Commit 91016738 authored by Thorsten Zachmann's avatar Thorsten Zachmann

This optimizes OpOver for RGBAF32 by using vc

New tests and benchmarks for the code have been added.

QDEBUG : KisCompositionBenchmark::testRgbF32CompositeOverLegacy() Testing Composite Op: "normal" ( "RGBF32 Legacy" )
QDEBUG : KisCompositionBenchmark::testRgbF32CompositeOverLegacy() "Aligned   Mask   SrcRand DstRand" RESULT: 143 msec
QDEBUG : KisCompositionBenchmark::testRgbF32CompositeOverLegacy() "DstUnalig Mask   SrcRand DstRand" RESULT: 142 msec
QDEBUG : KisCompositionBenchmark::testRgbF32CompositeOverLegacy() "SrcUnalig Mask   SrcRand DstRand" RESULT: 143 msec
QDEBUG : KisCompositionBenchmark::testRgbF32CompositeOverLegacy() "Unaligned Mask   SrcRand DstRand" RESULT: 144 msec
QDEBUG : KisCompositionBenchmark::testRgbF32CompositeOverLegacy() "Aligned   NoMask SrcRand DstRand" RESULT: 59 msec
QDEBUG : KisCompositionBenchmark::testRgbF32CompositeOverLegacy() "Aligned   NoMask SrcZero DstRand" RESULT: 9 msec
QDEBUG : KisCompositionBenchmark::testRgbF32CompositeOverLegacy() "Aligned   NoMask SrcUnit DstRand" RESULT: 21 msec
QDEBUG : KisCompositionBenchmark::testRgbF32CompositeOverLegacy() "Aligned   NoMask SrcRand DstZero" RESULT: 48 msec
QDEBUG : KisCompositionBenchmark::testRgbF32CompositeOverLegacy() "Aligned   NoMask SrcZero DstZero" RESULT: 9 msec
QDEBUG : KisCompositionBenchmark::testRgbF32CompositeOverLegacy() "Aligned   NoMask SrcUnit DstZero" RESULT: 18 msec
QDEBUG : KisCompositionBenchmark::testRgbF32CompositeOverLegacy() "Aligned   NoMask SrcRand DstUnit" RESULT: 22 msec
QDEBUG : KisCompositionBenchmark::testRgbF32CompositeOverLegacy() "Aligned   NoMask SrcZero DstUnit" RESULT: 9 msec
QDEBUG : KisCompositionBenchmark::testRgbF32CompositeOverLegacy() "Aligned   NoMask SrcUnit DstUnit" RESULT: 16 msec
PASS   : KisCompositionBenchmark::testRgbF32CompositeOverLegacy()
QDEBUG : KisCompositionBenchmark::testRgbF32CompositeOverOptimized() Testing Composite Op: "normal" ( "RGBF32 Optimized" )
QDEBUG : KisCompositionBenchmark::testRgbF32CompositeOverOptimized() "Aligned   Mask   SrcRand DstRand" RESULT: 17 msec
QDEBUG : KisCompositionBenchmark::testRgbF32CompositeOverOptimized() "DstUnalig Mask   SrcRand DstRand" RESULT: 17 msec
QDEBUG : KisCompositionBenchmark::testRgbF32CompositeOverOptimized() "SrcUnalig Mask   SrcRand DstRand" RESULT: 28 msec
QDEBUG : KisCompositionBenchmark::testRgbF32CompositeOverOptimized() "Unaligned Mask   SrcRand DstRand" RESULT: 27 msec
QDEBUG : KisCompositionBenchmark::testRgbF32CompositeOverOptimized() "Aligned   NoMask SrcRand DstRand" RESULT: 17 msec
QDEBUG : KisCompositionBenchmark::testRgbF32CompositeOverOptimized() "Aligned   NoMask SrcZero DstRand" RESULT: 4 msec
QDEBUG : KisCompositionBenchmark::testRgbF32CompositeOverOptimized() "Aligned   NoMask SrcUnit DstRand" RESULT: 13 msec
QDEBUG : KisCompositionBenchmark::testRgbF32CompositeOverOptimized() "Aligned   NoMask SrcRand DstZero" RESULT: 16 msec
QDEBUG : KisCompositionBenchmark::testRgbF32CompositeOverOptimized() "Aligned   NoMask SrcZero DstZero" RESULT: 4 msec
QDEBUG : KisCompositionBenchmark::testRgbF32CompositeOverOptimized() "Aligned   NoMask SrcUnit DstZero" RESULT: 12 msec
QDEBUG : KisCompositionBenchmark::testRgbF32CompositeOverOptimized() "Aligned   NoMask SrcRand DstUnit" RESULT: 13 msec
QDEBUG : KisCompositionBenchmark::testRgbF32CompositeOverOptimized() "Aligned   NoMask SrcZero DstUnit" RESULT: 4 msec
QDEBUG : KisCompositionBenchmark::testRgbF32CompositeOverOptimized() "Aligned   NoMask SrcUnit DstUnit" RESULT: 12 msec
PASS   : KisCompositionBenchmark::testRgbF32CompositeOverOptimized()

(cherry picked from commit 3be07eee35505bf754f12617e5d9059c03c44d7c)
parent 5e70ee76
......@@ -32,11 +32,13 @@ private Q_SLOTS:
void checkRoundingAlphaDarken_05_10_08();
void checkRoundingOver();
void checkRoundingOverRgbaF32();
void compareAlphaDarkenOps();
void compareAlphaDarkenOpsNoMask();
void compareOverOps();
void compareOverOpsNoMask();
void compareRgbF32OverOps();
void testRgb8CompositeAlphaDarkenLegacy();
void testRgb8CompositeAlphaDarkenOptimized();
......@@ -44,6 +46,9 @@ private Q_SLOTS:
void testRgb8CompositeOverLegacy();
void testRgb8CompositeOverOptimized();
void testRgbF32CompositeOverLegacy();
void testRgbF32CompositeOverOptimized();
void testRgb8CompositeAlphaDarkenReal_Aligned();
void testRgb8CompositeOverReal_Aligned();
......
......@@ -79,6 +79,17 @@ struct OptimizedOpsSelector<KoLabU8Traits>
}
};
template<>
struct OptimizedOpsSelector<KoRgbF32Traits>
{
static KoCompositeOp* createAlphaDarkenOp(const KoColorSpace *cs) {
return new KoCompositeOpAlphaDarken<KoRgbF32Traits>(cs);
}
static KoCompositeOp* createOverOp(const KoColorSpace *cs) {
return KoOptimizedCompositeOpFactory::createOverOp128(cs);
}
};
template<class Traits>
struct AddGeneralOps<Traits, true>
{
......
......@@ -41,3 +41,8 @@ KoCompositeOp* KoOptimizedCompositeOpFactory::createOverOp32(const KoColorSpace
{
return createOptimizedClass<KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpOver32> >(cs);
}
KoCompositeOp* KoOptimizedCompositeOpFactory::createOverOp128(const KoColorSpace *cs)
{
return createOptimizedClass<KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpOver128> >(cs);
}
......@@ -42,6 +42,7 @@ class PIGMENTCMS_EXPORT KoOptimizedCompositeOpFactory
public:
static KoCompositeOp* createAlphaDarkenOp32(const KoColorSpace *cs);
static KoCompositeOp* createOverOp32(const KoColorSpace *cs);
static KoCompositeOp* createOverOp128(const KoColorSpace *cs);
};
#endif /* KOOPTIMIZEDCOMPOSITEOPFACTORY_H */
......@@ -22,6 +22,7 @@
#include "KoOptimizedCompositeOpFactoryPerArch.h"
#include "KoOptimizedCompositeOpAlphaDarken32.h"
#include "KoOptimizedCompositeOpOver32.h"
#include "KoOptimizedCompositeOpOver128.h"
#include <QString>
#include "DebugPigment.h"
......@@ -48,6 +49,14 @@ KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpOver32>::create<VC_IM
return new KoOptimizedCompositeOpOver32<VC_IMPL>(param);
}
template<>
template<>
KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpOver128>::ReturnType
KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpOver128>::create<VC_IMPL>(ParamType param)
{
return new KoOptimizedCompositeOpOver128<VC_IMPL>(param);
}
#define __stringify(_s) #_s
#define stringify(_s) __stringify(_s)
......
......@@ -34,6 +34,9 @@ class KoOptimizedCompositeOpAlphaDarken32;
template<Vc::Implementation _impl>
class KoOptimizedCompositeOpOver32;
template<Vc::Implementation _impl>
class KoOptimizedCompositeOpOver128;
template<template<Vc::Implementation I> class CompositeOp>
struct KoOptimizedCompositeOpFactoryPerArch
{
......
......@@ -40,6 +40,14 @@ KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpOver32>::create<Vc::S
return new KoCompositeOpOver<KoBgrU8Traits>(param);
}
template<>
template<>
KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpOver128>::ReturnType
KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpOver128>::create<Vc::ScalarImpl>(ParamType param)
{
return new KoCompositeOpOver<KoRgbF32Traits>(param);
}
template<>
KoReportCurrentArch::ReturnType
KoReportCurrentArch::create<Vc::ScalarImpl>(ParamType)
......
This diff is collapsed.
......@@ -25,7 +25,9 @@
#include <stdint.h>
#include <KoAlwaysInline.h>
#include <iostream>
#define BLOCKDEBUG 0
#pragma GCC diagnostic ignored "-Wcast-align"
......@@ -35,13 +37,13 @@ struct KoStreamedMath {
/**
* Composes src into dst without using vector instructions
*/
template<bool useMask, bool useFlow, class Compositor>
static void genericComposite32_novector(const KoCompositeOp::ParameterInfo& params)
template<bool useMask, bool useFlow, class Compositor, int pixelSize>
static void genericComposite_novector(const KoCompositeOp::ParameterInfo& params)
{
using namespace Arithmetic;
const qint32 linearInc = 4;
qint32 srcLinearInc = params.srcRowStride ? 4 : 0;
const qint32 linearInc = pixelSize;
qint32 srcLinearInc = params.srcRowStride ? pixelSize : 0;
quint8* dstRowStart = params.dstRowStart;
const quint8* maskRowStart = params.maskRowStart;
......@@ -74,6 +76,18 @@ template<bool useMask, bool useFlow, class Compositor>
}
}
template<bool useMask, bool useFlow, class Compositor>
static void genericComposite32_novector(const KoCompositeOp::ParameterInfo& params)
{
genericComposite_novector<useMask, useFlow, Compositor, 4>(params);
}
template<bool useMask, bool useFlow, class Compositor>
static void genericComposite128_novector(const KoCompositeOp::ParameterInfo& params)
{
genericComposite_novector<useMask, useFlow, Compositor, 16>(params);
}
static inline quint8 round_float_to_uint(float value) {
return quint8(value + float(0.5));
}
......@@ -147,6 +161,48 @@ static inline void fetch_colors_32(const quint8 *data,
c3 = Vc::float_v(Vc::int_v( data_i & mask));
}
/**
*
*/
template <bool aligned>
static inline void fetch_all_32(const quint8 *data,
Vc::float_v &alpha,
Vc::float_v &c1,
Vc::float_v &c2,
Vc::float_v &c3) {
Vc::uint_v data_i;
if (aligned) {
data_i.load((const quint32*)data, Vc::Aligned);
} else {
data_i.load((const quint32*)data, Vc::Unaligned);
}
const quint32 lowByteMask = 0xFF;
Vc::uint_v mask(lowByteMask);
alpha = Vc::float_v(Vc::int_v(data_i >> 24));
c1 = Vc::float_v(Vc::int_v((data_i >> 16) & mask));
c2 = Vc::float_v(Vc::int_v((data_i >> 8) & mask));
c3 = Vc::float_v(Vc::int_v( data_i & mask));
}
template <bool aligned>
static inline void fetch_8_offset(const quint8 *data,
Vc::float_v &value,
const quint32 offset) {
Vc::uint_v data_i;
if (aligned) {
data_i.load((const quint32*)data, Vc::Aligned);
} else {
data_i.load((const quint32*)data, Vc::Unaligned);
}
const quint32 lowByteMask = 0xFF;
Vc::uint_v mask(lowByteMask);
value = Vc::float_v(Vc::int_v((data_i >> offset) & mask));
}
/**
* Pack color and alpha values to Vc::float_v::Size pixels 32-bit each
* (4 channels, 8 bit per channel). The color data is considered
......@@ -186,16 +242,16 @@ static inline void write_channels_32(quint8 *data,
* colorspaces. Uses \p Compositor strategy parameter for doing actual
* math of the composition
*/
template<bool useMask, bool useFlow, class Compositor>
static void genericComposite32(const KoCompositeOp::ParameterInfo& params)
template<bool useMask, bool useFlow, class Compositor, int pixelSize>
static void genericComposite(const KoCompositeOp::ParameterInfo& params)
{
using namespace Arithmetic;
const int vectorSize = Vc::float_v::Size;
const qint32 vectorInc = 4 * vectorSize;
const qint32 linearInc = 4;
const qint32 vectorInc = pixelSize * vectorSize;
const qint32 linearInc = pixelSize;
qint32 srcVectorInc = vectorInc;
qint32 srcLinearInc = 4;
qint32 srcLinearInc = pixelSize;
quint8* dstRowStart = params.dstRowStart;
const quint8* maskRowStart = params.maskRowStart;
......@@ -209,6 +265,12 @@ template<bool useMask, bool useFlow, class Compositor>
srcLinearInc = 0;
srcVectorInc = 0;
}
#if BLOCKDEBUG
int totalBlockAlign = 0;
int totalBlockAlignedVector = 0;
int totalBlockUnalignedVector = 0;
int totalBlockRest = 0;
#endif
for(quint32 r=params.rows; r>0; --r) {
// Hint: Mask is allowed to be unaligned
......@@ -243,9 +305,22 @@ template<bool useMask, bool useFlow, class Compositor>
} else if (params.cols > 2 * vectorSize) {
blockAlign = (vectorInc - dstAlignment) / 4;
const int restCols = params.cols - blockAlign;
*vectorBlock = restCols / vectorSize;
blockRest = restCols % vectorSize;
if (restCols > 0) {
*vectorBlock = restCols / vectorSize;
blockRest = restCols % vectorSize;
}
else {
blockAlign = params.cols;
*vectorBlock = 0;
blockRest = 0;
}
}
#if BLOCKDEBUG
totalBlockAlign += blockAlign;
totalBlockAlignedVector += blockAlignedVector;
totalBlockUnalignedVector += blockUnalignedVector;
totalBlockRest += blockRest;
#endif
for(int i = 0; i < blockAlign; i++) {
Compositor::template compositeOnePixelScalar<useMask, _impl>(src, dst, mask, params.opacity, optionalParams);
......@@ -296,10 +371,74 @@ template<bool useMask, bool useFlow, class Compositor>
}
}
#if BLOCKDEBUG
qDebug() << "I" << params.rows << totalBlockAlign << totalBlockAlignedVector << totalBlockUnalignedVector << totalBlockRest; // << srcAlignment << dstAlignment;
#endif
if (!params.srcRowStride) {
Vc::free<float>(reinterpret_cast<float*>(const_cast<quint8*>(srcRowStart)));
}
}
template<bool useMask, bool useFlow, class Compositor>
static void genericComposite32(const KoCompositeOp::ParameterInfo& params)
{
genericComposite<useMask, useFlow, Compositor, 4>(params);
}
template<bool useMask, bool useFlow, class Compositor>
static void genericComposite128(const KoCompositeOp::ParameterInfo& params)
{
genericComposite<useMask, useFlow, Compositor, 16>(params);
}
};
namespace KoStreamedMathFunctions {
template<int pixelSize>
ALWAYS_INLINE void clearPixel(quint8* dst)
{
qFatal("Not implemented");
}
template<>
ALWAYS_INLINE void clearPixel<4>(quint8* dst)
{
quint32 *d = reinterpret_cast<quint32*>(dst);
*d = 0;
}
template<>
ALWAYS_INLINE void clearPixel<16>(quint8* dst)
{
quint64 *d = reinterpret_cast<quint64*>(dst);
d[0] = 0;
d[1] = 0;
}
template<int pixelSize>
ALWAYS_INLINE void copyPixel(const quint8 *src, quint8* dst)
{
qFatal("Not implemented");
}
template<>
ALWAYS_INLINE void copyPixel<4>(const quint8 *src, quint8* dst)
{
const quint32 *s = reinterpret_cast<const quint32*>(src);
quint32 *d = reinterpret_cast<quint32*>(dst);
*d = *s;
}
template<>
ALWAYS_INLINE void copyPixel<16>(const quint8 *src, quint8* dst)
{
const quint64 *s = reinterpret_cast<const quint64*>(src);
quint64 *d = reinterpret_cast<quint64*>(dst);
d[0] = s[0];
d[1] = s[1];
}
}
#endif /* __KOSTREAMED_MATH_H */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment