Commit 72c50366 authored by Dmitry Kazakov's avatar Dmitry Kazakov

Optimized vector composite ops by 1.5-2 times more

Conversion Uint<->Float is quite expensive in comparison to
Int<->Float (2-2.5 times). This happens because of special code
that handles sign bit of the number. So discarding this bit with
conversion Uint->Int makes a huge speedup.

Now the vector version of the composition is 1.8-8.7 times faster
that the old version (weighted: 3.2 times).

Many thanks to Matthias Kretz for pointing this out!

CCMAIL:kimageshop@kde.org
CCMAIL:kretz@kde.org
parent bdb00b30
......@@ -516,5 +516,97 @@ void KisCompositionBenchmark::benchmarkMemcpy()
freeTiles(tiles, 0, 0);
}
void KisCompositionBenchmark::benchmarkUintFloat()
{
#ifdef HAVE_VC
const int vecSize = Vc::float_v::Size;
const int dataSize = 4096;
quint8 *iData = (quint8*) memalign(vecSize, dataSize);
float *fData = (float*) memalign(vecSize * 4, dataSize * 4);
QBENCHMARK {
for (int i = 0; i < dataSize; i += Vc::float_v::Size) {
// convert uint -> float directly, this causes
// static_cast helper be called
Vc::float_v b(Vc::uint_v(iData + i));
b.store(fData + i);
}
}
free(iData);
free(fData);
#endif
}
void KisCompositionBenchmark::benchmarkUintIntFloat()
{
#ifdef HAVE_VC
const int vecSize = Vc::float_v::Size;
const int dataSize = 4096;
quint8 *iData = (quint8*) memalign(vecSize, dataSize);
float *fData = (float*) memalign(vecSize * 4, dataSize * 4);
QBENCHMARK {
for (int i = 0; i < dataSize; i += Vc::float_v::Size) {
// convert uint->int->float, that avoids special sign
// treating, and gives 2.6 times speedup
Vc::float_v b(Vc::int_v(Vc::uint_v(iData + i)));
b.store(fData + i);
}
}
free(iData);
free(fData);
#endif
}
void KisCompositionBenchmark::benchmarkFloatUint()
{
#ifdef HAVE_VC
const int vecSize = Vc::float_v::Size;
const int dataSize = 4096;
quint32 *iData = (quint32*) memalign(vecSize * 4, dataSize * 4);
float *fData = (float*) memalign(vecSize * 4, dataSize * 4);
QBENCHMARK {
for (int i = 0; i < dataSize; i += Vc::float_v::Size) {
// conversion float -> uint
Vc::uint_v b(Vc::float_v(fData + i));
b.store(iData + i);
}
}
free(iData);
free(fData);
#endif
}
void KisCompositionBenchmark::benchmarkFloatIntUint()
{
#ifdef HAVE_VC
const int vecSize = Vc::float_v::Size;
const int dataSize = 4096;
quint32 *iData = (quint32*) memalign(vecSize * 4, dataSize * 4);
float *fData = (float*) memalign(vecSize * 4, dataSize * 4);
QBENCHMARK {
for (int i = 0; i < dataSize; i += Vc::float_v::Size) {
// conversion float -> int -> uint
Vc::uint_v b(Vc::int_v(Vc::float_v(fData + i)));
b.store(iData + i);
}
}
free(iData);
free(fData);
#endif
}
QTEST_KDEMAIN(KisCompositionBenchmark, GUI)
......@@ -43,6 +43,11 @@ private slots:
void testRgb8CompositeOverReal_Aligned();
void benchmarkMemcpy();
void benchmarkUintFloat();
void benchmarkUintIntFloat();
void benchmarkFloatUint();
void benchmarkFloatIntUint();
};
#endif /* __KIS_COMPOSITION_BENCHMARK_H */
......@@ -94,7 +94,7 @@ static inline quint8 lerp_mixed_u8_float(quint8 a, quint8 b, float alpha) {
*/
inline Vc::float_v fetch_mask_8(const quint8 *data) {
Vc::uint_v data_i(data);
return Vc::float_v(data_i);
return Vc::float_v(Vc::int_v(data_i));
}
/**
......@@ -118,7 +118,7 @@ inline Vc::float_v fetch_alpha_32(const quint8 *data) {
data_i.load((const quint32*)data, Vc::Unaligned);
}
return Vc::float_v(data_i >> 24);
return Vc::float_v(Vc::int_v(data_i >> 24));
}
/**
......@@ -148,9 +148,9 @@ inline void fetch_colors_32(const quint8 *data,
const quint32 lowByteMask = 0xFF;
Vc::uint_v mask(lowByteMask);
c1 = Vc::float_v((data_i >> 16) & mask);
c2 = Vc::float_v((data_i >> 8) & mask);
c3 = Vc::float_v( data_i & mask);
c1 = Vc::float_v(Vc::int_v((data_i >> 16) & mask));
c2 = Vc::float_v(Vc::int_v((data_i >> 8) & mask));
c3 = Vc::float_v(Vc::int_v( data_i & mask));
}
/**
......@@ -175,11 +175,11 @@ inline void write_channels_32(quint8 *data,
const quint32 lowByteMask = 0xFF;
Vc::uint_v mask(lowByteMask);
Vc::uint_v v1 = Vc::uint_v(alpha) << 24;
Vc::uint_v v2 = (Vc::uint_v(c1) & mask) << 16;
Vc::uint_v v3 = (Vc::uint_v(c2) & mask) << 8;
Vc::uint_v v1 = Vc::uint_v(Vc::int_v(alpha)) << 24;
Vc::uint_v v2 = (Vc::uint_v(Vc::int_v(c1)) & mask) << 16;
Vc::uint_v v3 = (Vc::uint_v(Vc::int_v(c2)) & mask) << 8;
v1 = v1 | v2;
Vc::uint_v v4 = Vc::uint_v(c3) & mask;
Vc::uint_v v4 = Vc::uint_v(Vc::int_v(c3)) & mask;
v3 = v3 | v4;
*((Vc::uint_v*)data) = v1 | v3;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment