Commit 72c50366 authored by Dmitry Kazakov's avatar Dmitry Kazakov

Optimized vector composite ops by 1.5-2 times more

Conversion Uint<->Float is quite expensive in comparison to
Int<->Float (2-2.5 times). This happens because of special code
that handles sign bit of the number. So discarding this bit with
conversion Uint->Int makes a huge speedup.

Now the vector version of the composition is 1.8-8.7 times faster
that the old version (weighted: 3.2 times).

Many thanks to Matthias Kretz for pointing this out!

CCMAIL:kimageshop@kde.org
CCMAIL:kretz@kde.org
parent bdb00b30
...@@ -516,5 +516,97 @@ void KisCompositionBenchmark::benchmarkMemcpy() ...@@ -516,5 +516,97 @@ void KisCompositionBenchmark::benchmarkMemcpy()
freeTiles(tiles, 0, 0); freeTiles(tiles, 0, 0);
} }
void KisCompositionBenchmark::benchmarkUintFloat()
{
#ifdef HAVE_VC
const int vecSize = Vc::float_v::Size;
const int dataSize = 4096;
quint8 *iData = (quint8*) memalign(vecSize, dataSize);
float *fData = (float*) memalign(vecSize * 4, dataSize * 4);
QBENCHMARK {
for (int i = 0; i < dataSize; i += Vc::float_v::Size) {
// convert uint -> float directly, this causes
// static_cast helper be called
Vc::float_v b(Vc::uint_v(iData + i));
b.store(fData + i);
}
}
free(iData);
free(fData);
#endif
}
void KisCompositionBenchmark::benchmarkUintIntFloat()
{
#ifdef HAVE_VC
const int vecSize = Vc::float_v::Size;
const int dataSize = 4096;
quint8 *iData = (quint8*) memalign(vecSize, dataSize);
float *fData = (float*) memalign(vecSize * 4, dataSize * 4);
QBENCHMARK {
for (int i = 0; i < dataSize; i += Vc::float_v::Size) {
// convert uint->int->float, that avoids special sign
// treating, and gives 2.6 times speedup
Vc::float_v b(Vc::int_v(Vc::uint_v(iData + i)));
b.store(fData + i);
}
}
free(iData);
free(fData);
#endif
}
void KisCompositionBenchmark::benchmarkFloatUint()
{
#ifdef HAVE_VC
const int vecSize = Vc::float_v::Size;
const int dataSize = 4096;
quint32 *iData = (quint32*) memalign(vecSize * 4, dataSize * 4);
float *fData = (float*) memalign(vecSize * 4, dataSize * 4);
QBENCHMARK {
for (int i = 0; i < dataSize; i += Vc::float_v::Size) {
// conversion float -> uint
Vc::uint_v b(Vc::float_v(fData + i));
b.store(iData + i);
}
}
free(iData);
free(fData);
#endif
}
void KisCompositionBenchmark::benchmarkFloatIntUint()
{
#ifdef HAVE_VC
const int vecSize = Vc::float_v::Size;
const int dataSize = 4096;
quint32 *iData = (quint32*) memalign(vecSize * 4, dataSize * 4);
float *fData = (float*) memalign(vecSize * 4, dataSize * 4);
QBENCHMARK {
for (int i = 0; i < dataSize; i += Vc::float_v::Size) {
// conversion float -> int -> uint
Vc::uint_v b(Vc::int_v(Vc::float_v(fData + i)));
b.store(iData + i);
}
}
free(iData);
free(fData);
#endif
}
QTEST_KDEMAIN(KisCompositionBenchmark, GUI) QTEST_KDEMAIN(KisCompositionBenchmark, GUI)
...@@ -43,6 +43,11 @@ private slots: ...@@ -43,6 +43,11 @@ private slots:
void testRgb8CompositeOverReal_Aligned(); void testRgb8CompositeOverReal_Aligned();
void benchmarkMemcpy(); void benchmarkMemcpy();
void benchmarkUintFloat();
void benchmarkUintIntFloat();
void benchmarkFloatUint();
void benchmarkFloatIntUint();
}; };
#endif /* __KIS_COMPOSITION_BENCHMARK_H */ #endif /* __KIS_COMPOSITION_BENCHMARK_H */
...@@ -94,7 +94,7 @@ static inline quint8 lerp_mixed_u8_float(quint8 a, quint8 b, float alpha) { ...@@ -94,7 +94,7 @@ static inline quint8 lerp_mixed_u8_float(quint8 a, quint8 b, float alpha) {
*/ */
inline Vc::float_v fetch_mask_8(const quint8 *data) { inline Vc::float_v fetch_mask_8(const quint8 *data) {
Vc::uint_v data_i(data); Vc::uint_v data_i(data);
return Vc::float_v(data_i); return Vc::float_v(Vc::int_v(data_i));
} }
/** /**
...@@ -118,7 +118,7 @@ inline Vc::float_v fetch_alpha_32(const quint8 *data) { ...@@ -118,7 +118,7 @@ inline Vc::float_v fetch_alpha_32(const quint8 *data) {
data_i.load((const quint32*)data, Vc::Unaligned); data_i.load((const quint32*)data, Vc::Unaligned);
} }
return Vc::float_v(data_i >> 24); return Vc::float_v(Vc::int_v(data_i >> 24));
} }
/** /**
...@@ -148,9 +148,9 @@ inline void fetch_colors_32(const quint8 *data, ...@@ -148,9 +148,9 @@ inline void fetch_colors_32(const quint8 *data,
const quint32 lowByteMask = 0xFF; const quint32 lowByteMask = 0xFF;
Vc::uint_v mask(lowByteMask); Vc::uint_v mask(lowByteMask);
c1 = Vc::float_v((data_i >> 16) & mask); c1 = Vc::float_v(Vc::int_v((data_i >> 16) & mask));
c2 = Vc::float_v((data_i >> 8) & mask); c2 = Vc::float_v(Vc::int_v((data_i >> 8) & mask));
c3 = Vc::float_v( data_i & mask); c3 = Vc::float_v(Vc::int_v( data_i & mask));
} }
/** /**
...@@ -175,11 +175,11 @@ inline void write_channels_32(quint8 *data, ...@@ -175,11 +175,11 @@ inline void write_channels_32(quint8 *data,
const quint32 lowByteMask = 0xFF; const quint32 lowByteMask = 0xFF;
Vc::uint_v mask(lowByteMask); Vc::uint_v mask(lowByteMask);
Vc::uint_v v1 = Vc::uint_v(alpha) << 24; Vc::uint_v v1 = Vc::uint_v(Vc::int_v(alpha)) << 24;
Vc::uint_v v2 = (Vc::uint_v(c1) & mask) << 16; Vc::uint_v v2 = (Vc::uint_v(Vc::int_v(c1)) & mask) << 16;
Vc::uint_v v3 = (Vc::uint_v(c2) & mask) << 8; Vc::uint_v v3 = (Vc::uint_v(Vc::int_v(c2)) & mask) << 8;
v1 = v1 | v2; v1 = v1 | v2;
Vc::uint_v v4 = Vc::uint_v(c3) & mask; Vc::uint_v v4 = Vc::uint_v(Vc::int_v(c3)) & mask;
v3 = v3 | v4; v3 = v3 | v4;
*((Vc::uint_v*)data) = v1 | v3; *((Vc::uint_v*)data) = v1 | v3;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment