Commit 0b4c3f93 authored by Dmitry Kazakov's avatar Dmitry Kazakov Committed by Thorsten Zachmann

Optimize autobrush mask generation by 15-20%

Just skip areas with n > 1. They are outsize the brush bounds, so
there is no reason to work with them.

And yes, it gives something like 15% better performance :)

(cherry picked from commit 282d2f0c)
parent 5d59709c
......@@ -52,20 +52,38 @@ void KisMaskGeneratorBenchmark::benchmarkCircle()
}
}
#include <KoColorSpace.h>
#include <KoColorSpaceRegistry.h>
#include "kis_fixed_paint_device.h"
#include "kis_types.h"
#include "kis_brush_mask_applicator_base.h"
#include "krita_utils.h"
void KisMaskGeneratorBenchmark::benchmarkSIMD()
{
#ifdef HAVE_VC
int width = 1000;
float *buffer = Vc::malloc<float, Vc::AlignOnVector>(width);
const KoColorSpace * cs = KoColorSpaceRegistry::instance()->rgb8();
KisFixedPaintDeviceSP dev = new KisFixedPaintDevice(cs);
dev->setRect(QRect(0, 0, 100, 100));
dev->initialize();
MaskProcessingData data(dev, cs,
0.0, 1.0,
50, 50, 0);
KisCircleMaskGenerator gen(100, 0.5, 0.5, 0.5, 2, false);
KisBrushMaskApplicatorBase *applicator = gen.applicator();
applicator->initializeData(&data);
QVector<QRect> rects = KritaUtils::splitRectIntoPatches(dev->bounds(), QSize(63, 63));
KisCircleMaskGenerator gen(1000, 0.5, 0.5, 0.5, 2, true);
QBENCHMARK{
for(int y = 0; y < 1000; ++y)
{
// gen.processRowFast(buffer, width, y, 0.0f, 1.0f, 500.0f, 500.0f, 0.5f, 0.5f);
Q_FOREACH (const QRect &rc, rects) {
applicator->process(rc);
}
}
Vc::free(buffer);
#endif
}
......
......@@ -94,22 +94,33 @@ FastRowProcessor::process<Vc::CurrentImplementation::current()>(float* buffer, i
Vc::float_v yr = x_ * vSina + vCosaY_;
Vc::float_v n = pow2(xr * vXCoeff) + pow2(yr * vYCoeff);
Vc::float_m outsideMask = n > vOne;
if (useSmoothing) {
xr = Vc::abs(xr) + vOne;
yr = Vc::abs(yr) + vOne;
}
if (!outsideMask.isFull()) {
if (useSmoothing) {
xr = Vc::abs(xr) + vOne;
yr = Vc::abs(yr) + vOne;
}
Vc::float_v vNormFade = pow2(xr * vTransformedFadeX) + pow2(yr * vTransformedFadeY);
Vc::float_v vNormFade = pow2(xr * vTransformedFadeX) + pow2(yr * vTransformedFadeY);
//255 * n * (normeFade - 1) / (normeFade - n)
Vc::float_v vFade = n * (vNormFade - vOne) / (vNormFade - n);
//255 * n * (normeFade - 1) / (normeFade - n)
Vc::float_v vFade = n * (vNormFade - vOne) / (vNormFade - n);
// Mask out the inner circe of the mask
Vc::float_m mask = vNormFade < vOne;
vFade.setZero(mask);
vFade = Vc::min(vFade, vOne);
// Mask in the inner circe of the mask
Vc::float_m mask = vNormFade < vOne;
vFade.setZero(mask);
// Mask out the outer circe of the mask
vFade(outsideMask) = vOne;
vFade.store(bufferPointer, Vc::Aligned);
} else {
// Mask out everything outside the circle
vOne.store(bufferPointer, Vc::Aligned);
}
vFade.store(bufferPointer);
currentIndices = currentIndices + increment;
bufferPointer += Vc::float_v::Size;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment