Commit 8dc92f7d authored by Daria Shcherbatyuk's avatar Daria Shcherbatyuk Committed by Dmitry Kazakov

Using Vc::reciprocal() and SSE instead of division in OverCompositor32

Summary:
Instead of the usual division, we can use the division of 1 by the
number (reciprocal) and multiplying it by the dividend in the original
fraction. Also, the use of Vc::reciprocal() and intrinsics using SSE in
total gives us an acceleration of up to 30%.

{F3862276}
{F3862275}.

Reviewers: dkazakov, #krita
Reviewed By: dkazakov, #krita
Subscribers: alvinhochun, woltherav
Tags: #krita

Differential Revision: https://phabricator.kde.org/D7314
parent bd864d90
...@@ -13,7 +13,7 @@ include_directories(SYSTEM ...@@ -13,7 +13,7 @@ include_directories(SYSTEM
set(LINK_VC_LIB) set(LINK_VC_LIB)
if(HAVE_VC) if(HAVE_VC)
include_directories(${Vc_INCLUDE_DIR}) include_directories(${Vc_INCLUDE_DIR})
# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${Vc_DEFINITIONS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${Vc_DEFINITIONS}")
set(LINK_VC_LIB ${Vc_LIBRARIES}) set(LINK_VC_LIB ${Vc_LIBRARIES})
endif() endif()
...@@ -38,7 +38,7 @@ set(kis_mask_generator_benchmark_SRCS kis_mask_generator_benchmark.cpp) ...@@ -38,7 +38,7 @@ set(kis_mask_generator_benchmark_SRCS kis_mask_generator_benchmark.cpp)
set(kis_low_memory_benchmark_SRCS kis_low_memory_benchmark.cpp) set(kis_low_memory_benchmark_SRCS kis_low_memory_benchmark.cpp)
set(kis_filter_selections_benchmark_SRCS kis_filter_selections_benchmark.cpp) set(kis_filter_selections_benchmark_SRCS kis_filter_selections_benchmark.cpp)
if (UNIX) if (UNIX)
#set(kis_composition_benchmark_SRCS kis_composition_benchmark.cpp) set(kis_composition_benchmark_SRCS kis_composition_benchmark.cpp)
endif() endif()
set(kis_thumbnail_benchmark_SRCS kis_thumbnail_benchmark.cpp) set(kis_thumbnail_benchmark_SRCS kis_thumbnail_benchmark.cpp)
...@@ -59,7 +59,7 @@ krita_add_benchmark(KisMaskGeneratorBenchmark TESTNAME krita-benchmarks-KisMaskG ...@@ -59,7 +59,7 @@ krita_add_benchmark(KisMaskGeneratorBenchmark TESTNAME krita-benchmarks-KisMaskG
krita_add_benchmark(KisLowMemoryBenchmark TESTNAME krita-benchmarks-KisLowMemory ${kis_low_memory_benchmark_SRCS}) krita_add_benchmark(KisLowMemoryBenchmark TESTNAME krita-benchmarks-KisLowMemory ${kis_low_memory_benchmark_SRCS})
krita_add_benchmark(KisFilterSelectionsBenchmark TESTNAME krita-image-KisFilterSelectionsBenchmark ${kis_filter_selections_benchmark_SRCS}) krita_add_benchmark(KisFilterSelectionsBenchmark TESTNAME krita-image-KisFilterSelectionsBenchmark ${kis_filter_selections_benchmark_SRCS})
if(UNIX) if(UNIX)
#krita_add_benchmark(KisCompositionBenchmark TESTNAME krita-benchmarks-KisComposition ${kis_composition_benchmark_SRCS}) krita_add_benchmark(KisCompositionBenchmark TESTNAME krita-benchmarks-KisComposition ${kis_composition_benchmark_SRCS})
endif() endif()
krita_add_benchmark(KisThumbnailBenchmark TESTNAME krita-benchmarks-KisThumbnail ${kis_thumbnail_benchmark_SRCS}) krita_add_benchmark(KisThumbnailBenchmark TESTNAME krita-benchmarks-KisThumbnail ${kis_thumbnail_benchmark_SRCS})
...@@ -80,10 +80,10 @@ target_link_libraries(KisLowMemoryBenchmark kritaimage Qt5::Test) ...@@ -80,10 +80,10 @@ target_link_libraries(KisLowMemoryBenchmark kritaimage Qt5::Test)
target_link_libraries(KisFilterSelectionsBenchmark kritaimage Qt5::Test) target_link_libraries(KisFilterSelectionsBenchmark kritaimage Qt5::Test)
if(UNIX) if(UNIX)
#target_link_libraries(KisCompositionBenchmark kritaimage Qt5::Test ${LINK_VC_LIB}) target_link_libraries(KisCompositionBenchmark kritaimage Qt5::Test ${LINK_VC_LIB})
#if(HAVE_VC) if(HAVE_VC)
# set_property(TARGET KisCompositionBenchmark APPEND PROPERTY COMPILE_OPTIONS "${Vc_ARCHITECTURE_FLAGS}") set_property(TARGET KisCompositionBenchmark APPEND PROPERTY COMPILE_OPTIONS "${Vc_ARCHITECTURE_FLAGS}")
#endif() endif()
endif() endif()
target_link_libraries(KisMaskGeneratorBenchmark kritaimage Qt5::Test) target_link_libraries(KisMaskGeneratorBenchmark kritaimage Qt5::Test)
target_link_libraries(KisThumbnailBenchmark kritaimage Qt5::Test) target_link_libraries(KisThumbnailBenchmark kritaimage Qt5::Test)
......
...@@ -26,6 +26,38 @@ ...@@ -26,6 +26,38 @@
#include "KoStreamedMath.h" #include "KoStreamedMath.h"
template<Vc::Implementation _impl>
struct OptiDiv {
static ALWAYS_INLINE float divScalar(const float& divident, const float& divisor) {
#ifdef __SSE__
float result;
__m128 x = _mm_set_ss(divisor);
__m128 y = _mm_set_ss(divident);
x = _mm_rcp_ss(x);
x = _mm_mul_ss(x, y);
_mm_store_ss(&result, x);
return result;
#else
return divident / divisor;
#endif
}
static ALWAYS_INLINE Vc::float_v divVector(Vc::float_v::AsArg divident, Vc::float_v::AsArg divisor) {
#ifdef __SSE__
return divident * Vc::reciprocal(divisor);
#else
return divident / divisor;
#endif
}
};
template<typename channels_type, typename pixel_type, bool alphaLocked, bool allChannelsFlag> template<typename channels_type, typename pixel_type, bool alphaLocked, bool allChannelsFlag>
struct OverCompositor32 { struct OverCompositor32 {
struct OptionalParams { struct OptionalParams {
...@@ -97,7 +129,11 @@ struct OverCompositor32 { ...@@ -97,7 +129,11 @@ struct OverCompositor32 {
* be converted to zeroes, which is exactly what we need * be converted to zeroes, which is exactly what we need
*/ */
new_alpha = dst_alpha + (uint8Max - dst_alpha) * src_alpha * uint8MaxRec1; new_alpha = dst_alpha + (uint8Max - dst_alpha) * src_alpha * uint8MaxRec1;
src_blend = src_alpha / new_alpha;
// Optimized version of:
// src_blend = src_alpha / new_alpha;
src_blend = OptiDiv<_impl>::divVector(src_alpha, new_alpha);
} }
if (!(src_blend == oneValue).isFull()) { if (!(src_blend == oneValue).isFull()) {
...@@ -156,7 +192,10 @@ struct OverCompositor32 { ...@@ -156,7 +192,10 @@ struct OverCompositor32 {
} }
} else { } else {
dstAlpha += (uint8Max - dstAlpha) * srcAlpha * uint8Rec1; dstAlpha += (uint8Max - dstAlpha) * srcAlpha * uint8Rec1;
srcBlendNorm = srcAlpha / dstAlpha; // Optimized version of:
// srcBlendNorm = srcAlpha / dstAlpha);
srcBlendNorm = OptiDiv<_impl>::divScalar(srcAlpha, dstAlpha);
} }
if(allChannelsFlag) { if(allChannelsFlag) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment