KoOptimizedRgbPixelDataScalerU8ToU16.h 6.83 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
/*
 *  SPDX-FileCopyrightText: 2021 Dmitry Kazakov <dimula73@gmail.com>
 *
 *  SPDX-License-Identifier: GPL-2.0-or-later
 */

#ifndef KoOptimizedRgbPixelDataScalerU8ToU16_H
#define KoOptimizedRgbPixelDataScalerU8ToU16_H

#include "KoOptimizedRgbPixelDataScalerU8ToU16Base.h"

#include "KoVcMultiArchBuildSupport.h"
#include "kis_debug.h"

Dmitry Kazakov's avatar
Dmitry Kazakov committed
15
#include <x86intrin.h>
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64


template<Vc::Implementation _impl>
class KoOptimizedRgbPixelDataScalerU8ToU16 : public KoOptimizedRgbPixelDataScalerU8ToU16Base
{
    void convertU8ToU16(const quint8 *src, int srcRowStride,
                        quint8 *dst, int dstRowStride,
                        int numRows, int numColumns) const override
    {
        const int numColorChannels = 4 * numColumns;

#if defined __AVX2__
        const int channelsPerAvx2Block = 16;
        const int channelsPerSse2Block = 8;
        const int avx2Block = numColorChannels / channelsPerAvx2Block;
        const int rest = numColorChannels % channelsPerAvx2Block;
        const int sse2Block = rest / channelsPerSse2Block;
        const int scalarBlock = rest % channelsPerSse2Block;
#elif defined __SSE4_1__
        const int channelsPerSse2Block = 8;
        const int avx2Block = 0;
        const int sse2Block = numColorChannels / channelsPerSse2Block;
        const int scalarBlock = numColorChannels % channelsPerSse2Block;
#else
        const int avx2Block = 0;
        const int sse2Block = 0;
        const int scalarBlock = numColorChannels;
#endif

        //qWarning() << ppVar(avx2Block) << ppVar(sse2Block);

        for (int row = 0; row < numRows; row++) {

            const quint8 *srcPtr = src;
            quint16 *dstPtr = reinterpret_cast<quint16*>(dst);

#ifdef __AVX2__
            for (int i = 0; i < avx2Block; i++) {
                __m128i x = _mm_loadu_si128(reinterpret_cast<const __m128i*>(srcPtr));

                __m256i y = _mm256_cvtepu8_epi16(x);
                __m256i y_shifted = _mm256_slli_epi16(y, 8);
                y = _mm256_or_si256(y, y_shifted);

                _mm256_storeu_si256(reinterpret_cast<__m256i*>(dstPtr), y);

                srcPtr += channelsPerAvx2Block;
                dstPtr += channelsPerAvx2Block;
            }
Dmitry Kazakov's avatar
Dmitry Kazakov committed
65
66
#else
            Q_UNUSED(avx2Block);
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#endif

#ifdef __SSE4_1__
            for (int i = 0; i < sse2Block; i++) {
                __m128i x = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(srcPtr));

                __m128i y = _mm_cvtepu8_epi16(x);
                __m128i y_shifted = _mm_slli_epi16(y, 8);
                y = _mm_or_si128(y, y_shifted);

                _mm_storeu_si128(reinterpret_cast<__m128i*>(dstPtr), y);

                srcPtr += channelsPerSse2Block;
                dstPtr += channelsPerSse2Block;
            }
Dmitry Kazakov's avatar
Dmitry Kazakov committed
82
83
#else
            Q_UNUSED(sse2Block);
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#endif

            for (int i = 0; i < scalarBlock; i++) {
                const quint16 value = *srcPtr;

                *dstPtr = value | (value << 8);

                srcPtr++;
                dstPtr++;
            }


            src += srcRowStride;
            dst += dstRowStride;
        }
    }

    void convertU16ToU8(const quint8 *src, int srcRowStride,
                        quint8 *dst, int dstRowStride,
                        int numRows, int numColumns) const override
    {
        const int numColorChannels = 4 * numColumns;

#if defined __AVX2__
        const int channelsPerAvx2Block = 32;
        const int channelsPerSse2Block = 16;
        const int avx2Block = numColorChannels / channelsPerAvx2Block;
        const int rest = numColorChannels % channelsPerAvx2Block;
        const int sse2Block = rest / channelsPerSse2Block;
        const int scalarBlock = rest % channelsPerSse2Block;

        __m256i offset1 = _mm256_set1_epi16(128);
        __m128i offset2 = _mm_set1_epi16(128);

#elif defined __SSE2__
        const int channelsPerSse2Block = 16;
        const int avx2Block = 0;
        const int sse2Block = numColorChannels / channelsPerSse2Block;
        const int scalarBlock = numColorChannels % channelsPerSse2Block;

        __m128i offset2 = _mm_set1_epi16(128);
#else
        const int avx2Block = 0;
        const int sse2Block = 0;
        const int scalarBlock = numColorChannels;
#endif

        //qWarning() << ppVar(avx2Block) << ppVar(sse2Block);

        for (int row = 0; row < numRows; row++) {

            const quint16 *srcPtr = reinterpret_cast<const quint16*>(src);
            quint8 *dstPtr = dst;

#ifdef __AVX2__
            for (int i = 0; i < avx2Block; i++) {

                __m256i x1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(srcPtr));
142
                __m256i x2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(srcPtr + 16));
143
144
145
146
147
148
149
150
151
152
153
154
155
156

                __m256i x1_shifted = _mm256_srli_epi16(x1, 8);
                __m256i x2_shifted = _mm256_srli_epi16(x2, 8);

                x1 = _mm256_sub_epi16(x1, x1_shifted);
                x1 = _mm256_add_epi16(x1, offset1);
                x1 = _mm256_srli_epi16(x1, 8);

                x2 = _mm256_sub_epi16(x2, x2_shifted);
                x2 = _mm256_add_epi16(x2, offset1);
                x2 = _mm256_srli_epi16(x2, 8);

                x1 = _mm256_packus_epi16(x1, x2);

157
158
159
160
161
                // Packing in AVX2 does a bit different thing, not
                // what you expect that after seeing a SSE2 version :)
                // Therefore we need to permute the result...
                x1 = _mm256_permute4x64_epi64(x1, 0xd8);

162
163
164
165
166
                _mm256_storeu_si256(reinterpret_cast<__m256i*>(dstPtr), x1);

                srcPtr += channelsPerAvx2Block;
                dstPtr += channelsPerAvx2Block;
            }
Dmitry Kazakov's avatar
Dmitry Kazakov committed
167
168
#else
            Q_UNUSED(avx2Block);
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
#endif

#ifdef __SSE2__
            for (int i = 0; i < sse2Block; i++) {
                __m128i x1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(srcPtr));
                __m128i x2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(srcPtr + 8));

                __m128i x1_shifted = _mm_srli_epi16(x1, 8);
                __m128i x2_shifted = _mm_srli_epi16(x2, 8);

                x1 = _mm_sub_epi16(x1, x1_shifted);
                x1 = _mm_add_epi16(x1, offset2);
                x1 = _mm_srli_epi16(x1, 8);

                x2 = _mm_sub_epi16(x2, x2_shifted);
                x2 = _mm_add_epi16(x2, offset2);
                x2 = _mm_srli_epi16(x2, 8);

                x1 = _mm_packus_epi16(x1, x2);

                _mm_storeu_si128(reinterpret_cast<__m128i*>(dstPtr), x1);

                srcPtr += channelsPerSse2Block;
                dstPtr += channelsPerSse2Block;
            }
Dmitry Kazakov's avatar
Dmitry Kazakov committed
194
195
#else
            Q_UNUSED(sse2Block);
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
#endif

            for (int i = 0; i < scalarBlock; i++) {
                const quint16 value = *srcPtr;

                *dstPtr = (value - (value >> 8) + 128) >> 8;

                srcPtr++;
                dstPtr++;
            }


            src += srcRowStride;
            dst += dstRowStride;
        }
    }
};

#endif // KoOptimizedRgbPixelDataScalerU8ToU16_H