KoStreamedMath.h 10.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
/*
 *  Copyright (c) 2012 Dmitry Kazakov <dimula73@gmail.com>
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 */

#ifndef __VECTOR_MATH_H
#define __VECTOR_MATH_H


#include "config-vc.h"

25
#ifdef HAVE_SANE_VC
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
#include <Vc/Vc>
#include <Vc/IO>
#endif

#include <stdint.h>

#ifndef ALWAYS_INLINE
#if defined __GNUC__
#define ALWAYS_INLINE inline __attribute__((__always_inline__))
#elif defined _MSC_VER
#define ALWAYS_INLINE __forceinline
#else
#define ALWAYS_INLINE inline
#endif
#endif

namespace KoStreamedMath {

44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
/**
 * Composes src into dst without using vector instructions
 */
template<bool useMask, bool useFlow, class Compositor>
    void genericComposite32_novector(const KoCompositeOp::ParameterInfo& params)
{
    using namespace Arithmetic;

    const qint32 linearInc = 4;
    qint32 srcLinearInc = params.srcRowStride ? 4 : 0;

    quint8*       dstRowStart  = params.dstRowStart;
    const quint8* maskRowStart = params.maskRowStart;
    const quint8* srcRowStart  = params.srcRowStart;

    for(quint32 r=params.rows; r>0; --r) {
        const quint8 *mask = maskRowStart;
        const quint8 *src  = srcRowStart;
        quint8       *dst  = dstRowStart;

        int blockRest = params.cols;

        for(int i = 0; i < blockRest; i++) {
67
            Compositor::template compositeOnePixelScalar<useMask>(src, dst, mask, params.opacity, params.flow, params.channelFlags);
68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84
            src += srcLinearInc;
            dst += linearInc;

            if (useMask) {
                mask++;
            }
        }

        srcRowStart  += params.srcRowStride;
        dstRowStart  += params.dstRowStride;

        if (useMask) {
            maskRowStart += params.maskRowStride;
        }
    }
}

85 86 87 88
static inline quint8 lerp_mixed_u8_float(quint8 a, quint8 b, float alpha) {
    return quint8(qint16(b - a) * alpha + a);
}

89
#if defined HAVE_SANE_VC
90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169

/**
 * Get a vector containing first Vc::float_v::Size values of mask.
 * Each source mask element is considered to be a 8-bit integer
 */
inline Vc::float_v fetch_mask_8(const quint8 *data) {
    Vc::uint_v data_i(data);
    return Vc::float_v(data_i);
}

/**
 * Get an alpha values from Vc::float_v::Size pixels 32-bit each
 * (4 channels, 8 bit per channel).  The alpha value is considered
 * to be stored in the most significat byte of the pixel
 *
 * \p aligned controls whether the \p data is fetched using aligned
 *            instruction or not.
 *            1) Fetching aligned data with unaligned instruction
 *               degrades performance.
 *            2) Fetching unaligned data with aligned instruction
 *               causes #GP (General Protection Exception)
 */
template <bool aligned>
inline Vc::float_v fetch_alpha_32(const quint8 *data) {
    Vc::uint_v data_i;
    if (aligned) {
        data_i.load((const quint32*)data, Vc::Aligned);
    } else {
        data_i.load((const quint32*)data, Vc::Unaligned);
    }

    return Vc::float_v(data_i >> 24);
}

/**
 * Get color values from Vc::float_v::Size pixels 32-bit each
 * (4 channels, 8 bit per channel).  The color data is considered
 * to be stored in the 3 least significant bytes of the pixel.
 *
 * \p aligned controls whether the \p data is fetched using aligned
 *            instruction or not.
 *            1) Fetching aligned data with unaligned instruction
 *               degrades performance.
 *            2) Fetching unaligned data with aligned instruction
 *               causes #GP (General Protection Exception)
 */
template <bool aligned>
inline void fetch_colors_32(const quint8 *data,
                            Vc::float_v &c1,
                            Vc::float_v &c2,
                            Vc::float_v &c3) {
    Vc::uint_v data_i;
    if (aligned) {
        data_i.load((const quint32*)data, Vc::Aligned);
    } else {
        data_i.load((const quint32*)data, Vc::Unaligned);
    }

    const quint32 lowByteMask = 0xFF;
    Vc::uint_v mask(lowByteMask);

    c1 = Vc::float_v((data_i >> 16) & mask);
    c2 = Vc::float_v((data_i >> 8)  & mask);
    c3 = Vc::float_v( data_i        & mask);
}

/**
 * Pack color and alpha values to Vc::float_v::Size pixels 32-bit each
 * (4 channels, 8 bit per channel).  The color data is considered
 * to be stored in the 3 least significant bytes of the pixel, alpha -
 * in the most significant byte
 *
 * NOTE: \p data must be aligned pointer!
 */
inline void write_channels_32(quint8 *data,
                              Vc::float_v alpha,
                              Vc::float_v c1,
                              Vc::float_v c2,
                              Vc::float_v c3) {

170 171 172 173 174
    /**
     * FIXME: make conversion float->int
     * use methematical rounding
     */

175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228
    const quint32 lowByteMask = 0xFF;
    Vc::uint_v mask(lowByteMask);

    Vc::uint_v v1 = Vc::uint_v(alpha) << 24;
    Vc::uint_v v2 = (Vc::uint_v(c1) & mask) << 16;
    Vc::uint_v v3 = (Vc::uint_v(c2) & mask) <<  8;
    v1 = v1 | v2;
    Vc::uint_v v4 = Vc::uint_v(c3) & mask;
    v3 = v3 | v4;

    *((Vc::uint_v*)data) = v1 | v3;
}

/**
 * Composes src pixels into dst pixles. Is optimized for 32-bit-per-pixel
 * colorspaces. Uses \p Compositor strategy parameter for doing actual
 * math of the composition
 */
template<bool useMask, bool useFlow, class Compositor>
    void genericComposite32(const KoCompositeOp::ParameterInfo& params)
{
    using namespace Arithmetic;

    const int vectorSize = Vc::float_v::Size;
    const qint32 vectorInc = 4 * vectorSize;
    const qint32 linearInc = 4;
    qint32 srcVectorInc = vectorInc;
    qint32 srcLinearInc = 4;

    quint8*       dstRowStart  = params.dstRowStart;
    const quint8* maskRowStart = params.maskRowStart;
    const quint8* srcRowStart  = params.srcRowStart;

    if (!params.srcRowStride) {
        quint32 *buf = Vc::malloc<quint32, Vc::AlignOnVector>(vectorSize);
        *((Vc::uint_v*)buf) = Vc::uint_v(*((const quint32*)params.srcRowStart));
        srcRowStart = reinterpret_cast<quint8*>(buf);
        srcLinearInc = 0;
        srcVectorInc = 0;
    }

    for(quint32 r=params.rows; r>0; --r) {
        // Hint: Mask is allowed to be unaligned
        const quint8 *mask = maskRowStart;

        const quint8 *src  = srcRowStart;
        quint8       *dst  = dstRowStart;

        const int pixelsAlignmentMask = vectorInc - 1;
        uintptr_t srcPtrValue = reinterpret_cast<uintptr_t>(src);
        uintptr_t dstPtrValue = reinterpret_cast<uintptr_t>(dst);
        uintptr_t srcAlignment = srcPtrValue & pixelsAlignmentMask;
        uintptr_t dstAlignment = dstPtrValue & pixelsAlignmentMask;

229 230 231 232
        // Uncomment if facing problems with alignment:
        // Q_ASSERT_X(!(dstAlignment & 3), "Compositioning",
        //            "Pixel data must be aligned on pixels borders!");

233 234 235 236 237 238 239 240 241 242 243 244 245
        int blockAlign = params.cols;
        int blockAlignedVector = 0;
        int blockUnalignedVector = 0;
        int blockRest = 0;

        int *vectorBlock =
            srcAlignment == dstAlignment || !srcVectorInc ?
            &blockAlignedVector : &blockUnalignedVector;

        if (!dstAlignment) {
            blockAlign = 0;
            *vectorBlock = params.cols / vectorSize;
            blockRest = params.cols % vectorSize;
246
        } else if (params.cols > 2 * vectorSize) {
247 248 249 250 251 252 253
            blockAlign = (vectorInc - dstAlignment) / 4;
            const int restCols = params.cols - blockAlign;
            *vectorBlock = restCols / vectorSize;
            blockRest = restCols % vectorSize;
        }

        for(int i = 0; i < blockAlign; i++) {
254
            Compositor::template compositeOnePixelScalar<useMask>(src, dst, mask, params.opacity, params.flow, params.channelFlags);
255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284
            src += srcLinearInc;
            dst += linearInc;

            if(useMask) {
                mask++;
            }
        }

        for (int i = 0; i < blockAlignedVector; i++) {
            Compositor::template compositeVector<useMask, true>(src, dst, mask, params.opacity, params.flow);
            src += srcVectorInc;
            dst += vectorInc;

            if (useMask) {
                mask += vectorSize;
            }
        }

        for (int i = 0; i < blockUnalignedVector; i++) {
            Compositor::template compositeVector<useMask, false>(src, dst, mask, params.opacity, params.flow);
            src += srcVectorInc;
            dst += vectorInc;

            if (useMask) {
                mask += vectorSize;
            }
        }


        for(int i = 0; i < blockRest; i++) {
285
            Compositor::template compositeOnePixelScalar<useMask>(src, dst, mask, params.opacity, params.flow, params.channelFlags);
286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306
            src += srcLinearInc;
            dst += linearInc;

            if (useMask) {
                mask++;
            }
        }

        srcRowStart  += params.srcRowStride;
        dstRowStart  += params.dstRowStride;

        if (useMask) {
            maskRowStart += params.maskRowStride;
        }
    }

    if (!params.srcRowStride) {
        Vc::free<float>(reinterpret_cast<float*>(const_cast<quint8*>(srcRowStart)));
    }
}

307
#else /* if ! defined HAVE_SANE_VC */
308

309 310 311 312 313 314 315 316
/**
 * Fall back to the scalar version of the composition.
 *
 * Don't use this method! The scalar floating point version of the
 * algorithm is up to 2 times slower then the basic integer
 * implementation! Use another composite op instead!
 */

317 318 319
template<bool useMask, bool useFlow, class Compositor>
    void genericComposite32(const KoCompositeOp::ParameterInfo& params)
{
320
    genericComposite32_novector<useMask, useFlow, Compositor>(params);
321 322
}

323
#endif /* HAVE_SANE_VC */
324 325 326 327

}

#endif /* __VECTOR_MATH_H */