/src/qtbase/src/gui/painting/qdrawhelper_ssse3.cpp

Source
// Copyright (C) 2018 The Qt Company Ltd.
// Copyright (C) 2018 Intel Corporation.
// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
// Qt-Security score:significant reason:default

#include <private/qdrawhelper_x86_p.h>

#if defined(QT_COMPILER_SUPPORTS_SSSE3)

#include <private/qdrawingprimitive_sse2_p.h>

QT_BEGIN_NAMESPACE

/* The instruction palignr uses direct arguments, so we have to generate the code fo the different
   shift (4, 8, 12). Checking the alignment inside the loop is unfortunately way too slow.
 */
#define BLENDING_LOOP(palignrOffset, length)\
    for (; x-minusOffsetToAlignSrcOn16Bytes < length-7; x += 4) { \
        const __m128i srcVectorLastLoaded = _mm_load_si128((const __m128i *)&src[x - minusOffsetToAlignSrcOn16Bytes + 4]);\
        const __m128i srcVector = _mm_alignr_epi8(srcVectorLastLoaded, srcVectorPrevLoaded, palignrOffset); \
        const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask); \
        if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) { \
            _mm_store_si128((__m128i *)&dst[x], srcVector); \
        } else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) != 0xffff) { \
            __m128i alphaChannel = _mm_shuffle_epi8(srcVector, alphaShuffleMask); \
            alphaChannel = _mm_sub_epi16(one, alphaChannel); \
            const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]); \
            __m128i destMultipliedByOneMinusAlpha; \
            BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half); \
            const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha); \
            _mm_store_si128((__m128i *)&dst[x], result); \
        } \
        srcVectorPrevLoaded = srcVectorLastLoaded;\
    }


// Basically blend src over dst with the const alpha defined as constAlphaVector.
// nullVector, half, one, colorMask are constant across the whole image/texture, and should be defined as:
//const __m128i nullVector = _mm_set1_epi32(0);
//const __m128i half = _mm_set1_epi16(0x80);
//const __m128i one = _mm_set1_epi16(0xff);
//const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
//const __m128i alphaMask = _mm_set1_epi32(0xff000000);
//
// The computation being done is:
// result = s + d * (1-alpha)
// with shortcuts if fully opaque or fully transparent.
static inline void Q_DECL_VECTORCALL
BLEND_SOURCE_OVER_ARGB32_SSSE3(quint32 *dst, const quint32 *src, int length,
                               __m128i nullVector, __m128i half, __m128i one, __m128i colorMask, __m128i alphaMask)
{
    int x = 0;

    /* First, get dst aligned. */
    ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) {
        blend_pixel(dst[x], src[x]);
    }

    const int minusOffsetToAlignSrcOn16Bytes = (reinterpret_cast<quintptr>(&(src[x])) >> 2) & 0x3;

    if (!minusOffsetToAlignSrcOn16Bytes) {
        /* src is aligned, usual algorithm but with aligned operations.
           See the SSE2 version for more documentation on the algorithm itself. */
        const __m128i alphaShuffleMask = _mm_set_epi8(char(0xff),15,char(0xff),15,char(0xff),11,char(0xff),11,char(0xff),7,char(0xff),7,char(0xff),3,char(0xff),3);
        for (; x < length-3; x += 4) {
            const __m128i srcVector = _mm_load_si128((const __m128i *)&src[x]);
            const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask);
            if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) {
                _mm_store_si128((__m128i *)&dst[x], srcVector);
            } else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) != 0xffff) {
                __m128i alphaChannel = _mm_shuffle_epi8(srcVector, alphaShuffleMask);
                alphaChannel = _mm_sub_epi16(one, alphaChannel);
                const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]);
                __m128i destMultipliedByOneMinusAlpha;
                BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half);
                const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha);
                _mm_store_si128((__m128i *)&dst[x], result);
            }
        } /* end for() */
    } else if ((length - x) >= 8) {
        /* We use two vectors to extract the src: prevLoaded for the first pixels, lastLoaded for the current pixels. */
        __m128i srcVectorPrevLoaded = _mm_load_si128((const __m128i *)&src[x - minusOffsetToAlignSrcOn16Bytes]);
        const int palignrOffset = minusOffsetToAlignSrcOn16Bytes << 2;

        const __m128i alphaShuffleMask = _mm_set_epi8(char(0xff),15,char(0xff),15,char(0xff),11,char(0xff),11,char(0xff),7,char(0xff),7,char(0xff),3,char(0xff),3);
        switch (palignrOffset) {
        case 4:
            BLENDING_LOOP(4, length)
            break;
        case 8:
            BLENDING_LOOP(8, length)
            break;
        case 12:
            BLENDING_LOOP(12, length)
            break;
        }
    }
    for (; x < length; ++x)
        blend_pixel(dst[x], src[x]);
}

void qt_blend_argb32_on_argb32_ssse3(uchar *destPixels, int dbpl,
                                     const uchar *srcPixels, int sbpl,
                                     int w, int h,
                                     int const_alpha)
{
    const quint32 *src = (const quint32 *) srcPixels;
    quint32 *dst = (quint32 *) destPixels;
    if (const_alpha == 256) {
        const __m128i alphaMask = _mm_set1_epi32(0xff000000);
        const __m128i nullVector = _mm_setzero_si128();
        const __m128i half = _mm_set1_epi16(0x80);
        const __m128i one = _mm_set1_epi16(0xff);
        const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);

        for (int y = 0; y < h; ++y) {
            BLEND_SOURCE_OVER_ARGB32_SSSE3(dst, src, w, nullVector, half, one, colorMask, alphaMask);
            dst = (quint32 *)(((uchar *) dst) + dbpl);
            src = (const quint32 *)(((const uchar *) src) + sbpl);
        }
    } else if (const_alpha != 0) {
        // dest = (s + d * sia) * ca + d * cia
        //      = s * ca + d * (sia * ca + cia)
        //      = s * ca + d * (1 - sa*ca)
        const_alpha = (const_alpha * 255) >> 8;
        const __m128i nullVector = _mm_setzero_si128();
        const __m128i half = _mm_set1_epi16(0x80);
        const __m128i one = _mm_set1_epi16(0xff);
        const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
        const __m128i constAlphaVector = _mm_set1_epi16(const_alpha);
        for (int y = 0; y < h; ++y) {
            BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2(dst, src, w, nullVector, half, one, colorMask, constAlphaVector)
            dst = (quint32 *)(((uchar *) dst) + dbpl);
            src = (const quint32 *)(((const uchar *) src) + sbpl);
        }
    }
}

const uint *QT_FASTCALL fetchPixelsBPP24_ssse3(uint *buffer, const uchar *src, int index, int count)
{
    const quint24 *s = reinterpret_cast<const quint24 *>(src);
    for (int i = 0; i < count; ++i)
        buffer[i] = s[index + i];
    return buffer;
}

extern void QT_FASTCALL qt_convert_rgb888_to_rgb32_ssse3(quint32 *dst, const uchar *src, int len);

const uint * QT_FASTCALL qt_fetchUntransformed_888_ssse3(uint *buffer, const Operator *, const QSpanData *data,
                                                         int y, int x, int length)
{
    const uchar *line = data->texture.scanLine(y) + x * 3;
    qt_convert_rgb888_to_rgb32_ssse3(buffer, line, length);
    return buffer;
}

void qt_memfill24_ssse3(quint24 *dest, quint24 color, qsizetype count)
{
    // LCM of 12 and 16 bytes is 48 bytes (16 px)
    quint32 v = color;
    __m128i m = _mm_cvtsi32_si128(v);
    quint24 *end = dest + count;

    constexpr uchar x = 2, y = 1, z = 0;
    alignas(__m128i) static const uchar
    shuffleMask[16 + 1] = { x, y, z, x,  y, z, x, y,  z, x, y, z,  x, y, z, x,  y };

    __m128i mval1 = _mm_shuffle_epi8(m, _mm_load_si128(reinterpret_cast<const __m128i *>(shuffleMask)));
    __m128i mval2 = _mm_shuffle_epi8(m, _mm_loadu_si128(reinterpret_cast<const __m128i *>(shuffleMask + 1)));
    __m128i mval3 = _mm_alignr_epi8(mval2, mval1, 2);

    for ( ; dest + 16 <= end; dest += 16) {
#ifdef __AVX__
        // Store using 32-byte AVX instruction
        __m256 mval12 = _mm256_castps128_ps256(_mm_castsi128_ps(mval1));
        mval12 = _mm256_insertf128_ps(mval12, _mm_castsi128_ps(mval2), 1);
        _mm256_storeu_ps(reinterpret_cast<float *>(dest), mval12);
#else
        _mm_storeu_si128(reinterpret_cast<__m128i *>(dest) + 0, mval1);
        _mm_storeu_si128(reinterpret_cast<__m128i *>(dest) + 1, mval2);
#endif
        _mm_storeu_si128(reinterpret_cast<__m128i *>(dest) + 2, mval3);
    }

    if (count < 3) {
        if (count > 1)
            end[-2] = v;
        if (count)
            end[-1] = v;
        return;
    }

    // less than 16px/48B left
    uchar *ptr = reinterpret_cast<uchar *>(dest);
    uchar *ptr_end = reinterpret_cast<uchar *>(end);
    qptrdiff left = ptr_end - ptr;
    if (left >= 24) {
        // 8px/24B or more left
        _mm_storeu_si128(reinterpret_cast<__m128i *>(ptr) + 0, mval1);
        _mm_storel_epi64(reinterpret_cast<__m128i *>(ptr) + 1, mval2);
        ptr += 24;
        left -= 24;
    }

    // less than 8px/24B left

    if (left >= 16) {
        // but more than 5px/15B left
        _mm_storeu_si128(reinterpret_cast<__m128i *>(ptr) , mval1);
    } else if (left >= 8) {
        // but more than 2px/6B left
        _mm_storel_epi64(reinterpret_cast<__m128i *>(ptr), mval1);
    }

    if (left) {
        // 1 or 2px left
        // store 8 bytes ending with the right values (will overwrite a bit)
        _mm_storel_epi64(reinterpret_cast<__m128i *>(ptr_end - 8), mval2);
    }
}

void QT_FASTCALL rbSwap_888_ssse3(uchar *dst, const uchar *src, int count)
{
    int i = 0;

    const static __m128i shuffleMask1 = _mm_setr_epi8(2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, /*!!*/15);
    const static __m128i shuffleMask2 = _mm_setr_epi8(0, /*!!*/1, 4, 3, 2, 7, 6, 5, 10, 9, 8, 13, 12, 11, /*!!*/14, 15);
    const static __m128i shuffleMask3 = _mm_setr_epi8(/*!!*/0, 3, 2, 1, 6, 5, 4, 9, 8, 7, 12, 11, 10, 15, 14, 13);

    for (; i + 15 < count; i += 16) {
        __m128i s1 = _mm_loadu_si128((const __m128i *)src);
        __m128i s2 = _mm_loadu_si128((const __m128i *)(src + 16));
        __m128i s3 = _mm_loadu_si128((const __m128i *)(src + 32));
        s1 = _mm_shuffle_epi8(s1, shuffleMask1);
        s2 = _mm_shuffle_epi8(s2, shuffleMask2);
        s3 = _mm_shuffle_epi8(s3, shuffleMask3);
        _mm_storeu_si128((__m128i *)dst, s1);
        _mm_storeu_si128((__m128i *)(dst + 16), s2);
        _mm_storeu_si128((__m128i *)(dst + 32), s3);

        // Now fix the last four misplaced values
        std::swap(dst[15], dst[17]);
        std::swap(dst[30], dst[32]);

        src += 48;
        dst += 48;
    }

    if (src != dst) {
        SIMD_EPILOGUE(i, count, 15) {
            dst[0] = src[2];
            dst[1] = src[1];
            dst[2] = src[0];
            dst += 3;
            src += 3;
        }
    } else {
        SIMD_EPILOGUE(i, count, 15) {
            std::swap(dst[0], dst[2]);
            dst += 3;
        }
    }
}

QT_END_NAMESPACE

#endif // QT_COMPILER_SUPPORTS_SSSE3

Coverage Report

Created: 2026-04-01 07:24

Line	Count	Source
1		// Copyright (C) 2018 The Qt Company Ltd.
2		// Copyright (C) 2018 Intel Corporation.
3		// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
4		// Qt-Security score:significant reason:default
5
6		#include <private/qdrawhelper_x86_p.h>
7
8		#if defined(QT_COMPILER_SUPPORTS_SSSE3)
9
10		#include <private/qdrawingprimitive_sse2_p.h>
11
12		QT_BEGIN_NAMESPACE
13
14		/* The instruction palignr uses direct arguments, so we have to generate the code fo the different
15		shift (4, 8, 12). Checking the alignment inside the loop is unfortunately way too slow.
16		*/
17		#define BLENDING_LOOP(palignrOffset, length)\
18	0	for (; x-minusOffsetToAlignSrcOn16Bytes < length-7; x += 4) { \
19	0	const __m128i srcVectorLastLoaded = _mm_load_si128((const __m128i *)&src[x - minusOffsetToAlignSrcOn16Bytes + 4]);\
20	0	const __m128i srcVector = _mm_alignr_epi8(srcVectorLastLoaded, srcVectorPrevLoaded, palignrOffset); \
21	0	const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask); \
22	0	if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) { \
23	0	_mm_store_si128((__m128i *)&dst[x], srcVector); \
24	0	} else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) != 0xffff) { \
25	0	__m128i alphaChannel = _mm_shuffle_epi8(srcVector, alphaShuffleMask); \
26	0	alphaChannel = _mm_sub_epi16(one, alphaChannel); \
27	0	const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]); \
28	0	__m128i destMultipliedByOneMinusAlpha; \
29	0	BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half); \
30	0	const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha); \
31	0	_mm_store_si128((__m128i *)&dst[x], result); \
32	0	} \
33	0	srcVectorPrevLoaded = srcVectorLastLoaded;\
34	0	}
35
36
37		// Basically blend src over dst with the const alpha defined as constAlphaVector.
38		// nullVector, half, one, colorMask are constant across the whole image/texture, and should be defined as:
39		//const __m128i nullVector = _mm_set1_epi32(0);
40		//const __m128i half = _mm_set1_epi16(0x80);
41		//const __m128i one = _mm_set1_epi16(0xff);
42		//const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
43		//const __m128i alphaMask = _mm_set1_epi32(0xff000000);
44		//
45		// The computation being done is:
46		// result = s + d * (1-alpha)
47		// with shortcuts if fully opaque or fully transparent.
48		static inline void Q_DECL_VECTORCALL
49		BLEND_SOURCE_OVER_ARGB32_SSSE3(quint32 dst, const quint32 src, int length,
50		__m128i nullVector, __m128i half, __m128i one, __m128i colorMask, __m128i alphaMask)
51	0	{
52	0	int x = 0;
53
54		/* First, get dst aligned. */
55	0	ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) {
56	0	blend_pixel(dst[x], src[x]);
57	0	}
58
59	0	const int minusOffsetToAlignSrcOn16Bytes = (reinterpret_cast<quintptr>(&(src[x])) >> 2) & 0x3;
60
61	0	if (!minusOffsetToAlignSrcOn16Bytes) {
62		/* src is aligned, usual algorithm but with aligned operations.
63		See the SSE2 version for more documentation on the algorithm itself. */
64	0	const __m128i alphaShuffleMask = _mm_set_epi8(char(0xff),15,char(0xff),15,char(0xff),11,char(0xff),11,char(0xff),7,char(0xff),7,char(0xff),3,char(0xff),3);
65	0	for (; x < length-3; x += 4) {
66	0	const __m128i srcVector = _mm_load_si128((const __m128i *)&src[x]);
67	0	const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask);
68	0	if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) {
69	0	_mm_store_si128((__m128i *)&dst[x], srcVector);
70	0	} else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) != 0xffff) {
71	0	__m128i alphaChannel = _mm_shuffle_epi8(srcVector, alphaShuffleMask);
72	0	alphaChannel = _mm_sub_epi16(one, alphaChannel);
73	0	const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]);
74	0	__m128i destMultipliedByOneMinusAlpha;
75	0	BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half);
76	0	const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha);
77	0	_mm_store_si128((__m128i *)&dst[x], result);
78	0	}
79	0	} /* end for() */
80	0	} else if ((length - x) >= 8) {
81		/* We use two vectors to extract the src: prevLoaded for the first pixels, lastLoaded for the current pixels. */
82	0	__m128i srcVectorPrevLoaded = _mm_load_si128((const __m128i *)&src[x - minusOffsetToAlignSrcOn16Bytes]);
83	0	const int palignrOffset = minusOffsetToAlignSrcOn16Bytes << 2;
84
85	0	const __m128i alphaShuffleMask = _mm_set_epi8(char(0xff),15,char(0xff),15,char(0xff),11,char(0xff),11,char(0xff),7,char(0xff),7,char(0xff),3,char(0xff),3);
86	0	switch (palignrOffset) {
87	0	case 4:
88	0	BLENDING_LOOP(4, length)
89	0	break;
90	0	case 8:
91	0	BLENDING_LOOP(8, length)
92	0	break;
93	0	case 12:
94	0	BLENDING_LOOP(12, length)
95	0	break;
96	0	}
97	0	}
98	0	for (; x < length; ++x)
99	0	blend_pixel(dst[x], src[x]);
100	0	}
101
102		void qt_blend_argb32_on_argb32_ssse3(uchar *destPixels, int dbpl,
103		const uchar *srcPixels, int sbpl,
104		int w, int h,
105		int const_alpha)
106	0	{
107	0	const quint32 src = (const quint32 ) srcPixels;
108	0	quint32 dst = (quint32 ) destPixels;
109	0	if (const_alpha == 256) {
110	0	const __m128i alphaMask = _mm_set1_epi32(0xff000000);
111	0	const __m128i nullVector = _mm_setzero_si128();
112	0	const __m128i half = _mm_set1_epi16(0x80);
113	0	const __m128i one = _mm_set1_epi16(0xff);
114	0	const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
115
116	0	for (int y = 0; y < h; ++y) {
117	0	BLEND_SOURCE_OVER_ARGB32_SSSE3(dst, src, w, nullVector, half, one, colorMask, alphaMask);
118	0	dst = (quint32 )(((uchar ) dst) + dbpl);
119	0	src = (const quint32 )(((const uchar ) src) + sbpl);
120	0	}
121	0	} else if (const_alpha != 0) {
122		// dest = (s + d * sia) * ca + d * cia
123		// = s * ca + d * (sia * ca + cia)
124		// = s * ca + d * (1 - sa*ca)
125	0	const_alpha = (const_alpha * 255) >> 8;
126	0	const __m128i nullVector = _mm_setzero_si128();
127	0	const __m128i half = _mm_set1_epi16(0x80);
128	0	const __m128i one = _mm_set1_epi16(0xff);
129	0	const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
130	0	const __m128i constAlphaVector = _mm_set1_epi16(const_alpha);
131	0	for (int y = 0; y < h; ++y) {
132	0	BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2(dst, src, w, nullVector, half, one, colorMask, constAlphaVector)
133	0	dst = (quint32 )(((uchar ) dst) + dbpl);
134	0	src = (const quint32 )(((const uchar ) src) + sbpl);
135	0	}
136	0	}
137	0	}
138
139		const uint QT_FASTCALL fetchPixelsBPP24_ssse3(uint buffer, const uchar *src, int index, int count)
140	0	{
141	0	const quint24 s = reinterpret_cast<const quint24 >(src);
142	0	for (int i = 0; i < count; ++i)
143	0	buffer[i] = s[index + i];
144	0	return buffer;
145	0	}
146
147		extern void QT_FASTCALL qt_convert_rgb888_to_rgb32_ssse3(quint32 dst, const uchar src, int len);
148
149		const uint * QT_FASTCALL qt_fetchUntransformed_888_ssse3(uint buffer, const Operator , const QSpanData *data,
150		int y, int x, int length)
151	0	{
152	0	const uchar line = data->texture.scanLine(y) + x 3;
153	0	qt_convert_rgb888_to_rgb32_ssse3(buffer, line, length);
154	0	return buffer;
155	0	}
156
157		void qt_memfill24_ssse3(quint24 *dest, quint24 color, qsizetype count)
158	28.2M	{
159		// LCM of 12 and 16 bytes is 48 bytes (16 px)
160	28.2M	quint32 v = color;
161	28.2M	__m128i m = _mm_cvtsi32_si128(v);
162	28.2M	quint24 *end = dest + count;
163
164	28.2M	constexpr uchar x = 2, y = 1, z = 0;
165	28.2M	alignas(__m128i) static const uchar
166	28.2M	shuffleMask[16 + 1] = { x, y, z, x, y, z, x, y, z, x, y, z, x, y, z, x, y };
167
168	28.2M	__m128i mval1 = _mm_shuffle_epi8(m, _mm_load_si128(reinterpret_cast<const __m128i *>(shuffleMask)));
169	28.2M	__m128i mval2 = _mm_shuffle_epi8(m, _mm_loadu_si128(reinterpret_cast<const __m128i *>(shuffleMask + 1)));
170	28.2M	__m128i mval3 = _mm_alignr_epi8(mval2, mval1, 2);
171
172	360M	for ( ; dest + 16 <= end; dest += 16) {
173		#ifdef __AVX__
174		// Store using 32-byte AVX instruction
175		__m256 mval12 = _mm256_castps128_ps256(_mm_castsi128_ps(mval1));
176		mval12 = _mm256_insertf128_ps(mval12, _mm_castsi128_ps(mval2), 1);
177		_mm256_storeu_ps(reinterpret_cast<float *>(dest), mval12);
178		#else
179	331M	_mm_storeu_si128(reinterpret_cast<__m128i *>(dest) + 0, mval1);
180	331M	_mm_storeu_si128(reinterpret_cast<__m128i *>(dest) + 1, mval2);
181	331M	#endif
182	331M	_mm_storeu_si128(reinterpret_cast<__m128i *>(dest) + 2, mval3);
183	331M	}
184
185	28.2M	if (count < 3) {
186	20.5M	if (count > 1)
187	934k	end[-2] = v;
188	20.5M	if (count)
189	20.5M	end[-1] = v;
190	20.5M	return;
191	20.5M	}
192
193		// less than 16px/48B left
194	7.68M	uchar ptr = reinterpret_cast<uchar >(dest);
195	7.68M	uchar ptr_end = reinterpret_cast<uchar >(end);
196	7.68M	qptrdiff left = ptr_end - ptr;
197	7.68M	if (left >= 24) {
198		// 8px/24B or more left
199	4.12M	_mm_storeu_si128(reinterpret_cast<__m128i *>(ptr) + 0, mval1);
200	4.12M	_mm_storel_epi64(reinterpret_cast<__m128i *>(ptr) + 1, mval2);
201	4.12M	ptr += 24;
202	4.12M	left -= 24;
203	4.12M	}
204
205		// less than 8px/24B left
206
207	7.68M	if (left >= 16) {
208		// but more than 5px/15B left
209	2.75M	_mm_storeu_si128(reinterpret_cast<__m128i *>(ptr) , mval1);
210	4.92M	} else if (left >= 8) {
211		// but more than 2px/6B left
212	3.79M	_mm_storel_epi64(reinterpret_cast<__m128i *>(ptr), mval1);
213	3.79M	}
214
215	7.68M	if (left) {
216		// 1 or 2px left
217		// store 8 bytes ending with the right values (will overwrite a bit)
218	7.68M	_mm_storel_epi64(reinterpret_cast<__m128i *>(ptr_end - 8), mval2);
219	7.68M	}
220	7.68M	}
221
222		void QT_FASTCALL rbSwap_888_ssse3(uchar dst, const uchar src, int count)
223	0	{
224	0	int i = 0;
225
226	0	const static __m128i shuffleMask1 = _mm_setr_epi8(2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, /!!/15);
227	0	const static __m128i shuffleMask2 = _mm_setr_epi8(0, /!!/1, 4, 3, 2, 7, 6, 5, 10, 9, 8, 13, 12, 11, /!!/14, 15);
228	0	const static __m128i shuffleMask3 = _mm_setr_epi8(/!!/0, 3, 2, 1, 6, 5, 4, 9, 8, 7, 12, 11, 10, 15, 14, 13);
229
230	0	for (; i + 15 < count; i += 16) {
231	0	__m128i s1 = _mm_loadu_si128((const __m128i *)src);
232	0	__m128i s2 = _mm_loadu_si128((const __m128i *)(src + 16));
233	0	__m128i s3 = _mm_loadu_si128((const __m128i *)(src + 32));
234	0	s1 = _mm_shuffle_epi8(s1, shuffleMask1);
235	0	s2 = _mm_shuffle_epi8(s2, shuffleMask2);
236	0	s3 = _mm_shuffle_epi8(s3, shuffleMask3);
237	0	_mm_storeu_si128((__m128i *)dst, s1);
238	0	_mm_storeu_si128((__m128i *)(dst + 16), s2);
239	0	_mm_storeu_si128((__m128i *)(dst + 32), s3);
240
241		// Now fix the last four misplaced values
242	0	std::swap(dst[15], dst[17]);
243	0	std::swap(dst[30], dst[32]);
244
245	0	src += 48;
246	0	dst += 48;
247	0	}
248
249	0	if (src != dst) {
250	0	SIMD_EPILOGUE(i, count, 15) {
251	0	dst[0] = src[2];
252	0	dst[1] = src[1];
253	0	dst[2] = src[0];
254	0	dst += 3;
255	0	src += 3;
256	0	}
257	0	} else {
258	0	SIMD_EPILOGUE(i, count, 15) {
259	0	std::swap(dst[0], dst[2]);
260	0	dst += 3;
261	0	}
262	0	}
263	0	}
264
265		QT_END_NAMESPACE
266
267		#endif // QT_COMPILER_SUPPORTS_SSSE3