Coverage Report

Created: 2026-04-01 07:24

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/qtbase/src/gui/painting/qdrawhelper_ssse3.cpp
Line
Count
Source
1
// Copyright (C) 2018 The Qt Company Ltd.
2
// Copyright (C) 2018 Intel Corporation.
3
// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
4
// Qt-Security score:significant reason:default
5
6
#include <private/qdrawhelper_x86_p.h>
7
8
#if defined(QT_COMPILER_SUPPORTS_SSSE3)
9
10
#include <private/qdrawingprimitive_sse2_p.h>
11
12
QT_BEGIN_NAMESPACE
13
14
/* The instruction palignr uses direct arguments, so we have to generate the code fo the different
15
   shift (4, 8, 12). Checking the alignment inside the loop is unfortunately way too slow.
16
 */
17
#define BLENDING_LOOP(palignrOffset, length)\
18
0
    for (; x-minusOffsetToAlignSrcOn16Bytes < length-7; x += 4) { \
19
0
        const __m128i srcVectorLastLoaded = _mm_load_si128((const __m128i *)&src[x - minusOffsetToAlignSrcOn16Bytes + 4]);\
20
0
        const __m128i srcVector = _mm_alignr_epi8(srcVectorLastLoaded, srcVectorPrevLoaded, palignrOffset); \
21
0
        const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask); \
22
0
        if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) { \
23
0
            _mm_store_si128((__m128i *)&dst[x], srcVector); \
24
0
        } else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) != 0xffff) { \
25
0
            __m128i alphaChannel = _mm_shuffle_epi8(srcVector, alphaShuffleMask); \
26
0
            alphaChannel = _mm_sub_epi16(one, alphaChannel); \
27
0
            const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]); \
28
0
            __m128i destMultipliedByOneMinusAlpha; \
29
0
            BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half); \
30
0
            const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha); \
31
0
            _mm_store_si128((__m128i *)&dst[x], result); \
32
0
        } \
33
0
        srcVectorPrevLoaded = srcVectorLastLoaded;\
34
0
    }
35
36
37
// Basically blend src over dst with the const alpha defined as constAlphaVector.
38
// nullVector, half, one, colorMask are constant across the whole image/texture, and should be defined as:
39
//const __m128i nullVector = _mm_set1_epi32(0);
40
//const __m128i half = _mm_set1_epi16(0x80);
41
//const __m128i one = _mm_set1_epi16(0xff);
42
//const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
43
//const __m128i alphaMask = _mm_set1_epi32(0xff000000);
44
//
45
// The computation being done is:
46
// result = s + d * (1-alpha)
47
// with shortcuts if fully opaque or fully transparent.
48
static inline void Q_DECL_VECTORCALL
49
BLEND_SOURCE_OVER_ARGB32_SSSE3(quint32 *dst, const quint32 *src, int length,
50
                               __m128i nullVector, __m128i half, __m128i one, __m128i colorMask, __m128i alphaMask)
51
0
{
52
0
    int x = 0;
53
54
    /* First, get dst aligned. */
55
0
    ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) {
56
0
        blend_pixel(dst[x], src[x]);
57
0
    }
58
59
0
    const int minusOffsetToAlignSrcOn16Bytes = (reinterpret_cast<quintptr>(&(src[x])) >> 2) & 0x3;
60
61
0
    if (!minusOffsetToAlignSrcOn16Bytes) {
62
        /* src is aligned, usual algorithm but with aligned operations.
63
           See the SSE2 version for more documentation on the algorithm itself. */
64
0
        const __m128i alphaShuffleMask = _mm_set_epi8(char(0xff),15,char(0xff),15,char(0xff),11,char(0xff),11,char(0xff),7,char(0xff),7,char(0xff),3,char(0xff),3);
65
0
        for (; x < length-3; x += 4) {
66
0
            const __m128i srcVector = _mm_load_si128((const __m128i *)&src[x]);
67
0
            const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask);
68
0
            if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) {
69
0
                _mm_store_si128((__m128i *)&dst[x], srcVector);
70
0
            } else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) != 0xffff) {
71
0
                __m128i alphaChannel = _mm_shuffle_epi8(srcVector, alphaShuffleMask);
72
0
                alphaChannel = _mm_sub_epi16(one, alphaChannel);
73
0
                const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]);
74
0
                __m128i destMultipliedByOneMinusAlpha;
75
0
                BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half);
76
0
                const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha);
77
0
                _mm_store_si128((__m128i *)&dst[x], result);
78
0
            }
79
0
        } /* end for() */
80
0
    } else if ((length - x) >= 8) {
81
        /* We use two vectors to extract the src: prevLoaded for the first pixels, lastLoaded for the current pixels. */
82
0
        __m128i srcVectorPrevLoaded = _mm_load_si128((const __m128i *)&src[x - minusOffsetToAlignSrcOn16Bytes]);
83
0
        const int palignrOffset = minusOffsetToAlignSrcOn16Bytes << 2;
84
85
0
        const __m128i alphaShuffleMask = _mm_set_epi8(char(0xff),15,char(0xff),15,char(0xff),11,char(0xff),11,char(0xff),7,char(0xff),7,char(0xff),3,char(0xff),3);
86
0
        switch (palignrOffset) {
87
0
        case 4:
88
0
            BLENDING_LOOP(4, length)
89
0
            break;
90
0
        case 8:
91
0
            BLENDING_LOOP(8, length)
92
0
            break;
93
0
        case 12:
94
0
            BLENDING_LOOP(12, length)
95
0
            break;
96
0
        }
97
0
    }
98
0
    for (; x < length; ++x)
99
0
        blend_pixel(dst[x], src[x]);
100
0
}
101
102
void qt_blend_argb32_on_argb32_ssse3(uchar *destPixels, int dbpl,
103
                                     const uchar *srcPixels, int sbpl,
104
                                     int w, int h,
105
                                     int const_alpha)
106
0
{
107
0
    const quint32 *src = (const quint32 *) srcPixels;
108
0
    quint32 *dst = (quint32 *) destPixels;
109
0
    if (const_alpha == 256) {
110
0
        const __m128i alphaMask = _mm_set1_epi32(0xff000000);
111
0
        const __m128i nullVector = _mm_setzero_si128();
112
0
        const __m128i half = _mm_set1_epi16(0x80);
113
0
        const __m128i one = _mm_set1_epi16(0xff);
114
0
        const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
115
116
0
        for (int y = 0; y < h; ++y) {
117
0
            BLEND_SOURCE_OVER_ARGB32_SSSE3(dst, src, w, nullVector, half, one, colorMask, alphaMask);
118
0
            dst = (quint32 *)(((uchar *) dst) + dbpl);
119
0
            src = (const quint32 *)(((const uchar *) src) + sbpl);
120
0
        }
121
0
    } else if (const_alpha != 0) {
122
        // dest = (s + d * sia) * ca + d * cia
123
        //      = s * ca + d * (sia * ca + cia)
124
        //      = s * ca + d * (1 - sa*ca)
125
0
        const_alpha = (const_alpha * 255) >> 8;
126
0
        const __m128i nullVector = _mm_setzero_si128();
127
0
        const __m128i half = _mm_set1_epi16(0x80);
128
0
        const __m128i one = _mm_set1_epi16(0xff);
129
0
        const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
130
0
        const __m128i constAlphaVector = _mm_set1_epi16(const_alpha);
131
0
        for (int y = 0; y < h; ++y) {
132
0
            BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2(dst, src, w, nullVector, half, one, colorMask, constAlphaVector)
133
0
            dst = (quint32 *)(((uchar *) dst) + dbpl);
134
0
            src = (const quint32 *)(((const uchar *) src) + sbpl);
135
0
        }
136
0
    }
137
0
}
138
139
const uint *QT_FASTCALL fetchPixelsBPP24_ssse3(uint *buffer, const uchar *src, int index, int count)
140
0
{
141
0
    const quint24 *s = reinterpret_cast<const quint24 *>(src);
142
0
    for (int i = 0; i < count; ++i)
143
0
        buffer[i] = s[index + i];
144
0
    return buffer;
145
0
}
146
147
extern void QT_FASTCALL qt_convert_rgb888_to_rgb32_ssse3(quint32 *dst, const uchar *src, int len);
148
149
const uint * QT_FASTCALL qt_fetchUntransformed_888_ssse3(uint *buffer, const Operator *, const QSpanData *data,
150
                                                         int y, int x, int length)
151
0
{
152
0
    const uchar *line = data->texture.scanLine(y) + x * 3;
153
0
    qt_convert_rgb888_to_rgb32_ssse3(buffer, line, length);
154
0
    return buffer;
155
0
}
156
157
void qt_memfill24_ssse3(quint24 *dest, quint24 color, qsizetype count)
158
28.2M
{
159
    // LCM of 12 and 16 bytes is 48 bytes (16 px)
160
28.2M
    quint32 v = color;
161
28.2M
    __m128i m = _mm_cvtsi32_si128(v);
162
28.2M
    quint24 *end = dest + count;
163
164
28.2M
    constexpr uchar x = 2, y = 1, z = 0;
165
28.2M
    alignas(__m128i) static const uchar
166
28.2M
    shuffleMask[16 + 1] = { x, y, z, x,  y, z, x, y,  z, x, y, z,  x, y, z, x,  y };
167
168
28.2M
    __m128i mval1 = _mm_shuffle_epi8(m, _mm_load_si128(reinterpret_cast<const __m128i *>(shuffleMask)));
169
28.2M
    __m128i mval2 = _mm_shuffle_epi8(m, _mm_loadu_si128(reinterpret_cast<const __m128i *>(shuffleMask + 1)));
170
28.2M
    __m128i mval3 = _mm_alignr_epi8(mval2, mval1, 2);
171
172
360M
    for ( ; dest + 16 <= end; dest += 16) {
173
#ifdef __AVX__
174
        // Store using 32-byte AVX instruction
175
        __m256 mval12 = _mm256_castps128_ps256(_mm_castsi128_ps(mval1));
176
        mval12 = _mm256_insertf128_ps(mval12, _mm_castsi128_ps(mval2), 1);
177
        _mm256_storeu_ps(reinterpret_cast<float *>(dest), mval12);
178
#else
179
331M
        _mm_storeu_si128(reinterpret_cast<__m128i *>(dest) + 0, mval1);
180
331M
        _mm_storeu_si128(reinterpret_cast<__m128i *>(dest) + 1, mval2);
181
331M
#endif
182
331M
        _mm_storeu_si128(reinterpret_cast<__m128i *>(dest) + 2, mval3);
183
331M
    }
184
185
28.2M
    if (count < 3) {
186
20.5M
        if (count > 1)
187
934k
            end[-2] = v;
188
20.5M
        if (count)
189
20.5M
            end[-1] = v;
190
20.5M
        return;
191
20.5M
    }
192
193
    // less than 16px/48B left
194
7.68M
    uchar *ptr = reinterpret_cast<uchar *>(dest);
195
7.68M
    uchar *ptr_end = reinterpret_cast<uchar *>(end);
196
7.68M
    qptrdiff left = ptr_end - ptr;
197
7.68M
    if (left >= 24) {
198
        // 8px/24B or more left
199
4.12M
        _mm_storeu_si128(reinterpret_cast<__m128i *>(ptr) + 0, mval1);
200
4.12M
        _mm_storel_epi64(reinterpret_cast<__m128i *>(ptr) + 1, mval2);
201
4.12M
        ptr += 24;
202
4.12M
        left -= 24;
203
4.12M
    }
204
205
    // less than 8px/24B left
206
207
7.68M
    if (left >= 16) {
208
        // but more than 5px/15B left
209
2.75M
        _mm_storeu_si128(reinterpret_cast<__m128i *>(ptr) , mval1);
210
4.92M
    } else if (left >= 8) {
211
        // but more than 2px/6B left
212
3.79M
        _mm_storel_epi64(reinterpret_cast<__m128i *>(ptr), mval1);
213
3.79M
    }
214
215
7.68M
    if (left) {
216
        // 1 or 2px left
217
        // store 8 bytes ending with the right values (will overwrite a bit)
218
7.68M
        _mm_storel_epi64(reinterpret_cast<__m128i *>(ptr_end - 8), mval2);
219
7.68M
    }
220
7.68M
}
221
222
void QT_FASTCALL rbSwap_888_ssse3(uchar *dst, const uchar *src, int count)
223
0
{
224
0
    int i = 0;
225
226
0
    const static __m128i shuffleMask1 = _mm_setr_epi8(2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, /*!!*/15);
227
0
    const static __m128i shuffleMask2 = _mm_setr_epi8(0, /*!!*/1, 4, 3, 2, 7, 6, 5, 10, 9, 8, 13, 12, 11, /*!!*/14, 15);
228
0
    const static __m128i shuffleMask3 = _mm_setr_epi8(/*!!*/0, 3, 2, 1, 6, 5, 4, 9, 8, 7, 12, 11, 10, 15, 14, 13);
229
230
0
    for (; i + 15 < count; i += 16) {
231
0
        __m128i s1 = _mm_loadu_si128((const __m128i *)src);
232
0
        __m128i s2 = _mm_loadu_si128((const __m128i *)(src + 16));
233
0
        __m128i s3 = _mm_loadu_si128((const __m128i *)(src + 32));
234
0
        s1 = _mm_shuffle_epi8(s1, shuffleMask1);
235
0
        s2 = _mm_shuffle_epi8(s2, shuffleMask2);
236
0
        s3 = _mm_shuffle_epi8(s3, shuffleMask3);
237
0
        _mm_storeu_si128((__m128i *)dst, s1);
238
0
        _mm_storeu_si128((__m128i *)(dst + 16), s2);
239
0
        _mm_storeu_si128((__m128i *)(dst + 32), s3);
240
241
        // Now fix the last four misplaced values
242
0
        std::swap(dst[15], dst[17]);
243
0
        std::swap(dst[30], dst[32]);
244
245
0
        src += 48;
246
0
        dst += 48;
247
0
    }
248
249
0
    if (src != dst) {
250
0
        SIMD_EPILOGUE(i, count, 15) {
251
0
            dst[0] = src[2];
252
0
            dst[1] = src[1];
253
0
            dst[2] = src[0];
254
0
            dst += 3;
255
0
            src += 3;
256
0
        }
257
0
    } else {
258
0
        SIMD_EPILOGUE(i, count, 15) {
259
0
            std::swap(dst[0], dst[2]);
260
0
            dst += 3;
261
0
        }
262
0
    }
263
0
}
264
265
QT_END_NAMESPACE
266
267
#endif // QT_COMPILER_SUPPORTS_SSSE3