/src/qtbase/src/gui/painting/qdrawhelper_ssse3.cpp
Line | Count | Source |
1 | | // Copyright (C) 2018 The Qt Company Ltd. |
2 | | // Copyright (C) 2018 Intel Corporation. |
3 | | // SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only |
4 | | // Qt-Security score:significant reason:default |
5 | | |
6 | | #include <private/qdrawhelper_x86_p.h> |
7 | | |
8 | | #if defined(QT_COMPILER_SUPPORTS_SSSE3) |
9 | | |
10 | | #include <private/qdrawingprimitive_sse2_p.h> |
11 | | |
12 | | QT_BEGIN_NAMESPACE |
13 | | |
14 | | /* The instruction palignr uses direct arguments, so we have to generate the code fo the different |
15 | | shift (4, 8, 12). Checking the alignment inside the loop is unfortunately way too slow. |
16 | | */ |
17 | | #define BLENDING_LOOP(palignrOffset, length)\ |
18 | 0 | for (; x-minusOffsetToAlignSrcOn16Bytes < length-7; x += 4) { \ |
19 | 0 | const __m128i srcVectorLastLoaded = _mm_load_si128((const __m128i *)&src[x - minusOffsetToAlignSrcOn16Bytes + 4]);\ |
20 | 0 | const __m128i srcVector = _mm_alignr_epi8(srcVectorLastLoaded, srcVectorPrevLoaded, palignrOffset); \ |
21 | 0 | const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask); \ |
22 | 0 | if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) { \ |
23 | 0 | _mm_store_si128((__m128i *)&dst[x], srcVector); \ |
24 | 0 | } else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) != 0xffff) { \ |
25 | 0 | __m128i alphaChannel = _mm_shuffle_epi8(srcVector, alphaShuffleMask); \ |
26 | 0 | alphaChannel = _mm_sub_epi16(one, alphaChannel); \ |
27 | 0 | const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]); \ |
28 | 0 | __m128i destMultipliedByOneMinusAlpha; \ |
29 | 0 | BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half); \ |
30 | 0 | const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha); \ |
31 | 0 | _mm_store_si128((__m128i *)&dst[x], result); \ |
32 | 0 | } \ |
33 | 0 | srcVectorPrevLoaded = srcVectorLastLoaded;\ |
34 | 0 | } |
35 | | |
36 | | |
37 | | // Basically blend src over dst with the const alpha defined as constAlphaVector. |
38 | | // nullVector, half, one, colorMask are constant across the whole image/texture, and should be defined as: |
39 | | //const __m128i nullVector = _mm_set1_epi32(0); |
40 | | //const __m128i half = _mm_set1_epi16(0x80); |
41 | | //const __m128i one = _mm_set1_epi16(0xff); |
42 | | //const __m128i colorMask = _mm_set1_epi32(0x00ff00ff); |
43 | | //const __m128i alphaMask = _mm_set1_epi32(0xff000000); |
44 | | // |
45 | | // The computation being done is: |
46 | | // result = s + d * (1-alpha) |
47 | | // with shortcuts if fully opaque or fully transparent. |
48 | | static inline void Q_DECL_VECTORCALL |
49 | | BLEND_SOURCE_OVER_ARGB32_SSSE3(quint32 *dst, const quint32 *src, int length, |
50 | | __m128i nullVector, __m128i half, __m128i one, __m128i colorMask, __m128i alphaMask) |
51 | 0 | { |
52 | 0 | int x = 0; |
53 | | |
54 | | /* First, get dst aligned. */ |
55 | 0 | ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) { |
56 | 0 | blend_pixel(dst[x], src[x]); |
57 | 0 | } |
58 | |
|
59 | 0 | const int minusOffsetToAlignSrcOn16Bytes = (reinterpret_cast<quintptr>(&(src[x])) >> 2) & 0x3; |
60 | |
|
61 | 0 | if (!minusOffsetToAlignSrcOn16Bytes) { |
62 | | /* src is aligned, usual algorithm but with aligned operations. |
63 | | See the SSE2 version for more documentation on the algorithm itself. */ |
64 | 0 | const __m128i alphaShuffleMask = _mm_set_epi8(char(0xff),15,char(0xff),15,char(0xff),11,char(0xff),11,char(0xff),7,char(0xff),7,char(0xff),3,char(0xff),3); |
65 | 0 | for (; x < length-3; x += 4) { |
66 | 0 | const __m128i srcVector = _mm_load_si128((const __m128i *)&src[x]); |
67 | 0 | const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask); |
68 | 0 | if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) { |
69 | 0 | _mm_store_si128((__m128i *)&dst[x], srcVector); |
70 | 0 | } else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) != 0xffff) { |
71 | 0 | __m128i alphaChannel = _mm_shuffle_epi8(srcVector, alphaShuffleMask); |
72 | 0 | alphaChannel = _mm_sub_epi16(one, alphaChannel); |
73 | 0 | const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]); |
74 | 0 | __m128i destMultipliedByOneMinusAlpha; |
75 | 0 | BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half); |
76 | 0 | const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha); |
77 | 0 | _mm_store_si128((__m128i *)&dst[x], result); |
78 | 0 | } |
79 | 0 | } /* end for() */ |
80 | 0 | } else if ((length - x) >= 8) { |
81 | | /* We use two vectors to extract the src: prevLoaded for the first pixels, lastLoaded for the current pixels. */ |
82 | 0 | __m128i srcVectorPrevLoaded = _mm_load_si128((const __m128i *)&src[x - minusOffsetToAlignSrcOn16Bytes]); |
83 | 0 | const int palignrOffset = minusOffsetToAlignSrcOn16Bytes << 2; |
84 | |
|
85 | 0 | const __m128i alphaShuffleMask = _mm_set_epi8(char(0xff),15,char(0xff),15,char(0xff),11,char(0xff),11,char(0xff),7,char(0xff),7,char(0xff),3,char(0xff),3); |
86 | 0 | switch (palignrOffset) { |
87 | 0 | case 4: |
88 | 0 | BLENDING_LOOP(4, length) |
89 | 0 | break; |
90 | 0 | case 8: |
91 | 0 | BLENDING_LOOP(8, length) |
92 | 0 | break; |
93 | 0 | case 12: |
94 | 0 | BLENDING_LOOP(12, length) |
95 | 0 | break; |
96 | 0 | } |
97 | 0 | } |
98 | 0 | for (; x < length; ++x) |
99 | 0 | blend_pixel(dst[x], src[x]); |
100 | 0 | } |
101 | | |
102 | | void qt_blend_argb32_on_argb32_ssse3(uchar *destPixels, int dbpl, |
103 | | const uchar *srcPixels, int sbpl, |
104 | | int w, int h, |
105 | | int const_alpha) |
106 | 0 | { |
107 | 0 | const quint32 *src = (const quint32 *) srcPixels; |
108 | 0 | quint32 *dst = (quint32 *) destPixels; |
109 | 0 | if (const_alpha == 256) { |
110 | 0 | const __m128i alphaMask = _mm_set1_epi32(0xff000000); |
111 | 0 | const __m128i nullVector = _mm_setzero_si128(); |
112 | 0 | const __m128i half = _mm_set1_epi16(0x80); |
113 | 0 | const __m128i one = _mm_set1_epi16(0xff); |
114 | 0 | const __m128i colorMask = _mm_set1_epi32(0x00ff00ff); |
115 | |
|
116 | 0 | for (int y = 0; y < h; ++y) { |
117 | 0 | BLEND_SOURCE_OVER_ARGB32_SSSE3(dst, src, w, nullVector, half, one, colorMask, alphaMask); |
118 | 0 | dst = (quint32 *)(((uchar *) dst) + dbpl); |
119 | 0 | src = (const quint32 *)(((const uchar *) src) + sbpl); |
120 | 0 | } |
121 | 0 | } else if (const_alpha != 0) { |
122 | | // dest = (s + d * sia) * ca + d * cia |
123 | | // = s * ca + d * (sia * ca + cia) |
124 | | // = s * ca + d * (1 - sa*ca) |
125 | 0 | const_alpha = (const_alpha * 255) >> 8; |
126 | 0 | const __m128i nullVector = _mm_setzero_si128(); |
127 | 0 | const __m128i half = _mm_set1_epi16(0x80); |
128 | 0 | const __m128i one = _mm_set1_epi16(0xff); |
129 | 0 | const __m128i colorMask = _mm_set1_epi32(0x00ff00ff); |
130 | 0 | const __m128i constAlphaVector = _mm_set1_epi16(const_alpha); |
131 | 0 | for (int y = 0; y < h; ++y) { |
132 | 0 | BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2(dst, src, w, nullVector, half, one, colorMask, constAlphaVector) |
133 | 0 | dst = (quint32 *)(((uchar *) dst) + dbpl); |
134 | 0 | src = (const quint32 *)(((const uchar *) src) + sbpl); |
135 | 0 | } |
136 | 0 | } |
137 | 0 | } |
138 | | |
139 | | const uint *QT_FASTCALL fetchPixelsBPP24_ssse3(uint *buffer, const uchar *src, int index, int count) |
140 | 0 | { |
141 | 0 | const quint24 *s = reinterpret_cast<const quint24 *>(src); |
142 | 0 | for (int i = 0; i < count; ++i) |
143 | 0 | buffer[i] = s[index + i]; |
144 | 0 | return buffer; |
145 | 0 | } |
146 | | |
147 | | extern void QT_FASTCALL qt_convert_rgb888_to_rgb32_ssse3(quint32 *dst, const uchar *src, int len); |
148 | | |
149 | | const uint * QT_FASTCALL qt_fetchUntransformed_888_ssse3(uint *buffer, const Operator *, const QSpanData *data, |
150 | | int y, int x, int length) |
151 | 0 | { |
152 | 0 | const uchar *line = data->texture.scanLine(y) + x * 3; |
153 | 0 | qt_convert_rgb888_to_rgb32_ssse3(buffer, line, length); |
154 | 0 | return buffer; |
155 | 0 | } |
156 | | |
157 | | void qt_memfill24_ssse3(quint24 *dest, quint24 color, qsizetype count) |
158 | 28.2M | { |
159 | | // LCM of 12 and 16 bytes is 48 bytes (16 px) |
160 | 28.2M | quint32 v = color; |
161 | 28.2M | __m128i m = _mm_cvtsi32_si128(v); |
162 | 28.2M | quint24 *end = dest + count; |
163 | | |
164 | 28.2M | constexpr uchar x = 2, y = 1, z = 0; |
165 | 28.2M | alignas(__m128i) static const uchar |
166 | 28.2M | shuffleMask[16 + 1] = { x, y, z, x, y, z, x, y, z, x, y, z, x, y, z, x, y }; |
167 | | |
168 | 28.2M | __m128i mval1 = _mm_shuffle_epi8(m, _mm_load_si128(reinterpret_cast<const __m128i *>(shuffleMask))); |
169 | 28.2M | __m128i mval2 = _mm_shuffle_epi8(m, _mm_loadu_si128(reinterpret_cast<const __m128i *>(shuffleMask + 1))); |
170 | 28.2M | __m128i mval3 = _mm_alignr_epi8(mval2, mval1, 2); |
171 | | |
172 | 360M | for ( ; dest + 16 <= end; dest += 16) { |
173 | | #ifdef __AVX__ |
174 | | // Store using 32-byte AVX instruction |
175 | | __m256 mval12 = _mm256_castps128_ps256(_mm_castsi128_ps(mval1)); |
176 | | mval12 = _mm256_insertf128_ps(mval12, _mm_castsi128_ps(mval2), 1); |
177 | | _mm256_storeu_ps(reinterpret_cast<float *>(dest), mval12); |
178 | | #else |
179 | 331M | _mm_storeu_si128(reinterpret_cast<__m128i *>(dest) + 0, mval1); |
180 | 331M | _mm_storeu_si128(reinterpret_cast<__m128i *>(dest) + 1, mval2); |
181 | 331M | #endif |
182 | 331M | _mm_storeu_si128(reinterpret_cast<__m128i *>(dest) + 2, mval3); |
183 | 331M | } |
184 | | |
185 | 28.2M | if (count < 3) { |
186 | 20.5M | if (count > 1) |
187 | 934k | end[-2] = v; |
188 | 20.5M | if (count) |
189 | 20.5M | end[-1] = v; |
190 | 20.5M | return; |
191 | 20.5M | } |
192 | | |
193 | | // less than 16px/48B left |
194 | 7.68M | uchar *ptr = reinterpret_cast<uchar *>(dest); |
195 | 7.68M | uchar *ptr_end = reinterpret_cast<uchar *>(end); |
196 | 7.68M | qptrdiff left = ptr_end - ptr; |
197 | 7.68M | if (left >= 24) { |
198 | | // 8px/24B or more left |
199 | 4.12M | _mm_storeu_si128(reinterpret_cast<__m128i *>(ptr) + 0, mval1); |
200 | 4.12M | _mm_storel_epi64(reinterpret_cast<__m128i *>(ptr) + 1, mval2); |
201 | 4.12M | ptr += 24; |
202 | 4.12M | left -= 24; |
203 | 4.12M | } |
204 | | |
205 | | // less than 8px/24B left |
206 | | |
207 | 7.68M | if (left >= 16) { |
208 | | // but more than 5px/15B left |
209 | 2.75M | _mm_storeu_si128(reinterpret_cast<__m128i *>(ptr) , mval1); |
210 | 4.92M | } else if (left >= 8) { |
211 | | // but more than 2px/6B left |
212 | 3.79M | _mm_storel_epi64(reinterpret_cast<__m128i *>(ptr), mval1); |
213 | 3.79M | } |
214 | | |
215 | 7.68M | if (left) { |
216 | | // 1 or 2px left |
217 | | // store 8 bytes ending with the right values (will overwrite a bit) |
218 | 7.68M | _mm_storel_epi64(reinterpret_cast<__m128i *>(ptr_end - 8), mval2); |
219 | 7.68M | } |
220 | 7.68M | } |
221 | | |
222 | | void QT_FASTCALL rbSwap_888_ssse3(uchar *dst, const uchar *src, int count) |
223 | 0 | { |
224 | 0 | int i = 0; |
225 | |
|
226 | 0 | const static __m128i shuffleMask1 = _mm_setr_epi8(2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, /*!!*/15); |
227 | 0 | const static __m128i shuffleMask2 = _mm_setr_epi8(0, /*!!*/1, 4, 3, 2, 7, 6, 5, 10, 9, 8, 13, 12, 11, /*!!*/14, 15); |
228 | 0 | const static __m128i shuffleMask3 = _mm_setr_epi8(/*!!*/0, 3, 2, 1, 6, 5, 4, 9, 8, 7, 12, 11, 10, 15, 14, 13); |
229 | |
|
230 | 0 | for (; i + 15 < count; i += 16) { |
231 | 0 | __m128i s1 = _mm_loadu_si128((const __m128i *)src); |
232 | 0 | __m128i s2 = _mm_loadu_si128((const __m128i *)(src + 16)); |
233 | 0 | __m128i s3 = _mm_loadu_si128((const __m128i *)(src + 32)); |
234 | 0 | s1 = _mm_shuffle_epi8(s1, shuffleMask1); |
235 | 0 | s2 = _mm_shuffle_epi8(s2, shuffleMask2); |
236 | 0 | s3 = _mm_shuffle_epi8(s3, shuffleMask3); |
237 | 0 | _mm_storeu_si128((__m128i *)dst, s1); |
238 | 0 | _mm_storeu_si128((__m128i *)(dst + 16), s2); |
239 | 0 | _mm_storeu_si128((__m128i *)(dst + 32), s3); |
240 | | |
241 | | // Now fix the last four misplaced values |
242 | 0 | std::swap(dst[15], dst[17]); |
243 | 0 | std::swap(dst[30], dst[32]); |
244 | |
|
245 | 0 | src += 48; |
246 | 0 | dst += 48; |
247 | 0 | } |
248 | |
|
249 | 0 | if (src != dst) { |
250 | 0 | SIMD_EPILOGUE(i, count, 15) { |
251 | 0 | dst[0] = src[2]; |
252 | 0 | dst[1] = src[1]; |
253 | 0 | dst[2] = src[0]; |
254 | 0 | dst += 3; |
255 | 0 | src += 3; |
256 | 0 | } |
257 | 0 | } else { |
258 | 0 | SIMD_EPILOGUE(i, count, 15) { |
259 | 0 | std::swap(dst[0], dst[2]); |
260 | 0 | dst += 3; |
261 | 0 | } |
262 | 0 | } |
263 | 0 | } |
264 | | |
265 | | QT_END_NAMESPACE |
266 | | |
267 | | #endif // QT_COMPILER_SUPPORTS_SSSE3 |