/src/FreeRDP/libfreerdp/primitives/sse/prim_YUV_sse4.1.c
Line | Count | Source (jump to first uncovered line) |
1 | | /** |
2 | | * FreeRDP: A Remote Desktop Protocol Implementation |
3 | | * Optimized YUV/RGB conversion operations |
4 | | * |
5 | | * Copyright 2014 Thomas Erbesdobler |
6 | | * Copyright 2016-2017 Armin Novak <armin.novak@thincast.com> |
7 | | * Copyright 2016-2017 Norbert Federa <norbert.federa@thincast.com> |
8 | | * Copyright 2016-2017 Thincast Technologies GmbH |
9 | | * |
10 | | * Licensed under the Apache License, Version 2.0 (the "License"); |
11 | | * you may not use this file except in compliance with the License. |
12 | | * You may obtain a copy of the License at |
13 | | * |
14 | | * http://www.apache.org/licenses/LICENSE-2.0 |
15 | | * |
16 | | * Unless required by applicable law or agreed to in writing, software |
17 | | * distributed under the License is distributed on an "AS IS" BASIS, |
18 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
19 | | * See the License for the specific language governing permissions and |
20 | | * limitations under the License. |
21 | | */ |
22 | | |
23 | | #include <winpr/wtypes.h> |
24 | | #include <freerdp/config.h> |
25 | | |
26 | | #include <winpr/sysinfo.h> |
27 | | #include <winpr/crt.h> |
28 | | #include <freerdp/types.h> |
29 | | #include <freerdp/primitives.h> |
30 | | |
31 | | #include "prim_internal.h" |
32 | | #include "prim_avxsse.h" |
33 | | #include "prim_YUV.h" |
34 | | |
35 | | #if defined(SSE_AVX_INTRINSICS_ENABLED) |
36 | | #include <emmintrin.h> |
37 | | #include <tmmintrin.h> |
38 | | #include <smmintrin.h> |
39 | | |
40 | | static primitives_t* generic = NULL; |
41 | | |
42 | | /****************************************************************************/ |
43 | | /* sse41 YUV420 -> RGB conversion */ |
44 | | /****************************************************************************/ |
45 | | static inline __m128i* sse41_YUV444Pixel(__m128i* WINPR_RESTRICT dst, __m128i Yraw, __m128i Uraw, |
46 | | __m128i Vraw, UINT8 pos) |
47 | 0 | { |
48 | 0 | const __m128i mapY[] = { mm_set_epu32(0x80800380, 0x80800280, 0x80800180, 0x80800080), |
49 | 0 | mm_set_epu32(0x80800780, 0x80800680, 0x80800580, 0x80800480), |
50 | 0 | mm_set_epu32(0x80800B80, 0x80800A80, 0x80800980, 0x80800880), |
51 | 0 | mm_set_epu32(0x80800F80, 0x80800E80, 0x80800D80, 0x80800C80) }; |
52 | 0 | const __m128i mapUV[] = { mm_set_epu32(0x80038002, 0x80018000, 0x80808080, 0x80808080), |
53 | 0 | mm_set_epu32(0x80078006, 0x80058004, 0x80808080, 0x80808080), |
54 | 0 | mm_set_epu32(0x800B800A, 0x80098008, 0x80808080, 0x80808080), |
55 | 0 | mm_set_epu32(0x800F800E, 0x800D800C, 0x80808080, 0x80808080) }; |
56 | 0 | const __m128i mask[] = { mm_set_epu32(0x80038080, 0x80028080, 0x80018080, 0x80008080), |
57 | 0 | mm_set_epu32(0x80800380, 0x80800280, 0x80800180, 0x80800080), |
58 | 0 | mm_set_epu32(0x80808003, 0x80808002, 0x80808001, 0x80808000) }; |
59 | 0 | const __m128i c128 = _mm_set1_epi16(128); |
60 | 0 | __m128i BGRX = _mm_and_si128(LOAD_SI128(dst), |
61 | 0 | mm_set_epu32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000)); |
62 | 0 | { |
63 | 0 | __m128i C; |
64 | 0 | __m128i D; |
65 | 0 | __m128i E; |
66 | | /* Load Y values and expand to 32 bit */ |
67 | 0 | { |
68 | 0 | C = _mm_shuffle_epi8(Yraw, mapY[pos]); /* Reorder and multiply by 256 */ |
69 | 0 | } |
70 | | /* Load U values and expand to 32 bit */ |
71 | 0 | { |
72 | 0 | const __m128i U = _mm_shuffle_epi8(Uraw, mapUV[pos]); /* Reorder dcba */ |
73 | 0 | D = _mm_sub_epi16(U, c128); /* D = U - 128 */ |
74 | 0 | } |
75 | | /* Load V values and expand to 32 bit */ |
76 | 0 | { |
77 | 0 | const __m128i V = _mm_shuffle_epi8(Vraw, mapUV[pos]); /* Reorder dcba */ |
78 | 0 | E = _mm_sub_epi16(V, c128); /* E = V - 128 */ |
79 | 0 | } |
80 | | /* Get the R value */ |
81 | 0 | { |
82 | 0 | const __m128i c403 = _mm_set1_epi16(403); |
83 | 0 | const __m128i e403 = |
84 | 0 | _mm_unpackhi_epi16(_mm_mullo_epi16(E, c403), _mm_mulhi_epi16(E, c403)); |
85 | 0 | const __m128i Rs = _mm_add_epi32(C, e403); |
86 | 0 | const __m128i R32 = _mm_srai_epi32(Rs, 8); |
87 | 0 | const __m128i R16 = _mm_packs_epi32(R32, _mm_setzero_si128()); |
88 | 0 | const __m128i R = _mm_packus_epi16(R16, _mm_setzero_si128()); |
89 | 0 | const __m128i packed = _mm_shuffle_epi8(R, mask[0]); |
90 | 0 | BGRX = _mm_or_si128(BGRX, packed); |
91 | 0 | } |
92 | | /* Get the G value */ |
93 | 0 | { |
94 | 0 | const __m128i c48 = _mm_set1_epi16(48); |
95 | 0 | const __m128i d48 = |
96 | 0 | _mm_unpackhi_epi16(_mm_mullo_epi16(D, c48), _mm_mulhi_epi16(D, c48)); |
97 | 0 | const __m128i c120 = _mm_set1_epi16(120); |
98 | 0 | const __m128i e120 = |
99 | 0 | _mm_unpackhi_epi16(_mm_mullo_epi16(E, c120), _mm_mulhi_epi16(E, c120)); |
100 | 0 | const __m128i de = _mm_add_epi32(d48, e120); |
101 | 0 | const __m128i Gs = _mm_sub_epi32(C, de); |
102 | 0 | const __m128i G32 = _mm_srai_epi32(Gs, 8); |
103 | 0 | const __m128i G16 = _mm_packs_epi32(G32, _mm_setzero_si128()); |
104 | 0 | const __m128i G = _mm_packus_epi16(G16, _mm_setzero_si128()); |
105 | 0 | const __m128i packed = _mm_shuffle_epi8(G, mask[1]); |
106 | 0 | BGRX = _mm_or_si128(BGRX, packed); |
107 | 0 | } |
108 | | /* Get the B value */ |
109 | 0 | { |
110 | 0 | const __m128i c475 = _mm_set1_epi16(475); |
111 | 0 | const __m128i d475 = |
112 | 0 | _mm_unpackhi_epi16(_mm_mullo_epi16(D, c475), _mm_mulhi_epi16(D, c475)); |
113 | 0 | const __m128i Bs = _mm_add_epi32(C, d475); |
114 | 0 | const __m128i B32 = _mm_srai_epi32(Bs, 8); |
115 | 0 | const __m128i B16 = _mm_packs_epi32(B32, _mm_setzero_si128()); |
116 | 0 | const __m128i B = _mm_packus_epi16(B16, _mm_setzero_si128()); |
117 | 0 | const __m128i packed = _mm_shuffle_epi8(B, mask[2]); |
118 | 0 | BGRX = _mm_or_si128(BGRX, packed); |
119 | 0 | } |
120 | 0 | } |
121 | 0 | STORE_SI128(dst++, BGRX); |
122 | 0 | return dst; |
123 | 0 | } |
124 | | |
125 | | static inline pstatus_t sse41_YUV420ToRGB_BGRX(const BYTE* WINPR_RESTRICT pSrc[], |
126 | | const UINT32* WINPR_RESTRICT srcStep, |
127 | | BYTE* WINPR_RESTRICT pDst, UINT32 dstStep, |
128 | | const prim_size_t* WINPR_RESTRICT roi) |
129 | 0 | { |
130 | 0 | const UINT32 nWidth = roi->width; |
131 | 0 | const UINT32 nHeight = roi->height; |
132 | 0 | const UINT32 pad = roi->width % 16; |
133 | 0 | const __m128i duplicate = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0); |
134 | |
|
135 | 0 | for (size_t y = 0; y < nHeight; y++) |
136 | 0 | { |
137 | 0 | __m128i* dst = (__m128i*)(pDst + dstStep * y); |
138 | 0 | const BYTE* YData = pSrc[0] + y * srcStep[0]; |
139 | 0 | const BYTE* UData = pSrc[1] + (y / 2) * srcStep[1]; |
140 | 0 | const BYTE* VData = pSrc[2] + (y / 2) * srcStep[2]; |
141 | |
|
142 | 0 | for (UINT32 x = 0; x < nWidth - pad; x += 16) |
143 | 0 | { |
144 | 0 | const __m128i Y = LOAD_SI128(YData); |
145 | 0 | const __m128i uRaw = LOAD_SI128(UData); |
146 | 0 | const __m128i vRaw = LOAD_SI128(VData); |
147 | 0 | const __m128i U = _mm_shuffle_epi8(uRaw, duplicate); |
148 | 0 | const __m128i V = _mm_shuffle_epi8(vRaw, duplicate); |
149 | 0 | YData += 16; |
150 | 0 | UData += 8; |
151 | 0 | VData += 8; |
152 | 0 | dst = sse41_YUV444Pixel(dst, Y, U, V, 0); |
153 | 0 | dst = sse41_YUV444Pixel(dst, Y, U, V, 1); |
154 | 0 | dst = sse41_YUV444Pixel(dst, Y, U, V, 2); |
155 | 0 | dst = sse41_YUV444Pixel(dst, Y, U, V, 3); |
156 | 0 | } |
157 | |
|
158 | 0 | for (UINT32 x = 0; x < pad; x++) |
159 | 0 | { |
160 | 0 | const BYTE Y = *YData++; |
161 | 0 | const BYTE U = *UData; |
162 | 0 | const BYTE V = *VData; |
163 | 0 | dst = (__m128i*)writeYUVPixel((BYTE*)dst, PIXEL_FORMAT_BGRX32, Y, U, V, writePixelBGRX); |
164 | |
|
165 | 0 | if (x % 2) |
166 | 0 | { |
167 | 0 | UData++; |
168 | 0 | VData++; |
169 | 0 | } |
170 | 0 | } |
171 | 0 | } |
172 | |
|
173 | 0 | return PRIMITIVES_SUCCESS; |
174 | 0 | } |
175 | | |
176 | | static pstatus_t sse41_YUV420ToRGB(const BYTE* WINPR_RESTRICT pSrc[3], const UINT32 srcStep[3], |
177 | | BYTE* WINPR_RESTRICT pDst, UINT32 dstStep, UINT32 DstFormat, |
178 | | const prim_size_t* WINPR_RESTRICT roi) |
179 | 0 | { |
180 | 0 | switch (DstFormat) |
181 | 0 | { |
182 | 0 | case PIXEL_FORMAT_BGRX32: |
183 | 0 | case PIXEL_FORMAT_BGRA32: |
184 | 0 | return sse41_YUV420ToRGB_BGRX(pSrc, srcStep, pDst, dstStep, roi); |
185 | | |
186 | 0 | default: |
187 | 0 | return generic->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi); |
188 | 0 | } |
189 | 0 | } |
190 | | |
191 | | static inline void BGRX_fillRGB(size_t offset, BYTE* WINPR_RESTRICT pRGB[2], |
192 | | const BYTE* WINPR_RESTRICT pY[2], const BYTE* WINPR_RESTRICT pU[2], |
193 | | const BYTE* WINPR_RESTRICT pV[2], BOOL filter) |
194 | 0 | { |
195 | 0 | WINPR_ASSERT(pRGB); |
196 | 0 | WINPR_ASSERT(pY); |
197 | 0 | WINPR_ASSERT(pU); |
198 | 0 | WINPR_ASSERT(pV); |
199 | | |
200 | 0 | const UINT32 DstFormat = PIXEL_FORMAT_BGRX32; |
201 | 0 | const UINT32 bpp = 4; |
202 | |
|
203 | 0 | for (size_t i = 0; i < 2; i++) |
204 | 0 | { |
205 | 0 | for (size_t j = 0; j < 2; j++) |
206 | 0 | { |
207 | 0 | const BYTE Y = pY[i][offset + j]; |
208 | 0 | BYTE U = pU[i][offset + j]; |
209 | 0 | BYTE V = pV[i][offset + j]; |
210 | 0 | if ((i == 0) && (j == 0) && filter) |
211 | 0 | { |
212 | 0 | const INT32 avgU = |
213 | 0 | 4 * pU[0][offset] - pU[0][offset + 1] - pU[1][offset] - pU[1][offset + 1]; |
214 | 0 | const INT32 avgV = |
215 | 0 | 4 * pV[0][offset] - pV[0][offset + 1] - pV[1][offset] - pV[1][offset + 1]; |
216 | |
|
217 | 0 | U = CONDITIONAL_CLIP(avgU, pU[0][offset]); |
218 | 0 | V = CONDITIONAL_CLIP(avgV, pV[0][offset]); |
219 | 0 | } |
220 | |
|
221 | 0 | writeYUVPixel(&pRGB[i][(j + offset) * bpp], DstFormat, Y, U, V, writePixelBGRX); |
222 | 0 | } |
223 | 0 | } |
224 | 0 | } |
225 | | |
226 | | /* input are uint16_t vectors */ |
227 | | static inline __m128i sse41_yuv2x_single(const __m128i Y, __m128i U, __m128i V, const short iMulU, |
228 | | const short iMulV) |
229 | 0 | { |
230 | 0 | const __m128i zero = _mm_set1_epi8(0); |
231 | |
|
232 | 0 | __m128i Ylo = _mm_unpacklo_epi16(Y, zero); |
233 | 0 | __m128i Yhi = _mm_unpackhi_epi16(Y, zero); |
234 | 0 | if (iMulU != 0) |
235 | 0 | { |
236 | 0 | const __m128i addX = _mm_set1_epi16(128); |
237 | 0 | const __m128i D = _mm_sub_epi16(U, addX); |
238 | 0 | const __m128i mulU = _mm_set1_epi16(iMulU); |
239 | 0 | const __m128i mulDlo = _mm_mullo_epi16(D, mulU); |
240 | 0 | const __m128i mulDhi = _mm_mulhi_epi16(D, mulU); |
241 | 0 | const __m128i Dlo = _mm_unpacklo_epi16(mulDlo, mulDhi); |
242 | 0 | Ylo = _mm_add_epi32(Ylo, Dlo); |
243 | |
|
244 | 0 | const __m128i Dhi = _mm_unpackhi_epi16(mulDlo, mulDhi); |
245 | 0 | Yhi = _mm_add_epi32(Yhi, Dhi); |
246 | 0 | } |
247 | 0 | if (iMulV != 0) |
248 | 0 | { |
249 | 0 | const __m128i addX = _mm_set1_epi16(128); |
250 | 0 | const __m128i E = _mm_sub_epi16(V, addX); |
251 | 0 | const __m128i mul = _mm_set1_epi16(iMulV); |
252 | 0 | const __m128i mulElo = _mm_mullo_epi16(E, mul); |
253 | 0 | const __m128i mulEhi = _mm_mulhi_epi16(E, mul); |
254 | 0 | const __m128i Elo = _mm_unpacklo_epi16(mulElo, mulEhi); |
255 | 0 | const __m128i esumlo = _mm_add_epi32(Ylo, Elo); |
256 | |
|
257 | 0 | const __m128i Ehi = _mm_unpackhi_epi16(mulElo, mulEhi); |
258 | 0 | const __m128i esumhi = _mm_add_epi32(Yhi, Ehi); |
259 | 0 | Ylo = esumlo; |
260 | 0 | Yhi = esumhi; |
261 | 0 | } |
262 | |
|
263 | 0 | const __m128i rYlo = _mm_srai_epi32(Ylo, 8); |
264 | 0 | const __m128i rYhi = _mm_srai_epi32(Yhi, 8); |
265 | 0 | const __m128i rY = _mm_packs_epi32(rYlo, rYhi); |
266 | 0 | return rY; |
267 | 0 | } |
268 | | |
269 | | /* Input are uint8_t vectors */ |
270 | | static inline __m128i sse41_yuv2x(const __m128i Y, __m128i U, __m128i V, const short iMulU, |
271 | | const short iMulV) |
272 | 0 | { |
273 | 0 | const __m128i zero = _mm_set1_epi8(0); |
274 | | |
275 | | /* Ylo = Y * 256 |
276 | | * Ulo = uint8_t -> uint16_t |
277 | | * Vlo = uint8_t -> uint16_t |
278 | | */ |
279 | 0 | const __m128i Ylo = _mm_unpacklo_epi8(zero, Y); |
280 | 0 | const __m128i Ulo = _mm_unpacklo_epi8(U, zero); |
281 | 0 | const __m128i Vlo = _mm_unpacklo_epi8(V, zero); |
282 | 0 | const __m128i preslo = sse41_yuv2x_single(Ylo, Ulo, Vlo, iMulU, iMulV); |
283 | |
|
284 | 0 | const __m128i Yhi = _mm_unpackhi_epi8(zero, Y); |
285 | 0 | const __m128i Uhi = _mm_unpackhi_epi8(U, zero); |
286 | 0 | const __m128i Vhi = _mm_unpackhi_epi8(V, zero); |
287 | 0 | const __m128i preshi = sse41_yuv2x_single(Yhi, Uhi, Vhi, iMulU, iMulV); |
288 | 0 | const __m128i res = _mm_packus_epi16(preslo, preshi); |
289 | |
|
290 | 0 | return res; |
291 | 0 | } |
292 | | |
293 | | /* const INT32 r = ((256L * C(Y) + 0L * D(U) + 403L * E(V))) >> 8; */ |
294 | | static inline __m128i sse41_yuv2r(const __m128i Y, __m128i U, __m128i V) |
295 | 0 | { |
296 | 0 | return sse41_yuv2x(Y, U, V, 0, 403); |
297 | 0 | } |
298 | | |
299 | | /* const INT32 g = ((256L * C(Y) - 48L * D(U) - 120L * E(V))) >> 8; */ |
300 | | static inline __m128i sse41_yuv2g(const __m128i Y, __m128i U, __m128i V) |
301 | 0 | { |
302 | 0 | return sse41_yuv2x(Y, U, V, -48, -120); |
303 | 0 | } |
304 | | |
305 | | /* const INT32 b = ((256L * C(Y) + 475L * D(U) + 0L * E(V))) >> 8; */ |
306 | | static inline __m128i sse41_yuv2b(const __m128i Y, __m128i U, __m128i V) |
307 | 0 | { |
308 | 0 | return sse41_yuv2x(Y, U, V, 475, 0); |
309 | 0 | } |
310 | | |
311 | | static inline void sse41_BGRX_fillRGB_pixel(BYTE* WINPR_RESTRICT pRGB, __m128i Y, __m128i U, |
312 | | __m128i V) |
313 | 0 | { |
314 | 0 | const __m128i zero = _mm_set1_epi8(0); |
315 | | /* Y * 256 */ |
316 | 0 | const __m128i r = sse41_yuv2r(Y, U, V); |
317 | 0 | const __m128i rx[2] = { _mm_unpackhi_epi8(r, zero), _mm_unpacklo_epi8(r, zero) }; |
318 | |
|
319 | 0 | const __m128i g = sse41_yuv2g(Y, U, V); |
320 | 0 | const __m128i b = sse41_yuv2b(Y, U, V); |
321 | |
|
322 | 0 | const __m128i bg[2] = { _mm_unpackhi_epi8(b, g), _mm_unpacklo_epi8(b, g) }; |
323 | |
|
324 | 0 | const __m128i mask = mm_set_epu8(0x00, 0xFF, 0xFF, 0xFF, 0x00, 0xFF, 0xFF, 0xFF, 0x00, 0xFF, |
325 | 0 | 0xFF, 0xFF, 0x00, 0xFF, 0xFF, 0xFF); |
326 | |
|
327 | 0 | __m128i* rgb = (__m128i*)pRGB; |
328 | 0 | const __m128i bgrx0 = _mm_unpacklo_epi16(bg[1], rx[1]); |
329 | 0 | _mm_maskmoveu_si128(bgrx0, mask, (char*)&rgb[0]); |
330 | 0 | const __m128i bgrx1 = _mm_unpackhi_epi16(bg[1], rx[1]); |
331 | 0 | _mm_maskmoveu_si128(bgrx1, mask, (char*)&rgb[1]); |
332 | 0 | const __m128i bgrx2 = _mm_unpacklo_epi16(bg[0], rx[0]); |
333 | 0 | _mm_maskmoveu_si128(bgrx2, mask, (char*)&rgb[2]); |
334 | 0 | const __m128i bgrx3 = _mm_unpackhi_epi16(bg[0], rx[0]); |
335 | 0 | _mm_maskmoveu_si128(bgrx3, mask, (char*)&rgb[3]); |
336 | 0 | } |
337 | | |
338 | | static inline __m128i odd1sum(__m128i u1) |
339 | 0 | { |
340 | 0 | const __m128i zero = _mm_set1_epi8(0); |
341 | 0 | const __m128i u1hi = _mm_unpackhi_epi8(u1, zero); |
342 | 0 | const __m128i u1lo = _mm_unpacklo_epi8(u1, zero); |
343 | 0 | return _mm_hadds_epi16(u1lo, u1hi); |
344 | 0 | } |
345 | | |
346 | | static inline __m128i odd0sum(__m128i u0, __m128i u1sum) |
347 | 0 | { |
348 | | /* Mask out even bytes, extend uint8_t to uint16_t by filling in zero bytes, |
349 | | * horizontally add the values */ |
350 | 0 | const __m128i mask = mm_set_epu8(0x80, 0x0F, 0x80, 0x0D, 0x80, 0x0B, 0x80, 0x09, 0x80, 0x07, |
351 | 0 | 0x80, 0x05, 0x80, 0x03, 0x80, 0x01); |
352 | 0 | const __m128i u0odd = _mm_shuffle_epi8(u0, mask); |
353 | 0 | return _mm_adds_epi16(u1sum, u0odd); |
354 | 0 | } |
355 | | |
356 | | static inline __m128i calcavg(__m128i u0even, __m128i sum) |
357 | 0 | { |
358 | 0 | const __m128i u4zero = _mm_slli_epi16(u0even, 2); |
359 | 0 | const __m128i uavg = _mm_sub_epi16(u4zero, sum); |
360 | 0 | const __m128i zero = _mm_set1_epi8(0); |
361 | 0 | const __m128i savg = _mm_packus_epi16(uavg, zero); |
362 | 0 | const __m128i smask = mm_set_epu8(0x80, 0x07, 0x80, 0x06, 0x80, 0x05, 0x80, 0x04, 0x80, 0x03, |
363 | 0 | 0x80, 0x02, 0x80, 0x01, 0x80, 0x00); |
364 | 0 | return _mm_shuffle_epi8(savg, smask); |
365 | 0 | } |
366 | | |
367 | | static inline __m128i diffmask(__m128i avg, __m128i u0even) |
368 | 0 | { |
369 | | /* Check for values >= 30 to apply the avg value to |
370 | | * use int16 for calculations to avoid issues with signed 8bit integers |
371 | | */ |
372 | 0 | const __m128i diff = _mm_subs_epi16(u0even, avg); |
373 | 0 | const __m128i absdiff = _mm_abs_epi16(diff); |
374 | 0 | const __m128i val30 = _mm_set1_epi16(30); |
375 | 0 | return _mm_cmplt_epi16(absdiff, val30); |
376 | 0 | } |
377 | | |
378 | | static inline void sse41_filter(__m128i pU[2]) |
379 | 0 | { |
380 | 0 | const __m128i u1sum = odd1sum(pU[1]); |
381 | 0 | const __m128i sum = odd0sum(pU[0], u1sum); |
382 | | |
383 | | /* Mask out the odd bytes. We don´t need to do anything to make the uint8_t to uint16_t */ |
384 | 0 | const __m128i emask = mm_set_epu8(0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, |
385 | 0 | 0x00, 0xff, 0x00, 0xff, 0x00, 0xff); |
386 | 0 | const __m128i u0even = _mm_and_si128(pU[0], emask); |
387 | 0 | const __m128i avg = calcavg(u0even, sum); |
388 | 0 | const __m128i umask = diffmask(avg, u0even); |
389 | |
|
390 | 0 | const __m128i u0orig = _mm_and_si128(u0even, umask); |
391 | 0 | const __m128i u0avg = _mm_andnot_si128(umask, avg); |
392 | 0 | const __m128i evenresult = _mm_or_si128(u0orig, u0avg); |
393 | 0 | const __m128i omask = mm_set_epu8(0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, |
394 | 0 | 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00); |
395 | 0 | const __m128i u0odd = _mm_and_si128(pU[0], omask); |
396 | 0 | const __m128i result = _mm_or_si128(evenresult, u0odd); |
397 | 0 | pU[0] = result; |
398 | 0 | } |
399 | | |
400 | | static inline void sse41_BGRX_fillRGB(BYTE* WINPR_RESTRICT pRGB[2], const __m128i pY[2], |
401 | | __m128i pU[2], __m128i pV[2]) |
402 | 0 | { |
403 | 0 | WINPR_ASSERT(pRGB); |
404 | 0 | WINPR_ASSERT(pY); |
405 | 0 | WINPR_ASSERT(pU); |
406 | 0 | WINPR_ASSERT(pV); |
407 | | |
408 | 0 | sse41_filter(pU); |
409 | 0 | sse41_filter(pV); |
410 | |
|
411 | 0 | for (size_t i = 0; i < 2; i++) |
412 | 0 | { |
413 | 0 | sse41_BGRX_fillRGB_pixel(pRGB[i], pY[i], pU[i], pV[i]); |
414 | 0 | } |
415 | 0 | } |
416 | | |
417 | | static inline pstatus_t sse41_YUV444ToRGB_8u_P3AC4R_BGRX_DOUBLE_ROW( |
418 | | BYTE* WINPR_RESTRICT pDst[2], const BYTE* WINPR_RESTRICT YData[2], |
419 | | const BYTE* WINPR_RESTRICT UData[2], const BYTE* WINPR_RESTRICT VData[2], UINT32 nWidth) |
420 | 0 | { |
421 | 0 | WINPR_ASSERT((nWidth % 2) == 0); |
422 | 0 | const UINT32 pad = nWidth % 16; |
423 | |
|
424 | 0 | size_t x = 0; |
425 | 0 | for (; x < nWidth - pad; x += 16) |
426 | 0 | { |
427 | 0 | const __m128i Y[] = { LOAD_SI128(&YData[0][x]), LOAD_SI128(&YData[1][x]) }; |
428 | 0 | __m128i U[] = { LOAD_SI128(&UData[0][x]), LOAD_SI128(&UData[1][x]) }; |
429 | 0 | __m128i V[] = { LOAD_SI128(&VData[0][x]), LOAD_SI128(&VData[1][x]) }; |
430 | |
|
431 | 0 | BYTE* dstp[] = { &pDst[0][x * 4], &pDst[1][x * 4] }; |
432 | 0 | sse41_BGRX_fillRGB(dstp, Y, U, V); |
433 | 0 | } |
434 | |
|
435 | 0 | for (; x < nWidth; x += 2) |
436 | 0 | { |
437 | 0 | BGRX_fillRGB(x, pDst, YData, UData, VData, TRUE); |
438 | 0 | } |
439 | |
|
440 | 0 | return PRIMITIVES_SUCCESS; |
441 | 0 | } |
442 | | |
443 | | static inline void BGRX_fillRGB_single(size_t offset, BYTE* WINPR_RESTRICT pRGB, |
444 | | const BYTE* WINPR_RESTRICT pY, const BYTE* WINPR_RESTRICT pU, |
445 | | const BYTE* WINPR_RESTRICT pV, WINPR_ATTR_UNUSED BOOL filter) |
446 | 0 | { |
447 | 0 | WINPR_ASSERT(pRGB); |
448 | 0 | WINPR_ASSERT(pY); |
449 | 0 | WINPR_ASSERT(pU); |
450 | 0 | WINPR_ASSERT(pV); |
451 | | |
452 | 0 | const UINT32 bpp = 4; |
453 | |
|
454 | 0 | for (size_t j = 0; j < 2; j++) |
455 | 0 | { |
456 | 0 | const BYTE Y = pY[offset + j]; |
457 | 0 | BYTE U = pU[offset + j]; |
458 | 0 | BYTE V = pV[offset + j]; |
459 | |
|
460 | 0 | writeYUVPixel(&pRGB[(j + offset) * bpp], PIXEL_FORMAT_BGRX32, Y, U, V, writePixelBGRX); |
461 | 0 | } |
462 | 0 | } |
463 | | |
464 | | static inline pstatus_t sse41_YUV444ToRGB_8u_P3AC4R_BGRX_SINGLE_ROW( |
465 | | BYTE* WINPR_RESTRICT pDst, const BYTE* WINPR_RESTRICT YData, const BYTE* WINPR_RESTRICT UData, |
466 | | const BYTE* WINPR_RESTRICT VData, UINT32 nWidth) |
467 | 0 | { |
468 | 0 | WINPR_ASSERT((nWidth % 2) == 0); |
469 | | |
470 | 0 | for (size_t x = 0; x < nWidth; x += 2) |
471 | 0 | { |
472 | 0 | BGRX_fillRGB_single(x, pDst, YData, UData, VData, TRUE); |
473 | 0 | } |
474 | |
|
475 | 0 | return PRIMITIVES_SUCCESS; |
476 | 0 | } |
477 | | |
478 | | static inline pstatus_t sse41_YUV444ToRGB_8u_P3AC4R_BGRX(const BYTE* WINPR_RESTRICT pSrc[], |
479 | | const UINT32 srcStep[], |
480 | | BYTE* WINPR_RESTRICT pDst, UINT32 dstStep, |
481 | | const prim_size_t* WINPR_RESTRICT roi) |
482 | 0 | { |
483 | 0 | const UINT32 nWidth = roi->width; |
484 | 0 | const UINT32 nHeight = roi->height; |
485 | |
|
486 | 0 | size_t y = 0; |
487 | 0 | for (; y < nHeight - nHeight % 2; y += 2) |
488 | 0 | { |
489 | 0 | BYTE* dst[] = { (pDst + dstStep * y), (pDst + dstStep * (y + 1)) }; |
490 | 0 | const BYTE* YData[] = { pSrc[0] + y * srcStep[0], pSrc[0] + (y + 1) * srcStep[0] }; |
491 | 0 | const BYTE* UData[] = { pSrc[1] + y * srcStep[1], pSrc[1] + (y + 1) * srcStep[1] }; |
492 | 0 | const BYTE* VData[] = { pSrc[2] + y * srcStep[2], pSrc[2] + (y + 1) * srcStep[2] }; |
493 | |
|
494 | 0 | const pstatus_t rc = |
495 | 0 | sse41_YUV444ToRGB_8u_P3AC4R_BGRX_DOUBLE_ROW(dst, YData, UData, VData, nWidth); |
496 | 0 | if (rc != PRIMITIVES_SUCCESS) |
497 | 0 | return rc; |
498 | 0 | } |
499 | 0 | for (; y < nHeight; y++) |
500 | 0 | { |
501 | 0 | BYTE* dst = (pDst + dstStep * y); |
502 | 0 | const BYTE* YData = pSrc[0] + y * srcStep[0]; |
503 | 0 | const BYTE* UData = pSrc[1] + y * srcStep[1]; |
504 | 0 | const BYTE* VData = pSrc[2] + y * srcStep[2]; |
505 | 0 | const pstatus_t rc = |
506 | 0 | sse41_YUV444ToRGB_8u_P3AC4R_BGRX_SINGLE_ROW(dst, YData, UData, VData, nWidth); |
507 | 0 | if (rc != PRIMITIVES_SUCCESS) |
508 | 0 | return rc; |
509 | 0 | } |
510 | | |
511 | 0 | return PRIMITIVES_SUCCESS; |
512 | 0 | } |
513 | | |
514 | | static pstatus_t sse41_YUV444ToRGB_8u_P3AC4R(const BYTE* WINPR_RESTRICT pSrc[], |
515 | | const UINT32 srcStep[], BYTE* WINPR_RESTRICT pDst, |
516 | | UINT32 dstStep, UINT32 DstFormat, |
517 | | const prim_size_t* WINPR_RESTRICT roi) |
518 | 0 | { |
519 | 0 | switch (DstFormat) |
520 | 0 | { |
521 | 0 | case PIXEL_FORMAT_BGRX32: |
522 | 0 | case PIXEL_FORMAT_BGRA32: |
523 | 0 | return sse41_YUV444ToRGB_8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi); |
524 | | |
525 | 0 | default: |
526 | 0 | return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi); |
527 | 0 | } |
528 | 0 | } |
529 | | |
530 | | /****************************************************************************/ |
531 | | /* sse41 RGB -> YUV420 conversion **/ |
532 | | /****************************************************************************/ |
533 | | |
534 | | /** |
535 | | * Note (nfedera): |
536 | | * The used forward transformation factors from RGB to YUV are based on the |
537 | | * values specified in [Rec. ITU-R BT.709-6] Section 3: |
538 | | * http://www.itu.int/rec/R-REC-BT.709-6-201506-I/en |
539 | | * |
540 | | * Y = 0.21260 * R + 0.71520 * G + 0.07220 * B + 0; |
541 | | * U = -0.11457 * R - 0.38543 * G + 0.50000 * B + 128; |
542 | | * V = 0.50000 * R - 0.45415 * G - 0.04585 * B + 128; |
543 | | * |
544 | | * The most accurate integer arithmetic approximation when using 8-bit signed |
545 | | * integer factors with 16-bit signed integer intermediate results is: |
546 | | * |
547 | | * Y = ( ( 27 * R + 92 * G + 9 * B) >> 7 ); |
548 | | * U = ( (-29 * R - 99 * G + 128 * B) >> 8 ) + 128; |
549 | | * V = ( ( 128 * R - 116 * G - 12 * B) >> 8 ) + 128; |
550 | | * |
551 | | * Due to signed 8bit range being [-128,127] the U and V constants of 128 are |
552 | | * rounded to 127 |
553 | | */ |
554 | | |
555 | 0 | #define BGRX_Y_FACTORS _mm_set_epi8(0, 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9) |
556 | | #define BGRX_U_FACTORS \ |
557 | 0 | _mm_set_epi8(0, -29, -99, 127, 0, -29, -99, 127, 0, -29, -99, 127, 0, -29, -99, 127) |
558 | | #define BGRX_V_FACTORS \ |
559 | 0 | _mm_set_epi8(0, 127, -116, -12, 0, 127, -116, -12, 0, 127, -116, -12, 0, 127, -116, -12) |
560 | 0 | #define CONST128_FACTORS _mm_set1_epi8(-128) |
561 | | |
562 | 0 | #define Y_SHIFT 7 |
563 | 0 | #define U_SHIFT 8 |
564 | 0 | #define V_SHIFT 8 |
565 | | |
566 | | /* |
567 | | TODO: |
568 | | RGB[AX] can simply be supported using the following factors. And instead of loading the |
569 | | globals directly the functions below could be passed pointers to the correct vectors |
570 | | depending on the source picture format. |
571 | | |
572 | | PRIM_ALIGN_128 static const BYTE rgbx_y_factors[] = { |
573 | | 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9, 0 |
574 | | }; |
575 | | PRIM_ALIGN_128 static const BYTE rgbx_u_factors[] = { |
576 | | -15, -49, 64, 0, -15, -49, 64, 0, -15, -49, 64, 0, -15, -49, 64, 0 |
577 | | }; |
578 | | PRIM_ALIGN_128 static const BYTE rgbx_v_factors[] = { |
579 | | 64, -58, -6, 0, 64, -58, -6, 0, 64, -58, -6, 0, 64, -58, -6, 0 |
580 | | }; |
581 | | */ |
582 | | |
583 | | static inline void sse41_BGRX_TO_YUV(const BYTE* WINPR_RESTRICT pLine1, BYTE* WINPR_RESTRICT pYLine, |
584 | | BYTE* WINPR_RESTRICT pULine, BYTE* WINPR_RESTRICT pVLine) |
585 | 0 | { |
586 | 0 | const BYTE r1 = pLine1[2]; |
587 | 0 | const BYTE g1 = pLine1[1]; |
588 | 0 | const BYTE b1 = pLine1[0]; |
589 | |
|
590 | 0 | if (pYLine) |
591 | 0 | pYLine[0] = RGB2Y(r1, g1, b1); |
592 | 0 | if (pULine) |
593 | 0 | pULine[0] = RGB2U(r1, g1, b1); |
594 | 0 | if (pVLine) |
595 | 0 | pVLine[0] = RGB2V(r1, g1, b1); |
596 | 0 | } |
597 | | |
598 | | /* compute the luma (Y) component from a single rgb source line */ |
599 | | |
600 | | static INLINE void sse41_RGBToYUV420_BGRX_Y(const BYTE* WINPR_RESTRICT src, BYTE* dst, UINT32 width) |
601 | 0 | { |
602 | 0 | const __m128i y_factors = BGRX_Y_FACTORS; |
603 | 0 | const __m128i* argb = (const __m128i*)src; |
604 | 0 | __m128i* ydst = (__m128i*)dst; |
605 | |
|
606 | 0 | UINT32 x = 0; |
607 | |
|
608 | 0 | for (; x < width - width % 16; x += 16) |
609 | 0 | { |
610 | | /* store 16 rgba pixels in 4 128 bit registers */ |
611 | 0 | __m128i x0 = LOAD_SI128(argb++); // 1st 4 pixels |
612 | 0 | { |
613 | 0 | x0 = _mm_maddubs_epi16(x0, y_factors); |
614 | |
|
615 | 0 | __m128i x1 = LOAD_SI128(argb++); // 2nd 4 pixels |
616 | 0 | x1 = _mm_maddubs_epi16(x1, y_factors); |
617 | 0 | x0 = _mm_hadds_epi16(x0, x1); |
618 | 0 | x0 = _mm_srli_epi16(x0, Y_SHIFT); |
619 | 0 | } |
620 | |
|
621 | 0 | __m128i x2 = LOAD_SI128(argb++); // 3rd 4 pixels |
622 | 0 | { |
623 | 0 | x2 = _mm_maddubs_epi16(x2, y_factors); |
624 | |
|
625 | 0 | __m128i x3 = LOAD_SI128(argb++); // 4th 4 pixels |
626 | 0 | x3 = _mm_maddubs_epi16(x3, y_factors); |
627 | 0 | x2 = _mm_hadds_epi16(x2, x3); |
628 | 0 | x2 = _mm_srli_epi16(x2, Y_SHIFT); |
629 | 0 | } |
630 | |
|
631 | 0 | x0 = _mm_packus_epi16(x0, x2); |
632 | | /* save to y plane */ |
633 | 0 | STORE_SI128(ydst++, x0); |
634 | 0 | } |
635 | |
|
636 | 0 | for (; x < width; x++) |
637 | 0 | { |
638 | 0 | sse41_BGRX_TO_YUV(&src[4ULL * x], &dst[x], NULL, NULL); |
639 | 0 | } |
640 | 0 | } |
641 | | |
642 | | /* compute the chrominance (UV) components from two rgb source lines */ |
643 | | |
644 | | static INLINE void sse41_RGBToYUV420_BGRX_UV(const BYTE* WINPR_RESTRICT src1, |
645 | | const BYTE* WINPR_RESTRICT src2, |
646 | | BYTE* WINPR_RESTRICT dst1, BYTE* WINPR_RESTRICT dst2, |
647 | | UINT32 width) |
648 | 0 | { |
649 | 0 | const __m128i u_factors = BGRX_U_FACTORS; |
650 | 0 | const __m128i v_factors = BGRX_V_FACTORS; |
651 | 0 | const __m128i vector128 = CONST128_FACTORS; |
652 | |
|
653 | 0 | size_t x = 0; |
654 | |
|
655 | 0 | for (; x < width - width % 16; x += 16) |
656 | 0 | { |
657 | 0 | const __m128i* rgb1 = (const __m128i*)&src1[4ULL * x]; |
658 | 0 | const __m128i* rgb2 = (const __m128i*)&src2[4ULL * x]; |
659 | 0 | __m64* udst = (__m64*)&dst1[x / 2]; |
660 | 0 | __m64* vdst = (__m64*)&dst2[x / 2]; |
661 | | |
662 | | /* subsample 16x2 pixels into 16x1 pixels */ |
663 | 0 | __m128i x0 = LOAD_SI128(&rgb1[0]); |
664 | 0 | __m128i x4 = LOAD_SI128(&rgb2[0]); |
665 | 0 | x0 = _mm_avg_epu8(x0, x4); |
666 | |
|
667 | 0 | __m128i x1 = LOAD_SI128(&rgb1[1]); |
668 | 0 | x4 = LOAD_SI128(&rgb2[1]); |
669 | 0 | x1 = _mm_avg_epu8(x1, x4); |
670 | |
|
671 | 0 | __m128i x2 = LOAD_SI128(&rgb1[2]); |
672 | 0 | x4 = LOAD_SI128(&rgb2[2]); |
673 | 0 | x2 = _mm_avg_epu8(x2, x4); |
674 | |
|
675 | 0 | __m128i x3 = LOAD_SI128(&rgb1[3]); |
676 | 0 | x4 = LOAD_SI128(&rgb2[3]); |
677 | 0 | x3 = _mm_avg_epu8(x3, x4); |
678 | | |
679 | | /* subsample these 16x1 pixels into 8x1 pixels */ |
680 | | /** |
681 | | * shuffle controls |
682 | | * c = a[0],a[2],b[0],b[2] == 10 00 10 00 = 0x88 |
683 | | * c = a[1],a[3],b[1],b[3] == 11 01 11 01 = 0xdd |
684 | | */ |
685 | 0 | x4 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x0), _mm_castsi128_ps(x1), 0x88)); |
686 | 0 | x0 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x0), _mm_castsi128_ps(x1), 0xdd)); |
687 | 0 | x0 = _mm_avg_epu8(x0, x4); |
688 | 0 | x4 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x2), _mm_castsi128_ps(x3), 0x88)); |
689 | 0 | x1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x2), _mm_castsi128_ps(x3), 0xdd)); |
690 | 0 | x1 = _mm_avg_epu8(x1, x4); |
691 | | /* multiplications and subtotals */ |
692 | 0 | x2 = _mm_maddubs_epi16(x0, u_factors); |
693 | 0 | x3 = _mm_maddubs_epi16(x1, u_factors); |
694 | 0 | x4 = _mm_maddubs_epi16(x0, v_factors); |
695 | 0 | __m128i x5 = _mm_maddubs_epi16(x1, v_factors); |
696 | | /* the total sums */ |
697 | 0 | x0 = _mm_hadd_epi16(x2, x3); |
698 | 0 | x1 = _mm_hadd_epi16(x4, x5); |
699 | | /* shift the results */ |
700 | 0 | x0 = _mm_srai_epi16(x0, U_SHIFT); |
701 | 0 | x1 = _mm_srai_epi16(x1, V_SHIFT); |
702 | | /* pack the 16 words into bytes */ |
703 | 0 | x0 = _mm_packs_epi16(x0, x1); |
704 | | /* add 128 */ |
705 | 0 | x0 = _mm_sub_epi8(x0, vector128); |
706 | | /* the lower 8 bytes go to the u plane */ |
707 | 0 | _mm_storel_pi(udst, _mm_castsi128_ps(x0)); |
708 | | /* the upper 8 bytes go to the v plane */ |
709 | 0 | _mm_storeh_pi(vdst, _mm_castsi128_ps(x0)); |
710 | 0 | } |
711 | |
|
712 | 0 | for (; x < width - width % 2; x += 2) |
713 | 0 | { |
714 | 0 | BYTE u[4] = { 0 }; |
715 | 0 | BYTE v[4] = { 0 }; |
716 | 0 | sse41_BGRX_TO_YUV(&src1[4ULL * x], NULL, &u[0], &v[0]); |
717 | 0 | sse41_BGRX_TO_YUV(&src1[4ULL * (1ULL + x)], NULL, &u[1], &v[1]); |
718 | 0 | sse41_BGRX_TO_YUV(&src2[4ULL * x], NULL, &u[2], &v[2]); |
719 | 0 | sse41_BGRX_TO_YUV(&src2[4ULL * (1ULL + x)], NULL, &u[3], &v[3]); |
720 | 0 | const INT16 u4 = WINPR_ASSERTING_INT_CAST(INT16, (INT16)u[0] + u[1] + u[2] + u[3]); |
721 | 0 | const INT16 uu = WINPR_ASSERTING_INT_CAST(INT16, u4 / 4); |
722 | 0 | const BYTE u8 = CLIP(uu); |
723 | 0 | dst1[x / 2] = u8; |
724 | |
|
725 | 0 | const INT16 v4 = WINPR_ASSERTING_INT_CAST(INT16, (INT16)v[0] + v[1] + v[2] + v[3]); |
726 | 0 | const INT16 vu = WINPR_ASSERTING_INT_CAST(INT16, v4 / 4); |
727 | 0 | const BYTE v8 = CLIP(vu); |
728 | 0 | dst2[x / 2] = v8; |
729 | 0 | } |
730 | 0 | } |
731 | | |
732 | | static pstatus_t sse41_RGBToYUV420_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep, |
733 | | BYTE* WINPR_RESTRICT pDst[], const UINT32 dstStep[], |
734 | | const prim_size_t* WINPR_RESTRICT roi) |
735 | 0 | { |
736 | 0 | if (roi->height < 1 || roi->width < 1) |
737 | 0 | { |
738 | 0 | return !PRIMITIVES_SUCCESS; |
739 | 0 | } |
740 | | |
741 | 0 | size_t y = 0; |
742 | 0 | for (; y < roi->height - roi->height % 2; y += 2) |
743 | 0 | { |
744 | 0 | const BYTE* line1 = &pSrc[y * srcStep]; |
745 | 0 | const BYTE* line2 = &pSrc[(1ULL + y) * srcStep]; |
746 | 0 | BYTE* ydst1 = &pDst[0][y * dstStep[0]]; |
747 | 0 | BYTE* ydst2 = &pDst[0][(1ULL + y) * dstStep[0]]; |
748 | 0 | BYTE* udst = &pDst[1][y / 2 * dstStep[1]]; |
749 | 0 | BYTE* vdst = &pDst[2][y / 2 * dstStep[2]]; |
750 | |
|
751 | 0 | sse41_RGBToYUV420_BGRX_UV(line1, line2, udst, vdst, roi->width); |
752 | 0 | sse41_RGBToYUV420_BGRX_Y(line1, ydst1, roi->width); |
753 | 0 | sse41_RGBToYUV420_BGRX_Y(line2, ydst2, roi->width); |
754 | 0 | } |
755 | |
|
756 | 0 | for (; y < roi->height; y++) |
757 | 0 | { |
758 | 0 | const BYTE* line = &pSrc[y * srcStep]; |
759 | 0 | BYTE* ydst = &pDst[0][1ULL * y * dstStep[0]]; |
760 | 0 | sse41_RGBToYUV420_BGRX_Y(line, ydst, roi->width); |
761 | 0 | } |
762 | |
|
763 | 0 | return PRIMITIVES_SUCCESS; |
764 | 0 | } |
765 | | |
766 | | static pstatus_t sse41_RGBToYUV420(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat, |
767 | | UINT32 srcStep, BYTE* WINPR_RESTRICT pDst[], |
768 | | const UINT32 dstStep[], const prim_size_t* WINPR_RESTRICT roi) |
769 | 0 | { |
770 | 0 | switch (srcFormat) |
771 | 0 | { |
772 | 0 | case PIXEL_FORMAT_BGRX32: |
773 | 0 | case PIXEL_FORMAT_BGRA32: |
774 | 0 | return sse41_RGBToYUV420_BGRX(pSrc, srcStep, pDst, dstStep, roi); |
775 | | |
776 | 0 | default: |
777 | 0 | return generic->RGBToYUV420_8u_P3AC4R(pSrc, srcFormat, srcStep, pDst, dstStep, roi); |
778 | 0 | } |
779 | 0 | } |
780 | | |
781 | | /****************************************************************************/ |
782 | | /* sse41 RGB -> AVC444-YUV conversion **/ |
783 | | /****************************************************************************/ |
784 | | |
785 | | static INLINE void sse41_RGBToAVC444YUV_BGRX_DOUBLE_ROW( |
786 | | const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd, |
787 | | BYTE* WINPR_RESTRICT b1Even, BYTE* WINPR_RESTRICT b1Odd, BYTE* WINPR_RESTRICT b2, |
788 | | BYTE* WINPR_RESTRICT b3, BYTE* WINPR_RESTRICT b4, BYTE* WINPR_RESTRICT b5, |
789 | | BYTE* WINPR_RESTRICT b6, BYTE* WINPR_RESTRICT b7, UINT32 width) |
790 | 0 | { |
791 | 0 | const __m128i* argbEven = (const __m128i*)srcEven; |
792 | 0 | const __m128i* argbOdd = (const __m128i*)srcOdd; |
793 | 0 | const __m128i y_factors = BGRX_Y_FACTORS; |
794 | 0 | const __m128i u_factors = BGRX_U_FACTORS; |
795 | 0 | const __m128i v_factors = BGRX_V_FACTORS; |
796 | 0 | const __m128i vector128 = CONST128_FACTORS; |
797 | |
|
798 | 0 | UINT32 x = 0; |
799 | 0 | for (; x < width - width % 16; x += 16) |
800 | 0 | { |
801 | | /* store 16 rgba pixels in 4 128 bit registers */ |
802 | 0 | const __m128i xe1 = LOAD_SI128(argbEven++); // 1st 4 pixels |
803 | 0 | const __m128i xe2 = LOAD_SI128(argbEven++); // 2nd 4 pixels |
804 | 0 | const __m128i xe3 = LOAD_SI128(argbEven++); // 3rd 4 pixels |
805 | 0 | const __m128i xe4 = LOAD_SI128(argbEven++); // 4th 4 pixels |
806 | 0 | const __m128i xo1 = LOAD_SI128(argbOdd++); // 1st 4 pixels |
807 | 0 | const __m128i xo2 = LOAD_SI128(argbOdd++); // 2nd 4 pixels |
808 | 0 | const __m128i xo3 = LOAD_SI128(argbOdd++); // 3rd 4 pixels |
809 | 0 | const __m128i xo4 = LOAD_SI128(argbOdd++); // 4th 4 pixels |
810 | 0 | { |
811 | | /* Y: multiplications with subtotals and horizontal sums */ |
812 | 0 | const __m128i ye1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, y_factors), |
813 | 0 | _mm_maddubs_epi16(xe2, y_factors)), |
814 | 0 | Y_SHIFT); |
815 | 0 | const __m128i ye2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, y_factors), |
816 | 0 | _mm_maddubs_epi16(xe4, y_factors)), |
817 | 0 | Y_SHIFT); |
818 | 0 | const __m128i ye = _mm_packus_epi16(ye1, ye2); |
819 | 0 | const __m128i yo1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, y_factors), |
820 | 0 | _mm_maddubs_epi16(xo2, y_factors)), |
821 | 0 | Y_SHIFT); |
822 | 0 | const __m128i yo2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, y_factors), |
823 | 0 | _mm_maddubs_epi16(xo4, y_factors)), |
824 | 0 | Y_SHIFT); |
825 | 0 | const __m128i yo = _mm_packus_epi16(yo1, yo2); |
826 | | /* store y [b1] */ |
827 | 0 | STORE_SI128(b1Even, ye); |
828 | 0 | b1Even += 16; |
829 | |
|
830 | 0 | if (b1Odd) |
831 | 0 | { |
832 | 0 | STORE_SI128(b1Odd, yo); |
833 | 0 | b1Odd += 16; |
834 | 0 | } |
835 | 0 | } |
836 | 0 | { |
837 | | /* We have now |
838 | | * 16 even U values in ue |
839 | | * 16 odd U values in uo |
840 | | * |
841 | | * We need to split these according to |
842 | | * 3.3.8.3.2 YUV420p Stream Combination for YUV444 mode */ |
843 | 0 | __m128i ue; |
844 | 0 | __m128i uo = { 0 }; |
845 | 0 | { |
846 | 0 | const __m128i ue1 = |
847 | 0 | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, u_factors), |
848 | 0 | _mm_maddubs_epi16(xe2, u_factors)), |
849 | 0 | U_SHIFT); |
850 | 0 | const __m128i ue2 = |
851 | 0 | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, u_factors), |
852 | 0 | _mm_maddubs_epi16(xe4, u_factors)), |
853 | 0 | U_SHIFT); |
854 | 0 | ue = _mm_sub_epi8(_mm_packs_epi16(ue1, ue2), vector128); |
855 | 0 | } |
856 | |
|
857 | 0 | if (b1Odd) |
858 | 0 | { |
859 | 0 | const __m128i uo1 = |
860 | 0 | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, u_factors), |
861 | 0 | _mm_maddubs_epi16(xo2, u_factors)), |
862 | 0 | U_SHIFT); |
863 | 0 | const __m128i uo2 = |
864 | 0 | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, u_factors), |
865 | 0 | _mm_maddubs_epi16(xo4, u_factors)), |
866 | 0 | U_SHIFT); |
867 | 0 | uo = _mm_sub_epi8(_mm_packs_epi16(uo1, uo2), vector128); |
868 | 0 | } |
869 | | |
870 | | /* Now we need the following storage distribution: |
871 | | * 2x 2y -> b2 |
872 | | * x 2y+1 -> b4 |
873 | | * 2x+1 2y -> b6 */ |
874 | 0 | if (b1Odd) /* b2 */ |
875 | 0 | { |
876 | 0 | const __m128i ueh = _mm_unpackhi_epi8(ue, _mm_setzero_si128()); |
877 | 0 | const __m128i uoh = _mm_unpackhi_epi8(uo, _mm_setzero_si128()); |
878 | 0 | const __m128i hi = _mm_add_epi16(ueh, uoh); |
879 | 0 | const __m128i uel = _mm_unpacklo_epi8(ue, _mm_setzero_si128()); |
880 | 0 | const __m128i uol = _mm_unpacklo_epi8(uo, _mm_setzero_si128()); |
881 | 0 | const __m128i lo = _mm_add_epi16(uel, uol); |
882 | 0 | const __m128i added = _mm_hadd_epi16(lo, hi); |
883 | 0 | const __m128i avg16 = _mm_srai_epi16(added, 2); |
884 | 0 | const __m128i avg = _mm_packus_epi16(avg16, avg16); |
885 | 0 | _mm_storel_epi64((__m128i*)b2, avg); |
886 | 0 | } |
887 | 0 | else |
888 | 0 | { |
889 | 0 | const __m128i mask = |
890 | 0 | _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80, |
891 | 0 | (char)0x80, (char)0x80, (char)0x80, 14, 12, 10, 8, 6, 4, 2, 0); |
892 | 0 | const __m128i ud = _mm_shuffle_epi8(ue, mask); |
893 | 0 | _mm_storel_epi64((__m128i*)b2, ud); |
894 | 0 | } |
895 | |
|
896 | 0 | b2 += 8; |
897 | |
|
898 | 0 | if (b1Odd) /* b4 */ |
899 | 0 | { |
900 | 0 | STORE_SI128(b4, uo); |
901 | 0 | b4 += 16; |
902 | 0 | } |
903 | |
|
904 | 0 | { |
905 | | /* b6 */ |
906 | 0 | const __m128i mask = |
907 | 0 | _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80, |
908 | 0 | (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1); |
909 | 0 | const __m128i ude = _mm_shuffle_epi8(ue, mask); |
910 | 0 | _mm_storel_epi64((__m128i*)b6, ude); |
911 | 0 | b6 += 8; |
912 | 0 | } |
913 | 0 | } |
914 | 0 | { |
915 | | /* We have now |
916 | | * 16 even V values in ue |
917 | | * 16 odd V values in uo |
918 | | * |
919 | | * We need to split these according to |
920 | | * 3.3.8.3.2 YUV420p Stream Combination for YUV444 mode */ |
921 | 0 | __m128i ve; |
922 | 0 | __m128i vo = { 0 }; |
923 | 0 | { |
924 | 0 | const __m128i ve1 = |
925 | 0 | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, v_factors), |
926 | 0 | _mm_maddubs_epi16(xe2, v_factors)), |
927 | 0 | V_SHIFT); |
928 | 0 | const __m128i ve2 = |
929 | 0 | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, v_factors), |
930 | 0 | _mm_maddubs_epi16(xe4, v_factors)), |
931 | 0 | V_SHIFT); |
932 | 0 | ve = _mm_sub_epi8(_mm_packs_epi16(ve1, ve2), vector128); |
933 | 0 | } |
934 | |
|
935 | 0 | if (b1Odd) |
936 | 0 | { |
937 | 0 | const __m128i vo1 = |
938 | 0 | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, v_factors), |
939 | 0 | _mm_maddubs_epi16(xo2, v_factors)), |
940 | 0 | V_SHIFT); |
941 | 0 | const __m128i vo2 = |
942 | 0 | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, v_factors), |
943 | 0 | _mm_maddubs_epi16(xo4, v_factors)), |
944 | 0 | V_SHIFT); |
945 | 0 | vo = _mm_sub_epi8(_mm_packs_epi16(vo1, vo2), vector128); |
946 | 0 | } |
947 | | |
948 | | /* Now we need the following storage distribution: |
949 | | * 2x 2y -> b3 |
950 | | * x 2y+1 -> b5 |
951 | | * 2x+1 2y -> b7 */ |
952 | 0 | if (b1Odd) /* b3 */ |
953 | 0 | { |
954 | 0 | const __m128i veh = _mm_unpackhi_epi8(ve, _mm_setzero_si128()); |
955 | 0 | const __m128i voh = _mm_unpackhi_epi8(vo, _mm_setzero_si128()); |
956 | 0 | const __m128i hi = _mm_add_epi16(veh, voh); |
957 | 0 | const __m128i vel = _mm_unpacklo_epi8(ve, _mm_setzero_si128()); |
958 | 0 | const __m128i vol = _mm_unpacklo_epi8(vo, _mm_setzero_si128()); |
959 | 0 | const __m128i lo = _mm_add_epi16(vel, vol); |
960 | 0 | const __m128i added = _mm_hadd_epi16(lo, hi); |
961 | 0 | const __m128i avg16 = _mm_srai_epi16(added, 2); |
962 | 0 | const __m128i avg = _mm_packus_epi16(avg16, avg16); |
963 | 0 | _mm_storel_epi64((__m128i*)b3, avg); |
964 | 0 | } |
965 | 0 | else |
966 | 0 | { |
967 | 0 | const __m128i mask = |
968 | 0 | _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80, |
969 | 0 | (char)0x80, (char)0x80, (char)0x80, 14, 12, 10, 8, 6, 4, 2, 0); |
970 | 0 | const __m128i vd = _mm_shuffle_epi8(ve, mask); |
971 | 0 | _mm_storel_epi64((__m128i*)b3, vd); |
972 | 0 | } |
973 | |
|
974 | 0 | b3 += 8; |
975 | |
|
976 | 0 | if (b1Odd) /* b5 */ |
977 | 0 | { |
978 | 0 | STORE_SI128(b5, vo); |
979 | 0 | b5 += 16; |
980 | 0 | } |
981 | |
|
982 | 0 | { |
983 | | /* b7 */ |
984 | 0 | const __m128i mask = |
985 | 0 | _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80, |
986 | 0 | (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1); |
987 | 0 | const __m128i vde = _mm_shuffle_epi8(ve, mask); |
988 | 0 | _mm_storel_epi64((__m128i*)b7, vde); |
989 | 0 | b7 += 8; |
990 | 0 | } |
991 | 0 | } |
992 | 0 | } |
993 | |
|
994 | 0 | general_RGBToAVC444YUV_BGRX_DOUBLE_ROW(x, srcEven, srcOdd, b1Even, b1Odd, b2, b3, b4, b5, b6, |
995 | 0 | b7, width); |
996 | 0 | } |
997 | | |
998 | | static pstatus_t sse41_RGBToAVC444YUV_BGRX(const BYTE* WINPR_RESTRICT pSrc, |
999 | | WINPR_ATTR_UNUSED UINT32 srcFormat, UINT32 srcStep, |
1000 | | BYTE* WINPR_RESTRICT pDst1[], const UINT32 dst1Step[], |
1001 | | BYTE* WINPR_RESTRICT pDst2[], const UINT32 dst2Step[], |
1002 | | const prim_size_t* WINPR_RESTRICT roi) |
1003 | 0 | { |
1004 | 0 | if (roi->height < 1 || roi->width < 1) |
1005 | 0 | return !PRIMITIVES_SUCCESS; |
1006 | | |
1007 | 0 | size_t y = 0; |
1008 | 0 | for (; y < roi->height - roi->height % 2; y += 2) |
1009 | 0 | { |
1010 | 0 | const BYTE* srcEven = pSrc + y * srcStep; |
1011 | 0 | const BYTE* srcOdd = pSrc + (y + 1) * srcStep; |
1012 | 0 | const size_t i = y >> 1; |
1013 | 0 | const size_t n = (i & (size_t)~7) + i; |
1014 | 0 | BYTE* b1Even = pDst1[0] + y * dst1Step[0]; |
1015 | 0 | BYTE* b1Odd = (b1Even + dst1Step[0]); |
1016 | 0 | BYTE* b2 = pDst1[1] + (y / 2) * dst1Step[1]; |
1017 | 0 | BYTE* b3 = pDst1[2] + (y / 2) * dst1Step[2]; |
1018 | 0 | BYTE* b4 = pDst2[0] + 1ULL * dst2Step[0] * n; |
1019 | 0 | BYTE* b5 = b4 + 8ULL * dst2Step[0]; |
1020 | 0 | BYTE* b6 = pDst2[1] + (y / 2) * dst2Step[1]; |
1021 | 0 | BYTE* b7 = pDst2[2] + (y / 2) * dst2Step[2]; |
1022 | 0 | sse41_RGBToAVC444YUV_BGRX_DOUBLE_ROW(srcEven, srcOdd, b1Even, b1Odd, b2, b3, b4, b5, b6, b7, |
1023 | 0 | roi->width); |
1024 | 0 | } |
1025 | |
|
1026 | 0 | for (; y < roi->height; y++) |
1027 | 0 | { |
1028 | 0 | const BYTE* srcEven = pSrc + y * srcStep; |
1029 | 0 | BYTE* b1Even = pDst1[0] + y * dst1Step[0]; |
1030 | 0 | BYTE* b2 = pDst1[1] + (y / 2) * dst1Step[1]; |
1031 | 0 | BYTE* b3 = pDst1[2] + (y / 2) * dst1Step[2]; |
1032 | 0 | BYTE* b6 = pDst2[1] + (y / 2) * dst2Step[1]; |
1033 | 0 | BYTE* b7 = pDst2[2] + (y / 2) * dst2Step[2]; |
1034 | 0 | general_RGBToAVC444YUV_BGRX_DOUBLE_ROW(0, srcEven, NULL, b1Even, NULL, b2, b3, NULL, NULL, |
1035 | 0 | b6, b7, roi->width); |
1036 | 0 | } |
1037 | |
|
1038 | 0 | return PRIMITIVES_SUCCESS; |
1039 | 0 | } |
1040 | | |
1041 | | static pstatus_t sse41_RGBToAVC444YUV(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat, |
1042 | | UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[], |
1043 | | const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[], |
1044 | | const UINT32 dst2Step[], |
1045 | | const prim_size_t* WINPR_RESTRICT roi) |
1046 | 0 | { |
1047 | 0 | switch (srcFormat) |
1048 | 0 | { |
1049 | 0 | case PIXEL_FORMAT_BGRX32: |
1050 | 0 | case PIXEL_FORMAT_BGRA32: |
1051 | 0 | return sse41_RGBToAVC444YUV_BGRX(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, |
1052 | 0 | dst2Step, roi); |
1053 | | |
1054 | 0 | default: |
1055 | 0 | return generic->RGBToAVC444YUV(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, |
1056 | 0 | dst2Step, roi); |
1057 | 0 | } |
1058 | 0 | } |
1059 | | |
1060 | | /* Mapping of arguments: |
1061 | | * |
1062 | | * b1 [even lines] -> yLumaDstEven |
1063 | | * b1 [odd lines] -> yLumaDstOdd |
1064 | | * b2 -> uLumaDst |
1065 | | * b3 -> vLumaDst |
1066 | | * b4 -> yChromaDst1 |
1067 | | * b5 -> yChromaDst2 |
1068 | | * b6 -> uChromaDst1 |
1069 | | * b7 -> uChromaDst2 |
1070 | | * b8 -> vChromaDst1 |
1071 | | * b9 -> vChromaDst2 |
1072 | | */ |
1073 | | static INLINE void sse41_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW( |
1074 | | const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd, |
1075 | | BYTE* WINPR_RESTRICT yLumaDstEven, BYTE* WINPR_RESTRICT yLumaDstOdd, |
1076 | | BYTE* WINPR_RESTRICT uLumaDst, BYTE* WINPR_RESTRICT vLumaDst, |
1077 | | BYTE* WINPR_RESTRICT yEvenChromaDst1, BYTE* WINPR_RESTRICT yEvenChromaDst2, |
1078 | | BYTE* WINPR_RESTRICT yOddChromaDst1, BYTE* WINPR_RESTRICT yOddChromaDst2, |
1079 | | BYTE* WINPR_RESTRICT uChromaDst1, BYTE* WINPR_RESTRICT uChromaDst2, |
1080 | | BYTE* WINPR_RESTRICT vChromaDst1, BYTE* WINPR_RESTRICT vChromaDst2, UINT32 width) |
1081 | 0 | { |
1082 | 0 | const __m128i vector128 = CONST128_FACTORS; |
1083 | 0 | const __m128i* argbEven = (const __m128i*)srcEven; |
1084 | 0 | const __m128i* argbOdd = (const __m128i*)srcOdd; |
1085 | |
|
1086 | 0 | UINT32 x = 0; |
1087 | 0 | for (; x < width - width % 16; x += 16) |
1088 | 0 | { |
1089 | | /* store 16 rgba pixels in 4 128 bit registers |
1090 | | * for even and odd rows. |
1091 | | */ |
1092 | 0 | const __m128i xe1 = LOAD_SI128(argbEven++); /* 1st 4 pixels */ |
1093 | 0 | const __m128i xe2 = LOAD_SI128(argbEven++); /* 2nd 4 pixels */ |
1094 | 0 | const __m128i xe3 = LOAD_SI128(argbEven++); /* 3rd 4 pixels */ |
1095 | 0 | const __m128i xe4 = LOAD_SI128(argbEven++); /* 4th 4 pixels */ |
1096 | 0 | const __m128i xo1 = LOAD_SI128(argbOdd++); /* 1st 4 pixels */ |
1097 | 0 | const __m128i xo2 = LOAD_SI128(argbOdd++); /* 2nd 4 pixels */ |
1098 | 0 | const __m128i xo3 = LOAD_SI128(argbOdd++); /* 3rd 4 pixels */ |
1099 | 0 | const __m128i xo4 = LOAD_SI128(argbOdd++); /* 4th 4 pixels */ |
1100 | 0 | { |
1101 | | /* Y: multiplications with subtotals and horizontal sums */ |
1102 | 0 | const __m128i y_factors = BGRX_Y_FACTORS; |
1103 | 0 | const __m128i ye1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, y_factors), |
1104 | 0 | _mm_maddubs_epi16(xe2, y_factors)), |
1105 | 0 | Y_SHIFT); |
1106 | 0 | const __m128i ye2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, y_factors), |
1107 | 0 | _mm_maddubs_epi16(xe4, y_factors)), |
1108 | 0 | Y_SHIFT); |
1109 | 0 | const __m128i ye = _mm_packus_epi16(ye1, ye2); |
1110 | | /* store y [b1] */ |
1111 | 0 | STORE_SI128(yLumaDstEven, ye); |
1112 | 0 | yLumaDstEven += 16; |
1113 | 0 | } |
1114 | |
|
1115 | 0 | if (yLumaDstOdd) |
1116 | 0 | { |
1117 | 0 | const __m128i y_factors = BGRX_Y_FACTORS; |
1118 | 0 | const __m128i yo1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, y_factors), |
1119 | 0 | _mm_maddubs_epi16(xo2, y_factors)), |
1120 | 0 | Y_SHIFT); |
1121 | 0 | const __m128i yo2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, y_factors), |
1122 | 0 | _mm_maddubs_epi16(xo4, y_factors)), |
1123 | 0 | Y_SHIFT); |
1124 | 0 | const __m128i yo = _mm_packus_epi16(yo1, yo2); |
1125 | 0 | STORE_SI128(yLumaDstOdd, yo); |
1126 | 0 | yLumaDstOdd += 16; |
1127 | 0 | } |
1128 | |
|
1129 | 0 | { |
1130 | | /* We have now |
1131 | | * 16 even U values in ue |
1132 | | * 16 odd U values in uo |
1133 | | * |
1134 | | * We need to split these according to |
1135 | | * 3.3.8.3.3 YUV420p Stream Combination for YUV444v2 mode */ |
1136 | | /* U: multiplications with subtotals and horizontal sums */ |
1137 | 0 | __m128i ue; |
1138 | 0 | __m128i uo; |
1139 | 0 | __m128i uavg; |
1140 | 0 | { |
1141 | 0 | const __m128i u_factors = BGRX_U_FACTORS; |
1142 | 0 | const __m128i ue1 = |
1143 | 0 | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, u_factors), |
1144 | 0 | _mm_maddubs_epi16(xe2, u_factors)), |
1145 | 0 | U_SHIFT); |
1146 | 0 | const __m128i ue2 = |
1147 | 0 | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, u_factors), |
1148 | 0 | _mm_maddubs_epi16(xe4, u_factors)), |
1149 | 0 | U_SHIFT); |
1150 | 0 | const __m128i ueavg = _mm_hadd_epi16(ue1, ue2); |
1151 | 0 | ue = _mm_sub_epi8(_mm_packs_epi16(ue1, ue2), vector128); |
1152 | 0 | uavg = ueavg; |
1153 | 0 | } |
1154 | 0 | { |
1155 | 0 | const __m128i u_factors = BGRX_U_FACTORS; |
1156 | 0 | const __m128i uo1 = |
1157 | 0 | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, u_factors), |
1158 | 0 | _mm_maddubs_epi16(xo2, u_factors)), |
1159 | 0 | U_SHIFT); |
1160 | 0 | const __m128i uo2 = |
1161 | 0 | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, u_factors), |
1162 | 0 | _mm_maddubs_epi16(xo4, u_factors)), |
1163 | 0 | U_SHIFT); |
1164 | 0 | const __m128i uoavg = _mm_hadd_epi16(uo1, uo2); |
1165 | 0 | uo = _mm_sub_epi8(_mm_packs_epi16(uo1, uo2), vector128); |
1166 | 0 | uavg = _mm_add_epi16(uavg, uoavg); |
1167 | 0 | uavg = _mm_srai_epi16(uavg, 2); |
1168 | 0 | uavg = _mm_packs_epi16(uavg, uoavg); |
1169 | 0 | uavg = _mm_sub_epi8(uavg, vector128); |
1170 | 0 | } |
1171 | | /* Now we need the following storage distribution: |
1172 | | * 2x 2y -> uLumaDst |
1173 | | * 2x+1 y -> yChromaDst1 |
1174 | | * 4x 2y+1 -> uChromaDst1 |
1175 | | * 4x+2 2y+1 -> vChromaDst1 */ |
1176 | 0 | { |
1177 | 0 | const __m128i mask = |
1178 | 0 | _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80, |
1179 | 0 | (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1); |
1180 | 0 | const __m128i ude = _mm_shuffle_epi8(ue, mask); |
1181 | 0 | _mm_storel_epi64((__m128i*)yEvenChromaDst1, ude); |
1182 | 0 | yEvenChromaDst1 += 8; |
1183 | 0 | } |
1184 | |
|
1185 | 0 | if (yLumaDstOdd) |
1186 | 0 | { |
1187 | 0 | const __m128i mask = |
1188 | 0 | _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80, |
1189 | 0 | (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1); |
1190 | 0 | const __m128i udo /* codespell:ignore udo */ = _mm_shuffle_epi8(uo, mask); |
1191 | 0 | _mm_storel_epi64((__m128i*)yOddChromaDst1, udo); // codespell:ignore udo |
1192 | 0 | yOddChromaDst1 += 8; |
1193 | 0 | } |
1194 | |
|
1195 | 0 | if (yLumaDstOdd) |
1196 | 0 | { |
1197 | 0 | const __m128i mask = |
1198 | 0 | _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80, |
1199 | 0 | (char)0x80, (char)0x80, (char)0x80, 14, 10, 6, 2, 12, 8, 4, 0); |
1200 | 0 | const __m128i ud = _mm_shuffle_epi8(uo, mask); |
1201 | 0 | int* uDst1 = (int*)uChromaDst1; |
1202 | 0 | int* vDst1 = (int*)vChromaDst1; |
1203 | 0 | const int* src = (const int*)&ud; |
1204 | 0 | _mm_stream_si32(uDst1, src[0]); |
1205 | 0 | _mm_stream_si32(vDst1, src[1]); |
1206 | 0 | uChromaDst1 += 4; |
1207 | 0 | vChromaDst1 += 4; |
1208 | 0 | } |
1209 | |
|
1210 | 0 | if (yLumaDstOdd) |
1211 | 0 | { |
1212 | 0 | _mm_storel_epi64((__m128i*)uLumaDst, uavg); |
1213 | 0 | uLumaDst += 8; |
1214 | 0 | } |
1215 | 0 | else |
1216 | 0 | { |
1217 | 0 | const __m128i mask = |
1218 | 0 | _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80, |
1219 | 0 | (char)0x80, (char)0x80, (char)0x80, 14, 12, 10, 8, 6, 4, 2, 0); |
1220 | 0 | const __m128i ud = _mm_shuffle_epi8(ue, mask); |
1221 | 0 | _mm_storel_epi64((__m128i*)uLumaDst, ud); |
1222 | 0 | uLumaDst += 8; |
1223 | 0 | } |
1224 | 0 | } |
1225 | |
|
1226 | 0 | { |
1227 | | /* V: multiplications with subtotals and horizontal sums */ |
1228 | 0 | __m128i ve; |
1229 | 0 | __m128i vo; |
1230 | 0 | __m128i vavg; |
1231 | 0 | { |
1232 | 0 | const __m128i v_factors = BGRX_V_FACTORS; |
1233 | 0 | const __m128i ve1 = |
1234 | 0 | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, v_factors), |
1235 | 0 | _mm_maddubs_epi16(xe2, v_factors)), |
1236 | 0 | V_SHIFT); |
1237 | 0 | const __m128i ve2 = |
1238 | 0 | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, v_factors), |
1239 | 0 | _mm_maddubs_epi16(xe4, v_factors)), |
1240 | 0 | V_SHIFT); |
1241 | 0 | const __m128i veavg = _mm_hadd_epi16(ve1, ve2); |
1242 | 0 | ve = _mm_sub_epi8(_mm_packs_epi16(ve1, ve2), vector128); |
1243 | 0 | vavg = veavg; |
1244 | 0 | } |
1245 | 0 | { |
1246 | 0 | const __m128i v_factors = BGRX_V_FACTORS; |
1247 | 0 | const __m128i vo1 = |
1248 | 0 | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, v_factors), |
1249 | 0 | _mm_maddubs_epi16(xo2, v_factors)), |
1250 | 0 | V_SHIFT); |
1251 | 0 | const __m128i vo2 = |
1252 | 0 | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, v_factors), |
1253 | 0 | _mm_maddubs_epi16(xo4, v_factors)), |
1254 | 0 | V_SHIFT); |
1255 | 0 | const __m128i voavg = _mm_hadd_epi16(vo1, vo2); |
1256 | 0 | vo = _mm_sub_epi8(_mm_packs_epi16(vo1, vo2), vector128); |
1257 | 0 | vavg = _mm_add_epi16(vavg, voavg); |
1258 | 0 | vavg = _mm_srai_epi16(vavg, 2); |
1259 | 0 | vavg = _mm_packs_epi16(vavg, voavg); |
1260 | 0 | vavg = _mm_sub_epi8(vavg, vector128); |
1261 | 0 | } |
1262 | | /* Now we need the following storage distribution: |
1263 | | * 2x 2y -> vLumaDst |
1264 | | * 2x+1 y -> yChromaDst2 |
1265 | | * 4x 2y+1 -> uChromaDst2 |
1266 | | * 4x+2 2y+1 -> vChromaDst2 */ |
1267 | 0 | { |
1268 | 0 | const __m128i mask = |
1269 | 0 | _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80, |
1270 | 0 | (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1); |
1271 | 0 | __m128i vde = _mm_shuffle_epi8(ve, mask); |
1272 | 0 | _mm_storel_epi64((__m128i*)yEvenChromaDst2, vde); |
1273 | 0 | yEvenChromaDst2 += 8; |
1274 | 0 | } |
1275 | |
|
1276 | 0 | if (yLumaDstOdd) |
1277 | 0 | { |
1278 | 0 | const __m128i mask = |
1279 | 0 | _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80, |
1280 | 0 | (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1); |
1281 | 0 | __m128i vdo = _mm_shuffle_epi8(vo, mask); |
1282 | 0 | _mm_storel_epi64((__m128i*)yOddChromaDst2, vdo); |
1283 | 0 | yOddChromaDst2 += 8; |
1284 | 0 | } |
1285 | |
|
1286 | 0 | if (yLumaDstOdd) |
1287 | 0 | { |
1288 | 0 | const __m128i mask = |
1289 | 0 | _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80, |
1290 | 0 | (char)0x80, (char)0x80, (char)0x80, 14, 10, 6, 2, 12, 8, 4, 0); |
1291 | 0 | const __m128i vd = _mm_shuffle_epi8(vo, mask); |
1292 | 0 | int* uDst2 = (int*)uChromaDst2; |
1293 | 0 | int* vDst2 = (int*)vChromaDst2; |
1294 | 0 | const int* src = (const int*)&vd; |
1295 | 0 | _mm_stream_si32(uDst2, src[0]); |
1296 | 0 | _mm_stream_si32(vDst2, src[1]); |
1297 | 0 | uChromaDst2 += 4; |
1298 | 0 | vChromaDst2 += 4; |
1299 | 0 | } |
1300 | |
|
1301 | 0 | if (yLumaDstOdd) |
1302 | 0 | { |
1303 | 0 | _mm_storel_epi64((__m128i*)vLumaDst, vavg); |
1304 | 0 | vLumaDst += 8; |
1305 | 0 | } |
1306 | 0 | else |
1307 | 0 | { |
1308 | 0 | const __m128i mask = |
1309 | 0 | _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80, |
1310 | 0 | (char)0x80, (char)0x80, (char)0x80, 14, 12, 10, 8, 6, 4, 2, 0); |
1311 | 0 | __m128i vd = _mm_shuffle_epi8(ve, mask); |
1312 | 0 | _mm_storel_epi64((__m128i*)vLumaDst, vd); |
1313 | 0 | vLumaDst += 8; |
1314 | 0 | } |
1315 | 0 | } |
1316 | 0 | } |
1317 | |
|
1318 | 0 | general_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(x, srcEven, srcOdd, yLumaDstEven, yLumaDstOdd, |
1319 | 0 | uLumaDst, vLumaDst, yEvenChromaDst1, yEvenChromaDst2, |
1320 | 0 | yOddChromaDst1, yOddChromaDst2, uChromaDst1, |
1321 | 0 | uChromaDst2, vChromaDst1, vChromaDst2, width); |
1322 | 0 | } |
1323 | | |
1324 | | static pstatus_t sse41_RGBToAVC444YUVv2_BGRX(const BYTE* WINPR_RESTRICT pSrc, |
1325 | | WINPR_ATTR_UNUSED UINT32 srcFormat, UINT32 srcStep, |
1326 | | BYTE* WINPR_RESTRICT pDst1[], const UINT32 dst1Step[], |
1327 | | BYTE* WINPR_RESTRICT pDst2[], const UINT32 dst2Step[], |
1328 | | const prim_size_t* WINPR_RESTRICT roi) |
1329 | 0 | { |
1330 | 0 | if (roi->height < 1 || roi->width < 1) |
1331 | 0 | return !PRIMITIVES_SUCCESS; |
1332 | | |
1333 | 0 | size_t y = 0; |
1334 | 0 | for (; y < roi->height - roi->height % 2; y += 2) |
1335 | 0 | { |
1336 | 0 | const BYTE* srcEven = (pSrc + y * srcStep); |
1337 | 0 | const BYTE* srcOdd = (srcEven + srcStep); |
1338 | 0 | BYTE* dstLumaYEven = (pDst1[0] + y * dst1Step[0]); |
1339 | 0 | BYTE* dstLumaYOdd = (dstLumaYEven + dst1Step[0]); |
1340 | 0 | BYTE* dstLumaU = (pDst1[1] + (y / 2) * dst1Step[1]); |
1341 | 0 | BYTE* dstLumaV = (pDst1[2] + (y / 2) * dst1Step[2]); |
1342 | 0 | BYTE* dstEvenChromaY1 = (pDst2[0] + y * dst2Step[0]); |
1343 | 0 | BYTE* dstEvenChromaY2 = dstEvenChromaY1 + roi->width / 2; |
1344 | 0 | BYTE* dstOddChromaY1 = dstEvenChromaY1 + dst2Step[0]; |
1345 | 0 | BYTE* dstOddChromaY2 = dstEvenChromaY2 + dst2Step[0]; |
1346 | 0 | BYTE* dstChromaU1 = (pDst2[1] + (y / 2) * dst2Step[1]); |
1347 | 0 | BYTE* dstChromaV1 = (pDst2[2] + (y / 2) * dst2Step[2]); |
1348 | 0 | BYTE* dstChromaU2 = dstChromaU1 + roi->width / 4; |
1349 | 0 | BYTE* dstChromaV2 = dstChromaV1 + roi->width / 4; |
1350 | 0 | sse41_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(srcEven, srcOdd, dstLumaYEven, dstLumaYOdd, dstLumaU, |
1351 | 0 | dstLumaV, dstEvenChromaY1, dstEvenChromaY2, |
1352 | 0 | dstOddChromaY1, dstOddChromaY2, dstChromaU1, |
1353 | 0 | dstChromaU2, dstChromaV1, dstChromaV2, roi->width); |
1354 | 0 | } |
1355 | |
|
1356 | 0 | for (; y < roi->height; y++) |
1357 | 0 | { |
1358 | 0 | const BYTE* srcEven = (pSrc + y * srcStep); |
1359 | 0 | BYTE* dstLumaYEven = (pDst1[0] + y * dst1Step[0]); |
1360 | 0 | BYTE* dstLumaU = (pDst1[1] + (y / 2) * dst1Step[1]); |
1361 | 0 | BYTE* dstLumaV = (pDst1[2] + (y / 2) * dst1Step[2]); |
1362 | 0 | BYTE* dstEvenChromaY1 = (pDst2[0] + y * dst2Step[0]); |
1363 | 0 | BYTE* dstEvenChromaY2 = dstEvenChromaY1 + roi->width / 2; |
1364 | 0 | BYTE* dstChromaU1 = (pDst2[1] + (y / 2) * dst2Step[1]); |
1365 | 0 | BYTE* dstChromaV1 = (pDst2[2] + (y / 2) * dst2Step[2]); |
1366 | 0 | BYTE* dstChromaU2 = dstChromaU1 + roi->width / 4; |
1367 | 0 | BYTE* dstChromaV2 = dstChromaV1 + roi->width / 4; |
1368 | 0 | general_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(0, srcEven, NULL, dstLumaYEven, NULL, dstLumaU, |
1369 | 0 | dstLumaV, dstEvenChromaY1, dstEvenChromaY2, NULL, |
1370 | 0 | NULL, dstChromaU1, dstChromaU2, dstChromaV1, |
1371 | 0 | dstChromaV2, roi->width); |
1372 | 0 | } |
1373 | |
|
1374 | 0 | return PRIMITIVES_SUCCESS; |
1375 | 0 | } |
1376 | | |
1377 | | static pstatus_t sse41_RGBToAVC444YUVv2(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat, |
1378 | | UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[], |
1379 | | const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[], |
1380 | | const UINT32 dst2Step[], |
1381 | | const prim_size_t* WINPR_RESTRICT roi) |
1382 | 0 | { |
1383 | 0 | switch (srcFormat) |
1384 | 0 | { |
1385 | 0 | case PIXEL_FORMAT_BGRX32: |
1386 | 0 | case PIXEL_FORMAT_BGRA32: |
1387 | 0 | return sse41_RGBToAVC444YUVv2_BGRX(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, |
1388 | 0 | dst2Step, roi); |
1389 | | |
1390 | 0 | default: |
1391 | 0 | return generic->RGBToAVC444YUVv2(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, |
1392 | 0 | dst2Step, roi); |
1393 | 0 | } |
1394 | 0 | } |
1395 | | |
1396 | | static pstatus_t sse41_LumaToYUV444(const BYTE* WINPR_RESTRICT pSrcRaw[], const UINT32 srcStep[], |
1397 | | BYTE* WINPR_RESTRICT pDstRaw[], const UINT32 dstStep[], |
1398 | | const RECTANGLE_16* WINPR_RESTRICT roi) |
1399 | 0 | { |
1400 | 0 | const UINT32 nWidth = roi->right - roi->left; |
1401 | 0 | const UINT32 nHeight = roi->bottom - roi->top; |
1402 | 0 | const UINT32 halfWidth = (nWidth + 1) / 2; |
1403 | 0 | const UINT32 halfPad = halfWidth % 16; |
1404 | 0 | const UINT32 halfHeight = (nHeight + 1) / 2; |
1405 | 0 | const UINT32 oddY = 1; |
1406 | 0 | const UINT32 evenY = 0; |
1407 | 0 | const UINT32 oddX = 1; |
1408 | 0 | const UINT32 evenX = 0; |
1409 | 0 | const BYTE* pSrc[3] = { pSrcRaw[0] + 1ULL * roi->top * srcStep[0] + roi->left, |
1410 | 0 | pSrcRaw[1] + 1ULL * roi->top / 2 * srcStep[1] + roi->left / 2, |
1411 | 0 | pSrcRaw[2] + 1ULL * roi->top / 2 * srcStep[2] + roi->left / 2 }; |
1412 | 0 | BYTE* pDst[3] = { pDstRaw[0] + 1ULL * roi->top * dstStep[0] + roi->left, |
1413 | 0 | pDstRaw[1] + 1ULL * roi->top * dstStep[1] + roi->left, |
1414 | 0 | pDstRaw[2] + 1ULL * roi->top * dstStep[2] + roi->left }; |
1415 | | |
1416 | | /* Y data is already here... */ |
1417 | | /* B1 */ |
1418 | 0 | for (size_t y = 0; y < nHeight; y++) |
1419 | 0 | { |
1420 | 0 | const BYTE* Ym = pSrc[0] + y * srcStep[0]; |
1421 | 0 | BYTE* pY = pDst[0] + y * dstStep[0]; |
1422 | 0 | memcpy(pY, Ym, nWidth); |
1423 | 0 | } |
1424 | | |
1425 | | /* The first half of U, V are already here part of this frame. */ |
1426 | | /* B2 and B3 */ |
1427 | 0 | for (size_t y = 0; y < halfHeight; y++) |
1428 | 0 | { |
1429 | 0 | const size_t val2y = (2 * y + evenY); |
1430 | 0 | const size_t val2y1 = val2y + oddY; |
1431 | 0 | const BYTE* Um = pSrc[1] + 1ULL * srcStep[1] * y; |
1432 | 0 | const BYTE* Vm = pSrc[2] + 1ULL * srcStep[2] * y; |
1433 | 0 | BYTE* pU = pDst[1] + 1ULL * dstStep[1] * val2y; |
1434 | 0 | BYTE* pV = pDst[2] + 1ULL * dstStep[2] * val2y; |
1435 | 0 | BYTE* pU1 = pDst[1] + 1ULL * dstStep[1] * val2y1; |
1436 | 0 | BYTE* pV1 = pDst[2] + 1ULL * dstStep[2] * val2y1; |
1437 | |
|
1438 | 0 | size_t x = 0; |
1439 | 0 | for (; x < halfWidth - halfPad; x += 16) |
1440 | 0 | { |
1441 | 0 | const __m128i unpackHigh = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0); |
1442 | 0 | const __m128i unpackLow = |
1443 | 0 | _mm_set_epi8(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8); |
1444 | 0 | { |
1445 | 0 | const __m128i u = LOAD_SI128(&Um[x]); |
1446 | 0 | const __m128i uHigh = _mm_shuffle_epi8(u, unpackHigh); |
1447 | 0 | const __m128i uLow = _mm_shuffle_epi8(u, unpackLow); |
1448 | 0 | STORE_SI128(&pU[2ULL * x], uHigh); |
1449 | 0 | STORE_SI128(&pU[2ULL * x + 16], uLow); |
1450 | 0 | STORE_SI128(&pU1[2ULL * x], uHigh); |
1451 | 0 | STORE_SI128(&pU1[2ULL * x + 16], uLow); |
1452 | 0 | } |
1453 | 0 | { |
1454 | 0 | const __m128i u = LOAD_SI128(&Vm[x]); |
1455 | 0 | const __m128i uHigh = _mm_shuffle_epi8(u, unpackHigh); |
1456 | 0 | const __m128i uLow = _mm_shuffle_epi8(u, unpackLow); |
1457 | 0 | STORE_SI128(&pV[2 * x], uHigh); |
1458 | 0 | STORE_SI128(&pV[2 * x + 16], uLow); |
1459 | 0 | STORE_SI128(&pV1[2 * x], uHigh); |
1460 | 0 | STORE_SI128(&pV1[2 * x + 16], uLow); |
1461 | 0 | } |
1462 | 0 | } |
1463 | |
|
1464 | 0 | for (; x < halfWidth; x++) |
1465 | 0 | { |
1466 | 0 | const size_t val2x = 2 * x + evenX; |
1467 | 0 | const size_t val2x1 = val2x + oddX; |
1468 | 0 | pU[val2x] = Um[x]; |
1469 | 0 | pV[val2x] = Vm[x]; |
1470 | 0 | pU[val2x1] = Um[x]; |
1471 | 0 | pV[val2x1] = Vm[x]; |
1472 | 0 | pU1[val2x] = Um[x]; |
1473 | 0 | pV1[val2x] = Vm[x]; |
1474 | 0 | pU1[val2x1] = Um[x]; |
1475 | 0 | pV1[val2x1] = Vm[x]; |
1476 | 0 | } |
1477 | 0 | } |
1478 | |
|
1479 | 0 | return PRIMITIVES_SUCCESS; |
1480 | 0 | } |
1481 | | |
1482 | | static pstatus_t sse41_ChromaV1ToYUV444(const BYTE* WINPR_RESTRICT pSrcRaw[3], |
1483 | | const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDstRaw[3], |
1484 | | const UINT32 dstStep[3], |
1485 | | const RECTANGLE_16* WINPR_RESTRICT roi) |
1486 | 0 | { |
1487 | 0 | const UINT32 mod = 16; |
1488 | 0 | UINT32 uY = 0; |
1489 | 0 | UINT32 vY = 0; |
1490 | 0 | const UINT32 nWidth = roi->right - roi->left; |
1491 | 0 | const UINT32 nHeight = roi->bottom - roi->top; |
1492 | 0 | const UINT32 halfWidth = (nWidth + 1) / 2; |
1493 | 0 | const UINT32 halfPad = halfWidth % 16; |
1494 | 0 | const UINT32 halfHeight = (nHeight + 1) / 2; |
1495 | 0 | const UINT32 oddY = 1; |
1496 | 0 | const UINT32 evenY = 0; |
1497 | 0 | const UINT32 oddX = 1; |
1498 | | /* The auxiliary frame is aligned to multiples of 16x16. |
1499 | | * We need the padded height for B4 and B5 conversion. */ |
1500 | 0 | const UINT32 padHeigth = nHeight + 16 - nHeight % 16; |
1501 | 0 | const BYTE* pSrc[3] = { pSrcRaw[0] + 1ULL * roi->top * srcStep[0] + roi->left, |
1502 | 0 | pSrcRaw[1] + 1ULL * roi->top / 2 * srcStep[1] + roi->left / 2, |
1503 | 0 | pSrcRaw[2] + 1ULL * roi->top / 2 * srcStep[2] + roi->left / 2 }; |
1504 | 0 | BYTE* pDst[3] = { pDstRaw[0] + 1ULL * roi->top * dstStep[0] + roi->left, |
1505 | 0 | pDstRaw[1] + 1ULL * roi->top * dstStep[1] + roi->left, |
1506 | 0 | pDstRaw[2] + 1ULL * roi->top * dstStep[2] + roi->left }; |
1507 | 0 | const __m128i zero = _mm_setzero_si128(); |
1508 | 0 | const __m128i mask = _mm_set_epi8(0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, |
1509 | 0 | (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80); |
1510 | | |
1511 | | /* The second half of U and V is a bit more tricky... */ |
1512 | | /* B4 and B5 */ |
1513 | 0 | for (size_t y = 0; y < padHeigth; y++) |
1514 | 0 | { |
1515 | 0 | const BYTE* Ya = pSrc[0] + 1ULL * srcStep[0] * y; |
1516 | 0 | BYTE* pX = NULL; |
1517 | |
|
1518 | 0 | if ((y) % mod < (mod + 1) / 2) |
1519 | 0 | { |
1520 | 0 | const UINT32 pos = (2 * uY++ + oddY); |
1521 | |
|
1522 | 0 | if (pos >= nHeight) |
1523 | 0 | continue; |
1524 | | |
1525 | 0 | pX = pDst[1] + 1ULL * dstStep[1] * pos; |
1526 | 0 | } |
1527 | 0 | else |
1528 | 0 | { |
1529 | 0 | const UINT32 pos = (2 * vY++ + oddY); |
1530 | |
|
1531 | 0 | if (pos >= nHeight) |
1532 | 0 | continue; |
1533 | | |
1534 | 0 | pX = pDst[2] + 1ULL * dstStep[2] * pos; |
1535 | 0 | } |
1536 | | |
1537 | 0 | memcpy(pX, Ya, nWidth); |
1538 | 0 | } |
1539 | | |
1540 | | /* B6 and B7 */ |
1541 | 0 | for (size_t y = 0; y < halfHeight; y++) |
1542 | 0 | { |
1543 | 0 | const size_t val2y = (y * 2 + evenY); |
1544 | 0 | const BYTE* Ua = pSrc[1] + srcStep[1] * y; |
1545 | 0 | const BYTE* Va = pSrc[2] + srcStep[2] * y; |
1546 | 0 | BYTE* pU = pDst[1] + dstStep[1] * val2y; |
1547 | 0 | BYTE* pV = pDst[2] + dstStep[2] * val2y; |
1548 | |
|
1549 | 0 | size_t x = 0; |
1550 | 0 | for (; x < halfWidth - halfPad; x += 16) |
1551 | 0 | { |
1552 | 0 | { |
1553 | 0 | const __m128i u = LOAD_SI128(&Ua[x]); |
1554 | 0 | const __m128i u2 = _mm_unpackhi_epi8(u, zero); |
1555 | 0 | const __m128i u1 = _mm_unpacklo_epi8(u, zero); |
1556 | 0 | _mm_maskmoveu_si128(u1, mask, (char*)&pU[2 * x]); |
1557 | 0 | _mm_maskmoveu_si128(u2, mask, (char*)&pU[2 * x + 16]); |
1558 | 0 | } |
1559 | 0 | { |
1560 | 0 | const __m128i u = LOAD_SI128(&Va[x]); |
1561 | 0 | const __m128i u2 = _mm_unpackhi_epi8(u, zero); |
1562 | 0 | const __m128i u1 = _mm_unpacklo_epi8(u, zero); |
1563 | 0 | _mm_maskmoveu_si128(u1, mask, (char*)&pV[2 * x]); |
1564 | 0 | _mm_maskmoveu_si128(u2, mask, (char*)&pV[2 * x + 16]); |
1565 | 0 | } |
1566 | 0 | } |
1567 | |
|
1568 | 0 | for (; x < halfWidth; x++) |
1569 | 0 | { |
1570 | 0 | const size_t val2x1 = (x * 2ULL + oddX); |
1571 | 0 | pU[val2x1] = Ua[x]; |
1572 | 0 | pV[val2x1] = Va[x]; |
1573 | 0 | } |
1574 | 0 | } |
1575 | |
|
1576 | 0 | return PRIMITIVES_SUCCESS; |
1577 | 0 | } |
1578 | | |
1579 | | static pstatus_t sse41_ChromaV2ToYUV444(const BYTE* WINPR_RESTRICT pSrc[3], const UINT32 srcStep[3], |
1580 | | UINT32 nTotalWidth, WINPR_ATTR_UNUSED UINT32 nTotalHeight, |
1581 | | BYTE* WINPR_RESTRICT pDst[3], const UINT32 dstStep[3], |
1582 | | const RECTANGLE_16* WINPR_RESTRICT roi) |
1583 | 0 | { |
1584 | 0 | const UINT32 nWidth = roi->right - roi->left; |
1585 | 0 | const UINT32 nHeight = roi->bottom - roi->top; |
1586 | 0 | const UINT32 halfWidth = (nWidth + 1) / 2; |
1587 | 0 | const UINT32 halfPad = halfWidth % 16; |
1588 | 0 | const UINT32 halfHeight = (nHeight + 1) / 2; |
1589 | 0 | const UINT32 quaterWidth = (nWidth + 3) / 4; |
1590 | 0 | const UINT32 quaterPad = quaterWidth % 16; |
1591 | 0 | const __m128i zero = _mm_setzero_si128(); |
1592 | 0 | const __m128i mask = _mm_set_epi8((char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, |
1593 | 0 | (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0); |
1594 | 0 | const __m128i mask2 = _mm_set_epi8(0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80, |
1595 | 0 | 0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80); |
1596 | 0 | const __m128i shuffle1 = |
1597 | 0 | _mm_set_epi8((char)0x80, 15, (char)0x80, 14, (char)0x80, 13, (char)0x80, 12, (char)0x80, 11, |
1598 | 0 | (char)0x80, 10, (char)0x80, 9, (char)0x80, 8); |
1599 | 0 | const __m128i shuffle2 = |
1600 | 0 | _mm_set_epi8((char)0x80, 7, (char)0x80, 6, (char)0x80, 5, (char)0x80, 4, (char)0x80, 3, |
1601 | 0 | (char)0x80, 2, (char)0x80, 1, (char)0x80, 0); |
1602 | | |
1603 | | /* B4 and B5: odd UV values for width/2, height */ |
1604 | 0 | for (size_t y = 0; y < nHeight; y++) |
1605 | 0 | { |
1606 | 0 | const size_t yTop = y + roi->top; |
1607 | 0 | const BYTE* pYaU = pSrc[0] + srcStep[0] * yTop + roi->left / 2; |
1608 | 0 | const BYTE* pYaV = pYaU + nTotalWidth / 2; |
1609 | 0 | BYTE* pU = pDst[1] + 1ULL * dstStep[1] * yTop + roi->left; |
1610 | 0 | BYTE* pV = pDst[2] + 1ULL * dstStep[2] * yTop + roi->left; |
1611 | |
|
1612 | 0 | size_t x = 0; |
1613 | 0 | for (; x < halfWidth - halfPad; x += 16) |
1614 | 0 | { |
1615 | 0 | { |
1616 | 0 | const __m128i u = LOAD_SI128(&pYaU[x]); |
1617 | 0 | const __m128i u2 = _mm_unpackhi_epi8(zero, u); |
1618 | 0 | const __m128i u1 = _mm_unpacklo_epi8(zero, u); |
1619 | 0 | _mm_maskmoveu_si128(u1, mask, (char*)&pU[2 * x]); |
1620 | 0 | _mm_maskmoveu_si128(u2, mask, (char*)&pU[2 * x + 16]); |
1621 | 0 | } |
1622 | 0 | { |
1623 | 0 | const __m128i v = LOAD_SI128(&pYaV[x]); |
1624 | 0 | const __m128i v2 = _mm_unpackhi_epi8(zero, v); |
1625 | 0 | const __m128i v1 = _mm_unpacklo_epi8(zero, v); |
1626 | 0 | _mm_maskmoveu_si128(v1, mask, (char*)&pV[2 * x]); |
1627 | 0 | _mm_maskmoveu_si128(v2, mask, (char*)&pV[2 * x + 16]); |
1628 | 0 | } |
1629 | 0 | } |
1630 | |
|
1631 | 0 | for (; x < halfWidth; x++) |
1632 | 0 | { |
1633 | 0 | const size_t odd = 2ULL * x + 1; |
1634 | 0 | pU[odd] = pYaU[x]; |
1635 | 0 | pV[odd] = pYaV[x]; |
1636 | 0 | } |
1637 | 0 | } |
1638 | | |
1639 | | /* B6 - B9 */ |
1640 | 0 | for (size_t y = 0; y < halfHeight; y++) |
1641 | 0 | { |
1642 | 0 | const BYTE* pUaU = pSrc[1] + srcStep[1] * (y + roi->top / 2) + roi->left / 4; |
1643 | 0 | const BYTE* pUaV = pUaU + nTotalWidth / 4; |
1644 | 0 | const BYTE* pVaU = pSrc[2] + srcStep[2] * (y + roi->top / 2) + roi->left / 4; |
1645 | 0 | const BYTE* pVaV = pVaU + nTotalWidth / 4; |
1646 | 0 | BYTE* pU = pDst[1] + dstStep[1] * (2 * y + 1 + roi->top) + roi->left; |
1647 | 0 | BYTE* pV = pDst[2] + dstStep[2] * (2 * y + 1 + roi->top) + roi->left; |
1648 | |
|
1649 | 0 | UINT32 x = 0; |
1650 | 0 | for (; x < quaterWidth - quaterPad; x += 16) |
1651 | 0 | { |
1652 | 0 | { |
1653 | 0 | const __m128i uU = LOAD_SI128(&pUaU[x]); |
1654 | 0 | const __m128i uV = LOAD_SI128(&pVaU[x]); |
1655 | 0 | const __m128i uHigh = _mm_unpackhi_epi8(uU, uV); |
1656 | 0 | const __m128i uLow = _mm_unpacklo_epi8(uU, uV); |
1657 | 0 | const __m128i u1 = _mm_shuffle_epi8(uLow, shuffle2); |
1658 | 0 | const __m128i u2 = _mm_shuffle_epi8(uLow, shuffle1); |
1659 | 0 | const __m128i u3 = _mm_shuffle_epi8(uHigh, shuffle2); |
1660 | 0 | const __m128i u4 = _mm_shuffle_epi8(uHigh, shuffle1); |
1661 | 0 | _mm_maskmoveu_si128(u1, mask2, (char*)&pU[4 * x + 0]); |
1662 | 0 | _mm_maskmoveu_si128(u2, mask2, (char*)&pU[4 * x + 16]); |
1663 | 0 | _mm_maskmoveu_si128(u3, mask2, (char*)&pU[4 * x + 32]); |
1664 | 0 | _mm_maskmoveu_si128(u4, mask2, (char*)&pU[4 * x + 48]); |
1665 | 0 | } |
1666 | 0 | { |
1667 | 0 | const __m128i vU = LOAD_SI128(&pUaV[x]); |
1668 | 0 | const __m128i vV = LOAD_SI128(&pVaV[x]); |
1669 | 0 | const __m128i vHigh = _mm_unpackhi_epi8(vU, vV); |
1670 | 0 | const __m128i vLow = _mm_unpacklo_epi8(vU, vV); |
1671 | 0 | const __m128i v1 = _mm_shuffle_epi8(vLow, shuffle2); |
1672 | 0 | const __m128i v2 = _mm_shuffle_epi8(vLow, shuffle1); |
1673 | 0 | const __m128i v3 = _mm_shuffle_epi8(vHigh, shuffle2); |
1674 | 0 | const __m128i v4 = _mm_shuffle_epi8(vHigh, shuffle1); |
1675 | 0 | _mm_maskmoveu_si128(v1, mask2, (char*)&pV[4 * x + 0]); |
1676 | 0 | _mm_maskmoveu_si128(v2, mask2, (char*)&pV[4 * x + 16]); |
1677 | 0 | _mm_maskmoveu_si128(v3, mask2, (char*)&pV[4 * x + 32]); |
1678 | 0 | _mm_maskmoveu_si128(v4, mask2, (char*)&pV[4 * x + 48]); |
1679 | 0 | } |
1680 | 0 | } |
1681 | |
|
1682 | 0 | for (; x < quaterWidth; x++) |
1683 | 0 | { |
1684 | 0 | pU[4 * x + 0] = pUaU[x]; |
1685 | 0 | pV[4 * x + 0] = pUaV[x]; |
1686 | 0 | pU[4 * x + 2] = pVaU[x]; |
1687 | 0 | pV[4 * x + 2] = pVaV[x]; |
1688 | 0 | } |
1689 | 0 | } |
1690 | |
|
1691 | 0 | return PRIMITIVES_SUCCESS; |
1692 | 0 | } |
1693 | | |
1694 | | static pstatus_t sse41_YUV420CombineToYUV444(avc444_frame_type type, |
1695 | | const BYTE* WINPR_RESTRICT pSrc[3], |
1696 | | const UINT32 srcStep[3], UINT32 nWidth, UINT32 nHeight, |
1697 | | BYTE* WINPR_RESTRICT pDst[3], const UINT32 dstStep[3], |
1698 | | const RECTANGLE_16* WINPR_RESTRICT roi) |
1699 | 0 | { |
1700 | 0 | if (!pSrc || !pSrc[0] || !pSrc[1] || !pSrc[2]) |
1701 | 0 | return -1; |
1702 | | |
1703 | 0 | if (!pDst || !pDst[0] || !pDst[1] || !pDst[2]) |
1704 | 0 | return -1; |
1705 | | |
1706 | 0 | if (!roi) |
1707 | 0 | return -1; |
1708 | | |
1709 | 0 | switch (type) |
1710 | 0 | { |
1711 | 0 | case AVC444_LUMA: |
1712 | 0 | return sse41_LumaToYUV444(pSrc, srcStep, pDst, dstStep, roi); |
1713 | | |
1714 | 0 | case AVC444_CHROMAv1: |
1715 | 0 | return sse41_ChromaV1ToYUV444(pSrc, srcStep, pDst, dstStep, roi); |
1716 | | |
1717 | 0 | case AVC444_CHROMAv2: |
1718 | 0 | return sse41_ChromaV2ToYUV444(pSrc, srcStep, nWidth, nHeight, pDst, dstStep, roi); |
1719 | | |
1720 | 0 | default: |
1721 | 0 | return -1; |
1722 | 0 | } |
1723 | 0 | } |
1724 | | #endif |
1725 | | |
1726 | | void primitives_init_YUV_sse41_int(primitives_t* WINPR_RESTRICT prims) |
1727 | 0 | { |
1728 | 0 | #if defined(SSE_AVX_INTRINSICS_ENABLED) |
1729 | 0 | generic = primitives_get_generic(); |
1730 | |
|
1731 | 0 | WLog_VRB(PRIM_TAG, "SSE3/sse41 optimizations"); |
1732 | 0 | prims->RGBToYUV420_8u_P3AC4R = sse41_RGBToYUV420; |
1733 | 0 | prims->RGBToAVC444YUV = sse41_RGBToAVC444YUV; |
1734 | 0 | prims->RGBToAVC444YUVv2 = sse41_RGBToAVC444YUVv2; |
1735 | 0 | prims->YUV420ToRGB_8u_P3AC4R = sse41_YUV420ToRGB; |
1736 | 0 | prims->YUV444ToRGB_8u_P3AC4R = sse41_YUV444ToRGB_8u_P3AC4R; |
1737 | 0 | prims->YUV420CombineToYUV444 = sse41_YUV420CombineToYUV444; |
1738 | | #else |
1739 | | WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or sse41 intrinsics not available"); |
1740 | | WINPR_UNUSED(prims); |
1741 | | #endif |
1742 | 0 | } |