/src/FreeRDP/libfreerdp/primitives/sse/prim_YUV_ssse3.c
Line | Count | Source (jump to first uncovered line) |
1 | | /** |
2 | | * FreeRDP: A Remote Desktop Protocol Implementation |
3 | | * Optimized YUV/RGB conversion operations |
4 | | * |
5 | | * Copyright 2014 Thomas Erbesdobler |
6 | | * Copyright 2016-2017 Armin Novak <armin.novak@thincast.com> |
7 | | * Copyright 2016-2017 Norbert Federa <norbert.federa@thincast.com> |
8 | | * Copyright 2016-2017 Thincast Technologies GmbH |
9 | | * |
10 | | * Licensed under the Apache License, Version 2.0 (the "License"); |
11 | | * you may not use this file except in compliance with the License. |
12 | | * You may obtain a copy of the License at |
13 | | * |
14 | | * http://www.apache.org/licenses/LICENSE-2.0 |
15 | | * |
16 | | * Unless required by applicable law or agreed to in writing, software |
17 | | * distributed under the License is distributed on an "AS IS" BASIS, |
18 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
19 | | * See the License for the specific language governing permissions and |
20 | | * limitations under the License. |
21 | | */ |
22 | | |
23 | | #include <winpr/wtypes.h> |
24 | | #include <freerdp/config.h> |
25 | | |
26 | | #include <winpr/sysinfo.h> |
27 | | #include <winpr/crt.h> |
28 | | #include <freerdp/types.h> |
29 | | #include <freerdp/primitives.h> |
30 | | |
31 | | #include "prim_internal.h" |
32 | | #include "prim_YUV.h" |
33 | | |
34 | | #if defined(SSE2_ENABLED) |
35 | | #include <emmintrin.h> |
36 | | #include <tmmintrin.h> |
37 | | |
38 | | static primitives_t* generic = NULL; |
39 | | |
40 | | /****************************************************************************/ |
41 | | /* SSSE3 YUV420 -> RGB conversion */ |
42 | | /****************************************************************************/ |
43 | | static __m128i* ssse3_YUV444Pixel(__m128i* WINPR_RESTRICT dst, __m128i Yraw, __m128i Uraw, |
44 | | __m128i Vraw, UINT8 pos) |
45 | | { |
46 | | /* Visual Studio 2010 doesn't like _mm_set_epi32 in array initializer list */ |
47 | | /* Note: This also applies to Visual Studio 2013 before Update 4 */ |
48 | | #if !defined(_MSC_VER) || (_MSC_VER > 1600) |
49 | | const __m128i mapY[] = { _mm_set_epi32(0x80800380, 0x80800280, 0x80800180, 0x80800080), |
50 | | _mm_set_epi32(0x80800780, 0x80800680, 0x80800580, 0x80800480), |
51 | | _mm_set_epi32(0x80800B80, 0x80800A80, 0x80800980, 0x80800880), |
52 | | _mm_set_epi32(0x80800F80, 0x80800E80, 0x80800D80, 0x80800C80) }; |
53 | | const __m128i mapUV[] = { _mm_set_epi32(0x80038002, 0x80018000, 0x80808080, 0x80808080), |
54 | | _mm_set_epi32(0x80078006, 0x80058004, 0x80808080, 0x80808080), |
55 | | _mm_set_epi32(0x800B800A, 0x80098008, 0x80808080, 0x80808080), |
56 | | _mm_set_epi32(0x800F800E, 0x800D800C, 0x80808080, 0x80808080) }; |
57 | | const __m128i mask[] = { _mm_set_epi32(0x80038080, 0x80028080, 0x80018080, 0x80008080), |
58 | | _mm_set_epi32(0x80800380, 0x80800280, 0x80800180, 0x80800080), |
59 | | _mm_set_epi32(0x80808003, 0x80808002, 0x80808001, 0x80808000) }; |
60 | | #else |
61 | | /* Note: must be in little-endian format ! */ |
62 | | const __m128i mapY[] = { { 0x80, 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02, 0x80, |
63 | | 0x80, 0x80, 0x03, 0x80, 0x80 }, |
64 | | { 0x80, 0x04, 0x80, 0x80, 0x80, 0x05, 0x80, 0x80, 0x80, 0x06, 0x80, |
65 | | 0x80, 0x80, 0x07, 0x80, 0x80 }, |
66 | | { 0x80, 0x08, 0x80, 0x80, 0x80, 0x09, 0x80, 0x80, 0x80, 0x0a, 0x80, |
67 | | 0x80, 0x80, 0x0b, 0x80, 0x80 }, |
68 | | { 0x80, 0x0c, 0x80, 0x80, 0x80, 0x0d, 0x80, 0x80, 0x80, 0x0e, 0x80, |
69 | | 0x80, 0x80, 0x0f, 0x80, 0x80 } |
70 | | |
71 | | }; |
72 | | const __m128i mapUV[] = { { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00, 0x80, 0x01, |
73 | | 0x80, 0x02, 0x80, 0x03, 0x80 }, |
74 | | { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x04, 0x80, 0x05, |
75 | | 0x80, 0x06, 0x80, 0x07, 0x80 }, |
76 | | { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x08, 0x80, 0x09, |
77 | | 0x80, 0x0a, 0x80, 0x0b, 0x80 }, |
78 | | { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x0c, 0x80, 0x0d, |
79 | | 0x80, 0x0e, 0x80, 0x0f, 0x80 } }; |
80 | | const __m128i mask[] = { { 0x80, 0x80, 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02, |
81 | | 0x80, 0x80, 0x80, 0x03, 0x80 }, |
82 | | { 0x80, 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02, 0x80, |
83 | | 0x80, 0x80, 0x03, 0x80, 0x80 }, |
84 | | { 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02, 0x80, 0x80, |
85 | | 0x80, 0x03, 0x80, 0x80, 0x80 } }; |
86 | | #endif |
87 | | const __m128i c128 = _mm_set1_epi16(128); |
88 | | __m128i BGRX = _mm_and_si128(_mm_loadu_si128(dst), |
89 | | _mm_set_epi32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000)); |
90 | | { |
91 | | __m128i C; |
92 | | __m128i D; |
93 | | __m128i E; |
94 | | /* Load Y values and expand to 32 bit */ |
95 | | { |
96 | | C = _mm_shuffle_epi8(Yraw, mapY[pos]); /* Reorder and multiply by 256 */ |
97 | | } |
98 | | /* Load U values and expand to 32 bit */ |
99 | | { |
100 | | const __m128i U = _mm_shuffle_epi8(Uraw, mapUV[pos]); /* Reorder dcba */ |
101 | | D = _mm_sub_epi16(U, c128); /* D = U - 128 */ |
102 | | } |
103 | | /* Load V values and expand to 32 bit */ |
104 | | { |
105 | | const __m128i V = _mm_shuffle_epi8(Vraw, mapUV[pos]); /* Reorder dcba */ |
106 | | E = _mm_sub_epi16(V, c128); /* E = V - 128 */ |
107 | | } |
108 | | /* Get the R value */ |
109 | | { |
110 | | const __m128i c403 = _mm_set1_epi16(403); |
111 | | const __m128i e403 = |
112 | | _mm_unpackhi_epi16(_mm_mullo_epi16(E, c403), _mm_mulhi_epi16(E, c403)); |
113 | | const __m128i Rs = _mm_add_epi32(C, e403); |
114 | | const __m128i R32 = _mm_srai_epi32(Rs, 8); |
115 | | const __m128i R16 = _mm_packs_epi32(R32, _mm_setzero_si128()); |
116 | | const __m128i R = _mm_packus_epi16(R16, _mm_setzero_si128()); |
117 | | const __m128i packed = _mm_shuffle_epi8(R, mask[0]); |
118 | | BGRX = _mm_or_si128(BGRX, packed); |
119 | | } |
120 | | /* Get the G value */ |
121 | | { |
122 | | const __m128i c48 = _mm_set1_epi16(48); |
123 | | const __m128i d48 = |
124 | | _mm_unpackhi_epi16(_mm_mullo_epi16(D, c48), _mm_mulhi_epi16(D, c48)); |
125 | | const __m128i c120 = _mm_set1_epi16(120); |
126 | | const __m128i e120 = |
127 | | _mm_unpackhi_epi16(_mm_mullo_epi16(E, c120), _mm_mulhi_epi16(E, c120)); |
128 | | const __m128i de = _mm_add_epi32(d48, e120); |
129 | | const __m128i Gs = _mm_sub_epi32(C, de); |
130 | | const __m128i G32 = _mm_srai_epi32(Gs, 8); |
131 | | const __m128i G16 = _mm_packs_epi32(G32, _mm_setzero_si128()); |
132 | | const __m128i G = _mm_packus_epi16(G16, _mm_setzero_si128()); |
133 | | const __m128i packed = _mm_shuffle_epi8(G, mask[1]); |
134 | | BGRX = _mm_or_si128(BGRX, packed); |
135 | | } |
136 | | /* Get the B value */ |
137 | | { |
138 | | const __m128i c475 = _mm_set1_epi16(475); |
139 | | const __m128i d475 = |
140 | | _mm_unpackhi_epi16(_mm_mullo_epi16(D, c475), _mm_mulhi_epi16(D, c475)); |
141 | | const __m128i Bs = _mm_add_epi32(C, d475); |
142 | | const __m128i B32 = _mm_srai_epi32(Bs, 8); |
143 | | const __m128i B16 = _mm_packs_epi32(B32, _mm_setzero_si128()); |
144 | | const __m128i B = _mm_packus_epi16(B16, _mm_setzero_si128()); |
145 | | const __m128i packed = _mm_shuffle_epi8(B, mask[2]); |
146 | | BGRX = _mm_or_si128(BGRX, packed); |
147 | | } |
148 | | } |
149 | | _mm_storeu_si128(dst++, BGRX); |
150 | | return dst; |
151 | | } |
152 | | |
153 | | static pstatus_t ssse3_YUV420ToRGB_BGRX(const BYTE* const WINPR_RESTRICT pSrc[], |
154 | | const UINT32* WINPR_RESTRICT srcStep, |
155 | | BYTE* WINPR_RESTRICT pDst, UINT32 dstStep, |
156 | | const prim_size_t* WINPR_RESTRICT roi) |
157 | | { |
158 | | const UINT32 nWidth = roi->width; |
159 | | const UINT32 nHeight = roi->height; |
160 | | const UINT32 pad = roi->width % 16; |
161 | | const __m128i duplicate = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0); |
162 | | |
163 | | for (size_t y = 0; y < nHeight; y++) |
164 | | { |
165 | | __m128i* dst = (__m128i*)(pDst + dstStep * y); |
166 | | const BYTE* YData = pSrc[0] + y * srcStep[0]; |
167 | | const BYTE* UData = pSrc[1] + (y / 2) * srcStep[1]; |
168 | | const BYTE* VData = pSrc[2] + (y / 2) * srcStep[2]; |
169 | | |
170 | | for (UINT32 x = 0; x < nWidth - pad; x += 16) |
171 | | { |
172 | | const __m128i Y = _mm_loadu_si128((const __m128i*)YData); |
173 | | const __m128i uRaw = _mm_loadu_si128((const __m128i*)UData); |
174 | | const __m128i vRaw = _mm_loadu_si128((const __m128i*)VData); |
175 | | const __m128i U = _mm_shuffle_epi8(uRaw, duplicate); |
176 | | const __m128i V = _mm_shuffle_epi8(vRaw, duplicate); |
177 | | YData += 16; |
178 | | UData += 8; |
179 | | VData += 8; |
180 | | dst = ssse3_YUV444Pixel(dst, Y, U, V, 0); |
181 | | dst = ssse3_YUV444Pixel(dst, Y, U, V, 1); |
182 | | dst = ssse3_YUV444Pixel(dst, Y, U, V, 2); |
183 | | dst = ssse3_YUV444Pixel(dst, Y, U, V, 3); |
184 | | } |
185 | | |
186 | | for (UINT32 x = 0; x < pad; x++) |
187 | | { |
188 | | const BYTE Y = *YData++; |
189 | | const BYTE U = *UData; |
190 | | const BYTE V = *VData; |
191 | | const BYTE r = YUV2R(Y, U, V); |
192 | | const BYTE g = YUV2G(Y, U, V); |
193 | | const BYTE b = YUV2B(Y, U, V); |
194 | | dst = (__m128i*)writePixelBGRX((BYTE*)dst, 4, PIXEL_FORMAT_BGRX32, r, g, b, 0); |
195 | | |
196 | | if (x % 2) |
197 | | { |
198 | | UData++; |
199 | | VData++; |
200 | | } |
201 | | } |
202 | | } |
203 | | |
204 | | return PRIMITIVES_SUCCESS; |
205 | | } |
206 | | |
207 | | static pstatus_t ssse3_YUV420ToRGB(const BYTE* const WINPR_RESTRICT pSrc[3], |
208 | | const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst, |
209 | | UINT32 dstStep, UINT32 DstFormat, |
210 | | const prim_size_t* WINPR_RESTRICT roi) |
211 | | { |
212 | | switch (DstFormat) |
213 | | { |
214 | | case PIXEL_FORMAT_BGRX32: |
215 | | case PIXEL_FORMAT_BGRA32: |
216 | | return ssse3_YUV420ToRGB_BGRX(pSrc, srcStep, pDst, dstStep, roi); |
217 | | |
218 | | default: |
219 | | return generic->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi); |
220 | | } |
221 | | } |
222 | | |
223 | | static pstatus_t ssse3_YUV444ToRGB_8u_P3AC4R_BGRX(const BYTE* const WINPR_RESTRICT pSrc[], |
224 | | const UINT32 srcStep[], BYTE* WINPR_RESTRICT pDst, |
225 | | UINT32 dstStep, |
226 | | const prim_size_t* WINPR_RESTRICT roi) |
227 | | { |
228 | | const UINT32 nWidth = roi->width; |
229 | | const UINT32 nHeight = roi->height; |
230 | | const UINT32 pad = roi->width % 16; |
231 | | |
232 | | for (size_t y = 0; y < nHeight; y++) |
233 | | { |
234 | | __m128i* dst = (__m128i*)(pDst + dstStep * y); |
235 | | const BYTE* YData = pSrc[0] + y * srcStep[0]; |
236 | | const BYTE* UData = pSrc[1] + y * srcStep[1]; |
237 | | const BYTE* VData = pSrc[2] + y * srcStep[2]; |
238 | | |
239 | | for (size_t x = 0; x < nWidth - pad; x += 16) |
240 | | { |
241 | | __m128i Y = _mm_load_si128((const __m128i*)YData); |
242 | | __m128i U = _mm_load_si128((const __m128i*)UData); |
243 | | __m128i V = _mm_load_si128((const __m128i*)VData); |
244 | | YData += 16; |
245 | | UData += 16; |
246 | | VData += 16; |
247 | | dst = ssse3_YUV444Pixel(dst, Y, U, V, 0); |
248 | | dst = ssse3_YUV444Pixel(dst, Y, U, V, 1); |
249 | | dst = ssse3_YUV444Pixel(dst, Y, U, V, 2); |
250 | | dst = ssse3_YUV444Pixel(dst, Y, U, V, 3); |
251 | | } |
252 | | |
253 | | for (size_t x = 0; x < pad; x++) |
254 | | { |
255 | | const BYTE Y = *YData++; |
256 | | const BYTE U = *UData++; |
257 | | const BYTE V = *VData++; |
258 | | const BYTE r = YUV2R(Y, U, V); |
259 | | const BYTE g = YUV2G(Y, U, V); |
260 | | const BYTE b = YUV2B(Y, U, V); |
261 | | dst = (__m128i*)writePixelBGRX((BYTE*)dst, 4, PIXEL_FORMAT_BGRX32, r, g, b, 0); |
262 | | } |
263 | | } |
264 | | |
265 | | return PRIMITIVES_SUCCESS; |
266 | | } |
267 | | |
268 | | static pstatus_t ssse3_YUV444ToRGB_8u_P3AC4R(const BYTE* const WINPR_RESTRICT pSrc[], |
269 | | const UINT32 srcStep[], BYTE* WINPR_RESTRICT pDst, |
270 | | UINT32 dstStep, UINT32 DstFormat, |
271 | | const prim_size_t* WINPR_RESTRICT roi) |
272 | | { |
273 | | if ((uintptr_t)pSrc[0] % 16 || (uintptr_t)pSrc[1] % 16 || (uintptr_t)pSrc[2] % 16 || |
274 | | srcStep[0] % 16 || srcStep[1] % 16 || srcStep[2] % 16) |
275 | | return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi); |
276 | | |
277 | | switch (DstFormat) |
278 | | { |
279 | | case PIXEL_FORMAT_BGRX32: |
280 | | case PIXEL_FORMAT_BGRA32: |
281 | | return ssse3_YUV444ToRGB_8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi); |
282 | | |
283 | | default: |
284 | | return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi); |
285 | | } |
286 | | } |
287 | | |
288 | | /****************************************************************************/ |
289 | | /* SSSE3 RGB -> YUV420 conversion **/ |
290 | | /****************************************************************************/ |
291 | | |
292 | | /** |
293 | | * Note (nfedera): |
294 | | * The used forward transformation factors from RGB to YUV are based on the |
295 | | * values specified in [Rec. ITU-R BT.709-6] Section 3: |
296 | | * http://www.itu.int/rec/R-REC-BT.709-6-201506-I/en |
297 | | * |
298 | | * Y = 0.21260 * R + 0.71520 * G + 0.07220 * B + 0; |
299 | | * U = -0.11457 * R - 0.38543 * G + 0.50000 * B + 128; |
300 | | * V = 0.50000 * R - 0.45415 * G - 0.04585 * B + 128; |
301 | | * |
302 | | * The most accurate integer arithmetic approximation when using 8-bit signed |
303 | | * integer factors with 16-bit signed integer intermediate results is: |
304 | | * |
305 | | * Y = ( ( 27 * R + 92 * G + 9 * B) >> 7 ); |
306 | | * U = ( (-29 * R - 99 * G + 128 * B) >> 8 ) + 128; |
307 | | * V = ( ( 128 * R - 116 * G - 12 * B) >> 8 ) + 128; |
308 | | * |
309 | | * Due to signed 8bit range being [-128,127] the U and V constants of 128 are |
310 | | * rounded to 127 |
311 | | */ |
312 | | |
313 | | #define BGRX_Y_FACTORS _mm_set_epi8(0, 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9) |
314 | | #define BGRX_U_FACTORS \ |
315 | | _mm_set_epi8(0, -29, -99, 127, 0, -29, -99, 127, 0, -29, -99, 127, 0, -29, -99, 127) |
316 | | #define BGRX_V_FACTORS \ |
317 | | _mm_set_epi8(0, 127, -116, -12, 0, 127, -116, -12, 0, 127, -116, -12, 0, 127, -116, -12) |
318 | | #define CONST128_FACTORS _mm_set1_epi8(-128) |
319 | | |
320 | | #define Y_SHIFT 7 |
321 | | #define U_SHIFT 8 |
322 | | #define V_SHIFT 8 |
323 | | |
324 | | /* |
325 | | TODO: |
326 | | RGB[AX] can simply be supported using the following factors. And instead of loading the |
327 | | globals directly the functions below could be passed pointers to the correct vectors |
328 | | depending on the source picture format. |
329 | | |
330 | | PRIM_ALIGN_128 static const BYTE rgbx_y_factors[] = { |
331 | | 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9, 0 |
332 | | }; |
333 | | PRIM_ALIGN_128 static const BYTE rgbx_u_factors[] = { |
334 | | -15, -49, 64, 0, -15, -49, 64, 0, -15, -49, 64, 0, -15, -49, 64, 0 |
335 | | }; |
336 | | PRIM_ALIGN_128 static const BYTE rgbx_v_factors[] = { |
337 | | 64, -58, -6, 0, 64, -58, -6, 0, 64, -58, -6, 0, 64, -58, -6, 0 |
338 | | }; |
339 | | */ |
340 | | |
341 | | /* compute the luma (Y) component from a single rgb source line */ |
342 | | |
343 | | static INLINE void ssse3_RGBToYUV420_BGRX_Y(const BYTE* WINPR_RESTRICT src, BYTE* dst, UINT32 width) |
344 | | { |
345 | | __m128i x0; |
346 | | __m128i x1; |
347 | | __m128i x2; |
348 | | __m128i x3; |
349 | | const __m128i y_factors = BGRX_Y_FACTORS; |
350 | | const __m128i* argb = (const __m128i*)src; |
351 | | __m128i* ydst = (__m128i*)dst; |
352 | | |
353 | | for (UINT32 x = 0; x < width; x += 16) |
354 | | { |
355 | | /* store 16 rgba pixels in 4 128 bit registers */ |
356 | | x0 = _mm_load_si128(argb++); // 1st 4 pixels |
357 | | x1 = _mm_load_si128(argb++); // 2nd 4 pixels |
358 | | x2 = _mm_load_si128(argb++); // 3rd 4 pixels |
359 | | x3 = _mm_load_si128(argb++); // 4th 4 pixels |
360 | | /* multiplications and subtotals */ |
361 | | x0 = _mm_maddubs_epi16(x0, y_factors); |
362 | | x1 = _mm_maddubs_epi16(x1, y_factors); |
363 | | x2 = _mm_maddubs_epi16(x2, y_factors); |
364 | | x3 = _mm_maddubs_epi16(x3, y_factors); |
365 | | /* the total sums */ |
366 | | x0 = _mm_hadd_epi16(x0, x1); |
367 | | x2 = _mm_hadd_epi16(x2, x3); |
368 | | /* shift the results */ |
369 | | x0 = _mm_srli_epi16(x0, Y_SHIFT); |
370 | | x2 = _mm_srli_epi16(x2, Y_SHIFT); |
371 | | /* pack the 16 words into bytes */ |
372 | | x0 = _mm_packus_epi16(x0, x2); |
373 | | /* save to y plane */ |
374 | | _mm_storeu_si128(ydst++, x0); |
375 | | } |
376 | | } |
377 | | |
378 | | /* compute the chrominance (UV) components from two rgb source lines */ |
379 | | |
380 | | static INLINE void ssse3_RGBToYUV420_BGRX_UV(const BYTE* WINPR_RESTRICT src1, |
381 | | const BYTE* WINPR_RESTRICT src2, |
382 | | BYTE* WINPR_RESTRICT dst1, BYTE* WINPR_RESTRICT dst2, |
383 | | UINT32 width) |
384 | | { |
385 | | const __m128i u_factors = BGRX_U_FACTORS; |
386 | | const __m128i v_factors = BGRX_V_FACTORS; |
387 | | const __m128i vector128 = CONST128_FACTORS; |
388 | | __m128i x0; |
389 | | __m128i x1; |
390 | | __m128i x2; |
391 | | __m128i x3; |
392 | | __m128i x4; |
393 | | __m128i x5; |
394 | | const __m128i* rgb1 = (const __m128i*)src1; |
395 | | const __m128i* rgb2 = (const __m128i*)src2; |
396 | | __m64* udst = (__m64*)dst1; |
397 | | __m64* vdst = (__m64*)dst2; |
398 | | |
399 | | for (UINT32 x = 0; x < width; x += 16) |
400 | | { |
401 | | /* subsample 16x2 pixels into 16x1 pixels */ |
402 | | x0 = _mm_load_si128(rgb1++); |
403 | | x4 = _mm_load_si128(rgb2++); |
404 | | x0 = _mm_avg_epu8(x0, x4); |
405 | | x1 = _mm_load_si128(rgb1++); |
406 | | x4 = _mm_load_si128(rgb2++); |
407 | | x1 = _mm_avg_epu8(x1, x4); |
408 | | x2 = _mm_load_si128(rgb1++); |
409 | | x4 = _mm_load_si128(rgb2++); |
410 | | x2 = _mm_avg_epu8(x2, x4); |
411 | | x3 = _mm_load_si128(rgb1++); |
412 | | x4 = _mm_load_si128(rgb2++); |
413 | | x3 = _mm_avg_epu8(x3, x4); |
414 | | /* subsample these 16x1 pixels into 8x1 pixels */ |
415 | | /** |
416 | | * shuffle controls |
417 | | * c = a[0],a[2],b[0],b[2] == 10 00 10 00 = 0x88 |
418 | | * c = a[1],a[3],b[1],b[3] == 11 01 11 01 = 0xdd |
419 | | */ |
420 | | x4 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x0), _mm_castsi128_ps(x1), 0x88)); |
421 | | x0 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x0), _mm_castsi128_ps(x1), 0xdd)); |
422 | | x0 = _mm_avg_epu8(x0, x4); |
423 | | x4 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x2), _mm_castsi128_ps(x3), 0x88)); |
424 | | x1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x2), _mm_castsi128_ps(x3), 0xdd)); |
425 | | x1 = _mm_avg_epu8(x1, x4); |
426 | | /* multiplications and subtotals */ |
427 | | x2 = _mm_maddubs_epi16(x0, u_factors); |
428 | | x3 = _mm_maddubs_epi16(x1, u_factors); |
429 | | x4 = _mm_maddubs_epi16(x0, v_factors); |
430 | | x5 = _mm_maddubs_epi16(x1, v_factors); |
431 | | /* the total sums */ |
432 | | x0 = _mm_hadd_epi16(x2, x3); |
433 | | x1 = _mm_hadd_epi16(x4, x5); |
434 | | /* shift the results */ |
435 | | x0 = _mm_srai_epi16(x0, U_SHIFT); |
436 | | x1 = _mm_srai_epi16(x1, V_SHIFT); |
437 | | /* pack the 16 words into bytes */ |
438 | | x0 = _mm_packs_epi16(x0, x1); |
439 | | /* add 128 */ |
440 | | x0 = _mm_sub_epi8(x0, vector128); |
441 | | /* the lower 8 bytes go to the u plane */ |
442 | | _mm_storel_pi(udst++, _mm_castsi128_ps(x0)); |
443 | | /* the upper 8 bytes go to the v plane */ |
444 | | _mm_storeh_pi(vdst++, _mm_castsi128_ps(x0)); |
445 | | } |
446 | | } |
447 | | |
448 | | static pstatus_t ssse3_RGBToYUV420_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat, |
449 | | UINT32 srcStep, BYTE* WINPR_RESTRICT pDst[], |
450 | | const UINT32 dstStep[], |
451 | | const prim_size_t* WINPR_RESTRICT roi) |
452 | | { |
453 | | const BYTE* argb = pSrc; |
454 | | BYTE* ydst = pDst[0]; |
455 | | BYTE* udst = pDst[1]; |
456 | | BYTE* vdst = pDst[2]; |
457 | | |
458 | | if (roi->height < 1 || roi->width < 1) |
459 | | { |
460 | | return !PRIMITIVES_SUCCESS; |
461 | | } |
462 | | |
463 | | if (roi->width % 16 || (uintptr_t)pSrc % 16 || srcStep % 16) |
464 | | { |
465 | | return generic->RGBToYUV420_8u_P3AC4R(pSrc, srcFormat, srcStep, pDst, dstStep, roi); |
466 | | } |
467 | | |
468 | | for (UINT32 y = 0; y < roi->height - 1; y += 2) |
469 | | { |
470 | | const BYTE* line1 = argb; |
471 | | const BYTE* line2 = argb + srcStep; |
472 | | ssse3_RGBToYUV420_BGRX_UV(line1, line2, udst, vdst, roi->width); |
473 | | ssse3_RGBToYUV420_BGRX_Y(line1, ydst, roi->width); |
474 | | ssse3_RGBToYUV420_BGRX_Y(line2, ydst + dstStep[0], roi->width); |
475 | | argb += 2ULL * srcStep; |
476 | | ydst += 2ULL * dstStep[0]; |
477 | | udst += 1ULL * dstStep[1]; |
478 | | vdst += 1ULL * dstStep[2]; |
479 | | } |
480 | | |
481 | | if (roi->height & 1) |
482 | | { |
483 | | /* pass the same last line of an odd height twice for UV */ |
484 | | ssse3_RGBToYUV420_BGRX_UV(argb, argb, udst, vdst, roi->width); |
485 | | ssse3_RGBToYUV420_BGRX_Y(argb, ydst, roi->width); |
486 | | } |
487 | | |
488 | | return PRIMITIVES_SUCCESS; |
489 | | } |
490 | | |
491 | | static pstatus_t ssse3_RGBToYUV420(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat, |
492 | | UINT32 srcStep, BYTE* WINPR_RESTRICT pDst[], |
493 | | const UINT32 dstStep[], const prim_size_t* WINPR_RESTRICT roi) |
494 | | { |
495 | | switch (srcFormat) |
496 | | { |
497 | | case PIXEL_FORMAT_BGRX32: |
498 | | case PIXEL_FORMAT_BGRA32: |
499 | | return ssse3_RGBToYUV420_BGRX(pSrc, srcFormat, srcStep, pDst, dstStep, roi); |
500 | | |
501 | | default: |
502 | | return generic->RGBToYUV420_8u_P3AC4R(pSrc, srcFormat, srcStep, pDst, dstStep, roi); |
503 | | } |
504 | | } |
505 | | |
506 | | /****************************************************************************/ |
507 | | /* SSSE3 RGB -> AVC444-YUV conversion **/ |
508 | | /****************************************************************************/ |
509 | | |
510 | | static INLINE void ssse3_RGBToAVC444YUV_BGRX_DOUBLE_ROW( |
511 | | const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd, |
512 | | BYTE* WINPR_RESTRICT b1Even, BYTE* WINPR_RESTRICT b1Odd, BYTE* WINPR_RESTRICT b2, |
513 | | BYTE* WINPR_RESTRICT b3, BYTE* WINPR_RESTRICT b4, BYTE* WINPR_RESTRICT b5, |
514 | | BYTE* WINPR_RESTRICT b6, BYTE* WINPR_RESTRICT b7, UINT32 width) |
515 | | { |
516 | | const __m128i* argbEven = (const __m128i*)srcEven; |
517 | | const __m128i* argbOdd = (const __m128i*)srcOdd; |
518 | | const __m128i y_factors = BGRX_Y_FACTORS; |
519 | | const __m128i u_factors = BGRX_U_FACTORS; |
520 | | const __m128i v_factors = BGRX_V_FACTORS; |
521 | | const __m128i vector128 = CONST128_FACTORS; |
522 | | |
523 | | for (UINT32 x = 0; x < width; x += 16) |
524 | | { |
525 | | /* store 16 rgba pixels in 4 128 bit registers */ |
526 | | const __m128i xe1 = _mm_load_si128(argbEven++); // 1st 4 pixels |
527 | | const __m128i xe2 = _mm_load_si128(argbEven++); // 2nd 4 pixels |
528 | | const __m128i xe3 = _mm_load_si128(argbEven++); // 3rd 4 pixels |
529 | | const __m128i xe4 = _mm_load_si128(argbEven++); // 4th 4 pixels |
530 | | const __m128i xo1 = _mm_load_si128(argbOdd++); // 1st 4 pixels |
531 | | const __m128i xo2 = _mm_load_si128(argbOdd++); // 2nd 4 pixels |
532 | | const __m128i xo3 = _mm_load_si128(argbOdd++); // 3rd 4 pixels |
533 | | const __m128i xo4 = _mm_load_si128(argbOdd++); // 4th 4 pixels |
534 | | { |
535 | | /* Y: multiplications with subtotals and horizontal sums */ |
536 | | const __m128i ye1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, y_factors), |
537 | | _mm_maddubs_epi16(xe2, y_factors)), |
538 | | Y_SHIFT); |
539 | | const __m128i ye2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, y_factors), |
540 | | _mm_maddubs_epi16(xe4, y_factors)), |
541 | | Y_SHIFT); |
542 | | const __m128i ye = _mm_packus_epi16(ye1, ye2); |
543 | | const __m128i yo1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, y_factors), |
544 | | _mm_maddubs_epi16(xo2, y_factors)), |
545 | | Y_SHIFT); |
546 | | const __m128i yo2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, y_factors), |
547 | | _mm_maddubs_epi16(xo4, y_factors)), |
548 | | Y_SHIFT); |
549 | | const __m128i yo = _mm_packus_epi16(yo1, yo2); |
550 | | /* store y [b1] */ |
551 | | _mm_storeu_si128((__m128i*)b1Even, ye); |
552 | | b1Even += 16; |
553 | | |
554 | | if (b1Odd) |
555 | | { |
556 | | _mm_storeu_si128((__m128i*)b1Odd, yo); |
557 | | b1Odd += 16; |
558 | | } |
559 | | } |
560 | | { |
561 | | /* We have now |
562 | | * 16 even U values in ue |
563 | | * 16 odd U values in uo |
564 | | * |
565 | | * We need to split these according to |
566 | | * 3.3.8.3.2 YUV420p Stream Combination for YUV444 mode */ |
567 | | __m128i ue; |
568 | | __m128i uo = { 0 }; |
569 | | { |
570 | | const __m128i ue1 = |
571 | | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, u_factors), |
572 | | _mm_maddubs_epi16(xe2, u_factors)), |
573 | | U_SHIFT); |
574 | | const __m128i ue2 = |
575 | | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, u_factors), |
576 | | _mm_maddubs_epi16(xe4, u_factors)), |
577 | | U_SHIFT); |
578 | | ue = _mm_sub_epi8(_mm_packs_epi16(ue1, ue2), vector128); |
579 | | } |
580 | | |
581 | | if (b1Odd) |
582 | | { |
583 | | const __m128i uo1 = |
584 | | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, u_factors), |
585 | | _mm_maddubs_epi16(xo2, u_factors)), |
586 | | U_SHIFT); |
587 | | const __m128i uo2 = |
588 | | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, u_factors), |
589 | | _mm_maddubs_epi16(xo4, u_factors)), |
590 | | U_SHIFT); |
591 | | uo = _mm_sub_epi8(_mm_packs_epi16(uo1, uo2), vector128); |
592 | | } |
593 | | |
594 | | /* Now we need the following storage distribution: |
595 | | * 2x 2y -> b2 |
596 | | * x 2y+1 -> b4 |
597 | | * 2x+1 2y -> b6 */ |
598 | | if (b1Odd) /* b2 */ |
599 | | { |
600 | | const __m128i ueh = _mm_unpackhi_epi8(ue, _mm_setzero_si128()); |
601 | | const __m128i uoh = _mm_unpackhi_epi8(uo, _mm_setzero_si128()); |
602 | | const __m128i hi = _mm_add_epi16(ueh, uoh); |
603 | | const __m128i uel = _mm_unpacklo_epi8(ue, _mm_setzero_si128()); |
604 | | const __m128i uol = _mm_unpacklo_epi8(uo, _mm_setzero_si128()); |
605 | | const __m128i lo = _mm_add_epi16(uel, uol); |
606 | | const __m128i added = _mm_hadd_epi16(lo, hi); |
607 | | const __m128i avg16 = _mm_srai_epi16(added, 2); |
608 | | const __m128i avg = _mm_packus_epi16(avg16, avg16); |
609 | | _mm_storel_epi64((__m128i*)b2, avg); |
610 | | } |
611 | | else |
612 | | { |
613 | | const __m128i mask = |
614 | | _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80, |
615 | | (char)0x80, (char)0x80, (char)0x80, 14, 12, 10, 8, 6, 4, 2, 0); |
616 | | const __m128i ud = _mm_shuffle_epi8(ue, mask); |
617 | | _mm_storel_epi64((__m128i*)b2, ud); |
618 | | } |
619 | | |
620 | | b2 += 8; |
621 | | |
622 | | if (b1Odd) /* b4 */ |
623 | | { |
624 | | _mm_store_si128((__m128i*)b4, uo); |
625 | | b4 += 16; |
626 | | } |
627 | | |
628 | | { |
629 | | /* b6 */ |
630 | | const __m128i mask = |
631 | | _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80, |
632 | | (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1); |
633 | | const __m128i ude = _mm_shuffle_epi8(ue, mask); |
634 | | _mm_storel_epi64((__m128i*)b6, ude); |
635 | | b6 += 8; |
636 | | } |
637 | | } |
638 | | { |
639 | | /* We have now |
640 | | * 16 even V values in ue |
641 | | * 16 odd V values in uo |
642 | | * |
643 | | * We need to split these according to |
644 | | * 3.3.8.3.2 YUV420p Stream Combination for YUV444 mode */ |
645 | | __m128i ve; |
646 | | __m128i vo = { 0 }; |
647 | | { |
648 | | const __m128i ve1 = |
649 | | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, v_factors), |
650 | | _mm_maddubs_epi16(xe2, v_factors)), |
651 | | V_SHIFT); |
652 | | const __m128i ve2 = |
653 | | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, v_factors), |
654 | | _mm_maddubs_epi16(xe4, v_factors)), |
655 | | V_SHIFT); |
656 | | ve = _mm_sub_epi8(_mm_packs_epi16(ve1, ve2), vector128); |
657 | | } |
658 | | |
659 | | if (b1Odd) |
660 | | { |
661 | | const __m128i vo1 = |
662 | | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, v_factors), |
663 | | _mm_maddubs_epi16(xo2, v_factors)), |
664 | | V_SHIFT); |
665 | | const __m128i vo2 = |
666 | | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, v_factors), |
667 | | _mm_maddubs_epi16(xo4, v_factors)), |
668 | | V_SHIFT); |
669 | | vo = _mm_sub_epi8(_mm_packs_epi16(vo1, vo2), vector128); |
670 | | } |
671 | | |
672 | | /* Now we need the following storage distribution: |
673 | | * 2x 2y -> b3 |
674 | | * x 2y+1 -> b5 |
675 | | * 2x+1 2y -> b7 */ |
676 | | if (b1Odd) /* b3 */ |
677 | | { |
678 | | const __m128i veh = _mm_unpackhi_epi8(ve, _mm_setzero_si128()); |
679 | | const __m128i voh = _mm_unpackhi_epi8(vo, _mm_setzero_si128()); |
680 | | const __m128i hi = _mm_add_epi16(veh, voh); |
681 | | const __m128i vel = _mm_unpacklo_epi8(ve, _mm_setzero_si128()); |
682 | | const __m128i vol = _mm_unpacklo_epi8(vo, _mm_setzero_si128()); |
683 | | const __m128i lo = _mm_add_epi16(vel, vol); |
684 | | const __m128i added = _mm_hadd_epi16(lo, hi); |
685 | | const __m128i avg16 = _mm_srai_epi16(added, 2); |
686 | | const __m128i avg = _mm_packus_epi16(avg16, avg16); |
687 | | _mm_storel_epi64((__m128i*)b3, avg); |
688 | | } |
689 | | else |
690 | | { |
691 | | const __m128i mask = |
692 | | _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80, |
693 | | (char)0x80, (char)0x80, (char)0x80, 14, 12, 10, 8, 6, 4, 2, 0); |
694 | | const __m128i vd = _mm_shuffle_epi8(ve, mask); |
695 | | _mm_storel_epi64((__m128i*)b3, vd); |
696 | | } |
697 | | |
698 | | b3 += 8; |
699 | | |
700 | | if (b1Odd) /* b5 */ |
701 | | { |
702 | | _mm_store_si128((__m128i*)b5, vo); |
703 | | b5 += 16; |
704 | | } |
705 | | |
706 | | { |
707 | | /* b7 */ |
708 | | const __m128i mask = |
709 | | _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80, |
710 | | (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1); |
711 | | const __m128i vde = _mm_shuffle_epi8(ve, mask); |
712 | | _mm_storel_epi64((__m128i*)b7, vde); |
713 | | b7 += 8; |
714 | | } |
715 | | } |
716 | | } |
717 | | } |
718 | | |
719 | | static pstatus_t ssse3_RGBToAVC444YUV_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat, |
720 | | UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[], |
721 | | const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[], |
722 | | const UINT32 dst2Step[], |
723 | | const prim_size_t* WINPR_RESTRICT roi) |
724 | | { |
725 | | const BYTE* pMaxSrc = pSrc + 1ULL * (roi->height - 1) * srcStep; |
726 | | |
727 | | if (roi->height < 1 || roi->width < 1) |
728 | | return !PRIMITIVES_SUCCESS; |
729 | | |
730 | | if (roi->width % 16 || (uintptr_t)pSrc % 16 || srcStep % 16) |
731 | | return generic->RGBToAVC444YUV(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, dst2Step, |
732 | | roi); |
733 | | |
734 | | for (size_t y = 0; y < roi->height; y += 2) |
735 | | { |
736 | | const BOOL last = (y >= (roi->height - 1)); |
737 | | const BYTE* srcEven = y < roi->height ? pSrc + y * srcStep : pMaxSrc; |
738 | | const BYTE* srcOdd = !last ? pSrc + (y + 1) * srcStep : pMaxSrc; |
739 | | const UINT32 i = y >> 1; |
740 | | const UINT32 n = (i & ~7) + i; |
741 | | BYTE* b1Even = pDst1[0] + y * dst1Step[0]; |
742 | | BYTE* b1Odd = !last ? (b1Even + dst1Step[0]) : NULL; |
743 | | BYTE* b2 = pDst1[1] + (y / 2) * dst1Step[1]; |
744 | | BYTE* b3 = pDst1[2] + (y / 2) * dst1Step[2]; |
745 | | BYTE* b4 = pDst2[0] + 1ULL * dst2Step[0] * n; |
746 | | BYTE* b5 = b4 + 8ULL * dst2Step[0]; |
747 | | BYTE* b6 = pDst2[1] + (y / 2) * dst2Step[1]; |
748 | | BYTE* b7 = pDst2[2] + (y / 2) * dst2Step[2]; |
749 | | ssse3_RGBToAVC444YUV_BGRX_DOUBLE_ROW(srcEven, srcOdd, b1Even, b1Odd, b2, b3, b4, b5, b6, b7, |
750 | | roi->width); |
751 | | } |
752 | | |
753 | | return PRIMITIVES_SUCCESS; |
754 | | } |
755 | | |
756 | | static pstatus_t ssse3_RGBToAVC444YUV(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat, |
757 | | UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[], |
758 | | const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[], |
759 | | const UINT32 dst2Step[], |
760 | | const prim_size_t* WINPR_RESTRICT roi) |
761 | | { |
762 | | switch (srcFormat) |
763 | | { |
764 | | case PIXEL_FORMAT_BGRX32: |
765 | | case PIXEL_FORMAT_BGRA32: |
766 | | return ssse3_RGBToAVC444YUV_BGRX(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, |
767 | | dst2Step, roi); |
768 | | |
769 | | default: |
770 | | return generic->RGBToAVC444YUV(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, |
771 | | dst2Step, roi); |
772 | | } |
773 | | } |
774 | | |
775 | | /* Mapping of arguments: |
776 | | * |
777 | | * b1 [even lines] -> yLumaDstEven |
778 | | * b1 [odd lines] -> yLumaDstOdd |
779 | | * b2 -> uLumaDst |
780 | | * b3 -> vLumaDst |
781 | | * b4 -> yChromaDst1 |
782 | | * b5 -> yChromaDst2 |
783 | | * b6 -> uChromaDst1 |
784 | | * b7 -> uChromaDst2 |
785 | | * b8 -> vChromaDst1 |
786 | | * b9 -> vChromaDst2 |
787 | | */ |
788 | | static INLINE void ssse3_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW( |
789 | | const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd, |
790 | | BYTE* WINPR_RESTRICT yLumaDstEven, BYTE* WINPR_RESTRICT yLumaDstOdd, |
791 | | BYTE* WINPR_RESTRICT uLumaDst, BYTE* WINPR_RESTRICT vLumaDst, |
792 | | BYTE* WINPR_RESTRICT yEvenChromaDst1, BYTE* WINPR_RESTRICT yEvenChromaDst2, |
793 | | BYTE* WINPR_RESTRICT yOddChromaDst1, BYTE* WINPR_RESTRICT yOddChromaDst2, |
794 | | BYTE* WINPR_RESTRICT uChromaDst1, BYTE* WINPR_RESTRICT uChromaDst2, |
795 | | BYTE* WINPR_RESTRICT vChromaDst1, BYTE* WINPR_RESTRICT vChromaDst2, UINT32 width) |
796 | | { |
797 | | const __m128i vector128 = CONST128_FACTORS; |
798 | | const __m128i* argbEven = (const __m128i*)srcEven; |
799 | | const __m128i* argbOdd = (const __m128i*)srcOdd; |
800 | | |
801 | | for (UINT32 x = 0; x < width; x += 16) |
802 | | { |
803 | | /* store 16 rgba pixels in 4 128 bit registers |
804 | | * for even and odd rows. |
805 | | */ |
806 | | const __m128i xe1 = _mm_load_si128(argbEven++); /* 1st 4 pixels */ |
807 | | const __m128i xe2 = _mm_load_si128(argbEven++); /* 2nd 4 pixels */ |
808 | | const __m128i xe3 = _mm_load_si128(argbEven++); /* 3rd 4 pixels */ |
809 | | const __m128i xe4 = _mm_load_si128(argbEven++); /* 4th 4 pixels */ |
810 | | const __m128i xo1 = _mm_load_si128(argbOdd++); /* 1st 4 pixels */ |
811 | | const __m128i xo2 = _mm_load_si128(argbOdd++); /* 2nd 4 pixels */ |
812 | | const __m128i xo3 = _mm_load_si128(argbOdd++); /* 3rd 4 pixels */ |
813 | | const __m128i xo4 = _mm_load_si128(argbOdd++); /* 4th 4 pixels */ |
814 | | { |
815 | | /* Y: multiplications with subtotals and horizontal sums */ |
816 | | const __m128i y_factors = BGRX_Y_FACTORS; |
817 | | const __m128i ye1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, y_factors), |
818 | | _mm_maddubs_epi16(xe2, y_factors)), |
819 | | Y_SHIFT); |
820 | | const __m128i ye2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, y_factors), |
821 | | _mm_maddubs_epi16(xe4, y_factors)), |
822 | | Y_SHIFT); |
823 | | const __m128i ye = _mm_packus_epi16(ye1, ye2); |
824 | | /* store y [b1] */ |
825 | | _mm_storeu_si128((__m128i*)yLumaDstEven, ye); |
826 | | yLumaDstEven += 16; |
827 | | } |
828 | | |
829 | | if (yLumaDstOdd) |
830 | | { |
831 | | const __m128i y_factors = BGRX_Y_FACTORS; |
832 | | const __m128i yo1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, y_factors), |
833 | | _mm_maddubs_epi16(xo2, y_factors)), |
834 | | Y_SHIFT); |
835 | | const __m128i yo2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, y_factors), |
836 | | _mm_maddubs_epi16(xo4, y_factors)), |
837 | | Y_SHIFT); |
838 | | const __m128i yo = _mm_packus_epi16(yo1, yo2); |
839 | | _mm_storeu_si128((__m128i*)yLumaDstOdd, yo); |
840 | | yLumaDstOdd += 16; |
841 | | } |
842 | | |
843 | | { |
844 | | /* We have now |
845 | | * 16 even U values in ue |
846 | | * 16 odd U values in uo |
847 | | * |
848 | | * We need to split these according to |
849 | | * 3.3.8.3.3 YUV420p Stream Combination for YUV444v2 mode */ |
850 | | /* U: multiplications with subtotals and horizontal sums */ |
851 | | __m128i ue; |
852 | | __m128i uo; |
853 | | __m128i uavg; |
854 | | { |
855 | | const __m128i u_factors = BGRX_U_FACTORS; |
856 | | const __m128i ue1 = |
857 | | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, u_factors), |
858 | | _mm_maddubs_epi16(xe2, u_factors)), |
859 | | U_SHIFT); |
860 | | const __m128i ue2 = |
861 | | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, u_factors), |
862 | | _mm_maddubs_epi16(xe4, u_factors)), |
863 | | U_SHIFT); |
864 | | const __m128i ueavg = _mm_hadd_epi16(ue1, ue2); |
865 | | ue = _mm_sub_epi8(_mm_packs_epi16(ue1, ue2), vector128); |
866 | | uavg = ueavg; |
867 | | } |
868 | | { |
869 | | const __m128i u_factors = BGRX_U_FACTORS; |
870 | | const __m128i uo1 = |
871 | | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, u_factors), |
872 | | _mm_maddubs_epi16(xo2, u_factors)), |
873 | | U_SHIFT); |
874 | | const __m128i uo2 = |
875 | | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, u_factors), |
876 | | _mm_maddubs_epi16(xo4, u_factors)), |
877 | | U_SHIFT); |
878 | | const __m128i uoavg = _mm_hadd_epi16(uo1, uo2); |
879 | | uo = _mm_sub_epi8(_mm_packs_epi16(uo1, uo2), vector128); |
880 | | uavg = _mm_add_epi16(uavg, uoavg); |
881 | | uavg = _mm_srai_epi16(uavg, 2); |
882 | | uavg = _mm_packs_epi16(uavg, uoavg); |
883 | | uavg = _mm_sub_epi8(uavg, vector128); |
884 | | } |
885 | | /* Now we need the following storage distribution: |
886 | | * 2x 2y -> uLumaDst |
887 | | * 2x+1 y -> yChromaDst1 |
888 | | * 4x 2y+1 -> uChromaDst1 |
889 | | * 4x+2 2y+1 -> vChromaDst1 */ |
890 | | { |
891 | | const __m128i mask = |
892 | | _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80, |
893 | | (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1); |
894 | | const __m128i ude = _mm_shuffle_epi8(ue, mask); |
895 | | _mm_storel_epi64((__m128i*)yEvenChromaDst1, ude); |
896 | | yEvenChromaDst1 += 8; |
897 | | } |
898 | | |
899 | | if (yLumaDstOdd) |
900 | | { |
901 | | const __m128i mask = |
902 | | _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80, |
903 | | (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1); |
904 | | const __m128i udo = _mm_shuffle_epi8(uo, mask); |
905 | | _mm_storel_epi64((__m128i*)yOddChromaDst1, udo); |
906 | | yOddChromaDst1 += 8; |
907 | | } |
908 | | |
909 | | if (yLumaDstOdd) |
910 | | { |
911 | | const __m128i mask = |
912 | | _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80, |
913 | | (char)0x80, (char)0x80, (char)0x80, 14, 10, 6, 2, 12, 8, 4, 0); |
914 | | const __m128i ud = _mm_shuffle_epi8(uo, mask); |
915 | | int* uDst1 = (int*)uChromaDst1; |
916 | | int* vDst1 = (int*)vChromaDst1; |
917 | | const int* src = (const int*)&ud; |
918 | | _mm_stream_si32(uDst1, src[0]); |
919 | | _mm_stream_si32(vDst1, src[1]); |
920 | | uChromaDst1 += 4; |
921 | | vChromaDst1 += 4; |
922 | | } |
923 | | |
924 | | if (yLumaDstOdd) |
925 | | { |
926 | | _mm_storel_epi64((__m128i*)uLumaDst, uavg); |
927 | | uLumaDst += 8; |
928 | | } |
929 | | else |
930 | | { |
931 | | const __m128i mask = |
932 | | _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80, |
933 | | (char)0x80, (char)0x80, (char)0x80, 14, 12, 10, 8, 6, 4, 2, 0); |
934 | | const __m128i ud = _mm_shuffle_epi8(ue, mask); |
935 | | _mm_storel_epi64((__m128i*)uLumaDst, ud); |
936 | | uLumaDst += 8; |
937 | | } |
938 | | } |
939 | | |
940 | | { |
941 | | /* V: multiplications with subtotals and horizontal sums */ |
942 | | __m128i ve; |
943 | | __m128i vo; |
944 | | __m128i vavg; |
945 | | { |
946 | | const __m128i v_factors = BGRX_V_FACTORS; |
947 | | const __m128i ve1 = |
948 | | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, v_factors), |
949 | | _mm_maddubs_epi16(xe2, v_factors)), |
950 | | V_SHIFT); |
951 | | const __m128i ve2 = |
952 | | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, v_factors), |
953 | | _mm_maddubs_epi16(xe4, v_factors)), |
954 | | V_SHIFT); |
955 | | const __m128i veavg = _mm_hadd_epi16(ve1, ve2); |
956 | | ve = _mm_sub_epi8(_mm_packs_epi16(ve1, ve2), vector128); |
957 | | vavg = veavg; |
958 | | } |
959 | | { |
960 | | const __m128i v_factors = BGRX_V_FACTORS; |
961 | | const __m128i vo1 = |
962 | | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, v_factors), |
963 | | _mm_maddubs_epi16(xo2, v_factors)), |
964 | | V_SHIFT); |
965 | | const __m128i vo2 = |
966 | | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, v_factors), |
967 | | _mm_maddubs_epi16(xo4, v_factors)), |
968 | | V_SHIFT); |
969 | | const __m128i voavg = _mm_hadd_epi16(vo1, vo2); |
970 | | vo = _mm_sub_epi8(_mm_packs_epi16(vo1, vo2), vector128); |
971 | | vavg = _mm_add_epi16(vavg, voavg); |
972 | | vavg = _mm_srai_epi16(vavg, 2); |
973 | | vavg = _mm_packs_epi16(vavg, voavg); |
974 | | vavg = _mm_sub_epi8(vavg, vector128); |
975 | | } |
976 | | /* Now we need the following storage distribution: |
977 | | * 2x 2y -> vLumaDst |
978 | | * 2x+1 y -> yChromaDst2 |
979 | | * 4x 2y+1 -> uChromaDst2 |
980 | | * 4x+2 2y+1 -> vChromaDst2 */ |
981 | | { |
982 | | const __m128i mask = |
983 | | _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80, |
984 | | (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1); |
985 | | __m128i vde = _mm_shuffle_epi8(ve, mask); |
986 | | _mm_storel_epi64((__m128i*)yEvenChromaDst2, vde); |
987 | | yEvenChromaDst2 += 8; |
988 | | } |
989 | | |
990 | | if (yLumaDstOdd) |
991 | | { |
992 | | const __m128i mask = |
993 | | _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80, |
994 | | (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1); |
995 | | __m128i vdo = _mm_shuffle_epi8(vo, mask); |
996 | | _mm_storel_epi64((__m128i*)yOddChromaDst2, vdo); |
997 | | yOddChromaDst2 += 8; |
998 | | } |
999 | | |
1000 | | if (yLumaDstOdd) |
1001 | | { |
1002 | | const __m128i mask = |
1003 | | _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80, |
1004 | | (char)0x80, (char)0x80, (char)0x80, 14, 10, 6, 2, 12, 8, 4, 0); |
1005 | | const __m128i vd = _mm_shuffle_epi8(vo, mask); |
1006 | | int* uDst2 = (int*)uChromaDst2; |
1007 | | int* vDst2 = (int*)vChromaDst2; |
1008 | | const int* src = (const int*)&vd; |
1009 | | _mm_stream_si32(uDst2, src[0]); |
1010 | | _mm_stream_si32(vDst2, src[1]); |
1011 | | uChromaDst2 += 4; |
1012 | | vChromaDst2 += 4; |
1013 | | } |
1014 | | |
1015 | | if (yLumaDstOdd) |
1016 | | { |
1017 | | _mm_storel_epi64((__m128i*)vLumaDst, vavg); |
1018 | | vLumaDst += 8; |
1019 | | } |
1020 | | else |
1021 | | { |
1022 | | const __m128i mask = |
1023 | | _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80, |
1024 | | (char)0x80, (char)0x80, (char)0x80, 14, 12, 10, 8, 6, 4, 2, 0); |
1025 | | __m128i vd = _mm_shuffle_epi8(ve, mask); |
1026 | | _mm_storel_epi64((__m128i*)vLumaDst, vd); |
1027 | | vLumaDst += 8; |
1028 | | } |
1029 | | } |
1030 | | } |
1031 | | } |
1032 | | |
1033 | | static pstatus_t ssse3_RGBToAVC444YUVv2_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat, |
1034 | | UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[], |
1035 | | const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[], |
1036 | | const UINT32 dst2Step[], |
1037 | | const prim_size_t* WINPR_RESTRICT roi) |
1038 | | { |
1039 | | if (roi->height < 1 || roi->width < 1) |
1040 | | return !PRIMITIVES_SUCCESS; |
1041 | | |
1042 | | if (roi->width % 16 || (uintptr_t)pSrc % 16 || srcStep % 16) |
1043 | | return generic->RGBToAVC444YUVv2(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, dst2Step, |
1044 | | roi); |
1045 | | |
1046 | | for (size_t y = 0; y < roi->height; y += 2) |
1047 | | { |
1048 | | const BYTE* srcEven = (pSrc + y * srcStep); |
1049 | | const BYTE* srcOdd = (srcEven + srcStep); |
1050 | | BYTE* dstLumaYEven = (pDst1[0] + y * dst1Step[0]); |
1051 | | BYTE* dstLumaYOdd = (y < roi->height - 1) ? (dstLumaYEven + dst1Step[0]) : NULL; |
1052 | | BYTE* dstLumaU = (pDst1[1] + (y / 2) * dst1Step[1]); |
1053 | | BYTE* dstLumaV = (pDst1[2] + (y / 2) * dst1Step[2]); |
1054 | | BYTE* dstEvenChromaY1 = (pDst2[0] + y * dst2Step[0]); |
1055 | | BYTE* dstEvenChromaY2 = dstEvenChromaY1 + roi->width / 2; |
1056 | | BYTE* dstOddChromaY1 = dstEvenChromaY1 + dst2Step[0]; |
1057 | | BYTE* dstOddChromaY2 = dstEvenChromaY2 + dst2Step[0]; |
1058 | | BYTE* dstChromaU1 = (pDst2[1] + (y / 2) * dst2Step[1]); |
1059 | | BYTE* dstChromaV1 = (pDst2[2] + (y / 2) * dst2Step[2]); |
1060 | | BYTE* dstChromaU2 = dstChromaU1 + roi->width / 4; |
1061 | | BYTE* dstChromaV2 = dstChromaV1 + roi->width / 4; |
1062 | | ssse3_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(srcEven, srcOdd, dstLumaYEven, dstLumaYOdd, dstLumaU, |
1063 | | dstLumaV, dstEvenChromaY1, dstEvenChromaY2, |
1064 | | dstOddChromaY1, dstOddChromaY2, dstChromaU1, |
1065 | | dstChromaU2, dstChromaV1, dstChromaV2, roi->width); |
1066 | | } |
1067 | | |
1068 | | return PRIMITIVES_SUCCESS; |
1069 | | } |
1070 | | |
1071 | | static pstatus_t ssse3_RGBToAVC444YUVv2(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat, |
1072 | | UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[], |
1073 | | const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[], |
1074 | | const UINT32 dst2Step[], |
1075 | | const prim_size_t* WINPR_RESTRICT roi) |
1076 | | { |
1077 | | switch (srcFormat) |
1078 | | { |
1079 | | case PIXEL_FORMAT_BGRX32: |
1080 | | case PIXEL_FORMAT_BGRA32: |
1081 | | return ssse3_RGBToAVC444YUVv2_BGRX(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, |
1082 | | dst2Step, roi); |
1083 | | |
1084 | | default: |
1085 | | return generic->RGBToAVC444YUVv2(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, |
1086 | | dst2Step, roi); |
1087 | | } |
1088 | | } |
1089 | | |
1090 | | static pstatus_t ssse3_LumaToYUV444(const BYTE* const WINPR_RESTRICT pSrcRaw[], |
1091 | | const UINT32 srcStep[], BYTE* WINPR_RESTRICT pDstRaw[], |
1092 | | const UINT32 dstStep[], const RECTANGLE_16* WINPR_RESTRICT roi) |
1093 | | { |
1094 | | const UINT32 nWidth = roi->right - roi->left; |
1095 | | const UINT32 nHeight = roi->bottom - roi->top; |
1096 | | const UINT32 halfWidth = (nWidth + 1) / 2; |
1097 | | const UINT32 halfPad = halfWidth % 16; |
1098 | | const UINT32 halfHeight = (nHeight + 1) / 2; |
1099 | | const UINT32 oddY = 1; |
1100 | | const UINT32 evenY = 0; |
1101 | | const UINT32 oddX = 1; |
1102 | | const UINT32 evenX = 0; |
1103 | | const BYTE* pSrc[3] = { pSrcRaw[0] + 1ULL * roi->top * srcStep[0] + roi->left, |
1104 | | pSrcRaw[1] + 1ULL * roi->top / 2 * srcStep[1] + roi->left / 2, |
1105 | | pSrcRaw[2] + 1ULL * roi->top / 2 * srcStep[2] + roi->left / 2 }; |
1106 | | BYTE* pDst[3] = { pDstRaw[0] + 1ULL * roi->top * dstStep[0] + roi->left, |
1107 | | pDstRaw[1] + 1ULL * roi->top * dstStep[1] + roi->left, |
1108 | | pDstRaw[2] + 1ULL * roi->top * dstStep[2] + roi->left }; |
1109 | | |
1110 | | /* Y data is already here... */ |
1111 | | /* B1 */ |
1112 | | for (size_t y = 0; y < nHeight; y++) |
1113 | | { |
1114 | | const BYTE* Ym = pSrc[0] + y * srcStep[0]; |
1115 | | BYTE* pY = pDst[0] + y * dstStep[0]; |
1116 | | memcpy(pY, Ym, nWidth); |
1117 | | } |
1118 | | |
1119 | | /* The first half of U, V are already here part of this frame. */ |
1120 | | /* B2 and B3 */ |
1121 | | for (size_t y = 0; y < halfHeight; y++) |
1122 | | { |
1123 | | const size_t val2y = (2 * y + evenY); |
1124 | | const size_t val2y1 = val2y + oddY; |
1125 | | const BYTE* Um = pSrc[1] + 1ULL * srcStep[1] * y; |
1126 | | const BYTE* Vm = pSrc[2] + 1ULL * srcStep[2] * y; |
1127 | | BYTE* pU = pDst[1] + 1ULL * dstStep[1] * val2y; |
1128 | | BYTE* pV = pDst[2] + 1ULL * dstStep[2] * val2y; |
1129 | | BYTE* pU1 = pDst[1] + 1ULL * dstStep[1] * val2y1; |
1130 | | BYTE* pV1 = pDst[2] + 1ULL * dstStep[2] * val2y1; |
1131 | | |
1132 | | size_t x = 0; |
1133 | | for (; x < halfWidth - halfPad; x += 16) |
1134 | | { |
1135 | | const __m128i unpackHigh = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0); |
1136 | | const __m128i unpackLow = |
1137 | | _mm_set_epi8(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8); |
1138 | | { |
1139 | | const __m128i u = _mm_loadu_si128((const __m128i*)&Um[x]); |
1140 | | const __m128i uHigh = _mm_shuffle_epi8(u, unpackHigh); |
1141 | | const __m128i uLow = _mm_shuffle_epi8(u, unpackLow); |
1142 | | _mm_storeu_si128((__m128i*)&pU[2ULL * x], uHigh); |
1143 | | _mm_storeu_si128((__m128i*)&pU[2ULL * x + 16], uLow); |
1144 | | _mm_storeu_si128((__m128i*)&pU1[2ULL * x], uHigh); |
1145 | | _mm_storeu_si128((__m128i*)&pU1[2ULL * x + 16], uLow); |
1146 | | } |
1147 | | { |
1148 | | const __m128i u = _mm_loadu_si128((const __m128i*)&Vm[x]); |
1149 | | const __m128i uHigh = _mm_shuffle_epi8(u, unpackHigh); |
1150 | | const __m128i uLow = _mm_shuffle_epi8(u, unpackLow); |
1151 | | _mm_storeu_si128((__m128i*)&pV[2 * x], uHigh); |
1152 | | _mm_storeu_si128((__m128i*)&pV[2 * x + 16], uLow); |
1153 | | _mm_storeu_si128((__m128i*)&pV1[2 * x], uHigh); |
1154 | | _mm_storeu_si128((__m128i*)&pV1[2 * x + 16], uLow); |
1155 | | } |
1156 | | } |
1157 | | |
1158 | | for (; x < halfWidth; x++) |
1159 | | { |
1160 | | const UINT32 val2x = 2 * x + evenX; |
1161 | | const UINT32 val2x1 = val2x + oddX; |
1162 | | pU[val2x] = Um[x]; |
1163 | | pV[val2x] = Vm[x]; |
1164 | | pU[val2x1] = Um[x]; |
1165 | | pV[val2x1] = Vm[x]; |
1166 | | pU1[val2x] = Um[x]; |
1167 | | pV1[val2x] = Vm[x]; |
1168 | | pU1[val2x1] = Um[x]; |
1169 | | pV1[val2x1] = Vm[x]; |
1170 | | } |
1171 | | } |
1172 | | |
1173 | | return PRIMITIVES_SUCCESS; |
1174 | | } |
1175 | | |
1176 | | static INLINE void ssse3_filter(BYTE* WINPR_RESTRICT pSrcDst, const BYTE* WINPR_RESTRICT pSrc2) |
1177 | | { |
1178 | | const __m128i even = _mm_set_epi8((char)0x80, 14, (char)0x80, 12, (char)0x80, 10, (char)0x80, 8, |
1179 | | (char)0x80, 6, (char)0x80, 4, (char)0x80, 2, (char)0x80, 0); |
1180 | | const __m128i odd = _mm_set_epi8((char)0x80, 15, (char)0x80, 13, (char)0x80, 11, (char)0x80, 9, |
1181 | | (char)0x80, 7, (char)0x80, 5, (char)0x80, 3, (char)0x80, 1); |
1182 | | const __m128i interleave = _mm_set_epi8(15, 7, 14, 6, 13, 5, 12, 4, 11, 3, 10, 2, 9, 1, 8, 0); |
1183 | | const __m128i u = _mm_loadu_si128((const __m128i*)pSrcDst); |
1184 | | const __m128i u1 = _mm_loadu_si128((const __m128i*)pSrc2); |
1185 | | const __m128i uEven = _mm_shuffle_epi8(u, even); |
1186 | | const __m128i uEven4 = _mm_slli_epi16(uEven, 2); |
1187 | | const __m128i uOdd = _mm_shuffle_epi8(u, odd); |
1188 | | const __m128i u1Even = _mm_shuffle_epi8(u1, even); |
1189 | | const __m128i u1Odd = _mm_shuffle_epi8(u1, odd); |
1190 | | const __m128i tmp1 = _mm_add_epi16(uOdd, u1Even); |
1191 | | const __m128i tmp2 = _mm_add_epi16(tmp1, u1Odd); |
1192 | | const __m128i result = _mm_sub_epi16(uEven4, tmp2); |
1193 | | const __m128i packed = _mm_packus_epi16(result, uOdd); |
1194 | | const __m128i interleaved = _mm_shuffle_epi8(packed, interleave); |
1195 | | _mm_storeu_si128((__m128i*)pSrcDst, interleaved); |
1196 | | } |
1197 | | |
1198 | | static pstatus_t ssse3_ChromaFilter(BYTE* WINPR_RESTRICT pDst[], const UINT32 dstStep[], |
1199 | | const RECTANGLE_16* WINPR_RESTRICT roi) |
1200 | | { |
1201 | | const UINT32 oddY = 1; |
1202 | | const UINT32 evenY = 0; |
1203 | | const UINT32 nWidth = roi->right - roi->left; |
1204 | | const UINT32 nHeight = roi->bottom - roi->top; |
1205 | | const UINT32 halfHeight = (nHeight + 1) / 2; |
1206 | | const UINT32 halfWidth = (nWidth + 1) / 2; |
1207 | | const UINT32 halfPad = halfWidth % 16; |
1208 | | |
1209 | | /* Filter */ |
1210 | | for (size_t y = roi->top; y < halfHeight + roi->top; y++) |
1211 | | { |
1212 | | size_t x = roi->left; |
1213 | | const UINT32 val2y = (y * 2 + evenY); |
1214 | | const UINT32 val2y1 = val2y + oddY; |
1215 | | BYTE* pU1 = pDst[1] + 1ULL * dstStep[1] * val2y1; |
1216 | | BYTE* pV1 = pDst[2] + 1ULL * dstStep[2] * val2y1; |
1217 | | BYTE* pU = pDst[1] + 1ULL * dstStep[1] * val2y; |
1218 | | BYTE* pV = pDst[2] + 1ULL * dstStep[2] * val2y; |
1219 | | |
1220 | | if (val2y1 > nHeight) |
1221 | | continue; |
1222 | | |
1223 | | for (; x < halfWidth + roi->left - halfPad; x += 16) |
1224 | | { |
1225 | | ssse3_filter(&pU[2 * x], &pU1[2 * x]); |
1226 | | ssse3_filter(&pV[2 * x], &pV1[2 * x]); |
1227 | | } |
1228 | | |
1229 | | for (; x < halfWidth + roi->left; x++) |
1230 | | { |
1231 | | const UINT32 val2x = (x * 2); |
1232 | | const UINT32 val2x1 = val2x + 1; |
1233 | | const BYTE inU = pU[val2x]; |
1234 | | const BYTE inV = pV[val2x]; |
1235 | | const INT32 up = inU * 4; |
1236 | | const INT32 vp = inV * 4; |
1237 | | INT32 u2020 = 0; |
1238 | | INT32 v2020 = 0; |
1239 | | |
1240 | | if (val2x1 > nWidth) |
1241 | | continue; |
1242 | | |
1243 | | u2020 = up - pU[val2x1] - pU1[val2x] - pU1[val2x1]; |
1244 | | v2020 = vp - pV[val2x1] - pV1[val2x] - pV1[val2x1]; |
1245 | | pU[val2x] = CONDITIONAL_CLIP(u2020, inU); |
1246 | | pV[val2x] = CONDITIONAL_CLIP(v2020, inV); |
1247 | | } |
1248 | | } |
1249 | | |
1250 | | return PRIMITIVES_SUCCESS; |
1251 | | } |
1252 | | |
1253 | | static pstatus_t ssse3_ChromaV1ToYUV444(const BYTE* const WINPR_RESTRICT pSrcRaw[3], |
1254 | | const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDstRaw[3], |
1255 | | const UINT32 dstStep[3], |
1256 | | const RECTANGLE_16* WINPR_RESTRICT roi) |
1257 | | { |
1258 | | const UINT32 mod = 16; |
1259 | | UINT32 uY = 0; |
1260 | | UINT32 vY = 0; |
1261 | | const UINT32 nWidth = roi->right - roi->left; |
1262 | | const UINT32 nHeight = roi->bottom - roi->top; |
1263 | | const UINT32 halfWidth = (nWidth + 1) / 2; |
1264 | | const UINT32 halfPad = halfWidth % 16; |
1265 | | const UINT32 halfHeight = (nHeight + 1) / 2; |
1266 | | const UINT32 oddY = 1; |
1267 | | const UINT32 evenY = 0; |
1268 | | const UINT32 oddX = 1; |
1269 | | /* The auxilary frame is aligned to multiples of 16x16. |
1270 | | * We need the padded height for B4 and B5 conversion. */ |
1271 | | const UINT32 padHeigth = nHeight + 16 - nHeight % 16; |
1272 | | const BYTE* pSrc[3] = { pSrcRaw[0] + 1ULL * roi->top * srcStep[0] + roi->left, |
1273 | | pSrcRaw[1] + 1ULL * roi->top / 2 * srcStep[1] + roi->left / 2, |
1274 | | pSrcRaw[2] + 1ULL * roi->top / 2 * srcStep[2] + roi->left / 2 }; |
1275 | | BYTE* pDst[3] = { pDstRaw[0] + 1ULL * roi->top * dstStep[0] + roi->left, |
1276 | | pDstRaw[1] + 1ULL * roi->top * dstStep[1] + roi->left, |
1277 | | pDstRaw[2] + 1ULL * roi->top * dstStep[2] + roi->left }; |
1278 | | const __m128i zero = _mm_setzero_si128(); |
1279 | | const __m128i mask = _mm_set_epi8(0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, |
1280 | | (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80); |
1281 | | |
1282 | | /* The second half of U and V is a bit more tricky... */ |
1283 | | /* B4 and B5 */ |
1284 | | for (size_t y = 0; y < padHeigth; y++) |
1285 | | { |
1286 | | const BYTE* Ya = pSrc[0] + 1ULL * srcStep[0] * y; |
1287 | | BYTE* pX = NULL; |
1288 | | |
1289 | | if ((y) % mod < (mod + 1) / 2) |
1290 | | { |
1291 | | const UINT32 pos = (2 * uY++ + oddY); |
1292 | | |
1293 | | if (pos >= nHeight) |
1294 | | continue; |
1295 | | |
1296 | | pX = pDst[1] + 1ULL * dstStep[1] * pos; |
1297 | | } |
1298 | | else |
1299 | | { |
1300 | | const UINT32 pos = (2 * vY++ + oddY); |
1301 | | |
1302 | | if (pos >= nHeight) |
1303 | | continue; |
1304 | | |
1305 | | pX = pDst[2] + 1ULL * dstStep[2] * pos; |
1306 | | } |
1307 | | |
1308 | | memcpy(pX, Ya, nWidth); |
1309 | | } |
1310 | | |
1311 | | /* B6 and B7 */ |
1312 | | for (size_t y = 0; y < halfHeight; y++) |
1313 | | { |
1314 | | const size_t val2y = (y * 2 + evenY); |
1315 | | const BYTE* Ua = pSrc[1] + srcStep[1] * y; |
1316 | | const BYTE* Va = pSrc[2] + srcStep[2] * y; |
1317 | | BYTE* pU = pDst[1] + dstStep[1] * val2y; |
1318 | | BYTE* pV = pDst[2] + dstStep[2] * val2y; |
1319 | | |
1320 | | size_t x = 0; |
1321 | | for (; x < halfWidth - halfPad; x += 16) |
1322 | | { |
1323 | | { |
1324 | | const __m128i u = _mm_loadu_si128((const __m128i*)&Ua[x]); |
1325 | | const __m128i u2 = _mm_unpackhi_epi8(u, zero); |
1326 | | const __m128i u1 = _mm_unpacklo_epi8(u, zero); |
1327 | | _mm_maskmoveu_si128(u1, mask, (char*)&pU[2 * x]); |
1328 | | _mm_maskmoveu_si128(u2, mask, (char*)&pU[2 * x + 16]); |
1329 | | } |
1330 | | { |
1331 | | const __m128i u = _mm_loadu_si128((const __m128i*)&Va[x]); |
1332 | | const __m128i u2 = _mm_unpackhi_epi8(u, zero); |
1333 | | const __m128i u1 = _mm_unpacklo_epi8(u, zero); |
1334 | | _mm_maskmoveu_si128(u1, mask, (char*)&pV[2 * x]); |
1335 | | _mm_maskmoveu_si128(u2, mask, (char*)&pV[2 * x + 16]); |
1336 | | } |
1337 | | } |
1338 | | |
1339 | | for (; x < halfWidth; x++) |
1340 | | { |
1341 | | const UINT32 val2x1 = (x * 2 + oddX); |
1342 | | pU[val2x1] = Ua[x]; |
1343 | | pV[val2x1] = Va[x]; |
1344 | | } |
1345 | | } |
1346 | | |
1347 | | /* Filter */ |
1348 | | return ssse3_ChromaFilter(pDst, dstStep, roi); |
1349 | | } |
1350 | | |
1351 | | static pstatus_t ssse3_ChromaV2ToYUV444(const BYTE* const WINPR_RESTRICT pSrc[3], |
1352 | | const UINT32 srcStep[3], UINT32 nTotalWidth, |
1353 | | UINT32 nTotalHeight, BYTE* WINPR_RESTRICT pDst[3], |
1354 | | const UINT32 dstStep[3], |
1355 | | const RECTANGLE_16* WINPR_RESTRICT roi) |
1356 | | { |
1357 | | const UINT32 nWidth = roi->right - roi->left; |
1358 | | const UINT32 nHeight = roi->bottom - roi->top; |
1359 | | const UINT32 halfWidth = (nWidth + 1) / 2; |
1360 | | const UINT32 halfPad = halfWidth % 16; |
1361 | | const UINT32 halfHeight = (nHeight + 1) / 2; |
1362 | | const UINT32 quaterWidth = (nWidth + 3) / 4; |
1363 | | const UINT32 quaterPad = quaterWidth % 16; |
1364 | | const __m128i zero = _mm_setzero_si128(); |
1365 | | const __m128i mask = _mm_set_epi8((char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, |
1366 | | (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0); |
1367 | | const __m128i mask2 = _mm_set_epi8(0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80, |
1368 | | 0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80); |
1369 | | const __m128i shuffle1 = |
1370 | | _mm_set_epi8((char)0x80, 15, (char)0x80, 14, (char)0x80, 13, (char)0x80, 12, (char)0x80, 11, |
1371 | | (char)0x80, 10, (char)0x80, 9, (char)0x80, 8); |
1372 | | const __m128i shuffle2 = |
1373 | | _mm_set_epi8((char)0x80, 7, (char)0x80, 6, (char)0x80, 5, (char)0x80, 4, (char)0x80, 3, |
1374 | | (char)0x80, 2, (char)0x80, 1, (char)0x80, 0); |
1375 | | |
1376 | | /* B4 and B5: odd UV values for width/2, height */ |
1377 | | for (size_t y = 0; y < nHeight; y++) |
1378 | | { |
1379 | | const size_t yTop = y + roi->top; |
1380 | | const BYTE* pYaU = pSrc[0] + srcStep[0] * yTop + roi->left / 2; |
1381 | | const BYTE* pYaV = pYaU + nTotalWidth / 2; |
1382 | | BYTE* pU = pDst[1] + 1ULL * dstStep[1] * yTop + roi->left; |
1383 | | BYTE* pV = pDst[2] + 1ULL * dstStep[2] * yTop + roi->left; |
1384 | | |
1385 | | size_t x = 0; |
1386 | | for (; x < halfWidth - halfPad; x += 16) |
1387 | | { |
1388 | | { |
1389 | | const __m128i u = _mm_loadu_si128((const __m128i*)&pYaU[x]); |
1390 | | const __m128i u2 = _mm_unpackhi_epi8(zero, u); |
1391 | | const __m128i u1 = _mm_unpacklo_epi8(zero, u); |
1392 | | _mm_maskmoveu_si128(u1, mask, (char*)&pU[2 * x]); |
1393 | | _mm_maskmoveu_si128(u2, mask, (char*)&pU[2 * x + 16]); |
1394 | | } |
1395 | | { |
1396 | | const __m128i v = _mm_loadu_si128((const __m128i*)&pYaV[x]); |
1397 | | const __m128i v2 = _mm_unpackhi_epi8(zero, v); |
1398 | | const __m128i v1 = _mm_unpacklo_epi8(zero, v); |
1399 | | _mm_maskmoveu_si128(v1, mask, (char*)&pV[2 * x]); |
1400 | | _mm_maskmoveu_si128(v2, mask, (char*)&pV[2 * x + 16]); |
1401 | | } |
1402 | | } |
1403 | | |
1404 | | for (; x < halfWidth; x++) |
1405 | | { |
1406 | | const UINT32 odd = 2 * x + 1; |
1407 | | pU[odd] = pYaU[x]; |
1408 | | pV[odd] = pYaV[x]; |
1409 | | } |
1410 | | } |
1411 | | |
1412 | | /* B6 - B9 */ |
1413 | | for (size_t y = 0; y < halfHeight; y++) |
1414 | | { |
1415 | | const BYTE* pUaU = pSrc[1] + srcStep[1] * (y + roi->top / 2) + roi->left / 4; |
1416 | | const BYTE* pUaV = pUaU + nTotalWidth / 4; |
1417 | | const BYTE* pVaU = pSrc[2] + srcStep[2] * (y + roi->top / 2) + roi->left / 4; |
1418 | | const BYTE* pVaV = pVaU + nTotalWidth / 4; |
1419 | | BYTE* pU = pDst[1] + dstStep[1] * (2 * y + 1 + roi->top) + roi->left; |
1420 | | BYTE* pV = pDst[2] + dstStep[2] * (2 * y + 1 + roi->top) + roi->left; |
1421 | | |
1422 | | UINT32 x = 0; |
1423 | | for (; x < quaterWidth - quaterPad; x += 16) |
1424 | | { |
1425 | | { |
1426 | | const __m128i uU = _mm_loadu_si128((const __m128i*)&pUaU[x]); |
1427 | | const __m128i uV = _mm_loadu_si128((const __m128i*)&pVaU[x]); |
1428 | | const __m128i uHigh = _mm_unpackhi_epi8(uU, uV); |
1429 | | const __m128i uLow = _mm_unpacklo_epi8(uU, uV); |
1430 | | const __m128i u1 = _mm_shuffle_epi8(uLow, shuffle2); |
1431 | | const __m128i u2 = _mm_shuffle_epi8(uLow, shuffle1); |
1432 | | const __m128i u3 = _mm_shuffle_epi8(uHigh, shuffle2); |
1433 | | const __m128i u4 = _mm_shuffle_epi8(uHigh, shuffle1); |
1434 | | _mm_maskmoveu_si128(u1, mask2, (char*)&pU[4 * x + 0]); |
1435 | | _mm_maskmoveu_si128(u2, mask2, (char*)&pU[4 * x + 16]); |
1436 | | _mm_maskmoveu_si128(u3, mask2, (char*)&pU[4 * x + 32]); |
1437 | | _mm_maskmoveu_si128(u4, mask2, (char*)&pU[4 * x + 48]); |
1438 | | } |
1439 | | { |
1440 | | const __m128i vU = _mm_loadu_si128((const __m128i*)&pUaV[x]); |
1441 | | const __m128i vV = _mm_loadu_si128((const __m128i*)&pVaV[x]); |
1442 | | const __m128i vHigh = _mm_unpackhi_epi8(vU, vV); |
1443 | | const __m128i vLow = _mm_unpacklo_epi8(vU, vV); |
1444 | | const __m128i v1 = _mm_shuffle_epi8(vLow, shuffle2); |
1445 | | const __m128i v2 = _mm_shuffle_epi8(vLow, shuffle1); |
1446 | | const __m128i v3 = _mm_shuffle_epi8(vHigh, shuffle2); |
1447 | | const __m128i v4 = _mm_shuffle_epi8(vHigh, shuffle1); |
1448 | | _mm_maskmoveu_si128(v1, mask2, (char*)&pV[4 * x + 0]); |
1449 | | _mm_maskmoveu_si128(v2, mask2, (char*)&pV[4 * x + 16]); |
1450 | | _mm_maskmoveu_si128(v3, mask2, (char*)&pV[4 * x + 32]); |
1451 | | _mm_maskmoveu_si128(v4, mask2, (char*)&pV[4 * x + 48]); |
1452 | | } |
1453 | | } |
1454 | | |
1455 | | for (; x < quaterWidth; x++) |
1456 | | { |
1457 | | pU[4 * x + 0] = pUaU[x]; |
1458 | | pV[4 * x + 0] = pUaV[x]; |
1459 | | pU[4 * x + 2] = pVaU[x]; |
1460 | | pV[4 * x + 2] = pVaV[x]; |
1461 | | } |
1462 | | } |
1463 | | |
1464 | | return ssse3_ChromaFilter(pDst, dstStep, roi); |
1465 | | } |
1466 | | |
1467 | | static pstatus_t ssse3_YUV420CombineToYUV444(avc444_frame_type type, |
1468 | | const BYTE* const WINPR_RESTRICT pSrc[3], |
1469 | | const UINT32 srcStep[3], UINT32 nWidth, UINT32 nHeight, |
1470 | | BYTE* WINPR_RESTRICT pDst[3], const UINT32 dstStep[3], |
1471 | | const RECTANGLE_16* WINPR_RESTRICT roi) |
1472 | | { |
1473 | | if (!pSrc || !pSrc[0] || !pSrc[1] || !pSrc[2]) |
1474 | | return -1; |
1475 | | |
1476 | | if (!pDst || !pDst[0] || !pDst[1] || !pDst[2]) |
1477 | | return -1; |
1478 | | |
1479 | | if (!roi) |
1480 | | return -1; |
1481 | | |
1482 | | switch (type) |
1483 | | { |
1484 | | case AVC444_LUMA: |
1485 | | return ssse3_LumaToYUV444(pSrc, srcStep, pDst, dstStep, roi); |
1486 | | |
1487 | | case AVC444_CHROMAv1: |
1488 | | return ssse3_ChromaV1ToYUV444(pSrc, srcStep, pDst, dstStep, roi); |
1489 | | |
1490 | | case AVC444_CHROMAv2: |
1491 | | return ssse3_ChromaV2ToYUV444(pSrc, srcStep, nWidth, nHeight, pDst, dstStep, roi); |
1492 | | |
1493 | | default: |
1494 | | return -1; |
1495 | | } |
1496 | | } |
1497 | | #endif |
1498 | | |
1499 | | void primitives_init_YUV_ssse3(primitives_t* WINPR_RESTRICT prims) |
1500 | 0 | { |
1501 | | #if defined(SSE2_ENABLED) |
1502 | | generic = primitives_get_generic(); |
1503 | | primitives_init_YUV(prims); |
1504 | | |
1505 | | if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) && |
1506 | | IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) |
1507 | | { |
1508 | | WLog_VRB(PRIM_TAG, "SSE3/SSSE3 optimizations"); |
1509 | | prims->RGBToYUV420_8u_P3AC4R = ssse3_RGBToYUV420; |
1510 | | prims->RGBToAVC444YUV = ssse3_RGBToAVC444YUV; |
1511 | | prims->RGBToAVC444YUVv2 = ssse3_RGBToAVC444YUVv2; |
1512 | | prims->YUV420ToRGB_8u_P3AC4R = ssse3_YUV420ToRGB; |
1513 | | prims->YUV444ToRGB_8u_P3AC4R = ssse3_YUV444ToRGB_8u_P3AC4R; |
1514 | | prims->YUV420CombineToYUV444 = ssse3_YUV420CombineToYUV444; |
1515 | | } |
1516 | | #else |
1517 | 0 | WLog_VRB(PRIM_TAG, "undefined WITH_SSE2"); |
1518 | 0 | WINPR_UNUSED(prims); |
1519 | 0 | #endif |
1520 | 0 | } |