/src/FreeRDP/libfreerdp/primitives/prim_YUV_ssse3.c
Line | Count | Source (jump to first uncovered line) |
1 | | /** |
2 | | * FreeRDP: A Remote Desktop Protocol Implementation |
3 | | * Optimized YUV/RGB conversion operations |
4 | | * |
5 | | * Copyright 2014 Thomas Erbesdobler |
6 | | * Copyright 2016-2017 Armin Novak <armin.novak@thincast.com> |
7 | | * Copyright 2016-2017 Norbert Federa <norbert.federa@thincast.com> |
8 | | * Copyright 2016-2017 Thincast Technologies GmbH |
9 | | * |
10 | | * Licensed under the Apache License, Version 2.0 (the "License"); |
11 | | * you may not use this file except in compliance with the License. |
12 | | * You may obtain a copy of the License at |
13 | | * |
14 | | * http://www.apache.org/licenses/LICENSE-2.0 |
15 | | * |
16 | | * Unless required by applicable law or agreed to in writing, software |
17 | | * distributed under the License is distributed on an "AS IS" BASIS, |
18 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
19 | | * See the License for the specific language governing permissions and |
20 | | * limitations under the License. |
21 | | */ |
22 | | |
23 | | #include <winpr/wtypes.h> |
24 | | #include <freerdp/config.h> |
25 | | |
26 | | #include <winpr/sysinfo.h> |
27 | | #include <winpr/crt.h> |
28 | | #include <freerdp/types.h> |
29 | | #include <freerdp/primitives.h> |
30 | | |
31 | | #include "prim_internal.h" |
32 | | |
33 | | #include <emmintrin.h> |
34 | | #include <tmmintrin.h> |
35 | | |
36 | | #if !defined(WITH_SSE2) |
37 | | #error "This file needs WITH_SSE2 enabled!" |
38 | | #endif |
39 | | |
40 | | static primitives_t* generic = NULL; |
41 | | |
42 | | /****************************************************************************/ |
43 | | /* SSSE3 YUV420 -> RGB conversion */ |
44 | | /****************************************************************************/ |
45 | | static __m128i* ssse3_YUV444Pixel(__m128i* WINPR_RESTRICT dst, __m128i Yraw, __m128i Uraw, |
46 | | __m128i Vraw, UINT8 pos) |
47 | 0 | { |
48 | | /* Visual Studio 2010 doesn't like _mm_set_epi32 in array initializer list */ |
49 | | /* Note: This also applies to Visual Studio 2013 before Update 4 */ |
50 | 0 | #if !defined(_MSC_VER) || (_MSC_VER > 1600) |
51 | 0 | const __m128i mapY[] = { _mm_set_epi32(0x80800380, 0x80800280, 0x80800180, 0x80800080), |
52 | 0 | _mm_set_epi32(0x80800780, 0x80800680, 0x80800580, 0x80800480), |
53 | 0 | _mm_set_epi32(0x80800B80, 0x80800A80, 0x80800980, 0x80800880), |
54 | 0 | _mm_set_epi32(0x80800F80, 0x80800E80, 0x80800D80, 0x80800C80) }; |
55 | 0 | const __m128i mapUV[] = { _mm_set_epi32(0x80038002, 0x80018000, 0x80808080, 0x80808080), |
56 | 0 | _mm_set_epi32(0x80078006, 0x80058004, 0x80808080, 0x80808080), |
57 | 0 | _mm_set_epi32(0x800B800A, 0x80098008, 0x80808080, 0x80808080), |
58 | 0 | _mm_set_epi32(0x800F800E, 0x800D800C, 0x80808080, 0x80808080) }; |
59 | 0 | const __m128i mask[] = { _mm_set_epi32(0x80038080, 0x80028080, 0x80018080, 0x80008080), |
60 | 0 | _mm_set_epi32(0x80800380, 0x80800280, 0x80800180, 0x80800080), |
61 | 0 | _mm_set_epi32(0x80808003, 0x80808002, 0x80808001, 0x80808000) }; |
62 | | #else |
63 | | /* Note: must be in little-endian format ! */ |
64 | | const __m128i mapY[] = { { 0x80, 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02, 0x80, |
65 | | 0x80, 0x80, 0x03, 0x80, 0x80 }, |
66 | | { 0x80, 0x04, 0x80, 0x80, 0x80, 0x05, 0x80, 0x80, 0x80, 0x06, 0x80, |
67 | | 0x80, 0x80, 0x07, 0x80, 0x80 }, |
68 | | { 0x80, 0x08, 0x80, 0x80, 0x80, 0x09, 0x80, 0x80, 0x80, 0x0a, 0x80, |
69 | | 0x80, 0x80, 0x0b, 0x80, 0x80 }, |
70 | | { 0x80, 0x0c, 0x80, 0x80, 0x80, 0x0d, 0x80, 0x80, 0x80, 0x0e, 0x80, |
71 | | 0x80, 0x80, 0x0f, 0x80, 0x80 } |
72 | | |
73 | | }; |
74 | | const __m128i mapUV[] = { { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00, 0x80, 0x01, |
75 | | 0x80, 0x02, 0x80, 0x03, 0x80 }, |
76 | | { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x04, 0x80, 0x05, |
77 | | 0x80, 0x06, 0x80, 0x07, 0x80 }, |
78 | | { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x08, 0x80, 0x09, |
79 | | 0x80, 0x0a, 0x80, 0x0b, 0x80 }, |
80 | | { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x0c, 0x80, 0x0d, |
81 | | 0x80, 0x0e, 0x80, 0x0f, 0x80 } }; |
82 | | const __m128i mask[] = { { 0x80, 0x80, 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02, |
83 | | 0x80, 0x80, 0x80, 0x03, 0x80 }, |
84 | | { 0x80, 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02, 0x80, |
85 | | 0x80, 0x80, 0x03, 0x80, 0x80 }, |
86 | | { 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02, 0x80, 0x80, |
87 | | 0x80, 0x03, 0x80, 0x80, 0x80 } }; |
88 | | #endif |
89 | 0 | const __m128i c128 = _mm_set1_epi16(128); |
90 | 0 | __m128i BGRX = _mm_and_si128(_mm_loadu_si128(dst), |
91 | 0 | _mm_set_epi32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000)); |
92 | 0 | { |
93 | 0 | __m128i C, D, E; |
94 | | /* Load Y values and expand to 32 bit */ |
95 | 0 | { |
96 | 0 | C = _mm_shuffle_epi8(Yraw, mapY[pos]); /* Reorder and multiply by 256 */ |
97 | 0 | } |
98 | | /* Load U values and expand to 32 bit */ |
99 | 0 | { |
100 | 0 | const __m128i U = _mm_shuffle_epi8(Uraw, mapUV[pos]); /* Reorder dcba */ |
101 | 0 | D = _mm_sub_epi16(U, c128); /* D = U - 128 */ |
102 | 0 | } |
103 | | /* Load V values and expand to 32 bit */ |
104 | 0 | { |
105 | 0 | const __m128i V = _mm_shuffle_epi8(Vraw, mapUV[pos]); /* Reorder dcba */ |
106 | 0 | E = _mm_sub_epi16(V, c128); /* E = V - 128 */ |
107 | 0 | } |
108 | | /* Get the R value */ |
109 | 0 | { |
110 | 0 | const __m128i c403 = _mm_set1_epi16(403); |
111 | 0 | const __m128i e403 = |
112 | 0 | _mm_unpackhi_epi16(_mm_mullo_epi16(E, c403), _mm_mulhi_epi16(E, c403)); |
113 | 0 | const __m128i Rs = _mm_add_epi32(C, e403); |
114 | 0 | const __m128i R32 = _mm_srai_epi32(Rs, 8); |
115 | 0 | const __m128i R16 = _mm_packs_epi32(R32, _mm_setzero_si128()); |
116 | 0 | const __m128i R = _mm_packus_epi16(R16, _mm_setzero_si128()); |
117 | 0 | const __m128i packed = _mm_shuffle_epi8(R, mask[0]); |
118 | 0 | BGRX = _mm_or_si128(BGRX, packed); |
119 | 0 | } |
120 | | /* Get the G value */ |
121 | 0 | { |
122 | 0 | const __m128i c48 = _mm_set1_epi16(48); |
123 | 0 | const __m128i d48 = |
124 | 0 | _mm_unpackhi_epi16(_mm_mullo_epi16(D, c48), _mm_mulhi_epi16(D, c48)); |
125 | 0 | const __m128i c120 = _mm_set1_epi16(120); |
126 | 0 | const __m128i e120 = |
127 | 0 | _mm_unpackhi_epi16(_mm_mullo_epi16(E, c120), _mm_mulhi_epi16(E, c120)); |
128 | 0 | const __m128i de = _mm_add_epi32(d48, e120); |
129 | 0 | const __m128i Gs = _mm_sub_epi32(C, de); |
130 | 0 | const __m128i G32 = _mm_srai_epi32(Gs, 8); |
131 | 0 | const __m128i G16 = _mm_packs_epi32(G32, _mm_setzero_si128()); |
132 | 0 | const __m128i G = _mm_packus_epi16(G16, _mm_setzero_si128()); |
133 | 0 | const __m128i packed = _mm_shuffle_epi8(G, mask[1]); |
134 | 0 | BGRX = _mm_or_si128(BGRX, packed); |
135 | 0 | } |
136 | | /* Get the B value */ |
137 | 0 | { |
138 | 0 | const __m128i c475 = _mm_set1_epi16(475); |
139 | 0 | const __m128i d475 = |
140 | 0 | _mm_unpackhi_epi16(_mm_mullo_epi16(D, c475), _mm_mulhi_epi16(D, c475)); |
141 | 0 | const __m128i Bs = _mm_add_epi32(C, d475); |
142 | 0 | const __m128i B32 = _mm_srai_epi32(Bs, 8); |
143 | 0 | const __m128i B16 = _mm_packs_epi32(B32, _mm_setzero_si128()); |
144 | 0 | const __m128i B = _mm_packus_epi16(B16, _mm_setzero_si128()); |
145 | 0 | const __m128i packed = _mm_shuffle_epi8(B, mask[2]); |
146 | 0 | BGRX = _mm_or_si128(BGRX, packed); |
147 | 0 | } |
148 | 0 | } |
149 | 0 | _mm_storeu_si128(dst++, BGRX); |
150 | 0 | return dst; |
151 | 0 | } |
152 | | |
153 | | static pstatus_t ssse3_YUV420ToRGB_BGRX(const BYTE* const WINPR_RESTRICT pSrc[], |
154 | | const UINT32* WINPR_RESTRICT srcStep, |
155 | | BYTE* WINPR_RESTRICT pDst, UINT32 dstStep, |
156 | | const prim_size_t* WINPR_RESTRICT roi) |
157 | 0 | { |
158 | 0 | const UINT32 nWidth = roi->width; |
159 | 0 | const UINT32 nHeight = roi->height; |
160 | 0 | const UINT32 pad = roi->width % 16; |
161 | 0 | const __m128i duplicate = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0); |
162 | 0 | UINT32 y; |
163 | |
|
164 | 0 | for (y = 0; y < nHeight; y++) |
165 | 0 | { |
166 | 0 | UINT32 x; |
167 | 0 | __m128i* dst = (__m128i*)(pDst + dstStep * y); |
168 | 0 | const BYTE* YData = pSrc[0] + y * srcStep[0]; |
169 | 0 | const BYTE* UData = pSrc[1] + (y / 2) * srcStep[1]; |
170 | 0 | const BYTE* VData = pSrc[2] + (y / 2) * srcStep[2]; |
171 | |
|
172 | 0 | for (x = 0; x < nWidth - pad; x += 16) |
173 | 0 | { |
174 | 0 | const __m128i Y = _mm_loadu_si128((const __m128i*)YData); |
175 | 0 | const __m128i uRaw = _mm_loadu_si128((const __m128i*)UData); |
176 | 0 | const __m128i vRaw = _mm_loadu_si128((const __m128i*)VData); |
177 | 0 | const __m128i U = _mm_shuffle_epi8(uRaw, duplicate); |
178 | 0 | const __m128i V = _mm_shuffle_epi8(vRaw, duplicate); |
179 | 0 | YData += 16; |
180 | 0 | UData += 8; |
181 | 0 | VData += 8; |
182 | 0 | dst = ssse3_YUV444Pixel(dst, Y, U, V, 0); |
183 | 0 | dst = ssse3_YUV444Pixel(dst, Y, U, V, 1); |
184 | 0 | dst = ssse3_YUV444Pixel(dst, Y, U, V, 2); |
185 | 0 | dst = ssse3_YUV444Pixel(dst, Y, U, V, 3); |
186 | 0 | } |
187 | |
|
188 | 0 | for (x = 0; x < pad; x++) |
189 | 0 | { |
190 | 0 | const BYTE Y = *YData++; |
191 | 0 | const BYTE U = *UData; |
192 | 0 | const BYTE V = *VData; |
193 | 0 | const BYTE r = YUV2R(Y, U, V); |
194 | 0 | const BYTE g = YUV2G(Y, U, V); |
195 | 0 | const BYTE b = YUV2B(Y, U, V); |
196 | 0 | dst = (__m128i*)writePixelBGRX((BYTE*)dst, 4, PIXEL_FORMAT_BGRX32, r, g, b, 0); |
197 | |
|
198 | 0 | if (x % 2) |
199 | 0 | { |
200 | 0 | UData++; |
201 | 0 | VData++; |
202 | 0 | } |
203 | 0 | } |
204 | 0 | } |
205 | |
|
206 | 0 | return PRIMITIVES_SUCCESS; |
207 | 0 | } |
208 | | |
209 | | static pstatus_t ssse3_YUV420ToRGB(const BYTE* const WINPR_RESTRICT pSrc[3], |
210 | | const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst, |
211 | | UINT32 dstStep, UINT32 DstFormat, |
212 | | const prim_size_t* WINPR_RESTRICT roi) |
213 | 0 | { |
214 | 0 | switch (DstFormat) |
215 | 0 | { |
216 | 0 | case PIXEL_FORMAT_BGRX32: |
217 | 0 | case PIXEL_FORMAT_BGRA32: |
218 | 0 | return ssse3_YUV420ToRGB_BGRX(pSrc, srcStep, pDst, dstStep, roi); |
219 | | |
220 | 0 | default: |
221 | 0 | return generic->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi); |
222 | 0 | } |
223 | 0 | } |
224 | | |
225 | | static pstatus_t ssse3_YUV444ToRGB_8u_P3AC4R_BGRX(const BYTE* const WINPR_RESTRICT pSrc[], |
226 | | const UINT32 srcStep[], BYTE* WINPR_RESTRICT pDst, |
227 | | UINT32 dstStep, |
228 | | const prim_size_t* WINPR_RESTRICT roi) |
229 | 0 | { |
230 | 0 | const UINT32 nWidth = roi->width; |
231 | 0 | const UINT32 nHeight = roi->height; |
232 | 0 | const UINT32 pad = roi->width % 16; |
233 | 0 | UINT32 y; |
234 | |
|
235 | 0 | for (y = 0; y < nHeight; y++) |
236 | 0 | { |
237 | 0 | UINT32 x; |
238 | 0 | __m128i* dst = (__m128i*)(pDst + dstStep * y); |
239 | 0 | const BYTE* YData = pSrc[0] + y * srcStep[0]; |
240 | 0 | const BYTE* UData = pSrc[1] + y * srcStep[1]; |
241 | 0 | const BYTE* VData = pSrc[2] + y * srcStep[2]; |
242 | |
|
243 | 0 | for (x = 0; x < nWidth - pad; x += 16) |
244 | 0 | { |
245 | 0 | __m128i Y = _mm_load_si128((const __m128i*)YData); |
246 | 0 | __m128i U = _mm_load_si128((const __m128i*)UData); |
247 | 0 | __m128i V = _mm_load_si128((const __m128i*)VData); |
248 | 0 | YData += 16; |
249 | 0 | UData += 16; |
250 | 0 | VData += 16; |
251 | 0 | dst = ssse3_YUV444Pixel(dst, Y, U, V, 0); |
252 | 0 | dst = ssse3_YUV444Pixel(dst, Y, U, V, 1); |
253 | 0 | dst = ssse3_YUV444Pixel(dst, Y, U, V, 2); |
254 | 0 | dst = ssse3_YUV444Pixel(dst, Y, U, V, 3); |
255 | 0 | } |
256 | |
|
257 | 0 | for (x = 0; x < pad; x++) |
258 | 0 | { |
259 | 0 | const BYTE Y = *YData++; |
260 | 0 | const BYTE U = *UData++; |
261 | 0 | const BYTE V = *VData++; |
262 | 0 | const BYTE r = YUV2R(Y, U, V); |
263 | 0 | const BYTE g = YUV2G(Y, U, V); |
264 | 0 | const BYTE b = YUV2B(Y, U, V); |
265 | 0 | dst = (__m128i*)writePixelBGRX((BYTE*)dst, 4, PIXEL_FORMAT_BGRX32, r, g, b, 0); |
266 | 0 | } |
267 | 0 | } |
268 | |
|
269 | 0 | return PRIMITIVES_SUCCESS; |
270 | 0 | } |
271 | | |
272 | | static pstatus_t ssse3_YUV444ToRGB_8u_P3AC4R(const BYTE* const WINPR_RESTRICT pSrc[], |
273 | | const UINT32 srcStep[], BYTE* WINPR_RESTRICT pDst, |
274 | | UINT32 dstStep, UINT32 DstFormat, |
275 | | const prim_size_t* WINPR_RESTRICT roi) |
276 | 0 | { |
277 | 0 | if ((uintptr_t)pSrc[0] % 16 || (uintptr_t)pSrc[1] % 16 || (uintptr_t)pSrc[2] % 16 || |
278 | 0 | srcStep[0] % 16 || srcStep[1] % 16 || srcStep[2] % 16) |
279 | 0 | return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi); |
280 | | |
281 | 0 | switch (DstFormat) |
282 | 0 | { |
283 | 0 | case PIXEL_FORMAT_BGRX32: |
284 | 0 | case PIXEL_FORMAT_BGRA32: |
285 | 0 | return ssse3_YUV444ToRGB_8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi); |
286 | | |
287 | 0 | default: |
288 | 0 | return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi); |
289 | 0 | } |
290 | 0 | } |
291 | | |
292 | | /****************************************************************************/ |
293 | | /* SSSE3 RGB -> YUV420 conversion **/ |
294 | | /****************************************************************************/ |
295 | | |
296 | | /** |
297 | | * Note (nfedera): |
298 | | * The used forward transformation factors from RGB to YUV are based on the |
299 | | * values specified in [Rec. ITU-R BT.709-6] Section 3: |
300 | | * http://www.itu.int/rec/R-REC-BT.709-6-201506-I/en |
301 | | * |
302 | | * Y = 0.21260 * R + 0.71520 * G + 0.07220 * B + 0; |
303 | | * U = -0.11457 * R - 0.38543 * G + 0.50000 * B + 128; |
304 | | * V = 0.50000 * R - 0.45415 * G - 0.04585 * B + 128; |
305 | | * |
306 | | * The most accurate integer arithmetic approximation when using 8-bit signed |
307 | | * integer factors with 16-bit signed integer intermediate results is: |
308 | | * |
309 | | * Y = ( ( 27 * R + 92 * G + 9 * B) >> 7 ); |
310 | | * U = ( (-29 * R - 99 * G + 128 * B) >> 8 ) + 128; |
311 | | * V = ( ( 128 * R - 116 * G - 12 * B) >> 8 ) + 128; |
312 | | * |
313 | | * Due to signed 8bit range being [-128,127] the U and V constants of 128 are |
314 | | * rounded to 127 |
315 | | */ |
316 | | |
317 | 0 | #define BGRX_Y_FACTORS _mm_set_epi8(0, 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9) |
318 | | #define BGRX_U_FACTORS \ |
319 | 0 | _mm_set_epi8(0, -29, -99, 127, 0, -29, -99, 127, 0, -29, -99, 127, 0, -29, -99, 127) |
320 | | #define BGRX_V_FACTORS \ |
321 | 0 | _mm_set_epi8(0, 127, -116, -12, 0, 127, -116, -12, 0, 127, -116, -12, 0, 127, -116, -12) |
322 | 0 | #define CONST128_FACTORS _mm_set1_epi8(-128) |
323 | | |
324 | 0 | #define Y_SHIFT 7 |
325 | 0 | #define U_SHIFT 8 |
326 | 0 | #define V_SHIFT 8 |
327 | | |
328 | | /* |
329 | | TODO: |
330 | | RGB[AX] can simply be supported using the following factors. And instead of loading the |
331 | | globals directly the functions below could be passed pointers to the correct vectors |
332 | | depending on the source picture format. |
333 | | |
334 | | PRIM_ALIGN_128 static const BYTE rgbx_y_factors[] = { |
335 | | 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9, 0 |
336 | | }; |
337 | | PRIM_ALIGN_128 static const BYTE rgbx_u_factors[] = { |
338 | | -15, -49, 64, 0, -15, -49, 64, 0, -15, -49, 64, 0, -15, -49, 64, 0 |
339 | | }; |
340 | | PRIM_ALIGN_128 static const BYTE rgbx_v_factors[] = { |
341 | | 64, -58, -6, 0, 64, -58, -6, 0, 64, -58, -6, 0, 64, -58, -6, 0 |
342 | | }; |
343 | | */ |
344 | | |
345 | | /* compute the luma (Y) component from a single rgb source line */ |
346 | | |
347 | | static INLINE void ssse3_RGBToYUV420_BGRX_Y(const BYTE* WINPR_RESTRICT src, BYTE* dst, UINT32 width) |
348 | 0 | { |
349 | 0 | UINT32 x; |
350 | 0 | __m128i x0, x1, x2, x3; |
351 | 0 | const __m128i y_factors = BGRX_Y_FACTORS; |
352 | 0 | const __m128i* argb = (const __m128i*)src; |
353 | 0 | __m128i* ydst = (__m128i*)dst; |
354 | |
|
355 | 0 | for (x = 0; x < width; x += 16) |
356 | 0 | { |
357 | | /* store 16 rgba pixels in 4 128 bit registers */ |
358 | 0 | x0 = _mm_load_si128(argb++); // 1st 4 pixels |
359 | 0 | x1 = _mm_load_si128(argb++); // 2nd 4 pixels |
360 | 0 | x2 = _mm_load_si128(argb++); // 3rd 4 pixels |
361 | 0 | x3 = _mm_load_si128(argb++); // 4th 4 pixels |
362 | | /* multiplications and subtotals */ |
363 | 0 | x0 = _mm_maddubs_epi16(x0, y_factors); |
364 | 0 | x1 = _mm_maddubs_epi16(x1, y_factors); |
365 | 0 | x2 = _mm_maddubs_epi16(x2, y_factors); |
366 | 0 | x3 = _mm_maddubs_epi16(x3, y_factors); |
367 | | /* the total sums */ |
368 | 0 | x0 = _mm_hadd_epi16(x0, x1); |
369 | 0 | x2 = _mm_hadd_epi16(x2, x3); |
370 | | /* shift the results */ |
371 | 0 | x0 = _mm_srli_epi16(x0, Y_SHIFT); |
372 | 0 | x2 = _mm_srli_epi16(x2, Y_SHIFT); |
373 | | /* pack the 16 words into bytes */ |
374 | 0 | x0 = _mm_packus_epi16(x0, x2); |
375 | | /* save to y plane */ |
376 | 0 | _mm_storeu_si128(ydst++, x0); |
377 | 0 | } |
378 | 0 | } |
379 | | |
380 | | /* compute the chrominance (UV) components from two rgb source lines */ |
381 | | |
382 | | static INLINE void ssse3_RGBToYUV420_BGRX_UV(const BYTE* WINPR_RESTRICT src1, |
383 | | const BYTE* WINPR_RESTRICT src2, |
384 | | BYTE* WINPR_RESTRICT dst1, BYTE* WINPR_RESTRICT dst2, |
385 | | UINT32 width) |
386 | 0 | { |
387 | 0 | UINT32 x; |
388 | 0 | const __m128i u_factors = BGRX_U_FACTORS; |
389 | 0 | const __m128i v_factors = BGRX_V_FACTORS; |
390 | 0 | const __m128i vector128 = CONST128_FACTORS; |
391 | 0 | __m128i x0, x1, x2, x3, x4, x5; |
392 | 0 | const __m128i* rgb1 = (const __m128i*)src1; |
393 | 0 | const __m128i* rgb2 = (const __m128i*)src2; |
394 | 0 | __m64* udst = (__m64*)dst1; |
395 | 0 | __m64* vdst = (__m64*)dst2; |
396 | |
|
397 | 0 | for (x = 0; x < width; x += 16) |
398 | 0 | { |
399 | | /* subsample 16x2 pixels into 16x1 pixels */ |
400 | 0 | x0 = _mm_load_si128(rgb1++); |
401 | 0 | x4 = _mm_load_si128(rgb2++); |
402 | 0 | x0 = _mm_avg_epu8(x0, x4); |
403 | 0 | x1 = _mm_load_si128(rgb1++); |
404 | 0 | x4 = _mm_load_si128(rgb2++); |
405 | 0 | x1 = _mm_avg_epu8(x1, x4); |
406 | 0 | x2 = _mm_load_si128(rgb1++); |
407 | 0 | x4 = _mm_load_si128(rgb2++); |
408 | 0 | x2 = _mm_avg_epu8(x2, x4); |
409 | 0 | x3 = _mm_load_si128(rgb1++); |
410 | 0 | x4 = _mm_load_si128(rgb2++); |
411 | 0 | x3 = _mm_avg_epu8(x3, x4); |
412 | | /* subsample these 16x1 pixels into 8x1 pixels */ |
413 | | /** |
414 | | * shuffle controls |
415 | | * c = a[0],a[2],b[0],b[2] == 10 00 10 00 = 0x88 |
416 | | * c = a[1],a[3],b[1],b[3] == 11 01 11 01 = 0xdd |
417 | | */ |
418 | 0 | x4 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x0), _mm_castsi128_ps(x1), 0x88)); |
419 | 0 | x0 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x0), _mm_castsi128_ps(x1), 0xdd)); |
420 | 0 | x0 = _mm_avg_epu8(x0, x4); |
421 | 0 | x4 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x2), _mm_castsi128_ps(x3), 0x88)); |
422 | 0 | x1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x2), _mm_castsi128_ps(x3), 0xdd)); |
423 | 0 | x1 = _mm_avg_epu8(x1, x4); |
424 | | /* multiplications and subtotals */ |
425 | 0 | x2 = _mm_maddubs_epi16(x0, u_factors); |
426 | 0 | x3 = _mm_maddubs_epi16(x1, u_factors); |
427 | 0 | x4 = _mm_maddubs_epi16(x0, v_factors); |
428 | 0 | x5 = _mm_maddubs_epi16(x1, v_factors); |
429 | | /* the total sums */ |
430 | 0 | x0 = _mm_hadd_epi16(x2, x3); |
431 | 0 | x1 = _mm_hadd_epi16(x4, x5); |
432 | | /* shift the results */ |
433 | 0 | x0 = _mm_srai_epi16(x0, U_SHIFT); |
434 | 0 | x1 = _mm_srai_epi16(x1, V_SHIFT); |
435 | | /* pack the 16 words into bytes */ |
436 | 0 | x0 = _mm_packs_epi16(x0, x1); |
437 | | /* add 128 */ |
438 | 0 | x0 = _mm_sub_epi8(x0, vector128); |
439 | | /* the lower 8 bytes go to the u plane */ |
440 | 0 | _mm_storel_pi(udst++, _mm_castsi128_ps(x0)); |
441 | | /* the upper 8 bytes go to the v plane */ |
442 | 0 | _mm_storeh_pi(vdst++, _mm_castsi128_ps(x0)); |
443 | 0 | } |
444 | 0 | } |
445 | | |
446 | | static pstatus_t ssse3_RGBToYUV420_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat, |
447 | | UINT32 srcStep, BYTE* WINPR_RESTRICT pDst[], |
448 | | const UINT32 dstStep[], |
449 | | const prim_size_t* WINPR_RESTRICT roi) |
450 | 0 | { |
451 | 0 | UINT32 y; |
452 | 0 | const BYTE* argb = pSrc; |
453 | 0 | BYTE* ydst = pDst[0]; |
454 | 0 | BYTE* udst = pDst[1]; |
455 | 0 | BYTE* vdst = pDst[2]; |
456 | |
|
457 | 0 | if (roi->height < 1 || roi->width < 1) |
458 | 0 | { |
459 | 0 | return !PRIMITIVES_SUCCESS; |
460 | 0 | } |
461 | | |
462 | 0 | if (roi->width % 16 || (uintptr_t)pSrc % 16 || srcStep % 16) |
463 | 0 | { |
464 | 0 | return generic->RGBToYUV420_8u_P3AC4R(pSrc, srcFormat, srcStep, pDst, dstStep, roi); |
465 | 0 | } |
466 | | |
467 | 0 | for (y = 0; y < roi->height - 1; y += 2) |
468 | 0 | { |
469 | 0 | const BYTE* line1 = argb; |
470 | 0 | const BYTE* line2 = argb + srcStep; |
471 | 0 | ssse3_RGBToYUV420_BGRX_UV(line1, line2, udst, vdst, roi->width); |
472 | 0 | ssse3_RGBToYUV420_BGRX_Y(line1, ydst, roi->width); |
473 | 0 | ssse3_RGBToYUV420_BGRX_Y(line2, ydst + dstStep[0], roi->width); |
474 | 0 | argb += 2 * srcStep; |
475 | 0 | ydst += 2 * dstStep[0]; |
476 | 0 | udst += 1 * dstStep[1]; |
477 | 0 | vdst += 1 * dstStep[2]; |
478 | 0 | } |
479 | |
|
480 | 0 | if (roi->height & 1) |
481 | 0 | { |
482 | | /* pass the same last line of an odd height twice for UV */ |
483 | 0 | ssse3_RGBToYUV420_BGRX_UV(argb, argb, udst, vdst, roi->width); |
484 | 0 | ssse3_RGBToYUV420_BGRX_Y(argb, ydst, roi->width); |
485 | 0 | } |
486 | |
|
487 | 0 | return PRIMITIVES_SUCCESS; |
488 | 0 | } |
489 | | |
490 | | static pstatus_t ssse3_RGBToYUV420(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat, |
491 | | UINT32 srcStep, BYTE* WINPR_RESTRICT pDst[], |
492 | | const UINT32 dstStep[], const prim_size_t* WINPR_RESTRICT roi) |
493 | 0 | { |
494 | 0 | switch (srcFormat) |
495 | 0 | { |
496 | 0 | case PIXEL_FORMAT_BGRX32: |
497 | 0 | case PIXEL_FORMAT_BGRA32: |
498 | 0 | return ssse3_RGBToYUV420_BGRX(pSrc, srcFormat, srcStep, pDst, dstStep, roi); |
499 | | |
500 | 0 | default: |
501 | 0 | return generic->RGBToYUV420_8u_P3AC4R(pSrc, srcFormat, srcStep, pDst, dstStep, roi); |
502 | 0 | } |
503 | 0 | } |
504 | | |
505 | | /****************************************************************************/ |
506 | | /* SSSE3 RGB -> AVC444-YUV conversion **/ |
507 | | /****************************************************************************/ |
508 | | |
509 | | static INLINE void ssse3_RGBToAVC444YUV_BGRX_DOUBLE_ROW( |
510 | | const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd, |
511 | | BYTE* WINPR_RESTRICT b1Even, BYTE* WINPR_RESTRICT b1Odd, BYTE* WINPR_RESTRICT b2, |
512 | | BYTE* WINPR_RESTRICT b3, BYTE* WINPR_RESTRICT b4, BYTE* WINPR_RESTRICT b5, |
513 | | BYTE* WINPR_RESTRICT b6, BYTE* WINPR_RESTRICT b7, UINT32 width) |
514 | 0 | { |
515 | 0 | UINT32 x; |
516 | 0 | const __m128i* argbEven = (const __m128i*)srcEven; |
517 | 0 | const __m128i* argbOdd = (const __m128i*)srcOdd; |
518 | 0 | const __m128i y_factors = BGRX_Y_FACTORS; |
519 | 0 | const __m128i u_factors = BGRX_U_FACTORS; |
520 | 0 | const __m128i v_factors = BGRX_V_FACTORS; |
521 | 0 | const __m128i vector128 = CONST128_FACTORS; |
522 | |
|
523 | 0 | for (x = 0; x < width; x += 16) |
524 | 0 | { |
525 | | /* store 16 rgba pixels in 4 128 bit registers */ |
526 | 0 | const __m128i xe1 = _mm_load_si128(argbEven++); // 1st 4 pixels |
527 | 0 | const __m128i xe2 = _mm_load_si128(argbEven++); // 2nd 4 pixels |
528 | 0 | const __m128i xe3 = _mm_load_si128(argbEven++); // 3rd 4 pixels |
529 | 0 | const __m128i xe4 = _mm_load_si128(argbEven++); // 4th 4 pixels |
530 | 0 | const __m128i xo1 = _mm_load_si128(argbOdd++); // 1st 4 pixels |
531 | 0 | const __m128i xo2 = _mm_load_si128(argbOdd++); // 2nd 4 pixels |
532 | 0 | const __m128i xo3 = _mm_load_si128(argbOdd++); // 3rd 4 pixels |
533 | 0 | const __m128i xo4 = _mm_load_si128(argbOdd++); // 4th 4 pixels |
534 | 0 | { |
535 | | /* Y: multiplications with subtotals and horizontal sums */ |
536 | 0 | const __m128i ye1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, y_factors), |
537 | 0 | _mm_maddubs_epi16(xe2, y_factors)), |
538 | 0 | Y_SHIFT); |
539 | 0 | const __m128i ye2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, y_factors), |
540 | 0 | _mm_maddubs_epi16(xe4, y_factors)), |
541 | 0 | Y_SHIFT); |
542 | 0 | const __m128i ye = _mm_packus_epi16(ye1, ye2); |
543 | 0 | const __m128i yo1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, y_factors), |
544 | 0 | _mm_maddubs_epi16(xo2, y_factors)), |
545 | 0 | Y_SHIFT); |
546 | 0 | const __m128i yo2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, y_factors), |
547 | 0 | _mm_maddubs_epi16(xo4, y_factors)), |
548 | 0 | Y_SHIFT); |
549 | 0 | const __m128i yo = _mm_packus_epi16(yo1, yo2); |
550 | | /* store y [b1] */ |
551 | 0 | _mm_storeu_si128((__m128i*)b1Even, ye); |
552 | 0 | b1Even += 16; |
553 | |
|
554 | 0 | if (b1Odd) |
555 | 0 | { |
556 | 0 | _mm_storeu_si128((__m128i*)b1Odd, yo); |
557 | 0 | b1Odd += 16; |
558 | 0 | } |
559 | 0 | } |
560 | 0 | { |
561 | | /* We have now |
562 | | * 16 even U values in ue |
563 | | * 16 odd U values in uo |
564 | | * |
565 | | * We need to split these according to |
566 | | * 3.3.8.3.2 YUV420p Stream Combination for YUV444 mode */ |
567 | 0 | __m128i ue, uo = { 0 }; |
568 | 0 | { |
569 | 0 | const __m128i ue1 = |
570 | 0 | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, u_factors), |
571 | 0 | _mm_maddubs_epi16(xe2, u_factors)), |
572 | 0 | U_SHIFT); |
573 | 0 | const __m128i ue2 = |
574 | 0 | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, u_factors), |
575 | 0 | _mm_maddubs_epi16(xe4, u_factors)), |
576 | 0 | U_SHIFT); |
577 | 0 | ue = _mm_sub_epi8(_mm_packs_epi16(ue1, ue2), vector128); |
578 | 0 | } |
579 | |
|
580 | 0 | if (b1Odd) |
581 | 0 | { |
582 | 0 | const __m128i uo1 = |
583 | 0 | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, u_factors), |
584 | 0 | _mm_maddubs_epi16(xo2, u_factors)), |
585 | 0 | U_SHIFT); |
586 | 0 | const __m128i uo2 = |
587 | 0 | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, u_factors), |
588 | 0 | _mm_maddubs_epi16(xo4, u_factors)), |
589 | 0 | U_SHIFT); |
590 | 0 | uo = _mm_sub_epi8(_mm_packs_epi16(uo1, uo2), vector128); |
591 | 0 | } |
592 | | |
593 | | /* Now we need the following storage distribution: |
594 | | * 2x 2y -> b2 |
595 | | * x 2y+1 -> b4 |
596 | | * 2x+1 2y -> b6 */ |
597 | 0 | if (b1Odd) /* b2 */ |
598 | 0 | { |
599 | 0 | const __m128i ueh = _mm_unpackhi_epi8(ue, _mm_setzero_si128()); |
600 | 0 | const __m128i uoh = _mm_unpackhi_epi8(uo, _mm_setzero_si128()); |
601 | 0 | const __m128i hi = _mm_add_epi16(ueh, uoh); |
602 | 0 | const __m128i uel = _mm_unpacklo_epi8(ue, _mm_setzero_si128()); |
603 | 0 | const __m128i uol = _mm_unpacklo_epi8(uo, _mm_setzero_si128()); |
604 | 0 | const __m128i lo = _mm_add_epi16(uel, uol); |
605 | 0 | const __m128i added = _mm_hadd_epi16(lo, hi); |
606 | 0 | const __m128i avg16 = _mm_srai_epi16(added, 2); |
607 | 0 | const __m128i avg = _mm_packus_epi16(avg16, avg16); |
608 | 0 | _mm_storel_epi64((__m128i*)b2, avg); |
609 | 0 | } |
610 | 0 | else |
611 | 0 | { |
612 | 0 | const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, |
613 | 0 | 14, 12, 10, 8, 6, 4, 2, 0); |
614 | 0 | const __m128i ud = _mm_shuffle_epi8(ue, mask); |
615 | 0 | _mm_storel_epi64((__m128i*)b2, ud); |
616 | 0 | } |
617 | |
|
618 | 0 | b2 += 8; |
619 | |
|
620 | 0 | if (b1Odd) /* b4 */ |
621 | 0 | { |
622 | 0 | _mm_store_si128((__m128i*)b4, uo); |
623 | 0 | b4 += 16; |
624 | 0 | } |
625 | |
|
626 | 0 | { |
627 | | /* b6 */ |
628 | 0 | const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, |
629 | 0 | 15, 13, 11, 9, 7, 5, 3, 1); |
630 | 0 | const __m128i ude = _mm_shuffle_epi8(ue, mask); |
631 | 0 | _mm_storel_epi64((__m128i*)b6, ude); |
632 | 0 | b6 += 8; |
633 | 0 | } |
634 | 0 | } |
635 | 0 | { |
636 | | /* We have now |
637 | | * 16 even V values in ue |
638 | | * 16 odd V values in uo |
639 | | * |
640 | | * We need to split these according to |
641 | | * 3.3.8.3.2 YUV420p Stream Combination for YUV444 mode */ |
642 | 0 | __m128i ve, vo = { 0 }; |
643 | 0 | { |
644 | 0 | const __m128i ve1 = |
645 | 0 | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, v_factors), |
646 | 0 | _mm_maddubs_epi16(xe2, v_factors)), |
647 | 0 | V_SHIFT); |
648 | 0 | const __m128i ve2 = |
649 | 0 | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, v_factors), |
650 | 0 | _mm_maddubs_epi16(xe4, v_factors)), |
651 | 0 | V_SHIFT); |
652 | 0 | ve = _mm_sub_epi8(_mm_packs_epi16(ve1, ve2), vector128); |
653 | 0 | } |
654 | |
|
655 | 0 | if (b1Odd) |
656 | 0 | { |
657 | 0 | const __m128i vo1 = |
658 | 0 | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, v_factors), |
659 | 0 | _mm_maddubs_epi16(xo2, v_factors)), |
660 | 0 | V_SHIFT); |
661 | 0 | const __m128i vo2 = |
662 | 0 | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, v_factors), |
663 | 0 | _mm_maddubs_epi16(xo4, v_factors)), |
664 | 0 | V_SHIFT); |
665 | 0 | vo = _mm_sub_epi8(_mm_packs_epi16(vo1, vo2), vector128); |
666 | 0 | } |
667 | | |
668 | | /* Now we need the following storage distribution: |
669 | | * 2x 2y -> b3 |
670 | | * x 2y+1 -> b5 |
671 | | * 2x+1 2y -> b7 */ |
672 | 0 | if (b1Odd) /* b3 */ |
673 | 0 | { |
674 | 0 | const __m128i veh = _mm_unpackhi_epi8(ve, _mm_setzero_si128()); |
675 | 0 | const __m128i voh = _mm_unpackhi_epi8(vo, _mm_setzero_si128()); |
676 | 0 | const __m128i hi = _mm_add_epi16(veh, voh); |
677 | 0 | const __m128i vel = _mm_unpacklo_epi8(ve, _mm_setzero_si128()); |
678 | 0 | const __m128i vol = _mm_unpacklo_epi8(vo, _mm_setzero_si128()); |
679 | 0 | const __m128i lo = _mm_add_epi16(vel, vol); |
680 | 0 | const __m128i added = _mm_hadd_epi16(lo, hi); |
681 | 0 | const __m128i avg16 = _mm_srai_epi16(added, 2); |
682 | 0 | const __m128i avg = _mm_packus_epi16(avg16, avg16); |
683 | 0 | _mm_storel_epi64((__m128i*)b3, avg); |
684 | 0 | } |
685 | 0 | else |
686 | 0 | { |
687 | 0 | const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, |
688 | 0 | 14, 12, 10, 8, 6, 4, 2, 0); |
689 | 0 | const __m128i vd = _mm_shuffle_epi8(ve, mask); |
690 | 0 | _mm_storel_epi64((__m128i*)b3, vd); |
691 | 0 | } |
692 | |
|
693 | 0 | b3 += 8; |
694 | |
|
695 | 0 | if (b1Odd) /* b5 */ |
696 | 0 | { |
697 | 0 | _mm_store_si128((__m128i*)b5, vo); |
698 | 0 | b5 += 16; |
699 | 0 | } |
700 | |
|
701 | 0 | { |
702 | | /* b7 */ |
703 | 0 | const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, |
704 | 0 | 15, 13, 11, 9, 7, 5, 3, 1); |
705 | 0 | const __m128i vde = _mm_shuffle_epi8(ve, mask); |
706 | 0 | _mm_storel_epi64((__m128i*)b7, vde); |
707 | 0 | b7 += 8; |
708 | 0 | } |
709 | 0 | } |
710 | 0 | } |
711 | 0 | } |
712 | | |
713 | | static pstatus_t ssse3_RGBToAVC444YUV_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat, |
714 | | UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[], |
715 | | const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[], |
716 | | const UINT32 dst2Step[], |
717 | | const prim_size_t* WINPR_RESTRICT roi) |
718 | 0 | { |
719 | 0 | const BYTE* pMaxSrc = pSrc + (roi->height - 1) * srcStep; |
720 | |
|
721 | 0 | if (roi->height < 1 || roi->width < 1) |
722 | 0 | return !PRIMITIVES_SUCCESS; |
723 | | |
724 | 0 | if (roi->width % 16 || (uintptr_t)pSrc % 16 || srcStep % 16) |
725 | 0 | return generic->RGBToAVC444YUV(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, dst2Step, |
726 | 0 | roi); |
727 | | |
728 | 0 | for (UINT32 y = 0; y < roi->height; y += 2) |
729 | 0 | { |
730 | 0 | const BOOL last = (y >= (roi->height - 1)); |
731 | 0 | const BYTE* srcEven = y < roi->height ? pSrc + y * srcStep : pMaxSrc; |
732 | 0 | const BYTE* srcOdd = !last ? pSrc + (y + 1) * srcStep : pMaxSrc; |
733 | 0 | const UINT32 i = y >> 1; |
734 | 0 | const UINT32 n = (i & ~7) + i; |
735 | 0 | BYTE* b1Even = pDst1[0] + y * dst1Step[0]; |
736 | 0 | BYTE* b1Odd = !last ? (b1Even + dst1Step[0]) : NULL; |
737 | 0 | BYTE* b2 = pDst1[1] + (y / 2) * dst1Step[1]; |
738 | 0 | BYTE* b3 = pDst1[2] + (y / 2) * dst1Step[2]; |
739 | 0 | BYTE* b4 = pDst2[0] + dst2Step[0] * n; |
740 | 0 | BYTE* b5 = b4 + 8 * dst2Step[0]; |
741 | 0 | BYTE* b6 = pDst2[1] + (y / 2) * dst2Step[1]; |
742 | 0 | BYTE* b7 = pDst2[2] + (y / 2) * dst2Step[2]; |
743 | 0 | ssse3_RGBToAVC444YUV_BGRX_DOUBLE_ROW(srcEven, srcOdd, b1Even, b1Odd, b2, b3, b4, b5, b6, b7, |
744 | 0 | roi->width); |
745 | 0 | } |
746 | |
|
747 | 0 | return PRIMITIVES_SUCCESS; |
748 | 0 | } |
749 | | |
750 | | static pstatus_t ssse3_RGBToAVC444YUV(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat, |
751 | | UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[], |
752 | | const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[], |
753 | | const UINT32 dst2Step[], |
754 | | const prim_size_t* WINPR_RESTRICT roi) |
755 | 0 | { |
756 | 0 | switch (srcFormat) |
757 | 0 | { |
758 | 0 | case PIXEL_FORMAT_BGRX32: |
759 | 0 | case PIXEL_FORMAT_BGRA32: |
760 | 0 | return ssse3_RGBToAVC444YUV_BGRX(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, |
761 | 0 | dst2Step, roi); |
762 | | |
763 | 0 | default: |
764 | 0 | return generic->RGBToAVC444YUV(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, |
765 | 0 | dst2Step, roi); |
766 | 0 | } |
767 | 0 | } |
768 | | |
769 | | /* Mapping of arguments: |
770 | | * |
771 | | * b1 [even lines] -> yLumaDstEven |
772 | | * b1 [odd lines] -> yLumaDstOdd |
773 | | * b2 -> uLumaDst |
774 | | * b3 -> vLumaDst |
775 | | * b4 -> yChromaDst1 |
776 | | * b5 -> yChromaDst2 |
777 | | * b6 -> uChromaDst1 |
778 | | * b7 -> uChromaDst2 |
779 | | * b8 -> vChromaDst1 |
780 | | * b9 -> vChromaDst2 |
781 | | */ |
782 | | static INLINE void ssse3_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW( |
783 | | const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd, |
784 | | BYTE* WINPR_RESTRICT yLumaDstEven, BYTE* WINPR_RESTRICT yLumaDstOdd, |
785 | | BYTE* WINPR_RESTRICT uLumaDst, BYTE* WINPR_RESTRICT vLumaDst, |
786 | | BYTE* WINPR_RESTRICT yEvenChromaDst1, BYTE* WINPR_RESTRICT yEvenChromaDst2, |
787 | | BYTE* WINPR_RESTRICT yOddChromaDst1, BYTE* WINPR_RESTRICT yOddChromaDst2, |
788 | | BYTE* WINPR_RESTRICT uChromaDst1, BYTE* WINPR_RESTRICT uChromaDst2, |
789 | | BYTE* WINPR_RESTRICT vChromaDst1, BYTE* WINPR_RESTRICT vChromaDst2, UINT32 width) |
790 | 0 | { |
791 | 0 | UINT32 x; |
792 | 0 | const __m128i vector128 = CONST128_FACTORS; |
793 | 0 | const __m128i* argbEven = (const __m128i*)srcEven; |
794 | 0 | const __m128i* argbOdd = (const __m128i*)srcOdd; |
795 | |
|
796 | 0 | for (x = 0; x < width; x += 16) |
797 | 0 | { |
798 | | /* store 16 rgba pixels in 4 128 bit registers |
799 | | * for even and odd rows. |
800 | | */ |
801 | 0 | const __m128i xe1 = _mm_load_si128(argbEven++); /* 1st 4 pixels */ |
802 | 0 | const __m128i xe2 = _mm_load_si128(argbEven++); /* 2nd 4 pixels */ |
803 | 0 | const __m128i xe3 = _mm_load_si128(argbEven++); /* 3rd 4 pixels */ |
804 | 0 | const __m128i xe4 = _mm_load_si128(argbEven++); /* 4th 4 pixels */ |
805 | 0 | const __m128i xo1 = _mm_load_si128(argbOdd++); /* 1st 4 pixels */ |
806 | 0 | const __m128i xo2 = _mm_load_si128(argbOdd++); /* 2nd 4 pixels */ |
807 | 0 | const __m128i xo3 = _mm_load_si128(argbOdd++); /* 3rd 4 pixels */ |
808 | 0 | const __m128i xo4 = _mm_load_si128(argbOdd++); /* 4th 4 pixels */ |
809 | 0 | { |
810 | | /* Y: multiplications with subtotals and horizontal sums */ |
811 | 0 | const __m128i y_factors = BGRX_Y_FACTORS; |
812 | 0 | const __m128i ye1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, y_factors), |
813 | 0 | _mm_maddubs_epi16(xe2, y_factors)), |
814 | 0 | Y_SHIFT); |
815 | 0 | const __m128i ye2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, y_factors), |
816 | 0 | _mm_maddubs_epi16(xe4, y_factors)), |
817 | 0 | Y_SHIFT); |
818 | 0 | const __m128i ye = _mm_packus_epi16(ye1, ye2); |
819 | | /* store y [b1] */ |
820 | 0 | _mm_storeu_si128((__m128i*)yLumaDstEven, ye); |
821 | 0 | yLumaDstEven += 16; |
822 | 0 | } |
823 | |
|
824 | 0 | if (yLumaDstOdd) |
825 | 0 | { |
826 | 0 | const __m128i y_factors = BGRX_Y_FACTORS; |
827 | 0 | const __m128i yo1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, y_factors), |
828 | 0 | _mm_maddubs_epi16(xo2, y_factors)), |
829 | 0 | Y_SHIFT); |
830 | 0 | const __m128i yo2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, y_factors), |
831 | 0 | _mm_maddubs_epi16(xo4, y_factors)), |
832 | 0 | Y_SHIFT); |
833 | 0 | const __m128i yo = _mm_packus_epi16(yo1, yo2); |
834 | 0 | _mm_storeu_si128((__m128i*)yLumaDstOdd, yo); |
835 | 0 | yLumaDstOdd += 16; |
836 | 0 | } |
837 | |
|
838 | 0 | { |
839 | | /* We have now |
840 | | * 16 even U values in ue |
841 | | * 16 odd U values in uo |
842 | | * |
843 | | * We need to split these according to |
844 | | * 3.3.8.3.3 YUV420p Stream Combination for YUV444v2 mode */ |
845 | | /* U: multiplications with subtotals and horizontal sums */ |
846 | 0 | __m128i ue, uo, uavg; |
847 | 0 | { |
848 | 0 | const __m128i u_factors = BGRX_U_FACTORS; |
849 | 0 | const __m128i ue1 = |
850 | 0 | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, u_factors), |
851 | 0 | _mm_maddubs_epi16(xe2, u_factors)), |
852 | 0 | U_SHIFT); |
853 | 0 | const __m128i ue2 = |
854 | 0 | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, u_factors), |
855 | 0 | _mm_maddubs_epi16(xe4, u_factors)), |
856 | 0 | U_SHIFT); |
857 | 0 | const __m128i ueavg = _mm_hadd_epi16(ue1, ue2); |
858 | 0 | ue = _mm_sub_epi8(_mm_packs_epi16(ue1, ue2), vector128); |
859 | 0 | uavg = ueavg; |
860 | 0 | } |
861 | 0 | { |
862 | 0 | const __m128i u_factors = BGRX_U_FACTORS; |
863 | 0 | const __m128i uo1 = |
864 | 0 | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, u_factors), |
865 | 0 | _mm_maddubs_epi16(xo2, u_factors)), |
866 | 0 | U_SHIFT); |
867 | 0 | const __m128i uo2 = |
868 | 0 | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, u_factors), |
869 | 0 | _mm_maddubs_epi16(xo4, u_factors)), |
870 | 0 | U_SHIFT); |
871 | 0 | const __m128i uoavg = _mm_hadd_epi16(uo1, uo2); |
872 | 0 | uo = _mm_sub_epi8(_mm_packs_epi16(uo1, uo2), vector128); |
873 | 0 | uavg = _mm_add_epi16(uavg, uoavg); |
874 | 0 | uavg = _mm_srai_epi16(uavg, 2); |
875 | 0 | uavg = _mm_packs_epi16(uavg, uoavg); |
876 | 0 | uavg = _mm_sub_epi8(uavg, vector128); |
877 | 0 | } |
878 | | /* Now we need the following storage distribution: |
879 | | * 2x 2y -> uLumaDst |
880 | | * 2x+1 y -> yChromaDst1 |
881 | | * 4x 2y+1 -> uChromaDst1 |
882 | | * 4x+2 2y+1 -> vChromaDst1 */ |
883 | 0 | { |
884 | 0 | const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, |
885 | 0 | 15, 13, 11, 9, 7, 5, 3, 1); |
886 | 0 | const __m128i ude = _mm_shuffle_epi8(ue, mask); |
887 | 0 | _mm_storel_epi64((__m128i*)yEvenChromaDst1, ude); |
888 | 0 | yEvenChromaDst1 += 8; |
889 | 0 | } |
890 | |
|
891 | 0 | if (yLumaDstOdd) |
892 | 0 | { |
893 | 0 | const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, |
894 | 0 | 15, 13, 11, 9, 7, 5, 3, 1); |
895 | 0 | const __m128i udo = _mm_shuffle_epi8(uo, mask); |
896 | 0 | _mm_storel_epi64((__m128i*)yOddChromaDst1, udo); |
897 | 0 | yOddChromaDst1 += 8; |
898 | 0 | } |
899 | |
|
900 | 0 | if (yLumaDstOdd) |
901 | 0 | { |
902 | 0 | const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, |
903 | 0 | 14, 10, 6, 2, 12, 8, 4, 0); |
904 | 0 | const __m128i ud = _mm_shuffle_epi8(uo, mask); |
905 | 0 | int* uDst1 = (int*)uChromaDst1; |
906 | 0 | int* vDst1 = (int*)vChromaDst1; |
907 | 0 | const int* src = (const int*)&ud; |
908 | 0 | _mm_stream_si32(uDst1, src[0]); |
909 | 0 | _mm_stream_si32(vDst1, src[1]); |
910 | 0 | uChromaDst1 += 4; |
911 | 0 | vChromaDst1 += 4; |
912 | 0 | } |
913 | |
|
914 | 0 | if (yLumaDstOdd) |
915 | 0 | { |
916 | 0 | _mm_storel_epi64((__m128i*)uLumaDst, uavg); |
917 | 0 | uLumaDst += 8; |
918 | 0 | } |
919 | 0 | else |
920 | 0 | { |
921 | 0 | const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, |
922 | 0 | 14, 12, 10, 8, 6, 4, 2, 0); |
923 | 0 | const __m128i ud = _mm_shuffle_epi8(ue, mask); |
924 | 0 | _mm_storel_epi64((__m128i*)uLumaDst, ud); |
925 | 0 | uLumaDst += 8; |
926 | 0 | } |
927 | 0 | } |
928 | |
|
929 | 0 | { |
930 | | /* V: multiplications with subtotals and horizontal sums */ |
931 | 0 | __m128i ve, vo, vavg; |
932 | 0 | { |
933 | 0 | const __m128i v_factors = BGRX_V_FACTORS; |
934 | 0 | const __m128i ve1 = |
935 | 0 | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, v_factors), |
936 | 0 | _mm_maddubs_epi16(xe2, v_factors)), |
937 | 0 | V_SHIFT); |
938 | 0 | const __m128i ve2 = |
939 | 0 | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, v_factors), |
940 | 0 | _mm_maddubs_epi16(xe4, v_factors)), |
941 | 0 | V_SHIFT); |
942 | 0 | const __m128i veavg = _mm_hadd_epi16(ve1, ve2); |
943 | 0 | ve = _mm_sub_epi8(_mm_packs_epi16(ve1, ve2), vector128); |
944 | 0 | vavg = veavg; |
945 | 0 | } |
946 | 0 | { |
947 | 0 | const __m128i v_factors = BGRX_V_FACTORS; |
948 | 0 | const __m128i vo1 = |
949 | 0 | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, v_factors), |
950 | 0 | _mm_maddubs_epi16(xo2, v_factors)), |
951 | 0 | V_SHIFT); |
952 | 0 | const __m128i vo2 = |
953 | 0 | _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, v_factors), |
954 | 0 | _mm_maddubs_epi16(xo4, v_factors)), |
955 | 0 | V_SHIFT); |
956 | 0 | const __m128i voavg = _mm_hadd_epi16(vo1, vo2); |
957 | 0 | vo = _mm_sub_epi8(_mm_packs_epi16(vo1, vo2), vector128); |
958 | 0 | vavg = _mm_add_epi16(vavg, voavg); |
959 | 0 | vavg = _mm_srai_epi16(vavg, 2); |
960 | 0 | vavg = _mm_packs_epi16(vavg, voavg); |
961 | 0 | vavg = _mm_sub_epi8(vavg, vector128); |
962 | 0 | } |
963 | | /* Now we need the following storage distribution: |
964 | | * 2x 2y -> vLumaDst |
965 | | * 2x+1 y -> yChromaDst2 |
966 | | * 4x 2y+1 -> uChromaDst2 |
967 | | * 4x+2 2y+1 -> vChromaDst2 */ |
968 | 0 | { |
969 | 0 | const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, |
970 | 0 | 15, 13, 11, 9, 7, 5, 3, 1); |
971 | 0 | __m128i vde = _mm_shuffle_epi8(ve, mask); |
972 | 0 | _mm_storel_epi64((__m128i*)yEvenChromaDst2, vde); |
973 | 0 | yEvenChromaDst2 += 8; |
974 | 0 | } |
975 | |
|
976 | 0 | if (yLumaDstOdd) |
977 | 0 | { |
978 | 0 | const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, |
979 | 0 | 15, 13, 11, 9, 7, 5, 3, 1); |
980 | 0 | __m128i vdo = _mm_shuffle_epi8(vo, mask); |
981 | 0 | _mm_storel_epi64((__m128i*)yOddChromaDst2, vdo); |
982 | 0 | yOddChromaDst2 += 8; |
983 | 0 | } |
984 | |
|
985 | 0 | if (yLumaDstOdd) |
986 | 0 | { |
987 | 0 | const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, |
988 | 0 | 14, 10, 6, 2, 12, 8, 4, 0); |
989 | 0 | const __m128i vd = _mm_shuffle_epi8(vo, mask); |
990 | 0 | int* uDst2 = (int*)uChromaDst2; |
991 | 0 | int* vDst2 = (int*)vChromaDst2; |
992 | 0 | const int* src = (const int*)&vd; |
993 | 0 | _mm_stream_si32(uDst2, src[0]); |
994 | 0 | _mm_stream_si32(vDst2, src[1]); |
995 | 0 | uChromaDst2 += 4; |
996 | 0 | vChromaDst2 += 4; |
997 | 0 | } |
998 | |
|
999 | 0 | if (yLumaDstOdd) |
1000 | 0 | { |
1001 | 0 | _mm_storel_epi64((__m128i*)vLumaDst, vavg); |
1002 | 0 | vLumaDst += 8; |
1003 | 0 | } |
1004 | 0 | else |
1005 | 0 | { |
1006 | 0 | const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, |
1007 | 0 | 14, 12, 10, 8, 6, 4, 2, 0); |
1008 | 0 | __m128i vd = _mm_shuffle_epi8(ve, mask); |
1009 | 0 | _mm_storel_epi64((__m128i*)vLumaDst, vd); |
1010 | 0 | vLumaDst += 8; |
1011 | 0 | } |
1012 | 0 | } |
1013 | 0 | } |
1014 | 0 | } |
1015 | | |
1016 | | static pstatus_t ssse3_RGBToAVC444YUVv2_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat, |
1017 | | UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[], |
1018 | | const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[], |
1019 | | const UINT32 dst2Step[], |
1020 | | const prim_size_t* WINPR_RESTRICT roi) |
1021 | 0 | { |
1022 | 0 | if (roi->height < 1 || roi->width < 1) |
1023 | 0 | return !PRIMITIVES_SUCCESS; |
1024 | | |
1025 | 0 | if (roi->width % 16 || (uintptr_t)pSrc % 16 || srcStep % 16) |
1026 | 0 | return generic->RGBToAVC444YUVv2(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, dst2Step, |
1027 | 0 | roi); |
1028 | | |
1029 | 0 | for (UINT32 y = 0; y < roi->height; y += 2) |
1030 | 0 | { |
1031 | 0 | const BYTE* srcEven = (pSrc + y * srcStep); |
1032 | 0 | const BYTE* srcOdd = (srcEven + srcStep); |
1033 | 0 | BYTE* dstLumaYEven = (pDst1[0] + y * dst1Step[0]); |
1034 | 0 | BYTE* dstLumaYOdd = (y < roi->height - 1) ? (dstLumaYEven + dst1Step[0]) : NULL; |
1035 | 0 | BYTE* dstLumaU = (pDst1[1] + (y / 2) * dst1Step[1]); |
1036 | 0 | BYTE* dstLumaV = (pDst1[2] + (y / 2) * dst1Step[2]); |
1037 | 0 | BYTE* dstEvenChromaY1 = (pDst2[0] + y * dst2Step[0]); |
1038 | 0 | BYTE* dstEvenChromaY2 = dstEvenChromaY1 + roi->width / 2; |
1039 | 0 | BYTE* dstOddChromaY1 = dstEvenChromaY1 + dst2Step[0]; |
1040 | 0 | BYTE* dstOddChromaY2 = dstEvenChromaY2 + dst2Step[0]; |
1041 | 0 | BYTE* dstChromaU1 = (pDst2[1] + (y / 2) * dst2Step[1]); |
1042 | 0 | BYTE* dstChromaV1 = (pDst2[2] + (y / 2) * dst2Step[2]); |
1043 | 0 | BYTE* dstChromaU2 = dstChromaU1 + roi->width / 4; |
1044 | 0 | BYTE* dstChromaV2 = dstChromaV1 + roi->width / 4; |
1045 | 0 | ssse3_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(srcEven, srcOdd, dstLumaYEven, dstLumaYOdd, dstLumaU, |
1046 | 0 | dstLumaV, dstEvenChromaY1, dstEvenChromaY2, |
1047 | 0 | dstOddChromaY1, dstOddChromaY2, dstChromaU1, |
1048 | 0 | dstChromaU2, dstChromaV1, dstChromaV2, roi->width); |
1049 | 0 | } |
1050 | |
|
1051 | 0 | return PRIMITIVES_SUCCESS; |
1052 | 0 | } |
1053 | | |
1054 | | static pstatus_t ssse3_RGBToAVC444YUVv2(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat, |
1055 | | UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[], |
1056 | | const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[], |
1057 | | const UINT32 dst2Step[], |
1058 | | const prim_size_t* WINPR_RESTRICT roi) |
1059 | 0 | { |
1060 | 0 | switch (srcFormat) |
1061 | 0 | { |
1062 | 0 | case PIXEL_FORMAT_BGRX32: |
1063 | 0 | case PIXEL_FORMAT_BGRA32: |
1064 | 0 | return ssse3_RGBToAVC444YUVv2_BGRX(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, |
1065 | 0 | dst2Step, roi); |
1066 | | |
1067 | 0 | default: |
1068 | 0 | return generic->RGBToAVC444YUVv2(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, |
1069 | 0 | dst2Step, roi); |
1070 | 0 | } |
1071 | 0 | } |
1072 | | |
1073 | | static pstatus_t ssse3_LumaToYUV444(const BYTE* const WINPR_RESTRICT pSrcRaw[], |
1074 | | const UINT32 srcStep[], BYTE* WINPR_RESTRICT pDstRaw[], |
1075 | | const UINT32 dstStep[], const RECTANGLE_16* WINPR_RESTRICT roi) |
1076 | 0 | { |
1077 | 0 | UINT32 x, y; |
1078 | 0 | const UINT32 nWidth = roi->right - roi->left; |
1079 | 0 | const UINT32 nHeight = roi->bottom - roi->top; |
1080 | 0 | const UINT32 halfWidth = (nWidth + 1) / 2; |
1081 | 0 | const UINT32 halfPad = halfWidth % 16; |
1082 | 0 | const UINT32 halfHeight = (nHeight + 1) / 2; |
1083 | 0 | const UINT32 oddY = 1; |
1084 | 0 | const UINT32 evenY = 0; |
1085 | 0 | const UINT32 oddX = 1; |
1086 | 0 | const UINT32 evenX = 0; |
1087 | 0 | const BYTE* pSrc[3] = { pSrcRaw[0] + roi->top * srcStep[0] + roi->left, |
1088 | 0 | pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2, |
1089 | 0 | pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2 }; |
1090 | 0 | BYTE* pDst[3] = { pDstRaw[0] + roi->top * dstStep[0] + roi->left, |
1091 | 0 | pDstRaw[1] + roi->top * dstStep[1] + roi->left, |
1092 | 0 | pDstRaw[2] + roi->top * dstStep[2] + roi->left }; |
1093 | | |
1094 | | /* Y data is already here... */ |
1095 | | /* B1 */ |
1096 | 0 | for (y = 0; y < nHeight; y++) |
1097 | 0 | { |
1098 | 0 | const BYTE* Ym = pSrc[0] + srcStep[0] * y; |
1099 | 0 | BYTE* pY = pDst[0] + dstStep[0] * y; |
1100 | 0 | memcpy(pY, Ym, nWidth); |
1101 | 0 | } |
1102 | | |
1103 | | /* The first half of U, V are already here part of this frame. */ |
1104 | | /* B2 and B3 */ |
1105 | 0 | for (y = 0; y < halfHeight; y++) |
1106 | 0 | { |
1107 | 0 | const UINT32 val2y = (2 * y + evenY); |
1108 | 0 | const UINT32 val2y1 = val2y + oddY; |
1109 | 0 | const BYTE* Um = pSrc[1] + srcStep[1] * y; |
1110 | 0 | const BYTE* Vm = pSrc[2] + srcStep[2] * y; |
1111 | 0 | BYTE* pU = pDst[1] + dstStep[1] * val2y; |
1112 | 0 | BYTE* pV = pDst[2] + dstStep[2] * val2y; |
1113 | 0 | BYTE* pU1 = pDst[1] + dstStep[1] * val2y1; |
1114 | 0 | BYTE* pV1 = pDst[2] + dstStep[2] * val2y1; |
1115 | |
|
1116 | 0 | for (x = 0; x < halfWidth - halfPad; x += 16) |
1117 | 0 | { |
1118 | 0 | const __m128i unpackHigh = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0); |
1119 | 0 | const __m128i unpackLow = |
1120 | 0 | _mm_set_epi8(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8); |
1121 | 0 | { |
1122 | 0 | const __m128i u = _mm_loadu_si128((const __m128i*)&Um[x]); |
1123 | 0 | const __m128i uHigh = _mm_shuffle_epi8(u, unpackHigh); |
1124 | 0 | const __m128i uLow = _mm_shuffle_epi8(u, unpackLow); |
1125 | 0 | _mm_storeu_si128((__m128i*)&pU[2 * x], uHigh); |
1126 | 0 | _mm_storeu_si128((__m128i*)&pU[2 * x + 16], uLow); |
1127 | 0 | _mm_storeu_si128((__m128i*)&pU1[2 * x], uHigh); |
1128 | 0 | _mm_storeu_si128((__m128i*)&pU1[2 * x + 16], uLow); |
1129 | 0 | } |
1130 | 0 | { |
1131 | 0 | const __m128i u = _mm_loadu_si128((const __m128i*)&Vm[x]); |
1132 | 0 | const __m128i uHigh = _mm_shuffle_epi8(u, unpackHigh); |
1133 | 0 | const __m128i uLow = _mm_shuffle_epi8(u, unpackLow); |
1134 | 0 | _mm_storeu_si128((__m128i*)&pV[2 * x], uHigh); |
1135 | 0 | _mm_storeu_si128((__m128i*)&pV[2 * x + 16], uLow); |
1136 | 0 | _mm_storeu_si128((__m128i*)&pV1[2 * x], uHigh); |
1137 | 0 | _mm_storeu_si128((__m128i*)&pV1[2 * x + 16], uLow); |
1138 | 0 | } |
1139 | 0 | } |
1140 | |
|
1141 | 0 | for (; x < halfWidth; x++) |
1142 | 0 | { |
1143 | 0 | const UINT32 val2x = 2 * x + evenX; |
1144 | 0 | const UINT32 val2x1 = val2x + oddX; |
1145 | 0 | pU[val2x] = Um[x]; |
1146 | 0 | pV[val2x] = Vm[x]; |
1147 | 0 | pU[val2x1] = Um[x]; |
1148 | 0 | pV[val2x1] = Vm[x]; |
1149 | 0 | pU1[val2x] = Um[x]; |
1150 | 0 | pV1[val2x] = Vm[x]; |
1151 | 0 | pU1[val2x1] = Um[x]; |
1152 | 0 | pV1[val2x1] = Vm[x]; |
1153 | 0 | } |
1154 | 0 | } |
1155 | |
|
1156 | 0 | return PRIMITIVES_SUCCESS; |
1157 | 0 | } |
1158 | | |
1159 | | static INLINE void ssse3_filter(BYTE* WINPR_RESTRICT pSrcDst, const BYTE* WINPR_RESTRICT pSrc2) |
1160 | 0 | { |
1161 | 0 | const __m128i even = |
1162 | 0 | _mm_set_epi8(0x80, 14, 0x80, 12, 0x80, 10, 0x80, 8, 0x80, 6, 0x80, 4, 0x80, 2, 0x80, 0); |
1163 | 0 | const __m128i odd = |
1164 | 0 | _mm_set_epi8(0x80, 15, 0x80, 13, 0x80, 11, 0x80, 9, 0x80, 7, 0x80, 5, 0x80, 3, 0x80, 1); |
1165 | 0 | const __m128i interleave = _mm_set_epi8(15, 7, 14, 6, 13, 5, 12, 4, 11, 3, 10, 2, 9, 1, 8, 0); |
1166 | 0 | const __m128i u = _mm_loadu_si128((const __m128i*)pSrcDst); |
1167 | 0 | const __m128i u1 = _mm_loadu_si128((const __m128i*)pSrc2); |
1168 | 0 | const __m128i uEven = _mm_shuffle_epi8(u, even); |
1169 | 0 | const __m128i uEven4 = _mm_slli_epi16(uEven, 2); |
1170 | 0 | const __m128i uOdd = _mm_shuffle_epi8(u, odd); |
1171 | 0 | const __m128i u1Even = _mm_shuffle_epi8(u1, even); |
1172 | 0 | const __m128i u1Odd = _mm_shuffle_epi8(u1, odd); |
1173 | 0 | const __m128i tmp1 = _mm_add_epi16(uOdd, u1Even); |
1174 | 0 | const __m128i tmp2 = _mm_add_epi16(tmp1, u1Odd); |
1175 | 0 | const __m128i result = _mm_sub_epi16(uEven4, tmp2); |
1176 | 0 | const __m128i packed = _mm_packus_epi16(result, uOdd); |
1177 | 0 | const __m128i interleaved = _mm_shuffle_epi8(packed, interleave); |
1178 | 0 | _mm_storeu_si128((__m128i*)pSrcDst, interleaved); |
1179 | 0 | } |
1180 | | |
1181 | | static pstatus_t ssse3_ChromaFilter(BYTE* WINPR_RESTRICT pDst[], const UINT32 dstStep[], |
1182 | | const RECTANGLE_16* WINPR_RESTRICT roi) |
1183 | 0 | { |
1184 | 0 | const UINT32 oddY = 1; |
1185 | 0 | const UINT32 evenY = 0; |
1186 | 0 | const UINT32 nWidth = roi->right - roi->left; |
1187 | 0 | const UINT32 nHeight = roi->bottom - roi->top; |
1188 | 0 | const UINT32 halfHeight = (nHeight + 1) / 2; |
1189 | 0 | const UINT32 halfWidth = (nWidth + 1) / 2; |
1190 | 0 | const UINT32 halfPad = halfWidth % 16; |
1191 | 0 | UINT32 x, y; |
1192 | | |
1193 | | /* Filter */ |
1194 | 0 | for (y = roi->top; y < halfHeight + roi->top; y++) |
1195 | 0 | { |
1196 | 0 | const UINT32 val2y = (y * 2 + evenY); |
1197 | 0 | const UINT32 val2y1 = val2y + oddY; |
1198 | 0 | BYTE* pU1 = pDst[1] + dstStep[1] * val2y1; |
1199 | 0 | BYTE* pV1 = pDst[2] + dstStep[2] * val2y1; |
1200 | 0 | BYTE* pU = pDst[1] + dstStep[1] * val2y; |
1201 | 0 | BYTE* pV = pDst[2] + dstStep[2] * val2y; |
1202 | |
|
1203 | 0 | if (val2y1 > nHeight) |
1204 | 0 | continue; |
1205 | | |
1206 | 0 | for (x = roi->left; x < halfWidth + roi->left - halfPad; x += 16) |
1207 | 0 | { |
1208 | 0 | ssse3_filter(&pU[2 * x], &pU1[2 * x]); |
1209 | 0 | ssse3_filter(&pV[2 * x], &pV1[2 * x]); |
1210 | 0 | } |
1211 | |
|
1212 | 0 | for (; x < halfWidth + roi->left; x++) |
1213 | 0 | { |
1214 | 0 | const UINT32 val2x = (x * 2); |
1215 | 0 | const UINT32 val2x1 = val2x + 1; |
1216 | 0 | const BYTE inU = pU[val2x]; |
1217 | 0 | const BYTE inV = pV[val2x]; |
1218 | 0 | const INT32 up = inU * 4; |
1219 | 0 | const INT32 vp = inV * 4; |
1220 | 0 | INT32 u2020; |
1221 | 0 | INT32 v2020; |
1222 | |
|
1223 | 0 | if (val2x1 > nWidth) |
1224 | 0 | continue; |
1225 | | |
1226 | 0 | u2020 = up - pU[val2x1] - pU1[val2x] - pU1[val2x1]; |
1227 | 0 | v2020 = vp - pV[val2x1] - pV1[val2x] - pV1[val2x1]; |
1228 | 0 | pU[val2x] = CONDITIONAL_CLIP(u2020, inU); |
1229 | 0 | pV[val2x] = CONDITIONAL_CLIP(v2020, inV); |
1230 | 0 | } |
1231 | 0 | } |
1232 | |
|
1233 | 0 | return PRIMITIVES_SUCCESS; |
1234 | 0 | } |
1235 | | |
1236 | | static pstatus_t ssse3_ChromaV1ToYUV444(const BYTE* const WINPR_RESTRICT pSrcRaw[3], |
1237 | | const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDstRaw[3], |
1238 | | const UINT32 dstStep[3], |
1239 | | const RECTANGLE_16* WINPR_RESTRICT roi) |
1240 | 0 | { |
1241 | 0 | const UINT32 mod = 16; |
1242 | 0 | UINT32 uY = 0; |
1243 | 0 | UINT32 vY = 0; |
1244 | 0 | UINT32 x, y; |
1245 | 0 | const UINT32 nWidth = roi->right - roi->left; |
1246 | 0 | const UINT32 nHeight = roi->bottom - roi->top; |
1247 | 0 | const UINT32 halfWidth = (nWidth + 1) / 2; |
1248 | 0 | const UINT32 halfPad = halfWidth % 16; |
1249 | 0 | const UINT32 halfHeight = (nHeight + 1) / 2; |
1250 | 0 | const UINT32 oddY = 1; |
1251 | 0 | const UINT32 evenY = 0; |
1252 | 0 | const UINT32 oddX = 1; |
1253 | | /* The auxilary frame is aligned to multiples of 16x16. |
1254 | | * We need the padded height for B4 and B5 conversion. */ |
1255 | 0 | const UINT32 padHeigth = nHeight + 16 - nHeight % 16; |
1256 | 0 | const BYTE* pSrc[3] = { pSrcRaw[0] + roi->top * srcStep[0] + roi->left, |
1257 | 0 | pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2, |
1258 | 0 | pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2 }; |
1259 | 0 | BYTE* pDst[3] = { pDstRaw[0] + roi->top * dstStep[0] + roi->left, |
1260 | 0 | pDstRaw[1] + roi->top * dstStep[1] + roi->left, |
1261 | 0 | pDstRaw[2] + roi->top * dstStep[2] + roi->left }; |
1262 | 0 | const __m128i zero = _mm_setzero_si128(); |
1263 | 0 | const __m128i mask = |
1264 | 0 | _mm_set_epi8(0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80); |
1265 | | |
1266 | | /* The second half of U and V is a bit more tricky... */ |
1267 | | /* B4 and B5 */ |
1268 | 0 | for (y = 0; y < padHeigth; y++) |
1269 | 0 | { |
1270 | 0 | const BYTE* Ya = pSrc[0] + srcStep[0] * y; |
1271 | 0 | BYTE* pX; |
1272 | |
|
1273 | 0 | if ((y) % mod < (mod + 1) / 2) |
1274 | 0 | { |
1275 | 0 | const UINT32 pos = (2 * uY++ + oddY); |
1276 | |
|
1277 | 0 | if (pos >= nHeight) |
1278 | 0 | continue; |
1279 | | |
1280 | 0 | pX = pDst[1] + dstStep[1] * pos; |
1281 | 0 | } |
1282 | 0 | else |
1283 | 0 | { |
1284 | 0 | const UINT32 pos = (2 * vY++ + oddY); |
1285 | |
|
1286 | 0 | if (pos >= nHeight) |
1287 | 0 | continue; |
1288 | | |
1289 | 0 | pX = pDst[2] + dstStep[2] * pos; |
1290 | 0 | } |
1291 | | |
1292 | 0 | memcpy(pX, Ya, nWidth); |
1293 | 0 | } |
1294 | | |
1295 | | /* B6 and B7 */ |
1296 | 0 | for (y = 0; y < halfHeight; y++) |
1297 | 0 | { |
1298 | 0 | const UINT32 val2y = (y * 2 + evenY); |
1299 | 0 | const BYTE* Ua = pSrc[1] + srcStep[1] * y; |
1300 | 0 | const BYTE* Va = pSrc[2] + srcStep[2] * y; |
1301 | 0 | BYTE* pU = pDst[1] + dstStep[1] * val2y; |
1302 | 0 | BYTE* pV = pDst[2] + dstStep[2] * val2y; |
1303 | |
|
1304 | 0 | for (x = 0; x < halfWidth - halfPad; x += 16) |
1305 | 0 | { |
1306 | 0 | { |
1307 | 0 | const __m128i u = _mm_loadu_si128((const __m128i*)&Ua[x]); |
1308 | 0 | const __m128i u2 = _mm_unpackhi_epi8(u, zero); |
1309 | 0 | const __m128i u1 = _mm_unpacklo_epi8(u, zero); |
1310 | 0 | _mm_maskmoveu_si128(u1, mask, (char*)&pU[2 * x]); |
1311 | 0 | _mm_maskmoveu_si128(u2, mask, (char*)&pU[2 * x + 16]); |
1312 | 0 | } |
1313 | 0 | { |
1314 | 0 | const __m128i u = _mm_loadu_si128((const __m128i*)&Va[x]); |
1315 | 0 | const __m128i u2 = _mm_unpackhi_epi8(u, zero); |
1316 | 0 | const __m128i u1 = _mm_unpacklo_epi8(u, zero); |
1317 | 0 | _mm_maskmoveu_si128(u1, mask, (char*)&pV[2 * x]); |
1318 | 0 | _mm_maskmoveu_si128(u2, mask, (char*)&pV[2 * x + 16]); |
1319 | 0 | } |
1320 | 0 | } |
1321 | |
|
1322 | 0 | for (; x < halfWidth; x++) |
1323 | 0 | { |
1324 | 0 | const UINT32 val2x1 = (x * 2 + oddX); |
1325 | 0 | pU[val2x1] = Ua[x]; |
1326 | 0 | pV[val2x1] = Va[x]; |
1327 | 0 | } |
1328 | 0 | } |
1329 | | |
1330 | | /* Filter */ |
1331 | 0 | return ssse3_ChromaFilter(pDst, dstStep, roi); |
1332 | 0 | } |
1333 | | |
1334 | | static pstatus_t ssse3_ChromaV2ToYUV444(const BYTE* const WINPR_RESTRICT pSrc[3], |
1335 | | const UINT32 srcStep[3], UINT32 nTotalWidth, |
1336 | | UINT32 nTotalHeight, BYTE* WINPR_RESTRICT pDst[3], |
1337 | | const UINT32 dstStep[3], |
1338 | | const RECTANGLE_16* WINPR_RESTRICT roi) |
1339 | 0 | { |
1340 | 0 | UINT32 x, y; |
1341 | 0 | const UINT32 nWidth = roi->right - roi->left; |
1342 | 0 | const UINT32 nHeight = roi->bottom - roi->top; |
1343 | 0 | const UINT32 halfWidth = (nWidth + 1) / 2; |
1344 | 0 | const UINT32 halfPad = halfWidth % 16; |
1345 | 0 | const UINT32 halfHeight = (nHeight + 1) / 2; |
1346 | 0 | const UINT32 quaterWidth = (nWidth + 3) / 4; |
1347 | 0 | const UINT32 quaterPad = quaterWidth % 16; |
1348 | 0 | const __m128i zero = _mm_setzero_si128(); |
1349 | 0 | const __m128i mask = |
1350 | 0 | _mm_set_epi8(0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0); |
1351 | 0 | const __m128i mask2 = |
1352 | 0 | _mm_set_epi8(0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80); |
1353 | 0 | const __m128i shuffle1 = |
1354 | 0 | _mm_set_epi8(0x80, 15, 0x80, 14, 0x80, 13, 0x80, 12, 0x80, 11, 0x80, 10, 0x80, 9, 0x80, 8); |
1355 | 0 | const __m128i shuffle2 = |
1356 | 0 | _mm_set_epi8(0x80, 7, 0x80, 6, 0x80, 5, 0x80, 4, 0x80, 3, 0x80, 2, 0x80, 1, 0x80, 0); |
1357 | | |
1358 | | /* B4 and B5: odd UV values for width/2, height */ |
1359 | 0 | for (y = 0; y < nHeight; y++) |
1360 | 0 | { |
1361 | 0 | const UINT32 yTop = y + roi->top; |
1362 | 0 | const BYTE* pYaU = pSrc[0] + srcStep[0] * yTop + roi->left / 2; |
1363 | 0 | const BYTE* pYaV = pYaU + nTotalWidth / 2; |
1364 | 0 | BYTE* pU = pDst[1] + dstStep[1] * yTop + roi->left; |
1365 | 0 | BYTE* pV = pDst[2] + dstStep[2] * yTop + roi->left; |
1366 | |
|
1367 | 0 | for (x = 0; x < halfWidth - halfPad; x += 16) |
1368 | 0 | { |
1369 | 0 | { |
1370 | 0 | const __m128i u = _mm_loadu_si128((const __m128i*)&pYaU[x]); |
1371 | 0 | const __m128i u2 = _mm_unpackhi_epi8(zero, u); |
1372 | 0 | const __m128i u1 = _mm_unpacklo_epi8(zero, u); |
1373 | 0 | _mm_maskmoveu_si128(u1, mask, (char*)&pU[2 * x]); |
1374 | 0 | _mm_maskmoveu_si128(u2, mask, (char*)&pU[2 * x + 16]); |
1375 | 0 | } |
1376 | 0 | { |
1377 | 0 | const __m128i v = _mm_loadu_si128((const __m128i*)&pYaV[x]); |
1378 | 0 | const __m128i v2 = _mm_unpackhi_epi8(zero, v); |
1379 | 0 | const __m128i v1 = _mm_unpacklo_epi8(zero, v); |
1380 | 0 | _mm_maskmoveu_si128(v1, mask, (char*)&pV[2 * x]); |
1381 | 0 | _mm_maskmoveu_si128(v2, mask, (char*)&pV[2 * x + 16]); |
1382 | 0 | } |
1383 | 0 | } |
1384 | |
|
1385 | 0 | for (; x < halfWidth; x++) |
1386 | 0 | { |
1387 | 0 | const UINT32 odd = 2 * x + 1; |
1388 | 0 | pU[odd] = pYaU[x]; |
1389 | 0 | pV[odd] = pYaV[x]; |
1390 | 0 | } |
1391 | 0 | } |
1392 | | |
1393 | | /* B6 - B9 */ |
1394 | 0 | for (y = 0; y < halfHeight; y++) |
1395 | 0 | { |
1396 | 0 | const BYTE* pUaU = pSrc[1] + srcStep[1] * (y + roi->top / 2) + roi->left / 4; |
1397 | 0 | const BYTE* pUaV = pUaU + nTotalWidth / 4; |
1398 | 0 | const BYTE* pVaU = pSrc[2] + srcStep[2] * (y + roi->top / 2) + roi->left / 4; |
1399 | 0 | const BYTE* pVaV = pVaU + nTotalWidth / 4; |
1400 | 0 | BYTE* pU = pDst[1] + dstStep[1] * (2 * y + 1 + roi->top) + roi->left; |
1401 | 0 | BYTE* pV = pDst[2] + dstStep[2] * (2 * y + 1 + roi->top) + roi->left; |
1402 | |
|
1403 | 0 | for (x = 0; x < quaterWidth - quaterPad; x += 16) |
1404 | 0 | { |
1405 | 0 | { |
1406 | 0 | const __m128i uU = _mm_loadu_si128((const __m128i*)&pUaU[x]); |
1407 | 0 | const __m128i uV = _mm_loadu_si128((const __m128i*)&pVaU[x]); |
1408 | 0 | const __m128i uHigh = _mm_unpackhi_epi8(uU, uV); |
1409 | 0 | const __m128i uLow = _mm_unpacklo_epi8(uU, uV); |
1410 | 0 | const __m128i u1 = _mm_shuffle_epi8(uLow, shuffle2); |
1411 | 0 | const __m128i u2 = _mm_shuffle_epi8(uLow, shuffle1); |
1412 | 0 | const __m128i u3 = _mm_shuffle_epi8(uHigh, shuffle2); |
1413 | 0 | const __m128i u4 = _mm_shuffle_epi8(uHigh, shuffle1); |
1414 | 0 | _mm_maskmoveu_si128(u1, mask2, (char*)&pU[4 * x + 0]); |
1415 | 0 | _mm_maskmoveu_si128(u2, mask2, (char*)&pU[4 * x + 16]); |
1416 | 0 | _mm_maskmoveu_si128(u3, mask2, (char*)&pU[4 * x + 32]); |
1417 | 0 | _mm_maskmoveu_si128(u4, mask2, (char*)&pU[4 * x + 48]); |
1418 | 0 | } |
1419 | 0 | { |
1420 | 0 | const __m128i vU = _mm_loadu_si128((const __m128i*)&pUaV[x]); |
1421 | 0 | const __m128i vV = _mm_loadu_si128((const __m128i*)&pVaV[x]); |
1422 | 0 | const __m128i vHigh = _mm_unpackhi_epi8(vU, vV); |
1423 | 0 | const __m128i vLow = _mm_unpacklo_epi8(vU, vV); |
1424 | 0 | const __m128i v1 = _mm_shuffle_epi8(vLow, shuffle2); |
1425 | 0 | const __m128i v2 = _mm_shuffle_epi8(vLow, shuffle1); |
1426 | 0 | const __m128i v3 = _mm_shuffle_epi8(vHigh, shuffle2); |
1427 | 0 | const __m128i v4 = _mm_shuffle_epi8(vHigh, shuffle1); |
1428 | 0 | _mm_maskmoveu_si128(v1, mask2, (char*)&pV[4 * x + 0]); |
1429 | 0 | _mm_maskmoveu_si128(v2, mask2, (char*)&pV[4 * x + 16]); |
1430 | 0 | _mm_maskmoveu_si128(v3, mask2, (char*)&pV[4 * x + 32]); |
1431 | 0 | _mm_maskmoveu_si128(v4, mask2, (char*)&pV[4 * x + 48]); |
1432 | 0 | } |
1433 | 0 | } |
1434 | |
|
1435 | 0 | for (; x < quaterWidth; x++) |
1436 | 0 | { |
1437 | 0 | pU[4 * x + 0] = pUaU[x]; |
1438 | 0 | pV[4 * x + 0] = pUaV[x]; |
1439 | 0 | pU[4 * x + 2] = pVaU[x]; |
1440 | 0 | pV[4 * x + 2] = pVaV[x]; |
1441 | 0 | } |
1442 | 0 | } |
1443 | |
|
1444 | 0 | return ssse3_ChromaFilter(pDst, dstStep, roi); |
1445 | 0 | } |
1446 | | |
1447 | | static pstatus_t ssse3_YUV420CombineToYUV444(avc444_frame_type type, |
1448 | | const BYTE* const WINPR_RESTRICT pSrc[3], |
1449 | | const UINT32 srcStep[3], UINT32 nWidth, UINT32 nHeight, |
1450 | | BYTE* WINPR_RESTRICT pDst[3], const UINT32 dstStep[3], |
1451 | | const RECTANGLE_16* WINPR_RESTRICT roi) |
1452 | 0 | { |
1453 | 0 | if (!pSrc || !pSrc[0] || !pSrc[1] || !pSrc[2]) |
1454 | 0 | return -1; |
1455 | | |
1456 | 0 | if (!pDst || !pDst[0] || !pDst[1] || !pDst[2]) |
1457 | 0 | return -1; |
1458 | | |
1459 | 0 | if (!roi) |
1460 | 0 | return -1; |
1461 | | |
1462 | 0 | switch (type) |
1463 | 0 | { |
1464 | 0 | case AVC444_LUMA: |
1465 | 0 | return ssse3_LumaToYUV444(pSrc, srcStep, pDst, dstStep, roi); |
1466 | | |
1467 | 0 | case AVC444_CHROMAv1: |
1468 | 0 | return ssse3_ChromaV1ToYUV444(pSrc, srcStep, pDst, dstStep, roi); |
1469 | | |
1470 | 0 | case AVC444_CHROMAv2: |
1471 | 0 | return ssse3_ChromaV2ToYUV444(pSrc, srcStep, nWidth, nHeight, pDst, dstStep, roi); |
1472 | | |
1473 | 0 | default: |
1474 | 0 | return -1; |
1475 | 0 | } |
1476 | 0 | } |
1477 | | |
1478 | | void primitives_init_YUV_opt(primitives_t* WINPR_RESTRICT prims) |
1479 | 0 | { |
1480 | 0 | generic = primitives_get_generic(); |
1481 | 0 | primitives_init_YUV(prims); |
1482 | |
|
1483 | 0 | if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) && |
1484 | 0 | IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) |
1485 | 0 | { |
1486 | 0 | prims->RGBToYUV420_8u_P3AC4R = ssse3_RGBToYUV420; |
1487 | 0 | prims->RGBToAVC444YUV = ssse3_RGBToAVC444YUV; |
1488 | 0 | prims->RGBToAVC444YUVv2 = ssse3_RGBToAVC444YUVv2; |
1489 | 0 | prims->YUV420ToRGB_8u_P3AC4R = ssse3_YUV420ToRGB; |
1490 | 0 | prims->YUV444ToRGB_8u_P3AC4R = ssse3_YUV444ToRGB_8u_P3AC4R; |
1491 | 0 | prims->YUV420CombineToYUV444 = ssse3_YUV420CombineToYUV444; |
1492 | 0 | } |
1493 | 0 | } |