Coverage Report

Created: 2023-09-25 06:56

/src/FreeRDP/libfreerdp/primitives/prim_YUV_ssse3.c
Line
Count
Source (jump to first uncovered line)
1
/**
2
 * FreeRDP: A Remote Desktop Protocol Implementation
3
 * Optimized YUV/RGB conversion operations
4
 *
5
 * Copyright 2014 Thomas Erbesdobler
6
 * Copyright 2016-2017 Armin Novak <armin.novak@thincast.com>
7
 * Copyright 2016-2017 Norbert Federa <norbert.federa@thincast.com>
8
 * Copyright 2016-2017 Thincast Technologies GmbH
9
 *
10
 * Licensed under the Apache License, Version 2.0 (the "License");
11
 * you may not use this file except in compliance with the License.
12
 * You may obtain a copy of the License at
13
 *
14
 *     http://www.apache.org/licenses/LICENSE-2.0
15
 *
16
 * Unless required by applicable law or agreed to in writing, software
17
 * distributed under the License is distributed on an "AS IS" BASIS,
18
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19
 * See the License for the specific language governing permissions and
20
 * limitations under the License.
21
 */
22
23
#include <winpr/wtypes.h>
24
#include <freerdp/config.h>
25
26
#include <winpr/sysinfo.h>
27
#include <winpr/crt.h>
28
#include <freerdp/types.h>
29
#include <freerdp/primitives.h>
30
31
#include "prim_internal.h"
32
33
#include <emmintrin.h>
34
#include <tmmintrin.h>
35
36
#if !defined(WITH_SSE2)
37
#error "This file needs WITH_SSE2 enabled!"
38
#endif
39
40
static primitives_t* generic = NULL;
41
42
/****************************************************************************/
43
/* SSSE3 YUV420 -> RGB conversion                                           */
44
/****************************************************************************/
45
static __m128i* ssse3_YUV444Pixel(__m128i* WINPR_RESTRICT dst, __m128i Yraw, __m128i Uraw,
46
                                  __m128i Vraw, UINT8 pos)
47
0
{
48
  /* Visual Studio 2010 doesn't like _mm_set_epi32 in array initializer list */
49
  /* Note: This also applies to Visual Studio 2013 before Update 4 */
50
0
#if !defined(_MSC_VER) || (_MSC_VER > 1600)
51
0
  const __m128i mapY[] = { _mm_set_epi32(0x80800380, 0x80800280, 0x80800180, 0x80800080),
52
0
                         _mm_set_epi32(0x80800780, 0x80800680, 0x80800580, 0x80800480),
53
0
                         _mm_set_epi32(0x80800B80, 0x80800A80, 0x80800980, 0x80800880),
54
0
                         _mm_set_epi32(0x80800F80, 0x80800E80, 0x80800D80, 0x80800C80) };
55
0
  const __m128i mapUV[] = { _mm_set_epi32(0x80038002, 0x80018000, 0x80808080, 0x80808080),
56
0
                          _mm_set_epi32(0x80078006, 0x80058004, 0x80808080, 0x80808080),
57
0
                          _mm_set_epi32(0x800B800A, 0x80098008, 0x80808080, 0x80808080),
58
0
                          _mm_set_epi32(0x800F800E, 0x800D800C, 0x80808080, 0x80808080) };
59
0
  const __m128i mask[] = { _mm_set_epi32(0x80038080, 0x80028080, 0x80018080, 0x80008080),
60
0
                         _mm_set_epi32(0x80800380, 0x80800280, 0x80800180, 0x80800080),
61
0
                         _mm_set_epi32(0x80808003, 0x80808002, 0x80808001, 0x80808000) };
62
#else
63
  /* Note: must be in little-endian format ! */
64
  const __m128i mapY[] = { { 0x80, 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02, 0x80,
65
                           0x80, 0x80, 0x03, 0x80, 0x80 },
66
                         { 0x80, 0x04, 0x80, 0x80, 0x80, 0x05, 0x80, 0x80, 0x80, 0x06, 0x80,
67
                           0x80, 0x80, 0x07, 0x80, 0x80 },
68
                         { 0x80, 0x08, 0x80, 0x80, 0x80, 0x09, 0x80, 0x80, 0x80, 0x0a, 0x80,
69
                           0x80, 0x80, 0x0b, 0x80, 0x80 },
70
                         { 0x80, 0x0c, 0x80, 0x80, 0x80, 0x0d, 0x80, 0x80, 0x80, 0x0e, 0x80,
71
                           0x80, 0x80, 0x0f, 0x80, 0x80 }
72
73
  };
74
  const __m128i mapUV[] = { { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00, 0x80, 0x01,
75
                            0x80, 0x02, 0x80, 0x03, 0x80 },
76
                          { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x04, 0x80, 0x05,
77
                            0x80, 0x06, 0x80, 0x07, 0x80 },
78
                          { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x08, 0x80, 0x09,
79
                            0x80, 0x0a, 0x80, 0x0b, 0x80 },
80
                          { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x0c, 0x80, 0x0d,
81
                            0x80, 0x0e, 0x80, 0x0f, 0x80 } };
82
  const __m128i mask[] = { { 0x80, 0x80, 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02,
83
                           0x80, 0x80, 0x80, 0x03, 0x80 },
84
                         { 0x80, 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02, 0x80,
85
                           0x80, 0x80, 0x03, 0x80, 0x80 },
86
                         { 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02, 0x80, 0x80,
87
                           0x80, 0x03, 0x80, 0x80, 0x80 } };
88
#endif
89
0
  const __m128i c128 = _mm_set1_epi16(128);
90
0
  __m128i BGRX = _mm_and_si128(_mm_loadu_si128(dst),
91
0
                               _mm_set_epi32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000));
92
0
  {
93
0
    __m128i C, D, E;
94
    /* Load Y values and expand to 32 bit */
95
0
    {
96
0
      C = _mm_shuffle_epi8(Yraw, mapY[pos]); /* Reorder and multiply by 256 */
97
0
    }
98
    /* Load U values and expand to 32 bit */
99
0
    {
100
0
      const __m128i U = _mm_shuffle_epi8(Uraw, mapUV[pos]); /* Reorder dcba */
101
0
      D = _mm_sub_epi16(U, c128);                           /* D = U - 128 */
102
0
    }
103
    /* Load V values and expand to 32 bit */
104
0
    {
105
0
      const __m128i V = _mm_shuffle_epi8(Vraw, mapUV[pos]); /* Reorder dcba */
106
0
      E = _mm_sub_epi16(V, c128);                           /* E = V - 128 */
107
0
    }
108
    /* Get the R value */
109
0
    {
110
0
      const __m128i c403 = _mm_set1_epi16(403);
111
0
      const __m128i e403 =
112
0
          _mm_unpackhi_epi16(_mm_mullo_epi16(E, c403), _mm_mulhi_epi16(E, c403));
113
0
      const __m128i Rs = _mm_add_epi32(C, e403);
114
0
      const __m128i R32 = _mm_srai_epi32(Rs, 8);
115
0
      const __m128i R16 = _mm_packs_epi32(R32, _mm_setzero_si128());
116
0
      const __m128i R = _mm_packus_epi16(R16, _mm_setzero_si128());
117
0
      const __m128i packed = _mm_shuffle_epi8(R, mask[0]);
118
0
      BGRX = _mm_or_si128(BGRX, packed);
119
0
    }
120
    /* Get the G value */
121
0
    {
122
0
      const __m128i c48 = _mm_set1_epi16(48);
123
0
      const __m128i d48 =
124
0
          _mm_unpackhi_epi16(_mm_mullo_epi16(D, c48), _mm_mulhi_epi16(D, c48));
125
0
      const __m128i c120 = _mm_set1_epi16(120);
126
0
      const __m128i e120 =
127
0
          _mm_unpackhi_epi16(_mm_mullo_epi16(E, c120), _mm_mulhi_epi16(E, c120));
128
0
      const __m128i de = _mm_add_epi32(d48, e120);
129
0
      const __m128i Gs = _mm_sub_epi32(C, de);
130
0
      const __m128i G32 = _mm_srai_epi32(Gs, 8);
131
0
      const __m128i G16 = _mm_packs_epi32(G32, _mm_setzero_si128());
132
0
      const __m128i G = _mm_packus_epi16(G16, _mm_setzero_si128());
133
0
      const __m128i packed = _mm_shuffle_epi8(G, mask[1]);
134
0
      BGRX = _mm_or_si128(BGRX, packed);
135
0
    }
136
    /* Get the B value */
137
0
    {
138
0
      const __m128i c475 = _mm_set1_epi16(475);
139
0
      const __m128i d475 =
140
0
          _mm_unpackhi_epi16(_mm_mullo_epi16(D, c475), _mm_mulhi_epi16(D, c475));
141
0
      const __m128i Bs = _mm_add_epi32(C, d475);
142
0
      const __m128i B32 = _mm_srai_epi32(Bs, 8);
143
0
      const __m128i B16 = _mm_packs_epi32(B32, _mm_setzero_si128());
144
0
      const __m128i B = _mm_packus_epi16(B16, _mm_setzero_si128());
145
0
      const __m128i packed = _mm_shuffle_epi8(B, mask[2]);
146
0
      BGRX = _mm_or_si128(BGRX, packed);
147
0
    }
148
0
  }
149
0
  _mm_storeu_si128(dst++, BGRX);
150
0
  return dst;
151
0
}
152
153
static pstatus_t ssse3_YUV420ToRGB_BGRX(const BYTE* const WINPR_RESTRICT pSrc[],
154
                                        const UINT32* WINPR_RESTRICT srcStep,
155
                                        BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
156
                                        const prim_size_t* WINPR_RESTRICT roi)
157
0
{
158
0
  const UINT32 nWidth = roi->width;
159
0
  const UINT32 nHeight = roi->height;
160
0
  const UINT32 pad = roi->width % 16;
161
0
  const __m128i duplicate = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
162
0
  UINT32 y;
163
164
0
  for (y = 0; y < nHeight; y++)
165
0
  {
166
0
    UINT32 x;
167
0
    __m128i* dst = (__m128i*)(pDst + dstStep * y);
168
0
    const BYTE* YData = pSrc[0] + y * srcStep[0];
169
0
    const BYTE* UData = pSrc[1] + (y / 2) * srcStep[1];
170
0
    const BYTE* VData = pSrc[2] + (y / 2) * srcStep[2];
171
172
0
    for (x = 0; x < nWidth - pad; x += 16)
173
0
    {
174
0
      const __m128i Y = _mm_loadu_si128((const __m128i*)YData);
175
0
      const __m128i uRaw = _mm_loadu_si128((const __m128i*)UData);
176
0
      const __m128i vRaw = _mm_loadu_si128((const __m128i*)VData);
177
0
      const __m128i U = _mm_shuffle_epi8(uRaw, duplicate);
178
0
      const __m128i V = _mm_shuffle_epi8(vRaw, duplicate);
179
0
      YData += 16;
180
0
      UData += 8;
181
0
      VData += 8;
182
0
      dst = ssse3_YUV444Pixel(dst, Y, U, V, 0);
183
0
      dst = ssse3_YUV444Pixel(dst, Y, U, V, 1);
184
0
      dst = ssse3_YUV444Pixel(dst, Y, U, V, 2);
185
0
      dst = ssse3_YUV444Pixel(dst, Y, U, V, 3);
186
0
    }
187
188
0
    for (x = 0; x < pad; x++)
189
0
    {
190
0
      const BYTE Y = *YData++;
191
0
      const BYTE U = *UData;
192
0
      const BYTE V = *VData;
193
0
      const BYTE r = YUV2R(Y, U, V);
194
0
      const BYTE g = YUV2G(Y, U, V);
195
0
      const BYTE b = YUV2B(Y, U, V);
196
0
      dst = (__m128i*)writePixelBGRX((BYTE*)dst, 4, PIXEL_FORMAT_BGRX32, r, g, b, 0);
197
198
0
      if (x % 2)
199
0
      {
200
0
        UData++;
201
0
        VData++;
202
0
      }
203
0
    }
204
0
  }
205
206
0
  return PRIMITIVES_SUCCESS;
207
0
}
208
209
static pstatus_t ssse3_YUV420ToRGB(const BYTE* const WINPR_RESTRICT pSrc[3],
210
                                   const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst,
211
                                   UINT32 dstStep, UINT32 DstFormat,
212
                                   const prim_size_t* WINPR_RESTRICT roi)
213
0
{
214
0
  switch (DstFormat)
215
0
  {
216
0
    case PIXEL_FORMAT_BGRX32:
217
0
    case PIXEL_FORMAT_BGRA32:
218
0
      return ssse3_YUV420ToRGB_BGRX(pSrc, srcStep, pDst, dstStep, roi);
219
220
0
    default:
221
0
      return generic->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
222
0
  }
223
0
}
224
225
static pstatus_t ssse3_YUV444ToRGB_8u_P3AC4R_BGRX(const BYTE* const WINPR_RESTRICT pSrc[],
226
                                                  const UINT32 srcStep[], BYTE* WINPR_RESTRICT pDst,
227
                                                  UINT32 dstStep,
228
                                                  const prim_size_t* WINPR_RESTRICT roi)
229
0
{
230
0
  const UINT32 nWidth = roi->width;
231
0
  const UINT32 nHeight = roi->height;
232
0
  const UINT32 pad = roi->width % 16;
233
0
  UINT32 y;
234
235
0
  for (y = 0; y < nHeight; y++)
236
0
  {
237
0
    UINT32 x;
238
0
    __m128i* dst = (__m128i*)(pDst + dstStep * y);
239
0
    const BYTE* YData = pSrc[0] + y * srcStep[0];
240
0
    const BYTE* UData = pSrc[1] + y * srcStep[1];
241
0
    const BYTE* VData = pSrc[2] + y * srcStep[2];
242
243
0
    for (x = 0; x < nWidth - pad; x += 16)
244
0
    {
245
0
      __m128i Y = _mm_load_si128((const __m128i*)YData);
246
0
      __m128i U = _mm_load_si128((const __m128i*)UData);
247
0
      __m128i V = _mm_load_si128((const __m128i*)VData);
248
0
      YData += 16;
249
0
      UData += 16;
250
0
      VData += 16;
251
0
      dst = ssse3_YUV444Pixel(dst, Y, U, V, 0);
252
0
      dst = ssse3_YUV444Pixel(dst, Y, U, V, 1);
253
0
      dst = ssse3_YUV444Pixel(dst, Y, U, V, 2);
254
0
      dst = ssse3_YUV444Pixel(dst, Y, U, V, 3);
255
0
    }
256
257
0
    for (x = 0; x < pad; x++)
258
0
    {
259
0
      const BYTE Y = *YData++;
260
0
      const BYTE U = *UData++;
261
0
      const BYTE V = *VData++;
262
0
      const BYTE r = YUV2R(Y, U, V);
263
0
      const BYTE g = YUV2G(Y, U, V);
264
0
      const BYTE b = YUV2B(Y, U, V);
265
0
      dst = (__m128i*)writePixelBGRX((BYTE*)dst, 4, PIXEL_FORMAT_BGRX32, r, g, b, 0);
266
0
    }
267
0
  }
268
269
0
  return PRIMITIVES_SUCCESS;
270
0
}
271
272
static pstatus_t ssse3_YUV444ToRGB_8u_P3AC4R(const BYTE* const WINPR_RESTRICT pSrc[],
273
                                             const UINT32 srcStep[], BYTE* WINPR_RESTRICT pDst,
274
                                             UINT32 dstStep, UINT32 DstFormat,
275
                                             const prim_size_t* WINPR_RESTRICT roi)
276
0
{
277
0
  if ((uintptr_t)pSrc[0] % 16 || (uintptr_t)pSrc[1] % 16 || (uintptr_t)pSrc[2] % 16 ||
278
0
      srcStep[0] % 16 || srcStep[1] % 16 || srcStep[2] % 16)
279
0
    return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
280
281
0
  switch (DstFormat)
282
0
  {
283
0
    case PIXEL_FORMAT_BGRX32:
284
0
    case PIXEL_FORMAT_BGRA32:
285
0
      return ssse3_YUV444ToRGB_8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi);
286
287
0
    default:
288
0
      return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
289
0
  }
290
0
}
291
292
/****************************************************************************/
293
/* SSSE3 RGB -> YUV420 conversion                                          **/
294
/****************************************************************************/
295
296
/**
297
 * Note (nfedera):
298
 * The used forward transformation factors from RGB to YUV are based on the
299
 * values specified in [Rec. ITU-R BT.709-6] Section 3:
300
 * http://www.itu.int/rec/R-REC-BT.709-6-201506-I/en
301
 *
302
 * Y =  0.21260 * R + 0.71520 * G + 0.07220 * B +   0;
303
 * U = -0.11457 * R - 0.38543 * G + 0.50000 * B + 128;
304
 * V =  0.50000 * R - 0.45415 * G - 0.04585 * B + 128;
305
 *
306
 * The most accurate integer arithmetic approximation when using 8-bit signed
307
 * integer factors with 16-bit signed integer intermediate results is:
308
 *
309
 * Y = ( ( 27 * R + 92 * G +  9 * B) >> 7 );
310
 * U = ( (-29 * R - 99 * G + 128 * B) >> 8 ) + 128;
311
 * V = ( ( 128 * R - 116 * G -  12 * B) >> 8 ) + 128;
312
 *
313
 * Due to signed 8bit range being [-128,127] the U and V constants of 128 are
314
 * rounded to 127
315
 */
316
317
0
#define BGRX_Y_FACTORS _mm_set_epi8(0, 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9)
318
#define BGRX_U_FACTORS \
319
0
  _mm_set_epi8(0, -29, -99, 127, 0, -29, -99, 127, 0, -29, -99, 127, 0, -29, -99, 127)
320
#define BGRX_V_FACTORS \
321
0
  _mm_set_epi8(0, 127, -116, -12, 0, 127, -116, -12, 0, 127, -116, -12, 0, 127, -116, -12)
322
0
#define CONST128_FACTORS _mm_set1_epi8(-128)
323
324
0
#define Y_SHIFT 7
325
0
#define U_SHIFT 8
326
0
#define V_SHIFT 8
327
328
/*
329
TODO:
330
RGB[AX] can simply be supported using the following factors. And instead of loading the
331
globals directly the functions below could be passed pointers to the correct vectors
332
depending on the source picture format.
333
334
PRIM_ALIGN_128 static const BYTE rgbx_y_factors[] = {
335
      27,  92,   9,   0,  27,  92,   9,   0,  27,  92,   9,   0,  27,  92,   9,   0
336
};
337
PRIM_ALIGN_128 static const BYTE rgbx_u_factors[] = {
338
     -15, -49,  64,   0, -15, -49,  64,   0, -15, -49,  64,   0, -15, -49,  64,   0
339
};
340
PRIM_ALIGN_128 static const BYTE rgbx_v_factors[] = {
341
      64, -58,  -6,   0,  64, -58,  -6,   0,  64, -58,  -6,   0,  64, -58,  -6,   0
342
};
343
*/
344
345
/* compute the luma (Y) component from a single rgb source line */
346
347
static INLINE void ssse3_RGBToYUV420_BGRX_Y(const BYTE* WINPR_RESTRICT src, BYTE* dst, UINT32 width)
348
0
{
349
0
  UINT32 x;
350
0
  __m128i x0, x1, x2, x3;
351
0
  const __m128i y_factors = BGRX_Y_FACTORS;
352
0
  const __m128i* argb = (const __m128i*)src;
353
0
  __m128i* ydst = (__m128i*)dst;
354
355
0
  for (x = 0; x < width; x += 16)
356
0
  {
357
    /* store 16 rgba pixels in 4 128 bit registers */
358
0
    x0 = _mm_load_si128(argb++); // 1st 4 pixels
359
0
    x1 = _mm_load_si128(argb++); // 2nd 4 pixels
360
0
    x2 = _mm_load_si128(argb++); // 3rd 4 pixels
361
0
    x3 = _mm_load_si128(argb++); // 4th 4 pixels
362
    /* multiplications and subtotals */
363
0
    x0 = _mm_maddubs_epi16(x0, y_factors);
364
0
    x1 = _mm_maddubs_epi16(x1, y_factors);
365
0
    x2 = _mm_maddubs_epi16(x2, y_factors);
366
0
    x3 = _mm_maddubs_epi16(x3, y_factors);
367
    /* the total sums */
368
0
    x0 = _mm_hadd_epi16(x0, x1);
369
0
    x2 = _mm_hadd_epi16(x2, x3);
370
    /* shift the results */
371
0
    x0 = _mm_srli_epi16(x0, Y_SHIFT);
372
0
    x2 = _mm_srli_epi16(x2, Y_SHIFT);
373
    /* pack the 16 words into bytes */
374
0
    x0 = _mm_packus_epi16(x0, x2);
375
    /* save to y plane */
376
0
    _mm_storeu_si128(ydst++, x0);
377
0
  }
378
0
}
379
380
/* compute the chrominance (UV) components from two rgb source lines */
381
382
static INLINE void ssse3_RGBToYUV420_BGRX_UV(const BYTE* WINPR_RESTRICT src1,
383
                                             const BYTE* WINPR_RESTRICT src2,
384
                                             BYTE* WINPR_RESTRICT dst1, BYTE* WINPR_RESTRICT dst2,
385
                                             UINT32 width)
386
0
{
387
0
  UINT32 x;
388
0
  const __m128i u_factors = BGRX_U_FACTORS;
389
0
  const __m128i v_factors = BGRX_V_FACTORS;
390
0
  const __m128i vector128 = CONST128_FACTORS;
391
0
  __m128i x0, x1, x2, x3, x4, x5;
392
0
  const __m128i* rgb1 = (const __m128i*)src1;
393
0
  const __m128i* rgb2 = (const __m128i*)src2;
394
0
  __m64* udst = (__m64*)dst1;
395
0
  __m64* vdst = (__m64*)dst2;
396
397
0
  for (x = 0; x < width; x += 16)
398
0
  {
399
    /* subsample 16x2 pixels into 16x1 pixels */
400
0
    x0 = _mm_load_si128(rgb1++);
401
0
    x4 = _mm_load_si128(rgb2++);
402
0
    x0 = _mm_avg_epu8(x0, x4);
403
0
    x1 = _mm_load_si128(rgb1++);
404
0
    x4 = _mm_load_si128(rgb2++);
405
0
    x1 = _mm_avg_epu8(x1, x4);
406
0
    x2 = _mm_load_si128(rgb1++);
407
0
    x4 = _mm_load_si128(rgb2++);
408
0
    x2 = _mm_avg_epu8(x2, x4);
409
0
    x3 = _mm_load_si128(rgb1++);
410
0
    x4 = _mm_load_si128(rgb2++);
411
0
    x3 = _mm_avg_epu8(x3, x4);
412
    /* subsample these 16x1 pixels into 8x1 pixels */
413
    /**
414
     * shuffle controls
415
     * c = a[0],a[2],b[0],b[2] == 10 00 10 00 = 0x88
416
     * c = a[1],a[3],b[1],b[3] == 11 01 11 01 = 0xdd
417
     */
418
0
    x4 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x0), _mm_castsi128_ps(x1), 0x88));
419
0
    x0 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x0), _mm_castsi128_ps(x1), 0xdd));
420
0
    x0 = _mm_avg_epu8(x0, x4);
421
0
    x4 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x2), _mm_castsi128_ps(x3), 0x88));
422
0
    x1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x2), _mm_castsi128_ps(x3), 0xdd));
423
0
    x1 = _mm_avg_epu8(x1, x4);
424
    /* multiplications and subtotals */
425
0
    x2 = _mm_maddubs_epi16(x0, u_factors);
426
0
    x3 = _mm_maddubs_epi16(x1, u_factors);
427
0
    x4 = _mm_maddubs_epi16(x0, v_factors);
428
0
    x5 = _mm_maddubs_epi16(x1, v_factors);
429
    /* the total sums */
430
0
    x0 = _mm_hadd_epi16(x2, x3);
431
0
    x1 = _mm_hadd_epi16(x4, x5);
432
    /* shift the results */
433
0
    x0 = _mm_srai_epi16(x0, U_SHIFT);
434
0
    x1 = _mm_srai_epi16(x1, V_SHIFT);
435
    /* pack the 16 words into bytes */
436
0
    x0 = _mm_packs_epi16(x0, x1);
437
    /* add 128 */
438
0
    x0 = _mm_sub_epi8(x0, vector128);
439
    /* the lower 8 bytes go to the u plane */
440
0
    _mm_storel_pi(udst++, _mm_castsi128_ps(x0));
441
    /* the upper 8 bytes go to the v plane */
442
0
    _mm_storeh_pi(vdst++, _mm_castsi128_ps(x0));
443
0
  }
444
0
}
445
446
static pstatus_t ssse3_RGBToYUV420_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
447
                                        UINT32 srcStep, BYTE* WINPR_RESTRICT pDst[],
448
                                        const UINT32 dstStep[],
449
                                        const prim_size_t* WINPR_RESTRICT roi)
450
0
{
451
0
  UINT32 y;
452
0
  const BYTE* argb = pSrc;
453
0
  BYTE* ydst = pDst[0];
454
0
  BYTE* udst = pDst[1];
455
0
  BYTE* vdst = pDst[2];
456
457
0
  if (roi->height < 1 || roi->width < 1)
458
0
  {
459
0
    return !PRIMITIVES_SUCCESS;
460
0
  }
461
462
0
  if (roi->width % 16 || (uintptr_t)pSrc % 16 || srcStep % 16)
463
0
  {
464
0
    return generic->RGBToYUV420_8u_P3AC4R(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
465
0
  }
466
467
0
  for (y = 0; y < roi->height - 1; y += 2)
468
0
  {
469
0
    const BYTE* line1 = argb;
470
0
    const BYTE* line2 = argb + srcStep;
471
0
    ssse3_RGBToYUV420_BGRX_UV(line1, line2, udst, vdst, roi->width);
472
0
    ssse3_RGBToYUV420_BGRX_Y(line1, ydst, roi->width);
473
0
    ssse3_RGBToYUV420_BGRX_Y(line2, ydst + dstStep[0], roi->width);
474
0
    argb += 2 * srcStep;
475
0
    ydst += 2 * dstStep[0];
476
0
    udst += 1 * dstStep[1];
477
0
    vdst += 1 * dstStep[2];
478
0
  }
479
480
0
  if (roi->height & 1)
481
0
  {
482
    /* pass the same last line of an odd height twice for UV */
483
0
    ssse3_RGBToYUV420_BGRX_UV(argb, argb, udst, vdst, roi->width);
484
0
    ssse3_RGBToYUV420_BGRX_Y(argb, ydst, roi->width);
485
0
  }
486
487
0
  return PRIMITIVES_SUCCESS;
488
0
}
489
490
static pstatus_t ssse3_RGBToYUV420(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
491
                                   UINT32 srcStep, BYTE* WINPR_RESTRICT pDst[],
492
                                   const UINT32 dstStep[], const prim_size_t* WINPR_RESTRICT roi)
493
0
{
494
0
  switch (srcFormat)
495
0
  {
496
0
    case PIXEL_FORMAT_BGRX32:
497
0
    case PIXEL_FORMAT_BGRA32:
498
0
      return ssse3_RGBToYUV420_BGRX(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
499
500
0
    default:
501
0
      return generic->RGBToYUV420_8u_P3AC4R(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
502
0
  }
503
0
}
504
505
/****************************************************************************/
506
/* SSSE3 RGB -> AVC444-YUV conversion                                      **/
507
/****************************************************************************/
508
509
static INLINE void ssse3_RGBToAVC444YUV_BGRX_DOUBLE_ROW(
510
    const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd,
511
    BYTE* WINPR_RESTRICT b1Even, BYTE* WINPR_RESTRICT b1Odd, BYTE* WINPR_RESTRICT b2,
512
    BYTE* WINPR_RESTRICT b3, BYTE* WINPR_RESTRICT b4, BYTE* WINPR_RESTRICT b5,
513
    BYTE* WINPR_RESTRICT b6, BYTE* WINPR_RESTRICT b7, UINT32 width)
514
0
{
515
0
  UINT32 x;
516
0
  const __m128i* argbEven = (const __m128i*)srcEven;
517
0
  const __m128i* argbOdd = (const __m128i*)srcOdd;
518
0
  const __m128i y_factors = BGRX_Y_FACTORS;
519
0
  const __m128i u_factors = BGRX_U_FACTORS;
520
0
  const __m128i v_factors = BGRX_V_FACTORS;
521
0
  const __m128i vector128 = CONST128_FACTORS;
522
523
0
  for (x = 0; x < width; x += 16)
524
0
  {
525
    /* store 16 rgba pixels in 4 128 bit registers */
526
0
    const __m128i xe1 = _mm_load_si128(argbEven++); // 1st 4 pixels
527
0
    const __m128i xe2 = _mm_load_si128(argbEven++); // 2nd 4 pixels
528
0
    const __m128i xe3 = _mm_load_si128(argbEven++); // 3rd 4 pixels
529
0
    const __m128i xe4 = _mm_load_si128(argbEven++); // 4th 4 pixels
530
0
    const __m128i xo1 = _mm_load_si128(argbOdd++);  // 1st 4 pixels
531
0
    const __m128i xo2 = _mm_load_si128(argbOdd++);  // 2nd 4 pixels
532
0
    const __m128i xo3 = _mm_load_si128(argbOdd++);  // 3rd 4 pixels
533
0
    const __m128i xo4 = _mm_load_si128(argbOdd++);  // 4th 4 pixels
534
0
    {
535
      /* Y: multiplications with subtotals and horizontal sums */
536
0
      const __m128i ye1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, y_factors),
537
0
                                                        _mm_maddubs_epi16(xe2, y_factors)),
538
0
                                         Y_SHIFT);
539
0
      const __m128i ye2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, y_factors),
540
0
                                                        _mm_maddubs_epi16(xe4, y_factors)),
541
0
                                         Y_SHIFT);
542
0
      const __m128i ye = _mm_packus_epi16(ye1, ye2);
543
0
      const __m128i yo1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, y_factors),
544
0
                                                        _mm_maddubs_epi16(xo2, y_factors)),
545
0
                                         Y_SHIFT);
546
0
      const __m128i yo2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, y_factors),
547
0
                                                        _mm_maddubs_epi16(xo4, y_factors)),
548
0
                                         Y_SHIFT);
549
0
      const __m128i yo = _mm_packus_epi16(yo1, yo2);
550
      /* store y [b1] */
551
0
      _mm_storeu_si128((__m128i*)b1Even, ye);
552
0
      b1Even += 16;
553
554
0
      if (b1Odd)
555
0
      {
556
0
        _mm_storeu_si128((__m128i*)b1Odd, yo);
557
0
        b1Odd += 16;
558
0
      }
559
0
    }
560
0
    {
561
      /* We have now
562
       * 16 even U values in ue
563
       * 16 odd U values in uo
564
       *
565
       * We need to split these according to
566
       * 3.3.8.3.2 YUV420p Stream Combination for YUV444 mode */
567
0
      __m128i ue, uo = { 0 };
568
0
      {
569
0
        const __m128i ue1 =
570
0
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, u_factors),
571
0
                                          _mm_maddubs_epi16(xe2, u_factors)),
572
0
                           U_SHIFT);
573
0
        const __m128i ue2 =
574
0
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, u_factors),
575
0
                                          _mm_maddubs_epi16(xe4, u_factors)),
576
0
                           U_SHIFT);
577
0
        ue = _mm_sub_epi8(_mm_packs_epi16(ue1, ue2), vector128);
578
0
      }
579
580
0
      if (b1Odd)
581
0
      {
582
0
        const __m128i uo1 =
583
0
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, u_factors),
584
0
                                          _mm_maddubs_epi16(xo2, u_factors)),
585
0
                           U_SHIFT);
586
0
        const __m128i uo2 =
587
0
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, u_factors),
588
0
                                          _mm_maddubs_epi16(xo4, u_factors)),
589
0
                           U_SHIFT);
590
0
        uo = _mm_sub_epi8(_mm_packs_epi16(uo1, uo2), vector128);
591
0
      }
592
593
      /* Now we need the following storage distribution:
594
       * 2x   2y    -> b2
595
       * x    2y+1  -> b4
596
       * 2x+1 2y    -> b6 */
597
0
      if (b1Odd) /* b2 */
598
0
      {
599
0
        const __m128i ueh = _mm_unpackhi_epi8(ue, _mm_setzero_si128());
600
0
        const __m128i uoh = _mm_unpackhi_epi8(uo, _mm_setzero_si128());
601
0
        const __m128i hi = _mm_add_epi16(ueh, uoh);
602
0
        const __m128i uel = _mm_unpacklo_epi8(ue, _mm_setzero_si128());
603
0
        const __m128i uol = _mm_unpacklo_epi8(uo, _mm_setzero_si128());
604
0
        const __m128i lo = _mm_add_epi16(uel, uol);
605
0
        const __m128i added = _mm_hadd_epi16(lo, hi);
606
0
        const __m128i avg16 = _mm_srai_epi16(added, 2);
607
0
        const __m128i avg = _mm_packus_epi16(avg16, avg16);
608
0
        _mm_storel_epi64((__m128i*)b2, avg);
609
0
      }
610
0
      else
611
0
      {
612
0
        const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
613
0
                                          14, 12, 10, 8, 6, 4, 2, 0);
614
0
        const __m128i ud = _mm_shuffle_epi8(ue, mask);
615
0
        _mm_storel_epi64((__m128i*)b2, ud);
616
0
      }
617
618
0
      b2 += 8;
619
620
0
      if (b1Odd) /* b4 */
621
0
      {
622
0
        _mm_store_si128((__m128i*)b4, uo);
623
0
        b4 += 16;
624
0
      }
625
626
0
      {
627
        /* b6 */
628
0
        const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
629
0
                                          15, 13, 11, 9, 7, 5, 3, 1);
630
0
        const __m128i ude = _mm_shuffle_epi8(ue, mask);
631
0
        _mm_storel_epi64((__m128i*)b6, ude);
632
0
        b6 += 8;
633
0
      }
634
0
    }
635
0
    {
636
      /* We have now
637
       * 16 even V values in ue
638
       * 16 odd V values in uo
639
       *
640
       * We need to split these according to
641
       * 3.3.8.3.2 YUV420p Stream Combination for YUV444 mode */
642
0
      __m128i ve, vo = { 0 };
643
0
      {
644
0
        const __m128i ve1 =
645
0
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, v_factors),
646
0
                                          _mm_maddubs_epi16(xe2, v_factors)),
647
0
                           V_SHIFT);
648
0
        const __m128i ve2 =
649
0
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, v_factors),
650
0
                                          _mm_maddubs_epi16(xe4, v_factors)),
651
0
                           V_SHIFT);
652
0
        ve = _mm_sub_epi8(_mm_packs_epi16(ve1, ve2), vector128);
653
0
      }
654
655
0
      if (b1Odd)
656
0
      {
657
0
        const __m128i vo1 =
658
0
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, v_factors),
659
0
                                          _mm_maddubs_epi16(xo2, v_factors)),
660
0
                           V_SHIFT);
661
0
        const __m128i vo2 =
662
0
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, v_factors),
663
0
                                          _mm_maddubs_epi16(xo4, v_factors)),
664
0
                           V_SHIFT);
665
0
        vo = _mm_sub_epi8(_mm_packs_epi16(vo1, vo2), vector128);
666
0
      }
667
668
      /* Now we need the following storage distribution:
669
       * 2x   2y    -> b3
670
       * x    2y+1  -> b5
671
       * 2x+1 2y    -> b7 */
672
0
      if (b1Odd) /* b3 */
673
0
      {
674
0
        const __m128i veh = _mm_unpackhi_epi8(ve, _mm_setzero_si128());
675
0
        const __m128i voh = _mm_unpackhi_epi8(vo, _mm_setzero_si128());
676
0
        const __m128i hi = _mm_add_epi16(veh, voh);
677
0
        const __m128i vel = _mm_unpacklo_epi8(ve, _mm_setzero_si128());
678
0
        const __m128i vol = _mm_unpacklo_epi8(vo, _mm_setzero_si128());
679
0
        const __m128i lo = _mm_add_epi16(vel, vol);
680
0
        const __m128i added = _mm_hadd_epi16(lo, hi);
681
0
        const __m128i avg16 = _mm_srai_epi16(added, 2);
682
0
        const __m128i avg = _mm_packus_epi16(avg16, avg16);
683
0
        _mm_storel_epi64((__m128i*)b3, avg);
684
0
      }
685
0
      else
686
0
      {
687
0
        const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
688
0
                                          14, 12, 10, 8, 6, 4, 2, 0);
689
0
        const __m128i vd = _mm_shuffle_epi8(ve, mask);
690
0
        _mm_storel_epi64((__m128i*)b3, vd);
691
0
      }
692
693
0
      b3 += 8;
694
695
0
      if (b1Odd) /* b5 */
696
0
      {
697
0
        _mm_store_si128((__m128i*)b5, vo);
698
0
        b5 += 16;
699
0
      }
700
701
0
      {
702
        /* b7 */
703
0
        const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
704
0
                                          15, 13, 11, 9, 7, 5, 3, 1);
705
0
        const __m128i vde = _mm_shuffle_epi8(ve, mask);
706
0
        _mm_storel_epi64((__m128i*)b7, vde);
707
0
        b7 += 8;
708
0
      }
709
0
    }
710
0
  }
711
0
}
712
713
static pstatus_t ssse3_RGBToAVC444YUV_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
714
                                           UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[],
715
                                           const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[],
716
                                           const UINT32 dst2Step[],
717
                                           const prim_size_t* WINPR_RESTRICT roi)
718
0
{
719
0
  const BYTE* pMaxSrc = pSrc + (roi->height - 1) * srcStep;
720
721
0
  if (roi->height < 1 || roi->width < 1)
722
0
    return !PRIMITIVES_SUCCESS;
723
724
0
  if (roi->width % 16 || (uintptr_t)pSrc % 16 || srcStep % 16)
725
0
    return generic->RGBToAVC444YUV(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, dst2Step,
726
0
                                   roi);
727
728
0
  for (UINT32 y = 0; y < roi->height; y += 2)
729
0
  {
730
0
    const BOOL last = (y >= (roi->height - 1));
731
0
    const BYTE* srcEven = y < roi->height ? pSrc + y * srcStep : pMaxSrc;
732
0
    const BYTE* srcOdd = !last ? pSrc + (y + 1) * srcStep : pMaxSrc;
733
0
    const UINT32 i = y >> 1;
734
0
    const UINT32 n = (i & ~7) + i;
735
0
    BYTE* b1Even = pDst1[0] + y * dst1Step[0];
736
0
    BYTE* b1Odd = !last ? (b1Even + dst1Step[0]) : NULL;
737
0
    BYTE* b2 = pDst1[1] + (y / 2) * dst1Step[1];
738
0
    BYTE* b3 = pDst1[2] + (y / 2) * dst1Step[2];
739
0
    BYTE* b4 = pDst2[0] + dst2Step[0] * n;
740
0
    BYTE* b5 = b4 + 8 * dst2Step[0];
741
0
    BYTE* b6 = pDst2[1] + (y / 2) * dst2Step[1];
742
0
    BYTE* b7 = pDst2[2] + (y / 2) * dst2Step[2];
743
0
    ssse3_RGBToAVC444YUV_BGRX_DOUBLE_ROW(srcEven, srcOdd, b1Even, b1Odd, b2, b3, b4, b5, b6, b7,
744
0
                                         roi->width);
745
0
  }
746
747
0
  return PRIMITIVES_SUCCESS;
748
0
}
749
750
static pstatus_t ssse3_RGBToAVC444YUV(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
751
                                      UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[],
752
                                      const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[],
753
                                      const UINT32 dst2Step[],
754
                                      const prim_size_t* WINPR_RESTRICT roi)
755
0
{
756
0
  switch (srcFormat)
757
0
  {
758
0
    case PIXEL_FORMAT_BGRX32:
759
0
    case PIXEL_FORMAT_BGRA32:
760
0
      return ssse3_RGBToAVC444YUV_BGRX(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
761
0
                                       dst2Step, roi);
762
763
0
    default:
764
0
      return generic->RGBToAVC444YUV(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
765
0
                                     dst2Step, roi);
766
0
  }
767
0
}
768
769
/* Mapping of arguments:
770
 *
771
 * b1 [even lines] -> yLumaDstEven
772
 * b1 [odd lines]  -> yLumaDstOdd
773
 * b2              -> uLumaDst
774
 * b3              -> vLumaDst
775
 * b4              -> yChromaDst1
776
 * b5              -> yChromaDst2
777
 * b6              -> uChromaDst1
778
 * b7              -> uChromaDst2
779
 * b8              -> vChromaDst1
780
 * b9              -> vChromaDst2
781
 */
782
static INLINE void ssse3_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
783
    const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd,
784
    BYTE* WINPR_RESTRICT yLumaDstEven, BYTE* WINPR_RESTRICT yLumaDstOdd,
785
    BYTE* WINPR_RESTRICT uLumaDst, BYTE* WINPR_RESTRICT vLumaDst,
786
    BYTE* WINPR_RESTRICT yEvenChromaDst1, BYTE* WINPR_RESTRICT yEvenChromaDst2,
787
    BYTE* WINPR_RESTRICT yOddChromaDst1, BYTE* WINPR_RESTRICT yOddChromaDst2,
788
    BYTE* WINPR_RESTRICT uChromaDst1, BYTE* WINPR_RESTRICT uChromaDst2,
789
    BYTE* WINPR_RESTRICT vChromaDst1, BYTE* WINPR_RESTRICT vChromaDst2, UINT32 width)
790
0
{
791
0
  UINT32 x;
792
0
  const __m128i vector128 = CONST128_FACTORS;
793
0
  const __m128i* argbEven = (const __m128i*)srcEven;
794
0
  const __m128i* argbOdd = (const __m128i*)srcOdd;
795
796
0
  for (x = 0; x < width; x += 16)
797
0
  {
798
    /* store 16 rgba pixels in 4 128 bit registers
799
     * for even and odd rows.
800
     */
801
0
    const __m128i xe1 = _mm_load_si128(argbEven++); /* 1st 4 pixels */
802
0
    const __m128i xe2 = _mm_load_si128(argbEven++); /* 2nd 4 pixels */
803
0
    const __m128i xe3 = _mm_load_si128(argbEven++); /* 3rd 4 pixels */
804
0
    const __m128i xe4 = _mm_load_si128(argbEven++); /* 4th 4 pixels */
805
0
    const __m128i xo1 = _mm_load_si128(argbOdd++);  /* 1st 4 pixels */
806
0
    const __m128i xo2 = _mm_load_si128(argbOdd++);  /* 2nd 4 pixels */
807
0
    const __m128i xo3 = _mm_load_si128(argbOdd++);  /* 3rd 4 pixels */
808
0
    const __m128i xo4 = _mm_load_si128(argbOdd++);  /* 4th 4 pixels */
809
0
    {
810
      /* Y: multiplications with subtotals and horizontal sums */
811
0
      const __m128i y_factors = BGRX_Y_FACTORS;
812
0
      const __m128i ye1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, y_factors),
813
0
                                                        _mm_maddubs_epi16(xe2, y_factors)),
814
0
                                         Y_SHIFT);
815
0
      const __m128i ye2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, y_factors),
816
0
                                                        _mm_maddubs_epi16(xe4, y_factors)),
817
0
                                         Y_SHIFT);
818
0
      const __m128i ye = _mm_packus_epi16(ye1, ye2);
819
      /* store y [b1] */
820
0
      _mm_storeu_si128((__m128i*)yLumaDstEven, ye);
821
0
      yLumaDstEven += 16;
822
0
    }
823
824
0
    if (yLumaDstOdd)
825
0
    {
826
0
      const __m128i y_factors = BGRX_Y_FACTORS;
827
0
      const __m128i yo1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, y_factors),
828
0
                                                        _mm_maddubs_epi16(xo2, y_factors)),
829
0
                                         Y_SHIFT);
830
0
      const __m128i yo2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, y_factors),
831
0
                                                        _mm_maddubs_epi16(xo4, y_factors)),
832
0
                                         Y_SHIFT);
833
0
      const __m128i yo = _mm_packus_epi16(yo1, yo2);
834
0
      _mm_storeu_si128((__m128i*)yLumaDstOdd, yo);
835
0
      yLumaDstOdd += 16;
836
0
    }
837
838
0
    {
839
      /* We have now
840
       * 16 even U values in ue
841
       * 16 odd U values in uo
842
       *
843
       * We need to split these according to
844
       * 3.3.8.3.3 YUV420p Stream Combination for YUV444v2 mode */
845
      /* U: multiplications with subtotals and horizontal sums */
846
0
      __m128i ue, uo, uavg;
847
0
      {
848
0
        const __m128i u_factors = BGRX_U_FACTORS;
849
0
        const __m128i ue1 =
850
0
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, u_factors),
851
0
                                          _mm_maddubs_epi16(xe2, u_factors)),
852
0
                           U_SHIFT);
853
0
        const __m128i ue2 =
854
0
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, u_factors),
855
0
                                          _mm_maddubs_epi16(xe4, u_factors)),
856
0
                           U_SHIFT);
857
0
        const __m128i ueavg = _mm_hadd_epi16(ue1, ue2);
858
0
        ue = _mm_sub_epi8(_mm_packs_epi16(ue1, ue2), vector128);
859
0
        uavg = ueavg;
860
0
      }
861
0
      {
862
0
        const __m128i u_factors = BGRX_U_FACTORS;
863
0
        const __m128i uo1 =
864
0
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, u_factors),
865
0
                                          _mm_maddubs_epi16(xo2, u_factors)),
866
0
                           U_SHIFT);
867
0
        const __m128i uo2 =
868
0
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, u_factors),
869
0
                                          _mm_maddubs_epi16(xo4, u_factors)),
870
0
                           U_SHIFT);
871
0
        const __m128i uoavg = _mm_hadd_epi16(uo1, uo2);
872
0
        uo = _mm_sub_epi8(_mm_packs_epi16(uo1, uo2), vector128);
873
0
        uavg = _mm_add_epi16(uavg, uoavg);
874
0
        uavg = _mm_srai_epi16(uavg, 2);
875
0
        uavg = _mm_packs_epi16(uavg, uoavg);
876
0
        uavg = _mm_sub_epi8(uavg, vector128);
877
0
      }
878
      /* Now we need the following storage distribution:
879
       * 2x   2y    -> uLumaDst
880
       * 2x+1  y    -> yChromaDst1
881
       * 4x   2y+1  -> uChromaDst1
882
       * 4x+2 2y+1  -> vChromaDst1 */
883
0
      {
884
0
        const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
885
0
                                          15, 13, 11, 9, 7, 5, 3, 1);
886
0
        const __m128i ude = _mm_shuffle_epi8(ue, mask);
887
0
        _mm_storel_epi64((__m128i*)yEvenChromaDst1, ude);
888
0
        yEvenChromaDst1 += 8;
889
0
      }
890
891
0
      if (yLumaDstOdd)
892
0
      {
893
0
        const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
894
0
                                          15, 13, 11, 9, 7, 5, 3, 1);
895
0
        const __m128i udo = _mm_shuffle_epi8(uo, mask);
896
0
        _mm_storel_epi64((__m128i*)yOddChromaDst1, udo);
897
0
        yOddChromaDst1 += 8;
898
0
      }
899
900
0
      if (yLumaDstOdd)
901
0
      {
902
0
        const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
903
0
                                          14, 10, 6, 2, 12, 8, 4, 0);
904
0
        const __m128i ud = _mm_shuffle_epi8(uo, mask);
905
0
        int* uDst1 = (int*)uChromaDst1;
906
0
        int* vDst1 = (int*)vChromaDst1;
907
0
        const int* src = (const int*)&ud;
908
0
        _mm_stream_si32(uDst1, src[0]);
909
0
        _mm_stream_si32(vDst1, src[1]);
910
0
        uChromaDst1 += 4;
911
0
        vChromaDst1 += 4;
912
0
      }
913
914
0
      if (yLumaDstOdd)
915
0
      {
916
0
        _mm_storel_epi64((__m128i*)uLumaDst, uavg);
917
0
        uLumaDst += 8;
918
0
      }
919
0
      else
920
0
      {
921
0
        const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
922
0
                                          14, 12, 10, 8, 6, 4, 2, 0);
923
0
        const __m128i ud = _mm_shuffle_epi8(ue, mask);
924
0
        _mm_storel_epi64((__m128i*)uLumaDst, ud);
925
0
        uLumaDst += 8;
926
0
      }
927
0
    }
928
929
0
    {
930
      /* V: multiplications with subtotals and horizontal sums */
931
0
      __m128i ve, vo, vavg;
932
0
      {
933
0
        const __m128i v_factors = BGRX_V_FACTORS;
934
0
        const __m128i ve1 =
935
0
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, v_factors),
936
0
                                          _mm_maddubs_epi16(xe2, v_factors)),
937
0
                           V_SHIFT);
938
0
        const __m128i ve2 =
939
0
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, v_factors),
940
0
                                          _mm_maddubs_epi16(xe4, v_factors)),
941
0
                           V_SHIFT);
942
0
        const __m128i veavg = _mm_hadd_epi16(ve1, ve2);
943
0
        ve = _mm_sub_epi8(_mm_packs_epi16(ve1, ve2), vector128);
944
0
        vavg = veavg;
945
0
      }
946
0
      {
947
0
        const __m128i v_factors = BGRX_V_FACTORS;
948
0
        const __m128i vo1 =
949
0
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, v_factors),
950
0
                                          _mm_maddubs_epi16(xo2, v_factors)),
951
0
                           V_SHIFT);
952
0
        const __m128i vo2 =
953
0
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, v_factors),
954
0
                                          _mm_maddubs_epi16(xo4, v_factors)),
955
0
                           V_SHIFT);
956
0
        const __m128i voavg = _mm_hadd_epi16(vo1, vo2);
957
0
        vo = _mm_sub_epi8(_mm_packs_epi16(vo1, vo2), vector128);
958
0
        vavg = _mm_add_epi16(vavg, voavg);
959
0
        vavg = _mm_srai_epi16(vavg, 2);
960
0
        vavg = _mm_packs_epi16(vavg, voavg);
961
0
        vavg = _mm_sub_epi8(vavg, vector128);
962
0
      }
963
      /* Now we need the following storage distribution:
964
       * 2x   2y    -> vLumaDst
965
       * 2x+1  y    -> yChromaDst2
966
       * 4x   2y+1  -> uChromaDst2
967
       * 4x+2 2y+1  -> vChromaDst2 */
968
0
      {
969
0
        const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
970
0
                                          15, 13, 11, 9, 7, 5, 3, 1);
971
0
        __m128i vde = _mm_shuffle_epi8(ve, mask);
972
0
        _mm_storel_epi64((__m128i*)yEvenChromaDst2, vde);
973
0
        yEvenChromaDst2 += 8;
974
0
      }
975
976
0
      if (yLumaDstOdd)
977
0
      {
978
0
        const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
979
0
                                          15, 13, 11, 9, 7, 5, 3, 1);
980
0
        __m128i vdo = _mm_shuffle_epi8(vo, mask);
981
0
        _mm_storel_epi64((__m128i*)yOddChromaDst2, vdo);
982
0
        yOddChromaDst2 += 8;
983
0
      }
984
985
0
      if (yLumaDstOdd)
986
0
      {
987
0
        const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
988
0
                                          14, 10, 6, 2, 12, 8, 4, 0);
989
0
        const __m128i vd = _mm_shuffle_epi8(vo, mask);
990
0
        int* uDst2 = (int*)uChromaDst2;
991
0
        int* vDst2 = (int*)vChromaDst2;
992
0
        const int* src = (const int*)&vd;
993
0
        _mm_stream_si32(uDst2, src[0]);
994
0
        _mm_stream_si32(vDst2, src[1]);
995
0
        uChromaDst2 += 4;
996
0
        vChromaDst2 += 4;
997
0
      }
998
999
0
      if (yLumaDstOdd)
1000
0
      {
1001
0
        _mm_storel_epi64((__m128i*)vLumaDst, vavg);
1002
0
        vLumaDst += 8;
1003
0
      }
1004
0
      else
1005
0
      {
1006
0
        const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1007
0
                                          14, 12, 10, 8, 6, 4, 2, 0);
1008
0
        __m128i vd = _mm_shuffle_epi8(ve, mask);
1009
0
        _mm_storel_epi64((__m128i*)vLumaDst, vd);
1010
0
        vLumaDst += 8;
1011
0
      }
1012
0
    }
1013
0
  }
1014
0
}
1015
1016
static pstatus_t ssse3_RGBToAVC444YUVv2_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
1017
                                             UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[],
1018
                                             const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[],
1019
                                             const UINT32 dst2Step[],
1020
                                             const prim_size_t* WINPR_RESTRICT roi)
1021
0
{
1022
0
  if (roi->height < 1 || roi->width < 1)
1023
0
    return !PRIMITIVES_SUCCESS;
1024
1025
0
  if (roi->width % 16 || (uintptr_t)pSrc % 16 || srcStep % 16)
1026
0
    return generic->RGBToAVC444YUVv2(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, dst2Step,
1027
0
                                     roi);
1028
1029
0
  for (UINT32 y = 0; y < roi->height; y += 2)
1030
0
  {
1031
0
    const BYTE* srcEven = (pSrc + y * srcStep);
1032
0
    const BYTE* srcOdd = (srcEven + srcStep);
1033
0
    BYTE* dstLumaYEven = (pDst1[0] + y * dst1Step[0]);
1034
0
    BYTE* dstLumaYOdd = (y < roi->height - 1) ? (dstLumaYEven + dst1Step[0]) : NULL;
1035
0
    BYTE* dstLumaU = (pDst1[1] + (y / 2) * dst1Step[1]);
1036
0
    BYTE* dstLumaV = (pDst1[2] + (y / 2) * dst1Step[2]);
1037
0
    BYTE* dstEvenChromaY1 = (pDst2[0] + y * dst2Step[0]);
1038
0
    BYTE* dstEvenChromaY2 = dstEvenChromaY1 + roi->width / 2;
1039
0
    BYTE* dstOddChromaY1 = dstEvenChromaY1 + dst2Step[0];
1040
0
    BYTE* dstOddChromaY2 = dstEvenChromaY2 + dst2Step[0];
1041
0
    BYTE* dstChromaU1 = (pDst2[1] + (y / 2) * dst2Step[1]);
1042
0
    BYTE* dstChromaV1 = (pDst2[2] + (y / 2) * dst2Step[2]);
1043
0
    BYTE* dstChromaU2 = dstChromaU1 + roi->width / 4;
1044
0
    BYTE* dstChromaV2 = dstChromaV1 + roi->width / 4;
1045
0
    ssse3_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(srcEven, srcOdd, dstLumaYEven, dstLumaYOdd, dstLumaU,
1046
0
                                           dstLumaV, dstEvenChromaY1, dstEvenChromaY2,
1047
0
                                           dstOddChromaY1, dstOddChromaY2, dstChromaU1,
1048
0
                                           dstChromaU2, dstChromaV1, dstChromaV2, roi->width);
1049
0
  }
1050
1051
0
  return PRIMITIVES_SUCCESS;
1052
0
}
1053
1054
static pstatus_t ssse3_RGBToAVC444YUVv2(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
1055
                                        UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[],
1056
                                        const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[],
1057
                                        const UINT32 dst2Step[],
1058
                                        const prim_size_t* WINPR_RESTRICT roi)
1059
0
{
1060
0
  switch (srcFormat)
1061
0
  {
1062
0
    case PIXEL_FORMAT_BGRX32:
1063
0
    case PIXEL_FORMAT_BGRA32:
1064
0
      return ssse3_RGBToAVC444YUVv2_BGRX(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
1065
0
                                         dst2Step, roi);
1066
1067
0
    default:
1068
0
      return generic->RGBToAVC444YUVv2(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
1069
0
                                       dst2Step, roi);
1070
0
  }
1071
0
}
1072
1073
static pstatus_t ssse3_LumaToYUV444(const BYTE* const WINPR_RESTRICT pSrcRaw[],
1074
                                    const UINT32 srcStep[], BYTE* WINPR_RESTRICT pDstRaw[],
1075
                                    const UINT32 dstStep[], const RECTANGLE_16* WINPR_RESTRICT roi)
1076
0
{
1077
0
  UINT32 x, y;
1078
0
  const UINT32 nWidth = roi->right - roi->left;
1079
0
  const UINT32 nHeight = roi->bottom - roi->top;
1080
0
  const UINT32 halfWidth = (nWidth + 1) / 2;
1081
0
  const UINT32 halfPad = halfWidth % 16;
1082
0
  const UINT32 halfHeight = (nHeight + 1) / 2;
1083
0
  const UINT32 oddY = 1;
1084
0
  const UINT32 evenY = 0;
1085
0
  const UINT32 oddX = 1;
1086
0
  const UINT32 evenX = 0;
1087
0
  const BYTE* pSrc[3] = { pSrcRaw[0] + roi->top * srcStep[0] + roi->left,
1088
0
                        pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2,
1089
0
                        pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2 };
1090
0
  BYTE* pDst[3] = { pDstRaw[0] + roi->top * dstStep[0] + roi->left,
1091
0
                  pDstRaw[1] + roi->top * dstStep[1] + roi->left,
1092
0
                  pDstRaw[2] + roi->top * dstStep[2] + roi->left };
1093
1094
  /* Y data is already here... */
1095
  /* B1 */
1096
0
  for (y = 0; y < nHeight; y++)
1097
0
  {
1098
0
    const BYTE* Ym = pSrc[0] + srcStep[0] * y;
1099
0
    BYTE* pY = pDst[0] + dstStep[0] * y;
1100
0
    memcpy(pY, Ym, nWidth);
1101
0
  }
1102
1103
  /* The first half of U, V are already here part of this frame. */
1104
  /* B2 and B3 */
1105
0
  for (y = 0; y < halfHeight; y++)
1106
0
  {
1107
0
    const UINT32 val2y = (2 * y + evenY);
1108
0
    const UINT32 val2y1 = val2y + oddY;
1109
0
    const BYTE* Um = pSrc[1] + srcStep[1] * y;
1110
0
    const BYTE* Vm = pSrc[2] + srcStep[2] * y;
1111
0
    BYTE* pU = pDst[1] + dstStep[1] * val2y;
1112
0
    BYTE* pV = pDst[2] + dstStep[2] * val2y;
1113
0
    BYTE* pU1 = pDst[1] + dstStep[1] * val2y1;
1114
0
    BYTE* pV1 = pDst[2] + dstStep[2] * val2y1;
1115
1116
0
    for (x = 0; x < halfWidth - halfPad; x += 16)
1117
0
    {
1118
0
      const __m128i unpackHigh = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
1119
0
      const __m128i unpackLow =
1120
0
          _mm_set_epi8(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8);
1121
0
      {
1122
0
        const __m128i u = _mm_loadu_si128((const __m128i*)&Um[x]);
1123
0
        const __m128i uHigh = _mm_shuffle_epi8(u, unpackHigh);
1124
0
        const __m128i uLow = _mm_shuffle_epi8(u, unpackLow);
1125
0
        _mm_storeu_si128((__m128i*)&pU[2 * x], uHigh);
1126
0
        _mm_storeu_si128((__m128i*)&pU[2 * x + 16], uLow);
1127
0
        _mm_storeu_si128((__m128i*)&pU1[2 * x], uHigh);
1128
0
        _mm_storeu_si128((__m128i*)&pU1[2 * x + 16], uLow);
1129
0
      }
1130
0
      {
1131
0
        const __m128i u = _mm_loadu_si128((const __m128i*)&Vm[x]);
1132
0
        const __m128i uHigh = _mm_shuffle_epi8(u, unpackHigh);
1133
0
        const __m128i uLow = _mm_shuffle_epi8(u, unpackLow);
1134
0
        _mm_storeu_si128((__m128i*)&pV[2 * x], uHigh);
1135
0
        _mm_storeu_si128((__m128i*)&pV[2 * x + 16], uLow);
1136
0
        _mm_storeu_si128((__m128i*)&pV1[2 * x], uHigh);
1137
0
        _mm_storeu_si128((__m128i*)&pV1[2 * x + 16], uLow);
1138
0
      }
1139
0
    }
1140
1141
0
    for (; x < halfWidth; x++)
1142
0
    {
1143
0
      const UINT32 val2x = 2 * x + evenX;
1144
0
      const UINT32 val2x1 = val2x + oddX;
1145
0
      pU[val2x] = Um[x];
1146
0
      pV[val2x] = Vm[x];
1147
0
      pU[val2x1] = Um[x];
1148
0
      pV[val2x1] = Vm[x];
1149
0
      pU1[val2x] = Um[x];
1150
0
      pV1[val2x] = Vm[x];
1151
0
      pU1[val2x1] = Um[x];
1152
0
      pV1[val2x1] = Vm[x];
1153
0
    }
1154
0
  }
1155
1156
0
  return PRIMITIVES_SUCCESS;
1157
0
}
1158
1159
static INLINE void ssse3_filter(BYTE* WINPR_RESTRICT pSrcDst, const BYTE* WINPR_RESTRICT pSrc2)
1160
0
{
1161
0
  const __m128i even =
1162
0
      _mm_set_epi8(0x80, 14, 0x80, 12, 0x80, 10, 0x80, 8, 0x80, 6, 0x80, 4, 0x80, 2, 0x80, 0);
1163
0
  const __m128i odd =
1164
0
      _mm_set_epi8(0x80, 15, 0x80, 13, 0x80, 11, 0x80, 9, 0x80, 7, 0x80, 5, 0x80, 3, 0x80, 1);
1165
0
  const __m128i interleave = _mm_set_epi8(15, 7, 14, 6, 13, 5, 12, 4, 11, 3, 10, 2, 9, 1, 8, 0);
1166
0
  const __m128i u = _mm_loadu_si128((const __m128i*)pSrcDst);
1167
0
  const __m128i u1 = _mm_loadu_si128((const __m128i*)pSrc2);
1168
0
  const __m128i uEven = _mm_shuffle_epi8(u, even);
1169
0
  const __m128i uEven4 = _mm_slli_epi16(uEven, 2);
1170
0
  const __m128i uOdd = _mm_shuffle_epi8(u, odd);
1171
0
  const __m128i u1Even = _mm_shuffle_epi8(u1, even);
1172
0
  const __m128i u1Odd = _mm_shuffle_epi8(u1, odd);
1173
0
  const __m128i tmp1 = _mm_add_epi16(uOdd, u1Even);
1174
0
  const __m128i tmp2 = _mm_add_epi16(tmp1, u1Odd);
1175
0
  const __m128i result = _mm_sub_epi16(uEven4, tmp2);
1176
0
  const __m128i packed = _mm_packus_epi16(result, uOdd);
1177
0
  const __m128i interleaved = _mm_shuffle_epi8(packed, interleave);
1178
0
  _mm_storeu_si128((__m128i*)pSrcDst, interleaved);
1179
0
}
1180
1181
static pstatus_t ssse3_ChromaFilter(BYTE* WINPR_RESTRICT pDst[], const UINT32 dstStep[],
1182
                                    const RECTANGLE_16* WINPR_RESTRICT roi)
1183
0
{
1184
0
  const UINT32 oddY = 1;
1185
0
  const UINT32 evenY = 0;
1186
0
  const UINT32 nWidth = roi->right - roi->left;
1187
0
  const UINT32 nHeight = roi->bottom - roi->top;
1188
0
  const UINT32 halfHeight = (nHeight + 1) / 2;
1189
0
  const UINT32 halfWidth = (nWidth + 1) / 2;
1190
0
  const UINT32 halfPad = halfWidth % 16;
1191
0
  UINT32 x, y;
1192
1193
  /* Filter */
1194
0
  for (y = roi->top; y < halfHeight + roi->top; y++)
1195
0
  {
1196
0
    const UINT32 val2y = (y * 2 + evenY);
1197
0
    const UINT32 val2y1 = val2y + oddY;
1198
0
    BYTE* pU1 = pDst[1] + dstStep[1] * val2y1;
1199
0
    BYTE* pV1 = pDst[2] + dstStep[2] * val2y1;
1200
0
    BYTE* pU = pDst[1] + dstStep[1] * val2y;
1201
0
    BYTE* pV = pDst[2] + dstStep[2] * val2y;
1202
1203
0
    if (val2y1 > nHeight)
1204
0
      continue;
1205
1206
0
    for (x = roi->left; x < halfWidth + roi->left - halfPad; x += 16)
1207
0
    {
1208
0
      ssse3_filter(&pU[2 * x], &pU1[2 * x]);
1209
0
      ssse3_filter(&pV[2 * x], &pV1[2 * x]);
1210
0
    }
1211
1212
0
    for (; x < halfWidth + roi->left; x++)
1213
0
    {
1214
0
      const UINT32 val2x = (x * 2);
1215
0
      const UINT32 val2x1 = val2x + 1;
1216
0
      const BYTE inU = pU[val2x];
1217
0
      const BYTE inV = pV[val2x];
1218
0
      const INT32 up = inU * 4;
1219
0
      const INT32 vp = inV * 4;
1220
0
      INT32 u2020;
1221
0
      INT32 v2020;
1222
1223
0
      if (val2x1 > nWidth)
1224
0
        continue;
1225
1226
0
      u2020 = up - pU[val2x1] - pU1[val2x] - pU1[val2x1];
1227
0
      v2020 = vp - pV[val2x1] - pV1[val2x] - pV1[val2x1];
1228
0
      pU[val2x] = CONDITIONAL_CLIP(u2020, inU);
1229
0
      pV[val2x] = CONDITIONAL_CLIP(v2020, inV);
1230
0
    }
1231
0
  }
1232
1233
0
  return PRIMITIVES_SUCCESS;
1234
0
}
1235
1236
static pstatus_t ssse3_ChromaV1ToYUV444(const BYTE* const WINPR_RESTRICT pSrcRaw[3],
1237
                                        const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDstRaw[3],
1238
                                        const UINT32 dstStep[3],
1239
                                        const RECTANGLE_16* WINPR_RESTRICT roi)
1240
0
{
1241
0
  const UINT32 mod = 16;
1242
0
  UINT32 uY = 0;
1243
0
  UINT32 vY = 0;
1244
0
  UINT32 x, y;
1245
0
  const UINT32 nWidth = roi->right - roi->left;
1246
0
  const UINT32 nHeight = roi->bottom - roi->top;
1247
0
  const UINT32 halfWidth = (nWidth + 1) / 2;
1248
0
  const UINT32 halfPad = halfWidth % 16;
1249
0
  const UINT32 halfHeight = (nHeight + 1) / 2;
1250
0
  const UINT32 oddY = 1;
1251
0
  const UINT32 evenY = 0;
1252
0
  const UINT32 oddX = 1;
1253
  /* The auxilary frame is aligned to multiples of 16x16.
1254
   * We need the padded height for B4 and B5 conversion. */
1255
0
  const UINT32 padHeigth = nHeight + 16 - nHeight % 16;
1256
0
  const BYTE* pSrc[3] = { pSrcRaw[0] + roi->top * srcStep[0] + roi->left,
1257
0
                        pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2,
1258
0
                        pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2 };
1259
0
  BYTE* pDst[3] = { pDstRaw[0] + roi->top * dstStep[0] + roi->left,
1260
0
                  pDstRaw[1] + roi->top * dstStep[1] + roi->left,
1261
0
                  pDstRaw[2] + roi->top * dstStep[2] + roi->left };
1262
0
  const __m128i zero = _mm_setzero_si128();
1263
0
  const __m128i mask =
1264
0
      _mm_set_epi8(0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80);
1265
1266
  /* The second half of U and V is a bit more tricky... */
1267
  /* B4 and B5 */
1268
0
  for (y = 0; y < padHeigth; y++)
1269
0
  {
1270
0
    const BYTE* Ya = pSrc[0] + srcStep[0] * y;
1271
0
    BYTE* pX;
1272
1273
0
    if ((y) % mod < (mod + 1) / 2)
1274
0
    {
1275
0
      const UINT32 pos = (2 * uY++ + oddY);
1276
1277
0
      if (pos >= nHeight)
1278
0
        continue;
1279
1280
0
      pX = pDst[1] + dstStep[1] * pos;
1281
0
    }
1282
0
    else
1283
0
    {
1284
0
      const UINT32 pos = (2 * vY++ + oddY);
1285
1286
0
      if (pos >= nHeight)
1287
0
        continue;
1288
1289
0
      pX = pDst[2] + dstStep[2] * pos;
1290
0
    }
1291
1292
0
    memcpy(pX, Ya, nWidth);
1293
0
  }
1294
1295
  /* B6 and B7 */
1296
0
  for (y = 0; y < halfHeight; y++)
1297
0
  {
1298
0
    const UINT32 val2y = (y * 2 + evenY);
1299
0
    const BYTE* Ua = pSrc[1] + srcStep[1] * y;
1300
0
    const BYTE* Va = pSrc[2] + srcStep[2] * y;
1301
0
    BYTE* pU = pDst[1] + dstStep[1] * val2y;
1302
0
    BYTE* pV = pDst[2] + dstStep[2] * val2y;
1303
1304
0
    for (x = 0; x < halfWidth - halfPad; x += 16)
1305
0
    {
1306
0
      {
1307
0
        const __m128i u = _mm_loadu_si128((const __m128i*)&Ua[x]);
1308
0
        const __m128i u2 = _mm_unpackhi_epi8(u, zero);
1309
0
        const __m128i u1 = _mm_unpacklo_epi8(u, zero);
1310
0
        _mm_maskmoveu_si128(u1, mask, (char*)&pU[2 * x]);
1311
0
        _mm_maskmoveu_si128(u2, mask, (char*)&pU[2 * x + 16]);
1312
0
      }
1313
0
      {
1314
0
        const __m128i u = _mm_loadu_si128((const __m128i*)&Va[x]);
1315
0
        const __m128i u2 = _mm_unpackhi_epi8(u, zero);
1316
0
        const __m128i u1 = _mm_unpacklo_epi8(u, zero);
1317
0
        _mm_maskmoveu_si128(u1, mask, (char*)&pV[2 * x]);
1318
0
        _mm_maskmoveu_si128(u2, mask, (char*)&pV[2 * x + 16]);
1319
0
      }
1320
0
    }
1321
1322
0
    for (; x < halfWidth; x++)
1323
0
    {
1324
0
      const UINT32 val2x1 = (x * 2 + oddX);
1325
0
      pU[val2x1] = Ua[x];
1326
0
      pV[val2x1] = Va[x];
1327
0
    }
1328
0
  }
1329
1330
  /* Filter */
1331
0
  return ssse3_ChromaFilter(pDst, dstStep, roi);
1332
0
}
1333
1334
static pstatus_t ssse3_ChromaV2ToYUV444(const BYTE* const WINPR_RESTRICT pSrc[3],
1335
                                        const UINT32 srcStep[3], UINT32 nTotalWidth,
1336
                                        UINT32 nTotalHeight, BYTE* WINPR_RESTRICT pDst[3],
1337
                                        const UINT32 dstStep[3],
1338
                                        const RECTANGLE_16* WINPR_RESTRICT roi)
1339
0
{
1340
0
  UINT32 x, y;
1341
0
  const UINT32 nWidth = roi->right - roi->left;
1342
0
  const UINT32 nHeight = roi->bottom - roi->top;
1343
0
  const UINT32 halfWidth = (nWidth + 1) / 2;
1344
0
  const UINT32 halfPad = halfWidth % 16;
1345
0
  const UINT32 halfHeight = (nHeight + 1) / 2;
1346
0
  const UINT32 quaterWidth = (nWidth + 3) / 4;
1347
0
  const UINT32 quaterPad = quaterWidth % 16;
1348
0
  const __m128i zero = _mm_setzero_si128();
1349
0
  const __m128i mask =
1350
0
      _mm_set_epi8(0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0);
1351
0
  const __m128i mask2 =
1352
0
      _mm_set_epi8(0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80);
1353
0
  const __m128i shuffle1 =
1354
0
      _mm_set_epi8(0x80, 15, 0x80, 14, 0x80, 13, 0x80, 12, 0x80, 11, 0x80, 10, 0x80, 9, 0x80, 8);
1355
0
  const __m128i shuffle2 =
1356
0
      _mm_set_epi8(0x80, 7, 0x80, 6, 0x80, 5, 0x80, 4, 0x80, 3, 0x80, 2, 0x80, 1, 0x80, 0);
1357
1358
  /* B4 and B5: odd UV values for width/2, height */
1359
0
  for (y = 0; y < nHeight; y++)
1360
0
  {
1361
0
    const UINT32 yTop = y + roi->top;
1362
0
    const BYTE* pYaU = pSrc[0] + srcStep[0] * yTop + roi->left / 2;
1363
0
    const BYTE* pYaV = pYaU + nTotalWidth / 2;
1364
0
    BYTE* pU = pDst[1] + dstStep[1] * yTop + roi->left;
1365
0
    BYTE* pV = pDst[2] + dstStep[2] * yTop + roi->left;
1366
1367
0
    for (x = 0; x < halfWidth - halfPad; x += 16)
1368
0
    {
1369
0
      {
1370
0
        const __m128i u = _mm_loadu_si128((const __m128i*)&pYaU[x]);
1371
0
        const __m128i u2 = _mm_unpackhi_epi8(zero, u);
1372
0
        const __m128i u1 = _mm_unpacklo_epi8(zero, u);
1373
0
        _mm_maskmoveu_si128(u1, mask, (char*)&pU[2 * x]);
1374
0
        _mm_maskmoveu_si128(u2, mask, (char*)&pU[2 * x + 16]);
1375
0
      }
1376
0
      {
1377
0
        const __m128i v = _mm_loadu_si128((const __m128i*)&pYaV[x]);
1378
0
        const __m128i v2 = _mm_unpackhi_epi8(zero, v);
1379
0
        const __m128i v1 = _mm_unpacklo_epi8(zero, v);
1380
0
        _mm_maskmoveu_si128(v1, mask, (char*)&pV[2 * x]);
1381
0
        _mm_maskmoveu_si128(v2, mask, (char*)&pV[2 * x + 16]);
1382
0
      }
1383
0
    }
1384
1385
0
    for (; x < halfWidth; x++)
1386
0
    {
1387
0
      const UINT32 odd = 2 * x + 1;
1388
0
      pU[odd] = pYaU[x];
1389
0
      pV[odd] = pYaV[x];
1390
0
    }
1391
0
  }
1392
1393
  /* B6 - B9 */
1394
0
  for (y = 0; y < halfHeight; y++)
1395
0
  {
1396
0
    const BYTE* pUaU = pSrc[1] + srcStep[1] * (y + roi->top / 2) + roi->left / 4;
1397
0
    const BYTE* pUaV = pUaU + nTotalWidth / 4;
1398
0
    const BYTE* pVaU = pSrc[2] + srcStep[2] * (y + roi->top / 2) + roi->left / 4;
1399
0
    const BYTE* pVaV = pVaU + nTotalWidth / 4;
1400
0
    BYTE* pU = pDst[1] + dstStep[1] * (2 * y + 1 + roi->top) + roi->left;
1401
0
    BYTE* pV = pDst[2] + dstStep[2] * (2 * y + 1 + roi->top) + roi->left;
1402
1403
0
    for (x = 0; x < quaterWidth - quaterPad; x += 16)
1404
0
    {
1405
0
      {
1406
0
        const __m128i uU = _mm_loadu_si128((const __m128i*)&pUaU[x]);
1407
0
        const __m128i uV = _mm_loadu_si128((const __m128i*)&pVaU[x]);
1408
0
        const __m128i uHigh = _mm_unpackhi_epi8(uU, uV);
1409
0
        const __m128i uLow = _mm_unpacklo_epi8(uU, uV);
1410
0
        const __m128i u1 = _mm_shuffle_epi8(uLow, shuffle2);
1411
0
        const __m128i u2 = _mm_shuffle_epi8(uLow, shuffle1);
1412
0
        const __m128i u3 = _mm_shuffle_epi8(uHigh, shuffle2);
1413
0
        const __m128i u4 = _mm_shuffle_epi8(uHigh, shuffle1);
1414
0
        _mm_maskmoveu_si128(u1, mask2, (char*)&pU[4 * x + 0]);
1415
0
        _mm_maskmoveu_si128(u2, mask2, (char*)&pU[4 * x + 16]);
1416
0
        _mm_maskmoveu_si128(u3, mask2, (char*)&pU[4 * x + 32]);
1417
0
        _mm_maskmoveu_si128(u4, mask2, (char*)&pU[4 * x + 48]);
1418
0
      }
1419
0
      {
1420
0
        const __m128i vU = _mm_loadu_si128((const __m128i*)&pUaV[x]);
1421
0
        const __m128i vV = _mm_loadu_si128((const __m128i*)&pVaV[x]);
1422
0
        const __m128i vHigh = _mm_unpackhi_epi8(vU, vV);
1423
0
        const __m128i vLow = _mm_unpacklo_epi8(vU, vV);
1424
0
        const __m128i v1 = _mm_shuffle_epi8(vLow, shuffle2);
1425
0
        const __m128i v2 = _mm_shuffle_epi8(vLow, shuffle1);
1426
0
        const __m128i v3 = _mm_shuffle_epi8(vHigh, shuffle2);
1427
0
        const __m128i v4 = _mm_shuffle_epi8(vHigh, shuffle1);
1428
0
        _mm_maskmoveu_si128(v1, mask2, (char*)&pV[4 * x + 0]);
1429
0
        _mm_maskmoveu_si128(v2, mask2, (char*)&pV[4 * x + 16]);
1430
0
        _mm_maskmoveu_si128(v3, mask2, (char*)&pV[4 * x + 32]);
1431
0
        _mm_maskmoveu_si128(v4, mask2, (char*)&pV[4 * x + 48]);
1432
0
      }
1433
0
    }
1434
1435
0
    for (; x < quaterWidth; x++)
1436
0
    {
1437
0
      pU[4 * x + 0] = pUaU[x];
1438
0
      pV[4 * x + 0] = pUaV[x];
1439
0
      pU[4 * x + 2] = pVaU[x];
1440
0
      pV[4 * x + 2] = pVaV[x];
1441
0
    }
1442
0
  }
1443
1444
0
  return ssse3_ChromaFilter(pDst, dstStep, roi);
1445
0
}
1446
1447
static pstatus_t ssse3_YUV420CombineToYUV444(avc444_frame_type type,
1448
                                             const BYTE* const WINPR_RESTRICT pSrc[3],
1449
                                             const UINT32 srcStep[3], UINT32 nWidth, UINT32 nHeight,
1450
                                             BYTE* WINPR_RESTRICT pDst[3], const UINT32 dstStep[3],
1451
                                             const RECTANGLE_16* WINPR_RESTRICT roi)
1452
0
{
1453
0
  if (!pSrc || !pSrc[0] || !pSrc[1] || !pSrc[2])
1454
0
    return -1;
1455
1456
0
  if (!pDst || !pDst[0] || !pDst[1] || !pDst[2])
1457
0
    return -1;
1458
1459
0
  if (!roi)
1460
0
    return -1;
1461
1462
0
  switch (type)
1463
0
  {
1464
0
    case AVC444_LUMA:
1465
0
      return ssse3_LumaToYUV444(pSrc, srcStep, pDst, dstStep, roi);
1466
1467
0
    case AVC444_CHROMAv1:
1468
0
      return ssse3_ChromaV1ToYUV444(pSrc, srcStep, pDst, dstStep, roi);
1469
1470
0
    case AVC444_CHROMAv2:
1471
0
      return ssse3_ChromaV2ToYUV444(pSrc, srcStep, nWidth, nHeight, pDst, dstStep, roi);
1472
1473
0
    default:
1474
0
      return -1;
1475
0
  }
1476
0
}
1477
1478
void primitives_init_YUV_opt(primitives_t* WINPR_RESTRICT prims)
1479
0
{
1480
0
  generic = primitives_get_generic();
1481
0
  primitives_init_YUV(prims);
1482
1483
0
  if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) &&
1484
0
      IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
1485
0
  {
1486
0
    prims->RGBToYUV420_8u_P3AC4R = ssse3_RGBToYUV420;
1487
0
    prims->RGBToAVC444YUV = ssse3_RGBToAVC444YUV;
1488
0
    prims->RGBToAVC444YUVv2 = ssse3_RGBToAVC444YUVv2;
1489
0
    prims->YUV420ToRGB_8u_P3AC4R = ssse3_YUV420ToRGB;
1490
0
    prims->YUV444ToRGB_8u_P3AC4R = ssse3_YUV444ToRGB_8u_P3AC4R;
1491
0
    prims->YUV420CombineToYUV444 = ssse3_YUV420CombineToYUV444;
1492
0
  }
1493
0
}