Coverage Report

Created: 2024-09-08 06:20

/src/FreeRDP/libfreerdp/primitives/sse/prim_YUV_ssse3.c
Line
Count
Source (jump to first uncovered line)
1
/**
2
 * FreeRDP: A Remote Desktop Protocol Implementation
3
 * Optimized YUV/RGB conversion operations
4
 *
5
 * Copyright 2014 Thomas Erbesdobler
6
 * Copyright 2016-2017 Armin Novak <armin.novak@thincast.com>
7
 * Copyright 2016-2017 Norbert Federa <norbert.federa@thincast.com>
8
 * Copyright 2016-2017 Thincast Technologies GmbH
9
 *
10
 * Licensed under the Apache License, Version 2.0 (the "License");
11
 * you may not use this file except in compliance with the License.
12
 * You may obtain a copy of the License at
13
 *
14
 *     http://www.apache.org/licenses/LICENSE-2.0
15
 *
16
 * Unless required by applicable law or agreed to in writing, software
17
 * distributed under the License is distributed on an "AS IS" BASIS,
18
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19
 * See the License for the specific language governing permissions and
20
 * limitations under the License.
21
 */
22
23
#include <winpr/wtypes.h>
24
#include <freerdp/config.h>
25
26
#include <winpr/sysinfo.h>
27
#include <winpr/crt.h>
28
#include <freerdp/types.h>
29
#include <freerdp/primitives.h>
30
31
#include "prim_internal.h"
32
#include "prim_YUV.h"
33
34
#if defined(SSE2_ENABLED)
35
#include <emmintrin.h>
36
#include <tmmintrin.h>
37
38
static primitives_t* generic = NULL;
39
40
/****************************************************************************/
41
/* SSSE3 YUV420 -> RGB conversion                                           */
42
/****************************************************************************/
43
static __m128i* ssse3_YUV444Pixel(__m128i* WINPR_RESTRICT dst, __m128i Yraw, __m128i Uraw,
44
                                  __m128i Vraw, UINT8 pos)
45
{
46
  /* Visual Studio 2010 doesn't like _mm_set_epi32 in array initializer list */
47
  /* Note: This also applies to Visual Studio 2013 before Update 4 */
48
#if !defined(_MSC_VER) || (_MSC_VER > 1600)
49
  const __m128i mapY[] = { _mm_set_epi32(0x80800380, 0x80800280, 0x80800180, 0x80800080),
50
                         _mm_set_epi32(0x80800780, 0x80800680, 0x80800580, 0x80800480),
51
                         _mm_set_epi32(0x80800B80, 0x80800A80, 0x80800980, 0x80800880),
52
                         _mm_set_epi32(0x80800F80, 0x80800E80, 0x80800D80, 0x80800C80) };
53
  const __m128i mapUV[] = { _mm_set_epi32(0x80038002, 0x80018000, 0x80808080, 0x80808080),
54
                          _mm_set_epi32(0x80078006, 0x80058004, 0x80808080, 0x80808080),
55
                          _mm_set_epi32(0x800B800A, 0x80098008, 0x80808080, 0x80808080),
56
                          _mm_set_epi32(0x800F800E, 0x800D800C, 0x80808080, 0x80808080) };
57
  const __m128i mask[] = { _mm_set_epi32(0x80038080, 0x80028080, 0x80018080, 0x80008080),
58
                         _mm_set_epi32(0x80800380, 0x80800280, 0x80800180, 0x80800080),
59
                         _mm_set_epi32(0x80808003, 0x80808002, 0x80808001, 0x80808000) };
60
#else
61
  /* Note: must be in little-endian format ! */
62
  const __m128i mapY[] = { { 0x80, 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02, 0x80,
63
                           0x80, 0x80, 0x03, 0x80, 0x80 },
64
                         { 0x80, 0x04, 0x80, 0x80, 0x80, 0x05, 0x80, 0x80, 0x80, 0x06, 0x80,
65
                           0x80, 0x80, 0x07, 0x80, 0x80 },
66
                         { 0x80, 0x08, 0x80, 0x80, 0x80, 0x09, 0x80, 0x80, 0x80, 0x0a, 0x80,
67
                           0x80, 0x80, 0x0b, 0x80, 0x80 },
68
                         { 0x80, 0x0c, 0x80, 0x80, 0x80, 0x0d, 0x80, 0x80, 0x80, 0x0e, 0x80,
69
                           0x80, 0x80, 0x0f, 0x80, 0x80 }
70
71
  };
72
  const __m128i mapUV[] = { { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00, 0x80, 0x01,
73
                            0x80, 0x02, 0x80, 0x03, 0x80 },
74
                          { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x04, 0x80, 0x05,
75
                            0x80, 0x06, 0x80, 0x07, 0x80 },
76
                          { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x08, 0x80, 0x09,
77
                            0x80, 0x0a, 0x80, 0x0b, 0x80 },
78
                          { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x0c, 0x80, 0x0d,
79
                            0x80, 0x0e, 0x80, 0x0f, 0x80 } };
80
  const __m128i mask[] = { { 0x80, 0x80, 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02,
81
                           0x80, 0x80, 0x80, 0x03, 0x80 },
82
                         { 0x80, 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02, 0x80,
83
                           0x80, 0x80, 0x03, 0x80, 0x80 },
84
                         { 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02, 0x80, 0x80,
85
                           0x80, 0x03, 0x80, 0x80, 0x80 } };
86
#endif
87
  const __m128i c128 = _mm_set1_epi16(128);
88
  __m128i BGRX = _mm_and_si128(_mm_loadu_si128(dst),
89
                               _mm_set_epi32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000));
90
  {
91
    __m128i C;
92
    __m128i D;
93
    __m128i E;
94
    /* Load Y values and expand to 32 bit */
95
    {
96
      C = _mm_shuffle_epi8(Yraw, mapY[pos]); /* Reorder and multiply by 256 */
97
    }
98
    /* Load U values and expand to 32 bit */
99
    {
100
      const __m128i U = _mm_shuffle_epi8(Uraw, mapUV[pos]); /* Reorder dcba */
101
      D = _mm_sub_epi16(U, c128);                           /* D = U - 128 */
102
    }
103
    /* Load V values and expand to 32 bit */
104
    {
105
      const __m128i V = _mm_shuffle_epi8(Vraw, mapUV[pos]); /* Reorder dcba */
106
      E = _mm_sub_epi16(V, c128);                           /* E = V - 128 */
107
    }
108
    /* Get the R value */
109
    {
110
      const __m128i c403 = _mm_set1_epi16(403);
111
      const __m128i e403 =
112
          _mm_unpackhi_epi16(_mm_mullo_epi16(E, c403), _mm_mulhi_epi16(E, c403));
113
      const __m128i Rs = _mm_add_epi32(C, e403);
114
      const __m128i R32 = _mm_srai_epi32(Rs, 8);
115
      const __m128i R16 = _mm_packs_epi32(R32, _mm_setzero_si128());
116
      const __m128i R = _mm_packus_epi16(R16, _mm_setzero_si128());
117
      const __m128i packed = _mm_shuffle_epi8(R, mask[0]);
118
      BGRX = _mm_or_si128(BGRX, packed);
119
    }
120
    /* Get the G value */
121
    {
122
      const __m128i c48 = _mm_set1_epi16(48);
123
      const __m128i d48 =
124
          _mm_unpackhi_epi16(_mm_mullo_epi16(D, c48), _mm_mulhi_epi16(D, c48));
125
      const __m128i c120 = _mm_set1_epi16(120);
126
      const __m128i e120 =
127
          _mm_unpackhi_epi16(_mm_mullo_epi16(E, c120), _mm_mulhi_epi16(E, c120));
128
      const __m128i de = _mm_add_epi32(d48, e120);
129
      const __m128i Gs = _mm_sub_epi32(C, de);
130
      const __m128i G32 = _mm_srai_epi32(Gs, 8);
131
      const __m128i G16 = _mm_packs_epi32(G32, _mm_setzero_si128());
132
      const __m128i G = _mm_packus_epi16(G16, _mm_setzero_si128());
133
      const __m128i packed = _mm_shuffle_epi8(G, mask[1]);
134
      BGRX = _mm_or_si128(BGRX, packed);
135
    }
136
    /* Get the B value */
137
    {
138
      const __m128i c475 = _mm_set1_epi16(475);
139
      const __m128i d475 =
140
          _mm_unpackhi_epi16(_mm_mullo_epi16(D, c475), _mm_mulhi_epi16(D, c475));
141
      const __m128i Bs = _mm_add_epi32(C, d475);
142
      const __m128i B32 = _mm_srai_epi32(Bs, 8);
143
      const __m128i B16 = _mm_packs_epi32(B32, _mm_setzero_si128());
144
      const __m128i B = _mm_packus_epi16(B16, _mm_setzero_si128());
145
      const __m128i packed = _mm_shuffle_epi8(B, mask[2]);
146
      BGRX = _mm_or_si128(BGRX, packed);
147
    }
148
  }
149
  _mm_storeu_si128(dst++, BGRX);
150
  return dst;
151
}
152
153
static pstatus_t ssse3_YUV420ToRGB_BGRX(const BYTE* const WINPR_RESTRICT pSrc[],
154
                                        const UINT32* WINPR_RESTRICT srcStep,
155
                                        BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
156
                                        const prim_size_t* WINPR_RESTRICT roi)
157
{
158
  const UINT32 nWidth = roi->width;
159
  const UINT32 nHeight = roi->height;
160
  const UINT32 pad = roi->width % 16;
161
  const __m128i duplicate = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
162
163
  for (size_t y = 0; y < nHeight; y++)
164
  {
165
    __m128i* dst = (__m128i*)(pDst + dstStep * y);
166
    const BYTE* YData = pSrc[0] + y * srcStep[0];
167
    const BYTE* UData = pSrc[1] + (y / 2) * srcStep[1];
168
    const BYTE* VData = pSrc[2] + (y / 2) * srcStep[2];
169
170
    for (UINT32 x = 0; x < nWidth - pad; x += 16)
171
    {
172
      const __m128i Y = _mm_loadu_si128((const __m128i*)YData);
173
      const __m128i uRaw = _mm_loadu_si128((const __m128i*)UData);
174
      const __m128i vRaw = _mm_loadu_si128((const __m128i*)VData);
175
      const __m128i U = _mm_shuffle_epi8(uRaw, duplicate);
176
      const __m128i V = _mm_shuffle_epi8(vRaw, duplicate);
177
      YData += 16;
178
      UData += 8;
179
      VData += 8;
180
      dst = ssse3_YUV444Pixel(dst, Y, U, V, 0);
181
      dst = ssse3_YUV444Pixel(dst, Y, U, V, 1);
182
      dst = ssse3_YUV444Pixel(dst, Y, U, V, 2);
183
      dst = ssse3_YUV444Pixel(dst, Y, U, V, 3);
184
    }
185
186
    for (UINT32 x = 0; x < pad; x++)
187
    {
188
      const BYTE Y = *YData++;
189
      const BYTE U = *UData;
190
      const BYTE V = *VData;
191
      const BYTE r = YUV2R(Y, U, V);
192
      const BYTE g = YUV2G(Y, U, V);
193
      const BYTE b = YUV2B(Y, U, V);
194
      dst = (__m128i*)writePixelBGRX((BYTE*)dst, 4, PIXEL_FORMAT_BGRX32, r, g, b, 0);
195
196
      if (x % 2)
197
      {
198
        UData++;
199
        VData++;
200
      }
201
    }
202
  }
203
204
  return PRIMITIVES_SUCCESS;
205
}
206
207
static pstatus_t ssse3_YUV420ToRGB(const BYTE* const WINPR_RESTRICT pSrc[3],
208
                                   const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst,
209
                                   UINT32 dstStep, UINT32 DstFormat,
210
                                   const prim_size_t* WINPR_RESTRICT roi)
211
{
212
  switch (DstFormat)
213
  {
214
    case PIXEL_FORMAT_BGRX32:
215
    case PIXEL_FORMAT_BGRA32:
216
      return ssse3_YUV420ToRGB_BGRX(pSrc, srcStep, pDst, dstStep, roi);
217
218
    default:
219
      return generic->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
220
  }
221
}
222
223
static pstatus_t ssse3_YUV444ToRGB_8u_P3AC4R_BGRX(const BYTE* const WINPR_RESTRICT pSrc[],
224
                                                  const UINT32 srcStep[], BYTE* WINPR_RESTRICT pDst,
225
                                                  UINT32 dstStep,
226
                                                  const prim_size_t* WINPR_RESTRICT roi)
227
{
228
  const UINT32 nWidth = roi->width;
229
  const UINT32 nHeight = roi->height;
230
  const UINT32 pad = roi->width % 16;
231
232
  for (size_t y = 0; y < nHeight; y++)
233
  {
234
    __m128i* dst = (__m128i*)(pDst + dstStep * y);
235
    const BYTE* YData = pSrc[0] + y * srcStep[0];
236
    const BYTE* UData = pSrc[1] + y * srcStep[1];
237
    const BYTE* VData = pSrc[2] + y * srcStep[2];
238
239
    for (size_t x = 0; x < nWidth - pad; x += 16)
240
    {
241
      __m128i Y = _mm_load_si128((const __m128i*)YData);
242
      __m128i U = _mm_load_si128((const __m128i*)UData);
243
      __m128i V = _mm_load_si128((const __m128i*)VData);
244
      YData += 16;
245
      UData += 16;
246
      VData += 16;
247
      dst = ssse3_YUV444Pixel(dst, Y, U, V, 0);
248
      dst = ssse3_YUV444Pixel(dst, Y, U, V, 1);
249
      dst = ssse3_YUV444Pixel(dst, Y, U, V, 2);
250
      dst = ssse3_YUV444Pixel(dst, Y, U, V, 3);
251
    }
252
253
    for (size_t x = 0; x < pad; x++)
254
    {
255
      const BYTE Y = *YData++;
256
      const BYTE U = *UData++;
257
      const BYTE V = *VData++;
258
      const BYTE r = YUV2R(Y, U, V);
259
      const BYTE g = YUV2G(Y, U, V);
260
      const BYTE b = YUV2B(Y, U, V);
261
      dst = (__m128i*)writePixelBGRX((BYTE*)dst, 4, PIXEL_FORMAT_BGRX32, r, g, b, 0);
262
    }
263
  }
264
265
  return PRIMITIVES_SUCCESS;
266
}
267
268
static pstatus_t ssse3_YUV444ToRGB_8u_P3AC4R(const BYTE* const WINPR_RESTRICT pSrc[],
269
                                             const UINT32 srcStep[], BYTE* WINPR_RESTRICT pDst,
270
                                             UINT32 dstStep, UINT32 DstFormat,
271
                                             const prim_size_t* WINPR_RESTRICT roi)
272
{
273
  if ((uintptr_t)pSrc[0] % 16 || (uintptr_t)pSrc[1] % 16 || (uintptr_t)pSrc[2] % 16 ||
274
      srcStep[0] % 16 || srcStep[1] % 16 || srcStep[2] % 16)
275
    return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
276
277
  switch (DstFormat)
278
  {
279
    case PIXEL_FORMAT_BGRX32:
280
    case PIXEL_FORMAT_BGRA32:
281
      return ssse3_YUV444ToRGB_8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi);
282
283
    default:
284
      return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
285
  }
286
}
287
288
/****************************************************************************/
289
/* SSSE3 RGB -> YUV420 conversion                                          **/
290
/****************************************************************************/
291
292
/**
293
 * Note (nfedera):
294
 * The used forward transformation factors from RGB to YUV are based on the
295
 * values specified in [Rec. ITU-R BT.709-6] Section 3:
296
 * http://www.itu.int/rec/R-REC-BT.709-6-201506-I/en
297
 *
298
 * Y =  0.21260 * R + 0.71520 * G + 0.07220 * B +   0;
299
 * U = -0.11457 * R - 0.38543 * G + 0.50000 * B + 128;
300
 * V =  0.50000 * R - 0.45415 * G - 0.04585 * B + 128;
301
 *
302
 * The most accurate integer arithmetic approximation when using 8-bit signed
303
 * integer factors with 16-bit signed integer intermediate results is:
304
 *
305
 * Y = ( ( 27 * R + 92 * G +  9 * B) >> 7 );
306
 * U = ( (-29 * R - 99 * G + 128 * B) >> 8 ) + 128;
307
 * V = ( ( 128 * R - 116 * G -  12 * B) >> 8 ) + 128;
308
 *
309
 * Due to signed 8bit range being [-128,127] the U and V constants of 128 are
310
 * rounded to 127
311
 */
312
313
#define BGRX_Y_FACTORS _mm_set_epi8(0, 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9)
314
#define BGRX_U_FACTORS \
315
  _mm_set_epi8(0, -29, -99, 127, 0, -29, -99, 127, 0, -29, -99, 127, 0, -29, -99, 127)
316
#define BGRX_V_FACTORS \
317
  _mm_set_epi8(0, 127, -116, -12, 0, 127, -116, -12, 0, 127, -116, -12, 0, 127, -116, -12)
318
#define CONST128_FACTORS _mm_set1_epi8(-128)
319
320
#define Y_SHIFT 7
321
#define U_SHIFT 8
322
#define V_SHIFT 8
323
324
/*
325
TODO:
326
RGB[AX] can simply be supported using the following factors. And instead of loading the
327
globals directly the functions below could be passed pointers to the correct vectors
328
depending on the source picture format.
329
330
PRIM_ALIGN_128 static const BYTE rgbx_y_factors[] = {
331
      27,  92,   9,   0,  27,  92,   9,   0,  27,  92,   9,   0,  27,  92,   9,   0
332
};
333
PRIM_ALIGN_128 static const BYTE rgbx_u_factors[] = {
334
     -15, -49,  64,   0, -15, -49,  64,   0, -15, -49,  64,   0, -15, -49,  64,   0
335
};
336
PRIM_ALIGN_128 static const BYTE rgbx_v_factors[] = {
337
      64, -58,  -6,   0,  64, -58,  -6,   0,  64, -58,  -6,   0,  64, -58,  -6,   0
338
};
339
*/
340
341
/* compute the luma (Y) component from a single rgb source line */
342
343
static INLINE void ssse3_RGBToYUV420_BGRX_Y(const BYTE* WINPR_RESTRICT src, BYTE* dst, UINT32 width)
344
{
345
  __m128i x0;
346
  __m128i x1;
347
  __m128i x2;
348
  __m128i x3;
349
  const __m128i y_factors = BGRX_Y_FACTORS;
350
  const __m128i* argb = (const __m128i*)src;
351
  __m128i* ydst = (__m128i*)dst;
352
353
  for (UINT32 x = 0; x < width; x += 16)
354
  {
355
    /* store 16 rgba pixels in 4 128 bit registers */
356
    x0 = _mm_load_si128(argb++); // 1st 4 pixels
357
    x1 = _mm_load_si128(argb++); // 2nd 4 pixels
358
    x2 = _mm_load_si128(argb++); // 3rd 4 pixels
359
    x3 = _mm_load_si128(argb++); // 4th 4 pixels
360
    /* multiplications and subtotals */
361
    x0 = _mm_maddubs_epi16(x0, y_factors);
362
    x1 = _mm_maddubs_epi16(x1, y_factors);
363
    x2 = _mm_maddubs_epi16(x2, y_factors);
364
    x3 = _mm_maddubs_epi16(x3, y_factors);
365
    /* the total sums */
366
    x0 = _mm_hadd_epi16(x0, x1);
367
    x2 = _mm_hadd_epi16(x2, x3);
368
    /* shift the results */
369
    x0 = _mm_srli_epi16(x0, Y_SHIFT);
370
    x2 = _mm_srli_epi16(x2, Y_SHIFT);
371
    /* pack the 16 words into bytes */
372
    x0 = _mm_packus_epi16(x0, x2);
373
    /* save to y plane */
374
    _mm_storeu_si128(ydst++, x0);
375
  }
376
}
377
378
/* compute the chrominance (UV) components from two rgb source lines */
379
380
static INLINE void ssse3_RGBToYUV420_BGRX_UV(const BYTE* WINPR_RESTRICT src1,
381
                                             const BYTE* WINPR_RESTRICT src2,
382
                                             BYTE* WINPR_RESTRICT dst1, BYTE* WINPR_RESTRICT dst2,
383
                                             UINT32 width)
384
{
385
  const __m128i u_factors = BGRX_U_FACTORS;
386
  const __m128i v_factors = BGRX_V_FACTORS;
387
  const __m128i vector128 = CONST128_FACTORS;
388
  __m128i x0;
389
  __m128i x1;
390
  __m128i x2;
391
  __m128i x3;
392
  __m128i x4;
393
  __m128i x5;
394
  const __m128i* rgb1 = (const __m128i*)src1;
395
  const __m128i* rgb2 = (const __m128i*)src2;
396
  __m64* udst = (__m64*)dst1;
397
  __m64* vdst = (__m64*)dst2;
398
399
  for (UINT32 x = 0; x < width; x += 16)
400
  {
401
    /* subsample 16x2 pixels into 16x1 pixels */
402
    x0 = _mm_load_si128(rgb1++);
403
    x4 = _mm_load_si128(rgb2++);
404
    x0 = _mm_avg_epu8(x0, x4);
405
    x1 = _mm_load_si128(rgb1++);
406
    x4 = _mm_load_si128(rgb2++);
407
    x1 = _mm_avg_epu8(x1, x4);
408
    x2 = _mm_load_si128(rgb1++);
409
    x4 = _mm_load_si128(rgb2++);
410
    x2 = _mm_avg_epu8(x2, x4);
411
    x3 = _mm_load_si128(rgb1++);
412
    x4 = _mm_load_si128(rgb2++);
413
    x3 = _mm_avg_epu8(x3, x4);
414
    /* subsample these 16x1 pixels into 8x1 pixels */
415
    /**
416
     * shuffle controls
417
     * c = a[0],a[2],b[0],b[2] == 10 00 10 00 = 0x88
418
     * c = a[1],a[3],b[1],b[3] == 11 01 11 01 = 0xdd
419
     */
420
    x4 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x0), _mm_castsi128_ps(x1), 0x88));
421
    x0 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x0), _mm_castsi128_ps(x1), 0xdd));
422
    x0 = _mm_avg_epu8(x0, x4);
423
    x4 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x2), _mm_castsi128_ps(x3), 0x88));
424
    x1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x2), _mm_castsi128_ps(x3), 0xdd));
425
    x1 = _mm_avg_epu8(x1, x4);
426
    /* multiplications and subtotals */
427
    x2 = _mm_maddubs_epi16(x0, u_factors);
428
    x3 = _mm_maddubs_epi16(x1, u_factors);
429
    x4 = _mm_maddubs_epi16(x0, v_factors);
430
    x5 = _mm_maddubs_epi16(x1, v_factors);
431
    /* the total sums */
432
    x0 = _mm_hadd_epi16(x2, x3);
433
    x1 = _mm_hadd_epi16(x4, x5);
434
    /* shift the results */
435
    x0 = _mm_srai_epi16(x0, U_SHIFT);
436
    x1 = _mm_srai_epi16(x1, V_SHIFT);
437
    /* pack the 16 words into bytes */
438
    x0 = _mm_packs_epi16(x0, x1);
439
    /* add 128 */
440
    x0 = _mm_sub_epi8(x0, vector128);
441
    /* the lower 8 bytes go to the u plane */
442
    _mm_storel_pi(udst++, _mm_castsi128_ps(x0));
443
    /* the upper 8 bytes go to the v plane */
444
    _mm_storeh_pi(vdst++, _mm_castsi128_ps(x0));
445
  }
446
}
447
448
static pstatus_t ssse3_RGBToYUV420_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
449
                                        UINT32 srcStep, BYTE* WINPR_RESTRICT pDst[],
450
                                        const UINT32 dstStep[],
451
                                        const prim_size_t* WINPR_RESTRICT roi)
452
{
453
  const BYTE* argb = pSrc;
454
  BYTE* ydst = pDst[0];
455
  BYTE* udst = pDst[1];
456
  BYTE* vdst = pDst[2];
457
458
  if (roi->height < 1 || roi->width < 1)
459
  {
460
    return !PRIMITIVES_SUCCESS;
461
  }
462
463
  if (roi->width % 16 || (uintptr_t)pSrc % 16 || srcStep % 16)
464
  {
465
    return generic->RGBToYUV420_8u_P3AC4R(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
466
  }
467
468
  for (UINT32 y = 0; y < roi->height - 1; y += 2)
469
  {
470
    const BYTE* line1 = argb;
471
    const BYTE* line2 = argb + srcStep;
472
    ssse3_RGBToYUV420_BGRX_UV(line1, line2, udst, vdst, roi->width);
473
    ssse3_RGBToYUV420_BGRX_Y(line1, ydst, roi->width);
474
    ssse3_RGBToYUV420_BGRX_Y(line2, ydst + dstStep[0], roi->width);
475
    argb += 2ULL * srcStep;
476
    ydst += 2ULL * dstStep[0];
477
    udst += 1ULL * dstStep[1];
478
    vdst += 1ULL * dstStep[2];
479
  }
480
481
  if (roi->height & 1)
482
  {
483
    /* pass the same last line of an odd height twice for UV */
484
    ssse3_RGBToYUV420_BGRX_UV(argb, argb, udst, vdst, roi->width);
485
    ssse3_RGBToYUV420_BGRX_Y(argb, ydst, roi->width);
486
  }
487
488
  return PRIMITIVES_SUCCESS;
489
}
490
491
static pstatus_t ssse3_RGBToYUV420(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
492
                                   UINT32 srcStep, BYTE* WINPR_RESTRICT pDst[],
493
                                   const UINT32 dstStep[], const prim_size_t* WINPR_RESTRICT roi)
494
{
495
  switch (srcFormat)
496
  {
497
    case PIXEL_FORMAT_BGRX32:
498
    case PIXEL_FORMAT_BGRA32:
499
      return ssse3_RGBToYUV420_BGRX(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
500
501
    default:
502
      return generic->RGBToYUV420_8u_P3AC4R(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
503
  }
504
}
505
506
/****************************************************************************/
507
/* SSSE3 RGB -> AVC444-YUV conversion                                      **/
508
/****************************************************************************/
509
510
static INLINE void ssse3_RGBToAVC444YUV_BGRX_DOUBLE_ROW(
511
    const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd,
512
    BYTE* WINPR_RESTRICT b1Even, BYTE* WINPR_RESTRICT b1Odd, BYTE* WINPR_RESTRICT b2,
513
    BYTE* WINPR_RESTRICT b3, BYTE* WINPR_RESTRICT b4, BYTE* WINPR_RESTRICT b5,
514
    BYTE* WINPR_RESTRICT b6, BYTE* WINPR_RESTRICT b7, UINT32 width)
515
{
516
  const __m128i* argbEven = (const __m128i*)srcEven;
517
  const __m128i* argbOdd = (const __m128i*)srcOdd;
518
  const __m128i y_factors = BGRX_Y_FACTORS;
519
  const __m128i u_factors = BGRX_U_FACTORS;
520
  const __m128i v_factors = BGRX_V_FACTORS;
521
  const __m128i vector128 = CONST128_FACTORS;
522
523
  for (UINT32 x = 0; x < width; x += 16)
524
  {
525
    /* store 16 rgba pixels in 4 128 bit registers */
526
    const __m128i xe1 = _mm_load_si128(argbEven++); // 1st 4 pixels
527
    const __m128i xe2 = _mm_load_si128(argbEven++); // 2nd 4 pixels
528
    const __m128i xe3 = _mm_load_si128(argbEven++); // 3rd 4 pixels
529
    const __m128i xe4 = _mm_load_si128(argbEven++); // 4th 4 pixels
530
    const __m128i xo1 = _mm_load_si128(argbOdd++);  // 1st 4 pixels
531
    const __m128i xo2 = _mm_load_si128(argbOdd++);  // 2nd 4 pixels
532
    const __m128i xo3 = _mm_load_si128(argbOdd++);  // 3rd 4 pixels
533
    const __m128i xo4 = _mm_load_si128(argbOdd++);  // 4th 4 pixels
534
    {
535
      /* Y: multiplications with subtotals and horizontal sums */
536
      const __m128i ye1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, y_factors),
537
                                                        _mm_maddubs_epi16(xe2, y_factors)),
538
                                         Y_SHIFT);
539
      const __m128i ye2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, y_factors),
540
                                                        _mm_maddubs_epi16(xe4, y_factors)),
541
                                         Y_SHIFT);
542
      const __m128i ye = _mm_packus_epi16(ye1, ye2);
543
      const __m128i yo1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, y_factors),
544
                                                        _mm_maddubs_epi16(xo2, y_factors)),
545
                                         Y_SHIFT);
546
      const __m128i yo2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, y_factors),
547
                                                        _mm_maddubs_epi16(xo4, y_factors)),
548
                                         Y_SHIFT);
549
      const __m128i yo = _mm_packus_epi16(yo1, yo2);
550
      /* store y [b1] */
551
      _mm_storeu_si128((__m128i*)b1Even, ye);
552
      b1Even += 16;
553
554
      if (b1Odd)
555
      {
556
        _mm_storeu_si128((__m128i*)b1Odd, yo);
557
        b1Odd += 16;
558
      }
559
    }
560
    {
561
      /* We have now
562
       * 16 even U values in ue
563
       * 16 odd U values in uo
564
       *
565
       * We need to split these according to
566
       * 3.3.8.3.2 YUV420p Stream Combination for YUV444 mode */
567
      __m128i ue;
568
      __m128i uo = { 0 };
569
      {
570
        const __m128i ue1 =
571
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, u_factors),
572
                                          _mm_maddubs_epi16(xe2, u_factors)),
573
                           U_SHIFT);
574
        const __m128i ue2 =
575
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, u_factors),
576
                                          _mm_maddubs_epi16(xe4, u_factors)),
577
                           U_SHIFT);
578
        ue = _mm_sub_epi8(_mm_packs_epi16(ue1, ue2), vector128);
579
      }
580
581
      if (b1Odd)
582
      {
583
        const __m128i uo1 =
584
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, u_factors),
585
                                          _mm_maddubs_epi16(xo2, u_factors)),
586
                           U_SHIFT);
587
        const __m128i uo2 =
588
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, u_factors),
589
                                          _mm_maddubs_epi16(xo4, u_factors)),
590
                           U_SHIFT);
591
        uo = _mm_sub_epi8(_mm_packs_epi16(uo1, uo2), vector128);
592
      }
593
594
      /* Now we need the following storage distribution:
595
       * 2x   2y    -> b2
596
       * x    2y+1  -> b4
597
       * 2x+1 2y    -> b6 */
598
      if (b1Odd) /* b2 */
599
      {
600
        const __m128i ueh = _mm_unpackhi_epi8(ue, _mm_setzero_si128());
601
        const __m128i uoh = _mm_unpackhi_epi8(uo, _mm_setzero_si128());
602
        const __m128i hi = _mm_add_epi16(ueh, uoh);
603
        const __m128i uel = _mm_unpacklo_epi8(ue, _mm_setzero_si128());
604
        const __m128i uol = _mm_unpacklo_epi8(uo, _mm_setzero_si128());
605
        const __m128i lo = _mm_add_epi16(uel, uol);
606
        const __m128i added = _mm_hadd_epi16(lo, hi);
607
        const __m128i avg16 = _mm_srai_epi16(added, 2);
608
        const __m128i avg = _mm_packus_epi16(avg16, avg16);
609
        _mm_storel_epi64((__m128i*)b2, avg);
610
      }
611
      else
612
      {
613
        const __m128i mask =
614
            _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
615
                         (char)0x80, (char)0x80, (char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
616
        const __m128i ud = _mm_shuffle_epi8(ue, mask);
617
        _mm_storel_epi64((__m128i*)b2, ud);
618
      }
619
620
      b2 += 8;
621
622
      if (b1Odd) /* b4 */
623
      {
624
        _mm_store_si128((__m128i*)b4, uo);
625
        b4 += 16;
626
      }
627
628
      {
629
        /* b6 */
630
        const __m128i mask =
631
            _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
632
                         (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
633
        const __m128i ude = _mm_shuffle_epi8(ue, mask);
634
        _mm_storel_epi64((__m128i*)b6, ude);
635
        b6 += 8;
636
      }
637
    }
638
    {
639
      /* We have now
640
       * 16 even V values in ue
641
       * 16 odd V values in uo
642
       *
643
       * We need to split these according to
644
       * 3.3.8.3.2 YUV420p Stream Combination for YUV444 mode */
645
      __m128i ve;
646
      __m128i vo = { 0 };
647
      {
648
        const __m128i ve1 =
649
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, v_factors),
650
                                          _mm_maddubs_epi16(xe2, v_factors)),
651
                           V_SHIFT);
652
        const __m128i ve2 =
653
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, v_factors),
654
                                          _mm_maddubs_epi16(xe4, v_factors)),
655
                           V_SHIFT);
656
        ve = _mm_sub_epi8(_mm_packs_epi16(ve1, ve2), vector128);
657
      }
658
659
      if (b1Odd)
660
      {
661
        const __m128i vo1 =
662
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, v_factors),
663
                                          _mm_maddubs_epi16(xo2, v_factors)),
664
                           V_SHIFT);
665
        const __m128i vo2 =
666
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, v_factors),
667
                                          _mm_maddubs_epi16(xo4, v_factors)),
668
                           V_SHIFT);
669
        vo = _mm_sub_epi8(_mm_packs_epi16(vo1, vo2), vector128);
670
      }
671
672
      /* Now we need the following storage distribution:
673
       * 2x   2y    -> b3
674
       * x    2y+1  -> b5
675
       * 2x+1 2y    -> b7 */
676
      if (b1Odd) /* b3 */
677
      {
678
        const __m128i veh = _mm_unpackhi_epi8(ve, _mm_setzero_si128());
679
        const __m128i voh = _mm_unpackhi_epi8(vo, _mm_setzero_si128());
680
        const __m128i hi = _mm_add_epi16(veh, voh);
681
        const __m128i vel = _mm_unpacklo_epi8(ve, _mm_setzero_si128());
682
        const __m128i vol = _mm_unpacklo_epi8(vo, _mm_setzero_si128());
683
        const __m128i lo = _mm_add_epi16(vel, vol);
684
        const __m128i added = _mm_hadd_epi16(lo, hi);
685
        const __m128i avg16 = _mm_srai_epi16(added, 2);
686
        const __m128i avg = _mm_packus_epi16(avg16, avg16);
687
        _mm_storel_epi64((__m128i*)b3, avg);
688
      }
689
      else
690
      {
691
        const __m128i mask =
692
            _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
693
                         (char)0x80, (char)0x80, (char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
694
        const __m128i vd = _mm_shuffle_epi8(ve, mask);
695
        _mm_storel_epi64((__m128i*)b3, vd);
696
      }
697
698
      b3 += 8;
699
700
      if (b1Odd) /* b5 */
701
      {
702
        _mm_store_si128((__m128i*)b5, vo);
703
        b5 += 16;
704
      }
705
706
      {
707
        /* b7 */
708
        const __m128i mask =
709
            _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
710
                         (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
711
        const __m128i vde = _mm_shuffle_epi8(ve, mask);
712
        _mm_storel_epi64((__m128i*)b7, vde);
713
        b7 += 8;
714
      }
715
    }
716
  }
717
}
718
719
static pstatus_t ssse3_RGBToAVC444YUV_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
720
                                           UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[],
721
                                           const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[],
722
                                           const UINT32 dst2Step[],
723
                                           const prim_size_t* WINPR_RESTRICT roi)
724
{
725
  const BYTE* pMaxSrc = pSrc + 1ULL * (roi->height - 1) * srcStep;
726
727
  if (roi->height < 1 || roi->width < 1)
728
    return !PRIMITIVES_SUCCESS;
729
730
  if (roi->width % 16 || (uintptr_t)pSrc % 16 || srcStep % 16)
731
    return generic->RGBToAVC444YUV(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, dst2Step,
732
                                   roi);
733
734
  for (size_t y = 0; y < roi->height; y += 2)
735
  {
736
    const BOOL last = (y >= (roi->height - 1));
737
    const BYTE* srcEven = y < roi->height ? pSrc + y * srcStep : pMaxSrc;
738
    const BYTE* srcOdd = !last ? pSrc + (y + 1) * srcStep : pMaxSrc;
739
    const UINT32 i = y >> 1;
740
    const UINT32 n = (i & ~7) + i;
741
    BYTE* b1Even = pDst1[0] + y * dst1Step[0];
742
    BYTE* b1Odd = !last ? (b1Even + dst1Step[0]) : NULL;
743
    BYTE* b2 = pDst1[1] + (y / 2) * dst1Step[1];
744
    BYTE* b3 = pDst1[2] + (y / 2) * dst1Step[2];
745
    BYTE* b4 = pDst2[0] + 1ULL * dst2Step[0] * n;
746
    BYTE* b5 = b4 + 8ULL * dst2Step[0];
747
    BYTE* b6 = pDst2[1] + (y / 2) * dst2Step[1];
748
    BYTE* b7 = pDst2[2] + (y / 2) * dst2Step[2];
749
    ssse3_RGBToAVC444YUV_BGRX_DOUBLE_ROW(srcEven, srcOdd, b1Even, b1Odd, b2, b3, b4, b5, b6, b7,
750
                                         roi->width);
751
  }
752
753
  return PRIMITIVES_SUCCESS;
754
}
755
756
static pstatus_t ssse3_RGBToAVC444YUV(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
757
                                      UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[],
758
                                      const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[],
759
                                      const UINT32 dst2Step[],
760
                                      const prim_size_t* WINPR_RESTRICT roi)
761
{
762
  switch (srcFormat)
763
  {
764
    case PIXEL_FORMAT_BGRX32:
765
    case PIXEL_FORMAT_BGRA32:
766
      return ssse3_RGBToAVC444YUV_BGRX(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
767
                                       dst2Step, roi);
768
769
    default:
770
      return generic->RGBToAVC444YUV(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
771
                                     dst2Step, roi);
772
  }
773
}
774
775
/* Mapping of arguments:
776
 *
777
 * b1 [even lines] -> yLumaDstEven
778
 * b1 [odd lines]  -> yLumaDstOdd
779
 * b2              -> uLumaDst
780
 * b3              -> vLumaDst
781
 * b4              -> yChromaDst1
782
 * b5              -> yChromaDst2
783
 * b6              -> uChromaDst1
784
 * b7              -> uChromaDst2
785
 * b8              -> vChromaDst1
786
 * b9              -> vChromaDst2
787
 */
788
static INLINE void ssse3_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
789
    const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd,
790
    BYTE* WINPR_RESTRICT yLumaDstEven, BYTE* WINPR_RESTRICT yLumaDstOdd,
791
    BYTE* WINPR_RESTRICT uLumaDst, BYTE* WINPR_RESTRICT vLumaDst,
792
    BYTE* WINPR_RESTRICT yEvenChromaDst1, BYTE* WINPR_RESTRICT yEvenChromaDst2,
793
    BYTE* WINPR_RESTRICT yOddChromaDst1, BYTE* WINPR_RESTRICT yOddChromaDst2,
794
    BYTE* WINPR_RESTRICT uChromaDst1, BYTE* WINPR_RESTRICT uChromaDst2,
795
    BYTE* WINPR_RESTRICT vChromaDst1, BYTE* WINPR_RESTRICT vChromaDst2, UINT32 width)
796
{
797
  const __m128i vector128 = CONST128_FACTORS;
798
  const __m128i* argbEven = (const __m128i*)srcEven;
799
  const __m128i* argbOdd = (const __m128i*)srcOdd;
800
801
  for (UINT32 x = 0; x < width; x += 16)
802
  {
803
    /* store 16 rgba pixels in 4 128 bit registers
804
     * for even and odd rows.
805
     */
806
    const __m128i xe1 = _mm_load_si128(argbEven++); /* 1st 4 pixels */
807
    const __m128i xe2 = _mm_load_si128(argbEven++); /* 2nd 4 pixels */
808
    const __m128i xe3 = _mm_load_si128(argbEven++); /* 3rd 4 pixels */
809
    const __m128i xe4 = _mm_load_si128(argbEven++); /* 4th 4 pixels */
810
    const __m128i xo1 = _mm_load_si128(argbOdd++);  /* 1st 4 pixels */
811
    const __m128i xo2 = _mm_load_si128(argbOdd++);  /* 2nd 4 pixels */
812
    const __m128i xo3 = _mm_load_si128(argbOdd++);  /* 3rd 4 pixels */
813
    const __m128i xo4 = _mm_load_si128(argbOdd++);  /* 4th 4 pixels */
814
    {
815
      /* Y: multiplications with subtotals and horizontal sums */
816
      const __m128i y_factors = BGRX_Y_FACTORS;
817
      const __m128i ye1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, y_factors),
818
                                                        _mm_maddubs_epi16(xe2, y_factors)),
819
                                         Y_SHIFT);
820
      const __m128i ye2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, y_factors),
821
                                                        _mm_maddubs_epi16(xe4, y_factors)),
822
                                         Y_SHIFT);
823
      const __m128i ye = _mm_packus_epi16(ye1, ye2);
824
      /* store y [b1] */
825
      _mm_storeu_si128((__m128i*)yLumaDstEven, ye);
826
      yLumaDstEven += 16;
827
    }
828
829
    if (yLumaDstOdd)
830
    {
831
      const __m128i y_factors = BGRX_Y_FACTORS;
832
      const __m128i yo1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, y_factors),
833
                                                        _mm_maddubs_epi16(xo2, y_factors)),
834
                                         Y_SHIFT);
835
      const __m128i yo2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, y_factors),
836
                                                        _mm_maddubs_epi16(xo4, y_factors)),
837
                                         Y_SHIFT);
838
      const __m128i yo = _mm_packus_epi16(yo1, yo2);
839
      _mm_storeu_si128((__m128i*)yLumaDstOdd, yo);
840
      yLumaDstOdd += 16;
841
    }
842
843
    {
844
      /* We have now
845
       * 16 even U values in ue
846
       * 16 odd U values in uo
847
       *
848
       * We need to split these according to
849
       * 3.3.8.3.3 YUV420p Stream Combination for YUV444v2 mode */
850
      /* U: multiplications with subtotals and horizontal sums */
851
      __m128i ue;
852
      __m128i uo;
853
      __m128i uavg;
854
      {
855
        const __m128i u_factors = BGRX_U_FACTORS;
856
        const __m128i ue1 =
857
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, u_factors),
858
                                          _mm_maddubs_epi16(xe2, u_factors)),
859
                           U_SHIFT);
860
        const __m128i ue2 =
861
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, u_factors),
862
                                          _mm_maddubs_epi16(xe4, u_factors)),
863
                           U_SHIFT);
864
        const __m128i ueavg = _mm_hadd_epi16(ue1, ue2);
865
        ue = _mm_sub_epi8(_mm_packs_epi16(ue1, ue2), vector128);
866
        uavg = ueavg;
867
      }
868
      {
869
        const __m128i u_factors = BGRX_U_FACTORS;
870
        const __m128i uo1 =
871
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, u_factors),
872
                                          _mm_maddubs_epi16(xo2, u_factors)),
873
                           U_SHIFT);
874
        const __m128i uo2 =
875
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, u_factors),
876
                                          _mm_maddubs_epi16(xo4, u_factors)),
877
                           U_SHIFT);
878
        const __m128i uoavg = _mm_hadd_epi16(uo1, uo2);
879
        uo = _mm_sub_epi8(_mm_packs_epi16(uo1, uo2), vector128);
880
        uavg = _mm_add_epi16(uavg, uoavg);
881
        uavg = _mm_srai_epi16(uavg, 2);
882
        uavg = _mm_packs_epi16(uavg, uoavg);
883
        uavg = _mm_sub_epi8(uavg, vector128);
884
      }
885
      /* Now we need the following storage distribution:
886
       * 2x   2y    -> uLumaDst
887
       * 2x+1  y    -> yChromaDst1
888
       * 4x   2y+1  -> uChromaDst1
889
       * 4x+2 2y+1  -> vChromaDst1 */
890
      {
891
        const __m128i mask =
892
            _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
893
                         (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
894
        const __m128i ude = _mm_shuffle_epi8(ue, mask);
895
        _mm_storel_epi64((__m128i*)yEvenChromaDst1, ude);
896
        yEvenChromaDst1 += 8;
897
      }
898
899
      if (yLumaDstOdd)
900
      {
901
        const __m128i mask =
902
            _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
903
                         (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
904
        const __m128i udo = _mm_shuffle_epi8(uo, mask);
905
        _mm_storel_epi64((__m128i*)yOddChromaDst1, udo);
906
        yOddChromaDst1 += 8;
907
      }
908
909
      if (yLumaDstOdd)
910
      {
911
        const __m128i mask =
912
            _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
913
                         (char)0x80, (char)0x80, (char)0x80, 14, 10, 6, 2, 12, 8, 4, 0);
914
        const __m128i ud = _mm_shuffle_epi8(uo, mask);
915
        int* uDst1 = (int*)uChromaDst1;
916
        int* vDst1 = (int*)vChromaDst1;
917
        const int* src = (const int*)&ud;
918
        _mm_stream_si32(uDst1, src[0]);
919
        _mm_stream_si32(vDst1, src[1]);
920
        uChromaDst1 += 4;
921
        vChromaDst1 += 4;
922
      }
923
924
      if (yLumaDstOdd)
925
      {
926
        _mm_storel_epi64((__m128i*)uLumaDst, uavg);
927
        uLumaDst += 8;
928
      }
929
      else
930
      {
931
        const __m128i mask =
932
            _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
933
                         (char)0x80, (char)0x80, (char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
934
        const __m128i ud = _mm_shuffle_epi8(ue, mask);
935
        _mm_storel_epi64((__m128i*)uLumaDst, ud);
936
        uLumaDst += 8;
937
      }
938
    }
939
940
    {
941
      /* V: multiplications with subtotals and horizontal sums */
942
      __m128i ve;
943
      __m128i vo;
944
      __m128i vavg;
945
      {
946
        const __m128i v_factors = BGRX_V_FACTORS;
947
        const __m128i ve1 =
948
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, v_factors),
949
                                          _mm_maddubs_epi16(xe2, v_factors)),
950
                           V_SHIFT);
951
        const __m128i ve2 =
952
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, v_factors),
953
                                          _mm_maddubs_epi16(xe4, v_factors)),
954
                           V_SHIFT);
955
        const __m128i veavg = _mm_hadd_epi16(ve1, ve2);
956
        ve = _mm_sub_epi8(_mm_packs_epi16(ve1, ve2), vector128);
957
        vavg = veavg;
958
      }
959
      {
960
        const __m128i v_factors = BGRX_V_FACTORS;
961
        const __m128i vo1 =
962
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, v_factors),
963
                                          _mm_maddubs_epi16(xo2, v_factors)),
964
                           V_SHIFT);
965
        const __m128i vo2 =
966
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, v_factors),
967
                                          _mm_maddubs_epi16(xo4, v_factors)),
968
                           V_SHIFT);
969
        const __m128i voavg = _mm_hadd_epi16(vo1, vo2);
970
        vo = _mm_sub_epi8(_mm_packs_epi16(vo1, vo2), vector128);
971
        vavg = _mm_add_epi16(vavg, voavg);
972
        vavg = _mm_srai_epi16(vavg, 2);
973
        vavg = _mm_packs_epi16(vavg, voavg);
974
        vavg = _mm_sub_epi8(vavg, vector128);
975
      }
976
      /* Now we need the following storage distribution:
977
       * 2x   2y    -> vLumaDst
978
       * 2x+1  y    -> yChromaDst2
979
       * 4x   2y+1  -> uChromaDst2
980
       * 4x+2 2y+1  -> vChromaDst2 */
981
      {
982
        const __m128i mask =
983
            _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
984
                         (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
985
        __m128i vde = _mm_shuffle_epi8(ve, mask);
986
        _mm_storel_epi64((__m128i*)yEvenChromaDst2, vde);
987
        yEvenChromaDst2 += 8;
988
      }
989
990
      if (yLumaDstOdd)
991
      {
992
        const __m128i mask =
993
            _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
994
                         (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
995
        __m128i vdo = _mm_shuffle_epi8(vo, mask);
996
        _mm_storel_epi64((__m128i*)yOddChromaDst2, vdo);
997
        yOddChromaDst2 += 8;
998
      }
999
1000
      if (yLumaDstOdd)
1001
      {
1002
        const __m128i mask =
1003
            _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
1004
                         (char)0x80, (char)0x80, (char)0x80, 14, 10, 6, 2, 12, 8, 4, 0);
1005
        const __m128i vd = _mm_shuffle_epi8(vo, mask);
1006
        int* uDst2 = (int*)uChromaDst2;
1007
        int* vDst2 = (int*)vChromaDst2;
1008
        const int* src = (const int*)&vd;
1009
        _mm_stream_si32(uDst2, src[0]);
1010
        _mm_stream_si32(vDst2, src[1]);
1011
        uChromaDst2 += 4;
1012
        vChromaDst2 += 4;
1013
      }
1014
1015
      if (yLumaDstOdd)
1016
      {
1017
        _mm_storel_epi64((__m128i*)vLumaDst, vavg);
1018
        vLumaDst += 8;
1019
      }
1020
      else
1021
      {
1022
        const __m128i mask =
1023
            _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
1024
                         (char)0x80, (char)0x80, (char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
1025
        __m128i vd = _mm_shuffle_epi8(ve, mask);
1026
        _mm_storel_epi64((__m128i*)vLumaDst, vd);
1027
        vLumaDst += 8;
1028
      }
1029
    }
1030
  }
1031
}
1032
1033
static pstatus_t ssse3_RGBToAVC444YUVv2_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
1034
                                             UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[],
1035
                                             const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[],
1036
                                             const UINT32 dst2Step[],
1037
                                             const prim_size_t* WINPR_RESTRICT roi)
1038
{
1039
  if (roi->height < 1 || roi->width < 1)
1040
    return !PRIMITIVES_SUCCESS;
1041
1042
  if (roi->width % 16 || (uintptr_t)pSrc % 16 || srcStep % 16)
1043
    return generic->RGBToAVC444YUVv2(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, dst2Step,
1044
                                     roi);
1045
1046
  for (size_t y = 0; y < roi->height; y += 2)
1047
  {
1048
    const BYTE* srcEven = (pSrc + y * srcStep);
1049
    const BYTE* srcOdd = (srcEven + srcStep);
1050
    BYTE* dstLumaYEven = (pDst1[0] + y * dst1Step[0]);
1051
    BYTE* dstLumaYOdd = (y < roi->height - 1) ? (dstLumaYEven + dst1Step[0]) : NULL;
1052
    BYTE* dstLumaU = (pDst1[1] + (y / 2) * dst1Step[1]);
1053
    BYTE* dstLumaV = (pDst1[2] + (y / 2) * dst1Step[2]);
1054
    BYTE* dstEvenChromaY1 = (pDst2[0] + y * dst2Step[0]);
1055
    BYTE* dstEvenChromaY2 = dstEvenChromaY1 + roi->width / 2;
1056
    BYTE* dstOddChromaY1 = dstEvenChromaY1 + dst2Step[0];
1057
    BYTE* dstOddChromaY2 = dstEvenChromaY2 + dst2Step[0];
1058
    BYTE* dstChromaU1 = (pDst2[1] + (y / 2) * dst2Step[1]);
1059
    BYTE* dstChromaV1 = (pDst2[2] + (y / 2) * dst2Step[2]);
1060
    BYTE* dstChromaU2 = dstChromaU1 + roi->width / 4;
1061
    BYTE* dstChromaV2 = dstChromaV1 + roi->width / 4;
1062
    ssse3_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(srcEven, srcOdd, dstLumaYEven, dstLumaYOdd, dstLumaU,
1063
                                           dstLumaV, dstEvenChromaY1, dstEvenChromaY2,
1064
                                           dstOddChromaY1, dstOddChromaY2, dstChromaU1,
1065
                                           dstChromaU2, dstChromaV1, dstChromaV2, roi->width);
1066
  }
1067
1068
  return PRIMITIVES_SUCCESS;
1069
}
1070
1071
static pstatus_t ssse3_RGBToAVC444YUVv2(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
1072
                                        UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[],
1073
                                        const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[],
1074
                                        const UINT32 dst2Step[],
1075
                                        const prim_size_t* WINPR_RESTRICT roi)
1076
{
1077
  switch (srcFormat)
1078
  {
1079
    case PIXEL_FORMAT_BGRX32:
1080
    case PIXEL_FORMAT_BGRA32:
1081
      return ssse3_RGBToAVC444YUVv2_BGRX(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
1082
                                         dst2Step, roi);
1083
1084
    default:
1085
      return generic->RGBToAVC444YUVv2(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
1086
                                       dst2Step, roi);
1087
  }
1088
}
1089
1090
static pstatus_t ssse3_LumaToYUV444(const BYTE* const WINPR_RESTRICT pSrcRaw[],
1091
                                    const UINT32 srcStep[], BYTE* WINPR_RESTRICT pDstRaw[],
1092
                                    const UINT32 dstStep[], const RECTANGLE_16* WINPR_RESTRICT roi)
1093
{
1094
  const UINT32 nWidth = roi->right - roi->left;
1095
  const UINT32 nHeight = roi->bottom - roi->top;
1096
  const UINT32 halfWidth = (nWidth + 1) / 2;
1097
  const UINT32 halfPad = halfWidth % 16;
1098
  const UINT32 halfHeight = (nHeight + 1) / 2;
1099
  const UINT32 oddY = 1;
1100
  const UINT32 evenY = 0;
1101
  const UINT32 oddX = 1;
1102
  const UINT32 evenX = 0;
1103
  const BYTE* pSrc[3] = { pSrcRaw[0] + 1ULL * roi->top * srcStep[0] + roi->left,
1104
                        pSrcRaw[1] + 1ULL * roi->top / 2 * srcStep[1] + roi->left / 2,
1105
                        pSrcRaw[2] + 1ULL * roi->top / 2 * srcStep[2] + roi->left / 2 };
1106
  BYTE* pDst[3] = { pDstRaw[0] + 1ULL * roi->top * dstStep[0] + roi->left,
1107
                  pDstRaw[1] + 1ULL * roi->top * dstStep[1] + roi->left,
1108
                  pDstRaw[2] + 1ULL * roi->top * dstStep[2] + roi->left };
1109
1110
  /* Y data is already here... */
1111
  /* B1 */
1112
  for (size_t y = 0; y < nHeight; y++)
1113
  {
1114
    const BYTE* Ym = pSrc[0] + y * srcStep[0];
1115
    BYTE* pY = pDst[0] + y * dstStep[0];
1116
    memcpy(pY, Ym, nWidth);
1117
  }
1118
1119
  /* The first half of U, V are already here part of this frame. */
1120
  /* B2 and B3 */
1121
  for (size_t y = 0; y < halfHeight; y++)
1122
  {
1123
    const size_t val2y = (2 * y + evenY);
1124
    const size_t val2y1 = val2y + oddY;
1125
    const BYTE* Um = pSrc[1] + 1ULL * srcStep[1] * y;
1126
    const BYTE* Vm = pSrc[2] + 1ULL * srcStep[2] * y;
1127
    BYTE* pU = pDst[1] + 1ULL * dstStep[1] * val2y;
1128
    BYTE* pV = pDst[2] + 1ULL * dstStep[2] * val2y;
1129
    BYTE* pU1 = pDst[1] + 1ULL * dstStep[1] * val2y1;
1130
    BYTE* pV1 = pDst[2] + 1ULL * dstStep[2] * val2y1;
1131
1132
    size_t x = 0;
1133
    for (; x < halfWidth - halfPad; x += 16)
1134
    {
1135
      const __m128i unpackHigh = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
1136
      const __m128i unpackLow =
1137
          _mm_set_epi8(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8);
1138
      {
1139
        const __m128i u = _mm_loadu_si128((const __m128i*)&Um[x]);
1140
        const __m128i uHigh = _mm_shuffle_epi8(u, unpackHigh);
1141
        const __m128i uLow = _mm_shuffle_epi8(u, unpackLow);
1142
        _mm_storeu_si128((__m128i*)&pU[2ULL * x], uHigh);
1143
        _mm_storeu_si128((__m128i*)&pU[2ULL * x + 16], uLow);
1144
        _mm_storeu_si128((__m128i*)&pU1[2ULL * x], uHigh);
1145
        _mm_storeu_si128((__m128i*)&pU1[2ULL * x + 16], uLow);
1146
      }
1147
      {
1148
        const __m128i u = _mm_loadu_si128((const __m128i*)&Vm[x]);
1149
        const __m128i uHigh = _mm_shuffle_epi8(u, unpackHigh);
1150
        const __m128i uLow = _mm_shuffle_epi8(u, unpackLow);
1151
        _mm_storeu_si128((__m128i*)&pV[2 * x], uHigh);
1152
        _mm_storeu_si128((__m128i*)&pV[2 * x + 16], uLow);
1153
        _mm_storeu_si128((__m128i*)&pV1[2 * x], uHigh);
1154
        _mm_storeu_si128((__m128i*)&pV1[2 * x + 16], uLow);
1155
      }
1156
    }
1157
1158
    for (; x < halfWidth; x++)
1159
    {
1160
      const UINT32 val2x = 2 * x + evenX;
1161
      const UINT32 val2x1 = val2x + oddX;
1162
      pU[val2x] = Um[x];
1163
      pV[val2x] = Vm[x];
1164
      pU[val2x1] = Um[x];
1165
      pV[val2x1] = Vm[x];
1166
      pU1[val2x] = Um[x];
1167
      pV1[val2x] = Vm[x];
1168
      pU1[val2x1] = Um[x];
1169
      pV1[val2x1] = Vm[x];
1170
    }
1171
  }
1172
1173
  return PRIMITIVES_SUCCESS;
1174
}
1175
1176
static INLINE void ssse3_filter(BYTE* WINPR_RESTRICT pSrcDst, const BYTE* WINPR_RESTRICT pSrc2)
1177
{
1178
  const __m128i even = _mm_set_epi8((char)0x80, 14, (char)0x80, 12, (char)0x80, 10, (char)0x80, 8,
1179
                                    (char)0x80, 6, (char)0x80, 4, (char)0x80, 2, (char)0x80, 0);
1180
  const __m128i odd = _mm_set_epi8((char)0x80, 15, (char)0x80, 13, (char)0x80, 11, (char)0x80, 9,
1181
                                   (char)0x80, 7, (char)0x80, 5, (char)0x80, 3, (char)0x80, 1);
1182
  const __m128i interleave = _mm_set_epi8(15, 7, 14, 6, 13, 5, 12, 4, 11, 3, 10, 2, 9, 1, 8, 0);
1183
  const __m128i u = _mm_loadu_si128((const __m128i*)pSrcDst);
1184
  const __m128i u1 = _mm_loadu_si128((const __m128i*)pSrc2);
1185
  const __m128i uEven = _mm_shuffle_epi8(u, even);
1186
  const __m128i uEven4 = _mm_slli_epi16(uEven, 2);
1187
  const __m128i uOdd = _mm_shuffle_epi8(u, odd);
1188
  const __m128i u1Even = _mm_shuffle_epi8(u1, even);
1189
  const __m128i u1Odd = _mm_shuffle_epi8(u1, odd);
1190
  const __m128i tmp1 = _mm_add_epi16(uOdd, u1Even);
1191
  const __m128i tmp2 = _mm_add_epi16(tmp1, u1Odd);
1192
  const __m128i result = _mm_sub_epi16(uEven4, tmp2);
1193
  const __m128i packed = _mm_packus_epi16(result, uOdd);
1194
  const __m128i interleaved = _mm_shuffle_epi8(packed, interleave);
1195
  _mm_storeu_si128((__m128i*)pSrcDst, interleaved);
1196
}
1197
1198
static pstatus_t ssse3_ChromaFilter(BYTE* WINPR_RESTRICT pDst[], const UINT32 dstStep[],
1199
                                    const RECTANGLE_16* WINPR_RESTRICT roi)
1200
{
1201
  const UINT32 oddY = 1;
1202
  const UINT32 evenY = 0;
1203
  const UINT32 nWidth = roi->right - roi->left;
1204
  const UINT32 nHeight = roi->bottom - roi->top;
1205
  const UINT32 halfHeight = (nHeight + 1) / 2;
1206
  const UINT32 halfWidth = (nWidth + 1) / 2;
1207
  const UINT32 halfPad = halfWidth % 16;
1208
1209
  /* Filter */
1210
  for (size_t y = roi->top; y < halfHeight + roi->top; y++)
1211
  {
1212
    size_t x = roi->left;
1213
    const UINT32 val2y = (y * 2 + evenY);
1214
    const UINT32 val2y1 = val2y + oddY;
1215
    BYTE* pU1 = pDst[1] + 1ULL * dstStep[1] * val2y1;
1216
    BYTE* pV1 = pDst[2] + 1ULL * dstStep[2] * val2y1;
1217
    BYTE* pU = pDst[1] + 1ULL * dstStep[1] * val2y;
1218
    BYTE* pV = pDst[2] + 1ULL * dstStep[2] * val2y;
1219
1220
    if (val2y1 > nHeight)
1221
      continue;
1222
1223
    for (; x < halfWidth + roi->left - halfPad; x += 16)
1224
    {
1225
      ssse3_filter(&pU[2 * x], &pU1[2 * x]);
1226
      ssse3_filter(&pV[2 * x], &pV1[2 * x]);
1227
    }
1228
1229
    for (; x < halfWidth + roi->left; x++)
1230
    {
1231
      const UINT32 val2x = (x * 2);
1232
      const UINT32 val2x1 = val2x + 1;
1233
      const BYTE inU = pU[val2x];
1234
      const BYTE inV = pV[val2x];
1235
      const INT32 up = inU * 4;
1236
      const INT32 vp = inV * 4;
1237
      INT32 u2020 = 0;
1238
      INT32 v2020 = 0;
1239
1240
      if (val2x1 > nWidth)
1241
        continue;
1242
1243
      u2020 = up - pU[val2x1] - pU1[val2x] - pU1[val2x1];
1244
      v2020 = vp - pV[val2x1] - pV1[val2x] - pV1[val2x1];
1245
      pU[val2x] = CONDITIONAL_CLIP(u2020, inU);
1246
      pV[val2x] = CONDITIONAL_CLIP(v2020, inV);
1247
    }
1248
  }
1249
1250
  return PRIMITIVES_SUCCESS;
1251
}
1252
1253
static pstatus_t ssse3_ChromaV1ToYUV444(const BYTE* const WINPR_RESTRICT pSrcRaw[3],
1254
                                        const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDstRaw[3],
1255
                                        const UINT32 dstStep[3],
1256
                                        const RECTANGLE_16* WINPR_RESTRICT roi)
1257
{
1258
  const UINT32 mod = 16;
1259
  UINT32 uY = 0;
1260
  UINT32 vY = 0;
1261
  const UINT32 nWidth = roi->right - roi->left;
1262
  const UINT32 nHeight = roi->bottom - roi->top;
1263
  const UINT32 halfWidth = (nWidth + 1) / 2;
1264
  const UINT32 halfPad = halfWidth % 16;
1265
  const UINT32 halfHeight = (nHeight + 1) / 2;
1266
  const UINT32 oddY = 1;
1267
  const UINT32 evenY = 0;
1268
  const UINT32 oddX = 1;
1269
  /* The auxilary frame is aligned to multiples of 16x16.
1270
   * We need the padded height for B4 and B5 conversion. */
1271
  const UINT32 padHeigth = nHeight + 16 - nHeight % 16;
1272
  const BYTE* pSrc[3] = { pSrcRaw[0] + 1ULL * roi->top * srcStep[0] + roi->left,
1273
                        pSrcRaw[1] + 1ULL * roi->top / 2 * srcStep[1] + roi->left / 2,
1274
                        pSrcRaw[2] + 1ULL * roi->top / 2 * srcStep[2] + roi->left / 2 };
1275
  BYTE* pDst[3] = { pDstRaw[0] + 1ULL * roi->top * dstStep[0] + roi->left,
1276
                  pDstRaw[1] + 1ULL * roi->top * dstStep[1] + roi->left,
1277
                  pDstRaw[2] + 1ULL * roi->top * dstStep[2] + roi->left };
1278
  const __m128i zero = _mm_setzero_si128();
1279
  const __m128i mask = _mm_set_epi8(0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0,
1280
                                    (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80);
1281
1282
  /* The second half of U and V is a bit more tricky... */
1283
  /* B4 and B5 */
1284
  for (size_t y = 0; y < padHeigth; y++)
1285
  {
1286
    const BYTE* Ya = pSrc[0] + 1ULL * srcStep[0] * y;
1287
    BYTE* pX = NULL;
1288
1289
    if ((y) % mod < (mod + 1) / 2)
1290
    {
1291
      const UINT32 pos = (2 * uY++ + oddY);
1292
1293
      if (pos >= nHeight)
1294
        continue;
1295
1296
      pX = pDst[1] + 1ULL * dstStep[1] * pos;
1297
    }
1298
    else
1299
    {
1300
      const UINT32 pos = (2 * vY++ + oddY);
1301
1302
      if (pos >= nHeight)
1303
        continue;
1304
1305
      pX = pDst[2] + 1ULL * dstStep[2] * pos;
1306
    }
1307
1308
    memcpy(pX, Ya, nWidth);
1309
  }
1310
1311
  /* B6 and B7 */
1312
  for (size_t y = 0; y < halfHeight; y++)
1313
  {
1314
    const size_t val2y = (y * 2 + evenY);
1315
    const BYTE* Ua = pSrc[1] + srcStep[1] * y;
1316
    const BYTE* Va = pSrc[2] + srcStep[2] * y;
1317
    BYTE* pU = pDst[1] + dstStep[1] * val2y;
1318
    BYTE* pV = pDst[2] + dstStep[2] * val2y;
1319
1320
    size_t x = 0;
1321
    for (; x < halfWidth - halfPad; x += 16)
1322
    {
1323
      {
1324
        const __m128i u = _mm_loadu_si128((const __m128i*)&Ua[x]);
1325
        const __m128i u2 = _mm_unpackhi_epi8(u, zero);
1326
        const __m128i u1 = _mm_unpacklo_epi8(u, zero);
1327
        _mm_maskmoveu_si128(u1, mask, (char*)&pU[2 * x]);
1328
        _mm_maskmoveu_si128(u2, mask, (char*)&pU[2 * x + 16]);
1329
      }
1330
      {
1331
        const __m128i u = _mm_loadu_si128((const __m128i*)&Va[x]);
1332
        const __m128i u2 = _mm_unpackhi_epi8(u, zero);
1333
        const __m128i u1 = _mm_unpacklo_epi8(u, zero);
1334
        _mm_maskmoveu_si128(u1, mask, (char*)&pV[2 * x]);
1335
        _mm_maskmoveu_si128(u2, mask, (char*)&pV[2 * x + 16]);
1336
      }
1337
    }
1338
1339
    for (; x < halfWidth; x++)
1340
    {
1341
      const UINT32 val2x1 = (x * 2 + oddX);
1342
      pU[val2x1] = Ua[x];
1343
      pV[val2x1] = Va[x];
1344
    }
1345
  }
1346
1347
  /* Filter */
1348
  return ssse3_ChromaFilter(pDst, dstStep, roi);
1349
}
1350
1351
static pstatus_t ssse3_ChromaV2ToYUV444(const BYTE* const WINPR_RESTRICT pSrc[3],
1352
                                        const UINT32 srcStep[3], UINT32 nTotalWidth,
1353
                                        UINT32 nTotalHeight, BYTE* WINPR_RESTRICT pDst[3],
1354
                                        const UINT32 dstStep[3],
1355
                                        const RECTANGLE_16* WINPR_RESTRICT roi)
1356
{
1357
  const UINT32 nWidth = roi->right - roi->left;
1358
  const UINT32 nHeight = roi->bottom - roi->top;
1359
  const UINT32 halfWidth = (nWidth + 1) / 2;
1360
  const UINT32 halfPad = halfWidth % 16;
1361
  const UINT32 halfHeight = (nHeight + 1) / 2;
1362
  const UINT32 quaterWidth = (nWidth + 3) / 4;
1363
  const UINT32 quaterPad = quaterWidth % 16;
1364
  const __m128i zero = _mm_setzero_si128();
1365
  const __m128i mask = _mm_set_epi8((char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0,
1366
                                    (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0);
1367
  const __m128i mask2 = _mm_set_epi8(0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80,
1368
                                     0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80);
1369
  const __m128i shuffle1 =
1370
      _mm_set_epi8((char)0x80, 15, (char)0x80, 14, (char)0x80, 13, (char)0x80, 12, (char)0x80, 11,
1371
                   (char)0x80, 10, (char)0x80, 9, (char)0x80, 8);
1372
  const __m128i shuffle2 =
1373
      _mm_set_epi8((char)0x80, 7, (char)0x80, 6, (char)0x80, 5, (char)0x80, 4, (char)0x80, 3,
1374
                   (char)0x80, 2, (char)0x80, 1, (char)0x80, 0);
1375
1376
  /* B4 and B5: odd UV values for width/2, height */
1377
  for (size_t y = 0; y < nHeight; y++)
1378
  {
1379
    const size_t yTop = y + roi->top;
1380
    const BYTE* pYaU = pSrc[0] + srcStep[0] * yTop + roi->left / 2;
1381
    const BYTE* pYaV = pYaU + nTotalWidth / 2;
1382
    BYTE* pU = pDst[1] + 1ULL * dstStep[1] * yTop + roi->left;
1383
    BYTE* pV = pDst[2] + 1ULL * dstStep[2] * yTop + roi->left;
1384
1385
    size_t x = 0;
1386
    for (; x < halfWidth - halfPad; x += 16)
1387
    {
1388
      {
1389
        const __m128i u = _mm_loadu_si128((const __m128i*)&pYaU[x]);
1390
        const __m128i u2 = _mm_unpackhi_epi8(zero, u);
1391
        const __m128i u1 = _mm_unpacklo_epi8(zero, u);
1392
        _mm_maskmoveu_si128(u1, mask, (char*)&pU[2 * x]);
1393
        _mm_maskmoveu_si128(u2, mask, (char*)&pU[2 * x + 16]);
1394
      }
1395
      {
1396
        const __m128i v = _mm_loadu_si128((const __m128i*)&pYaV[x]);
1397
        const __m128i v2 = _mm_unpackhi_epi8(zero, v);
1398
        const __m128i v1 = _mm_unpacklo_epi8(zero, v);
1399
        _mm_maskmoveu_si128(v1, mask, (char*)&pV[2 * x]);
1400
        _mm_maskmoveu_si128(v2, mask, (char*)&pV[2 * x + 16]);
1401
      }
1402
    }
1403
1404
    for (; x < halfWidth; x++)
1405
    {
1406
      const UINT32 odd = 2 * x + 1;
1407
      pU[odd] = pYaU[x];
1408
      pV[odd] = pYaV[x];
1409
    }
1410
  }
1411
1412
  /* B6 - B9 */
1413
  for (size_t y = 0; y < halfHeight; y++)
1414
  {
1415
    const BYTE* pUaU = pSrc[1] + srcStep[1] * (y + roi->top / 2) + roi->left / 4;
1416
    const BYTE* pUaV = pUaU + nTotalWidth / 4;
1417
    const BYTE* pVaU = pSrc[2] + srcStep[2] * (y + roi->top / 2) + roi->left / 4;
1418
    const BYTE* pVaV = pVaU + nTotalWidth / 4;
1419
    BYTE* pU = pDst[1] + dstStep[1] * (2 * y + 1 + roi->top) + roi->left;
1420
    BYTE* pV = pDst[2] + dstStep[2] * (2 * y + 1 + roi->top) + roi->left;
1421
1422
    UINT32 x = 0;
1423
    for (; x < quaterWidth - quaterPad; x += 16)
1424
    {
1425
      {
1426
        const __m128i uU = _mm_loadu_si128((const __m128i*)&pUaU[x]);
1427
        const __m128i uV = _mm_loadu_si128((const __m128i*)&pVaU[x]);
1428
        const __m128i uHigh = _mm_unpackhi_epi8(uU, uV);
1429
        const __m128i uLow = _mm_unpacklo_epi8(uU, uV);
1430
        const __m128i u1 = _mm_shuffle_epi8(uLow, shuffle2);
1431
        const __m128i u2 = _mm_shuffle_epi8(uLow, shuffle1);
1432
        const __m128i u3 = _mm_shuffle_epi8(uHigh, shuffle2);
1433
        const __m128i u4 = _mm_shuffle_epi8(uHigh, shuffle1);
1434
        _mm_maskmoveu_si128(u1, mask2, (char*)&pU[4 * x + 0]);
1435
        _mm_maskmoveu_si128(u2, mask2, (char*)&pU[4 * x + 16]);
1436
        _mm_maskmoveu_si128(u3, mask2, (char*)&pU[4 * x + 32]);
1437
        _mm_maskmoveu_si128(u4, mask2, (char*)&pU[4 * x + 48]);
1438
      }
1439
      {
1440
        const __m128i vU = _mm_loadu_si128((const __m128i*)&pUaV[x]);
1441
        const __m128i vV = _mm_loadu_si128((const __m128i*)&pVaV[x]);
1442
        const __m128i vHigh = _mm_unpackhi_epi8(vU, vV);
1443
        const __m128i vLow = _mm_unpacklo_epi8(vU, vV);
1444
        const __m128i v1 = _mm_shuffle_epi8(vLow, shuffle2);
1445
        const __m128i v2 = _mm_shuffle_epi8(vLow, shuffle1);
1446
        const __m128i v3 = _mm_shuffle_epi8(vHigh, shuffle2);
1447
        const __m128i v4 = _mm_shuffle_epi8(vHigh, shuffle1);
1448
        _mm_maskmoveu_si128(v1, mask2, (char*)&pV[4 * x + 0]);
1449
        _mm_maskmoveu_si128(v2, mask2, (char*)&pV[4 * x + 16]);
1450
        _mm_maskmoveu_si128(v3, mask2, (char*)&pV[4 * x + 32]);
1451
        _mm_maskmoveu_si128(v4, mask2, (char*)&pV[4 * x + 48]);
1452
      }
1453
    }
1454
1455
    for (; x < quaterWidth; x++)
1456
    {
1457
      pU[4 * x + 0] = pUaU[x];
1458
      pV[4 * x + 0] = pUaV[x];
1459
      pU[4 * x + 2] = pVaU[x];
1460
      pV[4 * x + 2] = pVaV[x];
1461
    }
1462
  }
1463
1464
  return ssse3_ChromaFilter(pDst, dstStep, roi);
1465
}
1466
1467
static pstatus_t ssse3_YUV420CombineToYUV444(avc444_frame_type type,
1468
                                             const BYTE* const WINPR_RESTRICT pSrc[3],
1469
                                             const UINT32 srcStep[3], UINT32 nWidth, UINT32 nHeight,
1470
                                             BYTE* WINPR_RESTRICT pDst[3], const UINT32 dstStep[3],
1471
                                             const RECTANGLE_16* WINPR_RESTRICT roi)
1472
{
1473
  if (!pSrc || !pSrc[0] || !pSrc[1] || !pSrc[2])
1474
    return -1;
1475
1476
  if (!pDst || !pDst[0] || !pDst[1] || !pDst[2])
1477
    return -1;
1478
1479
  if (!roi)
1480
    return -1;
1481
1482
  switch (type)
1483
  {
1484
    case AVC444_LUMA:
1485
      return ssse3_LumaToYUV444(pSrc, srcStep, pDst, dstStep, roi);
1486
1487
    case AVC444_CHROMAv1:
1488
      return ssse3_ChromaV1ToYUV444(pSrc, srcStep, pDst, dstStep, roi);
1489
1490
    case AVC444_CHROMAv2:
1491
      return ssse3_ChromaV2ToYUV444(pSrc, srcStep, nWidth, nHeight, pDst, dstStep, roi);
1492
1493
    default:
1494
      return -1;
1495
  }
1496
}
1497
#endif
1498
1499
void primitives_init_YUV_ssse3(primitives_t* WINPR_RESTRICT prims)
1500
0
{
1501
#if defined(SSE2_ENABLED)
1502
  generic = primitives_get_generic();
1503
  primitives_init_YUV(prims);
1504
1505
  if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) &&
1506
      IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
1507
  {
1508
    WLog_VRB(PRIM_TAG, "SSE3/SSSE3 optimizations");
1509
    prims->RGBToYUV420_8u_P3AC4R = ssse3_RGBToYUV420;
1510
    prims->RGBToAVC444YUV = ssse3_RGBToAVC444YUV;
1511
    prims->RGBToAVC444YUVv2 = ssse3_RGBToAVC444YUVv2;
1512
    prims->YUV420ToRGB_8u_P3AC4R = ssse3_YUV420ToRGB;
1513
    prims->YUV444ToRGB_8u_P3AC4R = ssse3_YUV444ToRGB_8u_P3AC4R;
1514
    prims->YUV420CombineToYUV444 = ssse3_YUV420CombineToYUV444;
1515
  }
1516
#else
1517
0
  WLog_VRB(PRIM_TAG, "undefined WITH_SSE2");
1518
0
  WINPR_UNUSED(prims);
1519
0
#endif
1520
0
}