Coverage Report

Created: 2025-07-01 06:46

/src/FreeRDP/libfreerdp/primitives/sse/prim_YUV_sse4.1.c
Line
Count
Source (jump to first uncovered line)
1
/**
2
 * FreeRDP: A Remote Desktop Protocol Implementation
3
 * Optimized YUV/RGB conversion operations
4
 *
5
 * Copyright 2014 Thomas Erbesdobler
6
 * Copyright 2016-2017 Armin Novak <armin.novak@thincast.com>
7
 * Copyright 2016-2017 Norbert Federa <norbert.federa@thincast.com>
8
 * Copyright 2016-2017 Thincast Technologies GmbH
9
 *
10
 * Licensed under the Apache License, Version 2.0 (the "License");
11
 * you may not use this file except in compliance with the License.
12
 * You may obtain a copy of the License at
13
 *
14
 *     http://www.apache.org/licenses/LICENSE-2.0
15
 *
16
 * Unless required by applicable law or agreed to in writing, software
17
 * distributed under the License is distributed on an "AS IS" BASIS,
18
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19
 * See the License for the specific language governing permissions and
20
 * limitations under the License.
21
 */
22
23
#include <winpr/wtypes.h>
24
#include <freerdp/config.h>
25
26
#include <winpr/sysinfo.h>
27
#include <winpr/crt.h>
28
#include <freerdp/types.h>
29
#include <freerdp/primitives.h>
30
31
#include "prim_internal.h"
32
#include "prim_avxsse.h"
33
#include "prim_YUV.h"
34
35
#if defined(SSE_AVX_INTRINSICS_ENABLED)
36
#include <emmintrin.h>
37
#include <tmmintrin.h>
38
#include <smmintrin.h>
39
40
static primitives_t* generic = NULL;
41
42
/****************************************************************************/
43
/* sse41 YUV420 -> RGB conversion                                           */
44
/****************************************************************************/
45
static inline __m128i* sse41_YUV444Pixel(__m128i* WINPR_RESTRICT dst, __m128i Yraw, __m128i Uraw,
46
                                         __m128i Vraw, UINT8 pos)
47
0
{
48
0
  const __m128i mapY[] = { mm_set_epu32(0x80800380, 0x80800280, 0x80800180, 0x80800080),
49
0
                         mm_set_epu32(0x80800780, 0x80800680, 0x80800580, 0x80800480),
50
0
                         mm_set_epu32(0x80800B80, 0x80800A80, 0x80800980, 0x80800880),
51
0
                         mm_set_epu32(0x80800F80, 0x80800E80, 0x80800D80, 0x80800C80) };
52
0
  const __m128i mapUV[] = { mm_set_epu32(0x80038002, 0x80018000, 0x80808080, 0x80808080),
53
0
                          mm_set_epu32(0x80078006, 0x80058004, 0x80808080, 0x80808080),
54
0
                          mm_set_epu32(0x800B800A, 0x80098008, 0x80808080, 0x80808080),
55
0
                          mm_set_epu32(0x800F800E, 0x800D800C, 0x80808080, 0x80808080) };
56
0
  const __m128i mask[] = { mm_set_epu32(0x80038080, 0x80028080, 0x80018080, 0x80008080),
57
0
                         mm_set_epu32(0x80800380, 0x80800280, 0x80800180, 0x80800080),
58
0
                         mm_set_epu32(0x80808003, 0x80808002, 0x80808001, 0x80808000) };
59
0
  const __m128i c128 = _mm_set1_epi16(128);
60
0
  __m128i BGRX = _mm_and_si128(LOAD_SI128(dst),
61
0
                               mm_set_epu32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000));
62
0
  {
63
0
    __m128i C;
64
0
    __m128i D;
65
0
    __m128i E;
66
    /* Load Y values and expand to 32 bit */
67
0
    {
68
0
      C = _mm_shuffle_epi8(Yraw, mapY[pos]); /* Reorder and multiply by 256 */
69
0
    }
70
    /* Load U values and expand to 32 bit */
71
0
    {
72
0
      const __m128i U = _mm_shuffle_epi8(Uraw, mapUV[pos]); /* Reorder dcba */
73
0
      D = _mm_sub_epi16(U, c128);                           /* D = U - 128 */
74
0
    }
75
    /* Load V values and expand to 32 bit */
76
0
    {
77
0
      const __m128i V = _mm_shuffle_epi8(Vraw, mapUV[pos]); /* Reorder dcba */
78
0
      E = _mm_sub_epi16(V, c128);                           /* E = V - 128 */
79
0
    }
80
    /* Get the R value */
81
0
    {
82
0
      const __m128i c403 = _mm_set1_epi16(403);
83
0
      const __m128i e403 =
84
0
          _mm_unpackhi_epi16(_mm_mullo_epi16(E, c403), _mm_mulhi_epi16(E, c403));
85
0
      const __m128i Rs = _mm_add_epi32(C, e403);
86
0
      const __m128i R32 = _mm_srai_epi32(Rs, 8);
87
0
      const __m128i R16 = _mm_packs_epi32(R32, _mm_setzero_si128());
88
0
      const __m128i R = _mm_packus_epi16(R16, _mm_setzero_si128());
89
0
      const __m128i packed = _mm_shuffle_epi8(R, mask[0]);
90
0
      BGRX = _mm_or_si128(BGRX, packed);
91
0
    }
92
    /* Get the G value */
93
0
    {
94
0
      const __m128i c48 = _mm_set1_epi16(48);
95
0
      const __m128i d48 =
96
0
          _mm_unpackhi_epi16(_mm_mullo_epi16(D, c48), _mm_mulhi_epi16(D, c48));
97
0
      const __m128i c120 = _mm_set1_epi16(120);
98
0
      const __m128i e120 =
99
0
          _mm_unpackhi_epi16(_mm_mullo_epi16(E, c120), _mm_mulhi_epi16(E, c120));
100
0
      const __m128i de = _mm_add_epi32(d48, e120);
101
0
      const __m128i Gs = _mm_sub_epi32(C, de);
102
0
      const __m128i G32 = _mm_srai_epi32(Gs, 8);
103
0
      const __m128i G16 = _mm_packs_epi32(G32, _mm_setzero_si128());
104
0
      const __m128i G = _mm_packus_epi16(G16, _mm_setzero_si128());
105
0
      const __m128i packed = _mm_shuffle_epi8(G, mask[1]);
106
0
      BGRX = _mm_or_si128(BGRX, packed);
107
0
    }
108
    /* Get the B value */
109
0
    {
110
0
      const __m128i c475 = _mm_set1_epi16(475);
111
0
      const __m128i d475 =
112
0
          _mm_unpackhi_epi16(_mm_mullo_epi16(D, c475), _mm_mulhi_epi16(D, c475));
113
0
      const __m128i Bs = _mm_add_epi32(C, d475);
114
0
      const __m128i B32 = _mm_srai_epi32(Bs, 8);
115
0
      const __m128i B16 = _mm_packs_epi32(B32, _mm_setzero_si128());
116
0
      const __m128i B = _mm_packus_epi16(B16, _mm_setzero_si128());
117
0
      const __m128i packed = _mm_shuffle_epi8(B, mask[2]);
118
0
      BGRX = _mm_or_si128(BGRX, packed);
119
0
    }
120
0
  }
121
0
  STORE_SI128(dst++, BGRX);
122
0
  return dst;
123
0
}
124
125
static inline pstatus_t sse41_YUV420ToRGB_BGRX(const BYTE* WINPR_RESTRICT pSrc[],
126
                                               const UINT32* WINPR_RESTRICT srcStep,
127
                                               BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
128
                                               const prim_size_t* WINPR_RESTRICT roi)
129
0
{
130
0
  const UINT32 nWidth = roi->width;
131
0
  const UINT32 nHeight = roi->height;
132
0
  const UINT32 pad = roi->width % 16;
133
0
  const __m128i duplicate = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
134
135
0
  for (size_t y = 0; y < nHeight; y++)
136
0
  {
137
0
    __m128i* dst = (__m128i*)(pDst + dstStep * y);
138
0
    const BYTE* YData = pSrc[0] + y * srcStep[0];
139
0
    const BYTE* UData = pSrc[1] + (y / 2) * srcStep[1];
140
0
    const BYTE* VData = pSrc[2] + (y / 2) * srcStep[2];
141
142
0
    for (UINT32 x = 0; x < nWidth - pad; x += 16)
143
0
    {
144
0
      const __m128i Y = LOAD_SI128(YData);
145
0
      const __m128i uRaw = LOAD_SI128(UData);
146
0
      const __m128i vRaw = LOAD_SI128(VData);
147
0
      const __m128i U = _mm_shuffle_epi8(uRaw, duplicate);
148
0
      const __m128i V = _mm_shuffle_epi8(vRaw, duplicate);
149
0
      YData += 16;
150
0
      UData += 8;
151
0
      VData += 8;
152
0
      dst = sse41_YUV444Pixel(dst, Y, U, V, 0);
153
0
      dst = sse41_YUV444Pixel(dst, Y, U, V, 1);
154
0
      dst = sse41_YUV444Pixel(dst, Y, U, V, 2);
155
0
      dst = sse41_YUV444Pixel(dst, Y, U, V, 3);
156
0
    }
157
158
0
    for (UINT32 x = 0; x < pad; x++)
159
0
    {
160
0
      const BYTE Y = *YData++;
161
0
      const BYTE U = *UData;
162
0
      const BYTE V = *VData;
163
0
      dst = (__m128i*)writeYUVPixel((BYTE*)dst, PIXEL_FORMAT_BGRX32, Y, U, V, writePixelBGRX);
164
165
0
      if (x % 2)
166
0
      {
167
0
        UData++;
168
0
        VData++;
169
0
      }
170
0
    }
171
0
  }
172
173
0
  return PRIMITIVES_SUCCESS;
174
0
}
175
176
static pstatus_t sse41_YUV420ToRGB(const BYTE* WINPR_RESTRICT pSrc[3], const UINT32 srcStep[3],
177
                                   BYTE* WINPR_RESTRICT pDst, UINT32 dstStep, UINT32 DstFormat,
178
                                   const prim_size_t* WINPR_RESTRICT roi)
179
0
{
180
0
  switch (DstFormat)
181
0
  {
182
0
    case PIXEL_FORMAT_BGRX32:
183
0
    case PIXEL_FORMAT_BGRA32:
184
0
      return sse41_YUV420ToRGB_BGRX(pSrc, srcStep, pDst, dstStep, roi);
185
186
0
    default:
187
0
      return generic->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
188
0
  }
189
0
}
190
191
static inline void BGRX_fillRGB(size_t offset, BYTE* WINPR_RESTRICT pRGB[2],
192
                                const BYTE* WINPR_RESTRICT pY[2], const BYTE* WINPR_RESTRICT pU[2],
193
                                const BYTE* WINPR_RESTRICT pV[2], BOOL filter)
194
0
{
195
0
  WINPR_ASSERT(pRGB);
196
0
  WINPR_ASSERT(pY);
197
0
  WINPR_ASSERT(pU);
198
0
  WINPR_ASSERT(pV);
199
200
0
  const UINT32 DstFormat = PIXEL_FORMAT_BGRX32;
201
0
  const UINT32 bpp = 4;
202
203
0
  for (size_t i = 0; i < 2; i++)
204
0
  {
205
0
    for (size_t j = 0; j < 2; j++)
206
0
    {
207
0
      const BYTE Y = pY[i][offset + j];
208
0
      BYTE U = pU[i][offset + j];
209
0
      BYTE V = pV[i][offset + j];
210
0
      if ((i == 0) && (j == 0) && filter)
211
0
      {
212
0
        const INT32 avgU =
213
0
            4 * pU[0][offset] - pU[0][offset + 1] - pU[1][offset] - pU[1][offset + 1];
214
0
        const INT32 avgV =
215
0
            4 * pV[0][offset] - pV[0][offset + 1] - pV[1][offset] - pV[1][offset + 1];
216
217
0
        U = CONDITIONAL_CLIP(avgU, pU[0][offset]);
218
0
        V = CONDITIONAL_CLIP(avgV, pV[0][offset]);
219
0
      }
220
221
0
      writeYUVPixel(&pRGB[i][(j + offset) * bpp], DstFormat, Y, U, V, writePixelBGRX);
222
0
    }
223
0
  }
224
0
}
225
226
/* input are uint16_t vectors */
227
static inline __m128i sse41_yuv2x_single(const __m128i Y, __m128i U, __m128i V, const short iMulU,
228
                                         const short iMulV)
229
0
{
230
0
  const __m128i zero = _mm_set1_epi8(0);
231
232
0
  __m128i Ylo = _mm_unpacklo_epi16(Y, zero);
233
0
  __m128i Yhi = _mm_unpackhi_epi16(Y, zero);
234
0
  if (iMulU != 0)
235
0
  {
236
0
    const __m128i addX = _mm_set1_epi16(128);
237
0
    const __m128i D = _mm_sub_epi16(U, addX);
238
0
    const __m128i mulU = _mm_set1_epi16(iMulU);
239
0
    const __m128i mulDlo = _mm_mullo_epi16(D, mulU);
240
0
    const __m128i mulDhi = _mm_mulhi_epi16(D, mulU);
241
0
    const __m128i Dlo = _mm_unpacklo_epi16(mulDlo, mulDhi);
242
0
    Ylo = _mm_add_epi32(Ylo, Dlo);
243
244
0
    const __m128i Dhi = _mm_unpackhi_epi16(mulDlo, mulDhi);
245
0
    Yhi = _mm_add_epi32(Yhi, Dhi);
246
0
  }
247
0
  if (iMulV != 0)
248
0
  {
249
0
    const __m128i addX = _mm_set1_epi16(128);
250
0
    const __m128i E = _mm_sub_epi16(V, addX);
251
0
    const __m128i mul = _mm_set1_epi16(iMulV);
252
0
    const __m128i mulElo = _mm_mullo_epi16(E, mul);
253
0
    const __m128i mulEhi = _mm_mulhi_epi16(E, mul);
254
0
    const __m128i Elo = _mm_unpacklo_epi16(mulElo, mulEhi);
255
0
    const __m128i esumlo = _mm_add_epi32(Ylo, Elo);
256
257
0
    const __m128i Ehi = _mm_unpackhi_epi16(mulElo, mulEhi);
258
0
    const __m128i esumhi = _mm_add_epi32(Yhi, Ehi);
259
0
    Ylo = esumlo;
260
0
    Yhi = esumhi;
261
0
  }
262
263
0
  const __m128i rYlo = _mm_srai_epi32(Ylo, 8);
264
0
  const __m128i rYhi = _mm_srai_epi32(Yhi, 8);
265
0
  const __m128i rY = _mm_packs_epi32(rYlo, rYhi);
266
0
  return rY;
267
0
}
268
269
/* Input are uint8_t vectors */
270
static inline __m128i sse41_yuv2x(const __m128i Y, __m128i U, __m128i V, const short iMulU,
271
                                  const short iMulV)
272
0
{
273
0
  const __m128i zero = _mm_set1_epi8(0);
274
275
  /* Ylo = Y * 256
276
   * Ulo = uint8_t -> uint16_t
277
   * Vlo = uint8_t -> uint16_t
278
   */
279
0
  const __m128i Ylo = _mm_unpacklo_epi8(zero, Y);
280
0
  const __m128i Ulo = _mm_unpacklo_epi8(U, zero);
281
0
  const __m128i Vlo = _mm_unpacklo_epi8(V, zero);
282
0
  const __m128i preslo = sse41_yuv2x_single(Ylo, Ulo, Vlo, iMulU, iMulV);
283
284
0
  const __m128i Yhi = _mm_unpackhi_epi8(zero, Y);
285
0
  const __m128i Uhi = _mm_unpackhi_epi8(U, zero);
286
0
  const __m128i Vhi = _mm_unpackhi_epi8(V, zero);
287
0
  const __m128i preshi = sse41_yuv2x_single(Yhi, Uhi, Vhi, iMulU, iMulV);
288
0
  const __m128i res = _mm_packus_epi16(preslo, preshi);
289
290
0
  return res;
291
0
}
292
293
/* const INT32 r = ((256L * C(Y) + 0L * D(U) + 403L * E(V))) >> 8; */
294
static inline __m128i sse41_yuv2r(const __m128i Y, __m128i U, __m128i V)
295
0
{
296
0
  return sse41_yuv2x(Y, U, V, 0, 403);
297
0
}
298
299
/*  const INT32 g = ((256L * C(Y) - 48L * D(U) - 120L * E(V))) >> 8; */
300
static inline __m128i sse41_yuv2g(const __m128i Y, __m128i U, __m128i V)
301
0
{
302
0
  return sse41_yuv2x(Y, U, V, -48, -120);
303
0
}
304
305
/* const INT32 b = ((256L * C(Y) + 475L * D(U) + 0L * E(V))) >> 8; */
306
static inline __m128i sse41_yuv2b(const __m128i Y, __m128i U, __m128i V)
307
0
{
308
0
  return sse41_yuv2x(Y, U, V, 475, 0);
309
0
}
310
311
static inline void sse41_BGRX_fillRGB_pixel(BYTE* WINPR_RESTRICT pRGB, __m128i Y, __m128i U,
312
                                            __m128i V)
313
0
{
314
0
  const __m128i zero = _mm_set1_epi8(0);
315
  /* Y * 256 */
316
0
  const __m128i r = sse41_yuv2r(Y, U, V);
317
0
  const __m128i rx[2] = { _mm_unpackhi_epi8(r, zero), _mm_unpacklo_epi8(r, zero) };
318
319
0
  const __m128i g = sse41_yuv2g(Y, U, V);
320
0
  const __m128i b = sse41_yuv2b(Y, U, V);
321
322
0
  const __m128i bg[2] = { _mm_unpackhi_epi8(b, g), _mm_unpacklo_epi8(b, g) };
323
324
0
  const __m128i mask = mm_set_epu8(0x00, 0xFF, 0xFF, 0xFF, 0x00, 0xFF, 0xFF, 0xFF, 0x00, 0xFF,
325
0
                                   0xFF, 0xFF, 0x00, 0xFF, 0xFF, 0xFF);
326
327
0
  __m128i* rgb = (__m128i*)pRGB;
328
0
  const __m128i bgrx0 = _mm_unpacklo_epi16(bg[1], rx[1]);
329
0
  _mm_maskmoveu_si128(bgrx0, mask, (char*)&rgb[0]);
330
0
  const __m128i bgrx1 = _mm_unpackhi_epi16(bg[1], rx[1]);
331
0
  _mm_maskmoveu_si128(bgrx1, mask, (char*)&rgb[1]);
332
0
  const __m128i bgrx2 = _mm_unpacklo_epi16(bg[0], rx[0]);
333
0
  _mm_maskmoveu_si128(bgrx2, mask, (char*)&rgb[2]);
334
0
  const __m128i bgrx3 = _mm_unpackhi_epi16(bg[0], rx[0]);
335
0
  _mm_maskmoveu_si128(bgrx3, mask, (char*)&rgb[3]);
336
0
}
337
338
static inline __m128i odd1sum(__m128i u1)
339
0
{
340
0
  const __m128i zero = _mm_set1_epi8(0);
341
0
  const __m128i u1hi = _mm_unpackhi_epi8(u1, zero);
342
0
  const __m128i u1lo = _mm_unpacklo_epi8(u1, zero);
343
0
  return _mm_hadds_epi16(u1lo, u1hi);
344
0
}
345
346
static inline __m128i odd0sum(__m128i u0, __m128i u1sum)
347
0
{
348
  /* Mask out even bytes, extend uint8_t to uint16_t by filling in zero bytes,
349
   * horizontally add the values */
350
0
  const __m128i mask = mm_set_epu8(0x80, 0x0F, 0x80, 0x0D, 0x80, 0x0B, 0x80, 0x09, 0x80, 0x07,
351
0
                                   0x80, 0x05, 0x80, 0x03, 0x80, 0x01);
352
0
  const __m128i u0odd = _mm_shuffle_epi8(u0, mask);
353
0
  return _mm_adds_epi16(u1sum, u0odd);
354
0
}
355
356
static inline __m128i calcavg(__m128i u0even, __m128i sum)
357
0
{
358
0
  const __m128i u4zero = _mm_slli_epi16(u0even, 2);
359
0
  const __m128i uavg = _mm_sub_epi16(u4zero, sum);
360
0
  const __m128i zero = _mm_set1_epi8(0);
361
0
  const __m128i savg = _mm_packus_epi16(uavg, zero);
362
0
  const __m128i smask = mm_set_epu8(0x80, 0x07, 0x80, 0x06, 0x80, 0x05, 0x80, 0x04, 0x80, 0x03,
363
0
                                    0x80, 0x02, 0x80, 0x01, 0x80, 0x00);
364
0
  return _mm_shuffle_epi8(savg, smask);
365
0
}
366
367
static inline __m128i diffmask(__m128i avg, __m128i u0even)
368
0
{
369
  /* Check for values >= 30 to apply the avg value to
370
   * use int16 for calculations to avoid issues with signed 8bit integers
371
   */
372
0
  const __m128i diff = _mm_subs_epi16(u0even, avg);
373
0
  const __m128i absdiff = _mm_abs_epi16(diff);
374
0
  const __m128i val30 = _mm_set1_epi16(30);
375
0
  return _mm_cmplt_epi16(absdiff, val30);
376
0
}
377
378
static inline void sse41_filter(__m128i pU[2])
379
0
{
380
0
  const __m128i u1sum = odd1sum(pU[1]);
381
0
  const __m128i sum = odd0sum(pU[0], u1sum);
382
383
  /* Mask out the odd bytes. We don´t need to do anything to make the uint8_t to uint16_t */
384
0
  const __m128i emask = mm_set_epu8(0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff,
385
0
                                    0x00, 0xff, 0x00, 0xff, 0x00, 0xff);
386
0
  const __m128i u0even = _mm_and_si128(pU[0], emask);
387
0
  const __m128i avg = calcavg(u0even, sum);
388
0
  const __m128i umask = diffmask(avg, u0even);
389
390
0
  const __m128i u0orig = _mm_and_si128(u0even, umask);
391
0
  const __m128i u0avg = _mm_andnot_si128(umask, avg);
392
0
  const __m128i evenresult = _mm_or_si128(u0orig, u0avg);
393
0
  const __m128i omask = mm_set_epu8(0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00,
394
0
                                    0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00);
395
0
  const __m128i u0odd = _mm_and_si128(pU[0], omask);
396
0
  const __m128i result = _mm_or_si128(evenresult, u0odd);
397
0
  pU[0] = result;
398
0
}
399
400
static inline void sse41_BGRX_fillRGB(BYTE* WINPR_RESTRICT pRGB[2], const __m128i pY[2],
401
                                      __m128i pU[2], __m128i pV[2])
402
0
{
403
0
  WINPR_ASSERT(pRGB);
404
0
  WINPR_ASSERT(pY);
405
0
  WINPR_ASSERT(pU);
406
0
  WINPR_ASSERT(pV);
407
408
0
  sse41_filter(pU);
409
0
  sse41_filter(pV);
410
411
0
  for (size_t i = 0; i < 2; i++)
412
0
  {
413
0
    sse41_BGRX_fillRGB_pixel(pRGB[i], pY[i], pU[i], pV[i]);
414
0
  }
415
0
}
416
417
static inline pstatus_t sse41_YUV444ToRGB_8u_P3AC4R_BGRX_DOUBLE_ROW(
418
    BYTE* WINPR_RESTRICT pDst[2], const BYTE* WINPR_RESTRICT YData[2],
419
    const BYTE* WINPR_RESTRICT UData[2], const BYTE* WINPR_RESTRICT VData[2], UINT32 nWidth)
420
0
{
421
0
  WINPR_ASSERT((nWidth % 2) == 0);
422
0
  const UINT32 pad = nWidth % 16;
423
424
0
  size_t x = 0;
425
0
  for (; x < nWidth - pad; x += 16)
426
0
  {
427
0
    const __m128i Y[] = { LOAD_SI128(&YData[0][x]), LOAD_SI128(&YData[1][x]) };
428
0
    __m128i U[] = { LOAD_SI128(&UData[0][x]), LOAD_SI128(&UData[1][x]) };
429
0
    __m128i V[] = { LOAD_SI128(&VData[0][x]), LOAD_SI128(&VData[1][x]) };
430
431
0
    BYTE* dstp[] = { &pDst[0][x * 4], &pDst[1][x * 4] };
432
0
    sse41_BGRX_fillRGB(dstp, Y, U, V);
433
0
  }
434
435
0
  for (; x < nWidth; x += 2)
436
0
  {
437
0
    BGRX_fillRGB(x, pDst, YData, UData, VData, TRUE);
438
0
  }
439
440
0
  return PRIMITIVES_SUCCESS;
441
0
}
442
443
static inline void BGRX_fillRGB_single(size_t offset, BYTE* WINPR_RESTRICT pRGB,
444
                                       const BYTE* WINPR_RESTRICT pY, const BYTE* WINPR_RESTRICT pU,
445
                                       const BYTE* WINPR_RESTRICT pV, WINPR_ATTR_UNUSED BOOL filter)
446
0
{
447
0
  WINPR_ASSERT(pRGB);
448
0
  WINPR_ASSERT(pY);
449
0
  WINPR_ASSERT(pU);
450
0
  WINPR_ASSERT(pV);
451
452
0
  const UINT32 bpp = 4;
453
454
0
  for (size_t j = 0; j < 2; j++)
455
0
  {
456
0
    const BYTE Y = pY[offset + j];
457
0
    BYTE U = pU[offset + j];
458
0
    BYTE V = pV[offset + j];
459
460
0
    writeYUVPixel(&pRGB[(j + offset) * bpp], PIXEL_FORMAT_BGRX32, Y, U, V, writePixelBGRX);
461
0
  }
462
0
}
463
464
static inline pstatus_t sse41_YUV444ToRGB_8u_P3AC4R_BGRX_SINGLE_ROW(
465
    BYTE* WINPR_RESTRICT pDst, const BYTE* WINPR_RESTRICT YData, const BYTE* WINPR_RESTRICT UData,
466
    const BYTE* WINPR_RESTRICT VData, UINT32 nWidth)
467
0
{
468
0
  WINPR_ASSERT((nWidth % 2) == 0);
469
470
0
  for (size_t x = 0; x < nWidth; x += 2)
471
0
  {
472
0
    BGRX_fillRGB_single(x, pDst, YData, UData, VData, TRUE);
473
0
  }
474
475
0
  return PRIMITIVES_SUCCESS;
476
0
}
477
478
static inline pstatus_t sse41_YUV444ToRGB_8u_P3AC4R_BGRX(const BYTE* WINPR_RESTRICT pSrc[],
479
                                                         const UINT32 srcStep[],
480
                                                         BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
481
                                                         const prim_size_t* WINPR_RESTRICT roi)
482
0
{
483
0
  const UINT32 nWidth = roi->width;
484
0
  const UINT32 nHeight = roi->height;
485
486
0
  size_t y = 0;
487
0
  for (; y < nHeight - nHeight % 2; y += 2)
488
0
  {
489
0
    BYTE* dst[] = { (pDst + dstStep * y), (pDst + dstStep * (y + 1)) };
490
0
    const BYTE* YData[] = { pSrc[0] + y * srcStep[0], pSrc[0] + (y + 1) * srcStep[0] };
491
0
    const BYTE* UData[] = { pSrc[1] + y * srcStep[1], pSrc[1] + (y + 1) * srcStep[1] };
492
0
    const BYTE* VData[] = { pSrc[2] + y * srcStep[2], pSrc[2] + (y + 1) * srcStep[2] };
493
494
0
    const pstatus_t rc =
495
0
        sse41_YUV444ToRGB_8u_P3AC4R_BGRX_DOUBLE_ROW(dst, YData, UData, VData, nWidth);
496
0
    if (rc != PRIMITIVES_SUCCESS)
497
0
      return rc;
498
0
  }
499
0
  for (; y < nHeight; y++)
500
0
  {
501
0
    BYTE* dst = (pDst + dstStep * y);
502
0
    const BYTE* YData = pSrc[0] + y * srcStep[0];
503
0
    const BYTE* UData = pSrc[1] + y * srcStep[1];
504
0
    const BYTE* VData = pSrc[2] + y * srcStep[2];
505
0
    const pstatus_t rc =
506
0
        sse41_YUV444ToRGB_8u_P3AC4R_BGRX_SINGLE_ROW(dst, YData, UData, VData, nWidth);
507
0
    if (rc != PRIMITIVES_SUCCESS)
508
0
      return rc;
509
0
  }
510
511
0
  return PRIMITIVES_SUCCESS;
512
0
}
513
514
static pstatus_t sse41_YUV444ToRGB_8u_P3AC4R(const BYTE* WINPR_RESTRICT pSrc[],
515
                                             const UINT32 srcStep[], BYTE* WINPR_RESTRICT pDst,
516
                                             UINT32 dstStep, UINT32 DstFormat,
517
                                             const prim_size_t* WINPR_RESTRICT roi)
518
0
{
519
0
  switch (DstFormat)
520
0
  {
521
0
    case PIXEL_FORMAT_BGRX32:
522
0
    case PIXEL_FORMAT_BGRA32:
523
0
      return sse41_YUV444ToRGB_8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi);
524
525
0
    default:
526
0
      return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
527
0
  }
528
0
}
529
530
/****************************************************************************/
531
/* sse41 RGB -> YUV420 conversion                                          **/
532
/****************************************************************************/
533
534
/**
535
 * Note (nfedera):
536
 * The used forward transformation factors from RGB to YUV are based on the
537
 * values specified in [Rec. ITU-R BT.709-6] Section 3:
538
 * http://www.itu.int/rec/R-REC-BT.709-6-201506-I/en
539
 *
540
 * Y =  0.21260 * R + 0.71520 * G + 0.07220 * B +   0;
541
 * U = -0.11457 * R - 0.38543 * G + 0.50000 * B + 128;
542
 * V =  0.50000 * R - 0.45415 * G - 0.04585 * B + 128;
543
 *
544
 * The most accurate integer arithmetic approximation when using 8-bit signed
545
 * integer factors with 16-bit signed integer intermediate results is:
546
 *
547
 * Y = ( ( 27 * R + 92 * G +  9 * B) >> 7 );
548
 * U = ( (-29 * R - 99 * G + 128 * B) >> 8 ) + 128;
549
 * V = ( ( 128 * R - 116 * G -  12 * B) >> 8 ) + 128;
550
 *
551
 * Due to signed 8bit range being [-128,127] the U and V constants of 128 are
552
 * rounded to 127
553
 */
554
555
0
#define BGRX_Y_FACTORS _mm_set_epi8(0, 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9)
556
#define BGRX_U_FACTORS \
557
0
  _mm_set_epi8(0, -29, -99, 127, 0, -29, -99, 127, 0, -29, -99, 127, 0, -29, -99, 127)
558
#define BGRX_V_FACTORS \
559
0
  _mm_set_epi8(0, 127, -116, -12, 0, 127, -116, -12, 0, 127, -116, -12, 0, 127, -116, -12)
560
0
#define CONST128_FACTORS _mm_set1_epi8(-128)
561
562
0
#define Y_SHIFT 7
563
0
#define U_SHIFT 8
564
0
#define V_SHIFT 8
565
566
/*
567
TODO:
568
RGB[AX] can simply be supported using the following factors. And instead of loading the
569
globals directly the functions below could be passed pointers to the correct vectors
570
depending on the source picture format.
571
572
PRIM_ALIGN_128 static const BYTE rgbx_y_factors[] = {
573
      27,  92,   9,   0,  27,  92,   9,   0,  27,  92,   9,   0,  27,  92,   9,   0
574
};
575
PRIM_ALIGN_128 static const BYTE rgbx_u_factors[] = {
576
     -15, -49,  64,   0, -15, -49,  64,   0, -15, -49,  64,   0, -15, -49,  64,   0
577
};
578
PRIM_ALIGN_128 static const BYTE rgbx_v_factors[] = {
579
      64, -58,  -6,   0,  64, -58,  -6,   0,  64, -58,  -6,   0,  64, -58,  -6,   0
580
};
581
*/
582
583
static inline void sse41_BGRX_TO_YUV(const BYTE* WINPR_RESTRICT pLine1, BYTE* WINPR_RESTRICT pYLine,
584
                                     BYTE* WINPR_RESTRICT pULine, BYTE* WINPR_RESTRICT pVLine)
585
0
{
586
0
  const BYTE r1 = pLine1[2];
587
0
  const BYTE g1 = pLine1[1];
588
0
  const BYTE b1 = pLine1[0];
589
590
0
  if (pYLine)
591
0
    pYLine[0] = RGB2Y(r1, g1, b1);
592
0
  if (pULine)
593
0
    pULine[0] = RGB2U(r1, g1, b1);
594
0
  if (pVLine)
595
0
    pVLine[0] = RGB2V(r1, g1, b1);
596
0
}
597
598
/* compute the luma (Y) component from a single rgb source line */
599
600
static INLINE void sse41_RGBToYUV420_BGRX_Y(const BYTE* WINPR_RESTRICT src, BYTE* dst, UINT32 width)
601
0
{
602
0
  const __m128i y_factors = BGRX_Y_FACTORS;
603
0
  const __m128i* argb = (const __m128i*)src;
604
0
  __m128i* ydst = (__m128i*)dst;
605
606
0
  UINT32 x = 0;
607
608
0
  for (; x < width - width % 16; x += 16)
609
0
  {
610
    /* store 16 rgba pixels in 4 128 bit registers */
611
0
    __m128i x0 = LOAD_SI128(argb++); // 1st 4 pixels
612
0
    {
613
0
      x0 = _mm_maddubs_epi16(x0, y_factors);
614
615
0
      __m128i x1 = LOAD_SI128(argb++); // 2nd 4 pixels
616
0
      x1 = _mm_maddubs_epi16(x1, y_factors);
617
0
      x0 = _mm_hadds_epi16(x0, x1);
618
0
      x0 = _mm_srli_epi16(x0, Y_SHIFT);
619
0
    }
620
621
0
    __m128i x2 = LOAD_SI128(argb++); // 3rd 4 pixels
622
0
    {
623
0
      x2 = _mm_maddubs_epi16(x2, y_factors);
624
625
0
      __m128i x3 = LOAD_SI128(argb++); // 4th 4 pixels
626
0
      x3 = _mm_maddubs_epi16(x3, y_factors);
627
0
      x2 = _mm_hadds_epi16(x2, x3);
628
0
      x2 = _mm_srli_epi16(x2, Y_SHIFT);
629
0
    }
630
631
0
    x0 = _mm_packus_epi16(x0, x2);
632
    /* save to y plane */
633
0
    STORE_SI128(ydst++, x0);
634
0
  }
635
636
0
  for (; x < width; x++)
637
0
  {
638
0
    sse41_BGRX_TO_YUV(&src[4ULL * x], &dst[x], NULL, NULL);
639
0
  }
640
0
}
641
642
/* compute the chrominance (UV) components from two rgb source lines */
643
644
static INLINE void sse41_RGBToYUV420_BGRX_UV(const BYTE* WINPR_RESTRICT src1,
645
                                             const BYTE* WINPR_RESTRICT src2,
646
                                             BYTE* WINPR_RESTRICT dst1, BYTE* WINPR_RESTRICT dst2,
647
                                             UINT32 width)
648
0
{
649
0
  const __m128i u_factors = BGRX_U_FACTORS;
650
0
  const __m128i v_factors = BGRX_V_FACTORS;
651
0
  const __m128i vector128 = CONST128_FACTORS;
652
653
0
  size_t x = 0;
654
655
0
  for (; x < width - width % 16; x += 16)
656
0
  {
657
0
    const __m128i* rgb1 = (const __m128i*)&src1[4ULL * x];
658
0
    const __m128i* rgb2 = (const __m128i*)&src2[4ULL * x];
659
0
    __m64* udst = (__m64*)&dst1[x / 2];
660
0
    __m64* vdst = (__m64*)&dst2[x / 2];
661
662
    /* subsample 16x2 pixels into 16x1 pixels */
663
0
    __m128i x0 = LOAD_SI128(&rgb1[0]);
664
0
    __m128i x4 = LOAD_SI128(&rgb2[0]);
665
0
    x0 = _mm_avg_epu8(x0, x4);
666
667
0
    __m128i x1 = LOAD_SI128(&rgb1[1]);
668
0
    x4 = LOAD_SI128(&rgb2[1]);
669
0
    x1 = _mm_avg_epu8(x1, x4);
670
671
0
    __m128i x2 = LOAD_SI128(&rgb1[2]);
672
0
    x4 = LOAD_SI128(&rgb2[2]);
673
0
    x2 = _mm_avg_epu8(x2, x4);
674
675
0
    __m128i x3 = LOAD_SI128(&rgb1[3]);
676
0
    x4 = LOAD_SI128(&rgb2[3]);
677
0
    x3 = _mm_avg_epu8(x3, x4);
678
679
    /* subsample these 16x1 pixels into 8x1 pixels */
680
    /**
681
     * shuffle controls
682
     * c = a[0],a[2],b[0],b[2] == 10 00 10 00 = 0x88
683
     * c = a[1],a[3],b[1],b[3] == 11 01 11 01 = 0xdd
684
     */
685
0
    x4 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x0), _mm_castsi128_ps(x1), 0x88));
686
0
    x0 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x0), _mm_castsi128_ps(x1), 0xdd));
687
0
    x0 = _mm_avg_epu8(x0, x4);
688
0
    x4 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x2), _mm_castsi128_ps(x3), 0x88));
689
0
    x1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x2), _mm_castsi128_ps(x3), 0xdd));
690
0
    x1 = _mm_avg_epu8(x1, x4);
691
    /* multiplications and subtotals */
692
0
    x2 = _mm_maddubs_epi16(x0, u_factors);
693
0
    x3 = _mm_maddubs_epi16(x1, u_factors);
694
0
    x4 = _mm_maddubs_epi16(x0, v_factors);
695
0
    __m128i x5 = _mm_maddubs_epi16(x1, v_factors);
696
    /* the total sums */
697
0
    x0 = _mm_hadd_epi16(x2, x3);
698
0
    x1 = _mm_hadd_epi16(x4, x5);
699
    /* shift the results */
700
0
    x0 = _mm_srai_epi16(x0, U_SHIFT);
701
0
    x1 = _mm_srai_epi16(x1, V_SHIFT);
702
    /* pack the 16 words into bytes */
703
0
    x0 = _mm_packs_epi16(x0, x1);
704
    /* add 128 */
705
0
    x0 = _mm_sub_epi8(x0, vector128);
706
    /* the lower 8 bytes go to the u plane */
707
0
    _mm_storel_pi(udst, _mm_castsi128_ps(x0));
708
    /* the upper 8 bytes go to the v plane */
709
0
    _mm_storeh_pi(vdst, _mm_castsi128_ps(x0));
710
0
  }
711
712
0
  for (; x < width - width % 2; x += 2)
713
0
  {
714
0
    BYTE u[4] = { 0 };
715
0
    BYTE v[4] = { 0 };
716
0
    sse41_BGRX_TO_YUV(&src1[4ULL * x], NULL, &u[0], &v[0]);
717
0
    sse41_BGRX_TO_YUV(&src1[4ULL * (1ULL + x)], NULL, &u[1], &v[1]);
718
0
    sse41_BGRX_TO_YUV(&src2[4ULL * x], NULL, &u[2], &v[2]);
719
0
    sse41_BGRX_TO_YUV(&src2[4ULL * (1ULL + x)], NULL, &u[3], &v[3]);
720
0
    const INT16 u4 = WINPR_ASSERTING_INT_CAST(INT16, (INT16)u[0] + u[1] + u[2] + u[3]);
721
0
    const INT16 uu = WINPR_ASSERTING_INT_CAST(INT16, u4 / 4);
722
0
    const BYTE u8 = CLIP(uu);
723
0
    dst1[x / 2] = u8;
724
725
0
    const INT16 v4 = WINPR_ASSERTING_INT_CAST(INT16, (INT16)v[0] + v[1] + v[2] + v[3]);
726
0
    const INT16 vu = WINPR_ASSERTING_INT_CAST(INT16, v4 / 4);
727
0
    const BYTE v8 = CLIP(vu);
728
0
    dst2[x / 2] = v8;
729
0
  }
730
0
}
731
732
static pstatus_t sse41_RGBToYUV420_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep,
733
                                        BYTE* WINPR_RESTRICT pDst[], const UINT32 dstStep[],
734
                                        const prim_size_t* WINPR_RESTRICT roi)
735
0
{
736
0
  if (roi->height < 1 || roi->width < 1)
737
0
  {
738
0
    return !PRIMITIVES_SUCCESS;
739
0
  }
740
741
0
  size_t y = 0;
742
0
  for (; y < roi->height - roi->height % 2; y += 2)
743
0
  {
744
0
    const BYTE* line1 = &pSrc[y * srcStep];
745
0
    const BYTE* line2 = &pSrc[(1ULL + y) * srcStep];
746
0
    BYTE* ydst1 = &pDst[0][y * dstStep[0]];
747
0
    BYTE* ydst2 = &pDst[0][(1ULL + y) * dstStep[0]];
748
0
    BYTE* udst = &pDst[1][y / 2 * dstStep[1]];
749
0
    BYTE* vdst = &pDst[2][y / 2 * dstStep[2]];
750
751
0
    sse41_RGBToYUV420_BGRX_UV(line1, line2, udst, vdst, roi->width);
752
0
    sse41_RGBToYUV420_BGRX_Y(line1, ydst1, roi->width);
753
0
    sse41_RGBToYUV420_BGRX_Y(line2, ydst2, roi->width);
754
0
  }
755
756
0
  for (; y < roi->height; y++)
757
0
  {
758
0
    const BYTE* line = &pSrc[y * srcStep];
759
0
    BYTE* ydst = &pDst[0][1ULL * y * dstStep[0]];
760
0
    sse41_RGBToYUV420_BGRX_Y(line, ydst, roi->width);
761
0
  }
762
763
0
  return PRIMITIVES_SUCCESS;
764
0
}
765
766
static pstatus_t sse41_RGBToYUV420(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
767
                                   UINT32 srcStep, BYTE* WINPR_RESTRICT pDst[],
768
                                   const UINT32 dstStep[], const prim_size_t* WINPR_RESTRICT roi)
769
0
{
770
0
  switch (srcFormat)
771
0
  {
772
0
    case PIXEL_FORMAT_BGRX32:
773
0
    case PIXEL_FORMAT_BGRA32:
774
0
      return sse41_RGBToYUV420_BGRX(pSrc, srcStep, pDst, dstStep, roi);
775
776
0
    default:
777
0
      return generic->RGBToYUV420_8u_P3AC4R(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
778
0
  }
779
0
}
780
781
/****************************************************************************/
782
/* sse41 RGB -> AVC444-YUV conversion                                      **/
783
/****************************************************************************/
784
785
static INLINE void sse41_RGBToAVC444YUV_BGRX_DOUBLE_ROW(
786
    const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd,
787
    BYTE* WINPR_RESTRICT b1Even, BYTE* WINPR_RESTRICT b1Odd, BYTE* WINPR_RESTRICT b2,
788
    BYTE* WINPR_RESTRICT b3, BYTE* WINPR_RESTRICT b4, BYTE* WINPR_RESTRICT b5,
789
    BYTE* WINPR_RESTRICT b6, BYTE* WINPR_RESTRICT b7, UINT32 width)
790
0
{
791
0
  const __m128i* argbEven = (const __m128i*)srcEven;
792
0
  const __m128i* argbOdd = (const __m128i*)srcOdd;
793
0
  const __m128i y_factors = BGRX_Y_FACTORS;
794
0
  const __m128i u_factors = BGRX_U_FACTORS;
795
0
  const __m128i v_factors = BGRX_V_FACTORS;
796
0
  const __m128i vector128 = CONST128_FACTORS;
797
798
0
  UINT32 x = 0;
799
0
  for (; x < width - width % 16; x += 16)
800
0
  {
801
    /* store 16 rgba pixels in 4 128 bit registers */
802
0
    const __m128i xe1 = LOAD_SI128(argbEven++); // 1st 4 pixels
803
0
    const __m128i xe2 = LOAD_SI128(argbEven++); // 2nd 4 pixels
804
0
    const __m128i xe3 = LOAD_SI128(argbEven++); // 3rd 4 pixels
805
0
    const __m128i xe4 = LOAD_SI128(argbEven++); // 4th 4 pixels
806
0
    const __m128i xo1 = LOAD_SI128(argbOdd++);  // 1st 4 pixels
807
0
    const __m128i xo2 = LOAD_SI128(argbOdd++);  // 2nd 4 pixels
808
0
    const __m128i xo3 = LOAD_SI128(argbOdd++);  // 3rd 4 pixels
809
0
    const __m128i xo4 = LOAD_SI128(argbOdd++);  // 4th 4 pixels
810
0
    {
811
      /* Y: multiplications with subtotals and horizontal sums */
812
0
      const __m128i ye1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, y_factors),
813
0
                                                        _mm_maddubs_epi16(xe2, y_factors)),
814
0
                                         Y_SHIFT);
815
0
      const __m128i ye2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, y_factors),
816
0
                                                        _mm_maddubs_epi16(xe4, y_factors)),
817
0
                                         Y_SHIFT);
818
0
      const __m128i ye = _mm_packus_epi16(ye1, ye2);
819
0
      const __m128i yo1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, y_factors),
820
0
                                                        _mm_maddubs_epi16(xo2, y_factors)),
821
0
                                         Y_SHIFT);
822
0
      const __m128i yo2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, y_factors),
823
0
                                                        _mm_maddubs_epi16(xo4, y_factors)),
824
0
                                         Y_SHIFT);
825
0
      const __m128i yo = _mm_packus_epi16(yo1, yo2);
826
      /* store y [b1] */
827
0
      STORE_SI128(b1Even, ye);
828
0
      b1Even += 16;
829
830
0
      if (b1Odd)
831
0
      {
832
0
        STORE_SI128(b1Odd, yo);
833
0
        b1Odd += 16;
834
0
      }
835
0
    }
836
0
    {
837
      /* We have now
838
       * 16 even U values in ue
839
       * 16 odd U values in uo
840
       *
841
       * We need to split these according to
842
       * 3.3.8.3.2 YUV420p Stream Combination for YUV444 mode */
843
0
      __m128i ue;
844
0
      __m128i uo = { 0 };
845
0
      {
846
0
        const __m128i ue1 =
847
0
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, u_factors),
848
0
                                          _mm_maddubs_epi16(xe2, u_factors)),
849
0
                           U_SHIFT);
850
0
        const __m128i ue2 =
851
0
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, u_factors),
852
0
                                          _mm_maddubs_epi16(xe4, u_factors)),
853
0
                           U_SHIFT);
854
0
        ue = _mm_sub_epi8(_mm_packs_epi16(ue1, ue2), vector128);
855
0
      }
856
857
0
      if (b1Odd)
858
0
      {
859
0
        const __m128i uo1 =
860
0
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, u_factors),
861
0
                                          _mm_maddubs_epi16(xo2, u_factors)),
862
0
                           U_SHIFT);
863
0
        const __m128i uo2 =
864
0
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, u_factors),
865
0
                                          _mm_maddubs_epi16(xo4, u_factors)),
866
0
                           U_SHIFT);
867
0
        uo = _mm_sub_epi8(_mm_packs_epi16(uo1, uo2), vector128);
868
0
      }
869
870
      /* Now we need the following storage distribution:
871
       * 2x   2y    -> b2
872
       * x    2y+1  -> b4
873
       * 2x+1 2y    -> b6 */
874
0
      if (b1Odd) /* b2 */
875
0
      {
876
0
        const __m128i ueh = _mm_unpackhi_epi8(ue, _mm_setzero_si128());
877
0
        const __m128i uoh = _mm_unpackhi_epi8(uo, _mm_setzero_si128());
878
0
        const __m128i hi = _mm_add_epi16(ueh, uoh);
879
0
        const __m128i uel = _mm_unpacklo_epi8(ue, _mm_setzero_si128());
880
0
        const __m128i uol = _mm_unpacklo_epi8(uo, _mm_setzero_si128());
881
0
        const __m128i lo = _mm_add_epi16(uel, uol);
882
0
        const __m128i added = _mm_hadd_epi16(lo, hi);
883
0
        const __m128i avg16 = _mm_srai_epi16(added, 2);
884
0
        const __m128i avg = _mm_packus_epi16(avg16, avg16);
885
0
        _mm_storel_epi64((__m128i*)b2, avg);
886
0
      }
887
0
      else
888
0
      {
889
0
        const __m128i mask =
890
0
            _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
891
0
                         (char)0x80, (char)0x80, (char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
892
0
        const __m128i ud = _mm_shuffle_epi8(ue, mask);
893
0
        _mm_storel_epi64((__m128i*)b2, ud);
894
0
      }
895
896
0
      b2 += 8;
897
898
0
      if (b1Odd) /* b4 */
899
0
      {
900
0
        STORE_SI128(b4, uo);
901
0
        b4 += 16;
902
0
      }
903
904
0
      {
905
        /* b6 */
906
0
        const __m128i mask =
907
0
            _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
908
0
                         (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
909
0
        const __m128i ude = _mm_shuffle_epi8(ue, mask);
910
0
        _mm_storel_epi64((__m128i*)b6, ude);
911
0
        b6 += 8;
912
0
      }
913
0
    }
914
0
    {
915
      /* We have now
916
       * 16 even V values in ue
917
       * 16 odd V values in uo
918
       *
919
       * We need to split these according to
920
       * 3.3.8.3.2 YUV420p Stream Combination for YUV444 mode */
921
0
      __m128i ve;
922
0
      __m128i vo = { 0 };
923
0
      {
924
0
        const __m128i ve1 =
925
0
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, v_factors),
926
0
                                          _mm_maddubs_epi16(xe2, v_factors)),
927
0
                           V_SHIFT);
928
0
        const __m128i ve2 =
929
0
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, v_factors),
930
0
                                          _mm_maddubs_epi16(xe4, v_factors)),
931
0
                           V_SHIFT);
932
0
        ve = _mm_sub_epi8(_mm_packs_epi16(ve1, ve2), vector128);
933
0
      }
934
935
0
      if (b1Odd)
936
0
      {
937
0
        const __m128i vo1 =
938
0
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, v_factors),
939
0
                                          _mm_maddubs_epi16(xo2, v_factors)),
940
0
                           V_SHIFT);
941
0
        const __m128i vo2 =
942
0
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, v_factors),
943
0
                                          _mm_maddubs_epi16(xo4, v_factors)),
944
0
                           V_SHIFT);
945
0
        vo = _mm_sub_epi8(_mm_packs_epi16(vo1, vo2), vector128);
946
0
      }
947
948
      /* Now we need the following storage distribution:
949
       * 2x   2y    -> b3
950
       * x    2y+1  -> b5
951
       * 2x+1 2y    -> b7 */
952
0
      if (b1Odd) /* b3 */
953
0
      {
954
0
        const __m128i veh = _mm_unpackhi_epi8(ve, _mm_setzero_si128());
955
0
        const __m128i voh = _mm_unpackhi_epi8(vo, _mm_setzero_si128());
956
0
        const __m128i hi = _mm_add_epi16(veh, voh);
957
0
        const __m128i vel = _mm_unpacklo_epi8(ve, _mm_setzero_si128());
958
0
        const __m128i vol = _mm_unpacklo_epi8(vo, _mm_setzero_si128());
959
0
        const __m128i lo = _mm_add_epi16(vel, vol);
960
0
        const __m128i added = _mm_hadd_epi16(lo, hi);
961
0
        const __m128i avg16 = _mm_srai_epi16(added, 2);
962
0
        const __m128i avg = _mm_packus_epi16(avg16, avg16);
963
0
        _mm_storel_epi64((__m128i*)b3, avg);
964
0
      }
965
0
      else
966
0
      {
967
0
        const __m128i mask =
968
0
            _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
969
0
                         (char)0x80, (char)0x80, (char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
970
0
        const __m128i vd = _mm_shuffle_epi8(ve, mask);
971
0
        _mm_storel_epi64((__m128i*)b3, vd);
972
0
      }
973
974
0
      b3 += 8;
975
976
0
      if (b1Odd) /* b5 */
977
0
      {
978
0
        STORE_SI128(b5, vo);
979
0
        b5 += 16;
980
0
      }
981
982
0
      {
983
        /* b7 */
984
0
        const __m128i mask =
985
0
            _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
986
0
                         (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
987
0
        const __m128i vde = _mm_shuffle_epi8(ve, mask);
988
0
        _mm_storel_epi64((__m128i*)b7, vde);
989
0
        b7 += 8;
990
0
      }
991
0
    }
992
0
  }
993
994
0
  general_RGBToAVC444YUV_BGRX_DOUBLE_ROW(x, srcEven, srcOdd, b1Even, b1Odd, b2, b3, b4, b5, b6,
995
0
                                         b7, width);
996
0
}
997
998
static pstatus_t sse41_RGBToAVC444YUV_BGRX(const BYTE* WINPR_RESTRICT pSrc,
999
                                           WINPR_ATTR_UNUSED UINT32 srcFormat, UINT32 srcStep,
1000
                                           BYTE* WINPR_RESTRICT pDst1[], const UINT32 dst1Step[],
1001
                                           BYTE* WINPR_RESTRICT pDst2[], const UINT32 dst2Step[],
1002
                                           const prim_size_t* WINPR_RESTRICT roi)
1003
0
{
1004
0
  if (roi->height < 1 || roi->width < 1)
1005
0
    return !PRIMITIVES_SUCCESS;
1006
1007
0
  size_t y = 0;
1008
0
  for (; y < roi->height - roi->height % 2; y += 2)
1009
0
  {
1010
0
    const BYTE* srcEven = pSrc + y * srcStep;
1011
0
    const BYTE* srcOdd = pSrc + (y + 1) * srcStep;
1012
0
    const size_t i = y >> 1;
1013
0
    const size_t n = (i & (size_t)~7) + i;
1014
0
    BYTE* b1Even = pDst1[0] + y * dst1Step[0];
1015
0
    BYTE* b1Odd = (b1Even + dst1Step[0]);
1016
0
    BYTE* b2 = pDst1[1] + (y / 2) * dst1Step[1];
1017
0
    BYTE* b3 = pDst1[2] + (y / 2) * dst1Step[2];
1018
0
    BYTE* b4 = pDst2[0] + 1ULL * dst2Step[0] * n;
1019
0
    BYTE* b5 = b4 + 8ULL * dst2Step[0];
1020
0
    BYTE* b6 = pDst2[1] + (y / 2) * dst2Step[1];
1021
0
    BYTE* b7 = pDst2[2] + (y / 2) * dst2Step[2];
1022
0
    sse41_RGBToAVC444YUV_BGRX_DOUBLE_ROW(srcEven, srcOdd, b1Even, b1Odd, b2, b3, b4, b5, b6, b7,
1023
0
                                         roi->width);
1024
0
  }
1025
1026
0
  for (; y < roi->height; y++)
1027
0
  {
1028
0
    const BYTE* srcEven = pSrc + y * srcStep;
1029
0
    BYTE* b1Even = pDst1[0] + y * dst1Step[0];
1030
0
    BYTE* b2 = pDst1[1] + (y / 2) * dst1Step[1];
1031
0
    BYTE* b3 = pDst1[2] + (y / 2) * dst1Step[2];
1032
0
    BYTE* b6 = pDst2[1] + (y / 2) * dst2Step[1];
1033
0
    BYTE* b7 = pDst2[2] + (y / 2) * dst2Step[2];
1034
0
    general_RGBToAVC444YUV_BGRX_DOUBLE_ROW(0, srcEven, NULL, b1Even, NULL, b2, b3, NULL, NULL,
1035
0
                                           b6, b7, roi->width);
1036
0
  }
1037
1038
0
  return PRIMITIVES_SUCCESS;
1039
0
}
1040
1041
static pstatus_t sse41_RGBToAVC444YUV(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
1042
                                      UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[],
1043
                                      const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[],
1044
                                      const UINT32 dst2Step[],
1045
                                      const prim_size_t* WINPR_RESTRICT roi)
1046
0
{
1047
0
  switch (srcFormat)
1048
0
  {
1049
0
    case PIXEL_FORMAT_BGRX32:
1050
0
    case PIXEL_FORMAT_BGRA32:
1051
0
      return sse41_RGBToAVC444YUV_BGRX(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
1052
0
                                       dst2Step, roi);
1053
1054
0
    default:
1055
0
      return generic->RGBToAVC444YUV(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
1056
0
                                     dst2Step, roi);
1057
0
  }
1058
0
}
1059
1060
/* Mapping of arguments:
1061
 *
1062
 * b1 [even lines] -> yLumaDstEven
1063
 * b1 [odd lines]  -> yLumaDstOdd
1064
 * b2              -> uLumaDst
1065
 * b3              -> vLumaDst
1066
 * b4              -> yChromaDst1
1067
 * b5              -> yChromaDst2
1068
 * b6              -> uChromaDst1
1069
 * b7              -> uChromaDst2
1070
 * b8              -> vChromaDst1
1071
 * b9              -> vChromaDst2
1072
 */
1073
static INLINE void sse41_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
1074
    const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd,
1075
    BYTE* WINPR_RESTRICT yLumaDstEven, BYTE* WINPR_RESTRICT yLumaDstOdd,
1076
    BYTE* WINPR_RESTRICT uLumaDst, BYTE* WINPR_RESTRICT vLumaDst,
1077
    BYTE* WINPR_RESTRICT yEvenChromaDst1, BYTE* WINPR_RESTRICT yEvenChromaDst2,
1078
    BYTE* WINPR_RESTRICT yOddChromaDst1, BYTE* WINPR_RESTRICT yOddChromaDst2,
1079
    BYTE* WINPR_RESTRICT uChromaDst1, BYTE* WINPR_RESTRICT uChromaDst2,
1080
    BYTE* WINPR_RESTRICT vChromaDst1, BYTE* WINPR_RESTRICT vChromaDst2, UINT32 width)
1081
0
{
1082
0
  const __m128i vector128 = CONST128_FACTORS;
1083
0
  const __m128i* argbEven = (const __m128i*)srcEven;
1084
0
  const __m128i* argbOdd = (const __m128i*)srcOdd;
1085
1086
0
  UINT32 x = 0;
1087
0
  for (; x < width - width % 16; x += 16)
1088
0
  {
1089
    /* store 16 rgba pixels in 4 128 bit registers
1090
     * for even and odd rows.
1091
     */
1092
0
    const __m128i xe1 = LOAD_SI128(argbEven++); /* 1st 4 pixels */
1093
0
    const __m128i xe2 = LOAD_SI128(argbEven++); /* 2nd 4 pixels */
1094
0
    const __m128i xe3 = LOAD_SI128(argbEven++); /* 3rd 4 pixels */
1095
0
    const __m128i xe4 = LOAD_SI128(argbEven++); /* 4th 4 pixels */
1096
0
    const __m128i xo1 = LOAD_SI128(argbOdd++);  /* 1st 4 pixels */
1097
0
    const __m128i xo2 = LOAD_SI128(argbOdd++);  /* 2nd 4 pixels */
1098
0
    const __m128i xo3 = LOAD_SI128(argbOdd++);  /* 3rd 4 pixels */
1099
0
    const __m128i xo4 = LOAD_SI128(argbOdd++);  /* 4th 4 pixels */
1100
0
    {
1101
      /* Y: multiplications with subtotals and horizontal sums */
1102
0
      const __m128i y_factors = BGRX_Y_FACTORS;
1103
0
      const __m128i ye1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, y_factors),
1104
0
                                                        _mm_maddubs_epi16(xe2, y_factors)),
1105
0
                                         Y_SHIFT);
1106
0
      const __m128i ye2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, y_factors),
1107
0
                                                        _mm_maddubs_epi16(xe4, y_factors)),
1108
0
                                         Y_SHIFT);
1109
0
      const __m128i ye = _mm_packus_epi16(ye1, ye2);
1110
      /* store y [b1] */
1111
0
      STORE_SI128(yLumaDstEven, ye);
1112
0
      yLumaDstEven += 16;
1113
0
    }
1114
1115
0
    if (yLumaDstOdd)
1116
0
    {
1117
0
      const __m128i y_factors = BGRX_Y_FACTORS;
1118
0
      const __m128i yo1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, y_factors),
1119
0
                                                        _mm_maddubs_epi16(xo2, y_factors)),
1120
0
                                         Y_SHIFT);
1121
0
      const __m128i yo2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, y_factors),
1122
0
                                                        _mm_maddubs_epi16(xo4, y_factors)),
1123
0
                                         Y_SHIFT);
1124
0
      const __m128i yo = _mm_packus_epi16(yo1, yo2);
1125
0
      STORE_SI128(yLumaDstOdd, yo);
1126
0
      yLumaDstOdd += 16;
1127
0
    }
1128
1129
0
    {
1130
      /* We have now
1131
       * 16 even U values in ue
1132
       * 16 odd U values in uo
1133
       *
1134
       * We need to split these according to
1135
       * 3.3.8.3.3 YUV420p Stream Combination for YUV444v2 mode */
1136
      /* U: multiplications with subtotals and horizontal sums */
1137
0
      __m128i ue;
1138
0
      __m128i uo;
1139
0
      __m128i uavg;
1140
0
      {
1141
0
        const __m128i u_factors = BGRX_U_FACTORS;
1142
0
        const __m128i ue1 =
1143
0
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, u_factors),
1144
0
                                          _mm_maddubs_epi16(xe2, u_factors)),
1145
0
                           U_SHIFT);
1146
0
        const __m128i ue2 =
1147
0
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, u_factors),
1148
0
                                          _mm_maddubs_epi16(xe4, u_factors)),
1149
0
                           U_SHIFT);
1150
0
        const __m128i ueavg = _mm_hadd_epi16(ue1, ue2);
1151
0
        ue = _mm_sub_epi8(_mm_packs_epi16(ue1, ue2), vector128);
1152
0
        uavg = ueavg;
1153
0
      }
1154
0
      {
1155
0
        const __m128i u_factors = BGRX_U_FACTORS;
1156
0
        const __m128i uo1 =
1157
0
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, u_factors),
1158
0
                                          _mm_maddubs_epi16(xo2, u_factors)),
1159
0
                           U_SHIFT);
1160
0
        const __m128i uo2 =
1161
0
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, u_factors),
1162
0
                                          _mm_maddubs_epi16(xo4, u_factors)),
1163
0
                           U_SHIFT);
1164
0
        const __m128i uoavg = _mm_hadd_epi16(uo1, uo2);
1165
0
        uo = _mm_sub_epi8(_mm_packs_epi16(uo1, uo2), vector128);
1166
0
        uavg = _mm_add_epi16(uavg, uoavg);
1167
0
        uavg = _mm_srai_epi16(uavg, 2);
1168
0
        uavg = _mm_packs_epi16(uavg, uoavg);
1169
0
        uavg = _mm_sub_epi8(uavg, vector128);
1170
0
      }
1171
      /* Now we need the following storage distribution:
1172
       * 2x   2y    -> uLumaDst
1173
       * 2x+1  y    -> yChromaDst1
1174
       * 4x   2y+1  -> uChromaDst1
1175
       * 4x+2 2y+1  -> vChromaDst1 */
1176
0
      {
1177
0
        const __m128i mask =
1178
0
            _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
1179
0
                         (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
1180
0
        const __m128i ude = _mm_shuffle_epi8(ue, mask);
1181
0
        _mm_storel_epi64((__m128i*)yEvenChromaDst1, ude);
1182
0
        yEvenChromaDst1 += 8;
1183
0
      }
1184
1185
0
      if (yLumaDstOdd)
1186
0
      {
1187
0
        const __m128i mask =
1188
0
            _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
1189
0
                         (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
1190
0
        const __m128i udo /* codespell:ignore udo */ = _mm_shuffle_epi8(uo, mask);
1191
0
        _mm_storel_epi64((__m128i*)yOddChromaDst1, udo); // codespell:ignore udo
1192
0
        yOddChromaDst1 += 8;
1193
0
      }
1194
1195
0
      if (yLumaDstOdd)
1196
0
      {
1197
0
        const __m128i mask =
1198
0
            _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
1199
0
                         (char)0x80, (char)0x80, (char)0x80, 14, 10, 6, 2, 12, 8, 4, 0);
1200
0
        const __m128i ud = _mm_shuffle_epi8(uo, mask);
1201
0
        int* uDst1 = (int*)uChromaDst1;
1202
0
        int* vDst1 = (int*)vChromaDst1;
1203
0
        const int* src = (const int*)&ud;
1204
0
        _mm_stream_si32(uDst1, src[0]);
1205
0
        _mm_stream_si32(vDst1, src[1]);
1206
0
        uChromaDst1 += 4;
1207
0
        vChromaDst1 += 4;
1208
0
      }
1209
1210
0
      if (yLumaDstOdd)
1211
0
      {
1212
0
        _mm_storel_epi64((__m128i*)uLumaDst, uavg);
1213
0
        uLumaDst += 8;
1214
0
      }
1215
0
      else
1216
0
      {
1217
0
        const __m128i mask =
1218
0
            _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
1219
0
                         (char)0x80, (char)0x80, (char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
1220
0
        const __m128i ud = _mm_shuffle_epi8(ue, mask);
1221
0
        _mm_storel_epi64((__m128i*)uLumaDst, ud);
1222
0
        uLumaDst += 8;
1223
0
      }
1224
0
    }
1225
1226
0
    {
1227
      /* V: multiplications with subtotals and horizontal sums */
1228
0
      __m128i ve;
1229
0
      __m128i vo;
1230
0
      __m128i vavg;
1231
0
      {
1232
0
        const __m128i v_factors = BGRX_V_FACTORS;
1233
0
        const __m128i ve1 =
1234
0
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, v_factors),
1235
0
                                          _mm_maddubs_epi16(xe2, v_factors)),
1236
0
                           V_SHIFT);
1237
0
        const __m128i ve2 =
1238
0
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, v_factors),
1239
0
                                          _mm_maddubs_epi16(xe4, v_factors)),
1240
0
                           V_SHIFT);
1241
0
        const __m128i veavg = _mm_hadd_epi16(ve1, ve2);
1242
0
        ve = _mm_sub_epi8(_mm_packs_epi16(ve1, ve2), vector128);
1243
0
        vavg = veavg;
1244
0
      }
1245
0
      {
1246
0
        const __m128i v_factors = BGRX_V_FACTORS;
1247
0
        const __m128i vo1 =
1248
0
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, v_factors),
1249
0
                                          _mm_maddubs_epi16(xo2, v_factors)),
1250
0
                           V_SHIFT);
1251
0
        const __m128i vo2 =
1252
0
            _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, v_factors),
1253
0
                                          _mm_maddubs_epi16(xo4, v_factors)),
1254
0
                           V_SHIFT);
1255
0
        const __m128i voavg = _mm_hadd_epi16(vo1, vo2);
1256
0
        vo = _mm_sub_epi8(_mm_packs_epi16(vo1, vo2), vector128);
1257
0
        vavg = _mm_add_epi16(vavg, voavg);
1258
0
        vavg = _mm_srai_epi16(vavg, 2);
1259
0
        vavg = _mm_packs_epi16(vavg, voavg);
1260
0
        vavg = _mm_sub_epi8(vavg, vector128);
1261
0
      }
1262
      /* Now we need the following storage distribution:
1263
       * 2x   2y    -> vLumaDst
1264
       * 2x+1  y    -> yChromaDst2
1265
       * 4x   2y+1  -> uChromaDst2
1266
       * 4x+2 2y+1  -> vChromaDst2 */
1267
0
      {
1268
0
        const __m128i mask =
1269
0
            _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
1270
0
                         (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
1271
0
        __m128i vde = _mm_shuffle_epi8(ve, mask);
1272
0
        _mm_storel_epi64((__m128i*)yEvenChromaDst2, vde);
1273
0
        yEvenChromaDst2 += 8;
1274
0
      }
1275
1276
0
      if (yLumaDstOdd)
1277
0
      {
1278
0
        const __m128i mask =
1279
0
            _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
1280
0
                         (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
1281
0
        __m128i vdo = _mm_shuffle_epi8(vo, mask);
1282
0
        _mm_storel_epi64((__m128i*)yOddChromaDst2, vdo);
1283
0
        yOddChromaDst2 += 8;
1284
0
      }
1285
1286
0
      if (yLumaDstOdd)
1287
0
      {
1288
0
        const __m128i mask =
1289
0
            _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
1290
0
                         (char)0x80, (char)0x80, (char)0x80, 14, 10, 6, 2, 12, 8, 4, 0);
1291
0
        const __m128i vd = _mm_shuffle_epi8(vo, mask);
1292
0
        int* uDst2 = (int*)uChromaDst2;
1293
0
        int* vDst2 = (int*)vChromaDst2;
1294
0
        const int* src = (const int*)&vd;
1295
0
        _mm_stream_si32(uDst2, src[0]);
1296
0
        _mm_stream_si32(vDst2, src[1]);
1297
0
        uChromaDst2 += 4;
1298
0
        vChromaDst2 += 4;
1299
0
      }
1300
1301
0
      if (yLumaDstOdd)
1302
0
      {
1303
0
        _mm_storel_epi64((__m128i*)vLumaDst, vavg);
1304
0
        vLumaDst += 8;
1305
0
      }
1306
0
      else
1307
0
      {
1308
0
        const __m128i mask =
1309
0
            _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
1310
0
                         (char)0x80, (char)0x80, (char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
1311
0
        __m128i vd = _mm_shuffle_epi8(ve, mask);
1312
0
        _mm_storel_epi64((__m128i*)vLumaDst, vd);
1313
0
        vLumaDst += 8;
1314
0
      }
1315
0
    }
1316
0
  }
1317
1318
0
  general_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(x, srcEven, srcOdd, yLumaDstEven, yLumaDstOdd,
1319
0
                                           uLumaDst, vLumaDst, yEvenChromaDst1, yEvenChromaDst2,
1320
0
                                           yOddChromaDst1, yOddChromaDst2, uChromaDst1,
1321
0
                                           uChromaDst2, vChromaDst1, vChromaDst2, width);
1322
0
}
1323
1324
static pstatus_t sse41_RGBToAVC444YUVv2_BGRX(const BYTE* WINPR_RESTRICT pSrc,
1325
                                             WINPR_ATTR_UNUSED UINT32 srcFormat, UINT32 srcStep,
1326
                                             BYTE* WINPR_RESTRICT pDst1[], const UINT32 dst1Step[],
1327
                                             BYTE* WINPR_RESTRICT pDst2[], const UINT32 dst2Step[],
1328
                                             const prim_size_t* WINPR_RESTRICT roi)
1329
0
{
1330
0
  if (roi->height < 1 || roi->width < 1)
1331
0
    return !PRIMITIVES_SUCCESS;
1332
1333
0
  size_t y = 0;
1334
0
  for (; y < roi->height - roi->height % 2; y += 2)
1335
0
  {
1336
0
    const BYTE* srcEven = (pSrc + y * srcStep);
1337
0
    const BYTE* srcOdd = (srcEven + srcStep);
1338
0
    BYTE* dstLumaYEven = (pDst1[0] + y * dst1Step[0]);
1339
0
    BYTE* dstLumaYOdd = (dstLumaYEven + dst1Step[0]);
1340
0
    BYTE* dstLumaU = (pDst1[1] + (y / 2) * dst1Step[1]);
1341
0
    BYTE* dstLumaV = (pDst1[2] + (y / 2) * dst1Step[2]);
1342
0
    BYTE* dstEvenChromaY1 = (pDst2[0] + y * dst2Step[0]);
1343
0
    BYTE* dstEvenChromaY2 = dstEvenChromaY1 + roi->width / 2;
1344
0
    BYTE* dstOddChromaY1 = dstEvenChromaY1 + dst2Step[0];
1345
0
    BYTE* dstOddChromaY2 = dstEvenChromaY2 + dst2Step[0];
1346
0
    BYTE* dstChromaU1 = (pDst2[1] + (y / 2) * dst2Step[1]);
1347
0
    BYTE* dstChromaV1 = (pDst2[2] + (y / 2) * dst2Step[2]);
1348
0
    BYTE* dstChromaU2 = dstChromaU1 + roi->width / 4;
1349
0
    BYTE* dstChromaV2 = dstChromaV1 + roi->width / 4;
1350
0
    sse41_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(srcEven, srcOdd, dstLumaYEven, dstLumaYOdd, dstLumaU,
1351
0
                                           dstLumaV, dstEvenChromaY1, dstEvenChromaY2,
1352
0
                                           dstOddChromaY1, dstOddChromaY2, dstChromaU1,
1353
0
                                           dstChromaU2, dstChromaV1, dstChromaV2, roi->width);
1354
0
  }
1355
1356
0
  for (; y < roi->height; y++)
1357
0
  {
1358
0
    const BYTE* srcEven = (pSrc + y * srcStep);
1359
0
    BYTE* dstLumaYEven = (pDst1[0] + y * dst1Step[0]);
1360
0
    BYTE* dstLumaU = (pDst1[1] + (y / 2) * dst1Step[1]);
1361
0
    BYTE* dstLumaV = (pDst1[2] + (y / 2) * dst1Step[2]);
1362
0
    BYTE* dstEvenChromaY1 = (pDst2[0] + y * dst2Step[0]);
1363
0
    BYTE* dstEvenChromaY2 = dstEvenChromaY1 + roi->width / 2;
1364
0
    BYTE* dstChromaU1 = (pDst2[1] + (y / 2) * dst2Step[1]);
1365
0
    BYTE* dstChromaV1 = (pDst2[2] + (y / 2) * dst2Step[2]);
1366
0
    BYTE* dstChromaU2 = dstChromaU1 + roi->width / 4;
1367
0
    BYTE* dstChromaV2 = dstChromaV1 + roi->width / 4;
1368
0
    general_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(0, srcEven, NULL, dstLumaYEven, NULL, dstLumaU,
1369
0
                                             dstLumaV, dstEvenChromaY1, dstEvenChromaY2, NULL,
1370
0
                                             NULL, dstChromaU1, dstChromaU2, dstChromaV1,
1371
0
                                             dstChromaV2, roi->width);
1372
0
  }
1373
1374
0
  return PRIMITIVES_SUCCESS;
1375
0
}
1376
1377
static pstatus_t sse41_RGBToAVC444YUVv2(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
1378
                                        UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[],
1379
                                        const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[],
1380
                                        const UINT32 dst2Step[],
1381
                                        const prim_size_t* WINPR_RESTRICT roi)
1382
0
{
1383
0
  switch (srcFormat)
1384
0
  {
1385
0
    case PIXEL_FORMAT_BGRX32:
1386
0
    case PIXEL_FORMAT_BGRA32:
1387
0
      return sse41_RGBToAVC444YUVv2_BGRX(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
1388
0
                                         dst2Step, roi);
1389
1390
0
    default:
1391
0
      return generic->RGBToAVC444YUVv2(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
1392
0
                                       dst2Step, roi);
1393
0
  }
1394
0
}
1395
1396
static pstatus_t sse41_LumaToYUV444(const BYTE* WINPR_RESTRICT pSrcRaw[], const UINT32 srcStep[],
1397
                                    BYTE* WINPR_RESTRICT pDstRaw[], const UINT32 dstStep[],
1398
                                    const RECTANGLE_16* WINPR_RESTRICT roi)
1399
0
{
1400
0
  const UINT32 nWidth = roi->right - roi->left;
1401
0
  const UINT32 nHeight = roi->bottom - roi->top;
1402
0
  const UINT32 halfWidth = (nWidth + 1) / 2;
1403
0
  const UINT32 halfPad = halfWidth % 16;
1404
0
  const UINT32 halfHeight = (nHeight + 1) / 2;
1405
0
  const UINT32 oddY = 1;
1406
0
  const UINT32 evenY = 0;
1407
0
  const UINT32 oddX = 1;
1408
0
  const UINT32 evenX = 0;
1409
0
  const BYTE* pSrc[3] = { pSrcRaw[0] + 1ULL * roi->top * srcStep[0] + roi->left,
1410
0
                        pSrcRaw[1] + 1ULL * roi->top / 2 * srcStep[1] + roi->left / 2,
1411
0
                        pSrcRaw[2] + 1ULL * roi->top / 2 * srcStep[2] + roi->left / 2 };
1412
0
  BYTE* pDst[3] = { pDstRaw[0] + 1ULL * roi->top * dstStep[0] + roi->left,
1413
0
                  pDstRaw[1] + 1ULL * roi->top * dstStep[1] + roi->left,
1414
0
                  pDstRaw[2] + 1ULL * roi->top * dstStep[2] + roi->left };
1415
1416
  /* Y data is already here... */
1417
  /* B1 */
1418
0
  for (size_t y = 0; y < nHeight; y++)
1419
0
  {
1420
0
    const BYTE* Ym = pSrc[0] + y * srcStep[0];
1421
0
    BYTE* pY = pDst[0] + y * dstStep[0];
1422
0
    memcpy(pY, Ym, nWidth);
1423
0
  }
1424
1425
  /* The first half of U, V are already here part of this frame. */
1426
  /* B2 and B3 */
1427
0
  for (size_t y = 0; y < halfHeight; y++)
1428
0
  {
1429
0
    const size_t val2y = (2 * y + evenY);
1430
0
    const size_t val2y1 = val2y + oddY;
1431
0
    const BYTE* Um = pSrc[1] + 1ULL * srcStep[1] * y;
1432
0
    const BYTE* Vm = pSrc[2] + 1ULL * srcStep[2] * y;
1433
0
    BYTE* pU = pDst[1] + 1ULL * dstStep[1] * val2y;
1434
0
    BYTE* pV = pDst[2] + 1ULL * dstStep[2] * val2y;
1435
0
    BYTE* pU1 = pDst[1] + 1ULL * dstStep[1] * val2y1;
1436
0
    BYTE* pV1 = pDst[2] + 1ULL * dstStep[2] * val2y1;
1437
1438
0
    size_t x = 0;
1439
0
    for (; x < halfWidth - halfPad; x += 16)
1440
0
    {
1441
0
      const __m128i unpackHigh = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
1442
0
      const __m128i unpackLow =
1443
0
          _mm_set_epi8(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8);
1444
0
      {
1445
0
        const __m128i u = LOAD_SI128(&Um[x]);
1446
0
        const __m128i uHigh = _mm_shuffle_epi8(u, unpackHigh);
1447
0
        const __m128i uLow = _mm_shuffle_epi8(u, unpackLow);
1448
0
        STORE_SI128(&pU[2ULL * x], uHigh);
1449
0
        STORE_SI128(&pU[2ULL * x + 16], uLow);
1450
0
        STORE_SI128(&pU1[2ULL * x], uHigh);
1451
0
        STORE_SI128(&pU1[2ULL * x + 16], uLow);
1452
0
      }
1453
0
      {
1454
0
        const __m128i u = LOAD_SI128(&Vm[x]);
1455
0
        const __m128i uHigh = _mm_shuffle_epi8(u, unpackHigh);
1456
0
        const __m128i uLow = _mm_shuffle_epi8(u, unpackLow);
1457
0
        STORE_SI128(&pV[2 * x], uHigh);
1458
0
        STORE_SI128(&pV[2 * x + 16], uLow);
1459
0
        STORE_SI128(&pV1[2 * x], uHigh);
1460
0
        STORE_SI128(&pV1[2 * x + 16], uLow);
1461
0
      }
1462
0
    }
1463
1464
0
    for (; x < halfWidth; x++)
1465
0
    {
1466
0
      const size_t val2x = 2 * x + evenX;
1467
0
      const size_t val2x1 = val2x + oddX;
1468
0
      pU[val2x] = Um[x];
1469
0
      pV[val2x] = Vm[x];
1470
0
      pU[val2x1] = Um[x];
1471
0
      pV[val2x1] = Vm[x];
1472
0
      pU1[val2x] = Um[x];
1473
0
      pV1[val2x] = Vm[x];
1474
0
      pU1[val2x1] = Um[x];
1475
0
      pV1[val2x1] = Vm[x];
1476
0
    }
1477
0
  }
1478
1479
0
  return PRIMITIVES_SUCCESS;
1480
0
}
1481
1482
static pstatus_t sse41_ChromaV1ToYUV444(const BYTE* WINPR_RESTRICT pSrcRaw[3],
1483
                                        const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDstRaw[3],
1484
                                        const UINT32 dstStep[3],
1485
                                        const RECTANGLE_16* WINPR_RESTRICT roi)
1486
0
{
1487
0
  const UINT32 mod = 16;
1488
0
  UINT32 uY = 0;
1489
0
  UINT32 vY = 0;
1490
0
  const UINT32 nWidth = roi->right - roi->left;
1491
0
  const UINT32 nHeight = roi->bottom - roi->top;
1492
0
  const UINT32 halfWidth = (nWidth + 1) / 2;
1493
0
  const UINT32 halfPad = halfWidth % 16;
1494
0
  const UINT32 halfHeight = (nHeight + 1) / 2;
1495
0
  const UINT32 oddY = 1;
1496
0
  const UINT32 evenY = 0;
1497
0
  const UINT32 oddX = 1;
1498
  /* The auxiliary frame is aligned to multiples of 16x16.
1499
   * We need the padded height for B4 and B5 conversion. */
1500
0
  const UINT32 padHeigth = nHeight + 16 - nHeight % 16;
1501
0
  const BYTE* pSrc[3] = { pSrcRaw[0] + 1ULL * roi->top * srcStep[0] + roi->left,
1502
0
                        pSrcRaw[1] + 1ULL * roi->top / 2 * srcStep[1] + roi->left / 2,
1503
0
                        pSrcRaw[2] + 1ULL * roi->top / 2 * srcStep[2] + roi->left / 2 };
1504
0
  BYTE* pDst[3] = { pDstRaw[0] + 1ULL * roi->top * dstStep[0] + roi->left,
1505
0
                  pDstRaw[1] + 1ULL * roi->top * dstStep[1] + roi->left,
1506
0
                  pDstRaw[2] + 1ULL * roi->top * dstStep[2] + roi->left };
1507
0
  const __m128i zero = _mm_setzero_si128();
1508
0
  const __m128i mask = _mm_set_epi8(0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0,
1509
0
                                    (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80);
1510
1511
  /* The second half of U and V is a bit more tricky... */
1512
  /* B4 and B5 */
1513
0
  for (size_t y = 0; y < padHeigth; y++)
1514
0
  {
1515
0
    const BYTE* Ya = pSrc[0] + 1ULL * srcStep[0] * y;
1516
0
    BYTE* pX = NULL;
1517
1518
0
    if ((y) % mod < (mod + 1) / 2)
1519
0
    {
1520
0
      const UINT32 pos = (2 * uY++ + oddY);
1521
1522
0
      if (pos >= nHeight)
1523
0
        continue;
1524
1525
0
      pX = pDst[1] + 1ULL * dstStep[1] * pos;
1526
0
    }
1527
0
    else
1528
0
    {
1529
0
      const UINT32 pos = (2 * vY++ + oddY);
1530
1531
0
      if (pos >= nHeight)
1532
0
        continue;
1533
1534
0
      pX = pDst[2] + 1ULL * dstStep[2] * pos;
1535
0
    }
1536
1537
0
    memcpy(pX, Ya, nWidth);
1538
0
  }
1539
1540
  /* B6 and B7 */
1541
0
  for (size_t y = 0; y < halfHeight; y++)
1542
0
  {
1543
0
    const size_t val2y = (y * 2 + evenY);
1544
0
    const BYTE* Ua = pSrc[1] + srcStep[1] * y;
1545
0
    const BYTE* Va = pSrc[2] + srcStep[2] * y;
1546
0
    BYTE* pU = pDst[1] + dstStep[1] * val2y;
1547
0
    BYTE* pV = pDst[2] + dstStep[2] * val2y;
1548
1549
0
    size_t x = 0;
1550
0
    for (; x < halfWidth - halfPad; x += 16)
1551
0
    {
1552
0
      {
1553
0
        const __m128i u = LOAD_SI128(&Ua[x]);
1554
0
        const __m128i u2 = _mm_unpackhi_epi8(u, zero);
1555
0
        const __m128i u1 = _mm_unpacklo_epi8(u, zero);
1556
0
        _mm_maskmoveu_si128(u1, mask, (char*)&pU[2 * x]);
1557
0
        _mm_maskmoveu_si128(u2, mask, (char*)&pU[2 * x + 16]);
1558
0
      }
1559
0
      {
1560
0
        const __m128i u = LOAD_SI128(&Va[x]);
1561
0
        const __m128i u2 = _mm_unpackhi_epi8(u, zero);
1562
0
        const __m128i u1 = _mm_unpacklo_epi8(u, zero);
1563
0
        _mm_maskmoveu_si128(u1, mask, (char*)&pV[2 * x]);
1564
0
        _mm_maskmoveu_si128(u2, mask, (char*)&pV[2 * x + 16]);
1565
0
      }
1566
0
    }
1567
1568
0
    for (; x < halfWidth; x++)
1569
0
    {
1570
0
      const size_t val2x1 = (x * 2ULL + oddX);
1571
0
      pU[val2x1] = Ua[x];
1572
0
      pV[val2x1] = Va[x];
1573
0
    }
1574
0
  }
1575
1576
0
  return PRIMITIVES_SUCCESS;
1577
0
}
1578
1579
static pstatus_t sse41_ChromaV2ToYUV444(const BYTE* WINPR_RESTRICT pSrc[3], const UINT32 srcStep[3],
1580
                                        UINT32 nTotalWidth, WINPR_ATTR_UNUSED UINT32 nTotalHeight,
1581
                                        BYTE* WINPR_RESTRICT pDst[3], const UINT32 dstStep[3],
1582
                                        const RECTANGLE_16* WINPR_RESTRICT roi)
1583
0
{
1584
0
  const UINT32 nWidth = roi->right - roi->left;
1585
0
  const UINT32 nHeight = roi->bottom - roi->top;
1586
0
  const UINT32 halfWidth = (nWidth + 1) / 2;
1587
0
  const UINT32 halfPad = halfWidth % 16;
1588
0
  const UINT32 halfHeight = (nHeight + 1) / 2;
1589
0
  const UINT32 quaterWidth = (nWidth + 3) / 4;
1590
0
  const UINT32 quaterPad = quaterWidth % 16;
1591
0
  const __m128i zero = _mm_setzero_si128();
1592
0
  const __m128i mask = _mm_set_epi8((char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0,
1593
0
                                    (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0);
1594
0
  const __m128i mask2 = _mm_set_epi8(0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80,
1595
0
                                     0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80);
1596
0
  const __m128i shuffle1 =
1597
0
      _mm_set_epi8((char)0x80, 15, (char)0x80, 14, (char)0x80, 13, (char)0x80, 12, (char)0x80, 11,
1598
0
                   (char)0x80, 10, (char)0x80, 9, (char)0x80, 8);
1599
0
  const __m128i shuffle2 =
1600
0
      _mm_set_epi8((char)0x80, 7, (char)0x80, 6, (char)0x80, 5, (char)0x80, 4, (char)0x80, 3,
1601
0
                   (char)0x80, 2, (char)0x80, 1, (char)0x80, 0);
1602
1603
  /* B4 and B5: odd UV values for width/2, height */
1604
0
  for (size_t y = 0; y < nHeight; y++)
1605
0
  {
1606
0
    const size_t yTop = y + roi->top;
1607
0
    const BYTE* pYaU = pSrc[0] + srcStep[0] * yTop + roi->left / 2;
1608
0
    const BYTE* pYaV = pYaU + nTotalWidth / 2;
1609
0
    BYTE* pU = pDst[1] + 1ULL * dstStep[1] * yTop + roi->left;
1610
0
    BYTE* pV = pDst[2] + 1ULL * dstStep[2] * yTop + roi->left;
1611
1612
0
    size_t x = 0;
1613
0
    for (; x < halfWidth - halfPad; x += 16)
1614
0
    {
1615
0
      {
1616
0
        const __m128i u = LOAD_SI128(&pYaU[x]);
1617
0
        const __m128i u2 = _mm_unpackhi_epi8(zero, u);
1618
0
        const __m128i u1 = _mm_unpacklo_epi8(zero, u);
1619
0
        _mm_maskmoveu_si128(u1, mask, (char*)&pU[2 * x]);
1620
0
        _mm_maskmoveu_si128(u2, mask, (char*)&pU[2 * x + 16]);
1621
0
      }
1622
0
      {
1623
0
        const __m128i v = LOAD_SI128(&pYaV[x]);
1624
0
        const __m128i v2 = _mm_unpackhi_epi8(zero, v);
1625
0
        const __m128i v1 = _mm_unpacklo_epi8(zero, v);
1626
0
        _mm_maskmoveu_si128(v1, mask, (char*)&pV[2 * x]);
1627
0
        _mm_maskmoveu_si128(v2, mask, (char*)&pV[2 * x + 16]);
1628
0
      }
1629
0
    }
1630
1631
0
    for (; x < halfWidth; x++)
1632
0
    {
1633
0
      const size_t odd = 2ULL * x + 1;
1634
0
      pU[odd] = pYaU[x];
1635
0
      pV[odd] = pYaV[x];
1636
0
    }
1637
0
  }
1638
1639
  /* B6 - B9 */
1640
0
  for (size_t y = 0; y < halfHeight; y++)
1641
0
  {
1642
0
    const BYTE* pUaU = pSrc[1] + srcStep[1] * (y + roi->top / 2) + roi->left / 4;
1643
0
    const BYTE* pUaV = pUaU + nTotalWidth / 4;
1644
0
    const BYTE* pVaU = pSrc[2] + srcStep[2] * (y + roi->top / 2) + roi->left / 4;
1645
0
    const BYTE* pVaV = pVaU + nTotalWidth / 4;
1646
0
    BYTE* pU = pDst[1] + dstStep[1] * (2 * y + 1 + roi->top) + roi->left;
1647
0
    BYTE* pV = pDst[2] + dstStep[2] * (2 * y + 1 + roi->top) + roi->left;
1648
1649
0
    UINT32 x = 0;
1650
0
    for (; x < quaterWidth - quaterPad; x += 16)
1651
0
    {
1652
0
      {
1653
0
        const __m128i uU = LOAD_SI128(&pUaU[x]);
1654
0
        const __m128i uV = LOAD_SI128(&pVaU[x]);
1655
0
        const __m128i uHigh = _mm_unpackhi_epi8(uU, uV);
1656
0
        const __m128i uLow = _mm_unpacklo_epi8(uU, uV);
1657
0
        const __m128i u1 = _mm_shuffle_epi8(uLow, shuffle2);
1658
0
        const __m128i u2 = _mm_shuffle_epi8(uLow, shuffle1);
1659
0
        const __m128i u3 = _mm_shuffle_epi8(uHigh, shuffle2);
1660
0
        const __m128i u4 = _mm_shuffle_epi8(uHigh, shuffle1);
1661
0
        _mm_maskmoveu_si128(u1, mask2, (char*)&pU[4 * x + 0]);
1662
0
        _mm_maskmoveu_si128(u2, mask2, (char*)&pU[4 * x + 16]);
1663
0
        _mm_maskmoveu_si128(u3, mask2, (char*)&pU[4 * x + 32]);
1664
0
        _mm_maskmoveu_si128(u4, mask2, (char*)&pU[4 * x + 48]);
1665
0
      }
1666
0
      {
1667
0
        const __m128i vU = LOAD_SI128(&pUaV[x]);
1668
0
        const __m128i vV = LOAD_SI128(&pVaV[x]);
1669
0
        const __m128i vHigh = _mm_unpackhi_epi8(vU, vV);
1670
0
        const __m128i vLow = _mm_unpacklo_epi8(vU, vV);
1671
0
        const __m128i v1 = _mm_shuffle_epi8(vLow, shuffle2);
1672
0
        const __m128i v2 = _mm_shuffle_epi8(vLow, shuffle1);
1673
0
        const __m128i v3 = _mm_shuffle_epi8(vHigh, shuffle2);
1674
0
        const __m128i v4 = _mm_shuffle_epi8(vHigh, shuffle1);
1675
0
        _mm_maskmoveu_si128(v1, mask2, (char*)&pV[4 * x + 0]);
1676
0
        _mm_maskmoveu_si128(v2, mask2, (char*)&pV[4 * x + 16]);
1677
0
        _mm_maskmoveu_si128(v3, mask2, (char*)&pV[4 * x + 32]);
1678
0
        _mm_maskmoveu_si128(v4, mask2, (char*)&pV[4 * x + 48]);
1679
0
      }
1680
0
    }
1681
1682
0
    for (; x < quaterWidth; x++)
1683
0
    {
1684
0
      pU[4 * x + 0] = pUaU[x];
1685
0
      pV[4 * x + 0] = pUaV[x];
1686
0
      pU[4 * x + 2] = pVaU[x];
1687
0
      pV[4 * x + 2] = pVaV[x];
1688
0
    }
1689
0
  }
1690
1691
0
  return PRIMITIVES_SUCCESS;
1692
0
}
1693
1694
static pstatus_t sse41_YUV420CombineToYUV444(avc444_frame_type type,
1695
                                             const BYTE* WINPR_RESTRICT pSrc[3],
1696
                                             const UINT32 srcStep[3], UINT32 nWidth, UINT32 nHeight,
1697
                                             BYTE* WINPR_RESTRICT pDst[3], const UINT32 dstStep[3],
1698
                                             const RECTANGLE_16* WINPR_RESTRICT roi)
1699
0
{
1700
0
  if (!pSrc || !pSrc[0] || !pSrc[1] || !pSrc[2])
1701
0
    return -1;
1702
1703
0
  if (!pDst || !pDst[0] || !pDst[1] || !pDst[2])
1704
0
    return -1;
1705
1706
0
  if (!roi)
1707
0
    return -1;
1708
1709
0
  switch (type)
1710
0
  {
1711
0
    case AVC444_LUMA:
1712
0
      return sse41_LumaToYUV444(pSrc, srcStep, pDst, dstStep, roi);
1713
1714
0
    case AVC444_CHROMAv1:
1715
0
      return sse41_ChromaV1ToYUV444(pSrc, srcStep, pDst, dstStep, roi);
1716
1717
0
    case AVC444_CHROMAv2:
1718
0
      return sse41_ChromaV2ToYUV444(pSrc, srcStep, nWidth, nHeight, pDst, dstStep, roi);
1719
1720
0
    default:
1721
0
      return -1;
1722
0
  }
1723
0
}
1724
#endif
1725
1726
void primitives_init_YUV_sse41_int(primitives_t* WINPR_RESTRICT prims)
1727
0
{
1728
0
#if defined(SSE_AVX_INTRINSICS_ENABLED)
1729
0
  generic = primitives_get_generic();
1730
1731
0
  WLog_VRB(PRIM_TAG, "SSE3/sse41 optimizations");
1732
0
  prims->RGBToYUV420_8u_P3AC4R = sse41_RGBToYUV420;
1733
0
  prims->RGBToAVC444YUV = sse41_RGBToAVC444YUV;
1734
0
  prims->RGBToAVC444YUVv2 = sse41_RGBToAVC444YUVv2;
1735
0
  prims->YUV420ToRGB_8u_P3AC4R = sse41_YUV420ToRGB;
1736
0
  prims->YUV444ToRGB_8u_P3AC4R = sse41_YUV444ToRGB_8u_P3AC4R;
1737
0
  prims->YUV420CombineToYUV444 = sse41_YUV420CombineToYUV444;
1738
#else
1739
  WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or sse41 intrinsics not available");
1740
  WINPR_UNUSED(prims);
1741
#endif
1742
0
}