Coverage Report

Created: 2023-09-25 06:56

/src/FreeRDP/libfreerdp/primitives/prim_YCoCg_opt.c
Line
Count
Source (jump to first uncovered line)
1
/* FreeRDP: A Remote Desktop Protocol Client
2
 * Optimized YCoCg<->RGB conversion operations.
3
 * vi:ts=4 sw=4:
4
 *
5
 * (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
6
 *
7
 * Licensed under the Apache License, Version 2.0 (the "License");
8
 * you may not use this file except in compliance with the License.
9
 * You may obtain a copy of the License at
10
 *
11
 *     http://www.apache.org/licenses/LICENSE-2.0
12
 *
13
 * Unless required by applicable law or agreed to in writing, software
14
 * distributed under the License is distributed on an "AS IS" BASIS,
15
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
 * See the License for the specific language governing permissions and
17
 * limitations under the License.
18
 */
19
20
#include <freerdp/config.h>
21
22
#include <freerdp/types.h>
23
#include <freerdp/primitives.h>
24
#include <winpr/sysinfo.h>
25
26
#ifdef WITH_SSE2
27
#include <emmintrin.h>
28
#include <tmmintrin.h>
29
#elif defined(WITH_NEON)
30
#include <arm_neon.h>
31
#endif /* WITH_SSE2 else WITH_NEON */
32
33
#include "prim_internal.h"
34
#include "prim_templates.h"
35
36
static primitives_t* generic = NULL;
37
38
#ifdef WITH_SSE2
39
/* ------------------------------------------------------------------------- */
40
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep,
41
                                                  BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
42
                                                  UINT32 dstStep, UINT32 width, UINT32 height,
43
                                                  UINT8 shift, BOOL withAlpha)
44
0
{
45
0
  const BYTE* sptr = pSrc;
46
0
  BYTE* dptr = (BYTE*)pDst;
47
0
  int sRowBump = srcStep - width * sizeof(UINT32);
48
0
  int dRowBump = dstStep - width * sizeof(UINT32);
49
0
  UINT32 h;
50
  /* Shift left by "shift" and divide by two is the same as shift
51
   * left by "shift-1".
52
   */
53
0
  int dataShift = shift - 1;
54
0
  BYTE mask = (BYTE)(0xFFU << dataShift);
55
56
  /* Let's say the data is of the form:
57
   * y0y0o0g0 a1y1o1g1 a2y2o2g2...
58
   * Apply:
59
   * |R|   | 1  1/2 -1/2 |   |y|
60
   * |G| = | 1  0    1/2 | * |o|
61
   * |B|   | 1 -1/2 -1/2 |   |g|
62
   * where Y is 8-bit unsigned and o & g are 8-bit signed.
63
   */
64
65
0
  if ((width < 8) || (ULONG_PTR)dptr & 0x03)
66
0
  {
67
    /* Too small, or we'll never hit a 16-byte boundary.  Punt. */
68
0
    return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
69
0
                                       shift, withAlpha);
70
0
  }
71
72
0
  for (h = 0; h < height; h++)
73
0
  {
74
0
    UINT32 w = width;
75
0
    BOOL onStride;
76
77
    /* Get to a 16-byte destination boundary. */
78
0
    if ((ULONG_PTR)dptr & 0x0f)
79
0
    {
80
0
      pstatus_t status;
81
0
      UINT32 startup = (16 - ((ULONG_PTR)dptr & 0x0f)) / 4;
82
83
0
      if (startup > width)
84
0
        startup = width;
85
86
0
      status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, startup,
87
0
                                           1, shift, withAlpha);
88
89
0
      if (status != PRIMITIVES_SUCCESS)
90
0
        return status;
91
92
0
      sptr += startup * sizeof(UINT32);
93
0
      dptr += startup * sizeof(UINT32);
94
0
      w -= startup;
95
0
    }
96
97
    /* Each loop handles eight pixels at a time. */
98
0
    onStride = (((ULONG_PTR)sptr & 0x0f) == 0) ? TRUE : FALSE;
99
100
0
    while (w >= 8)
101
0
    {
102
0
      __m128i R0, R1, R2, R3, R4, R5, R6, R7;
103
104
0
      if (onStride)
105
0
      {
106
        /* The faster path, 16-byte aligned load. */
107
0
        R0 = _mm_load_si128((const __m128i*)sptr);
108
0
        sptr += (128 / 8);
109
0
        R1 = _mm_load_si128((const __m128i*)sptr);
110
0
        sptr += (128 / 8);
111
0
      }
112
0
      else
113
0
      {
114
        /* Off-stride, slower LDDQU load. */
115
0
        R0 = _mm_lddqu_si128((const __m128i*)sptr);
116
0
        sptr += (128 / 8);
117
0
        R1 = _mm_lddqu_si128((const __m128i*)sptr);
118
0
        sptr += (128 / 8);
119
0
      }
120
121
      /* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
122
      /* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
123
      /* Shuffle to pack all the like types together. */
124
0
      R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
125
0
      R3 = _mm_shuffle_epi8(R0, R2);
126
0
      R4 = _mm_shuffle_epi8(R1, R2);
127
      /* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
128
      /* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
129
0
      R5 = _mm_unpackhi_epi32(R3, R4);
130
0
      R6 = _mm_unpacklo_epi32(R3, R4);
131
132
      /* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
133
      /* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
134
      /* Save alphas aside */
135
0
      if (withAlpha)
136
0
        R7 = _mm_unpackhi_epi64(R5, R5);
137
0
      else
138
0
        R7 = _mm_set1_epi32(0xFFFFFFFFU);
139
140
      /* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
141
      /* Expand Y's from 8-bit unsigned to 16-bit signed. */
142
0
      R1 = _mm_set1_epi32(0);
143
0
      R0 = _mm_unpacklo_epi8(R5, R1);
144
      /* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
145
      /* Shift Co's and Cg's by (shift-1).  -1 covers division by two.
146
       * Note: this must be done before sign-conversion.
147
       * Note also there is no slli_epi8, so we have to use a 16-bit
148
       * version and then mask.
149
       */
150
0
      R6 = _mm_slli_epi16(R6, dataShift);
151
0
      R1 = _mm_set1_epi8(mask);
152
0
      R6 = _mm_and_si128(R6, R1);
153
      /* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
154
      /* Expand Co's from 8-bit signed to 16-bit signed */
155
0
      R1 = _mm_unpackhi_epi8(R6, R6);
156
0
      R1 = _mm_srai_epi16(R1, 8);
157
      /* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
158
      /* Expand Cg's form 8-bit signed to 16-bit signed */
159
0
      R2 = _mm_unpacklo_epi8(R6, R6);
160
0
      R2 = _mm_srai_epi16(R2, 8);
161
      /* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
162
      /* Get Y - halfCg and save */
163
0
      R6 = _mm_subs_epi16(R0, R2);
164
      /* R = (Y-halfCg) + halfCo */
165
0
      R3 = _mm_adds_epi16(R6, R1);
166
      /* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
167
      /* G = Y + Cg(/2) */
168
0
      R4 = _mm_adds_epi16(R0, R2);
169
      /* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
170
      /* B = (Y-halfCg) - Co(/2) */
171
0
      R5 = _mm_subs_epi16(R6, R1);
172
      /* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
173
      /* Repack R's & B's.  */
174
0
      R0 = _mm_packus_epi16(R3, R5);
175
      /* R0 = R7R6R5R4 R3R2R1R0 B7B6B5B4 B3B2B1B0 */
176
      /* Repack G's. */
177
0
      R1 = _mm_packus_epi16(R4, R4);
178
      /* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
179
      /* And add the A's. */
180
0
      R1 = _mm_unpackhi_epi64(R1, R7);
181
      /* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
182
      /* Now do interleaving again. */
183
0
      R2 = _mm_unpacklo_epi8(R0, R1);
184
      /* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
185
0
      R3 = _mm_unpackhi_epi8(R0, R1);
186
      /* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
187
0
      R4 = _mm_unpacklo_epi16(R2, R3);
188
      /* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
189
0
      R5 = _mm_unpackhi_epi16(R2, R3);
190
      /* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
191
0
      _mm_store_si128((__m128i*)dptr, R4);
192
0
      dptr += (128 / 8);
193
0
      _mm_store_si128((__m128i*)dptr, R5);
194
0
      dptr += (128 / 8);
195
0
      w -= 8;
196
0
    }
197
198
    /* Handle any remainder pixels. */
199
0
    if (w > 0)
200
0
    {
201
0
      pstatus_t status;
202
0
      status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, w, 1,
203
0
                                           shift, withAlpha);
204
205
0
      if (status != PRIMITIVES_SUCCESS)
206
0
        return status;
207
208
0
      sptr += w * sizeof(UINT32);
209
0
      dptr += w * sizeof(UINT32);
210
0
    }
211
212
0
    sptr += sRowBump;
213
0
    dptr += dRowBump;
214
0
  }
215
216
0
  return PRIMITIVES_SUCCESS;
217
0
}
218
219
/* ------------------------------------------------------------------------- */
220
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(const BYTE* WINPR_RESTRICT pSrc,
221
                                                     UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
222
                                                     UINT32 DstFormat, UINT32 dstStep, UINT32 width,
223
                                                     UINT32 height, UINT8 shift, BOOL withAlpha)
224
0
{
225
0
  const BYTE* sptr = pSrc;
226
0
  BYTE* dptr = (BYTE*)pDst;
227
0
  int sRowBump = srcStep - width * sizeof(UINT32);
228
0
  int dRowBump = dstStep - width * sizeof(UINT32);
229
0
  UINT32 h;
230
  /* Shift left by "shift" and divide by two is the same as shift
231
   * left by "shift-1".
232
   */
233
0
  int dataShift = shift - 1;
234
0
  BYTE mask = (BYTE)(0xFFU << dataShift);
235
236
  /* Let's say the data is of the form:
237
   * y0y0o0g0 a1y1o1g1 a2y2o2g2...
238
   * Apply:
239
   * |R|   | 1  1/2 -1/2 |   |y|
240
   * |G| = | 1  0    1/2 | * |o|
241
   * |B|   | 1 -1/2 -1/2 |   |g|
242
   * where Y is 8-bit unsigned and o & g are 8-bit signed.
243
   */
244
245
0
  if ((width < 8) || (ULONG_PTR)dptr & 0x03)
246
0
  {
247
    /* Too small, or we'll never hit a 16-byte boundary.  Punt. */
248
0
    return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
249
0
                                       shift, withAlpha);
250
0
  }
251
252
0
  for (h = 0; h < height; h++)
253
0
  {
254
0
    int w = width;
255
0
    BOOL onStride;
256
257
    /* Get to a 16-byte destination boundary. */
258
0
    if ((ULONG_PTR)dptr & 0x0f)
259
0
    {
260
0
      pstatus_t status;
261
0
      UINT32 startup = (16 - ((ULONG_PTR)dptr & 0x0f)) / 4;
262
263
0
      if (startup > width)
264
0
        startup = width;
265
266
0
      status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, startup,
267
0
                                           1, shift, withAlpha);
268
269
0
      if (status != PRIMITIVES_SUCCESS)
270
0
        return status;
271
272
0
      sptr += startup * sizeof(UINT32);
273
0
      dptr += startup * sizeof(UINT32);
274
0
      w -= startup;
275
0
    }
276
277
    /* Each loop handles eight pixels at a time. */
278
0
    onStride = (((const ULONG_PTR)sptr & 0x0f) == 0) ? TRUE : FALSE;
279
280
0
    while (w >= 8)
281
0
    {
282
0
      __m128i R0, R1, R2, R3, R4, R5, R6, R7;
283
284
0
      if (onStride)
285
0
      {
286
        /* The faster path, 16-byte aligned load. */
287
0
        R0 = _mm_load_si128((const __m128i*)sptr);
288
0
        sptr += (128 / 8);
289
0
        R1 = _mm_load_si128((const __m128i*)sptr);
290
0
        sptr += (128 / 8);
291
0
      }
292
0
      else
293
0
      {
294
        /* Off-stride, slower LDDQU load. */
295
0
        R0 = _mm_lddqu_si128((const __m128i*)sptr);
296
0
        sptr += (128 / 8);
297
0
        R1 = _mm_lddqu_si128((const __m128i*)sptr);
298
0
        sptr += (128 / 8);
299
0
      }
300
301
      /* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
302
      /* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
303
      /* Shuffle to pack all the like types together. */
304
0
      R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
305
0
      R3 = _mm_shuffle_epi8(R0, R2);
306
0
      R4 = _mm_shuffle_epi8(R1, R2);
307
      /* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
308
      /* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
309
0
      R5 = _mm_unpackhi_epi32(R3, R4);
310
0
      R6 = _mm_unpacklo_epi32(R3, R4);
311
312
      /* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
313
      /* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
314
      /* Save alphas aside */
315
0
      if (withAlpha)
316
0
        R7 = _mm_unpackhi_epi64(R5, R5);
317
0
      else
318
0
        R7 = _mm_set1_epi32(0xFFFFFFFFU);
319
320
      /* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
321
      /* Expand Y's from 8-bit unsigned to 16-bit signed. */
322
0
      R1 = _mm_set1_epi32(0);
323
0
      R0 = _mm_unpacklo_epi8(R5, R1);
324
      /* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
325
      /* Shift Co's and Cg's by (shift-1).  -1 covers division by two.
326
       * Note: this must be done before sign-conversion.
327
       * Note also there is no slli_epi8, so we have to use a 16-bit
328
       * version and then mask.
329
       */
330
0
      R6 = _mm_slli_epi16(R6, dataShift);
331
0
      R1 = _mm_set1_epi8(mask);
332
0
      R6 = _mm_and_si128(R6, R1);
333
      /* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
334
      /* Expand Co's from 8-bit signed to 16-bit signed */
335
0
      R1 = _mm_unpackhi_epi8(R6, R6);
336
0
      R1 = _mm_srai_epi16(R1, 8);
337
      /* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
338
      /* Expand Cg's form 8-bit signed to 16-bit signed */
339
0
      R2 = _mm_unpacklo_epi8(R6, R6);
340
0
      R2 = _mm_srai_epi16(R2, 8);
341
      /* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
342
      /* Get Y - halfCg and save */
343
0
      R6 = _mm_subs_epi16(R0, R2);
344
      /* R = (Y-halfCg) + halfCo */
345
0
      R3 = _mm_adds_epi16(R6, R1);
346
      /* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
347
      /* G = Y + Cg(/2) */
348
0
      R4 = _mm_adds_epi16(R0, R2);
349
      /* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
350
      /* B = (Y-halfCg) - Co(/2) */
351
0
      R5 = _mm_subs_epi16(R6, R1);
352
      /* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
353
      /* Repack R's & B's.  */
354
      /* This line is the only diff between inverted and non-inverted.
355
       * Unfortunately, it would be expensive to check "inverted"
356
       * every time through this loop.
357
       */
358
0
      R0 = _mm_packus_epi16(R5, R3);
359
      /* R0 = B7B6B5B4 B3B2B1B0 R7R6R5R4 R3R2R1R0 */
360
      /* Repack G's. */
361
0
      R1 = _mm_packus_epi16(R4, R4);
362
      /* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
363
      /* And add the A's. */
364
0
      R1 = _mm_unpackhi_epi64(R1, R7);
365
      /* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
366
      /* Now do interleaving again. */
367
0
      R2 = _mm_unpacklo_epi8(R0, R1);
368
      /* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
369
0
      R3 = _mm_unpackhi_epi8(R0, R1);
370
      /* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
371
0
      R4 = _mm_unpacklo_epi16(R2, R3);
372
      /* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
373
0
      R5 = _mm_unpackhi_epi16(R2, R3);
374
      /* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
375
0
      _mm_store_si128((__m128i*)dptr, R4);
376
0
      dptr += (128 / 8);
377
0
      _mm_store_si128((__m128i*)dptr, R5);
378
0
      dptr += (128 / 8);
379
0
      w -= 8;
380
0
    }
381
382
    /* Handle any remainder pixels. */
383
0
    if (w > 0)
384
0
    {
385
0
      pstatus_t status;
386
0
      status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, w, 1,
387
0
                                           shift, withAlpha);
388
389
0
      if (status != PRIMITIVES_SUCCESS)
390
0
        return status;
391
392
0
      sptr += w * sizeof(UINT32);
393
0
      dptr += w * sizeof(UINT32);
394
0
    }
395
396
0
    sptr += sRowBump;
397
0
    dptr += dRowBump;
398
0
  }
399
400
0
  return PRIMITIVES_SUCCESS;
401
0
}
402
#endif /* WITH_SSE2 */
403
404
#ifdef WITH_SSE2
405
/* ------------------------------------------------------------------------- */
406
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
407
                                           BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
408
                                           INT32 dstStep, UINT32 width, UINT32 height, UINT8 shift,
409
                                           BOOL withAlpha)
410
0
{
411
0
  switch (DstFormat)
412
0
  {
413
0
    case PIXEL_FORMAT_BGRX32:
414
0
    case PIXEL_FORMAT_BGRA32:
415
0
      return ssse3_YCoCgRToRGB_8u_AC4R_invert(pSrc, srcStep, pDst, DstFormat, dstStep, width,
416
0
                                              height, shift, withAlpha);
417
418
0
    case PIXEL_FORMAT_RGBX32:
419
0
    case PIXEL_FORMAT_RGBA32:
420
0
      return ssse3_YCoCgRToRGB_8u_AC4R_no_invert(pSrc, srcStep, pDst, DstFormat, dstStep,
421
0
                                                 width, height, shift, withAlpha);
422
423
0
    default:
424
0
      return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width,
425
0
                                         height, shift, withAlpha);
426
0
  }
427
0
}
428
#elif defined(WITH_NEON)
429
430
static pstatus_t neon_YCoCgToRGB_8u_X(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
431
                                      BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, INT32 dstStep,
432
                                      UINT32 width, UINT32 height, UINT8 shift, BYTE bPos,
433
                                      BYTE gPos, BYTE rPos, BYTE aPos, BOOL alpha)
434
{
435
  UINT32 y;
436
  BYTE* dptr = pDst;
437
  const BYTE* sptr = pSrc;
438
  const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
439
  const int8_t cll = shift - 1; /* -1 builds in the /2's */
440
  const UINT32 srcPad = srcStep - (width * 4);
441
  const UINT32 dstPad = dstStep - (width * formatSize);
442
  const UINT32 pad = width % 8;
443
  const uint8x8_t aVal = vdup_n_u8(0xFF);
444
  const int8x8_t cllv = vdup_n_s8(cll);
445
446
  for (y = 0; y < height; y++)
447
  {
448
    UINT32 x;
449
450
    for (x = 0; x < width - pad; x += 8)
451
    {
452
      /* Note: shifts must be done before sign-conversion. */
453
      const uint8x8x4_t raw = vld4_u8(sptr);
454
      const int8x8_t CgRaw = vreinterpret_s8_u8(vshl_u8(raw.val[0], cllv));
455
      const int8x8_t CoRaw = vreinterpret_s8_u8(vshl_u8(raw.val[1], cllv));
456
      const int16x8_t Cg = vmovl_s8(CgRaw);
457
      const int16x8_t Co = vmovl_s8(CoRaw);
458
      const int16x8_t Y = vreinterpretq_s16_u16(vmovl_u8(raw.val[2])); /* UINT8 -> INT16 */
459
      const int16x8_t T = vsubq_s16(Y, Cg);
460
      const int16x8_t R = vaddq_s16(T, Co);
461
      const int16x8_t G = vaddq_s16(Y, Cg);
462
      const int16x8_t B = vsubq_s16(T, Co);
463
      uint8x8x4_t bgrx;
464
      bgrx.val[bPos] = vqmovun_s16(B);
465
      bgrx.val[gPos] = vqmovun_s16(G);
466
      bgrx.val[rPos] = vqmovun_s16(R);
467
468
      if (alpha)
469
        bgrx.val[aPos] = raw.val[3];
470
      else
471
        bgrx.val[aPos] = aVal;
472
473
      vst4_u8(dptr, bgrx);
474
      sptr += sizeof(raw);
475
      dptr += sizeof(bgrx);
476
    }
477
478
    for (x = 0; x < pad; x++)
479
    {
480
      /* Note: shifts must be done before sign-conversion. */
481
      const INT16 Cg = (INT16)((INT8)((*sptr++) << cll));
482
      const INT16 Co = (INT16)((INT8)((*sptr++) << cll));
483
      const INT16 Y = (INT16)(*sptr++); /* UINT8->INT16 */
484
      const INT16 T = Y - Cg;
485
      const INT16 R = T + Co;
486
      const INT16 G = Y + Cg;
487
      const INT16 B = T - Co;
488
      BYTE bgra[4];
489
      bgra[bPos] = CLIP(B);
490
      bgra[gPos] = CLIP(G);
491
      bgra[rPos] = CLIP(R);
492
      bgra[aPos] = *sptr++;
493
494
      if (!alpha)
495
        bgra[aPos] = 0xFF;
496
497
      *dptr++ = bgra[0];
498
      *dptr++ = bgra[1];
499
      *dptr++ = bgra[2];
500
      *dptr++ = bgra[3];
501
    }
502
503
    sptr += srcPad;
504
    dptr += dstPad;
505
  }
506
507
  return PRIMITIVES_SUCCESS;
508
}
509
510
static pstatus_t neon_YCoCgToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
511
                                         BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, INT32 dstStep,
512
                                         UINT32 width, UINT32 height, UINT8 shift, BOOL withAlpha)
513
{
514
  switch (DstFormat)
515
  {
516
    case PIXEL_FORMAT_BGRA32:
517
      return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
518
                                  shift, 2, 1, 0, 3, withAlpha);
519
520
    case PIXEL_FORMAT_BGRX32:
521
      return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
522
                                  shift, 2, 1, 0, 3, withAlpha);
523
524
    case PIXEL_FORMAT_RGBA32:
525
      return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
526
                                  shift, 0, 1, 2, 3, withAlpha);
527
528
    case PIXEL_FORMAT_RGBX32:
529
      return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
530
                                  shift, 0, 1, 2, 3, withAlpha);
531
532
    case PIXEL_FORMAT_ARGB32:
533
      return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
534
                                  shift, 1, 2, 3, 0, withAlpha);
535
536
    case PIXEL_FORMAT_XRGB32:
537
      return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
538
                                  shift, 1, 2, 3, 0, withAlpha);
539
540
    case PIXEL_FORMAT_ABGR32:
541
      return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
542
                                  shift, 3, 2, 1, 0, withAlpha);
543
544
    case PIXEL_FORMAT_XBGR32:
545
      return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
546
                                  shift, 3, 2, 1, 0, withAlpha);
547
548
    default:
549
      return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width,
550
                                         height, shift, withAlpha);
551
  }
552
}
553
#endif /* WITH_SSE2 */
554
555
/* ------------------------------------------------------------------------- */
556
void primitives_init_YCoCg_opt(primitives_t* WINPR_RESTRICT prims)
557
0
{
558
0
  generic = primitives_get_generic();
559
0
  primitives_init_YCoCg(prims);
560
  /* While IPP acknowledges the existence of YCoCg-R, it doesn't currently
561
   * include any routines to work with it, especially with variable shift
562
   * width.
563
   */
564
0
#if defined(WITH_SSE2)
565
566
0
  if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) &&
567
0
      IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
568
0
  {
569
0
    prims->YCoCgToRGB_8u_AC4R = ssse3_YCoCgRToRGB_8u_AC4R;
570
0
  }
571
572
#elif defined(WITH_NEON)
573
574
  if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
575
  {
576
    prims->YCoCgToRGB_8u_AC4R = neon_YCoCgToRGB_8u_AC4R;
577
  }
578
579
#endif /* WITH_SSE2 */
580
0
}