Coverage Report

Created: 2024-09-08 06:20

/src/FreeRDP/libfreerdp/primitives/sse/prim_YCoCg_ssse3.c
Line
Count
Source (jump to first uncovered line)
1
/* FreeRDP: A Remote Desktop Protocol Client
2
 * Optimized YCoCg<->RGB conversion operations.
3
 * vi:ts=4 sw=4:
4
 *
5
 * (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
6
 *
7
 * Licensed under the Apache License, Version 2.0 (the "License");
8
 * you may not use this file except in compliance with the License.
9
 * You may obtain a copy of the License at
10
 *
11
 *     http://www.apache.org/licenses/LICENSE-2.0
12
 *
13
 * Unless required by applicable law or agreed to in writing, software
14
 * distributed under the License is distributed on an "AS IS" BASIS,
15
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
 * See the License for the specific language governing permissions and
17
 * limitations under the License.
18
 */
19
20
#include <freerdp/config.h>
21
22
#include <freerdp/types.h>
23
#include <freerdp/primitives.h>
24
#include <winpr/sysinfo.h>
25
26
#include "prim_YCoCg.h"
27
28
#include "prim_internal.h"
29
#include "prim_templates.h"
30
31
#if defined(SSE2_ENABLED)
32
#include <emmintrin.h>
33
#include <tmmintrin.h>
34
35
static primitives_t* generic = NULL;
36
37
/* ------------------------------------------------------------------------- */
38
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep,
39
                                                  BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
40
                                                  UINT32 dstStep, UINT32 width, UINT32 height,
41
                                                  UINT8 shift, BOOL withAlpha)
42
{
43
  const BYTE* sptr = pSrc;
44
  BYTE* dptr = pDst;
45
  int sRowBump = srcStep - width * sizeof(UINT32);
46
  int dRowBump = dstStep - width * sizeof(UINT32);
47
  /* Shift left by "shift" and divide by two is the same as shift
48
   * left by "shift-1".
49
   */
50
  int dataShift = shift - 1;
51
  BYTE mask = (BYTE)(0xFFU << dataShift);
52
53
  /* Let's say the data is of the form:
54
   * y0y0o0g0 a1y1o1g1 a2y2o2g2...
55
   * Apply:
56
   * |R|   | 1  1/2 -1/2 |   |y|
57
   * |G| = | 1  0    1/2 | * |o|
58
   * |B|   | 1 -1/2 -1/2 |   |g|
59
   * where Y is 8-bit unsigned and o & g are 8-bit signed.
60
   */
61
62
  if ((width < 8) || (ULONG_PTR)dptr & 0x03)
63
  {
64
    /* Too small, or we'll never hit a 16-byte boundary.  Punt. */
65
    return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
66
                                       shift, withAlpha);
67
  }
68
69
  for (UINT32 h = 0; h < height; h++)
70
  {
71
    UINT32 w = width;
72
    BOOL onStride = 0;
73
74
    /* Get to a 16-byte destination boundary. */
75
    if ((ULONG_PTR)dptr & 0x0f)
76
    {
77
      pstatus_t status = 0;
78
      UINT32 startup = (16 - ((ULONG_PTR)dptr & 0x0f)) / 4;
79
80
      if (startup > width)
81
        startup = width;
82
83
      status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, startup,
84
                                           1, shift, withAlpha);
85
86
      if (status != PRIMITIVES_SUCCESS)
87
        return status;
88
89
      sptr += startup * sizeof(UINT32);
90
      dptr += startup * sizeof(UINT32);
91
      w -= startup;
92
    }
93
94
    /* Each loop handles eight pixels at a time. */
95
    onStride = (((ULONG_PTR)sptr & 0x0f) == 0) ? TRUE : FALSE;
96
97
    while (w >= 8)
98
    {
99
      __m128i R0;
100
      __m128i R1;
101
      __m128i R2;
102
      __m128i R3;
103
      __m128i R4;
104
      __m128i R5;
105
      __m128i R6;
106
      __m128i R7;
107
108
      if (onStride)
109
      {
110
        /* The faster path, 16-byte aligned load. */
111
        R0 = _mm_load_si128((const __m128i*)sptr);
112
        sptr += (128 / 8);
113
        R1 = _mm_load_si128((const __m128i*)sptr);
114
        sptr += (128 / 8);
115
      }
116
      else
117
      {
118
        /* Off-stride, slower LDDQU load. */
119
        R0 = _mm_lddqu_si128((const __m128i*)sptr);
120
        sptr += (128 / 8);
121
        R1 = _mm_lddqu_si128((const __m128i*)sptr);
122
        sptr += (128 / 8);
123
      }
124
125
      /* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
126
      /* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
127
      /* Shuffle to pack all the like types together. */
128
      R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
129
      R3 = _mm_shuffle_epi8(R0, R2);
130
      R4 = _mm_shuffle_epi8(R1, R2);
131
      /* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
132
      /* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
133
      R5 = _mm_unpackhi_epi32(R3, R4);
134
      R6 = _mm_unpacklo_epi32(R3, R4);
135
136
      /* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
137
      /* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
138
      /* Save alphas aside */
139
      if (withAlpha)
140
        R7 = _mm_unpackhi_epi64(R5, R5);
141
      else
142
        R7 = _mm_set1_epi32(0xFFFFFFFFU);
143
144
      /* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
145
      /* Expand Y's from 8-bit unsigned to 16-bit signed. */
146
      R1 = _mm_set1_epi32(0);
147
      R0 = _mm_unpacklo_epi8(R5, R1);
148
      /* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
149
      /* Shift Co's and Cg's by (shift-1).  -1 covers division by two.
150
       * Note: this must be done before sign-conversion.
151
       * Note also there is no slli_epi8, so we have to use a 16-bit
152
       * version and then mask.
153
       */
154
      R6 = _mm_slli_epi16(R6, dataShift);
155
      R1 = _mm_set1_epi8(mask);
156
      R6 = _mm_and_si128(R6, R1);
157
      /* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
158
      /* Expand Co's from 8-bit signed to 16-bit signed */
159
      R1 = _mm_unpackhi_epi8(R6, R6);
160
      R1 = _mm_srai_epi16(R1, 8);
161
      /* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
162
      /* Expand Cg's form 8-bit signed to 16-bit signed */
163
      R2 = _mm_unpacklo_epi8(R6, R6);
164
      R2 = _mm_srai_epi16(R2, 8);
165
      /* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
166
      /* Get Y - halfCg and save */
167
      R6 = _mm_subs_epi16(R0, R2);
168
      /* R = (Y-halfCg) + halfCo */
169
      R3 = _mm_adds_epi16(R6, R1);
170
      /* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
171
      /* G = Y + Cg(/2) */
172
      R4 = _mm_adds_epi16(R0, R2);
173
      /* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
174
      /* B = (Y-halfCg) - Co(/2) */
175
      R5 = _mm_subs_epi16(R6, R1);
176
      /* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
177
      /* Repack R's & B's.  */
178
      R0 = _mm_packus_epi16(R3, R5);
179
      /* R0 = R7R6R5R4 R3R2R1R0 B7B6B5B4 B3B2B1B0 */
180
      /* Repack G's. */
181
      R1 = _mm_packus_epi16(R4, R4);
182
      /* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
183
      /* And add the A's. */
184
      R1 = _mm_unpackhi_epi64(R1, R7);
185
      /* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
186
      /* Now do interleaving again. */
187
      R2 = _mm_unpacklo_epi8(R0, R1);
188
      /* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
189
      R3 = _mm_unpackhi_epi8(R0, R1);
190
      /* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
191
      R4 = _mm_unpacklo_epi16(R2, R3);
192
      /* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
193
      R5 = _mm_unpackhi_epi16(R2, R3);
194
      /* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
195
      _mm_store_si128((__m128i*)dptr, R4);
196
      dptr += (128 / 8);
197
      _mm_store_si128((__m128i*)dptr, R5);
198
      dptr += (128 / 8);
199
      w -= 8;
200
    }
201
202
    /* Handle any remainder pixels. */
203
    if (w > 0)
204
    {
205
      pstatus_t status = 0;
206
      status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, w, 1,
207
                                           shift, withAlpha);
208
209
      if (status != PRIMITIVES_SUCCESS)
210
        return status;
211
212
      sptr += w * sizeof(UINT32);
213
      dptr += w * sizeof(UINT32);
214
    }
215
216
    sptr += sRowBump;
217
    dptr += dRowBump;
218
  }
219
220
  return PRIMITIVES_SUCCESS;
221
}
222
223
/* ------------------------------------------------------------------------- */
224
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(const BYTE* WINPR_RESTRICT pSrc,
225
                                                     UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
226
                                                     UINT32 DstFormat, UINT32 dstStep, UINT32 width,
227
                                                     UINT32 height, UINT8 shift, BOOL withAlpha)
228
{
229
  const BYTE* sptr = pSrc;
230
  BYTE* dptr = pDst;
231
  int sRowBump = srcStep - width * sizeof(UINT32);
232
  int dRowBump = dstStep - width * sizeof(UINT32);
233
  /* Shift left by "shift" and divide by two is the same as shift
234
   * left by "shift-1".
235
   */
236
  int dataShift = shift - 1;
237
  BYTE mask = (BYTE)(0xFFU << dataShift);
238
239
  /* Let's say the data is of the form:
240
   * y0y0o0g0 a1y1o1g1 a2y2o2g2...
241
   * Apply:
242
   * |R|   | 1  1/2 -1/2 |   |y|
243
   * |G| = | 1  0    1/2 | * |o|
244
   * |B|   | 1 -1/2 -1/2 |   |g|
245
   * where Y is 8-bit unsigned and o & g are 8-bit signed.
246
   */
247
248
  if ((width < 8) || (ULONG_PTR)dptr & 0x03)
249
  {
250
    /* Too small, or we'll never hit a 16-byte boundary.  Punt. */
251
    return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
252
                                       shift, withAlpha);
253
  }
254
255
  for (UINT32 h = 0; h < height; h++)
256
  {
257
    int w = width;
258
    BOOL onStride = 0;
259
260
    /* Get to a 16-byte destination boundary. */
261
    if ((ULONG_PTR)dptr & 0x0f)
262
    {
263
      pstatus_t status = 0;
264
      UINT32 startup = (16 - ((ULONG_PTR)dptr & 0x0f)) / 4;
265
266
      if (startup > width)
267
        startup = width;
268
269
      status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, startup,
270
                                           1, shift, withAlpha);
271
272
      if (status != PRIMITIVES_SUCCESS)
273
        return status;
274
275
      sptr += startup * sizeof(UINT32);
276
      dptr += startup * sizeof(UINT32);
277
      w -= startup;
278
    }
279
280
    /* Each loop handles eight pixels at a time. */
281
    onStride = (((const ULONG_PTR)sptr & 0x0f) == 0) ? TRUE : FALSE;
282
283
    while (w >= 8)
284
    {
285
      __m128i R0;
286
      __m128i R1;
287
      __m128i R2;
288
      __m128i R3;
289
      __m128i R4;
290
      __m128i R5;
291
      __m128i R6;
292
      __m128i R7;
293
294
      if (onStride)
295
      {
296
        /* The faster path, 16-byte aligned load. */
297
        R0 = _mm_load_si128((const __m128i*)sptr);
298
        sptr += (128 / 8);
299
        R1 = _mm_load_si128((const __m128i*)sptr);
300
        sptr += (128 / 8);
301
      }
302
      else
303
      {
304
        /* Off-stride, slower LDDQU load. */
305
        R0 = _mm_lddqu_si128((const __m128i*)sptr);
306
        sptr += (128 / 8);
307
        R1 = _mm_lddqu_si128((const __m128i*)sptr);
308
        sptr += (128 / 8);
309
      }
310
311
      /* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
312
      /* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
313
      /* Shuffle to pack all the like types together. */
314
      R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
315
      R3 = _mm_shuffle_epi8(R0, R2);
316
      R4 = _mm_shuffle_epi8(R1, R2);
317
      /* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
318
      /* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
319
      R5 = _mm_unpackhi_epi32(R3, R4);
320
      R6 = _mm_unpacklo_epi32(R3, R4);
321
322
      /* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
323
      /* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
324
      /* Save alphas aside */
325
      if (withAlpha)
326
        R7 = _mm_unpackhi_epi64(R5, R5);
327
      else
328
        R7 = _mm_set1_epi32(0xFFFFFFFFU);
329
330
      /* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
331
      /* Expand Y's from 8-bit unsigned to 16-bit signed. */
332
      R1 = _mm_set1_epi32(0);
333
      R0 = _mm_unpacklo_epi8(R5, R1);
334
      /* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
335
      /* Shift Co's and Cg's by (shift-1).  -1 covers division by two.
336
       * Note: this must be done before sign-conversion.
337
       * Note also there is no slli_epi8, so we have to use a 16-bit
338
       * version and then mask.
339
       */
340
      R6 = _mm_slli_epi16(R6, dataShift);
341
      R1 = _mm_set1_epi8(mask);
342
      R6 = _mm_and_si128(R6, R1);
343
      /* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
344
      /* Expand Co's from 8-bit signed to 16-bit signed */
345
      R1 = _mm_unpackhi_epi8(R6, R6);
346
      R1 = _mm_srai_epi16(R1, 8);
347
      /* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
348
      /* Expand Cg's form 8-bit signed to 16-bit signed */
349
      R2 = _mm_unpacklo_epi8(R6, R6);
350
      R2 = _mm_srai_epi16(R2, 8);
351
      /* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
352
      /* Get Y - halfCg and save */
353
      R6 = _mm_subs_epi16(R0, R2);
354
      /* R = (Y-halfCg) + halfCo */
355
      R3 = _mm_adds_epi16(R6, R1);
356
      /* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
357
      /* G = Y + Cg(/2) */
358
      R4 = _mm_adds_epi16(R0, R2);
359
      /* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
360
      /* B = (Y-halfCg) - Co(/2) */
361
      R5 = _mm_subs_epi16(R6, R1);
362
      /* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
363
      /* Repack R's & B's.  */
364
      /* This line is the only diff between inverted and non-inverted.
365
       * Unfortunately, it would be expensive to check "inverted"
366
       * every time through this loop.
367
       */
368
      R0 = _mm_packus_epi16(R5, R3);
369
      /* R0 = B7B6B5B4 B3B2B1B0 R7R6R5R4 R3R2R1R0 */
370
      /* Repack G's. */
371
      R1 = _mm_packus_epi16(R4, R4);
372
      /* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
373
      /* And add the A's. */
374
      R1 = _mm_unpackhi_epi64(R1, R7);
375
      /* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
376
      /* Now do interleaving again. */
377
      R2 = _mm_unpacklo_epi8(R0, R1);
378
      /* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
379
      R3 = _mm_unpackhi_epi8(R0, R1);
380
      /* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
381
      R4 = _mm_unpacklo_epi16(R2, R3);
382
      /* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
383
      R5 = _mm_unpackhi_epi16(R2, R3);
384
      /* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
385
      _mm_store_si128((__m128i*)dptr, R4);
386
      dptr += (128 / 8);
387
      _mm_store_si128((__m128i*)dptr, R5);
388
      dptr += (128 / 8);
389
      w -= 8;
390
    }
391
392
    /* Handle any remainder pixels. */
393
    if (w > 0)
394
    {
395
      pstatus_t status = 0;
396
      status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, w, 1,
397
                                           shift, withAlpha);
398
399
      if (status != PRIMITIVES_SUCCESS)
400
        return status;
401
402
      sptr += w * sizeof(UINT32);
403
      dptr += w * sizeof(UINT32);
404
    }
405
406
    sptr += sRowBump;
407
    dptr += dRowBump;
408
  }
409
410
  return PRIMITIVES_SUCCESS;
411
}
412
413
/* ------------------------------------------------------------------------- */
414
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
415
                                           BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
416
                                           INT32 dstStep, UINT32 width, UINT32 height, UINT8 shift,
417
                                           BOOL withAlpha)
418
{
419
  switch (DstFormat)
420
  {
421
    case PIXEL_FORMAT_BGRX32:
422
    case PIXEL_FORMAT_BGRA32:
423
      return ssse3_YCoCgRToRGB_8u_AC4R_invert(pSrc, srcStep, pDst, DstFormat, dstStep, width,
424
                                              height, shift, withAlpha);
425
426
    case PIXEL_FORMAT_RGBX32:
427
    case PIXEL_FORMAT_RGBA32:
428
      return ssse3_YCoCgRToRGB_8u_AC4R_no_invert(pSrc, srcStep, pDst, DstFormat, dstStep,
429
                                                 width, height, shift, withAlpha);
430
431
    default:
432
      return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width,
433
                                         height, shift, withAlpha);
434
  }
435
}
436
437
#endif
438
439
/* ------------------------------------------------------------------------- */
440
void primitives_init_YCoCg_ssse3(primitives_t* WINPR_RESTRICT prims)
441
0
{
442
#if defined(SSE2_ENABLED)
443
  generic = primitives_get_generic();
444
  primitives_init_YCoCg(prims);
445
446
  if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) &&
447
      IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
448
  {
449
    WLog_VRB(PRIM_TAG, "SSE3/SSSE3 optimizations");
450
    prims->YCoCgToRGB_8u_AC4R = ssse3_YCoCgRToRGB_8u_AC4R;
451
  }
452
#else
453
0
  WLog_VRB(PRIM_TAG, "undefined WITH_SSE2");
454
0
  WINPR_UNUSED(prims);
455
0
#endif
456
0
}