Coverage Report

Created: 2025-07-01 06:46

/src/FreeRDP/libfreerdp/primitives/sse/prim_YCoCg_ssse3.c
Line
Count
Source (jump to first uncovered line)
1
/* FreeRDP: A Remote Desktop Protocol Client
2
 * Optimized YCoCg<->RGB conversion operations.
3
 * vi:ts=4 sw=4:
4
 *
5
 * (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
6
 *
7
 * Licensed under the Apache License, Version 2.0 (the "License");
8
 * you may not use this file except in compliance with the License.
9
 * You may obtain a copy of the License at
10
 *
11
 *     http://www.apache.org/licenses/LICENSE-2.0
12
 *
13
 * Unless required by applicable law or agreed to in writing, software
14
 * distributed under the License is distributed on an "AS IS" BASIS,
15
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
 * See the License for the specific language governing permissions and
17
 * limitations under the License.
18
 */
19
20
#include <freerdp/config.h>
21
22
#include <freerdp/types.h>
23
#include <freerdp/primitives.h>
24
#include <winpr/sysinfo.h>
25
26
#include "prim_YCoCg.h"
27
28
#include "prim_internal.h"
29
#include "prim_templates.h"
30
31
#if defined(SSE_AVX_INTRINSICS_ENABLED)
32
#include <emmintrin.h>
33
#include <tmmintrin.h>
34
35
static primitives_t* generic = NULL;
36
37
/* ------------------------------------------------------------------------- */
38
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep,
39
                                                  BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
40
                                                  UINT32 dstStep, UINT32 width, UINT32 height,
41
                                                  UINT8 shift, BOOL withAlpha)
42
0
{
43
0
  const BYTE* sptr = pSrc;
44
0
  BYTE* dptr = pDst;
45
46
0
  WINPR_ASSERT(srcStep / sizeof(UINT32) >= width);
47
0
  WINPR_ASSERT(dstStep / sizeof(UINT32) >= width);
48
0
  const size_t sRowBump = srcStep - width * sizeof(UINT32);
49
0
  const size_t dRowBump = dstStep - width * sizeof(UINT32);
50
  /* Shift left by "shift" and divide by two is the same as shift
51
   * left by "shift-1".
52
   */
53
0
  int dataShift = shift - 1;
54
0
  BYTE mask = (BYTE)(0xFFU << dataShift);
55
56
  /* Let's say the data is of the form:
57
   * y0y0o0g0 a1y1o1g1 a2y2o2g2...
58
   * Apply:
59
   * |R|   | 1  1/2 -1/2 |   |y|
60
   * |G| = | 1  0    1/2 | * |o|
61
   * |B|   | 1 -1/2 -1/2 |   |g|
62
   * where Y is 8-bit unsigned and o & g are 8-bit signed.
63
   */
64
65
0
  if ((width < 8) || (ULONG_PTR)dptr & 0x03)
66
0
  {
67
    /* Too small, or we'll never hit a 16-byte boundary.  Punt. */
68
0
    return generic->YCoCgToRGB_8u_AC4R(pSrc, WINPR_ASSERTING_INT_CAST(INT32, srcStep), pDst,
69
0
                                       DstFormat, WINPR_ASSERTING_INT_CAST(INT32, dstStep),
70
0
                                       width, height, shift, withAlpha);
71
0
  }
72
73
0
  for (UINT32 h = 0; h < height; h++)
74
0
  {
75
0
    UINT32 w = width;
76
77
0
    while (w >= 8)
78
0
    {
79
0
      __m128i R0;
80
0
      __m128i R1;
81
0
      __m128i R2;
82
0
      __m128i R3;
83
0
      __m128i R4;
84
0
      __m128i R5;
85
0
      __m128i R6;
86
0
      __m128i R7;
87
88
0
      R0 = LOAD_SI128(sptr);
89
0
      sptr += (128 / 8);
90
0
      R1 = LOAD_SI128(sptr);
91
0
      sptr += (128 / 8);
92
93
      /* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
94
      /* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
95
      /* Shuffle to pack all the like types together. */
96
0
      R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
97
0
      R3 = _mm_shuffle_epi8(R0, R2);
98
0
      R4 = _mm_shuffle_epi8(R1, R2);
99
      /* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
100
      /* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
101
0
      R5 = _mm_unpackhi_epi32(R3, R4);
102
0
      R6 = _mm_unpacklo_epi32(R3, R4);
103
104
      /* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
105
      /* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
106
      /* Save alphas aside */
107
0
      if (withAlpha)
108
0
        R7 = _mm_unpackhi_epi64(R5, R5);
109
0
      else
110
0
        R7 = mm_set1_epu32(0xFFFFFFFFU);
111
112
      /* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
113
      /* Expand Y's from 8-bit unsigned to 16-bit signed. */
114
0
      R1 = mm_set1_epu32(0);
115
0
      R0 = _mm_unpacklo_epi8(R5, R1);
116
      /* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
117
      /* Shift Co's and Cg's by (shift-1).  -1 covers division by two.
118
       * Note: this must be done before sign-conversion.
119
       * Note also there is no slli_epi8, so we have to use a 16-bit
120
       * version and then mask.
121
       */
122
0
      R6 = _mm_slli_epi16(R6, dataShift);
123
0
      R1 = mm_set1_epu8(mask);
124
0
      R6 = _mm_and_si128(R6, R1);
125
      /* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
126
      /* Expand Co's from 8-bit signed to 16-bit signed */
127
0
      R1 = _mm_unpackhi_epi8(R6, R6);
128
0
      R1 = _mm_srai_epi16(R1, 8);
129
      /* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
130
      /* Expand Cg's form 8-bit signed to 16-bit signed */
131
0
      R2 = _mm_unpacklo_epi8(R6, R6);
132
0
      R2 = _mm_srai_epi16(R2, 8);
133
      /* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
134
      /* Get Y - halfCg and save */
135
0
      R6 = _mm_subs_epi16(R0, R2);
136
      /* R = (Y-halfCg) + halfCo */
137
0
      R3 = _mm_adds_epi16(R6, R1);
138
      /* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
139
      /* G = Y + Cg(/2) */
140
0
      R4 = _mm_adds_epi16(R0, R2);
141
      /* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
142
      /* B = (Y-halfCg) - Co(/2) */
143
0
      R5 = _mm_subs_epi16(R6, R1);
144
      /* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
145
      /* Repack R's & B's.  */
146
0
      R0 = _mm_packus_epi16(R3, R5);
147
      /* R0 = R7R6R5R4 R3R2R1R0 B7B6B5B4 B3B2B1B0 */
148
      /* Repack G's. */
149
0
      R1 = _mm_packus_epi16(R4, R4);
150
      /* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
151
      /* And add the A's. */
152
0
      R1 = _mm_unpackhi_epi64(R1, R7);
153
      /* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
154
      /* Now do interleaving again. */
155
0
      R2 = _mm_unpacklo_epi8(R0, R1);
156
      /* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
157
0
      R3 = _mm_unpackhi_epi8(R0, R1);
158
      /* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
159
0
      R4 = _mm_unpacklo_epi16(R2, R3);
160
      /* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
161
0
      R5 = _mm_unpackhi_epi16(R2, R3);
162
      /* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
163
0
      STORE_SI128(dptr, R4);
164
0
      dptr += (128 / 8);
165
0
      STORE_SI128(dptr, R5);
166
0
      dptr += (128 / 8);
167
0
      w -= 8;
168
0
    }
169
170
    /* Handle any remainder pixels. */
171
0
    if (w > 0)
172
0
    {
173
0
      pstatus_t status = 0;
174
0
      status = generic->YCoCgToRGB_8u_AC4R(
175
0
          sptr, WINPR_ASSERTING_INT_CAST(INT32, srcStep), dptr, DstFormat,
176
0
          WINPR_ASSERTING_INT_CAST(INT32, dstStep), w, 1, shift, withAlpha);
177
178
0
      if (status != PRIMITIVES_SUCCESS)
179
0
        return status;
180
181
0
      sptr += w * sizeof(UINT32);
182
0
      dptr += w * sizeof(UINT32);
183
0
    }
184
185
0
    sptr += sRowBump;
186
0
    dptr += dRowBump;
187
0
  }
188
189
0
  return PRIMITIVES_SUCCESS;
190
0
}
191
192
/* ------------------------------------------------------------------------- */
193
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(const BYTE* WINPR_RESTRICT pSrc,
194
                                                     UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
195
                                                     UINT32 DstFormat, UINT32 dstStep, UINT32 width,
196
                                                     UINT32 height, UINT8 shift, BOOL withAlpha)
197
0
{
198
0
  const BYTE* sptr = pSrc;
199
0
  BYTE* dptr = pDst;
200
0
  size_t sRowBump = srcStep - width * sizeof(UINT32);
201
0
  size_t dRowBump = dstStep - width * sizeof(UINT32);
202
  /* Shift left by "shift" and divide by two is the same as shift
203
   * left by "shift-1".
204
   */
205
0
  int dataShift = shift - 1;
206
0
  BYTE mask = (BYTE)(0xFFU << dataShift);
207
208
  /* Let's say the data is of the form:
209
   * y0y0o0g0 a1y1o1g1 a2y2o2g2...
210
   * Apply:
211
   * |R|   | 1  1/2 -1/2 |   |y|
212
   * |G| = | 1  0    1/2 | * |o|
213
   * |B|   | 1 -1/2 -1/2 |   |g|
214
   * where Y is 8-bit unsigned and o & g are 8-bit signed.
215
   */
216
217
0
  if ((width < 8) || (ULONG_PTR)dptr & 0x03)
218
0
  {
219
    /* Too small, or we'll never hit a 16-byte boundary.  Punt. */
220
0
    return generic->YCoCgToRGB_8u_AC4R(pSrc, WINPR_ASSERTING_INT_CAST(INT32, srcStep), pDst,
221
0
                                       DstFormat, WINPR_ASSERTING_INT_CAST(INT32, dstStep),
222
0
                                       width, height, shift, withAlpha);
223
0
  }
224
225
0
  for (UINT32 h = 0; h < height; h++)
226
0
  {
227
0
    UINT32 w = width;
228
229
0
    while (w >= 8)
230
0
    {
231
0
      __m128i R7;
232
233
      /* The faster path, 16-byte aligned load. */
234
0
      __m128i R0 = LOAD_SI128(sptr);
235
0
      sptr += (128 / 8);
236
0
      __m128i R1 = LOAD_SI128(sptr);
237
0
      sptr += (128 / 8);
238
239
      /* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
240
      /* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
241
      /* Shuffle to pack all the like types together. */
242
0
      __m128i R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
243
0
      __m128i R3 = _mm_shuffle_epi8(R0, R2);
244
0
      __m128i R4 = _mm_shuffle_epi8(R1, R2);
245
      /* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
246
      /* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
247
0
      __m128i R5 = _mm_unpackhi_epi32(R3, R4);
248
0
      __m128i R6 = _mm_unpacklo_epi32(R3, R4);
249
250
      /* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
251
      /* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
252
      /* Save alphas aside */
253
0
      if (withAlpha)
254
0
        R7 = _mm_unpackhi_epi64(R5, R5);
255
0
      else
256
0
        R7 = mm_set1_epu32(0xFFFFFFFFU);
257
258
      /* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
259
      /* Expand Y's from 8-bit unsigned to 16-bit signed. */
260
0
      R1 = mm_set1_epu32(0);
261
0
      R0 = _mm_unpacklo_epi8(R5, R1);
262
      /* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
263
      /* Shift Co's and Cg's by (shift-1).  -1 covers division by two.
264
       * Note: this must be done before sign-conversion.
265
       * Note also there is no slli_epi8, so we have to use a 16-bit
266
       * version and then mask.
267
       */
268
0
      R6 = _mm_slli_epi16(R6, dataShift);
269
0
      R1 = mm_set1_epu8(mask);
270
0
      R6 = _mm_and_si128(R6, R1);
271
      /* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
272
      /* Expand Co's from 8-bit signed to 16-bit signed */
273
0
      R1 = _mm_unpackhi_epi8(R6, R6);
274
0
      R1 = _mm_srai_epi16(R1, 8);
275
      /* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
276
      /* Expand Cg's form 8-bit signed to 16-bit signed */
277
0
      R2 = _mm_unpacklo_epi8(R6, R6);
278
0
      R2 = _mm_srai_epi16(R2, 8);
279
      /* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
280
      /* Get Y - halfCg and save */
281
0
      R6 = _mm_subs_epi16(R0, R2);
282
      /* R = (Y-halfCg) + halfCo */
283
0
      R3 = _mm_adds_epi16(R6, R1);
284
      /* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
285
      /* G = Y + Cg(/2) */
286
0
      R4 = _mm_adds_epi16(R0, R2);
287
      /* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
288
      /* B = (Y-halfCg) - Co(/2) */
289
0
      R5 = _mm_subs_epi16(R6, R1);
290
      /* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
291
      /* Repack R's & B's.  */
292
      /* This line is the only diff between inverted and non-inverted.
293
       * Unfortunately, it would be expensive to check "inverted"
294
       * every time through this loop.
295
       */
296
0
      R0 = _mm_packus_epi16(R5, R3);
297
      /* R0 = B7B6B5B4 B3B2B1B0 R7R6R5R4 R3R2R1R0 */
298
      /* Repack G's. */
299
0
      R1 = _mm_packus_epi16(R4, R4);
300
      /* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
301
      /* And add the A's. */
302
0
      R1 = _mm_unpackhi_epi64(R1, R7);
303
      /* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
304
      /* Now do interleaving again. */
305
0
      R2 = _mm_unpacklo_epi8(R0, R1);
306
      /* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
307
0
      R3 = _mm_unpackhi_epi8(R0, R1);
308
      /* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
309
0
      R4 = _mm_unpacklo_epi16(R2, R3);
310
      /* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
311
0
      R5 = _mm_unpackhi_epi16(R2, R3);
312
      /* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
313
0
      STORE_SI128(dptr, R4);
314
0
      dptr += (128 / 8);
315
0
      STORE_SI128(dptr, R5);
316
0
      dptr += (128 / 8);
317
0
      w -= 8;
318
0
    }
319
320
    /* Handle any remainder pixels. */
321
0
    if (w > 0)
322
0
    {
323
0
      pstatus_t status = 0;
324
0
      status = generic->YCoCgToRGB_8u_AC4R(
325
0
          sptr, WINPR_ASSERTING_INT_CAST(INT32, srcStep), dptr, DstFormat,
326
0
          WINPR_ASSERTING_INT_CAST(INT32, dstStep), WINPR_ASSERTING_INT_CAST(UINT32, w), 1,
327
0
          shift, withAlpha);
328
329
0
      if (status != PRIMITIVES_SUCCESS)
330
0
        return status;
331
332
0
      sptr += WINPR_ASSERTING_INT_CAST(UINT32, w) * sizeof(UINT32);
333
0
      dptr += WINPR_ASSERTING_INT_CAST(UINT32, w) * sizeof(UINT32);
334
0
    }
335
336
0
    sptr += sRowBump;
337
0
    dptr += dRowBump;
338
0
  }
339
340
0
  return PRIMITIVES_SUCCESS;
341
0
}
342
343
/* ------------------------------------------------------------------------- */
344
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
345
                                           BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
346
                                           INT32 dstStep, UINT32 width, UINT32 height, UINT8 shift,
347
                                           BOOL withAlpha)
348
0
{
349
0
  switch (DstFormat)
350
0
  {
351
0
    case PIXEL_FORMAT_BGRX32:
352
0
    case PIXEL_FORMAT_BGRA32:
353
0
      return ssse3_YCoCgRToRGB_8u_AC4R_invert(
354
0
          pSrc, WINPR_ASSERTING_INT_CAST(UINT32, srcStep), pDst, DstFormat,
355
0
          WINPR_ASSERTING_INT_CAST(UINT32, dstStep), width, height, shift, withAlpha);
356
357
0
    case PIXEL_FORMAT_RGBX32:
358
0
    case PIXEL_FORMAT_RGBA32:
359
0
      return ssse3_YCoCgRToRGB_8u_AC4R_no_invert(
360
0
          pSrc, WINPR_ASSERTING_INT_CAST(UINT32, srcStep), pDst, DstFormat,
361
0
          WINPR_ASSERTING_INT_CAST(UINT32, dstStep), width, height, shift, withAlpha);
362
363
0
    default:
364
0
      return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width,
365
0
                                         height, shift, withAlpha);
366
0
  }
367
0
}
368
369
#endif
370
371
/* ------------------------------------------------------------------------- */
372
void primitives_init_YCoCg_ssse3_int(primitives_t* WINPR_RESTRICT prims)
373
0
{
374
0
#if defined(SSE_AVX_INTRINSICS_ENABLED)
375
0
  generic = primitives_get_generic();
376
377
0
  WLog_VRB(PRIM_TAG, "SSE3/SSSE3 optimizations");
378
0
  prims->YCoCgToRGB_8u_AC4R = ssse3_YCoCgRToRGB_8u_AC4R;
379
#else
380
  WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE2 intrinsics not available");
381
  WINPR_UNUSED(prims);
382
#endif
383
0
}