Coverage Report

Created: 2025-07-01 06:46

/src/FreeRDP/libfreerdp/codec/neon/rfx_neon.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
   FreeRDP: A Remote Desktop Protocol Implementation
3
   RemoteFX Codec Library - NEON Optimizations
4
5
   Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>
6
7
   Licensed under the Apache License, Version 2.0 (the "License");
8
   you may not use this file except in compliance with the License.
9
   You may obtain a copy of the License at
10
11
       http://www.apache.org/licenses/LICENSE-2.0
12
13
   Unless required by applicable law or agreed to in writing, software
14
   distributed under the License is distributed on an "AS IS" BASIS,
15
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
   See the License for the specific language governing permissions and
17
   limitations under the License.
18
*/
19
20
#include <winpr/platform.h>
21
#include <freerdp/config.h>
22
#include <freerdp/log.h>
23
24
#include "../rfx_types.h"
25
#include "rfx_neon.h"
26
27
#include "../../core/simd.h"
28
29
#if defined(NEON_INTRINSICS_ENABLED)
30
31
#include <stdio.h>
32
#include <stdlib.h>
33
#include <string.h>
34
#include <arm_neon.h>
35
#include <winpr/sysinfo.h>
36
37
/* rfx_decode_YCbCr_to_RGB_NEON code now resides in the primitives library. */
38
39
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
40
rfx_quantization_decode_block_NEON(INT16* buffer, const size_t buffer_size, const UINT32 factor)
41
{
42
  int16x8_t quantFactors = vdupq_n_s16(factor);
43
  int16x8_t* buf = (int16x8_t*)buffer;
44
  int16x8_t* buf_end = (int16x8_t*)(buffer + buffer_size);
45
46
  do
47
  {
48
    int16x8_t val = vld1q_s16((INT16*)buf);
49
    val = vshlq_s16(val, quantFactors);
50
    vst1q_s16((INT16*)buf, val);
51
    buf++;
52
  } while (buf < buf_end);
53
}
54
55
static void rfx_quantization_decode_NEON(INT16* buffer, const UINT32* WINPR_RESTRICT quantVals)
56
{
57
  WINPR_ASSERT(buffer);
58
  WINPR_ASSERT(quantVals);
59
60
  rfx_quantization_decode_block_NEON(&buffer[0], 1024, quantVals[8] - 1);    /* HL1 */
61
  rfx_quantization_decode_block_NEON(&buffer[1024], 1024, quantVals[7] - 1); /* LH1 */
62
  rfx_quantization_decode_block_NEON(&buffer[2048], 1024, quantVals[9] - 1); /* HH1 */
63
  rfx_quantization_decode_block_NEON(&buffer[3072], 256, quantVals[5] - 1);  /* HL2 */
64
  rfx_quantization_decode_block_NEON(&buffer[3328], 256, quantVals[4] - 1);  /* LH2 */
65
  rfx_quantization_decode_block_NEON(&buffer[3584], 256, quantVals[6] - 1);  /* HH2 */
66
  rfx_quantization_decode_block_NEON(&buffer[3840], 64, quantVals[2] - 1);   /* HL3 */
67
  rfx_quantization_decode_block_NEON(&buffer[3904], 64, quantVals[1] - 1);   /* LH3 */
68
  rfx_quantization_decode_block_NEON(&buffer[3968], 64, quantVals[3] - 1);   /* HH3 */
69
  rfx_quantization_decode_block_NEON(&buffer[4032], 64, quantVals[0] - 1);   /* LL3 */
70
}
71
72
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
73
rfx_dwt_2d_decode_block_horiz_NEON(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
74
                                   INT16* WINPR_RESTRICT dst, size_t subband_width)
75
{
76
  INT16* l_ptr = l;
77
  INT16* h_ptr = h;
78
  INT16* dst_ptr = dst;
79
80
  for (size_t y = 0; y < subband_width; y++)
81
  {
82
    /* Even coefficients */
83
    for (size_t n = 0; n < subband_width; n += 8)
84
    {
85
      // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
86
      int16x8_t l_n = vld1q_s16(l_ptr);
87
      int16x8_t h_n = vld1q_s16(h_ptr);
88
      int16x8_t h_n_m = vld1q_s16(h_ptr - 1);
89
90
      if (n == 0)
91
      {
92
        int16_t first = vgetq_lane_s16(h_n_m, 1);
93
        h_n_m = vsetq_lane_s16(first, h_n_m, 0);
94
      }
95
96
      int16x8_t tmp_n = vaddq_s16(h_n, h_n_m);
97
      tmp_n = vaddq_s16(tmp_n, vdupq_n_s16(1));
98
      tmp_n = vshrq_n_s16(tmp_n, 1);
99
      int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
100
      vst1q_s16(l_ptr, dst_n);
101
      l_ptr += 8;
102
      h_ptr += 8;
103
    }
104
105
    l_ptr -= subband_width;
106
    h_ptr -= subband_width;
107
108
    /* Odd coefficients */
109
    for (size_t n = 0; n < subband_width; n += 8)
110
    {
111
      // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
112
      int16x8_t h_n = vld1q_s16(h_ptr);
113
      h_n = vshlq_n_s16(h_n, 1);
114
      int16x8x2_t dst_n;
115
      dst_n.val[0] = vld1q_s16(l_ptr);
116
      int16x8_t dst_n_p = vld1q_s16(l_ptr + 1);
117
118
      if (n == subband_width - 8)
119
      {
120
        int16_t last = vgetq_lane_s16(dst_n_p, 6);
121
        dst_n_p = vsetq_lane_s16(last, dst_n_p, 7);
122
      }
123
124
      dst_n.val[1] = vaddq_s16(dst_n_p, dst_n.val[0]);
125
      dst_n.val[1] = vshrq_n_s16(dst_n.val[1], 1);
126
      dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n);
127
      vst2q_s16(dst_ptr, dst_n);
128
      l_ptr += 8;
129
      h_ptr += 8;
130
      dst_ptr += 16;
131
    }
132
  }
133
}
134
135
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
136
rfx_dwt_2d_decode_block_vert_NEON(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
137
                                  INT16* WINPR_RESTRICT dst, size_t subband_width)
138
{
139
  INT16* l_ptr = l;
140
  INT16* h_ptr = h;
141
  INT16* dst_ptr = dst;
142
  const size_t total_width = subband_width + subband_width;
143
144
  /* Even coefficients */
145
  for (size_t n = 0; n < subband_width; n++)
146
  {
147
    for (size_t x = 0; x < total_width; x += 8)
148
    {
149
      // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
150
      int16x8_t l_n = vld1q_s16(l_ptr);
151
      int16x8_t h_n = vld1q_s16(h_ptr);
152
      int16x8_t tmp_n = vaddq_s16(h_n, vdupq_n_s16(1));
153
154
      if (n == 0)
155
        tmp_n = vaddq_s16(tmp_n, h_n);
156
      else
157
      {
158
        int16x8_t h_n_m = vld1q_s16((h_ptr - total_width));
159
        tmp_n = vaddq_s16(tmp_n, h_n_m);
160
      }
161
162
      tmp_n = vshrq_n_s16(tmp_n, 1);
163
      int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
164
      vst1q_s16(dst_ptr, dst_n);
165
      l_ptr += 8;
166
      h_ptr += 8;
167
      dst_ptr += 8;
168
    }
169
170
    dst_ptr += total_width;
171
  }
172
173
  h_ptr = h;
174
  dst_ptr = dst + total_width;
175
176
  /* Odd coefficients */
177
  for (size_t n = 0; n < subband_width; n++)
178
  {
179
    for (size_t x = 0; x < total_width; x += 8)
180
    {
181
      // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
182
      int16x8_t h_n = vld1q_s16(h_ptr);
183
      int16x8_t dst_n_m = vld1q_s16(dst_ptr - total_width);
184
      h_n = vshlq_n_s16(h_n, 1);
185
      int16x8_t tmp_n = dst_n_m;
186
187
      if (n == subband_width - 1)
188
        tmp_n = vaddq_s16(tmp_n, dst_n_m);
189
      else
190
      {
191
        int16x8_t dst_n_p = vld1q_s16((dst_ptr + total_width));
192
        tmp_n = vaddq_s16(tmp_n, dst_n_p);
193
      }
194
195
      tmp_n = vshrq_n_s16(tmp_n, 1);
196
      int16x8_t dst_n = vaddq_s16(tmp_n, h_n);
197
      vst1q_s16(dst_ptr, dst_n);
198
      h_ptr += 8;
199
      dst_ptr += 8;
200
    }
201
202
    dst_ptr += total_width;
203
  }
204
}
205
206
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
207
rfx_dwt_2d_decode_block_NEON(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT idwt,
208
                             size_t subband_width)
209
{
210
  INT16 *hl, *lh, *hh, *ll;
211
  INT16 *l_dst, *h_dst;
212
  /* Inverse DWT in horizontal direction, results in 2 sub-bands in L, H order in tmp buffer idwt.
213
   */
214
  /* The 4 sub-bands are stored in HL(0), LH(1), HH(2), LL(3) order. */
215
  /* The lower part L uses LL(3) and HL(0). */
216
  /* The higher part H uses LH(1) and HH(2). */
217
  ll = buffer + subband_width * subband_width * 3;
218
  hl = buffer;
219
  l_dst = idwt;
220
  rfx_dwt_2d_decode_block_horiz_NEON(ll, hl, l_dst, subband_width);
221
  lh = buffer + subband_width * subband_width;
222
  hh = buffer + subband_width * subband_width * 2;
223
  h_dst = idwt + subband_width * subband_width * 2;
224
  rfx_dwt_2d_decode_block_horiz_NEON(lh, hh, h_dst, subband_width);
225
  /* Inverse DWT in vertical direction, results are stored in original buffer. */
226
  rfx_dwt_2d_decode_block_vert_NEON(l_dst, h_dst, buffer, subband_width);
227
}
228
229
static void rfx_dwt_2d_decode_NEON(INT16* buffer, INT16* dwt_buffer)
230
{
231
  rfx_dwt_2d_decode_block_NEON(buffer + 3840, dwt_buffer, 8);
232
  rfx_dwt_2d_decode_block_NEON(buffer + 3072, dwt_buffer, 16);
233
  rfx_dwt_2d_decode_block_NEON(buffer, dwt_buffer, 32);
234
}
235
236
static INLINE void rfx_idwt_extrapolate_horiz_neon(INT16* restrict pLowBand, size_t nLowStep,
237
                                                   const INT16* restrict pHighBand,
238
                                                   size_t nHighStep, INT16* restrict pDstBand,
239
                                                   size_t nDstStep, size_t nLowCount,
240
                                                   size_t nHighCount, size_t nDstCount)
241
{
242
  WINPR_ASSERT(pLowBand);
243
  WINPR_ASSERT(pHighBand);
244
  WINPR_ASSERT(pDstBand);
245
246
  INT16* l_ptr = pLowBand;
247
  const INT16* h_ptr = pHighBand;
248
  INT16* dst_ptr = pDstBand;
249
  size_t batchSize = (nLowCount + nHighCount) >> 1;
250
251
  for (size_t y = 0; y < nDstCount; y++)
252
  {
253
    /* Even coefficients */
254
    size_t n = 0;
255
    for (; n < batchSize; n += 8)
256
    {
257
      // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
258
      int16x8_t l_n = vld1q_s16(l_ptr);
259
      int16x8_t h_n = vld1q_s16(h_ptr);
260
      int16x8_t h_n_m = vld1q_s16(h_ptr - 1);
261
262
      if (n == 0)
263
      {
264
        int16_t first = vgetq_lane_s16(h_n_m, 1);
265
        h_n_m = vsetq_lane_s16(first, h_n_m, 0);
266
      }
267
      else if (n == 24)
268
        h_n = vsetq_lane_s16(0, h_n, 7);
269
270
      int16x8_t tmp_n = vaddq_s16(h_n, h_n_m);
271
      tmp_n = vaddq_s16(tmp_n, vdupq_n_s16(1));
272
      tmp_n = vshrq_n_s16(tmp_n, 1);
273
      int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
274
      vst1q_s16(l_ptr, dst_n);
275
      l_ptr += 8;
276
      h_ptr += 8;
277
    }
278
    if (n < 32)
279
      *l_ptr -= *(h_ptr - 1);
280
281
    l_ptr -= batchSize;
282
    h_ptr -= batchSize;
283
284
    /* Odd coefficients */
285
    n = 0;
286
    for (; n < batchSize; n += 8)
287
    {
288
      // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
289
      int16x8_t h_n = vld1q_s16(h_ptr);
290
      h_n = vshlq_n_s16(h_n, 1);
291
      int16x8x2_t dst_n;
292
      dst_n.val[0] = vld1q_s16(l_ptr);
293
      int16x8_t dst_n_p = vld1q_s16(l_ptr + 1);
294
295
      if (n == 24)
296
        h_n = vsetq_lane_s16(0, h_n, 7);
297
298
      dst_n.val[1] = vaddq_s16(dst_n_p, dst_n.val[0]);
299
      dst_n.val[1] = vshrq_n_s16(dst_n.val[1], 1);
300
      dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n);
301
      vst2q_s16(dst_ptr, dst_n);
302
      l_ptr += 8;
303
      h_ptr += 8;
304
      dst_ptr += 16;
305
    }
306
    if (n == 32)
307
    {
308
      h_ptr -= 1;
309
      l_ptr += 1;
310
    }
311
    else
312
    {
313
      *dst_ptr = *l_ptr;
314
      l_ptr += 1;
315
      dst_ptr += 1;
316
    }
317
  }
318
}
319
320
static INLINE void rfx_idwt_extrapolate_vert_neon(const INT16* restrict pLowBand, size_t nLowStep,
321
                                                  const INT16* restrict pHighBand, size_t nHighStep,
322
                                                  INT16* restrict pDstBand, size_t nDstStep,
323
                                                  size_t nLowCount, size_t nHighCount,
324
                                                  size_t nDstCount)
325
{
326
  WINPR_ASSERT(pLowBand);
327
  WINPR_ASSERT(pHighBand);
328
  WINPR_ASSERT(pDstBand);
329
330
  const INT16* l_ptr = pLowBand;
331
  const INT16* h_ptr = pHighBand;
332
  INT16* dst_ptr = pDstBand;
333
  size_t batchSize = (nDstCount >> 3) << 3;
334
  size_t forceBandSize = (nLowCount + nHighCount) >> 1;
335
336
  /* Even coefficients */
337
  for (size_t n = 0; n < forceBandSize; n++)
338
  {
339
    for (size_t x = 0; x < batchSize; x += 8)
340
    {
341
      // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
342
      int16x8_t l_n = vld1q_s16(l_ptr);
343
      int16x8_t h_n = vld1q_s16((n == 31) ? (h_ptr - nHighStep) : h_ptr);
344
      int16x8_t tmp_n = vaddq_s16(h_n, vdupq_n_s16(1));
345
346
      if (n == 0)
347
        tmp_n = vaddq_s16(tmp_n, h_n);
348
      else if (n < 31)
349
      {
350
        int16x8_t h_n_m = vld1q_s16((h_ptr - nHighStep));
351
        tmp_n = vaddq_s16(tmp_n, h_n_m);
352
      }
353
354
      tmp_n = vshrq_n_s16(tmp_n, 1);
355
      int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
356
      vst1q_s16(dst_ptr, dst_n);
357
      l_ptr += 8;
358
      h_ptr += 8;
359
      dst_ptr += 8;
360
    }
361
362
    if (nDstCount > batchSize)
363
    {
364
      int16_t h_n = (n == 31) ? *(h_ptr - nHighStep) : *h_ptr;
365
      int16_t tmp_n = h_n + 1;
366
      if (n == 0)
367
        tmp_n += h_n;
368
      else if (n < 31)
369
        tmp_n += *(h_ptr - nHighStep);
370
      tmp_n >>= 1;
371
      *dst_ptr = *l_ptr - tmp_n;
372
      l_ptr += 1;
373
      h_ptr += 1;
374
      dst_ptr += 1;
375
    }
376
377
    dst_ptr += nDstStep;
378
  }
379
380
  if (forceBandSize < 32)
381
  {
382
    for (size_t x = 0; x < batchSize; x += 8)
383
    {
384
      int16x8_t l_n = vld1q_s16(l_ptr);
385
      int16x8_t h_n = vld1q_s16(h_ptr - nHighStep);
386
      int16x8_t tmp_n = vsubq_s16(l_n, h_n);
387
      vst1q_s16(dst_ptr, tmp_n);
388
      l_ptr += 8;
389
      h_ptr += 8;
390
      dst_ptr += 8;
391
    }
392
393
    if (nDstCount > batchSize)
394
    {
395
      *dst_ptr = *l_ptr - *(h_ptr - nHighStep);
396
      l_ptr += 1;
397
      h_ptr += 1;
398
      dst_ptr += 1;
399
    }
400
  }
401
402
  h_ptr = pHighBand;
403
  dst_ptr = pDstBand + nDstStep;
404
405
  /* Odd coefficients */
406
  for (size_t n = 0; n < forceBandSize; n++)
407
  {
408
    for (size_t x = 0; x < batchSize; x += 8)
409
    {
410
      // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
411
      int16x8_t tmp_n = vld1q_s16(dst_ptr - nDstStep);
412
      if (n == 31)
413
      {
414
        int16x8_t dst_n_p = vld1q_s16(l_ptr);
415
        l_ptr += 8;
416
        tmp_n = vaddq_s16(tmp_n, dst_n_p);
417
        tmp_n = vshrq_n_s16(tmp_n, 1);
418
      }
419
      else
420
      {
421
        int16x8_t dst_n_p = vld1q_s16(dst_ptr + nDstStep);
422
        tmp_n = vaddq_s16(tmp_n, dst_n_p);
423
        tmp_n = vshrq_n_s16(tmp_n, 1);
424
        int16x8_t h_n = vld1q_s16(h_ptr);
425
        h_n = vshlq_n_s16(h_n, 1);
426
        tmp_n = vaddq_s16(tmp_n, h_n);
427
      }
428
      vst1q_s16(dst_ptr, tmp_n);
429
      h_ptr += 8;
430
      dst_ptr += 8;
431
    }
432
433
    if (nDstCount > batchSize)
434
    {
435
      int16_t tmp_n = *(dst_ptr - nDstStep);
436
      if (n == 31)
437
      {
438
        int16_t dst_n_p = *l_ptr;
439
        l_ptr += 1;
440
        tmp_n += dst_n_p;
441
        tmp_n >>= 1;
442
      }
443
      else
444
      {
445
        int16_t dst_n_p = *(dst_ptr + nDstStep);
446
        tmp_n += dst_n_p;
447
        tmp_n >>= 1;
448
        int16_t h_n = *h_ptr;
449
        h_n <<= 1;
450
        tmp_n += h_n;
451
      }
452
      *dst_ptr = tmp_n;
453
      h_ptr += 1;
454
      dst_ptr += 1;
455
    }
456
457
    dst_ptr += nDstStep;
458
  }
459
}
460
461
static INLINE size_t prfx_get_band_l_count(size_t level)
462
{
463
  return (64 >> level) + 1;
464
}
465
466
static INLINE size_t prfx_get_band_h_count(size_t level)
467
{
468
  if (level == 1)
469
    return (64 >> 1) - 1;
470
  else
471
    return (64 + (1 << (level - 1))) >> level;
472
}
473
474
static INLINE void rfx_dwt_2d_decode_extrapolate_block_neon(INT16* buffer, INT16* temp,
475
                                                            size_t level)
476
{
477
  size_t nDstStepX;
478
  size_t nDstStepY;
479
  INT16 *HL, *LH;
480
  INT16 *HH, *LL;
481
  INT16 *L, *H, *LLx;
482
483
  const size_t nBandL = prfx_get_band_l_count(level);
484
  const size_t nBandH = prfx_get_band_h_count(level);
485
  size_t offset = 0;
486
487
  WINPR_ASSERT(buffer);
488
  WINPR_ASSERT(temp);
489
490
  HL = &buffer[offset];
491
  offset += (nBandH * nBandL);
492
  LH = &buffer[offset];
493
  offset += (nBandL * nBandH);
494
  HH = &buffer[offset];
495
  offset += (nBandH * nBandH);
496
  LL = &buffer[offset];
497
  nDstStepX = (nBandL + nBandH);
498
  nDstStepY = (nBandL + nBandH);
499
  offset = 0;
500
  L = &temp[offset];
501
  offset += (nBandL * nDstStepX);
502
  H = &temp[offset];
503
  LLx = &buffer[0];
504
505
  /* horizontal (LL + HL -> L) */
506
  rfx_idwt_extrapolate_horiz_neon(LL, nBandL, HL, nBandH, L, nDstStepX, nBandL, nBandH, nBandL);
507
508
  /* horizontal (LH + HH -> H) */
509
  rfx_idwt_extrapolate_horiz_neon(LH, nBandL, HH, nBandH, H, nDstStepX, nBandL, nBandH, nBandH);
510
511
  /* vertical (L + H -> LL) */
512
  rfx_idwt_extrapolate_vert_neon(L, nDstStepX, H, nDstStepX, LLx, nDstStepY, nBandL, nBandH,
513
                                 nBandL + nBandH);
514
}
515
516
static void rfx_dwt_2d_extrapolate_decode_neon(INT16* buffer, INT16* temp)
517
{
518
  WINPR_ASSERT(buffer);
519
  WINPR_ASSERT(temp);
520
  rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[3807], temp, 3);
521
  rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[3007], temp, 2);
522
  rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[0], temp, 1);
523
}
524
#endif // NEON_INTRINSICS_ENABLED
525
526
void rfx_init_neon_int(RFX_CONTEXT* WINPR_RESTRICT context)
527
0
{
528
#if defined(NEON_INTRINSICS_ENABLED)
529
  WLog_VRB(PRIM_TAG, "NEON optimizations");
530
  PROFILER_RENAME(context->priv->prof_rfx_ycbcr_to_rgb, "rfx_decode_YCbCr_to_RGB_NEON");
531
  PROFILER_RENAME(context->priv->prof_rfx_quantization_decode, "rfx_quantization_decode_NEON");
532
  PROFILER_RENAME(context->priv->prof_rfx_dwt_2d_decode, "rfx_dwt_2d_decode_NEON");
533
  context->quantization_decode = rfx_quantization_decode_NEON;
534
  context->dwt_2d_decode = rfx_dwt_2d_decode_NEON;
535
  context->dwt_2d_extrapolate_decode = rfx_dwt_2d_extrapolate_decode_neon;
536
#else
537
0
  WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or NEON intrinsics not available");
538
0
  WINPR_UNUSED(context);
539
0
#endif
540
0
}