Coverage Report

Created: 2024-09-08 06:18

/src/FreeRDP/libfreerdp/codec/neon/rfx_neon.c
Line
Count
Source
1
/*
2
   FreeRDP: A Remote Desktop Protocol Implementation
3
   RemoteFX Codec Library - NEON Optimizations
4
5
   Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>
6
7
   Licensed under the Apache License, Version 2.0 (the "License");
8
   you may not use this file except in compliance with the License.
9
   You may obtain a copy of the License at
10
11
       http://www.apache.org/licenses/LICENSE-2.0
12
13
   Unless required by applicable law or agreed to in writing, software
14
   distributed under the License is distributed on an "AS IS" BASIS,
15
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
   See the License for the specific language governing permissions and
17
   limitations under the License.
18
*/
19
20
#include <winpr/platform.h>
21
#include <freerdp/config.h>
22
#include <freerdp/log.h>
23
24
#include "../rfx_types.h"
25
#include "rfx_neon.h"
26
27
#define TAG FREERDP_TAG("codec.rfx.neon")
28
29
#if defined(WITH_NEON)
30
#if defined(_M_ARM64) || defined(_M_ARM)
31
#define NEON_ENABLED
32
#endif
33
#endif
34
35
#if defined(NEON_ENABLED)
36
37
#include <stdio.h>
38
#include <stdlib.h>
39
#include <string.h>
40
#include <arm_neon.h>
41
#include <winpr/sysinfo.h>
42
43
/* rfx_decode_YCbCr_to_RGB_NEON code now resides in the primitives library. */
44
45
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
46
rfx_quantization_decode_block_NEON(INT16* buffer, const size_t buffer_size, const UINT32 factor)
47
{
48
  int16x8_t quantFactors = vdupq_n_s16(factor);
49
  int16x8_t* buf = (int16x8_t*)buffer;
50
  int16x8_t* buf_end = (int16x8_t*)(buffer + buffer_size);
51
52
  do
53
  {
54
    int16x8_t val = vld1q_s16((INT16*)buf);
55
    val = vshlq_s16(val, quantFactors);
56
    vst1q_s16((INT16*)buf, val);
57
    buf++;
58
  } while (buf < buf_end);
59
}
60
61
static void rfx_quantization_decode_NEON(INT16* buffer, const UINT32* WINPR_RESTRICT quantVals)
62
{
63
  WINPR_ASSERT(buffer);
64
  WINPR_ASSERT(quantVals);
65
66
  rfx_quantization_decode_block_NEON(&buffer[0], 1024, quantVals[8] - 1);    /* HL1 */
67
  rfx_quantization_decode_block_NEON(&buffer[1024], 1024, quantVals[7] - 1); /* LH1 */
68
  rfx_quantization_decode_block_NEON(&buffer[2048], 1024, quantVals[9] - 1); /* HH1 */
69
  rfx_quantization_decode_block_NEON(&buffer[3072], 256, quantVals[5] - 1);  /* HL2 */
70
  rfx_quantization_decode_block_NEON(&buffer[3328], 256, quantVals[4] - 1);  /* LH2 */
71
  rfx_quantization_decode_block_NEON(&buffer[3584], 256, quantVals[6] - 1);  /* HH2 */
72
  rfx_quantization_decode_block_NEON(&buffer[3840], 64, quantVals[2] - 1);   /* HL3 */
73
  rfx_quantization_decode_block_NEON(&buffer[3904], 64, quantVals[1] - 1);   /* LH3 */
74
  rfx_quantization_decode_block_NEON(&buffer[3968], 64, quantVals[3] - 1);   /* HH3 */
75
  rfx_quantization_decode_block_NEON(&buffer[4032], 64, quantVals[0] - 1);   /* LL3 */
76
}
77
78
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
79
rfx_dwt_2d_decode_block_horiz_NEON(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
80
                                   INT16* WINPR_RESTRICT dst, size_t subband_width)
81
{
82
  INT16* l_ptr = l;
83
  INT16* h_ptr = h;
84
  INT16* dst_ptr = dst;
85
86
  for (size_t y = 0; y < subband_width; y++)
87
  {
88
    /* Even coefficients */
89
    for (size_t n = 0; n < subband_width; n += 8)
90
    {
91
      // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
92
      int16x8_t l_n = vld1q_s16(l_ptr);
93
      int16x8_t h_n = vld1q_s16(h_ptr);
94
      int16x8_t h_n_m = vld1q_s16(h_ptr - 1);
95
96
      if (n == 0)
97
      {
98
        int16_t first = vgetq_lane_s16(h_n_m, 1);
99
        h_n_m = vsetq_lane_s16(first, h_n_m, 0);
100
      }
101
102
      int16x8_t tmp_n = vaddq_s16(h_n, h_n_m);
103
      tmp_n = vaddq_s16(tmp_n, vdupq_n_s16(1));
104
      tmp_n = vshrq_n_s16(tmp_n, 1);
105
      int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
106
      vst1q_s16(l_ptr, dst_n);
107
      l_ptr += 8;
108
      h_ptr += 8;
109
    }
110
111
    l_ptr -= subband_width;
112
    h_ptr -= subband_width;
113
114
    /* Odd coefficients */
115
    for (size_t n = 0; n < subband_width; n += 8)
116
    {
117
      // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
118
      int16x8_t h_n = vld1q_s16(h_ptr);
119
      h_n = vshlq_n_s16(h_n, 1);
120
      int16x8x2_t dst_n;
121
      dst_n.val[0] = vld1q_s16(l_ptr);
122
      int16x8_t dst_n_p = vld1q_s16(l_ptr + 1);
123
124
      if (n == subband_width - 8)
125
      {
126
        int16_t last = vgetq_lane_s16(dst_n_p, 6);
127
        dst_n_p = vsetq_lane_s16(last, dst_n_p, 7);
128
      }
129
130
      dst_n.val[1] = vaddq_s16(dst_n_p, dst_n.val[0]);
131
      dst_n.val[1] = vshrq_n_s16(dst_n.val[1], 1);
132
      dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n);
133
      vst2q_s16(dst_ptr, dst_n);
134
      l_ptr += 8;
135
      h_ptr += 8;
136
      dst_ptr += 16;
137
    }
138
  }
139
}
140
141
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
142
rfx_dwt_2d_decode_block_vert_NEON(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
143
                                  INT16* WINPR_RESTRICT dst, size_t subband_width)
144
{
145
  INT16* l_ptr = l;
146
  INT16* h_ptr = h;
147
  INT16* dst_ptr = dst;
148
  const size_t total_width = subband_width + subband_width;
149
150
  /* Even coefficients */
151
  for (size_t n = 0; n < subband_width; n++)
152
  {
153
    for (size_t x = 0; x < total_width; x += 8)
154
    {
155
      // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
156
      int16x8_t l_n = vld1q_s16(l_ptr);
157
      int16x8_t h_n = vld1q_s16(h_ptr);
158
      int16x8_t tmp_n = vaddq_s16(h_n, vdupq_n_s16(1));
159
160
      if (n == 0)
161
        tmp_n = vaddq_s16(tmp_n, h_n);
162
      else
163
      {
164
        int16x8_t h_n_m = vld1q_s16((h_ptr - total_width));
165
        tmp_n = vaddq_s16(tmp_n, h_n_m);
166
      }
167
168
      tmp_n = vshrq_n_s16(tmp_n, 1);
169
      int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
170
      vst1q_s16(dst_ptr, dst_n);
171
      l_ptr += 8;
172
      h_ptr += 8;
173
      dst_ptr += 8;
174
    }
175
176
    dst_ptr += total_width;
177
  }
178
179
  h_ptr = h;
180
  dst_ptr = dst + total_width;
181
182
  /* Odd coefficients */
183
  for (size_t n = 0; n < subband_width; n++)
184
  {
185
    for (size_t x = 0; x < total_width; x += 8)
186
    {
187
      // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
188
      int16x8_t h_n = vld1q_s16(h_ptr);
189
      int16x8_t dst_n_m = vld1q_s16(dst_ptr - total_width);
190
      h_n = vshlq_n_s16(h_n, 1);
191
      int16x8_t tmp_n = dst_n_m;
192
193
      if (n == subband_width - 1)
194
        tmp_n = vaddq_s16(tmp_n, dst_n_m);
195
      else
196
      {
197
        int16x8_t dst_n_p = vld1q_s16((dst_ptr + total_width));
198
        tmp_n = vaddq_s16(tmp_n, dst_n_p);
199
      }
200
201
      tmp_n = vshrq_n_s16(tmp_n, 1);
202
      int16x8_t dst_n = vaddq_s16(tmp_n, h_n);
203
      vst1q_s16(dst_ptr, dst_n);
204
      h_ptr += 8;
205
      dst_ptr += 8;
206
    }
207
208
    dst_ptr += total_width;
209
  }
210
}
211
212
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
213
rfx_dwt_2d_decode_block_NEON(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT idwt,
214
                             size_t subband_width)
215
{
216
  INT16 *hl, *lh, *hh, *ll;
217
  INT16 *l_dst, *h_dst;
218
  /* Inverse DWT in horizontal direction, results in 2 sub-bands in L, H order in tmp buffer idwt.
219
   */
220
  /* The 4 sub-bands are stored in HL(0), LH(1), HH(2), LL(3) order. */
221
  /* The lower part L uses LL(3) and HL(0). */
222
  /* The higher part H uses LH(1) and HH(2). */
223
  ll = buffer + subband_width * subband_width * 3;
224
  hl = buffer;
225
  l_dst = idwt;
226
  rfx_dwt_2d_decode_block_horiz_NEON(ll, hl, l_dst, subband_width);
227
  lh = buffer + subband_width * subband_width;
228
  hh = buffer + subband_width * subband_width * 2;
229
  h_dst = idwt + subband_width * subband_width * 2;
230
  rfx_dwt_2d_decode_block_horiz_NEON(lh, hh, h_dst, subband_width);
231
  /* Inverse DWT in vertical direction, results are stored in original buffer. */
232
  rfx_dwt_2d_decode_block_vert_NEON(l_dst, h_dst, buffer, subband_width);
233
}
234
235
static void rfx_dwt_2d_decode_NEON(INT16* buffer, INT16* dwt_buffer)
236
{
237
  rfx_dwt_2d_decode_block_NEON(buffer + 3840, dwt_buffer, 8);
238
  rfx_dwt_2d_decode_block_NEON(buffer + 3072, dwt_buffer, 16);
239
  rfx_dwt_2d_decode_block_NEON(buffer, dwt_buffer, 32);
240
}
241
242
static INLINE void rfx_idwt_extrapolate_horiz_neon(INT16* restrict pLowBand, size_t nLowStep,
243
                                                   const INT16* restrict pHighBand,
244
                                                   size_t nHighStep, INT16* restrict pDstBand,
245
                                                   size_t nDstStep, size_t nLowCount,
246
                                                   size_t nHighCount, size_t nDstCount)
247
{
248
  WINPR_ASSERT(pLowBand);
249
  WINPR_ASSERT(pHighBand);
250
  WINPR_ASSERT(pDstBand);
251
252
  INT16* l_ptr = pLowBand;
253
  const INT16* h_ptr = pHighBand;
254
  INT16* dst_ptr = pDstBand;
255
  size_t batchSize = (nLowCount + nHighCount) >> 1;
256
257
  for (size_t y = 0; y < nDstCount; y++)
258
  {
259
    /* Even coefficients */
260
    size_t n = 0;
261
    for (; n < batchSize; n += 8)
262
    {
263
      // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
264
      int16x8_t l_n = vld1q_s16(l_ptr);
265
      int16x8_t h_n = vld1q_s16(h_ptr);
266
      int16x8_t h_n_m = vld1q_s16(h_ptr - 1);
267
268
      if (n == 0)
269
      {
270
        int16_t first = vgetq_lane_s16(h_n_m, 1);
271
        h_n_m = vsetq_lane_s16(first, h_n_m, 0);
272
      }
273
      else if (n == 24)
274
        h_n = vsetq_lane_s16(0, h_n, 7);
275
276
      int16x8_t tmp_n = vaddq_s16(h_n, h_n_m);
277
      tmp_n = vaddq_s16(tmp_n, vdupq_n_s16(1));
278
      tmp_n = vshrq_n_s16(tmp_n, 1);
279
      int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
280
      vst1q_s16(l_ptr, dst_n);
281
      l_ptr += 8;
282
      h_ptr += 8;
283
    }
284
    if (n < 32)
285
      *l_ptr -= *(h_ptr - 1);
286
287
    l_ptr -= batchSize;
288
    h_ptr -= batchSize;
289
290
    /* Odd coefficients */
291
    n = 0;
292
    for (; n < batchSize; n += 8)
293
    {
294
      // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
295
      int16x8_t h_n = vld1q_s16(h_ptr);
296
      h_n = vshlq_n_s16(h_n, 1);
297
      int16x8x2_t dst_n;
298
      dst_n.val[0] = vld1q_s16(l_ptr);
299
      int16x8_t dst_n_p = vld1q_s16(l_ptr + 1);
300
301
      if (n == 24)
302
        h_n = vsetq_lane_s16(0, h_n, 7);
303
304
      dst_n.val[1] = vaddq_s16(dst_n_p, dst_n.val[0]);
305
      dst_n.val[1] = vshrq_n_s16(dst_n.val[1], 1);
306
      dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n);
307
      vst2q_s16(dst_ptr, dst_n);
308
      l_ptr += 8;
309
      h_ptr += 8;
310
      dst_ptr += 16;
311
    }
312
    if (n == 32)
313
    {
314
      h_ptr -= 1;
315
      l_ptr += 1;
316
    }
317
    else
318
    {
319
      *dst_ptr = *l_ptr;
320
      l_ptr += 1;
321
      dst_ptr += 1;
322
    }
323
  }
324
}
325
326
static INLINE void rfx_idwt_extrapolate_vert_neon(const INT16* restrict pLowBand, size_t nLowStep,
327
                                                  const INT16* restrict pHighBand, size_t nHighStep,
328
                                                  INT16* restrict pDstBand, size_t nDstStep,
329
                                                  size_t nLowCount, size_t nHighCount,
330
                                                  size_t nDstCount)
331
{
332
  WINPR_ASSERT(pLowBand);
333
  WINPR_ASSERT(pHighBand);
334
  WINPR_ASSERT(pDstBand);
335
336
  const INT16* l_ptr = pLowBand;
337
  const INT16* h_ptr = pHighBand;
338
  INT16* dst_ptr = pDstBand;
339
  size_t batchSize = (nDstCount >> 3) << 3;
340
  size_t forceBandSize = (nLowCount + nHighCount) >> 1;
341
342
  /* Even coefficients */
343
  for (size_t n = 0; n < forceBandSize; n++)
344
  {
345
    for (size_t x = 0; x < batchSize; x += 8)
346
    {
347
      // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
348
      int16x8_t l_n = vld1q_s16(l_ptr);
349
      int16x8_t h_n = vld1q_s16((n == 31) ? (h_ptr - nHighStep) : h_ptr);
350
      int16x8_t tmp_n = vaddq_s16(h_n, vdupq_n_s16(1));
351
352
      if (n == 0)
353
        tmp_n = vaddq_s16(tmp_n, h_n);
354
      else if (n < 31)
355
      {
356
        int16x8_t h_n_m = vld1q_s16((h_ptr - nHighStep));
357
        tmp_n = vaddq_s16(tmp_n, h_n_m);
358
      }
359
360
      tmp_n = vshrq_n_s16(tmp_n, 1);
361
      int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
362
      vst1q_s16(dst_ptr, dst_n);
363
      l_ptr += 8;
364
      h_ptr += 8;
365
      dst_ptr += 8;
366
    }
367
368
    if (nDstCount > batchSize)
369
    {
370
      int16_t h_n = (n == 31) ? *(h_ptr - nHighStep) : *h_ptr;
371
      int16_t tmp_n = h_n + 1;
372
      if (n == 0)
373
        tmp_n += h_n;
374
      else if (n < 31)
375
        tmp_n += *(h_ptr - nHighStep);
376
      tmp_n >>= 1;
377
      *dst_ptr = *l_ptr - tmp_n;
378
      l_ptr += 1;
379
      h_ptr += 1;
380
      dst_ptr += 1;
381
    }
382
383
    dst_ptr += nDstStep;
384
  }
385
386
  if (forceBandSize < 32)
387
  {
388
    for (size_t x = 0; x < batchSize; x += 8)
389
    {
390
      int16x8_t l_n = vld1q_s16(l_ptr);
391
      int16x8_t h_n = vld1q_s16(h_ptr - nHighStep);
392
      int16x8_t tmp_n = vsubq_s16(l_n, h_n);
393
      vst1q_s16(dst_ptr, tmp_n);
394
      l_ptr += 8;
395
      h_ptr += 8;
396
      dst_ptr += 8;
397
    }
398
399
    if (nDstCount > batchSize)
400
    {
401
      *dst_ptr = *l_ptr - *(h_ptr - nHighStep);
402
      l_ptr += 1;
403
      h_ptr += 1;
404
      dst_ptr += 1;
405
    }
406
  }
407
408
  h_ptr = pHighBand;
409
  dst_ptr = pDstBand + nDstStep;
410
411
  /* Odd coefficients */
412
  for (size_t n = 0; n < forceBandSize; n++)
413
  {
414
    for (size_t x = 0; x < batchSize; x += 8)
415
    {
416
      // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
417
      int16x8_t tmp_n = vld1q_s16(dst_ptr - nDstStep);
418
      if (n == 31)
419
      {
420
        int16x8_t dst_n_p = vld1q_s16(l_ptr);
421
        l_ptr += 8;
422
        tmp_n = vaddq_s16(tmp_n, dst_n_p);
423
        tmp_n = vshrq_n_s16(tmp_n, 1);
424
      }
425
      else
426
      {
427
        int16x8_t dst_n_p = vld1q_s16(dst_ptr + nDstStep);
428
        tmp_n = vaddq_s16(tmp_n, dst_n_p);
429
        tmp_n = vshrq_n_s16(tmp_n, 1);
430
        int16x8_t h_n = vld1q_s16(h_ptr);
431
        h_n = vshlq_n_s16(h_n, 1);
432
        tmp_n = vaddq_s16(tmp_n, h_n);
433
      }
434
      vst1q_s16(dst_ptr, tmp_n);
435
      h_ptr += 8;
436
      dst_ptr += 8;
437
    }
438
439
    if (nDstCount > batchSize)
440
    {
441
      int16_t tmp_n = *(dst_ptr - nDstStep);
442
      if (n == 31)
443
      {
444
        int16_t dst_n_p = *l_ptr;
445
        l_ptr += 1;
446
        tmp_n += dst_n_p;
447
        tmp_n >>= 1;
448
      }
449
      else
450
      {
451
        int16_t dst_n_p = *(dst_ptr + nDstStep);
452
        tmp_n += dst_n_p;
453
        tmp_n >>= 1;
454
        int16_t h_n = *h_ptr;
455
        h_n <<= 1;
456
        tmp_n += h_n;
457
      }
458
      *dst_ptr = tmp_n;
459
      h_ptr += 1;
460
      dst_ptr += 1;
461
    }
462
463
    dst_ptr += nDstStep;
464
  }
465
}
466
467
static INLINE size_t prfx_get_band_l_count(size_t level)
468
{
469
  return (64 >> level) + 1;
470
}
471
472
static INLINE size_t prfx_get_band_h_count(size_t level)
473
{
474
  if (level == 1)
475
    return (64 >> 1) - 1;
476
  else
477
    return (64 + (1 << (level - 1))) >> level;
478
}
479
480
static INLINE void rfx_dwt_2d_decode_extrapolate_block_neon(INT16* buffer, INT16* temp,
481
                                                            size_t level)
482
{
483
  size_t nDstStepX;
484
  size_t nDstStepY;
485
  INT16 *HL, *LH;
486
  INT16 *HH, *LL;
487
  INT16 *L, *H, *LLx;
488
489
  const size_t nBandL = prfx_get_band_l_count(level);
490
  const size_t nBandH = prfx_get_band_h_count(level);
491
  size_t offset = 0;
492
493
  WINPR_ASSERT(buffer);
494
  WINPR_ASSERT(temp);
495
496
  HL = &buffer[offset];
497
  offset += (nBandH * nBandL);
498
  LH = &buffer[offset];
499
  offset += (nBandL * nBandH);
500
  HH = &buffer[offset];
501
  offset += (nBandH * nBandH);
502
  LL = &buffer[offset];
503
  nDstStepX = (nBandL + nBandH);
504
  nDstStepY = (nBandL + nBandH);
505
  offset = 0;
506
  L = &temp[offset];
507
  offset += (nBandL * nDstStepX);
508
  H = &temp[offset];
509
  LLx = &buffer[0];
510
511
  /* horizontal (LL + HL -> L) */
512
  rfx_idwt_extrapolate_horiz_neon(LL, nBandL, HL, nBandH, L, nDstStepX, nBandL, nBandH, nBandL);
513
514
  /* horizontal (LH + HH -> H) */
515
  rfx_idwt_extrapolate_horiz_neon(LH, nBandL, HH, nBandH, H, nDstStepX, nBandL, nBandH, nBandH);
516
517
  /* vertical (L + H -> LL) */
518
  rfx_idwt_extrapolate_vert_neon(L, nDstStepX, H, nDstStepX, LLx, nDstStepY, nBandL, nBandH,
519
                                 nBandL + nBandH);
520
}
521
522
static void rfx_dwt_2d_extrapolate_decode_neon(INT16* buffer, INT16* temp)
523
{
524
  WINPR_ASSERT(buffer);
525
  WINPR_ASSERT(temp);
526
  rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[3807], temp, 3);
527
  rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[3007], temp, 2);
528
  rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[0], temp, 1);
529
}
530
#endif // NEON_ENABLED
531
532
void rfx_init_neon(RFX_CONTEXT* context)
533
10.8k
{
534
#if defined(NEON_ENABLED)
535
  if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
536
  {
537
    DEBUG_RFX("Using NEON optimizations");
538
    PROFILER_RENAME(context->priv->prof_rfx_ycbcr_to_rgb, "rfx_decode_YCbCr_to_RGB_NEON");
539
    PROFILER_RENAME(context->priv->prof_rfx_quantization_decode,
540
                    "rfx_quantization_decode_NEON");
541
    PROFILER_RENAME(context->priv->prof_rfx_dwt_2d_decode, "rfx_dwt_2d_decode_NEON");
542
    context->quantization_decode = rfx_quantization_decode_NEON;
543
    context->dwt_2d_decode = rfx_dwt_2d_decode_NEON;
544
    context->dwt_2d_extrapolate_decode = rfx_dwt_2d_extrapolate_decode_neon;
545
  }
546
#else
547
10.8k
  WINPR_UNUSED(context);
548
10.8k
#endif
549
10.8k
}