/src/FreeRDP/libfreerdp/codec/neon/rfx_neon.c

Source
/*
   FreeRDP: A Remote Desktop Protocol Implementation
   RemoteFX Codec Library - NEON Optimizations

   Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

#include <winpr/platform.h>
#include <freerdp/config.h>
#include <freerdp/log.h>

#include "../rfx_types.h"
#include "../rfx_quantization.h"
#include "rfx_neon.h"

#include "../../core/simd.h"

#if defined(NEON_INTRINSICS_ENABLED)

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <arm_neon.h>
#include <winpr/sysinfo.h>

/* rfx_decode_YCbCr_to_RGB_NEON code now resides in the primitives library. */

static inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
rfx_quantization_decode_block_NEON(INT16* buffer, const size_t buffer_size, const UINT32 factor)
{
  int16x8_t quantFactors = vdupq_n_s16(factor);
  int16x8_t* buf = (int16x8_t*)buffer;
  int16x8_t* buf_end = (int16x8_t*)(buffer + buffer_size);

  do
  {
    int16x8_t val = vld1q_s16((INT16*)buf);
    val = vshlq_s16(val, quantFactors);
    vst1q_s16((INT16*)buf, val);
    buf++;
  } while (buf < buf_end);
}

WINPR_ATTR_NODISCARD
static BOOL rfx_quantization_decode_NEON(INT16* buffer, const UINT32* WINPR_RESTRICT quantVals,
                                         size_t nrQuantVals)
{
  WINPR_ASSERT(buffer);
  WINPR_ASSERT(quantVals);
  WINPR_ASSERT(nrQuantVals == NR_QUANT_VALUES);

  for (size_t x = 0; x < nrQuantVals; x++)
  {
    const UINT32 val = quantVals[x];
    if (val < 1)
      return FALSE;
  }

  rfx_quantization_decode_block_NEON(&buffer[0], 1024, quantVals[8] - 1);    /* HL1 */
  rfx_quantization_decode_block_NEON(&buffer[1024], 1024, quantVals[7] - 1); /* LH1 */
  rfx_quantization_decode_block_NEON(&buffer[2048], 1024, quantVals[9] - 1); /* HH1 */
  rfx_quantization_decode_block_NEON(&buffer[3072], 256, quantVals[5] - 1);  /* HL2 */
  rfx_quantization_decode_block_NEON(&buffer[3328], 256, quantVals[4] - 1);  /* LH2 */
  rfx_quantization_decode_block_NEON(&buffer[3584], 256, quantVals[6] - 1);  /* HH2 */
  rfx_quantization_decode_block_NEON(&buffer[3840], 64, quantVals[2] - 1);   /* HL3 */
  rfx_quantization_decode_block_NEON(&buffer[3904], 64, quantVals[1] - 1);   /* LH3 */
  rfx_quantization_decode_block_NEON(&buffer[3968], 64, quantVals[3] - 1);   /* HH3 */
  rfx_quantization_decode_block_NEON(&buffer[4032], 64, quantVals[0] - 1);   /* LL3 */
  return TRUE;
}

static inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
rfx_dwt_2d_decode_block_horiz_NEON(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
                                   INT16* WINPR_RESTRICT dst, size_t subband_width)
{
  INT16* l_ptr = l;
  INT16* h_ptr = h;
  INT16* dst_ptr = dst;

  for (size_t y = 0; y < subband_width; y++)
  {
    /* Even coefficients */
    for (size_t n = 0; n < subband_width; n += 8)
    {
      // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
      int16x8_t l_n = vld1q_s16(l_ptr);
      int16x8_t h_n = vld1q_s16(h_ptr);
      int16x8_t h_n_m = vld1q_s16(h_ptr - 1);

      if (n == 0)
      {
        int16_t first = vgetq_lane_s16(h_n_m, 1);
        h_n_m = vsetq_lane_s16(first, h_n_m, 0);
      }

      int16x8_t tmp_n = vaddq_s16(h_n, h_n_m);
      tmp_n = vaddq_s16(tmp_n, vdupq_n_s16(1));
      tmp_n = vshrq_n_s16(tmp_n, 1);
      int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
      vst1q_s16(l_ptr, dst_n);
      l_ptr += 8;
      h_ptr += 8;
    }

    l_ptr -= subband_width;
    h_ptr -= subband_width;

    /* Odd coefficients */
    for (size_t n = 0; n < subband_width; n += 8)
    {
      // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
      int16x8_t h_n = vld1q_s16(h_ptr);
      h_n = vshlq_n_s16(h_n, 1);
      int16x8x2_t dst_n;
      dst_n.val[0] = vld1q_s16(l_ptr);
      int16x8_t dst_n_p = vld1q_s16(l_ptr + 1);

      if (n == subband_width - 8)
      {
        int16_t last = vgetq_lane_s16(dst_n_p, 6);
        dst_n_p = vsetq_lane_s16(last, dst_n_p, 7);
      }

      dst_n.val[1] = vaddq_s16(dst_n_p, dst_n.val[0]);
      dst_n.val[1] = vshrq_n_s16(dst_n.val[1], 1);
      dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n);
      vst2q_s16(dst_ptr, dst_n);
      l_ptr += 8;
      h_ptr += 8;
      dst_ptr += 16;
    }
  }
}

static inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
rfx_dwt_2d_decode_block_vert_NEON(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
                                  INT16* WINPR_RESTRICT dst, size_t subband_width)
{
  INT16* l_ptr = l;
  INT16* h_ptr = h;
  INT16* dst_ptr = dst;
  const size_t total_width = subband_width + subband_width;

  /* Even coefficients */
  for (size_t n = 0; n < subband_width; n++)
  {
    for (size_t x = 0; x < total_width; x += 8)
    {
      // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
      int16x8_t l_n = vld1q_s16(l_ptr);
      int16x8_t h_n = vld1q_s16(h_ptr);
      int16x8_t tmp_n = vaddq_s16(h_n, vdupq_n_s16(1));

      if (n == 0)
        tmp_n = vaddq_s16(tmp_n, h_n);
      else
      {
        int16x8_t h_n_m = vld1q_s16((h_ptr - total_width));
        tmp_n = vaddq_s16(tmp_n, h_n_m);
      }

      tmp_n = vshrq_n_s16(tmp_n, 1);
      int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
      vst1q_s16(dst_ptr, dst_n);
      l_ptr += 8;
      h_ptr += 8;
      dst_ptr += 8;
    }

    dst_ptr += total_width;
  }

  h_ptr = h;
  dst_ptr = dst + total_width;

  /* Odd coefficients */
  for (size_t n = 0; n < subband_width; n++)
  {
    for (size_t x = 0; x < total_width; x += 8)
    {
      // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
      int16x8_t h_n = vld1q_s16(h_ptr);
      int16x8_t dst_n_m = vld1q_s16(dst_ptr - total_width);
      h_n = vshlq_n_s16(h_n, 1);
      int16x8_t tmp_n = dst_n_m;

      if (n == subband_width - 1)
        tmp_n = vaddq_s16(tmp_n, dst_n_m);
      else
      {
        int16x8_t dst_n_p = vld1q_s16((dst_ptr + total_width));
        tmp_n = vaddq_s16(tmp_n, dst_n_p);
      }

      tmp_n = vshrq_n_s16(tmp_n, 1);
      int16x8_t dst_n = vaddq_s16(tmp_n, h_n);
      vst1q_s16(dst_ptr, dst_n);
      h_ptr += 8;
      dst_ptr += 8;
    }

    dst_ptr += total_width;
  }
}

static inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
rfx_dwt_2d_decode_block_NEON(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT idwt,
                             size_t subband_width)
{
  INT16 *hl, *lh, *hh, *ll;
  INT16 *l_dst, *h_dst;
  /* Inverse DWT in horizontal direction, results in 2 sub-bands in L, H order in tmp buffer idwt.
   */
  /* The 4 sub-bands are stored in HL(0), LH(1), HH(2), LL(3) order. */
  /* The lower part L uses LL(3) and HL(0). */
  /* The higher part H uses LH(1) and HH(2). */
  ll = buffer + subband_width * subband_width * 3;
  hl = buffer;
  l_dst = idwt;
  rfx_dwt_2d_decode_block_horiz_NEON(ll, hl, l_dst, subband_width);
  lh = buffer + subband_width * subband_width;
  hh = buffer + subband_width * subband_width * 2;
  h_dst = idwt + subband_width * subband_width * 2;
  rfx_dwt_2d_decode_block_horiz_NEON(lh, hh, h_dst, subband_width);
  /* Inverse DWT in vertical direction, results are stored in original buffer. */
  rfx_dwt_2d_decode_block_vert_NEON(l_dst, h_dst, buffer, subband_width);
}

static void rfx_dwt_2d_decode_NEON(INT16* buffer, INT16* dwt_buffer)
{
  rfx_dwt_2d_decode_block_NEON(buffer + 3840, dwt_buffer, 8);
  rfx_dwt_2d_decode_block_NEON(buffer + 3072, dwt_buffer, 16);
  rfx_dwt_2d_decode_block_NEON(buffer, dwt_buffer, 32);
}

static inline void rfx_idwt_extrapolate_horiz_neon(INT16* restrict pLowBand, size_t nLowStep,
                                                   const INT16* restrict pHighBand,
                                                   size_t nHighStep, INT16* restrict pDstBand,
                                                   size_t nDstStep, size_t nLowCount,
                                                   size_t nHighCount, size_t nDstCount)
{
  WINPR_ASSERT(pLowBand);
  WINPR_ASSERT(pHighBand);
  WINPR_ASSERT(pDstBand);

  INT16* l_ptr = pLowBand;
  const INT16* h_ptr = pHighBand;
  INT16* dst_ptr = pDstBand;
  size_t batchSize = (nLowCount + nHighCount) >> 1;

  for (size_t y = 0; y < nDstCount; y++)
  {
    /* Even coefficients */
    size_t n = 0;
    for (; n < batchSize; n += 8)
    {
      // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
      int16x8_t l_n = vld1q_s16(l_ptr);
      int16x8_t h_n = vld1q_s16(h_ptr);
      int16x8_t h_n_m = vld1q_s16(h_ptr - 1);

      if (n == 0)
      {
        int16_t first = vgetq_lane_s16(h_n_m, 1);
        h_n_m = vsetq_lane_s16(first, h_n_m, 0);
      }
      else if (n == 24)
        h_n = vsetq_lane_s16(0, h_n, 7);

      int16x8_t tmp_n = vaddq_s16(h_n, h_n_m);
      tmp_n = vaddq_s16(tmp_n, vdupq_n_s16(1));
      tmp_n = vshrq_n_s16(tmp_n, 1);
      int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
      vst1q_s16(l_ptr, dst_n);
      l_ptr += 8;
      h_ptr += 8;
    }
    if (n < 32)
      *l_ptr -= *(h_ptr - 1);

    l_ptr -= batchSize;
    h_ptr -= batchSize;

    /* Odd coefficients */
    n = 0;
    for (; n < batchSize; n += 8)
    {
      // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
      int16x8_t h_n = vld1q_s16(h_ptr);
      h_n = vshlq_n_s16(h_n, 1);
      int16x8x2_t dst_n;
      dst_n.val[0] = vld1q_s16(l_ptr);
      int16x8_t dst_n_p = vld1q_s16(l_ptr + 1);

      if (n == 24)
        h_n = vsetq_lane_s16(0, h_n, 7);

      dst_n.val[1] = vaddq_s16(dst_n_p, dst_n.val[0]);
      dst_n.val[1] = vshrq_n_s16(dst_n.val[1], 1);
      dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n);
      vst2q_s16(dst_ptr, dst_n);
      l_ptr += 8;
      h_ptr += 8;
      dst_ptr += 16;
    }
    if (n == 32)
    {
      h_ptr -= 1;
      l_ptr += 1;
    }
    else
    {
      *dst_ptr = *l_ptr;
      l_ptr += 1;
      dst_ptr += 1;
    }
  }
}

static inline void rfx_idwt_extrapolate_vert_neon(const INT16* restrict pLowBand, size_t nLowStep,
                                                  const INT16* restrict pHighBand, size_t nHighStep,
                                                  INT16* restrict pDstBand, size_t nDstStep,
                                                  size_t nLowCount, size_t nHighCount,
                                                  size_t nDstCount)
{
  WINPR_ASSERT(pLowBand);
  WINPR_ASSERT(pHighBand);
  WINPR_ASSERT(pDstBand);

  const INT16* l_ptr = pLowBand;
  const INT16* h_ptr = pHighBand;
  INT16* dst_ptr = pDstBand;
  size_t batchSize = (nDstCount >> 3) << 3;
  size_t forceBandSize = (nLowCount + nHighCount) >> 1;

  /* Even coefficients */
  for (size_t n = 0; n < forceBandSize; n++)
  {
    for (size_t x = 0; x < batchSize; x += 8)
    {
      // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
      int16x8_t l_n = vld1q_s16(l_ptr);
      int16x8_t h_n = vld1q_s16((n == 31) ? (h_ptr - nHighStep) : h_ptr);
      int16x8_t tmp_n = vaddq_s16(h_n, vdupq_n_s16(1));

      if (n == 0)
        tmp_n = vaddq_s16(tmp_n, h_n);
      else if (n < 31)
      {
        int16x8_t h_n_m = vld1q_s16((h_ptr - nHighStep));
        tmp_n = vaddq_s16(tmp_n, h_n_m);
      }

      tmp_n = vshrq_n_s16(tmp_n, 1);
      int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
      vst1q_s16(dst_ptr, dst_n);
      l_ptr += 8;
      h_ptr += 8;
      dst_ptr += 8;
    }

    if (nDstCount > batchSize)
    {
      int16_t h_n = (n == 31) ? *(h_ptr - nHighStep) : *h_ptr;
      int16_t tmp_n = h_n + 1;
      if (n == 0)
        tmp_n += h_n;
      else if (n < 31)
        tmp_n += *(h_ptr - nHighStep);
      tmp_n >>= 1;
      *dst_ptr = *l_ptr - tmp_n;
      l_ptr += 1;
      h_ptr += 1;
      dst_ptr += 1;
    }

    dst_ptr += nDstStep;
  }

  if (forceBandSize < 32)
  {
    for (size_t x = 0; x < batchSize; x += 8)
    {
      int16x8_t l_n = vld1q_s16(l_ptr);
      int16x8_t h_n = vld1q_s16(h_ptr - nHighStep);
      int16x8_t tmp_n = vsubq_s16(l_n, h_n);
      vst1q_s16(dst_ptr, tmp_n);
      l_ptr += 8;
      h_ptr += 8;
      dst_ptr += 8;
    }

    if (nDstCount > batchSize)
    {
      *dst_ptr = *l_ptr - *(h_ptr - nHighStep);
      l_ptr += 1;
      h_ptr += 1;
      dst_ptr += 1;
    }
  }

  h_ptr = pHighBand;
  dst_ptr = pDstBand + nDstStep;

  /* Odd coefficients */
  for (size_t n = 0; n < forceBandSize; n++)
  {
    for (size_t x = 0; x < batchSize; x += 8)
    {
      // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
      int16x8_t tmp_n = vld1q_s16(dst_ptr - nDstStep);
      if (n == 31)
      {
        int16x8_t dst_n_p = vld1q_s16(l_ptr);
        l_ptr += 8;
        tmp_n = vaddq_s16(tmp_n, dst_n_p);
        tmp_n = vshrq_n_s16(tmp_n, 1);
      }
      else
      {
        int16x8_t dst_n_p = vld1q_s16(dst_ptr + nDstStep);
        tmp_n = vaddq_s16(tmp_n, dst_n_p);
        tmp_n = vshrq_n_s16(tmp_n, 1);
        int16x8_t h_n = vld1q_s16(h_ptr);
        h_n = vshlq_n_s16(h_n, 1);
        tmp_n = vaddq_s16(tmp_n, h_n);
      }
      vst1q_s16(dst_ptr, tmp_n);
      h_ptr += 8;
      dst_ptr += 8;
    }

    if (nDstCount > batchSize)
    {
      int16_t tmp_n = *(dst_ptr - nDstStep);
      if (n == 31)
      {
        int16_t dst_n_p = *l_ptr;
        l_ptr += 1;
        tmp_n += dst_n_p;
        tmp_n >>= 1;
      }
      else
      {
        int16_t dst_n_p = *(dst_ptr + nDstStep);
        tmp_n += dst_n_p;
        tmp_n >>= 1;
        int16_t h_n = *h_ptr;
        h_n <<= 1;
        tmp_n += h_n;
      }
      *dst_ptr = tmp_n;
      h_ptr += 1;
      dst_ptr += 1;
    }

    dst_ptr += nDstStep;
  }
}

static inline size_t prfx_get_band_l_count(size_t level)
{
  return (64 >> level) + 1;
}

static inline size_t prfx_get_band_h_count(size_t level)
{
  if (level == 1)
    return (64 >> 1) - 1;
  else
    return (64 + (1u << (level - 1))) >> level;
}

static inline void rfx_dwt_2d_decode_extrapolate_block_neon(INT16* buffer, INT16* temp,
                                                            size_t level)
{
  size_t nDstStepX;
  size_t nDstStepY;
  INT16 *HL, *LH;
  INT16 *HH, *LL;
  INT16 *L, *H, *LLx;

  const size_t nBandL = prfx_get_band_l_count(level);
  const size_t nBandH = prfx_get_band_h_count(level);
  size_t offset = 0;

  WINPR_ASSERT(buffer);
  WINPR_ASSERT(temp);

  HL = &buffer[offset];
  offset += (nBandH * nBandL);
  LH = &buffer[offset];
  offset += (nBandL * nBandH);
  HH = &buffer[offset];
  offset += (nBandH * nBandH);
  LL = &buffer[offset];
  nDstStepX = (nBandL + nBandH);
  nDstStepY = (nBandL + nBandH);
  offset = 0;
  L = &temp[offset];
  offset += (nBandL * nDstStepX);
  H = &temp[offset];
  LLx = &buffer[0];

  /* horizontal (LL + HL -> L) */
  rfx_idwt_extrapolate_horiz_neon(LL, nBandL, HL, nBandH, L, nDstStepX, nBandL, nBandH, nBandL);

  /* horizontal (LH + HH -> H) */
  rfx_idwt_extrapolate_horiz_neon(LH, nBandL, HH, nBandH, H, nDstStepX, nBandL, nBandH, nBandH);

  /* vertical (L + H -> LL) */
  rfx_idwt_extrapolate_vert_neon(L, nDstStepX, H, nDstStepX, LLx, nDstStepY, nBandL, nBandH,
                                 nBandL + nBandH);
}

static void rfx_dwt_2d_extrapolate_decode_neon(INT16* buffer, INT16* temp)
{
  WINPR_ASSERT(buffer);
  WINPR_ASSERT(temp);
  rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[3807], temp, 3);
  rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[3007], temp, 2);
  rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[0], temp, 1);
}
#endif // NEON_INTRINSICS_ENABLED

void rfx_init_neon_int(RFX_CONTEXT* WINPR_RESTRICT context)
{
#if defined(NEON_INTRINSICS_ENABLED)
  WLog_VRB(PRIM_TAG, "NEON optimizations");
  PROFILER_RENAME(context->priv->prof_rfx_ycbcr_to_rgb, "rfx_decode_YCbCr_to_RGB_NEON");
  PROFILER_RENAME(context->priv->prof_rfx_quantization_decode, "rfx_quantization_decode_NEON");
  PROFILER_RENAME(context->priv->prof_rfx_dwt_2d_decode, "rfx_dwt_2d_decode_NEON");
  context->quantization_decode = rfx_quantization_decode_NEON;
  context->dwt_2d_decode = rfx_dwt_2d_decode_NEON;
  context->dwt_2d_extrapolate_decode = rfx_dwt_2d_extrapolate_decode_neon;
#else
  WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or NEON intrinsics not available");
  WINPR_UNUSED(context);
#endif
}

Coverage Report

Created: 2026-04-12 07:03

Line	Count	Source
1		/*
2		FreeRDP: A Remote Desktop Protocol Implementation
3		RemoteFX Codec Library - NEON Optimizations
4
5		Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>
6
7		Licensed under the Apache License, Version 2.0 (the "License");
8		you may not use this file except in compliance with the License.
9		You may obtain a copy of the License at
10
11		http://www.apache.org/licenses/LICENSE-2.0
12
13		Unless required by applicable law or agreed to in writing, software
14		distributed under the License is distributed on an "AS IS" BASIS,
15		WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16		See the License for the specific language governing permissions and
17		limitations under the License.
18		*/
19
20		#include <winpr/platform.h>
21		#include <freerdp/config.h>
22		#include <freerdp/log.h>
23
24		#include "../rfx_types.h"
25		#include "../rfx_quantization.h"
26		#include "rfx_neon.h"
27
28		#include "../../core/simd.h"
29
30		#if defined(NEON_INTRINSICS_ENABLED)
31
32		#include <stdio.h>
33		#include <stdlib.h>
34		#include <string.h>
35		#include <arm_neon.h>
36		#include <winpr/sysinfo.h>
37
38		/* rfx_decode_YCbCr_to_RGB_NEON code now resides in the primitives library. */
39
40		static inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
41		rfx_quantization_decode_block_NEON(INT16* buffer, const size_t buffer_size, const UINT32 factor)
42		{
43		int16x8_t quantFactors = vdupq_n_s16(factor);
44		int16x8_t* buf = (int16x8_t*)buffer;
45		int16x8_t* buf_end = (int16x8_t*)(buffer + buffer_size);
46
47		do
48		{
49		int16x8_t val = vld1q_s16((INT16*)buf);
50		val = vshlq_s16(val, quantFactors);
51		vst1q_s16((INT16*)buf, val);
52		buf++;
53		} while (buf < buf_end);
54		}
55
56		WINPR_ATTR_NODISCARD
57		static BOOL rfx_quantization_decode_NEON(INT16* buffer, const UINT32* WINPR_RESTRICT quantVals,
58		size_t nrQuantVals)
59		{
60		WINPR_ASSERT(buffer);
61		WINPR_ASSERT(quantVals);
62		WINPR_ASSERT(nrQuantVals == NR_QUANT_VALUES);
63
64		for (size_t x = 0; x < nrQuantVals; x++)
65		{
66		const UINT32 val = quantVals[x];
67		if (val < 1)
68		return FALSE;
69		}
70
71		rfx_quantization_decode_block_NEON(&buffer[0], 1024, quantVals[8] - 1); /* HL1 */
72		rfx_quantization_decode_block_NEON(&buffer[1024], 1024, quantVals[7] - 1); /* LH1 */
73		rfx_quantization_decode_block_NEON(&buffer[2048], 1024, quantVals[9] - 1); /* HH1 */
74		rfx_quantization_decode_block_NEON(&buffer[3072], 256, quantVals[5] - 1); /* HL2 */
75		rfx_quantization_decode_block_NEON(&buffer[3328], 256, quantVals[4] - 1); /* LH2 */
76		rfx_quantization_decode_block_NEON(&buffer[3584], 256, quantVals[6] - 1); /* HH2 */
77		rfx_quantization_decode_block_NEON(&buffer[3840], 64, quantVals[2] - 1); /* HL3 */
78		rfx_quantization_decode_block_NEON(&buffer[3904], 64, quantVals[1] - 1); /* LH3 */
79		rfx_quantization_decode_block_NEON(&buffer[3968], 64, quantVals[3] - 1); /* HH3 */
80		rfx_quantization_decode_block_NEON(&buffer[4032], 64, quantVals[0] - 1); /* LL3 */
81		return TRUE;
82		}
83
84		static inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
85		rfx_dwt_2d_decode_block_horiz_NEON(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
86		INT16* WINPR_RESTRICT dst, size_t subband_width)
87		{
88		INT16* l_ptr = l;
89		INT16* h_ptr = h;
90		INT16* dst_ptr = dst;
91
92		for (size_t y = 0; y < subband_width; y++)
93		{
94		/* Even coefficients */
95		for (size_t n = 0; n < subband_width; n += 8)
96		{
97		// dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
98		int16x8_t l_n = vld1q_s16(l_ptr);
99		int16x8_t h_n = vld1q_s16(h_ptr);
100		int16x8_t h_n_m = vld1q_s16(h_ptr - 1);
101
102		if (n == 0)
103		{
104		int16_t first = vgetq_lane_s16(h_n_m, 1);
105		h_n_m = vsetq_lane_s16(first, h_n_m, 0);
106		}
107
108		int16x8_t tmp_n = vaddq_s16(h_n, h_n_m);
109		tmp_n = vaddq_s16(tmp_n, vdupq_n_s16(1));
110		tmp_n = vshrq_n_s16(tmp_n, 1);
111		int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
112		vst1q_s16(l_ptr, dst_n);
113		l_ptr += 8;
114		h_ptr += 8;
115		}
116
117		l_ptr -= subband_width;
118		h_ptr -= subband_width;
119
120		/* Odd coefficients */
121		for (size_t n = 0; n < subband_width; n += 8)
122		{
123		// dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
124		int16x8_t h_n = vld1q_s16(h_ptr);
125		h_n = vshlq_n_s16(h_n, 1);
126		int16x8x2_t dst_n;
127		dst_n.val[0] = vld1q_s16(l_ptr);
128		int16x8_t dst_n_p = vld1q_s16(l_ptr + 1);
129
130		if (n == subband_width - 8)
131		{
132		int16_t last = vgetq_lane_s16(dst_n_p, 6);
133		dst_n_p = vsetq_lane_s16(last, dst_n_p, 7);
134		}
135
136		dst_n.val[1] = vaddq_s16(dst_n_p, dst_n.val[0]);
137		dst_n.val[1] = vshrq_n_s16(dst_n.val[1], 1);
138		dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n);
139		vst2q_s16(dst_ptr, dst_n);
140		l_ptr += 8;
141		h_ptr += 8;
142		dst_ptr += 16;
143		}
144		}
145		}
146
147		static inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
148		rfx_dwt_2d_decode_block_vert_NEON(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
149		INT16* WINPR_RESTRICT dst, size_t subband_width)
150		{
151		INT16* l_ptr = l;
152		INT16* h_ptr = h;
153		INT16* dst_ptr = dst;
154		const size_t total_width = subband_width + subband_width;
155
156		/* Even coefficients */
157		for (size_t n = 0; n < subband_width; n++)
158		{
159		for (size_t x = 0; x < total_width; x += 8)
160		{
161		// dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
162		int16x8_t l_n = vld1q_s16(l_ptr);
163		int16x8_t h_n = vld1q_s16(h_ptr);
164		int16x8_t tmp_n = vaddq_s16(h_n, vdupq_n_s16(1));
165
166		if (n == 0)
167		tmp_n = vaddq_s16(tmp_n, h_n);
168		else
169		{
170		int16x8_t h_n_m = vld1q_s16((h_ptr - total_width));
171		tmp_n = vaddq_s16(tmp_n, h_n_m);
172		}
173
174		tmp_n = vshrq_n_s16(tmp_n, 1);
175		int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
176		vst1q_s16(dst_ptr, dst_n);
177		l_ptr += 8;
178		h_ptr += 8;
179		dst_ptr += 8;
180		}
181
182		dst_ptr += total_width;
183		}
184
185		h_ptr = h;
186		dst_ptr = dst + total_width;
187
188		/* Odd coefficients */
189		for (size_t n = 0; n < subband_width; n++)
190		{
191		for (size_t x = 0; x < total_width; x += 8)
192		{
193		// dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
194		int16x8_t h_n = vld1q_s16(h_ptr);
195		int16x8_t dst_n_m = vld1q_s16(dst_ptr - total_width);
196		h_n = vshlq_n_s16(h_n, 1);
197		int16x8_t tmp_n = dst_n_m;
198
199		if (n == subband_width - 1)
200		tmp_n = vaddq_s16(tmp_n, dst_n_m);
201		else
202		{
203		int16x8_t dst_n_p = vld1q_s16((dst_ptr + total_width));
204		tmp_n = vaddq_s16(tmp_n, dst_n_p);
205		}
206
207		tmp_n = vshrq_n_s16(tmp_n, 1);
208		int16x8_t dst_n = vaddq_s16(tmp_n, h_n);
209		vst1q_s16(dst_ptr, dst_n);
210		h_ptr += 8;
211		dst_ptr += 8;
212		}
213
214		dst_ptr += total_width;
215		}
216		}
217
218		static inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
219		rfx_dwt_2d_decode_block_NEON(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT idwt,
220		size_t subband_width)
221		{
222		INT16 hl, lh, hh, ll;
223		INT16 l_dst, h_dst;
224		/* Inverse DWT in horizontal direction, results in 2 sub-bands in L, H order in tmp buffer idwt.
225		*/
226		/* The 4 sub-bands are stored in HL(0), LH(1), HH(2), LL(3) order. */
227		/* The lower part L uses LL(3) and HL(0). */
228		/* The higher part H uses LH(1) and HH(2). */
229		ll = buffer + subband_width * subband_width * 3;
230		hl = buffer;
231		l_dst = idwt;
232		rfx_dwt_2d_decode_block_horiz_NEON(ll, hl, l_dst, subband_width);
233		lh = buffer + subband_width * subband_width;
234		hh = buffer + subband_width * subband_width * 2;
235		h_dst = idwt + subband_width * subband_width * 2;
236		rfx_dwt_2d_decode_block_horiz_NEON(lh, hh, h_dst, subband_width);
237		/* Inverse DWT in vertical direction, results are stored in original buffer. */
238		rfx_dwt_2d_decode_block_vert_NEON(l_dst, h_dst, buffer, subband_width);
239		}
240
241		static void rfx_dwt_2d_decode_NEON(INT16* buffer, INT16* dwt_buffer)
242		{
243		rfx_dwt_2d_decode_block_NEON(buffer + 3840, dwt_buffer, 8);
244		rfx_dwt_2d_decode_block_NEON(buffer + 3072, dwt_buffer, 16);
245		rfx_dwt_2d_decode_block_NEON(buffer, dwt_buffer, 32);
246		}
247
248		static inline void rfx_idwt_extrapolate_horiz_neon(INT16* restrict pLowBand, size_t nLowStep,
249		const INT16* restrict pHighBand,
250		size_t nHighStep, INT16* restrict pDstBand,
251		size_t nDstStep, size_t nLowCount,
252		size_t nHighCount, size_t nDstCount)
253		{
254		WINPR_ASSERT(pLowBand);
255		WINPR_ASSERT(pHighBand);
256		WINPR_ASSERT(pDstBand);
257
258		INT16* l_ptr = pLowBand;
259		const INT16* h_ptr = pHighBand;
260		INT16* dst_ptr = pDstBand;
261		size_t batchSize = (nLowCount + nHighCount) >> 1;
262
263		for (size_t y = 0; y < nDstCount; y++)
264		{
265		/* Even coefficients */
266		size_t n = 0;
267		for (; n < batchSize; n += 8)
268		{
269		// dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
270		int16x8_t l_n = vld1q_s16(l_ptr);
271		int16x8_t h_n = vld1q_s16(h_ptr);
272		int16x8_t h_n_m = vld1q_s16(h_ptr - 1);
273
274		if (n == 0)
275		{
276		int16_t first = vgetq_lane_s16(h_n_m, 1);
277		h_n_m = vsetq_lane_s16(first, h_n_m, 0);
278		}
279		else if (n == 24)
280		h_n = vsetq_lane_s16(0, h_n, 7);
281
282		int16x8_t tmp_n = vaddq_s16(h_n, h_n_m);
283		tmp_n = vaddq_s16(tmp_n, vdupq_n_s16(1));
284		tmp_n = vshrq_n_s16(tmp_n, 1);
285		int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
286		vst1q_s16(l_ptr, dst_n);
287		l_ptr += 8;
288		h_ptr += 8;
289		}
290		if (n < 32)
291		l_ptr -= (h_ptr - 1);
292
293		l_ptr -= batchSize;
294		h_ptr -= batchSize;
295
296		/* Odd coefficients */
297		n = 0;
298		for (; n < batchSize; n += 8)
299		{
300		// dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
301		int16x8_t h_n = vld1q_s16(h_ptr);
302		h_n = vshlq_n_s16(h_n, 1);
303		int16x8x2_t dst_n;
304		dst_n.val[0] = vld1q_s16(l_ptr);
305		int16x8_t dst_n_p = vld1q_s16(l_ptr + 1);
306
307		if (n == 24)
308		h_n = vsetq_lane_s16(0, h_n, 7);
309
310		dst_n.val[1] = vaddq_s16(dst_n_p, dst_n.val[0]);
311		dst_n.val[1] = vshrq_n_s16(dst_n.val[1], 1);
312		dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n);
313		vst2q_s16(dst_ptr, dst_n);
314		l_ptr += 8;
315		h_ptr += 8;
316		dst_ptr += 16;
317		}
318		if (n == 32)
319		{
320		h_ptr -= 1;
321		l_ptr += 1;
322		}
323		else
324		{
325		dst_ptr = l_ptr;
326		l_ptr += 1;
327		dst_ptr += 1;
328		}
329		}
330		}
331
332		static inline void rfx_idwt_extrapolate_vert_neon(const INT16* restrict pLowBand, size_t nLowStep,
333		const INT16* restrict pHighBand, size_t nHighStep,
334		INT16* restrict pDstBand, size_t nDstStep,
335		size_t nLowCount, size_t nHighCount,
336		size_t nDstCount)
337		{
338		WINPR_ASSERT(pLowBand);
339		WINPR_ASSERT(pHighBand);
340		WINPR_ASSERT(pDstBand);
341
342		const INT16* l_ptr = pLowBand;
343		const INT16* h_ptr = pHighBand;
344		INT16* dst_ptr = pDstBand;
345		size_t batchSize = (nDstCount >> 3) << 3;
346		size_t forceBandSize = (nLowCount + nHighCount) >> 1;
347
348		/* Even coefficients */
349		for (size_t n = 0; n < forceBandSize; n++)
350		{
351		for (size_t x = 0; x < batchSize; x += 8)
352		{
353		// dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
354		int16x8_t l_n = vld1q_s16(l_ptr);
355		int16x8_t h_n = vld1q_s16((n == 31) ? (h_ptr - nHighStep) : h_ptr);
356		int16x8_t tmp_n = vaddq_s16(h_n, vdupq_n_s16(1));
357
358		if (n == 0)
359		tmp_n = vaddq_s16(tmp_n, h_n);
360		else if (n < 31)
361		{
362		int16x8_t h_n_m = vld1q_s16((h_ptr - nHighStep));
363		tmp_n = vaddq_s16(tmp_n, h_n_m);
364		}
365
366		tmp_n = vshrq_n_s16(tmp_n, 1);
367		int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
368		vst1q_s16(dst_ptr, dst_n);
369		l_ptr += 8;
370		h_ptr += 8;
371		dst_ptr += 8;
372		}
373
374		if (nDstCount > batchSize)
375		{
376		int16_t h_n = (n == 31) ? (h_ptr - nHighStep) : h_ptr;
377		int16_t tmp_n = h_n + 1;
378		if (n == 0)
379		tmp_n += h_n;
380		else if (n < 31)
381		tmp_n += *(h_ptr - nHighStep);
382		tmp_n >>= 1;
383		dst_ptr = l_ptr - tmp_n;
384		l_ptr += 1;
385		h_ptr += 1;
386		dst_ptr += 1;
387		}
388
389		dst_ptr += nDstStep;
390		}
391
392		if (forceBandSize < 32)
393		{
394		for (size_t x = 0; x < batchSize; x += 8)
395		{
396		int16x8_t l_n = vld1q_s16(l_ptr);
397		int16x8_t h_n = vld1q_s16(h_ptr - nHighStep);
398		int16x8_t tmp_n = vsubq_s16(l_n, h_n);
399		vst1q_s16(dst_ptr, tmp_n);
400		l_ptr += 8;
401		h_ptr += 8;
402		dst_ptr += 8;
403		}
404
405		if (nDstCount > batchSize)
406		{
407		dst_ptr = l_ptr - *(h_ptr - nHighStep);
408		l_ptr += 1;
409		h_ptr += 1;
410		dst_ptr += 1;
411		}
412		}
413
414		h_ptr = pHighBand;
415		dst_ptr = pDstBand + nDstStep;
416
417		/* Odd coefficients */
418		for (size_t n = 0; n < forceBandSize; n++)
419		{
420		for (size_t x = 0; x < batchSize; x += 8)
421		{
422		// dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
423		int16x8_t tmp_n = vld1q_s16(dst_ptr - nDstStep);
424		if (n == 31)
425		{
426		int16x8_t dst_n_p = vld1q_s16(l_ptr);
427		l_ptr += 8;
428		tmp_n = vaddq_s16(tmp_n, dst_n_p);
429		tmp_n = vshrq_n_s16(tmp_n, 1);
430		}
431		else
432		{
433		int16x8_t dst_n_p = vld1q_s16(dst_ptr + nDstStep);
434		tmp_n = vaddq_s16(tmp_n, dst_n_p);
435		tmp_n = vshrq_n_s16(tmp_n, 1);
436		int16x8_t h_n = vld1q_s16(h_ptr);
437		h_n = vshlq_n_s16(h_n, 1);
438		tmp_n = vaddq_s16(tmp_n, h_n);
439		}
440		vst1q_s16(dst_ptr, tmp_n);
441		h_ptr += 8;
442		dst_ptr += 8;
443		}
444
445		if (nDstCount > batchSize)
446		{
447		int16_t tmp_n = *(dst_ptr - nDstStep);
448		if (n == 31)
449		{
450		int16_t dst_n_p = *l_ptr;
451		l_ptr += 1;
452		tmp_n += dst_n_p;
453		tmp_n >>= 1;
454		}
455		else
456		{
457		int16_t dst_n_p = *(dst_ptr + nDstStep);
458		tmp_n += dst_n_p;
459		tmp_n >>= 1;
460		int16_t h_n = *h_ptr;
461		h_n <<= 1;
462		tmp_n += h_n;
463		}
464		*dst_ptr = tmp_n;
465		h_ptr += 1;
466		dst_ptr += 1;
467		}
468
469		dst_ptr += nDstStep;
470		}
471		}
472
473		static inline size_t prfx_get_band_l_count(size_t level)
474		{
475		return (64 >> level) + 1;
476		}
477
478		static inline size_t prfx_get_band_h_count(size_t level)
479		{
480		if (level == 1)
481		return (64 >> 1) - 1;
482		else
483		return (64 + (1u << (level - 1))) >> level;
484		}
485
486		static inline void rfx_dwt_2d_decode_extrapolate_block_neon(INT16* buffer, INT16* temp,
487		size_t level)
488		{
489		size_t nDstStepX;
490		size_t nDstStepY;
491		INT16 HL, LH;
492		INT16 HH, LL;
493		INT16 L, H, *LLx;
494
495		const size_t nBandL = prfx_get_band_l_count(level);
496		const size_t nBandH = prfx_get_band_h_count(level);
497		size_t offset = 0;
498
499		WINPR_ASSERT(buffer);
500		WINPR_ASSERT(temp);
501
502		HL = &buffer[offset];
503		offset += (nBandH * nBandL);
504		LH = &buffer[offset];
505		offset += (nBandL * nBandH);
506		HH = &buffer[offset];
507		offset += (nBandH * nBandH);
508		LL = &buffer[offset];
509		nDstStepX = (nBandL + nBandH);
510		nDstStepY = (nBandL + nBandH);
511		offset = 0;
512		L = &temp[offset];
513		offset += (nBandL * nDstStepX);
514		H = &temp[offset];
515		LLx = &buffer[0];
516
517		/* horizontal (LL + HL -> L) */
518		rfx_idwt_extrapolate_horiz_neon(LL, nBandL, HL, nBandH, L, nDstStepX, nBandL, nBandH, nBandL);
519
520		/* horizontal (LH + HH -> H) */
521		rfx_idwt_extrapolate_horiz_neon(LH, nBandL, HH, nBandH, H, nDstStepX, nBandL, nBandH, nBandH);
522
523		/* vertical (L + H -> LL) */
524		rfx_idwt_extrapolate_vert_neon(L, nDstStepX, H, nDstStepX, LLx, nDstStepY, nBandL, nBandH,
525		nBandL + nBandH);
526		}
527
528		static void rfx_dwt_2d_extrapolate_decode_neon(INT16* buffer, INT16* temp)
529		{
530		WINPR_ASSERT(buffer);
531		WINPR_ASSERT(temp);
532		rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[3807], temp, 3);
533		rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[3007], temp, 2);
534		rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[0], temp, 1);
535		}
536		#endif // NEON_INTRINSICS_ENABLED
537
538		void rfx_init_neon_int(RFX_CONTEXT* WINPR_RESTRICT context)
539	0	{
540		#if defined(NEON_INTRINSICS_ENABLED)
541		WLog_VRB(PRIM_TAG, "NEON optimizations");
542		PROFILER_RENAME(context->priv->prof_rfx_ycbcr_to_rgb, "rfx_decode_YCbCr_to_RGB_NEON");
543		PROFILER_RENAME(context->priv->prof_rfx_quantization_decode, "rfx_quantization_decode_NEON");
544		PROFILER_RENAME(context->priv->prof_rfx_dwt_2d_decode, "rfx_dwt_2d_decode_NEON");
545		context->quantization_decode = rfx_quantization_decode_NEON;
546		context->dwt_2d_decode = rfx_dwt_2d_decode_NEON;
547		context->dwt_2d_extrapolate_decode = rfx_dwt_2d_extrapolate_decode_neon;
548		#else
549	0	WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or NEON intrinsics not available");
550	0	WINPR_UNUSED(context);
551	0	#endif
552	0	}