/src/FreeRDP/libfreerdp/codec/neon/rfx_neon.c

Source
/*
   FreeRDP: A Remote Desktop Protocol Implementation
   RemoteFX Codec Library - NEON Optimizations

   Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

#include <winpr/platform.h>
#include <freerdp/config.h>
#include <freerdp/log.h>

#include "../rfx_types.h"
#include "rfx_neon.h"

#define TAG FREERDP_TAG("codec.rfx.neon")

#if defined(WITH_NEON)
#if defined(_M_ARM64) || defined(_M_ARM)
#define NEON_ENABLED
#endif
#endif

#if defined(NEON_ENABLED)

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <arm_neon.h>
#include <winpr/sysinfo.h>

/* rfx_decode_YCbCr_to_RGB_NEON code now resides in the primitives library. */

static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
rfx_quantization_decode_block_NEON(INT16* buffer, const size_t buffer_size, const UINT32 factor)
{
  int16x8_t quantFactors = vdupq_n_s16(factor);
  int16x8_t* buf = (int16x8_t*)buffer;
  int16x8_t* buf_end = (int16x8_t*)(buffer + buffer_size);

  do
  {
    int16x8_t val = vld1q_s16((INT16*)buf);
    val = vshlq_s16(val, quantFactors);
    vst1q_s16((INT16*)buf, val);
    buf++;
  } while (buf < buf_end);
}

static void rfx_quantization_decode_NEON(INT16* buffer, const UINT32* WINPR_RESTRICT quantVals)
{
  WINPR_ASSERT(buffer);
  WINPR_ASSERT(quantVals);

  rfx_quantization_decode_block_NEON(&buffer[0], 1024, quantVals[8] - 1);    /* HL1 */
  rfx_quantization_decode_block_NEON(&buffer[1024], 1024, quantVals[7] - 1); /* LH1 */
  rfx_quantization_decode_block_NEON(&buffer[2048], 1024, quantVals[9] - 1); /* HH1 */
  rfx_quantization_decode_block_NEON(&buffer[3072], 256, quantVals[5] - 1);  /* HL2 */
  rfx_quantization_decode_block_NEON(&buffer[3328], 256, quantVals[4] - 1);  /* LH2 */
  rfx_quantization_decode_block_NEON(&buffer[3584], 256, quantVals[6] - 1);  /* HH2 */
  rfx_quantization_decode_block_NEON(&buffer[3840], 64, quantVals[2] - 1);   /* HL3 */
  rfx_quantization_decode_block_NEON(&buffer[3904], 64, quantVals[1] - 1);   /* LH3 */
  rfx_quantization_decode_block_NEON(&buffer[3968], 64, quantVals[3] - 1);   /* HH3 */
  rfx_quantization_decode_block_NEON(&buffer[4032], 64, quantVals[0] - 1);   /* LL3 */
}

static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
rfx_dwt_2d_decode_block_horiz_NEON(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
                                   INT16* WINPR_RESTRICT dst, size_t subband_width)
{
  INT16* l_ptr = l;
  INT16* h_ptr = h;
  INT16* dst_ptr = dst;

  for (size_t y = 0; y < subband_width; y++)
  {
    /* Even coefficients */
    for (size_t n = 0; n < subband_width; n += 8)
    {
      // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
      int16x8_t l_n = vld1q_s16(l_ptr);
      int16x8_t h_n = vld1q_s16(h_ptr);
      int16x8_t h_n_m = vld1q_s16(h_ptr - 1);

      if (n == 0)
      {
        int16_t first = vgetq_lane_s16(h_n_m, 1);
        h_n_m = vsetq_lane_s16(first, h_n_m, 0);
      }

      int16x8_t tmp_n = vaddq_s16(h_n, h_n_m);
      tmp_n = vaddq_s16(tmp_n, vdupq_n_s16(1));
      tmp_n = vshrq_n_s16(tmp_n, 1);
      int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
      vst1q_s16(l_ptr, dst_n);
      l_ptr += 8;
      h_ptr += 8;
    }

    l_ptr -= subband_width;
    h_ptr -= subband_width;

    /* Odd coefficients */
    for (size_t n = 0; n < subband_width; n += 8)
    {
      // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
      int16x8_t h_n = vld1q_s16(h_ptr);
      h_n = vshlq_n_s16(h_n, 1);
      int16x8x2_t dst_n;
      dst_n.val[0] = vld1q_s16(l_ptr);
      int16x8_t dst_n_p = vld1q_s16(l_ptr + 1);

      if (n == subband_width - 8)
      {
        int16_t last = vgetq_lane_s16(dst_n_p, 6);
        dst_n_p = vsetq_lane_s16(last, dst_n_p, 7);
      }

      dst_n.val[1] = vaddq_s16(dst_n_p, dst_n.val[0]);
      dst_n.val[1] = vshrq_n_s16(dst_n.val[1], 1);
      dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n);
      vst2q_s16(dst_ptr, dst_n);
      l_ptr += 8;
      h_ptr += 8;
      dst_ptr += 16;
    }
  }
}

static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
rfx_dwt_2d_decode_block_vert_NEON(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
                                  INT16* WINPR_RESTRICT dst, size_t subband_width)
{
  INT16* l_ptr = l;
  INT16* h_ptr = h;
  INT16* dst_ptr = dst;
  const size_t total_width = subband_width + subband_width;

  /* Even coefficients */
  for (size_t n = 0; n < subband_width; n++)
  {
    for (size_t x = 0; x < total_width; x += 8)
    {
      // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
      int16x8_t l_n = vld1q_s16(l_ptr);
      int16x8_t h_n = vld1q_s16(h_ptr);
      int16x8_t tmp_n = vaddq_s16(h_n, vdupq_n_s16(1));

      if (n == 0)
        tmp_n = vaddq_s16(tmp_n, h_n);
      else
      {
        int16x8_t h_n_m = vld1q_s16((h_ptr - total_width));
        tmp_n = vaddq_s16(tmp_n, h_n_m);
      }

      tmp_n = vshrq_n_s16(tmp_n, 1);
      int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
      vst1q_s16(dst_ptr, dst_n);
      l_ptr += 8;
      h_ptr += 8;
      dst_ptr += 8;
    }

    dst_ptr += total_width;
  }

  h_ptr = h;
  dst_ptr = dst + total_width;

  /* Odd coefficients */
  for (size_t n = 0; n < subband_width; n++)
  {
    for (size_t x = 0; x < total_width; x += 8)
    {
      // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
      int16x8_t h_n = vld1q_s16(h_ptr);
      int16x8_t dst_n_m = vld1q_s16(dst_ptr - total_width);
      h_n = vshlq_n_s16(h_n, 1);
      int16x8_t tmp_n = dst_n_m;

      if (n == subband_width - 1)
        tmp_n = vaddq_s16(tmp_n, dst_n_m);
      else
      {
        int16x8_t dst_n_p = vld1q_s16((dst_ptr + total_width));
        tmp_n = vaddq_s16(tmp_n, dst_n_p);
      }

      tmp_n = vshrq_n_s16(tmp_n, 1);
      int16x8_t dst_n = vaddq_s16(tmp_n, h_n);
      vst1q_s16(dst_ptr, dst_n);
      h_ptr += 8;
      dst_ptr += 8;
    }

    dst_ptr += total_width;
  }
}

static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
rfx_dwt_2d_decode_block_NEON(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT idwt,
                             size_t subband_width)
{
  INT16 *hl, *lh, *hh, *ll;
  INT16 *l_dst, *h_dst;
  /* Inverse DWT in horizontal direction, results in 2 sub-bands in L, H order in tmp buffer idwt.
   */
  /* The 4 sub-bands are stored in HL(0), LH(1), HH(2), LL(3) order. */
  /* The lower part L uses LL(3) and HL(0). */
  /* The higher part H uses LH(1) and HH(2). */
  ll = buffer + subband_width * subband_width * 3;
  hl = buffer;
  l_dst = idwt;
  rfx_dwt_2d_decode_block_horiz_NEON(ll, hl, l_dst, subband_width);
  lh = buffer + subband_width * subband_width;
  hh = buffer + subband_width * subband_width * 2;
  h_dst = idwt + subband_width * subband_width * 2;
  rfx_dwt_2d_decode_block_horiz_NEON(lh, hh, h_dst, subband_width);
  /* Inverse DWT in vertical direction, results are stored in original buffer. */
  rfx_dwt_2d_decode_block_vert_NEON(l_dst, h_dst, buffer, subband_width);
}

static void rfx_dwt_2d_decode_NEON(INT16* buffer, INT16* dwt_buffer)
{
  rfx_dwt_2d_decode_block_NEON(buffer + 3840, dwt_buffer, 8);
  rfx_dwt_2d_decode_block_NEON(buffer + 3072, dwt_buffer, 16);
  rfx_dwt_2d_decode_block_NEON(buffer, dwt_buffer, 32);
}

static INLINE void rfx_idwt_extrapolate_horiz_neon(INT16* restrict pLowBand, size_t nLowStep,
                                                   const INT16* restrict pHighBand,
                                                   size_t nHighStep, INT16* restrict pDstBand,
                                                   size_t nDstStep, size_t nLowCount,
                                                   size_t nHighCount, size_t nDstCount)
{
  WINPR_ASSERT(pLowBand);
  WINPR_ASSERT(pHighBand);
  WINPR_ASSERT(pDstBand);

  INT16* l_ptr = pLowBand;
  const INT16* h_ptr = pHighBand;
  INT16* dst_ptr = pDstBand;
  size_t batchSize = (nLowCount + nHighCount) >> 1;

  for (size_t y = 0; y < nDstCount; y++)
  {
    /* Even coefficients */
    size_t n = 0;
    for (; n < batchSize; n += 8)
    {
      // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
      int16x8_t l_n = vld1q_s16(l_ptr);
      int16x8_t h_n = vld1q_s16(h_ptr);
      int16x8_t h_n_m = vld1q_s16(h_ptr - 1);

      if (n == 0)
      {
        int16_t first = vgetq_lane_s16(h_n_m, 1);
        h_n_m = vsetq_lane_s16(first, h_n_m, 0);
      }
      else if (n == 24)
        h_n = vsetq_lane_s16(0, h_n, 7);

      int16x8_t tmp_n = vaddq_s16(h_n, h_n_m);
      tmp_n = vaddq_s16(tmp_n, vdupq_n_s16(1));
      tmp_n = vshrq_n_s16(tmp_n, 1);
      int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
      vst1q_s16(l_ptr, dst_n);
      l_ptr += 8;
      h_ptr += 8;
    }
    if (n < 32)
      *l_ptr -= *(h_ptr - 1);

    l_ptr -= batchSize;
    h_ptr -= batchSize;

    /* Odd coefficients */
    n = 0;
    for (; n < batchSize; n += 8)
    {
      // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
      int16x8_t h_n = vld1q_s16(h_ptr);
      h_n = vshlq_n_s16(h_n, 1);
      int16x8x2_t dst_n;
      dst_n.val[0] = vld1q_s16(l_ptr);
      int16x8_t dst_n_p = vld1q_s16(l_ptr + 1);

      if (n == 24)
        h_n = vsetq_lane_s16(0, h_n, 7);

      dst_n.val[1] = vaddq_s16(dst_n_p, dst_n.val[0]);
      dst_n.val[1] = vshrq_n_s16(dst_n.val[1], 1);
      dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n);
      vst2q_s16(dst_ptr, dst_n);
      l_ptr += 8;
      h_ptr += 8;
      dst_ptr += 16;
    }
    if (n == 32)
    {
      h_ptr -= 1;
      l_ptr += 1;
    }
    else
    {
      *dst_ptr = *l_ptr;
      l_ptr += 1;
      dst_ptr += 1;
    }
  }
}

static INLINE void rfx_idwt_extrapolate_vert_neon(const INT16* restrict pLowBand, size_t nLowStep,
                                                  const INT16* restrict pHighBand, size_t nHighStep,
                                                  INT16* restrict pDstBand, size_t nDstStep,
                                                  size_t nLowCount, size_t nHighCount,
                                                  size_t nDstCount)
{
  WINPR_ASSERT(pLowBand);
  WINPR_ASSERT(pHighBand);
  WINPR_ASSERT(pDstBand);

  const INT16* l_ptr = pLowBand;
  const INT16* h_ptr = pHighBand;
  INT16* dst_ptr = pDstBand;
  size_t batchSize = (nDstCount >> 3) << 3;
  size_t forceBandSize = (nLowCount + nHighCount) >> 1;

  /* Even coefficients */
  for (size_t n = 0; n < forceBandSize; n++)
  {
    for (size_t x = 0; x < batchSize; x += 8)
    {
      // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
      int16x8_t l_n = vld1q_s16(l_ptr);
      int16x8_t h_n = vld1q_s16((n == 31) ? (h_ptr - nHighStep) : h_ptr);
      int16x8_t tmp_n = vaddq_s16(h_n, vdupq_n_s16(1));

      if (n == 0)
        tmp_n = vaddq_s16(tmp_n, h_n);
      else if (n < 31)
      {
        int16x8_t h_n_m = vld1q_s16((h_ptr - nHighStep));
        tmp_n = vaddq_s16(tmp_n, h_n_m);
      }

      tmp_n = vshrq_n_s16(tmp_n, 1);
      int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
      vst1q_s16(dst_ptr, dst_n);
      l_ptr += 8;
      h_ptr += 8;
      dst_ptr += 8;
    }

    if (nDstCount > batchSize)
    {
      int16_t h_n = (n == 31) ? *(h_ptr - nHighStep) : *h_ptr;
      int16_t tmp_n = h_n + 1;
      if (n == 0)
        tmp_n += h_n;
      else if (n < 31)
        tmp_n += *(h_ptr - nHighStep);
      tmp_n >>= 1;
      *dst_ptr = *l_ptr - tmp_n;
      l_ptr += 1;
      h_ptr += 1;
      dst_ptr += 1;
    }

    dst_ptr += nDstStep;
  }

  if (forceBandSize < 32)
  {
    for (size_t x = 0; x < batchSize; x += 8)
    {
      int16x8_t l_n = vld1q_s16(l_ptr);
      int16x8_t h_n = vld1q_s16(h_ptr - nHighStep);
      int16x8_t tmp_n = vsubq_s16(l_n, h_n);
      vst1q_s16(dst_ptr, tmp_n);
      l_ptr += 8;
      h_ptr += 8;
      dst_ptr += 8;
    }

    if (nDstCount > batchSize)
    {
      *dst_ptr = *l_ptr - *(h_ptr - nHighStep);
      l_ptr += 1;
      h_ptr += 1;
      dst_ptr += 1;
    }
  }

  h_ptr = pHighBand;
  dst_ptr = pDstBand + nDstStep;

  /* Odd coefficients */
  for (size_t n = 0; n < forceBandSize; n++)
  {
    for (size_t x = 0; x < batchSize; x += 8)
    {
      // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
      int16x8_t tmp_n = vld1q_s16(dst_ptr - nDstStep);
      if (n == 31)
      {
        int16x8_t dst_n_p = vld1q_s16(l_ptr);
        l_ptr += 8;
        tmp_n = vaddq_s16(tmp_n, dst_n_p);
        tmp_n = vshrq_n_s16(tmp_n, 1);
      }
      else
      {
        int16x8_t dst_n_p = vld1q_s16(dst_ptr + nDstStep);
        tmp_n = vaddq_s16(tmp_n, dst_n_p);
        tmp_n = vshrq_n_s16(tmp_n, 1);
        int16x8_t h_n = vld1q_s16(h_ptr);
        h_n = vshlq_n_s16(h_n, 1);
        tmp_n = vaddq_s16(tmp_n, h_n);
      }
      vst1q_s16(dst_ptr, tmp_n);
      h_ptr += 8;
      dst_ptr += 8;
    }

    if (nDstCount > batchSize)
    {
      int16_t tmp_n = *(dst_ptr - nDstStep);
      if (n == 31)
      {
        int16_t dst_n_p = *l_ptr;
        l_ptr += 1;
        tmp_n += dst_n_p;
        tmp_n >>= 1;
      }
      else
      {
        int16_t dst_n_p = *(dst_ptr + nDstStep);
        tmp_n += dst_n_p;
        tmp_n >>= 1;
        int16_t h_n = *h_ptr;
        h_n <<= 1;
        tmp_n += h_n;
      }
      *dst_ptr = tmp_n;
      h_ptr += 1;
      dst_ptr += 1;
    }

    dst_ptr += nDstStep;
  }
}

static INLINE size_t prfx_get_band_l_count(size_t level)
{
  return (64 >> level) + 1;
}

static INLINE size_t prfx_get_band_h_count(size_t level)
{
  if (level == 1)
    return (64 >> 1) - 1;
  else
    return (64 + (1 << (level - 1))) >> level;
}

static INLINE void rfx_dwt_2d_decode_extrapolate_block_neon(INT16* buffer, INT16* temp,
                                                            size_t level)
{
  size_t nDstStepX;
  size_t nDstStepY;
  INT16 *HL, *LH;
  INT16 *HH, *LL;
  INT16 *L, *H, *LLx;

  const size_t nBandL = prfx_get_band_l_count(level);
  const size_t nBandH = prfx_get_band_h_count(level);
  size_t offset = 0;

  WINPR_ASSERT(buffer);
  WINPR_ASSERT(temp);

  HL = &buffer[offset];
  offset += (nBandH * nBandL);
  LH = &buffer[offset];
  offset += (nBandL * nBandH);
  HH = &buffer[offset];
  offset += (nBandH * nBandH);
  LL = &buffer[offset];
  nDstStepX = (nBandL + nBandH);
  nDstStepY = (nBandL + nBandH);
  offset = 0;
  L = &temp[offset];
  offset += (nBandL * nDstStepX);
  H = &temp[offset];
  LLx = &buffer[0];

  /* horizontal (LL + HL -> L) */
  rfx_idwt_extrapolate_horiz_neon(LL, nBandL, HL, nBandH, L, nDstStepX, nBandL, nBandH, nBandL);

  /* horizontal (LH + HH -> H) */
  rfx_idwt_extrapolate_horiz_neon(LH, nBandL, HH, nBandH, H, nDstStepX, nBandL, nBandH, nBandH);

  /* vertical (L + H -> LL) */
  rfx_idwt_extrapolate_vert_neon(L, nDstStepX, H, nDstStepX, LLx, nDstStepY, nBandL, nBandH,
                                 nBandL + nBandH);
}

static void rfx_dwt_2d_extrapolate_decode_neon(INT16* buffer, INT16* temp)
{
  WINPR_ASSERT(buffer);
  WINPR_ASSERT(temp);
  rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[3807], temp, 3);
  rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[3007], temp, 2);
  rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[0], temp, 1);
}
#endif // NEON_ENABLED

void rfx_init_neon(RFX_CONTEXT* context)
{
#if defined(NEON_ENABLED)
  if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
  {
    DEBUG_RFX("Using NEON optimizations");
    PROFILER_RENAME(context->priv->prof_rfx_ycbcr_to_rgb, "rfx_decode_YCbCr_to_RGB_NEON");
    PROFILER_RENAME(context->priv->prof_rfx_quantization_decode,
                    "rfx_quantization_decode_NEON");
    PROFILER_RENAME(context->priv->prof_rfx_dwt_2d_decode, "rfx_dwt_2d_decode_NEON");
    context->quantization_decode = rfx_quantization_decode_NEON;
    context->dwt_2d_decode = rfx_dwt_2d_decode_NEON;
    context->dwt_2d_extrapolate_decode = rfx_dwt_2d_extrapolate_decode_neon;
  }
#else
  WINPR_UNUSED(context);
#endif
}

Coverage Report

Created: 2024-09-08 06:18

Line	Count	Source
1		/*
2		FreeRDP: A Remote Desktop Protocol Implementation
3		RemoteFX Codec Library - NEON Optimizations
4
5		Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>
6
7		Licensed under the Apache License, Version 2.0 (the "License");
8		you may not use this file except in compliance with the License.
9		You may obtain a copy of the License at
10
11		http://www.apache.org/licenses/LICENSE-2.0
12
13		Unless required by applicable law or agreed to in writing, software
14		distributed under the License is distributed on an "AS IS" BASIS,
15		WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16		See the License for the specific language governing permissions and
17		limitations under the License.
18		*/
19
20		#include <winpr/platform.h>
21		#include <freerdp/config.h>
22		#include <freerdp/log.h>
23
24		#include "../rfx_types.h"
25		#include "rfx_neon.h"
26
27		#define TAG FREERDP_TAG("codec.rfx.neon")
28
29		#if defined(WITH_NEON)
30		#if defined(_M_ARM64) \|\| defined(_M_ARM)
31		#define NEON_ENABLED
32		#endif
33		#endif
34
35		#if defined(NEON_ENABLED)
36
37		#include <stdio.h>
38		#include <stdlib.h>
39		#include <string.h>
40		#include <arm_neon.h>
41		#include <winpr/sysinfo.h>
42
43		/* rfx_decode_YCbCr_to_RGB_NEON code now resides in the primitives library. */
44
45		static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
46		rfx_quantization_decode_block_NEON(INT16* buffer, const size_t buffer_size, const UINT32 factor)
47		{
48		int16x8_t quantFactors = vdupq_n_s16(factor);
49		int16x8_t* buf = (int16x8_t*)buffer;
50		int16x8_t* buf_end = (int16x8_t*)(buffer + buffer_size);
51
52		do
53		{
54		int16x8_t val = vld1q_s16((INT16*)buf);
55		val = vshlq_s16(val, quantFactors);
56		vst1q_s16((INT16*)buf, val);
57		buf++;
58		} while (buf < buf_end);
59		}
60
61		static void rfx_quantization_decode_NEON(INT16* buffer, const UINT32* WINPR_RESTRICT quantVals)
62		{
63		WINPR_ASSERT(buffer);
64		WINPR_ASSERT(quantVals);
65
66		rfx_quantization_decode_block_NEON(&buffer[0], 1024, quantVals[8] - 1); /* HL1 */
67		rfx_quantization_decode_block_NEON(&buffer[1024], 1024, quantVals[7] - 1); /* LH1 */
68		rfx_quantization_decode_block_NEON(&buffer[2048], 1024, quantVals[9] - 1); /* HH1 */
69		rfx_quantization_decode_block_NEON(&buffer[3072], 256, quantVals[5] - 1); /* HL2 */
70		rfx_quantization_decode_block_NEON(&buffer[3328], 256, quantVals[4] - 1); /* LH2 */
71		rfx_quantization_decode_block_NEON(&buffer[3584], 256, quantVals[6] - 1); /* HH2 */
72		rfx_quantization_decode_block_NEON(&buffer[3840], 64, quantVals[2] - 1); /* HL3 */
73		rfx_quantization_decode_block_NEON(&buffer[3904], 64, quantVals[1] - 1); /* LH3 */
74		rfx_quantization_decode_block_NEON(&buffer[3968], 64, quantVals[3] - 1); /* HH3 */
75		rfx_quantization_decode_block_NEON(&buffer[4032], 64, quantVals[0] - 1); /* LL3 */
76		}
77
78		static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
79		rfx_dwt_2d_decode_block_horiz_NEON(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
80		INT16* WINPR_RESTRICT dst, size_t subband_width)
81		{
82		INT16* l_ptr = l;
83		INT16* h_ptr = h;
84		INT16* dst_ptr = dst;
85
86		for (size_t y = 0; y < subband_width; y++)
87		{
88		/* Even coefficients */
89		for (size_t n = 0; n < subband_width; n += 8)
90		{
91		// dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
92		int16x8_t l_n = vld1q_s16(l_ptr);
93		int16x8_t h_n = vld1q_s16(h_ptr);
94		int16x8_t h_n_m = vld1q_s16(h_ptr - 1);
95
96		if (n == 0)
97		{
98		int16_t first = vgetq_lane_s16(h_n_m, 1);
99		h_n_m = vsetq_lane_s16(first, h_n_m, 0);
100		}
101
102		int16x8_t tmp_n = vaddq_s16(h_n, h_n_m);
103		tmp_n = vaddq_s16(tmp_n, vdupq_n_s16(1));
104		tmp_n = vshrq_n_s16(tmp_n, 1);
105		int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
106		vst1q_s16(l_ptr, dst_n);
107		l_ptr += 8;
108		h_ptr += 8;
109		}
110
111		l_ptr -= subband_width;
112		h_ptr -= subband_width;
113
114		/* Odd coefficients */
115		for (size_t n = 0; n < subband_width; n += 8)
116		{
117		// dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
118		int16x8_t h_n = vld1q_s16(h_ptr);
119		h_n = vshlq_n_s16(h_n, 1);
120		int16x8x2_t dst_n;
121		dst_n.val[0] = vld1q_s16(l_ptr);
122		int16x8_t dst_n_p = vld1q_s16(l_ptr + 1);
123
124		if (n == subband_width - 8)
125		{
126		int16_t last = vgetq_lane_s16(dst_n_p, 6);
127		dst_n_p = vsetq_lane_s16(last, dst_n_p, 7);
128		}
129
130		dst_n.val[1] = vaddq_s16(dst_n_p, dst_n.val[0]);
131		dst_n.val[1] = vshrq_n_s16(dst_n.val[1], 1);
132		dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n);
133		vst2q_s16(dst_ptr, dst_n);
134		l_ptr += 8;
135		h_ptr += 8;
136		dst_ptr += 16;
137		}
138		}
139		}
140
141		static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
142		rfx_dwt_2d_decode_block_vert_NEON(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
143		INT16* WINPR_RESTRICT dst, size_t subband_width)
144		{
145		INT16* l_ptr = l;
146		INT16* h_ptr = h;
147		INT16* dst_ptr = dst;
148		const size_t total_width = subband_width + subband_width;
149
150		/* Even coefficients */
151		for (size_t n = 0; n < subband_width; n++)
152		{
153		for (size_t x = 0; x < total_width; x += 8)
154		{
155		// dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
156		int16x8_t l_n = vld1q_s16(l_ptr);
157		int16x8_t h_n = vld1q_s16(h_ptr);
158		int16x8_t tmp_n = vaddq_s16(h_n, vdupq_n_s16(1));
159
160		if (n == 0)
161		tmp_n = vaddq_s16(tmp_n, h_n);
162		else
163		{
164		int16x8_t h_n_m = vld1q_s16((h_ptr - total_width));
165		tmp_n = vaddq_s16(tmp_n, h_n_m);
166		}
167
168		tmp_n = vshrq_n_s16(tmp_n, 1);
169		int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
170		vst1q_s16(dst_ptr, dst_n);
171		l_ptr += 8;
172		h_ptr += 8;
173		dst_ptr += 8;
174		}
175
176		dst_ptr += total_width;
177		}
178
179		h_ptr = h;
180		dst_ptr = dst + total_width;
181
182		/* Odd coefficients */
183		for (size_t n = 0; n < subband_width; n++)
184		{
185		for (size_t x = 0; x < total_width; x += 8)
186		{
187		// dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
188		int16x8_t h_n = vld1q_s16(h_ptr);
189		int16x8_t dst_n_m = vld1q_s16(dst_ptr - total_width);
190		h_n = vshlq_n_s16(h_n, 1);
191		int16x8_t tmp_n = dst_n_m;
192
193		if (n == subband_width - 1)
194		tmp_n = vaddq_s16(tmp_n, dst_n_m);
195		else
196		{
197		int16x8_t dst_n_p = vld1q_s16((dst_ptr + total_width));
198		tmp_n = vaddq_s16(tmp_n, dst_n_p);
199		}
200
201		tmp_n = vshrq_n_s16(tmp_n, 1);
202		int16x8_t dst_n = vaddq_s16(tmp_n, h_n);
203		vst1q_s16(dst_ptr, dst_n);
204		h_ptr += 8;
205		dst_ptr += 8;
206		}
207
208		dst_ptr += total_width;
209		}
210		}
211
212		static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
213		rfx_dwt_2d_decode_block_NEON(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT idwt,
214		size_t subband_width)
215		{
216		INT16 hl, lh, hh, ll;
217		INT16 l_dst, h_dst;
218		/* Inverse DWT in horizontal direction, results in 2 sub-bands in L, H order in tmp buffer idwt.
219		*/
220		/* The 4 sub-bands are stored in HL(0), LH(1), HH(2), LL(3) order. */
221		/* The lower part L uses LL(3) and HL(0). */
222		/* The higher part H uses LH(1) and HH(2). */
223		ll = buffer + subband_width * subband_width * 3;
224		hl = buffer;
225		l_dst = idwt;
226		rfx_dwt_2d_decode_block_horiz_NEON(ll, hl, l_dst, subband_width);
227		lh = buffer + subband_width * subband_width;
228		hh = buffer + subband_width * subband_width * 2;
229		h_dst = idwt + subband_width * subband_width * 2;
230		rfx_dwt_2d_decode_block_horiz_NEON(lh, hh, h_dst, subband_width);
231		/* Inverse DWT in vertical direction, results are stored in original buffer. */
232		rfx_dwt_2d_decode_block_vert_NEON(l_dst, h_dst, buffer, subband_width);
233		}
234
235		static void rfx_dwt_2d_decode_NEON(INT16* buffer, INT16* dwt_buffer)
236		{
237		rfx_dwt_2d_decode_block_NEON(buffer + 3840, dwt_buffer, 8);
238		rfx_dwt_2d_decode_block_NEON(buffer + 3072, dwt_buffer, 16);
239		rfx_dwt_2d_decode_block_NEON(buffer, dwt_buffer, 32);
240		}
241
242		static INLINE void rfx_idwt_extrapolate_horiz_neon(INT16* restrict pLowBand, size_t nLowStep,
243		const INT16* restrict pHighBand,
244		size_t nHighStep, INT16* restrict pDstBand,
245		size_t nDstStep, size_t nLowCount,
246		size_t nHighCount, size_t nDstCount)
247		{
248		WINPR_ASSERT(pLowBand);
249		WINPR_ASSERT(pHighBand);
250		WINPR_ASSERT(pDstBand);
251
252		INT16* l_ptr = pLowBand;
253		const INT16* h_ptr = pHighBand;
254		INT16* dst_ptr = pDstBand;
255		size_t batchSize = (nLowCount + nHighCount) >> 1;
256
257		for (size_t y = 0; y < nDstCount; y++)
258		{
259		/* Even coefficients */
260		size_t n = 0;
261		for (; n < batchSize; n += 8)
262		{
263		// dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
264		int16x8_t l_n = vld1q_s16(l_ptr);
265		int16x8_t h_n = vld1q_s16(h_ptr);
266		int16x8_t h_n_m = vld1q_s16(h_ptr - 1);
267
268		if (n == 0)
269		{
270		int16_t first = vgetq_lane_s16(h_n_m, 1);
271		h_n_m = vsetq_lane_s16(first, h_n_m, 0);
272		}
273		else if (n == 24)
274		h_n = vsetq_lane_s16(0, h_n, 7);
275
276		int16x8_t tmp_n = vaddq_s16(h_n, h_n_m);
277		tmp_n = vaddq_s16(tmp_n, vdupq_n_s16(1));
278		tmp_n = vshrq_n_s16(tmp_n, 1);
279		int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
280		vst1q_s16(l_ptr, dst_n);
281		l_ptr += 8;
282		h_ptr += 8;
283		}
284		if (n < 32)
285		l_ptr -= (h_ptr - 1);
286
287		l_ptr -= batchSize;
288		h_ptr -= batchSize;
289
290		/* Odd coefficients */
291		n = 0;
292		for (; n < batchSize; n += 8)
293		{
294		// dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
295		int16x8_t h_n = vld1q_s16(h_ptr);
296		h_n = vshlq_n_s16(h_n, 1);
297		int16x8x2_t dst_n;
298		dst_n.val[0] = vld1q_s16(l_ptr);
299		int16x8_t dst_n_p = vld1q_s16(l_ptr + 1);
300
301		if (n == 24)
302		h_n = vsetq_lane_s16(0, h_n, 7);
303
304		dst_n.val[1] = vaddq_s16(dst_n_p, dst_n.val[0]);
305		dst_n.val[1] = vshrq_n_s16(dst_n.val[1], 1);
306		dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n);
307		vst2q_s16(dst_ptr, dst_n);
308		l_ptr += 8;
309		h_ptr += 8;
310		dst_ptr += 16;
311		}
312		if (n == 32)
313		{
314		h_ptr -= 1;
315		l_ptr += 1;
316		}
317		else
318		{
319		dst_ptr = l_ptr;
320		l_ptr += 1;
321		dst_ptr += 1;
322		}
323		}
324		}
325
326		static INLINE void rfx_idwt_extrapolate_vert_neon(const INT16* restrict pLowBand, size_t nLowStep,
327		const INT16* restrict pHighBand, size_t nHighStep,
328		INT16* restrict pDstBand, size_t nDstStep,
329		size_t nLowCount, size_t nHighCount,
330		size_t nDstCount)
331		{
332		WINPR_ASSERT(pLowBand);
333		WINPR_ASSERT(pHighBand);
334		WINPR_ASSERT(pDstBand);
335
336		const INT16* l_ptr = pLowBand;
337		const INT16* h_ptr = pHighBand;
338		INT16* dst_ptr = pDstBand;
339		size_t batchSize = (nDstCount >> 3) << 3;
340		size_t forceBandSize = (nLowCount + nHighCount) >> 1;
341
342		/* Even coefficients */
343		for (size_t n = 0; n < forceBandSize; n++)
344		{
345		for (size_t x = 0; x < batchSize; x += 8)
346		{
347		// dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
348		int16x8_t l_n = vld1q_s16(l_ptr);
349		int16x8_t h_n = vld1q_s16((n == 31) ? (h_ptr - nHighStep) : h_ptr);
350		int16x8_t tmp_n = vaddq_s16(h_n, vdupq_n_s16(1));
351
352		if (n == 0)
353		tmp_n = vaddq_s16(tmp_n, h_n);
354		else if (n < 31)
355		{
356		int16x8_t h_n_m = vld1q_s16((h_ptr - nHighStep));
357		tmp_n = vaddq_s16(tmp_n, h_n_m);
358		}
359
360		tmp_n = vshrq_n_s16(tmp_n, 1);
361		int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
362		vst1q_s16(dst_ptr, dst_n);
363		l_ptr += 8;
364		h_ptr += 8;
365		dst_ptr += 8;
366		}
367
368		if (nDstCount > batchSize)
369		{
370		int16_t h_n = (n == 31) ? (h_ptr - nHighStep) : h_ptr;
371		int16_t tmp_n = h_n + 1;
372		if (n == 0)
373		tmp_n += h_n;
374		else if (n < 31)
375		tmp_n += *(h_ptr - nHighStep);
376		tmp_n >>= 1;
377		dst_ptr = l_ptr - tmp_n;
378		l_ptr += 1;
379		h_ptr += 1;
380		dst_ptr += 1;
381		}
382
383		dst_ptr += nDstStep;
384		}
385
386		if (forceBandSize < 32)
387		{
388		for (size_t x = 0; x < batchSize; x += 8)
389		{
390		int16x8_t l_n = vld1q_s16(l_ptr);
391		int16x8_t h_n = vld1q_s16(h_ptr - nHighStep);
392		int16x8_t tmp_n = vsubq_s16(l_n, h_n);
393		vst1q_s16(dst_ptr, tmp_n);
394		l_ptr += 8;
395		h_ptr += 8;
396		dst_ptr += 8;
397		}
398
399		if (nDstCount > batchSize)
400		{
401		dst_ptr = l_ptr - *(h_ptr - nHighStep);
402		l_ptr += 1;
403		h_ptr += 1;
404		dst_ptr += 1;
405		}
406		}
407
408		h_ptr = pHighBand;
409		dst_ptr = pDstBand + nDstStep;
410
411		/* Odd coefficients */
412		for (size_t n = 0; n < forceBandSize; n++)
413		{
414		for (size_t x = 0; x < batchSize; x += 8)
415		{
416		// dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
417		int16x8_t tmp_n = vld1q_s16(dst_ptr - nDstStep);
418		if (n == 31)
419		{
420		int16x8_t dst_n_p = vld1q_s16(l_ptr);
421		l_ptr += 8;
422		tmp_n = vaddq_s16(tmp_n, dst_n_p);
423		tmp_n = vshrq_n_s16(tmp_n, 1);
424		}
425		else
426		{
427		int16x8_t dst_n_p = vld1q_s16(dst_ptr + nDstStep);
428		tmp_n = vaddq_s16(tmp_n, dst_n_p);
429		tmp_n = vshrq_n_s16(tmp_n, 1);
430		int16x8_t h_n = vld1q_s16(h_ptr);
431		h_n = vshlq_n_s16(h_n, 1);
432		tmp_n = vaddq_s16(tmp_n, h_n);
433		}
434		vst1q_s16(dst_ptr, tmp_n);
435		h_ptr += 8;
436		dst_ptr += 8;
437		}
438
439		if (nDstCount > batchSize)
440		{
441		int16_t tmp_n = *(dst_ptr - nDstStep);
442		if (n == 31)
443		{
444		int16_t dst_n_p = *l_ptr;
445		l_ptr += 1;
446		tmp_n += dst_n_p;
447		tmp_n >>= 1;
448		}
449		else
450		{
451		int16_t dst_n_p = *(dst_ptr + nDstStep);
452		tmp_n += dst_n_p;
453		tmp_n >>= 1;
454		int16_t h_n = *h_ptr;
455		h_n <<= 1;
456		tmp_n += h_n;
457		}
458		*dst_ptr = tmp_n;
459		h_ptr += 1;
460		dst_ptr += 1;
461		}
462
463		dst_ptr += nDstStep;
464		}
465		}
466
467		static INLINE size_t prfx_get_band_l_count(size_t level)
468		{
469		return (64 >> level) + 1;
470		}
471
472		static INLINE size_t prfx_get_band_h_count(size_t level)
473		{
474		if (level == 1)
475		return (64 >> 1) - 1;
476		else
477		return (64 + (1 << (level - 1))) >> level;
478		}
479
480		static INLINE void rfx_dwt_2d_decode_extrapolate_block_neon(INT16* buffer, INT16* temp,
481		size_t level)
482		{
483		size_t nDstStepX;
484		size_t nDstStepY;
485		INT16 HL, LH;
486		INT16 HH, LL;
487		INT16 L, H, *LLx;
488
489		const size_t nBandL = prfx_get_band_l_count(level);
490		const size_t nBandH = prfx_get_band_h_count(level);
491		size_t offset = 0;
492
493		WINPR_ASSERT(buffer);
494		WINPR_ASSERT(temp);
495
496		HL = &buffer[offset];
497		offset += (nBandH * nBandL);
498		LH = &buffer[offset];
499		offset += (nBandL * nBandH);
500		HH = &buffer[offset];
501		offset += (nBandH * nBandH);
502		LL = &buffer[offset];
503		nDstStepX = (nBandL + nBandH);
504		nDstStepY = (nBandL + nBandH);
505		offset = 0;
506		L = &temp[offset];
507		offset += (nBandL * nDstStepX);
508		H = &temp[offset];
509		LLx = &buffer[0];
510
511		/* horizontal (LL + HL -> L) */
512		rfx_idwt_extrapolate_horiz_neon(LL, nBandL, HL, nBandH, L, nDstStepX, nBandL, nBandH, nBandL);
513
514		/* horizontal (LH + HH -> H) */
515		rfx_idwt_extrapolate_horiz_neon(LH, nBandL, HH, nBandH, H, nDstStepX, nBandL, nBandH, nBandH);
516
517		/* vertical (L + H -> LL) */
518		rfx_idwt_extrapolate_vert_neon(L, nDstStepX, H, nDstStepX, LLx, nDstStepY, nBandL, nBandH,
519		nBandL + nBandH);
520		}
521
522		static void rfx_dwt_2d_extrapolate_decode_neon(INT16* buffer, INT16* temp)
523		{
524		WINPR_ASSERT(buffer);
525		WINPR_ASSERT(temp);
526		rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[3807], temp, 3);
527		rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[3007], temp, 2);
528		rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[0], temp, 1);
529		}
530		#endif // NEON_ENABLED
531
532		void rfx_init_neon(RFX_CONTEXT* context)
533	10.8k	{
534		#if defined(NEON_ENABLED)
535		if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
536		{
537		DEBUG_RFX("Using NEON optimizations");
538		PROFILER_RENAME(context->priv->prof_rfx_ycbcr_to_rgb, "rfx_decode_YCbCr_to_RGB_NEON");
539		PROFILER_RENAME(context->priv->prof_rfx_quantization_decode,
540		"rfx_quantization_decode_NEON");
541		PROFILER_RENAME(context->priv->prof_rfx_dwt_2d_decode, "rfx_dwt_2d_decode_NEON");
542		context->quantization_decode = rfx_quantization_decode_NEON;
543		context->dwt_2d_decode = rfx_dwt_2d_decode_NEON;
544		context->dwt_2d_extrapolate_decode = rfx_dwt_2d_extrapolate_decode_neon;
545		}
546		#else
547	10.8k	WINPR_UNUSED(context);
548	10.8k	#endif
549	10.8k	}