/src/FreeRDP/libfreerdp/codec/neon/rfx_neon.c

Source (jump to first uncovered line)
/*
   FreeRDP: A Remote Desktop Protocol Implementation
   RemoteFX Codec Library - NEON Optimizations

   Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

#include <winpr/platform.h>
#include <freerdp/config.h>
#include <freerdp/log.h>

#include "../rfx_types.h"
#include "rfx_neon.h"

#include "../../core/simd.h"

#if defined(NEON_INTRINSICS_ENABLED)

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <arm_neon.h>
#include <winpr/sysinfo.h>

/* rfx_decode_YCbCr_to_RGB_NEON code now resides in the primitives library. */

static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
rfx_quantization_decode_block_NEON(INT16* buffer, const size_t buffer_size, const UINT32 factor)
{
  int16x8_t quantFactors = vdupq_n_s16(factor);
  int16x8_t* buf = (int16x8_t*)buffer;
  int16x8_t* buf_end = (int16x8_t*)(buffer + buffer_size);

  do
  {
    int16x8_t val = vld1q_s16((INT16*)buf);
    val = vshlq_s16(val, quantFactors);
    vst1q_s16((INT16*)buf, val);
    buf++;
  } while (buf < buf_end);
}

static void rfx_quantization_decode_NEON(INT16* buffer, const UINT32* WINPR_RESTRICT quantVals)
{
  WINPR_ASSERT(buffer);
  WINPR_ASSERT(quantVals);

  rfx_quantization_decode_block_NEON(&buffer[0], 1024, quantVals[8] - 1);    /* HL1 */
  rfx_quantization_decode_block_NEON(&buffer[1024], 1024, quantVals[7] - 1); /* LH1 */
  rfx_quantization_decode_block_NEON(&buffer[2048], 1024, quantVals[9] - 1); /* HH1 */
  rfx_quantization_decode_block_NEON(&buffer[3072], 256, quantVals[5] - 1);  /* HL2 */
  rfx_quantization_decode_block_NEON(&buffer[3328], 256, quantVals[4] - 1);  /* LH2 */
  rfx_quantization_decode_block_NEON(&buffer[3584], 256, quantVals[6] - 1);  /* HH2 */
  rfx_quantization_decode_block_NEON(&buffer[3840], 64, quantVals[2] - 1);   /* HL3 */
  rfx_quantization_decode_block_NEON(&buffer[3904], 64, quantVals[1] - 1);   /* LH3 */
  rfx_quantization_decode_block_NEON(&buffer[3968], 64, quantVals[3] - 1);   /* HH3 */
  rfx_quantization_decode_block_NEON(&buffer[4032], 64, quantVals[0] - 1);   /* LL3 */
}

static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
rfx_dwt_2d_decode_block_horiz_NEON(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
                                   INT16* WINPR_RESTRICT dst, size_t subband_width)
{
  INT16* l_ptr = l;
  INT16* h_ptr = h;
  INT16* dst_ptr = dst;

  for (size_t y = 0; y < subband_width; y++)
  {
    /* Even coefficients */
    for (size_t n = 0; n < subband_width; n += 8)
    {
      // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
      int16x8_t l_n = vld1q_s16(l_ptr);
      int16x8_t h_n = vld1q_s16(h_ptr);
      int16x8_t h_n_m = vld1q_s16(h_ptr - 1);

      if (n == 0)
      {
        int16_t first = vgetq_lane_s16(h_n_m, 1);
        h_n_m = vsetq_lane_s16(first, h_n_m, 0);
      }

      int16x8_t tmp_n = vaddq_s16(h_n, h_n_m);
      tmp_n = vaddq_s16(tmp_n, vdupq_n_s16(1));
      tmp_n = vshrq_n_s16(tmp_n, 1);
      int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
      vst1q_s16(l_ptr, dst_n);
      l_ptr += 8;
      h_ptr += 8;
    }

    l_ptr -= subband_width;
    h_ptr -= subband_width;

    /* Odd coefficients */
    for (size_t n = 0; n < subband_width; n += 8)
    {
      // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
      int16x8_t h_n = vld1q_s16(h_ptr);
      h_n = vshlq_n_s16(h_n, 1);
      int16x8x2_t dst_n;
      dst_n.val[0] = vld1q_s16(l_ptr);
      int16x8_t dst_n_p = vld1q_s16(l_ptr + 1);

      if (n == subband_width - 8)
      {
        int16_t last = vgetq_lane_s16(dst_n_p, 6);
        dst_n_p = vsetq_lane_s16(last, dst_n_p, 7);
      }

      dst_n.val[1] = vaddq_s16(dst_n_p, dst_n.val[0]);
      dst_n.val[1] = vshrq_n_s16(dst_n.val[1], 1);
      dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n);
      vst2q_s16(dst_ptr, dst_n);
      l_ptr += 8;
      h_ptr += 8;
      dst_ptr += 16;
    }
  }
}

static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
rfx_dwt_2d_decode_block_vert_NEON(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
                                  INT16* WINPR_RESTRICT dst, size_t subband_width)
{
  INT16* l_ptr = l;
  INT16* h_ptr = h;
  INT16* dst_ptr = dst;
  const size_t total_width = subband_width + subband_width;

  /* Even coefficients */
  for (size_t n = 0; n < subband_width; n++)
  {
    for (size_t x = 0; x < total_width; x += 8)
    {
      // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
      int16x8_t l_n = vld1q_s16(l_ptr);
      int16x8_t h_n = vld1q_s16(h_ptr);
      int16x8_t tmp_n = vaddq_s16(h_n, vdupq_n_s16(1));

      if (n == 0)
        tmp_n = vaddq_s16(tmp_n, h_n);
      else
      {
        int16x8_t h_n_m = vld1q_s16((h_ptr - total_width));
        tmp_n = vaddq_s16(tmp_n, h_n_m);
      }

      tmp_n = vshrq_n_s16(tmp_n, 1);
      int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
      vst1q_s16(dst_ptr, dst_n);
      l_ptr += 8;
      h_ptr += 8;
      dst_ptr += 8;
    }

    dst_ptr += total_width;
  }

  h_ptr = h;
  dst_ptr = dst + total_width;

  /* Odd coefficients */
  for (size_t n = 0; n < subband_width; n++)
  {
    for (size_t x = 0; x < total_width; x += 8)
    {
      // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
      int16x8_t h_n = vld1q_s16(h_ptr);
      int16x8_t dst_n_m = vld1q_s16(dst_ptr - total_width);
      h_n = vshlq_n_s16(h_n, 1);
      int16x8_t tmp_n = dst_n_m;

      if (n == subband_width - 1)
        tmp_n = vaddq_s16(tmp_n, dst_n_m);
      else
      {
        int16x8_t dst_n_p = vld1q_s16((dst_ptr + total_width));
        tmp_n = vaddq_s16(tmp_n, dst_n_p);
      }

      tmp_n = vshrq_n_s16(tmp_n, 1);
      int16x8_t dst_n = vaddq_s16(tmp_n, h_n);
      vst1q_s16(dst_ptr, dst_n);
      h_ptr += 8;
      dst_ptr += 8;
    }

    dst_ptr += total_width;
  }
}

static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
rfx_dwt_2d_decode_block_NEON(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT idwt,
                             size_t subband_width)
{
  INT16 *hl, *lh, *hh, *ll;
  INT16 *l_dst, *h_dst;
  /* Inverse DWT in horizontal direction, results in 2 sub-bands in L, H order in tmp buffer idwt.
   */
  /* The 4 sub-bands are stored in HL(0), LH(1), HH(2), LL(3) order. */
  /* The lower part L uses LL(3) and HL(0). */
  /* The higher part H uses LH(1) and HH(2). */
  ll = buffer + subband_width * subband_width * 3;
  hl = buffer;
  l_dst = idwt;
  rfx_dwt_2d_decode_block_horiz_NEON(ll, hl, l_dst, subband_width);
  lh = buffer + subband_width * subband_width;
  hh = buffer + subband_width * subband_width * 2;
  h_dst = idwt + subband_width * subband_width * 2;
  rfx_dwt_2d_decode_block_horiz_NEON(lh, hh, h_dst, subband_width);
  /* Inverse DWT in vertical direction, results are stored in original buffer. */
  rfx_dwt_2d_decode_block_vert_NEON(l_dst, h_dst, buffer, subband_width);
}

static void rfx_dwt_2d_decode_NEON(INT16* buffer, INT16* dwt_buffer)
{
  rfx_dwt_2d_decode_block_NEON(buffer + 3840, dwt_buffer, 8);
  rfx_dwt_2d_decode_block_NEON(buffer + 3072, dwt_buffer, 16);
  rfx_dwt_2d_decode_block_NEON(buffer, dwt_buffer, 32);
}

static INLINE void rfx_idwt_extrapolate_horiz_neon(INT16* restrict pLowBand, size_t nLowStep,
                                                   const INT16* restrict pHighBand,
                                                   size_t nHighStep, INT16* restrict pDstBand,
                                                   size_t nDstStep, size_t nLowCount,
                                                   size_t nHighCount, size_t nDstCount)
{
  WINPR_ASSERT(pLowBand);
  WINPR_ASSERT(pHighBand);
  WINPR_ASSERT(pDstBand);

  INT16* l_ptr = pLowBand;
  const INT16* h_ptr = pHighBand;
  INT16* dst_ptr = pDstBand;
  size_t batchSize = (nLowCount + nHighCount) >> 1;

  for (size_t y = 0; y < nDstCount; y++)
  {
    /* Even coefficients */
    size_t n = 0;
    for (; n < batchSize; n += 8)
    {
      // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
      int16x8_t l_n = vld1q_s16(l_ptr);
      int16x8_t h_n = vld1q_s16(h_ptr);
      int16x8_t h_n_m = vld1q_s16(h_ptr - 1);

      if (n == 0)
      {
        int16_t first = vgetq_lane_s16(h_n_m, 1);
        h_n_m = vsetq_lane_s16(first, h_n_m, 0);
      }
      else if (n == 24)
        h_n = vsetq_lane_s16(0, h_n, 7);

      int16x8_t tmp_n = vaddq_s16(h_n, h_n_m);
      tmp_n = vaddq_s16(tmp_n, vdupq_n_s16(1));
      tmp_n = vshrq_n_s16(tmp_n, 1);
      int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
      vst1q_s16(l_ptr, dst_n);
      l_ptr += 8;
      h_ptr += 8;
    }
    if (n < 32)
      *l_ptr -= *(h_ptr - 1);

    l_ptr -= batchSize;
    h_ptr -= batchSize;

    /* Odd coefficients */
    n = 0;
    for (; n < batchSize; n += 8)
    {
      // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
      int16x8_t h_n = vld1q_s16(h_ptr);
      h_n = vshlq_n_s16(h_n, 1);
      int16x8x2_t dst_n;
      dst_n.val[0] = vld1q_s16(l_ptr);
      int16x8_t dst_n_p = vld1q_s16(l_ptr + 1);

      if (n == 24)
        h_n = vsetq_lane_s16(0, h_n, 7);

      dst_n.val[1] = vaddq_s16(dst_n_p, dst_n.val[0]);
      dst_n.val[1] = vshrq_n_s16(dst_n.val[1], 1);
      dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n);
      vst2q_s16(dst_ptr, dst_n);
      l_ptr += 8;
      h_ptr += 8;
      dst_ptr += 16;
    }
    if (n == 32)
    {
      h_ptr -= 1;
      l_ptr += 1;
    }
    else
    {
      *dst_ptr = *l_ptr;
      l_ptr += 1;
      dst_ptr += 1;
    }
  }
}

static INLINE void rfx_idwt_extrapolate_vert_neon(const INT16* restrict pLowBand, size_t nLowStep,
                                                  const INT16* restrict pHighBand, size_t nHighStep,
                                                  INT16* restrict pDstBand, size_t nDstStep,
                                                  size_t nLowCount, size_t nHighCount,
                                                  size_t nDstCount)
{
  WINPR_ASSERT(pLowBand);
  WINPR_ASSERT(pHighBand);
  WINPR_ASSERT(pDstBand);

  const INT16* l_ptr = pLowBand;
  const INT16* h_ptr = pHighBand;
  INT16* dst_ptr = pDstBand;
  size_t batchSize = (nDstCount >> 3) << 3;
  size_t forceBandSize = (nLowCount + nHighCount) >> 1;

  /* Even coefficients */
  for (size_t n = 0; n < forceBandSize; n++)
  {
    for (size_t x = 0; x < batchSize; x += 8)
    {
      // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
      int16x8_t l_n = vld1q_s16(l_ptr);
      int16x8_t h_n = vld1q_s16((n == 31) ? (h_ptr - nHighStep) : h_ptr);
      int16x8_t tmp_n = vaddq_s16(h_n, vdupq_n_s16(1));

      if (n == 0)
        tmp_n = vaddq_s16(tmp_n, h_n);
      else if (n < 31)
      {
        int16x8_t h_n_m = vld1q_s16((h_ptr - nHighStep));
        tmp_n = vaddq_s16(tmp_n, h_n_m);
      }

      tmp_n = vshrq_n_s16(tmp_n, 1);
      int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
      vst1q_s16(dst_ptr, dst_n);
      l_ptr += 8;
      h_ptr += 8;
      dst_ptr += 8;
    }

    if (nDstCount > batchSize)
    {
      int16_t h_n = (n == 31) ? *(h_ptr - nHighStep) : *h_ptr;
      int16_t tmp_n = h_n + 1;
      if (n == 0)
        tmp_n += h_n;
      else if (n < 31)
        tmp_n += *(h_ptr - nHighStep);
      tmp_n >>= 1;
      *dst_ptr = *l_ptr - tmp_n;
      l_ptr += 1;
      h_ptr += 1;
      dst_ptr += 1;
    }

    dst_ptr += nDstStep;
  }

  if (forceBandSize < 32)
  {
    for (size_t x = 0; x < batchSize; x += 8)
    {
      int16x8_t l_n = vld1q_s16(l_ptr);
      int16x8_t h_n = vld1q_s16(h_ptr - nHighStep);
      int16x8_t tmp_n = vsubq_s16(l_n, h_n);
      vst1q_s16(dst_ptr, tmp_n);
      l_ptr += 8;
      h_ptr += 8;
      dst_ptr += 8;
    }

    if (nDstCount > batchSize)
    {
      *dst_ptr = *l_ptr - *(h_ptr - nHighStep);
      l_ptr += 1;
      h_ptr += 1;
      dst_ptr += 1;
    }
  }

  h_ptr = pHighBand;
  dst_ptr = pDstBand + nDstStep;

  /* Odd coefficients */
  for (size_t n = 0; n < forceBandSize; n++)
  {
    for (size_t x = 0; x < batchSize; x += 8)
    {
      // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
      int16x8_t tmp_n = vld1q_s16(dst_ptr - nDstStep);
      if (n == 31)
      {
        int16x8_t dst_n_p = vld1q_s16(l_ptr);
        l_ptr += 8;
        tmp_n = vaddq_s16(tmp_n, dst_n_p);
        tmp_n = vshrq_n_s16(tmp_n, 1);
      }
      else
      {
        int16x8_t dst_n_p = vld1q_s16(dst_ptr + nDstStep);
        tmp_n = vaddq_s16(tmp_n, dst_n_p);
        tmp_n = vshrq_n_s16(tmp_n, 1);
        int16x8_t h_n = vld1q_s16(h_ptr);
        h_n = vshlq_n_s16(h_n, 1);
        tmp_n = vaddq_s16(tmp_n, h_n);
      }
      vst1q_s16(dst_ptr, tmp_n);
      h_ptr += 8;
      dst_ptr += 8;
    }

    if (nDstCount > batchSize)
    {
      int16_t tmp_n = *(dst_ptr - nDstStep);
      if (n == 31)
      {
        int16_t dst_n_p = *l_ptr;
        l_ptr += 1;
        tmp_n += dst_n_p;
        tmp_n >>= 1;
      }
      else
      {
        int16_t dst_n_p = *(dst_ptr + nDstStep);
        tmp_n += dst_n_p;
        tmp_n >>= 1;
        int16_t h_n = *h_ptr;
        h_n <<= 1;
        tmp_n += h_n;
      }
      *dst_ptr = tmp_n;
      h_ptr += 1;
      dst_ptr += 1;
    }

    dst_ptr += nDstStep;
  }
}

static INLINE size_t prfx_get_band_l_count(size_t level)
{
  return (64 >> level) + 1;
}

static INLINE size_t prfx_get_band_h_count(size_t level)
{
  if (level == 1)
    return (64 >> 1) - 1;
  else
    return (64 + (1 << (level - 1))) >> level;
}

static INLINE void rfx_dwt_2d_decode_extrapolate_block_neon(INT16* buffer, INT16* temp,
                                                            size_t level)
{
  size_t nDstStepX;
  size_t nDstStepY;
  INT16 *HL, *LH;
  INT16 *HH, *LL;
  INT16 *L, *H, *LLx;

  const size_t nBandL = prfx_get_band_l_count(level);
  const size_t nBandH = prfx_get_band_h_count(level);
  size_t offset = 0;

  WINPR_ASSERT(buffer);
  WINPR_ASSERT(temp);

  HL = &buffer[offset];
  offset += (nBandH * nBandL);
  LH = &buffer[offset];
  offset += (nBandL * nBandH);
  HH = &buffer[offset];
  offset += (nBandH * nBandH);
  LL = &buffer[offset];
  nDstStepX = (nBandL + nBandH);
  nDstStepY = (nBandL + nBandH);
  offset = 0;
  L = &temp[offset];
  offset += (nBandL * nDstStepX);
  H = &temp[offset];
  LLx = &buffer[0];

  /* horizontal (LL + HL -> L) */
  rfx_idwt_extrapolate_horiz_neon(LL, nBandL, HL, nBandH, L, nDstStepX, nBandL, nBandH, nBandL);

  /* horizontal (LH + HH -> H) */
  rfx_idwt_extrapolate_horiz_neon(LH, nBandL, HH, nBandH, H, nDstStepX, nBandL, nBandH, nBandH);

  /* vertical (L + H -> LL) */
  rfx_idwt_extrapolate_vert_neon(L, nDstStepX, H, nDstStepX, LLx, nDstStepY, nBandL, nBandH,
                                 nBandL + nBandH);
}

static void rfx_dwt_2d_extrapolate_decode_neon(INT16* buffer, INT16* temp)
{
  WINPR_ASSERT(buffer);
  WINPR_ASSERT(temp);
  rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[3807], temp, 3);
  rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[3007], temp, 2);
  rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[0], temp, 1);
}
#endif // NEON_INTRINSICS_ENABLED

void rfx_init_neon_int(RFX_CONTEXT* WINPR_RESTRICT context)
{
#if defined(NEON_INTRINSICS_ENABLED)
  WLog_VRB(PRIM_TAG, "NEON optimizations");
  PROFILER_RENAME(context->priv->prof_rfx_ycbcr_to_rgb, "rfx_decode_YCbCr_to_RGB_NEON");
  PROFILER_RENAME(context->priv->prof_rfx_quantization_decode, "rfx_quantization_decode_NEON");
  PROFILER_RENAME(context->priv->prof_rfx_dwt_2d_decode, "rfx_dwt_2d_decode_NEON");
  context->quantization_decode = rfx_quantization_decode_NEON;
  context->dwt_2d_decode = rfx_dwt_2d_decode_NEON;
  context->dwt_2d_extrapolate_decode = rfx_dwt_2d_extrapolate_decode_neon;
#else
  WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or NEON intrinsics not available");
  WINPR_UNUSED(context);
#endif
}

Coverage Report

Created: 2025-07-01 06:46

Line	Count	Source (jump to first uncovered line)
1		/*
2		FreeRDP: A Remote Desktop Protocol Implementation
3		RemoteFX Codec Library - NEON Optimizations
4
5		Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>
6
7		Licensed under the Apache License, Version 2.0 (the "License");
8		you may not use this file except in compliance with the License.
9		You may obtain a copy of the License at
10
11		http://www.apache.org/licenses/LICENSE-2.0
12
13		Unless required by applicable law or agreed to in writing, software
14		distributed under the License is distributed on an "AS IS" BASIS,
15		WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16		See the License for the specific language governing permissions and
17		limitations under the License.
18		*/
19
20		#include <winpr/platform.h>
21		#include <freerdp/config.h>
22		#include <freerdp/log.h>
23
24		#include "../rfx_types.h"
25		#include "rfx_neon.h"
26
27		#include "../../core/simd.h"
28
29		#if defined(NEON_INTRINSICS_ENABLED)
30
31		#include <stdio.h>
32		#include <stdlib.h>
33		#include <string.h>
34		#include <arm_neon.h>
35		#include <winpr/sysinfo.h>
36
37		/* rfx_decode_YCbCr_to_RGB_NEON code now resides in the primitives library. */
38
39		static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
40		rfx_quantization_decode_block_NEON(INT16* buffer, const size_t buffer_size, const UINT32 factor)
41		{
42		int16x8_t quantFactors = vdupq_n_s16(factor);
43		int16x8_t* buf = (int16x8_t*)buffer;
44		int16x8_t* buf_end = (int16x8_t*)(buffer + buffer_size);
45
46		do
47		{
48		int16x8_t val = vld1q_s16((INT16*)buf);
49		val = vshlq_s16(val, quantFactors);
50		vst1q_s16((INT16*)buf, val);
51		buf++;
52		} while (buf < buf_end);
53		}
54
55		static void rfx_quantization_decode_NEON(INT16* buffer, const UINT32* WINPR_RESTRICT quantVals)
56		{
57		WINPR_ASSERT(buffer);
58		WINPR_ASSERT(quantVals);
59
60		rfx_quantization_decode_block_NEON(&buffer[0], 1024, quantVals[8] - 1); /* HL1 */
61		rfx_quantization_decode_block_NEON(&buffer[1024], 1024, quantVals[7] - 1); /* LH1 */
62		rfx_quantization_decode_block_NEON(&buffer[2048], 1024, quantVals[9] - 1); /* HH1 */
63		rfx_quantization_decode_block_NEON(&buffer[3072], 256, quantVals[5] - 1); /* HL2 */
64		rfx_quantization_decode_block_NEON(&buffer[3328], 256, quantVals[4] - 1); /* LH2 */
65		rfx_quantization_decode_block_NEON(&buffer[3584], 256, quantVals[6] - 1); /* HH2 */
66		rfx_quantization_decode_block_NEON(&buffer[3840], 64, quantVals[2] - 1); /* HL3 */
67		rfx_quantization_decode_block_NEON(&buffer[3904], 64, quantVals[1] - 1); /* LH3 */
68		rfx_quantization_decode_block_NEON(&buffer[3968], 64, quantVals[3] - 1); /* HH3 */
69		rfx_quantization_decode_block_NEON(&buffer[4032], 64, quantVals[0] - 1); /* LL3 */
70		}
71
72		static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
73		rfx_dwt_2d_decode_block_horiz_NEON(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
74		INT16* WINPR_RESTRICT dst, size_t subband_width)
75		{
76		INT16* l_ptr = l;
77		INT16* h_ptr = h;
78		INT16* dst_ptr = dst;
79
80		for (size_t y = 0; y < subband_width; y++)
81		{
82		/* Even coefficients */
83		for (size_t n = 0; n < subband_width; n += 8)
84		{
85		// dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
86		int16x8_t l_n = vld1q_s16(l_ptr);
87		int16x8_t h_n = vld1q_s16(h_ptr);
88		int16x8_t h_n_m = vld1q_s16(h_ptr - 1);
89
90		if (n == 0)
91		{
92		int16_t first = vgetq_lane_s16(h_n_m, 1);
93		h_n_m = vsetq_lane_s16(first, h_n_m, 0);
94		}
95
96		int16x8_t tmp_n = vaddq_s16(h_n, h_n_m);
97		tmp_n = vaddq_s16(tmp_n, vdupq_n_s16(1));
98		tmp_n = vshrq_n_s16(tmp_n, 1);
99		int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
100		vst1q_s16(l_ptr, dst_n);
101		l_ptr += 8;
102		h_ptr += 8;
103		}
104
105		l_ptr -= subband_width;
106		h_ptr -= subband_width;
107
108		/* Odd coefficients */
109		for (size_t n = 0; n < subband_width; n += 8)
110		{
111		// dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
112		int16x8_t h_n = vld1q_s16(h_ptr);
113		h_n = vshlq_n_s16(h_n, 1);
114		int16x8x2_t dst_n;
115		dst_n.val[0] = vld1q_s16(l_ptr);
116		int16x8_t dst_n_p = vld1q_s16(l_ptr + 1);
117
118		if (n == subband_width - 8)
119		{
120		int16_t last = vgetq_lane_s16(dst_n_p, 6);
121		dst_n_p = vsetq_lane_s16(last, dst_n_p, 7);
122		}
123
124		dst_n.val[1] = vaddq_s16(dst_n_p, dst_n.val[0]);
125		dst_n.val[1] = vshrq_n_s16(dst_n.val[1], 1);
126		dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n);
127		vst2q_s16(dst_ptr, dst_n);
128		l_ptr += 8;
129		h_ptr += 8;
130		dst_ptr += 16;
131		}
132		}
133		}
134
135		static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
136		rfx_dwt_2d_decode_block_vert_NEON(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
137		INT16* WINPR_RESTRICT dst, size_t subband_width)
138		{
139		INT16* l_ptr = l;
140		INT16* h_ptr = h;
141		INT16* dst_ptr = dst;
142		const size_t total_width = subband_width + subband_width;
143
144		/* Even coefficients */
145		for (size_t n = 0; n < subband_width; n++)
146		{
147		for (size_t x = 0; x < total_width; x += 8)
148		{
149		// dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
150		int16x8_t l_n = vld1q_s16(l_ptr);
151		int16x8_t h_n = vld1q_s16(h_ptr);
152		int16x8_t tmp_n = vaddq_s16(h_n, vdupq_n_s16(1));
153
154		if (n == 0)
155		tmp_n = vaddq_s16(tmp_n, h_n);
156		else
157		{
158		int16x8_t h_n_m = vld1q_s16((h_ptr - total_width));
159		tmp_n = vaddq_s16(tmp_n, h_n_m);
160		}
161
162		tmp_n = vshrq_n_s16(tmp_n, 1);
163		int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
164		vst1q_s16(dst_ptr, dst_n);
165		l_ptr += 8;
166		h_ptr += 8;
167		dst_ptr += 8;
168		}
169
170		dst_ptr += total_width;
171		}
172
173		h_ptr = h;
174		dst_ptr = dst + total_width;
175
176		/* Odd coefficients */
177		for (size_t n = 0; n < subband_width; n++)
178		{
179		for (size_t x = 0; x < total_width; x += 8)
180		{
181		// dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
182		int16x8_t h_n = vld1q_s16(h_ptr);
183		int16x8_t dst_n_m = vld1q_s16(dst_ptr - total_width);
184		h_n = vshlq_n_s16(h_n, 1);
185		int16x8_t tmp_n = dst_n_m;
186
187		if (n == subband_width - 1)
188		tmp_n = vaddq_s16(tmp_n, dst_n_m);
189		else
190		{
191		int16x8_t dst_n_p = vld1q_s16((dst_ptr + total_width));
192		tmp_n = vaddq_s16(tmp_n, dst_n_p);
193		}
194
195		tmp_n = vshrq_n_s16(tmp_n, 1);
196		int16x8_t dst_n = vaddq_s16(tmp_n, h_n);
197		vst1q_s16(dst_ptr, dst_n);
198		h_ptr += 8;
199		dst_ptr += 8;
200		}
201
202		dst_ptr += total_width;
203		}
204		}
205
206		static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
207		rfx_dwt_2d_decode_block_NEON(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT idwt,
208		size_t subband_width)
209		{
210		INT16 hl, lh, hh, ll;
211		INT16 l_dst, h_dst;
212		/* Inverse DWT in horizontal direction, results in 2 sub-bands in L, H order in tmp buffer idwt.
213		*/
214		/* The 4 sub-bands are stored in HL(0), LH(1), HH(2), LL(3) order. */
215		/* The lower part L uses LL(3) and HL(0). */
216		/* The higher part H uses LH(1) and HH(2). */
217		ll = buffer + subband_width * subband_width * 3;
218		hl = buffer;
219		l_dst = idwt;
220		rfx_dwt_2d_decode_block_horiz_NEON(ll, hl, l_dst, subband_width);
221		lh = buffer + subband_width * subband_width;
222		hh = buffer + subband_width * subband_width * 2;
223		h_dst = idwt + subband_width * subband_width * 2;
224		rfx_dwt_2d_decode_block_horiz_NEON(lh, hh, h_dst, subband_width);
225		/* Inverse DWT in vertical direction, results are stored in original buffer. */
226		rfx_dwt_2d_decode_block_vert_NEON(l_dst, h_dst, buffer, subband_width);
227		}
228
229		static void rfx_dwt_2d_decode_NEON(INT16* buffer, INT16* dwt_buffer)
230		{
231		rfx_dwt_2d_decode_block_NEON(buffer + 3840, dwt_buffer, 8);
232		rfx_dwt_2d_decode_block_NEON(buffer + 3072, dwt_buffer, 16);
233		rfx_dwt_2d_decode_block_NEON(buffer, dwt_buffer, 32);
234		}
235
236		static INLINE void rfx_idwt_extrapolate_horiz_neon(INT16* restrict pLowBand, size_t nLowStep,
237		const INT16* restrict pHighBand,
238		size_t nHighStep, INT16* restrict pDstBand,
239		size_t nDstStep, size_t nLowCount,
240		size_t nHighCount, size_t nDstCount)
241		{
242		WINPR_ASSERT(pLowBand);
243		WINPR_ASSERT(pHighBand);
244		WINPR_ASSERT(pDstBand);
245
246		INT16* l_ptr = pLowBand;
247		const INT16* h_ptr = pHighBand;
248		INT16* dst_ptr = pDstBand;
249		size_t batchSize = (nLowCount + nHighCount) >> 1;
250
251		for (size_t y = 0; y < nDstCount; y++)
252		{
253		/* Even coefficients */
254		size_t n = 0;
255		for (; n < batchSize; n += 8)
256		{
257		// dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
258		int16x8_t l_n = vld1q_s16(l_ptr);
259		int16x8_t h_n = vld1q_s16(h_ptr);
260		int16x8_t h_n_m = vld1q_s16(h_ptr - 1);
261
262		if (n == 0)
263		{
264		int16_t first = vgetq_lane_s16(h_n_m, 1);
265		h_n_m = vsetq_lane_s16(first, h_n_m, 0);
266		}
267		else if (n == 24)
268		h_n = vsetq_lane_s16(0, h_n, 7);
269
270		int16x8_t tmp_n = vaddq_s16(h_n, h_n_m);
271		tmp_n = vaddq_s16(tmp_n, vdupq_n_s16(1));
272		tmp_n = vshrq_n_s16(tmp_n, 1);
273		int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
274		vst1q_s16(l_ptr, dst_n);
275		l_ptr += 8;
276		h_ptr += 8;
277		}
278		if (n < 32)
279		l_ptr -= (h_ptr - 1);
280
281		l_ptr -= batchSize;
282		h_ptr -= batchSize;
283
284		/* Odd coefficients */
285		n = 0;
286		for (; n < batchSize; n += 8)
287		{
288		// dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
289		int16x8_t h_n = vld1q_s16(h_ptr);
290		h_n = vshlq_n_s16(h_n, 1);
291		int16x8x2_t dst_n;
292		dst_n.val[0] = vld1q_s16(l_ptr);
293		int16x8_t dst_n_p = vld1q_s16(l_ptr + 1);
294
295		if (n == 24)
296		h_n = vsetq_lane_s16(0, h_n, 7);
297
298		dst_n.val[1] = vaddq_s16(dst_n_p, dst_n.val[0]);
299		dst_n.val[1] = vshrq_n_s16(dst_n.val[1], 1);
300		dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n);
301		vst2q_s16(dst_ptr, dst_n);
302		l_ptr += 8;
303		h_ptr += 8;
304		dst_ptr += 16;
305		}
306		if (n == 32)
307		{
308		h_ptr -= 1;
309		l_ptr += 1;
310		}
311		else
312		{
313		dst_ptr = l_ptr;
314		l_ptr += 1;
315		dst_ptr += 1;
316		}
317		}
318		}
319
320		static INLINE void rfx_idwt_extrapolate_vert_neon(const INT16* restrict pLowBand, size_t nLowStep,
321		const INT16* restrict pHighBand, size_t nHighStep,
322		INT16* restrict pDstBand, size_t nDstStep,
323		size_t nLowCount, size_t nHighCount,
324		size_t nDstCount)
325		{
326		WINPR_ASSERT(pLowBand);
327		WINPR_ASSERT(pHighBand);
328		WINPR_ASSERT(pDstBand);
329
330		const INT16* l_ptr = pLowBand;
331		const INT16* h_ptr = pHighBand;
332		INT16* dst_ptr = pDstBand;
333		size_t batchSize = (nDstCount >> 3) << 3;
334		size_t forceBandSize = (nLowCount + nHighCount) >> 1;
335
336		/* Even coefficients */
337		for (size_t n = 0; n < forceBandSize; n++)
338		{
339		for (size_t x = 0; x < batchSize; x += 8)
340		{
341		// dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
342		int16x8_t l_n = vld1q_s16(l_ptr);
343		int16x8_t h_n = vld1q_s16((n == 31) ? (h_ptr - nHighStep) : h_ptr);
344		int16x8_t tmp_n = vaddq_s16(h_n, vdupq_n_s16(1));
345
346		if (n == 0)
347		tmp_n = vaddq_s16(tmp_n, h_n);
348		else if (n < 31)
349		{
350		int16x8_t h_n_m = vld1q_s16((h_ptr - nHighStep));
351		tmp_n = vaddq_s16(tmp_n, h_n_m);
352		}
353
354		tmp_n = vshrq_n_s16(tmp_n, 1);
355		int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
356		vst1q_s16(dst_ptr, dst_n);
357		l_ptr += 8;
358		h_ptr += 8;
359		dst_ptr += 8;
360		}
361
362		if (nDstCount > batchSize)
363		{
364		int16_t h_n = (n == 31) ? (h_ptr - nHighStep) : h_ptr;
365		int16_t tmp_n = h_n + 1;
366		if (n == 0)
367		tmp_n += h_n;
368		else if (n < 31)
369		tmp_n += *(h_ptr - nHighStep);
370		tmp_n >>= 1;
371		dst_ptr = l_ptr - tmp_n;
372		l_ptr += 1;
373		h_ptr += 1;
374		dst_ptr += 1;
375		}
376
377		dst_ptr += nDstStep;
378		}
379
380		if (forceBandSize < 32)
381		{
382		for (size_t x = 0; x < batchSize; x += 8)
383		{
384		int16x8_t l_n = vld1q_s16(l_ptr);
385		int16x8_t h_n = vld1q_s16(h_ptr - nHighStep);
386		int16x8_t tmp_n = vsubq_s16(l_n, h_n);
387		vst1q_s16(dst_ptr, tmp_n);
388		l_ptr += 8;
389		h_ptr += 8;
390		dst_ptr += 8;
391		}
392
393		if (nDstCount > batchSize)
394		{
395		dst_ptr = l_ptr - *(h_ptr - nHighStep);
396		l_ptr += 1;
397		h_ptr += 1;
398		dst_ptr += 1;
399		}
400		}
401
402		h_ptr = pHighBand;
403		dst_ptr = pDstBand + nDstStep;
404
405		/* Odd coefficients */
406		for (size_t n = 0; n < forceBandSize; n++)
407		{
408		for (size_t x = 0; x < batchSize; x += 8)
409		{
410		// dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
411		int16x8_t tmp_n = vld1q_s16(dst_ptr - nDstStep);
412		if (n == 31)
413		{
414		int16x8_t dst_n_p = vld1q_s16(l_ptr);
415		l_ptr += 8;
416		tmp_n = vaddq_s16(tmp_n, dst_n_p);
417		tmp_n = vshrq_n_s16(tmp_n, 1);
418		}
419		else
420		{
421		int16x8_t dst_n_p = vld1q_s16(dst_ptr + nDstStep);
422		tmp_n = vaddq_s16(tmp_n, dst_n_p);
423		tmp_n = vshrq_n_s16(tmp_n, 1);
424		int16x8_t h_n = vld1q_s16(h_ptr);
425		h_n = vshlq_n_s16(h_n, 1);
426		tmp_n = vaddq_s16(tmp_n, h_n);
427		}
428		vst1q_s16(dst_ptr, tmp_n);
429		h_ptr += 8;
430		dst_ptr += 8;
431		}
432
433		if (nDstCount > batchSize)
434		{
435		int16_t tmp_n = *(dst_ptr - nDstStep);
436		if (n == 31)
437		{
438		int16_t dst_n_p = *l_ptr;
439		l_ptr += 1;
440		tmp_n += dst_n_p;
441		tmp_n >>= 1;
442		}
443		else
444		{
445		int16_t dst_n_p = *(dst_ptr + nDstStep);
446		tmp_n += dst_n_p;
447		tmp_n >>= 1;
448		int16_t h_n = *h_ptr;
449		h_n <<= 1;
450		tmp_n += h_n;
451		}
452		*dst_ptr = tmp_n;
453		h_ptr += 1;
454		dst_ptr += 1;
455		}
456
457		dst_ptr += nDstStep;
458		}
459		}
460
461		static INLINE size_t prfx_get_band_l_count(size_t level)
462		{
463		return (64 >> level) + 1;
464		}
465
466		static INLINE size_t prfx_get_band_h_count(size_t level)
467		{
468		if (level == 1)
469		return (64 >> 1) - 1;
470		else
471		return (64 + (1 << (level - 1))) >> level;
472		}
473
474		static INLINE void rfx_dwt_2d_decode_extrapolate_block_neon(INT16* buffer, INT16* temp,
475		size_t level)
476		{
477		size_t nDstStepX;
478		size_t nDstStepY;
479		INT16 HL, LH;
480		INT16 HH, LL;
481		INT16 L, H, *LLx;
482
483		const size_t nBandL = prfx_get_band_l_count(level);
484		const size_t nBandH = prfx_get_band_h_count(level);
485		size_t offset = 0;
486
487		WINPR_ASSERT(buffer);
488		WINPR_ASSERT(temp);
489
490		HL = &buffer[offset];
491		offset += (nBandH * nBandL);
492		LH = &buffer[offset];
493		offset += (nBandL * nBandH);
494		HH = &buffer[offset];
495		offset += (nBandH * nBandH);
496		LL = &buffer[offset];
497		nDstStepX = (nBandL + nBandH);
498		nDstStepY = (nBandL + nBandH);
499		offset = 0;
500		L = &temp[offset];
501		offset += (nBandL * nDstStepX);
502		H = &temp[offset];
503		LLx = &buffer[0];
504
505		/* horizontal (LL + HL -> L) */
506		rfx_idwt_extrapolate_horiz_neon(LL, nBandL, HL, nBandH, L, nDstStepX, nBandL, nBandH, nBandL);
507
508		/* horizontal (LH + HH -> H) */
509		rfx_idwt_extrapolate_horiz_neon(LH, nBandL, HH, nBandH, H, nDstStepX, nBandL, nBandH, nBandH);
510
511		/* vertical (L + H -> LL) */
512		rfx_idwt_extrapolate_vert_neon(L, nDstStepX, H, nDstStepX, LLx, nDstStepY, nBandL, nBandH,
513		nBandL + nBandH);
514		}
515
516		static void rfx_dwt_2d_extrapolate_decode_neon(INT16* buffer, INT16* temp)
517		{
518		WINPR_ASSERT(buffer);
519		WINPR_ASSERT(temp);
520		rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[3807], temp, 3);
521		rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[3007], temp, 2);
522		rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[0], temp, 1);
523		}
524		#endif // NEON_INTRINSICS_ENABLED
525
526		void rfx_init_neon_int(RFX_CONTEXT* WINPR_RESTRICT context)
527	0	{
528		#if defined(NEON_INTRINSICS_ENABLED)
529		WLog_VRB(PRIM_TAG, "NEON optimizations");
530		PROFILER_RENAME(context->priv->prof_rfx_ycbcr_to_rgb, "rfx_decode_YCbCr_to_RGB_NEON");
531		PROFILER_RENAME(context->priv->prof_rfx_quantization_decode, "rfx_quantization_decode_NEON");
532		PROFILER_RENAME(context->priv->prof_rfx_dwt_2d_decode, "rfx_dwt_2d_decode_NEON");
533		context->quantization_decode = rfx_quantization_decode_NEON;
534		context->dwt_2d_decode = rfx_dwt_2d_decode_NEON;
535		context->dwt_2d_extrapolate_decode = rfx_dwt_2d_extrapolate_decode_neon;
536		#else
537	0	WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or NEON intrinsics not available");
538	0	WINPR_UNUSED(context);
539	0	#endif
540	0	}