/src/FreeRDP/libfreerdp/primitives/prim_YCoCg_opt.c

Source (jump to first uncovered line)
/* FreeRDP: A Remote Desktop Protocol Client
 * Optimized YCoCg<->RGB conversion operations.
 * vi:ts=4 sw=4:
 *
 * (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <freerdp/config.h>

#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include <winpr/sysinfo.h>

#ifdef WITH_SSE2
#include <emmintrin.h>
#include <tmmintrin.h>
#elif defined(WITH_NEON)
#include <arm_neon.h>
#endif /* WITH_SSE2 else WITH_NEON */

#include "prim_internal.h"
#include "prim_templates.h"

static primitives_t* generic = NULL;

#ifdef WITH_SSE2
/* ------------------------------------------------------------------------- */
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep,
                                                  BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
                                                  UINT32 dstStep, UINT32 width, UINT32 height,
                                                  UINT8 shift, BOOL withAlpha)
{
  const BYTE* sptr = pSrc;
  BYTE* dptr = (BYTE*)pDst;
  int sRowBump = srcStep - width * sizeof(UINT32);
  int dRowBump = dstStep - width * sizeof(UINT32);
  UINT32 h;
  /* Shift left by "shift" and divide by two is the same as shift
   * left by "shift-1".
   */
  int dataShift = shift - 1;
  BYTE mask = (BYTE)(0xFFU << dataShift);

  /* Let's say the data is of the form:
   * y0y0o0g0 a1y1o1g1 a2y2o2g2...
   * Apply:
   * |R|   | 1  1/2 -1/2 |   |y|
   * |G| = | 1  0    1/2 | * |o|
   * |B|   | 1 -1/2 -1/2 |   |g|
   * where Y is 8-bit unsigned and o & g are 8-bit signed.
   */

  if ((width < 8) || (ULONG_PTR)dptr & 0x03)
  {
    /* Too small, or we'll never hit a 16-byte boundary.  Punt. */
    return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
                                       shift, withAlpha);
  }

  for (h = 0; h < height; h++)
  {
    UINT32 w = width;
    BOOL onStride;

    /* Get to a 16-byte destination boundary. */
    if ((ULONG_PTR)dptr & 0x0f)
    {
      pstatus_t status;
      UINT32 startup = (16 - ((ULONG_PTR)dptr & 0x0f)) / 4;

      if (startup > width)
        startup = width;

      status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, startup,
                                           1, shift, withAlpha);

      if (status != PRIMITIVES_SUCCESS)
        return status;

      sptr += startup * sizeof(UINT32);
      dptr += startup * sizeof(UINT32);
      w -= startup;
    }

    /* Each loop handles eight pixels at a time. */
    onStride = (((ULONG_PTR)sptr & 0x0f) == 0) ? TRUE : FALSE;

    while (w >= 8)
    {
      __m128i R0, R1, R2, R3, R4, R5, R6, R7;

      if (onStride)
      {
        /* The faster path, 16-byte aligned load. */
        R0 = _mm_load_si128((const __m128i*)sptr);
        sptr += (128 / 8);
        R1 = _mm_load_si128((const __m128i*)sptr);
        sptr += (128 / 8);
      }
      else
      {
        /* Off-stride, slower LDDQU load. */
        R0 = _mm_lddqu_si128((const __m128i*)sptr);
        sptr += (128 / 8);
        R1 = _mm_lddqu_si128((const __m128i*)sptr);
        sptr += (128 / 8);
      }

      /* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
      /* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
      /* Shuffle to pack all the like types together. */
      R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
      R3 = _mm_shuffle_epi8(R0, R2);
      R4 = _mm_shuffle_epi8(R1, R2);
      /* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
      /* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
      R5 = _mm_unpackhi_epi32(R3, R4);
      R6 = _mm_unpacklo_epi32(R3, R4);

      /* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
      /* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
      /* Save alphas aside */
      if (withAlpha)
        R7 = _mm_unpackhi_epi64(R5, R5);
      else
        R7 = _mm_set1_epi32(0xFFFFFFFFU);

      /* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
      /* Expand Y's from 8-bit unsigned to 16-bit signed. */
      R1 = _mm_set1_epi32(0);
      R0 = _mm_unpacklo_epi8(R5, R1);
      /* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
      /* Shift Co's and Cg's by (shift-1).  -1 covers division by two.
       * Note: this must be done before sign-conversion.
       * Note also there is no slli_epi8, so we have to use a 16-bit
       * version and then mask.
       */
      R6 = _mm_slli_epi16(R6, dataShift);
      R1 = _mm_set1_epi8(mask);
      R6 = _mm_and_si128(R6, R1);
      /* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
      /* Expand Co's from 8-bit signed to 16-bit signed */
      R1 = _mm_unpackhi_epi8(R6, R6);
      R1 = _mm_srai_epi16(R1, 8);
      /* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
      /* Expand Cg's form 8-bit signed to 16-bit signed */
      R2 = _mm_unpacklo_epi8(R6, R6);
      R2 = _mm_srai_epi16(R2, 8);
      /* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
      /* Get Y - halfCg and save */
      R6 = _mm_subs_epi16(R0, R2);
      /* R = (Y-halfCg) + halfCo */
      R3 = _mm_adds_epi16(R6, R1);
      /* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
      /* G = Y + Cg(/2) */
      R4 = _mm_adds_epi16(R0, R2);
      /* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
      /* B = (Y-halfCg) - Co(/2) */
      R5 = _mm_subs_epi16(R6, R1);
      /* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
      /* Repack R's & B's.  */
      R0 = _mm_packus_epi16(R3, R5);
      /* R0 = R7R6R5R4 R3R2R1R0 B7B6B5B4 B3B2B1B0 */
      /* Repack G's. */
      R1 = _mm_packus_epi16(R4, R4);
      /* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
      /* And add the A's. */
      R1 = _mm_unpackhi_epi64(R1, R7);
      /* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
      /* Now do interleaving again. */
      R2 = _mm_unpacklo_epi8(R0, R1);
      /* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
      R3 = _mm_unpackhi_epi8(R0, R1);
      /* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
      R4 = _mm_unpacklo_epi16(R2, R3);
      /* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
      R5 = _mm_unpackhi_epi16(R2, R3);
      /* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
      _mm_store_si128((__m128i*)dptr, R4);
      dptr += (128 / 8);
      _mm_store_si128((__m128i*)dptr, R5);
      dptr += (128 / 8);
      w -= 8;
    }

    /* Handle any remainder pixels. */
    if (w > 0)
    {
      pstatus_t status;
      status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, w, 1,
                                           shift, withAlpha);

      if (status != PRIMITIVES_SUCCESS)
        return status;

      sptr += w * sizeof(UINT32);
      dptr += w * sizeof(UINT32);
    }

    sptr += sRowBump;
    dptr += dRowBump;
  }

  return PRIMITIVES_SUCCESS;
}

/* ------------------------------------------------------------------------- */
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(const BYTE* WINPR_RESTRICT pSrc,
                                                     UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
                                                     UINT32 DstFormat, UINT32 dstStep, UINT32 width,
                                                     UINT32 height, UINT8 shift, BOOL withAlpha)
{
  const BYTE* sptr = pSrc;
  BYTE* dptr = (BYTE*)pDst;
  int sRowBump = srcStep - width * sizeof(UINT32);
  int dRowBump = dstStep - width * sizeof(UINT32);
  UINT32 h;
  /* Shift left by "shift" and divide by two is the same as shift
   * left by "shift-1".
   */
  int dataShift = shift - 1;
  BYTE mask = (BYTE)(0xFFU << dataShift);

  /* Let's say the data is of the form:
   * y0y0o0g0 a1y1o1g1 a2y2o2g2...
   * Apply:
   * |R|   | 1  1/2 -1/2 |   |y|
   * |G| = | 1  0    1/2 | * |o|
   * |B|   | 1 -1/2 -1/2 |   |g|
   * where Y is 8-bit unsigned and o & g are 8-bit signed.
   */

  if ((width < 8) || (ULONG_PTR)dptr & 0x03)
  {
    /* Too small, or we'll never hit a 16-byte boundary.  Punt. */
    return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
                                       shift, withAlpha);
  }

  for (h = 0; h < height; h++)
  {
    int w = width;
    BOOL onStride;

    /* Get to a 16-byte destination boundary. */
    if ((ULONG_PTR)dptr & 0x0f)
    {
      pstatus_t status;
      UINT32 startup = (16 - ((ULONG_PTR)dptr & 0x0f)) / 4;

      if (startup > width)
        startup = width;

      status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, startup,
                                           1, shift, withAlpha);

      if (status != PRIMITIVES_SUCCESS)
        return status;

      sptr += startup * sizeof(UINT32);
      dptr += startup * sizeof(UINT32);
      w -= startup;
    }

    /* Each loop handles eight pixels at a time. */
    onStride = (((const ULONG_PTR)sptr & 0x0f) == 0) ? TRUE : FALSE;

    while (w >= 8)
    {
      __m128i R0, R1, R2, R3, R4, R5, R6, R7;

      if (onStride)
      {
        /* The faster path, 16-byte aligned load. */
        R0 = _mm_load_si128((const __m128i*)sptr);
        sptr += (128 / 8);
        R1 = _mm_load_si128((const __m128i*)sptr);
        sptr += (128 / 8);
      }
      else
      {
        /* Off-stride, slower LDDQU load. */
        R0 = _mm_lddqu_si128((const __m128i*)sptr);
        sptr += (128 / 8);
        R1 = _mm_lddqu_si128((const __m128i*)sptr);
        sptr += (128 / 8);
      }

      /* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
      /* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
      /* Shuffle to pack all the like types together. */
      R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
      R3 = _mm_shuffle_epi8(R0, R2);
      R4 = _mm_shuffle_epi8(R1, R2);
      /* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
      /* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
      R5 = _mm_unpackhi_epi32(R3, R4);
      R6 = _mm_unpacklo_epi32(R3, R4);

      /* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
      /* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
      /* Save alphas aside */
      if (withAlpha)
        R7 = _mm_unpackhi_epi64(R5, R5);
      else
        R7 = _mm_set1_epi32(0xFFFFFFFFU);

      /* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
      /* Expand Y's from 8-bit unsigned to 16-bit signed. */
      R1 = _mm_set1_epi32(0);
      R0 = _mm_unpacklo_epi8(R5, R1);
      /* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
      /* Shift Co's and Cg's by (shift-1).  -1 covers division by two.
       * Note: this must be done before sign-conversion.
       * Note also there is no slli_epi8, so we have to use a 16-bit
       * version and then mask.
       */
      R6 = _mm_slli_epi16(R6, dataShift);
      R1 = _mm_set1_epi8(mask);
      R6 = _mm_and_si128(R6, R1);
      /* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
      /* Expand Co's from 8-bit signed to 16-bit signed */
      R1 = _mm_unpackhi_epi8(R6, R6);
      R1 = _mm_srai_epi16(R1, 8);
      /* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
      /* Expand Cg's form 8-bit signed to 16-bit signed */
      R2 = _mm_unpacklo_epi8(R6, R6);
      R2 = _mm_srai_epi16(R2, 8);
      /* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
      /* Get Y - halfCg and save */
      R6 = _mm_subs_epi16(R0, R2);
      /* R = (Y-halfCg) + halfCo */
      R3 = _mm_adds_epi16(R6, R1);
      /* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
      /* G = Y + Cg(/2) */
      R4 = _mm_adds_epi16(R0, R2);
      /* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
      /* B = (Y-halfCg) - Co(/2) */
      R5 = _mm_subs_epi16(R6, R1);
      /* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
      /* Repack R's & B's.  */
      /* This line is the only diff between inverted and non-inverted.
       * Unfortunately, it would be expensive to check "inverted"
       * every time through this loop.
       */
      R0 = _mm_packus_epi16(R5, R3);
      /* R0 = B7B6B5B4 B3B2B1B0 R7R6R5R4 R3R2R1R0 */
      /* Repack G's. */
      R1 = _mm_packus_epi16(R4, R4);
      /* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
      /* And add the A's. */
      R1 = _mm_unpackhi_epi64(R1, R7);
      /* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
      /* Now do interleaving again. */
      R2 = _mm_unpacklo_epi8(R0, R1);
      /* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
      R3 = _mm_unpackhi_epi8(R0, R1);
      /* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
      R4 = _mm_unpacklo_epi16(R2, R3);
      /* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
      R5 = _mm_unpackhi_epi16(R2, R3);
      /* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
      _mm_store_si128((__m128i*)dptr, R4);
      dptr += (128 / 8);
      _mm_store_si128((__m128i*)dptr, R5);
      dptr += (128 / 8);
      w -= 8;
    }

    /* Handle any remainder pixels. */
    if (w > 0)
    {
      pstatus_t status;
      status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, w, 1,
                                           shift, withAlpha);

      if (status != PRIMITIVES_SUCCESS)
        return status;

      sptr += w * sizeof(UINT32);
      dptr += w * sizeof(UINT32);
    }

    sptr += sRowBump;
    dptr += dRowBump;
  }

  return PRIMITIVES_SUCCESS;
}
#endif /* WITH_SSE2 */

#ifdef WITH_SSE2
/* ------------------------------------------------------------------------- */
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
                                           BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
                                           INT32 dstStep, UINT32 width, UINT32 height, UINT8 shift,
                                           BOOL withAlpha)
{
  switch (DstFormat)
  {
    case PIXEL_FORMAT_BGRX32:
    case PIXEL_FORMAT_BGRA32:
      return ssse3_YCoCgRToRGB_8u_AC4R_invert(pSrc, srcStep, pDst, DstFormat, dstStep, width,
                                              height, shift, withAlpha);

    case PIXEL_FORMAT_RGBX32:
    case PIXEL_FORMAT_RGBA32:
      return ssse3_YCoCgRToRGB_8u_AC4R_no_invert(pSrc, srcStep, pDst, DstFormat, dstStep,
                                                 width, height, shift, withAlpha);

    default:
      return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width,
                                         height, shift, withAlpha);
  }
}
#elif defined(WITH_NEON)

static pstatus_t neon_YCoCgToRGB_8u_X(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
                                      BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, INT32 dstStep,
                                      UINT32 width, UINT32 height, UINT8 shift, BYTE bPos,
                                      BYTE gPos, BYTE rPos, BYTE aPos, BOOL alpha)
{
  UINT32 y;
  BYTE* dptr = pDst;
  const BYTE* sptr = pSrc;
  const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
  const int8_t cll = shift - 1; /* -1 builds in the /2's */
  const UINT32 srcPad = srcStep - (width * 4);
  const UINT32 dstPad = dstStep - (width * formatSize);
  const UINT32 pad = width % 8;
  const uint8x8_t aVal = vdup_n_u8(0xFF);
  const int8x8_t cllv = vdup_n_s8(cll);

  for (y = 0; y < height; y++)
  {
    UINT32 x;

    for (x = 0; x < width - pad; x += 8)
    {
      /* Note: shifts must be done before sign-conversion. */
      const uint8x8x4_t raw = vld4_u8(sptr);
      const int8x8_t CgRaw = vreinterpret_s8_u8(vshl_u8(raw.val[0], cllv));
      const int8x8_t CoRaw = vreinterpret_s8_u8(vshl_u8(raw.val[1], cllv));
      const int16x8_t Cg = vmovl_s8(CgRaw);
      const int16x8_t Co = vmovl_s8(CoRaw);
      const int16x8_t Y = vreinterpretq_s16_u16(vmovl_u8(raw.val[2])); /* UINT8 -> INT16 */
      const int16x8_t T = vsubq_s16(Y, Cg);
      const int16x8_t R = vaddq_s16(T, Co);
      const int16x8_t G = vaddq_s16(Y, Cg);
      const int16x8_t B = vsubq_s16(T, Co);
      uint8x8x4_t bgrx;
      bgrx.val[bPos] = vqmovun_s16(B);
      bgrx.val[gPos] = vqmovun_s16(G);
      bgrx.val[rPos] = vqmovun_s16(R);

      if (alpha)
        bgrx.val[aPos] = raw.val[3];
      else
        bgrx.val[aPos] = aVal;

      vst4_u8(dptr, bgrx);
      sptr += sizeof(raw);
      dptr += sizeof(bgrx);
    }

    for (x = 0; x < pad; x++)
    {
      /* Note: shifts must be done before sign-conversion. */
      const INT16 Cg = (INT16)((INT8)((*sptr++) << cll));
      const INT16 Co = (INT16)((INT8)((*sptr++) << cll));
      const INT16 Y = (INT16)(*sptr++); /* UINT8->INT16 */
      const INT16 T = Y - Cg;
      const INT16 R = T + Co;
      const INT16 G = Y + Cg;
      const INT16 B = T - Co;
      BYTE bgra[4];
      bgra[bPos] = CLIP(B);
      bgra[gPos] = CLIP(G);
      bgra[rPos] = CLIP(R);
      bgra[aPos] = *sptr++;

      if (!alpha)
        bgra[aPos] = 0xFF;

      *dptr++ = bgra[0];
      *dptr++ = bgra[1];
      *dptr++ = bgra[2];
      *dptr++ = bgra[3];
    }

    sptr += srcPad;
    dptr += dstPad;
  }

  return PRIMITIVES_SUCCESS;
}

static pstatus_t neon_YCoCgToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
                                         BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, INT32 dstStep,
                                         UINT32 width, UINT32 height, UINT8 shift, BOOL withAlpha)
{
  switch (DstFormat)
  {
    case PIXEL_FORMAT_BGRA32:
      return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
                                  shift, 2, 1, 0, 3, withAlpha);

    case PIXEL_FORMAT_BGRX32:
      return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
                                  shift, 2, 1, 0, 3, withAlpha);

    case PIXEL_FORMAT_RGBA32:
      return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
                                  shift, 0, 1, 2, 3, withAlpha);

    case PIXEL_FORMAT_RGBX32:
      return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
                                  shift, 0, 1, 2, 3, withAlpha);

    case PIXEL_FORMAT_ARGB32:
      return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
                                  shift, 1, 2, 3, 0, withAlpha);

    case PIXEL_FORMAT_XRGB32:
      return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
                                  shift, 1, 2, 3, 0, withAlpha);

    case PIXEL_FORMAT_ABGR32:
      return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
                                  shift, 3, 2, 1, 0, withAlpha);

    case PIXEL_FORMAT_XBGR32:
      return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
                                  shift, 3, 2, 1, 0, withAlpha);

    default:
      return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width,
                                         height, shift, withAlpha);
  }
}
#endif /* WITH_SSE2 */

/* ------------------------------------------------------------------------- */
void primitives_init_YCoCg_opt(primitives_t* WINPR_RESTRICT prims)
{
  generic = primitives_get_generic();
  primitives_init_YCoCg(prims);
  /* While IPP acknowledges the existence of YCoCg-R, it doesn't currently
   * include any routines to work with it, especially with variable shift
   * width.
   */
#if defined(WITH_SSE2)

  if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) &&
      IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
  {
    prims->YCoCgToRGB_8u_AC4R = ssse3_YCoCgRToRGB_8u_AC4R;
  }

#elif defined(WITH_NEON)

  if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
  {
    prims->YCoCgToRGB_8u_AC4R = neon_YCoCgToRGB_8u_AC4R;
  }

#endif /* WITH_SSE2 */
}

Coverage Report

Created: 2023-09-25 06:56

Line	Count	Source (jump to first uncovered line)
1		/* FreeRDP: A Remote Desktop Protocol Client
2		* Optimized YCoCg<->RGB conversion operations.
3		* vi:ts=4 sw=4:
4		*
5		* (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
6		*
7		* Licensed under the Apache License, Version 2.0 (the "License");
8		* you may not use this file except in compliance with the License.
9		* You may obtain a copy of the License at
10		*
11		* http://www.apache.org/licenses/LICENSE-2.0
12		*
13		* Unless required by applicable law or agreed to in writing, software
14		* distributed under the License is distributed on an "AS IS" BASIS,
15		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16		* See the License for the specific language governing permissions and
17		* limitations under the License.
18		*/
19
20		#include <freerdp/config.h>
21
22		#include <freerdp/types.h>
23		#include <freerdp/primitives.h>
24		#include <winpr/sysinfo.h>
25
26		#ifdef WITH_SSE2
27		#include <emmintrin.h>
28		#include <tmmintrin.h>
29		#elif defined(WITH_NEON)
30		#include <arm_neon.h>
31		#endif /* WITH_SSE2 else WITH_NEON */
32
33		#include "prim_internal.h"
34		#include "prim_templates.h"
35
36		static primitives_t* generic = NULL;
37
38		#ifdef WITH_SSE2
39		/* ------------------------------------------------------------------------- */
40		static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep,
41		BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
42		UINT32 dstStep, UINT32 width, UINT32 height,
43		UINT8 shift, BOOL withAlpha)
44	0	{
45	0	const BYTE* sptr = pSrc;
46	0	BYTE* dptr = (BYTE*)pDst;
47	0	int sRowBump = srcStep - width * sizeof(UINT32);
48	0	int dRowBump = dstStep - width * sizeof(UINT32);
49	0	UINT32 h;
50		/* Shift left by "shift" and divide by two is the same as shift
51		* left by "shift-1".
52		*/
53	0	int dataShift = shift - 1;
54	0	BYTE mask = (BYTE)(0xFFU << dataShift);
55
56		/* Let's say the data is of the form:
57		* y0y0o0g0 a1y1o1g1 a2y2o2g2...
58		* Apply:
59		* \|R\| \| 1 1/2 -1/2 \| \|y\|
60		* \|G\| = \| 1 0 1/2 \| * \|o\|
61		* \|B\| \| 1 -1/2 -1/2 \| \|g\|
62		* where Y is 8-bit unsigned and o & g are 8-bit signed.
63		*/
64
65	0	if ((width < 8) \|\| (ULONG_PTR)dptr & 0x03)
66	0	{
67		/* Too small, or we'll never hit a 16-byte boundary. Punt. */
68	0	return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
69	0	shift, withAlpha);
70	0	}
71
72	0	for (h = 0; h < height; h++)
73	0	{
74	0	UINT32 w = width;
75	0	BOOL onStride;
76
77		/* Get to a 16-byte destination boundary. */
78	0	if ((ULONG_PTR)dptr & 0x0f)
79	0	{
80	0	pstatus_t status;
81	0	UINT32 startup = (16 - ((ULONG_PTR)dptr & 0x0f)) / 4;
82
83	0	if (startup > width)
84	0	startup = width;
85
86	0	status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, startup,
87	0	1, shift, withAlpha);
88
89	0	if (status != PRIMITIVES_SUCCESS)
90	0	return status;
91
92	0	sptr += startup * sizeof(UINT32);
93	0	dptr += startup * sizeof(UINT32);
94	0	w -= startup;
95	0	}
96
97		/* Each loop handles eight pixels at a time. */
98	0	onStride = (((ULONG_PTR)sptr & 0x0f) == 0) ? TRUE : FALSE;
99
100	0	while (w >= 8)
101	0	{
102	0	__m128i R0, R1, R2, R3, R4, R5, R6, R7;
103
104	0	if (onStride)
105	0	{
106		/* The faster path, 16-byte aligned load. */
107	0	R0 = _mm_load_si128((const __m128i*)sptr);
108	0	sptr += (128 / 8);
109	0	R1 = _mm_load_si128((const __m128i*)sptr);
110	0	sptr += (128 / 8);
111	0	}
112	0	else
113	0	{
114		/* Off-stride, slower LDDQU load. */
115	0	R0 = _mm_lddqu_si128((const __m128i*)sptr);
116	0	sptr += (128 / 8);
117	0	R1 = _mm_lddqu_si128((const __m128i*)sptr);
118	0	sptr += (128 / 8);
119	0	}
120
121		/* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
122		/* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
123		/* Shuffle to pack all the like types together. */
124	0	R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
125	0	R3 = _mm_shuffle_epi8(R0, R2);
126	0	R4 = _mm_shuffle_epi8(R1, R2);
127		/* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
128		/* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
129	0	R5 = _mm_unpackhi_epi32(R3, R4);
130	0	R6 = _mm_unpacklo_epi32(R3, R4);
131
132		/* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
133		/* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
134		/* Save alphas aside */
135	0	if (withAlpha)
136	0	R7 = _mm_unpackhi_epi64(R5, R5);
137	0	else
138	0	R7 = _mm_set1_epi32(0xFFFFFFFFU);
139
140		/* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
141		/* Expand Y's from 8-bit unsigned to 16-bit signed. */
142	0	R1 = _mm_set1_epi32(0);
143	0	R0 = _mm_unpacklo_epi8(R5, R1);
144		/* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
145		/* Shift Co's and Cg's by (shift-1). -1 covers division by two.
146		* Note: this must be done before sign-conversion.
147		* Note also there is no slli_epi8, so we have to use a 16-bit
148		* version and then mask.
149		*/
150	0	R6 = _mm_slli_epi16(R6, dataShift);
151	0	R1 = _mm_set1_epi8(mask);
152	0	R6 = _mm_and_si128(R6, R1);
153		/* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
154		/* Expand Co's from 8-bit signed to 16-bit signed */
155	0	R1 = _mm_unpackhi_epi8(R6, R6);
156	0	R1 = _mm_srai_epi16(R1, 8);
157		/* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
158		/* Expand Cg's form 8-bit signed to 16-bit signed */
159	0	R2 = _mm_unpacklo_epi8(R6, R6);
160	0	R2 = _mm_srai_epi16(R2, 8);
161		/* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
162		/* Get Y - halfCg and save */
163	0	R6 = _mm_subs_epi16(R0, R2);
164		/* R = (Y-halfCg) + halfCo */
165	0	R3 = _mm_adds_epi16(R6, R1);
166		/* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
167		/* G = Y + Cg(/2) */
168	0	R4 = _mm_adds_epi16(R0, R2);
169		/* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
170		/* B = (Y-halfCg) - Co(/2) */
171	0	R5 = _mm_subs_epi16(R6, R1);
172		/* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
173		/* Repack R's & B's. */
174	0	R0 = _mm_packus_epi16(R3, R5);
175		/* R0 = R7R6R5R4 R3R2R1R0 B7B6B5B4 B3B2B1B0 */
176		/* Repack G's. */
177	0	R1 = _mm_packus_epi16(R4, R4);
178		/* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
179		/* And add the A's. */
180	0	R1 = _mm_unpackhi_epi64(R1, R7);
181		/* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
182		/* Now do interleaving again. */
183	0	R2 = _mm_unpacklo_epi8(R0, R1);
184		/* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
185	0	R3 = _mm_unpackhi_epi8(R0, R1);
186		/* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
187	0	R4 = _mm_unpacklo_epi16(R2, R3);
188		/* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
189	0	R5 = _mm_unpackhi_epi16(R2, R3);
190		/* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
191	0	_mm_store_si128((__m128i*)dptr, R4);
192	0	dptr += (128 / 8);
193	0	_mm_store_si128((__m128i*)dptr, R5);
194	0	dptr += (128 / 8);
195	0	w -= 8;
196	0	}
197
198		/* Handle any remainder pixels. */
199	0	if (w > 0)
200	0	{
201	0	pstatus_t status;
202	0	status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, w, 1,
203	0	shift, withAlpha);
204
205	0	if (status != PRIMITIVES_SUCCESS)
206	0	return status;
207
208	0	sptr += w * sizeof(UINT32);
209	0	dptr += w * sizeof(UINT32);
210	0	}
211
212	0	sptr += sRowBump;
213	0	dptr += dRowBump;
214	0	}
215
216	0	return PRIMITIVES_SUCCESS;
217	0	}
218
219		/* ------------------------------------------------------------------------- */
220		static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(const BYTE* WINPR_RESTRICT pSrc,
221		UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
222		UINT32 DstFormat, UINT32 dstStep, UINT32 width,
223		UINT32 height, UINT8 shift, BOOL withAlpha)
224	0	{
225	0	const BYTE* sptr = pSrc;
226	0	BYTE* dptr = (BYTE*)pDst;
227	0	int sRowBump = srcStep - width * sizeof(UINT32);
228	0	int dRowBump = dstStep - width * sizeof(UINT32);
229	0	UINT32 h;
230		/* Shift left by "shift" and divide by two is the same as shift
231		* left by "shift-1".
232		*/
233	0	int dataShift = shift - 1;
234	0	BYTE mask = (BYTE)(0xFFU << dataShift);
235
236		/* Let's say the data is of the form:
237		* y0y0o0g0 a1y1o1g1 a2y2o2g2...
238		* Apply:
239		* \|R\| \| 1 1/2 -1/2 \| \|y\|
240		* \|G\| = \| 1 0 1/2 \| * \|o\|
241		* \|B\| \| 1 -1/2 -1/2 \| \|g\|
242		* where Y is 8-bit unsigned and o & g are 8-bit signed.
243		*/
244
245	0	if ((width < 8) \|\| (ULONG_PTR)dptr & 0x03)
246	0	{
247		/* Too small, or we'll never hit a 16-byte boundary. Punt. */
248	0	return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
249	0	shift, withAlpha);
250	0	}
251
252	0	for (h = 0; h < height; h++)
253	0	{
254	0	int w = width;
255	0	BOOL onStride;
256
257		/* Get to a 16-byte destination boundary. */
258	0	if ((ULONG_PTR)dptr & 0x0f)
259	0	{
260	0	pstatus_t status;
261	0	UINT32 startup = (16 - ((ULONG_PTR)dptr & 0x0f)) / 4;
262
263	0	if (startup > width)
264	0	startup = width;
265
266	0	status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, startup,
267	0	1, shift, withAlpha);
268
269	0	if (status != PRIMITIVES_SUCCESS)
270	0	return status;
271
272	0	sptr += startup * sizeof(UINT32);
273	0	dptr += startup * sizeof(UINT32);
274	0	w -= startup;
275	0	}
276
277		/* Each loop handles eight pixels at a time. */
278	0	onStride = (((const ULONG_PTR)sptr & 0x0f) == 0) ? TRUE : FALSE;
279
280	0	while (w >= 8)
281	0	{
282	0	__m128i R0, R1, R2, R3, R4, R5, R6, R7;
283
284	0	if (onStride)
285	0	{
286		/* The faster path, 16-byte aligned load. */
287	0	R0 = _mm_load_si128((const __m128i*)sptr);
288	0	sptr += (128 / 8);
289	0	R1 = _mm_load_si128((const __m128i*)sptr);
290	0	sptr += (128 / 8);
291	0	}
292	0	else
293	0	{
294		/* Off-stride, slower LDDQU load. */
295	0	R0 = _mm_lddqu_si128((const __m128i*)sptr);
296	0	sptr += (128 / 8);
297	0	R1 = _mm_lddqu_si128((const __m128i*)sptr);
298	0	sptr += (128 / 8);
299	0	}
300
301		/* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
302		/* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
303		/* Shuffle to pack all the like types together. */
304	0	R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
305	0	R3 = _mm_shuffle_epi8(R0, R2);
306	0	R4 = _mm_shuffle_epi8(R1, R2);
307		/* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
308		/* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
309	0	R5 = _mm_unpackhi_epi32(R3, R4);
310	0	R6 = _mm_unpacklo_epi32(R3, R4);
311
312		/* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
313		/* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
314		/* Save alphas aside */
315	0	if (withAlpha)
316	0	R7 = _mm_unpackhi_epi64(R5, R5);
317	0	else
318	0	R7 = _mm_set1_epi32(0xFFFFFFFFU);
319
320		/* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
321		/* Expand Y's from 8-bit unsigned to 16-bit signed. */
322	0	R1 = _mm_set1_epi32(0);
323	0	R0 = _mm_unpacklo_epi8(R5, R1);
324		/* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
325		/* Shift Co's and Cg's by (shift-1). -1 covers division by two.
326		* Note: this must be done before sign-conversion.
327		* Note also there is no slli_epi8, so we have to use a 16-bit
328		* version and then mask.
329		*/
330	0	R6 = _mm_slli_epi16(R6, dataShift);
331	0	R1 = _mm_set1_epi8(mask);
332	0	R6 = _mm_and_si128(R6, R1);
333		/* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
334		/* Expand Co's from 8-bit signed to 16-bit signed */
335	0	R1 = _mm_unpackhi_epi8(R6, R6);
336	0	R1 = _mm_srai_epi16(R1, 8);
337		/* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
338		/* Expand Cg's form 8-bit signed to 16-bit signed */
339	0	R2 = _mm_unpacklo_epi8(R6, R6);
340	0	R2 = _mm_srai_epi16(R2, 8);
341		/* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
342		/* Get Y - halfCg and save */
343	0	R6 = _mm_subs_epi16(R0, R2);
344		/* R = (Y-halfCg) + halfCo */
345	0	R3 = _mm_adds_epi16(R6, R1);
346		/* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
347		/* G = Y + Cg(/2) */
348	0	R4 = _mm_adds_epi16(R0, R2);
349		/* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
350		/* B = (Y-halfCg) - Co(/2) */
351	0	R5 = _mm_subs_epi16(R6, R1);
352		/* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
353		/* Repack R's & B's. */
354		/* This line is the only diff between inverted and non-inverted.
355		* Unfortunately, it would be expensive to check "inverted"
356		* every time through this loop.
357		*/
358	0	R0 = _mm_packus_epi16(R5, R3);
359		/* R0 = B7B6B5B4 B3B2B1B0 R7R6R5R4 R3R2R1R0 */
360		/* Repack G's. */
361	0	R1 = _mm_packus_epi16(R4, R4);
362		/* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
363		/* And add the A's. */
364	0	R1 = _mm_unpackhi_epi64(R1, R7);
365		/* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
366		/* Now do interleaving again. */
367	0	R2 = _mm_unpacklo_epi8(R0, R1);
368		/* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
369	0	R3 = _mm_unpackhi_epi8(R0, R1);
370		/* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
371	0	R4 = _mm_unpacklo_epi16(R2, R3);
372		/* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
373	0	R5 = _mm_unpackhi_epi16(R2, R3);
374		/* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
375	0	_mm_store_si128((__m128i*)dptr, R4);
376	0	dptr += (128 / 8);
377	0	_mm_store_si128((__m128i*)dptr, R5);
378	0	dptr += (128 / 8);
379	0	w -= 8;
380	0	}
381
382		/* Handle any remainder pixels. */
383	0	if (w > 0)
384	0	{
385	0	pstatus_t status;
386	0	status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, w, 1,
387	0	shift, withAlpha);
388
389	0	if (status != PRIMITIVES_SUCCESS)
390	0	return status;
391
392	0	sptr += w * sizeof(UINT32);
393	0	dptr += w * sizeof(UINT32);
394	0	}
395
396	0	sptr += sRowBump;
397	0	dptr += dRowBump;
398	0	}
399
400	0	return PRIMITIVES_SUCCESS;
401	0	}
402		#endif /* WITH_SSE2 */
403
404		#ifdef WITH_SSE2
405		/* ------------------------------------------------------------------------- */
406		static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
407		BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
408		INT32 dstStep, UINT32 width, UINT32 height, UINT8 shift,
409		BOOL withAlpha)
410	0	{
411	0	switch (DstFormat)
412	0	{
413	0	case PIXEL_FORMAT_BGRX32:
414	0	case PIXEL_FORMAT_BGRA32:
415	0	return ssse3_YCoCgRToRGB_8u_AC4R_invert(pSrc, srcStep, pDst, DstFormat, dstStep, width,
416	0	height, shift, withAlpha);
417
418	0	case PIXEL_FORMAT_RGBX32:
419	0	case PIXEL_FORMAT_RGBA32:
420	0	return ssse3_YCoCgRToRGB_8u_AC4R_no_invert(pSrc, srcStep, pDst, DstFormat, dstStep,
421	0	width, height, shift, withAlpha);
422
423	0	default:
424	0	return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width,
425	0	height, shift, withAlpha);
426	0	}
427	0	}
428		#elif defined(WITH_NEON)
429
430		static pstatus_t neon_YCoCgToRGB_8u_X(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
431		BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, INT32 dstStep,
432		UINT32 width, UINT32 height, UINT8 shift, BYTE bPos,
433		BYTE gPos, BYTE rPos, BYTE aPos, BOOL alpha)
434		{
435		UINT32 y;
436		BYTE* dptr = pDst;
437		const BYTE* sptr = pSrc;
438		const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
439		const int8_t cll = shift - 1; /* -1 builds in the /2's */
440		const UINT32 srcPad = srcStep - (width * 4);
441		const UINT32 dstPad = dstStep - (width * formatSize);
442		const UINT32 pad = width % 8;
443		const uint8x8_t aVal = vdup_n_u8(0xFF);
444		const int8x8_t cllv = vdup_n_s8(cll);
445
446		for (y = 0; y < height; y++)
447		{
448		UINT32 x;
449
450		for (x = 0; x < width - pad; x += 8)
451		{
452		/* Note: shifts must be done before sign-conversion. */
453		const uint8x8x4_t raw = vld4_u8(sptr);
454		const int8x8_t CgRaw = vreinterpret_s8_u8(vshl_u8(raw.val[0], cllv));
455		const int8x8_t CoRaw = vreinterpret_s8_u8(vshl_u8(raw.val[1], cllv));
456		const int16x8_t Cg = vmovl_s8(CgRaw);
457		const int16x8_t Co = vmovl_s8(CoRaw);
458		const int16x8_t Y = vreinterpretq_s16_u16(vmovl_u8(raw.val[2])); /* UINT8 -> INT16 */
459		const int16x8_t T = vsubq_s16(Y, Cg);
460		const int16x8_t R = vaddq_s16(T, Co);
461		const int16x8_t G = vaddq_s16(Y, Cg);
462		const int16x8_t B = vsubq_s16(T, Co);
463		uint8x8x4_t bgrx;
464		bgrx.val[bPos] = vqmovun_s16(B);
465		bgrx.val[gPos] = vqmovun_s16(G);
466		bgrx.val[rPos] = vqmovun_s16(R);
467
468		if (alpha)
469		bgrx.val[aPos] = raw.val[3];
470		else
471		bgrx.val[aPos] = aVal;
472
473		vst4_u8(dptr, bgrx);
474		sptr += sizeof(raw);
475		dptr += sizeof(bgrx);
476		}
477
478		for (x = 0; x < pad; x++)
479		{
480		/* Note: shifts must be done before sign-conversion. */
481		const INT16 Cg = (INT16)((INT8)((*sptr++) << cll));
482		const INT16 Co = (INT16)((INT8)((*sptr++) << cll));
483		const INT16 Y = (INT16)(sptr++); / UINT8->INT16 */
484		const INT16 T = Y - Cg;
485		const INT16 R = T + Co;
486		const INT16 G = Y + Cg;
487		const INT16 B = T - Co;
488		BYTE bgra[4];
489		bgra[bPos] = CLIP(B);
490		bgra[gPos] = CLIP(G);
491		bgra[rPos] = CLIP(R);
492		bgra[aPos] = *sptr++;
493
494		if (!alpha)
495		bgra[aPos] = 0xFF;
496
497		*dptr++ = bgra[0];
498		*dptr++ = bgra[1];
499		*dptr++ = bgra[2];
500		*dptr++ = bgra[3];
501		}
502
503		sptr += srcPad;
504		dptr += dstPad;
505		}
506
507		return PRIMITIVES_SUCCESS;
508		}
509
510		static pstatus_t neon_YCoCgToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
511		BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, INT32 dstStep,
512		UINT32 width, UINT32 height, UINT8 shift, BOOL withAlpha)
513		{
514		switch (DstFormat)
515		{
516		case PIXEL_FORMAT_BGRA32:
517		return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
518		shift, 2, 1, 0, 3, withAlpha);
519
520		case PIXEL_FORMAT_BGRX32:
521		return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
522		shift, 2, 1, 0, 3, withAlpha);
523
524		case PIXEL_FORMAT_RGBA32:
525		return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
526		shift, 0, 1, 2, 3, withAlpha);
527
528		case PIXEL_FORMAT_RGBX32:
529		return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
530		shift, 0, 1, 2, 3, withAlpha);
531
532		case PIXEL_FORMAT_ARGB32:
533		return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
534		shift, 1, 2, 3, 0, withAlpha);
535
536		case PIXEL_FORMAT_XRGB32:
537		return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
538		shift, 1, 2, 3, 0, withAlpha);
539
540		case PIXEL_FORMAT_ABGR32:
541		return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
542		shift, 3, 2, 1, 0, withAlpha);
543
544		case PIXEL_FORMAT_XBGR32:
545		return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
546		shift, 3, 2, 1, 0, withAlpha);
547
548		default:
549		return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width,
550		height, shift, withAlpha);
551		}
552		}
553		#endif /* WITH_SSE2 */
554
555		/* ------------------------------------------------------------------------- */
556		void primitives_init_YCoCg_opt(primitives_t* WINPR_RESTRICT prims)
557	0	{
558	0	generic = primitives_get_generic();
559	0	primitives_init_YCoCg(prims);
560		/* While IPP acknowledges the existence of YCoCg-R, it doesn't currently
561		* include any routines to work with it, especially with variable shift
562		* width.
563		*/
564	0	#if defined(WITH_SSE2)
565
566	0	if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) &&
567	0	IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
568	0	{
569	0	prims->YCoCgToRGB_8u_AC4R = ssse3_YCoCgRToRGB_8u_AC4R;
570	0	}
571
572		#elif defined(WITH_NEON)
573
574		if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
575		{
576		prims->YCoCgToRGB_8u_AC4R = neon_YCoCgToRGB_8u_AC4R;
577		}
578
579		#endif /* WITH_SSE2 */
580	0	}