/src/FreeRDP/libfreerdp/primitives/sse/prim_YCoCg_ssse3.c

Source (jump to first uncovered line)
/* FreeRDP: A Remote Desktop Protocol Client
 * Optimized YCoCg<->RGB conversion operations.
 * vi:ts=4 sw=4:
 *
 * (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <freerdp/config.h>

#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include <winpr/sysinfo.h>

#include "prim_YCoCg.h"

#include "prim_internal.h"
#include "prim_templates.h"

#if defined(SSE2_ENABLED)
#include <emmintrin.h>
#include <tmmintrin.h>

static primitives_t* generic = NULL;

/* ------------------------------------------------------------------------- */
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep,
                                                  BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
                                                  UINT32 dstStep, UINT32 width, UINT32 height,
                                                  UINT8 shift, BOOL withAlpha)
{
  const BYTE* sptr = pSrc;
  BYTE* dptr = pDst;
  int sRowBump = srcStep - width * sizeof(UINT32);
  int dRowBump = dstStep - width * sizeof(UINT32);
  /* Shift left by "shift" and divide by two is the same as shift
   * left by "shift-1".
   */
  int dataShift = shift - 1;
  BYTE mask = (BYTE)(0xFFU << dataShift);

  /* Let's say the data is of the form:
   * y0y0o0g0 a1y1o1g1 a2y2o2g2...
   * Apply:
   * |R|   | 1  1/2 -1/2 |   |y|
   * |G| = | 1  0    1/2 | * |o|
   * |B|   | 1 -1/2 -1/2 |   |g|
   * where Y is 8-bit unsigned and o & g are 8-bit signed.
   */

  if ((width < 8) || (ULONG_PTR)dptr & 0x03)
  {
    /* Too small, or we'll never hit a 16-byte boundary.  Punt. */
    return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
                                       shift, withAlpha);
  }

  for (UINT32 h = 0; h < height; h++)
  {
    UINT32 w = width;
    BOOL onStride = 0;

    /* Get to a 16-byte destination boundary. */
    if ((ULONG_PTR)dptr & 0x0f)
    {
      pstatus_t status = 0;
      UINT32 startup = (16 - ((ULONG_PTR)dptr & 0x0f)) / 4;

      if (startup > width)
        startup = width;

      status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, startup,
                                           1, shift, withAlpha);

      if (status != PRIMITIVES_SUCCESS)
        return status;

      sptr += startup * sizeof(UINT32);
      dptr += startup * sizeof(UINT32);
      w -= startup;
    }

    /* Each loop handles eight pixels at a time. */
    onStride = (((ULONG_PTR)sptr & 0x0f) == 0) ? TRUE : FALSE;

    while (w >= 8)
    {
      __m128i R0;
      __m128i R1;
      __m128i R2;
      __m128i R3;
      __m128i R4;
      __m128i R5;
      __m128i R6;
      __m128i R7;

      if (onStride)
      {
        /* The faster path, 16-byte aligned load. */
        R0 = _mm_load_si128((const __m128i*)sptr);
        sptr += (128 / 8);
        R1 = _mm_load_si128((const __m128i*)sptr);
        sptr += (128 / 8);
      }
      else
      {
        /* Off-stride, slower LDDQU load. */
        R0 = _mm_lddqu_si128((const __m128i*)sptr);
        sptr += (128 / 8);
        R1 = _mm_lddqu_si128((const __m128i*)sptr);
        sptr += (128 / 8);
      }

      /* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
      /* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
      /* Shuffle to pack all the like types together. */
      R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
      R3 = _mm_shuffle_epi8(R0, R2);
      R4 = _mm_shuffle_epi8(R1, R2);
      /* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
      /* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
      R5 = _mm_unpackhi_epi32(R3, R4);
      R6 = _mm_unpacklo_epi32(R3, R4);

      /* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
      /* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
      /* Save alphas aside */
      if (withAlpha)
        R7 = _mm_unpackhi_epi64(R5, R5);
      else
        R7 = _mm_set1_epi32(0xFFFFFFFFU);

      /* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
      /* Expand Y's from 8-bit unsigned to 16-bit signed. */
      R1 = _mm_set1_epi32(0);
      R0 = _mm_unpacklo_epi8(R5, R1);
      /* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
      /* Shift Co's and Cg's by (shift-1).  -1 covers division by two.
       * Note: this must be done before sign-conversion.
       * Note also there is no slli_epi8, so we have to use a 16-bit
       * version and then mask.
       */
      R6 = _mm_slli_epi16(R6, dataShift);
      R1 = _mm_set1_epi8(mask);
      R6 = _mm_and_si128(R6, R1);
      /* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
      /* Expand Co's from 8-bit signed to 16-bit signed */
      R1 = _mm_unpackhi_epi8(R6, R6);
      R1 = _mm_srai_epi16(R1, 8);
      /* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
      /* Expand Cg's form 8-bit signed to 16-bit signed */
      R2 = _mm_unpacklo_epi8(R6, R6);
      R2 = _mm_srai_epi16(R2, 8);
      /* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
      /* Get Y - halfCg and save */
      R6 = _mm_subs_epi16(R0, R2);
      /* R = (Y-halfCg) + halfCo */
      R3 = _mm_adds_epi16(R6, R1);
      /* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
      /* G = Y + Cg(/2) */
      R4 = _mm_adds_epi16(R0, R2);
      /* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
      /* B = (Y-halfCg) - Co(/2) */
      R5 = _mm_subs_epi16(R6, R1);
      /* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
      /* Repack R's & B's.  */
      R0 = _mm_packus_epi16(R3, R5);
      /* R0 = R7R6R5R4 R3R2R1R0 B7B6B5B4 B3B2B1B0 */
      /* Repack G's. */
      R1 = _mm_packus_epi16(R4, R4);
      /* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
      /* And add the A's. */
      R1 = _mm_unpackhi_epi64(R1, R7);
      /* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
      /* Now do interleaving again. */
      R2 = _mm_unpacklo_epi8(R0, R1);
      /* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
      R3 = _mm_unpackhi_epi8(R0, R1);
      /* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
      R4 = _mm_unpacklo_epi16(R2, R3);
      /* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
      R5 = _mm_unpackhi_epi16(R2, R3);
      /* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
      _mm_store_si128((__m128i*)dptr, R4);
      dptr += (128 / 8);
      _mm_store_si128((__m128i*)dptr, R5);
      dptr += (128 / 8);
      w -= 8;
    }

    /* Handle any remainder pixels. */
    if (w > 0)
    {
      pstatus_t status = 0;
      status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, w, 1,
                                           shift, withAlpha);

      if (status != PRIMITIVES_SUCCESS)
        return status;

      sptr += w * sizeof(UINT32);
      dptr += w * sizeof(UINT32);
    }

    sptr += sRowBump;
    dptr += dRowBump;
  }

  return PRIMITIVES_SUCCESS;
}

/* ------------------------------------------------------------------------- */
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(const BYTE* WINPR_RESTRICT pSrc,
                                                     UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
                                                     UINT32 DstFormat, UINT32 dstStep, UINT32 width,
                                                     UINT32 height, UINT8 shift, BOOL withAlpha)
{
  const BYTE* sptr = pSrc;
  BYTE* dptr = pDst;
  int sRowBump = srcStep - width * sizeof(UINT32);
  int dRowBump = dstStep - width * sizeof(UINT32);
  /* Shift left by "shift" and divide by two is the same as shift
   * left by "shift-1".
   */
  int dataShift = shift - 1;
  BYTE mask = (BYTE)(0xFFU << dataShift);

  /* Let's say the data is of the form:
   * y0y0o0g0 a1y1o1g1 a2y2o2g2...
   * Apply:
   * |R|   | 1  1/2 -1/2 |   |y|
   * |G| = | 1  0    1/2 | * |o|
   * |B|   | 1 -1/2 -1/2 |   |g|
   * where Y is 8-bit unsigned and o & g are 8-bit signed.
   */

  if ((width < 8) || (ULONG_PTR)dptr & 0x03)
  {
    /* Too small, or we'll never hit a 16-byte boundary.  Punt. */
    return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
                                       shift, withAlpha);
  }

  for (UINT32 h = 0; h < height; h++)
  {
    int w = width;
    BOOL onStride = 0;

    /* Get to a 16-byte destination boundary. */
    if ((ULONG_PTR)dptr & 0x0f)
    {
      pstatus_t status = 0;
      UINT32 startup = (16 - ((ULONG_PTR)dptr & 0x0f)) / 4;

      if (startup > width)
        startup = width;

      status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, startup,
                                           1, shift, withAlpha);

      if (status != PRIMITIVES_SUCCESS)
        return status;

      sptr += startup * sizeof(UINT32);
      dptr += startup * sizeof(UINT32);
      w -= startup;
    }

    /* Each loop handles eight pixels at a time. */
    onStride = (((const ULONG_PTR)sptr & 0x0f) == 0) ? TRUE : FALSE;

    while (w >= 8)
    {
      __m128i R0;
      __m128i R1;
      __m128i R2;
      __m128i R3;
      __m128i R4;
      __m128i R5;
      __m128i R6;
      __m128i R7;

      if (onStride)
      {
        /* The faster path, 16-byte aligned load. */
        R0 = _mm_load_si128((const __m128i*)sptr);
        sptr += (128 / 8);
        R1 = _mm_load_si128((const __m128i*)sptr);
        sptr += (128 / 8);
      }
      else
      {
        /* Off-stride, slower LDDQU load. */
        R0 = _mm_lddqu_si128((const __m128i*)sptr);
        sptr += (128 / 8);
        R1 = _mm_lddqu_si128((const __m128i*)sptr);
        sptr += (128 / 8);
      }

      /* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
      /* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
      /* Shuffle to pack all the like types together. */
      R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
      R3 = _mm_shuffle_epi8(R0, R2);
      R4 = _mm_shuffle_epi8(R1, R2);
      /* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
      /* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
      R5 = _mm_unpackhi_epi32(R3, R4);
      R6 = _mm_unpacklo_epi32(R3, R4);

      /* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
      /* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
      /* Save alphas aside */
      if (withAlpha)
        R7 = _mm_unpackhi_epi64(R5, R5);
      else
        R7 = _mm_set1_epi32(0xFFFFFFFFU);

      /* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
      /* Expand Y's from 8-bit unsigned to 16-bit signed. */
      R1 = _mm_set1_epi32(0);
      R0 = _mm_unpacklo_epi8(R5, R1);
      /* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
      /* Shift Co's and Cg's by (shift-1).  -1 covers division by two.
       * Note: this must be done before sign-conversion.
       * Note also there is no slli_epi8, so we have to use a 16-bit
       * version and then mask.
       */
      R6 = _mm_slli_epi16(R6, dataShift);
      R1 = _mm_set1_epi8(mask);
      R6 = _mm_and_si128(R6, R1);
      /* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
      /* Expand Co's from 8-bit signed to 16-bit signed */
      R1 = _mm_unpackhi_epi8(R6, R6);
      R1 = _mm_srai_epi16(R1, 8);
      /* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
      /* Expand Cg's form 8-bit signed to 16-bit signed */
      R2 = _mm_unpacklo_epi8(R6, R6);
      R2 = _mm_srai_epi16(R2, 8);
      /* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
      /* Get Y - halfCg and save */
      R6 = _mm_subs_epi16(R0, R2);
      /* R = (Y-halfCg) + halfCo */
      R3 = _mm_adds_epi16(R6, R1);
      /* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
      /* G = Y + Cg(/2) */
      R4 = _mm_adds_epi16(R0, R2);
      /* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
      /* B = (Y-halfCg) - Co(/2) */
      R5 = _mm_subs_epi16(R6, R1);
      /* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
      /* Repack R's & B's.  */
      /* This line is the only diff between inverted and non-inverted.
       * Unfortunately, it would be expensive to check "inverted"
       * every time through this loop.
       */
      R0 = _mm_packus_epi16(R5, R3);
      /* R0 = B7B6B5B4 B3B2B1B0 R7R6R5R4 R3R2R1R0 */
      /* Repack G's. */
      R1 = _mm_packus_epi16(R4, R4);
      /* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
      /* And add the A's. */
      R1 = _mm_unpackhi_epi64(R1, R7);
      /* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
      /* Now do interleaving again. */
      R2 = _mm_unpacklo_epi8(R0, R1);
      /* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
      R3 = _mm_unpackhi_epi8(R0, R1);
      /* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
      R4 = _mm_unpacklo_epi16(R2, R3);
      /* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
      R5 = _mm_unpackhi_epi16(R2, R3);
      /* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
      _mm_store_si128((__m128i*)dptr, R4);
      dptr += (128 / 8);
      _mm_store_si128((__m128i*)dptr, R5);
      dptr += (128 / 8);
      w -= 8;
    }

    /* Handle any remainder pixels. */
    if (w > 0)
    {
      pstatus_t status = 0;
      status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, w, 1,
                                           shift, withAlpha);

      if (status != PRIMITIVES_SUCCESS)
        return status;

      sptr += w * sizeof(UINT32);
      dptr += w * sizeof(UINT32);
    }

    sptr += sRowBump;
    dptr += dRowBump;
  }

  return PRIMITIVES_SUCCESS;
}

/* ------------------------------------------------------------------------- */
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
                                           BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
                                           INT32 dstStep, UINT32 width, UINT32 height, UINT8 shift,
                                           BOOL withAlpha)
{
  switch (DstFormat)
  {
    case PIXEL_FORMAT_BGRX32:
    case PIXEL_FORMAT_BGRA32:
      return ssse3_YCoCgRToRGB_8u_AC4R_invert(pSrc, srcStep, pDst, DstFormat, dstStep, width,
                                              height, shift, withAlpha);

    case PIXEL_FORMAT_RGBX32:
    case PIXEL_FORMAT_RGBA32:
      return ssse3_YCoCgRToRGB_8u_AC4R_no_invert(pSrc, srcStep, pDst, DstFormat, dstStep,
                                                 width, height, shift, withAlpha);

    default:
      return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width,
                                         height, shift, withAlpha);
  }
}

#endif

/* ------------------------------------------------------------------------- */
void primitives_init_YCoCg_ssse3(primitives_t* WINPR_RESTRICT prims)
{
#if defined(SSE2_ENABLED)
  generic = primitives_get_generic();
  primitives_init_YCoCg(prims);

  if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) &&
      IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
  {
    WLog_VRB(PRIM_TAG, "SSE3/SSSE3 optimizations");
    prims->YCoCgToRGB_8u_AC4R = ssse3_YCoCgRToRGB_8u_AC4R;
  }
#else
  WLog_VRB(PRIM_TAG, "undefined WITH_SSE2");
  WINPR_UNUSED(prims);
#endif
}

Coverage Report

Created: 2024-09-08 06:20

Line	Count	Source (jump to first uncovered line)
1		/* FreeRDP: A Remote Desktop Protocol Client
2		* Optimized YCoCg<->RGB conversion operations.
3		* vi:ts=4 sw=4:
4		*
5		* (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
6		*
7		* Licensed under the Apache License, Version 2.0 (the "License");
8		* you may not use this file except in compliance with the License.
9		* You may obtain a copy of the License at
10		*
11		* http://www.apache.org/licenses/LICENSE-2.0
12		*
13		* Unless required by applicable law or agreed to in writing, software
14		* distributed under the License is distributed on an "AS IS" BASIS,
15		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16		* See the License for the specific language governing permissions and
17		* limitations under the License.
18		*/
19
20		#include <freerdp/config.h>
21
22		#include <freerdp/types.h>
23		#include <freerdp/primitives.h>
24		#include <winpr/sysinfo.h>
25
26		#include "prim_YCoCg.h"
27
28		#include "prim_internal.h"
29		#include "prim_templates.h"
30
31		#if defined(SSE2_ENABLED)
32		#include <emmintrin.h>
33		#include <tmmintrin.h>
34
35		static primitives_t* generic = NULL;
36
37		/* ------------------------------------------------------------------------- */
38		static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep,
39		BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
40		UINT32 dstStep, UINT32 width, UINT32 height,
41		UINT8 shift, BOOL withAlpha)
42		{
43		const BYTE* sptr = pSrc;
44		BYTE* dptr = pDst;
45		int sRowBump = srcStep - width * sizeof(UINT32);
46		int dRowBump = dstStep - width * sizeof(UINT32);
47		/* Shift left by "shift" and divide by two is the same as shift
48		* left by "shift-1".
49		*/
50		int dataShift = shift - 1;
51		BYTE mask = (BYTE)(0xFFU << dataShift);
52
53		/* Let's say the data is of the form:
54		* y0y0o0g0 a1y1o1g1 a2y2o2g2...
55		* Apply:
56		* \|R\| \| 1 1/2 -1/2 \| \|y\|
57		* \|G\| = \| 1 0 1/2 \| * \|o\|
58		* \|B\| \| 1 -1/2 -1/2 \| \|g\|
59		* where Y is 8-bit unsigned and o & g are 8-bit signed.
60		*/
61
62		if ((width < 8) \|\| (ULONG_PTR)dptr & 0x03)
63		{
64		/* Too small, or we'll never hit a 16-byte boundary. Punt. */
65		return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
66		shift, withAlpha);
67		}
68
69		for (UINT32 h = 0; h < height; h++)
70		{
71		UINT32 w = width;
72		BOOL onStride = 0;
73
74		/* Get to a 16-byte destination boundary. */
75		if ((ULONG_PTR)dptr & 0x0f)
76		{
77		pstatus_t status = 0;
78		UINT32 startup = (16 - ((ULONG_PTR)dptr & 0x0f)) / 4;
79
80		if (startup > width)
81		startup = width;
82
83		status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, startup,
84		1, shift, withAlpha);
85
86		if (status != PRIMITIVES_SUCCESS)
87		return status;
88
89		sptr += startup * sizeof(UINT32);
90		dptr += startup * sizeof(UINT32);
91		w -= startup;
92		}
93
94		/* Each loop handles eight pixels at a time. */
95		onStride = (((ULONG_PTR)sptr & 0x0f) == 0) ? TRUE : FALSE;
96
97		while (w >= 8)
98		{
99		__m128i R0;
100		__m128i R1;
101		__m128i R2;
102		__m128i R3;
103		__m128i R4;
104		__m128i R5;
105		__m128i R6;
106		__m128i R7;
107
108		if (onStride)
109		{
110		/* The faster path, 16-byte aligned load. */
111		R0 = _mm_load_si128((const __m128i*)sptr);
112		sptr += (128 / 8);
113		R1 = _mm_load_si128((const __m128i*)sptr);
114		sptr += (128 / 8);
115		}
116		else
117		{
118		/* Off-stride, slower LDDQU load. */
119		R0 = _mm_lddqu_si128((const __m128i*)sptr);
120		sptr += (128 / 8);
121		R1 = _mm_lddqu_si128((const __m128i*)sptr);
122		sptr += (128 / 8);
123		}
124
125		/* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
126		/* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
127		/* Shuffle to pack all the like types together. */
128		R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
129		R3 = _mm_shuffle_epi8(R0, R2);
130		R4 = _mm_shuffle_epi8(R1, R2);
131		/* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
132		/* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
133		R5 = _mm_unpackhi_epi32(R3, R4);
134		R6 = _mm_unpacklo_epi32(R3, R4);
135
136		/* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
137		/* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
138		/* Save alphas aside */
139		if (withAlpha)
140		R7 = _mm_unpackhi_epi64(R5, R5);
141		else
142		R7 = _mm_set1_epi32(0xFFFFFFFFU);
143
144		/* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
145		/* Expand Y's from 8-bit unsigned to 16-bit signed. */
146		R1 = _mm_set1_epi32(0);
147		R0 = _mm_unpacklo_epi8(R5, R1);
148		/* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
149		/* Shift Co's and Cg's by (shift-1). -1 covers division by two.
150		* Note: this must be done before sign-conversion.
151		* Note also there is no slli_epi8, so we have to use a 16-bit
152		* version and then mask.
153		*/
154		R6 = _mm_slli_epi16(R6, dataShift);
155		R1 = _mm_set1_epi8(mask);
156		R6 = _mm_and_si128(R6, R1);
157		/* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
158		/* Expand Co's from 8-bit signed to 16-bit signed */
159		R1 = _mm_unpackhi_epi8(R6, R6);
160		R1 = _mm_srai_epi16(R1, 8);
161		/* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
162		/* Expand Cg's form 8-bit signed to 16-bit signed */
163		R2 = _mm_unpacklo_epi8(R6, R6);
164		R2 = _mm_srai_epi16(R2, 8);
165		/* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
166		/* Get Y - halfCg and save */
167		R6 = _mm_subs_epi16(R0, R2);
168		/* R = (Y-halfCg) + halfCo */
169		R3 = _mm_adds_epi16(R6, R1);
170		/* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
171		/* G = Y + Cg(/2) */
172		R4 = _mm_adds_epi16(R0, R2);
173		/* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
174		/* B = (Y-halfCg) - Co(/2) */
175		R5 = _mm_subs_epi16(R6, R1);
176		/* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
177		/* Repack R's & B's. */
178		R0 = _mm_packus_epi16(R3, R5);
179		/* R0 = R7R6R5R4 R3R2R1R0 B7B6B5B4 B3B2B1B0 */
180		/* Repack G's. */
181		R1 = _mm_packus_epi16(R4, R4);
182		/* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
183		/* And add the A's. */
184		R1 = _mm_unpackhi_epi64(R1, R7);
185		/* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
186		/* Now do interleaving again. */
187		R2 = _mm_unpacklo_epi8(R0, R1);
188		/* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
189		R3 = _mm_unpackhi_epi8(R0, R1);
190		/* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
191		R4 = _mm_unpacklo_epi16(R2, R3);
192		/* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
193		R5 = _mm_unpackhi_epi16(R2, R3);
194		/* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
195		_mm_store_si128((__m128i*)dptr, R4);
196		dptr += (128 / 8);
197		_mm_store_si128((__m128i*)dptr, R5);
198		dptr += (128 / 8);
199		w -= 8;
200		}
201
202		/* Handle any remainder pixels. */
203		if (w > 0)
204		{
205		pstatus_t status = 0;
206		status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, w, 1,
207		shift, withAlpha);
208
209		if (status != PRIMITIVES_SUCCESS)
210		return status;
211
212		sptr += w * sizeof(UINT32);
213		dptr += w * sizeof(UINT32);
214		}
215
216		sptr += sRowBump;
217		dptr += dRowBump;
218		}
219
220		return PRIMITIVES_SUCCESS;
221		}
222
223		/* ------------------------------------------------------------------------- */
224		static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(const BYTE* WINPR_RESTRICT pSrc,
225		UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
226		UINT32 DstFormat, UINT32 dstStep, UINT32 width,
227		UINT32 height, UINT8 shift, BOOL withAlpha)
228		{
229		const BYTE* sptr = pSrc;
230		BYTE* dptr = pDst;
231		int sRowBump = srcStep - width * sizeof(UINT32);
232		int dRowBump = dstStep - width * sizeof(UINT32);
233		/* Shift left by "shift" and divide by two is the same as shift
234		* left by "shift-1".
235		*/
236		int dataShift = shift - 1;
237		BYTE mask = (BYTE)(0xFFU << dataShift);
238
239		/* Let's say the data is of the form:
240		* y0y0o0g0 a1y1o1g1 a2y2o2g2...
241		* Apply:
242		* \|R\| \| 1 1/2 -1/2 \| \|y\|
243		* \|G\| = \| 1 0 1/2 \| * \|o\|
244		* \|B\| \| 1 -1/2 -1/2 \| \|g\|
245		* where Y is 8-bit unsigned and o & g are 8-bit signed.
246		*/
247
248		if ((width < 8) \|\| (ULONG_PTR)dptr & 0x03)
249		{
250		/* Too small, or we'll never hit a 16-byte boundary. Punt. */
251		return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
252		shift, withAlpha);
253		}
254
255		for (UINT32 h = 0; h < height; h++)
256		{
257		int w = width;
258		BOOL onStride = 0;
259
260		/* Get to a 16-byte destination boundary. */
261		if ((ULONG_PTR)dptr & 0x0f)
262		{
263		pstatus_t status = 0;
264		UINT32 startup = (16 - ((ULONG_PTR)dptr & 0x0f)) / 4;
265
266		if (startup > width)
267		startup = width;
268
269		status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, startup,
270		1, shift, withAlpha);
271
272		if (status != PRIMITIVES_SUCCESS)
273		return status;
274
275		sptr += startup * sizeof(UINT32);
276		dptr += startup * sizeof(UINT32);
277		w -= startup;
278		}
279
280		/* Each loop handles eight pixels at a time. */
281		onStride = (((const ULONG_PTR)sptr & 0x0f) == 0) ? TRUE : FALSE;
282
283		while (w >= 8)
284		{
285		__m128i R0;
286		__m128i R1;
287		__m128i R2;
288		__m128i R3;
289		__m128i R4;
290		__m128i R5;
291		__m128i R6;
292		__m128i R7;
293
294		if (onStride)
295		{
296		/* The faster path, 16-byte aligned load. */
297		R0 = _mm_load_si128((const __m128i*)sptr);
298		sptr += (128 / 8);
299		R1 = _mm_load_si128((const __m128i*)sptr);
300		sptr += (128 / 8);
301		}
302		else
303		{
304		/* Off-stride, slower LDDQU load. */
305		R0 = _mm_lddqu_si128((const __m128i*)sptr);
306		sptr += (128 / 8);
307		R1 = _mm_lddqu_si128((const __m128i*)sptr);
308		sptr += (128 / 8);
309		}
310
311		/* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
312		/* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
313		/* Shuffle to pack all the like types together. */
314		R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
315		R3 = _mm_shuffle_epi8(R0, R2);
316		R4 = _mm_shuffle_epi8(R1, R2);
317		/* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
318		/* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
319		R5 = _mm_unpackhi_epi32(R3, R4);
320		R6 = _mm_unpacklo_epi32(R3, R4);
321
322		/* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
323		/* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
324		/* Save alphas aside */
325		if (withAlpha)
326		R7 = _mm_unpackhi_epi64(R5, R5);
327		else
328		R7 = _mm_set1_epi32(0xFFFFFFFFU);
329
330		/* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
331		/* Expand Y's from 8-bit unsigned to 16-bit signed. */
332		R1 = _mm_set1_epi32(0);
333		R0 = _mm_unpacklo_epi8(R5, R1);
334		/* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
335		/* Shift Co's and Cg's by (shift-1). -1 covers division by two.
336		* Note: this must be done before sign-conversion.
337		* Note also there is no slli_epi8, so we have to use a 16-bit
338		* version and then mask.
339		*/
340		R6 = _mm_slli_epi16(R6, dataShift);
341		R1 = _mm_set1_epi8(mask);
342		R6 = _mm_and_si128(R6, R1);
343		/* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
344		/* Expand Co's from 8-bit signed to 16-bit signed */
345		R1 = _mm_unpackhi_epi8(R6, R6);
346		R1 = _mm_srai_epi16(R1, 8);
347		/* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
348		/* Expand Cg's form 8-bit signed to 16-bit signed */
349		R2 = _mm_unpacklo_epi8(R6, R6);
350		R2 = _mm_srai_epi16(R2, 8);
351		/* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
352		/* Get Y - halfCg and save */
353		R6 = _mm_subs_epi16(R0, R2);
354		/* R = (Y-halfCg) + halfCo */
355		R3 = _mm_adds_epi16(R6, R1);
356		/* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
357		/* G = Y + Cg(/2) */
358		R4 = _mm_adds_epi16(R0, R2);
359		/* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
360		/* B = (Y-halfCg) - Co(/2) */
361		R5 = _mm_subs_epi16(R6, R1);
362		/* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
363		/* Repack R's & B's. */
364		/* This line is the only diff between inverted and non-inverted.
365		* Unfortunately, it would be expensive to check "inverted"
366		* every time through this loop.
367		*/
368		R0 = _mm_packus_epi16(R5, R3);
369		/* R0 = B7B6B5B4 B3B2B1B0 R7R6R5R4 R3R2R1R0 */
370		/* Repack G's. */
371		R1 = _mm_packus_epi16(R4, R4);
372		/* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
373		/* And add the A's. */
374		R1 = _mm_unpackhi_epi64(R1, R7);
375		/* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
376		/* Now do interleaving again. */
377		R2 = _mm_unpacklo_epi8(R0, R1);
378		/* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
379		R3 = _mm_unpackhi_epi8(R0, R1);
380		/* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
381		R4 = _mm_unpacklo_epi16(R2, R3);
382		/* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
383		R5 = _mm_unpackhi_epi16(R2, R3);
384		/* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
385		_mm_store_si128((__m128i*)dptr, R4);
386		dptr += (128 / 8);
387		_mm_store_si128((__m128i*)dptr, R5);
388		dptr += (128 / 8);
389		w -= 8;
390		}
391
392		/* Handle any remainder pixels. */
393		if (w > 0)
394		{
395		pstatus_t status = 0;
396		status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, w, 1,
397		shift, withAlpha);
398
399		if (status != PRIMITIVES_SUCCESS)
400		return status;
401
402		sptr += w * sizeof(UINT32);
403		dptr += w * sizeof(UINT32);
404		}
405
406		sptr += sRowBump;
407		dptr += dRowBump;
408		}
409
410		return PRIMITIVES_SUCCESS;
411		}
412
413		/* ------------------------------------------------------------------------- */
414		static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
415		BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
416		INT32 dstStep, UINT32 width, UINT32 height, UINT8 shift,
417		BOOL withAlpha)
418		{
419		switch (DstFormat)
420		{
421		case PIXEL_FORMAT_BGRX32:
422		case PIXEL_FORMAT_BGRA32:
423		return ssse3_YCoCgRToRGB_8u_AC4R_invert(pSrc, srcStep, pDst, DstFormat, dstStep, width,
424		height, shift, withAlpha);
425
426		case PIXEL_FORMAT_RGBX32:
427		case PIXEL_FORMAT_RGBA32:
428		return ssse3_YCoCgRToRGB_8u_AC4R_no_invert(pSrc, srcStep, pDst, DstFormat, dstStep,
429		width, height, shift, withAlpha);
430
431		default:
432		return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width,
433		height, shift, withAlpha);
434		}
435		}
436
437		#endif
438
439		/* ------------------------------------------------------------------------- */
440		void primitives_init_YCoCg_ssse3(primitives_t* WINPR_RESTRICT prims)
441	0	{
442		#if defined(SSE2_ENABLED)
443		generic = primitives_get_generic();
444		primitives_init_YCoCg(prims);
445
446		if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) &&
447		IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
448		{
449		WLog_VRB(PRIM_TAG, "SSE3/SSSE3 optimizations");
450		prims->YCoCgToRGB_8u_AC4R = ssse3_YCoCgRToRGB_8u_AC4R;
451		}
452		#else
453	0	WLog_VRB(PRIM_TAG, "undefined WITH_SSE2");
454	0	WINPR_UNUSED(prims);
455	0	#endif
456	0	}