/src/FreeRDP/libfreerdp/primitives/sse/prim_YCoCg_ssse3.c

Source (jump to first uncovered line)
/* FreeRDP: A Remote Desktop Protocol Client
 * Optimized YCoCg<->RGB conversion operations.
 * vi:ts=4 sw=4:
 *
 * (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <freerdp/config.h>

#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include <winpr/sysinfo.h>

#include "prim_YCoCg.h"

#include "prim_internal.h"
#include "prim_templates.h"

#if defined(SSE_AVX_INTRINSICS_ENABLED)
#include <emmintrin.h>
#include <tmmintrin.h>

static primitives_t* generic = NULL;

/* ------------------------------------------------------------------------- */
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep,
                                                  BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
                                                  UINT32 dstStep, UINT32 width, UINT32 height,
                                                  UINT8 shift, BOOL withAlpha)
{
  const BYTE* sptr = pSrc;
  BYTE* dptr = pDst;

  WINPR_ASSERT(srcStep / sizeof(UINT32) >= width);
  WINPR_ASSERT(dstStep / sizeof(UINT32) >= width);
  const size_t sRowBump = srcStep - width * sizeof(UINT32);
  const size_t dRowBump = dstStep - width * sizeof(UINT32);
  /* Shift left by "shift" and divide by two is the same as shift
   * left by "shift-1".
   */
  int dataShift = shift - 1;
  BYTE mask = (BYTE)(0xFFU << dataShift);

  /* Let's say the data is of the form:
   * y0y0o0g0 a1y1o1g1 a2y2o2g2...
   * Apply:
   * |R|   | 1  1/2 -1/2 |   |y|
   * |G| = | 1  0    1/2 | * |o|
   * |B|   | 1 -1/2 -1/2 |   |g|
   * where Y is 8-bit unsigned and o & g are 8-bit signed.
   */

  if ((width < 8) || (ULONG_PTR)dptr & 0x03)
  {
    /* Too small, or we'll never hit a 16-byte boundary.  Punt. */
    return generic->YCoCgToRGB_8u_AC4R(pSrc, WINPR_ASSERTING_INT_CAST(INT32, srcStep), pDst,
                                       DstFormat, WINPR_ASSERTING_INT_CAST(INT32, dstStep),
                                       width, height, shift, withAlpha);
  }

  for (UINT32 h = 0; h < height; h++)
  {
    UINT32 w = width;

    while (w >= 8)
    {
      __m128i R0;
      __m128i R1;
      __m128i R2;
      __m128i R3;
      __m128i R4;
      __m128i R5;
      __m128i R6;
      __m128i R7;

      R0 = LOAD_SI128(sptr);
      sptr += (128 / 8);
      R1 = LOAD_SI128(sptr);
      sptr += (128 / 8);

      /* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
      /* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
      /* Shuffle to pack all the like types together. */
      R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
      R3 = _mm_shuffle_epi8(R0, R2);
      R4 = _mm_shuffle_epi8(R1, R2);
      /* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
      /* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
      R5 = _mm_unpackhi_epi32(R3, R4);
      R6 = _mm_unpacklo_epi32(R3, R4);

      /* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
      /* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
      /* Save alphas aside */
      if (withAlpha)
        R7 = _mm_unpackhi_epi64(R5, R5);
      else
        R7 = mm_set1_epu32(0xFFFFFFFFU);

      /* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
      /* Expand Y's from 8-bit unsigned to 16-bit signed. */
      R1 = mm_set1_epu32(0);
      R0 = _mm_unpacklo_epi8(R5, R1);
      /* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
      /* Shift Co's and Cg's by (shift-1).  -1 covers division by two.
       * Note: this must be done before sign-conversion.
       * Note also there is no slli_epi8, so we have to use a 16-bit
       * version and then mask.
       */
      R6 = _mm_slli_epi16(R6, dataShift);
      R1 = mm_set1_epu8(mask);
      R6 = _mm_and_si128(R6, R1);
      /* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
      /* Expand Co's from 8-bit signed to 16-bit signed */
      R1 = _mm_unpackhi_epi8(R6, R6);
      R1 = _mm_srai_epi16(R1, 8);
      /* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
      /* Expand Cg's form 8-bit signed to 16-bit signed */
      R2 = _mm_unpacklo_epi8(R6, R6);
      R2 = _mm_srai_epi16(R2, 8);
      /* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
      /* Get Y - halfCg and save */
      R6 = _mm_subs_epi16(R0, R2);
      /* R = (Y-halfCg) + halfCo */
      R3 = _mm_adds_epi16(R6, R1);
      /* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
      /* G = Y + Cg(/2) */
      R4 = _mm_adds_epi16(R0, R2);
      /* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
      /* B = (Y-halfCg) - Co(/2) */
      R5 = _mm_subs_epi16(R6, R1);
      /* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
      /* Repack R's & B's.  */
      R0 = _mm_packus_epi16(R3, R5);
      /* R0 = R7R6R5R4 R3R2R1R0 B7B6B5B4 B3B2B1B0 */
      /* Repack G's. */
      R1 = _mm_packus_epi16(R4, R4);
      /* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
      /* And add the A's. */
      R1 = _mm_unpackhi_epi64(R1, R7);
      /* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
      /* Now do interleaving again. */
      R2 = _mm_unpacklo_epi8(R0, R1);
      /* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
      R3 = _mm_unpackhi_epi8(R0, R1);
      /* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
      R4 = _mm_unpacklo_epi16(R2, R3);
      /* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
      R5 = _mm_unpackhi_epi16(R2, R3);
      /* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
      STORE_SI128(dptr, R4);
      dptr += (128 / 8);
      STORE_SI128(dptr, R5);
      dptr += (128 / 8);
      w -= 8;
    }

    /* Handle any remainder pixels. */
    if (w > 0)
    {
      pstatus_t status = 0;
      status = generic->YCoCgToRGB_8u_AC4R(
          sptr, WINPR_ASSERTING_INT_CAST(INT32, srcStep), dptr, DstFormat,
          WINPR_ASSERTING_INT_CAST(INT32, dstStep), w, 1, shift, withAlpha);

      if (status != PRIMITIVES_SUCCESS)
        return status;

      sptr += w * sizeof(UINT32);
      dptr += w * sizeof(UINT32);
    }

    sptr += sRowBump;
    dptr += dRowBump;
  }

  return PRIMITIVES_SUCCESS;
}

/* ------------------------------------------------------------------------- */
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(const BYTE* WINPR_RESTRICT pSrc,
                                                     UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
                                                     UINT32 DstFormat, UINT32 dstStep, UINT32 width,
                                                     UINT32 height, UINT8 shift, BOOL withAlpha)
{
  const BYTE* sptr = pSrc;
  BYTE* dptr = pDst;
  size_t sRowBump = srcStep - width * sizeof(UINT32);
  size_t dRowBump = dstStep - width * sizeof(UINT32);
  /* Shift left by "shift" and divide by two is the same as shift
   * left by "shift-1".
   */
  int dataShift = shift - 1;
  BYTE mask = (BYTE)(0xFFU << dataShift);

  /* Let's say the data is of the form:
   * y0y0o0g0 a1y1o1g1 a2y2o2g2...
   * Apply:
   * |R|   | 1  1/2 -1/2 |   |y|
   * |G| = | 1  0    1/2 | * |o|
   * |B|   | 1 -1/2 -1/2 |   |g|
   * where Y is 8-bit unsigned and o & g are 8-bit signed.
   */

  if ((width < 8) || (ULONG_PTR)dptr & 0x03)
  {
    /* Too small, or we'll never hit a 16-byte boundary.  Punt. */
    return generic->YCoCgToRGB_8u_AC4R(pSrc, WINPR_ASSERTING_INT_CAST(INT32, srcStep), pDst,
                                       DstFormat, WINPR_ASSERTING_INT_CAST(INT32, dstStep),
                                       width, height, shift, withAlpha);
  }

  for (UINT32 h = 0; h < height; h++)
  {
    UINT32 w = width;

    while (w >= 8)
    {
      __m128i R7;

      /* The faster path, 16-byte aligned load. */
      __m128i R0 = LOAD_SI128(sptr);
      sptr += (128 / 8);
      __m128i R1 = LOAD_SI128(sptr);
      sptr += (128 / 8);

      /* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
      /* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
      /* Shuffle to pack all the like types together. */
      __m128i R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
      __m128i R3 = _mm_shuffle_epi8(R0, R2);
      __m128i R4 = _mm_shuffle_epi8(R1, R2);
      /* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
      /* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
      __m128i R5 = _mm_unpackhi_epi32(R3, R4);
      __m128i R6 = _mm_unpacklo_epi32(R3, R4);

      /* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
      /* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
      /* Save alphas aside */
      if (withAlpha)
        R7 = _mm_unpackhi_epi64(R5, R5);
      else
        R7 = mm_set1_epu32(0xFFFFFFFFU);

      /* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
      /* Expand Y's from 8-bit unsigned to 16-bit signed. */
      R1 = mm_set1_epu32(0);
      R0 = _mm_unpacklo_epi8(R5, R1);
      /* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
      /* Shift Co's and Cg's by (shift-1).  -1 covers division by two.
       * Note: this must be done before sign-conversion.
       * Note also there is no slli_epi8, so we have to use a 16-bit
       * version and then mask.
       */
      R6 = _mm_slli_epi16(R6, dataShift);
      R1 = mm_set1_epu8(mask);
      R6 = _mm_and_si128(R6, R1);
      /* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
      /* Expand Co's from 8-bit signed to 16-bit signed */
      R1 = _mm_unpackhi_epi8(R6, R6);
      R1 = _mm_srai_epi16(R1, 8);
      /* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
      /* Expand Cg's form 8-bit signed to 16-bit signed */
      R2 = _mm_unpacklo_epi8(R6, R6);
      R2 = _mm_srai_epi16(R2, 8);
      /* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
      /* Get Y - halfCg and save */
      R6 = _mm_subs_epi16(R0, R2);
      /* R = (Y-halfCg) + halfCo */
      R3 = _mm_adds_epi16(R6, R1);
      /* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
      /* G = Y + Cg(/2) */
      R4 = _mm_adds_epi16(R0, R2);
      /* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
      /* B = (Y-halfCg) - Co(/2) */
      R5 = _mm_subs_epi16(R6, R1);
      /* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
      /* Repack R's & B's.  */
      /* This line is the only diff between inverted and non-inverted.
       * Unfortunately, it would be expensive to check "inverted"
       * every time through this loop.
       */
      R0 = _mm_packus_epi16(R5, R3);
      /* R0 = B7B6B5B4 B3B2B1B0 R7R6R5R4 R3R2R1R0 */
      /* Repack G's. */
      R1 = _mm_packus_epi16(R4, R4);
      /* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
      /* And add the A's. */
      R1 = _mm_unpackhi_epi64(R1, R7);
      /* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
      /* Now do interleaving again. */
      R2 = _mm_unpacklo_epi8(R0, R1);
      /* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
      R3 = _mm_unpackhi_epi8(R0, R1);
      /* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
      R4 = _mm_unpacklo_epi16(R2, R3);
      /* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
      R5 = _mm_unpackhi_epi16(R2, R3);
      /* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
      STORE_SI128(dptr, R4);
      dptr += (128 / 8);
      STORE_SI128(dptr, R5);
      dptr += (128 / 8);
      w -= 8;
    }

    /* Handle any remainder pixels. */
    if (w > 0)
    {
      pstatus_t status = 0;
      status = generic->YCoCgToRGB_8u_AC4R(
          sptr, WINPR_ASSERTING_INT_CAST(INT32, srcStep), dptr, DstFormat,
          WINPR_ASSERTING_INT_CAST(INT32, dstStep), WINPR_ASSERTING_INT_CAST(UINT32, w), 1,
          shift, withAlpha);

      if (status != PRIMITIVES_SUCCESS)
        return status;

      sptr += WINPR_ASSERTING_INT_CAST(UINT32, w) * sizeof(UINT32);
      dptr += WINPR_ASSERTING_INT_CAST(UINT32, w) * sizeof(UINT32);
    }

    sptr += sRowBump;
    dptr += dRowBump;
  }

  return PRIMITIVES_SUCCESS;
}

/* ------------------------------------------------------------------------- */
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
                                           BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
                                           INT32 dstStep, UINT32 width, UINT32 height, UINT8 shift,
                                           BOOL withAlpha)
{
  switch (DstFormat)
  {
    case PIXEL_FORMAT_BGRX32:
    case PIXEL_FORMAT_BGRA32:
      return ssse3_YCoCgRToRGB_8u_AC4R_invert(
          pSrc, WINPR_ASSERTING_INT_CAST(UINT32, srcStep), pDst, DstFormat,
          WINPR_ASSERTING_INT_CAST(UINT32, dstStep), width, height, shift, withAlpha);

    case PIXEL_FORMAT_RGBX32:
    case PIXEL_FORMAT_RGBA32:
      return ssse3_YCoCgRToRGB_8u_AC4R_no_invert(
          pSrc, WINPR_ASSERTING_INT_CAST(UINT32, srcStep), pDst, DstFormat,
          WINPR_ASSERTING_INT_CAST(UINT32, dstStep), width, height, shift, withAlpha);

    default:
      return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width,
                                         height, shift, withAlpha);
  }
}

#endif

/* ------------------------------------------------------------------------- */
void primitives_init_YCoCg_ssse3_int(primitives_t* WINPR_RESTRICT prims)
{
#if defined(SSE_AVX_INTRINSICS_ENABLED)
  generic = primitives_get_generic();

  WLog_VRB(PRIM_TAG, "SSE3/SSSE3 optimizations");
  prims->YCoCgToRGB_8u_AC4R = ssse3_YCoCgRToRGB_8u_AC4R;
#else
  WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE2 intrinsics not available");
  WINPR_UNUSED(prims);
#endif
}

Coverage Report

Created: 2025-07-01 06:46

Line	Count	Source (jump to first uncovered line)
1		/* FreeRDP: A Remote Desktop Protocol Client
2		* Optimized YCoCg<->RGB conversion operations.
3		* vi:ts=4 sw=4:
4		*
5		* (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
6		*
7		* Licensed under the Apache License, Version 2.0 (the "License");
8		* you may not use this file except in compliance with the License.
9		* You may obtain a copy of the License at
10		*
11		* http://www.apache.org/licenses/LICENSE-2.0
12		*
13		* Unless required by applicable law or agreed to in writing, software
14		* distributed under the License is distributed on an "AS IS" BASIS,
15		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16		* See the License for the specific language governing permissions and
17		* limitations under the License.
18		*/
19
20		#include <freerdp/config.h>
21
22		#include <freerdp/types.h>
23		#include <freerdp/primitives.h>
24		#include <winpr/sysinfo.h>
25
26		#include "prim_YCoCg.h"
27
28		#include "prim_internal.h"
29		#include "prim_templates.h"
30
31		#if defined(SSE_AVX_INTRINSICS_ENABLED)
32		#include <emmintrin.h>
33		#include <tmmintrin.h>
34
35		static primitives_t* generic = NULL;
36
37		/* ------------------------------------------------------------------------- */
38		static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep,
39		BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
40		UINT32 dstStep, UINT32 width, UINT32 height,
41		UINT8 shift, BOOL withAlpha)
42	0	{
43	0	const BYTE* sptr = pSrc;
44	0	BYTE* dptr = pDst;
45
46	0	WINPR_ASSERT(srcStep / sizeof(UINT32) >= width);
47	0	WINPR_ASSERT(dstStep / sizeof(UINT32) >= width);
48	0	const size_t sRowBump = srcStep - width * sizeof(UINT32);
49	0	const size_t dRowBump = dstStep - width * sizeof(UINT32);
50		/* Shift left by "shift" and divide by two is the same as shift
51		* left by "shift-1".
52		*/
53	0	int dataShift = shift - 1;
54	0	BYTE mask = (BYTE)(0xFFU << dataShift);
55
56		/* Let's say the data is of the form:
57		* y0y0o0g0 a1y1o1g1 a2y2o2g2...
58		* Apply:
59		* \|R\| \| 1 1/2 -1/2 \| \|y\|
60		* \|G\| = \| 1 0 1/2 \| * \|o\|
61		* \|B\| \| 1 -1/2 -1/2 \| \|g\|
62		* where Y is 8-bit unsigned and o & g are 8-bit signed.
63		*/
64
65	0	if ((width < 8) \|\| (ULONG_PTR)dptr & 0x03)
66	0	{
67		/* Too small, or we'll never hit a 16-byte boundary. Punt. */
68	0	return generic->YCoCgToRGB_8u_AC4R(pSrc, WINPR_ASSERTING_INT_CAST(INT32, srcStep), pDst,
69	0	DstFormat, WINPR_ASSERTING_INT_CAST(INT32, dstStep),
70	0	width, height, shift, withAlpha);
71	0	}
72
73	0	for (UINT32 h = 0; h < height; h++)
74	0	{
75	0	UINT32 w = width;
76
77	0	while (w >= 8)
78	0	{
79	0	__m128i R0;
80	0	__m128i R1;
81	0	__m128i R2;
82	0	__m128i R3;
83	0	__m128i R4;
84	0	__m128i R5;
85	0	__m128i R6;
86	0	__m128i R7;
87
88	0	R0 = LOAD_SI128(sptr);
89	0	sptr += (128 / 8);
90	0	R1 = LOAD_SI128(sptr);
91	0	sptr += (128 / 8);
92
93		/* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
94		/* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
95		/* Shuffle to pack all the like types together. */
96	0	R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
97	0	R3 = _mm_shuffle_epi8(R0, R2);
98	0	R4 = _mm_shuffle_epi8(R1, R2);
99		/* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
100		/* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
101	0	R5 = _mm_unpackhi_epi32(R3, R4);
102	0	R6 = _mm_unpacklo_epi32(R3, R4);
103
104		/* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
105		/* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
106		/* Save alphas aside */
107	0	if (withAlpha)
108	0	R7 = _mm_unpackhi_epi64(R5, R5);
109	0	else
110	0	R7 = mm_set1_epu32(0xFFFFFFFFU);
111
112		/* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
113		/* Expand Y's from 8-bit unsigned to 16-bit signed. */
114	0	R1 = mm_set1_epu32(0);
115	0	R0 = _mm_unpacklo_epi8(R5, R1);
116		/* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
117		/* Shift Co's and Cg's by (shift-1). -1 covers division by two.
118		* Note: this must be done before sign-conversion.
119		* Note also there is no slli_epi8, so we have to use a 16-bit
120		* version and then mask.
121		*/
122	0	R6 = _mm_slli_epi16(R6, dataShift);
123	0	R1 = mm_set1_epu8(mask);
124	0	R6 = _mm_and_si128(R6, R1);
125		/* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
126		/* Expand Co's from 8-bit signed to 16-bit signed */
127	0	R1 = _mm_unpackhi_epi8(R6, R6);
128	0	R1 = _mm_srai_epi16(R1, 8);
129		/* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
130		/* Expand Cg's form 8-bit signed to 16-bit signed */
131	0	R2 = _mm_unpacklo_epi8(R6, R6);
132	0	R2 = _mm_srai_epi16(R2, 8);
133		/* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
134		/* Get Y - halfCg and save */
135	0	R6 = _mm_subs_epi16(R0, R2);
136		/* R = (Y-halfCg) + halfCo */
137	0	R3 = _mm_adds_epi16(R6, R1);
138		/* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
139		/* G = Y + Cg(/2) */
140	0	R4 = _mm_adds_epi16(R0, R2);
141		/* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
142		/* B = (Y-halfCg) - Co(/2) */
143	0	R5 = _mm_subs_epi16(R6, R1);
144		/* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
145		/* Repack R's & B's. */
146	0	R0 = _mm_packus_epi16(R3, R5);
147		/* R0 = R7R6R5R4 R3R2R1R0 B7B6B5B4 B3B2B1B0 */
148		/* Repack G's. */
149	0	R1 = _mm_packus_epi16(R4, R4);
150		/* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
151		/* And add the A's. */
152	0	R1 = _mm_unpackhi_epi64(R1, R7);
153		/* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
154		/* Now do interleaving again. */
155	0	R2 = _mm_unpacklo_epi8(R0, R1);
156		/* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
157	0	R3 = _mm_unpackhi_epi8(R0, R1);
158		/* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
159	0	R4 = _mm_unpacklo_epi16(R2, R3);
160		/* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
161	0	R5 = _mm_unpackhi_epi16(R2, R3);
162		/* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
163	0	STORE_SI128(dptr, R4);
164	0	dptr += (128 / 8);
165	0	STORE_SI128(dptr, R5);
166	0	dptr += (128 / 8);
167	0	w -= 8;
168	0	}
169
170		/* Handle any remainder pixels. */
171	0	if (w > 0)
172	0	{
173	0	pstatus_t status = 0;
174	0	status = generic->YCoCgToRGB_8u_AC4R(
175	0	sptr, WINPR_ASSERTING_INT_CAST(INT32, srcStep), dptr, DstFormat,
176	0	WINPR_ASSERTING_INT_CAST(INT32, dstStep), w, 1, shift, withAlpha);
177
178	0	if (status != PRIMITIVES_SUCCESS)
179	0	return status;
180
181	0	sptr += w * sizeof(UINT32);
182	0	dptr += w * sizeof(UINT32);
183	0	}
184
185	0	sptr += sRowBump;
186	0	dptr += dRowBump;
187	0	}
188
189	0	return PRIMITIVES_SUCCESS;
190	0	}
191
192		/* ------------------------------------------------------------------------- */
193		static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(const BYTE* WINPR_RESTRICT pSrc,
194		UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
195		UINT32 DstFormat, UINT32 dstStep, UINT32 width,
196		UINT32 height, UINT8 shift, BOOL withAlpha)
197	0	{
198	0	const BYTE* sptr = pSrc;
199	0	BYTE* dptr = pDst;
200	0	size_t sRowBump = srcStep - width * sizeof(UINT32);
201	0	size_t dRowBump = dstStep - width * sizeof(UINT32);
202		/* Shift left by "shift" and divide by two is the same as shift
203		* left by "shift-1".
204		*/
205	0	int dataShift = shift - 1;
206	0	BYTE mask = (BYTE)(0xFFU << dataShift);
207
208		/* Let's say the data is of the form:
209		* y0y0o0g0 a1y1o1g1 a2y2o2g2...
210		* Apply:
211		* \|R\| \| 1 1/2 -1/2 \| \|y\|
212		* \|G\| = \| 1 0 1/2 \| * \|o\|
213		* \|B\| \| 1 -1/2 -1/2 \| \|g\|
214		* where Y is 8-bit unsigned and o & g are 8-bit signed.
215		*/
216
217	0	if ((width < 8) \|\| (ULONG_PTR)dptr & 0x03)
218	0	{
219		/* Too small, or we'll never hit a 16-byte boundary. Punt. */
220	0	return generic->YCoCgToRGB_8u_AC4R(pSrc, WINPR_ASSERTING_INT_CAST(INT32, srcStep), pDst,
221	0	DstFormat, WINPR_ASSERTING_INT_CAST(INT32, dstStep),
222	0	width, height, shift, withAlpha);
223	0	}
224
225	0	for (UINT32 h = 0; h < height; h++)
226	0	{
227	0	UINT32 w = width;
228
229	0	while (w >= 8)
230	0	{
231	0	__m128i R7;
232
233		/* The faster path, 16-byte aligned load. */
234	0	__m128i R0 = LOAD_SI128(sptr);
235	0	sptr += (128 / 8);
236	0	__m128i R1 = LOAD_SI128(sptr);
237	0	sptr += (128 / 8);
238
239		/* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
240		/* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
241		/* Shuffle to pack all the like types together. */
242	0	__m128i R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
243	0	__m128i R3 = _mm_shuffle_epi8(R0, R2);
244	0	__m128i R4 = _mm_shuffle_epi8(R1, R2);
245		/* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
246		/* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
247	0	__m128i R5 = _mm_unpackhi_epi32(R3, R4);
248	0	__m128i R6 = _mm_unpacklo_epi32(R3, R4);
249
250		/* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
251		/* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
252		/* Save alphas aside */
253	0	if (withAlpha)
254	0	R7 = _mm_unpackhi_epi64(R5, R5);
255	0	else
256	0	R7 = mm_set1_epu32(0xFFFFFFFFU);
257
258		/* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
259		/* Expand Y's from 8-bit unsigned to 16-bit signed. */
260	0	R1 = mm_set1_epu32(0);
261	0	R0 = _mm_unpacklo_epi8(R5, R1);
262		/* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
263		/* Shift Co's and Cg's by (shift-1). -1 covers division by two.
264		* Note: this must be done before sign-conversion.
265		* Note also there is no slli_epi8, so we have to use a 16-bit
266		* version and then mask.
267		*/
268	0	R6 = _mm_slli_epi16(R6, dataShift);
269	0	R1 = mm_set1_epu8(mask);
270	0	R6 = _mm_and_si128(R6, R1);
271		/* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
272		/* Expand Co's from 8-bit signed to 16-bit signed */
273	0	R1 = _mm_unpackhi_epi8(R6, R6);
274	0	R1 = _mm_srai_epi16(R1, 8);
275		/* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
276		/* Expand Cg's form 8-bit signed to 16-bit signed */
277	0	R2 = _mm_unpacklo_epi8(R6, R6);
278	0	R2 = _mm_srai_epi16(R2, 8);
279		/* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
280		/* Get Y - halfCg and save */
281	0	R6 = _mm_subs_epi16(R0, R2);
282		/* R = (Y-halfCg) + halfCo */
283	0	R3 = _mm_adds_epi16(R6, R1);
284		/* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
285		/* G = Y + Cg(/2) */
286	0	R4 = _mm_adds_epi16(R0, R2);
287		/* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
288		/* B = (Y-halfCg) - Co(/2) */
289	0	R5 = _mm_subs_epi16(R6, R1);
290		/* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
291		/* Repack R's & B's. */
292		/* This line is the only diff between inverted and non-inverted.
293		* Unfortunately, it would be expensive to check "inverted"
294		* every time through this loop.
295		*/
296	0	R0 = _mm_packus_epi16(R5, R3);
297		/* R0 = B7B6B5B4 B3B2B1B0 R7R6R5R4 R3R2R1R0 */
298		/* Repack G's. */
299	0	R1 = _mm_packus_epi16(R4, R4);
300		/* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
301		/* And add the A's. */
302	0	R1 = _mm_unpackhi_epi64(R1, R7);
303		/* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
304		/* Now do interleaving again. */
305	0	R2 = _mm_unpacklo_epi8(R0, R1);
306		/* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
307	0	R3 = _mm_unpackhi_epi8(R0, R1);
308		/* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
309	0	R4 = _mm_unpacklo_epi16(R2, R3);
310		/* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
311	0	R5 = _mm_unpackhi_epi16(R2, R3);
312		/* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
313	0	STORE_SI128(dptr, R4);
314	0	dptr += (128 / 8);
315	0	STORE_SI128(dptr, R5);
316	0	dptr += (128 / 8);
317	0	w -= 8;
318	0	}
319
320		/* Handle any remainder pixels. */
321	0	if (w > 0)
322	0	{
323	0	pstatus_t status = 0;
324	0	status = generic->YCoCgToRGB_8u_AC4R(
325	0	sptr, WINPR_ASSERTING_INT_CAST(INT32, srcStep), dptr, DstFormat,
326	0	WINPR_ASSERTING_INT_CAST(INT32, dstStep), WINPR_ASSERTING_INT_CAST(UINT32, w), 1,
327	0	shift, withAlpha);
328
329	0	if (status != PRIMITIVES_SUCCESS)
330	0	return status;
331
332	0	sptr += WINPR_ASSERTING_INT_CAST(UINT32, w) * sizeof(UINT32);
333	0	dptr += WINPR_ASSERTING_INT_CAST(UINT32, w) * sizeof(UINT32);
334	0	}
335
336	0	sptr += sRowBump;
337	0	dptr += dRowBump;
338	0	}
339
340	0	return PRIMITIVES_SUCCESS;
341	0	}
342
343		/* ------------------------------------------------------------------------- */
344		static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
345		BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
346		INT32 dstStep, UINT32 width, UINT32 height, UINT8 shift,
347		BOOL withAlpha)
348	0	{
349	0	switch (DstFormat)
350	0	{
351	0	case PIXEL_FORMAT_BGRX32:
352	0	case PIXEL_FORMAT_BGRA32:
353	0	return ssse3_YCoCgRToRGB_8u_AC4R_invert(
354	0	pSrc, WINPR_ASSERTING_INT_CAST(UINT32, srcStep), pDst, DstFormat,
355	0	WINPR_ASSERTING_INT_CAST(UINT32, dstStep), width, height, shift, withAlpha);
356
357	0	case PIXEL_FORMAT_RGBX32:
358	0	case PIXEL_FORMAT_RGBA32:
359	0	return ssse3_YCoCgRToRGB_8u_AC4R_no_invert(
360	0	pSrc, WINPR_ASSERTING_INT_CAST(UINT32, srcStep), pDst, DstFormat,
361	0	WINPR_ASSERTING_INT_CAST(UINT32, dstStep), width, height, shift, withAlpha);
362
363	0	default:
364	0	return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width,
365	0	height, shift, withAlpha);
366	0	}
367	0	}
368
369		#endif
370
371		/* ------------------------------------------------------------------------- */
372		void primitives_init_YCoCg_ssse3_int(primitives_t* WINPR_RESTRICT prims)
373	0	{
374	0	#if defined(SSE_AVX_INTRINSICS_ENABLED)
375	0	generic = primitives_get_generic();
376
377	0	WLog_VRB(PRIM_TAG, "SSE3/SSSE3 optimizations");
378	0	prims->YCoCgToRGB_8u_AC4R = ssse3_YCoCgRToRGB_8u_AC4R;
379		#else
380		WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE2 intrinsics not available");
381		WINPR_UNUSED(prims);
382		#endif
383	0	}