/src/FreeRDP/libfreerdp/primitives/prim_alphaComp_opt.c

Source (jump to first uncovered line)
/* FreeRDP: A Remote Desktop Protocol Client
 * Optimized alpha blending routines.
 * vi:ts=4 sw=4:
 *
 * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
 * Licensed under the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License. You may obtain
 * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing
 * permissions and limitations under the License.
 *
 * Note: this code assumes the second operand is fully opaque,
 * e.g.
 *   newval = alpha1*val1 + (1-alpha1)*val2
 * rather than
 *   newval = alpha1*val1 + (1-alpha1)*alpha2*val2
 * The IPP gives other options.
 */

#include <freerdp/config.h>

#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include <winpr/sysinfo.h>

#ifdef WITH_SSE2
#include <emmintrin.h>
#include <pmmintrin.h>
#endif /* WITH_SSE2 */

#ifdef WITH_IPP
#include <ippi.h>
#endif /* WITH_IPP */

#include "prim_internal.h"

static primitives_t* generic = NULL;

/* ------------------------------------------------------------------------- */
#ifdef WITH_SSE2
#if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)

static pstatus_t sse2_alphaComp_argb(const BYTE* WINPR_RESTRICT pSrc1, UINT32 src1Step,
                                     const BYTE* WINPR_RESTRICT pSrc2, UINT32 src2Step,
                                     BYTE* WINPR_RESTRICT pDst, UINT32 dstStep, UINT32 width,
                                     UINT32 height)
{
  const UINT32* sptr1 = (const UINT32*)pSrc1;
  const UINT32* sptr2 = (const UINT32*)pSrc2;
  UINT32* dptr;
  int linebytes, src1Jump, src2Jump, dstJump;
  UINT32 y;
  __m128i xmm0, xmm1;

  if ((width <= 0) || (height <= 0))
    return PRIMITIVES_SUCCESS;

  if (width < 4) /* pointless if too small */
  {
    return generic->alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step, pDst, dstStep, width,
                                   height);
  }

  dptr = (UINT32*)pDst;
  linebytes = width * sizeof(UINT32);
  src1Jump = (src1Step - linebytes) / sizeof(UINT32);
  src2Jump = (src2Step - linebytes) / sizeof(UINT32);
  dstJump = (dstStep - linebytes) / sizeof(UINT32);
  xmm0 = _mm_set1_epi32(0);
  xmm1 = _mm_set1_epi16(1);

  for (y = 0; y < height; ++y)
  {
    int pixels = width;
    int count;
    /* Get to the 16-byte boundary now. */
    int leadIn = 0;

    switch ((ULONG_PTR)dptr & 0x0f)
    {
      case 0:
        leadIn = 0;
        break;

      case 4:
        leadIn = 3;
        break;

      case 8:
        leadIn = 2;
        break;

      case 12:
        leadIn = 1;
        break;

      default:
        /* We'll never hit a 16-byte boundary, so do the whole
         * thing the slow way.
         */
        leadIn = width;
        break;
    }

    if (leadIn)
    {
      pstatus_t status;
      status = generic->alphaComp_argb((const BYTE*)sptr1, src1Step, (const BYTE*)sptr2,
                                       src2Step, (BYTE*)dptr, dstStep, leadIn, 1);
      if (status != PRIMITIVES_SUCCESS)
        return status;

      sptr1 += leadIn;
      sptr2 += leadIn;
      dptr += leadIn;
      pixels -= leadIn;
    }

    /* Use SSE registers to do 4 pixels at a time. */
    count = pixels >> 2;
    pixels -= count << 2;

    while (count--)
    {
      __m128i xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
      /* BdGdRdAdBcGcRcAcBbGbRbAbBaGaRaAa */
      xmm2 = LOAD_SI128(sptr1);
      sptr1 += 4;
      /* BhGhRhAhBgGgRgAgBfGfRfAfBeGeReAe */
      xmm3 = LOAD_SI128(sptr2);
      sptr2 += 4;
      /* 00Bb00Gb00Rb00Ab00Ba00Ga00Ra00Aa */
      xmm4 = _mm_unpackhi_epi8(xmm2, xmm0);
      /* 00Bf00Gf00Bf00Af00Be00Ge00Re00Ae */
      xmm5 = _mm_unpackhi_epi8(xmm3, xmm0);
      /* subtract */
      xmm6 = _mm_subs_epi16(xmm4, xmm5);
      /* 00Bb00Gb00Rb00Ab00Aa00Aa00Aa00Aa */
      xmm4 = _mm_shufflelo_epi16(xmm4, 0xff);
      /* 00Ab00Ab00Ab00Ab00Aa00Aa00Aa00Aa */
      xmm4 = _mm_shufflehi_epi16(xmm4, 0xff);
      /* Add one to alphas */
      xmm4 = _mm_adds_epi16(xmm4, xmm1);
      /* Multiply and take low word */
      xmm4 = _mm_mullo_epi16(xmm4, xmm6);
      /* Shift 8 right */
      xmm4 = _mm_srai_epi16(xmm4, 8);
      /* Add xmm5 */
      xmm4 = _mm_adds_epi16(xmm4, xmm5);
      /* 00Bj00Gj00Rj00Aj00Bi00Gi00Ri00Ai */
      /* 00Bd00Gd00Rd00Ad00Bc00Gc00Rc00Ac */
      xmm5 = _mm_unpacklo_epi8(xmm2, xmm0);
      /* 00Bh00Gh00Rh00Ah00Bg00Gg00Rg00Ag */
      xmm6 = _mm_unpacklo_epi8(xmm3, xmm0);
      /* subtract */
      xmm7 = _mm_subs_epi16(xmm5, xmm6);
      /* 00Bd00Gd00Rd00Ad00Ac00Ac00Ac00Ac */
      xmm5 = _mm_shufflelo_epi16(xmm5, 0xff);
      /* 00Ad00Ad00Ad00Ad00Ac00Ac00Ac00Ac */
      xmm5 = _mm_shufflehi_epi16(xmm5, 0xff);
      /* Add one to alphas */
      xmm5 = _mm_adds_epi16(xmm5, xmm1);
      /* Multiply and take low word */
      xmm5 = _mm_mullo_epi16(xmm5, xmm7);
      /* Shift 8 right */
      xmm5 = _mm_srai_epi16(xmm5, 8);
      /* Add xmm6 */
      xmm5 = _mm_adds_epi16(xmm5, xmm6);
      /* 00Bl00Gl00Rl00Al00Bk00Gk00Rk0ABk */
      /* Must mask off remainders or pack gets confused */
      xmm3 = _mm_set1_epi16(0x00ffU);
      xmm4 = _mm_and_si128(xmm4, xmm3);
      xmm5 = _mm_and_si128(xmm5, xmm3);
      /* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */
      xmm5 = _mm_packus_epi16(xmm5, xmm4);
      _mm_store_si128((__m128i*)dptr, xmm5);
      dptr += 4;
    }

    /* Finish off the remainder. */
    if (pixels)
    {
      pstatus_t status;
      status = generic->alphaComp_argb((const BYTE*)sptr1, src1Step, (const BYTE*)sptr2,
                                       src2Step, (BYTE*)dptr, dstStep, pixels, 1);
      if (status != PRIMITIVES_SUCCESS)
        return status;

      sptr1 += pixels;
      sptr2 += pixels;
      dptr += pixels;
    }

    /* Jump to next row. */
    sptr1 += src1Jump;
    sptr2 += src2Jump;
    dptr += dstJump;
  }

  return PRIMITIVES_SUCCESS;
}
#endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
#endif

#ifdef WITH_IPP
/* ------------------------------------------------------------------------- */
static pstatus_t ipp_alphaComp_argb(const BYTE* pSrc1, INT32 src1Step, const BYTE* pSrc2,
                                    INT32 src2Step, BYTE* pDst, INT32 dstStep, INT32 width,
                                    INT32 height)
{
  IppiSize sz;
  sz.width = width;
  sz.height = height;
  return ippiAlphaComp_8u_AC4R(pSrc1, src1Step, pSrc2, src2Step, pDst, dstStep, sz, ippAlphaOver);
}
#endif

/* ------------------------------------------------------------------------- */
void primitives_init_alphaComp_opt(primitives_t* WINPR_RESTRICT prims)
{
  generic = primitives_get_generic();
  primitives_init_alphaComp(prims);
#ifdef WITH_IPP
  prims->alphaComp_argb = ipp_alphaComp_argb;
#elif defined(WITH_SSE2)

  if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
      IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) /* for LDDQU */
  {
    prims->alphaComp_argb = sse2_alphaComp_argb;
  }

#endif
}

Coverage Report

Created: 2023-09-25 06:56

Line	Count	Source (jump to first uncovered line)
1		/* FreeRDP: A Remote Desktop Protocol Client
2		* Optimized alpha blending routines.
3		* vi:ts=4 sw=4:
4		*
5		* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
6		* Licensed under the Apache License, Version 2.0 (the "License"); you may
7		* not use this file except in compliance with the License. You may obtain
8		* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
9		* Unless required by applicable law or agreed to in writing, software
10		* distributed under the License is distributed on an "AS IS" BASIS,
11		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
12		* or implied. See the License for the specific language governing
13		* permissions and limitations under the License.
14		*
15		* Note: this code assumes the second operand is fully opaque,
16		* e.g.
17		* newval = alpha1val1 + (1-alpha1)val2
18		* rather than
19		* newval = alpha1val1 + (1-alpha1)alpha2*val2
20		* The IPP gives other options.
21		*/
22
23		#include <freerdp/config.h>
24
25		#include <freerdp/types.h>
26		#include <freerdp/primitives.h>
27		#include <winpr/sysinfo.h>
28
29		#ifdef WITH_SSE2
30		#include <emmintrin.h>
31		#include <pmmintrin.h>
32		#endif /* WITH_SSE2 */
33
34		#ifdef WITH_IPP
35		#include <ippi.h>
36		#endif /* WITH_IPP */
37
38		#include "prim_internal.h"
39
40		static primitives_t* generic = NULL;
41
42		/* ------------------------------------------------------------------------- */
43		#ifdef WITH_SSE2
44		#if !defined(WITH_IPP) \|\| defined(ALL_PRIMITIVES_VERSIONS)
45
46		static pstatus_t sse2_alphaComp_argb(const BYTE* WINPR_RESTRICT pSrc1, UINT32 src1Step,
47		const BYTE* WINPR_RESTRICT pSrc2, UINT32 src2Step,
48		BYTE* WINPR_RESTRICT pDst, UINT32 dstStep, UINT32 width,
49		UINT32 height)
50	0	{
51	0	const UINT32* sptr1 = (const UINT32*)pSrc1;
52	0	const UINT32* sptr2 = (const UINT32*)pSrc2;
53	0	UINT32* dptr;
54	0	int linebytes, src1Jump, src2Jump, dstJump;
55	0	UINT32 y;
56	0	__m128i xmm0, xmm1;
57
58	0	if ((width <= 0) \|\| (height <= 0))
59	0	return PRIMITIVES_SUCCESS;
60
61	0	if (width < 4) /* pointless if too small */
62	0	{
63	0	return generic->alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step, pDst, dstStep, width,
64	0	height);
65	0	}
66
67	0	dptr = (UINT32*)pDst;
68	0	linebytes = width * sizeof(UINT32);
69	0	src1Jump = (src1Step - linebytes) / sizeof(UINT32);
70	0	src2Jump = (src2Step - linebytes) / sizeof(UINT32);
71	0	dstJump = (dstStep - linebytes) / sizeof(UINT32);
72	0	xmm0 = _mm_set1_epi32(0);
73	0	xmm1 = _mm_set1_epi16(1);
74
75	0	for (y = 0; y < height; ++y)
76	0	{
77	0	int pixels = width;
78	0	int count;
79		/* Get to the 16-byte boundary now. */
80	0	int leadIn = 0;
81
82	0	switch ((ULONG_PTR)dptr & 0x0f)
83	0	{
84	0	case 0:
85	0	leadIn = 0;
86	0	break;
87
88	0	case 4:
89	0	leadIn = 3;
90	0	break;
91
92	0	case 8:
93	0	leadIn = 2;
94	0	break;
95
96	0	case 12:
97	0	leadIn = 1;
98	0	break;
99
100	0	default:
101		/* We'll never hit a 16-byte boundary, so do the whole
102		* thing the slow way.
103		*/
104	0	leadIn = width;
105	0	break;
106	0	}
107
108	0	if (leadIn)
109	0	{
110	0	pstatus_t status;
111	0	status = generic->alphaComp_argb((const BYTE)sptr1, src1Step, (const BYTE)sptr2,
112	0	src2Step, (BYTE*)dptr, dstStep, leadIn, 1);
113	0	if (status != PRIMITIVES_SUCCESS)
114	0	return status;
115
116	0	sptr1 += leadIn;
117	0	sptr2 += leadIn;
118	0	dptr += leadIn;
119	0	pixels -= leadIn;
120	0	}
121
122		/* Use SSE registers to do 4 pixels at a time. */
123	0	count = pixels >> 2;
124	0	pixels -= count << 2;
125
126	0	while (count--)
127	0	{
128	0	__m128i xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
129		/* BdGdRdAdBcGcRcAcBbGbRbAbBaGaRaAa */
130	0	xmm2 = LOAD_SI128(sptr1);
131	0	sptr1 += 4;
132		/* BhGhRhAhBgGgRgAgBfGfRfAfBeGeReAe */
133	0	xmm3 = LOAD_SI128(sptr2);
134	0	sptr2 += 4;
135		/* 00Bb00Gb00Rb00Ab00Ba00Ga00Ra00Aa */
136	0	xmm4 = _mm_unpackhi_epi8(xmm2, xmm0);
137		/* 00Bf00Gf00Bf00Af00Be00Ge00Re00Ae */
138	0	xmm5 = _mm_unpackhi_epi8(xmm3, xmm0);
139		/* subtract */
140	0	xmm6 = _mm_subs_epi16(xmm4, xmm5);
141		/* 00Bb00Gb00Rb00Ab00Aa00Aa00Aa00Aa */
142	0	xmm4 = _mm_shufflelo_epi16(xmm4, 0xff);
143		/* 00Ab00Ab00Ab00Ab00Aa00Aa00Aa00Aa */
144	0	xmm4 = _mm_shufflehi_epi16(xmm4, 0xff);
145		/* Add one to alphas */
146	0	xmm4 = _mm_adds_epi16(xmm4, xmm1);
147		/* Multiply and take low word */
148	0	xmm4 = _mm_mullo_epi16(xmm4, xmm6);
149		/* Shift 8 right */
150	0	xmm4 = _mm_srai_epi16(xmm4, 8);
151		/* Add xmm5 */
152	0	xmm4 = _mm_adds_epi16(xmm4, xmm5);
153		/* 00Bj00Gj00Rj00Aj00Bi00Gi00Ri00Ai */
154		/* 00Bd00Gd00Rd00Ad00Bc00Gc00Rc00Ac */
155	0	xmm5 = _mm_unpacklo_epi8(xmm2, xmm0);
156		/* 00Bh00Gh00Rh00Ah00Bg00Gg00Rg00Ag */
157	0	xmm6 = _mm_unpacklo_epi8(xmm3, xmm0);
158		/* subtract */
159	0	xmm7 = _mm_subs_epi16(xmm5, xmm6);
160		/* 00Bd00Gd00Rd00Ad00Ac00Ac00Ac00Ac */
161	0	xmm5 = _mm_shufflelo_epi16(xmm5, 0xff);
162		/* 00Ad00Ad00Ad00Ad00Ac00Ac00Ac00Ac */
163	0	xmm5 = _mm_shufflehi_epi16(xmm5, 0xff);
164		/* Add one to alphas */
165	0	xmm5 = _mm_adds_epi16(xmm5, xmm1);
166		/* Multiply and take low word */
167	0	xmm5 = _mm_mullo_epi16(xmm5, xmm7);
168		/* Shift 8 right */
169	0	xmm5 = _mm_srai_epi16(xmm5, 8);
170		/* Add xmm6 */
171	0	xmm5 = _mm_adds_epi16(xmm5, xmm6);
172		/* 00Bl00Gl00Rl00Al00Bk00Gk00Rk0ABk */
173		/* Must mask off remainders or pack gets confused */
174	0	xmm3 = _mm_set1_epi16(0x00ffU);
175	0	xmm4 = _mm_and_si128(xmm4, xmm3);
176	0	xmm5 = _mm_and_si128(xmm5, xmm3);
177		/* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */
178	0	xmm5 = _mm_packus_epi16(xmm5, xmm4);
179	0	_mm_store_si128((__m128i*)dptr, xmm5);
180	0	dptr += 4;
181	0	}
182
183		/* Finish off the remainder. */
184	0	if (pixels)
185	0	{
186	0	pstatus_t status;
187	0	status = generic->alphaComp_argb((const BYTE)sptr1, src1Step, (const BYTE)sptr2,
188	0	src2Step, (BYTE*)dptr, dstStep, pixels, 1);
189	0	if (status != PRIMITIVES_SUCCESS)
190	0	return status;
191
192	0	sptr1 += pixels;
193	0	sptr2 += pixels;
194	0	dptr += pixels;
195	0	}
196
197		/* Jump to next row. */
198	0	sptr1 += src1Jump;
199	0	sptr2 += src2Jump;
200	0	dptr += dstJump;
201	0	}
202
203	0	return PRIMITIVES_SUCCESS;
204	0	}
205		#endif /* !defined(WITH_IPP) \|\| defined(ALL_PRIMITIVES_VERSIONS) */
206		#endif
207
208		#ifdef WITH_IPP
209		/* ------------------------------------------------------------------------- */
210		static pstatus_t ipp_alphaComp_argb(const BYTE* pSrc1, INT32 src1Step, const BYTE* pSrc2,
211		INT32 src2Step, BYTE* pDst, INT32 dstStep, INT32 width,
212		INT32 height)
213		{
214		IppiSize sz;
215		sz.width = width;
216		sz.height = height;
217		return ippiAlphaComp_8u_AC4R(pSrc1, src1Step, pSrc2, src2Step, pDst, dstStep, sz, ippAlphaOver);
218		}
219		#endif
220
221		/* ------------------------------------------------------------------------- */
222		void primitives_init_alphaComp_opt(primitives_t* WINPR_RESTRICT prims)
223	0	{
224	0	generic = primitives_get_generic();
225	0	primitives_init_alphaComp(prims);
226		#ifdef WITH_IPP
227		prims->alphaComp_argb = ipp_alphaComp_argb;
228		#elif defined(WITH_SSE2)
229
230	0	if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
231	0	IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) /* for LDDQU */
232	0	{
233	0	prims->alphaComp_argb = sse2_alphaComp_argb;
234	0	}
235
236	0	#endif
237	0	}