Coverage Report

Created: 2023-09-25 06:56

/src/FreeRDP/libfreerdp/primitives/prim_alphaComp_opt.c
Line
Count
Source (jump to first uncovered line)
1
/* FreeRDP: A Remote Desktop Protocol Client
2
 * Optimized alpha blending routines.
3
 * vi:ts=4 sw=4:
4
 *
5
 * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
6
 * Licensed under the Apache License, Version 2.0 (the "License"); you may
7
 * not use this file except in compliance with the License. You may obtain
8
 * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
9
 * Unless required by applicable law or agreed to in writing, software
10
 * distributed under the License is distributed on an "AS IS" BASIS,
11
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
12
 * or implied. See the License for the specific language governing
13
 * permissions and limitations under the License.
14
 *
15
 * Note: this code assumes the second operand is fully opaque,
16
 * e.g.
17
 *   newval = alpha1*val1 + (1-alpha1)*val2
18
 * rather than
19
 *   newval = alpha1*val1 + (1-alpha1)*alpha2*val2
20
 * The IPP gives other options.
21
 */
22
23
#include <freerdp/config.h>
24
25
#include <freerdp/types.h>
26
#include <freerdp/primitives.h>
27
#include <winpr/sysinfo.h>
28
29
#ifdef WITH_SSE2
30
#include <emmintrin.h>
31
#include <pmmintrin.h>
32
#endif /* WITH_SSE2 */
33
34
#ifdef WITH_IPP
35
#include <ippi.h>
36
#endif /* WITH_IPP */
37
38
#include "prim_internal.h"
39
40
static primitives_t* generic = NULL;
41
42
/* ------------------------------------------------------------------------- */
43
#ifdef WITH_SSE2
44
#if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
45
46
static pstatus_t sse2_alphaComp_argb(const BYTE* WINPR_RESTRICT pSrc1, UINT32 src1Step,
47
                                     const BYTE* WINPR_RESTRICT pSrc2, UINT32 src2Step,
48
                                     BYTE* WINPR_RESTRICT pDst, UINT32 dstStep, UINT32 width,
49
                                     UINT32 height)
50
0
{
51
0
  const UINT32* sptr1 = (const UINT32*)pSrc1;
52
0
  const UINT32* sptr2 = (const UINT32*)pSrc2;
53
0
  UINT32* dptr;
54
0
  int linebytes, src1Jump, src2Jump, dstJump;
55
0
  UINT32 y;
56
0
  __m128i xmm0, xmm1;
57
58
0
  if ((width <= 0) || (height <= 0))
59
0
    return PRIMITIVES_SUCCESS;
60
61
0
  if (width < 4) /* pointless if too small */
62
0
  {
63
0
    return generic->alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step, pDst, dstStep, width,
64
0
                                   height);
65
0
  }
66
67
0
  dptr = (UINT32*)pDst;
68
0
  linebytes = width * sizeof(UINT32);
69
0
  src1Jump = (src1Step - linebytes) / sizeof(UINT32);
70
0
  src2Jump = (src2Step - linebytes) / sizeof(UINT32);
71
0
  dstJump = (dstStep - linebytes) / sizeof(UINT32);
72
0
  xmm0 = _mm_set1_epi32(0);
73
0
  xmm1 = _mm_set1_epi16(1);
74
75
0
  for (y = 0; y < height; ++y)
76
0
  {
77
0
    int pixels = width;
78
0
    int count;
79
    /* Get to the 16-byte boundary now. */
80
0
    int leadIn = 0;
81
82
0
    switch ((ULONG_PTR)dptr & 0x0f)
83
0
    {
84
0
      case 0:
85
0
        leadIn = 0;
86
0
        break;
87
88
0
      case 4:
89
0
        leadIn = 3;
90
0
        break;
91
92
0
      case 8:
93
0
        leadIn = 2;
94
0
        break;
95
96
0
      case 12:
97
0
        leadIn = 1;
98
0
        break;
99
100
0
      default:
101
        /* We'll never hit a 16-byte boundary, so do the whole
102
         * thing the slow way.
103
         */
104
0
        leadIn = width;
105
0
        break;
106
0
    }
107
108
0
    if (leadIn)
109
0
    {
110
0
      pstatus_t status;
111
0
      status = generic->alphaComp_argb((const BYTE*)sptr1, src1Step, (const BYTE*)sptr2,
112
0
                                       src2Step, (BYTE*)dptr, dstStep, leadIn, 1);
113
0
      if (status != PRIMITIVES_SUCCESS)
114
0
        return status;
115
116
0
      sptr1 += leadIn;
117
0
      sptr2 += leadIn;
118
0
      dptr += leadIn;
119
0
      pixels -= leadIn;
120
0
    }
121
122
    /* Use SSE registers to do 4 pixels at a time. */
123
0
    count = pixels >> 2;
124
0
    pixels -= count << 2;
125
126
0
    while (count--)
127
0
    {
128
0
      __m128i xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
129
      /* BdGdRdAdBcGcRcAcBbGbRbAbBaGaRaAa */
130
0
      xmm2 = LOAD_SI128(sptr1);
131
0
      sptr1 += 4;
132
      /* BhGhRhAhBgGgRgAgBfGfRfAfBeGeReAe */
133
0
      xmm3 = LOAD_SI128(sptr2);
134
0
      sptr2 += 4;
135
      /* 00Bb00Gb00Rb00Ab00Ba00Ga00Ra00Aa */
136
0
      xmm4 = _mm_unpackhi_epi8(xmm2, xmm0);
137
      /* 00Bf00Gf00Bf00Af00Be00Ge00Re00Ae */
138
0
      xmm5 = _mm_unpackhi_epi8(xmm3, xmm0);
139
      /* subtract */
140
0
      xmm6 = _mm_subs_epi16(xmm4, xmm5);
141
      /* 00Bb00Gb00Rb00Ab00Aa00Aa00Aa00Aa */
142
0
      xmm4 = _mm_shufflelo_epi16(xmm4, 0xff);
143
      /* 00Ab00Ab00Ab00Ab00Aa00Aa00Aa00Aa */
144
0
      xmm4 = _mm_shufflehi_epi16(xmm4, 0xff);
145
      /* Add one to alphas */
146
0
      xmm4 = _mm_adds_epi16(xmm4, xmm1);
147
      /* Multiply and take low word */
148
0
      xmm4 = _mm_mullo_epi16(xmm4, xmm6);
149
      /* Shift 8 right */
150
0
      xmm4 = _mm_srai_epi16(xmm4, 8);
151
      /* Add xmm5 */
152
0
      xmm4 = _mm_adds_epi16(xmm4, xmm5);
153
      /* 00Bj00Gj00Rj00Aj00Bi00Gi00Ri00Ai */
154
      /* 00Bd00Gd00Rd00Ad00Bc00Gc00Rc00Ac */
155
0
      xmm5 = _mm_unpacklo_epi8(xmm2, xmm0);
156
      /* 00Bh00Gh00Rh00Ah00Bg00Gg00Rg00Ag */
157
0
      xmm6 = _mm_unpacklo_epi8(xmm3, xmm0);
158
      /* subtract */
159
0
      xmm7 = _mm_subs_epi16(xmm5, xmm6);
160
      /* 00Bd00Gd00Rd00Ad00Ac00Ac00Ac00Ac */
161
0
      xmm5 = _mm_shufflelo_epi16(xmm5, 0xff);
162
      /* 00Ad00Ad00Ad00Ad00Ac00Ac00Ac00Ac */
163
0
      xmm5 = _mm_shufflehi_epi16(xmm5, 0xff);
164
      /* Add one to alphas */
165
0
      xmm5 = _mm_adds_epi16(xmm5, xmm1);
166
      /* Multiply and take low word */
167
0
      xmm5 = _mm_mullo_epi16(xmm5, xmm7);
168
      /* Shift 8 right */
169
0
      xmm5 = _mm_srai_epi16(xmm5, 8);
170
      /* Add xmm6 */
171
0
      xmm5 = _mm_adds_epi16(xmm5, xmm6);
172
      /* 00Bl00Gl00Rl00Al00Bk00Gk00Rk0ABk */
173
      /* Must mask off remainders or pack gets confused */
174
0
      xmm3 = _mm_set1_epi16(0x00ffU);
175
0
      xmm4 = _mm_and_si128(xmm4, xmm3);
176
0
      xmm5 = _mm_and_si128(xmm5, xmm3);
177
      /* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */
178
0
      xmm5 = _mm_packus_epi16(xmm5, xmm4);
179
0
      _mm_store_si128((__m128i*)dptr, xmm5);
180
0
      dptr += 4;
181
0
    }
182
183
    /* Finish off the remainder. */
184
0
    if (pixels)
185
0
    {
186
0
      pstatus_t status;
187
0
      status = generic->alphaComp_argb((const BYTE*)sptr1, src1Step, (const BYTE*)sptr2,
188
0
                                       src2Step, (BYTE*)dptr, dstStep, pixels, 1);
189
0
      if (status != PRIMITIVES_SUCCESS)
190
0
        return status;
191
192
0
      sptr1 += pixels;
193
0
      sptr2 += pixels;
194
0
      dptr += pixels;
195
0
    }
196
197
    /* Jump to next row. */
198
0
    sptr1 += src1Jump;
199
0
    sptr2 += src2Jump;
200
0
    dptr += dstJump;
201
0
  }
202
203
0
  return PRIMITIVES_SUCCESS;
204
0
}
205
#endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
206
#endif
207
208
#ifdef WITH_IPP
209
/* ------------------------------------------------------------------------- */
210
static pstatus_t ipp_alphaComp_argb(const BYTE* pSrc1, INT32 src1Step, const BYTE* pSrc2,
211
                                    INT32 src2Step, BYTE* pDst, INT32 dstStep, INT32 width,
212
                                    INT32 height)
213
{
214
  IppiSize sz;
215
  sz.width = width;
216
  sz.height = height;
217
  return ippiAlphaComp_8u_AC4R(pSrc1, src1Step, pSrc2, src2Step, pDst, dstStep, sz, ippAlphaOver);
218
}
219
#endif
220
221
/* ------------------------------------------------------------------------- */
222
void primitives_init_alphaComp_opt(primitives_t* WINPR_RESTRICT prims)
223
0
{
224
0
  generic = primitives_get_generic();
225
0
  primitives_init_alphaComp(prims);
226
#ifdef WITH_IPP
227
  prims->alphaComp_argb = ipp_alphaComp_argb;
228
#elif defined(WITH_SSE2)
229
230
0
  if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
231
0
      IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) /* for LDDQU */
232
0
  {
233
0
    prims->alphaComp_argb = sse2_alphaComp_argb;
234
0
  }
235
236
0
#endif
237
0
}