Coverage Report

Created: 2024-09-08 06:20

/src/FreeRDP/libfreerdp/primitives/sse/prim_alphaComp_sse3.c
Line
Count
Source (jump to first uncovered line)
1
/* FreeRDP: A Remote Desktop Protocol Client
2
 * Optimized alpha blending routines.
3
 * vi:ts=4 sw=4:
4
 *
5
 * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
6
 * Licensed under the Apache License, Version 2.0 (the "License"); you may
7
 * not use this file except in compliance with the License. You may obtain
8
 * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
9
 * Unless required by applicable law or agreed to in writing, software
10
 * distributed under the License is distributed on an "AS IS" BASIS,
11
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
12
 * or implied. See the License for the specific language governing
13
 * permissions and limitations under the License.
14
 *
15
 * Note: this code assumes the second operand is fully opaque,
16
 * e.g.
17
 *   newval = alpha1*val1 + (1-alpha1)*val2
18
 * rather than
19
 *   newval = alpha1*val1 + (1-alpha1)*alpha2*val2
20
 * The IPP gives other options.
21
 */
22
23
#include <freerdp/config.h>
24
25
#include <freerdp/types.h>
26
#include <freerdp/primitives.h>
27
#include <winpr/sysinfo.h>
28
29
#include "prim_alphaComp.h"
30
31
#include "prim_internal.h"
32
33
/* ------------------------------------------------------------------------- */
34
#if defined(SSE2_ENABLED)
35
#include <emmintrin.h>
36
#include <pmmintrin.h>
37
38
static primitives_t* generic = NULL;
39
40
static pstatus_t sse2_alphaComp_argb(const BYTE* WINPR_RESTRICT pSrc1, UINT32 src1Step,
41
                                     const BYTE* WINPR_RESTRICT pSrc2, UINT32 src2Step,
42
                                     BYTE* WINPR_RESTRICT pDst, UINT32 dstStep, UINT32 width,
43
                                     UINT32 height)
44
{
45
  const UINT32* sptr1 = (const UINT32*)pSrc1;
46
  const UINT32* sptr2 = (const UINT32*)pSrc2;
47
  UINT32* dptr = NULL;
48
  int linebytes = 0;
49
  int src1Jump = 0;
50
  int src2Jump = 0;
51
  int dstJump = 0;
52
  __m128i xmm0;
53
  __m128i xmm1;
54
55
  if ((width <= 0) || (height <= 0))
56
    return PRIMITIVES_SUCCESS;
57
58
  if (width < 4) /* pointless if too small */
59
  {
60
    return generic->alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step, pDst, dstStep, width,
61
                                   height);
62
  }
63
64
  dptr = (UINT32*)pDst;
65
  linebytes = width * sizeof(UINT32);
66
  src1Jump = (src1Step - linebytes) / sizeof(UINT32);
67
  src2Jump = (src2Step - linebytes) / sizeof(UINT32);
68
  dstJump = (dstStep - linebytes) / sizeof(UINT32);
69
  xmm0 = _mm_set1_epi32(0);
70
  xmm1 = _mm_set1_epi16(1);
71
72
  for (UINT32 y = 0; y < height; ++y)
73
  {
74
    int pixels = width;
75
    int count = 0;
76
    /* Get to the 16-byte boundary now. */
77
    int leadIn = 0;
78
79
    switch ((ULONG_PTR)dptr & 0x0f)
80
    {
81
      case 0:
82
        leadIn = 0;
83
        break;
84
85
      case 4:
86
        leadIn = 3;
87
        break;
88
89
      case 8:
90
        leadIn = 2;
91
        break;
92
93
      case 12:
94
        leadIn = 1;
95
        break;
96
97
      default:
98
        /* We'll never hit a 16-byte boundary, so do the whole
99
         * thing the slow way.
100
         */
101
        leadIn = width;
102
        break;
103
    }
104
105
    if (leadIn)
106
    {
107
      pstatus_t status = 0;
108
      status = generic->alphaComp_argb((const BYTE*)sptr1, src1Step, (const BYTE*)sptr2,
109
                                       src2Step, (BYTE*)dptr, dstStep, leadIn, 1);
110
      if (status != PRIMITIVES_SUCCESS)
111
        return status;
112
113
      sptr1 += leadIn;
114
      sptr2 += leadIn;
115
      dptr += leadIn;
116
      pixels -= leadIn;
117
    }
118
119
    /* Use SSE registers to do 4 pixels at a time. */
120
    count = pixels >> 2;
121
    pixels -= count << 2;
122
123
    while (count--)
124
    {
125
      __m128i xmm2;
126
      __m128i xmm3;
127
      __m128i xmm4;
128
      __m128i xmm5;
129
      __m128i xmm6;
130
      __m128i xmm7;
131
      /* BdGdRdAdBcGcRcAcBbGbRbAbBaGaRaAa */
132
      xmm2 = LOAD_SI128(sptr1);
133
      sptr1 += 4;
134
      /* BhGhRhAhBgGgRgAgBfGfRfAfBeGeReAe */
135
      xmm3 = LOAD_SI128(sptr2);
136
      sptr2 += 4;
137
      /* 00Bb00Gb00Rb00Ab00Ba00Ga00Ra00Aa */
138
      xmm4 = _mm_unpackhi_epi8(xmm2, xmm0);
139
      /* 00Bf00Gf00Bf00Af00Be00Ge00Re00Ae */
140
      xmm5 = _mm_unpackhi_epi8(xmm3, xmm0);
141
      /* subtract */
142
      xmm6 = _mm_subs_epi16(xmm4, xmm5);
143
      /* 00Bb00Gb00Rb00Ab00Aa00Aa00Aa00Aa */
144
      xmm4 = _mm_shufflelo_epi16(xmm4, 0xff);
145
      /* 00Ab00Ab00Ab00Ab00Aa00Aa00Aa00Aa */
146
      xmm4 = _mm_shufflehi_epi16(xmm4, 0xff);
147
      /* Add one to alphas */
148
      xmm4 = _mm_adds_epi16(xmm4, xmm1);
149
      /* Multiply and take low word */
150
      xmm4 = _mm_mullo_epi16(xmm4, xmm6);
151
      /* Shift 8 right */
152
      xmm4 = _mm_srai_epi16(xmm4, 8);
153
      /* Add xmm5 */
154
      xmm4 = _mm_adds_epi16(xmm4, xmm5);
155
      /* 00Bj00Gj00Rj00Aj00Bi00Gi00Ri00Ai */
156
      /* 00Bd00Gd00Rd00Ad00Bc00Gc00Rc00Ac */
157
      xmm5 = _mm_unpacklo_epi8(xmm2, xmm0);
158
      /* 00Bh00Gh00Rh00Ah00Bg00Gg00Rg00Ag */
159
      xmm6 = _mm_unpacklo_epi8(xmm3, xmm0);
160
      /* subtract */
161
      xmm7 = _mm_subs_epi16(xmm5, xmm6);
162
      /* 00Bd00Gd00Rd00Ad00Ac00Ac00Ac00Ac */
163
      xmm5 = _mm_shufflelo_epi16(xmm5, 0xff);
164
      /* 00Ad00Ad00Ad00Ad00Ac00Ac00Ac00Ac */
165
      xmm5 = _mm_shufflehi_epi16(xmm5, 0xff);
166
      /* Add one to alphas */
167
      xmm5 = _mm_adds_epi16(xmm5, xmm1);
168
      /* Multiply and take low word */
169
      xmm5 = _mm_mullo_epi16(xmm5, xmm7);
170
      /* Shift 8 right */
171
      xmm5 = _mm_srai_epi16(xmm5, 8);
172
      /* Add xmm6 */
173
      xmm5 = _mm_adds_epi16(xmm5, xmm6);
174
      /* 00Bl00Gl00Rl00Al00Bk00Gk00Rk0ABk */
175
      /* Must mask off remainders or pack gets confused */
176
      xmm3 = _mm_set1_epi16(0x00ffU);
177
      xmm4 = _mm_and_si128(xmm4, xmm3);
178
      xmm5 = _mm_and_si128(xmm5, xmm3);
179
      /* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */
180
      xmm5 = _mm_packus_epi16(xmm5, xmm4);
181
      _mm_store_si128((__m128i*)dptr, xmm5);
182
      dptr += 4;
183
    }
184
185
    /* Finish off the remainder. */
186
    if (pixels)
187
    {
188
      pstatus_t status = 0;
189
      status = generic->alphaComp_argb((const BYTE*)sptr1, src1Step, (const BYTE*)sptr2,
190
                                       src2Step, (BYTE*)dptr, dstStep, pixels, 1);
191
      if (status != PRIMITIVES_SUCCESS)
192
        return status;
193
194
      sptr1 += pixels;
195
      sptr2 += pixels;
196
      dptr += pixels;
197
    }
198
199
    /* Jump to next row. */
200
    sptr1 += src1Jump;
201
    sptr2 += src2Jump;
202
    dptr += dstJump;
203
  }
204
205
  return PRIMITIVES_SUCCESS;
206
}
207
#endif
208
209
/* ------------------------------------------------------------------------- */
210
void primitives_init_alphaComp_sse3(primitives_t* WINPR_RESTRICT prims)
211
0
{
212
#if defined(SSE2_ENABLED)
213
  generic = primitives_get_generic();
214
  primitives_init_alphaComp(prims);
215
216
  if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
217
      IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) /* for LDDQU */
218
  {
219
    WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations");
220
    prims->alphaComp_argb = sse2_alphaComp_argb;
221
  }
222
223
#else
224
0
  WLog_VRB(PRIM_TAG, "undefined WITH_SSE2");
225
0
  WINPR_UNUSED(prims);
226
0
#endif
227
0
}