/src/FreeRDP/libfreerdp/primitives/sse/prim_alphaComp_sse3.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* FreeRDP: A Remote Desktop Protocol Client |
2 | | * Optimized alpha blending routines. |
3 | | * vi:ts=4 sw=4: |
4 | | * |
5 | | * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. |
6 | | * Licensed under the Apache License, Version 2.0 (the "License"); you may |
7 | | * not use this file except in compliance with the License. You may obtain |
8 | | * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. |
9 | | * Unless required by applicable law or agreed to in writing, software |
10 | | * distributed under the License is distributed on an "AS IS" BASIS, |
11 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
12 | | * or implied. See the License for the specific language governing |
13 | | * permissions and limitations under the License. |
14 | | * |
15 | | * Note: this code assumes the second operand is fully opaque, |
16 | | * e.g. |
17 | | * newval = alpha1*val1 + (1-alpha1)*val2 |
18 | | * rather than |
19 | | * newval = alpha1*val1 + (1-alpha1)*alpha2*val2 |
20 | | * The IPP gives other options. |
21 | | */ |
22 | | |
23 | | #include <freerdp/config.h> |
24 | | |
25 | | #include <freerdp/types.h> |
26 | | #include <freerdp/primitives.h> |
27 | | #include <winpr/sysinfo.h> |
28 | | |
29 | | #include "prim_alphaComp.h" |
30 | | |
31 | | #include "prim_internal.h" |
32 | | #include "prim_avxsse.h" |
33 | | |
34 | | /* ------------------------------------------------------------------------- */ |
35 | | #if defined(SSE_AVX_INTRINSICS_ENABLED) |
36 | | #include <emmintrin.h> |
37 | | #include <pmmintrin.h> |
38 | | |
39 | | static primitives_t* generic = NULL; |
40 | | |
41 | | static pstatus_t sse2_alphaComp_argb(const BYTE* WINPR_RESTRICT pSrc1, UINT32 src1Step, |
42 | | const BYTE* WINPR_RESTRICT pSrc2, UINT32 src2Step, |
43 | | BYTE* WINPR_RESTRICT pDst, UINT32 dstStep, UINT32 width, |
44 | | UINT32 height) |
45 | 0 | { |
46 | 0 | const UINT32* sptr1 = (const UINT32*)pSrc1; |
47 | 0 | const UINT32* sptr2 = (const UINT32*)pSrc2; |
48 | |
|
49 | 0 | if ((width <= 0) || (height <= 0)) |
50 | 0 | return PRIMITIVES_SUCCESS; |
51 | | |
52 | 0 | if (width < 4) /* pointless if too small */ |
53 | 0 | { |
54 | 0 | return generic->alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step, pDst, dstStep, width, |
55 | 0 | height); |
56 | 0 | } |
57 | | |
58 | 0 | UINT32* dptr = (UINT32*)pDst; |
59 | 0 | const size_t linebytes = width * sizeof(UINT32); |
60 | 0 | const size_t src1Jump = (src1Step - linebytes) / sizeof(UINT32); |
61 | 0 | const size_t src2Jump = (src2Step - linebytes) / sizeof(UINT32); |
62 | 0 | const size_t dstJump = (dstStep - linebytes) / sizeof(UINT32); |
63 | 0 | __m128i xmm0 = mm_set1_epu32(0); |
64 | 0 | __m128i xmm1 = _mm_set1_epi16(1); |
65 | |
|
66 | 0 | for (UINT32 y = 0; y < height; ++y) |
67 | 0 | { |
68 | 0 | uint32_t pixels = width; |
69 | 0 | uint32_t count = 0; |
70 | | /* Get to the 16-byte boundary now. */ |
71 | 0 | uint32_t leadIn = 0; |
72 | |
|
73 | 0 | switch ((ULONG_PTR)dptr & 0x0f) |
74 | 0 | { |
75 | 0 | case 0: |
76 | 0 | leadIn = 0; |
77 | 0 | break; |
78 | | |
79 | 0 | case 4: |
80 | 0 | leadIn = 3; |
81 | 0 | break; |
82 | | |
83 | 0 | case 8: |
84 | 0 | leadIn = 2; |
85 | 0 | break; |
86 | | |
87 | 0 | case 12: |
88 | 0 | leadIn = 1; |
89 | 0 | break; |
90 | | |
91 | 0 | default: |
92 | | /* We'll never hit a 16-byte boundary, so do the whole |
93 | | * thing the slow way. |
94 | | */ |
95 | 0 | leadIn = width; |
96 | 0 | break; |
97 | 0 | } |
98 | | |
99 | 0 | if (leadIn) |
100 | 0 | { |
101 | 0 | pstatus_t status = 0; |
102 | 0 | status = generic->alphaComp_argb((const BYTE*)sptr1, src1Step, (const BYTE*)sptr2, |
103 | 0 | src2Step, (BYTE*)dptr, dstStep, leadIn, 1); |
104 | 0 | if (status != PRIMITIVES_SUCCESS) |
105 | 0 | return status; |
106 | | |
107 | 0 | sptr1 += leadIn; |
108 | 0 | sptr2 += leadIn; |
109 | 0 | dptr += leadIn; |
110 | 0 | pixels -= leadIn; |
111 | 0 | } |
112 | | |
113 | | /* Use SSE registers to do 4 pixels at a time. */ |
114 | 0 | count = pixels >> 2; |
115 | 0 | pixels -= count << 2; |
116 | |
|
117 | 0 | while (count--) |
118 | 0 | { |
119 | 0 | __m128i xmm2; |
120 | 0 | __m128i xmm3; |
121 | 0 | __m128i xmm4; |
122 | 0 | __m128i xmm5; |
123 | 0 | __m128i xmm6; |
124 | 0 | __m128i xmm7; |
125 | | /* BdGdRdAdBcGcRcAcBbGbRbAbBaGaRaAa */ |
126 | 0 | xmm2 = LOAD_SI128(sptr1); |
127 | 0 | sptr1 += 4; |
128 | | /* BhGhRhAhBgGgRgAgBfGfRfAfBeGeReAe */ |
129 | 0 | xmm3 = LOAD_SI128(sptr2); |
130 | 0 | sptr2 += 4; |
131 | | /* 00Bb00Gb00Rb00Ab00Ba00Ga00Ra00Aa */ |
132 | 0 | xmm4 = _mm_unpackhi_epi8(xmm2, xmm0); |
133 | | /* 00Bf00Gf00Bf00Af00Be00Ge00Re00Ae */ |
134 | 0 | xmm5 = _mm_unpackhi_epi8(xmm3, xmm0); |
135 | | /* subtract */ |
136 | 0 | xmm6 = _mm_subs_epi16(xmm4, xmm5); |
137 | | /* 00Bb00Gb00Rb00Ab00Aa00Aa00Aa00Aa */ |
138 | 0 | xmm4 = _mm_shufflelo_epi16(xmm4, 0xff); |
139 | | /* 00Ab00Ab00Ab00Ab00Aa00Aa00Aa00Aa */ |
140 | 0 | xmm4 = _mm_shufflehi_epi16(xmm4, 0xff); |
141 | | /* Add one to alphas */ |
142 | 0 | xmm4 = _mm_adds_epi16(xmm4, xmm1); |
143 | | /* Multiply and take low word */ |
144 | 0 | xmm4 = _mm_mullo_epi16(xmm4, xmm6); |
145 | | /* Shift 8 right */ |
146 | 0 | xmm4 = _mm_srai_epi16(xmm4, 8); |
147 | | /* Add xmm5 */ |
148 | 0 | xmm4 = _mm_adds_epi16(xmm4, xmm5); |
149 | | /* 00Bj00Gj00Rj00Aj00Bi00Gi00Ri00Ai */ |
150 | | /* 00Bd00Gd00Rd00Ad00Bc00Gc00Rc00Ac */ |
151 | 0 | xmm5 = _mm_unpacklo_epi8(xmm2, xmm0); |
152 | | /* 00Bh00Gh00Rh00Ah00Bg00Gg00Rg00Ag */ |
153 | 0 | xmm6 = _mm_unpacklo_epi8(xmm3, xmm0); |
154 | | /* subtract */ |
155 | 0 | xmm7 = _mm_subs_epi16(xmm5, xmm6); |
156 | | /* 00Bd00Gd00Rd00Ad00Ac00Ac00Ac00Ac */ |
157 | 0 | xmm5 = _mm_shufflelo_epi16(xmm5, 0xff); |
158 | | /* 00Ad00Ad00Ad00Ad00Ac00Ac00Ac00Ac */ |
159 | 0 | xmm5 = _mm_shufflehi_epi16(xmm5, 0xff); |
160 | | /* Add one to alphas */ |
161 | 0 | xmm5 = _mm_adds_epi16(xmm5, xmm1); |
162 | | /* Multiply and take low word */ |
163 | 0 | xmm5 = _mm_mullo_epi16(xmm5, xmm7); |
164 | | /* Shift 8 right */ |
165 | 0 | xmm5 = _mm_srai_epi16(xmm5, 8); |
166 | | /* Add xmm6 */ |
167 | 0 | xmm5 = _mm_adds_epi16(xmm5, xmm6); |
168 | | /* 00Bl00Gl00Rl00Al00Bk00Gk00Rk0ABk */ |
169 | | /* Must mask off remainders or pack gets confused */ |
170 | 0 | xmm3 = _mm_set1_epi16(0x00ffU); |
171 | 0 | xmm4 = _mm_and_si128(xmm4, xmm3); |
172 | 0 | xmm5 = _mm_and_si128(xmm5, xmm3); |
173 | | /* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */ |
174 | 0 | xmm5 = _mm_packus_epi16(xmm5, xmm4); |
175 | 0 | STORE_SI128(dptr, xmm5); |
176 | 0 | dptr += 4; |
177 | 0 | } |
178 | | |
179 | | /* Finish off the remainder. */ |
180 | 0 | if (pixels) |
181 | 0 | { |
182 | 0 | pstatus_t status = 0; |
183 | 0 | status = generic->alphaComp_argb((const BYTE*)sptr1, src1Step, (const BYTE*)sptr2, |
184 | 0 | src2Step, (BYTE*)dptr, dstStep, pixels, 1); |
185 | 0 | if (status != PRIMITIVES_SUCCESS) |
186 | 0 | return status; |
187 | | |
188 | 0 | sptr1 += pixels; |
189 | 0 | sptr2 += pixels; |
190 | 0 | dptr += pixels; |
191 | 0 | } |
192 | | |
193 | | /* Jump to next row. */ |
194 | 0 | sptr1 += src1Jump; |
195 | 0 | sptr2 += src2Jump; |
196 | 0 | dptr += dstJump; |
197 | 0 | } |
198 | | |
199 | 0 | return PRIMITIVES_SUCCESS; |
200 | 0 | } |
201 | | #endif |
202 | | |
203 | | /* ------------------------------------------------------------------------- */ |
204 | | void primitives_init_alphaComp_sse3_int(primitives_t* WINPR_RESTRICT prims) |
205 | 0 | { |
206 | 0 | #if defined(SSE_AVX_INTRINSICS_ENABLED) |
207 | 0 | generic = primitives_get_generic(); |
208 | 0 | WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations"); |
209 | 0 | prims->alphaComp_argb = sse2_alphaComp_argb; |
210 | |
|
211 | | #else |
212 | | WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE3 intrinsics not available"); |
213 | | WINPR_UNUSED(prims); |
214 | | #endif |
215 | 0 | } |