/src/FreeRDP/libfreerdp/primitives/sse/prim_alphaComp_sse3.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* FreeRDP: A Remote Desktop Protocol Client |
2 | | * Optimized alpha blending routines. |
3 | | * vi:ts=4 sw=4: |
4 | | * |
5 | | * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. |
6 | | * Licensed under the Apache License, Version 2.0 (the "License"); you may |
7 | | * not use this file except in compliance with the License. You may obtain |
8 | | * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. |
9 | | * Unless required by applicable law or agreed to in writing, software |
10 | | * distributed under the License is distributed on an "AS IS" BASIS, |
11 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
12 | | * or implied. See the License for the specific language governing |
13 | | * permissions and limitations under the License. |
14 | | * |
15 | | * Note: this code assumes the second operand is fully opaque, |
16 | | * e.g. |
17 | | * newval = alpha1*val1 + (1-alpha1)*val2 |
18 | | * rather than |
19 | | * newval = alpha1*val1 + (1-alpha1)*alpha2*val2 |
20 | | * The IPP gives other options. |
21 | | */ |
22 | | |
23 | | #include <freerdp/config.h> |
24 | | |
25 | | #include <freerdp/types.h> |
26 | | #include <freerdp/primitives.h> |
27 | | #include <winpr/sysinfo.h> |
28 | | |
29 | | #include "prim_alphaComp.h" |
30 | | |
31 | | #include "prim_internal.h" |
32 | | |
33 | | /* ------------------------------------------------------------------------- */ |
34 | | #if defined(SSE2_ENABLED) |
35 | | #include <emmintrin.h> |
36 | | #include <pmmintrin.h> |
37 | | |
38 | | static primitives_t* generic = NULL; |
39 | | |
40 | | static pstatus_t sse2_alphaComp_argb(const BYTE* WINPR_RESTRICT pSrc1, UINT32 src1Step, |
41 | | const BYTE* WINPR_RESTRICT pSrc2, UINT32 src2Step, |
42 | | BYTE* WINPR_RESTRICT pDst, UINT32 dstStep, UINT32 width, |
43 | | UINT32 height) |
44 | | { |
45 | | const UINT32* sptr1 = (const UINT32*)pSrc1; |
46 | | const UINT32* sptr2 = (const UINT32*)pSrc2; |
47 | | UINT32* dptr = NULL; |
48 | | int linebytes = 0; |
49 | | int src1Jump = 0; |
50 | | int src2Jump = 0; |
51 | | int dstJump = 0; |
52 | | __m128i xmm0; |
53 | | __m128i xmm1; |
54 | | |
55 | | if ((width <= 0) || (height <= 0)) |
56 | | return PRIMITIVES_SUCCESS; |
57 | | |
58 | | if (width < 4) /* pointless if too small */ |
59 | | { |
60 | | return generic->alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step, pDst, dstStep, width, |
61 | | height); |
62 | | } |
63 | | |
64 | | dptr = (UINT32*)pDst; |
65 | | linebytes = width * sizeof(UINT32); |
66 | | src1Jump = (src1Step - linebytes) / sizeof(UINT32); |
67 | | src2Jump = (src2Step - linebytes) / sizeof(UINT32); |
68 | | dstJump = (dstStep - linebytes) / sizeof(UINT32); |
69 | | xmm0 = _mm_set1_epi32(0); |
70 | | xmm1 = _mm_set1_epi16(1); |
71 | | |
72 | | for (UINT32 y = 0; y < height; ++y) |
73 | | { |
74 | | int pixels = width; |
75 | | int count = 0; |
76 | | /* Get to the 16-byte boundary now. */ |
77 | | int leadIn = 0; |
78 | | |
79 | | switch ((ULONG_PTR)dptr & 0x0f) |
80 | | { |
81 | | case 0: |
82 | | leadIn = 0; |
83 | | break; |
84 | | |
85 | | case 4: |
86 | | leadIn = 3; |
87 | | break; |
88 | | |
89 | | case 8: |
90 | | leadIn = 2; |
91 | | break; |
92 | | |
93 | | case 12: |
94 | | leadIn = 1; |
95 | | break; |
96 | | |
97 | | default: |
98 | | /* We'll never hit a 16-byte boundary, so do the whole |
99 | | * thing the slow way. |
100 | | */ |
101 | | leadIn = width; |
102 | | break; |
103 | | } |
104 | | |
105 | | if (leadIn) |
106 | | { |
107 | | pstatus_t status = 0; |
108 | | status = generic->alphaComp_argb((const BYTE*)sptr1, src1Step, (const BYTE*)sptr2, |
109 | | src2Step, (BYTE*)dptr, dstStep, leadIn, 1); |
110 | | if (status != PRIMITIVES_SUCCESS) |
111 | | return status; |
112 | | |
113 | | sptr1 += leadIn; |
114 | | sptr2 += leadIn; |
115 | | dptr += leadIn; |
116 | | pixels -= leadIn; |
117 | | } |
118 | | |
119 | | /* Use SSE registers to do 4 pixels at a time. */ |
120 | | count = pixels >> 2; |
121 | | pixels -= count << 2; |
122 | | |
123 | | while (count--) |
124 | | { |
125 | | __m128i xmm2; |
126 | | __m128i xmm3; |
127 | | __m128i xmm4; |
128 | | __m128i xmm5; |
129 | | __m128i xmm6; |
130 | | __m128i xmm7; |
131 | | /* BdGdRdAdBcGcRcAcBbGbRbAbBaGaRaAa */ |
132 | | xmm2 = LOAD_SI128(sptr1); |
133 | | sptr1 += 4; |
134 | | /* BhGhRhAhBgGgRgAgBfGfRfAfBeGeReAe */ |
135 | | xmm3 = LOAD_SI128(sptr2); |
136 | | sptr2 += 4; |
137 | | /* 00Bb00Gb00Rb00Ab00Ba00Ga00Ra00Aa */ |
138 | | xmm4 = _mm_unpackhi_epi8(xmm2, xmm0); |
139 | | /* 00Bf00Gf00Bf00Af00Be00Ge00Re00Ae */ |
140 | | xmm5 = _mm_unpackhi_epi8(xmm3, xmm0); |
141 | | /* subtract */ |
142 | | xmm6 = _mm_subs_epi16(xmm4, xmm5); |
143 | | /* 00Bb00Gb00Rb00Ab00Aa00Aa00Aa00Aa */ |
144 | | xmm4 = _mm_shufflelo_epi16(xmm4, 0xff); |
145 | | /* 00Ab00Ab00Ab00Ab00Aa00Aa00Aa00Aa */ |
146 | | xmm4 = _mm_shufflehi_epi16(xmm4, 0xff); |
147 | | /* Add one to alphas */ |
148 | | xmm4 = _mm_adds_epi16(xmm4, xmm1); |
149 | | /* Multiply and take low word */ |
150 | | xmm4 = _mm_mullo_epi16(xmm4, xmm6); |
151 | | /* Shift 8 right */ |
152 | | xmm4 = _mm_srai_epi16(xmm4, 8); |
153 | | /* Add xmm5 */ |
154 | | xmm4 = _mm_adds_epi16(xmm4, xmm5); |
155 | | /* 00Bj00Gj00Rj00Aj00Bi00Gi00Ri00Ai */ |
156 | | /* 00Bd00Gd00Rd00Ad00Bc00Gc00Rc00Ac */ |
157 | | xmm5 = _mm_unpacklo_epi8(xmm2, xmm0); |
158 | | /* 00Bh00Gh00Rh00Ah00Bg00Gg00Rg00Ag */ |
159 | | xmm6 = _mm_unpacklo_epi8(xmm3, xmm0); |
160 | | /* subtract */ |
161 | | xmm7 = _mm_subs_epi16(xmm5, xmm6); |
162 | | /* 00Bd00Gd00Rd00Ad00Ac00Ac00Ac00Ac */ |
163 | | xmm5 = _mm_shufflelo_epi16(xmm5, 0xff); |
164 | | /* 00Ad00Ad00Ad00Ad00Ac00Ac00Ac00Ac */ |
165 | | xmm5 = _mm_shufflehi_epi16(xmm5, 0xff); |
166 | | /* Add one to alphas */ |
167 | | xmm5 = _mm_adds_epi16(xmm5, xmm1); |
168 | | /* Multiply and take low word */ |
169 | | xmm5 = _mm_mullo_epi16(xmm5, xmm7); |
170 | | /* Shift 8 right */ |
171 | | xmm5 = _mm_srai_epi16(xmm5, 8); |
172 | | /* Add xmm6 */ |
173 | | xmm5 = _mm_adds_epi16(xmm5, xmm6); |
174 | | /* 00Bl00Gl00Rl00Al00Bk00Gk00Rk0ABk */ |
175 | | /* Must mask off remainders or pack gets confused */ |
176 | | xmm3 = _mm_set1_epi16(0x00ffU); |
177 | | xmm4 = _mm_and_si128(xmm4, xmm3); |
178 | | xmm5 = _mm_and_si128(xmm5, xmm3); |
179 | | /* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */ |
180 | | xmm5 = _mm_packus_epi16(xmm5, xmm4); |
181 | | _mm_store_si128((__m128i*)dptr, xmm5); |
182 | | dptr += 4; |
183 | | } |
184 | | |
185 | | /* Finish off the remainder. */ |
186 | | if (pixels) |
187 | | { |
188 | | pstatus_t status = 0; |
189 | | status = generic->alphaComp_argb((const BYTE*)sptr1, src1Step, (const BYTE*)sptr2, |
190 | | src2Step, (BYTE*)dptr, dstStep, pixels, 1); |
191 | | if (status != PRIMITIVES_SUCCESS) |
192 | | return status; |
193 | | |
194 | | sptr1 += pixels; |
195 | | sptr2 += pixels; |
196 | | dptr += pixels; |
197 | | } |
198 | | |
199 | | /* Jump to next row. */ |
200 | | sptr1 += src1Jump; |
201 | | sptr2 += src2Jump; |
202 | | dptr += dstJump; |
203 | | } |
204 | | |
205 | | return PRIMITIVES_SUCCESS; |
206 | | } |
207 | | #endif |
208 | | |
209 | | /* ------------------------------------------------------------------------- */ |
210 | | void primitives_init_alphaComp_sse3(primitives_t* WINPR_RESTRICT prims) |
211 | 0 | { |
212 | | #if defined(SSE2_ENABLED) |
213 | | generic = primitives_get_generic(); |
214 | | primitives_init_alphaComp(prims); |
215 | | |
216 | | if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) && |
217 | | IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) /* for LDDQU */ |
218 | | { |
219 | | WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations"); |
220 | | prims->alphaComp_argb = sse2_alphaComp_argb; |
221 | | } |
222 | | |
223 | | #else |
224 | 0 | WLog_VRB(PRIM_TAG, "undefined WITH_SSE2"); |
225 | 0 | WINPR_UNUSED(prims); |
226 | 0 | #endif |
227 | 0 | } |