/src/FreeRDP/libfreerdp/primitives/prim_alphaComp_opt.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* FreeRDP: A Remote Desktop Protocol Client |
2 | | * Optimized alpha blending routines. |
3 | | * vi:ts=4 sw=4: |
4 | | * |
5 | | * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. |
6 | | * Licensed under the Apache License, Version 2.0 (the "License"); you may |
7 | | * not use this file except in compliance with the License. You may obtain |
8 | | * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. |
9 | | * Unless required by applicable law or agreed to in writing, software |
10 | | * distributed under the License is distributed on an "AS IS" BASIS, |
11 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
12 | | * or implied. See the License for the specific language governing |
13 | | * permissions and limitations under the License. |
14 | | * |
15 | | * Note: this code assumes the second operand is fully opaque, |
16 | | * e.g. |
17 | | * newval = alpha1*val1 + (1-alpha1)*val2 |
18 | | * rather than |
19 | | * newval = alpha1*val1 + (1-alpha1)*alpha2*val2 |
20 | | * The IPP gives other options. |
21 | | */ |
22 | | |
23 | | #include <freerdp/config.h> |
24 | | |
25 | | #include <freerdp/types.h> |
26 | | #include <freerdp/primitives.h> |
27 | | #include <winpr/sysinfo.h> |
28 | | |
29 | | #ifdef WITH_SSE2 |
30 | | #include <emmintrin.h> |
31 | | #include <pmmintrin.h> |
32 | | #endif /* WITH_SSE2 */ |
33 | | |
34 | | #ifdef WITH_IPP |
35 | | #include <ippi.h> |
36 | | #endif /* WITH_IPP */ |
37 | | |
38 | | #include "prim_internal.h" |
39 | | |
40 | | static primitives_t* generic = NULL; |
41 | | |
42 | | /* ------------------------------------------------------------------------- */ |
43 | | #ifdef WITH_SSE2 |
44 | | #if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) |
45 | | |
46 | | static pstatus_t sse2_alphaComp_argb(const BYTE* WINPR_RESTRICT pSrc1, UINT32 src1Step, |
47 | | const BYTE* WINPR_RESTRICT pSrc2, UINT32 src2Step, |
48 | | BYTE* WINPR_RESTRICT pDst, UINT32 dstStep, UINT32 width, |
49 | | UINT32 height) |
50 | 0 | { |
51 | 0 | const UINT32* sptr1 = (const UINT32*)pSrc1; |
52 | 0 | const UINT32* sptr2 = (const UINT32*)pSrc2; |
53 | 0 | UINT32* dptr; |
54 | 0 | int linebytes, src1Jump, src2Jump, dstJump; |
55 | 0 | UINT32 y; |
56 | 0 | __m128i xmm0, xmm1; |
57 | |
|
58 | 0 | if ((width <= 0) || (height <= 0)) |
59 | 0 | return PRIMITIVES_SUCCESS; |
60 | | |
61 | 0 | if (width < 4) /* pointless if too small */ |
62 | 0 | { |
63 | 0 | return generic->alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step, pDst, dstStep, width, |
64 | 0 | height); |
65 | 0 | } |
66 | | |
67 | 0 | dptr = (UINT32*)pDst; |
68 | 0 | linebytes = width * sizeof(UINT32); |
69 | 0 | src1Jump = (src1Step - linebytes) / sizeof(UINT32); |
70 | 0 | src2Jump = (src2Step - linebytes) / sizeof(UINT32); |
71 | 0 | dstJump = (dstStep - linebytes) / sizeof(UINT32); |
72 | 0 | xmm0 = _mm_set1_epi32(0); |
73 | 0 | xmm1 = _mm_set1_epi16(1); |
74 | |
|
75 | 0 | for (y = 0; y < height; ++y) |
76 | 0 | { |
77 | 0 | int pixels = width; |
78 | 0 | int count; |
79 | | /* Get to the 16-byte boundary now. */ |
80 | 0 | int leadIn = 0; |
81 | |
|
82 | 0 | switch ((ULONG_PTR)dptr & 0x0f) |
83 | 0 | { |
84 | 0 | case 0: |
85 | 0 | leadIn = 0; |
86 | 0 | break; |
87 | | |
88 | 0 | case 4: |
89 | 0 | leadIn = 3; |
90 | 0 | break; |
91 | | |
92 | 0 | case 8: |
93 | 0 | leadIn = 2; |
94 | 0 | break; |
95 | | |
96 | 0 | case 12: |
97 | 0 | leadIn = 1; |
98 | 0 | break; |
99 | | |
100 | 0 | default: |
101 | | /* We'll never hit a 16-byte boundary, so do the whole |
102 | | * thing the slow way. |
103 | | */ |
104 | 0 | leadIn = width; |
105 | 0 | break; |
106 | 0 | } |
107 | | |
108 | 0 | if (leadIn) |
109 | 0 | { |
110 | 0 | pstatus_t status; |
111 | 0 | status = generic->alphaComp_argb((const BYTE*)sptr1, src1Step, (const BYTE*)sptr2, |
112 | 0 | src2Step, (BYTE*)dptr, dstStep, leadIn, 1); |
113 | 0 | if (status != PRIMITIVES_SUCCESS) |
114 | 0 | return status; |
115 | | |
116 | 0 | sptr1 += leadIn; |
117 | 0 | sptr2 += leadIn; |
118 | 0 | dptr += leadIn; |
119 | 0 | pixels -= leadIn; |
120 | 0 | } |
121 | | |
122 | | /* Use SSE registers to do 4 pixels at a time. */ |
123 | 0 | count = pixels >> 2; |
124 | 0 | pixels -= count << 2; |
125 | |
|
126 | 0 | while (count--) |
127 | 0 | { |
128 | 0 | __m128i xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; |
129 | | /* BdGdRdAdBcGcRcAcBbGbRbAbBaGaRaAa */ |
130 | 0 | xmm2 = LOAD_SI128(sptr1); |
131 | 0 | sptr1 += 4; |
132 | | /* BhGhRhAhBgGgRgAgBfGfRfAfBeGeReAe */ |
133 | 0 | xmm3 = LOAD_SI128(sptr2); |
134 | 0 | sptr2 += 4; |
135 | | /* 00Bb00Gb00Rb00Ab00Ba00Ga00Ra00Aa */ |
136 | 0 | xmm4 = _mm_unpackhi_epi8(xmm2, xmm0); |
137 | | /* 00Bf00Gf00Bf00Af00Be00Ge00Re00Ae */ |
138 | 0 | xmm5 = _mm_unpackhi_epi8(xmm3, xmm0); |
139 | | /* subtract */ |
140 | 0 | xmm6 = _mm_subs_epi16(xmm4, xmm5); |
141 | | /* 00Bb00Gb00Rb00Ab00Aa00Aa00Aa00Aa */ |
142 | 0 | xmm4 = _mm_shufflelo_epi16(xmm4, 0xff); |
143 | | /* 00Ab00Ab00Ab00Ab00Aa00Aa00Aa00Aa */ |
144 | 0 | xmm4 = _mm_shufflehi_epi16(xmm4, 0xff); |
145 | | /* Add one to alphas */ |
146 | 0 | xmm4 = _mm_adds_epi16(xmm4, xmm1); |
147 | | /* Multiply and take low word */ |
148 | 0 | xmm4 = _mm_mullo_epi16(xmm4, xmm6); |
149 | | /* Shift 8 right */ |
150 | 0 | xmm4 = _mm_srai_epi16(xmm4, 8); |
151 | | /* Add xmm5 */ |
152 | 0 | xmm4 = _mm_adds_epi16(xmm4, xmm5); |
153 | | /* 00Bj00Gj00Rj00Aj00Bi00Gi00Ri00Ai */ |
154 | | /* 00Bd00Gd00Rd00Ad00Bc00Gc00Rc00Ac */ |
155 | 0 | xmm5 = _mm_unpacklo_epi8(xmm2, xmm0); |
156 | | /* 00Bh00Gh00Rh00Ah00Bg00Gg00Rg00Ag */ |
157 | 0 | xmm6 = _mm_unpacklo_epi8(xmm3, xmm0); |
158 | | /* subtract */ |
159 | 0 | xmm7 = _mm_subs_epi16(xmm5, xmm6); |
160 | | /* 00Bd00Gd00Rd00Ad00Ac00Ac00Ac00Ac */ |
161 | 0 | xmm5 = _mm_shufflelo_epi16(xmm5, 0xff); |
162 | | /* 00Ad00Ad00Ad00Ad00Ac00Ac00Ac00Ac */ |
163 | 0 | xmm5 = _mm_shufflehi_epi16(xmm5, 0xff); |
164 | | /* Add one to alphas */ |
165 | 0 | xmm5 = _mm_adds_epi16(xmm5, xmm1); |
166 | | /* Multiply and take low word */ |
167 | 0 | xmm5 = _mm_mullo_epi16(xmm5, xmm7); |
168 | | /* Shift 8 right */ |
169 | 0 | xmm5 = _mm_srai_epi16(xmm5, 8); |
170 | | /* Add xmm6 */ |
171 | 0 | xmm5 = _mm_adds_epi16(xmm5, xmm6); |
172 | | /* 00Bl00Gl00Rl00Al00Bk00Gk00Rk0ABk */ |
173 | | /* Must mask off remainders or pack gets confused */ |
174 | 0 | xmm3 = _mm_set1_epi16(0x00ffU); |
175 | 0 | xmm4 = _mm_and_si128(xmm4, xmm3); |
176 | 0 | xmm5 = _mm_and_si128(xmm5, xmm3); |
177 | | /* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */ |
178 | 0 | xmm5 = _mm_packus_epi16(xmm5, xmm4); |
179 | 0 | _mm_store_si128((__m128i*)dptr, xmm5); |
180 | 0 | dptr += 4; |
181 | 0 | } |
182 | | |
183 | | /* Finish off the remainder. */ |
184 | 0 | if (pixels) |
185 | 0 | { |
186 | 0 | pstatus_t status; |
187 | 0 | status = generic->alphaComp_argb((const BYTE*)sptr1, src1Step, (const BYTE*)sptr2, |
188 | 0 | src2Step, (BYTE*)dptr, dstStep, pixels, 1); |
189 | 0 | if (status != PRIMITIVES_SUCCESS) |
190 | 0 | return status; |
191 | | |
192 | 0 | sptr1 += pixels; |
193 | 0 | sptr2 += pixels; |
194 | 0 | dptr += pixels; |
195 | 0 | } |
196 | | |
197 | | /* Jump to next row. */ |
198 | 0 | sptr1 += src1Jump; |
199 | 0 | sptr2 += src2Jump; |
200 | 0 | dptr += dstJump; |
201 | 0 | } |
202 | | |
203 | 0 | return PRIMITIVES_SUCCESS; |
204 | 0 | } |
205 | | #endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */ |
206 | | #endif |
207 | | |
208 | | #ifdef WITH_IPP |
209 | | /* ------------------------------------------------------------------------- */ |
210 | | static pstatus_t ipp_alphaComp_argb(const BYTE* pSrc1, INT32 src1Step, const BYTE* pSrc2, |
211 | | INT32 src2Step, BYTE* pDst, INT32 dstStep, INT32 width, |
212 | | INT32 height) |
213 | | { |
214 | | IppiSize sz; |
215 | | sz.width = width; |
216 | | sz.height = height; |
217 | | return ippiAlphaComp_8u_AC4R(pSrc1, src1Step, pSrc2, src2Step, pDst, dstStep, sz, ippAlphaOver); |
218 | | } |
219 | | #endif |
220 | | |
221 | | /* ------------------------------------------------------------------------- */ |
222 | | void primitives_init_alphaComp_opt(primitives_t* WINPR_RESTRICT prims) |
223 | 0 | { |
224 | 0 | generic = primitives_get_generic(); |
225 | 0 | primitives_init_alphaComp(prims); |
226 | | #ifdef WITH_IPP |
227 | | prims->alphaComp_argb = ipp_alphaComp_argb; |
228 | | #elif defined(WITH_SSE2) |
229 | |
|
230 | 0 | if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) && |
231 | 0 | IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) /* for LDDQU */ |
232 | 0 | { |
233 | 0 | prims->alphaComp_argb = sse2_alphaComp_argb; |
234 | 0 | } |
235 | |
|
236 | 0 | #endif |
237 | 0 | } |