/src/FreeRDP/libfreerdp/primitives/sse/prim_YCoCg_ssse3.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* FreeRDP: A Remote Desktop Protocol Client |
2 | | * Optimized YCoCg<->RGB conversion operations. |
3 | | * vi:ts=4 sw=4: |
4 | | * |
5 | | * (c) Copyright 2014 Hewlett-Packard Development Company, L.P. |
6 | | * |
7 | | * Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | * you may not use this file except in compliance with the License. |
9 | | * You may obtain a copy of the License at |
10 | | * |
11 | | * http://www.apache.org/licenses/LICENSE-2.0 |
12 | | * |
13 | | * Unless required by applicable law or agreed to in writing, software |
14 | | * distributed under the License is distributed on an "AS IS" BASIS, |
15 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
16 | | * See the License for the specific language governing permissions and |
17 | | * limitations under the License. |
18 | | */ |
19 | | |
20 | | #include <freerdp/config.h> |
21 | | |
22 | | #include <freerdp/types.h> |
23 | | #include <freerdp/primitives.h> |
24 | | #include <winpr/sysinfo.h> |
25 | | |
26 | | #include "prim_YCoCg.h" |
27 | | |
28 | | #include "prim_internal.h" |
29 | | #include "prim_templates.h" |
30 | | |
31 | | #if defined(SSE_AVX_INTRINSICS_ENABLED) |
32 | | #include <emmintrin.h> |
33 | | #include <tmmintrin.h> |
34 | | |
35 | | static primitives_t* generic = NULL; |
36 | | |
37 | | /* ------------------------------------------------------------------------- */ |
38 | | static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep, |
39 | | BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, |
40 | | UINT32 dstStep, UINT32 width, UINT32 height, |
41 | | UINT8 shift, BOOL withAlpha) |
42 | 0 | { |
43 | 0 | const BYTE* sptr = pSrc; |
44 | 0 | BYTE* dptr = pDst; |
45 | |
|
46 | 0 | WINPR_ASSERT(srcStep / sizeof(UINT32) >= width); |
47 | 0 | WINPR_ASSERT(dstStep / sizeof(UINT32) >= width); |
48 | 0 | const size_t sRowBump = srcStep - width * sizeof(UINT32); |
49 | 0 | const size_t dRowBump = dstStep - width * sizeof(UINT32); |
50 | | /* Shift left by "shift" and divide by two is the same as shift |
51 | | * left by "shift-1". |
52 | | */ |
53 | 0 | int dataShift = shift - 1; |
54 | 0 | BYTE mask = (BYTE)(0xFFU << dataShift); |
55 | | |
56 | | /* Let's say the data is of the form: |
57 | | * y0y0o0g0 a1y1o1g1 a2y2o2g2... |
58 | | * Apply: |
59 | | * |R| | 1 1/2 -1/2 | |y| |
60 | | * |G| = | 1 0 1/2 | * |o| |
61 | | * |B| | 1 -1/2 -1/2 | |g| |
62 | | * where Y is 8-bit unsigned and o & g are 8-bit signed. |
63 | | */ |
64 | |
|
65 | 0 | if ((width < 8) || (ULONG_PTR)dptr & 0x03) |
66 | 0 | { |
67 | | /* Too small, or we'll never hit a 16-byte boundary. Punt. */ |
68 | 0 | return generic->YCoCgToRGB_8u_AC4R(pSrc, WINPR_ASSERTING_INT_CAST(INT32, srcStep), pDst, |
69 | 0 | DstFormat, WINPR_ASSERTING_INT_CAST(INT32, dstStep), |
70 | 0 | width, height, shift, withAlpha); |
71 | 0 | } |
72 | | |
73 | 0 | for (UINT32 h = 0; h < height; h++) |
74 | 0 | { |
75 | 0 | UINT32 w = width; |
76 | |
|
77 | 0 | while (w >= 8) |
78 | 0 | { |
79 | 0 | __m128i R0; |
80 | 0 | __m128i R1; |
81 | 0 | __m128i R2; |
82 | 0 | __m128i R3; |
83 | 0 | __m128i R4; |
84 | 0 | __m128i R5; |
85 | 0 | __m128i R6; |
86 | 0 | __m128i R7; |
87 | |
|
88 | 0 | R0 = LOAD_SI128(sptr); |
89 | 0 | sptr += (128 / 8); |
90 | 0 | R1 = LOAD_SI128(sptr); |
91 | 0 | sptr += (128 / 8); |
92 | | |
93 | | /* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */ |
94 | | /* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */ |
95 | | /* Shuffle to pack all the like types together. */ |
96 | 0 | R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400); |
97 | 0 | R3 = _mm_shuffle_epi8(R0, R2); |
98 | 0 | R4 = _mm_shuffle_epi8(R1, R2); |
99 | | /* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */ |
100 | | /* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */ |
101 | 0 | R5 = _mm_unpackhi_epi32(R3, R4); |
102 | 0 | R6 = _mm_unpacklo_epi32(R3, R4); |
103 | | |
104 | | /* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */ |
105 | | /* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */ |
106 | | /* Save alphas aside */ |
107 | 0 | if (withAlpha) |
108 | 0 | R7 = _mm_unpackhi_epi64(R5, R5); |
109 | 0 | else |
110 | 0 | R7 = mm_set1_epu32(0xFFFFFFFFU); |
111 | | |
112 | | /* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */ |
113 | | /* Expand Y's from 8-bit unsigned to 16-bit signed. */ |
114 | 0 | R1 = mm_set1_epu32(0); |
115 | 0 | R0 = _mm_unpacklo_epi8(R5, R1); |
116 | | /* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */ |
117 | | /* Shift Co's and Cg's by (shift-1). -1 covers division by two. |
118 | | * Note: this must be done before sign-conversion. |
119 | | * Note also there is no slli_epi8, so we have to use a 16-bit |
120 | | * version and then mask. |
121 | | */ |
122 | 0 | R6 = _mm_slli_epi16(R6, dataShift); |
123 | 0 | R1 = mm_set1_epu8(mask); |
124 | 0 | R6 = _mm_and_si128(R6, R1); |
125 | | /* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */ |
126 | | /* Expand Co's from 8-bit signed to 16-bit signed */ |
127 | 0 | R1 = _mm_unpackhi_epi8(R6, R6); |
128 | 0 | R1 = _mm_srai_epi16(R1, 8); |
129 | | /* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */ |
130 | | /* Expand Cg's form 8-bit signed to 16-bit signed */ |
131 | 0 | R2 = _mm_unpacklo_epi8(R6, R6); |
132 | 0 | R2 = _mm_srai_epi16(R2, 8); |
133 | | /* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */ |
134 | | /* Get Y - halfCg and save */ |
135 | 0 | R6 = _mm_subs_epi16(R0, R2); |
136 | | /* R = (Y-halfCg) + halfCo */ |
137 | 0 | R3 = _mm_adds_epi16(R6, R1); |
138 | | /* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */ |
139 | | /* G = Y + Cg(/2) */ |
140 | 0 | R4 = _mm_adds_epi16(R0, R2); |
141 | | /* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */ |
142 | | /* B = (Y-halfCg) - Co(/2) */ |
143 | 0 | R5 = _mm_subs_epi16(R6, R1); |
144 | | /* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */ |
145 | | /* Repack R's & B's. */ |
146 | 0 | R0 = _mm_packus_epi16(R3, R5); |
147 | | /* R0 = R7R6R5R4 R3R2R1R0 B7B6B5B4 B3B2B1B0 */ |
148 | | /* Repack G's. */ |
149 | 0 | R1 = _mm_packus_epi16(R4, R4); |
150 | | /* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */ |
151 | | /* And add the A's. */ |
152 | 0 | R1 = _mm_unpackhi_epi64(R1, R7); |
153 | | /* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */ |
154 | | /* Now do interleaving again. */ |
155 | 0 | R2 = _mm_unpacklo_epi8(R0, R1); |
156 | | /* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */ |
157 | 0 | R3 = _mm_unpackhi_epi8(R0, R1); |
158 | | /* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */ |
159 | 0 | R4 = _mm_unpacklo_epi16(R2, R3); |
160 | | /* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */ |
161 | 0 | R5 = _mm_unpackhi_epi16(R2, R3); |
162 | | /* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */ |
163 | 0 | STORE_SI128(dptr, R4); |
164 | 0 | dptr += (128 / 8); |
165 | 0 | STORE_SI128(dptr, R5); |
166 | 0 | dptr += (128 / 8); |
167 | 0 | w -= 8; |
168 | 0 | } |
169 | | |
170 | | /* Handle any remainder pixels. */ |
171 | 0 | if (w > 0) |
172 | 0 | { |
173 | 0 | pstatus_t status = 0; |
174 | 0 | status = generic->YCoCgToRGB_8u_AC4R( |
175 | 0 | sptr, WINPR_ASSERTING_INT_CAST(INT32, srcStep), dptr, DstFormat, |
176 | 0 | WINPR_ASSERTING_INT_CAST(INT32, dstStep), w, 1, shift, withAlpha); |
177 | | |
178 | 0 | if (status != PRIMITIVES_SUCCESS) |
179 | 0 | return status; |
180 | | |
181 | 0 | sptr += w * sizeof(UINT32); |
182 | 0 | dptr += w * sizeof(UINT32); |
183 | 0 | } |
184 | | |
185 | 0 | sptr += sRowBump; |
186 | 0 | dptr += dRowBump; |
187 | 0 | } |
188 | | |
189 | 0 | return PRIMITIVES_SUCCESS; |
190 | 0 | } |
191 | | |
192 | | /* ------------------------------------------------------------------------- */ |
193 | | static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(const BYTE* WINPR_RESTRICT pSrc, |
194 | | UINT32 srcStep, BYTE* WINPR_RESTRICT pDst, |
195 | | UINT32 DstFormat, UINT32 dstStep, UINT32 width, |
196 | | UINT32 height, UINT8 shift, BOOL withAlpha) |
197 | 0 | { |
198 | 0 | const BYTE* sptr = pSrc; |
199 | 0 | BYTE* dptr = pDst; |
200 | 0 | size_t sRowBump = srcStep - width * sizeof(UINT32); |
201 | 0 | size_t dRowBump = dstStep - width * sizeof(UINT32); |
202 | | /* Shift left by "shift" and divide by two is the same as shift |
203 | | * left by "shift-1". |
204 | | */ |
205 | 0 | int dataShift = shift - 1; |
206 | 0 | BYTE mask = (BYTE)(0xFFU << dataShift); |
207 | | |
208 | | /* Let's say the data is of the form: |
209 | | * y0y0o0g0 a1y1o1g1 a2y2o2g2... |
210 | | * Apply: |
211 | | * |R| | 1 1/2 -1/2 | |y| |
212 | | * |G| = | 1 0 1/2 | * |o| |
213 | | * |B| | 1 -1/2 -1/2 | |g| |
214 | | * where Y is 8-bit unsigned and o & g are 8-bit signed. |
215 | | */ |
216 | |
|
217 | 0 | if ((width < 8) || (ULONG_PTR)dptr & 0x03) |
218 | 0 | { |
219 | | /* Too small, or we'll never hit a 16-byte boundary. Punt. */ |
220 | 0 | return generic->YCoCgToRGB_8u_AC4R(pSrc, WINPR_ASSERTING_INT_CAST(INT32, srcStep), pDst, |
221 | 0 | DstFormat, WINPR_ASSERTING_INT_CAST(INT32, dstStep), |
222 | 0 | width, height, shift, withAlpha); |
223 | 0 | } |
224 | | |
225 | 0 | for (UINT32 h = 0; h < height; h++) |
226 | 0 | { |
227 | 0 | UINT32 w = width; |
228 | |
|
229 | 0 | while (w >= 8) |
230 | 0 | { |
231 | 0 | __m128i R7; |
232 | | |
233 | | /* The faster path, 16-byte aligned load. */ |
234 | 0 | __m128i R0 = LOAD_SI128(sptr); |
235 | 0 | sptr += (128 / 8); |
236 | 0 | __m128i R1 = LOAD_SI128(sptr); |
237 | 0 | sptr += (128 / 8); |
238 | | |
239 | | /* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */ |
240 | | /* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */ |
241 | | /* Shuffle to pack all the like types together. */ |
242 | 0 | __m128i R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400); |
243 | 0 | __m128i R3 = _mm_shuffle_epi8(R0, R2); |
244 | 0 | __m128i R4 = _mm_shuffle_epi8(R1, R2); |
245 | | /* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */ |
246 | | /* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */ |
247 | 0 | __m128i R5 = _mm_unpackhi_epi32(R3, R4); |
248 | 0 | __m128i R6 = _mm_unpacklo_epi32(R3, R4); |
249 | | |
250 | | /* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */ |
251 | | /* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */ |
252 | | /* Save alphas aside */ |
253 | 0 | if (withAlpha) |
254 | 0 | R7 = _mm_unpackhi_epi64(R5, R5); |
255 | 0 | else |
256 | 0 | R7 = mm_set1_epu32(0xFFFFFFFFU); |
257 | | |
258 | | /* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */ |
259 | | /* Expand Y's from 8-bit unsigned to 16-bit signed. */ |
260 | 0 | R1 = mm_set1_epu32(0); |
261 | 0 | R0 = _mm_unpacklo_epi8(R5, R1); |
262 | | /* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */ |
263 | | /* Shift Co's and Cg's by (shift-1). -1 covers division by two. |
264 | | * Note: this must be done before sign-conversion. |
265 | | * Note also there is no slli_epi8, so we have to use a 16-bit |
266 | | * version and then mask. |
267 | | */ |
268 | 0 | R6 = _mm_slli_epi16(R6, dataShift); |
269 | 0 | R1 = mm_set1_epu8(mask); |
270 | 0 | R6 = _mm_and_si128(R6, R1); |
271 | | /* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */ |
272 | | /* Expand Co's from 8-bit signed to 16-bit signed */ |
273 | 0 | R1 = _mm_unpackhi_epi8(R6, R6); |
274 | 0 | R1 = _mm_srai_epi16(R1, 8); |
275 | | /* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */ |
276 | | /* Expand Cg's form 8-bit signed to 16-bit signed */ |
277 | 0 | R2 = _mm_unpacklo_epi8(R6, R6); |
278 | 0 | R2 = _mm_srai_epi16(R2, 8); |
279 | | /* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */ |
280 | | /* Get Y - halfCg and save */ |
281 | 0 | R6 = _mm_subs_epi16(R0, R2); |
282 | | /* R = (Y-halfCg) + halfCo */ |
283 | 0 | R3 = _mm_adds_epi16(R6, R1); |
284 | | /* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */ |
285 | | /* G = Y + Cg(/2) */ |
286 | 0 | R4 = _mm_adds_epi16(R0, R2); |
287 | | /* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */ |
288 | | /* B = (Y-halfCg) - Co(/2) */ |
289 | 0 | R5 = _mm_subs_epi16(R6, R1); |
290 | | /* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */ |
291 | | /* Repack R's & B's. */ |
292 | | /* This line is the only diff between inverted and non-inverted. |
293 | | * Unfortunately, it would be expensive to check "inverted" |
294 | | * every time through this loop. |
295 | | */ |
296 | 0 | R0 = _mm_packus_epi16(R5, R3); |
297 | | /* R0 = B7B6B5B4 B3B2B1B0 R7R6R5R4 R3R2R1R0 */ |
298 | | /* Repack G's. */ |
299 | 0 | R1 = _mm_packus_epi16(R4, R4); |
300 | | /* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */ |
301 | | /* And add the A's. */ |
302 | 0 | R1 = _mm_unpackhi_epi64(R1, R7); |
303 | | /* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */ |
304 | | /* Now do interleaving again. */ |
305 | 0 | R2 = _mm_unpacklo_epi8(R0, R1); |
306 | | /* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */ |
307 | 0 | R3 = _mm_unpackhi_epi8(R0, R1); |
308 | | /* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */ |
309 | 0 | R4 = _mm_unpacklo_epi16(R2, R3); |
310 | | /* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */ |
311 | 0 | R5 = _mm_unpackhi_epi16(R2, R3); |
312 | | /* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */ |
313 | 0 | STORE_SI128(dptr, R4); |
314 | 0 | dptr += (128 / 8); |
315 | 0 | STORE_SI128(dptr, R5); |
316 | 0 | dptr += (128 / 8); |
317 | 0 | w -= 8; |
318 | 0 | } |
319 | | |
320 | | /* Handle any remainder pixels. */ |
321 | 0 | if (w > 0) |
322 | 0 | { |
323 | 0 | pstatus_t status = 0; |
324 | 0 | status = generic->YCoCgToRGB_8u_AC4R( |
325 | 0 | sptr, WINPR_ASSERTING_INT_CAST(INT32, srcStep), dptr, DstFormat, |
326 | 0 | WINPR_ASSERTING_INT_CAST(INT32, dstStep), WINPR_ASSERTING_INT_CAST(UINT32, w), 1, |
327 | 0 | shift, withAlpha); |
328 | | |
329 | 0 | if (status != PRIMITIVES_SUCCESS) |
330 | 0 | return status; |
331 | | |
332 | 0 | sptr += WINPR_ASSERTING_INT_CAST(UINT32, w) * sizeof(UINT32); |
333 | 0 | dptr += WINPR_ASSERTING_INT_CAST(UINT32, w) * sizeof(UINT32); |
334 | 0 | } |
335 | | |
336 | 0 | sptr += sRowBump; |
337 | 0 | dptr += dRowBump; |
338 | 0 | } |
339 | | |
340 | 0 | return PRIMITIVES_SUCCESS; |
341 | 0 | } |
342 | | |
343 | | /* ------------------------------------------------------------------------- */ |
344 | | static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep, |
345 | | BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, |
346 | | INT32 dstStep, UINT32 width, UINT32 height, UINT8 shift, |
347 | | BOOL withAlpha) |
348 | 0 | { |
349 | 0 | switch (DstFormat) |
350 | 0 | { |
351 | 0 | case PIXEL_FORMAT_BGRX32: |
352 | 0 | case PIXEL_FORMAT_BGRA32: |
353 | 0 | return ssse3_YCoCgRToRGB_8u_AC4R_invert( |
354 | 0 | pSrc, WINPR_ASSERTING_INT_CAST(UINT32, srcStep), pDst, DstFormat, |
355 | 0 | WINPR_ASSERTING_INT_CAST(UINT32, dstStep), width, height, shift, withAlpha); |
356 | | |
357 | 0 | case PIXEL_FORMAT_RGBX32: |
358 | 0 | case PIXEL_FORMAT_RGBA32: |
359 | 0 | return ssse3_YCoCgRToRGB_8u_AC4R_no_invert( |
360 | 0 | pSrc, WINPR_ASSERTING_INT_CAST(UINT32, srcStep), pDst, DstFormat, |
361 | 0 | WINPR_ASSERTING_INT_CAST(UINT32, dstStep), width, height, shift, withAlpha); |
362 | | |
363 | 0 | default: |
364 | 0 | return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width, |
365 | 0 | height, shift, withAlpha); |
366 | 0 | } |
367 | 0 | } |
368 | | |
369 | | #endif |
370 | | |
371 | | /* ------------------------------------------------------------------------- */ |
372 | | void primitives_init_YCoCg_ssse3_int(primitives_t* WINPR_RESTRICT prims) |
373 | 0 | { |
374 | 0 | #if defined(SSE_AVX_INTRINSICS_ENABLED) |
375 | 0 | generic = primitives_get_generic(); |
376 | |
|
377 | 0 | WLog_VRB(PRIM_TAG, "SSE3/SSSE3 optimizations"); |
378 | 0 | prims->YCoCgToRGB_8u_AC4R = ssse3_YCoCgRToRGB_8u_AC4R; |
379 | | #else |
380 | | WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE2 intrinsics not available"); |
381 | | WINPR_UNUSED(prims); |
382 | | #endif |
383 | 0 | } |