/src/FreeRDP/libfreerdp/primitives/prim_YCoCg_opt.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* FreeRDP: A Remote Desktop Protocol Client |
2 | | * Optimized YCoCg<->RGB conversion operations. |
3 | | * vi:ts=4 sw=4: |
4 | | * |
5 | | * (c) Copyright 2014 Hewlett-Packard Development Company, L.P. |
6 | | * |
7 | | * Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | * you may not use this file except in compliance with the License. |
9 | | * You may obtain a copy of the License at |
10 | | * |
11 | | * http://www.apache.org/licenses/LICENSE-2.0 |
12 | | * |
13 | | * Unless required by applicable law or agreed to in writing, software |
14 | | * distributed under the License is distributed on an "AS IS" BASIS, |
15 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
16 | | * See the License for the specific language governing permissions and |
17 | | * limitations under the License. |
18 | | */ |
19 | | |
20 | | #include <freerdp/config.h> |
21 | | |
22 | | #include <freerdp/types.h> |
23 | | #include <freerdp/primitives.h> |
24 | | #include <winpr/sysinfo.h> |
25 | | |
26 | | #ifdef WITH_SSE2 |
27 | | #include <emmintrin.h> |
28 | | #include <tmmintrin.h> |
29 | | #elif defined(WITH_NEON) |
30 | | #include <arm_neon.h> |
31 | | #endif /* WITH_SSE2 else WITH_NEON */ |
32 | | |
33 | | #include "prim_internal.h" |
34 | | #include "prim_templates.h" |
35 | | |
36 | | static primitives_t* generic = NULL; |
37 | | |
38 | | #ifdef WITH_SSE2 |
39 | | /* ------------------------------------------------------------------------- */ |
40 | | static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep, |
41 | | BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, |
42 | | UINT32 dstStep, UINT32 width, UINT32 height, |
43 | | UINT8 shift, BOOL withAlpha) |
44 | 0 | { |
45 | 0 | const BYTE* sptr = pSrc; |
46 | 0 | BYTE* dptr = (BYTE*)pDst; |
47 | 0 | int sRowBump = srcStep - width * sizeof(UINT32); |
48 | 0 | int dRowBump = dstStep - width * sizeof(UINT32); |
49 | 0 | UINT32 h; |
50 | | /* Shift left by "shift" and divide by two is the same as shift |
51 | | * left by "shift-1". |
52 | | */ |
53 | 0 | int dataShift = shift - 1; |
54 | 0 | BYTE mask = (BYTE)(0xFFU << dataShift); |
55 | | |
56 | | /* Let's say the data is of the form: |
57 | | * y0y0o0g0 a1y1o1g1 a2y2o2g2... |
58 | | * Apply: |
59 | | * |R| | 1 1/2 -1/2 | |y| |
60 | | * |G| = | 1 0 1/2 | * |o| |
61 | | * |B| | 1 -1/2 -1/2 | |g| |
62 | | * where Y is 8-bit unsigned and o & g are 8-bit signed. |
63 | | */ |
64 | |
|
65 | 0 | if ((width < 8) || (ULONG_PTR)dptr & 0x03) |
66 | 0 | { |
67 | | /* Too small, or we'll never hit a 16-byte boundary. Punt. */ |
68 | 0 | return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, |
69 | 0 | shift, withAlpha); |
70 | 0 | } |
71 | | |
72 | 0 | for (h = 0; h < height; h++) |
73 | 0 | { |
74 | 0 | UINT32 w = width; |
75 | 0 | BOOL onStride; |
76 | | |
77 | | /* Get to a 16-byte destination boundary. */ |
78 | 0 | if ((ULONG_PTR)dptr & 0x0f) |
79 | 0 | { |
80 | 0 | pstatus_t status; |
81 | 0 | UINT32 startup = (16 - ((ULONG_PTR)dptr & 0x0f)) / 4; |
82 | |
|
83 | 0 | if (startup > width) |
84 | 0 | startup = width; |
85 | |
|
86 | 0 | status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, startup, |
87 | 0 | 1, shift, withAlpha); |
88 | |
|
89 | 0 | if (status != PRIMITIVES_SUCCESS) |
90 | 0 | return status; |
91 | | |
92 | 0 | sptr += startup * sizeof(UINT32); |
93 | 0 | dptr += startup * sizeof(UINT32); |
94 | 0 | w -= startup; |
95 | 0 | } |
96 | | |
97 | | /* Each loop handles eight pixels at a time. */ |
98 | 0 | onStride = (((ULONG_PTR)sptr & 0x0f) == 0) ? TRUE : FALSE; |
99 | |
|
100 | 0 | while (w >= 8) |
101 | 0 | { |
102 | 0 | __m128i R0, R1, R2, R3, R4, R5, R6, R7; |
103 | |
|
104 | 0 | if (onStride) |
105 | 0 | { |
106 | | /* The faster path, 16-byte aligned load. */ |
107 | 0 | R0 = _mm_load_si128((const __m128i*)sptr); |
108 | 0 | sptr += (128 / 8); |
109 | 0 | R1 = _mm_load_si128((const __m128i*)sptr); |
110 | 0 | sptr += (128 / 8); |
111 | 0 | } |
112 | 0 | else |
113 | 0 | { |
114 | | /* Off-stride, slower LDDQU load. */ |
115 | 0 | R0 = _mm_lddqu_si128((const __m128i*)sptr); |
116 | 0 | sptr += (128 / 8); |
117 | 0 | R1 = _mm_lddqu_si128((const __m128i*)sptr); |
118 | 0 | sptr += (128 / 8); |
119 | 0 | } |
120 | | |
121 | | /* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */ |
122 | | /* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */ |
123 | | /* Shuffle to pack all the like types together. */ |
124 | 0 | R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400); |
125 | 0 | R3 = _mm_shuffle_epi8(R0, R2); |
126 | 0 | R4 = _mm_shuffle_epi8(R1, R2); |
127 | | /* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */ |
128 | | /* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */ |
129 | 0 | R5 = _mm_unpackhi_epi32(R3, R4); |
130 | 0 | R6 = _mm_unpacklo_epi32(R3, R4); |
131 | | |
132 | | /* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */ |
133 | | /* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */ |
134 | | /* Save alphas aside */ |
135 | 0 | if (withAlpha) |
136 | 0 | R7 = _mm_unpackhi_epi64(R5, R5); |
137 | 0 | else |
138 | 0 | R7 = _mm_set1_epi32(0xFFFFFFFFU); |
139 | | |
140 | | /* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */ |
141 | | /* Expand Y's from 8-bit unsigned to 16-bit signed. */ |
142 | 0 | R1 = _mm_set1_epi32(0); |
143 | 0 | R0 = _mm_unpacklo_epi8(R5, R1); |
144 | | /* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */ |
145 | | /* Shift Co's and Cg's by (shift-1). -1 covers division by two. |
146 | | * Note: this must be done before sign-conversion. |
147 | | * Note also there is no slli_epi8, so we have to use a 16-bit |
148 | | * version and then mask. |
149 | | */ |
150 | 0 | R6 = _mm_slli_epi16(R6, dataShift); |
151 | 0 | R1 = _mm_set1_epi8(mask); |
152 | 0 | R6 = _mm_and_si128(R6, R1); |
153 | | /* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */ |
154 | | /* Expand Co's from 8-bit signed to 16-bit signed */ |
155 | 0 | R1 = _mm_unpackhi_epi8(R6, R6); |
156 | 0 | R1 = _mm_srai_epi16(R1, 8); |
157 | | /* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */ |
158 | | /* Expand Cg's form 8-bit signed to 16-bit signed */ |
159 | 0 | R2 = _mm_unpacklo_epi8(R6, R6); |
160 | 0 | R2 = _mm_srai_epi16(R2, 8); |
161 | | /* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */ |
162 | | /* Get Y - halfCg and save */ |
163 | 0 | R6 = _mm_subs_epi16(R0, R2); |
164 | | /* R = (Y-halfCg) + halfCo */ |
165 | 0 | R3 = _mm_adds_epi16(R6, R1); |
166 | | /* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */ |
167 | | /* G = Y + Cg(/2) */ |
168 | 0 | R4 = _mm_adds_epi16(R0, R2); |
169 | | /* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */ |
170 | | /* B = (Y-halfCg) - Co(/2) */ |
171 | 0 | R5 = _mm_subs_epi16(R6, R1); |
172 | | /* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */ |
173 | | /* Repack R's & B's. */ |
174 | 0 | R0 = _mm_packus_epi16(R3, R5); |
175 | | /* R0 = R7R6R5R4 R3R2R1R0 B7B6B5B4 B3B2B1B0 */ |
176 | | /* Repack G's. */ |
177 | 0 | R1 = _mm_packus_epi16(R4, R4); |
178 | | /* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */ |
179 | | /* And add the A's. */ |
180 | 0 | R1 = _mm_unpackhi_epi64(R1, R7); |
181 | | /* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */ |
182 | | /* Now do interleaving again. */ |
183 | 0 | R2 = _mm_unpacklo_epi8(R0, R1); |
184 | | /* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */ |
185 | 0 | R3 = _mm_unpackhi_epi8(R0, R1); |
186 | | /* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */ |
187 | 0 | R4 = _mm_unpacklo_epi16(R2, R3); |
188 | | /* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */ |
189 | 0 | R5 = _mm_unpackhi_epi16(R2, R3); |
190 | | /* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */ |
191 | 0 | _mm_store_si128((__m128i*)dptr, R4); |
192 | 0 | dptr += (128 / 8); |
193 | 0 | _mm_store_si128((__m128i*)dptr, R5); |
194 | 0 | dptr += (128 / 8); |
195 | 0 | w -= 8; |
196 | 0 | } |
197 | | |
198 | | /* Handle any remainder pixels. */ |
199 | 0 | if (w > 0) |
200 | 0 | { |
201 | 0 | pstatus_t status; |
202 | 0 | status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, w, 1, |
203 | 0 | shift, withAlpha); |
204 | |
|
205 | 0 | if (status != PRIMITIVES_SUCCESS) |
206 | 0 | return status; |
207 | | |
208 | 0 | sptr += w * sizeof(UINT32); |
209 | 0 | dptr += w * sizeof(UINT32); |
210 | 0 | } |
211 | | |
212 | 0 | sptr += sRowBump; |
213 | 0 | dptr += dRowBump; |
214 | 0 | } |
215 | | |
216 | 0 | return PRIMITIVES_SUCCESS; |
217 | 0 | } |
218 | | |
219 | | /* ------------------------------------------------------------------------- */ |
220 | | static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(const BYTE* WINPR_RESTRICT pSrc, |
221 | | UINT32 srcStep, BYTE* WINPR_RESTRICT pDst, |
222 | | UINT32 DstFormat, UINT32 dstStep, UINT32 width, |
223 | | UINT32 height, UINT8 shift, BOOL withAlpha) |
224 | 0 | { |
225 | 0 | const BYTE* sptr = pSrc; |
226 | 0 | BYTE* dptr = (BYTE*)pDst; |
227 | 0 | int sRowBump = srcStep - width * sizeof(UINT32); |
228 | 0 | int dRowBump = dstStep - width * sizeof(UINT32); |
229 | 0 | UINT32 h; |
230 | | /* Shift left by "shift" and divide by two is the same as shift |
231 | | * left by "shift-1". |
232 | | */ |
233 | 0 | int dataShift = shift - 1; |
234 | 0 | BYTE mask = (BYTE)(0xFFU << dataShift); |
235 | | |
236 | | /* Let's say the data is of the form: |
237 | | * y0y0o0g0 a1y1o1g1 a2y2o2g2... |
238 | | * Apply: |
239 | | * |R| | 1 1/2 -1/2 | |y| |
240 | | * |G| = | 1 0 1/2 | * |o| |
241 | | * |B| | 1 -1/2 -1/2 | |g| |
242 | | * where Y is 8-bit unsigned and o & g are 8-bit signed. |
243 | | */ |
244 | |
|
245 | 0 | if ((width < 8) || (ULONG_PTR)dptr & 0x03) |
246 | 0 | { |
247 | | /* Too small, or we'll never hit a 16-byte boundary. Punt. */ |
248 | 0 | return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, |
249 | 0 | shift, withAlpha); |
250 | 0 | } |
251 | | |
252 | 0 | for (h = 0; h < height; h++) |
253 | 0 | { |
254 | 0 | int w = width; |
255 | 0 | BOOL onStride; |
256 | | |
257 | | /* Get to a 16-byte destination boundary. */ |
258 | 0 | if ((ULONG_PTR)dptr & 0x0f) |
259 | 0 | { |
260 | 0 | pstatus_t status; |
261 | 0 | UINT32 startup = (16 - ((ULONG_PTR)dptr & 0x0f)) / 4; |
262 | |
|
263 | 0 | if (startup > width) |
264 | 0 | startup = width; |
265 | |
|
266 | 0 | status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, startup, |
267 | 0 | 1, shift, withAlpha); |
268 | |
|
269 | 0 | if (status != PRIMITIVES_SUCCESS) |
270 | 0 | return status; |
271 | | |
272 | 0 | sptr += startup * sizeof(UINT32); |
273 | 0 | dptr += startup * sizeof(UINT32); |
274 | 0 | w -= startup; |
275 | 0 | } |
276 | | |
277 | | /* Each loop handles eight pixels at a time. */ |
278 | 0 | onStride = (((const ULONG_PTR)sptr & 0x0f) == 0) ? TRUE : FALSE; |
279 | |
|
280 | 0 | while (w >= 8) |
281 | 0 | { |
282 | 0 | __m128i R0, R1, R2, R3, R4, R5, R6, R7; |
283 | |
|
284 | 0 | if (onStride) |
285 | 0 | { |
286 | | /* The faster path, 16-byte aligned load. */ |
287 | 0 | R0 = _mm_load_si128((const __m128i*)sptr); |
288 | 0 | sptr += (128 / 8); |
289 | 0 | R1 = _mm_load_si128((const __m128i*)sptr); |
290 | 0 | sptr += (128 / 8); |
291 | 0 | } |
292 | 0 | else |
293 | 0 | { |
294 | | /* Off-stride, slower LDDQU load. */ |
295 | 0 | R0 = _mm_lddqu_si128((const __m128i*)sptr); |
296 | 0 | sptr += (128 / 8); |
297 | 0 | R1 = _mm_lddqu_si128((const __m128i*)sptr); |
298 | 0 | sptr += (128 / 8); |
299 | 0 | } |
300 | | |
301 | | /* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */ |
302 | | /* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */ |
303 | | /* Shuffle to pack all the like types together. */ |
304 | 0 | R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400); |
305 | 0 | R3 = _mm_shuffle_epi8(R0, R2); |
306 | 0 | R4 = _mm_shuffle_epi8(R1, R2); |
307 | | /* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */ |
308 | | /* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */ |
309 | 0 | R5 = _mm_unpackhi_epi32(R3, R4); |
310 | 0 | R6 = _mm_unpacklo_epi32(R3, R4); |
311 | | |
312 | | /* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */ |
313 | | /* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */ |
314 | | /* Save alphas aside */ |
315 | 0 | if (withAlpha) |
316 | 0 | R7 = _mm_unpackhi_epi64(R5, R5); |
317 | 0 | else |
318 | 0 | R7 = _mm_set1_epi32(0xFFFFFFFFU); |
319 | | |
320 | | /* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */ |
321 | | /* Expand Y's from 8-bit unsigned to 16-bit signed. */ |
322 | 0 | R1 = _mm_set1_epi32(0); |
323 | 0 | R0 = _mm_unpacklo_epi8(R5, R1); |
324 | | /* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */ |
325 | | /* Shift Co's and Cg's by (shift-1). -1 covers division by two. |
326 | | * Note: this must be done before sign-conversion. |
327 | | * Note also there is no slli_epi8, so we have to use a 16-bit |
328 | | * version and then mask. |
329 | | */ |
330 | 0 | R6 = _mm_slli_epi16(R6, dataShift); |
331 | 0 | R1 = _mm_set1_epi8(mask); |
332 | 0 | R6 = _mm_and_si128(R6, R1); |
333 | | /* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */ |
334 | | /* Expand Co's from 8-bit signed to 16-bit signed */ |
335 | 0 | R1 = _mm_unpackhi_epi8(R6, R6); |
336 | 0 | R1 = _mm_srai_epi16(R1, 8); |
337 | | /* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */ |
338 | | /* Expand Cg's form 8-bit signed to 16-bit signed */ |
339 | 0 | R2 = _mm_unpacklo_epi8(R6, R6); |
340 | 0 | R2 = _mm_srai_epi16(R2, 8); |
341 | | /* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */ |
342 | | /* Get Y - halfCg and save */ |
343 | 0 | R6 = _mm_subs_epi16(R0, R2); |
344 | | /* R = (Y-halfCg) + halfCo */ |
345 | 0 | R3 = _mm_adds_epi16(R6, R1); |
346 | | /* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */ |
347 | | /* G = Y + Cg(/2) */ |
348 | 0 | R4 = _mm_adds_epi16(R0, R2); |
349 | | /* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */ |
350 | | /* B = (Y-halfCg) - Co(/2) */ |
351 | 0 | R5 = _mm_subs_epi16(R6, R1); |
352 | | /* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */ |
353 | | /* Repack R's & B's. */ |
354 | | /* This line is the only diff between inverted and non-inverted. |
355 | | * Unfortunately, it would be expensive to check "inverted" |
356 | | * every time through this loop. |
357 | | */ |
358 | 0 | R0 = _mm_packus_epi16(R5, R3); |
359 | | /* R0 = B7B6B5B4 B3B2B1B0 R7R6R5R4 R3R2R1R0 */ |
360 | | /* Repack G's. */ |
361 | 0 | R1 = _mm_packus_epi16(R4, R4); |
362 | | /* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */ |
363 | | /* And add the A's. */ |
364 | 0 | R1 = _mm_unpackhi_epi64(R1, R7); |
365 | | /* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */ |
366 | | /* Now do interleaving again. */ |
367 | 0 | R2 = _mm_unpacklo_epi8(R0, R1); |
368 | | /* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */ |
369 | 0 | R3 = _mm_unpackhi_epi8(R0, R1); |
370 | | /* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */ |
371 | 0 | R4 = _mm_unpacklo_epi16(R2, R3); |
372 | | /* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */ |
373 | 0 | R5 = _mm_unpackhi_epi16(R2, R3); |
374 | | /* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */ |
375 | 0 | _mm_store_si128((__m128i*)dptr, R4); |
376 | 0 | dptr += (128 / 8); |
377 | 0 | _mm_store_si128((__m128i*)dptr, R5); |
378 | 0 | dptr += (128 / 8); |
379 | 0 | w -= 8; |
380 | 0 | } |
381 | | |
382 | | /* Handle any remainder pixels. */ |
383 | 0 | if (w > 0) |
384 | 0 | { |
385 | 0 | pstatus_t status; |
386 | 0 | status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, w, 1, |
387 | 0 | shift, withAlpha); |
388 | |
|
389 | 0 | if (status != PRIMITIVES_SUCCESS) |
390 | 0 | return status; |
391 | | |
392 | 0 | sptr += w * sizeof(UINT32); |
393 | 0 | dptr += w * sizeof(UINT32); |
394 | 0 | } |
395 | | |
396 | 0 | sptr += sRowBump; |
397 | 0 | dptr += dRowBump; |
398 | 0 | } |
399 | | |
400 | 0 | return PRIMITIVES_SUCCESS; |
401 | 0 | } |
402 | | #endif /* WITH_SSE2 */ |
403 | | |
404 | | #ifdef WITH_SSE2 |
405 | | /* ------------------------------------------------------------------------- */ |
406 | | static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep, |
407 | | BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, |
408 | | INT32 dstStep, UINT32 width, UINT32 height, UINT8 shift, |
409 | | BOOL withAlpha) |
410 | 0 | { |
411 | 0 | switch (DstFormat) |
412 | 0 | { |
413 | 0 | case PIXEL_FORMAT_BGRX32: |
414 | 0 | case PIXEL_FORMAT_BGRA32: |
415 | 0 | return ssse3_YCoCgRToRGB_8u_AC4R_invert(pSrc, srcStep, pDst, DstFormat, dstStep, width, |
416 | 0 | height, shift, withAlpha); |
417 | | |
418 | 0 | case PIXEL_FORMAT_RGBX32: |
419 | 0 | case PIXEL_FORMAT_RGBA32: |
420 | 0 | return ssse3_YCoCgRToRGB_8u_AC4R_no_invert(pSrc, srcStep, pDst, DstFormat, dstStep, |
421 | 0 | width, height, shift, withAlpha); |
422 | | |
423 | 0 | default: |
424 | 0 | return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width, |
425 | 0 | height, shift, withAlpha); |
426 | 0 | } |
427 | 0 | } |
428 | | #elif defined(WITH_NEON) |
429 | | |
430 | | static pstatus_t neon_YCoCgToRGB_8u_X(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep, |
431 | | BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, INT32 dstStep, |
432 | | UINT32 width, UINT32 height, UINT8 shift, BYTE bPos, |
433 | | BYTE gPos, BYTE rPos, BYTE aPos, BOOL alpha) |
434 | | { |
435 | | UINT32 y; |
436 | | BYTE* dptr = pDst; |
437 | | const BYTE* sptr = pSrc; |
438 | | const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat); |
439 | | const int8_t cll = shift - 1; /* -1 builds in the /2's */ |
440 | | const UINT32 srcPad = srcStep - (width * 4); |
441 | | const UINT32 dstPad = dstStep - (width * formatSize); |
442 | | const UINT32 pad = width % 8; |
443 | | const uint8x8_t aVal = vdup_n_u8(0xFF); |
444 | | const int8x8_t cllv = vdup_n_s8(cll); |
445 | | |
446 | | for (y = 0; y < height; y++) |
447 | | { |
448 | | UINT32 x; |
449 | | |
450 | | for (x = 0; x < width - pad; x += 8) |
451 | | { |
452 | | /* Note: shifts must be done before sign-conversion. */ |
453 | | const uint8x8x4_t raw = vld4_u8(sptr); |
454 | | const int8x8_t CgRaw = vreinterpret_s8_u8(vshl_u8(raw.val[0], cllv)); |
455 | | const int8x8_t CoRaw = vreinterpret_s8_u8(vshl_u8(raw.val[1], cllv)); |
456 | | const int16x8_t Cg = vmovl_s8(CgRaw); |
457 | | const int16x8_t Co = vmovl_s8(CoRaw); |
458 | | const int16x8_t Y = vreinterpretq_s16_u16(vmovl_u8(raw.val[2])); /* UINT8 -> INT16 */ |
459 | | const int16x8_t T = vsubq_s16(Y, Cg); |
460 | | const int16x8_t R = vaddq_s16(T, Co); |
461 | | const int16x8_t G = vaddq_s16(Y, Cg); |
462 | | const int16x8_t B = vsubq_s16(T, Co); |
463 | | uint8x8x4_t bgrx; |
464 | | bgrx.val[bPos] = vqmovun_s16(B); |
465 | | bgrx.val[gPos] = vqmovun_s16(G); |
466 | | bgrx.val[rPos] = vqmovun_s16(R); |
467 | | |
468 | | if (alpha) |
469 | | bgrx.val[aPos] = raw.val[3]; |
470 | | else |
471 | | bgrx.val[aPos] = aVal; |
472 | | |
473 | | vst4_u8(dptr, bgrx); |
474 | | sptr += sizeof(raw); |
475 | | dptr += sizeof(bgrx); |
476 | | } |
477 | | |
478 | | for (x = 0; x < pad; x++) |
479 | | { |
480 | | /* Note: shifts must be done before sign-conversion. */ |
481 | | const INT16 Cg = (INT16)((INT8)((*sptr++) << cll)); |
482 | | const INT16 Co = (INT16)((INT8)((*sptr++) << cll)); |
483 | | const INT16 Y = (INT16)(*sptr++); /* UINT8->INT16 */ |
484 | | const INT16 T = Y - Cg; |
485 | | const INT16 R = T + Co; |
486 | | const INT16 G = Y + Cg; |
487 | | const INT16 B = T - Co; |
488 | | BYTE bgra[4]; |
489 | | bgra[bPos] = CLIP(B); |
490 | | bgra[gPos] = CLIP(G); |
491 | | bgra[rPos] = CLIP(R); |
492 | | bgra[aPos] = *sptr++; |
493 | | |
494 | | if (!alpha) |
495 | | bgra[aPos] = 0xFF; |
496 | | |
497 | | *dptr++ = bgra[0]; |
498 | | *dptr++ = bgra[1]; |
499 | | *dptr++ = bgra[2]; |
500 | | *dptr++ = bgra[3]; |
501 | | } |
502 | | |
503 | | sptr += srcPad; |
504 | | dptr += dstPad; |
505 | | } |
506 | | |
507 | | return PRIMITIVES_SUCCESS; |
508 | | } |
509 | | |
510 | | static pstatus_t neon_YCoCgToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep, |
511 | | BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, INT32 dstStep, |
512 | | UINT32 width, UINT32 height, UINT8 shift, BOOL withAlpha) |
513 | | { |
514 | | switch (DstFormat) |
515 | | { |
516 | | case PIXEL_FORMAT_BGRA32: |
517 | | return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, |
518 | | shift, 2, 1, 0, 3, withAlpha); |
519 | | |
520 | | case PIXEL_FORMAT_BGRX32: |
521 | | return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, |
522 | | shift, 2, 1, 0, 3, withAlpha); |
523 | | |
524 | | case PIXEL_FORMAT_RGBA32: |
525 | | return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, |
526 | | shift, 0, 1, 2, 3, withAlpha); |
527 | | |
528 | | case PIXEL_FORMAT_RGBX32: |
529 | | return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, |
530 | | shift, 0, 1, 2, 3, withAlpha); |
531 | | |
532 | | case PIXEL_FORMAT_ARGB32: |
533 | | return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, |
534 | | shift, 1, 2, 3, 0, withAlpha); |
535 | | |
536 | | case PIXEL_FORMAT_XRGB32: |
537 | | return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, |
538 | | shift, 1, 2, 3, 0, withAlpha); |
539 | | |
540 | | case PIXEL_FORMAT_ABGR32: |
541 | | return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, |
542 | | shift, 3, 2, 1, 0, withAlpha); |
543 | | |
544 | | case PIXEL_FORMAT_XBGR32: |
545 | | return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, |
546 | | shift, 3, 2, 1, 0, withAlpha); |
547 | | |
548 | | default: |
549 | | return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width, |
550 | | height, shift, withAlpha); |
551 | | } |
552 | | } |
553 | | #endif /* WITH_SSE2 */ |
554 | | |
555 | | /* ------------------------------------------------------------------------- */ |
556 | | void primitives_init_YCoCg_opt(primitives_t* WINPR_RESTRICT prims) |
557 | 0 | { |
558 | 0 | generic = primitives_get_generic(); |
559 | 0 | primitives_init_YCoCg(prims); |
560 | | /* While IPP acknowledges the existence of YCoCg-R, it doesn't currently |
561 | | * include any routines to work with it, especially with variable shift |
562 | | * width. |
563 | | */ |
564 | 0 | #if defined(WITH_SSE2) |
565 | |
|
566 | 0 | if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) && |
567 | 0 | IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) |
568 | 0 | { |
569 | 0 | prims->YCoCgToRGB_8u_AC4R = ssse3_YCoCgRToRGB_8u_AC4R; |
570 | 0 | } |
571 | |
|
572 | | #elif defined(WITH_NEON) |
573 | | |
574 | | if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE)) |
575 | | { |
576 | | prims->YCoCgToRGB_8u_AC4R = neon_YCoCgToRGB_8u_AC4R; |
577 | | } |
578 | | |
579 | | #endif /* WITH_SSE2 */ |
580 | 0 | } |