/src/cairo/subprojects/pixman-0.44.2/pixman/pixman-sse2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright © 2008 Rodrigo Kumpera |
3 | | * Copyright © 2008 André Tupinambá |
4 | | * |
5 | | * Permission to use, copy, modify, distribute, and sell this software and its |
6 | | * documentation for any purpose is hereby granted without fee, provided that |
7 | | * the above copyright notice appear in all copies and that both that |
8 | | * copyright notice and this permission notice appear in supporting |
9 | | * documentation, and that the name of Red Hat not be used in advertising or |
10 | | * publicity pertaining to distribution of the software without specific, |
11 | | * written prior permission. Red Hat makes no representations about the |
12 | | * suitability of this software for any purpose. It is provided "as is" |
13 | | * without express or implied warranty. |
14 | | * |
15 | | * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS |
16 | | * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND |
17 | | * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY |
18 | | * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
19 | | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN |
20 | | * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING |
21 | | * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS |
22 | | * SOFTWARE. |
23 | | * |
24 | | * Author: Rodrigo Kumpera (kumpera@gmail.com) |
25 | | * André Tupinambá (andrelrt@gmail.com) |
26 | | * |
27 | | * Based on work by Owen Taylor and Søren Sandmann |
28 | | */ |
29 | | #ifdef HAVE_CONFIG_H |
30 | | #include <pixman-config.h> |
31 | | #endif |
32 | | |
33 | | /* PSHUFD is slow on a lot of old processors, and new processors have SSSE3 */ |
34 | | #define PSHUFD_IS_FAST 0 |
35 | | |
36 | | #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */ |
37 | | #include <emmintrin.h> /* for SSE2 intrinsics */ |
38 | | #include "pixman-private.h" |
39 | | #include "pixman-combine32.h" |
40 | | #include "pixman-inlines.h" |
41 | | |
42 | | static __m128i mask_0080; |
43 | | static __m128i mask_00ff; |
44 | | static __m128i mask_0101; |
45 | | static __m128i mask_ffff; |
46 | | static __m128i mask_ff000000; |
47 | | static __m128i mask_alpha; |
48 | | |
49 | | static __m128i mask_565_r; |
50 | | static __m128i mask_565_g1, mask_565_g2; |
51 | | static __m128i mask_565_b; |
52 | | static __m128i mask_red; |
53 | | static __m128i mask_green; |
54 | | static __m128i mask_blue; |
55 | | |
56 | | static __m128i mask_565_fix_rb; |
57 | | static __m128i mask_565_fix_g; |
58 | | |
59 | | static __m128i mask_565_rb; |
60 | | static __m128i mask_565_pack_multiplier; |
61 | | |
62 | | static force_inline __m128i |
63 | | unpack_32_1x128 (uint32_t data) |
64 | 739 | { |
65 | 739 | return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ()); |
66 | 739 | } |
67 | | |
68 | | static force_inline void |
69 | | unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi) |
70 | 1.38k | { |
71 | 1.38k | *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ()); |
72 | 1.38k | *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ()); |
73 | 1.38k | } |
74 | | |
75 | | static force_inline __m128i |
76 | | unpack_565_to_8888 (__m128i lo) |
77 | 0 | { |
78 | 0 | __m128i r, g, b, rb, t; |
79 | |
|
80 | 0 | r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red); |
81 | 0 | g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green); |
82 | 0 | b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue); |
83 | |
|
84 | 0 | rb = _mm_or_si128 (r, b); |
85 | 0 | t = _mm_and_si128 (rb, mask_565_fix_rb); |
86 | 0 | t = _mm_srli_epi32 (t, 5); |
87 | 0 | rb = _mm_or_si128 (rb, t); |
88 | |
|
89 | 0 | t = _mm_and_si128 (g, mask_565_fix_g); |
90 | 0 | t = _mm_srli_epi32 (t, 6); |
91 | 0 | g = _mm_or_si128 (g, t); |
92 | |
|
93 | 0 | return _mm_or_si128 (rb, g); |
94 | 0 | } |
95 | | |
96 | | static force_inline void |
97 | | unpack_565_128_4x128 (__m128i data, |
98 | | __m128i* data0, |
99 | | __m128i* data1, |
100 | | __m128i* data2, |
101 | | __m128i* data3) |
102 | 0 | { |
103 | 0 | __m128i lo, hi; |
104 | |
|
105 | 0 | lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ()); |
106 | 0 | hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ()); |
107 | |
|
108 | 0 | lo = unpack_565_to_8888 (lo); |
109 | 0 | hi = unpack_565_to_8888 (hi); |
110 | |
|
111 | 0 | unpack_128_2x128 (lo, data0, data1); |
112 | 0 | unpack_128_2x128 (hi, data2, data3); |
113 | 0 | } |
114 | | |
115 | | static force_inline uint16_t |
116 | | pack_565_32_16 (uint32_t pixel) |
117 | 0 | { |
118 | 0 | return (uint16_t) (((pixel >> 8) & 0xf800) | |
119 | 0 | ((pixel >> 5) & 0x07e0) | |
120 | 0 | ((pixel >> 3) & 0x001f)); |
121 | 0 | } |
122 | | |
123 | | static force_inline __m128i |
124 | | pack_2x128_128 (__m128i lo, __m128i hi) |
125 | 685 | { |
126 | 685 | return _mm_packus_epi16 (lo, hi); |
127 | 685 | } |
128 | | |
129 | | static force_inline __m128i |
130 | | pack_565_2packedx128_128 (__m128i lo, __m128i hi) |
131 | 0 | { |
132 | 0 | __m128i rb0 = _mm_and_si128 (lo, mask_565_rb); |
133 | 0 | __m128i rb1 = _mm_and_si128 (hi, mask_565_rb); |
134 | |
|
135 | 0 | __m128i t0 = _mm_madd_epi16 (rb0, mask_565_pack_multiplier); |
136 | 0 | __m128i t1 = _mm_madd_epi16 (rb1, mask_565_pack_multiplier); |
137 | |
|
138 | 0 | __m128i g0 = _mm_and_si128 (lo, mask_green); |
139 | 0 | __m128i g1 = _mm_and_si128 (hi, mask_green); |
140 | |
|
141 | 0 | t0 = _mm_or_si128 (t0, g0); |
142 | 0 | t1 = _mm_or_si128 (t1, g1); |
143 | | |
144 | | /* Simulates _mm_packus_epi32 */ |
145 | 0 | t0 = _mm_slli_epi32 (t0, 16 - 5); |
146 | 0 | t1 = _mm_slli_epi32 (t1, 16 - 5); |
147 | 0 | t0 = _mm_srai_epi32 (t0, 16); |
148 | 0 | t1 = _mm_srai_epi32 (t1, 16); |
149 | 0 | return _mm_packs_epi32 (t0, t1); |
150 | 0 | } |
151 | | |
152 | | static force_inline __m128i |
153 | | pack_565_2x128_128 (__m128i lo, __m128i hi) |
154 | 0 | { |
155 | 0 | __m128i data; |
156 | 0 | __m128i r, g1, g2, b; |
157 | |
|
158 | 0 | data = pack_2x128_128 (lo, hi); |
159 | |
|
160 | 0 | r = _mm_and_si128 (data, mask_565_r); |
161 | 0 | g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1); |
162 | 0 | g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2); |
163 | 0 | b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b); |
164 | |
|
165 | 0 | return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b); |
166 | 0 | } |
167 | | |
168 | | static force_inline __m128i |
169 | | pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3) |
170 | 0 | { |
171 | 0 | return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1), |
172 | 0 | pack_565_2x128_128 (*xmm2, *xmm3)); |
173 | 0 | } |
174 | | |
175 | | static force_inline int |
176 | | is_opaque (__m128i x) |
177 | 1.65k | { |
178 | 1.65k | __m128i ffs = _mm_cmpeq_epi8 (x, x); |
179 | | |
180 | 1.65k | return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888; |
181 | 1.65k | } |
182 | | |
183 | | static force_inline int |
184 | | is_zero (__m128i x) |
185 | 4.43k | { |
186 | 4.43k | return _mm_movemask_epi8 ( |
187 | 4.43k | _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff; |
188 | 4.43k | } |
189 | | |
190 | | static force_inline int |
191 | | is_transparent (__m128i x) |
192 | 0 | { |
193 | 0 | return (_mm_movemask_epi8 ( |
194 | 0 | _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888; |
195 | 0 | } |
196 | | |
197 | | static force_inline __m128i |
198 | | expand_pixel_32_1x128 (uint32_t data) |
199 | 2 | { |
200 | 2 | return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0)); |
201 | 2 | } |
202 | | |
203 | | static force_inline __m128i |
204 | | expand_alpha_1x128 (__m128i data) |
205 | 230 | { |
206 | 230 | return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data, |
207 | 230 | _MM_SHUFFLE (3, 3, 3, 3)), |
208 | 230 | _MM_SHUFFLE (3, 3, 3, 3)); |
209 | 230 | } |
210 | | |
211 | | static force_inline void |
212 | | expand_alpha_2x128 (__m128i data_lo, |
213 | | __m128i data_hi, |
214 | | __m128i* alpha_lo, |
215 | | __m128i* alpha_hi) |
216 | 558 | { |
217 | 558 | __m128i lo, hi; |
218 | | |
219 | 558 | lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3)); |
220 | 558 | hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3)); |
221 | | |
222 | 558 | *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3)); |
223 | 558 | *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3)); |
224 | 558 | } |
225 | | |
226 | | static force_inline void |
227 | | expand_alpha_rev_2x128 (__m128i data_lo, |
228 | | __m128i data_hi, |
229 | | __m128i* alpha_lo, |
230 | | __m128i* alpha_hi) |
231 | 137 | { |
232 | 137 | __m128i lo, hi; |
233 | | |
234 | 137 | lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0)); |
235 | 137 | hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0)); |
236 | 137 | *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0)); |
237 | 137 | *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0)); |
238 | 137 | } |
239 | | |
240 | | static force_inline void |
241 | | pix_multiply_2x128 (__m128i* data_lo, |
242 | | __m128i* data_hi, |
243 | | __m128i* alpha_lo, |
244 | | __m128i* alpha_hi, |
245 | | __m128i* ret_lo, |
246 | | __m128i* ret_hi) |
247 | 959 | { |
248 | 959 | __m128i lo, hi; |
249 | | |
250 | 959 | lo = _mm_mullo_epi16 (*data_lo, *alpha_lo); |
251 | 959 | hi = _mm_mullo_epi16 (*data_hi, *alpha_hi); |
252 | 959 | lo = _mm_adds_epu16 (lo, mask_0080); |
253 | 959 | hi = _mm_adds_epu16 (hi, mask_0080); |
254 | 959 | *ret_lo = _mm_mulhi_epu16 (lo, mask_0101); |
255 | 959 | *ret_hi = _mm_mulhi_epu16 (hi, mask_0101); |
256 | 959 | } |
257 | | |
258 | | static force_inline void |
259 | | pix_add_multiply_2x128 (__m128i* src_lo, |
260 | | __m128i* src_hi, |
261 | | __m128i* alpha_dst_lo, |
262 | | __m128i* alpha_dst_hi, |
263 | | __m128i* dst_lo, |
264 | | __m128i* dst_hi, |
265 | | __m128i* alpha_src_lo, |
266 | | __m128i* alpha_src_hi, |
267 | | __m128i* ret_lo, |
268 | | __m128i* ret_hi) |
269 | 0 | { |
270 | 0 | __m128i t1_lo, t1_hi; |
271 | 0 | __m128i t2_lo, t2_hi; |
272 | |
|
273 | 0 | pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi); |
274 | 0 | pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi); |
275 | |
|
276 | 0 | *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo); |
277 | 0 | *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi); |
278 | 0 | } |
279 | | |
280 | | static force_inline void |
281 | | negate_2x128 (__m128i data_lo, |
282 | | __m128i data_hi, |
283 | | __m128i* neg_lo, |
284 | | __m128i* neg_hi) |
285 | 685 | { |
286 | 685 | *neg_lo = _mm_xor_si128 (data_lo, mask_00ff); |
287 | 685 | *neg_hi = _mm_xor_si128 (data_hi, mask_00ff); |
288 | 685 | } |
289 | | |
290 | | static force_inline void |
291 | | invert_colors_2x128 (__m128i data_lo, |
292 | | __m128i data_hi, |
293 | | __m128i* inv_lo, |
294 | | __m128i* inv_hi) |
295 | 0 | { |
296 | 0 | __m128i lo, hi; |
297 | |
|
298 | 0 | lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2)); |
299 | 0 | hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2)); |
300 | 0 | *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2)); |
301 | 0 | *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2)); |
302 | 0 | } |
303 | | |
304 | | static force_inline void |
305 | | over_2x128 (__m128i* src_lo, |
306 | | __m128i* src_hi, |
307 | | __m128i* alpha_lo, |
308 | | __m128i* alpha_hi, |
309 | | __m128i* dst_lo, |
310 | | __m128i* dst_hi) |
311 | 685 | { |
312 | 685 | __m128i t1, t2; |
313 | | |
314 | 685 | negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2); |
315 | | |
316 | 685 | pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi); |
317 | | |
318 | 685 | *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo); |
319 | 685 | *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi); |
320 | 685 | } |
321 | | |
322 | | static force_inline void |
323 | | over_rev_non_pre_2x128 (__m128i src_lo, |
324 | | __m128i src_hi, |
325 | | __m128i* dst_lo, |
326 | | __m128i* dst_hi) |
327 | 0 | { |
328 | 0 | __m128i lo, hi; |
329 | 0 | __m128i alpha_lo, alpha_hi; |
330 | |
|
331 | 0 | expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi); |
332 | |
|
333 | 0 | lo = _mm_or_si128 (alpha_lo, mask_alpha); |
334 | 0 | hi = _mm_or_si128 (alpha_hi, mask_alpha); |
335 | |
|
336 | 0 | invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi); |
337 | |
|
338 | 0 | pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi); |
339 | |
|
340 | 0 | over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi); |
341 | 0 | } |
342 | | |
343 | | static force_inline void |
344 | | in_over_2x128 (__m128i* src_lo, |
345 | | __m128i* src_hi, |
346 | | __m128i* alpha_lo, |
347 | | __m128i* alpha_hi, |
348 | | __m128i* mask_lo, |
349 | | __m128i* mask_hi, |
350 | | __m128i* dst_lo, |
351 | | __m128i* dst_hi) |
352 | 137 | { |
353 | 137 | __m128i s_lo, s_hi; |
354 | 137 | __m128i a_lo, a_hi; |
355 | | |
356 | 137 | pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi); |
357 | 137 | pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi); |
358 | | |
359 | 137 | over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi); |
360 | 137 | } |
361 | | |
362 | | /* load 4 pixels from a 16-byte boundary aligned address */ |
363 | | static force_inline __m128i |
364 | | load_128_aligned (__m128i* src) |
365 | 685 | { |
366 | 685 | return _mm_load_si128 (src); |
367 | 685 | } |
368 | | |
369 | | /* load 4 pixels from a unaligned address */ |
370 | | static force_inline __m128i |
371 | | load_128_unaligned (const __m128i* src) |
372 | 4.44k | { |
373 | 4.44k | return _mm_loadu_si128 (src); |
374 | 4.44k | } |
375 | | |
376 | | /* save 4 pixels on a 16-byte boundary aligned address */ |
377 | | static force_inline void |
378 | | save_128_aligned (__m128i* dst, |
379 | | __m128i data) |
380 | 95.0k | { |
381 | 95.0k | _mm_store_si128 (dst, data); |
382 | 95.0k | } |
383 | | |
384 | | static force_inline __m128i |
385 | | load_32_1x128 (uint32_t data) |
386 | 78 | { |
387 | 78 | return _mm_cvtsi32_si128 (data); |
388 | 78 | } |
389 | | |
390 | | static force_inline __m128i |
391 | | expand_alpha_rev_1x128 (__m128i data) |
392 | 78 | { |
393 | 78 | return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0)); |
394 | 78 | } |
395 | | |
396 | | static force_inline __m128i |
397 | | expand_pixel_8_1x128 (uint8_t data) |
398 | 33 | { |
399 | 33 | return _mm_shufflelo_epi16 ( |
400 | 33 | unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0)); |
401 | 33 | } |
402 | | |
403 | | static force_inline __m128i |
404 | | pix_multiply_1x128 (__m128i data, |
405 | | __m128i alpha) |
406 | 444 | { |
407 | 444 | return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha), |
408 | 444 | mask_0080), |
409 | 444 | mask_0101); |
410 | 444 | } |
411 | | |
412 | | static force_inline __m128i |
413 | | pix_add_multiply_1x128 (__m128i* src, |
414 | | __m128i* alpha_dst, |
415 | | __m128i* dst, |
416 | | __m128i* alpha_src) |
417 | 0 | { |
418 | 0 | __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst); |
419 | 0 | __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src); |
420 | |
|
421 | 0 | return _mm_adds_epu8 (t1, t2); |
422 | 0 | } |
423 | | |
424 | | static force_inline __m128i |
425 | | negate_1x128 (__m128i data) |
426 | 300 | { |
427 | 300 | return _mm_xor_si128 (data, mask_00ff); |
428 | 300 | } |
429 | | |
430 | | static force_inline __m128i |
431 | | invert_colors_1x128 (__m128i data) |
432 | 0 | { |
433 | 0 | return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2)); |
434 | 0 | } |
435 | | |
436 | | static force_inline __m128i |
437 | | over_1x128 (__m128i src, __m128i alpha, __m128i dst) |
438 | 300 | { |
439 | 300 | return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha))); |
440 | 300 | } |
441 | | |
442 | | static force_inline __m128i |
443 | | in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst) |
444 | 72 | { |
445 | 72 | return over_1x128 (pix_multiply_1x128 (*src, *mask), |
446 | 72 | pix_multiply_1x128 (*alpha, *mask), |
447 | 72 | *dst); |
448 | 72 | } |
449 | | |
450 | | static force_inline __m128i |
451 | | over_rev_non_pre_1x128 (__m128i src, __m128i dst) |
452 | 0 | { |
453 | 0 | __m128i alpha = expand_alpha_1x128 (src); |
454 | |
|
455 | 0 | return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src), |
456 | 0 | _mm_or_si128 (alpha, mask_alpha)), |
457 | 0 | alpha, |
458 | 0 | dst); |
459 | 0 | } |
460 | | |
461 | | static force_inline uint32_t |
462 | | pack_1x128_32 (__m128i data) |
463 | 300 | { |
464 | 300 | return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ())); |
465 | 300 | } |
466 | | |
467 | | static force_inline __m128i |
468 | | expand565_16_1x128 (uint16_t pixel) |
469 | 0 | { |
470 | 0 | __m128i m = _mm_cvtsi32_si128 (pixel); |
471 | |
|
472 | 0 | m = unpack_565_to_8888 (m); |
473 | |
|
474 | 0 | return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ()); |
475 | 0 | } |
476 | | |
477 | | static force_inline uint32_t |
478 | | core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst) |
479 | 341 | { |
480 | 341 | uint8_t a; |
481 | 341 | __m128i xmms; |
482 | | |
483 | 341 | a = src >> 24; |
484 | | |
485 | 341 | if (a == 0xff) |
486 | 113 | { |
487 | 113 | return src; |
488 | 113 | } |
489 | 228 | else if (src) |
490 | 228 | { |
491 | 228 | xmms = unpack_32_1x128 (src); |
492 | 228 | return pack_1x128_32 ( |
493 | 228 | over_1x128 (xmms, expand_alpha_1x128 (xmms), |
494 | 228 | unpack_32_1x128 (dst))); |
495 | 228 | } |
496 | | |
497 | 0 | return dst; |
498 | 341 | } |
499 | | |
500 | | static force_inline uint32_t |
501 | | combine1 (const uint32_t *ps, const uint32_t *pm) |
502 | 0 | { |
503 | 0 | uint32_t s; |
504 | 0 | memcpy(&s, ps, sizeof(uint32_t)); |
505 | |
|
506 | 0 | if (pm) |
507 | 0 | { |
508 | 0 | __m128i ms, mm; |
509 | |
|
510 | 0 | mm = unpack_32_1x128 (*pm); |
511 | 0 | mm = expand_alpha_1x128 (mm); |
512 | |
|
513 | 0 | ms = unpack_32_1x128 (s); |
514 | 0 | ms = pix_multiply_1x128 (ms, mm); |
515 | |
|
516 | 0 | s = pack_1x128_32 (ms); |
517 | 0 | } |
518 | |
|
519 | 0 | return s; |
520 | 0 | } |
521 | | |
522 | | static force_inline __m128i |
523 | | combine4 (const __m128i *ps, const __m128i *pm) |
524 | 0 | { |
525 | 0 | __m128i xmm_src_lo, xmm_src_hi; |
526 | 0 | __m128i xmm_msk_lo, xmm_msk_hi; |
527 | 0 | __m128i s; |
528 | |
|
529 | 0 | if (pm) |
530 | 0 | { |
531 | 0 | xmm_msk_lo = load_128_unaligned (pm); |
532 | |
|
533 | 0 | if (is_transparent (xmm_msk_lo)) |
534 | 0 | return _mm_setzero_si128 (); |
535 | 0 | } |
536 | | |
537 | 0 | s = load_128_unaligned (ps); |
538 | |
|
539 | 0 | if (pm) |
540 | 0 | { |
541 | 0 | unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi); |
542 | 0 | unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi); |
543 | |
|
544 | 0 | expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi); |
545 | |
|
546 | 0 | pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, |
547 | 0 | &xmm_msk_lo, &xmm_msk_hi, |
548 | 0 | &xmm_src_lo, &xmm_src_hi); |
549 | |
|
550 | 0 | s = pack_2x128_128 (xmm_src_lo, xmm_src_hi); |
551 | 0 | } |
552 | |
|
553 | 0 | return s; |
554 | 0 | } |
555 | | |
556 | | static force_inline void |
557 | | core_combine_over_u_sse2_mask (uint32_t * pd, |
558 | | const uint32_t* ps, |
559 | | const uint32_t* pm, |
560 | | int w) |
561 | 0 | { |
562 | 0 | uint32_t s, d; |
563 | | |
564 | | /* Align dst on a 16-byte boundary */ |
565 | 0 | while (w && ((uintptr_t)pd & 15)) |
566 | 0 | { |
567 | 0 | d = *pd; |
568 | 0 | s = combine1 (ps, pm); |
569 | |
|
570 | 0 | if (s) |
571 | 0 | *pd = core_combine_over_u_pixel_sse2 (s, d); |
572 | 0 | pd++; |
573 | 0 | ps++; |
574 | 0 | pm++; |
575 | 0 | w--; |
576 | 0 | } |
577 | |
|
578 | 0 | while (w >= 4) |
579 | 0 | { |
580 | 0 | __m128i mask = load_128_unaligned ((__m128i *)pm); |
581 | |
|
582 | 0 | if (!is_zero (mask)) |
583 | 0 | { |
584 | 0 | __m128i src; |
585 | 0 | __m128i src_hi, src_lo; |
586 | 0 | __m128i mask_hi, mask_lo; |
587 | 0 | __m128i alpha_hi, alpha_lo; |
588 | |
|
589 | 0 | src = load_128_unaligned ((__m128i *)ps); |
590 | |
|
591 | 0 | if (is_opaque (_mm_and_si128 (src, mask))) |
592 | 0 | { |
593 | 0 | save_128_aligned ((__m128i *)pd, src); |
594 | 0 | } |
595 | 0 | else |
596 | 0 | { |
597 | 0 | __m128i dst = load_128_aligned ((__m128i *)pd); |
598 | 0 | __m128i dst_hi, dst_lo; |
599 | |
|
600 | 0 | unpack_128_2x128 (mask, &mask_lo, &mask_hi); |
601 | 0 | unpack_128_2x128 (src, &src_lo, &src_hi); |
602 | |
|
603 | 0 | expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi); |
604 | 0 | pix_multiply_2x128 (&src_lo, &src_hi, |
605 | 0 | &mask_lo, &mask_hi, |
606 | 0 | &src_lo, &src_hi); |
607 | |
|
608 | 0 | unpack_128_2x128 (dst, &dst_lo, &dst_hi); |
609 | |
|
610 | 0 | expand_alpha_2x128 (src_lo, src_hi, |
611 | 0 | &alpha_lo, &alpha_hi); |
612 | |
|
613 | 0 | over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi, |
614 | 0 | &dst_lo, &dst_hi); |
615 | |
|
616 | 0 | save_128_aligned ( |
617 | 0 | (__m128i *)pd, |
618 | 0 | pack_2x128_128 (dst_lo, dst_hi)); |
619 | 0 | } |
620 | 0 | } |
621 | |
|
622 | 0 | pm += 4; |
623 | 0 | ps += 4; |
624 | 0 | pd += 4; |
625 | 0 | w -= 4; |
626 | 0 | } |
627 | 0 | while (w) |
628 | 0 | { |
629 | 0 | d = *pd; |
630 | 0 | s = combine1 (ps, pm); |
631 | |
|
632 | 0 | if (s) |
633 | 0 | *pd = core_combine_over_u_pixel_sse2 (s, d); |
634 | 0 | pd++; |
635 | 0 | ps++; |
636 | 0 | pm++; |
637 | |
|
638 | 0 | w--; |
639 | 0 | } |
640 | 0 | } |
641 | | |
642 | | static force_inline void |
643 | | core_combine_over_u_sse2_no_mask (uint32_t * pd, |
644 | | const uint32_t* ps, |
645 | | int w) |
646 | 249 | { |
647 | 249 | uint32_t s, d; |
648 | | |
649 | | /* Align dst on a 16-byte boundary */ |
650 | 621 | while (w && ((uintptr_t)pd & 15)) |
651 | 372 | { |
652 | 372 | d = *pd; |
653 | 372 | s = *ps; |
654 | | |
655 | 372 | if (s) |
656 | 152 | *pd = core_combine_over_u_pixel_sse2 (s, d); |
657 | 372 | pd++; |
658 | 372 | ps++; |
659 | 372 | w--; |
660 | 372 | } |
661 | | |
662 | 4.68k | while (w >= 4) |
663 | 4.43k | { |
664 | 4.43k | __m128i src; |
665 | 4.43k | __m128i src_hi, src_lo, dst_hi, dst_lo; |
666 | 4.43k | __m128i alpha_hi, alpha_lo; |
667 | | |
668 | 4.43k | src = load_128_unaligned ((__m128i *)ps); |
669 | | |
670 | 4.43k | if (!is_zero (src)) |
671 | 1.65k | { |
672 | 1.65k | if (is_opaque (src)) |
673 | 1.11k | { |
674 | 1.11k | save_128_aligned ((__m128i *)pd, src); |
675 | 1.11k | } |
676 | 548 | else |
677 | 548 | { |
678 | 548 | __m128i dst = load_128_aligned ((__m128i *)pd); |
679 | | |
680 | 548 | unpack_128_2x128 (src, &src_lo, &src_hi); |
681 | 548 | unpack_128_2x128 (dst, &dst_lo, &dst_hi); |
682 | | |
683 | 548 | expand_alpha_2x128 (src_lo, src_hi, |
684 | 548 | &alpha_lo, &alpha_hi); |
685 | 548 | over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi, |
686 | 548 | &dst_lo, &dst_hi); |
687 | | |
688 | 548 | save_128_aligned ( |
689 | 548 | (__m128i *)pd, |
690 | 548 | pack_2x128_128 (dst_lo, dst_hi)); |
691 | 548 | } |
692 | 1.65k | } |
693 | | |
694 | 4.43k | ps += 4; |
695 | 4.43k | pd += 4; |
696 | 4.43k | w -= 4; |
697 | 4.43k | } |
698 | 691 | while (w) |
699 | 442 | { |
700 | 442 | d = *pd; |
701 | 442 | s = *ps; |
702 | | |
703 | 442 | if (s) |
704 | 189 | *pd = core_combine_over_u_pixel_sse2 (s, d); |
705 | 442 | pd++; |
706 | 442 | ps++; |
707 | | |
708 | 442 | w--; |
709 | 442 | } |
710 | 249 | } |
711 | | |
712 | | static force_inline void |
713 | | sse2_combine_over_u (pixman_implementation_t *imp, |
714 | | pixman_op_t op, |
715 | | uint32_t * pd, |
716 | | const uint32_t * ps, |
717 | | const uint32_t * pm, |
718 | | int w) |
719 | 249 | { |
720 | 249 | if (pm) |
721 | 0 | core_combine_over_u_sse2_mask (pd, ps, pm, w); |
722 | 249 | else |
723 | 249 | core_combine_over_u_sse2_no_mask (pd, ps, w); |
724 | 249 | } |
725 | | |
726 | | static void |
727 | | sse2_combine_over_reverse_u (pixman_implementation_t *imp, |
728 | | pixman_op_t op, |
729 | | uint32_t * pd, |
730 | | const uint32_t * ps, |
731 | | const uint32_t * pm, |
732 | | int w) |
733 | 0 | { |
734 | 0 | uint32_t s, d; |
735 | |
|
736 | 0 | __m128i xmm_dst_lo, xmm_dst_hi; |
737 | 0 | __m128i xmm_src_lo, xmm_src_hi; |
738 | 0 | __m128i xmm_alpha_lo, xmm_alpha_hi; |
739 | | |
740 | | /* Align dst on a 16-byte boundary */ |
741 | 0 | while (w && |
742 | 0 | ((uintptr_t)pd & 15)) |
743 | 0 | { |
744 | 0 | d = *pd; |
745 | 0 | s = combine1 (ps, pm); |
746 | |
|
747 | 0 | *pd++ = core_combine_over_u_pixel_sse2 (d, s); |
748 | 0 | w--; |
749 | 0 | ps++; |
750 | 0 | if (pm) |
751 | 0 | pm++; |
752 | 0 | } |
753 | |
|
754 | 0 | while (w >= 4) |
755 | 0 | { |
756 | | /* I'm loading unaligned because I'm not sure |
757 | | * about the address alignment. |
758 | | */ |
759 | 0 | xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); |
760 | 0 | xmm_dst_hi = load_128_aligned ((__m128i*) pd); |
761 | |
|
762 | 0 | unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
763 | 0 | unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
764 | |
|
765 | 0 | expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, |
766 | 0 | &xmm_alpha_lo, &xmm_alpha_hi); |
767 | |
|
768 | 0 | over_2x128 (&xmm_dst_lo, &xmm_dst_hi, |
769 | 0 | &xmm_alpha_lo, &xmm_alpha_hi, |
770 | 0 | &xmm_src_lo, &xmm_src_hi); |
771 | | |
772 | | /* rebuid the 4 pixel data and save*/ |
773 | 0 | save_128_aligned ((__m128i*)pd, |
774 | 0 | pack_2x128_128 (xmm_src_lo, xmm_src_hi)); |
775 | |
|
776 | 0 | w -= 4; |
777 | 0 | ps += 4; |
778 | 0 | pd += 4; |
779 | |
|
780 | 0 | if (pm) |
781 | 0 | pm += 4; |
782 | 0 | } |
783 | |
|
784 | 0 | while (w) |
785 | 0 | { |
786 | 0 | d = *pd; |
787 | 0 | s = combine1 (ps, pm); |
788 | |
|
789 | 0 | *pd++ = core_combine_over_u_pixel_sse2 (d, s); |
790 | 0 | ps++; |
791 | 0 | w--; |
792 | 0 | if (pm) |
793 | 0 | pm++; |
794 | 0 | } |
795 | 0 | } |
796 | | |
797 | | static force_inline uint32_t |
798 | | core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst) |
799 | 0 | { |
800 | 0 | uint32_t maska = src >> 24; |
801 | |
|
802 | 0 | if (maska == 0) |
803 | 0 | { |
804 | 0 | return 0; |
805 | 0 | } |
806 | 0 | else if (maska != 0xff) |
807 | 0 | { |
808 | 0 | return pack_1x128_32 ( |
809 | 0 | pix_multiply_1x128 (unpack_32_1x128 (dst), |
810 | 0 | expand_alpha_1x128 (unpack_32_1x128 (src)))); |
811 | 0 | } |
812 | | |
813 | 0 | return dst; |
814 | 0 | } |
815 | | |
816 | | static void |
817 | | sse2_combine_in_u (pixman_implementation_t *imp, |
818 | | pixman_op_t op, |
819 | | uint32_t * pd, |
820 | | const uint32_t * ps, |
821 | | const uint32_t * pm, |
822 | | int w) |
823 | 0 | { |
824 | 0 | uint32_t s, d; |
825 | |
|
826 | 0 | __m128i xmm_src_lo, xmm_src_hi; |
827 | 0 | __m128i xmm_dst_lo, xmm_dst_hi; |
828 | |
|
829 | 0 | while (w && ((uintptr_t)pd & 15)) |
830 | 0 | { |
831 | 0 | s = combine1 (ps, pm); |
832 | 0 | d = *pd; |
833 | |
|
834 | 0 | *pd++ = core_combine_in_u_pixel_sse2 (d, s); |
835 | 0 | w--; |
836 | 0 | ps++; |
837 | 0 | if (pm) |
838 | 0 | pm++; |
839 | 0 | } |
840 | |
|
841 | 0 | while (w >= 4) |
842 | 0 | { |
843 | 0 | xmm_dst_hi = load_128_aligned ((__m128i*) pd); |
844 | 0 | xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm); |
845 | |
|
846 | 0 | unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
847 | 0 | expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
848 | |
|
849 | 0 | unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
850 | 0 | pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, |
851 | 0 | &xmm_dst_lo, &xmm_dst_hi, |
852 | 0 | &xmm_dst_lo, &xmm_dst_hi); |
853 | |
|
854 | 0 | save_128_aligned ((__m128i*)pd, |
855 | 0 | pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
856 | |
|
857 | 0 | ps += 4; |
858 | 0 | pd += 4; |
859 | 0 | w -= 4; |
860 | 0 | if (pm) |
861 | 0 | pm += 4; |
862 | 0 | } |
863 | |
|
864 | 0 | while (w) |
865 | 0 | { |
866 | 0 | s = combine1 (ps, pm); |
867 | 0 | d = *pd; |
868 | |
|
869 | 0 | *pd++ = core_combine_in_u_pixel_sse2 (d, s); |
870 | 0 | w--; |
871 | 0 | ps++; |
872 | 0 | if (pm) |
873 | 0 | pm++; |
874 | 0 | } |
875 | 0 | } |
876 | | |
877 | | static void |
878 | | sse2_combine_in_reverse_u (pixman_implementation_t *imp, |
879 | | pixman_op_t op, |
880 | | uint32_t * pd, |
881 | | const uint32_t * ps, |
882 | | const uint32_t * pm, |
883 | | int w) |
884 | 0 | { |
885 | 0 | uint32_t s, d; |
886 | |
|
887 | 0 | __m128i xmm_src_lo, xmm_src_hi; |
888 | 0 | __m128i xmm_dst_lo, xmm_dst_hi; |
889 | |
|
890 | 0 | while (w && ((uintptr_t)pd & 15)) |
891 | 0 | { |
892 | 0 | s = combine1 (ps, pm); |
893 | 0 | d = *pd; |
894 | |
|
895 | 0 | *pd++ = core_combine_in_u_pixel_sse2 (s, d); |
896 | 0 | ps++; |
897 | 0 | w--; |
898 | 0 | if (pm) |
899 | 0 | pm++; |
900 | 0 | } |
901 | |
|
902 | 0 | while (w >= 4) |
903 | 0 | { |
904 | 0 | xmm_dst_hi = load_128_aligned ((__m128i*) pd); |
905 | 0 | xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm); |
906 | |
|
907 | 0 | unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
908 | 0 | expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
909 | |
|
910 | 0 | unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
911 | 0 | pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, |
912 | 0 | &xmm_src_lo, &xmm_src_hi, |
913 | 0 | &xmm_dst_lo, &xmm_dst_hi); |
914 | |
|
915 | 0 | save_128_aligned ( |
916 | 0 | (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
917 | |
|
918 | 0 | ps += 4; |
919 | 0 | pd += 4; |
920 | 0 | w -= 4; |
921 | 0 | if (pm) |
922 | 0 | pm += 4; |
923 | 0 | } |
924 | |
|
925 | 0 | while (w) |
926 | 0 | { |
927 | 0 | s = combine1 (ps, pm); |
928 | 0 | d = *pd; |
929 | |
|
930 | 0 | *pd++ = core_combine_in_u_pixel_sse2 (s, d); |
931 | 0 | w--; |
932 | 0 | ps++; |
933 | 0 | if (pm) |
934 | 0 | pm++; |
935 | 0 | } |
936 | 0 | } |
937 | | |
938 | | static void |
939 | | sse2_combine_out_reverse_u (pixman_implementation_t *imp, |
940 | | pixman_op_t op, |
941 | | uint32_t * pd, |
942 | | const uint32_t * ps, |
943 | | const uint32_t * pm, |
944 | | int w) |
945 | 0 | { |
946 | 0 | while (w && ((uintptr_t)pd & 15)) |
947 | 0 | { |
948 | 0 | uint32_t s = combine1 (ps, pm); |
949 | 0 | uint32_t d = *pd; |
950 | |
|
951 | 0 | *pd++ = pack_1x128_32 ( |
952 | 0 | pix_multiply_1x128 ( |
953 | 0 | unpack_32_1x128 (d), negate_1x128 ( |
954 | 0 | expand_alpha_1x128 (unpack_32_1x128 (s))))); |
955 | |
|
956 | 0 | if (pm) |
957 | 0 | pm++; |
958 | 0 | ps++; |
959 | 0 | w--; |
960 | 0 | } |
961 | |
|
962 | 0 | while (w >= 4) |
963 | 0 | { |
964 | 0 | __m128i xmm_src_lo, xmm_src_hi; |
965 | 0 | __m128i xmm_dst_lo, xmm_dst_hi; |
966 | |
|
967 | 0 | xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); |
968 | 0 | xmm_dst_hi = load_128_aligned ((__m128i*) pd); |
969 | |
|
970 | 0 | unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
971 | 0 | unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
972 | |
|
973 | 0 | expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
974 | 0 | negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
975 | |
|
976 | 0 | pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, |
977 | 0 | &xmm_src_lo, &xmm_src_hi, |
978 | 0 | &xmm_dst_lo, &xmm_dst_hi); |
979 | |
|
980 | 0 | save_128_aligned ( |
981 | 0 | (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
982 | |
|
983 | 0 | ps += 4; |
984 | 0 | pd += 4; |
985 | 0 | if (pm) |
986 | 0 | pm += 4; |
987 | |
|
988 | 0 | w -= 4; |
989 | 0 | } |
990 | |
|
991 | 0 | while (w) |
992 | 0 | { |
993 | 0 | uint32_t s = combine1 (ps, pm); |
994 | 0 | uint32_t d = *pd; |
995 | |
|
996 | 0 | *pd++ = pack_1x128_32 ( |
997 | 0 | pix_multiply_1x128 ( |
998 | 0 | unpack_32_1x128 (d), negate_1x128 ( |
999 | 0 | expand_alpha_1x128 (unpack_32_1x128 (s))))); |
1000 | 0 | ps++; |
1001 | 0 | if (pm) |
1002 | 0 | pm++; |
1003 | 0 | w--; |
1004 | 0 | } |
1005 | 0 | } |
1006 | | |
1007 | | static void |
1008 | | sse2_combine_out_u (pixman_implementation_t *imp, |
1009 | | pixman_op_t op, |
1010 | | uint32_t * pd, |
1011 | | const uint32_t * ps, |
1012 | | const uint32_t * pm, |
1013 | | int w) |
1014 | 0 | { |
1015 | 0 | while (w && ((uintptr_t)pd & 15)) |
1016 | 0 | { |
1017 | 0 | uint32_t s = combine1 (ps, pm); |
1018 | 0 | uint32_t d = *pd; |
1019 | |
|
1020 | 0 | *pd++ = pack_1x128_32 ( |
1021 | 0 | pix_multiply_1x128 ( |
1022 | 0 | unpack_32_1x128 (s), negate_1x128 ( |
1023 | 0 | expand_alpha_1x128 (unpack_32_1x128 (d))))); |
1024 | 0 | w--; |
1025 | 0 | ps++; |
1026 | 0 | if (pm) |
1027 | 0 | pm++; |
1028 | 0 | } |
1029 | |
|
1030 | 0 | while (w >= 4) |
1031 | 0 | { |
1032 | 0 | __m128i xmm_src_lo, xmm_src_hi; |
1033 | 0 | __m128i xmm_dst_lo, xmm_dst_hi; |
1034 | |
|
1035 | 0 | xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm); |
1036 | 0 | xmm_dst_hi = load_128_aligned ((__m128i*) pd); |
1037 | |
|
1038 | 0 | unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
1039 | 0 | unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
1040 | |
|
1041 | 0 | expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
1042 | 0 | negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
1043 | |
|
1044 | 0 | pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, |
1045 | 0 | &xmm_dst_lo, &xmm_dst_hi, |
1046 | 0 | &xmm_dst_lo, &xmm_dst_hi); |
1047 | |
|
1048 | 0 | save_128_aligned ( |
1049 | 0 | (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
1050 | |
|
1051 | 0 | ps += 4; |
1052 | 0 | pd += 4; |
1053 | 0 | w -= 4; |
1054 | 0 | if (pm) |
1055 | 0 | pm += 4; |
1056 | 0 | } |
1057 | |
|
1058 | 0 | while (w) |
1059 | 0 | { |
1060 | 0 | uint32_t s = combine1 (ps, pm); |
1061 | 0 | uint32_t d = *pd; |
1062 | |
|
1063 | 0 | *pd++ = pack_1x128_32 ( |
1064 | 0 | pix_multiply_1x128 ( |
1065 | 0 | unpack_32_1x128 (s), negate_1x128 ( |
1066 | 0 | expand_alpha_1x128 (unpack_32_1x128 (d))))); |
1067 | 0 | w--; |
1068 | 0 | ps++; |
1069 | 0 | if (pm) |
1070 | 0 | pm++; |
1071 | 0 | } |
1072 | 0 | } |
1073 | | |
1074 | | static force_inline uint32_t |
1075 | | core_combine_atop_u_pixel_sse2 (uint32_t src, |
1076 | | uint32_t dst) |
1077 | 0 | { |
1078 | 0 | __m128i s = unpack_32_1x128 (src); |
1079 | 0 | __m128i d = unpack_32_1x128 (dst); |
1080 | |
|
1081 | 0 | __m128i sa = negate_1x128 (expand_alpha_1x128 (s)); |
1082 | 0 | __m128i da = expand_alpha_1x128 (d); |
1083 | |
|
1084 | 0 | return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa)); |
1085 | 0 | } |
1086 | | |
1087 | | static void |
1088 | | sse2_combine_atop_u (pixman_implementation_t *imp, |
1089 | | pixman_op_t op, |
1090 | | uint32_t * pd, |
1091 | | const uint32_t * ps, |
1092 | | const uint32_t * pm, |
1093 | | int w) |
1094 | 0 | { |
1095 | 0 | uint32_t s, d; |
1096 | |
|
1097 | 0 | __m128i xmm_src_lo, xmm_src_hi; |
1098 | 0 | __m128i xmm_dst_lo, xmm_dst_hi; |
1099 | 0 | __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; |
1100 | 0 | __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; |
1101 | |
|
1102 | 0 | while (w && ((uintptr_t)pd & 15)) |
1103 | 0 | { |
1104 | 0 | s = combine1 (ps, pm); |
1105 | 0 | d = *pd; |
1106 | |
|
1107 | 0 | *pd++ = core_combine_atop_u_pixel_sse2 (s, d); |
1108 | 0 | w--; |
1109 | 0 | ps++; |
1110 | 0 | if (pm) |
1111 | 0 | pm++; |
1112 | 0 | } |
1113 | |
|
1114 | 0 | while (w >= 4) |
1115 | 0 | { |
1116 | 0 | xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); |
1117 | 0 | xmm_dst_hi = load_128_aligned ((__m128i*) pd); |
1118 | |
|
1119 | 0 | unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
1120 | 0 | unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
1121 | |
|
1122 | 0 | expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, |
1123 | 0 | &xmm_alpha_src_lo, &xmm_alpha_src_hi); |
1124 | 0 | expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, |
1125 | 0 | &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); |
1126 | |
|
1127 | 0 | negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi, |
1128 | 0 | &xmm_alpha_src_lo, &xmm_alpha_src_hi); |
1129 | |
|
1130 | 0 | pix_add_multiply_2x128 ( |
1131 | 0 | &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, |
1132 | 0 | &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, |
1133 | 0 | &xmm_dst_lo, &xmm_dst_hi); |
1134 | |
|
1135 | 0 | save_128_aligned ( |
1136 | 0 | (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
1137 | |
|
1138 | 0 | ps += 4; |
1139 | 0 | pd += 4; |
1140 | 0 | w -= 4; |
1141 | 0 | if (pm) |
1142 | 0 | pm += 4; |
1143 | 0 | } |
1144 | |
|
1145 | 0 | while (w) |
1146 | 0 | { |
1147 | 0 | s = combine1 (ps, pm); |
1148 | 0 | d = *pd; |
1149 | |
|
1150 | 0 | *pd++ = core_combine_atop_u_pixel_sse2 (s, d); |
1151 | 0 | w--; |
1152 | 0 | ps++; |
1153 | 0 | if (pm) |
1154 | 0 | pm++; |
1155 | 0 | } |
1156 | 0 | } |
1157 | | |
1158 | | static force_inline uint32_t |
1159 | | core_combine_reverse_atop_u_pixel_sse2 (uint32_t src, |
1160 | | uint32_t dst) |
1161 | 0 | { |
1162 | 0 | __m128i s = unpack_32_1x128 (src); |
1163 | 0 | __m128i d = unpack_32_1x128 (dst); |
1164 | |
|
1165 | 0 | __m128i sa = expand_alpha_1x128 (s); |
1166 | 0 | __m128i da = negate_1x128 (expand_alpha_1x128 (d)); |
1167 | |
|
1168 | 0 | return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa)); |
1169 | 0 | } |
1170 | | |
1171 | | static void |
1172 | | sse2_combine_atop_reverse_u (pixman_implementation_t *imp, |
1173 | | pixman_op_t op, |
1174 | | uint32_t * pd, |
1175 | | const uint32_t * ps, |
1176 | | const uint32_t * pm, |
1177 | | int w) |
1178 | 0 | { |
1179 | 0 | uint32_t s, d; |
1180 | |
|
1181 | 0 | __m128i xmm_src_lo, xmm_src_hi; |
1182 | 0 | __m128i xmm_dst_lo, xmm_dst_hi; |
1183 | 0 | __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; |
1184 | 0 | __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; |
1185 | |
|
1186 | 0 | while (w && ((uintptr_t)pd & 15)) |
1187 | 0 | { |
1188 | 0 | s = combine1 (ps, pm); |
1189 | 0 | d = *pd; |
1190 | |
|
1191 | 0 | *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d); |
1192 | 0 | ps++; |
1193 | 0 | w--; |
1194 | 0 | if (pm) |
1195 | 0 | pm++; |
1196 | 0 | } |
1197 | |
|
1198 | 0 | while (w >= 4) |
1199 | 0 | { |
1200 | 0 | xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); |
1201 | 0 | xmm_dst_hi = load_128_aligned ((__m128i*) pd); |
1202 | |
|
1203 | 0 | unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
1204 | 0 | unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
1205 | |
|
1206 | 0 | expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, |
1207 | 0 | &xmm_alpha_src_lo, &xmm_alpha_src_hi); |
1208 | 0 | expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, |
1209 | 0 | &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); |
1210 | |
|
1211 | 0 | negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, |
1212 | 0 | &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); |
1213 | |
|
1214 | 0 | pix_add_multiply_2x128 ( |
1215 | 0 | &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, |
1216 | 0 | &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, |
1217 | 0 | &xmm_dst_lo, &xmm_dst_hi); |
1218 | |
|
1219 | 0 | save_128_aligned ( |
1220 | 0 | (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
1221 | |
|
1222 | 0 | ps += 4; |
1223 | 0 | pd += 4; |
1224 | 0 | w -= 4; |
1225 | 0 | if (pm) |
1226 | 0 | pm += 4; |
1227 | 0 | } |
1228 | |
|
1229 | 0 | while (w) |
1230 | 0 | { |
1231 | 0 | s = combine1 (ps, pm); |
1232 | 0 | d = *pd; |
1233 | |
|
1234 | 0 | *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d); |
1235 | 0 | ps++; |
1236 | 0 | w--; |
1237 | 0 | if (pm) |
1238 | 0 | pm++; |
1239 | 0 | } |
1240 | 0 | } |
1241 | | |
1242 | | static force_inline uint32_t |
1243 | | core_combine_xor_u_pixel_sse2 (uint32_t src, |
1244 | | uint32_t dst) |
1245 | 0 | { |
1246 | 0 | __m128i s = unpack_32_1x128 (src); |
1247 | 0 | __m128i d = unpack_32_1x128 (dst); |
1248 | |
|
1249 | 0 | __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d)); |
1250 | 0 | __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s)); |
1251 | |
|
1252 | 0 | return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s)); |
1253 | 0 | } |
1254 | | |
1255 | | static void |
1256 | | sse2_combine_xor_u (pixman_implementation_t *imp, |
1257 | | pixman_op_t op, |
1258 | | uint32_t * dst, |
1259 | | const uint32_t * src, |
1260 | | const uint32_t * mask, |
1261 | | int width) |
1262 | 0 | { |
1263 | 0 | int w = width; |
1264 | 0 | uint32_t s, d; |
1265 | 0 | uint32_t* pd = dst; |
1266 | 0 | const uint32_t* ps = src; |
1267 | 0 | const uint32_t* pm = mask; |
1268 | |
|
1269 | 0 | __m128i xmm_src, xmm_src_lo, xmm_src_hi; |
1270 | 0 | __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
1271 | 0 | __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; |
1272 | 0 | __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; |
1273 | |
|
1274 | 0 | while (w && ((uintptr_t)pd & 15)) |
1275 | 0 | { |
1276 | 0 | s = combine1 (ps, pm); |
1277 | 0 | d = *pd; |
1278 | |
|
1279 | 0 | *pd++ = core_combine_xor_u_pixel_sse2 (s, d); |
1280 | 0 | w--; |
1281 | 0 | ps++; |
1282 | 0 | if (pm) |
1283 | 0 | pm++; |
1284 | 0 | } |
1285 | |
|
1286 | 0 | while (w >= 4) |
1287 | 0 | { |
1288 | 0 | xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm); |
1289 | 0 | xmm_dst = load_128_aligned ((__m128i*) pd); |
1290 | |
|
1291 | 0 | unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
1292 | 0 | unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
1293 | |
|
1294 | 0 | expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, |
1295 | 0 | &xmm_alpha_src_lo, &xmm_alpha_src_hi); |
1296 | 0 | expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, |
1297 | 0 | &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); |
1298 | |
|
1299 | 0 | negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi, |
1300 | 0 | &xmm_alpha_src_lo, &xmm_alpha_src_hi); |
1301 | 0 | negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, |
1302 | 0 | &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); |
1303 | |
|
1304 | 0 | pix_add_multiply_2x128 ( |
1305 | 0 | &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, |
1306 | 0 | &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, |
1307 | 0 | &xmm_dst_lo, &xmm_dst_hi); |
1308 | |
|
1309 | 0 | save_128_aligned ( |
1310 | 0 | (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
1311 | |
|
1312 | 0 | ps += 4; |
1313 | 0 | pd += 4; |
1314 | 0 | w -= 4; |
1315 | 0 | if (pm) |
1316 | 0 | pm += 4; |
1317 | 0 | } |
1318 | |
|
1319 | 0 | while (w) |
1320 | 0 | { |
1321 | 0 | s = combine1 (ps, pm); |
1322 | 0 | d = *pd; |
1323 | |
|
1324 | 0 | *pd++ = core_combine_xor_u_pixel_sse2 (s, d); |
1325 | 0 | w--; |
1326 | 0 | ps++; |
1327 | 0 | if (pm) |
1328 | 0 | pm++; |
1329 | 0 | } |
1330 | 0 | } |
1331 | | |
1332 | | static force_inline void |
1333 | | sse2_combine_add_u (pixman_implementation_t *imp, |
1334 | | pixman_op_t op, |
1335 | | uint32_t * dst, |
1336 | | const uint32_t * src, |
1337 | | const uint32_t * mask, |
1338 | | int width) |
1339 | 0 | { |
1340 | 0 | int w = width; |
1341 | 0 | uint32_t s, d; |
1342 | 0 | uint32_t* pd = dst; |
1343 | 0 | const uint32_t* ps = src; |
1344 | 0 | const uint32_t* pm = mask; |
1345 | |
|
1346 | 0 | while (w && (uintptr_t)pd & 15) |
1347 | 0 | { |
1348 | 0 | s = combine1 (ps, pm); |
1349 | 0 | d = *pd; |
1350 | |
|
1351 | 0 | ps++; |
1352 | 0 | if (pm) |
1353 | 0 | pm++; |
1354 | 0 | *pd++ = _mm_cvtsi128_si32 ( |
1355 | 0 | _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d))); |
1356 | 0 | w--; |
1357 | 0 | } |
1358 | |
|
1359 | 0 | while (w >= 4) |
1360 | 0 | { |
1361 | 0 | __m128i s; |
1362 | |
|
1363 | 0 | s = combine4 ((__m128i*)ps, (__m128i*)pm); |
1364 | |
|
1365 | 0 | save_128_aligned ( |
1366 | 0 | (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd))); |
1367 | |
|
1368 | 0 | pd += 4; |
1369 | 0 | ps += 4; |
1370 | 0 | if (pm) |
1371 | 0 | pm += 4; |
1372 | 0 | w -= 4; |
1373 | 0 | } |
1374 | |
|
1375 | 0 | while (w--) |
1376 | 0 | { |
1377 | 0 | s = combine1 (ps, pm); |
1378 | 0 | d = *pd; |
1379 | |
|
1380 | 0 | ps++; |
1381 | 0 | *pd++ = _mm_cvtsi128_si32 ( |
1382 | 0 | _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d))); |
1383 | 0 | if (pm) |
1384 | 0 | pm++; |
1385 | 0 | } |
1386 | 0 | } |
1387 | | |
1388 | | static force_inline uint32_t |
1389 | | core_combine_saturate_u_pixel_sse2 (uint32_t src, |
1390 | | uint32_t dst) |
1391 | 0 | { |
1392 | 0 | __m128i ms = unpack_32_1x128 (src); |
1393 | 0 | __m128i md = unpack_32_1x128 (dst); |
1394 | 0 | uint32_t sa = src >> 24; |
1395 | 0 | uint32_t da = ~dst >> 24; |
1396 | |
|
1397 | 0 | if (sa > da) |
1398 | 0 | { |
1399 | 0 | ms = pix_multiply_1x128 ( |
1400 | 0 | ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24))); |
1401 | 0 | } |
1402 | |
|
1403 | 0 | return pack_1x128_32 (_mm_adds_epu16 (md, ms)); |
1404 | 0 | } |
1405 | | |
1406 | | static void |
1407 | | sse2_combine_saturate_u (pixman_implementation_t *imp, |
1408 | | pixman_op_t op, |
1409 | | uint32_t * pd, |
1410 | | const uint32_t * ps, |
1411 | | const uint32_t * pm, |
1412 | | int w) |
1413 | 0 | { |
1414 | 0 | uint32_t s, d; |
1415 | |
|
1416 | 0 | uint32_t pack_cmp; |
1417 | 0 | __m128i xmm_src, xmm_dst; |
1418 | |
|
1419 | 0 | while (w && (uintptr_t)pd & 15) |
1420 | 0 | { |
1421 | 0 | s = combine1 (ps, pm); |
1422 | 0 | d = *pd; |
1423 | |
|
1424 | 0 | *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); |
1425 | 0 | w--; |
1426 | 0 | ps++; |
1427 | 0 | if (pm) |
1428 | 0 | pm++; |
1429 | 0 | } |
1430 | |
|
1431 | 0 | while (w >= 4) |
1432 | 0 | { |
1433 | 0 | xmm_dst = load_128_aligned ((__m128i*)pd); |
1434 | 0 | xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm); |
1435 | |
|
1436 | 0 | pack_cmp = _mm_movemask_epi8 ( |
1437 | 0 | _mm_cmpgt_epi32 ( |
1438 | 0 | _mm_srli_epi32 (xmm_src, 24), |
1439 | 0 | _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24))); |
1440 | | |
1441 | | /* if some alpha src is grater than respective ~alpha dst */ |
1442 | 0 | if (pack_cmp) |
1443 | 0 | { |
1444 | 0 | s = combine1 (ps++, pm); |
1445 | 0 | d = *pd; |
1446 | 0 | *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); |
1447 | 0 | if (pm) |
1448 | 0 | pm++; |
1449 | |
|
1450 | 0 | s = combine1 (ps++, pm); |
1451 | 0 | d = *pd; |
1452 | 0 | *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); |
1453 | 0 | if (pm) |
1454 | 0 | pm++; |
1455 | |
|
1456 | 0 | s = combine1 (ps++, pm); |
1457 | 0 | d = *pd; |
1458 | 0 | *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); |
1459 | 0 | if (pm) |
1460 | 0 | pm++; |
1461 | |
|
1462 | 0 | s = combine1 (ps++, pm); |
1463 | 0 | d = *pd; |
1464 | 0 | *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); |
1465 | 0 | if (pm) |
1466 | 0 | pm++; |
1467 | 0 | } |
1468 | 0 | else |
1469 | 0 | { |
1470 | 0 | save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src)); |
1471 | |
|
1472 | 0 | pd += 4; |
1473 | 0 | ps += 4; |
1474 | 0 | if (pm) |
1475 | 0 | pm += 4; |
1476 | 0 | } |
1477 | |
|
1478 | 0 | w -= 4; |
1479 | 0 | } |
1480 | |
|
1481 | 0 | while (w--) |
1482 | 0 | { |
1483 | 0 | s = combine1 (ps, pm); |
1484 | 0 | d = *pd; |
1485 | |
|
1486 | 0 | *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); |
1487 | 0 | ps++; |
1488 | 0 | if (pm) |
1489 | 0 | pm++; |
1490 | 0 | } |
1491 | 0 | } |
1492 | | |
1493 | | static void |
1494 | | sse2_combine_src_ca (pixman_implementation_t *imp, |
1495 | | pixman_op_t op, |
1496 | | uint32_t * pd, |
1497 | | const uint32_t * ps, |
1498 | | const uint32_t * pm, |
1499 | | int w) |
1500 | 0 | { |
1501 | 0 | uint32_t s, m; |
1502 | |
|
1503 | 0 | __m128i xmm_src_lo, xmm_src_hi; |
1504 | 0 | __m128i xmm_mask_lo, xmm_mask_hi; |
1505 | 0 | __m128i xmm_dst_lo, xmm_dst_hi; |
1506 | |
|
1507 | 0 | while (w && (uintptr_t)pd & 15) |
1508 | 0 | { |
1509 | 0 | s = *ps++; |
1510 | 0 | m = *pm++; |
1511 | 0 | *pd++ = pack_1x128_32 ( |
1512 | 0 | pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m))); |
1513 | 0 | w--; |
1514 | 0 | } |
1515 | |
|
1516 | 0 | while (w >= 4) |
1517 | 0 | { |
1518 | 0 | xmm_src_hi = load_128_unaligned ((__m128i*)ps); |
1519 | 0 | xmm_mask_hi = load_128_unaligned ((__m128i*)pm); |
1520 | |
|
1521 | 0 | unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
1522 | 0 | unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
1523 | |
|
1524 | 0 | pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, |
1525 | 0 | &xmm_mask_lo, &xmm_mask_hi, |
1526 | 0 | &xmm_dst_lo, &xmm_dst_hi); |
1527 | |
|
1528 | 0 | save_128_aligned ( |
1529 | 0 | (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
1530 | |
|
1531 | 0 | ps += 4; |
1532 | 0 | pd += 4; |
1533 | 0 | pm += 4; |
1534 | 0 | w -= 4; |
1535 | 0 | } |
1536 | |
|
1537 | 0 | while (w) |
1538 | 0 | { |
1539 | 0 | s = *ps++; |
1540 | 0 | m = *pm++; |
1541 | 0 | *pd++ = pack_1x128_32 ( |
1542 | 0 | pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m))); |
1543 | 0 | w--; |
1544 | 0 | } |
1545 | 0 | } |
1546 | | |
1547 | | static force_inline uint32_t |
1548 | | core_combine_over_ca_pixel_sse2 (uint32_t src, |
1549 | | uint32_t mask, |
1550 | | uint32_t dst) |
1551 | 0 | { |
1552 | 0 | __m128i s = unpack_32_1x128 (src); |
1553 | 0 | __m128i expAlpha = expand_alpha_1x128 (s); |
1554 | 0 | __m128i unpk_mask = unpack_32_1x128 (mask); |
1555 | 0 | __m128i unpk_dst = unpack_32_1x128 (dst); |
1556 | |
|
1557 | 0 | return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst)); |
1558 | 0 | } |
1559 | | |
1560 | | static void |
1561 | | sse2_combine_over_ca (pixman_implementation_t *imp, |
1562 | | pixman_op_t op, |
1563 | | uint32_t * pd, |
1564 | | const uint32_t * ps, |
1565 | | const uint32_t * pm, |
1566 | | int w) |
1567 | 0 | { |
1568 | 0 | uint32_t s, m, d; |
1569 | |
|
1570 | 0 | __m128i xmm_alpha_lo, xmm_alpha_hi; |
1571 | 0 | __m128i xmm_src_lo, xmm_src_hi; |
1572 | 0 | __m128i xmm_dst_lo, xmm_dst_hi; |
1573 | 0 | __m128i xmm_mask_lo, xmm_mask_hi; |
1574 | |
|
1575 | 0 | while (w && (uintptr_t)pd & 15) |
1576 | 0 | { |
1577 | 0 | s = *ps++; |
1578 | 0 | m = *pm++; |
1579 | 0 | d = *pd; |
1580 | |
|
1581 | 0 | *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d); |
1582 | 0 | w--; |
1583 | 0 | } |
1584 | |
|
1585 | 0 | while (w >= 4) |
1586 | 0 | { |
1587 | 0 | xmm_dst_hi = load_128_aligned ((__m128i*)pd); |
1588 | 0 | xmm_src_hi = load_128_unaligned ((__m128i*)ps); |
1589 | 0 | xmm_mask_hi = load_128_unaligned ((__m128i*)pm); |
1590 | |
|
1591 | 0 | unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
1592 | 0 | unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
1593 | 0 | unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
1594 | |
|
1595 | 0 | expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, |
1596 | 0 | &xmm_alpha_lo, &xmm_alpha_hi); |
1597 | |
|
1598 | 0 | in_over_2x128 (&xmm_src_lo, &xmm_src_hi, |
1599 | 0 | &xmm_alpha_lo, &xmm_alpha_hi, |
1600 | 0 | &xmm_mask_lo, &xmm_mask_hi, |
1601 | 0 | &xmm_dst_lo, &xmm_dst_hi); |
1602 | |
|
1603 | 0 | save_128_aligned ( |
1604 | 0 | (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
1605 | |
|
1606 | 0 | ps += 4; |
1607 | 0 | pd += 4; |
1608 | 0 | pm += 4; |
1609 | 0 | w -= 4; |
1610 | 0 | } |
1611 | |
|
1612 | 0 | while (w) |
1613 | 0 | { |
1614 | 0 | s = *ps++; |
1615 | 0 | m = *pm++; |
1616 | 0 | d = *pd; |
1617 | |
|
1618 | 0 | *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d); |
1619 | 0 | w--; |
1620 | 0 | } |
1621 | 0 | } |
1622 | | |
1623 | | static force_inline uint32_t |
1624 | | core_combine_over_reverse_ca_pixel_sse2 (uint32_t src, |
1625 | | uint32_t mask, |
1626 | | uint32_t dst) |
1627 | 0 | { |
1628 | 0 | __m128i d = unpack_32_1x128 (dst); |
1629 | |
|
1630 | 0 | return pack_1x128_32 ( |
1631 | 0 | over_1x128 (d, expand_alpha_1x128 (d), |
1632 | 0 | pix_multiply_1x128 (unpack_32_1x128 (src), |
1633 | 0 | unpack_32_1x128 (mask)))); |
1634 | 0 | } |
1635 | | |
1636 | | static void |
1637 | | sse2_combine_over_reverse_ca (pixman_implementation_t *imp, |
1638 | | pixman_op_t op, |
1639 | | uint32_t * pd, |
1640 | | const uint32_t * ps, |
1641 | | const uint32_t * pm, |
1642 | | int w) |
1643 | 0 | { |
1644 | 0 | uint32_t s, m, d; |
1645 | |
|
1646 | 0 | __m128i xmm_alpha_lo, xmm_alpha_hi; |
1647 | 0 | __m128i xmm_src_lo, xmm_src_hi; |
1648 | 0 | __m128i xmm_dst_lo, xmm_dst_hi; |
1649 | 0 | __m128i xmm_mask_lo, xmm_mask_hi; |
1650 | |
|
1651 | 0 | while (w && (uintptr_t)pd & 15) |
1652 | 0 | { |
1653 | 0 | s = *ps++; |
1654 | 0 | m = *pm++; |
1655 | 0 | d = *pd; |
1656 | |
|
1657 | 0 | *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d); |
1658 | 0 | w--; |
1659 | 0 | } |
1660 | |
|
1661 | 0 | while (w >= 4) |
1662 | 0 | { |
1663 | 0 | xmm_dst_hi = load_128_aligned ((__m128i*)pd); |
1664 | 0 | xmm_src_hi = load_128_unaligned ((__m128i*)ps); |
1665 | 0 | xmm_mask_hi = load_128_unaligned ((__m128i*)pm); |
1666 | |
|
1667 | 0 | unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
1668 | 0 | unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
1669 | 0 | unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
1670 | |
|
1671 | 0 | expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, |
1672 | 0 | &xmm_alpha_lo, &xmm_alpha_hi); |
1673 | 0 | pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, |
1674 | 0 | &xmm_mask_lo, &xmm_mask_hi, |
1675 | 0 | &xmm_mask_lo, &xmm_mask_hi); |
1676 | |
|
1677 | 0 | over_2x128 (&xmm_dst_lo, &xmm_dst_hi, |
1678 | 0 | &xmm_alpha_lo, &xmm_alpha_hi, |
1679 | 0 | &xmm_mask_lo, &xmm_mask_hi); |
1680 | |
|
1681 | 0 | save_128_aligned ( |
1682 | 0 | (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi)); |
1683 | |
|
1684 | 0 | ps += 4; |
1685 | 0 | pd += 4; |
1686 | 0 | pm += 4; |
1687 | 0 | w -= 4; |
1688 | 0 | } |
1689 | |
|
1690 | 0 | while (w) |
1691 | 0 | { |
1692 | 0 | s = *ps++; |
1693 | 0 | m = *pm++; |
1694 | 0 | d = *pd; |
1695 | |
|
1696 | 0 | *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d); |
1697 | 0 | w--; |
1698 | 0 | } |
1699 | 0 | } |
1700 | | |
1701 | | static void |
1702 | | sse2_combine_in_ca (pixman_implementation_t *imp, |
1703 | | pixman_op_t op, |
1704 | | uint32_t * pd, |
1705 | | const uint32_t * ps, |
1706 | | const uint32_t * pm, |
1707 | | int w) |
1708 | 0 | { |
1709 | 0 | uint32_t s, m, d; |
1710 | |
|
1711 | 0 | __m128i xmm_alpha_lo, xmm_alpha_hi; |
1712 | 0 | __m128i xmm_src_lo, xmm_src_hi; |
1713 | 0 | __m128i xmm_dst_lo, xmm_dst_hi; |
1714 | 0 | __m128i xmm_mask_lo, xmm_mask_hi; |
1715 | |
|
1716 | 0 | while (w && (uintptr_t)pd & 15) |
1717 | 0 | { |
1718 | 0 | s = *ps++; |
1719 | 0 | m = *pm++; |
1720 | 0 | d = *pd; |
1721 | |
|
1722 | 0 | *pd++ = pack_1x128_32 ( |
1723 | 0 | pix_multiply_1x128 ( |
1724 | 0 | pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)), |
1725 | 0 | expand_alpha_1x128 (unpack_32_1x128 (d)))); |
1726 | |
|
1727 | 0 | w--; |
1728 | 0 | } |
1729 | |
|
1730 | 0 | while (w >= 4) |
1731 | 0 | { |
1732 | 0 | xmm_dst_hi = load_128_aligned ((__m128i*)pd); |
1733 | 0 | xmm_src_hi = load_128_unaligned ((__m128i*)ps); |
1734 | 0 | xmm_mask_hi = load_128_unaligned ((__m128i*)pm); |
1735 | |
|
1736 | 0 | unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
1737 | 0 | unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
1738 | 0 | unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
1739 | |
|
1740 | 0 | expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, |
1741 | 0 | &xmm_alpha_lo, &xmm_alpha_hi); |
1742 | |
|
1743 | 0 | pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, |
1744 | 0 | &xmm_mask_lo, &xmm_mask_hi, |
1745 | 0 | &xmm_dst_lo, &xmm_dst_hi); |
1746 | |
|
1747 | 0 | pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, |
1748 | 0 | &xmm_alpha_lo, &xmm_alpha_hi, |
1749 | 0 | &xmm_dst_lo, &xmm_dst_hi); |
1750 | |
|
1751 | 0 | save_128_aligned ( |
1752 | 0 | (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
1753 | |
|
1754 | 0 | ps += 4; |
1755 | 0 | pd += 4; |
1756 | 0 | pm += 4; |
1757 | 0 | w -= 4; |
1758 | 0 | } |
1759 | |
|
1760 | 0 | while (w) |
1761 | 0 | { |
1762 | 0 | s = *ps++; |
1763 | 0 | m = *pm++; |
1764 | 0 | d = *pd; |
1765 | |
|
1766 | 0 | *pd++ = pack_1x128_32 ( |
1767 | 0 | pix_multiply_1x128 ( |
1768 | 0 | pix_multiply_1x128 ( |
1769 | 0 | unpack_32_1x128 (s), unpack_32_1x128 (m)), |
1770 | 0 | expand_alpha_1x128 (unpack_32_1x128 (d)))); |
1771 | |
|
1772 | 0 | w--; |
1773 | 0 | } |
1774 | 0 | } |
1775 | | |
1776 | | static void |
1777 | | sse2_combine_in_reverse_ca (pixman_implementation_t *imp, |
1778 | | pixman_op_t op, |
1779 | | uint32_t * pd, |
1780 | | const uint32_t * ps, |
1781 | | const uint32_t * pm, |
1782 | | int w) |
1783 | 0 | { |
1784 | 0 | uint32_t s, m, d; |
1785 | |
|
1786 | 0 | __m128i xmm_alpha_lo, xmm_alpha_hi; |
1787 | 0 | __m128i xmm_src_lo, xmm_src_hi; |
1788 | 0 | __m128i xmm_dst_lo, xmm_dst_hi; |
1789 | 0 | __m128i xmm_mask_lo, xmm_mask_hi; |
1790 | |
|
1791 | 0 | while (w && (uintptr_t)pd & 15) |
1792 | 0 | { |
1793 | 0 | s = *ps++; |
1794 | 0 | m = *pm++; |
1795 | 0 | d = *pd; |
1796 | |
|
1797 | 0 | *pd++ = pack_1x128_32 ( |
1798 | 0 | pix_multiply_1x128 ( |
1799 | 0 | unpack_32_1x128 (d), |
1800 | 0 | pix_multiply_1x128 (unpack_32_1x128 (m), |
1801 | 0 | expand_alpha_1x128 (unpack_32_1x128 (s))))); |
1802 | 0 | w--; |
1803 | 0 | } |
1804 | |
|
1805 | 0 | while (w >= 4) |
1806 | 0 | { |
1807 | 0 | xmm_dst_hi = load_128_aligned ((__m128i*)pd); |
1808 | 0 | xmm_src_hi = load_128_unaligned ((__m128i*)ps); |
1809 | 0 | xmm_mask_hi = load_128_unaligned ((__m128i*)pm); |
1810 | |
|
1811 | 0 | unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
1812 | 0 | unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
1813 | 0 | unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
1814 | |
|
1815 | 0 | expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, |
1816 | 0 | &xmm_alpha_lo, &xmm_alpha_hi); |
1817 | 0 | pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, |
1818 | 0 | &xmm_alpha_lo, &xmm_alpha_hi, |
1819 | 0 | &xmm_alpha_lo, &xmm_alpha_hi); |
1820 | |
|
1821 | 0 | pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, |
1822 | 0 | &xmm_alpha_lo, &xmm_alpha_hi, |
1823 | 0 | &xmm_dst_lo, &xmm_dst_hi); |
1824 | |
|
1825 | 0 | save_128_aligned ( |
1826 | 0 | (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
1827 | |
|
1828 | 0 | ps += 4; |
1829 | 0 | pd += 4; |
1830 | 0 | pm += 4; |
1831 | 0 | w -= 4; |
1832 | 0 | } |
1833 | |
|
1834 | 0 | while (w) |
1835 | 0 | { |
1836 | 0 | s = *ps++; |
1837 | 0 | m = *pm++; |
1838 | 0 | d = *pd; |
1839 | |
|
1840 | 0 | *pd++ = pack_1x128_32 ( |
1841 | 0 | pix_multiply_1x128 ( |
1842 | 0 | unpack_32_1x128 (d), |
1843 | 0 | pix_multiply_1x128 (unpack_32_1x128 (m), |
1844 | 0 | expand_alpha_1x128 (unpack_32_1x128 (s))))); |
1845 | 0 | w--; |
1846 | 0 | } |
1847 | 0 | } |
1848 | | |
1849 | | static void |
1850 | | sse2_combine_out_ca (pixman_implementation_t *imp, |
1851 | | pixman_op_t op, |
1852 | | uint32_t * pd, |
1853 | | const uint32_t * ps, |
1854 | | const uint32_t * pm, |
1855 | | int w) |
1856 | 0 | { |
1857 | 0 | uint32_t s, m, d; |
1858 | |
|
1859 | 0 | __m128i xmm_alpha_lo, xmm_alpha_hi; |
1860 | 0 | __m128i xmm_src_lo, xmm_src_hi; |
1861 | 0 | __m128i xmm_dst_lo, xmm_dst_hi; |
1862 | 0 | __m128i xmm_mask_lo, xmm_mask_hi; |
1863 | |
|
1864 | 0 | while (w && (uintptr_t)pd & 15) |
1865 | 0 | { |
1866 | 0 | s = *ps++; |
1867 | 0 | m = *pm++; |
1868 | 0 | d = *pd; |
1869 | |
|
1870 | 0 | *pd++ = pack_1x128_32 ( |
1871 | 0 | pix_multiply_1x128 ( |
1872 | 0 | pix_multiply_1x128 ( |
1873 | 0 | unpack_32_1x128 (s), unpack_32_1x128 (m)), |
1874 | 0 | negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d))))); |
1875 | 0 | w--; |
1876 | 0 | } |
1877 | |
|
1878 | 0 | while (w >= 4) |
1879 | 0 | { |
1880 | 0 | xmm_dst_hi = load_128_aligned ((__m128i*)pd); |
1881 | 0 | xmm_src_hi = load_128_unaligned ((__m128i*)ps); |
1882 | 0 | xmm_mask_hi = load_128_unaligned ((__m128i*)pm); |
1883 | |
|
1884 | 0 | unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
1885 | 0 | unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
1886 | 0 | unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
1887 | |
|
1888 | 0 | expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, |
1889 | 0 | &xmm_alpha_lo, &xmm_alpha_hi); |
1890 | 0 | negate_2x128 (xmm_alpha_lo, xmm_alpha_hi, |
1891 | 0 | &xmm_alpha_lo, &xmm_alpha_hi); |
1892 | |
|
1893 | 0 | pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, |
1894 | 0 | &xmm_mask_lo, &xmm_mask_hi, |
1895 | 0 | &xmm_dst_lo, &xmm_dst_hi); |
1896 | 0 | pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, |
1897 | 0 | &xmm_alpha_lo, &xmm_alpha_hi, |
1898 | 0 | &xmm_dst_lo, &xmm_dst_hi); |
1899 | |
|
1900 | 0 | save_128_aligned ( |
1901 | 0 | (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
1902 | |
|
1903 | 0 | ps += 4; |
1904 | 0 | pd += 4; |
1905 | 0 | pm += 4; |
1906 | 0 | w -= 4; |
1907 | 0 | } |
1908 | |
|
1909 | 0 | while (w) |
1910 | 0 | { |
1911 | 0 | s = *ps++; |
1912 | 0 | m = *pm++; |
1913 | 0 | d = *pd; |
1914 | |
|
1915 | 0 | *pd++ = pack_1x128_32 ( |
1916 | 0 | pix_multiply_1x128 ( |
1917 | 0 | pix_multiply_1x128 ( |
1918 | 0 | unpack_32_1x128 (s), unpack_32_1x128 (m)), |
1919 | 0 | negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d))))); |
1920 | |
|
1921 | 0 | w--; |
1922 | 0 | } |
1923 | 0 | } |
1924 | | |
1925 | | static void |
1926 | | sse2_combine_out_reverse_ca (pixman_implementation_t *imp, |
1927 | | pixman_op_t op, |
1928 | | uint32_t * pd, |
1929 | | const uint32_t * ps, |
1930 | | const uint32_t * pm, |
1931 | | int w) |
1932 | 0 | { |
1933 | 0 | uint32_t s, m, d; |
1934 | |
|
1935 | 0 | __m128i xmm_alpha_lo, xmm_alpha_hi; |
1936 | 0 | __m128i xmm_src_lo, xmm_src_hi; |
1937 | 0 | __m128i xmm_dst_lo, xmm_dst_hi; |
1938 | 0 | __m128i xmm_mask_lo, xmm_mask_hi; |
1939 | |
|
1940 | 0 | while (w && (uintptr_t)pd & 15) |
1941 | 0 | { |
1942 | 0 | s = *ps++; |
1943 | 0 | m = *pm++; |
1944 | 0 | d = *pd; |
1945 | |
|
1946 | 0 | *pd++ = pack_1x128_32 ( |
1947 | 0 | pix_multiply_1x128 ( |
1948 | 0 | unpack_32_1x128 (d), |
1949 | 0 | negate_1x128 (pix_multiply_1x128 ( |
1950 | 0 | unpack_32_1x128 (m), |
1951 | 0 | expand_alpha_1x128 (unpack_32_1x128 (s)))))); |
1952 | 0 | w--; |
1953 | 0 | } |
1954 | |
|
1955 | 0 | while (w >= 4) |
1956 | 0 | { |
1957 | 0 | xmm_dst_hi = load_128_aligned ((__m128i*)pd); |
1958 | 0 | xmm_src_hi = load_128_unaligned ((__m128i*)ps); |
1959 | 0 | xmm_mask_hi = load_128_unaligned ((__m128i*)pm); |
1960 | |
|
1961 | 0 | unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
1962 | 0 | unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
1963 | 0 | unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
1964 | |
|
1965 | 0 | expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, |
1966 | 0 | &xmm_alpha_lo, &xmm_alpha_hi); |
1967 | |
|
1968 | 0 | pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, |
1969 | 0 | &xmm_alpha_lo, &xmm_alpha_hi, |
1970 | 0 | &xmm_mask_lo, &xmm_mask_hi); |
1971 | |
|
1972 | 0 | negate_2x128 (xmm_mask_lo, xmm_mask_hi, |
1973 | 0 | &xmm_mask_lo, &xmm_mask_hi); |
1974 | |
|
1975 | 0 | pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, |
1976 | 0 | &xmm_mask_lo, &xmm_mask_hi, |
1977 | 0 | &xmm_dst_lo, &xmm_dst_hi); |
1978 | |
|
1979 | 0 | save_128_aligned ( |
1980 | 0 | (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
1981 | |
|
1982 | 0 | ps += 4; |
1983 | 0 | pd += 4; |
1984 | 0 | pm += 4; |
1985 | 0 | w -= 4; |
1986 | 0 | } |
1987 | |
|
1988 | 0 | while (w) |
1989 | 0 | { |
1990 | 0 | s = *ps++; |
1991 | 0 | m = *pm++; |
1992 | 0 | d = *pd; |
1993 | |
|
1994 | 0 | *pd++ = pack_1x128_32 ( |
1995 | 0 | pix_multiply_1x128 ( |
1996 | 0 | unpack_32_1x128 (d), |
1997 | 0 | negate_1x128 (pix_multiply_1x128 ( |
1998 | 0 | unpack_32_1x128 (m), |
1999 | 0 | expand_alpha_1x128 (unpack_32_1x128 (s)))))); |
2000 | 0 | w--; |
2001 | 0 | } |
2002 | 0 | } |
2003 | | |
2004 | | static force_inline uint32_t |
2005 | | core_combine_atop_ca_pixel_sse2 (uint32_t src, |
2006 | | uint32_t mask, |
2007 | | uint32_t dst) |
2008 | 0 | { |
2009 | 0 | __m128i m = unpack_32_1x128 (mask); |
2010 | 0 | __m128i s = unpack_32_1x128 (src); |
2011 | 0 | __m128i d = unpack_32_1x128 (dst); |
2012 | 0 | __m128i sa = expand_alpha_1x128 (s); |
2013 | 0 | __m128i da = expand_alpha_1x128 (d); |
2014 | |
|
2015 | 0 | s = pix_multiply_1x128 (s, m); |
2016 | 0 | m = negate_1x128 (pix_multiply_1x128 (m, sa)); |
2017 | |
|
2018 | 0 | return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da)); |
2019 | 0 | } |
2020 | | |
2021 | | static void |
2022 | | sse2_combine_atop_ca (pixman_implementation_t *imp, |
2023 | | pixman_op_t op, |
2024 | | uint32_t * pd, |
2025 | | const uint32_t * ps, |
2026 | | const uint32_t * pm, |
2027 | | int w) |
2028 | 0 | { |
2029 | 0 | uint32_t s, m, d; |
2030 | |
|
2031 | 0 | __m128i xmm_src_lo, xmm_src_hi; |
2032 | 0 | __m128i xmm_dst_lo, xmm_dst_hi; |
2033 | 0 | __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; |
2034 | 0 | __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; |
2035 | 0 | __m128i xmm_mask_lo, xmm_mask_hi; |
2036 | |
|
2037 | 0 | while (w && (uintptr_t)pd & 15) |
2038 | 0 | { |
2039 | 0 | s = *ps++; |
2040 | 0 | m = *pm++; |
2041 | 0 | d = *pd; |
2042 | |
|
2043 | 0 | *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d); |
2044 | 0 | w--; |
2045 | 0 | } |
2046 | |
|
2047 | 0 | while (w >= 4) |
2048 | 0 | { |
2049 | 0 | xmm_dst_hi = load_128_aligned ((__m128i*)pd); |
2050 | 0 | xmm_src_hi = load_128_unaligned ((__m128i*)ps); |
2051 | 0 | xmm_mask_hi = load_128_unaligned ((__m128i*)pm); |
2052 | |
|
2053 | 0 | unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
2054 | 0 | unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
2055 | 0 | unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
2056 | |
|
2057 | 0 | expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, |
2058 | 0 | &xmm_alpha_src_lo, &xmm_alpha_src_hi); |
2059 | 0 | expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, |
2060 | 0 | &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); |
2061 | |
|
2062 | 0 | pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, |
2063 | 0 | &xmm_mask_lo, &xmm_mask_hi, |
2064 | 0 | &xmm_src_lo, &xmm_src_hi); |
2065 | 0 | pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, |
2066 | 0 | &xmm_alpha_src_lo, &xmm_alpha_src_hi, |
2067 | 0 | &xmm_mask_lo, &xmm_mask_hi); |
2068 | |
|
2069 | 0 | negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
2070 | |
|
2071 | 0 | pix_add_multiply_2x128 ( |
2072 | 0 | &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi, |
2073 | 0 | &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, |
2074 | 0 | &xmm_dst_lo, &xmm_dst_hi); |
2075 | |
|
2076 | 0 | save_128_aligned ( |
2077 | 0 | (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
2078 | |
|
2079 | 0 | ps += 4; |
2080 | 0 | pd += 4; |
2081 | 0 | pm += 4; |
2082 | 0 | w -= 4; |
2083 | 0 | } |
2084 | |
|
2085 | 0 | while (w) |
2086 | 0 | { |
2087 | 0 | s = *ps++; |
2088 | 0 | m = *pm++; |
2089 | 0 | d = *pd; |
2090 | |
|
2091 | 0 | *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d); |
2092 | 0 | w--; |
2093 | 0 | } |
2094 | 0 | } |
2095 | | |
2096 | | static force_inline uint32_t |
2097 | | core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src, |
2098 | | uint32_t mask, |
2099 | | uint32_t dst) |
2100 | 0 | { |
2101 | 0 | __m128i m = unpack_32_1x128 (mask); |
2102 | 0 | __m128i s = unpack_32_1x128 (src); |
2103 | 0 | __m128i d = unpack_32_1x128 (dst); |
2104 | |
|
2105 | 0 | __m128i da = negate_1x128 (expand_alpha_1x128 (d)); |
2106 | 0 | __m128i sa = expand_alpha_1x128 (s); |
2107 | |
|
2108 | 0 | s = pix_multiply_1x128 (s, m); |
2109 | 0 | m = pix_multiply_1x128 (m, sa); |
2110 | |
|
2111 | 0 | return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da)); |
2112 | 0 | } |
2113 | | |
2114 | | static void |
2115 | | sse2_combine_atop_reverse_ca (pixman_implementation_t *imp, |
2116 | | pixman_op_t op, |
2117 | | uint32_t * pd, |
2118 | | const uint32_t * ps, |
2119 | | const uint32_t * pm, |
2120 | | int w) |
2121 | 0 | { |
2122 | 0 | uint32_t s, m, d; |
2123 | |
|
2124 | 0 | __m128i xmm_src_lo, xmm_src_hi; |
2125 | 0 | __m128i xmm_dst_lo, xmm_dst_hi; |
2126 | 0 | __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; |
2127 | 0 | __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; |
2128 | 0 | __m128i xmm_mask_lo, xmm_mask_hi; |
2129 | |
|
2130 | 0 | while (w && (uintptr_t)pd & 15) |
2131 | 0 | { |
2132 | 0 | s = *ps++; |
2133 | 0 | m = *pm++; |
2134 | 0 | d = *pd; |
2135 | |
|
2136 | 0 | *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d); |
2137 | 0 | w--; |
2138 | 0 | } |
2139 | |
|
2140 | 0 | while (w >= 4) |
2141 | 0 | { |
2142 | 0 | xmm_dst_hi = load_128_aligned ((__m128i*)pd); |
2143 | 0 | xmm_src_hi = load_128_unaligned ((__m128i*)ps); |
2144 | 0 | xmm_mask_hi = load_128_unaligned ((__m128i*)pm); |
2145 | |
|
2146 | 0 | unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
2147 | 0 | unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
2148 | 0 | unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
2149 | |
|
2150 | 0 | expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, |
2151 | 0 | &xmm_alpha_src_lo, &xmm_alpha_src_hi); |
2152 | 0 | expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, |
2153 | 0 | &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); |
2154 | |
|
2155 | 0 | pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, |
2156 | 0 | &xmm_mask_lo, &xmm_mask_hi, |
2157 | 0 | &xmm_src_lo, &xmm_src_hi); |
2158 | 0 | pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, |
2159 | 0 | &xmm_alpha_src_lo, &xmm_alpha_src_hi, |
2160 | 0 | &xmm_mask_lo, &xmm_mask_hi); |
2161 | |
|
2162 | 0 | negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, |
2163 | 0 | &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); |
2164 | |
|
2165 | 0 | pix_add_multiply_2x128 ( |
2166 | 0 | &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi, |
2167 | 0 | &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, |
2168 | 0 | &xmm_dst_lo, &xmm_dst_hi); |
2169 | |
|
2170 | 0 | save_128_aligned ( |
2171 | 0 | (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
2172 | |
|
2173 | 0 | ps += 4; |
2174 | 0 | pd += 4; |
2175 | 0 | pm += 4; |
2176 | 0 | w -= 4; |
2177 | 0 | } |
2178 | |
|
2179 | 0 | while (w) |
2180 | 0 | { |
2181 | 0 | s = *ps++; |
2182 | 0 | m = *pm++; |
2183 | 0 | d = *pd; |
2184 | |
|
2185 | 0 | *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d); |
2186 | 0 | w--; |
2187 | 0 | } |
2188 | 0 | } |
2189 | | |
2190 | | static force_inline uint32_t |
2191 | | core_combine_xor_ca_pixel_sse2 (uint32_t src, |
2192 | | uint32_t mask, |
2193 | | uint32_t dst) |
2194 | 0 | { |
2195 | 0 | __m128i a = unpack_32_1x128 (mask); |
2196 | 0 | __m128i s = unpack_32_1x128 (src); |
2197 | 0 | __m128i d = unpack_32_1x128 (dst); |
2198 | |
|
2199 | 0 | __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 ( |
2200 | 0 | a, expand_alpha_1x128 (s))); |
2201 | 0 | __m128i dest = pix_multiply_1x128 (s, a); |
2202 | 0 | __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d)); |
2203 | |
|
2204 | 0 | return pack_1x128_32 (pix_add_multiply_1x128 (&d, |
2205 | 0 | &alpha_dst, |
2206 | 0 | &dest, |
2207 | 0 | &alpha_src)); |
2208 | 0 | } |
2209 | | |
2210 | | static void |
2211 | | sse2_combine_xor_ca (pixman_implementation_t *imp, |
2212 | | pixman_op_t op, |
2213 | | uint32_t * pd, |
2214 | | const uint32_t * ps, |
2215 | | const uint32_t * pm, |
2216 | | int w) |
2217 | 0 | { |
2218 | 0 | uint32_t s, m, d; |
2219 | |
|
2220 | 0 | __m128i xmm_src_lo, xmm_src_hi; |
2221 | 0 | __m128i xmm_dst_lo, xmm_dst_hi; |
2222 | 0 | __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; |
2223 | 0 | __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; |
2224 | 0 | __m128i xmm_mask_lo, xmm_mask_hi; |
2225 | |
|
2226 | 0 | while (w && (uintptr_t)pd & 15) |
2227 | 0 | { |
2228 | 0 | s = *ps++; |
2229 | 0 | m = *pm++; |
2230 | 0 | d = *pd; |
2231 | |
|
2232 | 0 | *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d); |
2233 | 0 | w--; |
2234 | 0 | } |
2235 | |
|
2236 | 0 | while (w >= 4) |
2237 | 0 | { |
2238 | 0 | xmm_dst_hi = load_128_aligned ((__m128i*)pd); |
2239 | 0 | xmm_src_hi = load_128_unaligned ((__m128i*)ps); |
2240 | 0 | xmm_mask_hi = load_128_unaligned ((__m128i*)pm); |
2241 | |
|
2242 | 0 | unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
2243 | 0 | unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
2244 | 0 | unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
2245 | |
|
2246 | 0 | expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, |
2247 | 0 | &xmm_alpha_src_lo, &xmm_alpha_src_hi); |
2248 | 0 | expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, |
2249 | 0 | &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); |
2250 | |
|
2251 | 0 | pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, |
2252 | 0 | &xmm_mask_lo, &xmm_mask_hi, |
2253 | 0 | &xmm_src_lo, &xmm_src_hi); |
2254 | 0 | pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, |
2255 | 0 | &xmm_alpha_src_lo, &xmm_alpha_src_hi, |
2256 | 0 | &xmm_mask_lo, &xmm_mask_hi); |
2257 | |
|
2258 | 0 | negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, |
2259 | 0 | &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); |
2260 | 0 | negate_2x128 (xmm_mask_lo, xmm_mask_hi, |
2261 | 0 | &xmm_mask_lo, &xmm_mask_hi); |
2262 | |
|
2263 | 0 | pix_add_multiply_2x128 ( |
2264 | 0 | &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi, |
2265 | 0 | &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, |
2266 | 0 | &xmm_dst_lo, &xmm_dst_hi); |
2267 | |
|
2268 | 0 | save_128_aligned ( |
2269 | 0 | (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
2270 | |
|
2271 | 0 | ps += 4; |
2272 | 0 | pd += 4; |
2273 | 0 | pm += 4; |
2274 | 0 | w -= 4; |
2275 | 0 | } |
2276 | |
|
2277 | 0 | while (w) |
2278 | 0 | { |
2279 | 0 | s = *ps++; |
2280 | 0 | m = *pm++; |
2281 | 0 | d = *pd; |
2282 | |
|
2283 | 0 | *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d); |
2284 | 0 | w--; |
2285 | 0 | } |
2286 | 0 | } |
2287 | | |
2288 | | static void |
2289 | | sse2_combine_add_ca (pixman_implementation_t *imp, |
2290 | | pixman_op_t op, |
2291 | | uint32_t * pd, |
2292 | | const uint32_t * ps, |
2293 | | const uint32_t * pm, |
2294 | | int w) |
2295 | 0 | { |
2296 | 0 | uint32_t s, m, d; |
2297 | |
|
2298 | 0 | __m128i xmm_src_lo, xmm_src_hi; |
2299 | 0 | __m128i xmm_dst_lo, xmm_dst_hi; |
2300 | 0 | __m128i xmm_mask_lo, xmm_mask_hi; |
2301 | |
|
2302 | 0 | while (w && (uintptr_t)pd & 15) |
2303 | 0 | { |
2304 | 0 | s = *ps++; |
2305 | 0 | m = *pm++; |
2306 | 0 | d = *pd; |
2307 | |
|
2308 | 0 | *pd++ = pack_1x128_32 ( |
2309 | 0 | _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s), |
2310 | 0 | unpack_32_1x128 (m)), |
2311 | 0 | unpack_32_1x128 (d))); |
2312 | 0 | w--; |
2313 | 0 | } |
2314 | |
|
2315 | 0 | while (w >= 4) |
2316 | 0 | { |
2317 | 0 | xmm_src_hi = load_128_unaligned ((__m128i*)ps); |
2318 | 0 | xmm_mask_hi = load_128_unaligned ((__m128i*)pm); |
2319 | 0 | xmm_dst_hi = load_128_aligned ((__m128i*)pd); |
2320 | |
|
2321 | 0 | unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
2322 | 0 | unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
2323 | 0 | unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
2324 | |
|
2325 | 0 | pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, |
2326 | 0 | &xmm_mask_lo, &xmm_mask_hi, |
2327 | 0 | &xmm_src_lo, &xmm_src_hi); |
2328 | |
|
2329 | 0 | save_128_aligned ( |
2330 | 0 | (__m128i*)pd, pack_2x128_128 ( |
2331 | 0 | _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo), |
2332 | 0 | _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi))); |
2333 | |
|
2334 | 0 | ps += 4; |
2335 | 0 | pd += 4; |
2336 | 0 | pm += 4; |
2337 | 0 | w -= 4; |
2338 | 0 | } |
2339 | |
|
2340 | 0 | while (w) |
2341 | 0 | { |
2342 | 0 | s = *ps++; |
2343 | 0 | m = *pm++; |
2344 | 0 | d = *pd; |
2345 | |
|
2346 | 0 | *pd++ = pack_1x128_32 ( |
2347 | 0 | _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s), |
2348 | 0 | unpack_32_1x128 (m)), |
2349 | 0 | unpack_32_1x128 (d))); |
2350 | 0 | w--; |
2351 | 0 | } |
2352 | 0 | } |
2353 | | |
2354 | | static force_inline __m128i |
2355 | | create_mask_16_128 (uint16_t mask) |
2356 | 48 | { |
2357 | 48 | return _mm_set1_epi16 (mask); |
2358 | 48 | } |
2359 | | |
2360 | | /* Work around a code generation bug in Sun Studio 12. */ |
2361 | | #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590) |
2362 | | # define create_mask_2x32_128(mask0, mask1) \ |
2363 | | (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1))) |
2364 | | #else |
2365 | | static force_inline __m128i |
2366 | | create_mask_2x32_128 (uint32_t mask0, |
2367 | | uint32_t mask1) |
2368 | 166 | { |
2369 | 166 | return _mm_set_epi32 (mask0, mask1, mask0, mask1); |
2370 | 166 | } |
2371 | | #endif |
2372 | | |
2373 | | static void |
2374 | | sse2_composite_over_n_8888 (pixman_implementation_t *imp, |
2375 | | pixman_composite_info_t *info) |
2376 | 0 | { |
2377 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
2378 | 0 | uint32_t src; |
2379 | 0 | uint32_t *dst_line, *dst, d; |
2380 | 0 | int32_t w; |
2381 | 0 | int dst_stride; |
2382 | 0 | __m128i xmm_src, xmm_alpha; |
2383 | 0 | __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
2384 | |
|
2385 | 0 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
2386 | |
|
2387 | 0 | if (src == 0) |
2388 | 0 | return; |
2389 | | |
2390 | 0 | PIXMAN_IMAGE_GET_LINE ( |
2391 | 0 | dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
2392 | |
|
2393 | 0 | xmm_src = expand_pixel_32_1x128 (src); |
2394 | 0 | xmm_alpha = expand_alpha_1x128 (xmm_src); |
2395 | |
|
2396 | 0 | while (height--) |
2397 | 0 | { |
2398 | 0 | dst = dst_line; |
2399 | |
|
2400 | 0 | dst_line += dst_stride; |
2401 | 0 | w = width; |
2402 | |
|
2403 | 0 | while (w && (uintptr_t)dst & 15) |
2404 | 0 | { |
2405 | 0 | d = *dst; |
2406 | 0 | *dst++ = pack_1x128_32 (over_1x128 (xmm_src, |
2407 | 0 | xmm_alpha, |
2408 | 0 | unpack_32_1x128 (d))); |
2409 | 0 | w--; |
2410 | 0 | } |
2411 | |
|
2412 | 0 | while (w >= 4) |
2413 | 0 | { |
2414 | 0 | xmm_dst = load_128_aligned ((__m128i*)dst); |
2415 | |
|
2416 | 0 | unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
2417 | |
|
2418 | 0 | over_2x128 (&xmm_src, &xmm_src, |
2419 | 0 | &xmm_alpha, &xmm_alpha, |
2420 | 0 | &xmm_dst_lo, &xmm_dst_hi); |
2421 | | |
2422 | | /* rebuid the 4 pixel data and save*/ |
2423 | 0 | save_128_aligned ( |
2424 | 0 | (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
2425 | |
|
2426 | 0 | w -= 4; |
2427 | 0 | dst += 4; |
2428 | 0 | } |
2429 | |
|
2430 | 0 | while (w) |
2431 | 0 | { |
2432 | 0 | d = *dst; |
2433 | 0 | *dst++ = pack_1x128_32 (over_1x128 (xmm_src, |
2434 | 0 | xmm_alpha, |
2435 | 0 | unpack_32_1x128 (d))); |
2436 | 0 | w--; |
2437 | 0 | } |
2438 | |
|
2439 | 0 | } |
2440 | 0 | } |
2441 | | |
2442 | | static void |
2443 | | sse2_composite_over_n_0565 (pixman_implementation_t *imp, |
2444 | | pixman_composite_info_t *info) |
2445 | 0 | { |
2446 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
2447 | 0 | uint32_t src; |
2448 | 0 | uint16_t *dst_line, *dst, d; |
2449 | 0 | int32_t w; |
2450 | 0 | int dst_stride; |
2451 | 0 | __m128i xmm_src, xmm_alpha; |
2452 | 0 | __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; |
2453 | |
|
2454 | 0 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
2455 | |
|
2456 | 0 | if (src == 0) |
2457 | 0 | return; |
2458 | | |
2459 | 0 | PIXMAN_IMAGE_GET_LINE ( |
2460 | 0 | dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
2461 | |
|
2462 | 0 | xmm_src = expand_pixel_32_1x128 (src); |
2463 | 0 | xmm_alpha = expand_alpha_1x128 (xmm_src); |
2464 | |
|
2465 | 0 | while (height--) |
2466 | 0 | { |
2467 | 0 | dst = dst_line; |
2468 | |
|
2469 | 0 | dst_line += dst_stride; |
2470 | 0 | w = width; |
2471 | |
|
2472 | 0 | while (w && (uintptr_t)dst & 15) |
2473 | 0 | { |
2474 | 0 | d = *dst; |
2475 | |
|
2476 | 0 | *dst++ = pack_565_32_16 ( |
2477 | 0 | pack_1x128_32 (over_1x128 (xmm_src, |
2478 | 0 | xmm_alpha, |
2479 | 0 | expand565_16_1x128 (d)))); |
2480 | 0 | w--; |
2481 | 0 | } |
2482 | |
|
2483 | 0 | while (w >= 8) |
2484 | 0 | { |
2485 | 0 | xmm_dst = load_128_aligned ((__m128i*)dst); |
2486 | |
|
2487 | 0 | unpack_565_128_4x128 (xmm_dst, |
2488 | 0 | &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); |
2489 | |
|
2490 | 0 | over_2x128 (&xmm_src, &xmm_src, |
2491 | 0 | &xmm_alpha, &xmm_alpha, |
2492 | 0 | &xmm_dst0, &xmm_dst1); |
2493 | 0 | over_2x128 (&xmm_src, &xmm_src, |
2494 | 0 | &xmm_alpha, &xmm_alpha, |
2495 | 0 | &xmm_dst2, &xmm_dst3); |
2496 | |
|
2497 | 0 | xmm_dst = pack_565_4x128_128 ( |
2498 | 0 | &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); |
2499 | |
|
2500 | 0 | save_128_aligned ((__m128i*)dst, xmm_dst); |
2501 | |
|
2502 | 0 | dst += 8; |
2503 | 0 | w -= 8; |
2504 | 0 | } |
2505 | |
|
2506 | 0 | while (w--) |
2507 | 0 | { |
2508 | 0 | d = *dst; |
2509 | 0 | *dst++ = pack_565_32_16 ( |
2510 | 0 | pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha, |
2511 | 0 | expand565_16_1x128 (d)))); |
2512 | 0 | } |
2513 | 0 | } |
2514 | |
|
2515 | 0 | } |
2516 | | |
2517 | | static void |
2518 | | sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp, |
2519 | | pixman_composite_info_t *info) |
2520 | 0 | { |
2521 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
2522 | 0 | uint32_t src; |
2523 | 0 | uint32_t *dst_line, d; |
2524 | 0 | uint32_t *mask_line, m; |
2525 | 0 | uint32_t pack_cmp; |
2526 | 0 | int dst_stride, mask_stride; |
2527 | |
|
2528 | 0 | __m128i xmm_src; |
2529 | 0 | __m128i xmm_dst; |
2530 | 0 | __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; |
2531 | |
|
2532 | 0 | __m128i mmx_src, mmx_mask, mmx_dest; |
2533 | |
|
2534 | 0 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
2535 | |
|
2536 | 0 | if (src == 0) |
2537 | 0 | return; |
2538 | | |
2539 | 0 | PIXMAN_IMAGE_GET_LINE ( |
2540 | 0 | dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
2541 | 0 | PIXMAN_IMAGE_GET_LINE ( |
2542 | 0 | mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); |
2543 | |
|
2544 | 0 | xmm_src = _mm_unpacklo_epi8 ( |
2545 | 0 | create_mask_2x32_128 (src, src), _mm_setzero_si128 ()); |
2546 | 0 | mmx_src = xmm_src; |
2547 | |
|
2548 | 0 | while (height--) |
2549 | 0 | { |
2550 | 0 | int w = width; |
2551 | 0 | const uint32_t *pm = (uint32_t *)mask_line; |
2552 | 0 | uint32_t *pd = (uint32_t *)dst_line; |
2553 | |
|
2554 | 0 | dst_line += dst_stride; |
2555 | 0 | mask_line += mask_stride; |
2556 | |
|
2557 | 0 | while (w && (uintptr_t)pd & 15) |
2558 | 0 | { |
2559 | 0 | m = *pm++; |
2560 | |
|
2561 | 0 | if (m) |
2562 | 0 | { |
2563 | 0 | d = *pd; |
2564 | |
|
2565 | 0 | mmx_mask = unpack_32_1x128 (m); |
2566 | 0 | mmx_dest = unpack_32_1x128 (d); |
2567 | |
|
2568 | 0 | *pd = pack_1x128_32 ( |
2569 | 0 | _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), |
2570 | 0 | mmx_dest)); |
2571 | 0 | } |
2572 | |
|
2573 | 0 | pd++; |
2574 | 0 | w--; |
2575 | 0 | } |
2576 | |
|
2577 | 0 | while (w >= 4) |
2578 | 0 | { |
2579 | 0 | xmm_mask = load_128_unaligned ((__m128i*)pm); |
2580 | |
|
2581 | 0 | pack_cmp = |
2582 | 0 | _mm_movemask_epi8 ( |
2583 | 0 | _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); |
2584 | | |
2585 | | /* if all bits in mask are zero, pack_cmp are equal to 0xffff */ |
2586 | 0 | if (pack_cmp != 0xffff) |
2587 | 0 | { |
2588 | 0 | xmm_dst = load_128_aligned ((__m128i*)pd); |
2589 | |
|
2590 | 0 | unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
2591 | |
|
2592 | 0 | pix_multiply_2x128 (&xmm_src, &xmm_src, |
2593 | 0 | &xmm_mask_lo, &xmm_mask_hi, |
2594 | 0 | &xmm_mask_lo, &xmm_mask_hi); |
2595 | 0 | xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi); |
2596 | |
|
2597 | 0 | save_128_aligned ( |
2598 | 0 | (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst)); |
2599 | 0 | } |
2600 | |
|
2601 | 0 | pd += 4; |
2602 | 0 | pm += 4; |
2603 | 0 | w -= 4; |
2604 | 0 | } |
2605 | |
|
2606 | 0 | while (w) |
2607 | 0 | { |
2608 | 0 | m = *pm++; |
2609 | |
|
2610 | 0 | if (m) |
2611 | 0 | { |
2612 | 0 | d = *pd; |
2613 | |
|
2614 | 0 | mmx_mask = unpack_32_1x128 (m); |
2615 | 0 | mmx_dest = unpack_32_1x128 (d); |
2616 | |
|
2617 | 0 | *pd = pack_1x128_32 ( |
2618 | 0 | _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), |
2619 | 0 | mmx_dest)); |
2620 | 0 | } |
2621 | |
|
2622 | 0 | pd++; |
2623 | 0 | w--; |
2624 | 0 | } |
2625 | 0 | } |
2626 | |
|
2627 | 0 | } |
2628 | | |
2629 | | static void |
2630 | | sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, |
2631 | | pixman_composite_info_t *info) |
2632 | 0 | { |
2633 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
2634 | 0 | uint32_t src; |
2635 | 0 | uint32_t *dst_line, d; |
2636 | 0 | uint32_t *mask_line, m; |
2637 | 0 | uint32_t pack_cmp; |
2638 | 0 | int dst_stride, mask_stride; |
2639 | |
|
2640 | 0 | __m128i xmm_src, xmm_alpha; |
2641 | 0 | __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
2642 | 0 | __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; |
2643 | |
|
2644 | 0 | __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; |
2645 | |
|
2646 | 0 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
2647 | |
|
2648 | 0 | if (src == 0) |
2649 | 0 | return; |
2650 | | |
2651 | 0 | PIXMAN_IMAGE_GET_LINE ( |
2652 | 0 | dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
2653 | 0 | PIXMAN_IMAGE_GET_LINE ( |
2654 | 0 | mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); |
2655 | |
|
2656 | 0 | xmm_src = _mm_unpacklo_epi8 ( |
2657 | 0 | create_mask_2x32_128 (src, src), _mm_setzero_si128 ()); |
2658 | 0 | xmm_alpha = expand_alpha_1x128 (xmm_src); |
2659 | 0 | mmx_src = xmm_src; |
2660 | 0 | mmx_alpha = xmm_alpha; |
2661 | |
|
2662 | 0 | while (height--) |
2663 | 0 | { |
2664 | 0 | int w = width; |
2665 | 0 | const uint32_t *pm = (uint32_t *)mask_line; |
2666 | 0 | uint32_t *pd = (uint32_t *)dst_line; |
2667 | |
|
2668 | 0 | dst_line += dst_stride; |
2669 | 0 | mask_line += mask_stride; |
2670 | |
|
2671 | 0 | while (w && (uintptr_t)pd & 15) |
2672 | 0 | { |
2673 | 0 | m = *pm++; |
2674 | |
|
2675 | 0 | if (m) |
2676 | 0 | { |
2677 | 0 | d = *pd; |
2678 | 0 | mmx_mask = unpack_32_1x128 (m); |
2679 | 0 | mmx_dest = unpack_32_1x128 (d); |
2680 | |
|
2681 | 0 | *pd = pack_1x128_32 (in_over_1x128 (&mmx_src, |
2682 | 0 | &mmx_alpha, |
2683 | 0 | &mmx_mask, |
2684 | 0 | &mmx_dest)); |
2685 | 0 | } |
2686 | |
|
2687 | 0 | pd++; |
2688 | 0 | w--; |
2689 | 0 | } |
2690 | |
|
2691 | 0 | while (w >= 4) |
2692 | 0 | { |
2693 | 0 | xmm_mask = load_128_unaligned ((__m128i*)pm); |
2694 | |
|
2695 | 0 | pack_cmp = |
2696 | 0 | _mm_movemask_epi8 ( |
2697 | 0 | _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); |
2698 | | |
2699 | | /* if all bits in mask are zero, pack_cmp are equal to 0xffff */ |
2700 | 0 | if (pack_cmp != 0xffff) |
2701 | 0 | { |
2702 | 0 | xmm_dst = load_128_aligned ((__m128i*)pd); |
2703 | |
|
2704 | 0 | unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
2705 | 0 | unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
2706 | |
|
2707 | 0 | in_over_2x128 (&xmm_src, &xmm_src, |
2708 | 0 | &xmm_alpha, &xmm_alpha, |
2709 | 0 | &xmm_mask_lo, &xmm_mask_hi, |
2710 | 0 | &xmm_dst_lo, &xmm_dst_hi); |
2711 | |
|
2712 | 0 | save_128_aligned ( |
2713 | 0 | (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
2714 | 0 | } |
2715 | |
|
2716 | 0 | pd += 4; |
2717 | 0 | pm += 4; |
2718 | 0 | w -= 4; |
2719 | 0 | } |
2720 | |
|
2721 | 0 | while (w) |
2722 | 0 | { |
2723 | 0 | m = *pm++; |
2724 | |
|
2725 | 0 | if (m) |
2726 | 0 | { |
2727 | 0 | d = *pd; |
2728 | 0 | mmx_mask = unpack_32_1x128 (m); |
2729 | 0 | mmx_dest = unpack_32_1x128 (d); |
2730 | |
|
2731 | 0 | *pd = pack_1x128_32 ( |
2732 | 0 | in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)); |
2733 | 0 | } |
2734 | |
|
2735 | 0 | pd++; |
2736 | 0 | w--; |
2737 | 0 | } |
2738 | 0 | } |
2739 | |
|
2740 | 0 | } |
2741 | | |
2742 | | static void |
2743 | | sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp, |
2744 | | pixman_composite_info_t *info) |
2745 | 0 | { |
2746 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
2747 | 0 | uint32_t *dst_line, *dst; |
2748 | 0 | uint32_t *src_line, *src; |
2749 | 0 | uint32_t mask; |
2750 | 0 | int32_t w; |
2751 | 0 | int dst_stride, src_stride; |
2752 | |
|
2753 | 0 | __m128i xmm_mask; |
2754 | 0 | __m128i xmm_src, xmm_src_lo, xmm_src_hi; |
2755 | 0 | __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
2756 | 0 | __m128i xmm_alpha_lo, xmm_alpha_hi; |
2757 | |
|
2758 | 0 | PIXMAN_IMAGE_GET_LINE ( |
2759 | 0 | dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
2760 | 0 | PIXMAN_IMAGE_GET_LINE ( |
2761 | 0 | src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
2762 | |
|
2763 | 0 | mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8); |
2764 | |
|
2765 | 0 | xmm_mask = create_mask_16_128 (mask >> 24); |
2766 | |
|
2767 | 0 | while (height--) |
2768 | 0 | { |
2769 | 0 | dst = dst_line; |
2770 | 0 | dst_line += dst_stride; |
2771 | 0 | src = src_line; |
2772 | 0 | src_line += src_stride; |
2773 | 0 | w = width; |
2774 | |
|
2775 | 0 | while (w && (uintptr_t)dst & 15) |
2776 | 0 | { |
2777 | 0 | uint32_t s = *src++; |
2778 | |
|
2779 | 0 | if (s) |
2780 | 0 | { |
2781 | 0 | uint32_t d = *dst; |
2782 | | |
2783 | 0 | __m128i ms = unpack_32_1x128 (s); |
2784 | 0 | __m128i alpha = expand_alpha_1x128 (ms); |
2785 | 0 | __m128i dest = xmm_mask; |
2786 | 0 | __m128i alpha_dst = unpack_32_1x128 (d); |
2787 | | |
2788 | 0 | *dst = pack_1x128_32 ( |
2789 | 0 | in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); |
2790 | 0 | } |
2791 | 0 | dst++; |
2792 | 0 | w--; |
2793 | 0 | } |
2794 | |
|
2795 | 0 | while (w >= 4) |
2796 | 0 | { |
2797 | 0 | xmm_src = load_128_unaligned ((__m128i*)src); |
2798 | |
|
2799 | 0 | if (!is_zero (xmm_src)) |
2800 | 0 | { |
2801 | 0 | xmm_dst = load_128_aligned ((__m128i*)dst); |
2802 | | |
2803 | 0 | unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
2804 | 0 | unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
2805 | 0 | expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, |
2806 | 0 | &xmm_alpha_lo, &xmm_alpha_hi); |
2807 | | |
2808 | 0 | in_over_2x128 (&xmm_src_lo, &xmm_src_hi, |
2809 | 0 | &xmm_alpha_lo, &xmm_alpha_hi, |
2810 | 0 | &xmm_mask, &xmm_mask, |
2811 | 0 | &xmm_dst_lo, &xmm_dst_hi); |
2812 | | |
2813 | 0 | save_128_aligned ( |
2814 | 0 | (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
2815 | 0 | } |
2816 | | |
2817 | 0 | dst += 4; |
2818 | 0 | src += 4; |
2819 | 0 | w -= 4; |
2820 | 0 | } |
2821 | |
|
2822 | 0 | while (w) |
2823 | 0 | { |
2824 | 0 | uint32_t s = *src++; |
2825 | |
|
2826 | 0 | if (s) |
2827 | 0 | { |
2828 | 0 | uint32_t d = *dst; |
2829 | | |
2830 | 0 | __m128i ms = unpack_32_1x128 (s); |
2831 | 0 | __m128i alpha = expand_alpha_1x128 (ms); |
2832 | 0 | __m128i mask = xmm_mask; |
2833 | 0 | __m128i dest = unpack_32_1x128 (d); |
2834 | | |
2835 | 0 | *dst = pack_1x128_32 ( |
2836 | 0 | in_over_1x128 (&ms, &alpha, &mask, &dest)); |
2837 | 0 | } |
2838 | |
|
2839 | 0 | dst++; |
2840 | 0 | w--; |
2841 | 0 | } |
2842 | 0 | } |
2843 | |
|
2844 | 0 | } |
2845 | | |
2846 | | static void |
2847 | | sse2_composite_src_x888_0565 (pixman_implementation_t *imp, |
2848 | | pixman_composite_info_t *info) |
2849 | 0 | { |
2850 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
2851 | 0 | uint16_t *dst_line, *dst; |
2852 | 0 | uint32_t *src_line, *src, s; |
2853 | 0 | int dst_stride, src_stride; |
2854 | 0 | int32_t w; |
2855 | |
|
2856 | 0 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
2857 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
2858 | |
|
2859 | 0 | while (height--) |
2860 | 0 | { |
2861 | 0 | dst = dst_line; |
2862 | 0 | dst_line += dst_stride; |
2863 | 0 | src = src_line; |
2864 | 0 | src_line += src_stride; |
2865 | 0 | w = width; |
2866 | |
|
2867 | 0 | while (w && (uintptr_t)dst & 15) |
2868 | 0 | { |
2869 | 0 | s = *src++; |
2870 | 0 | *dst = convert_8888_to_0565 (s); |
2871 | 0 | dst++; |
2872 | 0 | w--; |
2873 | 0 | } |
2874 | |
|
2875 | 0 | while (w >= 8) |
2876 | 0 | { |
2877 | 0 | __m128i xmm_src0 = load_128_unaligned ((__m128i *)src + 0); |
2878 | 0 | __m128i xmm_src1 = load_128_unaligned ((__m128i *)src + 1); |
2879 | |
|
2880 | 0 | save_128_aligned ((__m128i*)dst, pack_565_2packedx128_128 (xmm_src0, xmm_src1)); |
2881 | |
|
2882 | 0 | w -= 8; |
2883 | 0 | src += 8; |
2884 | 0 | dst += 8; |
2885 | 0 | } |
2886 | |
|
2887 | 0 | while (w) |
2888 | 0 | { |
2889 | 0 | s = *src++; |
2890 | 0 | *dst = convert_8888_to_0565 (s); |
2891 | 0 | dst++; |
2892 | 0 | w--; |
2893 | 0 | } |
2894 | 0 | } |
2895 | 0 | } |
2896 | | |
2897 | | static void |
2898 | | sse2_composite_src_x888_8888 (pixman_implementation_t *imp, |
2899 | | pixman_composite_info_t *info) |
2900 | 0 | { |
2901 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
2902 | 0 | uint32_t *dst_line, *dst; |
2903 | 0 | uint32_t *src_line, *src; |
2904 | 0 | int32_t w; |
2905 | 0 | int dst_stride, src_stride; |
2906 | | |
2907 | |
|
2908 | 0 | PIXMAN_IMAGE_GET_LINE ( |
2909 | 0 | dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
2910 | 0 | PIXMAN_IMAGE_GET_LINE ( |
2911 | 0 | src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
2912 | |
|
2913 | 0 | while (height--) |
2914 | 0 | { |
2915 | 0 | dst = dst_line; |
2916 | 0 | dst_line += dst_stride; |
2917 | 0 | src = src_line; |
2918 | 0 | src_line += src_stride; |
2919 | 0 | w = width; |
2920 | |
|
2921 | 0 | while (w && (uintptr_t)dst & 15) |
2922 | 0 | { |
2923 | 0 | *dst++ = *src++ | 0xff000000; |
2924 | 0 | w--; |
2925 | 0 | } |
2926 | |
|
2927 | 0 | while (w >= 16) |
2928 | 0 | { |
2929 | 0 | __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4; |
2930 | | |
2931 | 0 | xmm_src1 = load_128_unaligned ((__m128i*)src + 0); |
2932 | 0 | xmm_src2 = load_128_unaligned ((__m128i*)src + 1); |
2933 | 0 | xmm_src3 = load_128_unaligned ((__m128i*)src + 2); |
2934 | 0 | xmm_src4 = load_128_unaligned ((__m128i*)src + 3); |
2935 | | |
2936 | 0 | save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000)); |
2937 | 0 | save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000)); |
2938 | 0 | save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000)); |
2939 | 0 | save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000)); |
2940 | | |
2941 | 0 | dst += 16; |
2942 | 0 | src += 16; |
2943 | 0 | w -= 16; |
2944 | 0 | } |
2945 | |
|
2946 | 0 | while (w) |
2947 | 0 | { |
2948 | 0 | *dst++ = *src++ | 0xff000000; |
2949 | 0 | w--; |
2950 | 0 | } |
2951 | 0 | } |
2952 | |
|
2953 | 0 | } |
2954 | | |
2955 | | static void |
2956 | | sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp, |
2957 | | pixman_composite_info_t *info) |
2958 | 0 | { |
2959 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
2960 | 0 | uint32_t *dst_line, *dst; |
2961 | 0 | uint32_t *src_line, *src; |
2962 | 0 | uint32_t mask; |
2963 | 0 | int dst_stride, src_stride; |
2964 | 0 | int32_t w; |
2965 | |
|
2966 | 0 | __m128i xmm_mask, xmm_alpha; |
2967 | 0 | __m128i xmm_src, xmm_src_lo, xmm_src_hi; |
2968 | 0 | __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
2969 | |
|
2970 | 0 | PIXMAN_IMAGE_GET_LINE ( |
2971 | 0 | dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
2972 | 0 | PIXMAN_IMAGE_GET_LINE ( |
2973 | 0 | src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
2974 | |
|
2975 | 0 | mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8); |
2976 | |
|
2977 | 0 | xmm_mask = create_mask_16_128 (mask >> 24); |
2978 | 0 | xmm_alpha = mask_00ff; |
2979 | |
|
2980 | 0 | while (height--) |
2981 | 0 | { |
2982 | 0 | dst = dst_line; |
2983 | 0 | dst_line += dst_stride; |
2984 | 0 | src = src_line; |
2985 | 0 | src_line += src_stride; |
2986 | 0 | w = width; |
2987 | |
|
2988 | 0 | while (w && (uintptr_t)dst & 15) |
2989 | 0 | { |
2990 | 0 | uint32_t s = (*src++) | 0xff000000; |
2991 | 0 | uint32_t d = *dst; |
2992 | |
|
2993 | 0 | __m128i src = unpack_32_1x128 (s); |
2994 | 0 | __m128i alpha = xmm_alpha; |
2995 | 0 | __m128i mask = xmm_mask; |
2996 | 0 | __m128i dest = unpack_32_1x128 (d); |
2997 | |
|
2998 | 0 | *dst++ = pack_1x128_32 ( |
2999 | 0 | in_over_1x128 (&src, &alpha, &mask, &dest)); |
3000 | |
|
3001 | 0 | w--; |
3002 | 0 | } |
3003 | |
|
3004 | 0 | while (w >= 4) |
3005 | 0 | { |
3006 | 0 | xmm_src = _mm_or_si128 ( |
3007 | 0 | load_128_unaligned ((__m128i*)src), mask_ff000000); |
3008 | 0 | xmm_dst = load_128_aligned ((__m128i*)dst); |
3009 | |
|
3010 | 0 | unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
3011 | 0 | unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
3012 | |
|
3013 | 0 | in_over_2x128 (&xmm_src_lo, &xmm_src_hi, |
3014 | 0 | &xmm_alpha, &xmm_alpha, |
3015 | 0 | &xmm_mask, &xmm_mask, |
3016 | 0 | &xmm_dst_lo, &xmm_dst_hi); |
3017 | |
|
3018 | 0 | save_128_aligned ( |
3019 | 0 | (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
3020 | |
|
3021 | 0 | dst += 4; |
3022 | 0 | src += 4; |
3023 | 0 | w -= 4; |
3024 | |
|
3025 | 0 | } |
3026 | |
|
3027 | 0 | while (w) |
3028 | 0 | { |
3029 | 0 | uint32_t s = (*src++) | 0xff000000; |
3030 | 0 | uint32_t d = *dst; |
3031 | |
|
3032 | 0 | __m128i src = unpack_32_1x128 (s); |
3033 | 0 | __m128i alpha = xmm_alpha; |
3034 | 0 | __m128i mask = xmm_mask; |
3035 | 0 | __m128i dest = unpack_32_1x128 (d); |
3036 | |
|
3037 | 0 | *dst++ = pack_1x128_32 ( |
3038 | 0 | in_over_1x128 (&src, &alpha, &mask, &dest)); |
3039 | |
|
3040 | 0 | w--; |
3041 | 0 | } |
3042 | 0 | } |
3043 | |
|
3044 | 0 | } |
3045 | | |
3046 | | static void |
3047 | | sse2_composite_over_8888_8888 (pixman_implementation_t *imp, |
3048 | | pixman_composite_info_t *info) |
3049 | 8 | { |
3050 | 8 | PIXMAN_COMPOSITE_ARGS (info); |
3051 | 8 | int dst_stride, src_stride; |
3052 | 8 | uint32_t *dst_line, *dst; |
3053 | 8 | uint32_t *src_line, *src; |
3054 | | |
3055 | 8 | PIXMAN_IMAGE_GET_LINE ( |
3056 | 8 | dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
3057 | 8 | PIXMAN_IMAGE_GET_LINE ( |
3058 | 8 | src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
3059 | | |
3060 | 8 | dst = dst_line; |
3061 | 8 | src = src_line; |
3062 | | |
3063 | 257 | while (height--) |
3064 | 249 | { |
3065 | 249 | sse2_combine_over_u (imp, op, dst, src, NULL, width); |
3066 | | |
3067 | 249 | dst += dst_stride; |
3068 | 249 | src += src_stride; |
3069 | 249 | } |
3070 | 8 | } |
3071 | | |
3072 | | static force_inline uint16_t |
3073 | | composite_over_8888_0565pixel (uint32_t src, uint16_t dst) |
3074 | 0 | { |
3075 | 0 | __m128i ms; |
3076 | |
|
3077 | 0 | ms = unpack_32_1x128 (src); |
3078 | 0 | return pack_565_32_16 ( |
3079 | 0 | pack_1x128_32 ( |
3080 | 0 | over_1x128 ( |
3081 | 0 | ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst)))); |
3082 | 0 | } |
3083 | | |
3084 | | static void |
3085 | | sse2_composite_over_8888_0565 (pixman_implementation_t *imp, |
3086 | | pixman_composite_info_t *info) |
3087 | 0 | { |
3088 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
3089 | 0 | uint16_t *dst_line, *dst, d; |
3090 | 0 | uint32_t *src_line, *src, s; |
3091 | 0 | int dst_stride, src_stride; |
3092 | 0 | int32_t w; |
3093 | |
|
3094 | 0 | __m128i xmm_alpha_lo, xmm_alpha_hi; |
3095 | 0 | __m128i xmm_src, xmm_src_lo, xmm_src_hi; |
3096 | 0 | __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; |
3097 | |
|
3098 | 0 | PIXMAN_IMAGE_GET_LINE ( |
3099 | 0 | dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
3100 | 0 | PIXMAN_IMAGE_GET_LINE ( |
3101 | 0 | src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
3102 | |
|
3103 | 0 | while (height--) |
3104 | 0 | { |
3105 | 0 | dst = dst_line; |
3106 | 0 | src = src_line; |
3107 | |
|
3108 | 0 | dst_line += dst_stride; |
3109 | 0 | src_line += src_stride; |
3110 | 0 | w = width; |
3111 | | |
3112 | | /* Align dst on a 16-byte boundary */ |
3113 | 0 | while (w && |
3114 | 0 | ((uintptr_t)dst & 15)) |
3115 | 0 | { |
3116 | 0 | s = *src++; |
3117 | 0 | d = *dst; |
3118 | |
|
3119 | 0 | *dst++ = composite_over_8888_0565pixel (s, d); |
3120 | 0 | w--; |
3121 | 0 | } |
3122 | | |
3123 | | /* It's a 8 pixel loop */ |
3124 | 0 | while (w >= 8) |
3125 | 0 | { |
3126 | | /* I'm loading unaligned because I'm not sure |
3127 | | * about the address alignment. |
3128 | | */ |
3129 | 0 | xmm_src = load_128_unaligned ((__m128i*) src); |
3130 | 0 | xmm_dst = load_128_aligned ((__m128i*) dst); |
3131 | | |
3132 | | /* Unpacking */ |
3133 | 0 | unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
3134 | 0 | unpack_565_128_4x128 (xmm_dst, |
3135 | 0 | &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); |
3136 | 0 | expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, |
3137 | 0 | &xmm_alpha_lo, &xmm_alpha_hi); |
3138 | | |
3139 | | /* I'm loading next 4 pixels from memory |
3140 | | * before to optimze the memory read. |
3141 | | */ |
3142 | 0 | xmm_src = load_128_unaligned ((__m128i*) (src + 4)); |
3143 | |
|
3144 | 0 | over_2x128 (&xmm_src_lo, &xmm_src_hi, |
3145 | 0 | &xmm_alpha_lo, &xmm_alpha_hi, |
3146 | 0 | &xmm_dst0, &xmm_dst1); |
3147 | | |
3148 | | /* Unpacking */ |
3149 | 0 | unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
3150 | 0 | expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, |
3151 | 0 | &xmm_alpha_lo, &xmm_alpha_hi); |
3152 | |
|
3153 | 0 | over_2x128 (&xmm_src_lo, &xmm_src_hi, |
3154 | 0 | &xmm_alpha_lo, &xmm_alpha_hi, |
3155 | 0 | &xmm_dst2, &xmm_dst3); |
3156 | |
|
3157 | 0 | save_128_aligned ( |
3158 | 0 | (__m128i*)dst, pack_565_4x128_128 ( |
3159 | 0 | &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); |
3160 | |
|
3161 | 0 | w -= 8; |
3162 | 0 | dst += 8; |
3163 | 0 | src += 8; |
3164 | 0 | } |
3165 | |
|
3166 | 0 | while (w--) |
3167 | 0 | { |
3168 | 0 | s = *src++; |
3169 | 0 | d = *dst; |
3170 | |
|
3171 | 0 | *dst++ = composite_over_8888_0565pixel (s, d); |
3172 | 0 | } |
3173 | 0 | } |
3174 | |
|
3175 | 0 | } |
3176 | | |
3177 | | static void |
3178 | | sse2_composite_over_n_8_8888 (pixman_implementation_t *imp, |
3179 | | pixman_composite_info_t *info) |
3180 | 2 | { |
3181 | 2 | PIXMAN_COMPOSITE_ARGS (info); |
3182 | 2 | uint32_t src, srca; |
3183 | 2 | uint32_t *dst_line, *dst; |
3184 | 2 | uint8_t *mask_line, *mask; |
3185 | 2 | int dst_stride, mask_stride; |
3186 | 2 | int32_t w; |
3187 | 2 | uint32_t d; |
3188 | | |
3189 | 2 | __m128i xmm_src, xmm_alpha, xmm_def; |
3190 | 2 | __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
3191 | 2 | __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; |
3192 | | |
3193 | 2 | __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; |
3194 | | |
3195 | 2 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
3196 | | |
3197 | 2 | srca = src >> 24; |
3198 | 2 | if (src == 0) |
3199 | 0 | return; |
3200 | | |
3201 | 2 | PIXMAN_IMAGE_GET_LINE ( |
3202 | 2 | dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
3203 | 2 | PIXMAN_IMAGE_GET_LINE ( |
3204 | 2 | mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
3205 | | |
3206 | 2 | xmm_def = create_mask_2x32_128 (src, src); |
3207 | 2 | xmm_src = expand_pixel_32_1x128 (src); |
3208 | 2 | xmm_alpha = expand_alpha_1x128 (xmm_src); |
3209 | 2 | mmx_src = xmm_src; |
3210 | 2 | mmx_alpha = xmm_alpha; |
3211 | | |
3212 | 37 | while (height--) |
3213 | 35 | { |
3214 | 35 | dst = dst_line; |
3215 | 35 | dst_line += dst_stride; |
3216 | 35 | mask = mask_line; |
3217 | 35 | mask_line += mask_stride; |
3218 | 35 | w = width; |
3219 | | |
3220 | 84 | while (w && (uintptr_t)dst & 15) |
3221 | 49 | { |
3222 | 49 | uint8_t m = *mask++; |
3223 | | |
3224 | 49 | if (m) |
3225 | 16 | { |
3226 | 16 | d = *dst; |
3227 | 16 | mmx_mask = expand_pixel_8_1x128 (m); |
3228 | 16 | mmx_dest = unpack_32_1x128 (d); |
3229 | | |
3230 | 16 | *dst = pack_1x128_32 (in_over_1x128 (&mmx_src, |
3231 | 16 | &mmx_alpha, |
3232 | 16 | &mmx_mask, |
3233 | 16 | &mmx_dest)); |
3234 | 16 | } |
3235 | | |
3236 | 49 | w--; |
3237 | 49 | dst++; |
3238 | 49 | } |
3239 | | |
3240 | 210 | while (w >= 4) |
3241 | 175 | { |
3242 | 175 | uint32_t m; |
3243 | 175 | memcpy(&m, mask, sizeof(uint32_t)); |
3244 | | |
3245 | 175 | if (srca == 0xff && m == 0xffffffff) |
3246 | 0 | { |
3247 | 0 | save_128_aligned ((__m128i*)dst, xmm_def); |
3248 | 0 | } |
3249 | 175 | else if (m) |
3250 | 127 | { |
3251 | 127 | xmm_dst = load_128_aligned ((__m128i*) dst); |
3252 | 127 | xmm_mask = unpack_32_1x128 (m); |
3253 | 127 | xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); |
3254 | | |
3255 | | /* Unpacking */ |
3256 | 127 | unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
3257 | 127 | unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
3258 | | |
3259 | 127 | expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, |
3260 | 127 | &xmm_mask_lo, &xmm_mask_hi); |
3261 | | |
3262 | 127 | in_over_2x128 (&xmm_src, &xmm_src, |
3263 | 127 | &xmm_alpha, &xmm_alpha, |
3264 | 127 | &xmm_mask_lo, &xmm_mask_hi, |
3265 | 127 | &xmm_dst_lo, &xmm_dst_hi); |
3266 | | |
3267 | 127 | save_128_aligned ( |
3268 | 127 | (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
3269 | 127 | } |
3270 | | |
3271 | 175 | w -= 4; |
3272 | 175 | dst += 4; |
3273 | 175 | mask += 4; |
3274 | 175 | } |
3275 | | |
3276 | 91 | while (w) |
3277 | 56 | { |
3278 | 56 | uint8_t m = *mask++; |
3279 | | |
3280 | 56 | if (m) |
3281 | 17 | { |
3282 | 17 | d = *dst; |
3283 | 17 | mmx_mask = expand_pixel_8_1x128 (m); |
3284 | 17 | mmx_dest = unpack_32_1x128 (d); |
3285 | | |
3286 | 17 | *dst = pack_1x128_32 (in_over_1x128 (&mmx_src, |
3287 | 17 | &mmx_alpha, |
3288 | 17 | &mmx_mask, |
3289 | 17 | &mmx_dest)); |
3290 | 17 | } |
3291 | | |
3292 | 56 | w--; |
3293 | 56 | dst++; |
3294 | 56 | } |
3295 | 35 | } |
3296 | | |
3297 | 2 | } |
3298 | | |
3299 | | #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__) |
3300 | | __attribute__((__force_align_arg_pointer__)) |
3301 | | #endif |
3302 | | static pixman_bool_t |
3303 | | sse2_fill (pixman_implementation_t *imp, |
3304 | | uint32_t * bits, |
3305 | | int stride, |
3306 | | int bpp, |
3307 | | int x, |
3308 | | int y, |
3309 | | int width, |
3310 | | int height, |
3311 | | uint32_t filler) |
3312 | 8 | { |
3313 | 8 | uint32_t byte_width; |
3314 | 8 | uint8_t *byte_line; |
3315 | | |
3316 | 8 | __m128i xmm_def; |
3317 | | |
3318 | 8 | if (bpp == 8) |
3319 | 0 | { |
3320 | 0 | uint32_t b; |
3321 | 0 | uint32_t w; |
3322 | |
|
3323 | 0 | stride = stride * (int) sizeof (uint32_t) / 1; |
3324 | 0 | byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x); |
3325 | 0 | byte_width = width; |
3326 | 0 | stride *= 1; |
3327 | |
|
3328 | 0 | b = filler & 0xff; |
3329 | 0 | w = (b << 8) | b; |
3330 | 0 | filler = (w << 16) | w; |
3331 | 0 | } |
3332 | 8 | else if (bpp == 16) |
3333 | 0 | { |
3334 | 0 | stride = stride * (int) sizeof (uint32_t) / 2; |
3335 | 0 | byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x); |
3336 | 0 | byte_width = 2 * width; |
3337 | 0 | stride *= 2; |
3338 | |
|
3339 | 0 | filler = (filler & 0xffff) * 0x00010001; |
3340 | 0 | } |
3341 | 8 | else if (bpp == 32) |
3342 | 8 | { |
3343 | 8 | stride = stride * (int) sizeof (uint32_t) / 4; |
3344 | 8 | byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x); |
3345 | 8 | byte_width = 4 * width; |
3346 | 8 | stride *= 4; |
3347 | 8 | } |
3348 | 0 | else |
3349 | 0 | { |
3350 | 0 | return FALSE; |
3351 | 0 | } |
3352 | | |
3353 | 8 | xmm_def = create_mask_2x32_128 (filler, filler); |
3354 | | |
3355 | 969 | while (height--) |
3356 | 961 | { |
3357 | 961 | int w; |
3358 | 961 | uint8_t *d = byte_line; |
3359 | 961 | byte_line += stride; |
3360 | 961 | w = byte_width; |
3361 | | |
3362 | 961 | if (w >= 1 && ((uintptr_t)d & 1)) |
3363 | 0 | { |
3364 | 0 | *(uint8_t *)d = filler; |
3365 | 0 | w -= 1; |
3366 | 0 | d += 1; |
3367 | 0 | } |
3368 | | |
3369 | 961 | while (w >= 2 && ((uintptr_t)d & 3)) |
3370 | 0 | { |
3371 | 0 | *(uint16_t *)d = filler; |
3372 | 0 | w -= 2; |
3373 | 0 | d += 2; |
3374 | 0 | } |
3375 | | |
3376 | 1.28k | while (w >= 4 && ((uintptr_t)d & 15)) |
3377 | 326 | { |
3378 | 326 | *(uint32_t *)d = filler; |
3379 | | |
3380 | 326 | w -= 4; |
3381 | 326 | d += 4; |
3382 | 326 | } |
3383 | | |
3384 | 11.8k | while (w >= 128) |
3385 | 10.9k | { |
3386 | 10.9k | save_128_aligned ((__m128i*)(d), xmm_def); |
3387 | 10.9k | save_128_aligned ((__m128i*)(d + 16), xmm_def); |
3388 | 10.9k | save_128_aligned ((__m128i*)(d + 32), xmm_def); |
3389 | 10.9k | save_128_aligned ((__m128i*)(d + 48), xmm_def); |
3390 | 10.9k | save_128_aligned ((__m128i*)(d + 64), xmm_def); |
3391 | 10.9k | save_128_aligned ((__m128i*)(d + 80), xmm_def); |
3392 | 10.9k | save_128_aligned ((__m128i*)(d + 96), xmm_def); |
3393 | 10.9k | save_128_aligned ((__m128i*)(d + 112), xmm_def); |
3394 | | |
3395 | 10.9k | d += 128; |
3396 | 10.9k | w -= 128; |
3397 | 10.9k | } |
3398 | | |
3399 | 961 | if (w >= 64) |
3400 | 813 | { |
3401 | 813 | save_128_aligned ((__m128i*)(d), xmm_def); |
3402 | 813 | save_128_aligned ((__m128i*)(d + 16), xmm_def); |
3403 | 813 | save_128_aligned ((__m128i*)(d + 32), xmm_def); |
3404 | 813 | save_128_aligned ((__m128i*)(d + 48), xmm_def); |
3405 | | |
3406 | 813 | d += 64; |
3407 | 813 | w -= 64; |
3408 | 813 | } |
3409 | | |
3410 | 961 | if (w >= 32) |
3411 | 880 | { |
3412 | 880 | save_128_aligned ((__m128i*)(d), xmm_def); |
3413 | 880 | save_128_aligned ((__m128i*)(d + 16), xmm_def); |
3414 | | |
3415 | 880 | d += 32; |
3416 | 880 | w -= 32; |
3417 | 880 | } |
3418 | | |
3419 | 961 | if (w >= 16) |
3420 | 849 | { |
3421 | 849 | save_128_aligned ((__m128i*)(d), xmm_def); |
3422 | | |
3423 | 849 | d += 16; |
3424 | 849 | w -= 16; |
3425 | 849 | } |
3426 | | |
3427 | 1.29k | while (w >= 4) |
3428 | 337 | { |
3429 | 337 | *(uint32_t *)d = filler; |
3430 | | |
3431 | 337 | w -= 4; |
3432 | 337 | d += 4; |
3433 | 337 | } |
3434 | | |
3435 | 961 | if (w >= 2) |
3436 | 0 | { |
3437 | 0 | *(uint16_t *)d = filler; |
3438 | 0 | w -= 2; |
3439 | 0 | d += 2; |
3440 | 0 | } |
3441 | | |
3442 | 961 | if (w >= 1) |
3443 | 0 | { |
3444 | 0 | *(uint8_t *)d = filler; |
3445 | 0 | w -= 1; |
3446 | 0 | d += 1; |
3447 | 0 | } |
3448 | 961 | } |
3449 | | |
3450 | 8 | return TRUE; |
3451 | 8 | } |
3452 | | |
3453 | | static void |
3454 | | sse2_composite_src_n_8_8888 (pixman_implementation_t *imp, |
3455 | | pixman_composite_info_t *info) |
3456 | 0 | { |
3457 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
3458 | 0 | uint32_t src, srca; |
3459 | 0 | uint32_t *dst_line, *dst; |
3460 | 0 | uint8_t *mask_line, *mask; |
3461 | 0 | int dst_stride, mask_stride; |
3462 | 0 | int32_t w; |
3463 | |
|
3464 | 0 | __m128i xmm_src, xmm_def; |
3465 | 0 | __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; |
3466 | |
|
3467 | 0 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
3468 | |
|
3469 | 0 | srca = src >> 24; |
3470 | 0 | if (src == 0) |
3471 | 0 | { |
3472 | 0 | sse2_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride, |
3473 | 0 | PIXMAN_FORMAT_BPP (dest_image->bits.format), |
3474 | 0 | dest_x, dest_y, width, height, 0); |
3475 | 0 | return; |
3476 | 0 | } |
3477 | | |
3478 | 0 | PIXMAN_IMAGE_GET_LINE ( |
3479 | 0 | dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
3480 | 0 | PIXMAN_IMAGE_GET_LINE ( |
3481 | 0 | mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
3482 | |
|
3483 | 0 | xmm_def = create_mask_2x32_128 (src, src); |
3484 | 0 | xmm_src = expand_pixel_32_1x128 (src); |
3485 | |
|
3486 | 0 | while (height--) |
3487 | 0 | { |
3488 | 0 | dst = dst_line; |
3489 | 0 | dst_line += dst_stride; |
3490 | 0 | mask = mask_line; |
3491 | 0 | mask_line += mask_stride; |
3492 | 0 | w = width; |
3493 | |
|
3494 | 0 | while (w && (uintptr_t)dst & 15) |
3495 | 0 | { |
3496 | 0 | uint8_t m = *mask++; |
3497 | |
|
3498 | 0 | if (m) |
3499 | 0 | { |
3500 | 0 | *dst = pack_1x128_32 ( |
3501 | 0 | pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m))); |
3502 | 0 | } |
3503 | 0 | else |
3504 | 0 | { |
3505 | 0 | *dst = 0; |
3506 | 0 | } |
3507 | |
|
3508 | 0 | w--; |
3509 | 0 | dst++; |
3510 | 0 | } |
3511 | |
|
3512 | 0 | while (w >= 4) |
3513 | 0 | { |
3514 | 0 | uint32_t m; |
3515 | 0 | memcpy(&m, mask, sizeof(uint32_t)); |
3516 | |
|
3517 | 0 | if (srca == 0xff && m == 0xffffffff) |
3518 | 0 | { |
3519 | 0 | save_128_aligned ((__m128i*)dst, xmm_def); |
3520 | 0 | } |
3521 | 0 | else if (m) |
3522 | 0 | { |
3523 | 0 | xmm_mask = unpack_32_1x128 (m); |
3524 | 0 | xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); |
3525 | | |
3526 | | /* Unpacking */ |
3527 | 0 | unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
3528 | |
|
3529 | 0 | expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, |
3530 | 0 | &xmm_mask_lo, &xmm_mask_hi); |
3531 | |
|
3532 | 0 | pix_multiply_2x128 (&xmm_src, &xmm_src, |
3533 | 0 | &xmm_mask_lo, &xmm_mask_hi, |
3534 | 0 | &xmm_mask_lo, &xmm_mask_hi); |
3535 | |
|
3536 | 0 | save_128_aligned ( |
3537 | 0 | (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi)); |
3538 | 0 | } |
3539 | 0 | else |
3540 | 0 | { |
3541 | 0 | save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ()); |
3542 | 0 | } |
3543 | |
|
3544 | 0 | w -= 4; |
3545 | 0 | dst += 4; |
3546 | 0 | mask += 4; |
3547 | 0 | } |
3548 | |
|
3549 | 0 | while (w) |
3550 | 0 | { |
3551 | 0 | uint8_t m = *mask++; |
3552 | |
|
3553 | 0 | if (m) |
3554 | 0 | { |
3555 | 0 | *dst = pack_1x128_32 ( |
3556 | 0 | pix_multiply_1x128 ( |
3557 | 0 | xmm_src, expand_pixel_8_1x128 (m))); |
3558 | 0 | } |
3559 | 0 | else |
3560 | 0 | { |
3561 | 0 | *dst = 0; |
3562 | 0 | } |
3563 | |
|
3564 | 0 | w--; |
3565 | 0 | dst++; |
3566 | 0 | } |
3567 | 0 | } |
3568 | |
|
3569 | 0 | } |
3570 | | |
3571 | | static void |
3572 | | sse2_composite_over_n_8_0565 (pixman_implementation_t *imp, |
3573 | | pixman_composite_info_t *info) |
3574 | 0 | { |
3575 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
3576 | 0 | uint32_t src; |
3577 | 0 | uint16_t *dst_line, *dst, d; |
3578 | 0 | uint8_t *mask_line, *mask; |
3579 | 0 | int dst_stride, mask_stride; |
3580 | 0 | int32_t w; |
3581 | 0 | __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; |
3582 | |
|
3583 | 0 | __m128i xmm_src, xmm_alpha; |
3584 | 0 | __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; |
3585 | 0 | __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; |
3586 | |
|
3587 | 0 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
3588 | |
|
3589 | 0 | if (src == 0) |
3590 | 0 | return; |
3591 | | |
3592 | 0 | PIXMAN_IMAGE_GET_LINE ( |
3593 | 0 | dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
3594 | 0 | PIXMAN_IMAGE_GET_LINE ( |
3595 | 0 | mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
3596 | |
|
3597 | 0 | xmm_src = expand_pixel_32_1x128 (src); |
3598 | 0 | xmm_alpha = expand_alpha_1x128 (xmm_src); |
3599 | 0 | mmx_src = xmm_src; |
3600 | 0 | mmx_alpha = xmm_alpha; |
3601 | |
|
3602 | 0 | while (height--) |
3603 | 0 | { |
3604 | 0 | dst = dst_line; |
3605 | 0 | dst_line += dst_stride; |
3606 | 0 | mask = mask_line; |
3607 | 0 | mask_line += mask_stride; |
3608 | 0 | w = width; |
3609 | |
|
3610 | 0 | while (w && (uintptr_t)dst & 15) |
3611 | 0 | { |
3612 | 0 | uint8_t m = *mask++; |
3613 | |
|
3614 | 0 | if (m) |
3615 | 0 | { |
3616 | 0 | d = *dst; |
3617 | 0 | mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); |
3618 | 0 | mmx_dest = expand565_16_1x128 (d); |
3619 | |
|
3620 | 0 | *dst = pack_565_32_16 ( |
3621 | 0 | pack_1x128_32 ( |
3622 | 0 | in_over_1x128 ( |
3623 | 0 | &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); |
3624 | 0 | } |
3625 | |
|
3626 | 0 | w--; |
3627 | 0 | dst++; |
3628 | 0 | } |
3629 | |
|
3630 | 0 | while (w >= 8) |
3631 | 0 | { |
3632 | 0 | uint32_t m; |
3633 | |
|
3634 | 0 | xmm_dst = load_128_aligned ((__m128i*) dst); |
3635 | 0 | unpack_565_128_4x128 (xmm_dst, |
3636 | 0 | &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); |
3637 | |
|
3638 | 0 | memcpy(&m, mask, sizeof(uint32_t)); |
3639 | 0 | mask += 4; |
3640 | |
|
3641 | 0 | if (m) |
3642 | 0 | { |
3643 | 0 | xmm_mask = unpack_32_1x128 (m); |
3644 | 0 | xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); |
3645 | | |
3646 | | /* Unpacking */ |
3647 | 0 | unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
3648 | |
|
3649 | 0 | expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, |
3650 | 0 | &xmm_mask_lo, &xmm_mask_hi); |
3651 | |
|
3652 | 0 | in_over_2x128 (&xmm_src, &xmm_src, |
3653 | 0 | &xmm_alpha, &xmm_alpha, |
3654 | 0 | &xmm_mask_lo, &xmm_mask_hi, |
3655 | 0 | &xmm_dst0, &xmm_dst1); |
3656 | 0 | } |
3657 | |
|
3658 | 0 | memcpy(&m, mask, sizeof(uint32_t)); |
3659 | 0 | mask += 4; |
3660 | |
|
3661 | 0 | if (m) |
3662 | 0 | { |
3663 | 0 | xmm_mask = unpack_32_1x128 (m); |
3664 | 0 | xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); |
3665 | | |
3666 | | /* Unpacking */ |
3667 | 0 | unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
3668 | |
|
3669 | 0 | expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, |
3670 | 0 | &xmm_mask_lo, &xmm_mask_hi); |
3671 | 0 | in_over_2x128 (&xmm_src, &xmm_src, |
3672 | 0 | &xmm_alpha, &xmm_alpha, |
3673 | 0 | &xmm_mask_lo, &xmm_mask_hi, |
3674 | 0 | &xmm_dst2, &xmm_dst3); |
3675 | 0 | } |
3676 | |
|
3677 | 0 | save_128_aligned ( |
3678 | 0 | (__m128i*)dst, pack_565_4x128_128 ( |
3679 | 0 | &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); |
3680 | |
|
3681 | 0 | w -= 8; |
3682 | 0 | dst += 8; |
3683 | 0 | } |
3684 | |
|
3685 | 0 | while (w) |
3686 | 0 | { |
3687 | 0 | uint8_t m = *mask++; |
3688 | |
|
3689 | 0 | if (m) |
3690 | 0 | { |
3691 | 0 | d = *dst; |
3692 | 0 | mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); |
3693 | 0 | mmx_dest = expand565_16_1x128 (d); |
3694 | |
|
3695 | 0 | *dst = pack_565_32_16 ( |
3696 | 0 | pack_1x128_32 ( |
3697 | 0 | in_over_1x128 ( |
3698 | 0 | &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); |
3699 | 0 | } |
3700 | |
|
3701 | 0 | w--; |
3702 | 0 | dst++; |
3703 | 0 | } |
3704 | 0 | } |
3705 | |
|
3706 | 0 | } |
3707 | | |
3708 | | static void |
3709 | | sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp, |
3710 | | pixman_composite_info_t *info) |
3711 | 0 | { |
3712 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
3713 | 0 | uint16_t *dst_line, *dst, d; |
3714 | 0 | uint32_t *src_line, *src, s; |
3715 | 0 | int dst_stride, src_stride; |
3716 | 0 | int32_t w; |
3717 | 0 | uint32_t opaque, zero; |
3718 | |
|
3719 | 0 | __m128i ms; |
3720 | 0 | __m128i xmm_src, xmm_src_lo, xmm_src_hi; |
3721 | 0 | __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; |
3722 | |
|
3723 | 0 | PIXMAN_IMAGE_GET_LINE ( |
3724 | 0 | dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
3725 | 0 | PIXMAN_IMAGE_GET_LINE ( |
3726 | 0 | src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
3727 | |
|
3728 | 0 | while (height--) |
3729 | 0 | { |
3730 | 0 | dst = dst_line; |
3731 | 0 | dst_line += dst_stride; |
3732 | 0 | src = src_line; |
3733 | 0 | src_line += src_stride; |
3734 | 0 | w = width; |
3735 | |
|
3736 | 0 | while (w && (uintptr_t)dst & 15) |
3737 | 0 | { |
3738 | 0 | s = *src++; |
3739 | 0 | d = *dst; |
3740 | |
|
3741 | 0 | ms = unpack_32_1x128 (s); |
3742 | |
|
3743 | 0 | *dst++ = pack_565_32_16 ( |
3744 | 0 | pack_1x128_32 ( |
3745 | 0 | over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d)))); |
3746 | 0 | w--; |
3747 | 0 | } |
3748 | |
|
3749 | 0 | while (w >= 8) |
3750 | 0 | { |
3751 | | /* First round */ |
3752 | 0 | xmm_src = load_128_unaligned ((__m128i*)src); |
3753 | 0 | xmm_dst = load_128_aligned ((__m128i*)dst); |
3754 | |
|
3755 | 0 | opaque = is_opaque (xmm_src); |
3756 | 0 | zero = is_zero (xmm_src); |
3757 | |
|
3758 | 0 | unpack_565_128_4x128 (xmm_dst, |
3759 | 0 | &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); |
3760 | 0 | unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
3761 | | |
3762 | | /* preload next round*/ |
3763 | 0 | xmm_src = load_128_unaligned ((__m128i*)(src + 4)); |
3764 | |
|
3765 | 0 | if (opaque) |
3766 | 0 | { |
3767 | 0 | invert_colors_2x128 (xmm_src_lo, xmm_src_hi, |
3768 | 0 | &xmm_dst0, &xmm_dst1); |
3769 | 0 | } |
3770 | 0 | else if (!zero) |
3771 | 0 | { |
3772 | 0 | over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, |
3773 | 0 | &xmm_dst0, &xmm_dst1); |
3774 | 0 | } |
3775 | | |
3776 | | /* Second round */ |
3777 | 0 | opaque = is_opaque (xmm_src); |
3778 | 0 | zero = is_zero (xmm_src); |
3779 | |
|
3780 | 0 | unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
3781 | |
|
3782 | 0 | if (opaque) |
3783 | 0 | { |
3784 | 0 | invert_colors_2x128 (xmm_src_lo, xmm_src_hi, |
3785 | 0 | &xmm_dst2, &xmm_dst3); |
3786 | 0 | } |
3787 | 0 | else if (!zero) |
3788 | 0 | { |
3789 | 0 | over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, |
3790 | 0 | &xmm_dst2, &xmm_dst3); |
3791 | 0 | } |
3792 | |
|
3793 | 0 | save_128_aligned ( |
3794 | 0 | (__m128i*)dst, pack_565_4x128_128 ( |
3795 | 0 | &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); |
3796 | |
|
3797 | 0 | w -= 8; |
3798 | 0 | src += 8; |
3799 | 0 | dst += 8; |
3800 | 0 | } |
3801 | |
|
3802 | 0 | while (w) |
3803 | 0 | { |
3804 | 0 | s = *src++; |
3805 | 0 | d = *dst; |
3806 | |
|
3807 | 0 | ms = unpack_32_1x128 (s); |
3808 | |
|
3809 | 0 | *dst++ = pack_565_32_16 ( |
3810 | 0 | pack_1x128_32 ( |
3811 | 0 | over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d)))); |
3812 | 0 | w--; |
3813 | 0 | } |
3814 | 0 | } |
3815 | |
|
3816 | 0 | } |
3817 | | |
3818 | | static void |
3819 | | sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp, |
3820 | | pixman_composite_info_t *info) |
3821 | 0 | { |
3822 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
3823 | 0 | uint32_t *dst_line, *dst, d; |
3824 | 0 | uint32_t *src_line, *src, s; |
3825 | 0 | int dst_stride, src_stride; |
3826 | 0 | int32_t w; |
3827 | 0 | uint32_t opaque, zero; |
3828 | |
|
3829 | 0 | __m128i xmm_src_lo, xmm_src_hi; |
3830 | 0 | __m128i xmm_dst_lo, xmm_dst_hi; |
3831 | |
|
3832 | 0 | PIXMAN_IMAGE_GET_LINE ( |
3833 | 0 | dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
3834 | 0 | PIXMAN_IMAGE_GET_LINE ( |
3835 | 0 | src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
3836 | |
|
3837 | 0 | while (height--) |
3838 | 0 | { |
3839 | 0 | dst = dst_line; |
3840 | 0 | dst_line += dst_stride; |
3841 | 0 | src = src_line; |
3842 | 0 | src_line += src_stride; |
3843 | 0 | w = width; |
3844 | |
|
3845 | 0 | while (w && (uintptr_t)dst & 15) |
3846 | 0 | { |
3847 | 0 | s = *src++; |
3848 | 0 | d = *dst; |
3849 | |
|
3850 | 0 | *dst++ = pack_1x128_32 ( |
3851 | 0 | over_rev_non_pre_1x128 ( |
3852 | 0 | unpack_32_1x128 (s), unpack_32_1x128 (d))); |
3853 | |
|
3854 | 0 | w--; |
3855 | 0 | } |
3856 | |
|
3857 | 0 | while (w >= 4) |
3858 | 0 | { |
3859 | 0 | xmm_src_hi = load_128_unaligned ((__m128i*)src); |
3860 | |
|
3861 | 0 | opaque = is_opaque (xmm_src_hi); |
3862 | 0 | zero = is_zero (xmm_src_hi); |
3863 | |
|
3864 | 0 | unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
3865 | |
|
3866 | 0 | if (opaque) |
3867 | 0 | { |
3868 | 0 | invert_colors_2x128 (xmm_src_lo, xmm_src_hi, |
3869 | 0 | &xmm_dst_lo, &xmm_dst_hi); |
3870 | |
|
3871 | 0 | save_128_aligned ( |
3872 | 0 | (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
3873 | 0 | } |
3874 | 0 | else if (!zero) |
3875 | 0 | { |
3876 | 0 | xmm_dst_hi = load_128_aligned ((__m128i*)dst); |
3877 | |
|
3878 | 0 | unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
3879 | |
|
3880 | 0 | over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, |
3881 | 0 | &xmm_dst_lo, &xmm_dst_hi); |
3882 | |
|
3883 | 0 | save_128_aligned ( |
3884 | 0 | (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
3885 | 0 | } |
3886 | |
|
3887 | 0 | w -= 4; |
3888 | 0 | dst += 4; |
3889 | 0 | src += 4; |
3890 | 0 | } |
3891 | |
|
3892 | 0 | while (w) |
3893 | 0 | { |
3894 | 0 | s = *src++; |
3895 | 0 | d = *dst; |
3896 | |
|
3897 | 0 | *dst++ = pack_1x128_32 ( |
3898 | 0 | over_rev_non_pre_1x128 ( |
3899 | 0 | unpack_32_1x128 (s), unpack_32_1x128 (d))); |
3900 | |
|
3901 | 0 | w--; |
3902 | 0 | } |
3903 | 0 | } |
3904 | |
|
3905 | 0 | } |
3906 | | |
3907 | | static void |
3908 | | sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, |
3909 | | pixman_composite_info_t *info) |
3910 | 0 | { |
3911 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
3912 | 0 | uint32_t src; |
3913 | 0 | uint16_t *dst_line, *dst, d; |
3914 | 0 | uint32_t *mask_line, *mask, m; |
3915 | 0 | int dst_stride, mask_stride; |
3916 | 0 | int w; |
3917 | 0 | uint32_t pack_cmp; |
3918 | |
|
3919 | 0 | __m128i xmm_src, xmm_alpha; |
3920 | 0 | __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; |
3921 | 0 | __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; |
3922 | |
|
3923 | 0 | __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; |
3924 | |
|
3925 | 0 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
3926 | |
|
3927 | 0 | if (src == 0) |
3928 | 0 | return; |
3929 | | |
3930 | 0 | PIXMAN_IMAGE_GET_LINE ( |
3931 | 0 | dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
3932 | 0 | PIXMAN_IMAGE_GET_LINE ( |
3933 | 0 | mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); |
3934 | |
|
3935 | 0 | xmm_src = expand_pixel_32_1x128 (src); |
3936 | 0 | xmm_alpha = expand_alpha_1x128 (xmm_src); |
3937 | 0 | mmx_src = xmm_src; |
3938 | 0 | mmx_alpha = xmm_alpha; |
3939 | |
|
3940 | 0 | while (height--) |
3941 | 0 | { |
3942 | 0 | w = width; |
3943 | 0 | mask = mask_line; |
3944 | 0 | dst = dst_line; |
3945 | 0 | mask_line += mask_stride; |
3946 | 0 | dst_line += dst_stride; |
3947 | |
|
3948 | 0 | while (w && ((uintptr_t)dst & 15)) |
3949 | 0 | { |
3950 | 0 | m = *(uint32_t *) mask; |
3951 | |
|
3952 | 0 | if (m) |
3953 | 0 | { |
3954 | 0 | d = *dst; |
3955 | 0 | mmx_mask = unpack_32_1x128 (m); |
3956 | 0 | mmx_dest = expand565_16_1x128 (d); |
3957 | |
|
3958 | 0 | *dst = pack_565_32_16 ( |
3959 | 0 | pack_1x128_32 ( |
3960 | 0 | in_over_1x128 ( |
3961 | 0 | &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); |
3962 | 0 | } |
3963 | |
|
3964 | 0 | w--; |
3965 | 0 | dst++; |
3966 | 0 | mask++; |
3967 | 0 | } |
3968 | |
|
3969 | 0 | while (w >= 8) |
3970 | 0 | { |
3971 | | /* First round */ |
3972 | 0 | xmm_mask = load_128_unaligned ((__m128i*)mask); |
3973 | 0 | xmm_dst = load_128_aligned ((__m128i*)dst); |
3974 | |
|
3975 | 0 | pack_cmp = _mm_movemask_epi8 ( |
3976 | 0 | _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); |
3977 | |
|
3978 | 0 | unpack_565_128_4x128 (xmm_dst, |
3979 | 0 | &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); |
3980 | 0 | unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
3981 | | |
3982 | | /* preload next round */ |
3983 | 0 | xmm_mask = load_128_unaligned ((__m128i*)(mask + 4)); |
3984 | | |
3985 | | /* preload next round */ |
3986 | 0 | if (pack_cmp != 0xffff) |
3987 | 0 | { |
3988 | 0 | in_over_2x128 (&xmm_src, &xmm_src, |
3989 | 0 | &xmm_alpha, &xmm_alpha, |
3990 | 0 | &xmm_mask_lo, &xmm_mask_hi, |
3991 | 0 | &xmm_dst0, &xmm_dst1); |
3992 | 0 | } |
3993 | | |
3994 | | /* Second round */ |
3995 | 0 | pack_cmp = _mm_movemask_epi8 ( |
3996 | 0 | _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); |
3997 | |
|
3998 | 0 | unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
3999 | |
|
4000 | 0 | if (pack_cmp != 0xffff) |
4001 | 0 | { |
4002 | 0 | in_over_2x128 (&xmm_src, &xmm_src, |
4003 | 0 | &xmm_alpha, &xmm_alpha, |
4004 | 0 | &xmm_mask_lo, &xmm_mask_hi, |
4005 | 0 | &xmm_dst2, &xmm_dst3); |
4006 | 0 | } |
4007 | |
|
4008 | 0 | save_128_aligned ( |
4009 | 0 | (__m128i*)dst, pack_565_4x128_128 ( |
4010 | 0 | &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); |
4011 | |
|
4012 | 0 | w -= 8; |
4013 | 0 | dst += 8; |
4014 | 0 | mask += 8; |
4015 | 0 | } |
4016 | |
|
4017 | 0 | while (w) |
4018 | 0 | { |
4019 | 0 | m = *(uint32_t *) mask; |
4020 | |
|
4021 | 0 | if (m) |
4022 | 0 | { |
4023 | 0 | d = *dst; |
4024 | 0 | mmx_mask = unpack_32_1x128 (m); |
4025 | 0 | mmx_dest = expand565_16_1x128 (d); |
4026 | |
|
4027 | 0 | *dst = pack_565_32_16 ( |
4028 | 0 | pack_1x128_32 ( |
4029 | 0 | in_over_1x128 ( |
4030 | 0 | &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); |
4031 | 0 | } |
4032 | |
|
4033 | 0 | w--; |
4034 | 0 | dst++; |
4035 | 0 | mask++; |
4036 | 0 | } |
4037 | 0 | } |
4038 | |
|
4039 | 0 | } |
4040 | | |
4041 | | static void |
4042 | | sse2_composite_in_n_8_8 (pixman_implementation_t *imp, |
4043 | | pixman_composite_info_t *info) |
4044 | 0 | { |
4045 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
4046 | 0 | uint8_t *dst_line, *dst; |
4047 | 0 | uint8_t *mask_line, *mask; |
4048 | 0 | int dst_stride, mask_stride; |
4049 | 0 | uint32_t d; |
4050 | 0 | uint32_t src; |
4051 | 0 | int32_t w; |
4052 | |
|
4053 | 0 | __m128i xmm_alpha; |
4054 | 0 | __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; |
4055 | 0 | __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
4056 | |
|
4057 | 0 | PIXMAN_IMAGE_GET_LINE ( |
4058 | 0 | dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); |
4059 | 0 | PIXMAN_IMAGE_GET_LINE ( |
4060 | 0 | mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
4061 | |
|
4062 | 0 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
4063 | |
|
4064 | 0 | xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src)); |
4065 | |
|
4066 | 0 | while (height--) |
4067 | 0 | { |
4068 | 0 | dst = dst_line; |
4069 | 0 | dst_line += dst_stride; |
4070 | 0 | mask = mask_line; |
4071 | 0 | mask_line += mask_stride; |
4072 | 0 | w = width; |
4073 | |
|
4074 | 0 | while (w && ((uintptr_t)dst & 15)) |
4075 | 0 | { |
4076 | 0 | uint8_t m = *mask++; |
4077 | 0 | d = (uint32_t) *dst; |
4078 | |
|
4079 | 0 | *dst++ = (uint8_t) pack_1x128_32 ( |
4080 | 0 | pix_multiply_1x128 ( |
4081 | 0 | pix_multiply_1x128 (xmm_alpha, |
4082 | 0 | unpack_32_1x128 (m)), |
4083 | 0 | unpack_32_1x128 (d))); |
4084 | 0 | w--; |
4085 | 0 | } |
4086 | |
|
4087 | 0 | while (w >= 16) |
4088 | 0 | { |
4089 | 0 | xmm_mask = load_128_unaligned ((__m128i*)mask); |
4090 | 0 | xmm_dst = load_128_aligned ((__m128i*)dst); |
4091 | |
|
4092 | 0 | unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
4093 | 0 | unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
4094 | |
|
4095 | 0 | pix_multiply_2x128 (&xmm_alpha, &xmm_alpha, |
4096 | 0 | &xmm_mask_lo, &xmm_mask_hi, |
4097 | 0 | &xmm_mask_lo, &xmm_mask_hi); |
4098 | |
|
4099 | 0 | pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, |
4100 | 0 | &xmm_dst_lo, &xmm_dst_hi, |
4101 | 0 | &xmm_dst_lo, &xmm_dst_hi); |
4102 | |
|
4103 | 0 | save_128_aligned ( |
4104 | 0 | (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
4105 | |
|
4106 | 0 | mask += 16; |
4107 | 0 | dst += 16; |
4108 | 0 | w -= 16; |
4109 | 0 | } |
4110 | |
|
4111 | 0 | while (w) |
4112 | 0 | { |
4113 | 0 | uint8_t m = *mask++; |
4114 | 0 | d = (uint32_t) *dst; |
4115 | |
|
4116 | 0 | *dst++ = (uint8_t) pack_1x128_32 ( |
4117 | 0 | pix_multiply_1x128 ( |
4118 | 0 | pix_multiply_1x128 ( |
4119 | 0 | xmm_alpha, unpack_32_1x128 (m)), |
4120 | 0 | unpack_32_1x128 (d))); |
4121 | 0 | w--; |
4122 | 0 | } |
4123 | 0 | } |
4124 | |
|
4125 | 0 | } |
4126 | | |
4127 | | static void |
4128 | | sse2_composite_in_n_8 (pixman_implementation_t *imp, |
4129 | | pixman_composite_info_t *info) |
4130 | 0 | { |
4131 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
4132 | 0 | uint8_t *dst_line, *dst; |
4133 | 0 | int dst_stride; |
4134 | 0 | uint32_t d; |
4135 | 0 | uint32_t src; |
4136 | 0 | int32_t w; |
4137 | |
|
4138 | 0 | __m128i xmm_alpha; |
4139 | 0 | __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
4140 | |
|
4141 | 0 | PIXMAN_IMAGE_GET_LINE ( |
4142 | 0 | dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); |
4143 | |
|
4144 | 0 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
4145 | |
|
4146 | 0 | xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src)); |
4147 | |
|
4148 | 0 | src = src >> 24; |
4149 | |
|
4150 | 0 | if (src == 0xff) |
4151 | 0 | return; |
4152 | | |
4153 | 0 | if (src == 0x00) |
4154 | 0 | { |
4155 | 0 | pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, |
4156 | 0 | 8, dest_x, dest_y, width, height, src); |
4157 | |
|
4158 | 0 | return; |
4159 | 0 | } |
4160 | | |
4161 | 0 | while (height--) |
4162 | 0 | { |
4163 | 0 | dst = dst_line; |
4164 | 0 | dst_line += dst_stride; |
4165 | 0 | w = width; |
4166 | |
|
4167 | 0 | while (w && ((uintptr_t)dst & 15)) |
4168 | 0 | { |
4169 | 0 | d = (uint32_t) *dst; |
4170 | |
|
4171 | 0 | *dst++ = (uint8_t) pack_1x128_32 ( |
4172 | 0 | pix_multiply_1x128 ( |
4173 | 0 | xmm_alpha, |
4174 | 0 | unpack_32_1x128 (d))); |
4175 | 0 | w--; |
4176 | 0 | } |
4177 | |
|
4178 | 0 | while (w >= 16) |
4179 | 0 | { |
4180 | 0 | xmm_dst = load_128_aligned ((__m128i*)dst); |
4181 | |
|
4182 | 0 | unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
4183 | | |
4184 | 0 | pix_multiply_2x128 (&xmm_alpha, &xmm_alpha, |
4185 | 0 | &xmm_dst_lo, &xmm_dst_hi, |
4186 | 0 | &xmm_dst_lo, &xmm_dst_hi); |
4187 | |
|
4188 | 0 | save_128_aligned ( |
4189 | 0 | (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
4190 | |
|
4191 | 0 | dst += 16; |
4192 | 0 | w -= 16; |
4193 | 0 | } |
4194 | |
|
4195 | 0 | while (w) |
4196 | 0 | { |
4197 | 0 | d = (uint32_t) *dst; |
4198 | |
|
4199 | 0 | *dst++ = (uint8_t) pack_1x128_32 ( |
4200 | 0 | pix_multiply_1x128 ( |
4201 | 0 | xmm_alpha, |
4202 | 0 | unpack_32_1x128 (d))); |
4203 | 0 | w--; |
4204 | 0 | } |
4205 | 0 | } |
4206 | |
|
4207 | 0 | } |
4208 | | |
4209 | | static void |
4210 | | sse2_composite_in_8_8 (pixman_implementation_t *imp, |
4211 | | pixman_composite_info_t *info) |
4212 | 0 | { |
4213 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
4214 | 0 | uint8_t *dst_line, *dst; |
4215 | 0 | uint8_t *src_line, *src; |
4216 | 0 | int src_stride, dst_stride; |
4217 | 0 | int32_t w; |
4218 | 0 | uint32_t s, d; |
4219 | |
|
4220 | 0 | __m128i xmm_src, xmm_src_lo, xmm_src_hi; |
4221 | 0 | __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
4222 | |
|
4223 | 0 | PIXMAN_IMAGE_GET_LINE ( |
4224 | 0 | dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); |
4225 | 0 | PIXMAN_IMAGE_GET_LINE ( |
4226 | 0 | src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); |
4227 | |
|
4228 | 0 | while (height--) |
4229 | 0 | { |
4230 | 0 | dst = dst_line; |
4231 | 0 | dst_line += dst_stride; |
4232 | 0 | src = src_line; |
4233 | 0 | src_line += src_stride; |
4234 | 0 | w = width; |
4235 | |
|
4236 | 0 | while (w && ((uintptr_t)dst & 15)) |
4237 | 0 | { |
4238 | 0 | s = (uint32_t) *src++; |
4239 | 0 | d = (uint32_t) *dst; |
4240 | |
|
4241 | 0 | *dst++ = (uint8_t) pack_1x128_32 ( |
4242 | 0 | pix_multiply_1x128 ( |
4243 | 0 | unpack_32_1x128 (s), unpack_32_1x128 (d))); |
4244 | 0 | w--; |
4245 | 0 | } |
4246 | |
|
4247 | 0 | while (w >= 16) |
4248 | 0 | { |
4249 | 0 | xmm_src = load_128_unaligned ((__m128i*)src); |
4250 | 0 | xmm_dst = load_128_aligned ((__m128i*)dst); |
4251 | |
|
4252 | 0 | unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
4253 | 0 | unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
4254 | |
|
4255 | 0 | pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, |
4256 | 0 | &xmm_dst_lo, &xmm_dst_hi, |
4257 | 0 | &xmm_dst_lo, &xmm_dst_hi); |
4258 | |
|
4259 | 0 | save_128_aligned ( |
4260 | 0 | (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
4261 | |
|
4262 | 0 | src += 16; |
4263 | 0 | dst += 16; |
4264 | 0 | w -= 16; |
4265 | 0 | } |
4266 | |
|
4267 | 0 | while (w) |
4268 | 0 | { |
4269 | 0 | s = (uint32_t) *src++; |
4270 | 0 | d = (uint32_t) *dst; |
4271 | |
|
4272 | 0 | *dst++ = (uint8_t) pack_1x128_32 ( |
4273 | 0 | pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d))); |
4274 | 0 | w--; |
4275 | 0 | } |
4276 | 0 | } |
4277 | |
|
4278 | 0 | } |
4279 | | |
4280 | | static void |
4281 | | sse2_composite_add_n_8_8 (pixman_implementation_t *imp, |
4282 | | pixman_composite_info_t *info) |
4283 | 0 | { |
4284 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
4285 | 0 | uint8_t *dst_line, *dst; |
4286 | 0 | uint8_t *mask_line, *mask; |
4287 | 0 | int dst_stride, mask_stride; |
4288 | 0 | int32_t w; |
4289 | 0 | uint32_t src; |
4290 | 0 | uint32_t d; |
4291 | |
|
4292 | 0 | __m128i xmm_alpha; |
4293 | 0 | __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; |
4294 | 0 | __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
4295 | |
|
4296 | 0 | PIXMAN_IMAGE_GET_LINE ( |
4297 | 0 | dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); |
4298 | 0 | PIXMAN_IMAGE_GET_LINE ( |
4299 | 0 | mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
4300 | |
|
4301 | 0 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
4302 | |
|
4303 | 0 | xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src)); |
4304 | |
|
4305 | 0 | while (height--) |
4306 | 0 | { |
4307 | 0 | dst = dst_line; |
4308 | 0 | dst_line += dst_stride; |
4309 | 0 | mask = mask_line; |
4310 | 0 | mask_line += mask_stride; |
4311 | 0 | w = width; |
4312 | |
|
4313 | 0 | while (w && ((uintptr_t)dst & 15)) |
4314 | 0 | { |
4315 | 0 | uint8_t m = *mask++; |
4316 | 0 | d = (uint32_t) *dst; |
4317 | |
|
4318 | 0 | *dst++ = (uint8_t) pack_1x128_32 ( |
4319 | 0 | _mm_adds_epu16 ( |
4320 | 0 | pix_multiply_1x128 ( |
4321 | 0 | xmm_alpha, unpack_32_1x128 (m)), |
4322 | 0 | unpack_32_1x128 (d))); |
4323 | 0 | w--; |
4324 | 0 | } |
4325 | |
|
4326 | 0 | while (w >= 16) |
4327 | 0 | { |
4328 | 0 | xmm_mask = load_128_unaligned ((__m128i*)mask); |
4329 | 0 | xmm_dst = load_128_aligned ((__m128i*)dst); |
4330 | |
|
4331 | 0 | unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
4332 | 0 | unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
4333 | |
|
4334 | 0 | pix_multiply_2x128 (&xmm_alpha, &xmm_alpha, |
4335 | 0 | &xmm_mask_lo, &xmm_mask_hi, |
4336 | 0 | &xmm_mask_lo, &xmm_mask_hi); |
4337 | |
|
4338 | 0 | xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo); |
4339 | 0 | xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi); |
4340 | |
|
4341 | 0 | save_128_aligned ( |
4342 | 0 | (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
4343 | |
|
4344 | 0 | mask += 16; |
4345 | 0 | dst += 16; |
4346 | 0 | w -= 16; |
4347 | 0 | } |
4348 | |
|
4349 | 0 | while (w) |
4350 | 0 | { |
4351 | 0 | uint8_t m = (uint32_t) *mask++; |
4352 | 0 | d = (uint32_t) *dst; |
4353 | |
|
4354 | 0 | *dst++ = (uint8_t) pack_1x128_32 ( |
4355 | 0 | _mm_adds_epu16 ( |
4356 | 0 | pix_multiply_1x128 ( |
4357 | 0 | xmm_alpha, unpack_32_1x128 (m)), |
4358 | 0 | unpack_32_1x128 (d))); |
4359 | |
|
4360 | 0 | w--; |
4361 | 0 | } |
4362 | 0 | } |
4363 | |
|
4364 | 0 | } |
4365 | | |
4366 | | static void |
4367 | | sse2_composite_add_n_8 (pixman_implementation_t *imp, |
4368 | | pixman_composite_info_t *info) |
4369 | 0 | { |
4370 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
4371 | 0 | uint8_t *dst_line, *dst; |
4372 | 0 | int dst_stride; |
4373 | 0 | int32_t w; |
4374 | 0 | uint32_t src; |
4375 | |
|
4376 | 0 | __m128i xmm_src; |
4377 | |
|
4378 | 0 | PIXMAN_IMAGE_GET_LINE ( |
4379 | 0 | dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); |
4380 | |
|
4381 | 0 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
4382 | |
|
4383 | 0 | src >>= 24; |
4384 | |
|
4385 | 0 | if (src == 0x00) |
4386 | 0 | return; |
4387 | | |
4388 | 0 | if (src == 0xff) |
4389 | 0 | { |
4390 | 0 | pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, |
4391 | 0 | 8, dest_x, dest_y, width, height, 0xff); |
4392 | |
|
4393 | 0 | return; |
4394 | 0 | } |
4395 | | |
4396 | 0 | src = (src << 24) | (src << 16) | (src << 8) | src; |
4397 | 0 | xmm_src = _mm_set_epi32 (src, src, src, src); |
4398 | |
|
4399 | 0 | while (height--) |
4400 | 0 | { |
4401 | 0 | dst = dst_line; |
4402 | 0 | dst_line += dst_stride; |
4403 | 0 | w = width; |
4404 | |
|
4405 | 0 | while (w && ((uintptr_t)dst & 15)) |
4406 | 0 | { |
4407 | 0 | *dst = (uint8_t)_mm_cvtsi128_si32 ( |
4408 | 0 | _mm_adds_epu8 ( |
4409 | 0 | xmm_src, |
4410 | 0 | _mm_cvtsi32_si128 (*dst))); |
4411 | |
|
4412 | 0 | w--; |
4413 | 0 | dst++; |
4414 | 0 | } |
4415 | |
|
4416 | 0 | while (w >= 16) |
4417 | 0 | { |
4418 | 0 | save_128_aligned ( |
4419 | 0 | (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst))); |
4420 | |
|
4421 | 0 | dst += 16; |
4422 | 0 | w -= 16; |
4423 | 0 | } |
4424 | |
|
4425 | 0 | while (w) |
4426 | 0 | { |
4427 | 0 | *dst = (uint8_t)_mm_cvtsi128_si32 ( |
4428 | 0 | _mm_adds_epu8 ( |
4429 | 0 | xmm_src, |
4430 | 0 | _mm_cvtsi32_si128 (*dst))); |
4431 | |
|
4432 | 0 | w--; |
4433 | 0 | dst++; |
4434 | 0 | } |
4435 | 0 | } |
4436 | |
|
4437 | 0 | } |
4438 | | |
4439 | | static void |
4440 | | sse2_composite_add_8_8 (pixman_implementation_t *imp, |
4441 | | pixman_composite_info_t *info) |
4442 | 0 | { |
4443 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
4444 | 0 | uint8_t *dst_line, *dst; |
4445 | 0 | uint8_t *src_line, *src; |
4446 | 0 | int dst_stride, src_stride; |
4447 | 0 | int32_t w; |
4448 | 0 | uint16_t t; |
4449 | |
|
4450 | 0 | PIXMAN_IMAGE_GET_LINE ( |
4451 | 0 | src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); |
4452 | 0 | PIXMAN_IMAGE_GET_LINE ( |
4453 | 0 | dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); |
4454 | |
|
4455 | 0 | while (height--) |
4456 | 0 | { |
4457 | 0 | dst = dst_line; |
4458 | 0 | src = src_line; |
4459 | |
|
4460 | 0 | dst_line += dst_stride; |
4461 | 0 | src_line += src_stride; |
4462 | 0 | w = width; |
4463 | | |
4464 | | /* Small head */ |
4465 | 0 | while (w && (uintptr_t)dst & 3) |
4466 | 0 | { |
4467 | 0 | t = (*dst) + (*src++); |
4468 | 0 | *dst++ = t | (0 - (t >> 8)); |
4469 | 0 | w--; |
4470 | 0 | } |
4471 | |
|
4472 | 0 | sse2_combine_add_u (imp, op, |
4473 | 0 | (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2); |
4474 | | |
4475 | | /* Small tail */ |
4476 | 0 | dst += w & 0xfffc; |
4477 | 0 | src += w & 0xfffc; |
4478 | |
|
4479 | 0 | w &= 3; |
4480 | |
|
4481 | 0 | while (w) |
4482 | 0 | { |
4483 | 0 | t = (*dst) + (*src++); |
4484 | 0 | *dst++ = t | (0 - (t >> 8)); |
4485 | 0 | w--; |
4486 | 0 | } |
4487 | 0 | } |
4488 | |
|
4489 | 0 | } |
4490 | | |
4491 | | static void |
4492 | | sse2_composite_add_8888_8888 (pixman_implementation_t *imp, |
4493 | | pixman_composite_info_t *info) |
4494 | 0 | { |
4495 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
4496 | 0 | uint32_t *dst_line, *dst; |
4497 | 0 | uint32_t *src_line, *src; |
4498 | 0 | int dst_stride, src_stride; |
4499 | |
|
4500 | 0 | PIXMAN_IMAGE_GET_LINE ( |
4501 | 0 | src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
4502 | 0 | PIXMAN_IMAGE_GET_LINE ( |
4503 | 0 | dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
4504 | |
|
4505 | 0 | while (height--) |
4506 | 0 | { |
4507 | 0 | dst = dst_line; |
4508 | 0 | dst_line += dst_stride; |
4509 | 0 | src = src_line; |
4510 | 0 | src_line += src_stride; |
4511 | |
|
4512 | 0 | sse2_combine_add_u (imp, op, dst, src, NULL, width); |
4513 | 0 | } |
4514 | 0 | } |
4515 | | |
4516 | | static void |
4517 | | sse2_composite_add_n_8888 (pixman_implementation_t *imp, |
4518 | | pixman_composite_info_t *info) |
4519 | 0 | { |
4520 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
4521 | 0 | uint32_t *dst_line, *dst, src; |
4522 | 0 | int dst_stride; |
4523 | |
|
4524 | 0 | __m128i xmm_src; |
4525 | |
|
4526 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
4527 | |
|
4528 | 0 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
4529 | 0 | if (src == 0) |
4530 | 0 | return; |
4531 | | |
4532 | 0 | if (src == ~0) |
4533 | 0 | { |
4534 | 0 | pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, 32, |
4535 | 0 | dest_x, dest_y, width, height, ~0); |
4536 | |
|
4537 | 0 | return; |
4538 | 0 | } |
4539 | | |
4540 | 0 | xmm_src = _mm_set_epi32 (src, src, src, src); |
4541 | 0 | while (height--) |
4542 | 0 | { |
4543 | 0 | int w = width; |
4544 | 0 | uint32_t d; |
4545 | |
|
4546 | 0 | dst = dst_line; |
4547 | 0 | dst_line += dst_stride; |
4548 | |
|
4549 | 0 | while (w && (uintptr_t)dst & 15) |
4550 | 0 | { |
4551 | 0 | d = *dst; |
4552 | 0 | *dst++ = |
4553 | 0 | _mm_cvtsi128_si32 ( _mm_adds_epu8 (xmm_src, _mm_cvtsi32_si128 (d))); |
4554 | 0 | w--; |
4555 | 0 | } |
4556 | |
|
4557 | 0 | while (w >= 4) |
4558 | 0 | { |
4559 | 0 | save_128_aligned |
4560 | 0 | ((__m128i*)dst, |
4561 | 0 | _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst))); |
4562 | |
|
4563 | 0 | dst += 4; |
4564 | 0 | w -= 4; |
4565 | 0 | } |
4566 | |
|
4567 | 0 | while (w--) |
4568 | 0 | { |
4569 | 0 | d = *dst; |
4570 | 0 | *dst++ = |
4571 | 0 | _mm_cvtsi128_si32 (_mm_adds_epu8 (xmm_src, |
4572 | 0 | _mm_cvtsi32_si128 (d))); |
4573 | 0 | } |
4574 | 0 | } |
4575 | 0 | } |
4576 | | |
4577 | | static void |
4578 | | sse2_composite_add_n_8_8888 (pixman_implementation_t *imp, |
4579 | | pixman_composite_info_t *info) |
4580 | 0 | { |
4581 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
4582 | 0 | uint32_t *dst_line, *dst; |
4583 | 0 | uint8_t *mask_line, *mask; |
4584 | 0 | int dst_stride, mask_stride; |
4585 | 0 | int32_t w; |
4586 | 0 | uint32_t src; |
4587 | |
|
4588 | 0 | __m128i xmm_src; |
4589 | |
|
4590 | 0 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
4591 | 0 | if (src == 0) |
4592 | 0 | return; |
4593 | 0 | xmm_src = expand_pixel_32_1x128 (src); |
4594 | |
|
4595 | 0 | PIXMAN_IMAGE_GET_LINE ( |
4596 | 0 | dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
4597 | 0 | PIXMAN_IMAGE_GET_LINE ( |
4598 | 0 | mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
4599 | |
|
4600 | 0 | while (height--) |
4601 | 0 | { |
4602 | 0 | dst = dst_line; |
4603 | 0 | dst_line += dst_stride; |
4604 | 0 | mask = mask_line; |
4605 | 0 | mask_line += mask_stride; |
4606 | 0 | w = width; |
4607 | |
|
4608 | 0 | while (w && ((uintptr_t)dst & 15)) |
4609 | 0 | { |
4610 | 0 | uint8_t m = *mask++; |
4611 | 0 | if (m) |
4612 | 0 | { |
4613 | 0 | *dst = pack_1x128_32 |
4614 | 0 | (_mm_adds_epu16 |
4615 | 0 | (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)), |
4616 | 0 | unpack_32_1x128 (*dst))); |
4617 | 0 | } |
4618 | 0 | dst++; |
4619 | 0 | w--; |
4620 | 0 | } |
4621 | |
|
4622 | 0 | while (w >= 4) |
4623 | 0 | { |
4624 | 0 | uint32_t m; |
4625 | 0 | memcpy(&m, mask, sizeof(uint32_t)); |
4626 | |
|
4627 | 0 | if (m) |
4628 | 0 | { |
4629 | 0 | __m128i xmm_mask_lo, xmm_mask_hi; |
4630 | 0 | __m128i xmm_dst_lo, xmm_dst_hi; |
4631 | |
|
4632 | 0 | __m128i xmm_dst = load_128_aligned ((__m128i*)dst); |
4633 | 0 | __m128i xmm_mask = |
4634 | 0 | _mm_unpacklo_epi8 (unpack_32_1x128(m), |
4635 | 0 | _mm_setzero_si128 ()); |
4636 | |
|
4637 | 0 | unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
4638 | 0 | unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
4639 | |
|
4640 | 0 | expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, |
4641 | 0 | &xmm_mask_lo, &xmm_mask_hi); |
4642 | |
|
4643 | 0 | pix_multiply_2x128 (&xmm_src, &xmm_src, |
4644 | 0 | &xmm_mask_lo, &xmm_mask_hi, |
4645 | 0 | &xmm_mask_lo, &xmm_mask_hi); |
4646 | |
|
4647 | 0 | xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo); |
4648 | 0 | xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi); |
4649 | |
|
4650 | 0 | save_128_aligned ( |
4651 | 0 | (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
4652 | 0 | } |
4653 | |
|
4654 | 0 | w -= 4; |
4655 | 0 | dst += 4; |
4656 | 0 | mask += 4; |
4657 | 0 | } |
4658 | |
|
4659 | 0 | while (w) |
4660 | 0 | { |
4661 | 0 | uint8_t m = *mask++; |
4662 | 0 | if (m) |
4663 | 0 | { |
4664 | 0 | *dst = pack_1x128_32 |
4665 | 0 | (_mm_adds_epu16 |
4666 | 0 | (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)), |
4667 | 0 | unpack_32_1x128 (*dst))); |
4668 | 0 | } |
4669 | 0 | dst++; |
4670 | 0 | w--; |
4671 | 0 | } |
4672 | 0 | } |
4673 | 0 | } |
4674 | | |
4675 | | static pixman_bool_t |
4676 | | sse2_blt (pixman_implementation_t *imp, |
4677 | | uint32_t * src_bits, |
4678 | | uint32_t * dst_bits, |
4679 | | int src_stride, |
4680 | | int dst_stride, |
4681 | | int src_bpp, |
4682 | | int dst_bpp, |
4683 | | int src_x, |
4684 | | int src_y, |
4685 | | int dest_x, |
4686 | | int dest_y, |
4687 | | int width, |
4688 | | int height) |
4689 | 0 | { |
4690 | 0 | uint8_t * src_bytes; |
4691 | 0 | uint8_t * dst_bytes; |
4692 | 0 | int byte_width; |
4693 | |
|
4694 | 0 | if (src_bpp != dst_bpp) |
4695 | 0 | return FALSE; |
4696 | | |
4697 | 0 | if (src_bpp == 16) |
4698 | 0 | { |
4699 | 0 | src_stride = src_stride * (int) sizeof (uint32_t) / 2; |
4700 | 0 | dst_stride = dst_stride * (int) sizeof (uint32_t) / 2; |
4701 | 0 | src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x)); |
4702 | 0 | dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x)); |
4703 | 0 | byte_width = 2 * width; |
4704 | 0 | src_stride *= 2; |
4705 | 0 | dst_stride *= 2; |
4706 | 0 | } |
4707 | 0 | else if (src_bpp == 32) |
4708 | 0 | { |
4709 | 0 | src_stride = src_stride * (int) sizeof (uint32_t) / 4; |
4710 | 0 | dst_stride = dst_stride * (int) sizeof (uint32_t) / 4; |
4711 | 0 | src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x)); |
4712 | 0 | dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x)); |
4713 | 0 | byte_width = 4 * width; |
4714 | 0 | src_stride *= 4; |
4715 | 0 | dst_stride *= 4; |
4716 | 0 | } |
4717 | 0 | else |
4718 | 0 | { |
4719 | 0 | return FALSE; |
4720 | 0 | } |
4721 | | |
4722 | 0 | while (height--) |
4723 | 0 | { |
4724 | 0 | int w; |
4725 | 0 | uint8_t *s = src_bytes; |
4726 | 0 | uint8_t *d = dst_bytes; |
4727 | 0 | src_bytes += src_stride; |
4728 | 0 | dst_bytes += dst_stride; |
4729 | 0 | w = byte_width; |
4730 | |
|
4731 | 0 | while (w >= 2 && ((uintptr_t)d & 3)) |
4732 | 0 | { |
4733 | 0 | memmove(d, s, 2); |
4734 | 0 | w -= 2; |
4735 | 0 | s += 2; |
4736 | 0 | d += 2; |
4737 | 0 | } |
4738 | |
|
4739 | 0 | while (w >= 4 && ((uintptr_t)d & 15)) |
4740 | 0 | { |
4741 | 0 | memmove(d, s, 4); |
4742 | |
|
4743 | 0 | w -= 4; |
4744 | 0 | s += 4; |
4745 | 0 | d += 4; |
4746 | 0 | } |
4747 | |
|
4748 | 0 | while (w >= 64) |
4749 | 0 | { |
4750 | 0 | __m128i xmm0, xmm1, xmm2, xmm3; |
4751 | |
|
4752 | 0 | xmm0 = load_128_unaligned ((__m128i*)(s)); |
4753 | 0 | xmm1 = load_128_unaligned ((__m128i*)(s + 16)); |
4754 | 0 | xmm2 = load_128_unaligned ((__m128i*)(s + 32)); |
4755 | 0 | xmm3 = load_128_unaligned ((__m128i*)(s + 48)); |
4756 | |
|
4757 | 0 | save_128_aligned ((__m128i*)(d), xmm0); |
4758 | 0 | save_128_aligned ((__m128i*)(d + 16), xmm1); |
4759 | 0 | save_128_aligned ((__m128i*)(d + 32), xmm2); |
4760 | 0 | save_128_aligned ((__m128i*)(d + 48), xmm3); |
4761 | |
|
4762 | 0 | s += 64; |
4763 | 0 | d += 64; |
4764 | 0 | w -= 64; |
4765 | 0 | } |
4766 | |
|
4767 | 0 | while (w >= 16) |
4768 | 0 | { |
4769 | 0 | save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) ); |
4770 | |
|
4771 | 0 | w -= 16; |
4772 | 0 | d += 16; |
4773 | 0 | s += 16; |
4774 | 0 | } |
4775 | |
|
4776 | 0 | while (w >= 4) |
4777 | 0 | { |
4778 | 0 | memmove(d, s, 4); |
4779 | |
|
4780 | 0 | w -= 4; |
4781 | 0 | s += 4; |
4782 | 0 | d += 4; |
4783 | 0 | } |
4784 | |
|
4785 | 0 | if (w >= 2) |
4786 | 0 | { |
4787 | 0 | memmove(d, s, 2); |
4788 | 0 | w -= 2; |
4789 | 0 | s += 2; |
4790 | 0 | d += 2; |
4791 | 0 | } |
4792 | 0 | } |
4793 | |
|
4794 | 0 | return TRUE; |
4795 | 0 | } |
4796 | | |
4797 | | static void |
4798 | | sse2_composite_copy_area (pixman_implementation_t *imp, |
4799 | | pixman_composite_info_t *info) |
4800 | 0 | { |
4801 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
4802 | 0 | sse2_blt (imp, src_image->bits.bits, |
4803 | 0 | dest_image->bits.bits, |
4804 | 0 | src_image->bits.rowstride, |
4805 | 0 | dest_image->bits.rowstride, |
4806 | 0 | PIXMAN_FORMAT_BPP (src_image->bits.format), |
4807 | 0 | PIXMAN_FORMAT_BPP (dest_image->bits.format), |
4808 | 0 | src_x, src_y, dest_x, dest_y, width, height); |
4809 | 0 | } |
4810 | | |
4811 | | static void |
4812 | | sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp, |
4813 | | pixman_composite_info_t *info) |
4814 | 0 | { |
4815 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
4816 | 0 | uint32_t *src, *src_line, s; |
4817 | 0 | uint32_t *dst, *dst_line, d; |
4818 | 0 | uint8_t *mask, *mask_line; |
4819 | 0 | int src_stride, mask_stride, dst_stride; |
4820 | 0 | int32_t w; |
4821 | 0 | __m128i ms; |
4822 | |
|
4823 | 0 | __m128i xmm_src, xmm_src_lo, xmm_src_hi; |
4824 | 0 | __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
4825 | 0 | __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; |
4826 | |
|
4827 | 0 | PIXMAN_IMAGE_GET_LINE ( |
4828 | 0 | dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
4829 | 0 | PIXMAN_IMAGE_GET_LINE ( |
4830 | 0 | mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
4831 | 0 | PIXMAN_IMAGE_GET_LINE ( |
4832 | 0 | src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
4833 | |
|
4834 | 0 | while (height--) |
4835 | 0 | { |
4836 | 0 | src = src_line; |
4837 | 0 | src_line += src_stride; |
4838 | 0 | dst = dst_line; |
4839 | 0 | dst_line += dst_stride; |
4840 | 0 | mask = mask_line; |
4841 | 0 | mask_line += mask_stride; |
4842 | |
|
4843 | 0 | w = width; |
4844 | |
|
4845 | 0 | while (w && (uintptr_t)dst & 15) |
4846 | 0 | { |
4847 | 0 | uint8_t m = *mask++; |
4848 | 0 | s = 0xff000000 | *src++; |
4849 | 0 | d = *dst; |
4850 | 0 | ms = unpack_32_1x128 (s); |
4851 | |
|
4852 | 0 | if (m != 0xff) |
4853 | 0 | { |
4854 | 0 | __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); |
4855 | 0 | __m128i md = unpack_32_1x128 (d); |
4856 | |
|
4857 | 0 | ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md); |
4858 | 0 | } |
4859 | |
|
4860 | 0 | *dst++ = pack_1x128_32 (ms); |
4861 | 0 | w--; |
4862 | 0 | } |
4863 | |
|
4864 | 0 | while (w >= 4) |
4865 | 0 | { |
4866 | 0 | uint32_t m; |
4867 | 0 | memcpy(&m, mask, sizeof(uint32_t)); |
4868 | 0 | xmm_src = _mm_or_si128 ( |
4869 | 0 | load_128_unaligned ((__m128i*)src), mask_ff000000); |
4870 | |
|
4871 | 0 | if (m == 0xffffffff) |
4872 | 0 | { |
4873 | 0 | save_128_aligned ((__m128i*)dst, xmm_src); |
4874 | 0 | } |
4875 | 0 | else |
4876 | 0 | { |
4877 | 0 | xmm_dst = load_128_aligned ((__m128i*)dst); |
4878 | |
|
4879 | 0 | xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128()); |
4880 | |
|
4881 | 0 | unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
4882 | 0 | unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
4883 | 0 | unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
4884 | |
|
4885 | 0 | expand_alpha_rev_2x128 ( |
4886 | 0 | xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
4887 | |
|
4888 | 0 | in_over_2x128 (&xmm_src_lo, &xmm_src_hi, |
4889 | 0 | &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, |
4890 | 0 | &xmm_dst_lo, &xmm_dst_hi); |
4891 | |
|
4892 | 0 | save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
4893 | 0 | } |
4894 | |
|
4895 | 0 | src += 4; |
4896 | 0 | dst += 4; |
4897 | 0 | mask += 4; |
4898 | 0 | w -= 4; |
4899 | 0 | } |
4900 | |
|
4901 | 0 | while (w) |
4902 | 0 | { |
4903 | 0 | uint8_t m = *mask++; |
4904 | |
|
4905 | 0 | if (m) |
4906 | 0 | { |
4907 | 0 | s = 0xff000000 | *src; |
4908 | |
|
4909 | 0 | if (m == 0xff) |
4910 | 0 | { |
4911 | 0 | *dst = s; |
4912 | 0 | } |
4913 | 0 | else |
4914 | 0 | { |
4915 | 0 | __m128i ma, md, ms; |
4916 | |
|
4917 | 0 | d = *dst; |
4918 | |
|
4919 | 0 | ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); |
4920 | 0 | md = unpack_32_1x128 (d); |
4921 | 0 | ms = unpack_32_1x128 (s); |
4922 | |
|
4923 | 0 | *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md)); |
4924 | 0 | } |
4925 | |
|
4926 | 0 | } |
4927 | |
|
4928 | 0 | src++; |
4929 | 0 | dst++; |
4930 | 0 | w--; |
4931 | 0 | } |
4932 | 0 | } |
4933 | |
|
4934 | 0 | } |
4935 | | |
4936 | | static void |
4937 | | sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp, |
4938 | | pixman_composite_info_t *info) |
4939 | 4 | { |
4940 | 4 | PIXMAN_COMPOSITE_ARGS (info); |
4941 | 4 | uint32_t *src, *src_line, s; |
4942 | 4 | uint32_t *dst, *dst_line, d; |
4943 | 4 | uint8_t *mask, *mask_line; |
4944 | 4 | int src_stride, mask_stride, dst_stride; |
4945 | 4 | int32_t w; |
4946 | | |
4947 | 4 | __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi; |
4948 | 4 | __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
4949 | 4 | __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; |
4950 | | |
4951 | 4 | PIXMAN_IMAGE_GET_LINE ( |
4952 | 4 | dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
4953 | 4 | PIXMAN_IMAGE_GET_LINE ( |
4954 | 4 | mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
4955 | 4 | PIXMAN_IMAGE_GET_LINE ( |
4956 | 4 | src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
4957 | | |
4958 | 39 | while (height--) |
4959 | 35 | { |
4960 | 35 | src = src_line; |
4961 | 35 | src_line += src_stride; |
4962 | 35 | dst = dst_line; |
4963 | 35 | dst_line += dst_stride; |
4964 | 35 | mask = mask_line; |
4965 | 35 | mask_line += mask_stride; |
4966 | | |
4967 | 35 | w = width; |
4968 | | |
4969 | 67 | while (w && (uintptr_t)dst & 15) |
4970 | 32 | { |
4971 | 32 | uint32_t sa; |
4972 | 32 | uint8_t m = *mask++; |
4973 | | |
4974 | 32 | s = *src++; |
4975 | 32 | d = *dst; |
4976 | | |
4977 | 32 | sa = s >> 24; |
4978 | | |
4979 | 32 | if (m) |
4980 | 32 | { |
4981 | 32 | if (sa == 0xff && m == 0xff) |
4982 | 0 | { |
4983 | 0 | *dst = s; |
4984 | 0 | } |
4985 | 32 | else |
4986 | 32 | { |
4987 | 32 | __m128i ms, md, ma, msa; |
4988 | | |
4989 | 32 | ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); |
4990 | 32 | ms = unpack_32_1x128 (s); |
4991 | 32 | md = unpack_32_1x128 (d); |
4992 | | |
4993 | 32 | msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); |
4994 | | |
4995 | 32 | *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); |
4996 | 32 | } |
4997 | 32 | } |
4998 | | |
4999 | 32 | dst++; |
5000 | 32 | w--; |
5001 | 32 | } |
5002 | | |
5003 | 45 | while (w >= 4) |
5004 | 10 | { |
5005 | 10 | uint32_t m; |
5006 | 10 | memcpy(&m, mask, sizeof(uint32_t)); |
5007 | | |
5008 | 10 | if (m) |
5009 | 10 | { |
5010 | 10 | xmm_src = load_128_unaligned ((__m128i*)src); |
5011 | | |
5012 | 10 | if (m == 0xffffffff && is_opaque (xmm_src)) |
5013 | 0 | { |
5014 | 0 | save_128_aligned ((__m128i *)dst, xmm_src); |
5015 | 0 | } |
5016 | 10 | else |
5017 | 10 | { |
5018 | 10 | xmm_dst = load_128_aligned ((__m128i *)dst); |
5019 | | |
5020 | 10 | xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128()); |
5021 | | |
5022 | 10 | unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
5023 | 10 | unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
5024 | 10 | unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
5025 | | |
5026 | 10 | expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi); |
5027 | 10 | expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
5028 | | |
5029 | 10 | in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi, |
5030 | 10 | &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); |
5031 | | |
5032 | 10 | save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
5033 | 10 | } |
5034 | 10 | } |
5035 | | |
5036 | 10 | src += 4; |
5037 | 10 | dst += 4; |
5038 | 10 | mask += 4; |
5039 | 10 | w -= 4; |
5040 | 10 | } |
5041 | | |
5042 | 42 | while (w) |
5043 | 7 | { |
5044 | 7 | uint32_t sa; |
5045 | 7 | uint8_t m = *mask++; |
5046 | | |
5047 | 7 | s = *src++; |
5048 | 7 | d = *dst; |
5049 | | |
5050 | 7 | sa = s >> 24; |
5051 | | |
5052 | 7 | if (m) |
5053 | 7 | { |
5054 | 7 | if (sa == 0xff && m == 0xff) |
5055 | 0 | { |
5056 | 0 | *dst = s; |
5057 | 0 | } |
5058 | 7 | else |
5059 | 7 | { |
5060 | 7 | __m128i ms, md, ma, msa; |
5061 | | |
5062 | 7 | ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); |
5063 | 7 | ms = unpack_32_1x128 (s); |
5064 | 7 | md = unpack_32_1x128 (d); |
5065 | | |
5066 | 7 | msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); |
5067 | | |
5068 | 7 | *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); |
5069 | 7 | } |
5070 | 7 | } |
5071 | | |
5072 | 7 | dst++; |
5073 | 7 | w--; |
5074 | 7 | } |
5075 | 35 | } |
5076 | | |
5077 | 4 | } |
5078 | | |
5079 | | static void |
5080 | | sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp, |
5081 | | pixman_composite_info_t *info) |
5082 | 0 | { |
5083 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
5084 | 0 | uint32_t src; |
5085 | 0 | uint32_t *dst_line, *dst; |
5086 | 0 | __m128i xmm_src; |
5087 | 0 | __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
5088 | 0 | __m128i xmm_dsta_hi, xmm_dsta_lo; |
5089 | 0 | int dst_stride; |
5090 | 0 | int32_t w; |
5091 | |
|
5092 | 0 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
5093 | |
|
5094 | 0 | if (src == 0) |
5095 | 0 | return; |
5096 | | |
5097 | 0 | PIXMAN_IMAGE_GET_LINE ( |
5098 | 0 | dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
5099 | |
|
5100 | 0 | xmm_src = expand_pixel_32_1x128 (src); |
5101 | |
|
5102 | 0 | while (height--) |
5103 | 0 | { |
5104 | 0 | dst = dst_line; |
5105 | |
|
5106 | 0 | dst_line += dst_stride; |
5107 | 0 | w = width; |
5108 | |
|
5109 | 0 | while (w && (uintptr_t)dst & 15) |
5110 | 0 | { |
5111 | 0 | __m128i vd; |
5112 | |
|
5113 | 0 | vd = unpack_32_1x128 (*dst); |
5114 | |
|
5115 | 0 | *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd), |
5116 | 0 | xmm_src)); |
5117 | 0 | w--; |
5118 | 0 | dst++; |
5119 | 0 | } |
5120 | |
|
5121 | 0 | while (w >= 4) |
5122 | 0 | { |
5123 | 0 | __m128i tmp_lo, tmp_hi; |
5124 | |
|
5125 | 0 | xmm_dst = load_128_aligned ((__m128i*)dst); |
5126 | |
|
5127 | 0 | unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
5128 | 0 | expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi); |
5129 | |
|
5130 | 0 | tmp_lo = xmm_src; |
5131 | 0 | tmp_hi = xmm_src; |
5132 | |
|
5133 | 0 | over_2x128 (&xmm_dst_lo, &xmm_dst_hi, |
5134 | 0 | &xmm_dsta_lo, &xmm_dsta_hi, |
5135 | 0 | &tmp_lo, &tmp_hi); |
5136 | |
|
5137 | 0 | save_128_aligned ( |
5138 | 0 | (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi)); |
5139 | |
|
5140 | 0 | w -= 4; |
5141 | 0 | dst += 4; |
5142 | 0 | } |
5143 | |
|
5144 | 0 | while (w) |
5145 | 0 | { |
5146 | 0 | __m128i vd; |
5147 | |
|
5148 | 0 | vd = unpack_32_1x128 (*dst); |
5149 | |
|
5150 | 0 | *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd), |
5151 | 0 | xmm_src)); |
5152 | 0 | w--; |
5153 | 0 | dst++; |
5154 | 0 | } |
5155 | |
|
5156 | 0 | } |
5157 | |
|
5158 | 0 | } |
5159 | | |
5160 | | static void |
5161 | | sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp, |
5162 | | pixman_composite_info_t *info) |
5163 | 0 | { |
5164 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
5165 | 0 | uint32_t *src, *src_line, s; |
5166 | 0 | uint32_t *dst, *dst_line, d; |
5167 | 0 | uint32_t *mask, *mask_line; |
5168 | 0 | uint32_t m; |
5169 | 0 | int src_stride, mask_stride, dst_stride; |
5170 | 0 | int32_t w; |
5171 | |
|
5172 | 0 | __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi; |
5173 | 0 | __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
5174 | 0 | __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; |
5175 | |
|
5176 | 0 | PIXMAN_IMAGE_GET_LINE ( |
5177 | 0 | dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
5178 | 0 | PIXMAN_IMAGE_GET_LINE ( |
5179 | 0 | mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); |
5180 | 0 | PIXMAN_IMAGE_GET_LINE ( |
5181 | 0 | src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
5182 | |
|
5183 | 0 | while (height--) |
5184 | 0 | { |
5185 | 0 | src = src_line; |
5186 | 0 | src_line += src_stride; |
5187 | 0 | dst = dst_line; |
5188 | 0 | dst_line += dst_stride; |
5189 | 0 | mask = mask_line; |
5190 | 0 | mask_line += mask_stride; |
5191 | |
|
5192 | 0 | w = width; |
5193 | |
|
5194 | 0 | while (w && (uintptr_t)dst & 15) |
5195 | 0 | { |
5196 | 0 | uint32_t sa; |
5197 | |
|
5198 | 0 | s = *src++; |
5199 | 0 | m = (*mask++) >> 24; |
5200 | 0 | d = *dst; |
5201 | |
|
5202 | 0 | sa = s >> 24; |
5203 | |
|
5204 | 0 | if (m) |
5205 | 0 | { |
5206 | 0 | if (sa == 0xff && m == 0xff) |
5207 | 0 | { |
5208 | 0 | *dst = s; |
5209 | 0 | } |
5210 | 0 | else |
5211 | 0 | { |
5212 | 0 | __m128i ms, md, ma, msa; |
5213 | |
|
5214 | 0 | ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); |
5215 | 0 | ms = unpack_32_1x128 (s); |
5216 | 0 | md = unpack_32_1x128 (d); |
5217 | |
|
5218 | 0 | msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); |
5219 | |
|
5220 | 0 | *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); |
5221 | 0 | } |
5222 | 0 | } |
5223 | |
|
5224 | 0 | dst++; |
5225 | 0 | w--; |
5226 | 0 | } |
5227 | |
|
5228 | 0 | while (w >= 4) |
5229 | 0 | { |
5230 | 0 | xmm_mask = load_128_unaligned ((__m128i*)mask); |
5231 | |
|
5232 | 0 | if (!is_transparent (xmm_mask)) |
5233 | 0 | { |
5234 | 0 | xmm_src = load_128_unaligned ((__m128i*)src); |
5235 | |
|
5236 | 0 | if (is_opaque (xmm_mask) && is_opaque (xmm_src)) |
5237 | 0 | { |
5238 | 0 | save_128_aligned ((__m128i *)dst, xmm_src); |
5239 | 0 | } |
5240 | 0 | else |
5241 | 0 | { |
5242 | 0 | xmm_dst = load_128_aligned ((__m128i *)dst); |
5243 | |
|
5244 | 0 | unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
5245 | 0 | unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
5246 | 0 | unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
5247 | |
|
5248 | 0 | expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi); |
5249 | 0 | expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
5250 | |
|
5251 | 0 | in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi, |
5252 | 0 | &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); |
5253 | |
|
5254 | 0 | save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
5255 | 0 | } |
5256 | 0 | } |
5257 | |
|
5258 | 0 | src += 4; |
5259 | 0 | dst += 4; |
5260 | 0 | mask += 4; |
5261 | 0 | w -= 4; |
5262 | 0 | } |
5263 | |
|
5264 | 0 | while (w) |
5265 | 0 | { |
5266 | 0 | uint32_t sa; |
5267 | |
|
5268 | 0 | s = *src++; |
5269 | 0 | m = (*mask++) >> 24; |
5270 | 0 | d = *dst; |
5271 | |
|
5272 | 0 | sa = s >> 24; |
5273 | |
|
5274 | 0 | if (m) |
5275 | 0 | { |
5276 | 0 | if (sa == 0xff && m == 0xff) |
5277 | 0 | { |
5278 | 0 | *dst = s; |
5279 | 0 | } |
5280 | 0 | else |
5281 | 0 | { |
5282 | 0 | __m128i ms, md, ma, msa; |
5283 | |
|
5284 | 0 | ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); |
5285 | 0 | ms = unpack_32_1x128 (s); |
5286 | 0 | md = unpack_32_1x128 (d); |
5287 | |
|
5288 | 0 | msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); |
5289 | |
|
5290 | 0 | *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); |
5291 | 0 | } |
5292 | 0 | } |
5293 | |
|
5294 | 0 | dst++; |
5295 | 0 | w--; |
5296 | 0 | } |
5297 | 0 | } |
5298 | |
|
5299 | 0 | } |
5300 | | |
5301 | | /* A variant of 'sse2_combine_over_u' with minor tweaks */ |
5302 | | static force_inline void |
5303 | | scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd, |
5304 | | const uint32_t* ps, |
5305 | | int32_t w, |
5306 | | pixman_fixed_t vx, |
5307 | | pixman_fixed_t unit_x, |
5308 | | pixman_fixed_t src_width_fixed, |
5309 | | pixman_bool_t fully_transparent_src) |
5310 | 0 | { |
5311 | 0 | uint32_t s, d; |
5312 | 0 | const uint32_t* pm = NULL; |
5313 | |
|
5314 | 0 | __m128i xmm_dst_lo, xmm_dst_hi; |
5315 | 0 | __m128i xmm_src_lo, xmm_src_hi; |
5316 | 0 | __m128i xmm_alpha_lo, xmm_alpha_hi; |
5317 | |
|
5318 | 0 | if (fully_transparent_src) |
5319 | 0 | return; |
5320 | | |
5321 | | /* Align dst on a 16-byte boundary */ |
5322 | 0 | while (w && ((uintptr_t)pd & 15)) |
5323 | 0 | { |
5324 | 0 | d = *pd; |
5325 | 0 | s = combine1 (ps + pixman_fixed_to_int (vx), pm); |
5326 | 0 | vx += unit_x; |
5327 | 0 | while (vx >= 0) |
5328 | 0 | vx -= src_width_fixed; |
5329 | |
|
5330 | 0 | *pd++ = core_combine_over_u_pixel_sse2 (s, d); |
5331 | 0 | if (pm) |
5332 | 0 | pm++; |
5333 | 0 | w--; |
5334 | 0 | } |
5335 | |
|
5336 | 0 | while (w >= 4) |
5337 | 0 | { |
5338 | 0 | __m128i tmp; |
5339 | 0 | uint32_t tmp1, tmp2, tmp3, tmp4; |
5340 | |
|
5341 | 0 | tmp1 = *(ps + pixman_fixed_to_int (vx)); |
5342 | 0 | vx += unit_x; |
5343 | 0 | while (vx >= 0) |
5344 | 0 | vx -= src_width_fixed; |
5345 | 0 | tmp2 = *(ps + pixman_fixed_to_int (vx)); |
5346 | 0 | vx += unit_x; |
5347 | 0 | while (vx >= 0) |
5348 | 0 | vx -= src_width_fixed; |
5349 | 0 | tmp3 = *(ps + pixman_fixed_to_int (vx)); |
5350 | 0 | vx += unit_x; |
5351 | 0 | while (vx >= 0) |
5352 | 0 | vx -= src_width_fixed; |
5353 | 0 | tmp4 = *(ps + pixman_fixed_to_int (vx)); |
5354 | 0 | vx += unit_x; |
5355 | 0 | while (vx >= 0) |
5356 | 0 | vx -= src_width_fixed; |
5357 | |
|
5358 | 0 | tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1); |
5359 | |
|
5360 | 0 | xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm); |
5361 | |
|
5362 | 0 | if (is_opaque (xmm_src_hi)) |
5363 | 0 | { |
5364 | 0 | save_128_aligned ((__m128i*)pd, xmm_src_hi); |
5365 | 0 | } |
5366 | 0 | else if (!is_zero (xmm_src_hi)) |
5367 | 0 | { |
5368 | 0 | xmm_dst_hi = load_128_aligned ((__m128i*) pd); |
5369 | |
|
5370 | 0 | unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
5371 | 0 | unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
5372 | |
|
5373 | 0 | expand_alpha_2x128 ( |
5374 | 0 | xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi); |
5375 | |
|
5376 | 0 | over_2x128 (&xmm_src_lo, &xmm_src_hi, |
5377 | 0 | &xmm_alpha_lo, &xmm_alpha_hi, |
5378 | 0 | &xmm_dst_lo, &xmm_dst_hi); |
5379 | | |
5380 | | /* rebuid the 4 pixel data and save*/ |
5381 | 0 | save_128_aligned ((__m128i*)pd, |
5382 | 0 | pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
5383 | 0 | } |
5384 | |
|
5385 | 0 | w -= 4; |
5386 | 0 | pd += 4; |
5387 | 0 | if (pm) |
5388 | 0 | pm += 4; |
5389 | 0 | } |
5390 | |
|
5391 | 0 | while (w) |
5392 | 0 | { |
5393 | 0 | d = *pd; |
5394 | 0 | s = combine1 (ps + pixman_fixed_to_int (vx), pm); |
5395 | 0 | vx += unit_x; |
5396 | 0 | while (vx >= 0) |
5397 | 0 | vx -= src_width_fixed; |
5398 | |
|
5399 | 0 | *pd++ = core_combine_over_u_pixel_sse2 (s, d); |
5400 | 0 | if (pm) |
5401 | 0 | pm++; |
5402 | |
|
5403 | 0 | w--; |
5404 | 0 | } |
5405 | 0 | } |
5406 | | |
5407 | | FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER, |
5408 | | scaled_nearest_scanline_sse2_8888_8888_OVER, |
5409 | | uint32_t, uint32_t, COVER) |
5410 | | FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER, |
5411 | | scaled_nearest_scanline_sse2_8888_8888_OVER, |
5412 | | uint32_t, uint32_t, NONE) |
5413 | | FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER, |
5414 | | scaled_nearest_scanline_sse2_8888_8888_OVER, |
5415 | | uint32_t, uint32_t, PAD) |
5416 | | FAST_NEAREST_MAINLOOP (sse2_8888_8888_normal_OVER, |
5417 | | scaled_nearest_scanline_sse2_8888_8888_OVER, |
5418 | | uint32_t, uint32_t, NORMAL) |
5419 | | |
5420 | | static force_inline void |
5421 | | scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask, |
5422 | | uint32_t * dst, |
5423 | | const uint32_t * src, |
5424 | | int32_t w, |
5425 | | pixman_fixed_t vx, |
5426 | | pixman_fixed_t unit_x, |
5427 | | pixman_fixed_t src_width_fixed, |
5428 | | pixman_bool_t zero_src) |
5429 | 0 | { |
5430 | 0 | __m128i xmm_mask; |
5431 | 0 | __m128i xmm_src, xmm_src_lo, xmm_src_hi; |
5432 | 0 | __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
5433 | 0 | __m128i xmm_alpha_lo, xmm_alpha_hi; |
5434 | |
|
5435 | 0 | if (zero_src || (*mask >> 24) == 0) |
5436 | 0 | return; |
5437 | | |
5438 | 0 | xmm_mask = create_mask_16_128 (*mask >> 24); |
5439 | |
|
5440 | 0 | while (w && (uintptr_t)dst & 15) |
5441 | 0 | { |
5442 | 0 | uint32_t s = *(src + pixman_fixed_to_int (vx)); |
5443 | 0 | vx += unit_x; |
5444 | 0 | while (vx >= 0) |
5445 | 0 | vx -= src_width_fixed; |
5446 | |
|
5447 | 0 | if (s) |
5448 | 0 | { |
5449 | 0 | uint32_t d = *dst; |
5450 | |
|
5451 | 0 | __m128i ms = unpack_32_1x128 (s); |
5452 | 0 | __m128i alpha = expand_alpha_1x128 (ms); |
5453 | 0 | __m128i dest = xmm_mask; |
5454 | 0 | __m128i alpha_dst = unpack_32_1x128 (d); |
5455 | |
|
5456 | 0 | *dst = pack_1x128_32 ( |
5457 | 0 | in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); |
5458 | 0 | } |
5459 | 0 | dst++; |
5460 | 0 | w--; |
5461 | 0 | } |
5462 | |
|
5463 | 0 | while (w >= 4) |
5464 | 0 | { |
5465 | 0 | uint32_t tmp1, tmp2, tmp3, tmp4; |
5466 | |
|
5467 | 0 | tmp1 = *(src + pixman_fixed_to_int (vx)); |
5468 | 0 | vx += unit_x; |
5469 | 0 | while (vx >= 0) |
5470 | 0 | vx -= src_width_fixed; |
5471 | 0 | tmp2 = *(src + pixman_fixed_to_int (vx)); |
5472 | 0 | vx += unit_x; |
5473 | 0 | while (vx >= 0) |
5474 | 0 | vx -= src_width_fixed; |
5475 | 0 | tmp3 = *(src + pixman_fixed_to_int (vx)); |
5476 | 0 | vx += unit_x; |
5477 | 0 | while (vx >= 0) |
5478 | 0 | vx -= src_width_fixed; |
5479 | 0 | tmp4 = *(src + pixman_fixed_to_int (vx)); |
5480 | 0 | vx += unit_x; |
5481 | 0 | while (vx >= 0) |
5482 | 0 | vx -= src_width_fixed; |
5483 | |
|
5484 | 0 | xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1); |
5485 | |
|
5486 | 0 | if (!is_zero (xmm_src)) |
5487 | 0 | { |
5488 | 0 | xmm_dst = load_128_aligned ((__m128i*)dst); |
5489 | |
|
5490 | 0 | unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
5491 | 0 | unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
5492 | 0 | expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, |
5493 | 0 | &xmm_alpha_lo, &xmm_alpha_hi); |
5494 | |
|
5495 | 0 | in_over_2x128 (&xmm_src_lo, &xmm_src_hi, |
5496 | 0 | &xmm_alpha_lo, &xmm_alpha_hi, |
5497 | 0 | &xmm_mask, &xmm_mask, |
5498 | 0 | &xmm_dst_lo, &xmm_dst_hi); |
5499 | |
|
5500 | 0 | save_128_aligned ( |
5501 | 0 | (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
5502 | 0 | } |
5503 | |
|
5504 | 0 | dst += 4; |
5505 | 0 | w -= 4; |
5506 | 0 | } |
5507 | |
|
5508 | 0 | while (w) |
5509 | 0 | { |
5510 | 0 | uint32_t s = *(src + pixman_fixed_to_int (vx)); |
5511 | 0 | vx += unit_x; |
5512 | 0 | while (vx >= 0) |
5513 | 0 | vx -= src_width_fixed; |
5514 | |
|
5515 | 0 | if (s) |
5516 | 0 | { |
5517 | 0 | uint32_t d = *dst; |
5518 | |
|
5519 | 0 | __m128i ms = unpack_32_1x128 (s); |
5520 | 0 | __m128i alpha = expand_alpha_1x128 (ms); |
5521 | 0 | __m128i mask = xmm_mask; |
5522 | 0 | __m128i dest = unpack_32_1x128 (d); |
5523 | |
|
5524 | 0 | *dst = pack_1x128_32 ( |
5525 | 0 | in_over_1x128 (&ms, &alpha, &mask, &dest)); |
5526 | 0 | } |
5527 | |
|
5528 | 0 | dst++; |
5529 | 0 | w--; |
5530 | 0 | } |
5531 | |
|
5532 | 0 | } |
5533 | | |
5534 | | FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER, |
5535 | | scaled_nearest_scanline_sse2_8888_n_8888_OVER, |
5536 | | uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE) |
5537 | | FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER, |
5538 | | scaled_nearest_scanline_sse2_8888_n_8888_OVER, |
5539 | | uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE) |
5540 | | FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER, |
5541 | | scaled_nearest_scanline_sse2_8888_n_8888_OVER, |
5542 | | uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE) |
5543 | | FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER, |
5544 | | scaled_nearest_scanline_sse2_8888_n_8888_OVER, |
5545 | | uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE) |
5546 | | |
5547 | | #if PSHUFD_IS_FAST |
5548 | | |
5549 | | /***********************************************************************************/ |
5550 | | |
5551 | | # define BILINEAR_DECLARE_VARIABLES \ |
5552 | | const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); \ |
5553 | | const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); \ |
5554 | | const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1); \ |
5555 | | const __m128i xmm_ux1 = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x, \ |
5556 | | unit_x, -unit_x, unit_x, -unit_x); \ |
5557 | | const __m128i xmm_ux4 = _mm_set_epi16 (unit_x * 4, -unit_x * 4, \ |
5558 | | unit_x * 4, -unit_x * 4, \ |
5559 | | unit_x * 4, -unit_x * 4, \ |
5560 | | unit_x * 4, -unit_x * 4); \ |
5561 | | const __m128i xmm_zero = _mm_setzero_si128 (); \ |
5562 | | __m128i xmm_x = _mm_set_epi16 (vx + unit_x * 3, -(vx + 1) - unit_x * 3, \ |
5563 | | vx + unit_x * 2, -(vx + 1) - unit_x * 2, \ |
5564 | | vx + unit_x * 1, -(vx + 1) - unit_x * 1, \ |
5565 | | vx + unit_x * 0, -(vx + 1) - unit_x * 0); \ |
5566 | | __m128i xmm_wh_state; |
5567 | | |
5568 | | #define BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER(pix, phase_) \ |
5569 | | do { \ |
5570 | | int phase = phase_; \ |
5571 | | __m128i xmm_wh, xmm_a, xmm_b; \ |
5572 | | /* fetch 2x2 pixel block into sse2 registers */ \ |
5573 | | __m128i tltr = _mm_loadl_epi64 ((__m128i *)&src_top[vx >> 16]); \ |
5574 | | __m128i blbr = _mm_loadl_epi64 ((__m128i *)&src_bottom[vx >> 16]); \ |
5575 | | vx += unit_x; \ |
5576 | | /* vertical interpolation */ \ |
5577 | | xmm_a = _mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), xmm_wt); \ |
5578 | | xmm_b = _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), xmm_wb); \ |
5579 | | xmm_a = _mm_add_epi16 (xmm_a, xmm_b); \ |
5580 | | /* calculate horizontal weights */ \ |
5581 | | if (phase <= 0) \ |
5582 | | { \ |
5583 | | xmm_wh_state = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x, \ |
5584 | | 16 - BILINEAR_INTERPOLATION_BITS)); \ |
5585 | | xmm_x = _mm_add_epi16 (xmm_x, (phase < 0) ? xmm_ux1 : xmm_ux4); \ |
5586 | | phase = 0; \ |
5587 | | } \ |
5588 | | xmm_wh = _mm_shuffle_epi32 (xmm_wh_state, _MM_SHUFFLE (phase, phase, \ |
5589 | | phase, phase)); \ |
5590 | | /* horizontal interpolation */ \ |
5591 | | xmm_a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 ( \ |
5592 | | xmm_a, _MM_SHUFFLE (1, 0, 3, 2)), xmm_a), xmm_wh); \ |
5593 | | /* shift the result */ \ |
5594 | | pix = _mm_srli_epi32 (xmm_a, BILINEAR_INTERPOLATION_BITS * 2); \ |
5595 | | } while (0) |
5596 | | |
5597 | | #else /************************************************************************/ |
5598 | | |
5599 | | # define BILINEAR_DECLARE_VARIABLES \ |
5600 | 0 | const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); \ |
5601 | 0 | const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); \ |
5602 | 0 | const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1); \ |
5603 | 0 | const __m128i xmm_ux1 = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x, \ |
5604 | 0 | unit_x, -unit_x, unit_x, -unit_x); \ |
5605 | 0 | const __m128i xmm_ux4 = _mm_set_epi16 (unit_x * 4, -unit_x * 4, \ |
5606 | 0 | unit_x * 4, -unit_x * 4, \ |
5607 | 0 | unit_x * 4, -unit_x * 4, \ |
5608 | 0 | unit_x * 4, -unit_x * 4); \ |
5609 | 0 | const __m128i xmm_zero = _mm_setzero_si128 (); \ |
5610 | 0 | __m128i xmm_x = _mm_set_epi16 (vx, -(vx + 1), vx, -(vx + 1), \ |
5611 | 0 | vx, -(vx + 1), vx, -(vx + 1)) |
5612 | | |
5613 | 0 | #define BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER(pix, phase) \ |
5614 | 0 | do { \ |
5615 | 0 | __m128i xmm_wh, xmm_a, xmm_b; \ |
5616 | 0 | /* fetch 2x2 pixel block into sse2 registers */ \ |
5617 | 0 | __m128i tltr = _mm_loadl_epi64 ((__m128i *)&src_top[vx >> 16]); \ |
5618 | 0 | __m128i blbr = _mm_loadl_epi64 ((__m128i *)&src_bottom[vx >> 16]); \ |
5619 | 0 | (void)xmm_ux4; /* suppress warning: unused variable 'xmm_ux4' */ \ |
5620 | 0 | vx += unit_x; \ |
5621 | 0 | /* vertical interpolation */ \ |
5622 | 0 | xmm_a = _mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), xmm_wt); \ |
5623 | 0 | xmm_b = _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), xmm_wb); \ |
5624 | 0 | xmm_a = _mm_add_epi16 (xmm_a, xmm_b); \ |
5625 | 0 | /* calculate horizontal weights */ \ |
5626 | 0 | xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x, \ |
5627 | 0 | 16 - BILINEAR_INTERPOLATION_BITS)); \ |
5628 | 0 | xmm_x = _mm_add_epi16 (xmm_x, xmm_ux1); \ |
5629 | 0 | /* horizontal interpolation */ \ |
5630 | 0 | xmm_b = _mm_unpacklo_epi64 (/* any value is fine here */ xmm_b, xmm_a); \ |
5631 | 0 | xmm_a = _mm_madd_epi16 (_mm_unpackhi_epi16 (xmm_b, xmm_a), xmm_wh); \ |
5632 | 0 | /* shift the result */ \ |
5633 | 0 | pix = _mm_srli_epi32 (xmm_a, BILINEAR_INTERPOLATION_BITS * 2); \ |
5634 | 0 | } while (0) |
5635 | | |
5636 | | /***********************************************************************************/ |
5637 | | |
5638 | | #endif |
5639 | | |
5640 | 0 | #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix); \ |
5641 | 0 | do { \ |
5642 | 0 | __m128i xmm_pix; \ |
5643 | 0 | BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix, -1); \ |
5644 | 0 | xmm_pix = _mm_packs_epi32 (xmm_pix, xmm_pix); \ |
5645 | 0 | xmm_pix = _mm_packus_epi16 (xmm_pix, xmm_pix); \ |
5646 | 0 | pix = _mm_cvtsi128_si32 (xmm_pix); \ |
5647 | 0 | } while(0) |
5648 | | |
5649 | 0 | #define BILINEAR_INTERPOLATE_FOUR_PIXELS(pix); \ |
5650 | 0 | do { \ |
5651 | 0 | __m128i xmm_pix1, xmm_pix2, xmm_pix3, xmm_pix4; \ |
5652 | 0 | BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix1, 0); \ |
5653 | 0 | BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix2, 1); \ |
5654 | 0 | BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix3, 2); \ |
5655 | 0 | BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix4, 3); \ |
5656 | 0 | xmm_pix1 = _mm_packs_epi32 (xmm_pix1, xmm_pix2); \ |
5657 | 0 | xmm_pix3 = _mm_packs_epi32 (xmm_pix3, xmm_pix4); \ |
5658 | 0 | pix = _mm_packus_epi16 (xmm_pix1, xmm_pix3); \ |
5659 | 0 | } while(0) |
5660 | | |
5661 | 0 | #define BILINEAR_SKIP_ONE_PIXEL() \ |
5662 | 0 | do { \ |
5663 | 0 | vx += unit_x; \ |
5664 | 0 | xmm_x = _mm_add_epi16 (xmm_x, xmm_ux1); \ |
5665 | 0 | } while(0) |
5666 | | |
5667 | 0 | #define BILINEAR_SKIP_FOUR_PIXELS() \ |
5668 | 0 | do { \ |
5669 | 0 | vx += unit_x * 4; \ |
5670 | 0 | xmm_x = _mm_add_epi16 (xmm_x, xmm_ux4); \ |
5671 | 0 | } while(0) |
5672 | | |
5673 | | /***********************************************************************************/ |
5674 | | |
5675 | | static force_inline void |
5676 | | scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t * dst, |
5677 | | const uint32_t * mask, |
5678 | | const uint32_t * src_top, |
5679 | | const uint32_t * src_bottom, |
5680 | | int32_t w, |
5681 | | int wt, |
5682 | | int wb, |
5683 | | pixman_fixed_t vx_, |
5684 | | pixman_fixed_t unit_x_, |
5685 | | pixman_fixed_t max_vx, |
5686 | | pixman_bool_t zero_src) |
5687 | 0 | { |
5688 | 0 | intptr_t vx = vx_; |
5689 | 0 | intptr_t unit_x = unit_x_; |
5690 | 0 | BILINEAR_DECLARE_VARIABLES; |
5691 | 0 | uint32_t pix1, pix2; |
5692 | |
|
5693 | 0 | while (w && ((uintptr_t)dst & 15)) |
5694 | 0 | { |
5695 | 0 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
5696 | 0 | *dst++ = pix1; |
5697 | 0 | w--; |
5698 | 0 | } |
5699 | |
|
5700 | 0 | while ((w -= 4) >= 0) { |
5701 | 0 | __m128i xmm_src; |
5702 | 0 | BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src); |
5703 | 0 | _mm_store_si128 ((__m128i *)dst, xmm_src); |
5704 | 0 | dst += 4; |
5705 | 0 | } |
5706 | |
|
5707 | 0 | if (w & 2) |
5708 | 0 | { |
5709 | 0 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
5710 | 0 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); |
5711 | 0 | *dst++ = pix1; |
5712 | 0 | *dst++ = pix2; |
5713 | 0 | } |
5714 | |
|
5715 | 0 | if (w & 1) |
5716 | 0 | { |
5717 | 0 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
5718 | 0 | *dst = pix1; |
5719 | 0 | } |
5720 | |
|
5721 | 0 | } |
5722 | | |
5723 | | FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC, |
5724 | | scaled_bilinear_scanline_sse2_8888_8888_SRC, |
5725 | | uint32_t, uint32_t, uint32_t, |
5726 | | COVER, FLAG_NONE) |
5727 | | FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC, |
5728 | | scaled_bilinear_scanline_sse2_8888_8888_SRC, |
5729 | | uint32_t, uint32_t, uint32_t, |
5730 | | PAD, FLAG_NONE) |
5731 | | FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC, |
5732 | | scaled_bilinear_scanline_sse2_8888_8888_SRC, |
5733 | | uint32_t, uint32_t, uint32_t, |
5734 | | NONE, FLAG_NONE) |
5735 | | FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC, |
5736 | | scaled_bilinear_scanline_sse2_8888_8888_SRC, |
5737 | | uint32_t, uint32_t, uint32_t, |
5738 | | NORMAL, FLAG_NONE) |
5739 | | |
5740 | | static force_inline void |
5741 | | scaled_bilinear_scanline_sse2_x888_8888_SRC (uint32_t * dst, |
5742 | | const uint32_t * mask, |
5743 | | const uint32_t * src_top, |
5744 | | const uint32_t * src_bottom, |
5745 | | int32_t w, |
5746 | | int wt, |
5747 | | int wb, |
5748 | | pixman_fixed_t vx_, |
5749 | | pixman_fixed_t unit_x_, |
5750 | | pixman_fixed_t max_vx, |
5751 | | pixman_bool_t zero_src) |
5752 | 0 | { |
5753 | 0 | intptr_t vx = vx_; |
5754 | 0 | intptr_t unit_x = unit_x_; |
5755 | 0 | BILINEAR_DECLARE_VARIABLES; |
5756 | 0 | uint32_t pix1, pix2; |
5757 | |
|
5758 | 0 | while (w && ((uintptr_t)dst & 15)) |
5759 | 0 | { |
5760 | 0 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
5761 | 0 | *dst++ = pix1 | 0xFF000000; |
5762 | 0 | w--; |
5763 | 0 | } |
5764 | |
|
5765 | 0 | while ((w -= 4) >= 0) { |
5766 | 0 | __m128i xmm_src; |
5767 | 0 | BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src); |
5768 | 0 | _mm_store_si128 ((__m128i *)dst, _mm_or_si128 (xmm_src, mask_ff000000)); |
5769 | 0 | dst += 4; |
5770 | 0 | } |
5771 | |
|
5772 | 0 | if (w & 2) |
5773 | 0 | { |
5774 | 0 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
5775 | 0 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); |
5776 | 0 | *dst++ = pix1 | 0xFF000000; |
5777 | 0 | *dst++ = pix2 | 0xFF000000; |
5778 | 0 | } |
5779 | |
|
5780 | 0 | if (w & 1) |
5781 | 0 | { |
5782 | 0 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
5783 | 0 | *dst = pix1 | 0xFF000000; |
5784 | 0 | } |
5785 | 0 | } |
5786 | | |
5787 | | FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_cover_SRC, |
5788 | | scaled_bilinear_scanline_sse2_x888_8888_SRC, |
5789 | | uint32_t, uint32_t, uint32_t, |
5790 | | COVER, FLAG_NONE) |
5791 | | FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_pad_SRC, |
5792 | | scaled_bilinear_scanline_sse2_x888_8888_SRC, |
5793 | | uint32_t, uint32_t, uint32_t, |
5794 | | PAD, FLAG_NONE) |
5795 | | FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_normal_SRC, |
5796 | | scaled_bilinear_scanline_sse2_x888_8888_SRC, |
5797 | | uint32_t, uint32_t, uint32_t, |
5798 | | NORMAL, FLAG_NONE) |
5799 | | |
5800 | | static force_inline void |
5801 | | scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t * dst, |
5802 | | const uint32_t * mask, |
5803 | | const uint32_t * src_top, |
5804 | | const uint32_t * src_bottom, |
5805 | | int32_t w, |
5806 | | int wt, |
5807 | | int wb, |
5808 | | pixman_fixed_t vx_, |
5809 | | pixman_fixed_t unit_x_, |
5810 | | pixman_fixed_t max_vx, |
5811 | | pixman_bool_t zero_src) |
5812 | 0 | { |
5813 | 0 | intptr_t vx = vx_; |
5814 | 0 | intptr_t unit_x = unit_x_; |
5815 | 0 | BILINEAR_DECLARE_VARIABLES; |
5816 | 0 | uint32_t pix1, pix2; |
5817 | |
|
5818 | 0 | while (w && ((uintptr_t)dst & 15)) |
5819 | 0 | { |
5820 | 0 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
5821 | |
|
5822 | 0 | if (pix1) |
5823 | 0 | { |
5824 | 0 | pix2 = *dst; |
5825 | 0 | *dst = core_combine_over_u_pixel_sse2 (pix1, pix2); |
5826 | 0 | } |
5827 | |
|
5828 | 0 | w--; |
5829 | 0 | dst++; |
5830 | 0 | } |
5831 | |
|
5832 | 0 | while (w >= 4) |
5833 | 0 | { |
5834 | 0 | __m128i xmm_src; |
5835 | 0 | __m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo; |
5836 | 0 | __m128i xmm_alpha_hi, xmm_alpha_lo; |
5837 | |
|
5838 | 0 | BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src); |
5839 | |
|
5840 | 0 | if (!is_zero (xmm_src)) |
5841 | 0 | { |
5842 | 0 | if (is_opaque (xmm_src)) |
5843 | 0 | { |
5844 | 0 | save_128_aligned ((__m128i *)dst, xmm_src); |
5845 | 0 | } |
5846 | 0 | else |
5847 | 0 | { |
5848 | 0 | __m128i xmm_dst = load_128_aligned ((__m128i *)dst); |
5849 | |
|
5850 | 0 | unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
5851 | 0 | unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
5852 | |
|
5853 | 0 | expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi); |
5854 | 0 | over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi, |
5855 | 0 | &xmm_dst_lo, &xmm_dst_hi); |
5856 | |
|
5857 | 0 | save_128_aligned ((__m128i *)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
5858 | 0 | } |
5859 | 0 | } |
5860 | |
|
5861 | 0 | w -= 4; |
5862 | 0 | dst += 4; |
5863 | 0 | } |
5864 | |
|
5865 | 0 | while (w) |
5866 | 0 | { |
5867 | 0 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
5868 | |
|
5869 | 0 | if (pix1) |
5870 | 0 | { |
5871 | 0 | pix2 = *dst; |
5872 | 0 | *dst = core_combine_over_u_pixel_sse2 (pix1, pix2); |
5873 | 0 | } |
5874 | |
|
5875 | 0 | w--; |
5876 | 0 | dst++; |
5877 | 0 | } |
5878 | 0 | } |
5879 | | |
5880 | | FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER, |
5881 | | scaled_bilinear_scanline_sse2_8888_8888_OVER, |
5882 | | uint32_t, uint32_t, uint32_t, |
5883 | | COVER, FLAG_NONE) |
5884 | | FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER, |
5885 | | scaled_bilinear_scanline_sse2_8888_8888_OVER, |
5886 | | uint32_t, uint32_t, uint32_t, |
5887 | | PAD, FLAG_NONE) |
5888 | | FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER, |
5889 | | scaled_bilinear_scanline_sse2_8888_8888_OVER, |
5890 | | uint32_t, uint32_t, uint32_t, |
5891 | | NONE, FLAG_NONE) |
5892 | | FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER, |
5893 | | scaled_bilinear_scanline_sse2_8888_8888_OVER, |
5894 | | uint32_t, uint32_t, uint32_t, |
5895 | | NORMAL, FLAG_NONE) |
5896 | | |
5897 | | static force_inline void |
5898 | | scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t * dst, |
5899 | | const uint8_t * mask, |
5900 | | const uint32_t * src_top, |
5901 | | const uint32_t * src_bottom, |
5902 | | int32_t w, |
5903 | | int wt, |
5904 | | int wb, |
5905 | | pixman_fixed_t vx_, |
5906 | | pixman_fixed_t unit_x_, |
5907 | | pixman_fixed_t max_vx, |
5908 | | pixman_bool_t zero_src) |
5909 | 0 | { |
5910 | 0 | intptr_t vx = vx_; |
5911 | 0 | intptr_t unit_x = unit_x_; |
5912 | 0 | BILINEAR_DECLARE_VARIABLES; |
5913 | 0 | uint32_t pix1, pix2; |
5914 | |
|
5915 | 0 | while (w && ((uintptr_t)dst & 15)) |
5916 | 0 | { |
5917 | 0 | uint32_t sa; |
5918 | 0 | uint8_t m = *mask++; |
5919 | |
|
5920 | 0 | if (m) |
5921 | 0 | { |
5922 | 0 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
5923 | 0 | sa = pix1 >> 24; |
5924 | |
|
5925 | 0 | if (sa == 0xff && m == 0xff) |
5926 | 0 | { |
5927 | 0 | *dst = pix1; |
5928 | 0 | } |
5929 | 0 | else |
5930 | 0 | { |
5931 | 0 | __m128i ms, md, ma, msa; |
5932 | |
|
5933 | 0 | pix2 = *dst; |
5934 | 0 | ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); |
5935 | 0 | ms = unpack_32_1x128 (pix1); |
5936 | 0 | md = unpack_32_1x128 (pix2); |
5937 | |
|
5938 | 0 | msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); |
5939 | |
|
5940 | 0 | *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); |
5941 | 0 | } |
5942 | 0 | } |
5943 | 0 | else |
5944 | 0 | { |
5945 | 0 | BILINEAR_SKIP_ONE_PIXEL (); |
5946 | 0 | } |
5947 | |
|
5948 | 0 | w--; |
5949 | 0 | dst++; |
5950 | 0 | } |
5951 | |
|
5952 | 0 | while (w >= 4) |
5953 | 0 | { |
5954 | 0 | uint32_t m; |
5955 | |
|
5956 | 0 | __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi; |
5957 | 0 | __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
5958 | 0 | __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; |
5959 | |
|
5960 | 0 | memcpy(&m, mask, sizeof(uint32_t)); |
5961 | |
|
5962 | 0 | if (m) |
5963 | 0 | { |
5964 | 0 | BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src); |
5965 | |
|
5966 | 0 | if (m == 0xffffffff && is_opaque (xmm_src)) |
5967 | 0 | { |
5968 | 0 | save_128_aligned ((__m128i *)dst, xmm_src); |
5969 | 0 | } |
5970 | 0 | else |
5971 | 0 | { |
5972 | 0 | xmm_dst = load_128_aligned ((__m128i *)dst); |
5973 | |
|
5974 | 0 | xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128()); |
5975 | |
|
5976 | 0 | unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
5977 | 0 | unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
5978 | 0 | unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
5979 | |
|
5980 | 0 | expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi); |
5981 | 0 | expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
5982 | |
|
5983 | 0 | in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi, |
5984 | 0 | &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); |
5985 | |
|
5986 | 0 | save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
5987 | 0 | } |
5988 | 0 | } |
5989 | 0 | else |
5990 | 0 | { |
5991 | 0 | BILINEAR_SKIP_FOUR_PIXELS (); |
5992 | 0 | } |
5993 | |
|
5994 | 0 | w -= 4; |
5995 | 0 | dst += 4; |
5996 | 0 | mask += 4; |
5997 | 0 | } |
5998 | |
|
5999 | 0 | while (w) |
6000 | 0 | { |
6001 | 0 | uint32_t sa; |
6002 | 0 | uint8_t m = *mask++; |
6003 | |
|
6004 | 0 | if (m) |
6005 | 0 | { |
6006 | 0 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
6007 | 0 | sa = pix1 >> 24; |
6008 | |
|
6009 | 0 | if (sa == 0xff && m == 0xff) |
6010 | 0 | { |
6011 | 0 | *dst = pix1; |
6012 | 0 | } |
6013 | 0 | else |
6014 | 0 | { |
6015 | 0 | __m128i ms, md, ma, msa; |
6016 | |
|
6017 | 0 | pix2 = *dst; |
6018 | 0 | ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); |
6019 | 0 | ms = unpack_32_1x128 (pix1); |
6020 | 0 | md = unpack_32_1x128 (pix2); |
6021 | |
|
6022 | 0 | msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); |
6023 | |
|
6024 | 0 | *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); |
6025 | 0 | } |
6026 | 0 | } |
6027 | 0 | else |
6028 | 0 | { |
6029 | 0 | BILINEAR_SKIP_ONE_PIXEL (); |
6030 | 0 | } |
6031 | |
|
6032 | 0 | w--; |
6033 | 0 | dst++; |
6034 | 0 | } |
6035 | 0 | } |
6036 | | |
6037 | | FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER, |
6038 | | scaled_bilinear_scanline_sse2_8888_8_8888_OVER, |
6039 | | uint32_t, uint8_t, uint32_t, |
6040 | | COVER, FLAG_HAVE_NON_SOLID_MASK) |
6041 | | FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER, |
6042 | | scaled_bilinear_scanline_sse2_8888_8_8888_OVER, |
6043 | | uint32_t, uint8_t, uint32_t, |
6044 | | PAD, FLAG_HAVE_NON_SOLID_MASK) |
6045 | | FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER, |
6046 | | scaled_bilinear_scanline_sse2_8888_8_8888_OVER, |
6047 | | uint32_t, uint8_t, uint32_t, |
6048 | | NONE, FLAG_HAVE_NON_SOLID_MASK) |
6049 | | FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER, |
6050 | | scaled_bilinear_scanline_sse2_8888_8_8888_OVER, |
6051 | | uint32_t, uint8_t, uint32_t, |
6052 | | NORMAL, FLAG_HAVE_NON_SOLID_MASK) |
6053 | | |
6054 | | static force_inline void |
6055 | | scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t * dst, |
6056 | | const uint32_t * mask, |
6057 | | const uint32_t * src_top, |
6058 | | const uint32_t * src_bottom, |
6059 | | int32_t w, |
6060 | | int wt, |
6061 | | int wb, |
6062 | | pixman_fixed_t vx_, |
6063 | | pixman_fixed_t unit_x_, |
6064 | | pixman_fixed_t max_vx, |
6065 | | pixman_bool_t zero_src) |
6066 | 0 | { |
6067 | 0 | intptr_t vx = vx_; |
6068 | 0 | intptr_t unit_x = unit_x_; |
6069 | 0 | BILINEAR_DECLARE_VARIABLES; |
6070 | 0 | uint32_t pix1; |
6071 | 0 | __m128i xmm_mask; |
6072 | |
|
6073 | 0 | if (zero_src || (*mask >> 24) == 0) |
6074 | 0 | return; |
6075 | | |
6076 | 0 | xmm_mask = create_mask_16_128 (*mask >> 24); |
6077 | |
|
6078 | 0 | while (w && ((uintptr_t)dst & 15)) |
6079 | 0 | { |
6080 | 0 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
6081 | 0 | if (pix1) |
6082 | 0 | { |
6083 | 0 | uint32_t d = *dst; |
6084 | |
|
6085 | 0 | __m128i ms = unpack_32_1x128 (pix1); |
6086 | 0 | __m128i alpha = expand_alpha_1x128 (ms); |
6087 | 0 | __m128i dest = xmm_mask; |
6088 | 0 | __m128i alpha_dst = unpack_32_1x128 (d); |
6089 | |
|
6090 | 0 | *dst = pack_1x128_32 |
6091 | 0 | (in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); |
6092 | 0 | } |
6093 | |
|
6094 | 0 | dst++; |
6095 | 0 | w--; |
6096 | 0 | } |
6097 | |
|
6098 | 0 | while (w >= 4) |
6099 | 0 | { |
6100 | 0 | __m128i xmm_src; |
6101 | 0 | BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src); |
6102 | |
|
6103 | 0 | if (!is_zero (xmm_src)) |
6104 | 0 | { |
6105 | 0 | __m128i xmm_src_lo, xmm_src_hi; |
6106 | 0 | __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
6107 | 0 | __m128i xmm_alpha_lo, xmm_alpha_hi; |
6108 | |
|
6109 | 0 | xmm_dst = load_128_aligned ((__m128i*)dst); |
6110 | |
|
6111 | 0 | unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
6112 | 0 | unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
6113 | 0 | expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, |
6114 | 0 | &xmm_alpha_lo, &xmm_alpha_hi); |
6115 | |
|
6116 | 0 | in_over_2x128 (&xmm_src_lo, &xmm_src_hi, |
6117 | 0 | &xmm_alpha_lo, &xmm_alpha_hi, |
6118 | 0 | &xmm_mask, &xmm_mask, |
6119 | 0 | &xmm_dst_lo, &xmm_dst_hi); |
6120 | |
|
6121 | 0 | save_128_aligned |
6122 | 0 | ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
6123 | 0 | } |
6124 | |
|
6125 | 0 | dst += 4; |
6126 | 0 | w -= 4; |
6127 | 0 | } |
6128 | |
|
6129 | 0 | while (w) |
6130 | 0 | { |
6131 | 0 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
6132 | 0 | if (pix1) |
6133 | 0 | { |
6134 | 0 | uint32_t d = *dst; |
6135 | |
|
6136 | 0 | __m128i ms = unpack_32_1x128 (pix1); |
6137 | 0 | __m128i alpha = expand_alpha_1x128 (ms); |
6138 | 0 | __m128i dest = xmm_mask; |
6139 | 0 | __m128i alpha_dst = unpack_32_1x128 (d); |
6140 | |
|
6141 | 0 | *dst = pack_1x128_32 |
6142 | 0 | (in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); |
6143 | 0 | } |
6144 | |
|
6145 | 0 | dst++; |
6146 | 0 | w--; |
6147 | 0 | } |
6148 | 0 | } |
6149 | | |
6150 | | FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER, |
6151 | | scaled_bilinear_scanline_sse2_8888_n_8888_OVER, |
6152 | | uint32_t, uint32_t, uint32_t, |
6153 | | COVER, FLAG_HAVE_SOLID_MASK) |
6154 | | FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER, |
6155 | | scaled_bilinear_scanline_sse2_8888_n_8888_OVER, |
6156 | | uint32_t, uint32_t, uint32_t, |
6157 | | PAD, FLAG_HAVE_SOLID_MASK) |
6158 | | FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER, |
6159 | | scaled_bilinear_scanline_sse2_8888_n_8888_OVER, |
6160 | | uint32_t, uint32_t, uint32_t, |
6161 | | NONE, FLAG_HAVE_SOLID_MASK) |
6162 | | FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER, |
6163 | | scaled_bilinear_scanline_sse2_8888_n_8888_OVER, |
6164 | | uint32_t, uint32_t, uint32_t, |
6165 | | NORMAL, FLAG_HAVE_SOLID_MASK) |
6166 | | |
6167 | | static const pixman_fast_path_t sse2_fast_paths[] = |
6168 | | { |
6169 | | /* PIXMAN_OP_OVER */ |
6170 | | PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565), |
6171 | | PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565), |
6172 | | PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888), |
6173 | | PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888), |
6174 | | PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565), |
6175 | | PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, sse2_composite_over_n_0565), |
6176 | | PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888), |
6177 | | PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888), |
6178 | | PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888), |
6179 | | PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888), |
6180 | | PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565), |
6181 | | PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565), |
6182 | | PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888), |
6183 | | PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888), |
6184 | | PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888), |
6185 | | PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888), |
6186 | | PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888), |
6187 | | PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888), |
6188 | | PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888), |
6189 | | PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888), |
6190 | | PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888), |
6191 | | PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888), |
6192 | | PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888), |
6193 | | PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888), |
6194 | | PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888), |
6195 | | PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888), |
6196 | | PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888), |
6197 | | PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888), |
6198 | | PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888), |
6199 | | PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888), |
6200 | | PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888), |
6201 | | PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888), |
6202 | | PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888), |
6203 | | PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca), |
6204 | | PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca), |
6205 | | PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca), |
6206 | | PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca), |
6207 | | PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca), |
6208 | | PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca), |
6209 | | PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888), |
6210 | | PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888), |
6211 | | PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888), |
6212 | | PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888), |
6213 | | PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565), |
6214 | | PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565), |
6215 | | PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area), |
6216 | | PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area), |
6217 | | |
6218 | | /* PIXMAN_OP_OVER_REVERSE */ |
6219 | | PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888), |
6220 | | PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888), |
6221 | | |
6222 | | /* PIXMAN_OP_ADD */ |
6223 | | PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca), |
6224 | | PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8), |
6225 | | PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888), |
6226 | | PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888), |
6227 | | PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8), |
6228 | | PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8), |
6229 | | PIXMAN_STD_FAST_PATH (ADD, solid, null, x8r8g8b8, sse2_composite_add_n_8888), |
6230 | | PIXMAN_STD_FAST_PATH (ADD, solid, null, a8r8g8b8, sse2_composite_add_n_8888), |
6231 | | PIXMAN_STD_FAST_PATH (ADD, solid, null, x8b8g8r8, sse2_composite_add_n_8888), |
6232 | | PIXMAN_STD_FAST_PATH (ADD, solid, null, a8b8g8r8, sse2_composite_add_n_8888), |
6233 | | PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8r8g8b8, sse2_composite_add_n_8_8888), |
6234 | | PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, sse2_composite_add_n_8_8888), |
6235 | | PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8b8g8r8, sse2_composite_add_n_8_8888), |
6236 | | PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8b8g8r8, sse2_composite_add_n_8_8888), |
6237 | | |
6238 | | /* PIXMAN_OP_SRC */ |
6239 | | PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888), |
6240 | | PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888), |
6241 | | PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888), |
6242 | | PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888), |
6243 | | PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565), |
6244 | | PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565), |
6245 | | PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565), |
6246 | | PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565), |
6247 | | PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888), |
6248 | | PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888), |
6249 | | PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area), |
6250 | | PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area), |
6251 | | PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area), |
6252 | | PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area), |
6253 | | PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area), |
6254 | | PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area), |
6255 | | PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area), |
6256 | | PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area), |
6257 | | |
6258 | | /* PIXMAN_OP_IN */ |
6259 | | PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8), |
6260 | | PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8), |
6261 | | PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8), |
6262 | | |
6263 | | SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), |
6264 | | SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), |
6265 | | SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), |
6266 | | SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), |
6267 | | |
6268 | | SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888), |
6269 | | SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888), |
6270 | | SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888), |
6271 | | SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888), |
6272 | | |
6273 | | SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888), |
6274 | | SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888), |
6275 | | SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888), |
6276 | | SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, sse2_8888_8888), |
6277 | | SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, sse2_8888_8888), |
6278 | | SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, sse2_8888_8888), |
6279 | | |
6280 | | SIMPLE_BILINEAR_FAST_PATH_COVER (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888), |
6281 | | SIMPLE_BILINEAR_FAST_PATH_COVER (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888), |
6282 | | SIMPLE_BILINEAR_FAST_PATH_PAD (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888), |
6283 | | SIMPLE_BILINEAR_FAST_PATH_PAD (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888), |
6284 | | SIMPLE_BILINEAR_FAST_PATH_NORMAL (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888), |
6285 | | SIMPLE_BILINEAR_FAST_PATH_NORMAL (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888), |
6286 | | |
6287 | | SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), |
6288 | | SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), |
6289 | | SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), |
6290 | | SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), |
6291 | | |
6292 | | SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888), |
6293 | | SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888), |
6294 | | SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888), |
6295 | | SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888), |
6296 | | |
6297 | | SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888), |
6298 | | SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888), |
6299 | | SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888), |
6300 | | SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8_8888), |
6301 | | |
6302 | | { PIXMAN_OP_NONE }, |
6303 | | }; |
6304 | | |
6305 | | static uint32_t * |
6306 | | sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask) |
6307 | 0 | { |
6308 | 0 | int w = iter->width; |
6309 | 0 | __m128i ff000000 = mask_ff000000; |
6310 | 0 | uint32_t *dst = iter->buffer; |
6311 | 0 | uint32_t *src = (uint32_t *)iter->bits; |
6312 | |
|
6313 | 0 | iter->bits += iter->stride; |
6314 | |
|
6315 | 0 | while (w && ((uintptr_t)dst) & 0x0f) |
6316 | 0 | { |
6317 | 0 | *dst++ = (*src++) | 0xff000000; |
6318 | 0 | w--; |
6319 | 0 | } |
6320 | |
|
6321 | 0 | while (w >= 4) |
6322 | 0 | { |
6323 | 0 | save_128_aligned ( |
6324 | 0 | (__m128i *)dst, _mm_or_si128 ( |
6325 | 0 | load_128_unaligned ((__m128i *)src), ff000000)); |
6326 | |
|
6327 | 0 | dst += 4; |
6328 | 0 | src += 4; |
6329 | 0 | w -= 4; |
6330 | 0 | } |
6331 | |
|
6332 | 0 | while (w) |
6333 | 0 | { |
6334 | 0 | *dst++ = (*src++) | 0xff000000; |
6335 | 0 | w--; |
6336 | 0 | } |
6337 | |
|
6338 | 0 | return iter->buffer; |
6339 | 0 | } |
6340 | | |
6341 | | static uint32_t * |
6342 | | sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask) |
6343 | 0 | { |
6344 | 0 | int w = iter->width; |
6345 | 0 | uint32_t *dst = iter->buffer; |
6346 | 0 | uint16_t *src = (uint16_t *)iter->bits; |
6347 | 0 | __m128i ff000000 = mask_ff000000; |
6348 | |
|
6349 | 0 | iter->bits += iter->stride; |
6350 | |
|
6351 | 0 | while (w && ((uintptr_t)dst) & 0x0f) |
6352 | 0 | { |
6353 | 0 | uint16_t s = *src++; |
6354 | |
|
6355 | 0 | *dst++ = convert_0565_to_8888 (s); |
6356 | 0 | w--; |
6357 | 0 | } |
6358 | |
|
6359 | 0 | while (w >= 8) |
6360 | 0 | { |
6361 | 0 | __m128i lo, hi, s; |
6362 | |
|
6363 | 0 | s = _mm_loadu_si128 ((__m128i *)src); |
6364 | |
|
6365 | 0 | lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ())); |
6366 | 0 | hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ())); |
6367 | |
|
6368 | 0 | save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000)); |
6369 | 0 | save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000)); |
6370 | |
|
6371 | 0 | dst += 8; |
6372 | 0 | src += 8; |
6373 | 0 | w -= 8; |
6374 | 0 | } |
6375 | |
|
6376 | 0 | while (w) |
6377 | 0 | { |
6378 | 0 | uint16_t s = *src++; |
6379 | |
|
6380 | 0 | *dst++ = convert_0565_to_8888 (s); |
6381 | 0 | w--; |
6382 | 0 | } |
6383 | |
|
6384 | 0 | return iter->buffer; |
6385 | 0 | } |
6386 | | |
6387 | | static uint32_t * |
6388 | | sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask) |
6389 | 207 | { |
6390 | 207 | int w = iter->width; |
6391 | 207 | uint32_t *dst = iter->buffer; |
6392 | 207 | uint8_t *src = iter->bits; |
6393 | 207 | __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6; |
6394 | | |
6395 | 207 | iter->bits += iter->stride; |
6396 | | |
6397 | 207 | while (w && (((uintptr_t)dst) & 15)) |
6398 | 0 | { |
6399 | 0 | *dst++ = (uint32_t)(*(src++)) << 24; |
6400 | 0 | w--; |
6401 | 0 | } |
6402 | | |
6403 | 461 | while (w >= 16) |
6404 | 254 | { |
6405 | 254 | xmm0 = _mm_loadu_si128((__m128i *)src); |
6406 | | |
6407 | 254 | xmm1 = _mm_unpacklo_epi8 (_mm_setzero_si128(), xmm0); |
6408 | 254 | xmm2 = _mm_unpackhi_epi8 (_mm_setzero_si128(), xmm0); |
6409 | 254 | xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1); |
6410 | 254 | xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1); |
6411 | 254 | xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2); |
6412 | 254 | xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2); |
6413 | | |
6414 | 254 | _mm_store_si128(((__m128i *)(dst + 0)), xmm3); |
6415 | 254 | _mm_store_si128(((__m128i *)(dst + 4)), xmm4); |
6416 | 254 | _mm_store_si128(((__m128i *)(dst + 8)), xmm5); |
6417 | 254 | _mm_store_si128(((__m128i *)(dst + 12)), xmm6); |
6418 | | |
6419 | 254 | dst += 16; |
6420 | 254 | src += 16; |
6421 | 254 | w -= 16; |
6422 | 254 | } |
6423 | | |
6424 | 1.67k | while (w) |
6425 | 1.46k | { |
6426 | 1.46k | *dst++ = (uint32_t)(*(src++)) << 24; |
6427 | 1.46k | w--; |
6428 | 1.46k | } |
6429 | | |
6430 | 207 | return iter->buffer; |
6431 | 207 | } |
6432 | | |
6433 | | #define IMAGE_FLAGS \ |
6434 | | (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \ |
6435 | | FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST) |
6436 | | |
6437 | | static const pixman_iter_info_t sse2_iters[] = |
6438 | | { |
6439 | | { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW, |
6440 | | _pixman_iter_init_bits_stride, sse2_fetch_x8r8g8b8, NULL |
6441 | | }, |
6442 | | { PIXMAN_r5g6b5, IMAGE_FLAGS, ITER_NARROW, |
6443 | | _pixman_iter_init_bits_stride, sse2_fetch_r5g6b5, NULL |
6444 | | }, |
6445 | | { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW, |
6446 | | _pixman_iter_init_bits_stride, sse2_fetch_a8, NULL |
6447 | | }, |
6448 | | { PIXMAN_null }, |
6449 | | }; |
6450 | | |
6451 | | #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__) |
6452 | | __attribute__((__force_align_arg_pointer__)) |
6453 | | #endif |
6454 | | pixman_implementation_t * |
6455 | | _pixman_implementation_create_sse2 (pixman_implementation_t *fallback) |
6456 | 12 | { |
6457 | 12 | pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths); |
6458 | | |
6459 | | /* SSE2 constants */ |
6460 | 12 | mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000); |
6461 | 12 | mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000); |
6462 | 12 | mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0); |
6463 | 12 | mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f); |
6464 | 12 | mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000); |
6465 | 12 | mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00); |
6466 | 12 | mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8); |
6467 | 12 | mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0); |
6468 | 12 | mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000); |
6469 | 12 | mask_0080 = create_mask_16_128 (0x0080); |
6470 | 12 | mask_00ff = create_mask_16_128 (0x00ff); |
6471 | 12 | mask_0101 = create_mask_16_128 (0x0101); |
6472 | 12 | mask_ffff = create_mask_16_128 (0xffff); |
6473 | 12 | mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000); |
6474 | 12 | mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000); |
6475 | 12 | mask_565_rb = create_mask_2x32_128 (0x00f800f8, 0x00f800f8); |
6476 | 12 | mask_565_pack_multiplier = create_mask_2x32_128 (0x20000004, 0x20000004); |
6477 | | |
6478 | | /* Set up function pointers */ |
6479 | 12 | imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u; |
6480 | 12 | imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u; |
6481 | 12 | imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u; |
6482 | 12 | imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u; |
6483 | 12 | imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u; |
6484 | 12 | imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u; |
6485 | 12 | imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u; |
6486 | 12 | imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u; |
6487 | 12 | imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u; |
6488 | 12 | imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u; |
6489 | | |
6490 | 12 | imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u; |
6491 | | |
6492 | 12 | imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca; |
6493 | 12 | imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca; |
6494 | 12 | imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca; |
6495 | 12 | imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca; |
6496 | 12 | imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca; |
6497 | 12 | imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca; |
6498 | 12 | imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca; |
6499 | 12 | imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca; |
6500 | 12 | imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca; |
6501 | 12 | imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca; |
6502 | 12 | imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca; |
6503 | | |
6504 | 12 | imp->blt = sse2_blt; |
6505 | 12 | imp->fill = sse2_fill; |
6506 | | |
6507 | 12 | imp->iter_info = sse2_iters; |
6508 | | |
6509 | 12 | return imp; |
6510 | 12 | } |