/src/cairo/subprojects/pixman-0.44.2/pixman/pixman-mmx.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright © 2004, 2005 Red Hat, Inc. |
3 | | * Copyright © 2004 Nicholas Miell |
4 | | * Copyright © 2005 Trolltech AS |
5 | | * |
6 | | * Permission to use, copy, modify, distribute, and sell this software and its |
7 | | * documentation for any purpose is hereby granted without fee, provided that |
8 | | * the above copyright notice appear in all copies and that both that |
9 | | * copyright notice and this permission notice appear in supporting |
10 | | * documentation, and that the name of Red Hat not be used in advertising or |
11 | | * publicity pertaining to distribution of the software without specific, |
12 | | * written prior permission. Red Hat makes no representations about the |
13 | | * suitability of this software for any purpose. It is provided "as is" |
14 | | * without express or implied warranty. |
15 | | * |
16 | | * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS |
17 | | * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND |
18 | | * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY |
19 | | * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
20 | | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN |
21 | | * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING |
22 | | * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS |
23 | | * SOFTWARE. |
24 | | * |
25 | | * Author: Søren Sandmann (sandmann@redhat.com) |
26 | | * Minor Improvements: Nicholas Miell (nmiell@gmail.com) |
27 | | * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com) |
28 | | * |
29 | | * Based on work by Owen Taylor |
30 | | */ |
31 | | |
32 | | #ifdef HAVE_CONFIG_H |
33 | | #include <pixman-config.h> |
34 | | #endif |
35 | | |
36 | | #if defined USE_X86_MMX || defined USE_LOONGSON_MMI |
37 | | |
38 | | #ifdef USE_LOONGSON_MMI |
39 | | #include <loongson-mmintrin.h> |
40 | | #else |
41 | | #include <mmintrin.h> |
42 | | #endif |
43 | | #include "pixman-private.h" |
44 | | #include "pixman-combine32.h" |
45 | | #include "pixman-inlines.h" |
46 | | |
47 | | #ifdef VERBOSE |
48 | | #define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__) |
49 | | #else |
50 | | #define CHECKPOINT() |
51 | | #endif |
52 | | |
53 | | #ifdef USE_X86_MMX |
54 | | # if (defined(__SSE2__) || defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64)) |
55 | | # include <xmmintrin.h> |
56 | | # else |
57 | | /* We have to compile with -msse to use xmmintrin.h, but that causes SSE |
58 | | * instructions to be generated that we don't want. Just duplicate the |
59 | | * functions we want to use. */ |
60 | | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
61 | | _mm_movemask_pi8 (__m64 __A) |
62 | | { |
63 | | int ret; |
64 | | |
65 | | asm ("pmovmskb %1, %0\n\t" |
66 | | : "=r" (ret) |
67 | | : "y" (__A) |
68 | | ); |
69 | | |
70 | | return ret; |
71 | | } |
72 | | |
73 | | extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
74 | | _mm_mulhi_pu16 (__m64 __A, __m64 __B) |
75 | | { |
76 | | asm ("pmulhuw %1, %0\n\t" |
77 | | : "+y" (__A) |
78 | | : "y" (__B) |
79 | | ); |
80 | | return __A; |
81 | | } |
82 | | |
83 | | # define _mm_shuffle_pi16(A, N) \ |
84 | | ({ \ |
85 | | __m64 ret; \ |
86 | | \ |
87 | | asm ("pshufw %2, %1, %0\n\t" \ |
88 | | : "=y" (ret) \ |
89 | | : "y" (A), "K" ((const int8_t)N) \ |
90 | | ); \ |
91 | | \ |
92 | | ret; \ |
93 | | }) |
94 | | # endif |
95 | | #endif |
96 | | |
97 | | #ifndef _MM_SHUFFLE |
98 | | #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \ |
99 | | (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0)) |
100 | | #endif |
101 | | |
102 | | /* Notes about writing mmx code |
103 | | * |
104 | | * give memory operands as the second operand. If you give it as the |
105 | | * first, gcc will first load it into a register, then use that |
106 | | * register |
107 | | * |
108 | | * ie. use |
109 | | * |
110 | | * _mm_mullo_pi16 (x, mmx_constant); |
111 | | * |
112 | | * not |
113 | | * |
114 | | * _mm_mullo_pi16 (mmx_constant, x); |
115 | | * |
116 | | * Also try to minimize dependencies. i.e. when you need a value, try |
117 | | * to calculate it from a value that was calculated as early as |
118 | | * possible. |
119 | | */ |
120 | | |
121 | | /* --------------- MMX primitives ------------------------------------- */ |
122 | | |
123 | | /* If __m64 is defined as a struct or union, then define M64_MEMBER to be |
124 | | * the name of the member used to access the data. |
125 | | * If __m64 requires using mm_cvt* intrinsics functions to convert between |
126 | | * uint64_t and __m64 values, then define USE_CVT_INTRINSICS. |
127 | | * If __m64 and uint64_t values can just be cast to each other directly, |
128 | | * then define USE_M64_CASTS. |
129 | | * If __m64 is a double datatype, then define USE_M64_DOUBLE. |
130 | | */ |
131 | | #ifdef _MSC_VER |
132 | | # ifdef __clang__ |
133 | | # define USE_CVT_INTRINSICS |
134 | | # else |
135 | | # define M64_MEMBER m64_u64 |
136 | | # endif |
137 | | #elif defined(__ICC) |
138 | | # define USE_CVT_INTRINSICS |
139 | | #elif defined(USE_LOONGSON_MMI) |
140 | | # define USE_M64_DOUBLE |
141 | | #elif defined(__GNUC__) |
142 | | # define USE_M64_CASTS |
143 | | #elif defined(__SUNPRO_C) |
144 | | # if (__SUNPRO_C >= 0x5120) && !defined(__NOVECTORSIZE__) |
145 | | /* Solaris Studio 12.3 (Sun C 5.12) introduces __attribute__(__vector_size__) |
146 | | * support, and defaults to using it to define __m64, unless __NOVECTORSIZE__ |
147 | | * is defined. If it is used, then the mm_cvt* intrinsics must be used. |
148 | | */ |
149 | | # define USE_CVT_INTRINSICS |
150 | | # else |
151 | | /* For Studio 12.2 or older, or when __attribute__(__vector_size__) is |
152 | | * disabled, __m64 is defined as a struct containing "unsigned long long l_". |
153 | | */ |
154 | | # define M64_MEMBER l_ |
155 | | # endif |
156 | | #endif |
157 | | |
158 | | #if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS) || defined(USE_M64_DOUBLE) |
159 | | typedef uint64_t mmxdatafield; |
160 | | #else |
161 | | typedef __m64 mmxdatafield; |
162 | | #endif |
163 | | |
164 | | typedef struct |
165 | | { |
166 | | mmxdatafield mmx_4x00ff; |
167 | | mmxdatafield mmx_4x0080; |
168 | | mmxdatafield mmx_565_rgb; |
169 | | mmxdatafield mmx_565_unpack_multiplier; |
170 | | mmxdatafield mmx_565_pack_multiplier; |
171 | | mmxdatafield mmx_565_r; |
172 | | mmxdatafield mmx_565_g; |
173 | | mmxdatafield mmx_565_b; |
174 | | mmxdatafield mmx_packed_565_rb; |
175 | | mmxdatafield mmx_packed_565_g; |
176 | | mmxdatafield mmx_expand_565_g; |
177 | | mmxdatafield mmx_expand_565_b; |
178 | | mmxdatafield mmx_expand_565_r; |
179 | | #ifndef USE_LOONGSON_MMI |
180 | | mmxdatafield mmx_mask_0; |
181 | | mmxdatafield mmx_mask_1; |
182 | | mmxdatafield mmx_mask_2; |
183 | | mmxdatafield mmx_mask_3; |
184 | | #endif |
185 | | mmxdatafield mmx_full_alpha; |
186 | | mmxdatafield mmx_4x0101; |
187 | | mmxdatafield mmx_ff000000; |
188 | | } mmx_data_t; |
189 | | |
190 | | #if defined(_MSC_VER) |
191 | | # define MMXDATA_INIT(field, val) { val ## UI64 } |
192 | | #elif defined(M64_MEMBER) /* __m64 is a struct, not an integral type */ |
193 | | # define MMXDATA_INIT(field, val) field = { val ## ULL } |
194 | | #else /* mmxdatafield is an integral type */ |
195 | | # define MMXDATA_INIT(field, val) field = val ## ULL |
196 | | #endif |
197 | | |
198 | | static const mmx_data_t c = |
199 | | { |
200 | | MMXDATA_INIT (.mmx_4x00ff, 0x00ff00ff00ff00ff), |
201 | | MMXDATA_INIT (.mmx_4x0080, 0x0080008000800080), |
202 | | MMXDATA_INIT (.mmx_565_rgb, 0x000001f0003f001f), |
203 | | MMXDATA_INIT (.mmx_565_unpack_multiplier, 0x0000008404100840), |
204 | | MMXDATA_INIT (.mmx_565_pack_multiplier, 0x2000000420000004), |
205 | | MMXDATA_INIT (.mmx_565_r, 0x000000f800000000), |
206 | | MMXDATA_INIT (.mmx_565_g, 0x0000000000fc0000), |
207 | | MMXDATA_INIT (.mmx_565_b, 0x00000000000000f8), |
208 | | MMXDATA_INIT (.mmx_packed_565_rb, 0x00f800f800f800f8), |
209 | | MMXDATA_INIT (.mmx_packed_565_g, 0x0000fc000000fc00), |
210 | | MMXDATA_INIT (.mmx_expand_565_g, 0x07e007e007e007e0), |
211 | | MMXDATA_INIT (.mmx_expand_565_b, 0x001f001f001f001f), |
212 | | MMXDATA_INIT (.mmx_expand_565_r, 0xf800f800f800f800), |
213 | | #ifndef USE_LOONGSON_MMI |
214 | | MMXDATA_INIT (.mmx_mask_0, 0xffffffffffff0000), |
215 | | MMXDATA_INIT (.mmx_mask_1, 0xffffffff0000ffff), |
216 | | MMXDATA_INIT (.mmx_mask_2, 0xffff0000ffffffff), |
217 | | MMXDATA_INIT (.mmx_mask_3, 0x0000ffffffffffff), |
218 | | #endif |
219 | | MMXDATA_INIT (.mmx_full_alpha, 0x00ff000000000000), |
220 | | MMXDATA_INIT (.mmx_4x0101, 0x0101010101010101), |
221 | | MMXDATA_INIT (.mmx_ff000000, 0xff000000ff000000), |
222 | | }; |
223 | | |
224 | | #ifdef USE_CVT_INTRINSICS |
225 | | # define MC(x) to_m64 (c.mmx_ ## x) |
226 | | #elif defined(USE_M64_CASTS) |
227 | 0 | # define MC(x) ((__m64)c.mmx_ ## x) |
228 | | #elif defined(USE_M64_DOUBLE) |
229 | | # define MC(x) (*(__m64 *)&c.mmx_ ## x) |
230 | | #else |
231 | | # define MC(x) c.mmx_ ## x |
232 | | #endif |
233 | | |
234 | | static force_inline __m64 |
235 | | to_m64 (uint64_t x) |
236 | 0 | { |
237 | | #ifdef USE_CVT_INTRINSICS |
238 | | return _mm_cvtsi64_m64 (x); |
239 | | #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */ |
240 | | __m64 res; |
241 | | |
242 | | res.M64_MEMBER = x; |
243 | | return res; |
244 | | #elif defined USE_M64_DOUBLE |
245 | | return *(__m64 *)&x; |
246 | | #else /* USE_M64_CASTS */ |
247 | 0 | return (__m64)x; |
248 | 0 | #endif |
249 | 0 | } |
250 | | |
251 | | static force_inline uint64_t |
252 | | to_uint64 (__m64 x) |
253 | 0 | { |
254 | | #ifdef USE_CVT_INTRINSICS |
255 | | return _mm_cvtm64_si64 (x); |
256 | | #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */ |
257 | | uint64_t res = x.M64_MEMBER; |
258 | | return res; |
259 | | #elif defined USE_M64_DOUBLE |
260 | | return *(uint64_t *)&x; |
261 | | #else /* USE_M64_CASTS */ |
262 | 0 | return (uint64_t)x; |
263 | 0 | #endif |
264 | 0 | } |
265 | | |
266 | | static force_inline __m64 |
267 | | shift (__m64 v, |
268 | | int s) |
269 | 0 | { |
270 | 0 | if (s > 0) |
271 | 0 | return _mm_slli_si64 (v, s); |
272 | 0 | else if (s < 0) |
273 | 0 | return _mm_srli_si64 (v, -s); |
274 | 0 | else |
275 | 0 | return v; |
276 | 0 | } |
277 | | |
278 | | static force_inline __m64 |
279 | | negate (__m64 mask) |
280 | 0 | { |
281 | 0 | return _mm_xor_si64 (mask, MC (4x00ff)); |
282 | 0 | } |
283 | | |
284 | | /* Computes the product of two unsigned fixed-point 8-bit values from 0 to 1 |
285 | | * and maps its result to the same range. |
286 | | * |
287 | | * Jim Blinn gives multiple ways to compute this in "Jim Blinn's Corner: |
288 | | * Notation, Notation, Notation", the first of which is |
289 | | * |
290 | | * prod(a, b) = (a * b + 128) / 255. |
291 | | * |
292 | | * By approximating the division by 255 as 257/65536 it can be replaced by a |
293 | | * multiply and a right shift. This is the implementation that we use in |
294 | | * pix_multiply(), but we _mm_mulhi_pu16() by 257 (part of SSE1 or Extended |
295 | | * 3DNow!, and unavailable at the time of the book's publication) to perform |
296 | | * the multiplication and right shift in a single operation. |
297 | | * |
298 | | * prod(a, b) = ((a * b + 128) * 257) >> 16. |
299 | | * |
300 | | * A third way (how pix_multiply() was implemented prior to 14208344) exists |
301 | | * also that performs the multiplication by 257 with adds and shifts. |
302 | | * |
303 | | * Where temp = a * b + 128 |
304 | | * |
305 | | * prod(a, b) = (temp + (temp >> 8)) >> 8. |
306 | | */ |
307 | | static force_inline __m64 |
308 | | pix_multiply (__m64 a, __m64 b) |
309 | 0 | { |
310 | 0 | __m64 res; |
311 | |
|
312 | 0 | res = _mm_mullo_pi16 (a, b); |
313 | 0 | res = _mm_adds_pu16 (res, MC (4x0080)); |
314 | 0 | res = _mm_mulhi_pu16 (res, MC (4x0101)); |
315 | |
|
316 | 0 | return res; |
317 | 0 | } |
318 | | |
319 | | static force_inline __m64 |
320 | | pix_add (__m64 a, __m64 b) |
321 | 0 | { |
322 | 0 | return _mm_adds_pu8 (a, b); |
323 | 0 | } |
324 | | |
325 | | static force_inline __m64 |
326 | | expand_alpha (__m64 pixel) |
327 | 0 | { |
328 | 0 | return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 3, 3, 3)); |
329 | 0 | } |
330 | | |
331 | | static force_inline __m64 |
332 | | expand_alpha_rev (__m64 pixel) |
333 | 0 | { |
334 | 0 | return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (0, 0, 0, 0)); |
335 | 0 | } |
336 | | |
337 | | static force_inline __m64 |
338 | | invert_colors (__m64 pixel) |
339 | 0 | { |
340 | 0 | return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 0, 1, 2)); |
341 | 0 | } |
342 | | |
343 | | static force_inline __m64 |
344 | | over (__m64 src, |
345 | | __m64 srca, |
346 | | __m64 dest) |
347 | 0 | { |
348 | 0 | return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca))); |
349 | 0 | } |
350 | | |
351 | | static force_inline __m64 |
352 | | over_rev_non_pre (__m64 src, __m64 dest) |
353 | 0 | { |
354 | 0 | __m64 srca = expand_alpha (src); |
355 | 0 | __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha)); |
356 | |
|
357 | 0 | return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest); |
358 | 0 | } |
359 | | |
360 | | static force_inline __m64 |
361 | | in (__m64 src, __m64 mask) |
362 | 0 | { |
363 | 0 | return pix_multiply (src, mask); |
364 | 0 | } |
365 | | |
366 | | #ifndef _MSC_VER |
367 | | static force_inline __m64 |
368 | | in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest) |
369 | 0 | { |
370 | 0 | return over (in (src, mask), pix_multiply (srca, mask), dest); |
371 | 0 | } |
372 | | |
373 | | #else |
374 | | |
375 | | #define in_over(src, srca, mask, dest) \ |
376 | | over (in (src, mask), pix_multiply (srca, mask), dest) |
377 | | |
378 | | #endif |
379 | | |
380 | | /* Elemental unaligned loads */ |
381 | | |
382 | | static force_inline __m64 ldq_u(__m64 *p) |
383 | 0 | { |
384 | 0 | #ifdef USE_X86_MMX |
385 | | /* x86's alignment restrictions are very relaxed, but that's no excuse */ |
386 | 0 | __m64 r; |
387 | 0 | memcpy(&r, p, sizeof(__m64)); |
388 | 0 | return r; |
389 | | #else |
390 | | struct __una_u64 { __m64 x __attribute__((packed)); }; |
391 | | const struct __una_u64 *ptr = (const struct __una_u64 *) p; |
392 | | return (__m64) ptr->x; |
393 | | #endif |
394 | 0 | } |
395 | | |
396 | | static force_inline uint32_t ldl_u(const uint32_t *p) |
397 | 0 | { |
398 | 0 | #ifdef USE_X86_MMX |
399 | | /* x86's alignment restrictions are very relaxed. */ |
400 | 0 | uint32_t r; |
401 | 0 | memcpy(&r, p, sizeof(uint32_t)); |
402 | 0 | return r; |
403 | | #else |
404 | | struct __una_u32 { uint32_t x __attribute__((packed)); }; |
405 | | const struct __una_u32 *ptr = (const struct __una_u32 *) p; |
406 | | return ptr->x; |
407 | | #endif |
408 | 0 | } |
409 | | |
410 | | static force_inline __m64 |
411 | | load (const uint32_t *v) |
412 | 0 | { |
413 | | #ifdef USE_LOONGSON_MMI |
414 | | __m64 ret; |
415 | | asm ("lwc1 %0, %1\n\t" |
416 | | : "=f" (ret) |
417 | | : "m" (*v) |
418 | | ); |
419 | | return ret; |
420 | | #else |
421 | 0 | return _mm_cvtsi32_si64 (*v); |
422 | 0 | #endif |
423 | 0 | } |
424 | | |
425 | | static force_inline __m64 |
426 | | load8888 (const uint32_t *v) |
427 | 0 | { |
428 | | #ifdef USE_LOONGSON_MMI |
429 | | return _mm_unpacklo_pi8_f (*(__m32 *)v, _mm_setzero_si64 ()); |
430 | | #else |
431 | 0 | return _mm_unpacklo_pi8 (load (v), _mm_setzero_si64 ()); |
432 | 0 | #endif |
433 | 0 | } |
434 | | |
435 | | static force_inline __m64 |
436 | | load8888u (const uint32_t *v) |
437 | 0 | { |
438 | 0 | uint32_t l = ldl_u (v); |
439 | 0 | return load8888 (&l); |
440 | 0 | } |
441 | | |
442 | | static force_inline __m64 |
443 | | pack8888 (__m64 lo, __m64 hi) |
444 | 0 | { |
445 | 0 | return _mm_packs_pu16 (lo, hi); |
446 | 0 | } |
447 | | |
448 | | static force_inline void |
449 | | store (uint32_t *dest, __m64 v) |
450 | 0 | { |
451 | | #ifdef USE_LOONGSON_MMI |
452 | | asm ("swc1 %1, %0\n\t" |
453 | | : "=m" (*dest) |
454 | | : "f" (v) |
455 | | : "memory" |
456 | | ); |
457 | | #else |
458 | 0 | *dest = _mm_cvtsi64_si32 (v); |
459 | 0 | #endif |
460 | 0 | } |
461 | | |
462 | | static force_inline void |
463 | | store8888 (uint32_t *dest, __m64 v) |
464 | 0 | { |
465 | 0 | v = pack8888 (v, _mm_setzero_si64 ()); |
466 | 0 | store (dest, v); |
467 | 0 | } |
468 | | |
469 | | static force_inline pixman_bool_t |
470 | | is_equal (__m64 a, __m64 b) |
471 | 0 | { |
472 | | #ifdef USE_LOONGSON_MMI |
473 | | /* __m64 is double, we can compare directly. */ |
474 | | return a == b; |
475 | | #else |
476 | 0 | return _mm_movemask_pi8 (_mm_cmpeq_pi8 (a, b)) == 0xff; |
477 | 0 | #endif |
478 | 0 | } |
479 | | |
480 | | static force_inline pixman_bool_t |
481 | | is_opaque (__m64 v) |
482 | 0 | { |
483 | | #ifdef USE_LOONGSON_MMI |
484 | | return is_equal (_mm_and_si64 (v, MC (full_alpha)), MC (full_alpha)); |
485 | | #else |
486 | 0 | __m64 ffs = _mm_cmpeq_pi8 (v, v); |
487 | 0 | return (_mm_movemask_pi8 (_mm_cmpeq_pi8 (v, ffs)) & 0x40); |
488 | 0 | #endif |
489 | 0 | } |
490 | | |
491 | | static force_inline pixman_bool_t |
492 | | is_zero (__m64 v) |
493 | 0 | { |
494 | 0 | return is_equal (v, _mm_setzero_si64 ()); |
495 | 0 | } |
496 | | |
497 | | /* Expand 16 bits positioned at @pos (0-3) of a mmx register into |
498 | | * |
499 | | * 00RR00GG00BB |
500 | | * |
501 | | * --- Expanding 565 in the low word --- |
502 | | * |
503 | | * m = (m << (32 - 3)) | (m << (16 - 5)) | m; |
504 | | * m = m & (01f0003f001f); |
505 | | * m = m * (008404100840); |
506 | | * m = m >> 8; |
507 | | * |
508 | | * Note the trick here - the top word is shifted by another nibble to |
509 | | * avoid it bumping into the middle word |
510 | | */ |
511 | | static force_inline __m64 |
512 | | expand565 (__m64 pixel, int pos) |
513 | 0 | { |
514 | 0 | __m64 p = pixel; |
515 | 0 | __m64 t1, t2; |
516 | | |
517 | | /* move pixel to low 16 bit and zero the rest */ |
518 | | #ifdef USE_LOONGSON_MMI |
519 | | p = loongson_extract_pi16 (p, pos); |
520 | | #else |
521 | 0 | p = shift (shift (p, (3 - pos) * 16), -48); |
522 | 0 | #endif |
523 | |
|
524 | 0 | t1 = shift (p, 36 - 11); |
525 | 0 | t2 = shift (p, 16 - 5); |
526 | |
|
527 | 0 | p = _mm_or_si64 (t1, p); |
528 | 0 | p = _mm_or_si64 (t2, p); |
529 | 0 | p = _mm_and_si64 (p, MC (565_rgb)); |
530 | |
|
531 | 0 | pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier)); |
532 | 0 | return _mm_srli_pi16 (pixel, 8); |
533 | 0 | } |
534 | | |
535 | | /* Expand 4 16 bit pixels in an mmx register into two mmx registers of |
536 | | * |
537 | | * AARRGGBBRRGGBB |
538 | | */ |
539 | | static force_inline void |
540 | | expand_4xpacked565 (__m64 vin, __m64 *vout0, __m64 *vout1, int full_alpha) |
541 | 0 | { |
542 | 0 | __m64 t0, t1, alpha = _mm_setzero_si64 (); |
543 | 0 | __m64 r = _mm_and_si64 (vin, MC (expand_565_r)); |
544 | 0 | __m64 g = _mm_and_si64 (vin, MC (expand_565_g)); |
545 | 0 | __m64 b = _mm_and_si64 (vin, MC (expand_565_b)); |
546 | 0 | if (full_alpha) |
547 | 0 | alpha = _mm_cmpeq_pi32 (alpha, alpha); |
548 | | |
549 | | /* Replicate high bits into empty low bits. */ |
550 | 0 | r = _mm_or_si64 (_mm_srli_pi16 (r, 8), _mm_srli_pi16 (r, 13)); |
551 | 0 | g = _mm_or_si64 (_mm_srli_pi16 (g, 3), _mm_srli_pi16 (g, 9)); |
552 | 0 | b = _mm_or_si64 (_mm_slli_pi16 (b, 3), _mm_srli_pi16 (b, 2)); |
553 | |
|
554 | 0 | r = _mm_packs_pu16 (r, _mm_setzero_si64 ()); /* 00 00 00 00 R3 R2 R1 R0 */ |
555 | 0 | g = _mm_packs_pu16 (g, _mm_setzero_si64 ()); /* 00 00 00 00 G3 G2 G1 G0 */ |
556 | 0 | b = _mm_packs_pu16 (b, _mm_setzero_si64 ()); /* 00 00 00 00 B3 B2 B1 B0 */ |
557 | |
|
558 | 0 | t1 = _mm_unpacklo_pi8 (r, alpha); /* A3 R3 A2 R2 A1 R1 A0 R0 */ |
559 | 0 | t0 = _mm_unpacklo_pi8 (b, g); /* G3 B3 G2 B2 G1 B1 G0 B0 */ |
560 | |
|
561 | 0 | *vout0 = _mm_unpacklo_pi16 (t0, t1); /* A1 R1 G1 B1 A0 R0 G0 B0 */ |
562 | 0 | *vout1 = _mm_unpackhi_pi16 (t0, t1); /* A3 R3 G3 B3 A2 R2 G2 B2 */ |
563 | 0 | } |
564 | | |
565 | | static force_inline __m64 |
566 | | expand8888 (__m64 in, int pos) |
567 | 0 | { |
568 | 0 | if (pos == 0) |
569 | 0 | return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ()); |
570 | 0 | else |
571 | 0 | return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ()); |
572 | 0 | } |
573 | | |
574 | | static force_inline __m64 |
575 | | expandx888 (__m64 in, int pos) |
576 | 0 | { |
577 | 0 | return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha)); |
578 | 0 | } |
579 | | |
580 | | static force_inline void |
581 | | expand_4x565 (__m64 vin, __m64 *vout0, __m64 *vout1, __m64 *vout2, __m64 *vout3, int full_alpha) |
582 | 0 | { |
583 | 0 | __m64 v0, v1; |
584 | 0 | expand_4xpacked565 (vin, &v0, &v1, full_alpha); |
585 | 0 | *vout0 = expand8888 (v0, 0); |
586 | 0 | *vout1 = expand8888 (v0, 1); |
587 | 0 | *vout2 = expand8888 (v1, 0); |
588 | 0 | *vout3 = expand8888 (v1, 1); |
589 | 0 | } |
590 | | |
591 | | static force_inline __m64 |
592 | | pack_565 (__m64 pixel, __m64 target, int pos) |
593 | 0 | { |
594 | 0 | __m64 p = pixel; |
595 | 0 | __m64 t = target; |
596 | 0 | __m64 r, g, b; |
597 | |
|
598 | 0 | r = _mm_and_si64 (p, MC (565_r)); |
599 | 0 | g = _mm_and_si64 (p, MC (565_g)); |
600 | 0 | b = _mm_and_si64 (p, MC (565_b)); |
601 | |
|
602 | | #ifdef USE_LOONGSON_MMI |
603 | | r = shift (r, -(32 - 8)); |
604 | | g = shift (g, -(16 - 3)); |
605 | | b = shift (b, -(0 + 3)); |
606 | | |
607 | | p = _mm_or_si64 (r, g); |
608 | | p = _mm_or_si64 (p, b); |
609 | | return loongson_insert_pi16 (t, p, pos); |
610 | | #else |
611 | 0 | r = shift (r, -(32 - 8) + pos * 16); |
612 | 0 | g = shift (g, -(16 - 3) + pos * 16); |
613 | 0 | b = shift (b, -(0 + 3) + pos * 16); |
614 | |
|
615 | 0 | if (pos == 0) |
616 | 0 | t = _mm_and_si64 (t, MC (mask_0)); |
617 | 0 | else if (pos == 1) |
618 | 0 | t = _mm_and_si64 (t, MC (mask_1)); |
619 | 0 | else if (pos == 2) |
620 | 0 | t = _mm_and_si64 (t, MC (mask_2)); |
621 | 0 | else if (pos == 3) |
622 | 0 | t = _mm_and_si64 (t, MC (mask_3)); |
623 | |
|
624 | 0 | p = _mm_or_si64 (r, t); |
625 | 0 | p = _mm_or_si64 (g, p); |
626 | |
|
627 | 0 | return _mm_or_si64 (b, p); |
628 | 0 | #endif |
629 | 0 | } |
630 | | |
631 | | static force_inline __m64 |
632 | | pack_4xpacked565 (__m64 a, __m64 b) |
633 | 0 | { |
634 | 0 | __m64 rb0 = _mm_and_si64 (a, MC (packed_565_rb)); |
635 | 0 | __m64 rb1 = _mm_and_si64 (b, MC (packed_565_rb)); |
636 | |
|
637 | 0 | __m64 t0 = _mm_madd_pi16 (rb0, MC (565_pack_multiplier)); |
638 | 0 | __m64 t1 = _mm_madd_pi16 (rb1, MC (565_pack_multiplier)); |
639 | |
|
640 | 0 | __m64 g0 = _mm_and_si64 (a, MC (packed_565_g)); |
641 | 0 | __m64 g1 = _mm_and_si64 (b, MC (packed_565_g)); |
642 | |
|
643 | 0 | t0 = _mm_or_si64 (t0, g0); |
644 | 0 | t1 = _mm_or_si64 (t1, g1); |
645 | |
|
646 | 0 | t0 = shift(t0, -5); |
647 | 0 | t1 = shift(t1, -5 + 16); |
648 | 0 | return _mm_shuffle_pi16 (_mm_or_si64 (t0, t1), _MM_SHUFFLE (3, 1, 2, 0)); |
649 | 0 | } |
650 | | |
651 | | #ifndef _MSC_VER |
652 | | |
653 | | static force_inline __m64 |
654 | | pack_4x565 (__m64 v0, __m64 v1, __m64 v2, __m64 v3) |
655 | 0 | { |
656 | 0 | return pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3)); |
657 | 0 | } |
658 | | |
659 | | static force_inline __m64 |
660 | | pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b) |
661 | 0 | { |
662 | 0 | x = pix_multiply (x, a); |
663 | 0 | y = pix_multiply (y, b); |
664 | |
|
665 | 0 | return pix_add (x, y); |
666 | 0 | } |
667 | | |
668 | | #else |
669 | | |
670 | | /* MSVC only handles a "pass by register" of up to three SSE intrinsics */ |
671 | | |
672 | | #define pack_4x565(v0, v1, v2, v3) \ |
673 | | pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3)) |
674 | | |
675 | | #define pix_add_mul(x, a, y, b) \ |
676 | | ( x = pix_multiply (x, a), \ |
677 | | y = pix_multiply (y, b), \ |
678 | | pix_add (x, y) ) |
679 | | |
680 | | #endif |
681 | | |
682 | | /* --------------- MMX code patch for fbcompose.c --------------------- */ |
683 | | |
684 | | static force_inline __m64 |
685 | | combine (const uint32_t *src, const uint32_t *mask) |
686 | 0 | { |
687 | 0 | __m64 vsrc = load8888 (src); |
688 | |
|
689 | 0 | if (mask) |
690 | 0 | { |
691 | 0 | __m64 m = load8888 (mask); |
692 | |
|
693 | 0 | m = expand_alpha (m); |
694 | 0 | vsrc = pix_multiply (vsrc, m); |
695 | 0 | } |
696 | |
|
697 | 0 | return vsrc; |
698 | 0 | } |
699 | | |
700 | | static force_inline __m64 |
701 | | core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst) |
702 | 0 | { |
703 | 0 | vsrc = _mm_unpacklo_pi8 (vsrc, _mm_setzero_si64 ()); |
704 | |
|
705 | 0 | if (is_opaque (vsrc)) |
706 | 0 | { |
707 | 0 | return vsrc; |
708 | 0 | } |
709 | 0 | else if (!is_zero (vsrc)) |
710 | 0 | { |
711 | 0 | return over (vsrc, expand_alpha (vsrc), |
712 | 0 | _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ())); |
713 | 0 | } |
714 | | |
715 | 0 | return _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ()); |
716 | 0 | } |
717 | | |
718 | | static void |
719 | | mmx_combine_over_u (pixman_implementation_t *imp, |
720 | | pixman_op_t op, |
721 | | uint32_t * dest, |
722 | | const uint32_t * src, |
723 | | const uint32_t * mask, |
724 | | int width) |
725 | 0 | { |
726 | 0 | const uint32_t *end = dest + width; |
727 | |
|
728 | 0 | while (dest < end) |
729 | 0 | { |
730 | 0 | __m64 vsrc = combine (src, mask); |
731 | |
|
732 | 0 | if (is_opaque (vsrc)) |
733 | 0 | { |
734 | 0 | store8888 (dest, vsrc); |
735 | 0 | } |
736 | 0 | else if (!is_zero (vsrc)) |
737 | 0 | { |
738 | 0 | __m64 sa = expand_alpha (vsrc); |
739 | 0 | store8888 (dest, over (vsrc, sa, load8888 (dest))); |
740 | 0 | } |
741 | |
|
742 | 0 | ++dest; |
743 | 0 | ++src; |
744 | 0 | if (mask) |
745 | 0 | ++mask; |
746 | 0 | } |
747 | 0 | _mm_empty (); |
748 | 0 | } |
749 | | |
750 | | static void |
751 | | mmx_combine_over_reverse_u (pixman_implementation_t *imp, |
752 | | pixman_op_t op, |
753 | | uint32_t * dest, |
754 | | const uint32_t * src, |
755 | | const uint32_t * mask, |
756 | | int width) |
757 | 0 | { |
758 | 0 | const uint32_t *end = dest + width; |
759 | |
|
760 | 0 | while (dest < end) |
761 | 0 | { |
762 | 0 | __m64 d, da; |
763 | 0 | __m64 s = combine (src, mask); |
764 | |
|
765 | 0 | d = load8888 (dest); |
766 | 0 | da = expand_alpha (d); |
767 | 0 | store8888 (dest, over (d, da, s)); |
768 | |
|
769 | 0 | ++dest; |
770 | 0 | ++src; |
771 | 0 | if (mask) |
772 | 0 | mask++; |
773 | 0 | } |
774 | 0 | _mm_empty (); |
775 | 0 | } |
776 | | |
777 | | static void |
778 | | mmx_combine_in_u (pixman_implementation_t *imp, |
779 | | pixman_op_t op, |
780 | | uint32_t * dest, |
781 | | const uint32_t * src, |
782 | | const uint32_t * mask, |
783 | | int width) |
784 | 0 | { |
785 | 0 | const uint32_t *end = dest + width; |
786 | |
|
787 | 0 | while (dest < end) |
788 | 0 | { |
789 | 0 | __m64 a; |
790 | 0 | __m64 x = combine (src, mask); |
791 | |
|
792 | 0 | a = load8888 (dest); |
793 | 0 | a = expand_alpha (a); |
794 | 0 | x = pix_multiply (x, a); |
795 | |
|
796 | 0 | store8888 (dest, x); |
797 | |
|
798 | 0 | ++dest; |
799 | 0 | ++src; |
800 | 0 | if (mask) |
801 | 0 | mask++; |
802 | 0 | } |
803 | 0 | _mm_empty (); |
804 | 0 | } |
805 | | |
806 | | static void |
807 | | mmx_combine_in_reverse_u (pixman_implementation_t *imp, |
808 | | pixman_op_t op, |
809 | | uint32_t * dest, |
810 | | const uint32_t * src, |
811 | | const uint32_t * mask, |
812 | | int width) |
813 | 0 | { |
814 | 0 | const uint32_t *end = dest + width; |
815 | |
|
816 | 0 | while (dest < end) |
817 | 0 | { |
818 | 0 | __m64 a = combine (src, mask); |
819 | 0 | __m64 x; |
820 | |
|
821 | 0 | x = load8888 (dest); |
822 | 0 | a = expand_alpha (a); |
823 | 0 | x = pix_multiply (x, a); |
824 | 0 | store8888 (dest, x); |
825 | |
|
826 | 0 | ++dest; |
827 | 0 | ++src; |
828 | 0 | if (mask) |
829 | 0 | mask++; |
830 | 0 | } |
831 | 0 | _mm_empty (); |
832 | 0 | } |
833 | | |
834 | | static void |
835 | | mmx_combine_out_u (pixman_implementation_t *imp, |
836 | | pixman_op_t op, |
837 | | uint32_t * dest, |
838 | | const uint32_t * src, |
839 | | const uint32_t * mask, |
840 | | int width) |
841 | 0 | { |
842 | 0 | const uint32_t *end = dest + width; |
843 | |
|
844 | 0 | while (dest < end) |
845 | 0 | { |
846 | 0 | __m64 a; |
847 | 0 | __m64 x = combine (src, mask); |
848 | |
|
849 | 0 | a = load8888 (dest); |
850 | 0 | a = expand_alpha (a); |
851 | 0 | a = negate (a); |
852 | 0 | x = pix_multiply (x, a); |
853 | 0 | store8888 (dest, x); |
854 | |
|
855 | 0 | ++dest; |
856 | 0 | ++src; |
857 | 0 | if (mask) |
858 | 0 | mask++; |
859 | 0 | } |
860 | 0 | _mm_empty (); |
861 | 0 | } |
862 | | |
863 | | static void |
864 | | mmx_combine_out_reverse_u (pixman_implementation_t *imp, |
865 | | pixman_op_t op, |
866 | | uint32_t * dest, |
867 | | const uint32_t * src, |
868 | | const uint32_t * mask, |
869 | | int width) |
870 | 0 | { |
871 | 0 | const uint32_t *end = dest + width; |
872 | |
|
873 | 0 | while (dest < end) |
874 | 0 | { |
875 | 0 | __m64 a = combine (src, mask); |
876 | 0 | __m64 x; |
877 | |
|
878 | 0 | x = load8888 (dest); |
879 | 0 | a = expand_alpha (a); |
880 | 0 | a = negate (a); |
881 | 0 | x = pix_multiply (x, a); |
882 | |
|
883 | 0 | store8888 (dest, x); |
884 | |
|
885 | 0 | ++dest; |
886 | 0 | ++src; |
887 | 0 | if (mask) |
888 | 0 | mask++; |
889 | 0 | } |
890 | 0 | _mm_empty (); |
891 | 0 | } |
892 | | |
893 | | static void |
894 | | mmx_combine_atop_u (pixman_implementation_t *imp, |
895 | | pixman_op_t op, |
896 | | uint32_t * dest, |
897 | | const uint32_t * src, |
898 | | const uint32_t * mask, |
899 | | int width) |
900 | 0 | { |
901 | 0 | const uint32_t *end = dest + width; |
902 | |
|
903 | 0 | while (dest < end) |
904 | 0 | { |
905 | 0 | __m64 da, d, sia; |
906 | 0 | __m64 s = combine (src, mask); |
907 | |
|
908 | 0 | d = load8888 (dest); |
909 | 0 | sia = expand_alpha (s); |
910 | 0 | sia = negate (sia); |
911 | 0 | da = expand_alpha (d); |
912 | 0 | s = pix_add_mul (s, da, d, sia); |
913 | 0 | store8888 (dest, s); |
914 | |
|
915 | 0 | ++dest; |
916 | 0 | ++src; |
917 | 0 | if (mask) |
918 | 0 | mask++; |
919 | 0 | } |
920 | 0 | _mm_empty (); |
921 | 0 | } |
922 | | |
923 | | static void |
924 | | mmx_combine_atop_reverse_u (pixman_implementation_t *imp, |
925 | | pixman_op_t op, |
926 | | uint32_t * dest, |
927 | | const uint32_t * src, |
928 | | const uint32_t * mask, |
929 | | int width) |
930 | 0 | { |
931 | 0 | const uint32_t *end; |
932 | |
|
933 | 0 | end = dest + width; |
934 | |
|
935 | 0 | while (dest < end) |
936 | 0 | { |
937 | 0 | __m64 dia, d, sa; |
938 | 0 | __m64 s = combine (src, mask); |
939 | |
|
940 | 0 | d = load8888 (dest); |
941 | 0 | sa = expand_alpha (s); |
942 | 0 | dia = expand_alpha (d); |
943 | 0 | dia = negate (dia); |
944 | 0 | s = pix_add_mul (s, dia, d, sa); |
945 | 0 | store8888 (dest, s); |
946 | |
|
947 | 0 | ++dest; |
948 | 0 | ++src; |
949 | 0 | if (mask) |
950 | 0 | mask++; |
951 | 0 | } |
952 | 0 | _mm_empty (); |
953 | 0 | } |
954 | | |
955 | | static void |
956 | | mmx_combine_xor_u (pixman_implementation_t *imp, |
957 | | pixman_op_t op, |
958 | | uint32_t * dest, |
959 | | const uint32_t * src, |
960 | | const uint32_t * mask, |
961 | | int width) |
962 | 0 | { |
963 | 0 | const uint32_t *end = dest + width; |
964 | |
|
965 | 0 | while (dest < end) |
966 | 0 | { |
967 | 0 | __m64 dia, d, sia; |
968 | 0 | __m64 s = combine (src, mask); |
969 | |
|
970 | 0 | d = load8888 (dest); |
971 | 0 | sia = expand_alpha (s); |
972 | 0 | dia = expand_alpha (d); |
973 | 0 | sia = negate (sia); |
974 | 0 | dia = negate (dia); |
975 | 0 | s = pix_add_mul (s, dia, d, sia); |
976 | 0 | store8888 (dest, s); |
977 | |
|
978 | 0 | ++dest; |
979 | 0 | ++src; |
980 | 0 | if (mask) |
981 | 0 | mask++; |
982 | 0 | } |
983 | 0 | _mm_empty (); |
984 | 0 | } |
985 | | |
986 | | static void |
987 | | mmx_combine_add_u (pixman_implementation_t *imp, |
988 | | pixman_op_t op, |
989 | | uint32_t * dest, |
990 | | const uint32_t * src, |
991 | | const uint32_t * mask, |
992 | | int width) |
993 | 0 | { |
994 | 0 | const uint32_t *end = dest + width; |
995 | |
|
996 | 0 | while (dest < end) |
997 | 0 | { |
998 | 0 | __m64 d; |
999 | 0 | __m64 s = combine (src, mask); |
1000 | |
|
1001 | 0 | d = load8888 (dest); |
1002 | 0 | s = pix_add (s, d); |
1003 | 0 | store8888 (dest, s); |
1004 | |
|
1005 | 0 | ++dest; |
1006 | 0 | ++src; |
1007 | 0 | if (mask) |
1008 | 0 | mask++; |
1009 | 0 | } |
1010 | 0 | _mm_empty (); |
1011 | 0 | } |
1012 | | |
1013 | | static void |
1014 | | mmx_combine_saturate_u (pixman_implementation_t *imp, |
1015 | | pixman_op_t op, |
1016 | | uint32_t * dest, |
1017 | | const uint32_t * src, |
1018 | | const uint32_t * mask, |
1019 | | int width) |
1020 | 0 | { |
1021 | 0 | const uint32_t *end = dest + width; |
1022 | |
|
1023 | 0 | while (dest < end) |
1024 | 0 | { |
1025 | 0 | uint32_t s, sa, da; |
1026 | 0 | uint32_t d = *dest; |
1027 | 0 | __m64 ms = combine (src, mask); |
1028 | 0 | __m64 md = load8888 (dest); |
1029 | |
|
1030 | 0 | store8888(&s, ms); |
1031 | 0 | da = ~d >> 24; |
1032 | 0 | sa = s >> 24; |
1033 | |
|
1034 | 0 | if (sa > da) |
1035 | 0 | { |
1036 | 0 | uint32_t quot = DIV_UN8 (da, sa) << 24; |
1037 | 0 | __m64 msa = load8888 ("); |
1038 | 0 | msa = expand_alpha (msa); |
1039 | 0 | ms = pix_multiply (ms, msa); |
1040 | 0 | } |
1041 | |
|
1042 | 0 | md = pix_add (md, ms); |
1043 | 0 | store8888 (dest, md); |
1044 | |
|
1045 | 0 | ++src; |
1046 | 0 | ++dest; |
1047 | 0 | if (mask) |
1048 | 0 | mask++; |
1049 | 0 | } |
1050 | 0 | _mm_empty (); |
1051 | 0 | } |
1052 | | |
1053 | | static void |
1054 | | mmx_combine_src_ca (pixman_implementation_t *imp, |
1055 | | pixman_op_t op, |
1056 | | uint32_t * dest, |
1057 | | const uint32_t * src, |
1058 | | const uint32_t * mask, |
1059 | | int width) |
1060 | 0 | { |
1061 | 0 | const uint32_t *end = src + width; |
1062 | |
|
1063 | 0 | while (src < end) |
1064 | 0 | { |
1065 | 0 | __m64 a = load8888 (mask); |
1066 | 0 | __m64 s = load8888 (src); |
1067 | |
|
1068 | 0 | s = pix_multiply (s, a); |
1069 | 0 | store8888 (dest, s); |
1070 | |
|
1071 | 0 | ++src; |
1072 | 0 | ++mask; |
1073 | 0 | ++dest; |
1074 | 0 | } |
1075 | 0 | _mm_empty (); |
1076 | 0 | } |
1077 | | |
1078 | | static void |
1079 | | mmx_combine_over_ca (pixman_implementation_t *imp, |
1080 | | pixman_op_t op, |
1081 | | uint32_t * dest, |
1082 | | const uint32_t * src, |
1083 | | const uint32_t * mask, |
1084 | | int width) |
1085 | 0 | { |
1086 | 0 | const uint32_t *end = src + width; |
1087 | |
|
1088 | 0 | while (src < end) |
1089 | 0 | { |
1090 | 0 | __m64 a = load8888 (mask); |
1091 | 0 | __m64 s = load8888 (src); |
1092 | 0 | __m64 d = load8888 (dest); |
1093 | 0 | __m64 sa = expand_alpha (s); |
1094 | |
|
1095 | 0 | store8888 (dest, in_over (s, sa, a, d)); |
1096 | |
|
1097 | 0 | ++src; |
1098 | 0 | ++dest; |
1099 | 0 | ++mask; |
1100 | 0 | } |
1101 | 0 | _mm_empty (); |
1102 | 0 | } |
1103 | | |
1104 | | static void |
1105 | | mmx_combine_over_reverse_ca (pixman_implementation_t *imp, |
1106 | | pixman_op_t op, |
1107 | | uint32_t * dest, |
1108 | | const uint32_t * src, |
1109 | | const uint32_t * mask, |
1110 | | int width) |
1111 | 0 | { |
1112 | 0 | const uint32_t *end = src + width; |
1113 | |
|
1114 | 0 | while (src < end) |
1115 | 0 | { |
1116 | 0 | __m64 a = load8888 (mask); |
1117 | 0 | __m64 s = load8888 (src); |
1118 | 0 | __m64 d = load8888 (dest); |
1119 | 0 | __m64 da = expand_alpha (d); |
1120 | |
|
1121 | 0 | store8888 (dest, over (d, da, in (s, a))); |
1122 | |
|
1123 | 0 | ++src; |
1124 | 0 | ++dest; |
1125 | 0 | ++mask; |
1126 | 0 | } |
1127 | 0 | _mm_empty (); |
1128 | 0 | } |
1129 | | |
1130 | | static void |
1131 | | mmx_combine_in_ca (pixman_implementation_t *imp, |
1132 | | pixman_op_t op, |
1133 | | uint32_t * dest, |
1134 | | const uint32_t * src, |
1135 | | const uint32_t * mask, |
1136 | | int width) |
1137 | 0 | { |
1138 | 0 | const uint32_t *end = src + width; |
1139 | |
|
1140 | 0 | while (src < end) |
1141 | 0 | { |
1142 | 0 | __m64 a = load8888 (mask); |
1143 | 0 | __m64 s = load8888 (src); |
1144 | 0 | __m64 d = load8888 (dest); |
1145 | 0 | __m64 da = expand_alpha (d); |
1146 | |
|
1147 | 0 | s = pix_multiply (s, a); |
1148 | 0 | s = pix_multiply (s, da); |
1149 | 0 | store8888 (dest, s); |
1150 | |
|
1151 | 0 | ++src; |
1152 | 0 | ++dest; |
1153 | 0 | ++mask; |
1154 | 0 | } |
1155 | 0 | _mm_empty (); |
1156 | 0 | } |
1157 | | |
1158 | | static void |
1159 | | mmx_combine_in_reverse_ca (pixman_implementation_t *imp, |
1160 | | pixman_op_t op, |
1161 | | uint32_t * dest, |
1162 | | const uint32_t * src, |
1163 | | const uint32_t * mask, |
1164 | | int width) |
1165 | 0 | { |
1166 | 0 | const uint32_t *end = src + width; |
1167 | |
|
1168 | 0 | while (src < end) |
1169 | 0 | { |
1170 | 0 | __m64 a = load8888 (mask); |
1171 | 0 | __m64 s = load8888 (src); |
1172 | 0 | __m64 d = load8888 (dest); |
1173 | 0 | __m64 sa = expand_alpha (s); |
1174 | |
|
1175 | 0 | a = pix_multiply (a, sa); |
1176 | 0 | d = pix_multiply (d, a); |
1177 | 0 | store8888 (dest, d); |
1178 | |
|
1179 | 0 | ++src; |
1180 | 0 | ++dest; |
1181 | 0 | ++mask; |
1182 | 0 | } |
1183 | 0 | _mm_empty (); |
1184 | 0 | } |
1185 | | |
1186 | | static void |
1187 | | mmx_combine_out_ca (pixman_implementation_t *imp, |
1188 | | pixman_op_t op, |
1189 | | uint32_t * dest, |
1190 | | const uint32_t * src, |
1191 | | const uint32_t * mask, |
1192 | | int width) |
1193 | 0 | { |
1194 | 0 | const uint32_t *end = src + width; |
1195 | |
|
1196 | 0 | while (src < end) |
1197 | 0 | { |
1198 | 0 | __m64 a = load8888 (mask); |
1199 | 0 | __m64 s = load8888 (src); |
1200 | 0 | __m64 d = load8888 (dest); |
1201 | 0 | __m64 da = expand_alpha (d); |
1202 | |
|
1203 | 0 | da = negate (da); |
1204 | 0 | s = pix_multiply (s, a); |
1205 | 0 | s = pix_multiply (s, da); |
1206 | 0 | store8888 (dest, s); |
1207 | |
|
1208 | 0 | ++src; |
1209 | 0 | ++dest; |
1210 | 0 | ++mask; |
1211 | 0 | } |
1212 | 0 | _mm_empty (); |
1213 | 0 | } |
1214 | | |
1215 | | static void |
1216 | | mmx_combine_out_reverse_ca (pixman_implementation_t *imp, |
1217 | | pixman_op_t op, |
1218 | | uint32_t * dest, |
1219 | | const uint32_t * src, |
1220 | | const uint32_t * mask, |
1221 | | int width) |
1222 | 0 | { |
1223 | 0 | const uint32_t *end = src + width; |
1224 | |
|
1225 | 0 | while (src < end) |
1226 | 0 | { |
1227 | 0 | __m64 a = load8888 (mask); |
1228 | 0 | __m64 s = load8888 (src); |
1229 | 0 | __m64 d = load8888 (dest); |
1230 | 0 | __m64 sa = expand_alpha (s); |
1231 | |
|
1232 | 0 | a = pix_multiply (a, sa); |
1233 | 0 | a = negate (a); |
1234 | 0 | d = pix_multiply (d, a); |
1235 | 0 | store8888 (dest, d); |
1236 | |
|
1237 | 0 | ++src; |
1238 | 0 | ++dest; |
1239 | 0 | ++mask; |
1240 | 0 | } |
1241 | 0 | _mm_empty (); |
1242 | 0 | } |
1243 | | |
1244 | | static void |
1245 | | mmx_combine_atop_ca (pixman_implementation_t *imp, |
1246 | | pixman_op_t op, |
1247 | | uint32_t * dest, |
1248 | | const uint32_t * src, |
1249 | | const uint32_t * mask, |
1250 | | int width) |
1251 | 0 | { |
1252 | 0 | const uint32_t *end = src + width; |
1253 | |
|
1254 | 0 | while (src < end) |
1255 | 0 | { |
1256 | 0 | __m64 a = load8888 (mask); |
1257 | 0 | __m64 s = load8888 (src); |
1258 | 0 | __m64 d = load8888 (dest); |
1259 | 0 | __m64 da = expand_alpha (d); |
1260 | 0 | __m64 sa = expand_alpha (s); |
1261 | |
|
1262 | 0 | s = pix_multiply (s, a); |
1263 | 0 | a = pix_multiply (a, sa); |
1264 | 0 | a = negate (a); |
1265 | 0 | d = pix_add_mul (d, a, s, da); |
1266 | 0 | store8888 (dest, d); |
1267 | |
|
1268 | 0 | ++src; |
1269 | 0 | ++dest; |
1270 | 0 | ++mask; |
1271 | 0 | } |
1272 | 0 | _mm_empty (); |
1273 | 0 | } |
1274 | | |
1275 | | static void |
1276 | | mmx_combine_atop_reverse_ca (pixman_implementation_t *imp, |
1277 | | pixman_op_t op, |
1278 | | uint32_t * dest, |
1279 | | const uint32_t * src, |
1280 | | const uint32_t * mask, |
1281 | | int width) |
1282 | 0 | { |
1283 | 0 | const uint32_t *end = src + width; |
1284 | |
|
1285 | 0 | while (src < end) |
1286 | 0 | { |
1287 | 0 | __m64 a = load8888 (mask); |
1288 | 0 | __m64 s = load8888 (src); |
1289 | 0 | __m64 d = load8888 (dest); |
1290 | 0 | __m64 da = expand_alpha (d); |
1291 | 0 | __m64 sa = expand_alpha (s); |
1292 | |
|
1293 | 0 | s = pix_multiply (s, a); |
1294 | 0 | a = pix_multiply (a, sa); |
1295 | 0 | da = negate (da); |
1296 | 0 | d = pix_add_mul (d, a, s, da); |
1297 | 0 | store8888 (dest, d); |
1298 | |
|
1299 | 0 | ++src; |
1300 | 0 | ++dest; |
1301 | 0 | ++mask; |
1302 | 0 | } |
1303 | 0 | _mm_empty (); |
1304 | 0 | } |
1305 | | |
1306 | | static void |
1307 | | mmx_combine_xor_ca (pixman_implementation_t *imp, |
1308 | | pixman_op_t op, |
1309 | | uint32_t * dest, |
1310 | | const uint32_t * src, |
1311 | | const uint32_t * mask, |
1312 | | int width) |
1313 | 0 | { |
1314 | 0 | const uint32_t *end = src + width; |
1315 | |
|
1316 | 0 | while (src < end) |
1317 | 0 | { |
1318 | 0 | __m64 a = load8888 (mask); |
1319 | 0 | __m64 s = load8888 (src); |
1320 | 0 | __m64 d = load8888 (dest); |
1321 | 0 | __m64 da = expand_alpha (d); |
1322 | 0 | __m64 sa = expand_alpha (s); |
1323 | |
|
1324 | 0 | s = pix_multiply (s, a); |
1325 | 0 | a = pix_multiply (a, sa); |
1326 | 0 | da = negate (da); |
1327 | 0 | a = negate (a); |
1328 | 0 | d = pix_add_mul (d, a, s, da); |
1329 | 0 | store8888 (dest, d); |
1330 | |
|
1331 | 0 | ++src; |
1332 | 0 | ++dest; |
1333 | 0 | ++mask; |
1334 | 0 | } |
1335 | 0 | _mm_empty (); |
1336 | 0 | } |
1337 | | |
1338 | | static void |
1339 | | mmx_combine_add_ca (pixman_implementation_t *imp, |
1340 | | pixman_op_t op, |
1341 | | uint32_t * dest, |
1342 | | const uint32_t * src, |
1343 | | const uint32_t * mask, |
1344 | | int width) |
1345 | 0 | { |
1346 | 0 | const uint32_t *end = src + width; |
1347 | |
|
1348 | 0 | while (src < end) |
1349 | 0 | { |
1350 | 0 | __m64 a = load8888 (mask); |
1351 | 0 | __m64 s = load8888 (src); |
1352 | 0 | __m64 d = load8888 (dest); |
1353 | |
|
1354 | 0 | s = pix_multiply (s, a); |
1355 | 0 | d = pix_add (s, d); |
1356 | 0 | store8888 (dest, d); |
1357 | |
|
1358 | 0 | ++src; |
1359 | 0 | ++dest; |
1360 | 0 | ++mask; |
1361 | 0 | } |
1362 | 0 | _mm_empty (); |
1363 | 0 | } |
1364 | | |
1365 | | /* ------------- MMX code paths called from fbpict.c -------------------- */ |
1366 | | |
1367 | | static void |
1368 | | mmx_composite_over_n_8888 (pixman_implementation_t *imp, |
1369 | | pixman_composite_info_t *info) |
1370 | 0 | { |
1371 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
1372 | 0 | uint32_t src; |
1373 | 0 | uint32_t *dst_line, *dst; |
1374 | 0 | int32_t w; |
1375 | 0 | int dst_stride; |
1376 | 0 | __m64 vsrc, vsrca; |
1377 | |
|
1378 | 0 | CHECKPOINT (); |
1379 | |
|
1380 | 0 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
1381 | |
|
1382 | 0 | if (src == 0) |
1383 | 0 | return; |
1384 | | |
1385 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
1386 | |
|
1387 | 0 | vsrc = load8888 (&src); |
1388 | 0 | vsrca = expand_alpha (vsrc); |
1389 | |
|
1390 | 0 | while (height--) |
1391 | 0 | { |
1392 | 0 | dst = dst_line; |
1393 | 0 | dst_line += dst_stride; |
1394 | 0 | w = width; |
1395 | |
|
1396 | 0 | CHECKPOINT (); |
1397 | |
|
1398 | 0 | while (w && (uintptr_t)dst & 7) |
1399 | 0 | { |
1400 | 0 | store8888 (dst, over (vsrc, vsrca, load8888 (dst))); |
1401 | |
|
1402 | 0 | w--; |
1403 | 0 | dst++; |
1404 | 0 | } |
1405 | |
|
1406 | 0 | while (w >= 2) |
1407 | 0 | { |
1408 | 0 | __m64 vdest; |
1409 | 0 | __m64 dest0, dest1; |
1410 | |
|
1411 | 0 | vdest = *(__m64 *)dst; |
1412 | |
|
1413 | 0 | dest0 = over (vsrc, vsrca, expand8888 (vdest, 0)); |
1414 | 0 | dest1 = over (vsrc, vsrca, expand8888 (vdest, 1)); |
1415 | |
|
1416 | 0 | *(__m64 *)dst = pack8888 (dest0, dest1); |
1417 | |
|
1418 | 0 | dst += 2; |
1419 | 0 | w -= 2; |
1420 | 0 | } |
1421 | |
|
1422 | 0 | CHECKPOINT (); |
1423 | |
|
1424 | 0 | if (w) |
1425 | 0 | { |
1426 | 0 | store8888 (dst, over (vsrc, vsrca, load8888 (dst))); |
1427 | 0 | } |
1428 | 0 | } |
1429 | |
|
1430 | 0 | _mm_empty (); |
1431 | 0 | } |
1432 | | |
1433 | | static void |
1434 | | mmx_composite_over_n_0565 (pixman_implementation_t *imp, |
1435 | | pixman_composite_info_t *info) |
1436 | 0 | { |
1437 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
1438 | 0 | uint32_t src; |
1439 | 0 | uint16_t *dst_line, *dst; |
1440 | 0 | int32_t w; |
1441 | 0 | int dst_stride; |
1442 | 0 | __m64 vsrc, vsrca; |
1443 | |
|
1444 | 0 | CHECKPOINT (); |
1445 | |
|
1446 | 0 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
1447 | |
|
1448 | 0 | if (src == 0) |
1449 | 0 | return; |
1450 | | |
1451 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
1452 | |
|
1453 | 0 | vsrc = load8888 (&src); |
1454 | 0 | vsrca = expand_alpha (vsrc); |
1455 | |
|
1456 | 0 | while (height--) |
1457 | 0 | { |
1458 | 0 | dst = dst_line; |
1459 | 0 | dst_line += dst_stride; |
1460 | 0 | w = width; |
1461 | |
|
1462 | 0 | CHECKPOINT (); |
1463 | |
|
1464 | 0 | while (w && (uintptr_t)dst & 7) |
1465 | 0 | { |
1466 | 0 | uint64_t d = *dst; |
1467 | 0 | __m64 vdest = expand565 (to_m64 (d), 0); |
1468 | |
|
1469 | 0 | vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0); |
1470 | 0 | *dst = to_uint64 (vdest); |
1471 | |
|
1472 | 0 | w--; |
1473 | 0 | dst++; |
1474 | 0 | } |
1475 | |
|
1476 | 0 | while (w >= 4) |
1477 | 0 | { |
1478 | 0 | __m64 vdest = *(__m64 *)dst; |
1479 | 0 | __m64 v0, v1, v2, v3; |
1480 | |
|
1481 | 0 | expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); |
1482 | |
|
1483 | 0 | v0 = over (vsrc, vsrca, v0); |
1484 | 0 | v1 = over (vsrc, vsrca, v1); |
1485 | 0 | v2 = over (vsrc, vsrca, v2); |
1486 | 0 | v3 = over (vsrc, vsrca, v3); |
1487 | |
|
1488 | 0 | *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); |
1489 | |
|
1490 | 0 | dst += 4; |
1491 | 0 | w -= 4; |
1492 | 0 | } |
1493 | |
|
1494 | 0 | CHECKPOINT (); |
1495 | |
|
1496 | 0 | while (w) |
1497 | 0 | { |
1498 | 0 | uint64_t d = *dst; |
1499 | 0 | __m64 vdest = expand565 (to_m64 (d), 0); |
1500 | |
|
1501 | 0 | vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0); |
1502 | 0 | *dst = to_uint64 (vdest); |
1503 | |
|
1504 | 0 | w--; |
1505 | 0 | dst++; |
1506 | 0 | } |
1507 | 0 | } |
1508 | |
|
1509 | 0 | _mm_empty (); |
1510 | 0 | } |
1511 | | |
1512 | | static void |
1513 | | mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, |
1514 | | pixman_composite_info_t *info) |
1515 | 0 | { |
1516 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
1517 | 0 | uint32_t src; |
1518 | 0 | uint32_t *dst_line; |
1519 | 0 | uint32_t *mask_line; |
1520 | 0 | int dst_stride, mask_stride; |
1521 | 0 | __m64 vsrc, vsrca; |
1522 | |
|
1523 | 0 | CHECKPOINT (); |
1524 | |
|
1525 | 0 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
1526 | |
|
1527 | 0 | if (src == 0) |
1528 | 0 | return; |
1529 | | |
1530 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
1531 | 0 | PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); |
1532 | |
|
1533 | 0 | vsrc = load8888 (&src); |
1534 | 0 | vsrca = expand_alpha (vsrc); |
1535 | |
|
1536 | 0 | while (height--) |
1537 | 0 | { |
1538 | 0 | int twidth = width; |
1539 | 0 | uint32_t *p = (uint32_t *)mask_line; |
1540 | 0 | uint32_t *q = (uint32_t *)dst_line; |
1541 | |
|
1542 | 0 | while (twidth && (uintptr_t)q & 7) |
1543 | 0 | { |
1544 | 0 | uint32_t m = *(uint32_t *)p; |
1545 | |
|
1546 | 0 | if (m) |
1547 | 0 | { |
1548 | 0 | __m64 vdest = load8888 (q); |
1549 | 0 | vdest = in_over (vsrc, vsrca, load8888 (&m), vdest); |
1550 | 0 | store8888 (q, vdest); |
1551 | 0 | } |
1552 | |
|
1553 | 0 | twidth--; |
1554 | 0 | p++; |
1555 | 0 | q++; |
1556 | 0 | } |
1557 | |
|
1558 | 0 | while (twidth >= 2) |
1559 | 0 | { |
1560 | 0 | uint32_t m0, m1; |
1561 | 0 | m0 = *p; |
1562 | 0 | m1 = *(p + 1); |
1563 | |
|
1564 | 0 | if (m0 | m1) |
1565 | 0 | { |
1566 | 0 | __m64 dest0, dest1; |
1567 | 0 | __m64 vdest = *(__m64 *)q; |
1568 | |
|
1569 | 0 | dest0 = in_over (vsrc, vsrca, load8888 (&m0), |
1570 | 0 | expand8888 (vdest, 0)); |
1571 | 0 | dest1 = in_over (vsrc, vsrca, load8888 (&m1), |
1572 | 0 | expand8888 (vdest, 1)); |
1573 | |
|
1574 | 0 | *(__m64 *)q = pack8888 (dest0, dest1); |
1575 | 0 | } |
1576 | |
|
1577 | 0 | p += 2; |
1578 | 0 | q += 2; |
1579 | 0 | twidth -= 2; |
1580 | 0 | } |
1581 | |
|
1582 | 0 | if (twidth) |
1583 | 0 | { |
1584 | 0 | uint32_t m = *(uint32_t *)p; |
1585 | |
|
1586 | 0 | if (m) |
1587 | 0 | { |
1588 | 0 | __m64 vdest = load8888 (q); |
1589 | 0 | vdest = in_over (vsrc, vsrca, load8888 (&m), vdest); |
1590 | 0 | store8888 (q, vdest); |
1591 | 0 | } |
1592 | |
|
1593 | 0 | twidth--; |
1594 | 0 | p++; |
1595 | 0 | q++; |
1596 | 0 | } |
1597 | |
|
1598 | 0 | dst_line += dst_stride; |
1599 | 0 | mask_line += mask_stride; |
1600 | 0 | } |
1601 | |
|
1602 | 0 | _mm_empty (); |
1603 | 0 | } |
1604 | | |
1605 | | static void |
1606 | | mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp, |
1607 | | pixman_composite_info_t *info) |
1608 | 0 | { |
1609 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
1610 | 0 | uint32_t *dst_line, *dst; |
1611 | 0 | uint32_t *src_line, *src; |
1612 | 0 | uint32_t mask; |
1613 | 0 | __m64 vmask; |
1614 | 0 | int dst_stride, src_stride; |
1615 | 0 | int32_t w; |
1616 | |
|
1617 | 0 | CHECKPOINT (); |
1618 | |
|
1619 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
1620 | 0 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
1621 | |
|
1622 | 0 | mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format); |
1623 | 0 | vmask = expand_alpha (load8888 (&mask)); |
1624 | |
|
1625 | 0 | while (height--) |
1626 | 0 | { |
1627 | 0 | dst = dst_line; |
1628 | 0 | dst_line += dst_stride; |
1629 | 0 | src = src_line; |
1630 | 0 | src_line += src_stride; |
1631 | 0 | w = width; |
1632 | |
|
1633 | 0 | while (w && (uintptr_t)dst & 7) |
1634 | 0 | { |
1635 | 0 | __m64 s = load8888 (src); |
1636 | 0 | __m64 d = load8888 (dst); |
1637 | |
|
1638 | 0 | store8888 (dst, in_over (s, expand_alpha (s), vmask, d)); |
1639 | |
|
1640 | 0 | w--; |
1641 | 0 | dst++; |
1642 | 0 | src++; |
1643 | 0 | } |
1644 | |
|
1645 | 0 | while (w >= 2) |
1646 | 0 | { |
1647 | 0 | __m64 vs = ldq_u ((__m64 *)src); |
1648 | 0 | __m64 vd = *(__m64 *)dst; |
1649 | 0 | __m64 vsrc0 = expand8888 (vs, 0); |
1650 | 0 | __m64 vsrc1 = expand8888 (vs, 1); |
1651 | |
|
1652 | 0 | *(__m64 *)dst = pack8888 ( |
1653 | 0 | in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)), |
1654 | 0 | in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1))); |
1655 | |
|
1656 | 0 | w -= 2; |
1657 | 0 | dst += 2; |
1658 | 0 | src += 2; |
1659 | 0 | } |
1660 | |
|
1661 | 0 | if (w) |
1662 | 0 | { |
1663 | 0 | __m64 s = load8888 (src); |
1664 | 0 | __m64 d = load8888 (dst); |
1665 | |
|
1666 | 0 | store8888 (dst, in_over (s, expand_alpha (s), vmask, d)); |
1667 | 0 | } |
1668 | 0 | } |
1669 | |
|
1670 | 0 | _mm_empty (); |
1671 | 0 | } |
1672 | | |
1673 | | static void |
1674 | | mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp, |
1675 | | pixman_composite_info_t *info) |
1676 | 0 | { |
1677 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
1678 | 0 | uint32_t *dst_line, *dst; |
1679 | 0 | uint32_t *src_line, *src; |
1680 | 0 | uint32_t mask; |
1681 | 0 | __m64 vmask; |
1682 | 0 | int dst_stride, src_stride; |
1683 | 0 | int32_t w; |
1684 | 0 | __m64 srca; |
1685 | |
|
1686 | 0 | CHECKPOINT (); |
1687 | |
|
1688 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
1689 | 0 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
1690 | 0 | mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format); |
1691 | |
|
1692 | 0 | vmask = expand_alpha (load8888 (&mask)); |
1693 | 0 | srca = MC (4x00ff); |
1694 | |
|
1695 | 0 | while (height--) |
1696 | 0 | { |
1697 | 0 | dst = dst_line; |
1698 | 0 | dst_line += dst_stride; |
1699 | 0 | src = src_line; |
1700 | 0 | src_line += src_stride; |
1701 | 0 | w = width; |
1702 | |
|
1703 | 0 | while (w && (uintptr_t)dst & 7) |
1704 | 0 | { |
1705 | 0 | uint32_t ssrc = *src | 0xff000000; |
1706 | 0 | __m64 s = load8888 (&ssrc); |
1707 | 0 | __m64 d = load8888 (dst); |
1708 | |
|
1709 | 0 | store8888 (dst, in_over (s, srca, vmask, d)); |
1710 | |
|
1711 | 0 | w--; |
1712 | 0 | dst++; |
1713 | 0 | src++; |
1714 | 0 | } |
1715 | |
|
1716 | 0 | while (w >= 16) |
1717 | 0 | { |
1718 | 0 | __m64 vd0 = *(__m64 *)(dst + 0); |
1719 | 0 | __m64 vd1 = *(__m64 *)(dst + 2); |
1720 | 0 | __m64 vd2 = *(__m64 *)(dst + 4); |
1721 | 0 | __m64 vd3 = *(__m64 *)(dst + 6); |
1722 | 0 | __m64 vd4 = *(__m64 *)(dst + 8); |
1723 | 0 | __m64 vd5 = *(__m64 *)(dst + 10); |
1724 | 0 | __m64 vd6 = *(__m64 *)(dst + 12); |
1725 | 0 | __m64 vd7 = *(__m64 *)(dst + 14); |
1726 | |
|
1727 | 0 | __m64 vs0 = ldq_u ((__m64 *)(src + 0)); |
1728 | 0 | __m64 vs1 = ldq_u ((__m64 *)(src + 2)); |
1729 | 0 | __m64 vs2 = ldq_u ((__m64 *)(src + 4)); |
1730 | 0 | __m64 vs3 = ldq_u ((__m64 *)(src + 6)); |
1731 | 0 | __m64 vs4 = ldq_u ((__m64 *)(src + 8)); |
1732 | 0 | __m64 vs5 = ldq_u ((__m64 *)(src + 10)); |
1733 | 0 | __m64 vs6 = ldq_u ((__m64 *)(src + 12)); |
1734 | 0 | __m64 vs7 = ldq_u ((__m64 *)(src + 14)); |
1735 | |
|
1736 | 0 | vd0 = pack8888 ( |
1737 | 0 | in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)), |
1738 | 0 | in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1))); |
1739 | |
|
1740 | 0 | vd1 = pack8888 ( |
1741 | 0 | in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)), |
1742 | 0 | in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1))); |
1743 | |
|
1744 | 0 | vd2 = pack8888 ( |
1745 | 0 | in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)), |
1746 | 0 | in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1))); |
1747 | |
|
1748 | 0 | vd3 = pack8888 ( |
1749 | 0 | in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)), |
1750 | 0 | in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1))); |
1751 | |
|
1752 | 0 | vd4 = pack8888 ( |
1753 | 0 | in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)), |
1754 | 0 | in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1))); |
1755 | |
|
1756 | 0 | vd5 = pack8888 ( |
1757 | 0 | in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)), |
1758 | 0 | in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1))); |
1759 | |
|
1760 | 0 | vd6 = pack8888 ( |
1761 | 0 | in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)), |
1762 | 0 | in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1))); |
1763 | |
|
1764 | 0 | vd7 = pack8888 ( |
1765 | 0 | in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)), |
1766 | 0 | in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1))); |
1767 | |
|
1768 | 0 | *(__m64 *)(dst + 0) = vd0; |
1769 | 0 | *(__m64 *)(dst + 2) = vd1; |
1770 | 0 | *(__m64 *)(dst + 4) = vd2; |
1771 | 0 | *(__m64 *)(dst + 6) = vd3; |
1772 | 0 | *(__m64 *)(dst + 8) = vd4; |
1773 | 0 | *(__m64 *)(dst + 10) = vd5; |
1774 | 0 | *(__m64 *)(dst + 12) = vd6; |
1775 | 0 | *(__m64 *)(dst + 14) = vd7; |
1776 | |
|
1777 | 0 | w -= 16; |
1778 | 0 | dst += 16; |
1779 | 0 | src += 16; |
1780 | 0 | } |
1781 | |
|
1782 | 0 | while (w) |
1783 | 0 | { |
1784 | 0 | uint32_t ssrc = *src | 0xff000000; |
1785 | 0 | __m64 s = load8888 (&ssrc); |
1786 | 0 | __m64 d = load8888 (dst); |
1787 | |
|
1788 | 0 | store8888 (dst, in_over (s, srca, vmask, d)); |
1789 | |
|
1790 | 0 | w--; |
1791 | 0 | dst++; |
1792 | 0 | src++; |
1793 | 0 | } |
1794 | 0 | } |
1795 | |
|
1796 | 0 | _mm_empty (); |
1797 | 0 | } |
1798 | | |
1799 | | static void |
1800 | | mmx_composite_over_8888_8888 (pixman_implementation_t *imp, |
1801 | | pixman_composite_info_t *info) |
1802 | 0 | { |
1803 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
1804 | 0 | uint32_t *dst_line, *dst; |
1805 | 0 | uint32_t *src_line, *src; |
1806 | 0 | uint32_t s; |
1807 | 0 | int dst_stride, src_stride; |
1808 | 0 | uint8_t a; |
1809 | 0 | int32_t w; |
1810 | |
|
1811 | 0 | CHECKPOINT (); |
1812 | |
|
1813 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
1814 | 0 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
1815 | |
|
1816 | 0 | while (height--) |
1817 | 0 | { |
1818 | 0 | dst = dst_line; |
1819 | 0 | dst_line += dst_stride; |
1820 | 0 | src = src_line; |
1821 | 0 | src_line += src_stride; |
1822 | 0 | w = width; |
1823 | |
|
1824 | 0 | while (w--) |
1825 | 0 | { |
1826 | 0 | s = *src++; |
1827 | 0 | a = s >> 24; |
1828 | |
|
1829 | 0 | if (a == 0xff) |
1830 | 0 | { |
1831 | 0 | *dst = s; |
1832 | 0 | } |
1833 | 0 | else if (s) |
1834 | 0 | { |
1835 | 0 | __m64 ms, sa; |
1836 | 0 | ms = load8888 (&s); |
1837 | 0 | sa = expand_alpha (ms); |
1838 | 0 | store8888 (dst, over (ms, sa, load8888 (dst))); |
1839 | 0 | } |
1840 | |
|
1841 | 0 | dst++; |
1842 | 0 | } |
1843 | 0 | } |
1844 | 0 | _mm_empty (); |
1845 | 0 | } |
1846 | | |
1847 | | static void |
1848 | | mmx_composite_over_8888_0565 (pixman_implementation_t *imp, |
1849 | | pixman_composite_info_t *info) |
1850 | 0 | { |
1851 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
1852 | 0 | uint16_t *dst_line, *dst; |
1853 | 0 | uint32_t *src_line, *src; |
1854 | 0 | int dst_stride, src_stride; |
1855 | 0 | int32_t w; |
1856 | |
|
1857 | 0 | CHECKPOINT (); |
1858 | |
|
1859 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
1860 | 0 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
1861 | |
|
1862 | | #if 0 |
1863 | | /* FIXME */ |
1864 | | assert (src_image->drawable == mask_image->drawable); |
1865 | | #endif |
1866 | |
|
1867 | 0 | while (height--) |
1868 | 0 | { |
1869 | 0 | dst = dst_line; |
1870 | 0 | dst_line += dst_stride; |
1871 | 0 | src = src_line; |
1872 | 0 | src_line += src_stride; |
1873 | 0 | w = width; |
1874 | |
|
1875 | 0 | CHECKPOINT (); |
1876 | |
|
1877 | 0 | while (w && (uintptr_t)dst & 7) |
1878 | 0 | { |
1879 | 0 | __m64 vsrc = load8888 (src); |
1880 | 0 | uint64_t d = *dst; |
1881 | 0 | __m64 vdest = expand565 (to_m64 (d), 0); |
1882 | |
|
1883 | 0 | vdest = pack_565 ( |
1884 | 0 | over (vsrc, expand_alpha (vsrc), vdest), vdest, 0); |
1885 | |
|
1886 | 0 | *dst = to_uint64 (vdest); |
1887 | |
|
1888 | 0 | w--; |
1889 | 0 | dst++; |
1890 | 0 | src++; |
1891 | 0 | } |
1892 | |
|
1893 | 0 | CHECKPOINT (); |
1894 | |
|
1895 | 0 | while (w >= 4) |
1896 | 0 | { |
1897 | 0 | __m64 vdest = *(__m64 *)dst; |
1898 | 0 | __m64 v0, v1, v2, v3; |
1899 | 0 | __m64 vsrc0, vsrc1, vsrc2, vsrc3; |
1900 | |
|
1901 | 0 | expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); |
1902 | |
|
1903 | 0 | vsrc0 = load8888 ((src + 0)); |
1904 | 0 | vsrc1 = load8888 ((src + 1)); |
1905 | 0 | vsrc2 = load8888 ((src + 2)); |
1906 | 0 | vsrc3 = load8888 ((src + 3)); |
1907 | |
|
1908 | 0 | v0 = over (vsrc0, expand_alpha (vsrc0), v0); |
1909 | 0 | v1 = over (vsrc1, expand_alpha (vsrc1), v1); |
1910 | 0 | v2 = over (vsrc2, expand_alpha (vsrc2), v2); |
1911 | 0 | v3 = over (vsrc3, expand_alpha (vsrc3), v3); |
1912 | |
|
1913 | 0 | *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); |
1914 | |
|
1915 | 0 | w -= 4; |
1916 | 0 | dst += 4; |
1917 | 0 | src += 4; |
1918 | 0 | } |
1919 | |
|
1920 | 0 | CHECKPOINT (); |
1921 | |
|
1922 | 0 | while (w) |
1923 | 0 | { |
1924 | 0 | __m64 vsrc = load8888 (src); |
1925 | 0 | uint64_t d = *dst; |
1926 | 0 | __m64 vdest = expand565 (to_m64 (d), 0); |
1927 | |
|
1928 | 0 | vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0); |
1929 | |
|
1930 | 0 | *dst = to_uint64 (vdest); |
1931 | |
|
1932 | 0 | w--; |
1933 | 0 | dst++; |
1934 | 0 | src++; |
1935 | 0 | } |
1936 | 0 | } |
1937 | |
|
1938 | 0 | _mm_empty (); |
1939 | 0 | } |
1940 | | |
1941 | | static void |
1942 | | mmx_composite_over_n_8_8888 (pixman_implementation_t *imp, |
1943 | | pixman_composite_info_t *info) |
1944 | 0 | { |
1945 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
1946 | 0 | uint32_t src, srca; |
1947 | 0 | uint32_t *dst_line, *dst; |
1948 | 0 | uint8_t *mask_line, *mask; |
1949 | 0 | int dst_stride, mask_stride; |
1950 | 0 | int32_t w; |
1951 | 0 | __m64 vsrc, vsrca; |
1952 | 0 | uint64_t srcsrc; |
1953 | |
|
1954 | 0 | CHECKPOINT (); |
1955 | |
|
1956 | 0 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
1957 | |
|
1958 | 0 | srca = src >> 24; |
1959 | 0 | if (src == 0) |
1960 | 0 | return; |
1961 | | |
1962 | 0 | srcsrc = (uint64_t)src << 32 | src; |
1963 | |
|
1964 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
1965 | 0 | PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
1966 | |
|
1967 | 0 | vsrc = load8888 (&src); |
1968 | 0 | vsrca = expand_alpha (vsrc); |
1969 | |
|
1970 | 0 | while (height--) |
1971 | 0 | { |
1972 | 0 | dst = dst_line; |
1973 | 0 | dst_line += dst_stride; |
1974 | 0 | mask = mask_line; |
1975 | 0 | mask_line += mask_stride; |
1976 | 0 | w = width; |
1977 | |
|
1978 | 0 | CHECKPOINT (); |
1979 | |
|
1980 | 0 | while (w && (uintptr_t)dst & 7) |
1981 | 0 | { |
1982 | 0 | uint64_t m = *mask; |
1983 | |
|
1984 | 0 | if (m) |
1985 | 0 | { |
1986 | 0 | __m64 vdest = in_over (vsrc, vsrca, |
1987 | 0 | expand_alpha_rev (to_m64 (m)), |
1988 | 0 | load8888 (dst)); |
1989 | |
|
1990 | 0 | store8888 (dst, vdest); |
1991 | 0 | } |
1992 | |
|
1993 | 0 | w--; |
1994 | 0 | mask++; |
1995 | 0 | dst++; |
1996 | 0 | } |
1997 | |
|
1998 | 0 | CHECKPOINT (); |
1999 | |
|
2000 | 0 | while (w >= 2) |
2001 | 0 | { |
2002 | 0 | uint64_t m0, m1; |
2003 | |
|
2004 | 0 | m0 = *mask; |
2005 | 0 | m1 = *(mask + 1); |
2006 | |
|
2007 | 0 | if (srca == 0xff && (m0 & m1) == 0xff) |
2008 | 0 | { |
2009 | 0 | *(uint64_t *)dst = srcsrc; |
2010 | 0 | } |
2011 | 0 | else if (m0 | m1) |
2012 | 0 | { |
2013 | 0 | __m64 vdest; |
2014 | 0 | __m64 dest0, dest1; |
2015 | |
|
2016 | 0 | vdest = *(__m64 *)dst; |
2017 | |
|
2018 | 0 | dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)), |
2019 | 0 | expand8888 (vdest, 0)); |
2020 | 0 | dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)), |
2021 | 0 | expand8888 (vdest, 1)); |
2022 | |
|
2023 | 0 | *(__m64 *)dst = pack8888 (dest0, dest1); |
2024 | 0 | } |
2025 | |
|
2026 | 0 | mask += 2; |
2027 | 0 | dst += 2; |
2028 | 0 | w -= 2; |
2029 | 0 | } |
2030 | |
|
2031 | 0 | CHECKPOINT (); |
2032 | |
|
2033 | 0 | if (w) |
2034 | 0 | { |
2035 | 0 | uint64_t m = *mask; |
2036 | |
|
2037 | 0 | if (m) |
2038 | 0 | { |
2039 | 0 | __m64 vdest = load8888 (dst); |
2040 | |
|
2041 | 0 | vdest = in_over ( |
2042 | 0 | vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest); |
2043 | 0 | store8888 (dst, vdest); |
2044 | 0 | } |
2045 | 0 | } |
2046 | 0 | } |
2047 | |
|
2048 | 0 | _mm_empty (); |
2049 | 0 | } |
2050 | | |
2051 | | static pixman_bool_t |
2052 | | mmx_fill (pixman_implementation_t *imp, |
2053 | | uint32_t * bits, |
2054 | | int stride, |
2055 | | int bpp, |
2056 | | int x, |
2057 | | int y, |
2058 | | int width, |
2059 | | int height, |
2060 | | uint32_t filler) |
2061 | 0 | { |
2062 | 0 | uint64_t fill; |
2063 | 0 | __m64 vfill; |
2064 | 0 | uint32_t byte_width; |
2065 | 0 | uint8_t *byte_line; |
2066 | |
|
2067 | 0 | #if defined __GNUC__ && defined USE_X86_MMX |
2068 | 0 | __m64 v1, v2, v3, v4, v5, v6, v7; |
2069 | 0 | #endif |
2070 | |
|
2071 | 0 | if (bpp != 16 && bpp != 32 && bpp != 8) |
2072 | 0 | return FALSE; |
2073 | | |
2074 | 0 | if (bpp == 8) |
2075 | 0 | { |
2076 | 0 | stride = stride * (int) sizeof (uint32_t) / 1; |
2077 | 0 | byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x); |
2078 | 0 | byte_width = width; |
2079 | 0 | stride *= 1; |
2080 | 0 | filler = (filler & 0xff) * 0x01010101; |
2081 | 0 | } |
2082 | 0 | else if (bpp == 16) |
2083 | 0 | { |
2084 | 0 | stride = stride * (int) sizeof (uint32_t) / 2; |
2085 | 0 | byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x); |
2086 | 0 | byte_width = 2 * width; |
2087 | 0 | stride *= 2; |
2088 | 0 | filler = (filler & 0xffff) * 0x00010001; |
2089 | 0 | } |
2090 | 0 | else |
2091 | 0 | { |
2092 | 0 | stride = stride * (int) sizeof (uint32_t) / 4; |
2093 | 0 | byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x); |
2094 | 0 | byte_width = 4 * width; |
2095 | 0 | stride *= 4; |
2096 | 0 | } |
2097 | |
|
2098 | 0 | fill = ((uint64_t)filler << 32) | filler; |
2099 | 0 | vfill = to_m64 (fill); |
2100 | |
|
2101 | 0 | #if defined __GNUC__ && defined USE_X86_MMX |
2102 | 0 | __asm__ ( |
2103 | 0 | "movq %7, %0\n" |
2104 | 0 | "movq %7, %1\n" |
2105 | 0 | "movq %7, %2\n" |
2106 | 0 | "movq %7, %3\n" |
2107 | 0 | "movq %7, %4\n" |
2108 | 0 | "movq %7, %5\n" |
2109 | 0 | "movq %7, %6\n" |
2110 | 0 | : "=&y" (v1), "=&y" (v2), "=&y" (v3), |
2111 | 0 | "=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7) |
2112 | 0 | : "y" (vfill)); |
2113 | 0 | #endif |
2114 | |
|
2115 | 0 | while (height--) |
2116 | 0 | { |
2117 | 0 | int w; |
2118 | 0 | uint8_t *d = byte_line; |
2119 | |
|
2120 | 0 | byte_line += stride; |
2121 | 0 | w = byte_width; |
2122 | |
|
2123 | 0 | if (w >= 1 && ((uintptr_t)d & 1)) |
2124 | 0 | { |
2125 | 0 | *(uint8_t *)d = (filler & 0xff); |
2126 | 0 | w--; |
2127 | 0 | d++; |
2128 | 0 | } |
2129 | |
|
2130 | 0 | if (w >= 2 && ((uintptr_t)d & 3)) |
2131 | 0 | { |
2132 | 0 | *(uint16_t *)d = filler; |
2133 | 0 | w -= 2; |
2134 | 0 | d += 2; |
2135 | 0 | } |
2136 | |
|
2137 | 0 | while (w >= 4 && ((uintptr_t)d & 7)) |
2138 | 0 | { |
2139 | 0 | *(uint32_t *)d = filler; |
2140 | |
|
2141 | 0 | w -= 4; |
2142 | 0 | d += 4; |
2143 | 0 | } |
2144 | |
|
2145 | 0 | while (w >= 64) |
2146 | 0 | { |
2147 | 0 | #if defined __GNUC__ && defined USE_X86_MMX |
2148 | 0 | __asm__ ( |
2149 | 0 | "movq %1, (%0)\n" |
2150 | 0 | "movq %2, 8(%0)\n" |
2151 | 0 | "movq %3, 16(%0)\n" |
2152 | 0 | "movq %4, 24(%0)\n" |
2153 | 0 | "movq %5, 32(%0)\n" |
2154 | 0 | "movq %6, 40(%0)\n" |
2155 | 0 | "movq %7, 48(%0)\n" |
2156 | 0 | "movq %8, 56(%0)\n" |
2157 | 0 | : |
2158 | 0 | : "r" (d), |
2159 | 0 | "y" (vfill), "y" (v1), "y" (v2), "y" (v3), |
2160 | 0 | "y" (v4), "y" (v5), "y" (v6), "y" (v7) |
2161 | 0 | : "memory"); |
2162 | | #else |
2163 | | *(__m64*) (d + 0) = vfill; |
2164 | | *(__m64*) (d + 8) = vfill; |
2165 | | *(__m64*) (d + 16) = vfill; |
2166 | | *(__m64*) (d + 24) = vfill; |
2167 | | *(__m64*) (d + 32) = vfill; |
2168 | | *(__m64*) (d + 40) = vfill; |
2169 | | *(__m64*) (d + 48) = vfill; |
2170 | | *(__m64*) (d + 56) = vfill; |
2171 | | #endif |
2172 | 0 | w -= 64; |
2173 | 0 | d += 64; |
2174 | 0 | } |
2175 | |
|
2176 | 0 | while (w >= 4) |
2177 | 0 | { |
2178 | 0 | *(uint32_t *)d = filler; |
2179 | |
|
2180 | 0 | w -= 4; |
2181 | 0 | d += 4; |
2182 | 0 | } |
2183 | 0 | if (w >= 2) |
2184 | 0 | { |
2185 | 0 | *(uint16_t *)d = filler; |
2186 | 0 | w -= 2; |
2187 | 0 | d += 2; |
2188 | 0 | } |
2189 | 0 | if (w >= 1) |
2190 | 0 | { |
2191 | 0 | *(uint8_t *)d = (filler & 0xff); |
2192 | 0 | w--; |
2193 | 0 | d++; |
2194 | 0 | } |
2195 | |
|
2196 | 0 | } |
2197 | |
|
2198 | 0 | _mm_empty (); |
2199 | 0 | return TRUE; |
2200 | 0 | } |
2201 | | |
2202 | | static void |
2203 | | mmx_composite_src_x888_0565 (pixman_implementation_t *imp, |
2204 | | pixman_composite_info_t *info) |
2205 | 0 | { |
2206 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
2207 | 0 | uint16_t *dst_line, *dst; |
2208 | 0 | uint32_t *src_line, *src, s; |
2209 | 0 | int dst_stride, src_stride; |
2210 | 0 | int32_t w; |
2211 | |
|
2212 | 0 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
2213 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
2214 | |
|
2215 | 0 | while (height--) |
2216 | 0 | { |
2217 | 0 | dst = dst_line; |
2218 | 0 | dst_line += dst_stride; |
2219 | 0 | src = src_line; |
2220 | 0 | src_line += src_stride; |
2221 | 0 | w = width; |
2222 | |
|
2223 | 0 | while (w && (uintptr_t)dst & 7) |
2224 | 0 | { |
2225 | 0 | s = *src++; |
2226 | 0 | *dst = convert_8888_to_0565 (s); |
2227 | 0 | dst++; |
2228 | 0 | w--; |
2229 | 0 | } |
2230 | |
|
2231 | 0 | while (w >= 4) |
2232 | 0 | { |
2233 | 0 | __m64 vdest; |
2234 | 0 | __m64 vsrc0 = ldq_u ((__m64 *)(src + 0)); |
2235 | 0 | __m64 vsrc1 = ldq_u ((__m64 *)(src + 2)); |
2236 | |
|
2237 | 0 | vdest = pack_4xpacked565 (vsrc0, vsrc1); |
2238 | |
|
2239 | 0 | *(__m64 *)dst = vdest; |
2240 | |
|
2241 | 0 | w -= 4; |
2242 | 0 | src += 4; |
2243 | 0 | dst += 4; |
2244 | 0 | } |
2245 | |
|
2246 | 0 | while (w) |
2247 | 0 | { |
2248 | 0 | s = *src++; |
2249 | 0 | *dst = convert_8888_to_0565 (s); |
2250 | 0 | dst++; |
2251 | 0 | w--; |
2252 | 0 | } |
2253 | 0 | } |
2254 | |
|
2255 | 0 | _mm_empty (); |
2256 | 0 | } |
2257 | | |
2258 | | static void |
2259 | | mmx_composite_src_n_8_8888 (pixman_implementation_t *imp, |
2260 | | pixman_composite_info_t *info) |
2261 | 0 | { |
2262 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
2263 | 0 | uint32_t src, srca; |
2264 | 0 | uint32_t *dst_line, *dst; |
2265 | 0 | uint8_t *mask_line, *mask; |
2266 | 0 | int dst_stride, mask_stride; |
2267 | 0 | int32_t w; |
2268 | 0 | __m64 vsrc; |
2269 | 0 | uint64_t srcsrc; |
2270 | |
|
2271 | 0 | CHECKPOINT (); |
2272 | |
|
2273 | 0 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
2274 | |
|
2275 | 0 | srca = src >> 24; |
2276 | 0 | if (src == 0) |
2277 | 0 | { |
2278 | 0 | mmx_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride, |
2279 | 0 | PIXMAN_FORMAT_BPP (dest_image->bits.format), |
2280 | 0 | dest_x, dest_y, width, height, 0); |
2281 | 0 | return; |
2282 | 0 | } |
2283 | | |
2284 | 0 | srcsrc = (uint64_t)src << 32 | src; |
2285 | |
|
2286 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
2287 | 0 | PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
2288 | |
|
2289 | 0 | vsrc = load8888 (&src); |
2290 | |
|
2291 | 0 | while (height--) |
2292 | 0 | { |
2293 | 0 | dst = dst_line; |
2294 | 0 | dst_line += dst_stride; |
2295 | 0 | mask = mask_line; |
2296 | 0 | mask_line += mask_stride; |
2297 | 0 | w = width; |
2298 | |
|
2299 | 0 | CHECKPOINT (); |
2300 | |
|
2301 | 0 | while (w && (uintptr_t)dst & 7) |
2302 | 0 | { |
2303 | 0 | uint64_t m = *mask; |
2304 | |
|
2305 | 0 | if (m) |
2306 | 0 | { |
2307 | 0 | __m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m))); |
2308 | |
|
2309 | 0 | store8888 (dst, vdest); |
2310 | 0 | } |
2311 | 0 | else |
2312 | 0 | { |
2313 | 0 | *dst = 0; |
2314 | 0 | } |
2315 | |
|
2316 | 0 | w--; |
2317 | 0 | mask++; |
2318 | 0 | dst++; |
2319 | 0 | } |
2320 | |
|
2321 | 0 | CHECKPOINT (); |
2322 | |
|
2323 | 0 | while (w >= 2) |
2324 | 0 | { |
2325 | 0 | uint64_t m0, m1; |
2326 | 0 | m0 = *mask; |
2327 | 0 | m1 = *(mask + 1); |
2328 | |
|
2329 | 0 | if (srca == 0xff && (m0 & m1) == 0xff) |
2330 | 0 | { |
2331 | 0 | *(uint64_t *)dst = srcsrc; |
2332 | 0 | } |
2333 | 0 | else if (m0 | m1) |
2334 | 0 | { |
2335 | 0 | __m64 dest0, dest1; |
2336 | |
|
2337 | 0 | dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0))); |
2338 | 0 | dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1))); |
2339 | |
|
2340 | 0 | *(__m64 *)dst = pack8888 (dest0, dest1); |
2341 | 0 | } |
2342 | 0 | else |
2343 | 0 | { |
2344 | 0 | *(uint64_t *)dst = 0; |
2345 | 0 | } |
2346 | |
|
2347 | 0 | mask += 2; |
2348 | 0 | dst += 2; |
2349 | 0 | w -= 2; |
2350 | 0 | } |
2351 | |
|
2352 | 0 | CHECKPOINT (); |
2353 | |
|
2354 | 0 | if (w) |
2355 | 0 | { |
2356 | 0 | uint64_t m = *mask; |
2357 | |
|
2358 | 0 | if (m) |
2359 | 0 | { |
2360 | 0 | __m64 vdest = load8888 (dst); |
2361 | |
|
2362 | 0 | vdest = in (vsrc, expand_alpha_rev (to_m64 (m))); |
2363 | 0 | store8888 (dst, vdest); |
2364 | 0 | } |
2365 | 0 | else |
2366 | 0 | { |
2367 | 0 | *dst = 0; |
2368 | 0 | } |
2369 | 0 | } |
2370 | 0 | } |
2371 | |
|
2372 | 0 | _mm_empty (); |
2373 | 0 | } |
2374 | | |
2375 | | static void |
2376 | | mmx_composite_over_n_8_0565 (pixman_implementation_t *imp, |
2377 | | pixman_composite_info_t *info) |
2378 | 0 | { |
2379 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
2380 | 0 | uint32_t src, srca; |
2381 | 0 | uint16_t *dst_line, *dst; |
2382 | 0 | uint8_t *mask_line, *mask; |
2383 | 0 | int dst_stride, mask_stride; |
2384 | 0 | int32_t w; |
2385 | 0 | __m64 vsrc, vsrca, tmp; |
2386 | 0 | __m64 srcsrcsrcsrc; |
2387 | |
|
2388 | 0 | CHECKPOINT (); |
2389 | |
|
2390 | 0 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
2391 | |
|
2392 | 0 | srca = src >> 24; |
2393 | 0 | if (src == 0) |
2394 | 0 | return; |
2395 | | |
2396 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
2397 | 0 | PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
2398 | |
|
2399 | 0 | vsrc = load8888 (&src); |
2400 | 0 | vsrca = expand_alpha (vsrc); |
2401 | |
|
2402 | 0 | tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0); |
2403 | 0 | srcsrcsrcsrc = expand_alpha_rev (tmp); |
2404 | |
|
2405 | 0 | while (height--) |
2406 | 0 | { |
2407 | 0 | dst = dst_line; |
2408 | 0 | dst_line += dst_stride; |
2409 | 0 | mask = mask_line; |
2410 | 0 | mask_line += mask_stride; |
2411 | 0 | w = width; |
2412 | |
|
2413 | 0 | CHECKPOINT (); |
2414 | |
|
2415 | 0 | while (w && (uintptr_t)dst & 7) |
2416 | 0 | { |
2417 | 0 | uint64_t m = *mask; |
2418 | |
|
2419 | 0 | if (m) |
2420 | 0 | { |
2421 | 0 | uint64_t d = *dst; |
2422 | 0 | __m64 vd = to_m64 (d); |
2423 | 0 | __m64 vdest = in_over ( |
2424 | 0 | vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0)); |
2425 | |
|
2426 | 0 | vd = pack_565 (vdest, _mm_setzero_si64 (), 0); |
2427 | 0 | *dst = to_uint64 (vd); |
2428 | 0 | } |
2429 | |
|
2430 | 0 | w--; |
2431 | 0 | mask++; |
2432 | 0 | dst++; |
2433 | 0 | } |
2434 | |
|
2435 | 0 | CHECKPOINT (); |
2436 | |
|
2437 | 0 | while (w >= 4) |
2438 | 0 | { |
2439 | 0 | uint64_t m0, m1, m2, m3; |
2440 | 0 | m0 = *mask; |
2441 | 0 | m1 = *(mask + 1); |
2442 | 0 | m2 = *(mask + 2); |
2443 | 0 | m3 = *(mask + 3); |
2444 | |
|
2445 | 0 | if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff) |
2446 | 0 | { |
2447 | 0 | *(__m64 *)dst = srcsrcsrcsrc; |
2448 | 0 | } |
2449 | 0 | else if (m0 | m1 | m2 | m3) |
2450 | 0 | { |
2451 | 0 | __m64 vdest = *(__m64 *)dst; |
2452 | 0 | __m64 v0, v1, v2, v3; |
2453 | 0 | __m64 vm0, vm1, vm2, vm3; |
2454 | |
|
2455 | 0 | expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); |
2456 | |
|
2457 | 0 | vm0 = to_m64 (m0); |
2458 | 0 | v0 = in_over (vsrc, vsrca, expand_alpha_rev (vm0), v0); |
2459 | |
|
2460 | 0 | vm1 = to_m64 (m1); |
2461 | 0 | v1 = in_over (vsrc, vsrca, expand_alpha_rev (vm1), v1); |
2462 | |
|
2463 | 0 | vm2 = to_m64 (m2); |
2464 | 0 | v2 = in_over (vsrc, vsrca, expand_alpha_rev (vm2), v2); |
2465 | |
|
2466 | 0 | vm3 = to_m64 (m3); |
2467 | 0 | v3 = in_over (vsrc, vsrca, expand_alpha_rev (vm3), v3); |
2468 | |
|
2469 | 0 | *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);; |
2470 | 0 | } |
2471 | |
|
2472 | 0 | w -= 4; |
2473 | 0 | mask += 4; |
2474 | 0 | dst += 4; |
2475 | 0 | } |
2476 | |
|
2477 | 0 | CHECKPOINT (); |
2478 | |
|
2479 | 0 | while (w) |
2480 | 0 | { |
2481 | 0 | uint64_t m = *mask; |
2482 | |
|
2483 | 0 | if (m) |
2484 | 0 | { |
2485 | 0 | uint64_t d = *dst; |
2486 | 0 | __m64 vd = to_m64 (d); |
2487 | 0 | __m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)), |
2488 | 0 | expand565 (vd, 0)); |
2489 | 0 | vd = pack_565 (vdest, _mm_setzero_si64 (), 0); |
2490 | 0 | *dst = to_uint64 (vd); |
2491 | 0 | } |
2492 | |
|
2493 | 0 | w--; |
2494 | 0 | mask++; |
2495 | 0 | dst++; |
2496 | 0 | } |
2497 | 0 | } |
2498 | |
|
2499 | 0 | _mm_empty (); |
2500 | 0 | } |
2501 | | |
2502 | | static void |
2503 | | mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp, |
2504 | | pixman_composite_info_t *info) |
2505 | 0 | { |
2506 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
2507 | 0 | uint16_t *dst_line, *dst; |
2508 | 0 | uint32_t *src_line, *src; |
2509 | 0 | int dst_stride, src_stride; |
2510 | 0 | int32_t w; |
2511 | |
|
2512 | 0 | CHECKPOINT (); |
2513 | |
|
2514 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
2515 | 0 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
2516 | |
|
2517 | | #if 0 |
2518 | | /* FIXME */ |
2519 | | assert (src_image->drawable == mask_image->drawable); |
2520 | | #endif |
2521 | |
|
2522 | 0 | while (height--) |
2523 | 0 | { |
2524 | 0 | dst = dst_line; |
2525 | 0 | dst_line += dst_stride; |
2526 | 0 | src = src_line; |
2527 | 0 | src_line += src_stride; |
2528 | 0 | w = width; |
2529 | |
|
2530 | 0 | CHECKPOINT (); |
2531 | |
|
2532 | 0 | while (w && (uintptr_t)dst & 7) |
2533 | 0 | { |
2534 | 0 | __m64 vsrc = load8888 (src); |
2535 | 0 | uint64_t d = *dst; |
2536 | 0 | __m64 vdest = expand565 (to_m64 (d), 0); |
2537 | |
|
2538 | 0 | vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0); |
2539 | |
|
2540 | 0 | *dst = to_uint64 (vdest); |
2541 | |
|
2542 | 0 | w--; |
2543 | 0 | dst++; |
2544 | 0 | src++; |
2545 | 0 | } |
2546 | |
|
2547 | 0 | CHECKPOINT (); |
2548 | |
|
2549 | 0 | while (w >= 4) |
2550 | 0 | { |
2551 | 0 | uint32_t s0, s1, s2, s3; |
2552 | 0 | unsigned char a0, a1, a2, a3; |
2553 | |
|
2554 | 0 | s0 = *src; |
2555 | 0 | s1 = *(src + 1); |
2556 | 0 | s2 = *(src + 2); |
2557 | 0 | s3 = *(src + 3); |
2558 | |
|
2559 | 0 | a0 = (s0 >> 24); |
2560 | 0 | a1 = (s1 >> 24); |
2561 | 0 | a2 = (s2 >> 24); |
2562 | 0 | a3 = (s3 >> 24); |
2563 | |
|
2564 | 0 | if ((a0 & a1 & a2 & a3) == 0xFF) |
2565 | 0 | { |
2566 | 0 | __m64 v0 = invert_colors (load8888 (&s0)); |
2567 | 0 | __m64 v1 = invert_colors (load8888 (&s1)); |
2568 | 0 | __m64 v2 = invert_colors (load8888 (&s2)); |
2569 | 0 | __m64 v3 = invert_colors (load8888 (&s3)); |
2570 | |
|
2571 | 0 | *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); |
2572 | 0 | } |
2573 | 0 | else if (s0 | s1 | s2 | s3) |
2574 | 0 | { |
2575 | 0 | __m64 vdest = *(__m64 *)dst; |
2576 | 0 | __m64 v0, v1, v2, v3; |
2577 | |
|
2578 | 0 | __m64 vsrc0 = load8888 (&s0); |
2579 | 0 | __m64 vsrc1 = load8888 (&s1); |
2580 | 0 | __m64 vsrc2 = load8888 (&s2); |
2581 | 0 | __m64 vsrc3 = load8888 (&s3); |
2582 | |
|
2583 | 0 | expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); |
2584 | |
|
2585 | 0 | v0 = over_rev_non_pre (vsrc0, v0); |
2586 | 0 | v1 = over_rev_non_pre (vsrc1, v1); |
2587 | 0 | v2 = over_rev_non_pre (vsrc2, v2); |
2588 | 0 | v3 = over_rev_non_pre (vsrc3, v3); |
2589 | |
|
2590 | 0 | *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); |
2591 | 0 | } |
2592 | |
|
2593 | 0 | w -= 4; |
2594 | 0 | dst += 4; |
2595 | 0 | src += 4; |
2596 | 0 | } |
2597 | |
|
2598 | 0 | CHECKPOINT (); |
2599 | |
|
2600 | 0 | while (w) |
2601 | 0 | { |
2602 | 0 | __m64 vsrc = load8888 (src); |
2603 | 0 | uint64_t d = *dst; |
2604 | 0 | __m64 vdest = expand565 (to_m64 (d), 0); |
2605 | |
|
2606 | 0 | vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0); |
2607 | |
|
2608 | 0 | *dst = to_uint64 (vdest); |
2609 | |
|
2610 | 0 | w--; |
2611 | 0 | dst++; |
2612 | 0 | src++; |
2613 | 0 | } |
2614 | 0 | } |
2615 | |
|
2616 | 0 | _mm_empty (); |
2617 | 0 | } |
2618 | | |
2619 | | static void |
2620 | | mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp, |
2621 | | pixman_composite_info_t *info) |
2622 | 0 | { |
2623 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
2624 | 0 | uint32_t *dst_line, *dst; |
2625 | 0 | uint32_t *src_line, *src; |
2626 | 0 | int dst_stride, src_stride; |
2627 | 0 | int32_t w; |
2628 | |
|
2629 | 0 | CHECKPOINT (); |
2630 | |
|
2631 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
2632 | 0 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
2633 | |
|
2634 | | #if 0 |
2635 | | /* FIXME */ |
2636 | | assert (src_image->drawable == mask_image->drawable); |
2637 | | #endif |
2638 | |
|
2639 | 0 | while (height--) |
2640 | 0 | { |
2641 | 0 | dst = dst_line; |
2642 | 0 | dst_line += dst_stride; |
2643 | 0 | src = src_line; |
2644 | 0 | src_line += src_stride; |
2645 | 0 | w = width; |
2646 | |
|
2647 | 0 | while (w && (uintptr_t)dst & 7) |
2648 | 0 | { |
2649 | 0 | __m64 s = load8888 (src); |
2650 | 0 | __m64 d = load8888 (dst); |
2651 | |
|
2652 | 0 | store8888 (dst, over_rev_non_pre (s, d)); |
2653 | |
|
2654 | 0 | w--; |
2655 | 0 | dst++; |
2656 | 0 | src++; |
2657 | 0 | } |
2658 | |
|
2659 | 0 | while (w >= 2) |
2660 | 0 | { |
2661 | 0 | uint32_t s0, s1; |
2662 | 0 | unsigned char a0, a1; |
2663 | 0 | __m64 d0, d1; |
2664 | |
|
2665 | 0 | s0 = *src; |
2666 | 0 | s1 = *(src + 1); |
2667 | |
|
2668 | 0 | a0 = (s0 >> 24); |
2669 | 0 | a1 = (s1 >> 24); |
2670 | |
|
2671 | 0 | if ((a0 & a1) == 0xFF) |
2672 | 0 | { |
2673 | 0 | d0 = invert_colors (load8888 (&s0)); |
2674 | 0 | d1 = invert_colors (load8888 (&s1)); |
2675 | |
|
2676 | 0 | *(__m64 *)dst = pack8888 (d0, d1); |
2677 | 0 | } |
2678 | 0 | else if (s0 | s1) |
2679 | 0 | { |
2680 | 0 | __m64 vdest = *(__m64 *)dst; |
2681 | |
|
2682 | 0 | d0 = over_rev_non_pre (load8888 (&s0), expand8888 (vdest, 0)); |
2683 | 0 | d1 = over_rev_non_pre (load8888 (&s1), expand8888 (vdest, 1)); |
2684 | |
|
2685 | 0 | *(__m64 *)dst = pack8888 (d0, d1); |
2686 | 0 | } |
2687 | |
|
2688 | 0 | w -= 2; |
2689 | 0 | dst += 2; |
2690 | 0 | src += 2; |
2691 | 0 | } |
2692 | |
|
2693 | 0 | if (w) |
2694 | 0 | { |
2695 | 0 | __m64 s = load8888 (src); |
2696 | 0 | __m64 d = load8888 (dst); |
2697 | |
|
2698 | 0 | store8888 (dst, over_rev_non_pre (s, d)); |
2699 | 0 | } |
2700 | 0 | } |
2701 | |
|
2702 | 0 | _mm_empty (); |
2703 | 0 | } |
2704 | | |
2705 | | static void |
2706 | | mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, |
2707 | | pixman_composite_info_t *info) |
2708 | 0 | { |
2709 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
2710 | 0 | uint32_t src; |
2711 | 0 | uint16_t *dst_line; |
2712 | 0 | uint32_t *mask_line; |
2713 | 0 | int dst_stride, mask_stride; |
2714 | 0 | __m64 vsrc, vsrca; |
2715 | |
|
2716 | 0 | CHECKPOINT (); |
2717 | |
|
2718 | 0 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
2719 | |
|
2720 | 0 | if (src == 0) |
2721 | 0 | return; |
2722 | | |
2723 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
2724 | 0 | PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); |
2725 | |
|
2726 | 0 | vsrc = load8888 (&src); |
2727 | 0 | vsrca = expand_alpha (vsrc); |
2728 | |
|
2729 | 0 | while (height--) |
2730 | 0 | { |
2731 | 0 | int twidth = width; |
2732 | 0 | uint32_t *p = (uint32_t *)mask_line; |
2733 | 0 | uint16_t *q = (uint16_t *)dst_line; |
2734 | |
|
2735 | 0 | while (twidth && ((uintptr_t)q & 7)) |
2736 | 0 | { |
2737 | 0 | uint32_t m = *(uint32_t *)p; |
2738 | |
|
2739 | 0 | if (m) |
2740 | 0 | { |
2741 | 0 | uint64_t d = *q; |
2742 | 0 | __m64 vdest = expand565 (to_m64 (d), 0); |
2743 | 0 | vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0); |
2744 | 0 | *q = to_uint64 (vdest); |
2745 | 0 | } |
2746 | |
|
2747 | 0 | twidth--; |
2748 | 0 | p++; |
2749 | 0 | q++; |
2750 | 0 | } |
2751 | |
|
2752 | 0 | while (twidth >= 4) |
2753 | 0 | { |
2754 | 0 | uint32_t m0, m1, m2, m3; |
2755 | |
|
2756 | 0 | m0 = *p; |
2757 | 0 | m1 = *(p + 1); |
2758 | 0 | m2 = *(p + 2); |
2759 | 0 | m3 = *(p + 3); |
2760 | |
|
2761 | 0 | if ((m0 | m1 | m2 | m3)) |
2762 | 0 | { |
2763 | 0 | __m64 vdest = *(__m64 *)q; |
2764 | 0 | __m64 v0, v1, v2, v3; |
2765 | |
|
2766 | 0 | expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); |
2767 | |
|
2768 | 0 | v0 = in_over (vsrc, vsrca, load8888 (&m0), v0); |
2769 | 0 | v1 = in_over (vsrc, vsrca, load8888 (&m1), v1); |
2770 | 0 | v2 = in_over (vsrc, vsrca, load8888 (&m2), v2); |
2771 | 0 | v3 = in_over (vsrc, vsrca, load8888 (&m3), v3); |
2772 | |
|
2773 | 0 | *(__m64 *)q = pack_4x565 (v0, v1, v2, v3); |
2774 | 0 | } |
2775 | 0 | twidth -= 4; |
2776 | 0 | p += 4; |
2777 | 0 | q += 4; |
2778 | 0 | } |
2779 | |
|
2780 | 0 | while (twidth) |
2781 | 0 | { |
2782 | 0 | uint32_t m; |
2783 | |
|
2784 | 0 | m = *(uint32_t *)p; |
2785 | 0 | if (m) |
2786 | 0 | { |
2787 | 0 | uint64_t d = *q; |
2788 | 0 | __m64 vdest = expand565 (to_m64 (d), 0); |
2789 | 0 | vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0); |
2790 | 0 | *q = to_uint64 (vdest); |
2791 | 0 | } |
2792 | |
|
2793 | 0 | twidth--; |
2794 | 0 | p++; |
2795 | 0 | q++; |
2796 | 0 | } |
2797 | |
|
2798 | 0 | mask_line += mask_stride; |
2799 | 0 | dst_line += dst_stride; |
2800 | 0 | } |
2801 | |
|
2802 | 0 | _mm_empty (); |
2803 | 0 | } |
2804 | | |
2805 | | static void |
2806 | | mmx_composite_in_n_8_8 (pixman_implementation_t *imp, |
2807 | | pixman_composite_info_t *info) |
2808 | 0 | { |
2809 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
2810 | 0 | uint8_t *dst_line, *dst; |
2811 | 0 | uint8_t *mask_line, *mask; |
2812 | 0 | int dst_stride, mask_stride; |
2813 | 0 | int32_t w; |
2814 | 0 | uint32_t src; |
2815 | 0 | uint8_t sa; |
2816 | 0 | __m64 vsrc, vsrca; |
2817 | |
|
2818 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); |
2819 | 0 | PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
2820 | |
|
2821 | 0 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
2822 | |
|
2823 | 0 | sa = src >> 24; |
2824 | |
|
2825 | 0 | vsrc = load8888 (&src); |
2826 | 0 | vsrca = expand_alpha (vsrc); |
2827 | |
|
2828 | 0 | while (height--) |
2829 | 0 | { |
2830 | 0 | dst = dst_line; |
2831 | 0 | dst_line += dst_stride; |
2832 | 0 | mask = mask_line; |
2833 | 0 | mask_line += mask_stride; |
2834 | 0 | w = width; |
2835 | |
|
2836 | 0 | while (w && (uintptr_t)dst & 7) |
2837 | 0 | { |
2838 | 0 | uint16_t tmp; |
2839 | 0 | uint8_t a; |
2840 | 0 | uint32_t m, d; |
2841 | |
|
2842 | 0 | a = *mask++; |
2843 | 0 | d = *dst; |
2844 | |
|
2845 | 0 | m = MUL_UN8 (sa, a, tmp); |
2846 | 0 | d = MUL_UN8 (m, d, tmp); |
2847 | |
|
2848 | 0 | *dst++ = d; |
2849 | 0 | w--; |
2850 | 0 | } |
2851 | |
|
2852 | 0 | while (w >= 4) |
2853 | 0 | { |
2854 | 0 | __m64 vmask; |
2855 | 0 | __m64 vdest; |
2856 | |
|
2857 | 0 | vmask = load8888u ((uint32_t *)mask); |
2858 | 0 | vdest = load8888 ((uint32_t *)dst); |
2859 | |
|
2860 | 0 | store8888 ((uint32_t *)dst, in (in (vsrca, vmask), vdest)); |
2861 | |
|
2862 | 0 | dst += 4; |
2863 | 0 | mask += 4; |
2864 | 0 | w -= 4; |
2865 | 0 | } |
2866 | |
|
2867 | 0 | while (w--) |
2868 | 0 | { |
2869 | 0 | uint16_t tmp; |
2870 | 0 | uint8_t a; |
2871 | 0 | uint32_t m, d; |
2872 | |
|
2873 | 0 | a = *mask++; |
2874 | 0 | d = *dst; |
2875 | |
|
2876 | 0 | m = MUL_UN8 (sa, a, tmp); |
2877 | 0 | d = MUL_UN8 (m, d, tmp); |
2878 | |
|
2879 | 0 | *dst++ = d; |
2880 | 0 | } |
2881 | 0 | } |
2882 | |
|
2883 | 0 | _mm_empty (); |
2884 | 0 | } |
2885 | | |
2886 | | static void |
2887 | | mmx_composite_in_8_8 (pixman_implementation_t *imp, |
2888 | | pixman_composite_info_t *info) |
2889 | 0 | { |
2890 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
2891 | 0 | uint8_t *dst_line, *dst; |
2892 | 0 | uint8_t *src_line, *src; |
2893 | 0 | int src_stride, dst_stride; |
2894 | 0 | int32_t w; |
2895 | |
|
2896 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); |
2897 | 0 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); |
2898 | |
|
2899 | 0 | while (height--) |
2900 | 0 | { |
2901 | 0 | dst = dst_line; |
2902 | 0 | dst_line += dst_stride; |
2903 | 0 | src = src_line; |
2904 | 0 | src_line += src_stride; |
2905 | 0 | w = width; |
2906 | |
|
2907 | 0 | while (w && (uintptr_t)dst & 3) |
2908 | 0 | { |
2909 | 0 | uint8_t s, d; |
2910 | 0 | uint16_t tmp; |
2911 | |
|
2912 | 0 | s = *src; |
2913 | 0 | d = *dst; |
2914 | |
|
2915 | 0 | *dst = MUL_UN8 (s, d, tmp); |
2916 | |
|
2917 | 0 | src++; |
2918 | 0 | dst++; |
2919 | 0 | w--; |
2920 | 0 | } |
2921 | |
|
2922 | 0 | while (w >= 4) |
2923 | 0 | { |
2924 | 0 | uint32_t *s = (uint32_t *)src; |
2925 | 0 | uint32_t *d = (uint32_t *)dst; |
2926 | |
|
2927 | 0 | store8888 (d, in (load8888u (s), load8888 (d))); |
2928 | |
|
2929 | 0 | w -= 4; |
2930 | 0 | dst += 4; |
2931 | 0 | src += 4; |
2932 | 0 | } |
2933 | |
|
2934 | 0 | while (w--) |
2935 | 0 | { |
2936 | 0 | uint8_t s, d; |
2937 | 0 | uint16_t tmp; |
2938 | |
|
2939 | 0 | s = *src; |
2940 | 0 | d = *dst; |
2941 | |
|
2942 | 0 | *dst = MUL_UN8 (s, d, tmp); |
2943 | |
|
2944 | 0 | src++; |
2945 | 0 | dst++; |
2946 | 0 | } |
2947 | 0 | } |
2948 | |
|
2949 | 0 | _mm_empty (); |
2950 | 0 | } |
2951 | | |
2952 | | static void |
2953 | | mmx_composite_add_n_8_8 (pixman_implementation_t *imp, |
2954 | | pixman_composite_info_t *info) |
2955 | 0 | { |
2956 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
2957 | 0 | uint8_t *dst_line, *dst; |
2958 | 0 | uint8_t *mask_line, *mask; |
2959 | 0 | int dst_stride, mask_stride; |
2960 | 0 | int32_t w; |
2961 | 0 | uint32_t src; |
2962 | 0 | uint8_t sa; |
2963 | 0 | __m64 vsrc, vsrca; |
2964 | |
|
2965 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); |
2966 | 0 | PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
2967 | |
|
2968 | 0 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
2969 | |
|
2970 | 0 | sa = src >> 24; |
2971 | |
|
2972 | 0 | if (src == 0) |
2973 | 0 | return; |
2974 | | |
2975 | 0 | vsrc = load8888 (&src); |
2976 | 0 | vsrca = expand_alpha (vsrc); |
2977 | |
|
2978 | 0 | while (height--) |
2979 | 0 | { |
2980 | 0 | dst = dst_line; |
2981 | 0 | dst_line += dst_stride; |
2982 | 0 | mask = mask_line; |
2983 | 0 | mask_line += mask_stride; |
2984 | 0 | w = width; |
2985 | |
|
2986 | 0 | while (w && (uintptr_t)dst & 3) |
2987 | 0 | { |
2988 | 0 | uint16_t tmp; |
2989 | 0 | uint16_t a; |
2990 | 0 | uint32_t m, d; |
2991 | 0 | uint32_t r; |
2992 | |
|
2993 | 0 | a = *mask++; |
2994 | 0 | d = *dst; |
2995 | |
|
2996 | 0 | m = MUL_UN8 (sa, a, tmp); |
2997 | 0 | r = ADD_UN8 (m, d, tmp); |
2998 | |
|
2999 | 0 | *dst++ = r; |
3000 | 0 | w--; |
3001 | 0 | } |
3002 | |
|
3003 | 0 | while (w >= 4) |
3004 | 0 | { |
3005 | 0 | __m64 vmask; |
3006 | 0 | __m64 vdest; |
3007 | |
|
3008 | 0 | vmask = load8888u ((uint32_t *)mask); |
3009 | 0 | vdest = load8888 ((uint32_t *)dst); |
3010 | |
|
3011 | 0 | store8888 ((uint32_t *)dst, _mm_adds_pu8 (in (vsrca, vmask), vdest)); |
3012 | |
|
3013 | 0 | dst += 4; |
3014 | 0 | mask += 4; |
3015 | 0 | w -= 4; |
3016 | 0 | } |
3017 | |
|
3018 | 0 | while (w--) |
3019 | 0 | { |
3020 | 0 | uint16_t tmp; |
3021 | 0 | uint16_t a; |
3022 | 0 | uint32_t m, d; |
3023 | 0 | uint32_t r; |
3024 | |
|
3025 | 0 | a = *mask++; |
3026 | 0 | d = *dst; |
3027 | |
|
3028 | 0 | m = MUL_UN8 (sa, a, tmp); |
3029 | 0 | r = ADD_UN8 (m, d, tmp); |
3030 | |
|
3031 | 0 | *dst++ = r; |
3032 | 0 | } |
3033 | 0 | } |
3034 | |
|
3035 | 0 | _mm_empty (); |
3036 | 0 | } |
3037 | | |
3038 | | static void |
3039 | | mmx_composite_add_8_8 (pixman_implementation_t *imp, |
3040 | | pixman_composite_info_t *info) |
3041 | 0 | { |
3042 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
3043 | 0 | uint8_t *dst_line, *dst; |
3044 | 0 | uint8_t *src_line, *src; |
3045 | 0 | int dst_stride, src_stride; |
3046 | 0 | int32_t w; |
3047 | 0 | uint8_t s, d; |
3048 | 0 | uint16_t t; |
3049 | |
|
3050 | 0 | CHECKPOINT (); |
3051 | |
|
3052 | 0 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); |
3053 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); |
3054 | |
|
3055 | 0 | while (height--) |
3056 | 0 | { |
3057 | 0 | dst = dst_line; |
3058 | 0 | dst_line += dst_stride; |
3059 | 0 | src = src_line; |
3060 | 0 | src_line += src_stride; |
3061 | 0 | w = width; |
3062 | |
|
3063 | 0 | while (w && (uintptr_t)dst & 7) |
3064 | 0 | { |
3065 | 0 | s = *src; |
3066 | 0 | d = *dst; |
3067 | 0 | t = d + s; |
3068 | 0 | s = t | (0 - (t >> 8)); |
3069 | 0 | *dst = s; |
3070 | |
|
3071 | 0 | dst++; |
3072 | 0 | src++; |
3073 | 0 | w--; |
3074 | 0 | } |
3075 | |
|
3076 | 0 | while (w >= 8) |
3077 | 0 | { |
3078 | 0 | *(__m64*)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst); |
3079 | 0 | dst += 8; |
3080 | 0 | src += 8; |
3081 | 0 | w -= 8; |
3082 | 0 | } |
3083 | |
|
3084 | 0 | while (w) |
3085 | 0 | { |
3086 | 0 | s = *src; |
3087 | 0 | d = *dst; |
3088 | 0 | t = d + s; |
3089 | 0 | s = t | (0 - (t >> 8)); |
3090 | 0 | *dst = s; |
3091 | |
|
3092 | 0 | dst++; |
3093 | 0 | src++; |
3094 | 0 | w--; |
3095 | 0 | } |
3096 | 0 | } |
3097 | |
|
3098 | 0 | _mm_empty (); |
3099 | 0 | } |
3100 | | |
3101 | | static void |
3102 | | mmx_composite_add_0565_0565 (pixman_implementation_t *imp, |
3103 | | pixman_composite_info_t *info) |
3104 | 0 | { |
3105 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
3106 | 0 | uint16_t *dst_line, *dst; |
3107 | 0 | uint32_t d; |
3108 | 0 | uint16_t *src_line, *src; |
3109 | 0 | uint32_t s; |
3110 | 0 | int dst_stride, src_stride; |
3111 | 0 | int32_t w; |
3112 | |
|
3113 | 0 | CHECKPOINT (); |
3114 | |
|
3115 | 0 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1); |
3116 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
3117 | |
|
3118 | 0 | while (height--) |
3119 | 0 | { |
3120 | 0 | dst = dst_line; |
3121 | 0 | dst_line += dst_stride; |
3122 | 0 | src = src_line; |
3123 | 0 | src_line += src_stride; |
3124 | 0 | w = width; |
3125 | |
|
3126 | 0 | while (w && (uintptr_t)dst & 7) |
3127 | 0 | { |
3128 | 0 | s = *src++; |
3129 | 0 | if (s) |
3130 | 0 | { |
3131 | 0 | d = *dst; |
3132 | 0 | s = convert_0565_to_8888 (s); |
3133 | 0 | if (d) |
3134 | 0 | { |
3135 | 0 | d = convert_0565_to_8888 (d); |
3136 | 0 | UN8x4_ADD_UN8x4 (s, d); |
3137 | 0 | } |
3138 | 0 | *dst = convert_8888_to_0565 (s); |
3139 | 0 | } |
3140 | 0 | dst++; |
3141 | 0 | w--; |
3142 | 0 | } |
3143 | |
|
3144 | 0 | while (w >= 4) |
3145 | 0 | { |
3146 | 0 | __m64 vdest = *(__m64 *)dst; |
3147 | 0 | __m64 vsrc = ldq_u ((__m64 *)src); |
3148 | 0 | __m64 vd0, vd1; |
3149 | 0 | __m64 vs0, vs1; |
3150 | |
|
3151 | 0 | expand_4xpacked565 (vdest, &vd0, &vd1, 0); |
3152 | 0 | expand_4xpacked565 (vsrc, &vs0, &vs1, 0); |
3153 | |
|
3154 | 0 | vd0 = _mm_adds_pu8 (vd0, vs0); |
3155 | 0 | vd1 = _mm_adds_pu8 (vd1, vs1); |
3156 | |
|
3157 | 0 | *(__m64 *)dst = pack_4xpacked565 (vd0, vd1); |
3158 | |
|
3159 | 0 | dst += 4; |
3160 | 0 | src += 4; |
3161 | 0 | w -= 4; |
3162 | 0 | } |
3163 | |
|
3164 | 0 | while (w--) |
3165 | 0 | { |
3166 | 0 | s = *src++; |
3167 | 0 | if (s) |
3168 | 0 | { |
3169 | 0 | d = *dst; |
3170 | 0 | s = convert_0565_to_8888 (s); |
3171 | 0 | if (d) |
3172 | 0 | { |
3173 | 0 | d = convert_0565_to_8888 (d); |
3174 | 0 | UN8x4_ADD_UN8x4 (s, d); |
3175 | 0 | } |
3176 | 0 | *dst = convert_8888_to_0565 (s); |
3177 | 0 | } |
3178 | 0 | dst++; |
3179 | 0 | } |
3180 | 0 | } |
3181 | |
|
3182 | 0 | _mm_empty (); |
3183 | 0 | } |
3184 | | |
3185 | | static void |
3186 | | mmx_composite_add_8888_8888 (pixman_implementation_t *imp, |
3187 | | pixman_composite_info_t *info) |
3188 | 0 | { |
3189 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
3190 | 0 | uint32_t *dst_line, *dst; |
3191 | 0 | uint32_t *src_line, *src; |
3192 | 0 | int dst_stride, src_stride; |
3193 | 0 | int32_t w; |
3194 | |
|
3195 | 0 | CHECKPOINT (); |
3196 | |
|
3197 | 0 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
3198 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
3199 | |
|
3200 | 0 | while (height--) |
3201 | 0 | { |
3202 | 0 | dst = dst_line; |
3203 | 0 | dst_line += dst_stride; |
3204 | 0 | src = src_line; |
3205 | 0 | src_line += src_stride; |
3206 | 0 | w = width; |
3207 | |
|
3208 | 0 | while (w && (uintptr_t)dst & 7) |
3209 | 0 | { |
3210 | 0 | store (dst, _mm_adds_pu8 (load ((const uint32_t *)src), |
3211 | 0 | load ((const uint32_t *)dst))); |
3212 | 0 | dst++; |
3213 | 0 | src++; |
3214 | 0 | w--; |
3215 | 0 | } |
3216 | |
|
3217 | 0 | while (w >= 2) |
3218 | 0 | { |
3219 | 0 | *(__m64 *)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst); |
3220 | 0 | dst += 2; |
3221 | 0 | src += 2; |
3222 | 0 | w -= 2; |
3223 | 0 | } |
3224 | |
|
3225 | 0 | if (w) |
3226 | 0 | { |
3227 | 0 | store (dst, _mm_adds_pu8 (load ((const uint32_t *)src), |
3228 | 0 | load ((const uint32_t *)dst))); |
3229 | |
|
3230 | 0 | } |
3231 | 0 | } |
3232 | |
|
3233 | 0 | _mm_empty (); |
3234 | 0 | } |
3235 | | |
3236 | | static pixman_bool_t |
3237 | | mmx_blt (pixman_implementation_t *imp, |
3238 | | uint32_t * src_bits, |
3239 | | uint32_t * dst_bits, |
3240 | | int src_stride, |
3241 | | int dst_stride, |
3242 | | int src_bpp, |
3243 | | int dst_bpp, |
3244 | | int src_x, |
3245 | | int src_y, |
3246 | | int dest_x, |
3247 | | int dest_y, |
3248 | | int width, |
3249 | | int height) |
3250 | 0 | { |
3251 | 0 | uint8_t * src_bytes; |
3252 | 0 | uint8_t * dst_bytes; |
3253 | 0 | int byte_width; |
3254 | |
|
3255 | 0 | if (src_bpp != dst_bpp) |
3256 | 0 | return FALSE; |
3257 | | |
3258 | 0 | if (src_bpp == 16) |
3259 | 0 | { |
3260 | 0 | src_stride = src_stride * (int) sizeof (uint32_t) / 2; |
3261 | 0 | dst_stride = dst_stride * (int) sizeof (uint32_t) / 2; |
3262 | 0 | src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x)); |
3263 | 0 | dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x)); |
3264 | 0 | byte_width = 2 * width; |
3265 | 0 | src_stride *= 2; |
3266 | 0 | dst_stride *= 2; |
3267 | 0 | } |
3268 | 0 | else if (src_bpp == 32) |
3269 | 0 | { |
3270 | 0 | src_stride = src_stride * (int) sizeof (uint32_t) / 4; |
3271 | 0 | dst_stride = dst_stride * (int) sizeof (uint32_t) / 4; |
3272 | 0 | src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x)); |
3273 | 0 | dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x)); |
3274 | 0 | byte_width = 4 * width; |
3275 | 0 | src_stride *= 4; |
3276 | 0 | dst_stride *= 4; |
3277 | 0 | } |
3278 | 0 | else |
3279 | 0 | { |
3280 | 0 | return FALSE; |
3281 | 0 | } |
3282 | | |
3283 | 0 | while (height--) |
3284 | 0 | { |
3285 | 0 | int w; |
3286 | 0 | uint8_t *s = src_bytes; |
3287 | 0 | uint8_t *d = dst_bytes; |
3288 | 0 | src_bytes += src_stride; |
3289 | 0 | dst_bytes += dst_stride; |
3290 | 0 | w = byte_width; |
3291 | |
|
3292 | 0 | if (w >= 1 && ((uintptr_t)d & 1)) |
3293 | 0 | { |
3294 | 0 | *(uint8_t *)d = *(uint8_t *)s; |
3295 | 0 | w -= 1; |
3296 | 0 | s += 1; |
3297 | 0 | d += 1; |
3298 | 0 | } |
3299 | |
|
3300 | 0 | if (w >= 2 && ((uintptr_t)d & 3)) |
3301 | 0 | { |
3302 | 0 | *(uint16_t *)d = *(uint16_t *)s; |
3303 | 0 | w -= 2; |
3304 | 0 | s += 2; |
3305 | 0 | d += 2; |
3306 | 0 | } |
3307 | |
|
3308 | 0 | while (w >= 4 && ((uintptr_t)d & 7)) |
3309 | 0 | { |
3310 | 0 | *(uint32_t *)d = ldl_u ((uint32_t *)s); |
3311 | |
|
3312 | 0 | w -= 4; |
3313 | 0 | s += 4; |
3314 | 0 | d += 4; |
3315 | 0 | } |
3316 | |
|
3317 | 0 | while (w >= 64) |
3318 | 0 | { |
3319 | 0 | #if (defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined USE_X86_MMX |
3320 | 0 | __asm__ ( |
3321 | 0 | "movq (%1), %%mm0\n" |
3322 | 0 | "movq 8(%1), %%mm1\n" |
3323 | 0 | "movq 16(%1), %%mm2\n" |
3324 | 0 | "movq 24(%1), %%mm3\n" |
3325 | 0 | "movq 32(%1), %%mm4\n" |
3326 | 0 | "movq 40(%1), %%mm5\n" |
3327 | 0 | "movq 48(%1), %%mm6\n" |
3328 | 0 | "movq 56(%1), %%mm7\n" |
3329 | |
|
3330 | 0 | "movq %%mm0, (%0)\n" |
3331 | 0 | "movq %%mm1, 8(%0)\n" |
3332 | 0 | "movq %%mm2, 16(%0)\n" |
3333 | 0 | "movq %%mm3, 24(%0)\n" |
3334 | 0 | "movq %%mm4, 32(%0)\n" |
3335 | 0 | "movq %%mm5, 40(%0)\n" |
3336 | 0 | "movq %%mm6, 48(%0)\n" |
3337 | 0 | "movq %%mm7, 56(%0)\n" |
3338 | 0 | : |
3339 | 0 | : "r" (d), "r" (s) |
3340 | 0 | : "memory", |
3341 | 0 | "%mm0", "%mm1", "%mm2", "%mm3", |
3342 | 0 | "%mm4", "%mm5", "%mm6", "%mm7"); |
3343 | | #else |
3344 | | __m64 v0 = ldq_u ((__m64 *)(s + 0)); |
3345 | | __m64 v1 = ldq_u ((__m64 *)(s + 8)); |
3346 | | __m64 v2 = ldq_u ((__m64 *)(s + 16)); |
3347 | | __m64 v3 = ldq_u ((__m64 *)(s + 24)); |
3348 | | __m64 v4 = ldq_u ((__m64 *)(s + 32)); |
3349 | | __m64 v5 = ldq_u ((__m64 *)(s + 40)); |
3350 | | __m64 v6 = ldq_u ((__m64 *)(s + 48)); |
3351 | | __m64 v7 = ldq_u ((__m64 *)(s + 56)); |
3352 | | *(__m64 *)(d + 0) = v0; |
3353 | | *(__m64 *)(d + 8) = v1; |
3354 | | *(__m64 *)(d + 16) = v2; |
3355 | | *(__m64 *)(d + 24) = v3; |
3356 | | *(__m64 *)(d + 32) = v4; |
3357 | | *(__m64 *)(d + 40) = v5; |
3358 | | *(__m64 *)(d + 48) = v6; |
3359 | | *(__m64 *)(d + 56) = v7; |
3360 | | #endif |
3361 | |
|
3362 | 0 | w -= 64; |
3363 | 0 | s += 64; |
3364 | 0 | d += 64; |
3365 | 0 | } |
3366 | 0 | while (w >= 4) |
3367 | 0 | { |
3368 | 0 | *(uint32_t *)d = ldl_u ((uint32_t *)s); |
3369 | |
|
3370 | 0 | w -= 4; |
3371 | 0 | s += 4; |
3372 | 0 | d += 4; |
3373 | 0 | } |
3374 | 0 | if (w >= 2) |
3375 | 0 | { |
3376 | 0 | *(uint16_t *)d = *(uint16_t *)s; |
3377 | 0 | w -= 2; |
3378 | 0 | s += 2; |
3379 | 0 | d += 2; |
3380 | 0 | } |
3381 | 0 | } |
3382 | |
|
3383 | 0 | _mm_empty (); |
3384 | |
|
3385 | 0 | return TRUE; |
3386 | 0 | } |
3387 | | |
3388 | | static void |
3389 | | mmx_composite_copy_area (pixman_implementation_t *imp, |
3390 | | pixman_composite_info_t *info) |
3391 | 0 | { |
3392 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
3393 | |
|
3394 | 0 | mmx_blt (imp, src_image->bits.bits, |
3395 | 0 | dest_image->bits.bits, |
3396 | 0 | src_image->bits.rowstride, |
3397 | 0 | dest_image->bits.rowstride, |
3398 | 0 | PIXMAN_FORMAT_BPP (src_image->bits.format), |
3399 | 0 | PIXMAN_FORMAT_BPP (dest_image->bits.format), |
3400 | 0 | src_x, src_y, dest_x, dest_y, width, height); |
3401 | 0 | } |
3402 | | |
3403 | | static void |
3404 | | mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp, |
3405 | | pixman_composite_info_t *info) |
3406 | 0 | { |
3407 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
3408 | 0 | uint32_t *src, *src_line; |
3409 | 0 | uint32_t *dst, *dst_line; |
3410 | 0 | uint8_t *mask, *mask_line; |
3411 | 0 | int src_stride, mask_stride, dst_stride; |
3412 | 0 | int32_t w; |
3413 | |
|
3414 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
3415 | 0 | PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
3416 | 0 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
3417 | |
|
3418 | 0 | while (height--) |
3419 | 0 | { |
3420 | 0 | src = src_line; |
3421 | 0 | src_line += src_stride; |
3422 | 0 | dst = dst_line; |
3423 | 0 | dst_line += dst_stride; |
3424 | 0 | mask = mask_line; |
3425 | 0 | mask_line += mask_stride; |
3426 | |
|
3427 | 0 | w = width; |
3428 | |
|
3429 | 0 | while (w--) |
3430 | 0 | { |
3431 | 0 | uint64_t m = *mask; |
3432 | |
|
3433 | 0 | if (m) |
3434 | 0 | { |
3435 | 0 | uint32_t ssrc = *src | 0xff000000; |
3436 | 0 | __m64 s = load8888 (&ssrc); |
3437 | |
|
3438 | 0 | if (m == 0xff) |
3439 | 0 | { |
3440 | 0 | store8888 (dst, s); |
3441 | 0 | } |
3442 | 0 | else |
3443 | 0 | { |
3444 | 0 | __m64 sa = expand_alpha (s); |
3445 | 0 | __m64 vm = expand_alpha_rev (to_m64 (m)); |
3446 | 0 | __m64 vdest = in_over (s, sa, vm, load8888 (dst)); |
3447 | |
|
3448 | 0 | store8888 (dst, vdest); |
3449 | 0 | } |
3450 | 0 | } |
3451 | |
|
3452 | 0 | mask++; |
3453 | 0 | dst++; |
3454 | 0 | src++; |
3455 | 0 | } |
3456 | 0 | } |
3457 | |
|
3458 | 0 | _mm_empty (); |
3459 | 0 | } |
3460 | | |
3461 | | static void |
3462 | | mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp, |
3463 | | pixman_composite_info_t *info) |
3464 | 0 | { |
3465 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
3466 | 0 | uint32_t src; |
3467 | 0 | uint32_t *dst_line, *dst; |
3468 | 0 | int32_t w; |
3469 | 0 | int dst_stride; |
3470 | 0 | __m64 vsrc; |
3471 | |
|
3472 | 0 | CHECKPOINT (); |
3473 | |
|
3474 | 0 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
3475 | |
|
3476 | 0 | if (src == 0) |
3477 | 0 | return; |
3478 | | |
3479 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
3480 | |
|
3481 | 0 | vsrc = load8888 (&src); |
3482 | |
|
3483 | 0 | while (height--) |
3484 | 0 | { |
3485 | 0 | dst = dst_line; |
3486 | 0 | dst_line += dst_stride; |
3487 | 0 | w = width; |
3488 | |
|
3489 | 0 | CHECKPOINT (); |
3490 | |
|
3491 | 0 | while (w && (uintptr_t)dst & 7) |
3492 | 0 | { |
3493 | 0 | __m64 vdest = load8888 (dst); |
3494 | |
|
3495 | 0 | store8888 (dst, over (vdest, expand_alpha (vdest), vsrc)); |
3496 | |
|
3497 | 0 | w--; |
3498 | 0 | dst++; |
3499 | 0 | } |
3500 | |
|
3501 | 0 | while (w >= 2) |
3502 | 0 | { |
3503 | 0 | __m64 vdest = *(__m64 *)dst; |
3504 | 0 | __m64 dest0 = expand8888 (vdest, 0); |
3505 | 0 | __m64 dest1 = expand8888 (vdest, 1); |
3506 | | |
3507 | |
|
3508 | 0 | dest0 = over (dest0, expand_alpha (dest0), vsrc); |
3509 | 0 | dest1 = over (dest1, expand_alpha (dest1), vsrc); |
3510 | |
|
3511 | 0 | *(__m64 *)dst = pack8888 (dest0, dest1); |
3512 | |
|
3513 | 0 | dst += 2; |
3514 | 0 | w -= 2; |
3515 | 0 | } |
3516 | |
|
3517 | 0 | CHECKPOINT (); |
3518 | |
|
3519 | 0 | if (w) |
3520 | 0 | { |
3521 | 0 | __m64 vdest = load8888 (dst); |
3522 | |
|
3523 | 0 | store8888 (dst, over (vdest, expand_alpha (vdest), vsrc)); |
3524 | 0 | } |
3525 | 0 | } |
3526 | |
|
3527 | 0 | _mm_empty (); |
3528 | 0 | } |
3529 | | |
3530 | | static force_inline void |
3531 | | scaled_nearest_scanline_mmx_8888_8888_OVER (uint32_t* pd, |
3532 | | const uint32_t* ps, |
3533 | | int32_t w, |
3534 | | pixman_fixed_t vx, |
3535 | | pixman_fixed_t unit_x, |
3536 | | pixman_fixed_t src_width_fixed, |
3537 | | pixman_bool_t fully_transparent_src) |
3538 | 0 | { |
3539 | 0 | if (fully_transparent_src) |
3540 | 0 | return; |
3541 | | |
3542 | 0 | while (w) |
3543 | 0 | { |
3544 | 0 | __m64 d = load (pd); |
3545 | 0 | __m64 s = load (ps + pixman_fixed_to_int (vx)); |
3546 | 0 | vx += unit_x; |
3547 | 0 | while (vx >= 0) |
3548 | 0 | vx -= src_width_fixed; |
3549 | |
|
3550 | 0 | store8888 (pd, core_combine_over_u_pixel_mmx (s, d)); |
3551 | 0 | pd++; |
3552 | |
|
3553 | 0 | w--; |
3554 | 0 | } |
3555 | |
|
3556 | 0 | _mm_empty (); |
3557 | 0 | } |
3558 | | |
3559 | | FAST_NEAREST_MAINLOOP (mmx_8888_8888_cover_OVER, |
3560 | | scaled_nearest_scanline_mmx_8888_8888_OVER, |
3561 | | uint32_t, uint32_t, COVER) |
3562 | | FAST_NEAREST_MAINLOOP (mmx_8888_8888_none_OVER, |
3563 | | scaled_nearest_scanline_mmx_8888_8888_OVER, |
3564 | | uint32_t, uint32_t, NONE) |
3565 | | FAST_NEAREST_MAINLOOP (mmx_8888_8888_pad_OVER, |
3566 | | scaled_nearest_scanline_mmx_8888_8888_OVER, |
3567 | | uint32_t, uint32_t, PAD) |
3568 | | FAST_NEAREST_MAINLOOP (mmx_8888_8888_normal_OVER, |
3569 | | scaled_nearest_scanline_mmx_8888_8888_OVER, |
3570 | | uint32_t, uint32_t, NORMAL) |
3571 | | |
3572 | | static force_inline void |
3573 | | scaled_nearest_scanline_mmx_8888_n_8888_OVER (const uint32_t * mask, |
3574 | | uint32_t * dst, |
3575 | | const uint32_t * src, |
3576 | | int32_t w, |
3577 | | pixman_fixed_t vx, |
3578 | | pixman_fixed_t unit_x, |
3579 | | pixman_fixed_t src_width_fixed, |
3580 | | pixman_bool_t zero_src) |
3581 | 0 | { |
3582 | 0 | __m64 mm_mask; |
3583 | |
|
3584 | 0 | if (zero_src || (*mask >> 24) == 0) |
3585 | 0 | { |
3586 | | /* A workaround for https://gcc.gnu.org/PR47759 */ |
3587 | 0 | _mm_empty (); |
3588 | 0 | return; |
3589 | 0 | } |
3590 | | |
3591 | 0 | mm_mask = expand_alpha (load8888 (mask)); |
3592 | |
|
3593 | 0 | while (w) |
3594 | 0 | { |
3595 | 0 | uint32_t s = *(src + pixman_fixed_to_int (vx)); |
3596 | 0 | vx += unit_x; |
3597 | 0 | while (vx >= 0) |
3598 | 0 | vx -= src_width_fixed; |
3599 | |
|
3600 | 0 | if (s) |
3601 | 0 | { |
3602 | 0 | __m64 ms = load8888 (&s); |
3603 | 0 | __m64 alpha = expand_alpha (ms); |
3604 | 0 | __m64 dest = load8888 (dst); |
3605 | |
|
3606 | 0 | store8888 (dst, (in_over (ms, alpha, mm_mask, dest))); |
3607 | 0 | } |
3608 | |
|
3609 | 0 | dst++; |
3610 | 0 | w--; |
3611 | 0 | } |
3612 | |
|
3613 | 0 | _mm_empty (); |
3614 | 0 | } |
3615 | | |
3616 | | FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_cover_OVER, |
3617 | | scaled_nearest_scanline_mmx_8888_n_8888_OVER, |
3618 | | uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE) |
3619 | | FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_pad_OVER, |
3620 | | scaled_nearest_scanline_mmx_8888_n_8888_OVER, |
3621 | | uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE) |
3622 | | FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_none_OVER, |
3623 | | scaled_nearest_scanline_mmx_8888_n_8888_OVER, |
3624 | | uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE) |
3625 | | FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_normal_OVER, |
3626 | | scaled_nearest_scanline_mmx_8888_n_8888_OVER, |
3627 | | uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE) |
3628 | | |
3629 | 0 | #define BSHIFT ((1 << BILINEAR_INTERPOLATION_BITS)) |
3630 | 0 | #define BMSK (BSHIFT - 1) |
3631 | | |
3632 | | #define BILINEAR_DECLARE_VARIABLES \ |
3633 | 0 | const __m64 mm_wt = _mm_set_pi16 (wt, wt, wt, wt); \ |
3634 | 0 | const __m64 mm_wb = _mm_set_pi16 (wb, wb, wb, wb); \ |
3635 | 0 | const __m64 mm_addc7 = _mm_set_pi16 (0, 1, 0, 1); \ |
3636 | 0 | const __m64 mm_xorc7 = _mm_set_pi16 (0, BMSK, 0, BMSK); \ |
3637 | 0 | const __m64 mm_ux = _mm_set_pi16 (unit_x, unit_x, unit_x, unit_x); \ |
3638 | 0 | const __m64 mm_zero = _mm_setzero_si64 (); \ |
3639 | 0 | __m64 mm_x = _mm_set_pi16 (vx, vx, vx, vx) |
3640 | | |
3641 | 0 | #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix) \ |
3642 | 0 | do { \ |
3643 | 0 | /* fetch 2x2 pixel block into 2 mmx registers */ \ |
3644 | 0 | __m64 t = ldq_u ((__m64 *)&src_top [pixman_fixed_to_int (vx)]); \ |
3645 | 0 | __m64 b = ldq_u ((__m64 *)&src_bottom [pixman_fixed_to_int (vx)]); \ |
3646 | 0 | /* vertical interpolation */ \ |
3647 | 0 | __m64 t_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (t, mm_zero), mm_wt); \ |
3648 | 0 | __m64 t_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (t, mm_zero), mm_wt); \ |
3649 | 0 | __m64 b_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (b, mm_zero), mm_wb); \ |
3650 | 0 | __m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb); \ |
3651 | 0 | __m64 hi = _mm_add_pi16 (t_hi, b_hi); \ |
3652 | 0 | __m64 lo = _mm_add_pi16 (t_lo, b_lo); \ |
3653 | 0 | /* calculate horizontal weights */ \ |
3654 | 0 | __m64 mm_wh = _mm_add_pi16 (mm_addc7, _mm_xor_si64 (mm_xorc7, \ |
3655 | 0 | _mm_srli_pi16 (mm_x, \ |
3656 | 0 | 16 - BILINEAR_INTERPOLATION_BITS))); \ |
3657 | 0 | /* horizontal interpolation */ \ |
3658 | 0 | __m64 p = _mm_unpacklo_pi16 (lo, hi); \ |
3659 | 0 | __m64 q = _mm_unpackhi_pi16 (lo, hi); \ |
3660 | 0 | vx += unit_x; \ |
3661 | 0 | lo = _mm_madd_pi16 (p, mm_wh); \ |
3662 | 0 | hi = _mm_madd_pi16 (q, mm_wh); \ |
3663 | 0 | mm_x = _mm_add_pi16 (mm_x, mm_ux); \ |
3664 | 0 | /* shift and pack the result */ \ |
3665 | 0 | hi = _mm_srli_pi32 (hi, BILINEAR_INTERPOLATION_BITS * 2); \ |
3666 | 0 | lo = _mm_srli_pi32 (lo, BILINEAR_INTERPOLATION_BITS * 2); \ |
3667 | 0 | lo = _mm_packs_pi32 (lo, hi); \ |
3668 | 0 | lo = _mm_packs_pu16 (lo, lo); \ |
3669 | 0 | pix = lo; \ |
3670 | 0 | } while (0) |
3671 | | |
3672 | 0 | #define BILINEAR_SKIP_ONE_PIXEL() \ |
3673 | 0 | do { \ |
3674 | 0 | vx += unit_x; \ |
3675 | 0 | mm_x = _mm_add_pi16 (mm_x, mm_ux); \ |
3676 | 0 | } while(0) |
3677 | | |
3678 | | static force_inline void |
3679 | | scaled_bilinear_scanline_mmx_8888_8888_SRC (uint32_t * dst, |
3680 | | const uint32_t * mask, |
3681 | | const uint32_t * src_top, |
3682 | | const uint32_t * src_bottom, |
3683 | | int32_t w, |
3684 | | int wt, |
3685 | | int wb, |
3686 | | pixman_fixed_t vx, |
3687 | | pixman_fixed_t unit_x, |
3688 | | pixman_fixed_t max_vx, |
3689 | | pixman_bool_t zero_src) |
3690 | 0 | { |
3691 | 0 | BILINEAR_DECLARE_VARIABLES; |
3692 | 0 | __m64 pix; |
3693 | |
|
3694 | 0 | while (w--) |
3695 | 0 | { |
3696 | 0 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix); |
3697 | 0 | store (dst, pix); |
3698 | 0 | dst++; |
3699 | 0 | } |
3700 | |
|
3701 | 0 | _mm_empty (); |
3702 | 0 | } |
3703 | | |
3704 | | FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_SRC, |
3705 | | scaled_bilinear_scanline_mmx_8888_8888_SRC, |
3706 | | uint32_t, uint32_t, uint32_t, |
3707 | | COVER, FLAG_NONE) |
3708 | | FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_SRC, |
3709 | | scaled_bilinear_scanline_mmx_8888_8888_SRC, |
3710 | | uint32_t, uint32_t, uint32_t, |
3711 | | PAD, FLAG_NONE) |
3712 | | FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_SRC, |
3713 | | scaled_bilinear_scanline_mmx_8888_8888_SRC, |
3714 | | uint32_t, uint32_t, uint32_t, |
3715 | | NONE, FLAG_NONE) |
3716 | | FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_SRC, |
3717 | | scaled_bilinear_scanline_mmx_8888_8888_SRC, |
3718 | | uint32_t, uint32_t, uint32_t, |
3719 | | NORMAL, FLAG_NONE) |
3720 | | |
3721 | | static force_inline void |
3722 | | scaled_bilinear_scanline_mmx_8888_8888_OVER (uint32_t * dst, |
3723 | | const uint32_t * mask, |
3724 | | const uint32_t * src_top, |
3725 | | const uint32_t * src_bottom, |
3726 | | int32_t w, |
3727 | | int wt, |
3728 | | int wb, |
3729 | | pixman_fixed_t vx, |
3730 | | pixman_fixed_t unit_x, |
3731 | | pixman_fixed_t max_vx, |
3732 | | pixman_bool_t zero_src) |
3733 | 0 | { |
3734 | 0 | BILINEAR_DECLARE_VARIABLES; |
3735 | 0 | __m64 pix1, pix2; |
3736 | |
|
3737 | 0 | while (w) |
3738 | 0 | { |
3739 | 0 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
3740 | |
|
3741 | 0 | if (!is_zero (pix1)) |
3742 | 0 | { |
3743 | 0 | pix2 = load (dst); |
3744 | 0 | store8888 (dst, core_combine_over_u_pixel_mmx (pix1, pix2)); |
3745 | 0 | } |
3746 | |
|
3747 | 0 | w--; |
3748 | 0 | dst++; |
3749 | 0 | } |
3750 | |
|
3751 | 0 | _mm_empty (); |
3752 | 0 | } |
3753 | | |
3754 | | FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_OVER, |
3755 | | scaled_bilinear_scanline_mmx_8888_8888_OVER, |
3756 | | uint32_t, uint32_t, uint32_t, |
3757 | | COVER, FLAG_NONE) |
3758 | | FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_OVER, |
3759 | | scaled_bilinear_scanline_mmx_8888_8888_OVER, |
3760 | | uint32_t, uint32_t, uint32_t, |
3761 | | PAD, FLAG_NONE) |
3762 | | FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_OVER, |
3763 | | scaled_bilinear_scanline_mmx_8888_8888_OVER, |
3764 | | uint32_t, uint32_t, uint32_t, |
3765 | | NONE, FLAG_NONE) |
3766 | | FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_OVER, |
3767 | | scaled_bilinear_scanline_mmx_8888_8888_OVER, |
3768 | | uint32_t, uint32_t, uint32_t, |
3769 | | NORMAL, FLAG_NONE) |
3770 | | |
3771 | | static force_inline void |
3772 | | scaled_bilinear_scanline_mmx_8888_8_8888_OVER (uint32_t * dst, |
3773 | | const uint8_t * mask, |
3774 | | const uint32_t * src_top, |
3775 | | const uint32_t * src_bottom, |
3776 | | int32_t w, |
3777 | | int wt, |
3778 | | int wb, |
3779 | | pixman_fixed_t vx, |
3780 | | pixman_fixed_t unit_x, |
3781 | | pixman_fixed_t max_vx, |
3782 | | pixman_bool_t zero_src) |
3783 | 0 | { |
3784 | 0 | BILINEAR_DECLARE_VARIABLES; |
3785 | 0 | __m64 pix1, pix2; |
3786 | 0 | uint32_t m; |
3787 | |
|
3788 | 0 | while (w) |
3789 | 0 | { |
3790 | 0 | m = (uint32_t) *mask++; |
3791 | |
|
3792 | 0 | if (m) |
3793 | 0 | { |
3794 | 0 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
3795 | |
|
3796 | 0 | if (m == 0xff && is_opaque (pix1)) |
3797 | 0 | { |
3798 | 0 | store (dst, pix1); |
3799 | 0 | } |
3800 | 0 | else |
3801 | 0 | { |
3802 | 0 | __m64 ms, md, ma, msa; |
3803 | |
|
3804 | 0 | pix2 = load (dst); |
3805 | 0 | ma = expand_alpha_rev (to_m64 (m)); |
3806 | 0 | ms = _mm_unpacklo_pi8 (pix1, _mm_setzero_si64 ()); |
3807 | 0 | md = _mm_unpacklo_pi8 (pix2, _mm_setzero_si64 ()); |
3808 | |
|
3809 | 0 | msa = expand_alpha (ms); |
3810 | |
|
3811 | 0 | store8888 (dst, (in_over (ms, msa, ma, md))); |
3812 | 0 | } |
3813 | 0 | } |
3814 | 0 | else |
3815 | 0 | { |
3816 | 0 | BILINEAR_SKIP_ONE_PIXEL (); |
3817 | 0 | } |
3818 | |
|
3819 | 0 | w--; |
3820 | 0 | dst++; |
3821 | 0 | } |
3822 | |
|
3823 | 0 | _mm_empty (); |
3824 | 0 | } |
3825 | | |
3826 | | FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_cover_OVER, |
3827 | | scaled_bilinear_scanline_mmx_8888_8_8888_OVER, |
3828 | | uint32_t, uint8_t, uint32_t, |
3829 | | COVER, FLAG_HAVE_NON_SOLID_MASK) |
3830 | | FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_pad_OVER, |
3831 | | scaled_bilinear_scanline_mmx_8888_8_8888_OVER, |
3832 | | uint32_t, uint8_t, uint32_t, |
3833 | | PAD, FLAG_HAVE_NON_SOLID_MASK) |
3834 | | FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_none_OVER, |
3835 | | scaled_bilinear_scanline_mmx_8888_8_8888_OVER, |
3836 | | uint32_t, uint8_t, uint32_t, |
3837 | | NONE, FLAG_HAVE_NON_SOLID_MASK) |
3838 | | FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_normal_OVER, |
3839 | | scaled_bilinear_scanline_mmx_8888_8_8888_OVER, |
3840 | | uint32_t, uint8_t, uint32_t, |
3841 | | NORMAL, FLAG_HAVE_NON_SOLID_MASK) |
3842 | | |
3843 | | static uint32_t * |
3844 | | mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask) |
3845 | 0 | { |
3846 | 0 | int w = iter->width; |
3847 | 0 | uint32_t *dst = iter->buffer; |
3848 | 0 | uint32_t *src = (uint32_t *)iter->bits; |
3849 | |
|
3850 | 0 | iter->bits += iter->stride; |
3851 | |
|
3852 | 0 | while (w && ((uintptr_t)dst) & 7) |
3853 | 0 | { |
3854 | 0 | *dst++ = (*src++) | 0xff000000; |
3855 | 0 | w--; |
3856 | 0 | } |
3857 | |
|
3858 | 0 | while (w >= 8) |
3859 | 0 | { |
3860 | 0 | __m64 vsrc1 = ldq_u ((__m64 *)(src + 0)); |
3861 | 0 | __m64 vsrc2 = ldq_u ((__m64 *)(src + 2)); |
3862 | 0 | __m64 vsrc3 = ldq_u ((__m64 *)(src + 4)); |
3863 | 0 | __m64 vsrc4 = ldq_u ((__m64 *)(src + 6)); |
3864 | |
|
3865 | 0 | *(__m64 *)(dst + 0) = _mm_or_si64 (vsrc1, MC (ff000000)); |
3866 | 0 | *(__m64 *)(dst + 2) = _mm_or_si64 (vsrc2, MC (ff000000)); |
3867 | 0 | *(__m64 *)(dst + 4) = _mm_or_si64 (vsrc3, MC (ff000000)); |
3868 | 0 | *(__m64 *)(dst + 6) = _mm_or_si64 (vsrc4, MC (ff000000)); |
3869 | |
|
3870 | 0 | dst += 8; |
3871 | 0 | src += 8; |
3872 | 0 | w -= 8; |
3873 | 0 | } |
3874 | |
|
3875 | 0 | while (w) |
3876 | 0 | { |
3877 | 0 | *dst++ = (*src++) | 0xff000000; |
3878 | 0 | w--; |
3879 | 0 | } |
3880 | |
|
3881 | 0 | _mm_empty (); |
3882 | 0 | return iter->buffer; |
3883 | 0 | } |
3884 | | |
3885 | | static uint32_t * |
3886 | | mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask) |
3887 | 0 | { |
3888 | 0 | int w = iter->width; |
3889 | 0 | uint32_t *dst = iter->buffer; |
3890 | 0 | uint16_t *src = (uint16_t *)iter->bits; |
3891 | |
|
3892 | 0 | iter->bits += iter->stride; |
3893 | |
|
3894 | 0 | while (w && ((uintptr_t)dst) & 0x0f) |
3895 | 0 | { |
3896 | 0 | uint16_t s = *src++; |
3897 | |
|
3898 | 0 | *dst++ = convert_0565_to_8888 (s); |
3899 | 0 | w--; |
3900 | 0 | } |
3901 | |
|
3902 | 0 | while (w >= 4) |
3903 | 0 | { |
3904 | 0 | __m64 vsrc = ldq_u ((__m64 *)src); |
3905 | 0 | __m64 mm0, mm1; |
3906 | |
|
3907 | 0 | expand_4xpacked565 (vsrc, &mm0, &mm1, 1); |
3908 | |
|
3909 | 0 | *(__m64 *)(dst + 0) = mm0; |
3910 | 0 | *(__m64 *)(dst + 2) = mm1; |
3911 | |
|
3912 | 0 | dst += 4; |
3913 | 0 | src += 4; |
3914 | 0 | w -= 4; |
3915 | 0 | } |
3916 | |
|
3917 | 0 | while (w) |
3918 | 0 | { |
3919 | 0 | uint16_t s = *src++; |
3920 | |
|
3921 | 0 | *dst++ = convert_0565_to_8888 (s); |
3922 | 0 | w--; |
3923 | 0 | } |
3924 | |
|
3925 | 0 | _mm_empty (); |
3926 | 0 | return iter->buffer; |
3927 | 0 | } |
3928 | | |
3929 | | static uint32_t * |
3930 | | mmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask) |
3931 | 0 | { |
3932 | 0 | int w = iter->width; |
3933 | 0 | uint32_t *dst = iter->buffer; |
3934 | 0 | uint8_t *src = iter->bits; |
3935 | |
|
3936 | 0 | iter->bits += iter->stride; |
3937 | |
|
3938 | 0 | while (w && (((uintptr_t)dst) & 15)) |
3939 | 0 | { |
3940 | 0 | *dst++ = (uint32_t)*(src++) << 24; |
3941 | 0 | w--; |
3942 | 0 | } |
3943 | |
|
3944 | 0 | while (w >= 8) |
3945 | 0 | { |
3946 | 0 | __m64 mm0 = ldq_u ((__m64 *)src); |
3947 | |
|
3948 | 0 | __m64 mm1 = _mm_unpacklo_pi8 (_mm_setzero_si64(), mm0); |
3949 | 0 | __m64 mm2 = _mm_unpackhi_pi8 (_mm_setzero_si64(), mm0); |
3950 | 0 | __m64 mm3 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm1); |
3951 | 0 | __m64 mm4 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm1); |
3952 | 0 | __m64 mm5 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm2); |
3953 | 0 | __m64 mm6 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm2); |
3954 | |
|
3955 | 0 | *(__m64 *)(dst + 0) = mm3; |
3956 | 0 | *(__m64 *)(dst + 2) = mm4; |
3957 | 0 | *(__m64 *)(dst + 4) = mm5; |
3958 | 0 | *(__m64 *)(dst + 6) = mm6; |
3959 | |
|
3960 | 0 | dst += 8; |
3961 | 0 | src += 8; |
3962 | 0 | w -= 8; |
3963 | 0 | } |
3964 | |
|
3965 | 0 | while (w) |
3966 | 0 | { |
3967 | 0 | *dst++ = (uint32_t)*(src++) << 24; |
3968 | 0 | w--; |
3969 | 0 | } |
3970 | |
|
3971 | 0 | _mm_empty (); |
3972 | 0 | return iter->buffer; |
3973 | 0 | } |
3974 | | |
3975 | | #define IMAGE_FLAGS \ |
3976 | | (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \ |
3977 | | FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST) |
3978 | | |
3979 | | static const pixman_iter_info_t mmx_iters[] = |
3980 | | { |
3981 | | { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW, |
3982 | | _pixman_iter_init_bits_stride, mmx_fetch_x8r8g8b8, NULL |
3983 | | }, |
3984 | | { PIXMAN_r5g6b5, IMAGE_FLAGS, ITER_NARROW, |
3985 | | _pixman_iter_init_bits_stride, mmx_fetch_r5g6b5, NULL |
3986 | | }, |
3987 | | { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW, |
3988 | | _pixman_iter_init_bits_stride, mmx_fetch_a8, NULL |
3989 | | }, |
3990 | | { PIXMAN_null }, |
3991 | | }; |
3992 | | |
3993 | | static const pixman_fast_path_t mmx_fast_paths[] = |
3994 | | { |
3995 | | PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, mmx_composite_over_n_8_0565 ), |
3996 | | PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, mmx_composite_over_n_8_0565 ), |
3997 | | PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, mmx_composite_over_n_8_8888 ), |
3998 | | PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, mmx_composite_over_n_8_8888 ), |
3999 | | PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, mmx_composite_over_n_8_8888 ), |
4000 | | PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, mmx_composite_over_n_8_8888 ), |
4001 | | PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ), |
4002 | | PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ), |
4003 | | PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, mmx_composite_over_n_8888_0565_ca ), |
4004 | | PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ), |
4005 | | PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ), |
4006 | | PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, mmx_composite_over_n_8888_0565_ca ), |
4007 | | PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, mmx_composite_over_pixbuf_8888 ), |
4008 | | PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, mmx_composite_over_pixbuf_8888 ), |
4009 | | PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, mmx_composite_over_pixbuf_0565 ), |
4010 | | PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, mmx_composite_over_pixbuf_8888 ), |
4011 | | PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, mmx_composite_over_pixbuf_8888 ), |
4012 | | PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, mmx_composite_over_pixbuf_0565 ), |
4013 | | PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, mmx_composite_over_x888_n_8888 ), |
4014 | | PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, mmx_composite_over_x888_n_8888 ), |
4015 | | PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, mmx_composite_over_x888_n_8888 ), |
4016 | | PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, mmx_composite_over_x888_n_8888 ), |
4017 | | PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, mmx_composite_over_8888_n_8888 ), |
4018 | | PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, mmx_composite_over_8888_n_8888 ), |
4019 | | PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, mmx_composite_over_8888_n_8888 ), |
4020 | | PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, mmx_composite_over_8888_n_8888 ), |
4021 | | PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, mmx_composite_over_x888_8_8888 ), |
4022 | | PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, mmx_composite_over_x888_8_8888 ), |
4023 | | PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, mmx_composite_over_x888_8_8888 ), |
4024 | | PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, mmx_composite_over_x888_8_8888 ), |
4025 | | PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, mmx_composite_over_n_8888 ), |
4026 | | PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, mmx_composite_over_n_8888 ), |
4027 | | PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, mmx_composite_over_n_0565 ), |
4028 | | PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, mmx_composite_over_n_0565 ), |
4029 | | PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ), |
4030 | | PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ), |
4031 | | |
4032 | | PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, mmx_composite_over_8888_8888 ), |
4033 | | PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, mmx_composite_over_8888_8888 ), |
4034 | | PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, mmx_composite_over_8888_0565 ), |
4035 | | PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, mmx_composite_over_8888_8888 ), |
4036 | | PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, mmx_composite_over_8888_8888 ), |
4037 | | PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, mmx_composite_over_8888_0565 ), |
4038 | | |
4039 | | PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, mmx_composite_over_reverse_n_8888), |
4040 | | PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, mmx_composite_over_reverse_n_8888), |
4041 | | |
4042 | | PIXMAN_STD_FAST_PATH (ADD, r5g6b5, null, r5g6b5, mmx_composite_add_0565_0565 ), |
4043 | | PIXMAN_STD_FAST_PATH (ADD, b5g6r5, null, b5g6r5, mmx_composite_add_0565_0565 ), |
4044 | | PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, mmx_composite_add_8888_8888 ), |
4045 | | PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, mmx_composite_add_8888_8888 ), |
4046 | | PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, mmx_composite_add_8_8 ), |
4047 | | PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, mmx_composite_add_n_8_8 ), |
4048 | | |
4049 | | PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, mmx_composite_src_x888_0565 ), |
4050 | | PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, mmx_composite_src_x888_0565 ), |
4051 | | PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, mmx_composite_src_x888_0565 ), |
4052 | | PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, mmx_composite_src_x888_0565 ), |
4053 | | PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, mmx_composite_src_n_8_8888 ), |
4054 | | PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, mmx_composite_src_n_8_8888 ), |
4055 | | PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, mmx_composite_src_n_8_8888 ), |
4056 | | PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, mmx_composite_src_n_8_8888 ), |
4057 | | PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, mmx_composite_copy_area ), |
4058 | | PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, mmx_composite_copy_area ), |
4059 | | PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ), |
4060 | | PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ), |
4061 | | PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ), |
4062 | | PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ), |
4063 | | PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, mmx_composite_copy_area ), |
4064 | | PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, mmx_composite_copy_area ), |
4065 | | |
4066 | | PIXMAN_STD_FAST_PATH (IN, a8, null, a8, mmx_composite_in_8_8 ), |
4067 | | PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, mmx_composite_in_n_8_8 ), |
4068 | | |
4069 | | SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ), |
4070 | | SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ), |
4071 | | SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ), |
4072 | | SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ), |
4073 | | |
4074 | | SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_n_8888 ), |
4075 | | SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_n_8888 ), |
4076 | | SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_n_8888 ), |
4077 | | SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_n_8888 ), |
4078 | | |
4079 | | SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ), |
4080 | | SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ), |
4081 | | SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, mmx_8888_8888 ), |
4082 | | SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ), |
4083 | | SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ), |
4084 | | SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, mmx_8888_8888 ), |
4085 | | |
4086 | | SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ), |
4087 | | SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ), |
4088 | | SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ), |
4089 | | SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ), |
4090 | | |
4091 | | SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8_8888 ), |
4092 | | SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8_8888 ), |
4093 | | SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8_8888 ), |
4094 | | SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8_8888 ), |
4095 | | |
4096 | | { PIXMAN_OP_NONE }, |
4097 | | }; |
4098 | | |
4099 | | pixman_implementation_t * |
4100 | | _pixman_implementation_create_mmx (pixman_implementation_t *fallback) |
4101 | 12 | { |
4102 | 12 | pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths); |
4103 | | |
4104 | 12 | imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u; |
4105 | 12 | imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u; |
4106 | 12 | imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u; |
4107 | 12 | imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u; |
4108 | 12 | imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u; |
4109 | 12 | imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u; |
4110 | 12 | imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u; |
4111 | 12 | imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u; |
4112 | 12 | imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u; |
4113 | 12 | imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u; |
4114 | 12 | imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u; |
4115 | | |
4116 | 12 | imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca; |
4117 | 12 | imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca; |
4118 | 12 | imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca; |
4119 | 12 | imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca; |
4120 | 12 | imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca; |
4121 | 12 | imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca; |
4122 | 12 | imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca; |
4123 | 12 | imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca; |
4124 | 12 | imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca; |
4125 | 12 | imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca; |
4126 | 12 | imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca; |
4127 | | |
4128 | 12 | imp->blt = mmx_blt; |
4129 | 12 | imp->fill = mmx_fill; |
4130 | | |
4131 | 12 | imp->iter_info = mmx_iters; |
4132 | | |
4133 | 12 | return imp; |
4134 | 12 | } |
4135 | | |
4136 | | #endif /* USE_X86_MMX || USE_LOONGSON_MMI */ |