/work/workdir/UnpackedTarball/pixman/pixman/pixman-mmx.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright © 2004, 2005 Red Hat, Inc. |
3 | | * Copyright © 2004 Nicholas Miell |
4 | | * Copyright © 2005 Trolltech AS |
5 | | * |
6 | | * Permission to use, copy, modify, distribute, and sell this software and its |
7 | | * documentation for any purpose is hereby granted without fee, provided that |
8 | | * the above copyright notice appear in all copies and that both that |
9 | | * copyright notice and this permission notice appear in supporting |
10 | | * documentation, and that the name of Red Hat not be used in advertising or |
11 | | * publicity pertaining to distribution of the software without specific, |
12 | | * written prior permission. Red Hat makes no representations about the |
13 | | * suitability of this software for any purpose. It is provided "as is" |
14 | | * without express or implied warranty. |
15 | | * |
16 | | * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS |
17 | | * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND |
18 | | * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY |
19 | | * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
20 | | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN |
21 | | * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING |
22 | | * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS |
23 | | * SOFTWARE. |
24 | | * |
25 | | * Author: Søren Sandmann (sandmann@redhat.com) |
26 | | * Minor Improvements: Nicholas Miell (nmiell@gmail.com) |
27 | | * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com) |
28 | | * |
29 | | * Based on work by Owen Taylor |
30 | | */ |
31 | | |
32 | | #ifdef HAVE_CONFIG_H |
33 | | #include <config.h> |
34 | | #endif |
35 | | |
36 | | #if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_MMI |
37 | | |
38 | | #ifdef USE_LOONGSON_MMI |
39 | | #include <loongson-mmintrin.h> |
40 | | #else |
41 | | #include <mmintrin.h> |
42 | | #endif |
43 | | #include "pixman-private.h" |
44 | | #include "pixman-combine32.h" |
45 | | #include "pixman-inlines.h" |
46 | | |
47 | | #ifdef VERBOSE |
48 | | #define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__) |
49 | | #else |
50 | | #define CHECKPOINT() |
51 | | #endif |
52 | | |
53 | | #if defined USE_ARM_IWMMXT && __GNUC__ == 4 && __GNUC_MINOR__ < 8 |
54 | | /* Empty the multimedia state. For some reason, ARM's mmintrin.h doesn't provide this. */ |
55 | | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
56 | | _mm_empty (void) |
57 | | { |
58 | | |
59 | | } |
60 | | #endif |
61 | | |
62 | | #ifdef USE_X86_MMX |
63 | | # if (defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64)) |
64 | | # include <xmmintrin.h> |
65 | | # else |
66 | | /* We have to compile with -msse to use xmmintrin.h, but that causes SSE |
67 | | * instructions to be generated that we don't want. Just duplicate the |
68 | | * functions we want to use. */ |
69 | | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
70 | | _mm_movemask_pi8 (__m64 __A) |
71 | 0 | { |
72 | 0 | int ret; |
73 | |
|
74 | 0 | asm ("pmovmskb %1, %0\n\t" |
75 | 0 | : "=r" (ret) |
76 | 0 | : "y" (__A) |
77 | 0 | ); |
78 | |
|
79 | 0 | return ret; |
80 | 0 | } |
81 | | |
82 | | extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
83 | | _mm_mulhi_pu16 (__m64 __A, __m64 __B) |
84 | 0 | { |
85 | 0 | asm ("pmulhuw %1, %0\n\t" |
86 | 0 | : "+y" (__A) |
87 | 0 | : "y" (__B) |
88 | 0 | ); |
89 | 0 | return __A; |
90 | 0 | } |
91 | | |
92 | | # define _mm_shuffle_pi16(A, N) \ |
93 | 0 | ({ \ |
94 | 0 | __m64 ret; \ |
95 | 0 | \ |
96 | 0 | asm ("pshufw %2, %1, %0\n\t" \ |
97 | 0 | : "=y" (ret) \ |
98 | 0 | : "y" (A), "K" ((const int8_t)N) \ |
99 | 0 | ); \ |
100 | 0 | \ |
101 | 0 | ret; \ |
102 | 0 | }) |
103 | | # endif |
104 | | #endif |
105 | | |
106 | | #ifndef _MSC_VER |
107 | | #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \ |
108 | | (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0)) |
109 | | #endif |
110 | | |
111 | | /* Notes about writing mmx code |
112 | | * |
113 | | * give memory operands as the second operand. If you give it as the |
114 | | * first, gcc will first load it into a register, then use that |
115 | | * register |
116 | | * |
117 | | * ie. use |
118 | | * |
119 | | * _mm_mullo_pi16 (x, mmx_constant); |
120 | | * |
121 | | * not |
122 | | * |
123 | | * _mm_mullo_pi16 (mmx_constant, x); |
124 | | * |
125 | | * Also try to minimize dependencies. i.e. when you need a value, try |
126 | | * to calculate it from a value that was calculated as early as |
127 | | * possible. |
128 | | */ |
129 | | |
130 | | /* --------------- MMX primitives ------------------------------------- */ |
131 | | |
132 | | /* If __m64 is defined as a struct or union, then define M64_MEMBER to be |
133 | | * the name of the member used to access the data. |
134 | | * If __m64 requires using mm_cvt* intrinsics functions to convert between |
135 | | * uint64_t and __m64 values, then define USE_CVT_INTRINSICS. |
136 | | * If __m64 and uint64_t values can just be cast to each other directly, |
137 | | * then define USE_M64_CASTS. |
138 | | * If __m64 is a double datatype, then define USE_M64_DOUBLE. |
139 | | */ |
140 | | #ifdef _MSC_VER |
141 | | # define M64_MEMBER m64_u64 |
142 | | #elif defined(__ICC) |
143 | | # define USE_CVT_INTRINSICS |
144 | | #elif defined(USE_LOONGSON_MMI) |
145 | | # define USE_M64_DOUBLE |
146 | | #elif defined(__GNUC__) |
147 | | # define USE_M64_CASTS |
148 | | #elif defined(__SUNPRO_C) |
149 | | # if (__SUNPRO_C >= 0x5120) && !defined(__NOVECTORSIZE__) |
150 | | /* Solaris Studio 12.3 (Sun C 5.12) introduces __attribute__(__vector_size__) |
151 | | * support, and defaults to using it to define __m64, unless __NOVECTORSIZE__ |
152 | | * is defined. If it is used, then the mm_cvt* intrinsics must be used. |
153 | | */ |
154 | | # define USE_CVT_INTRINSICS |
155 | | # else |
156 | | /* For Studio 12.2 or older, or when __attribute__(__vector_size__) is |
157 | | * disabled, __m64 is defined as a struct containing "unsigned long long l_". |
158 | | */ |
159 | | # define M64_MEMBER l_ |
160 | | # endif |
161 | | #endif |
162 | | |
163 | | #if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS) || defined(USE_M64_DOUBLE) |
164 | | typedef uint64_t mmxdatafield; |
165 | | #else |
166 | | typedef __m64 mmxdatafield; |
167 | | #endif |
168 | | |
169 | | typedef struct |
170 | | { |
171 | | mmxdatafield mmx_4x00ff; |
172 | | mmxdatafield mmx_4x0080; |
173 | | mmxdatafield mmx_565_rgb; |
174 | | mmxdatafield mmx_565_unpack_multiplier; |
175 | | mmxdatafield mmx_565_pack_multiplier; |
176 | | mmxdatafield mmx_565_r; |
177 | | mmxdatafield mmx_565_g; |
178 | | mmxdatafield mmx_565_b; |
179 | | mmxdatafield mmx_packed_565_rb; |
180 | | mmxdatafield mmx_packed_565_g; |
181 | | mmxdatafield mmx_expand_565_g; |
182 | | mmxdatafield mmx_expand_565_b; |
183 | | mmxdatafield mmx_expand_565_r; |
184 | | #ifndef USE_LOONGSON_MMI |
185 | | mmxdatafield mmx_mask_0; |
186 | | mmxdatafield mmx_mask_1; |
187 | | mmxdatafield mmx_mask_2; |
188 | | mmxdatafield mmx_mask_3; |
189 | | #endif |
190 | | mmxdatafield mmx_full_alpha; |
191 | | mmxdatafield mmx_4x0101; |
192 | | mmxdatafield mmx_ff000000; |
193 | | } mmx_data_t; |
194 | | |
195 | | #if defined(_MSC_VER) |
196 | | # define MMXDATA_INIT(field, val) { val ## UI64 } |
197 | | #elif defined(M64_MEMBER) /* __m64 is a struct, not an integral type */ |
198 | | # define MMXDATA_INIT(field, val) field = { val ## ULL } |
199 | | #else /* mmxdatafield is an integral type */ |
200 | | # define MMXDATA_INIT(field, val) field = val ## ULL |
201 | | #endif |
202 | | |
203 | | static const mmx_data_t c = |
204 | | { |
205 | | MMXDATA_INIT (.mmx_4x00ff, 0x00ff00ff00ff00ff), |
206 | | MMXDATA_INIT (.mmx_4x0080, 0x0080008000800080), |
207 | | MMXDATA_INIT (.mmx_565_rgb, 0x000001f0003f001f), |
208 | | MMXDATA_INIT (.mmx_565_unpack_multiplier, 0x0000008404100840), |
209 | | MMXDATA_INIT (.mmx_565_pack_multiplier, 0x2000000420000004), |
210 | | MMXDATA_INIT (.mmx_565_r, 0x000000f800000000), |
211 | | MMXDATA_INIT (.mmx_565_g, 0x0000000000fc0000), |
212 | | MMXDATA_INIT (.mmx_565_b, 0x00000000000000f8), |
213 | | MMXDATA_INIT (.mmx_packed_565_rb, 0x00f800f800f800f8), |
214 | | MMXDATA_INIT (.mmx_packed_565_g, 0x0000fc000000fc00), |
215 | | MMXDATA_INIT (.mmx_expand_565_g, 0x07e007e007e007e0), |
216 | | MMXDATA_INIT (.mmx_expand_565_b, 0x001f001f001f001f), |
217 | | MMXDATA_INIT (.mmx_expand_565_r, 0xf800f800f800f800), |
218 | | #ifndef USE_LOONGSON_MMI |
219 | | MMXDATA_INIT (.mmx_mask_0, 0xffffffffffff0000), |
220 | | MMXDATA_INIT (.mmx_mask_1, 0xffffffff0000ffff), |
221 | | MMXDATA_INIT (.mmx_mask_2, 0xffff0000ffffffff), |
222 | | MMXDATA_INIT (.mmx_mask_3, 0x0000ffffffffffff), |
223 | | #endif |
224 | | MMXDATA_INIT (.mmx_full_alpha, 0x00ff000000000000), |
225 | | MMXDATA_INIT (.mmx_4x0101, 0x0101010101010101), |
226 | | MMXDATA_INIT (.mmx_ff000000, 0xff000000ff000000), |
227 | | }; |
228 | | |
229 | | #ifdef USE_CVT_INTRINSICS |
230 | | # define MC(x) to_m64 (c.mmx_ ## x) |
231 | | #elif defined(USE_M64_CASTS) |
232 | 0 | # define MC(x) ((__m64)c.mmx_ ## x) |
233 | | #elif defined(USE_M64_DOUBLE) |
234 | | # define MC(x) (*(__m64 *)&c.mmx_ ## x) |
235 | | #else |
236 | | # define MC(x) c.mmx_ ## x |
237 | | #endif |
238 | | |
239 | | static force_inline __m64 |
240 | | to_m64 (uint64_t x) |
241 | 0 | { |
242 | | #ifdef USE_CVT_INTRINSICS |
243 | | return _mm_cvtsi64_m64 (x); |
244 | | #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */ |
245 | | __m64 res; |
246 | | |
247 | | res.M64_MEMBER = x; |
248 | | return res; |
249 | | #elif defined USE_M64_DOUBLE |
250 | | return *(__m64 *)&x; |
251 | | #else /* USE_M64_CASTS */ |
252 | 0 | return (__m64)x; |
253 | 0 | #endif |
254 | 0 | } |
255 | | |
256 | | static force_inline uint64_t |
257 | | to_uint64 (__m64 x) |
258 | 0 | { |
259 | | #ifdef USE_CVT_INTRINSICS |
260 | | return _mm_cvtm64_si64 (x); |
261 | | #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */ |
262 | | uint64_t res = x.M64_MEMBER; |
263 | | return res; |
264 | | #elif defined USE_M64_DOUBLE |
265 | | return *(uint64_t *)&x; |
266 | | #else /* USE_M64_CASTS */ |
267 | 0 | return (uint64_t)x; |
268 | 0 | #endif |
269 | 0 | } |
270 | | |
271 | | static force_inline __m64 |
272 | | shift (__m64 v, |
273 | | int s) |
274 | 0 | { |
275 | 0 | if (s > 0) |
276 | 0 | return _mm_slli_si64 (v, s); |
277 | 0 | else if (s < 0) |
278 | 0 | return _mm_srli_si64 (v, -s); |
279 | 0 | else |
280 | 0 | return v; |
281 | 0 | } |
282 | | |
283 | | static force_inline __m64 |
284 | | negate (__m64 mask) |
285 | 0 | { |
286 | 0 | return _mm_xor_si64 (mask, MC (4x00ff)); |
287 | 0 | } |
288 | | |
289 | | /* Computes the product of two unsigned fixed-point 8-bit values from 0 to 1 |
290 | | * and maps its result to the same range. |
291 | | * |
292 | | * Jim Blinn gives multiple ways to compute this in "Jim Blinn's Corner: |
293 | | * Notation, Notation, Notation", the first of which is |
294 | | * |
295 | | * prod(a, b) = (a * b + 128) / 255. |
296 | | * |
297 | | * By approximating the division by 255 as 257/65536 it can be replaced by a |
298 | | * multiply and a right shift. This is the implementation that we use in |
299 | | * pix_multiply(), but we _mm_mulhi_pu16() by 257 (part of SSE1 or Extended |
300 | | * 3DNow!, and unavailable at the time of the book's publication) to perform |
301 | | * the multiplication and right shift in a single operation. |
302 | | * |
303 | | * prod(a, b) = ((a * b + 128) * 257) >> 16. |
304 | | * |
305 | | * A third way (how pix_multiply() was implemented prior to 14208344) exists |
306 | | * also that performs the multiplication by 257 with adds and shifts. |
307 | | * |
308 | | * Where temp = a * b + 128 |
309 | | * |
310 | | * prod(a, b) = (temp + (temp >> 8)) >> 8. |
311 | | */ |
312 | | static force_inline __m64 |
313 | | pix_multiply (__m64 a, __m64 b) |
314 | 0 | { |
315 | 0 | __m64 res; |
316 | |
|
317 | 0 | res = _mm_mullo_pi16 (a, b); |
318 | 0 | res = _mm_adds_pu16 (res, MC (4x0080)); |
319 | 0 | res = _mm_mulhi_pu16 (res, MC (4x0101)); |
320 | |
|
321 | 0 | return res; |
322 | 0 | } |
323 | | |
324 | | static force_inline __m64 |
325 | | pix_add (__m64 a, __m64 b) |
326 | 0 | { |
327 | 0 | return _mm_adds_pu8 (a, b); |
328 | 0 | } |
329 | | |
330 | | static force_inline __m64 |
331 | | expand_alpha (__m64 pixel) |
332 | 0 | { |
333 | 0 | return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 3, 3, 3)); |
334 | 0 | } |
335 | | |
336 | | static force_inline __m64 |
337 | | expand_alpha_rev (__m64 pixel) |
338 | 0 | { |
339 | 0 | return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (0, 0, 0, 0)); |
340 | 0 | } |
341 | | |
342 | | static force_inline __m64 |
343 | | invert_colors (__m64 pixel) |
344 | 0 | { |
345 | 0 | return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 0, 1, 2)); |
346 | 0 | } |
347 | | |
348 | | static force_inline __m64 |
349 | | over (__m64 src, |
350 | | __m64 srca, |
351 | | __m64 dest) |
352 | 0 | { |
353 | 0 | return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca))); |
354 | 0 | } |
355 | | |
356 | | static force_inline __m64 |
357 | | over_rev_non_pre (__m64 src, __m64 dest) |
358 | 0 | { |
359 | 0 | __m64 srca = expand_alpha (src); |
360 | 0 | __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha)); |
361 | |
|
362 | 0 | return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest); |
363 | 0 | } |
364 | | |
365 | | static force_inline __m64 |
366 | | in (__m64 src, __m64 mask) |
367 | 0 | { |
368 | 0 | return pix_multiply (src, mask); |
369 | 0 | } |
370 | | |
371 | | #ifndef _MSC_VER |
372 | | static force_inline __m64 |
373 | | in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest) |
374 | 0 | { |
375 | 0 | return over (in (src, mask), pix_multiply (srca, mask), dest); |
376 | 0 | } |
377 | | |
378 | | #else |
379 | | |
380 | | #define in_over(src, srca, mask, dest) \ |
381 | | over (in (src, mask), pix_multiply (srca, mask), dest) |
382 | | |
383 | | #endif |
384 | | |
385 | | /* Elemental unaligned loads */ |
386 | | |
387 | | static force_inline __m64 ldq_u(__m64 *p) |
388 | 0 | { |
389 | 0 | #ifdef USE_X86_MMX |
390 | | /* x86's alignment restrictions are very relaxed, but that's no excuse */ |
391 | 0 | __m64 r; |
392 | 0 | memcpy(&r, p, sizeof(__m64)); |
393 | 0 | return r; |
394 | | #elif defined USE_ARM_IWMMXT |
395 | | int align = (uintptr_t)p & 7; |
396 | | __m64 *aligned_p; |
397 | | if (align == 0) |
398 | | return *p; |
399 | | aligned_p = (__m64 *)((uintptr_t)p & ~7); |
400 | | return (__m64) _mm_align_si64 (aligned_p[0], aligned_p[1], align); |
401 | | #else |
402 | | struct __una_u64 { __m64 x __attribute__((packed)); }; |
403 | | const struct __una_u64 *ptr = (const struct __una_u64 *) p; |
404 | | return (__m64) ptr->x; |
405 | | #endif |
406 | 0 | } |
407 | | |
408 | | static force_inline uint32_t ldl_u(const uint32_t *p) |
409 | 0 | { |
410 | 0 | #ifdef USE_X86_MMX |
411 | | /* x86's alignment restrictions are very relaxed. */ |
412 | 0 | uint32_t r; |
413 | 0 | memcpy(&r, p, sizeof(uint32_t)); |
414 | 0 | return r; |
415 | | #else |
416 | | struct __una_u32 { uint32_t x __attribute__((packed)); }; |
417 | | const struct __una_u32 *ptr = (const struct __una_u32 *) p; |
418 | | return ptr->x; |
419 | | #endif |
420 | 0 | } |
421 | | |
422 | | static force_inline __m64 |
423 | | load (const uint32_t *v) |
424 | 0 | { |
425 | | #ifdef USE_LOONGSON_MMI |
426 | | __m64 ret; |
427 | | asm ("lwc1 %0, %1\n\t" |
428 | | : "=f" (ret) |
429 | | : "m" (*v) |
430 | | ); |
431 | | return ret; |
432 | | #else |
433 | 0 | return _mm_cvtsi32_si64 (*v); |
434 | 0 | #endif |
435 | 0 | } |
436 | | |
437 | | static force_inline __m64 |
438 | | load8888 (const uint32_t *v) |
439 | 0 | { |
440 | | #ifdef USE_LOONGSON_MMI |
441 | | return _mm_unpacklo_pi8_f (*(__m32 *)v, _mm_setzero_si64 ()); |
442 | | #else |
443 | 0 | return _mm_unpacklo_pi8 (load (v), _mm_setzero_si64 ()); |
444 | 0 | #endif |
445 | 0 | } |
446 | | |
447 | | static force_inline __m64 |
448 | | load8888u (const uint32_t *v) |
449 | 0 | { |
450 | 0 | uint32_t l = ldl_u (v); |
451 | 0 | return load8888 (&l); |
452 | 0 | } |
453 | | |
454 | | static force_inline __m64 |
455 | | pack8888 (__m64 lo, __m64 hi) |
456 | 0 | { |
457 | 0 | return _mm_packs_pu16 (lo, hi); |
458 | 0 | } |
459 | | |
460 | | static force_inline void |
461 | | store (uint32_t *dest, __m64 v) |
462 | 0 | { |
463 | | #ifdef USE_LOONGSON_MMI |
464 | | asm ("swc1 %1, %0\n\t" |
465 | | : "=m" (*dest) |
466 | | : "f" (v) |
467 | | : "memory" |
468 | | ); |
469 | | #else |
470 | 0 | *dest = _mm_cvtsi64_si32 (v); |
471 | 0 | #endif |
472 | 0 | } |
473 | | |
474 | | static force_inline void |
475 | | store8888 (uint32_t *dest, __m64 v) |
476 | 0 | { |
477 | 0 | v = pack8888 (v, _mm_setzero_si64 ()); |
478 | 0 | store (dest, v); |
479 | 0 | } |
480 | | |
481 | | static force_inline pixman_bool_t |
482 | | is_equal (__m64 a, __m64 b) |
483 | 0 | { |
484 | | #ifdef USE_LOONGSON_MMI |
485 | | /* __m64 is double, we can compare directly. */ |
486 | | return a == b; |
487 | | #else |
488 | 0 | return _mm_movemask_pi8 (_mm_cmpeq_pi8 (a, b)) == 0xff; |
489 | 0 | #endif |
490 | 0 | } |
491 | | |
492 | | static force_inline pixman_bool_t |
493 | | is_opaque (__m64 v) |
494 | 0 | { |
495 | | #ifdef USE_LOONGSON_MMI |
496 | | return is_equal (_mm_and_si64 (v, MC (full_alpha)), MC (full_alpha)); |
497 | | #else |
498 | 0 | __m64 ffs = _mm_cmpeq_pi8 (v, v); |
499 | 0 | return (_mm_movemask_pi8 (_mm_cmpeq_pi8 (v, ffs)) & 0x40); |
500 | 0 | #endif |
501 | 0 | } |
502 | | |
503 | | static force_inline pixman_bool_t |
504 | | is_zero (__m64 v) |
505 | 0 | { |
506 | 0 | return is_equal (v, _mm_setzero_si64 ()); |
507 | 0 | } |
508 | | |
509 | | /* Expand 16 bits positioned at @pos (0-3) of a mmx register into |
510 | | * |
511 | | * 00RR00GG00BB |
512 | | * |
513 | | * --- Expanding 565 in the low word --- |
514 | | * |
515 | | * m = (m << (32 - 3)) | (m << (16 - 5)) | m; |
516 | | * m = m & (01f0003f001f); |
517 | | * m = m * (008404100840); |
518 | | * m = m >> 8; |
519 | | * |
520 | | * Note the trick here - the top word is shifted by another nibble to |
521 | | * avoid it bumping into the middle word |
522 | | */ |
523 | | static force_inline __m64 |
524 | | expand565 (__m64 pixel, int pos) |
525 | 0 | { |
526 | 0 | __m64 p = pixel; |
527 | 0 | __m64 t1, t2; |
528 | | |
529 | | /* move pixel to low 16 bit and zero the rest */ |
530 | | #ifdef USE_LOONGSON_MMI |
531 | | p = loongson_extract_pi16 (p, pos); |
532 | | #else |
533 | 0 | p = shift (shift (p, (3 - pos) * 16), -48); |
534 | 0 | #endif |
535 | |
|
536 | 0 | t1 = shift (p, 36 - 11); |
537 | 0 | t2 = shift (p, 16 - 5); |
538 | |
|
539 | 0 | p = _mm_or_si64 (t1, p); |
540 | 0 | p = _mm_or_si64 (t2, p); |
541 | 0 | p = _mm_and_si64 (p, MC (565_rgb)); |
542 | |
|
543 | 0 | pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier)); |
544 | 0 | return _mm_srli_pi16 (pixel, 8); |
545 | 0 | } |
546 | | |
547 | | /* Expand 4 16 bit pixels in an mmx register into two mmx registers of |
548 | | * |
549 | | * AARRGGBBRRGGBB |
550 | | */ |
551 | | static force_inline void |
552 | | expand_4xpacked565 (__m64 vin, __m64 *vout0, __m64 *vout1, int full_alpha) |
553 | 0 | { |
554 | 0 | __m64 t0, t1, alpha = _mm_setzero_si64 (); |
555 | 0 | __m64 r = _mm_and_si64 (vin, MC (expand_565_r)); |
556 | 0 | __m64 g = _mm_and_si64 (vin, MC (expand_565_g)); |
557 | 0 | __m64 b = _mm_and_si64 (vin, MC (expand_565_b)); |
558 | 0 | if (full_alpha) |
559 | 0 | alpha = _mm_cmpeq_pi32 (alpha, alpha); |
560 | | |
561 | | /* Replicate high bits into empty low bits. */ |
562 | 0 | r = _mm_or_si64 (_mm_srli_pi16 (r, 8), _mm_srli_pi16 (r, 13)); |
563 | 0 | g = _mm_or_si64 (_mm_srli_pi16 (g, 3), _mm_srli_pi16 (g, 9)); |
564 | 0 | b = _mm_or_si64 (_mm_slli_pi16 (b, 3), _mm_srli_pi16 (b, 2)); |
565 | |
|
566 | 0 | r = _mm_packs_pu16 (r, _mm_setzero_si64 ()); /* 00 00 00 00 R3 R2 R1 R0 */ |
567 | 0 | g = _mm_packs_pu16 (g, _mm_setzero_si64 ()); /* 00 00 00 00 G3 G2 G1 G0 */ |
568 | 0 | b = _mm_packs_pu16 (b, _mm_setzero_si64 ()); /* 00 00 00 00 B3 B2 B1 B0 */ |
569 | |
|
570 | 0 | t1 = _mm_unpacklo_pi8 (r, alpha); /* A3 R3 A2 R2 A1 R1 A0 R0 */ |
571 | 0 | t0 = _mm_unpacklo_pi8 (b, g); /* G3 B3 G2 B2 G1 B1 G0 B0 */ |
572 | |
|
573 | 0 | *vout0 = _mm_unpacklo_pi16 (t0, t1); /* A1 R1 G1 B1 A0 R0 G0 B0 */ |
574 | 0 | *vout1 = _mm_unpackhi_pi16 (t0, t1); /* A3 R3 G3 B3 A2 R2 G2 B2 */ |
575 | 0 | } |
576 | | |
577 | | static force_inline __m64 |
578 | | expand8888 (__m64 in, int pos) |
579 | 0 | { |
580 | 0 | if (pos == 0) |
581 | 0 | return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ()); |
582 | 0 | else |
583 | 0 | return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ()); |
584 | 0 | } |
585 | | |
586 | | static force_inline __m64 |
587 | | expandx888 (__m64 in, int pos) |
588 | 0 | { |
589 | 0 | return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha)); |
590 | 0 | } |
591 | | |
592 | | static force_inline void |
593 | | expand_4x565 (__m64 vin, __m64 *vout0, __m64 *vout1, __m64 *vout2, __m64 *vout3, int full_alpha) |
594 | 0 | { |
595 | 0 | __m64 v0, v1; |
596 | 0 | expand_4xpacked565 (vin, &v0, &v1, full_alpha); |
597 | 0 | *vout0 = expand8888 (v0, 0); |
598 | 0 | *vout1 = expand8888 (v0, 1); |
599 | 0 | *vout2 = expand8888 (v1, 0); |
600 | 0 | *vout3 = expand8888 (v1, 1); |
601 | 0 | } |
602 | | |
603 | | static force_inline __m64 |
604 | | pack_565 (__m64 pixel, __m64 target, int pos) |
605 | 0 | { |
606 | 0 | __m64 p = pixel; |
607 | 0 | __m64 t = target; |
608 | 0 | __m64 r, g, b; |
609 | |
|
610 | 0 | r = _mm_and_si64 (p, MC (565_r)); |
611 | 0 | g = _mm_and_si64 (p, MC (565_g)); |
612 | 0 | b = _mm_and_si64 (p, MC (565_b)); |
613 | |
|
614 | | #ifdef USE_LOONGSON_MMI |
615 | | r = shift (r, -(32 - 8)); |
616 | | g = shift (g, -(16 - 3)); |
617 | | b = shift (b, -(0 + 3)); |
618 | | |
619 | | p = _mm_or_si64 (r, g); |
620 | | p = _mm_or_si64 (p, b); |
621 | | return loongson_insert_pi16 (t, p, pos); |
622 | | #else |
623 | 0 | r = shift (r, -(32 - 8) + pos * 16); |
624 | 0 | g = shift (g, -(16 - 3) + pos * 16); |
625 | 0 | b = shift (b, -(0 + 3) + pos * 16); |
626 | |
|
627 | 0 | if (pos == 0) |
628 | 0 | t = _mm_and_si64 (t, MC (mask_0)); |
629 | 0 | else if (pos == 1) |
630 | 0 | t = _mm_and_si64 (t, MC (mask_1)); |
631 | 0 | else if (pos == 2) |
632 | 0 | t = _mm_and_si64 (t, MC (mask_2)); |
633 | 0 | else if (pos == 3) |
634 | 0 | t = _mm_and_si64 (t, MC (mask_3)); |
635 | |
|
636 | 0 | p = _mm_or_si64 (r, t); |
637 | 0 | p = _mm_or_si64 (g, p); |
638 | |
|
639 | 0 | return _mm_or_si64 (b, p); |
640 | 0 | #endif |
641 | 0 | } |
642 | | |
643 | | static force_inline __m64 |
644 | | pack_4xpacked565 (__m64 a, __m64 b) |
645 | 0 | { |
646 | 0 | __m64 rb0 = _mm_and_si64 (a, MC (packed_565_rb)); |
647 | 0 | __m64 rb1 = _mm_and_si64 (b, MC (packed_565_rb)); |
648 | |
|
649 | 0 | __m64 t0 = _mm_madd_pi16 (rb0, MC (565_pack_multiplier)); |
650 | 0 | __m64 t1 = _mm_madd_pi16 (rb1, MC (565_pack_multiplier)); |
651 | |
|
652 | 0 | __m64 g0 = _mm_and_si64 (a, MC (packed_565_g)); |
653 | 0 | __m64 g1 = _mm_and_si64 (b, MC (packed_565_g)); |
654 | |
|
655 | 0 | t0 = _mm_or_si64 (t0, g0); |
656 | 0 | t1 = _mm_or_si64 (t1, g1); |
657 | |
|
658 | 0 | t0 = shift(t0, -5); |
659 | | #ifdef USE_ARM_IWMMXT |
660 | | t1 = shift(t1, -5); |
661 | | return _mm_packs_pu32 (t0, t1); |
662 | | #else |
663 | 0 | t1 = shift(t1, -5 + 16); |
664 | 0 | return _mm_shuffle_pi16 (_mm_or_si64 (t0, t1), _MM_SHUFFLE (3, 1, 2, 0)); |
665 | 0 | #endif |
666 | 0 | } |
667 | | |
668 | | #ifndef _MSC_VER |
669 | | |
670 | | static force_inline __m64 |
671 | | pack_4x565 (__m64 v0, __m64 v1, __m64 v2, __m64 v3) |
672 | 0 | { |
673 | 0 | return pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3)); |
674 | 0 | } |
675 | | |
676 | | static force_inline __m64 |
677 | | pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b) |
678 | 0 | { |
679 | 0 | x = pix_multiply (x, a); |
680 | 0 | y = pix_multiply (y, b); |
681 | |
|
682 | 0 | return pix_add (x, y); |
683 | 0 | } |
684 | | |
685 | | #else |
686 | | |
687 | | /* MSVC only handles a "pass by register" of up to three SSE intrinsics */ |
688 | | |
689 | | #define pack_4x565(v0, v1, v2, v3) \ |
690 | | pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3)) |
691 | | |
692 | | #define pix_add_mul(x, a, y, b) \ |
693 | | ( x = pix_multiply (x, a), \ |
694 | | y = pix_multiply (y, b), \ |
695 | | pix_add (x, y) ) |
696 | | |
697 | | #endif |
698 | | |
699 | | /* --------------- MMX code patch for fbcompose.c --------------------- */ |
700 | | |
701 | | static force_inline __m64 |
702 | | combine (const uint32_t *src, const uint32_t *mask) |
703 | 0 | { |
704 | 0 | __m64 vsrc = load8888 (src); |
705 | |
|
706 | 0 | if (mask) |
707 | 0 | { |
708 | 0 | __m64 m = load8888 (mask); |
709 | |
|
710 | 0 | m = expand_alpha (m); |
711 | 0 | vsrc = pix_multiply (vsrc, m); |
712 | 0 | } |
713 | |
|
714 | 0 | return vsrc; |
715 | 0 | } |
716 | | |
717 | | static force_inline __m64 |
718 | | core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst) |
719 | 0 | { |
720 | 0 | vsrc = _mm_unpacklo_pi8 (vsrc, _mm_setzero_si64 ()); |
721 | |
|
722 | 0 | if (is_opaque (vsrc)) |
723 | 0 | { |
724 | 0 | return vsrc; |
725 | 0 | } |
726 | 0 | else if (!is_zero (vsrc)) |
727 | 0 | { |
728 | 0 | return over (vsrc, expand_alpha (vsrc), |
729 | 0 | _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ())); |
730 | 0 | } |
731 | | |
732 | 0 | return _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ()); |
733 | 0 | } |
734 | | |
735 | | static void |
736 | | mmx_combine_over_u (pixman_implementation_t *imp, |
737 | | pixman_op_t op, |
738 | | uint32_t * dest, |
739 | | const uint32_t * src, |
740 | | const uint32_t * mask, |
741 | | int width) |
742 | 0 | { |
743 | 0 | const uint32_t *end = dest + width; |
744 | |
|
745 | 0 | while (dest < end) |
746 | 0 | { |
747 | 0 | __m64 vsrc = combine (src, mask); |
748 | |
|
749 | 0 | if (is_opaque (vsrc)) |
750 | 0 | { |
751 | 0 | store8888 (dest, vsrc); |
752 | 0 | } |
753 | 0 | else if (!is_zero (vsrc)) |
754 | 0 | { |
755 | 0 | __m64 sa = expand_alpha (vsrc); |
756 | 0 | store8888 (dest, over (vsrc, sa, load8888 (dest))); |
757 | 0 | } |
758 | |
|
759 | 0 | ++dest; |
760 | 0 | ++src; |
761 | 0 | if (mask) |
762 | 0 | ++mask; |
763 | 0 | } |
764 | 0 | _mm_empty (); |
765 | 0 | } |
766 | | |
767 | | static void |
768 | | mmx_combine_over_reverse_u (pixman_implementation_t *imp, |
769 | | pixman_op_t op, |
770 | | uint32_t * dest, |
771 | | const uint32_t * src, |
772 | | const uint32_t * mask, |
773 | | int width) |
774 | 0 | { |
775 | 0 | const uint32_t *end = dest + width; |
776 | |
|
777 | 0 | while (dest < end) |
778 | 0 | { |
779 | 0 | __m64 d, da; |
780 | 0 | __m64 s = combine (src, mask); |
781 | |
|
782 | 0 | d = load8888 (dest); |
783 | 0 | da = expand_alpha (d); |
784 | 0 | store8888 (dest, over (d, da, s)); |
785 | |
|
786 | 0 | ++dest; |
787 | 0 | ++src; |
788 | 0 | if (mask) |
789 | 0 | mask++; |
790 | 0 | } |
791 | 0 | _mm_empty (); |
792 | 0 | } |
793 | | |
794 | | static void |
795 | | mmx_combine_in_u (pixman_implementation_t *imp, |
796 | | pixman_op_t op, |
797 | | uint32_t * dest, |
798 | | const uint32_t * src, |
799 | | const uint32_t * mask, |
800 | | int width) |
801 | 0 | { |
802 | 0 | const uint32_t *end = dest + width; |
803 | |
|
804 | 0 | while (dest < end) |
805 | 0 | { |
806 | 0 | __m64 a; |
807 | 0 | __m64 x = combine (src, mask); |
808 | |
|
809 | 0 | a = load8888 (dest); |
810 | 0 | a = expand_alpha (a); |
811 | 0 | x = pix_multiply (x, a); |
812 | |
|
813 | 0 | store8888 (dest, x); |
814 | |
|
815 | 0 | ++dest; |
816 | 0 | ++src; |
817 | 0 | if (mask) |
818 | 0 | mask++; |
819 | 0 | } |
820 | 0 | _mm_empty (); |
821 | 0 | } |
822 | | |
823 | | static void |
824 | | mmx_combine_in_reverse_u (pixman_implementation_t *imp, |
825 | | pixman_op_t op, |
826 | | uint32_t * dest, |
827 | | const uint32_t * src, |
828 | | const uint32_t * mask, |
829 | | int width) |
830 | 0 | { |
831 | 0 | const uint32_t *end = dest + width; |
832 | |
|
833 | 0 | while (dest < end) |
834 | 0 | { |
835 | 0 | __m64 a = combine (src, mask); |
836 | 0 | __m64 x; |
837 | |
|
838 | 0 | x = load8888 (dest); |
839 | 0 | a = expand_alpha (a); |
840 | 0 | x = pix_multiply (x, a); |
841 | 0 | store8888 (dest, x); |
842 | |
|
843 | 0 | ++dest; |
844 | 0 | ++src; |
845 | 0 | if (mask) |
846 | 0 | mask++; |
847 | 0 | } |
848 | 0 | _mm_empty (); |
849 | 0 | } |
850 | | |
851 | | static void |
852 | | mmx_combine_out_u (pixman_implementation_t *imp, |
853 | | pixman_op_t op, |
854 | | uint32_t * dest, |
855 | | const uint32_t * src, |
856 | | const uint32_t * mask, |
857 | | int width) |
858 | 0 | { |
859 | 0 | const uint32_t *end = dest + width; |
860 | |
|
861 | 0 | while (dest < end) |
862 | 0 | { |
863 | 0 | __m64 a; |
864 | 0 | __m64 x = combine (src, mask); |
865 | |
|
866 | 0 | a = load8888 (dest); |
867 | 0 | a = expand_alpha (a); |
868 | 0 | a = negate (a); |
869 | 0 | x = pix_multiply (x, a); |
870 | 0 | store8888 (dest, x); |
871 | |
|
872 | 0 | ++dest; |
873 | 0 | ++src; |
874 | 0 | if (mask) |
875 | 0 | mask++; |
876 | 0 | } |
877 | 0 | _mm_empty (); |
878 | 0 | } |
879 | | |
880 | | static void |
881 | | mmx_combine_out_reverse_u (pixman_implementation_t *imp, |
882 | | pixman_op_t op, |
883 | | uint32_t * dest, |
884 | | const uint32_t * src, |
885 | | const uint32_t * mask, |
886 | | int width) |
887 | 0 | { |
888 | 0 | const uint32_t *end = dest + width; |
889 | |
|
890 | 0 | while (dest < end) |
891 | 0 | { |
892 | 0 | __m64 a = combine (src, mask); |
893 | 0 | __m64 x; |
894 | |
|
895 | 0 | x = load8888 (dest); |
896 | 0 | a = expand_alpha (a); |
897 | 0 | a = negate (a); |
898 | 0 | x = pix_multiply (x, a); |
899 | |
|
900 | 0 | store8888 (dest, x); |
901 | |
|
902 | 0 | ++dest; |
903 | 0 | ++src; |
904 | 0 | if (mask) |
905 | 0 | mask++; |
906 | 0 | } |
907 | 0 | _mm_empty (); |
908 | 0 | } |
909 | | |
910 | | static void |
911 | | mmx_combine_atop_u (pixman_implementation_t *imp, |
912 | | pixman_op_t op, |
913 | | uint32_t * dest, |
914 | | const uint32_t * src, |
915 | | const uint32_t * mask, |
916 | | int width) |
917 | 0 | { |
918 | 0 | const uint32_t *end = dest + width; |
919 | |
|
920 | 0 | while (dest < end) |
921 | 0 | { |
922 | 0 | __m64 da, d, sia; |
923 | 0 | __m64 s = combine (src, mask); |
924 | |
|
925 | 0 | d = load8888 (dest); |
926 | 0 | sia = expand_alpha (s); |
927 | 0 | sia = negate (sia); |
928 | 0 | da = expand_alpha (d); |
929 | 0 | s = pix_add_mul (s, da, d, sia); |
930 | 0 | store8888 (dest, s); |
931 | |
|
932 | 0 | ++dest; |
933 | 0 | ++src; |
934 | 0 | if (mask) |
935 | 0 | mask++; |
936 | 0 | } |
937 | 0 | _mm_empty (); |
938 | 0 | } |
939 | | |
940 | | static void |
941 | | mmx_combine_atop_reverse_u (pixman_implementation_t *imp, |
942 | | pixman_op_t op, |
943 | | uint32_t * dest, |
944 | | const uint32_t * src, |
945 | | const uint32_t * mask, |
946 | | int width) |
947 | 0 | { |
948 | 0 | const uint32_t *end; |
949 | |
|
950 | 0 | end = dest + width; |
951 | |
|
952 | 0 | while (dest < end) |
953 | 0 | { |
954 | 0 | __m64 dia, d, sa; |
955 | 0 | __m64 s = combine (src, mask); |
956 | |
|
957 | 0 | d = load8888 (dest); |
958 | 0 | sa = expand_alpha (s); |
959 | 0 | dia = expand_alpha (d); |
960 | 0 | dia = negate (dia); |
961 | 0 | s = pix_add_mul (s, dia, d, sa); |
962 | 0 | store8888 (dest, s); |
963 | |
|
964 | 0 | ++dest; |
965 | 0 | ++src; |
966 | 0 | if (mask) |
967 | 0 | mask++; |
968 | 0 | } |
969 | 0 | _mm_empty (); |
970 | 0 | } |
971 | | |
972 | | static void |
973 | | mmx_combine_xor_u (pixman_implementation_t *imp, |
974 | | pixman_op_t op, |
975 | | uint32_t * dest, |
976 | | const uint32_t * src, |
977 | | const uint32_t * mask, |
978 | | int width) |
979 | 0 | { |
980 | 0 | const uint32_t *end = dest + width; |
981 | |
|
982 | 0 | while (dest < end) |
983 | 0 | { |
984 | 0 | __m64 dia, d, sia; |
985 | 0 | __m64 s = combine (src, mask); |
986 | |
|
987 | 0 | d = load8888 (dest); |
988 | 0 | sia = expand_alpha (s); |
989 | 0 | dia = expand_alpha (d); |
990 | 0 | sia = negate (sia); |
991 | 0 | dia = negate (dia); |
992 | 0 | s = pix_add_mul (s, dia, d, sia); |
993 | 0 | store8888 (dest, s); |
994 | |
|
995 | 0 | ++dest; |
996 | 0 | ++src; |
997 | 0 | if (mask) |
998 | 0 | mask++; |
999 | 0 | } |
1000 | 0 | _mm_empty (); |
1001 | 0 | } |
1002 | | |
1003 | | static void |
1004 | | mmx_combine_add_u (pixman_implementation_t *imp, |
1005 | | pixman_op_t op, |
1006 | | uint32_t * dest, |
1007 | | const uint32_t * src, |
1008 | | const uint32_t * mask, |
1009 | | int width) |
1010 | 0 | { |
1011 | 0 | const uint32_t *end = dest + width; |
1012 | |
|
1013 | 0 | while (dest < end) |
1014 | 0 | { |
1015 | 0 | __m64 d; |
1016 | 0 | __m64 s = combine (src, mask); |
1017 | |
|
1018 | 0 | d = load8888 (dest); |
1019 | 0 | s = pix_add (s, d); |
1020 | 0 | store8888 (dest, s); |
1021 | |
|
1022 | 0 | ++dest; |
1023 | 0 | ++src; |
1024 | 0 | if (mask) |
1025 | 0 | mask++; |
1026 | 0 | } |
1027 | 0 | _mm_empty (); |
1028 | 0 | } |
1029 | | |
1030 | | static void |
1031 | | mmx_combine_saturate_u (pixman_implementation_t *imp, |
1032 | | pixman_op_t op, |
1033 | | uint32_t * dest, |
1034 | | const uint32_t * src, |
1035 | | const uint32_t * mask, |
1036 | | int width) |
1037 | 0 | { |
1038 | 0 | const uint32_t *end = dest + width; |
1039 | |
|
1040 | 0 | while (dest < end) |
1041 | 0 | { |
1042 | 0 | uint32_t s, sa, da; |
1043 | 0 | uint32_t d = *dest; |
1044 | 0 | __m64 ms = combine (src, mask); |
1045 | 0 | __m64 md = load8888 (dest); |
1046 | |
|
1047 | 0 | store8888(&s, ms); |
1048 | 0 | da = ~d >> 24; |
1049 | 0 | sa = s >> 24; |
1050 | |
|
1051 | 0 | if (sa > da) |
1052 | 0 | { |
1053 | 0 | uint32_t quot = DIV_UN8 (da, sa) << 24; |
1054 | 0 | __m64 msa = load8888 ("); |
1055 | 0 | msa = expand_alpha (msa); |
1056 | 0 | ms = pix_multiply (ms, msa); |
1057 | 0 | } |
1058 | |
|
1059 | 0 | md = pix_add (md, ms); |
1060 | 0 | store8888 (dest, md); |
1061 | |
|
1062 | 0 | ++src; |
1063 | 0 | ++dest; |
1064 | 0 | if (mask) |
1065 | 0 | mask++; |
1066 | 0 | } |
1067 | 0 | _mm_empty (); |
1068 | 0 | } |
1069 | | |
1070 | | static void |
1071 | | mmx_combine_src_ca (pixman_implementation_t *imp, |
1072 | | pixman_op_t op, |
1073 | | uint32_t * dest, |
1074 | | const uint32_t * src, |
1075 | | const uint32_t * mask, |
1076 | | int width) |
1077 | 0 | { |
1078 | 0 | const uint32_t *end = src + width; |
1079 | |
|
1080 | 0 | while (src < end) |
1081 | 0 | { |
1082 | 0 | __m64 a = load8888 (mask); |
1083 | 0 | __m64 s = load8888 (src); |
1084 | |
|
1085 | 0 | s = pix_multiply (s, a); |
1086 | 0 | store8888 (dest, s); |
1087 | |
|
1088 | 0 | ++src; |
1089 | 0 | ++mask; |
1090 | 0 | ++dest; |
1091 | 0 | } |
1092 | 0 | _mm_empty (); |
1093 | 0 | } |
1094 | | |
1095 | | static void |
1096 | | mmx_combine_over_ca (pixman_implementation_t *imp, |
1097 | | pixman_op_t op, |
1098 | | uint32_t * dest, |
1099 | | const uint32_t * src, |
1100 | | const uint32_t * mask, |
1101 | | int width) |
1102 | 0 | { |
1103 | 0 | const uint32_t *end = src + width; |
1104 | |
|
1105 | 0 | while (src < end) |
1106 | 0 | { |
1107 | 0 | __m64 a = load8888 (mask); |
1108 | 0 | __m64 s = load8888 (src); |
1109 | 0 | __m64 d = load8888 (dest); |
1110 | 0 | __m64 sa = expand_alpha (s); |
1111 | |
|
1112 | 0 | store8888 (dest, in_over (s, sa, a, d)); |
1113 | |
|
1114 | 0 | ++src; |
1115 | 0 | ++dest; |
1116 | 0 | ++mask; |
1117 | 0 | } |
1118 | 0 | _mm_empty (); |
1119 | 0 | } |
1120 | | |
1121 | | static void |
1122 | | mmx_combine_over_reverse_ca (pixman_implementation_t *imp, |
1123 | | pixman_op_t op, |
1124 | | uint32_t * dest, |
1125 | | const uint32_t * src, |
1126 | | const uint32_t * mask, |
1127 | | int width) |
1128 | 0 | { |
1129 | 0 | const uint32_t *end = src + width; |
1130 | |
|
1131 | 0 | while (src < end) |
1132 | 0 | { |
1133 | 0 | __m64 a = load8888 (mask); |
1134 | 0 | __m64 s = load8888 (src); |
1135 | 0 | __m64 d = load8888 (dest); |
1136 | 0 | __m64 da = expand_alpha (d); |
1137 | |
|
1138 | 0 | store8888 (dest, over (d, da, in (s, a))); |
1139 | |
|
1140 | 0 | ++src; |
1141 | 0 | ++dest; |
1142 | 0 | ++mask; |
1143 | 0 | } |
1144 | 0 | _mm_empty (); |
1145 | 0 | } |
1146 | | |
1147 | | static void |
1148 | | mmx_combine_in_ca (pixman_implementation_t *imp, |
1149 | | pixman_op_t op, |
1150 | | uint32_t * dest, |
1151 | | const uint32_t * src, |
1152 | | const uint32_t * mask, |
1153 | | int width) |
1154 | 0 | { |
1155 | 0 | const uint32_t *end = src + width; |
1156 | |
|
1157 | 0 | while (src < end) |
1158 | 0 | { |
1159 | 0 | __m64 a = load8888 (mask); |
1160 | 0 | __m64 s = load8888 (src); |
1161 | 0 | __m64 d = load8888 (dest); |
1162 | 0 | __m64 da = expand_alpha (d); |
1163 | |
|
1164 | 0 | s = pix_multiply (s, a); |
1165 | 0 | s = pix_multiply (s, da); |
1166 | 0 | store8888 (dest, s); |
1167 | |
|
1168 | 0 | ++src; |
1169 | 0 | ++dest; |
1170 | 0 | ++mask; |
1171 | 0 | } |
1172 | 0 | _mm_empty (); |
1173 | 0 | } |
1174 | | |
1175 | | static void |
1176 | | mmx_combine_in_reverse_ca (pixman_implementation_t *imp, |
1177 | | pixman_op_t op, |
1178 | | uint32_t * dest, |
1179 | | const uint32_t * src, |
1180 | | const uint32_t * mask, |
1181 | | int width) |
1182 | 0 | { |
1183 | 0 | const uint32_t *end = src + width; |
1184 | |
|
1185 | 0 | while (src < end) |
1186 | 0 | { |
1187 | 0 | __m64 a = load8888 (mask); |
1188 | 0 | __m64 s = load8888 (src); |
1189 | 0 | __m64 d = load8888 (dest); |
1190 | 0 | __m64 sa = expand_alpha (s); |
1191 | |
|
1192 | 0 | a = pix_multiply (a, sa); |
1193 | 0 | d = pix_multiply (d, a); |
1194 | 0 | store8888 (dest, d); |
1195 | |
|
1196 | 0 | ++src; |
1197 | 0 | ++dest; |
1198 | 0 | ++mask; |
1199 | 0 | } |
1200 | 0 | _mm_empty (); |
1201 | 0 | } |
1202 | | |
1203 | | static void |
1204 | | mmx_combine_out_ca (pixman_implementation_t *imp, |
1205 | | pixman_op_t op, |
1206 | | uint32_t * dest, |
1207 | | const uint32_t * src, |
1208 | | const uint32_t * mask, |
1209 | | int width) |
1210 | 0 | { |
1211 | 0 | const uint32_t *end = src + width; |
1212 | |
|
1213 | 0 | while (src < end) |
1214 | 0 | { |
1215 | 0 | __m64 a = load8888 (mask); |
1216 | 0 | __m64 s = load8888 (src); |
1217 | 0 | __m64 d = load8888 (dest); |
1218 | 0 | __m64 da = expand_alpha (d); |
1219 | |
|
1220 | 0 | da = negate (da); |
1221 | 0 | s = pix_multiply (s, a); |
1222 | 0 | s = pix_multiply (s, da); |
1223 | 0 | store8888 (dest, s); |
1224 | |
|
1225 | 0 | ++src; |
1226 | 0 | ++dest; |
1227 | 0 | ++mask; |
1228 | 0 | } |
1229 | 0 | _mm_empty (); |
1230 | 0 | } |
1231 | | |
1232 | | static void |
1233 | | mmx_combine_out_reverse_ca (pixman_implementation_t *imp, |
1234 | | pixman_op_t op, |
1235 | | uint32_t * dest, |
1236 | | const uint32_t * src, |
1237 | | const uint32_t * mask, |
1238 | | int width) |
1239 | 0 | { |
1240 | 0 | const uint32_t *end = src + width; |
1241 | |
|
1242 | 0 | while (src < end) |
1243 | 0 | { |
1244 | 0 | __m64 a = load8888 (mask); |
1245 | 0 | __m64 s = load8888 (src); |
1246 | 0 | __m64 d = load8888 (dest); |
1247 | 0 | __m64 sa = expand_alpha (s); |
1248 | |
|
1249 | 0 | a = pix_multiply (a, sa); |
1250 | 0 | a = negate (a); |
1251 | 0 | d = pix_multiply (d, a); |
1252 | 0 | store8888 (dest, d); |
1253 | |
|
1254 | 0 | ++src; |
1255 | 0 | ++dest; |
1256 | 0 | ++mask; |
1257 | 0 | } |
1258 | 0 | _mm_empty (); |
1259 | 0 | } |
1260 | | |
1261 | | static void |
1262 | | mmx_combine_atop_ca (pixman_implementation_t *imp, |
1263 | | pixman_op_t op, |
1264 | | uint32_t * dest, |
1265 | | const uint32_t * src, |
1266 | | const uint32_t * mask, |
1267 | | int width) |
1268 | 0 | { |
1269 | 0 | const uint32_t *end = src + width; |
1270 | |
|
1271 | 0 | while (src < end) |
1272 | 0 | { |
1273 | 0 | __m64 a = load8888 (mask); |
1274 | 0 | __m64 s = load8888 (src); |
1275 | 0 | __m64 d = load8888 (dest); |
1276 | 0 | __m64 da = expand_alpha (d); |
1277 | 0 | __m64 sa = expand_alpha (s); |
1278 | |
|
1279 | 0 | s = pix_multiply (s, a); |
1280 | 0 | a = pix_multiply (a, sa); |
1281 | 0 | a = negate (a); |
1282 | 0 | d = pix_add_mul (d, a, s, da); |
1283 | 0 | store8888 (dest, d); |
1284 | |
|
1285 | 0 | ++src; |
1286 | 0 | ++dest; |
1287 | 0 | ++mask; |
1288 | 0 | } |
1289 | 0 | _mm_empty (); |
1290 | 0 | } |
1291 | | |
1292 | | static void |
1293 | | mmx_combine_atop_reverse_ca (pixman_implementation_t *imp, |
1294 | | pixman_op_t op, |
1295 | | uint32_t * dest, |
1296 | | const uint32_t * src, |
1297 | | const uint32_t * mask, |
1298 | | int width) |
1299 | 0 | { |
1300 | 0 | const uint32_t *end = src + width; |
1301 | |
|
1302 | 0 | while (src < end) |
1303 | 0 | { |
1304 | 0 | __m64 a = load8888 (mask); |
1305 | 0 | __m64 s = load8888 (src); |
1306 | 0 | __m64 d = load8888 (dest); |
1307 | 0 | __m64 da = expand_alpha (d); |
1308 | 0 | __m64 sa = expand_alpha (s); |
1309 | |
|
1310 | 0 | s = pix_multiply (s, a); |
1311 | 0 | a = pix_multiply (a, sa); |
1312 | 0 | da = negate (da); |
1313 | 0 | d = pix_add_mul (d, a, s, da); |
1314 | 0 | store8888 (dest, d); |
1315 | |
|
1316 | 0 | ++src; |
1317 | 0 | ++dest; |
1318 | 0 | ++mask; |
1319 | 0 | } |
1320 | 0 | _mm_empty (); |
1321 | 0 | } |
1322 | | |
1323 | | static void |
1324 | | mmx_combine_xor_ca (pixman_implementation_t *imp, |
1325 | | pixman_op_t op, |
1326 | | uint32_t * dest, |
1327 | | const uint32_t * src, |
1328 | | const uint32_t * mask, |
1329 | | int width) |
1330 | 0 | { |
1331 | 0 | const uint32_t *end = src + width; |
1332 | |
|
1333 | 0 | while (src < end) |
1334 | 0 | { |
1335 | 0 | __m64 a = load8888 (mask); |
1336 | 0 | __m64 s = load8888 (src); |
1337 | 0 | __m64 d = load8888 (dest); |
1338 | 0 | __m64 da = expand_alpha (d); |
1339 | 0 | __m64 sa = expand_alpha (s); |
1340 | |
|
1341 | 0 | s = pix_multiply (s, a); |
1342 | 0 | a = pix_multiply (a, sa); |
1343 | 0 | da = negate (da); |
1344 | 0 | a = negate (a); |
1345 | 0 | d = pix_add_mul (d, a, s, da); |
1346 | 0 | store8888 (dest, d); |
1347 | |
|
1348 | 0 | ++src; |
1349 | 0 | ++dest; |
1350 | 0 | ++mask; |
1351 | 0 | } |
1352 | 0 | _mm_empty (); |
1353 | 0 | } |
1354 | | |
1355 | | static void |
1356 | | mmx_combine_add_ca (pixman_implementation_t *imp, |
1357 | | pixman_op_t op, |
1358 | | uint32_t * dest, |
1359 | | const uint32_t * src, |
1360 | | const uint32_t * mask, |
1361 | | int width) |
1362 | 0 | { |
1363 | 0 | const uint32_t *end = src + width; |
1364 | |
|
1365 | 0 | while (src < end) |
1366 | 0 | { |
1367 | 0 | __m64 a = load8888 (mask); |
1368 | 0 | __m64 s = load8888 (src); |
1369 | 0 | __m64 d = load8888 (dest); |
1370 | |
|
1371 | 0 | s = pix_multiply (s, a); |
1372 | 0 | d = pix_add (s, d); |
1373 | 0 | store8888 (dest, d); |
1374 | |
|
1375 | 0 | ++src; |
1376 | 0 | ++dest; |
1377 | 0 | ++mask; |
1378 | 0 | } |
1379 | 0 | _mm_empty (); |
1380 | 0 | } |
1381 | | |
1382 | | /* ------------- MMX code paths called from fbpict.c -------------------- */ |
1383 | | |
1384 | | static void |
1385 | | mmx_composite_over_n_8888 (pixman_implementation_t *imp, |
1386 | | pixman_composite_info_t *info) |
1387 | 0 | { |
1388 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
1389 | 0 | uint32_t src; |
1390 | 0 | uint32_t *dst_line, *dst; |
1391 | 0 | int32_t w; |
1392 | 0 | int dst_stride; |
1393 | 0 | __m64 vsrc, vsrca; |
1394 | |
|
1395 | 0 | CHECKPOINT (); |
1396 | |
|
1397 | 0 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
1398 | |
|
1399 | 0 | if (src == 0) |
1400 | 0 | return; |
1401 | | |
1402 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
1403 | |
|
1404 | 0 | vsrc = load8888 (&src); |
1405 | 0 | vsrca = expand_alpha (vsrc); |
1406 | |
|
1407 | 0 | while (height--) |
1408 | 0 | { |
1409 | 0 | dst = dst_line; |
1410 | 0 | dst_line += dst_stride; |
1411 | 0 | w = width; |
1412 | |
|
1413 | 0 | CHECKPOINT (); |
1414 | |
|
1415 | 0 | while (w && (uintptr_t)dst & 7) |
1416 | 0 | { |
1417 | 0 | store8888 (dst, over (vsrc, vsrca, load8888 (dst))); |
1418 | |
|
1419 | 0 | w--; |
1420 | 0 | dst++; |
1421 | 0 | } |
1422 | |
|
1423 | 0 | while (w >= 2) |
1424 | 0 | { |
1425 | 0 | __m64 vdest; |
1426 | 0 | __m64 dest0, dest1; |
1427 | |
|
1428 | 0 | vdest = *(__m64 *)dst; |
1429 | |
|
1430 | 0 | dest0 = over (vsrc, vsrca, expand8888 (vdest, 0)); |
1431 | 0 | dest1 = over (vsrc, vsrca, expand8888 (vdest, 1)); |
1432 | |
|
1433 | 0 | *(__m64 *)dst = pack8888 (dest0, dest1); |
1434 | |
|
1435 | 0 | dst += 2; |
1436 | 0 | w -= 2; |
1437 | 0 | } |
1438 | |
|
1439 | 0 | CHECKPOINT (); |
1440 | |
|
1441 | 0 | if (w) |
1442 | 0 | { |
1443 | 0 | store8888 (dst, over (vsrc, vsrca, load8888 (dst))); |
1444 | 0 | } |
1445 | 0 | } |
1446 | |
|
1447 | 0 | _mm_empty (); |
1448 | 0 | } |
1449 | | |
1450 | | static void |
1451 | | mmx_composite_over_n_0565 (pixman_implementation_t *imp, |
1452 | | pixman_composite_info_t *info) |
1453 | 0 | { |
1454 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
1455 | 0 | uint32_t src; |
1456 | 0 | uint16_t *dst_line, *dst; |
1457 | 0 | int32_t w; |
1458 | 0 | int dst_stride; |
1459 | 0 | __m64 vsrc, vsrca; |
1460 | |
|
1461 | 0 | CHECKPOINT (); |
1462 | |
|
1463 | 0 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
1464 | |
|
1465 | 0 | if (src == 0) |
1466 | 0 | return; |
1467 | | |
1468 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
1469 | |
|
1470 | 0 | vsrc = load8888 (&src); |
1471 | 0 | vsrca = expand_alpha (vsrc); |
1472 | |
|
1473 | 0 | while (height--) |
1474 | 0 | { |
1475 | 0 | dst = dst_line; |
1476 | 0 | dst_line += dst_stride; |
1477 | 0 | w = width; |
1478 | |
|
1479 | 0 | CHECKPOINT (); |
1480 | |
|
1481 | 0 | while (w && (uintptr_t)dst & 7) |
1482 | 0 | { |
1483 | 0 | uint64_t d = *dst; |
1484 | 0 | __m64 vdest = expand565 (to_m64 (d), 0); |
1485 | |
|
1486 | 0 | vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0); |
1487 | 0 | *dst = to_uint64 (vdest); |
1488 | |
|
1489 | 0 | w--; |
1490 | 0 | dst++; |
1491 | 0 | } |
1492 | |
|
1493 | 0 | while (w >= 4) |
1494 | 0 | { |
1495 | 0 | __m64 vdest = *(__m64 *)dst; |
1496 | 0 | __m64 v0, v1, v2, v3; |
1497 | |
|
1498 | 0 | expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); |
1499 | |
|
1500 | 0 | v0 = over (vsrc, vsrca, v0); |
1501 | 0 | v1 = over (vsrc, vsrca, v1); |
1502 | 0 | v2 = over (vsrc, vsrca, v2); |
1503 | 0 | v3 = over (vsrc, vsrca, v3); |
1504 | |
|
1505 | 0 | *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); |
1506 | |
|
1507 | 0 | dst += 4; |
1508 | 0 | w -= 4; |
1509 | 0 | } |
1510 | |
|
1511 | 0 | CHECKPOINT (); |
1512 | |
|
1513 | 0 | while (w) |
1514 | 0 | { |
1515 | 0 | uint64_t d = *dst; |
1516 | 0 | __m64 vdest = expand565 (to_m64 (d), 0); |
1517 | |
|
1518 | 0 | vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0); |
1519 | 0 | *dst = to_uint64 (vdest); |
1520 | |
|
1521 | 0 | w--; |
1522 | 0 | dst++; |
1523 | 0 | } |
1524 | 0 | } |
1525 | |
|
1526 | 0 | _mm_empty (); |
1527 | 0 | } |
1528 | | |
1529 | | static void |
1530 | | mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, |
1531 | | pixman_composite_info_t *info) |
1532 | 0 | { |
1533 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
1534 | 0 | uint32_t src; |
1535 | 0 | uint32_t *dst_line; |
1536 | 0 | uint32_t *mask_line; |
1537 | 0 | int dst_stride, mask_stride; |
1538 | 0 | __m64 vsrc, vsrca; |
1539 | |
|
1540 | 0 | CHECKPOINT (); |
1541 | |
|
1542 | 0 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
1543 | |
|
1544 | 0 | if (src == 0) |
1545 | 0 | return; |
1546 | | |
1547 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
1548 | 0 | PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); |
1549 | |
|
1550 | 0 | vsrc = load8888 (&src); |
1551 | 0 | vsrca = expand_alpha (vsrc); |
1552 | |
|
1553 | 0 | while (height--) |
1554 | 0 | { |
1555 | 0 | int twidth = width; |
1556 | 0 | uint32_t *p = (uint32_t *)mask_line; |
1557 | 0 | uint32_t *q = (uint32_t *)dst_line; |
1558 | |
|
1559 | 0 | while (twidth && (uintptr_t)q & 7) |
1560 | 0 | { |
1561 | 0 | uint32_t m = *(uint32_t *)p; |
1562 | |
|
1563 | 0 | if (m) |
1564 | 0 | { |
1565 | 0 | __m64 vdest = load8888 (q); |
1566 | 0 | vdest = in_over (vsrc, vsrca, load8888 (&m), vdest); |
1567 | 0 | store8888 (q, vdest); |
1568 | 0 | } |
1569 | |
|
1570 | 0 | twidth--; |
1571 | 0 | p++; |
1572 | 0 | q++; |
1573 | 0 | } |
1574 | |
|
1575 | 0 | while (twidth >= 2) |
1576 | 0 | { |
1577 | 0 | uint32_t m0, m1; |
1578 | 0 | m0 = *p; |
1579 | 0 | m1 = *(p + 1); |
1580 | |
|
1581 | 0 | if (m0 | m1) |
1582 | 0 | { |
1583 | 0 | __m64 dest0, dest1; |
1584 | 0 | __m64 vdest = *(__m64 *)q; |
1585 | |
|
1586 | 0 | dest0 = in_over (vsrc, vsrca, load8888 (&m0), |
1587 | 0 | expand8888 (vdest, 0)); |
1588 | 0 | dest1 = in_over (vsrc, vsrca, load8888 (&m1), |
1589 | 0 | expand8888 (vdest, 1)); |
1590 | |
|
1591 | 0 | *(__m64 *)q = pack8888 (dest0, dest1); |
1592 | 0 | } |
1593 | |
|
1594 | 0 | p += 2; |
1595 | 0 | q += 2; |
1596 | 0 | twidth -= 2; |
1597 | 0 | } |
1598 | |
|
1599 | 0 | if (twidth) |
1600 | 0 | { |
1601 | 0 | uint32_t m = *(uint32_t *)p; |
1602 | |
|
1603 | 0 | if (m) |
1604 | 0 | { |
1605 | 0 | __m64 vdest = load8888 (q); |
1606 | 0 | vdest = in_over (vsrc, vsrca, load8888 (&m), vdest); |
1607 | 0 | store8888 (q, vdest); |
1608 | 0 | } |
1609 | |
|
1610 | 0 | twidth--; |
1611 | 0 | p++; |
1612 | 0 | q++; |
1613 | 0 | } |
1614 | |
|
1615 | 0 | dst_line += dst_stride; |
1616 | 0 | mask_line += mask_stride; |
1617 | 0 | } |
1618 | |
|
1619 | 0 | _mm_empty (); |
1620 | 0 | } |
1621 | | |
1622 | | static void |
1623 | | mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp, |
1624 | | pixman_composite_info_t *info) |
1625 | 0 | { |
1626 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
1627 | 0 | uint32_t *dst_line, *dst; |
1628 | 0 | uint32_t *src_line, *src; |
1629 | 0 | uint32_t mask; |
1630 | 0 | __m64 vmask; |
1631 | 0 | int dst_stride, src_stride; |
1632 | 0 | int32_t w; |
1633 | |
|
1634 | 0 | CHECKPOINT (); |
1635 | |
|
1636 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
1637 | 0 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
1638 | |
|
1639 | 0 | mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format); |
1640 | 0 | vmask = expand_alpha (load8888 (&mask)); |
1641 | |
|
1642 | 0 | while (height--) |
1643 | 0 | { |
1644 | 0 | dst = dst_line; |
1645 | 0 | dst_line += dst_stride; |
1646 | 0 | src = src_line; |
1647 | 0 | src_line += src_stride; |
1648 | 0 | w = width; |
1649 | |
|
1650 | 0 | while (w && (uintptr_t)dst & 7) |
1651 | 0 | { |
1652 | 0 | __m64 s = load8888 (src); |
1653 | 0 | __m64 d = load8888 (dst); |
1654 | |
|
1655 | 0 | store8888 (dst, in_over (s, expand_alpha (s), vmask, d)); |
1656 | |
|
1657 | 0 | w--; |
1658 | 0 | dst++; |
1659 | 0 | src++; |
1660 | 0 | } |
1661 | |
|
1662 | 0 | while (w >= 2) |
1663 | 0 | { |
1664 | 0 | __m64 vs = ldq_u ((__m64 *)src); |
1665 | 0 | __m64 vd = *(__m64 *)dst; |
1666 | 0 | __m64 vsrc0 = expand8888 (vs, 0); |
1667 | 0 | __m64 vsrc1 = expand8888 (vs, 1); |
1668 | |
|
1669 | 0 | *(__m64 *)dst = pack8888 ( |
1670 | 0 | in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)), |
1671 | 0 | in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1))); |
1672 | |
|
1673 | 0 | w -= 2; |
1674 | 0 | dst += 2; |
1675 | 0 | src += 2; |
1676 | 0 | } |
1677 | |
|
1678 | 0 | if (w) |
1679 | 0 | { |
1680 | 0 | __m64 s = load8888 (src); |
1681 | 0 | __m64 d = load8888 (dst); |
1682 | |
|
1683 | 0 | store8888 (dst, in_over (s, expand_alpha (s), vmask, d)); |
1684 | 0 | } |
1685 | 0 | } |
1686 | |
|
1687 | 0 | _mm_empty (); |
1688 | 0 | } |
1689 | | |
1690 | | static void |
1691 | | mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp, |
1692 | | pixman_composite_info_t *info) |
1693 | 0 | { |
1694 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
1695 | 0 | uint32_t *dst_line, *dst; |
1696 | 0 | uint32_t *src_line, *src; |
1697 | 0 | uint32_t mask; |
1698 | 0 | __m64 vmask; |
1699 | 0 | int dst_stride, src_stride; |
1700 | 0 | int32_t w; |
1701 | 0 | __m64 srca; |
1702 | |
|
1703 | 0 | CHECKPOINT (); |
1704 | |
|
1705 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
1706 | 0 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
1707 | 0 | mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format); |
1708 | |
|
1709 | 0 | vmask = expand_alpha (load8888 (&mask)); |
1710 | 0 | srca = MC (4x00ff); |
1711 | |
|
1712 | 0 | while (height--) |
1713 | 0 | { |
1714 | 0 | dst = dst_line; |
1715 | 0 | dst_line += dst_stride; |
1716 | 0 | src = src_line; |
1717 | 0 | src_line += src_stride; |
1718 | 0 | w = width; |
1719 | |
|
1720 | 0 | while (w && (uintptr_t)dst & 7) |
1721 | 0 | { |
1722 | 0 | uint32_t ssrc = *src | 0xff000000; |
1723 | 0 | __m64 s = load8888 (&ssrc); |
1724 | 0 | __m64 d = load8888 (dst); |
1725 | |
|
1726 | 0 | store8888 (dst, in_over (s, srca, vmask, d)); |
1727 | |
|
1728 | 0 | w--; |
1729 | 0 | dst++; |
1730 | 0 | src++; |
1731 | 0 | } |
1732 | |
|
1733 | 0 | while (w >= 16) |
1734 | 0 | { |
1735 | 0 | __m64 vd0 = *(__m64 *)(dst + 0); |
1736 | 0 | __m64 vd1 = *(__m64 *)(dst + 2); |
1737 | 0 | __m64 vd2 = *(__m64 *)(dst + 4); |
1738 | 0 | __m64 vd3 = *(__m64 *)(dst + 6); |
1739 | 0 | __m64 vd4 = *(__m64 *)(dst + 8); |
1740 | 0 | __m64 vd5 = *(__m64 *)(dst + 10); |
1741 | 0 | __m64 vd6 = *(__m64 *)(dst + 12); |
1742 | 0 | __m64 vd7 = *(__m64 *)(dst + 14); |
1743 | |
|
1744 | 0 | __m64 vs0 = ldq_u ((__m64 *)(src + 0)); |
1745 | 0 | __m64 vs1 = ldq_u ((__m64 *)(src + 2)); |
1746 | 0 | __m64 vs2 = ldq_u ((__m64 *)(src + 4)); |
1747 | 0 | __m64 vs3 = ldq_u ((__m64 *)(src + 6)); |
1748 | 0 | __m64 vs4 = ldq_u ((__m64 *)(src + 8)); |
1749 | 0 | __m64 vs5 = ldq_u ((__m64 *)(src + 10)); |
1750 | 0 | __m64 vs6 = ldq_u ((__m64 *)(src + 12)); |
1751 | 0 | __m64 vs7 = ldq_u ((__m64 *)(src + 14)); |
1752 | |
|
1753 | 0 | vd0 = pack8888 ( |
1754 | 0 | in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)), |
1755 | 0 | in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1))); |
1756 | |
|
1757 | 0 | vd1 = pack8888 ( |
1758 | 0 | in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)), |
1759 | 0 | in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1))); |
1760 | |
|
1761 | 0 | vd2 = pack8888 ( |
1762 | 0 | in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)), |
1763 | 0 | in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1))); |
1764 | |
|
1765 | 0 | vd3 = pack8888 ( |
1766 | 0 | in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)), |
1767 | 0 | in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1))); |
1768 | |
|
1769 | 0 | vd4 = pack8888 ( |
1770 | 0 | in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)), |
1771 | 0 | in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1))); |
1772 | |
|
1773 | 0 | vd5 = pack8888 ( |
1774 | 0 | in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)), |
1775 | 0 | in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1))); |
1776 | |
|
1777 | 0 | vd6 = pack8888 ( |
1778 | 0 | in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)), |
1779 | 0 | in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1))); |
1780 | |
|
1781 | 0 | vd7 = pack8888 ( |
1782 | 0 | in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)), |
1783 | 0 | in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1))); |
1784 | |
|
1785 | 0 | *(__m64 *)(dst + 0) = vd0; |
1786 | 0 | *(__m64 *)(dst + 2) = vd1; |
1787 | 0 | *(__m64 *)(dst + 4) = vd2; |
1788 | 0 | *(__m64 *)(dst + 6) = vd3; |
1789 | 0 | *(__m64 *)(dst + 8) = vd4; |
1790 | 0 | *(__m64 *)(dst + 10) = vd5; |
1791 | 0 | *(__m64 *)(dst + 12) = vd6; |
1792 | 0 | *(__m64 *)(dst + 14) = vd7; |
1793 | |
|
1794 | 0 | w -= 16; |
1795 | 0 | dst += 16; |
1796 | 0 | src += 16; |
1797 | 0 | } |
1798 | |
|
1799 | 0 | while (w) |
1800 | 0 | { |
1801 | 0 | uint32_t ssrc = *src | 0xff000000; |
1802 | 0 | __m64 s = load8888 (&ssrc); |
1803 | 0 | __m64 d = load8888 (dst); |
1804 | |
|
1805 | 0 | store8888 (dst, in_over (s, srca, vmask, d)); |
1806 | |
|
1807 | 0 | w--; |
1808 | 0 | dst++; |
1809 | 0 | src++; |
1810 | 0 | } |
1811 | 0 | } |
1812 | |
|
1813 | 0 | _mm_empty (); |
1814 | 0 | } |
1815 | | |
1816 | | static void |
1817 | | mmx_composite_over_8888_8888 (pixman_implementation_t *imp, |
1818 | | pixman_composite_info_t *info) |
1819 | 0 | { |
1820 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
1821 | 0 | uint32_t *dst_line, *dst; |
1822 | 0 | uint32_t *src_line, *src; |
1823 | 0 | uint32_t s; |
1824 | 0 | int dst_stride, src_stride; |
1825 | 0 | uint8_t a; |
1826 | 0 | int32_t w; |
1827 | |
|
1828 | 0 | CHECKPOINT (); |
1829 | |
|
1830 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
1831 | 0 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
1832 | |
|
1833 | 0 | while (height--) |
1834 | 0 | { |
1835 | 0 | dst = dst_line; |
1836 | 0 | dst_line += dst_stride; |
1837 | 0 | src = src_line; |
1838 | 0 | src_line += src_stride; |
1839 | 0 | w = width; |
1840 | |
|
1841 | 0 | while (w--) |
1842 | 0 | { |
1843 | 0 | s = *src++; |
1844 | 0 | a = s >> 24; |
1845 | |
|
1846 | 0 | if (a == 0xff) |
1847 | 0 | { |
1848 | 0 | *dst = s; |
1849 | 0 | } |
1850 | 0 | else if (s) |
1851 | 0 | { |
1852 | 0 | __m64 ms, sa; |
1853 | 0 | ms = load8888 (&s); |
1854 | 0 | sa = expand_alpha (ms); |
1855 | 0 | store8888 (dst, over (ms, sa, load8888 (dst))); |
1856 | 0 | } |
1857 | |
|
1858 | 0 | dst++; |
1859 | 0 | } |
1860 | 0 | } |
1861 | 0 | _mm_empty (); |
1862 | 0 | } |
1863 | | |
1864 | | static void |
1865 | | mmx_composite_over_8888_0565 (pixman_implementation_t *imp, |
1866 | | pixman_composite_info_t *info) |
1867 | 0 | { |
1868 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
1869 | 0 | uint16_t *dst_line, *dst; |
1870 | 0 | uint32_t *src_line, *src; |
1871 | 0 | int dst_stride, src_stride; |
1872 | 0 | int32_t w; |
1873 | |
|
1874 | 0 | CHECKPOINT (); |
1875 | |
|
1876 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
1877 | 0 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
1878 | |
|
1879 | | #if 0 |
1880 | | /* FIXME */ |
1881 | | assert (src_image->drawable == mask_image->drawable); |
1882 | | #endif |
1883 | |
|
1884 | 0 | while (height--) |
1885 | 0 | { |
1886 | 0 | dst = dst_line; |
1887 | 0 | dst_line += dst_stride; |
1888 | 0 | src = src_line; |
1889 | 0 | src_line += src_stride; |
1890 | 0 | w = width; |
1891 | |
|
1892 | 0 | CHECKPOINT (); |
1893 | |
|
1894 | 0 | while (w && (uintptr_t)dst & 7) |
1895 | 0 | { |
1896 | 0 | __m64 vsrc = load8888 (src); |
1897 | 0 | uint64_t d = *dst; |
1898 | 0 | __m64 vdest = expand565 (to_m64 (d), 0); |
1899 | |
|
1900 | 0 | vdest = pack_565 ( |
1901 | 0 | over (vsrc, expand_alpha (vsrc), vdest), vdest, 0); |
1902 | |
|
1903 | 0 | *dst = to_uint64 (vdest); |
1904 | |
|
1905 | 0 | w--; |
1906 | 0 | dst++; |
1907 | 0 | src++; |
1908 | 0 | } |
1909 | |
|
1910 | 0 | CHECKPOINT (); |
1911 | |
|
1912 | 0 | while (w >= 4) |
1913 | 0 | { |
1914 | 0 | __m64 vdest = *(__m64 *)dst; |
1915 | 0 | __m64 v0, v1, v2, v3; |
1916 | 0 | __m64 vsrc0, vsrc1, vsrc2, vsrc3; |
1917 | |
|
1918 | 0 | expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); |
1919 | |
|
1920 | 0 | vsrc0 = load8888 ((src + 0)); |
1921 | 0 | vsrc1 = load8888 ((src + 1)); |
1922 | 0 | vsrc2 = load8888 ((src + 2)); |
1923 | 0 | vsrc3 = load8888 ((src + 3)); |
1924 | |
|
1925 | 0 | v0 = over (vsrc0, expand_alpha (vsrc0), v0); |
1926 | 0 | v1 = over (vsrc1, expand_alpha (vsrc1), v1); |
1927 | 0 | v2 = over (vsrc2, expand_alpha (vsrc2), v2); |
1928 | 0 | v3 = over (vsrc3, expand_alpha (vsrc3), v3); |
1929 | |
|
1930 | 0 | *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); |
1931 | |
|
1932 | 0 | w -= 4; |
1933 | 0 | dst += 4; |
1934 | 0 | src += 4; |
1935 | 0 | } |
1936 | |
|
1937 | 0 | CHECKPOINT (); |
1938 | |
|
1939 | 0 | while (w) |
1940 | 0 | { |
1941 | 0 | __m64 vsrc = load8888 (src); |
1942 | 0 | uint64_t d = *dst; |
1943 | 0 | __m64 vdest = expand565 (to_m64 (d), 0); |
1944 | |
|
1945 | 0 | vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0); |
1946 | |
|
1947 | 0 | *dst = to_uint64 (vdest); |
1948 | |
|
1949 | 0 | w--; |
1950 | 0 | dst++; |
1951 | 0 | src++; |
1952 | 0 | } |
1953 | 0 | } |
1954 | |
|
1955 | 0 | _mm_empty (); |
1956 | 0 | } |
1957 | | |
1958 | | static void |
1959 | | mmx_composite_over_n_8_8888 (pixman_implementation_t *imp, |
1960 | | pixman_composite_info_t *info) |
1961 | 0 | { |
1962 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
1963 | 0 | uint32_t src, srca; |
1964 | 0 | uint32_t *dst_line, *dst; |
1965 | 0 | uint8_t *mask_line, *mask; |
1966 | 0 | int dst_stride, mask_stride; |
1967 | 0 | int32_t w; |
1968 | 0 | __m64 vsrc, vsrca; |
1969 | 0 | uint64_t srcsrc; |
1970 | |
|
1971 | 0 | CHECKPOINT (); |
1972 | |
|
1973 | 0 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
1974 | |
|
1975 | 0 | srca = src >> 24; |
1976 | 0 | if (src == 0) |
1977 | 0 | return; |
1978 | | |
1979 | 0 | srcsrc = (uint64_t)src << 32 | src; |
1980 | |
|
1981 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
1982 | 0 | PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
1983 | |
|
1984 | 0 | vsrc = load8888 (&src); |
1985 | 0 | vsrca = expand_alpha (vsrc); |
1986 | |
|
1987 | 0 | while (height--) |
1988 | 0 | { |
1989 | 0 | dst = dst_line; |
1990 | 0 | dst_line += dst_stride; |
1991 | 0 | mask = mask_line; |
1992 | 0 | mask_line += mask_stride; |
1993 | 0 | w = width; |
1994 | |
|
1995 | 0 | CHECKPOINT (); |
1996 | |
|
1997 | 0 | while (w && (uintptr_t)dst & 7) |
1998 | 0 | { |
1999 | 0 | uint64_t m = *mask; |
2000 | |
|
2001 | 0 | if (m) |
2002 | 0 | { |
2003 | 0 | __m64 vdest = in_over (vsrc, vsrca, |
2004 | 0 | expand_alpha_rev (to_m64 (m)), |
2005 | 0 | load8888 (dst)); |
2006 | |
|
2007 | 0 | store8888 (dst, vdest); |
2008 | 0 | } |
2009 | |
|
2010 | 0 | w--; |
2011 | 0 | mask++; |
2012 | 0 | dst++; |
2013 | 0 | } |
2014 | |
|
2015 | 0 | CHECKPOINT (); |
2016 | |
|
2017 | 0 | while (w >= 2) |
2018 | 0 | { |
2019 | 0 | uint64_t m0, m1; |
2020 | |
|
2021 | 0 | m0 = *mask; |
2022 | 0 | m1 = *(mask + 1); |
2023 | |
|
2024 | 0 | if (srca == 0xff && (m0 & m1) == 0xff) |
2025 | 0 | { |
2026 | 0 | *(uint64_t *)dst = srcsrc; |
2027 | 0 | } |
2028 | 0 | else if (m0 | m1) |
2029 | 0 | { |
2030 | 0 | __m64 vdest; |
2031 | 0 | __m64 dest0, dest1; |
2032 | |
|
2033 | 0 | vdest = *(__m64 *)dst; |
2034 | |
|
2035 | 0 | dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)), |
2036 | 0 | expand8888 (vdest, 0)); |
2037 | 0 | dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)), |
2038 | 0 | expand8888 (vdest, 1)); |
2039 | |
|
2040 | 0 | *(__m64 *)dst = pack8888 (dest0, dest1); |
2041 | 0 | } |
2042 | |
|
2043 | 0 | mask += 2; |
2044 | 0 | dst += 2; |
2045 | 0 | w -= 2; |
2046 | 0 | } |
2047 | |
|
2048 | 0 | CHECKPOINT (); |
2049 | |
|
2050 | 0 | if (w) |
2051 | 0 | { |
2052 | 0 | uint64_t m = *mask; |
2053 | |
|
2054 | 0 | if (m) |
2055 | 0 | { |
2056 | 0 | __m64 vdest = load8888 (dst); |
2057 | |
|
2058 | 0 | vdest = in_over ( |
2059 | 0 | vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest); |
2060 | 0 | store8888 (dst, vdest); |
2061 | 0 | } |
2062 | 0 | } |
2063 | 0 | } |
2064 | |
|
2065 | 0 | _mm_empty (); |
2066 | 0 | } |
2067 | | |
2068 | | static pixman_bool_t |
2069 | | mmx_fill (pixman_implementation_t *imp, |
2070 | | uint32_t * bits, |
2071 | | int stride, |
2072 | | int bpp, |
2073 | | int x, |
2074 | | int y, |
2075 | | int width, |
2076 | | int height, |
2077 | | uint32_t filler) |
2078 | 0 | { |
2079 | 0 | uint64_t fill; |
2080 | 0 | __m64 vfill; |
2081 | 0 | uint32_t byte_width; |
2082 | 0 | uint8_t *byte_line; |
2083 | |
|
2084 | 0 | #if defined __GNUC__ && defined USE_X86_MMX |
2085 | 0 | __m64 v1, v2, v3, v4, v5, v6, v7; |
2086 | 0 | #endif |
2087 | |
|
2088 | 0 | if (bpp != 16 && bpp != 32 && bpp != 8) |
2089 | 0 | return FALSE; |
2090 | | |
2091 | 0 | if (bpp == 8) |
2092 | 0 | { |
2093 | 0 | stride = stride * (int) sizeof (uint32_t) / 1; |
2094 | 0 | byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x); |
2095 | 0 | byte_width = width; |
2096 | 0 | stride *= 1; |
2097 | 0 | filler = (filler & 0xff) * 0x01010101; |
2098 | 0 | } |
2099 | 0 | else if (bpp == 16) |
2100 | 0 | { |
2101 | 0 | stride = stride * (int) sizeof (uint32_t) / 2; |
2102 | 0 | byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x); |
2103 | 0 | byte_width = 2 * width; |
2104 | 0 | stride *= 2; |
2105 | 0 | filler = (filler & 0xffff) * 0x00010001; |
2106 | 0 | } |
2107 | 0 | else |
2108 | 0 | { |
2109 | 0 | stride = stride * (int) sizeof (uint32_t) / 4; |
2110 | 0 | byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x); |
2111 | 0 | byte_width = 4 * width; |
2112 | 0 | stride *= 4; |
2113 | 0 | } |
2114 | |
|
2115 | 0 | fill = ((uint64_t)filler << 32) | filler; |
2116 | 0 | vfill = to_m64 (fill); |
2117 | |
|
2118 | 0 | #if defined __GNUC__ && defined USE_X86_MMX |
2119 | 0 | __asm__ ( |
2120 | 0 | "movq %7, %0\n" |
2121 | 0 | "movq %7, %1\n" |
2122 | 0 | "movq %7, %2\n" |
2123 | 0 | "movq %7, %3\n" |
2124 | 0 | "movq %7, %4\n" |
2125 | 0 | "movq %7, %5\n" |
2126 | 0 | "movq %7, %6\n" |
2127 | 0 | : "=&y" (v1), "=&y" (v2), "=&y" (v3), |
2128 | 0 | "=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7) |
2129 | 0 | : "y" (vfill)); |
2130 | 0 | #endif |
2131 | |
|
2132 | 0 | while (height--) |
2133 | 0 | { |
2134 | 0 | int w; |
2135 | 0 | uint8_t *d = byte_line; |
2136 | |
|
2137 | 0 | byte_line += stride; |
2138 | 0 | w = byte_width; |
2139 | |
|
2140 | 0 | if (w >= 1 && ((uintptr_t)d & 1)) |
2141 | 0 | { |
2142 | 0 | *(uint8_t *)d = (filler & 0xff); |
2143 | 0 | w--; |
2144 | 0 | d++; |
2145 | 0 | } |
2146 | |
|
2147 | 0 | if (w >= 2 && ((uintptr_t)d & 3)) |
2148 | 0 | { |
2149 | 0 | *(uint16_t *)d = filler; |
2150 | 0 | w -= 2; |
2151 | 0 | d += 2; |
2152 | 0 | } |
2153 | |
|
2154 | 0 | while (w >= 4 && ((uintptr_t)d & 7)) |
2155 | 0 | { |
2156 | 0 | *(uint32_t *)d = filler; |
2157 | |
|
2158 | 0 | w -= 4; |
2159 | 0 | d += 4; |
2160 | 0 | } |
2161 | |
|
2162 | 0 | while (w >= 64) |
2163 | 0 | { |
2164 | 0 | #if defined __GNUC__ && defined USE_X86_MMX |
2165 | 0 | __asm__ ( |
2166 | 0 | "movq %1, (%0)\n" |
2167 | 0 | "movq %2, 8(%0)\n" |
2168 | 0 | "movq %3, 16(%0)\n" |
2169 | 0 | "movq %4, 24(%0)\n" |
2170 | 0 | "movq %5, 32(%0)\n" |
2171 | 0 | "movq %6, 40(%0)\n" |
2172 | 0 | "movq %7, 48(%0)\n" |
2173 | 0 | "movq %8, 56(%0)\n" |
2174 | 0 | : |
2175 | 0 | : "r" (d), |
2176 | 0 | "y" (vfill), "y" (v1), "y" (v2), "y" (v3), |
2177 | 0 | "y" (v4), "y" (v5), "y" (v6), "y" (v7) |
2178 | 0 | : "memory"); |
2179 | | #else |
2180 | | *(__m64*) (d + 0) = vfill; |
2181 | | *(__m64*) (d + 8) = vfill; |
2182 | | *(__m64*) (d + 16) = vfill; |
2183 | | *(__m64*) (d + 24) = vfill; |
2184 | | *(__m64*) (d + 32) = vfill; |
2185 | | *(__m64*) (d + 40) = vfill; |
2186 | | *(__m64*) (d + 48) = vfill; |
2187 | | *(__m64*) (d + 56) = vfill; |
2188 | | #endif |
2189 | 0 | w -= 64; |
2190 | 0 | d += 64; |
2191 | 0 | } |
2192 | |
|
2193 | 0 | while (w >= 4) |
2194 | 0 | { |
2195 | 0 | *(uint32_t *)d = filler; |
2196 | |
|
2197 | 0 | w -= 4; |
2198 | 0 | d += 4; |
2199 | 0 | } |
2200 | 0 | if (w >= 2) |
2201 | 0 | { |
2202 | 0 | *(uint16_t *)d = filler; |
2203 | 0 | w -= 2; |
2204 | 0 | d += 2; |
2205 | 0 | } |
2206 | 0 | if (w >= 1) |
2207 | 0 | { |
2208 | 0 | *(uint8_t *)d = (filler & 0xff); |
2209 | 0 | w--; |
2210 | 0 | d++; |
2211 | 0 | } |
2212 | |
|
2213 | 0 | } |
2214 | |
|
2215 | 0 | _mm_empty (); |
2216 | 0 | return TRUE; |
2217 | 0 | } |
2218 | | |
2219 | | static void |
2220 | | mmx_composite_src_x888_0565 (pixman_implementation_t *imp, |
2221 | | pixman_composite_info_t *info) |
2222 | 0 | { |
2223 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
2224 | 0 | uint16_t *dst_line, *dst; |
2225 | 0 | uint32_t *src_line, *src, s; |
2226 | 0 | int dst_stride, src_stride; |
2227 | 0 | int32_t w; |
2228 | |
|
2229 | 0 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
2230 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
2231 | |
|
2232 | 0 | while (height--) |
2233 | 0 | { |
2234 | 0 | dst = dst_line; |
2235 | 0 | dst_line += dst_stride; |
2236 | 0 | src = src_line; |
2237 | 0 | src_line += src_stride; |
2238 | 0 | w = width; |
2239 | |
|
2240 | 0 | while (w && (uintptr_t)dst & 7) |
2241 | 0 | { |
2242 | 0 | s = *src++; |
2243 | 0 | *dst = convert_8888_to_0565 (s); |
2244 | 0 | dst++; |
2245 | 0 | w--; |
2246 | 0 | } |
2247 | |
|
2248 | 0 | while (w >= 4) |
2249 | 0 | { |
2250 | 0 | __m64 vdest; |
2251 | 0 | __m64 vsrc0 = ldq_u ((__m64 *)(src + 0)); |
2252 | 0 | __m64 vsrc1 = ldq_u ((__m64 *)(src + 2)); |
2253 | |
|
2254 | 0 | vdest = pack_4xpacked565 (vsrc0, vsrc1); |
2255 | |
|
2256 | 0 | *(__m64 *)dst = vdest; |
2257 | |
|
2258 | 0 | w -= 4; |
2259 | 0 | src += 4; |
2260 | 0 | dst += 4; |
2261 | 0 | } |
2262 | |
|
2263 | 0 | while (w) |
2264 | 0 | { |
2265 | 0 | s = *src++; |
2266 | 0 | *dst = convert_8888_to_0565 (s); |
2267 | 0 | dst++; |
2268 | 0 | w--; |
2269 | 0 | } |
2270 | 0 | } |
2271 | |
|
2272 | 0 | _mm_empty (); |
2273 | 0 | } |
2274 | | |
2275 | | static void |
2276 | | mmx_composite_src_n_8_8888 (pixman_implementation_t *imp, |
2277 | | pixman_composite_info_t *info) |
2278 | 0 | { |
2279 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
2280 | 0 | uint32_t src, srca; |
2281 | 0 | uint32_t *dst_line, *dst; |
2282 | 0 | uint8_t *mask_line, *mask; |
2283 | 0 | int dst_stride, mask_stride; |
2284 | 0 | int32_t w; |
2285 | 0 | __m64 vsrc; |
2286 | 0 | uint64_t srcsrc; |
2287 | |
|
2288 | 0 | CHECKPOINT (); |
2289 | |
|
2290 | 0 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
2291 | |
|
2292 | 0 | srca = src >> 24; |
2293 | 0 | if (src == 0) |
2294 | 0 | { |
2295 | 0 | mmx_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride, |
2296 | 0 | PIXMAN_FORMAT_BPP (dest_image->bits.format), |
2297 | 0 | dest_x, dest_y, width, height, 0); |
2298 | 0 | return; |
2299 | 0 | } |
2300 | | |
2301 | 0 | srcsrc = (uint64_t)src << 32 | src; |
2302 | |
|
2303 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
2304 | 0 | PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
2305 | |
|
2306 | 0 | vsrc = load8888 (&src); |
2307 | |
|
2308 | 0 | while (height--) |
2309 | 0 | { |
2310 | 0 | dst = dst_line; |
2311 | 0 | dst_line += dst_stride; |
2312 | 0 | mask = mask_line; |
2313 | 0 | mask_line += mask_stride; |
2314 | 0 | w = width; |
2315 | |
|
2316 | 0 | CHECKPOINT (); |
2317 | |
|
2318 | 0 | while (w && (uintptr_t)dst & 7) |
2319 | 0 | { |
2320 | 0 | uint64_t m = *mask; |
2321 | |
|
2322 | 0 | if (m) |
2323 | 0 | { |
2324 | 0 | __m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m))); |
2325 | |
|
2326 | 0 | store8888 (dst, vdest); |
2327 | 0 | } |
2328 | 0 | else |
2329 | 0 | { |
2330 | 0 | *dst = 0; |
2331 | 0 | } |
2332 | |
|
2333 | 0 | w--; |
2334 | 0 | mask++; |
2335 | 0 | dst++; |
2336 | 0 | } |
2337 | |
|
2338 | 0 | CHECKPOINT (); |
2339 | |
|
2340 | 0 | while (w >= 2) |
2341 | 0 | { |
2342 | 0 | uint64_t m0, m1; |
2343 | 0 | m0 = *mask; |
2344 | 0 | m1 = *(mask + 1); |
2345 | |
|
2346 | 0 | if (srca == 0xff && (m0 & m1) == 0xff) |
2347 | 0 | { |
2348 | 0 | *(uint64_t *)dst = srcsrc; |
2349 | 0 | } |
2350 | 0 | else if (m0 | m1) |
2351 | 0 | { |
2352 | 0 | __m64 dest0, dest1; |
2353 | |
|
2354 | 0 | dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0))); |
2355 | 0 | dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1))); |
2356 | |
|
2357 | 0 | *(__m64 *)dst = pack8888 (dest0, dest1); |
2358 | 0 | } |
2359 | 0 | else |
2360 | 0 | { |
2361 | 0 | *(uint64_t *)dst = 0; |
2362 | 0 | } |
2363 | |
|
2364 | 0 | mask += 2; |
2365 | 0 | dst += 2; |
2366 | 0 | w -= 2; |
2367 | 0 | } |
2368 | |
|
2369 | 0 | CHECKPOINT (); |
2370 | |
|
2371 | 0 | if (w) |
2372 | 0 | { |
2373 | 0 | uint64_t m = *mask; |
2374 | |
|
2375 | 0 | if (m) |
2376 | 0 | { |
2377 | 0 | __m64 vdest = load8888 (dst); |
2378 | |
|
2379 | 0 | vdest = in (vsrc, expand_alpha_rev (to_m64 (m))); |
2380 | 0 | store8888 (dst, vdest); |
2381 | 0 | } |
2382 | 0 | else |
2383 | 0 | { |
2384 | 0 | *dst = 0; |
2385 | 0 | } |
2386 | 0 | } |
2387 | 0 | } |
2388 | |
|
2389 | 0 | _mm_empty (); |
2390 | 0 | } |
2391 | | |
2392 | | static void |
2393 | | mmx_composite_over_n_8_0565 (pixman_implementation_t *imp, |
2394 | | pixman_composite_info_t *info) |
2395 | 0 | { |
2396 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
2397 | 0 | uint32_t src, srca; |
2398 | 0 | uint16_t *dst_line, *dst; |
2399 | 0 | uint8_t *mask_line, *mask; |
2400 | 0 | int dst_stride, mask_stride; |
2401 | 0 | int32_t w; |
2402 | 0 | __m64 vsrc, vsrca, tmp; |
2403 | 0 | __m64 srcsrcsrcsrc; |
2404 | |
|
2405 | 0 | CHECKPOINT (); |
2406 | |
|
2407 | 0 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
2408 | |
|
2409 | 0 | srca = src >> 24; |
2410 | 0 | if (src == 0) |
2411 | 0 | return; |
2412 | | |
2413 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
2414 | 0 | PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
2415 | |
|
2416 | 0 | vsrc = load8888 (&src); |
2417 | 0 | vsrca = expand_alpha (vsrc); |
2418 | |
|
2419 | 0 | tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0); |
2420 | 0 | srcsrcsrcsrc = expand_alpha_rev (tmp); |
2421 | |
|
2422 | 0 | while (height--) |
2423 | 0 | { |
2424 | 0 | dst = dst_line; |
2425 | 0 | dst_line += dst_stride; |
2426 | 0 | mask = mask_line; |
2427 | 0 | mask_line += mask_stride; |
2428 | 0 | w = width; |
2429 | |
|
2430 | 0 | CHECKPOINT (); |
2431 | |
|
2432 | 0 | while (w && (uintptr_t)dst & 7) |
2433 | 0 | { |
2434 | 0 | uint64_t m = *mask; |
2435 | |
|
2436 | 0 | if (m) |
2437 | 0 | { |
2438 | 0 | uint64_t d = *dst; |
2439 | 0 | __m64 vd = to_m64 (d); |
2440 | 0 | __m64 vdest = in_over ( |
2441 | 0 | vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0)); |
2442 | |
|
2443 | 0 | vd = pack_565 (vdest, _mm_setzero_si64 (), 0); |
2444 | 0 | *dst = to_uint64 (vd); |
2445 | 0 | } |
2446 | |
|
2447 | 0 | w--; |
2448 | 0 | mask++; |
2449 | 0 | dst++; |
2450 | 0 | } |
2451 | |
|
2452 | 0 | CHECKPOINT (); |
2453 | |
|
2454 | 0 | while (w >= 4) |
2455 | 0 | { |
2456 | 0 | uint64_t m0, m1, m2, m3; |
2457 | 0 | m0 = *mask; |
2458 | 0 | m1 = *(mask + 1); |
2459 | 0 | m2 = *(mask + 2); |
2460 | 0 | m3 = *(mask + 3); |
2461 | |
|
2462 | 0 | if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff) |
2463 | 0 | { |
2464 | 0 | *(__m64 *)dst = srcsrcsrcsrc; |
2465 | 0 | } |
2466 | 0 | else if (m0 | m1 | m2 | m3) |
2467 | 0 | { |
2468 | 0 | __m64 vdest = *(__m64 *)dst; |
2469 | 0 | __m64 v0, v1, v2, v3; |
2470 | 0 | __m64 vm0, vm1, vm2, vm3; |
2471 | |
|
2472 | 0 | expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); |
2473 | |
|
2474 | 0 | vm0 = to_m64 (m0); |
2475 | 0 | v0 = in_over (vsrc, vsrca, expand_alpha_rev (vm0), v0); |
2476 | |
|
2477 | 0 | vm1 = to_m64 (m1); |
2478 | 0 | v1 = in_over (vsrc, vsrca, expand_alpha_rev (vm1), v1); |
2479 | |
|
2480 | 0 | vm2 = to_m64 (m2); |
2481 | 0 | v2 = in_over (vsrc, vsrca, expand_alpha_rev (vm2), v2); |
2482 | |
|
2483 | 0 | vm3 = to_m64 (m3); |
2484 | 0 | v3 = in_over (vsrc, vsrca, expand_alpha_rev (vm3), v3); |
2485 | |
|
2486 | 0 | *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);; |
2487 | 0 | } |
2488 | |
|
2489 | 0 | w -= 4; |
2490 | 0 | mask += 4; |
2491 | 0 | dst += 4; |
2492 | 0 | } |
2493 | |
|
2494 | 0 | CHECKPOINT (); |
2495 | |
|
2496 | 0 | while (w) |
2497 | 0 | { |
2498 | 0 | uint64_t m = *mask; |
2499 | |
|
2500 | 0 | if (m) |
2501 | 0 | { |
2502 | 0 | uint64_t d = *dst; |
2503 | 0 | __m64 vd = to_m64 (d); |
2504 | 0 | __m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)), |
2505 | 0 | expand565 (vd, 0)); |
2506 | 0 | vd = pack_565 (vdest, _mm_setzero_si64 (), 0); |
2507 | 0 | *dst = to_uint64 (vd); |
2508 | 0 | } |
2509 | |
|
2510 | 0 | w--; |
2511 | 0 | mask++; |
2512 | 0 | dst++; |
2513 | 0 | } |
2514 | 0 | } |
2515 | |
|
2516 | 0 | _mm_empty (); |
2517 | 0 | } |
2518 | | |
2519 | | static void |
2520 | | mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp, |
2521 | | pixman_composite_info_t *info) |
2522 | 0 | { |
2523 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
2524 | 0 | uint16_t *dst_line, *dst; |
2525 | 0 | uint32_t *src_line, *src; |
2526 | 0 | int dst_stride, src_stride; |
2527 | 0 | int32_t w; |
2528 | |
|
2529 | 0 | CHECKPOINT (); |
2530 | |
|
2531 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
2532 | 0 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
2533 | |
|
2534 | | #if 0 |
2535 | | /* FIXME */ |
2536 | | assert (src_image->drawable == mask_image->drawable); |
2537 | | #endif |
2538 | |
|
2539 | 0 | while (height--) |
2540 | 0 | { |
2541 | 0 | dst = dst_line; |
2542 | 0 | dst_line += dst_stride; |
2543 | 0 | src = src_line; |
2544 | 0 | src_line += src_stride; |
2545 | 0 | w = width; |
2546 | |
|
2547 | 0 | CHECKPOINT (); |
2548 | |
|
2549 | 0 | while (w && (uintptr_t)dst & 7) |
2550 | 0 | { |
2551 | 0 | __m64 vsrc = load8888 (src); |
2552 | 0 | uint64_t d = *dst; |
2553 | 0 | __m64 vdest = expand565 (to_m64 (d), 0); |
2554 | |
|
2555 | 0 | vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0); |
2556 | |
|
2557 | 0 | *dst = to_uint64 (vdest); |
2558 | |
|
2559 | 0 | w--; |
2560 | 0 | dst++; |
2561 | 0 | src++; |
2562 | 0 | } |
2563 | |
|
2564 | 0 | CHECKPOINT (); |
2565 | |
|
2566 | 0 | while (w >= 4) |
2567 | 0 | { |
2568 | 0 | uint32_t s0, s1, s2, s3; |
2569 | 0 | unsigned char a0, a1, a2, a3; |
2570 | |
|
2571 | 0 | s0 = *src; |
2572 | 0 | s1 = *(src + 1); |
2573 | 0 | s2 = *(src + 2); |
2574 | 0 | s3 = *(src + 3); |
2575 | |
|
2576 | 0 | a0 = (s0 >> 24); |
2577 | 0 | a1 = (s1 >> 24); |
2578 | 0 | a2 = (s2 >> 24); |
2579 | 0 | a3 = (s3 >> 24); |
2580 | |
|
2581 | 0 | if ((a0 & a1 & a2 & a3) == 0xFF) |
2582 | 0 | { |
2583 | 0 | __m64 v0 = invert_colors (load8888 (&s0)); |
2584 | 0 | __m64 v1 = invert_colors (load8888 (&s1)); |
2585 | 0 | __m64 v2 = invert_colors (load8888 (&s2)); |
2586 | 0 | __m64 v3 = invert_colors (load8888 (&s3)); |
2587 | |
|
2588 | 0 | *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); |
2589 | 0 | } |
2590 | 0 | else if (s0 | s1 | s2 | s3) |
2591 | 0 | { |
2592 | 0 | __m64 vdest = *(__m64 *)dst; |
2593 | 0 | __m64 v0, v1, v2, v3; |
2594 | |
|
2595 | 0 | __m64 vsrc0 = load8888 (&s0); |
2596 | 0 | __m64 vsrc1 = load8888 (&s1); |
2597 | 0 | __m64 vsrc2 = load8888 (&s2); |
2598 | 0 | __m64 vsrc3 = load8888 (&s3); |
2599 | |
|
2600 | 0 | expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); |
2601 | |
|
2602 | 0 | v0 = over_rev_non_pre (vsrc0, v0); |
2603 | 0 | v1 = over_rev_non_pre (vsrc1, v1); |
2604 | 0 | v2 = over_rev_non_pre (vsrc2, v2); |
2605 | 0 | v3 = over_rev_non_pre (vsrc3, v3); |
2606 | |
|
2607 | 0 | *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); |
2608 | 0 | } |
2609 | |
|
2610 | 0 | w -= 4; |
2611 | 0 | dst += 4; |
2612 | 0 | src += 4; |
2613 | 0 | } |
2614 | |
|
2615 | 0 | CHECKPOINT (); |
2616 | |
|
2617 | 0 | while (w) |
2618 | 0 | { |
2619 | 0 | __m64 vsrc = load8888 (src); |
2620 | 0 | uint64_t d = *dst; |
2621 | 0 | __m64 vdest = expand565 (to_m64 (d), 0); |
2622 | |
|
2623 | 0 | vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0); |
2624 | |
|
2625 | 0 | *dst = to_uint64 (vdest); |
2626 | |
|
2627 | 0 | w--; |
2628 | 0 | dst++; |
2629 | 0 | src++; |
2630 | 0 | } |
2631 | 0 | } |
2632 | |
|
2633 | 0 | _mm_empty (); |
2634 | 0 | } |
2635 | | |
2636 | | static void |
2637 | | mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp, |
2638 | | pixman_composite_info_t *info) |
2639 | 0 | { |
2640 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
2641 | 0 | uint32_t *dst_line, *dst; |
2642 | 0 | uint32_t *src_line, *src; |
2643 | 0 | int dst_stride, src_stride; |
2644 | 0 | int32_t w; |
2645 | |
|
2646 | 0 | CHECKPOINT (); |
2647 | |
|
2648 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
2649 | 0 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
2650 | |
|
2651 | | #if 0 |
2652 | | /* FIXME */ |
2653 | | assert (src_image->drawable == mask_image->drawable); |
2654 | | #endif |
2655 | |
|
2656 | 0 | while (height--) |
2657 | 0 | { |
2658 | 0 | dst = dst_line; |
2659 | 0 | dst_line += dst_stride; |
2660 | 0 | src = src_line; |
2661 | 0 | src_line += src_stride; |
2662 | 0 | w = width; |
2663 | |
|
2664 | 0 | while (w && (uintptr_t)dst & 7) |
2665 | 0 | { |
2666 | 0 | __m64 s = load8888 (src); |
2667 | 0 | __m64 d = load8888 (dst); |
2668 | |
|
2669 | 0 | store8888 (dst, over_rev_non_pre (s, d)); |
2670 | |
|
2671 | 0 | w--; |
2672 | 0 | dst++; |
2673 | 0 | src++; |
2674 | 0 | } |
2675 | |
|
2676 | 0 | while (w >= 2) |
2677 | 0 | { |
2678 | 0 | uint32_t s0, s1; |
2679 | 0 | unsigned char a0, a1; |
2680 | 0 | __m64 d0, d1; |
2681 | |
|
2682 | 0 | s0 = *src; |
2683 | 0 | s1 = *(src + 1); |
2684 | |
|
2685 | 0 | a0 = (s0 >> 24); |
2686 | 0 | a1 = (s1 >> 24); |
2687 | |
|
2688 | 0 | if ((a0 & a1) == 0xFF) |
2689 | 0 | { |
2690 | 0 | d0 = invert_colors (load8888 (&s0)); |
2691 | 0 | d1 = invert_colors (load8888 (&s1)); |
2692 | |
|
2693 | 0 | *(__m64 *)dst = pack8888 (d0, d1); |
2694 | 0 | } |
2695 | 0 | else if (s0 | s1) |
2696 | 0 | { |
2697 | 0 | __m64 vdest = *(__m64 *)dst; |
2698 | |
|
2699 | 0 | d0 = over_rev_non_pre (load8888 (&s0), expand8888 (vdest, 0)); |
2700 | 0 | d1 = over_rev_non_pre (load8888 (&s1), expand8888 (vdest, 1)); |
2701 | |
|
2702 | 0 | *(__m64 *)dst = pack8888 (d0, d1); |
2703 | 0 | } |
2704 | |
|
2705 | 0 | w -= 2; |
2706 | 0 | dst += 2; |
2707 | 0 | src += 2; |
2708 | 0 | } |
2709 | |
|
2710 | 0 | if (w) |
2711 | 0 | { |
2712 | 0 | __m64 s = load8888 (src); |
2713 | 0 | __m64 d = load8888 (dst); |
2714 | |
|
2715 | 0 | store8888 (dst, over_rev_non_pre (s, d)); |
2716 | 0 | } |
2717 | 0 | } |
2718 | |
|
2719 | 0 | _mm_empty (); |
2720 | 0 | } |
2721 | | |
2722 | | static void |
2723 | | mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, |
2724 | | pixman_composite_info_t *info) |
2725 | 0 | { |
2726 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
2727 | 0 | uint32_t src; |
2728 | 0 | uint16_t *dst_line; |
2729 | 0 | uint32_t *mask_line; |
2730 | 0 | int dst_stride, mask_stride; |
2731 | 0 | __m64 vsrc, vsrca; |
2732 | |
|
2733 | 0 | CHECKPOINT (); |
2734 | |
|
2735 | 0 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
2736 | |
|
2737 | 0 | if (src == 0) |
2738 | 0 | return; |
2739 | | |
2740 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
2741 | 0 | PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); |
2742 | |
|
2743 | 0 | vsrc = load8888 (&src); |
2744 | 0 | vsrca = expand_alpha (vsrc); |
2745 | |
|
2746 | 0 | while (height--) |
2747 | 0 | { |
2748 | 0 | int twidth = width; |
2749 | 0 | uint32_t *p = (uint32_t *)mask_line; |
2750 | 0 | uint16_t *q = (uint16_t *)dst_line; |
2751 | |
|
2752 | 0 | while (twidth && ((uintptr_t)q & 7)) |
2753 | 0 | { |
2754 | 0 | uint32_t m = *(uint32_t *)p; |
2755 | |
|
2756 | 0 | if (m) |
2757 | 0 | { |
2758 | 0 | uint64_t d = *q; |
2759 | 0 | __m64 vdest = expand565 (to_m64 (d), 0); |
2760 | 0 | vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0); |
2761 | 0 | *q = to_uint64 (vdest); |
2762 | 0 | } |
2763 | |
|
2764 | 0 | twidth--; |
2765 | 0 | p++; |
2766 | 0 | q++; |
2767 | 0 | } |
2768 | |
|
2769 | 0 | while (twidth >= 4) |
2770 | 0 | { |
2771 | 0 | uint32_t m0, m1, m2, m3; |
2772 | |
|
2773 | 0 | m0 = *p; |
2774 | 0 | m1 = *(p + 1); |
2775 | 0 | m2 = *(p + 2); |
2776 | 0 | m3 = *(p + 3); |
2777 | |
|
2778 | 0 | if ((m0 | m1 | m2 | m3)) |
2779 | 0 | { |
2780 | 0 | __m64 vdest = *(__m64 *)q; |
2781 | 0 | __m64 v0, v1, v2, v3; |
2782 | |
|
2783 | 0 | expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); |
2784 | |
|
2785 | 0 | v0 = in_over (vsrc, vsrca, load8888 (&m0), v0); |
2786 | 0 | v1 = in_over (vsrc, vsrca, load8888 (&m1), v1); |
2787 | 0 | v2 = in_over (vsrc, vsrca, load8888 (&m2), v2); |
2788 | 0 | v3 = in_over (vsrc, vsrca, load8888 (&m3), v3); |
2789 | |
|
2790 | 0 | *(__m64 *)q = pack_4x565 (v0, v1, v2, v3); |
2791 | 0 | } |
2792 | 0 | twidth -= 4; |
2793 | 0 | p += 4; |
2794 | 0 | q += 4; |
2795 | 0 | } |
2796 | |
|
2797 | 0 | while (twidth) |
2798 | 0 | { |
2799 | 0 | uint32_t m; |
2800 | |
|
2801 | 0 | m = *(uint32_t *)p; |
2802 | 0 | if (m) |
2803 | 0 | { |
2804 | 0 | uint64_t d = *q; |
2805 | 0 | __m64 vdest = expand565 (to_m64 (d), 0); |
2806 | 0 | vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0); |
2807 | 0 | *q = to_uint64 (vdest); |
2808 | 0 | } |
2809 | |
|
2810 | 0 | twidth--; |
2811 | 0 | p++; |
2812 | 0 | q++; |
2813 | 0 | } |
2814 | |
|
2815 | 0 | mask_line += mask_stride; |
2816 | 0 | dst_line += dst_stride; |
2817 | 0 | } |
2818 | |
|
2819 | 0 | _mm_empty (); |
2820 | 0 | } |
2821 | | |
2822 | | static void |
2823 | | mmx_composite_in_n_8_8 (pixman_implementation_t *imp, |
2824 | | pixman_composite_info_t *info) |
2825 | 0 | { |
2826 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
2827 | 0 | uint8_t *dst_line, *dst; |
2828 | 0 | uint8_t *mask_line, *mask; |
2829 | 0 | int dst_stride, mask_stride; |
2830 | 0 | int32_t w; |
2831 | 0 | uint32_t src; |
2832 | 0 | uint8_t sa; |
2833 | 0 | __m64 vsrc, vsrca; |
2834 | |
|
2835 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); |
2836 | 0 | PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
2837 | |
|
2838 | 0 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
2839 | |
|
2840 | 0 | sa = src >> 24; |
2841 | |
|
2842 | 0 | vsrc = load8888 (&src); |
2843 | 0 | vsrca = expand_alpha (vsrc); |
2844 | |
|
2845 | 0 | while (height--) |
2846 | 0 | { |
2847 | 0 | dst = dst_line; |
2848 | 0 | dst_line += dst_stride; |
2849 | 0 | mask = mask_line; |
2850 | 0 | mask_line += mask_stride; |
2851 | 0 | w = width; |
2852 | |
|
2853 | 0 | while (w && (uintptr_t)dst & 7) |
2854 | 0 | { |
2855 | 0 | uint16_t tmp; |
2856 | 0 | uint8_t a; |
2857 | 0 | uint32_t m, d; |
2858 | |
|
2859 | 0 | a = *mask++; |
2860 | 0 | d = *dst; |
2861 | |
|
2862 | 0 | m = MUL_UN8 (sa, a, tmp); |
2863 | 0 | d = MUL_UN8 (m, d, tmp); |
2864 | |
|
2865 | 0 | *dst++ = d; |
2866 | 0 | w--; |
2867 | 0 | } |
2868 | |
|
2869 | 0 | while (w >= 4) |
2870 | 0 | { |
2871 | 0 | __m64 vmask; |
2872 | 0 | __m64 vdest; |
2873 | |
|
2874 | 0 | vmask = load8888u ((uint32_t *)mask); |
2875 | 0 | vdest = load8888 ((uint32_t *)dst); |
2876 | |
|
2877 | 0 | store8888 ((uint32_t *)dst, in (in (vsrca, vmask), vdest)); |
2878 | |
|
2879 | 0 | dst += 4; |
2880 | 0 | mask += 4; |
2881 | 0 | w -= 4; |
2882 | 0 | } |
2883 | |
|
2884 | 0 | while (w--) |
2885 | 0 | { |
2886 | 0 | uint16_t tmp; |
2887 | 0 | uint8_t a; |
2888 | 0 | uint32_t m, d; |
2889 | |
|
2890 | 0 | a = *mask++; |
2891 | 0 | d = *dst; |
2892 | |
|
2893 | 0 | m = MUL_UN8 (sa, a, tmp); |
2894 | 0 | d = MUL_UN8 (m, d, tmp); |
2895 | |
|
2896 | 0 | *dst++ = d; |
2897 | 0 | } |
2898 | 0 | } |
2899 | |
|
2900 | 0 | _mm_empty (); |
2901 | 0 | } |
2902 | | |
2903 | | static void |
2904 | | mmx_composite_in_8_8 (pixman_implementation_t *imp, |
2905 | | pixman_composite_info_t *info) |
2906 | 0 | { |
2907 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
2908 | 0 | uint8_t *dst_line, *dst; |
2909 | 0 | uint8_t *src_line, *src; |
2910 | 0 | int src_stride, dst_stride; |
2911 | 0 | int32_t w; |
2912 | |
|
2913 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); |
2914 | 0 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); |
2915 | |
|
2916 | 0 | while (height--) |
2917 | 0 | { |
2918 | 0 | dst = dst_line; |
2919 | 0 | dst_line += dst_stride; |
2920 | 0 | src = src_line; |
2921 | 0 | src_line += src_stride; |
2922 | 0 | w = width; |
2923 | |
|
2924 | 0 | while (w && (uintptr_t)dst & 3) |
2925 | 0 | { |
2926 | 0 | uint8_t s, d; |
2927 | 0 | uint16_t tmp; |
2928 | |
|
2929 | 0 | s = *src; |
2930 | 0 | d = *dst; |
2931 | |
|
2932 | 0 | *dst = MUL_UN8 (s, d, tmp); |
2933 | |
|
2934 | 0 | src++; |
2935 | 0 | dst++; |
2936 | 0 | w--; |
2937 | 0 | } |
2938 | |
|
2939 | 0 | while (w >= 4) |
2940 | 0 | { |
2941 | 0 | uint32_t *s = (uint32_t *)src; |
2942 | 0 | uint32_t *d = (uint32_t *)dst; |
2943 | |
|
2944 | 0 | store8888 (d, in (load8888u (s), load8888 (d))); |
2945 | |
|
2946 | 0 | w -= 4; |
2947 | 0 | dst += 4; |
2948 | 0 | src += 4; |
2949 | 0 | } |
2950 | |
|
2951 | 0 | while (w--) |
2952 | 0 | { |
2953 | 0 | uint8_t s, d; |
2954 | 0 | uint16_t tmp; |
2955 | |
|
2956 | 0 | s = *src; |
2957 | 0 | d = *dst; |
2958 | |
|
2959 | 0 | *dst = MUL_UN8 (s, d, tmp); |
2960 | |
|
2961 | 0 | src++; |
2962 | 0 | dst++; |
2963 | 0 | } |
2964 | 0 | } |
2965 | |
|
2966 | 0 | _mm_empty (); |
2967 | 0 | } |
2968 | | |
2969 | | static void |
2970 | | mmx_composite_add_n_8_8 (pixman_implementation_t *imp, |
2971 | | pixman_composite_info_t *info) |
2972 | 0 | { |
2973 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
2974 | 0 | uint8_t *dst_line, *dst; |
2975 | 0 | uint8_t *mask_line, *mask; |
2976 | 0 | int dst_stride, mask_stride; |
2977 | 0 | int32_t w; |
2978 | 0 | uint32_t src; |
2979 | 0 | uint8_t sa; |
2980 | 0 | __m64 vsrc, vsrca; |
2981 | |
|
2982 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); |
2983 | 0 | PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
2984 | |
|
2985 | 0 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
2986 | |
|
2987 | 0 | sa = src >> 24; |
2988 | |
|
2989 | 0 | if (src == 0) |
2990 | 0 | return; |
2991 | | |
2992 | 0 | vsrc = load8888 (&src); |
2993 | 0 | vsrca = expand_alpha (vsrc); |
2994 | |
|
2995 | 0 | while (height--) |
2996 | 0 | { |
2997 | 0 | dst = dst_line; |
2998 | 0 | dst_line += dst_stride; |
2999 | 0 | mask = mask_line; |
3000 | 0 | mask_line += mask_stride; |
3001 | 0 | w = width; |
3002 | |
|
3003 | 0 | while (w && (uintptr_t)dst & 3) |
3004 | 0 | { |
3005 | 0 | uint16_t tmp; |
3006 | 0 | uint16_t a; |
3007 | 0 | uint32_t m, d; |
3008 | 0 | uint32_t r; |
3009 | |
|
3010 | 0 | a = *mask++; |
3011 | 0 | d = *dst; |
3012 | |
|
3013 | 0 | m = MUL_UN8 (sa, a, tmp); |
3014 | 0 | r = ADD_UN8 (m, d, tmp); |
3015 | |
|
3016 | 0 | *dst++ = r; |
3017 | 0 | w--; |
3018 | 0 | } |
3019 | |
|
3020 | 0 | while (w >= 4) |
3021 | 0 | { |
3022 | 0 | __m64 vmask; |
3023 | 0 | __m64 vdest; |
3024 | |
|
3025 | 0 | vmask = load8888u ((uint32_t *)mask); |
3026 | 0 | vdest = load8888 ((uint32_t *)dst); |
3027 | |
|
3028 | 0 | store8888 ((uint32_t *)dst, _mm_adds_pu8 (in (vsrca, vmask), vdest)); |
3029 | |
|
3030 | 0 | dst += 4; |
3031 | 0 | mask += 4; |
3032 | 0 | w -= 4; |
3033 | 0 | } |
3034 | |
|
3035 | 0 | while (w--) |
3036 | 0 | { |
3037 | 0 | uint16_t tmp; |
3038 | 0 | uint16_t a; |
3039 | 0 | uint32_t m, d; |
3040 | 0 | uint32_t r; |
3041 | |
|
3042 | 0 | a = *mask++; |
3043 | 0 | d = *dst; |
3044 | |
|
3045 | 0 | m = MUL_UN8 (sa, a, tmp); |
3046 | 0 | r = ADD_UN8 (m, d, tmp); |
3047 | |
|
3048 | 0 | *dst++ = r; |
3049 | 0 | } |
3050 | 0 | } |
3051 | |
|
3052 | 0 | _mm_empty (); |
3053 | 0 | } |
3054 | | |
3055 | | static void |
3056 | | mmx_composite_add_8_8 (pixman_implementation_t *imp, |
3057 | | pixman_composite_info_t *info) |
3058 | 0 | { |
3059 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
3060 | 0 | uint8_t *dst_line, *dst; |
3061 | 0 | uint8_t *src_line, *src; |
3062 | 0 | int dst_stride, src_stride; |
3063 | 0 | int32_t w; |
3064 | 0 | uint8_t s, d; |
3065 | 0 | uint16_t t; |
3066 | |
|
3067 | 0 | CHECKPOINT (); |
3068 | |
|
3069 | 0 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); |
3070 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); |
3071 | |
|
3072 | 0 | while (height--) |
3073 | 0 | { |
3074 | 0 | dst = dst_line; |
3075 | 0 | dst_line += dst_stride; |
3076 | 0 | src = src_line; |
3077 | 0 | src_line += src_stride; |
3078 | 0 | w = width; |
3079 | |
|
3080 | 0 | while (w && (uintptr_t)dst & 7) |
3081 | 0 | { |
3082 | 0 | s = *src; |
3083 | 0 | d = *dst; |
3084 | 0 | t = d + s; |
3085 | 0 | s = t | (0 - (t >> 8)); |
3086 | 0 | *dst = s; |
3087 | |
|
3088 | 0 | dst++; |
3089 | 0 | src++; |
3090 | 0 | w--; |
3091 | 0 | } |
3092 | |
|
3093 | 0 | while (w >= 8) |
3094 | 0 | { |
3095 | 0 | *(__m64*)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst); |
3096 | 0 | dst += 8; |
3097 | 0 | src += 8; |
3098 | 0 | w -= 8; |
3099 | 0 | } |
3100 | |
|
3101 | 0 | while (w) |
3102 | 0 | { |
3103 | 0 | s = *src; |
3104 | 0 | d = *dst; |
3105 | 0 | t = d + s; |
3106 | 0 | s = t | (0 - (t >> 8)); |
3107 | 0 | *dst = s; |
3108 | |
|
3109 | 0 | dst++; |
3110 | 0 | src++; |
3111 | 0 | w--; |
3112 | 0 | } |
3113 | 0 | } |
3114 | |
|
3115 | 0 | _mm_empty (); |
3116 | 0 | } |
3117 | | |
3118 | | static void |
3119 | | mmx_composite_add_0565_0565 (pixman_implementation_t *imp, |
3120 | | pixman_composite_info_t *info) |
3121 | 0 | { |
3122 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
3123 | 0 | uint16_t *dst_line, *dst; |
3124 | 0 | uint32_t d; |
3125 | 0 | uint16_t *src_line, *src; |
3126 | 0 | uint32_t s; |
3127 | 0 | int dst_stride, src_stride; |
3128 | 0 | int32_t w; |
3129 | |
|
3130 | 0 | CHECKPOINT (); |
3131 | |
|
3132 | 0 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1); |
3133 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
3134 | |
|
3135 | 0 | while (height--) |
3136 | 0 | { |
3137 | 0 | dst = dst_line; |
3138 | 0 | dst_line += dst_stride; |
3139 | 0 | src = src_line; |
3140 | 0 | src_line += src_stride; |
3141 | 0 | w = width; |
3142 | |
|
3143 | 0 | while (w && (uintptr_t)dst & 7) |
3144 | 0 | { |
3145 | 0 | s = *src++; |
3146 | 0 | if (s) |
3147 | 0 | { |
3148 | 0 | d = *dst; |
3149 | 0 | s = convert_0565_to_8888 (s); |
3150 | 0 | if (d) |
3151 | 0 | { |
3152 | 0 | d = convert_0565_to_8888 (d); |
3153 | 0 | UN8x4_ADD_UN8x4 (s, d); |
3154 | 0 | } |
3155 | 0 | *dst = convert_8888_to_0565 (s); |
3156 | 0 | } |
3157 | 0 | dst++; |
3158 | 0 | w--; |
3159 | 0 | } |
3160 | |
|
3161 | 0 | while (w >= 4) |
3162 | 0 | { |
3163 | 0 | __m64 vdest = *(__m64 *)dst; |
3164 | 0 | __m64 vsrc = ldq_u ((__m64 *)src); |
3165 | 0 | __m64 vd0, vd1; |
3166 | 0 | __m64 vs0, vs1; |
3167 | |
|
3168 | 0 | expand_4xpacked565 (vdest, &vd0, &vd1, 0); |
3169 | 0 | expand_4xpacked565 (vsrc, &vs0, &vs1, 0); |
3170 | |
|
3171 | 0 | vd0 = _mm_adds_pu8 (vd0, vs0); |
3172 | 0 | vd1 = _mm_adds_pu8 (vd1, vs1); |
3173 | |
|
3174 | 0 | *(__m64 *)dst = pack_4xpacked565 (vd0, vd1); |
3175 | |
|
3176 | 0 | dst += 4; |
3177 | 0 | src += 4; |
3178 | 0 | w -= 4; |
3179 | 0 | } |
3180 | |
|
3181 | 0 | while (w--) |
3182 | 0 | { |
3183 | 0 | s = *src++; |
3184 | 0 | if (s) |
3185 | 0 | { |
3186 | 0 | d = *dst; |
3187 | 0 | s = convert_0565_to_8888 (s); |
3188 | 0 | if (d) |
3189 | 0 | { |
3190 | 0 | d = convert_0565_to_8888 (d); |
3191 | 0 | UN8x4_ADD_UN8x4 (s, d); |
3192 | 0 | } |
3193 | 0 | *dst = convert_8888_to_0565 (s); |
3194 | 0 | } |
3195 | 0 | dst++; |
3196 | 0 | } |
3197 | 0 | } |
3198 | |
|
3199 | 0 | _mm_empty (); |
3200 | 0 | } |
3201 | | |
3202 | | static void |
3203 | | mmx_composite_add_8888_8888 (pixman_implementation_t *imp, |
3204 | | pixman_composite_info_t *info) |
3205 | 0 | { |
3206 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
3207 | 0 | uint32_t *dst_line, *dst; |
3208 | 0 | uint32_t *src_line, *src; |
3209 | 0 | int dst_stride, src_stride; |
3210 | 0 | int32_t w; |
3211 | |
|
3212 | 0 | CHECKPOINT (); |
3213 | |
|
3214 | 0 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
3215 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
3216 | |
|
3217 | 0 | while (height--) |
3218 | 0 | { |
3219 | 0 | dst = dst_line; |
3220 | 0 | dst_line += dst_stride; |
3221 | 0 | src = src_line; |
3222 | 0 | src_line += src_stride; |
3223 | 0 | w = width; |
3224 | |
|
3225 | 0 | while (w && (uintptr_t)dst & 7) |
3226 | 0 | { |
3227 | 0 | store (dst, _mm_adds_pu8 (load ((const uint32_t *)src), |
3228 | 0 | load ((const uint32_t *)dst))); |
3229 | 0 | dst++; |
3230 | 0 | src++; |
3231 | 0 | w--; |
3232 | 0 | } |
3233 | |
|
3234 | 0 | while (w >= 2) |
3235 | 0 | { |
3236 | 0 | *(__m64 *)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst); |
3237 | 0 | dst += 2; |
3238 | 0 | src += 2; |
3239 | 0 | w -= 2; |
3240 | 0 | } |
3241 | |
|
3242 | 0 | if (w) |
3243 | 0 | { |
3244 | 0 | store (dst, _mm_adds_pu8 (load ((const uint32_t *)src), |
3245 | 0 | load ((const uint32_t *)dst))); |
3246 | |
|
3247 | 0 | } |
3248 | 0 | } |
3249 | |
|
3250 | 0 | _mm_empty (); |
3251 | 0 | } |
3252 | | |
3253 | | static pixman_bool_t |
3254 | | mmx_blt (pixman_implementation_t *imp, |
3255 | | uint32_t * src_bits, |
3256 | | uint32_t * dst_bits, |
3257 | | int src_stride, |
3258 | | int dst_stride, |
3259 | | int src_bpp, |
3260 | | int dst_bpp, |
3261 | | int src_x, |
3262 | | int src_y, |
3263 | | int dest_x, |
3264 | | int dest_y, |
3265 | | int width, |
3266 | | int height) |
3267 | 0 | { |
3268 | 0 | uint8_t * src_bytes; |
3269 | 0 | uint8_t * dst_bytes; |
3270 | 0 | int byte_width; |
3271 | |
|
3272 | 0 | if (src_bpp != dst_bpp) |
3273 | 0 | return FALSE; |
3274 | | |
3275 | 0 | if (src_bpp == 16) |
3276 | 0 | { |
3277 | 0 | src_stride = src_stride * (int) sizeof (uint32_t) / 2; |
3278 | 0 | dst_stride = dst_stride * (int) sizeof (uint32_t) / 2; |
3279 | 0 | src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x)); |
3280 | 0 | dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x)); |
3281 | 0 | byte_width = 2 * width; |
3282 | 0 | src_stride *= 2; |
3283 | 0 | dst_stride *= 2; |
3284 | 0 | } |
3285 | 0 | else if (src_bpp == 32) |
3286 | 0 | { |
3287 | 0 | src_stride = src_stride * (int) sizeof (uint32_t) / 4; |
3288 | 0 | dst_stride = dst_stride * (int) sizeof (uint32_t) / 4; |
3289 | 0 | src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x)); |
3290 | 0 | dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x)); |
3291 | 0 | byte_width = 4 * width; |
3292 | 0 | src_stride *= 4; |
3293 | 0 | dst_stride *= 4; |
3294 | 0 | } |
3295 | 0 | else |
3296 | 0 | { |
3297 | 0 | return FALSE; |
3298 | 0 | } |
3299 | | |
3300 | 0 | while (height--) |
3301 | 0 | { |
3302 | 0 | int w; |
3303 | 0 | uint8_t *s = src_bytes; |
3304 | 0 | uint8_t *d = dst_bytes; |
3305 | 0 | src_bytes += src_stride; |
3306 | 0 | dst_bytes += dst_stride; |
3307 | 0 | w = byte_width; |
3308 | |
|
3309 | 0 | if (w >= 1 && ((uintptr_t)d & 1)) |
3310 | 0 | { |
3311 | 0 | *(uint8_t *)d = *(uint8_t *)s; |
3312 | 0 | w -= 1; |
3313 | 0 | s += 1; |
3314 | 0 | d += 1; |
3315 | 0 | } |
3316 | |
|
3317 | 0 | if (w >= 2 && ((uintptr_t)d & 3)) |
3318 | 0 | { |
3319 | 0 | *(uint16_t *)d = *(uint16_t *)s; |
3320 | 0 | w -= 2; |
3321 | 0 | s += 2; |
3322 | 0 | d += 2; |
3323 | 0 | } |
3324 | |
|
3325 | 0 | while (w >= 4 && ((uintptr_t)d & 7)) |
3326 | 0 | { |
3327 | 0 | *(uint32_t *)d = ldl_u ((uint32_t *)s); |
3328 | |
|
3329 | 0 | w -= 4; |
3330 | 0 | s += 4; |
3331 | 0 | d += 4; |
3332 | 0 | } |
3333 | |
|
3334 | 0 | while (w >= 64) |
3335 | 0 | { |
3336 | 0 | #if (defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined USE_X86_MMX |
3337 | 0 | __asm__ ( |
3338 | 0 | "movq (%1), %%mm0\n" |
3339 | 0 | "movq 8(%1), %%mm1\n" |
3340 | 0 | "movq 16(%1), %%mm2\n" |
3341 | 0 | "movq 24(%1), %%mm3\n" |
3342 | 0 | "movq 32(%1), %%mm4\n" |
3343 | 0 | "movq 40(%1), %%mm5\n" |
3344 | 0 | "movq 48(%1), %%mm6\n" |
3345 | 0 | "movq 56(%1), %%mm7\n" |
3346 | |
|
3347 | 0 | "movq %%mm0, (%0)\n" |
3348 | 0 | "movq %%mm1, 8(%0)\n" |
3349 | 0 | "movq %%mm2, 16(%0)\n" |
3350 | 0 | "movq %%mm3, 24(%0)\n" |
3351 | 0 | "movq %%mm4, 32(%0)\n" |
3352 | 0 | "movq %%mm5, 40(%0)\n" |
3353 | 0 | "movq %%mm6, 48(%0)\n" |
3354 | 0 | "movq %%mm7, 56(%0)\n" |
3355 | 0 | : |
3356 | 0 | : "r" (d), "r" (s) |
3357 | 0 | : "memory", |
3358 | 0 | "%mm0", "%mm1", "%mm2", "%mm3", |
3359 | 0 | "%mm4", "%mm5", "%mm6", "%mm7"); |
3360 | | #else |
3361 | | __m64 v0 = ldq_u ((__m64 *)(s + 0)); |
3362 | | __m64 v1 = ldq_u ((__m64 *)(s + 8)); |
3363 | | __m64 v2 = ldq_u ((__m64 *)(s + 16)); |
3364 | | __m64 v3 = ldq_u ((__m64 *)(s + 24)); |
3365 | | __m64 v4 = ldq_u ((__m64 *)(s + 32)); |
3366 | | __m64 v5 = ldq_u ((__m64 *)(s + 40)); |
3367 | | __m64 v6 = ldq_u ((__m64 *)(s + 48)); |
3368 | | __m64 v7 = ldq_u ((__m64 *)(s + 56)); |
3369 | | *(__m64 *)(d + 0) = v0; |
3370 | | *(__m64 *)(d + 8) = v1; |
3371 | | *(__m64 *)(d + 16) = v2; |
3372 | | *(__m64 *)(d + 24) = v3; |
3373 | | *(__m64 *)(d + 32) = v4; |
3374 | | *(__m64 *)(d + 40) = v5; |
3375 | | *(__m64 *)(d + 48) = v6; |
3376 | | *(__m64 *)(d + 56) = v7; |
3377 | | #endif |
3378 | |
|
3379 | 0 | w -= 64; |
3380 | 0 | s += 64; |
3381 | 0 | d += 64; |
3382 | 0 | } |
3383 | 0 | while (w >= 4) |
3384 | 0 | { |
3385 | 0 | *(uint32_t *)d = ldl_u ((uint32_t *)s); |
3386 | |
|
3387 | 0 | w -= 4; |
3388 | 0 | s += 4; |
3389 | 0 | d += 4; |
3390 | 0 | } |
3391 | 0 | if (w >= 2) |
3392 | 0 | { |
3393 | 0 | *(uint16_t *)d = *(uint16_t *)s; |
3394 | 0 | w -= 2; |
3395 | 0 | s += 2; |
3396 | 0 | d += 2; |
3397 | 0 | } |
3398 | 0 | } |
3399 | |
|
3400 | 0 | _mm_empty (); |
3401 | |
|
3402 | 0 | return TRUE; |
3403 | 0 | } |
3404 | | |
3405 | | static void |
3406 | | mmx_composite_copy_area (pixman_implementation_t *imp, |
3407 | | pixman_composite_info_t *info) |
3408 | 0 | { |
3409 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
3410 | |
|
3411 | 0 | mmx_blt (imp, src_image->bits.bits, |
3412 | 0 | dest_image->bits.bits, |
3413 | 0 | src_image->bits.rowstride, |
3414 | 0 | dest_image->bits.rowstride, |
3415 | 0 | PIXMAN_FORMAT_BPP (src_image->bits.format), |
3416 | 0 | PIXMAN_FORMAT_BPP (dest_image->bits.format), |
3417 | 0 | src_x, src_y, dest_x, dest_y, width, height); |
3418 | 0 | } |
3419 | | |
3420 | | static void |
3421 | | mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp, |
3422 | | pixman_composite_info_t *info) |
3423 | 0 | { |
3424 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
3425 | 0 | uint32_t *src, *src_line; |
3426 | 0 | uint32_t *dst, *dst_line; |
3427 | 0 | uint8_t *mask, *mask_line; |
3428 | 0 | int src_stride, mask_stride, dst_stride; |
3429 | 0 | int32_t w; |
3430 | |
|
3431 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
3432 | 0 | PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
3433 | 0 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
3434 | |
|
3435 | 0 | while (height--) |
3436 | 0 | { |
3437 | 0 | src = src_line; |
3438 | 0 | src_line += src_stride; |
3439 | 0 | dst = dst_line; |
3440 | 0 | dst_line += dst_stride; |
3441 | 0 | mask = mask_line; |
3442 | 0 | mask_line += mask_stride; |
3443 | |
|
3444 | 0 | w = width; |
3445 | |
|
3446 | 0 | while (w--) |
3447 | 0 | { |
3448 | 0 | uint64_t m = *mask; |
3449 | |
|
3450 | 0 | if (m) |
3451 | 0 | { |
3452 | 0 | uint32_t ssrc = *src | 0xff000000; |
3453 | 0 | __m64 s = load8888 (&ssrc); |
3454 | |
|
3455 | 0 | if (m == 0xff) |
3456 | 0 | { |
3457 | 0 | store8888 (dst, s); |
3458 | 0 | } |
3459 | 0 | else |
3460 | 0 | { |
3461 | 0 | __m64 sa = expand_alpha (s); |
3462 | 0 | __m64 vm = expand_alpha_rev (to_m64 (m)); |
3463 | 0 | __m64 vdest = in_over (s, sa, vm, load8888 (dst)); |
3464 | |
|
3465 | 0 | store8888 (dst, vdest); |
3466 | 0 | } |
3467 | 0 | } |
3468 | |
|
3469 | 0 | mask++; |
3470 | 0 | dst++; |
3471 | 0 | src++; |
3472 | 0 | } |
3473 | 0 | } |
3474 | |
|
3475 | 0 | _mm_empty (); |
3476 | 0 | } |
3477 | | |
3478 | | static void |
3479 | | mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp, |
3480 | | pixman_composite_info_t *info) |
3481 | 0 | { |
3482 | 0 | PIXMAN_COMPOSITE_ARGS (info); |
3483 | 0 | uint32_t src; |
3484 | 0 | uint32_t *dst_line, *dst; |
3485 | 0 | int32_t w; |
3486 | 0 | int dst_stride; |
3487 | 0 | __m64 vsrc; |
3488 | |
|
3489 | 0 | CHECKPOINT (); |
3490 | |
|
3491 | 0 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
3492 | |
|
3493 | 0 | if (src == 0) |
3494 | 0 | return; |
3495 | | |
3496 | 0 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
3497 | |
|
3498 | 0 | vsrc = load8888 (&src); |
3499 | |
|
3500 | 0 | while (height--) |
3501 | 0 | { |
3502 | 0 | dst = dst_line; |
3503 | 0 | dst_line += dst_stride; |
3504 | 0 | w = width; |
3505 | |
|
3506 | 0 | CHECKPOINT (); |
3507 | |
|
3508 | 0 | while (w && (uintptr_t)dst & 7) |
3509 | 0 | { |
3510 | 0 | __m64 vdest = load8888 (dst); |
3511 | |
|
3512 | 0 | store8888 (dst, over (vdest, expand_alpha (vdest), vsrc)); |
3513 | |
|
3514 | 0 | w--; |
3515 | 0 | dst++; |
3516 | 0 | } |
3517 | |
|
3518 | 0 | while (w >= 2) |
3519 | 0 | { |
3520 | 0 | __m64 vdest = *(__m64 *)dst; |
3521 | 0 | __m64 dest0 = expand8888 (vdest, 0); |
3522 | 0 | __m64 dest1 = expand8888 (vdest, 1); |
3523 | | |
3524 | |
|
3525 | 0 | dest0 = over (dest0, expand_alpha (dest0), vsrc); |
3526 | 0 | dest1 = over (dest1, expand_alpha (dest1), vsrc); |
3527 | |
|
3528 | 0 | *(__m64 *)dst = pack8888 (dest0, dest1); |
3529 | |
|
3530 | 0 | dst += 2; |
3531 | 0 | w -= 2; |
3532 | 0 | } |
3533 | |
|
3534 | 0 | CHECKPOINT (); |
3535 | |
|
3536 | 0 | if (w) |
3537 | 0 | { |
3538 | 0 | __m64 vdest = load8888 (dst); |
3539 | |
|
3540 | 0 | store8888 (dst, over (vdest, expand_alpha (vdest), vsrc)); |
3541 | 0 | } |
3542 | 0 | } |
3543 | |
|
3544 | 0 | _mm_empty (); |
3545 | 0 | } |
3546 | | |
3547 | | static force_inline void |
3548 | | scaled_nearest_scanline_mmx_8888_8888_OVER (uint32_t* pd, |
3549 | | const uint32_t* ps, |
3550 | | int32_t w, |
3551 | | pixman_fixed_t vx, |
3552 | | pixman_fixed_t unit_x, |
3553 | | pixman_fixed_t src_width_fixed, |
3554 | | pixman_bool_t fully_transparent_src) |
3555 | 0 | { |
3556 | 0 | if (fully_transparent_src) |
3557 | 0 | return; |
3558 | | |
3559 | 0 | while (w) |
3560 | 0 | { |
3561 | 0 | __m64 d = load (pd); |
3562 | 0 | __m64 s = load (ps + pixman_fixed_to_int (vx)); |
3563 | 0 | vx += unit_x; |
3564 | 0 | while (vx >= 0) |
3565 | 0 | vx -= src_width_fixed; |
3566 | |
|
3567 | 0 | store8888 (pd, core_combine_over_u_pixel_mmx (s, d)); |
3568 | 0 | pd++; |
3569 | |
|
3570 | 0 | w--; |
3571 | 0 | } |
3572 | |
|
3573 | 0 | _mm_empty (); |
3574 | 0 | } |
3575 | | |
3576 | | FAST_NEAREST_MAINLOOP (mmx_8888_8888_cover_OVER, |
3577 | | scaled_nearest_scanline_mmx_8888_8888_OVER, |
3578 | | uint32_t, uint32_t, COVER) |
3579 | | FAST_NEAREST_MAINLOOP (mmx_8888_8888_none_OVER, |
3580 | | scaled_nearest_scanline_mmx_8888_8888_OVER, |
3581 | | uint32_t, uint32_t, NONE) |
3582 | | FAST_NEAREST_MAINLOOP (mmx_8888_8888_pad_OVER, |
3583 | | scaled_nearest_scanline_mmx_8888_8888_OVER, |
3584 | | uint32_t, uint32_t, PAD) |
3585 | | FAST_NEAREST_MAINLOOP (mmx_8888_8888_normal_OVER, |
3586 | | scaled_nearest_scanline_mmx_8888_8888_OVER, |
3587 | | uint32_t, uint32_t, NORMAL) |
3588 | | |
3589 | | static force_inline void |
3590 | | scaled_nearest_scanline_mmx_8888_n_8888_OVER (const uint32_t * mask, |
3591 | | uint32_t * dst, |
3592 | | const uint32_t * src, |
3593 | | int32_t w, |
3594 | | pixman_fixed_t vx, |
3595 | | pixman_fixed_t unit_x, |
3596 | | pixman_fixed_t src_width_fixed, |
3597 | | pixman_bool_t zero_src) |
3598 | 0 | { |
3599 | 0 | __m64 mm_mask; |
3600 | |
|
3601 | 0 | if (zero_src || (*mask >> 24) == 0) |
3602 | 0 | { |
3603 | | /* A workaround for https://gcc.gnu.org/PR47759 */ |
3604 | 0 | _mm_empty (); |
3605 | 0 | return; |
3606 | 0 | } |
3607 | | |
3608 | 0 | mm_mask = expand_alpha (load8888 (mask)); |
3609 | |
|
3610 | 0 | while (w) |
3611 | 0 | { |
3612 | 0 | uint32_t s = *(src + pixman_fixed_to_int (vx)); |
3613 | 0 | vx += unit_x; |
3614 | 0 | while (vx >= 0) |
3615 | 0 | vx -= src_width_fixed; |
3616 | |
|
3617 | 0 | if (s) |
3618 | 0 | { |
3619 | 0 | __m64 ms = load8888 (&s); |
3620 | 0 | __m64 alpha = expand_alpha (ms); |
3621 | 0 | __m64 dest = load8888 (dst); |
3622 | |
|
3623 | 0 | store8888 (dst, (in_over (ms, alpha, mm_mask, dest))); |
3624 | 0 | } |
3625 | |
|
3626 | 0 | dst++; |
3627 | 0 | w--; |
3628 | 0 | } |
3629 | |
|
3630 | 0 | _mm_empty (); |
3631 | 0 | } |
3632 | | |
3633 | | FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_cover_OVER, |
3634 | | scaled_nearest_scanline_mmx_8888_n_8888_OVER, |
3635 | | uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE) |
3636 | | FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_pad_OVER, |
3637 | | scaled_nearest_scanline_mmx_8888_n_8888_OVER, |
3638 | | uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE) |
3639 | | FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_none_OVER, |
3640 | | scaled_nearest_scanline_mmx_8888_n_8888_OVER, |
3641 | | uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE) |
3642 | | FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_normal_OVER, |
3643 | | scaled_nearest_scanline_mmx_8888_n_8888_OVER, |
3644 | | uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE) |
3645 | | |
3646 | 0 | #define BSHIFT ((1 << BILINEAR_INTERPOLATION_BITS)) |
3647 | 0 | #define BMSK (BSHIFT - 1) |
3648 | | |
3649 | | #define BILINEAR_DECLARE_VARIABLES \ |
3650 | 0 | const __m64 mm_wt = _mm_set_pi16 (wt, wt, wt, wt); \ |
3651 | 0 | const __m64 mm_wb = _mm_set_pi16 (wb, wb, wb, wb); \ |
3652 | 0 | const __m64 mm_addc7 = _mm_set_pi16 (0, 1, 0, 1); \ |
3653 | 0 | const __m64 mm_xorc7 = _mm_set_pi16 (0, BMSK, 0, BMSK); \ |
3654 | 0 | const __m64 mm_ux = _mm_set_pi16 (unit_x, unit_x, unit_x, unit_x); \ |
3655 | 0 | const __m64 mm_zero = _mm_setzero_si64 (); \ |
3656 | 0 | __m64 mm_x = _mm_set_pi16 (vx, vx, vx, vx) |
3657 | | |
3658 | 0 | #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix) \ |
3659 | 0 | do { \ |
3660 | 0 | /* fetch 2x2 pixel block into 2 mmx registers */ \ |
3661 | 0 | __m64 t = ldq_u ((__m64 *)&src_top [pixman_fixed_to_int (vx)]); \ |
3662 | 0 | __m64 b = ldq_u ((__m64 *)&src_bottom [pixman_fixed_to_int (vx)]); \ |
3663 | 0 | /* vertical interpolation */ \ |
3664 | 0 | __m64 t_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (t, mm_zero), mm_wt); \ |
3665 | 0 | __m64 t_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (t, mm_zero), mm_wt); \ |
3666 | 0 | __m64 b_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (b, mm_zero), mm_wb); \ |
3667 | 0 | __m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb); \ |
3668 | 0 | __m64 hi = _mm_add_pi16 (t_hi, b_hi); \ |
3669 | 0 | __m64 lo = _mm_add_pi16 (t_lo, b_lo); \ |
3670 | 0 | /* calculate horizontal weights */ \ |
3671 | 0 | __m64 mm_wh = _mm_add_pi16 (mm_addc7, _mm_xor_si64 (mm_xorc7, \ |
3672 | 0 | _mm_srli_pi16 (mm_x, \ |
3673 | 0 | 16 - BILINEAR_INTERPOLATION_BITS))); \ |
3674 | 0 | /* horizontal interpolation */ \ |
3675 | 0 | __m64 p = _mm_unpacklo_pi16 (lo, hi); \ |
3676 | 0 | __m64 q = _mm_unpackhi_pi16 (lo, hi); \ |
3677 | 0 | vx += unit_x; \ |
3678 | 0 | lo = _mm_madd_pi16 (p, mm_wh); \ |
3679 | 0 | hi = _mm_madd_pi16 (q, mm_wh); \ |
3680 | 0 | mm_x = _mm_add_pi16 (mm_x, mm_ux); \ |
3681 | 0 | /* shift and pack the result */ \ |
3682 | 0 | hi = _mm_srli_pi32 (hi, BILINEAR_INTERPOLATION_BITS * 2); \ |
3683 | 0 | lo = _mm_srli_pi32 (lo, BILINEAR_INTERPOLATION_BITS * 2); \ |
3684 | 0 | lo = _mm_packs_pi32 (lo, hi); \ |
3685 | 0 | lo = _mm_packs_pu16 (lo, lo); \ |
3686 | 0 | pix = lo; \ |
3687 | 0 | } while (0) |
3688 | | |
3689 | 0 | #define BILINEAR_SKIP_ONE_PIXEL() \ |
3690 | 0 | do { \ |
3691 | 0 | vx += unit_x; \ |
3692 | 0 | mm_x = _mm_add_pi16 (mm_x, mm_ux); \ |
3693 | 0 | } while(0) |
3694 | | |
3695 | | static force_inline void |
3696 | | scaled_bilinear_scanline_mmx_8888_8888_SRC (uint32_t * dst, |
3697 | | const uint32_t * mask, |
3698 | | const uint32_t * src_top, |
3699 | | const uint32_t * src_bottom, |
3700 | | int32_t w, |
3701 | | int wt, |
3702 | | int wb, |
3703 | | pixman_fixed_t vx, |
3704 | | pixman_fixed_t unit_x, |
3705 | | pixman_fixed_t max_vx, |
3706 | | pixman_bool_t zero_src) |
3707 | 0 | { |
3708 | 0 | BILINEAR_DECLARE_VARIABLES; |
3709 | 0 | __m64 pix; |
3710 | |
|
3711 | 0 | while (w--) |
3712 | 0 | { |
3713 | 0 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix); |
3714 | 0 | store (dst, pix); |
3715 | 0 | dst++; |
3716 | 0 | } |
3717 | |
|
3718 | 0 | _mm_empty (); |
3719 | 0 | } |
3720 | | |
3721 | | FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_SRC, |
3722 | | scaled_bilinear_scanline_mmx_8888_8888_SRC, |
3723 | | uint32_t, uint32_t, uint32_t, |
3724 | | COVER, FLAG_NONE) |
3725 | | FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_SRC, |
3726 | | scaled_bilinear_scanline_mmx_8888_8888_SRC, |
3727 | | uint32_t, uint32_t, uint32_t, |
3728 | | PAD, FLAG_NONE) |
3729 | | FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_SRC, |
3730 | | scaled_bilinear_scanline_mmx_8888_8888_SRC, |
3731 | | uint32_t, uint32_t, uint32_t, |
3732 | | NONE, FLAG_NONE) |
3733 | | FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_SRC, |
3734 | | scaled_bilinear_scanline_mmx_8888_8888_SRC, |
3735 | | uint32_t, uint32_t, uint32_t, |
3736 | | NORMAL, FLAG_NONE) |
3737 | | |
3738 | | static force_inline void |
3739 | | scaled_bilinear_scanline_mmx_8888_8888_OVER (uint32_t * dst, |
3740 | | const uint32_t * mask, |
3741 | | const uint32_t * src_top, |
3742 | | const uint32_t * src_bottom, |
3743 | | int32_t w, |
3744 | | int wt, |
3745 | | int wb, |
3746 | | pixman_fixed_t vx, |
3747 | | pixman_fixed_t unit_x, |
3748 | | pixman_fixed_t max_vx, |
3749 | | pixman_bool_t zero_src) |
3750 | 0 | { |
3751 | 0 | BILINEAR_DECLARE_VARIABLES; |
3752 | 0 | __m64 pix1, pix2; |
3753 | |
|
3754 | 0 | while (w) |
3755 | 0 | { |
3756 | 0 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
3757 | |
|
3758 | 0 | if (!is_zero (pix1)) |
3759 | 0 | { |
3760 | 0 | pix2 = load (dst); |
3761 | 0 | store8888 (dst, core_combine_over_u_pixel_mmx (pix1, pix2)); |
3762 | 0 | } |
3763 | |
|
3764 | 0 | w--; |
3765 | 0 | dst++; |
3766 | 0 | } |
3767 | |
|
3768 | 0 | _mm_empty (); |
3769 | 0 | } |
3770 | | |
3771 | | FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_OVER, |
3772 | | scaled_bilinear_scanline_mmx_8888_8888_OVER, |
3773 | | uint32_t, uint32_t, uint32_t, |
3774 | | COVER, FLAG_NONE) |
3775 | | FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_OVER, |
3776 | | scaled_bilinear_scanline_mmx_8888_8888_OVER, |
3777 | | uint32_t, uint32_t, uint32_t, |
3778 | | PAD, FLAG_NONE) |
3779 | | FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_OVER, |
3780 | | scaled_bilinear_scanline_mmx_8888_8888_OVER, |
3781 | | uint32_t, uint32_t, uint32_t, |
3782 | | NONE, FLAG_NONE) |
3783 | | FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_OVER, |
3784 | | scaled_bilinear_scanline_mmx_8888_8888_OVER, |
3785 | | uint32_t, uint32_t, uint32_t, |
3786 | | NORMAL, FLAG_NONE) |
3787 | | |
3788 | | static force_inline void |
3789 | | scaled_bilinear_scanline_mmx_8888_8_8888_OVER (uint32_t * dst, |
3790 | | const uint8_t * mask, |
3791 | | const uint32_t * src_top, |
3792 | | const uint32_t * src_bottom, |
3793 | | int32_t w, |
3794 | | int wt, |
3795 | | int wb, |
3796 | | pixman_fixed_t vx, |
3797 | | pixman_fixed_t unit_x, |
3798 | | pixman_fixed_t max_vx, |
3799 | | pixman_bool_t zero_src) |
3800 | 0 | { |
3801 | 0 | BILINEAR_DECLARE_VARIABLES; |
3802 | 0 | __m64 pix1, pix2; |
3803 | 0 | uint32_t m; |
3804 | |
|
3805 | 0 | while (w) |
3806 | 0 | { |
3807 | 0 | m = (uint32_t) *mask++; |
3808 | |
|
3809 | 0 | if (m) |
3810 | 0 | { |
3811 | 0 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
3812 | |
|
3813 | 0 | if (m == 0xff && is_opaque (pix1)) |
3814 | 0 | { |
3815 | 0 | store (dst, pix1); |
3816 | 0 | } |
3817 | 0 | else |
3818 | 0 | { |
3819 | 0 | __m64 ms, md, ma, msa; |
3820 | |
|
3821 | 0 | pix2 = load (dst); |
3822 | 0 | ma = expand_alpha_rev (to_m64 (m)); |
3823 | 0 | ms = _mm_unpacklo_pi8 (pix1, _mm_setzero_si64 ()); |
3824 | 0 | md = _mm_unpacklo_pi8 (pix2, _mm_setzero_si64 ()); |
3825 | |
|
3826 | 0 | msa = expand_alpha (ms); |
3827 | |
|
3828 | 0 | store8888 (dst, (in_over (ms, msa, ma, md))); |
3829 | 0 | } |
3830 | 0 | } |
3831 | 0 | else |
3832 | 0 | { |
3833 | 0 | BILINEAR_SKIP_ONE_PIXEL (); |
3834 | 0 | } |
3835 | |
|
3836 | 0 | w--; |
3837 | 0 | dst++; |
3838 | 0 | } |
3839 | |
|
3840 | 0 | _mm_empty (); |
3841 | 0 | } |
3842 | | |
3843 | | FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_cover_OVER, |
3844 | | scaled_bilinear_scanline_mmx_8888_8_8888_OVER, |
3845 | | uint32_t, uint8_t, uint32_t, |
3846 | | COVER, FLAG_HAVE_NON_SOLID_MASK) |
3847 | | FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_pad_OVER, |
3848 | | scaled_bilinear_scanline_mmx_8888_8_8888_OVER, |
3849 | | uint32_t, uint8_t, uint32_t, |
3850 | | PAD, FLAG_HAVE_NON_SOLID_MASK) |
3851 | | FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_none_OVER, |
3852 | | scaled_bilinear_scanline_mmx_8888_8_8888_OVER, |
3853 | | uint32_t, uint8_t, uint32_t, |
3854 | | NONE, FLAG_HAVE_NON_SOLID_MASK) |
3855 | | FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_normal_OVER, |
3856 | | scaled_bilinear_scanline_mmx_8888_8_8888_OVER, |
3857 | | uint32_t, uint8_t, uint32_t, |
3858 | | NORMAL, FLAG_HAVE_NON_SOLID_MASK) |
3859 | | |
3860 | | static uint32_t * |
3861 | | mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask) |
3862 | 0 | { |
3863 | 0 | int w = iter->width; |
3864 | 0 | uint32_t *dst = iter->buffer; |
3865 | 0 | uint32_t *src = (uint32_t *)iter->bits; |
3866 | |
|
3867 | 0 | iter->bits += iter->stride; |
3868 | |
|
3869 | 0 | while (w && ((uintptr_t)dst) & 7) |
3870 | 0 | { |
3871 | 0 | *dst++ = (*src++) | 0xff000000; |
3872 | 0 | w--; |
3873 | 0 | } |
3874 | |
|
3875 | 0 | while (w >= 8) |
3876 | 0 | { |
3877 | 0 | __m64 vsrc1 = ldq_u ((__m64 *)(src + 0)); |
3878 | 0 | __m64 vsrc2 = ldq_u ((__m64 *)(src + 2)); |
3879 | 0 | __m64 vsrc3 = ldq_u ((__m64 *)(src + 4)); |
3880 | 0 | __m64 vsrc4 = ldq_u ((__m64 *)(src + 6)); |
3881 | |
|
3882 | 0 | *(__m64 *)(dst + 0) = _mm_or_si64 (vsrc1, MC (ff000000)); |
3883 | 0 | *(__m64 *)(dst + 2) = _mm_or_si64 (vsrc2, MC (ff000000)); |
3884 | 0 | *(__m64 *)(dst + 4) = _mm_or_si64 (vsrc3, MC (ff000000)); |
3885 | 0 | *(__m64 *)(dst + 6) = _mm_or_si64 (vsrc4, MC (ff000000)); |
3886 | |
|
3887 | 0 | dst += 8; |
3888 | 0 | src += 8; |
3889 | 0 | w -= 8; |
3890 | 0 | } |
3891 | |
|
3892 | 0 | while (w) |
3893 | 0 | { |
3894 | 0 | *dst++ = (*src++) | 0xff000000; |
3895 | 0 | w--; |
3896 | 0 | } |
3897 | |
|
3898 | 0 | _mm_empty (); |
3899 | 0 | return iter->buffer; |
3900 | 0 | } |
3901 | | |
3902 | | static uint32_t * |
3903 | | mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask) |
3904 | 0 | { |
3905 | 0 | int w = iter->width; |
3906 | 0 | uint32_t *dst = iter->buffer; |
3907 | 0 | uint16_t *src = (uint16_t *)iter->bits; |
3908 | |
|
3909 | 0 | iter->bits += iter->stride; |
3910 | |
|
3911 | 0 | while (w && ((uintptr_t)dst) & 0x0f) |
3912 | 0 | { |
3913 | 0 | uint16_t s = *src++; |
3914 | |
|
3915 | 0 | *dst++ = convert_0565_to_8888 (s); |
3916 | 0 | w--; |
3917 | 0 | } |
3918 | |
|
3919 | 0 | while (w >= 4) |
3920 | 0 | { |
3921 | 0 | __m64 vsrc = ldq_u ((__m64 *)src); |
3922 | 0 | __m64 mm0, mm1; |
3923 | |
|
3924 | 0 | expand_4xpacked565 (vsrc, &mm0, &mm1, 1); |
3925 | |
|
3926 | 0 | *(__m64 *)(dst + 0) = mm0; |
3927 | 0 | *(__m64 *)(dst + 2) = mm1; |
3928 | |
|
3929 | 0 | dst += 4; |
3930 | 0 | src += 4; |
3931 | 0 | w -= 4; |
3932 | 0 | } |
3933 | |
|
3934 | 0 | while (w) |
3935 | 0 | { |
3936 | 0 | uint16_t s = *src++; |
3937 | |
|
3938 | 0 | *dst++ = convert_0565_to_8888 (s); |
3939 | 0 | w--; |
3940 | 0 | } |
3941 | |
|
3942 | 0 | _mm_empty (); |
3943 | 0 | return iter->buffer; |
3944 | 0 | } |
3945 | | |
3946 | | static uint32_t * |
3947 | | mmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask) |
3948 | 0 | { |
3949 | 0 | int w = iter->width; |
3950 | 0 | uint32_t *dst = iter->buffer; |
3951 | 0 | uint8_t *src = iter->bits; |
3952 | |
|
3953 | 0 | iter->bits += iter->stride; |
3954 | |
|
3955 | 0 | while (w && (((uintptr_t)dst) & 15)) |
3956 | 0 | { |
3957 | 0 | *dst++ = (uint32_t)*(src++) << 24; |
3958 | 0 | w--; |
3959 | 0 | } |
3960 | |
|
3961 | 0 | while (w >= 8) |
3962 | 0 | { |
3963 | 0 | __m64 mm0 = ldq_u ((__m64 *)src); |
3964 | |
|
3965 | 0 | __m64 mm1 = _mm_unpacklo_pi8 (_mm_setzero_si64(), mm0); |
3966 | 0 | __m64 mm2 = _mm_unpackhi_pi8 (_mm_setzero_si64(), mm0); |
3967 | 0 | __m64 mm3 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm1); |
3968 | 0 | __m64 mm4 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm1); |
3969 | 0 | __m64 mm5 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm2); |
3970 | 0 | __m64 mm6 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm2); |
3971 | |
|
3972 | 0 | *(__m64 *)(dst + 0) = mm3; |
3973 | 0 | *(__m64 *)(dst + 2) = mm4; |
3974 | 0 | *(__m64 *)(dst + 4) = mm5; |
3975 | 0 | *(__m64 *)(dst + 6) = mm6; |
3976 | |
|
3977 | 0 | dst += 8; |
3978 | 0 | src += 8; |
3979 | 0 | w -= 8; |
3980 | 0 | } |
3981 | |
|
3982 | 0 | while (w) |
3983 | 0 | { |
3984 | 0 | *dst++ = (uint32_t)*(src++) << 24; |
3985 | 0 | w--; |
3986 | 0 | } |
3987 | |
|
3988 | 0 | _mm_empty (); |
3989 | 0 | return iter->buffer; |
3990 | 0 | } |
3991 | | |
3992 | | #define IMAGE_FLAGS \ |
3993 | | (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \ |
3994 | | FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST) |
3995 | | |
3996 | | static const pixman_iter_info_t mmx_iters[] = |
3997 | | { |
3998 | | { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW, |
3999 | | _pixman_iter_init_bits_stride, mmx_fetch_x8r8g8b8, NULL |
4000 | | }, |
4001 | | { PIXMAN_r5g6b5, IMAGE_FLAGS, ITER_NARROW, |
4002 | | _pixman_iter_init_bits_stride, mmx_fetch_r5g6b5, NULL |
4003 | | }, |
4004 | | { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW, |
4005 | | _pixman_iter_init_bits_stride, mmx_fetch_a8, NULL |
4006 | | }, |
4007 | | { PIXMAN_null }, |
4008 | | }; |
4009 | | |
4010 | | static const pixman_fast_path_t mmx_fast_paths[] = |
4011 | | { |
4012 | | PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, mmx_composite_over_n_8_0565 ), |
4013 | | PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, mmx_composite_over_n_8_0565 ), |
4014 | | PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, mmx_composite_over_n_8_8888 ), |
4015 | | PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, mmx_composite_over_n_8_8888 ), |
4016 | | PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, mmx_composite_over_n_8_8888 ), |
4017 | | PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, mmx_composite_over_n_8_8888 ), |
4018 | | PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ), |
4019 | | PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ), |
4020 | | PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, mmx_composite_over_n_8888_0565_ca ), |
4021 | | PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ), |
4022 | | PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ), |
4023 | | PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, mmx_composite_over_n_8888_0565_ca ), |
4024 | | PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, mmx_composite_over_pixbuf_8888 ), |
4025 | | PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, mmx_composite_over_pixbuf_8888 ), |
4026 | | PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, mmx_composite_over_pixbuf_0565 ), |
4027 | | PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, mmx_composite_over_pixbuf_8888 ), |
4028 | | PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, mmx_composite_over_pixbuf_8888 ), |
4029 | | PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, mmx_composite_over_pixbuf_0565 ), |
4030 | | PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, mmx_composite_over_x888_n_8888 ), |
4031 | | PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, mmx_composite_over_x888_n_8888 ), |
4032 | | PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, mmx_composite_over_x888_n_8888 ), |
4033 | | PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, mmx_composite_over_x888_n_8888 ), |
4034 | | PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, mmx_composite_over_8888_n_8888 ), |
4035 | | PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, mmx_composite_over_8888_n_8888 ), |
4036 | | PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, mmx_composite_over_8888_n_8888 ), |
4037 | | PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, mmx_composite_over_8888_n_8888 ), |
4038 | | PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, mmx_composite_over_x888_8_8888 ), |
4039 | | PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, mmx_composite_over_x888_8_8888 ), |
4040 | | PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, mmx_composite_over_x888_8_8888 ), |
4041 | | PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, mmx_composite_over_x888_8_8888 ), |
4042 | | PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, mmx_composite_over_n_8888 ), |
4043 | | PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, mmx_composite_over_n_8888 ), |
4044 | | PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, mmx_composite_over_n_0565 ), |
4045 | | PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, mmx_composite_over_n_0565 ), |
4046 | | PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ), |
4047 | | PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ), |
4048 | | |
4049 | | PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, mmx_composite_over_8888_8888 ), |
4050 | | PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, mmx_composite_over_8888_8888 ), |
4051 | | PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, mmx_composite_over_8888_0565 ), |
4052 | | PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, mmx_composite_over_8888_8888 ), |
4053 | | PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, mmx_composite_over_8888_8888 ), |
4054 | | PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, mmx_composite_over_8888_0565 ), |
4055 | | |
4056 | | PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, mmx_composite_over_reverse_n_8888), |
4057 | | PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, mmx_composite_over_reverse_n_8888), |
4058 | | |
4059 | | PIXMAN_STD_FAST_PATH (ADD, r5g6b5, null, r5g6b5, mmx_composite_add_0565_0565 ), |
4060 | | PIXMAN_STD_FAST_PATH (ADD, b5g6r5, null, b5g6r5, mmx_composite_add_0565_0565 ), |
4061 | | PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, mmx_composite_add_8888_8888 ), |
4062 | | PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, mmx_composite_add_8888_8888 ), |
4063 | | PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, mmx_composite_add_8_8 ), |
4064 | | PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, mmx_composite_add_n_8_8 ), |
4065 | | |
4066 | | PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, mmx_composite_src_x888_0565 ), |
4067 | | PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, mmx_composite_src_x888_0565 ), |
4068 | | PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, mmx_composite_src_x888_0565 ), |
4069 | | PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, mmx_composite_src_x888_0565 ), |
4070 | | PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, mmx_composite_src_n_8_8888 ), |
4071 | | PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, mmx_composite_src_n_8_8888 ), |
4072 | | PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, mmx_composite_src_n_8_8888 ), |
4073 | | PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, mmx_composite_src_n_8_8888 ), |
4074 | | PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, mmx_composite_copy_area ), |
4075 | | PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, mmx_composite_copy_area ), |
4076 | | PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ), |
4077 | | PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ), |
4078 | | PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ), |
4079 | | PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ), |
4080 | | PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, mmx_composite_copy_area ), |
4081 | | PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, mmx_composite_copy_area ), |
4082 | | |
4083 | | PIXMAN_STD_FAST_PATH (IN, a8, null, a8, mmx_composite_in_8_8 ), |
4084 | | PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, mmx_composite_in_n_8_8 ), |
4085 | | |
4086 | | SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ), |
4087 | | SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ), |
4088 | | SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ), |
4089 | | SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ), |
4090 | | |
4091 | | SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_n_8888 ), |
4092 | | SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_n_8888 ), |
4093 | | SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_n_8888 ), |
4094 | | SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_n_8888 ), |
4095 | | |
4096 | | SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ), |
4097 | | SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ), |
4098 | | SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, mmx_8888_8888 ), |
4099 | | SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ), |
4100 | | SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ), |
4101 | | SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, mmx_8888_8888 ), |
4102 | | |
4103 | | SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ), |
4104 | | SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ), |
4105 | | SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ), |
4106 | | SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ), |
4107 | | |
4108 | | SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8_8888 ), |
4109 | | SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8_8888 ), |
4110 | | SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8_8888 ), |
4111 | | SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8_8888 ), |
4112 | | |
4113 | | { PIXMAN_OP_NONE }, |
4114 | | }; |
4115 | | |
4116 | | pixman_implementation_t * |
4117 | | _pixman_implementation_create_mmx (pixman_implementation_t *fallback) |
4118 | 108 | { |
4119 | 108 | pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths); |
4120 | | |
4121 | 108 | imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u; |
4122 | 108 | imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u; |
4123 | 108 | imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u; |
4124 | 108 | imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u; |
4125 | 108 | imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u; |
4126 | 108 | imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u; |
4127 | 108 | imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u; |
4128 | 108 | imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u; |
4129 | 108 | imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u; |
4130 | 108 | imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u; |
4131 | 108 | imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u; |
4132 | | |
4133 | 108 | imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca; |
4134 | 108 | imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca; |
4135 | 108 | imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca; |
4136 | 108 | imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca; |
4137 | 108 | imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca; |
4138 | 108 | imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca; |
4139 | 108 | imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca; |
4140 | 108 | imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca; |
4141 | 108 | imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca; |
4142 | 108 | imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca; |
4143 | 108 | imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca; |
4144 | | |
4145 | 108 | imp->blt = mmx_blt; |
4146 | 108 | imp->fill = mmx_fill; |
4147 | | |
4148 | 108 | imp->iter_info = mmx_iters; |
4149 | | |
4150 | 108 | return imp; |
4151 | 108 | } |
4152 | | |
4153 | | #endif /* USE_X86_MMX || USE_ARM_IWMMXT || USE_LOONGSON_MMI */ |