Coverage Report

Created: 2025-07-07 10:01

/work/workdir/UnpackedTarball/pixman/pixman/pixman-mmx.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright © 2004, 2005 Red Hat, Inc.
3
 * Copyright © 2004 Nicholas Miell
4
 * Copyright © 2005 Trolltech AS
5
 *
6
 * Permission to use, copy, modify, distribute, and sell this software and its
7
 * documentation for any purpose is hereby granted without fee, provided that
8
 * the above copyright notice appear in all copies and that both that
9
 * copyright notice and this permission notice appear in supporting
10
 * documentation, and that the name of Red Hat not be used in advertising or
11
 * publicity pertaining to distribution of the software without specific,
12
 * written prior permission.  Red Hat makes no representations about the
13
 * suitability of this software for any purpose.  It is provided "as is"
14
 * without express or implied warranty.
15
 *
16
 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
17
 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
18
 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
19
 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
21
 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
22
 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
23
 * SOFTWARE.
24
 *
25
 * Author:  Søren Sandmann (sandmann@redhat.com)
26
 * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
27
 * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com)
28
 *
29
 * Based on work by Owen Taylor
30
 */
31
32
#ifdef HAVE_CONFIG_H
33
#include <config.h>
34
#endif
35
36
#if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_MMI
37
38
#ifdef USE_LOONGSON_MMI
39
#include <loongson-mmintrin.h>
40
#else
41
#include <mmintrin.h>
42
#endif
43
#include "pixman-private.h"
44
#include "pixman-combine32.h"
45
#include "pixman-inlines.h"
46
47
#ifdef VERBOSE
48
#define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__)
49
#else
50
#define CHECKPOINT()
51
#endif
52
53
#if defined USE_ARM_IWMMXT && __GNUC__ == 4 && __GNUC_MINOR__ < 8
54
/* Empty the multimedia state. For some reason, ARM's mmintrin.h doesn't provide this.  */
55
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
56
_mm_empty (void)
57
{
58
59
}
60
#endif
61
62
#ifdef USE_X86_MMX
63
# if (defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64))
64
#  include <xmmintrin.h>
65
# else
66
/* We have to compile with -msse to use xmmintrin.h, but that causes SSE
67
 * instructions to be generated that we don't want. Just duplicate the
68
 * functions we want to use.  */
69
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
70
_mm_movemask_pi8 (__m64 __A)
71
0
{
72
0
    int ret;
73
74
0
    asm ("pmovmskb %1, %0\n\t"
75
0
  : "=r" (ret)
76
0
  : "y" (__A)
77
0
    );
78
79
0
    return ret;
80
0
}
81
82
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
83
_mm_mulhi_pu16 (__m64 __A, __m64 __B)
84
0
{
85
0
    asm ("pmulhuw %1, %0\n\t"
86
0
  : "+y" (__A)
87
0
  : "y" (__B)
88
0
    );
89
0
    return __A;
90
0
}
91
92
# define _mm_shuffle_pi16(A, N)           \
93
0
    ({                  \
94
0
  __m64 ret;              \
95
0
                  \
96
0
  asm ("pshufw %2, %1, %0\n\t"          \
97
0
       : "=y" (ret)           \
98
0
       : "y" (A), "K" ((const int8_t)N)       \
99
0
  );                \
100
0
                  \
101
0
  ret;                \
102
0
    })
103
# endif
104
#endif
105
106
#ifndef _MSC_VER
107
#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
108
 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
109
#endif
110
111
/* Notes about writing mmx code
112
 *
113
 * give memory operands as the second operand. If you give it as the
114
 * first, gcc will first load it into a register, then use that
115
 * register
116
 *
117
 *   ie. use
118
 *
119
 *         _mm_mullo_pi16 (x, mmx_constant);
120
 *
121
 *   not
122
 *
123
 *         _mm_mullo_pi16 (mmx_constant, x);
124
 *
125
 * Also try to minimize dependencies. i.e. when you need a value, try
126
 * to calculate it from a value that was calculated as early as
127
 * possible.
128
 */
129
130
/* --------------- MMX primitives ------------------------------------- */
131
132
/* If __m64 is defined as a struct or union, then define M64_MEMBER to be
133
 * the name of the member used to access the data.
134
 * If __m64 requires using mm_cvt* intrinsics functions to convert between
135
 * uint64_t and __m64 values, then define USE_CVT_INTRINSICS.
136
 * If __m64 and uint64_t values can just be cast to each other directly,
137
 * then define USE_M64_CASTS.
138
 * If __m64 is a double datatype, then define USE_M64_DOUBLE.
139
 */
140
#ifdef _MSC_VER
141
# define M64_MEMBER m64_u64
142
#elif defined(__ICC)
143
# define USE_CVT_INTRINSICS
144
#elif defined(USE_LOONGSON_MMI)
145
# define USE_M64_DOUBLE
146
#elif defined(__GNUC__)
147
# define USE_M64_CASTS
148
#elif defined(__SUNPRO_C)
149
# if (__SUNPRO_C >= 0x5120) && !defined(__NOVECTORSIZE__)
150
/* Solaris Studio 12.3 (Sun C 5.12) introduces __attribute__(__vector_size__)
151
 * support, and defaults to using it to define __m64, unless __NOVECTORSIZE__
152
 * is defined.   If it is used, then the mm_cvt* intrinsics must be used.
153
 */
154
#  define USE_CVT_INTRINSICS
155
# else
156
/* For Studio 12.2 or older, or when __attribute__(__vector_size__) is
157
 * disabled, __m64 is defined as a struct containing "unsigned long long l_".
158
 */
159
#  define M64_MEMBER l_
160
# endif
161
#endif
162
163
#if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS) || defined(USE_M64_DOUBLE)
164
typedef uint64_t mmxdatafield;
165
#else
166
typedef __m64 mmxdatafield;
167
#endif
168
169
typedef struct
170
{
171
    mmxdatafield mmx_4x00ff;
172
    mmxdatafield mmx_4x0080;
173
    mmxdatafield mmx_565_rgb;
174
    mmxdatafield mmx_565_unpack_multiplier;
175
    mmxdatafield mmx_565_pack_multiplier;
176
    mmxdatafield mmx_565_r;
177
    mmxdatafield mmx_565_g;
178
    mmxdatafield mmx_565_b;
179
    mmxdatafield mmx_packed_565_rb;
180
    mmxdatafield mmx_packed_565_g;
181
    mmxdatafield mmx_expand_565_g;
182
    mmxdatafield mmx_expand_565_b;
183
    mmxdatafield mmx_expand_565_r;
184
#ifndef USE_LOONGSON_MMI
185
    mmxdatafield mmx_mask_0;
186
    mmxdatafield mmx_mask_1;
187
    mmxdatafield mmx_mask_2;
188
    mmxdatafield mmx_mask_3;
189
#endif
190
    mmxdatafield mmx_full_alpha;
191
    mmxdatafield mmx_4x0101;
192
    mmxdatafield mmx_ff000000;
193
} mmx_data_t;
194
195
#if defined(_MSC_VER)
196
# define MMXDATA_INIT(field, val) { val ## UI64 }
197
#elif defined(M64_MEMBER)       /* __m64 is a struct, not an integral type */
198
# define MMXDATA_INIT(field, val) field =   { val ## ULL }
199
#else                           /* mmxdatafield is an integral type */
200
# define MMXDATA_INIT(field, val) field =   val ## ULL
201
#endif
202
203
static const mmx_data_t c =
204
{
205
    MMXDATA_INIT (.mmx_4x00ff,                   0x00ff00ff00ff00ff),
206
    MMXDATA_INIT (.mmx_4x0080,                   0x0080008000800080),
207
    MMXDATA_INIT (.mmx_565_rgb,                  0x000001f0003f001f),
208
    MMXDATA_INIT (.mmx_565_unpack_multiplier,    0x0000008404100840),
209
    MMXDATA_INIT (.mmx_565_pack_multiplier,      0x2000000420000004),
210
    MMXDATA_INIT (.mmx_565_r,                    0x000000f800000000),
211
    MMXDATA_INIT (.mmx_565_g,                    0x0000000000fc0000),
212
    MMXDATA_INIT (.mmx_565_b,                    0x00000000000000f8),
213
    MMXDATA_INIT (.mmx_packed_565_rb,            0x00f800f800f800f8),
214
    MMXDATA_INIT (.mmx_packed_565_g,             0x0000fc000000fc00),
215
    MMXDATA_INIT (.mmx_expand_565_g,             0x07e007e007e007e0),
216
    MMXDATA_INIT (.mmx_expand_565_b,             0x001f001f001f001f),
217
    MMXDATA_INIT (.mmx_expand_565_r,             0xf800f800f800f800),
218
#ifndef USE_LOONGSON_MMI
219
    MMXDATA_INIT (.mmx_mask_0,                   0xffffffffffff0000),
220
    MMXDATA_INIT (.mmx_mask_1,                   0xffffffff0000ffff),
221
    MMXDATA_INIT (.mmx_mask_2,                   0xffff0000ffffffff),
222
    MMXDATA_INIT (.mmx_mask_3,                   0x0000ffffffffffff),
223
#endif
224
    MMXDATA_INIT (.mmx_full_alpha,               0x00ff000000000000),
225
    MMXDATA_INIT (.mmx_4x0101,                   0x0101010101010101),
226
    MMXDATA_INIT (.mmx_ff000000,                 0xff000000ff000000),
227
};
228
229
#ifdef USE_CVT_INTRINSICS
230
#    define MC(x) to_m64 (c.mmx_ ## x)
231
#elif defined(USE_M64_CASTS)
232
0
#    define MC(x) ((__m64)c.mmx_ ## x)
233
#elif defined(USE_M64_DOUBLE)
234
#    define MC(x) (*(__m64 *)&c.mmx_ ## x)
235
#else
236
#    define MC(x) c.mmx_ ## x
237
#endif
238
239
static force_inline __m64
240
to_m64 (uint64_t x)
241
0
{
242
#ifdef USE_CVT_INTRINSICS
243
    return _mm_cvtsi64_m64 (x);
244
#elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
245
    __m64 res;
246
247
    res.M64_MEMBER = x;
248
    return res;
249
#elif defined USE_M64_DOUBLE
250
    return *(__m64 *)&x;
251
#else /* USE_M64_CASTS */
252
0
    return (__m64)x;
253
0
#endif
254
0
}
255
256
static force_inline uint64_t
257
to_uint64 (__m64 x)
258
0
{
259
#ifdef USE_CVT_INTRINSICS
260
    return _mm_cvtm64_si64 (x);
261
#elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
262
    uint64_t res = x.M64_MEMBER;
263
    return res;
264
#elif defined USE_M64_DOUBLE
265
    return *(uint64_t *)&x;
266
#else /* USE_M64_CASTS */
267
0
    return (uint64_t)x;
268
0
#endif
269
0
}
270
271
static force_inline __m64
272
shift (__m64 v,
273
       int   s)
274
0
{
275
0
    if (s > 0)
276
0
  return _mm_slli_si64 (v, s);
277
0
    else if (s < 0)
278
0
  return _mm_srli_si64 (v, -s);
279
0
    else
280
0
  return v;
281
0
}
282
283
static force_inline __m64
284
negate (__m64 mask)
285
0
{
286
0
    return _mm_xor_si64 (mask, MC (4x00ff));
287
0
}
288
289
/* Computes the product of two unsigned fixed-point 8-bit values from 0 to 1
290
 * and maps its result to the same range.
291
 *
292
 * Jim Blinn gives multiple ways to compute this in "Jim Blinn's Corner:
293
 * Notation, Notation, Notation", the first of which is
294
 *
295
 *   prod(a, b) = (a * b + 128) / 255.
296
 *
297
 * By approximating the division by 255 as 257/65536 it can be replaced by a
298
 * multiply and a right shift. This is the implementation that we use in
299
 * pix_multiply(), but we _mm_mulhi_pu16() by 257 (part of SSE1 or Extended
300
 * 3DNow!, and unavailable at the time of the book's publication) to perform
301
 * the multiplication and right shift in a single operation.
302
 *
303
 *   prod(a, b) = ((a * b + 128) * 257) >> 16.
304
 *
305
 * A third way (how pix_multiply() was implemented prior to 14208344) exists
306
 * also that performs the multiplication by 257 with adds and shifts.
307
 *
308
 * Where temp = a * b + 128
309
 *
310
 *   prod(a, b) = (temp + (temp >> 8)) >> 8.
311
 */
312
static force_inline __m64
313
pix_multiply (__m64 a, __m64 b)
314
0
{
315
0
    __m64 res;
316
317
0
    res = _mm_mullo_pi16 (a, b);
318
0
    res = _mm_adds_pu16 (res, MC (4x0080));
319
0
    res = _mm_mulhi_pu16 (res, MC (4x0101));
320
321
0
    return res;
322
0
}
323
324
static force_inline __m64
325
pix_add (__m64 a, __m64 b)
326
0
{
327
0
    return _mm_adds_pu8 (a, b);
328
0
}
329
330
static force_inline __m64
331
expand_alpha (__m64 pixel)
332
0
{
333
0
    return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 3, 3, 3));
334
0
}
335
336
static force_inline __m64
337
expand_alpha_rev (__m64 pixel)
338
0
{
339
0
    return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (0, 0, 0, 0));
340
0
}
341
342
static force_inline __m64
343
invert_colors (__m64 pixel)
344
0
{
345
0
    return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 0, 1, 2));
346
0
}
347
348
static force_inline __m64
349
over (__m64 src,
350
      __m64 srca,
351
      __m64 dest)
352
0
{
353
0
    return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca)));
354
0
}
355
356
static force_inline __m64
357
over_rev_non_pre (__m64 src, __m64 dest)
358
0
{
359
0
    __m64 srca = expand_alpha (src);
360
0
    __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha));
361
362
0
    return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest);
363
0
}
364
365
static force_inline __m64
366
in (__m64 src, __m64 mask)
367
0
{
368
0
    return pix_multiply (src, mask);
369
0
}
370
371
#ifndef _MSC_VER
372
static force_inline __m64
373
in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
374
0
{
375
0
    return over (in (src, mask), pix_multiply (srca, mask), dest);
376
0
}
377
378
#else
379
380
#define in_over(src, srca, mask, dest)          \
381
    over (in (src, mask), pix_multiply (srca, mask), dest)
382
383
#endif
384
385
/* Elemental unaligned loads */
386
387
static force_inline __m64 ldq_u(__m64 *p)
388
0
{
389
0
#ifdef USE_X86_MMX
390
    /* x86's alignment restrictions are very relaxed, but that's no excuse */
391
0
    __m64 r;
392
0
    memcpy(&r, p, sizeof(__m64));
393
0
    return r;
394
#elif defined USE_ARM_IWMMXT
395
    int align = (uintptr_t)p & 7;
396
    __m64 *aligned_p;
397
    if (align == 0)
398
  return *p;
399
    aligned_p = (__m64 *)((uintptr_t)p & ~7);
400
    return (__m64) _mm_align_si64 (aligned_p[0], aligned_p[1], align);
401
#else
402
    struct __una_u64 { __m64 x __attribute__((packed)); };
403
    const struct __una_u64 *ptr = (const struct __una_u64 *) p;
404
    return (__m64) ptr->x;
405
#endif
406
0
}
407
408
static force_inline uint32_t ldl_u(const uint32_t *p)
409
0
{
410
0
#ifdef USE_X86_MMX
411
    /* x86's alignment restrictions are very relaxed. */
412
0
    uint32_t r;
413
0
    memcpy(&r, p, sizeof(uint32_t));
414
0
    return r;
415
#else
416
    struct __una_u32 { uint32_t x __attribute__((packed)); };
417
    const struct __una_u32 *ptr = (const struct __una_u32 *) p;
418
    return ptr->x;
419
#endif
420
0
}
421
422
static force_inline __m64
423
load (const uint32_t *v)
424
0
{
425
#ifdef USE_LOONGSON_MMI
426
    __m64 ret;
427
    asm ("lwc1 %0, %1\n\t"
428
  : "=f" (ret)
429
  : "m" (*v)
430
    );
431
    return ret;
432
#else
433
0
    return _mm_cvtsi32_si64 (*v);
434
0
#endif
435
0
}
436
437
static force_inline __m64
438
load8888 (const uint32_t *v)
439
0
{
440
#ifdef USE_LOONGSON_MMI
441
    return _mm_unpacklo_pi8_f (*(__m32 *)v, _mm_setzero_si64 ());
442
#else
443
0
    return _mm_unpacklo_pi8 (load (v), _mm_setzero_si64 ());
444
0
#endif
445
0
}
446
447
static force_inline __m64
448
load8888u (const uint32_t *v)
449
0
{
450
0
    uint32_t l = ldl_u (v);
451
0
    return load8888 (&l);
452
0
}
453
454
static force_inline __m64
455
pack8888 (__m64 lo, __m64 hi)
456
0
{
457
0
    return _mm_packs_pu16 (lo, hi);
458
0
}
459
460
static force_inline void
461
store (uint32_t *dest, __m64 v)
462
0
{
463
#ifdef USE_LOONGSON_MMI
464
    asm ("swc1 %1, %0\n\t"
465
  : "=m" (*dest)
466
  : "f" (v)
467
  : "memory"
468
    );
469
#else
470
0
    *dest = _mm_cvtsi64_si32 (v);
471
0
#endif
472
0
}
473
474
static force_inline void
475
store8888 (uint32_t *dest, __m64 v)
476
0
{
477
0
    v = pack8888 (v, _mm_setzero_si64 ());
478
0
    store (dest, v);
479
0
}
480
481
static force_inline pixman_bool_t
482
is_equal (__m64 a, __m64 b)
483
0
{
484
#ifdef USE_LOONGSON_MMI
485
    /* __m64 is double, we can compare directly. */
486
    return a == b;
487
#else
488
0
    return _mm_movemask_pi8 (_mm_cmpeq_pi8 (a, b)) == 0xff;
489
0
#endif
490
0
}
491
492
static force_inline pixman_bool_t
493
is_opaque (__m64 v)
494
0
{
495
#ifdef USE_LOONGSON_MMI
496
    return is_equal (_mm_and_si64 (v, MC (full_alpha)), MC (full_alpha));
497
#else
498
0
    __m64 ffs = _mm_cmpeq_pi8 (v, v);
499
0
    return (_mm_movemask_pi8 (_mm_cmpeq_pi8 (v, ffs)) & 0x40);
500
0
#endif
501
0
}
502
503
static force_inline pixman_bool_t
504
is_zero (__m64 v)
505
0
{
506
0
    return is_equal (v, _mm_setzero_si64 ());
507
0
}
508
509
/* Expand 16 bits positioned at @pos (0-3) of a mmx register into
510
 *
511
 *    00RR00GG00BB
512
 *
513
 * --- Expanding 565 in the low word ---
514
 *
515
 * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
516
 * m = m & (01f0003f001f);
517
 * m = m * (008404100840);
518
 * m = m >> 8;
519
 *
520
 * Note the trick here - the top word is shifted by another nibble to
521
 * avoid it bumping into the middle word
522
 */
523
static force_inline __m64
524
expand565 (__m64 pixel, int pos)
525
0
{
526
0
    __m64 p = pixel;
527
0
    __m64 t1, t2;
528
529
    /* move pixel to low 16 bit and zero the rest */
530
#ifdef USE_LOONGSON_MMI
531
    p = loongson_extract_pi16 (p, pos);
532
#else
533
0
    p = shift (shift (p, (3 - pos) * 16), -48);
534
0
#endif
535
536
0
    t1 = shift (p, 36 - 11);
537
0
    t2 = shift (p, 16 - 5);
538
539
0
    p = _mm_or_si64 (t1, p);
540
0
    p = _mm_or_si64 (t2, p);
541
0
    p = _mm_and_si64 (p, MC (565_rgb));
542
543
0
    pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier));
544
0
    return _mm_srli_pi16 (pixel, 8);
545
0
}
546
547
/* Expand 4 16 bit pixels in an mmx register into two mmx registers of
548
 *
549
 *    AARRGGBBRRGGBB
550
 */
551
static force_inline void
552
expand_4xpacked565 (__m64 vin, __m64 *vout0, __m64 *vout1, int full_alpha)
553
0
{
554
0
    __m64 t0, t1, alpha = _mm_setzero_si64 ();
555
0
    __m64 r = _mm_and_si64 (vin, MC (expand_565_r));
556
0
    __m64 g = _mm_and_si64 (vin, MC (expand_565_g));
557
0
    __m64 b = _mm_and_si64 (vin, MC (expand_565_b));
558
0
    if (full_alpha)
559
0
  alpha = _mm_cmpeq_pi32 (alpha, alpha);
560
561
    /* Replicate high bits into empty low bits. */
562
0
    r = _mm_or_si64 (_mm_srli_pi16 (r, 8), _mm_srli_pi16 (r, 13));
563
0
    g = _mm_or_si64 (_mm_srli_pi16 (g, 3), _mm_srli_pi16 (g, 9));
564
0
    b = _mm_or_si64 (_mm_slli_pi16 (b, 3), _mm_srli_pi16 (b, 2));
565
566
0
    r = _mm_packs_pu16 (r, _mm_setzero_si64 ());  /* 00 00 00 00 R3 R2 R1 R0 */
567
0
    g = _mm_packs_pu16 (g, _mm_setzero_si64 ());  /* 00 00 00 00 G3 G2 G1 G0 */
568
0
    b = _mm_packs_pu16 (b, _mm_setzero_si64 ());  /* 00 00 00 00 B3 B2 B1 B0 */
569
570
0
    t1 = _mm_unpacklo_pi8 (r, alpha);     /* A3 R3 A2 R2 A1 R1 A0 R0 */
571
0
    t0 = _mm_unpacklo_pi8 (b, g);     /* G3 B3 G2 B2 G1 B1 G0 B0 */
572
573
0
    *vout0 = _mm_unpacklo_pi16 (t0, t1);    /* A1 R1 G1 B1 A0 R0 G0 B0 */
574
0
    *vout1 = _mm_unpackhi_pi16 (t0, t1);    /* A3 R3 G3 B3 A2 R2 G2 B2 */
575
0
}
576
577
static force_inline __m64
578
expand8888 (__m64 in, int pos)
579
0
{
580
0
    if (pos == 0)
581
0
  return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ());
582
0
    else
583
0
  return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ());
584
0
}
585
586
static force_inline __m64
587
expandx888 (__m64 in, int pos)
588
0
{
589
0
    return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha));
590
0
}
591
592
static force_inline void
593
expand_4x565 (__m64 vin, __m64 *vout0, __m64 *vout1, __m64 *vout2, __m64 *vout3, int full_alpha)
594
0
{
595
0
    __m64 v0, v1;
596
0
    expand_4xpacked565 (vin, &v0, &v1, full_alpha);
597
0
    *vout0 = expand8888 (v0, 0);
598
0
    *vout1 = expand8888 (v0, 1);
599
0
    *vout2 = expand8888 (v1, 0);
600
0
    *vout3 = expand8888 (v1, 1);
601
0
}
602
603
static force_inline __m64
604
pack_565 (__m64 pixel, __m64 target, int pos)
605
0
{
606
0
    __m64 p = pixel;
607
0
    __m64 t = target;
608
0
    __m64 r, g, b;
609
610
0
    r = _mm_and_si64 (p, MC (565_r));
611
0
    g = _mm_and_si64 (p, MC (565_g));
612
0
    b = _mm_and_si64 (p, MC (565_b));
613
614
#ifdef USE_LOONGSON_MMI
615
    r = shift (r, -(32 - 8));
616
    g = shift (g, -(16 - 3));
617
    b = shift (b, -(0  + 3));
618
619
    p = _mm_or_si64 (r, g);
620
    p = _mm_or_si64 (p, b);
621
    return loongson_insert_pi16 (t, p, pos);
622
#else
623
0
    r = shift (r, -(32 - 8) + pos * 16);
624
0
    g = shift (g, -(16 - 3) + pos * 16);
625
0
    b = shift (b, -(0  + 3) + pos * 16);
626
627
0
    if (pos == 0)
628
0
  t = _mm_and_si64 (t, MC (mask_0));
629
0
    else if (pos == 1)
630
0
  t = _mm_and_si64 (t, MC (mask_1));
631
0
    else if (pos == 2)
632
0
  t = _mm_and_si64 (t, MC (mask_2));
633
0
    else if (pos == 3)
634
0
  t = _mm_and_si64 (t, MC (mask_3));
635
636
0
    p = _mm_or_si64 (r, t);
637
0
    p = _mm_or_si64 (g, p);
638
639
0
    return _mm_or_si64 (b, p);
640
0
#endif
641
0
}
642
643
static force_inline __m64
644
pack_4xpacked565 (__m64 a, __m64 b)
645
0
{
646
0
    __m64 rb0 = _mm_and_si64 (a, MC (packed_565_rb));
647
0
    __m64 rb1 = _mm_and_si64 (b, MC (packed_565_rb));
648
649
0
    __m64 t0 = _mm_madd_pi16 (rb0, MC (565_pack_multiplier));
650
0
    __m64 t1 = _mm_madd_pi16 (rb1, MC (565_pack_multiplier));
651
652
0
    __m64 g0 = _mm_and_si64 (a, MC (packed_565_g));
653
0
    __m64 g1 = _mm_and_si64 (b, MC (packed_565_g));
654
655
0
    t0 = _mm_or_si64 (t0, g0);
656
0
    t1 = _mm_or_si64 (t1, g1);
657
658
0
    t0 = shift(t0, -5);
659
#ifdef USE_ARM_IWMMXT
660
    t1 = shift(t1, -5);
661
    return _mm_packs_pu32 (t0, t1);
662
#else
663
0
    t1 = shift(t1, -5 + 16);
664
0
    return _mm_shuffle_pi16 (_mm_or_si64 (t0, t1), _MM_SHUFFLE (3, 1, 2, 0));
665
0
#endif
666
0
}
667
668
#ifndef _MSC_VER
669
670
static force_inline __m64
671
pack_4x565 (__m64 v0, __m64 v1, __m64 v2, __m64 v3)
672
0
{
673
0
    return pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3));
674
0
}
675
676
static force_inline __m64
677
pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
678
0
{
679
0
    x = pix_multiply (x, a);
680
0
    y = pix_multiply (y, b);
681
682
0
    return pix_add (x, y);
683
0
}
684
685
#else
686
687
/* MSVC only handles a "pass by register" of up to three SSE intrinsics */
688
689
#define pack_4x565(v0, v1, v2, v3) \
690
    pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3))
691
692
#define pix_add_mul(x, a, y, b)  \
693
    ( x = pix_multiply (x, a),   \
694
      y = pix_multiply (y, b),   \
695
      pix_add (x, y) )
696
697
#endif
698
699
/* --------------- MMX code patch for fbcompose.c --------------------- */
700
701
static force_inline __m64
702
combine (const uint32_t *src, const uint32_t *mask)
703
0
{
704
0
    __m64 vsrc = load8888 (src);
705
706
0
    if (mask)
707
0
    {
708
0
  __m64 m = load8888 (mask);
709
710
0
  m = expand_alpha (m);
711
0
  vsrc = pix_multiply (vsrc, m);
712
0
    }
713
714
0
    return vsrc;
715
0
}
716
717
static force_inline __m64
718
core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst)
719
0
{
720
0
    vsrc = _mm_unpacklo_pi8 (vsrc, _mm_setzero_si64 ());
721
722
0
    if (is_opaque (vsrc))
723
0
    {
724
0
  return vsrc;
725
0
    }
726
0
    else if (!is_zero (vsrc))
727
0
    {
728
0
  return over (vsrc, expand_alpha (vsrc),
729
0
         _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ()));
730
0
    }
731
732
0
    return _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ());
733
0
}
734
735
static void
736
mmx_combine_over_u (pixman_implementation_t *imp,
737
                    pixman_op_t              op,
738
                    uint32_t *               dest,
739
                    const uint32_t *         src,
740
                    const uint32_t *         mask,
741
                    int                      width)
742
0
{
743
0
    const uint32_t *end = dest + width;
744
745
0
    while (dest < end)
746
0
    {
747
0
  __m64 vsrc = combine (src, mask);
748
749
0
  if (is_opaque (vsrc))
750
0
  {
751
0
      store8888 (dest, vsrc);
752
0
  }
753
0
  else if (!is_zero (vsrc))
754
0
  {
755
0
      __m64 sa = expand_alpha (vsrc);
756
0
      store8888 (dest, over (vsrc, sa, load8888 (dest)));
757
0
  }
758
759
0
  ++dest;
760
0
  ++src;
761
0
  if (mask)
762
0
      ++mask;
763
0
    }
764
0
    _mm_empty ();
765
0
}
766
767
static void
768
mmx_combine_over_reverse_u (pixman_implementation_t *imp,
769
                            pixman_op_t              op,
770
                            uint32_t *               dest,
771
                            const uint32_t *         src,
772
                            const uint32_t *         mask,
773
                            int                      width)
774
0
{
775
0
    const uint32_t *end = dest + width;
776
777
0
    while (dest < end)
778
0
    {
779
0
  __m64 d, da;
780
0
  __m64 s = combine (src, mask);
781
782
0
  d = load8888 (dest);
783
0
  da = expand_alpha (d);
784
0
  store8888 (dest, over (d, da, s));
785
786
0
  ++dest;
787
0
  ++src;
788
0
  if (mask)
789
0
      mask++;
790
0
    }
791
0
    _mm_empty ();
792
0
}
793
794
static void
795
mmx_combine_in_u (pixman_implementation_t *imp,
796
                  pixman_op_t              op,
797
                  uint32_t *               dest,
798
                  const uint32_t *         src,
799
                  const uint32_t *         mask,
800
                  int                      width)
801
0
{
802
0
    const uint32_t *end = dest + width;
803
804
0
    while (dest < end)
805
0
    {
806
0
  __m64 a;
807
0
  __m64 x = combine (src, mask);
808
809
0
  a = load8888 (dest);
810
0
  a = expand_alpha (a);
811
0
  x = pix_multiply (x, a);
812
813
0
  store8888 (dest, x);
814
815
0
  ++dest;
816
0
  ++src;
817
0
  if (mask)
818
0
      mask++;
819
0
    }
820
0
    _mm_empty ();
821
0
}
822
823
static void
824
mmx_combine_in_reverse_u (pixman_implementation_t *imp,
825
                          pixman_op_t              op,
826
                          uint32_t *               dest,
827
                          const uint32_t *         src,
828
                          const uint32_t *         mask,
829
                          int                      width)
830
0
{
831
0
    const uint32_t *end = dest + width;
832
833
0
    while (dest < end)
834
0
    {
835
0
  __m64 a = combine (src, mask);
836
0
  __m64 x;
837
838
0
  x = load8888 (dest);
839
0
  a = expand_alpha (a);
840
0
  x = pix_multiply (x, a);
841
0
  store8888 (dest, x);
842
843
0
  ++dest;
844
0
  ++src;
845
0
  if (mask)
846
0
      mask++;
847
0
    }
848
0
    _mm_empty ();
849
0
}
850
851
static void
852
mmx_combine_out_u (pixman_implementation_t *imp,
853
                   pixman_op_t              op,
854
                   uint32_t *               dest,
855
                   const uint32_t *         src,
856
                   const uint32_t *         mask,
857
                   int                      width)
858
0
{
859
0
    const uint32_t *end = dest + width;
860
861
0
    while (dest < end)
862
0
    {
863
0
  __m64 a;
864
0
  __m64 x = combine (src, mask);
865
866
0
  a = load8888 (dest);
867
0
  a = expand_alpha (a);
868
0
  a = negate (a);
869
0
  x = pix_multiply (x, a);
870
0
  store8888 (dest, x);
871
872
0
  ++dest;
873
0
  ++src;
874
0
  if (mask)
875
0
      mask++;
876
0
    }
877
0
    _mm_empty ();
878
0
}
879
880
static void
881
mmx_combine_out_reverse_u (pixman_implementation_t *imp,
882
                           pixman_op_t              op,
883
                           uint32_t *               dest,
884
                           const uint32_t *         src,
885
                           const uint32_t *         mask,
886
                           int                      width)
887
0
{
888
0
    const uint32_t *end = dest + width;
889
890
0
    while (dest < end)
891
0
    {
892
0
  __m64 a = combine (src, mask);
893
0
  __m64 x;
894
895
0
  x = load8888 (dest);
896
0
  a = expand_alpha (a);
897
0
  a = negate (a);
898
0
  x = pix_multiply (x, a);
899
900
0
  store8888 (dest, x);
901
902
0
  ++dest;
903
0
  ++src;
904
0
  if (mask)
905
0
      mask++;
906
0
    }
907
0
    _mm_empty ();
908
0
}
909
910
static void
911
mmx_combine_atop_u (pixman_implementation_t *imp,
912
                    pixman_op_t              op,
913
                    uint32_t *               dest,
914
                    const uint32_t *         src,
915
                    const uint32_t *         mask,
916
                    int                      width)
917
0
{
918
0
    const uint32_t *end = dest + width;
919
920
0
    while (dest < end)
921
0
    {
922
0
  __m64 da, d, sia;
923
0
  __m64 s = combine (src, mask);
924
925
0
  d = load8888 (dest);
926
0
  sia = expand_alpha (s);
927
0
  sia = negate (sia);
928
0
  da = expand_alpha (d);
929
0
  s = pix_add_mul (s, da, d, sia);
930
0
  store8888 (dest, s);
931
932
0
  ++dest;
933
0
  ++src;
934
0
  if (mask)
935
0
      mask++;
936
0
    }
937
0
    _mm_empty ();
938
0
}
939
940
static void
941
mmx_combine_atop_reverse_u (pixman_implementation_t *imp,
942
                            pixman_op_t              op,
943
                            uint32_t *               dest,
944
                            const uint32_t *         src,
945
                            const uint32_t *         mask,
946
                            int                      width)
947
0
{
948
0
    const uint32_t *end;
949
950
0
    end = dest + width;
951
952
0
    while (dest < end)
953
0
    {
954
0
  __m64 dia, d, sa;
955
0
  __m64 s = combine (src, mask);
956
957
0
  d = load8888 (dest);
958
0
  sa = expand_alpha (s);
959
0
  dia = expand_alpha (d);
960
0
  dia = negate (dia);
961
0
  s = pix_add_mul (s, dia, d, sa);
962
0
  store8888 (dest, s);
963
964
0
  ++dest;
965
0
  ++src;
966
0
  if (mask)
967
0
      mask++;
968
0
    }
969
0
    _mm_empty ();
970
0
}
971
972
static void
973
mmx_combine_xor_u (pixman_implementation_t *imp,
974
                   pixman_op_t              op,
975
                   uint32_t *               dest,
976
                   const uint32_t *         src,
977
                   const uint32_t *         mask,
978
                   int                      width)
979
0
{
980
0
    const uint32_t *end = dest + width;
981
982
0
    while (dest < end)
983
0
    {
984
0
  __m64 dia, d, sia;
985
0
  __m64 s = combine (src, mask);
986
987
0
  d = load8888 (dest);
988
0
  sia = expand_alpha (s);
989
0
  dia = expand_alpha (d);
990
0
  sia = negate (sia);
991
0
  dia = negate (dia);
992
0
  s = pix_add_mul (s, dia, d, sia);
993
0
  store8888 (dest, s);
994
995
0
  ++dest;
996
0
  ++src;
997
0
  if (mask)
998
0
      mask++;
999
0
    }
1000
0
    _mm_empty ();
1001
0
}
1002
1003
static void
1004
mmx_combine_add_u (pixman_implementation_t *imp,
1005
                   pixman_op_t              op,
1006
                   uint32_t *               dest,
1007
                   const uint32_t *         src,
1008
                   const uint32_t *         mask,
1009
                   int                      width)
1010
0
{
1011
0
    const uint32_t *end = dest + width;
1012
1013
0
    while (dest < end)
1014
0
    {
1015
0
  __m64 d;
1016
0
  __m64 s = combine (src, mask);
1017
1018
0
  d = load8888 (dest);
1019
0
  s = pix_add (s, d);
1020
0
  store8888 (dest, s);
1021
1022
0
  ++dest;
1023
0
  ++src;
1024
0
  if (mask)
1025
0
      mask++;
1026
0
    }
1027
0
    _mm_empty ();
1028
0
}
1029
1030
static void
1031
mmx_combine_saturate_u (pixman_implementation_t *imp,
1032
                        pixman_op_t              op,
1033
                        uint32_t *               dest,
1034
                        const uint32_t *         src,
1035
                        const uint32_t *         mask,
1036
                        int                      width)
1037
0
{
1038
0
    const uint32_t *end = dest + width;
1039
1040
0
    while (dest < end)
1041
0
    {
1042
0
  uint32_t s, sa, da;
1043
0
  uint32_t d = *dest;
1044
0
  __m64 ms = combine (src, mask);
1045
0
  __m64 md = load8888 (dest);
1046
1047
0
  store8888(&s, ms);
1048
0
  da = ~d >> 24;
1049
0
  sa = s >> 24;
1050
1051
0
  if (sa > da)
1052
0
  {
1053
0
      uint32_t quot = DIV_UN8 (da, sa) << 24;
1054
0
      __m64 msa = load8888 (&quot);
1055
0
      msa = expand_alpha (msa);
1056
0
      ms = pix_multiply (ms, msa);
1057
0
  }
1058
1059
0
  md = pix_add (md, ms);
1060
0
  store8888 (dest, md);
1061
1062
0
  ++src;
1063
0
  ++dest;
1064
0
  if (mask)
1065
0
      mask++;
1066
0
    }
1067
0
    _mm_empty ();
1068
0
}
1069
1070
static void
1071
mmx_combine_src_ca (pixman_implementation_t *imp,
1072
                    pixman_op_t              op,
1073
                    uint32_t *               dest,
1074
                    const uint32_t *         src,
1075
                    const uint32_t *         mask,
1076
                    int                      width)
1077
0
{
1078
0
    const uint32_t *end = src + width;
1079
1080
0
    while (src < end)
1081
0
    {
1082
0
  __m64 a = load8888 (mask);
1083
0
  __m64 s = load8888 (src);
1084
1085
0
  s = pix_multiply (s, a);
1086
0
  store8888 (dest, s);
1087
1088
0
  ++src;
1089
0
  ++mask;
1090
0
  ++dest;
1091
0
    }
1092
0
    _mm_empty ();
1093
0
}
1094
1095
static void
1096
mmx_combine_over_ca (pixman_implementation_t *imp,
1097
                     pixman_op_t              op,
1098
                     uint32_t *               dest,
1099
                     const uint32_t *         src,
1100
                     const uint32_t *         mask,
1101
                     int                      width)
1102
0
{
1103
0
    const uint32_t *end = src + width;
1104
1105
0
    while (src < end)
1106
0
    {
1107
0
  __m64 a = load8888 (mask);
1108
0
  __m64 s = load8888 (src);
1109
0
  __m64 d = load8888 (dest);
1110
0
  __m64 sa = expand_alpha (s);
1111
1112
0
  store8888 (dest, in_over (s, sa, a, d));
1113
1114
0
  ++src;
1115
0
  ++dest;
1116
0
  ++mask;
1117
0
    }
1118
0
    _mm_empty ();
1119
0
}
1120
1121
static void
1122
mmx_combine_over_reverse_ca (pixman_implementation_t *imp,
1123
                             pixman_op_t              op,
1124
                             uint32_t *               dest,
1125
                             const uint32_t *         src,
1126
                             const uint32_t *         mask,
1127
                             int                      width)
1128
0
{
1129
0
    const uint32_t *end = src + width;
1130
1131
0
    while (src < end)
1132
0
    {
1133
0
  __m64 a = load8888 (mask);
1134
0
  __m64 s = load8888 (src);
1135
0
  __m64 d = load8888 (dest);
1136
0
  __m64 da = expand_alpha (d);
1137
1138
0
  store8888 (dest, over (d, da, in (s, a)));
1139
1140
0
  ++src;
1141
0
  ++dest;
1142
0
  ++mask;
1143
0
    }
1144
0
    _mm_empty ();
1145
0
}
1146
1147
static void
1148
mmx_combine_in_ca (pixman_implementation_t *imp,
1149
                   pixman_op_t              op,
1150
                   uint32_t *               dest,
1151
                   const uint32_t *         src,
1152
                   const uint32_t *         mask,
1153
                   int                      width)
1154
0
{
1155
0
    const uint32_t *end = src + width;
1156
1157
0
    while (src < end)
1158
0
    {
1159
0
  __m64 a = load8888 (mask);
1160
0
  __m64 s = load8888 (src);
1161
0
  __m64 d = load8888 (dest);
1162
0
  __m64 da = expand_alpha (d);
1163
1164
0
  s = pix_multiply (s, a);
1165
0
  s = pix_multiply (s, da);
1166
0
  store8888 (dest, s);
1167
1168
0
  ++src;
1169
0
  ++dest;
1170
0
  ++mask;
1171
0
    }
1172
0
    _mm_empty ();
1173
0
}
1174
1175
static void
1176
mmx_combine_in_reverse_ca (pixman_implementation_t *imp,
1177
                           pixman_op_t              op,
1178
                           uint32_t *               dest,
1179
                           const uint32_t *         src,
1180
                           const uint32_t *         mask,
1181
                           int                      width)
1182
0
{
1183
0
    const uint32_t *end = src + width;
1184
1185
0
    while (src < end)
1186
0
    {
1187
0
  __m64 a = load8888 (mask);
1188
0
  __m64 s = load8888 (src);
1189
0
  __m64 d = load8888 (dest);
1190
0
  __m64 sa = expand_alpha (s);
1191
1192
0
  a = pix_multiply (a, sa);
1193
0
  d = pix_multiply (d, a);
1194
0
  store8888 (dest, d);
1195
1196
0
  ++src;
1197
0
  ++dest;
1198
0
  ++mask;
1199
0
    }
1200
0
    _mm_empty ();
1201
0
}
1202
1203
static void
1204
mmx_combine_out_ca (pixman_implementation_t *imp,
1205
                    pixman_op_t              op,
1206
                    uint32_t *               dest,
1207
                    const uint32_t *         src,
1208
                    const uint32_t *         mask,
1209
                    int                      width)
1210
0
{
1211
0
    const uint32_t *end = src + width;
1212
1213
0
    while (src < end)
1214
0
    {
1215
0
  __m64 a = load8888 (mask);
1216
0
  __m64 s = load8888 (src);
1217
0
  __m64 d = load8888 (dest);
1218
0
  __m64 da = expand_alpha (d);
1219
1220
0
  da = negate (da);
1221
0
  s = pix_multiply (s, a);
1222
0
  s = pix_multiply (s, da);
1223
0
  store8888 (dest, s);
1224
1225
0
  ++src;
1226
0
  ++dest;
1227
0
  ++mask;
1228
0
    }
1229
0
    _mm_empty ();
1230
0
}
1231
1232
static void
1233
mmx_combine_out_reverse_ca (pixman_implementation_t *imp,
1234
                            pixman_op_t              op,
1235
                            uint32_t *               dest,
1236
                            const uint32_t *         src,
1237
                            const uint32_t *         mask,
1238
                            int                      width)
1239
0
{
1240
0
    const uint32_t *end = src + width;
1241
1242
0
    while (src < end)
1243
0
    {
1244
0
  __m64 a = load8888 (mask);
1245
0
  __m64 s = load8888 (src);
1246
0
  __m64 d = load8888 (dest);
1247
0
  __m64 sa = expand_alpha (s);
1248
1249
0
  a = pix_multiply (a, sa);
1250
0
  a = negate (a);
1251
0
  d = pix_multiply (d, a);
1252
0
  store8888 (dest, d);
1253
1254
0
  ++src;
1255
0
  ++dest;
1256
0
  ++mask;
1257
0
    }
1258
0
    _mm_empty ();
1259
0
}
1260
1261
static void
1262
mmx_combine_atop_ca (pixman_implementation_t *imp,
1263
                     pixman_op_t              op,
1264
                     uint32_t *               dest,
1265
                     const uint32_t *         src,
1266
                     const uint32_t *         mask,
1267
                     int                      width)
1268
0
{
1269
0
    const uint32_t *end = src + width;
1270
1271
0
    while (src < end)
1272
0
    {
1273
0
  __m64 a = load8888 (mask);
1274
0
  __m64 s = load8888 (src);
1275
0
  __m64 d = load8888 (dest);
1276
0
  __m64 da = expand_alpha (d);
1277
0
  __m64 sa = expand_alpha (s);
1278
1279
0
  s = pix_multiply (s, a);
1280
0
  a = pix_multiply (a, sa);
1281
0
  a = negate (a);
1282
0
  d = pix_add_mul (d, a, s, da);
1283
0
  store8888 (dest, d);
1284
1285
0
  ++src;
1286
0
  ++dest;
1287
0
  ++mask;
1288
0
    }
1289
0
    _mm_empty ();
1290
0
}
1291
1292
static void
1293
mmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
1294
                             pixman_op_t              op,
1295
                             uint32_t *               dest,
1296
                             const uint32_t *         src,
1297
                             const uint32_t *         mask,
1298
                             int                      width)
1299
0
{
1300
0
    const uint32_t *end = src + width;
1301
1302
0
    while (src < end)
1303
0
    {
1304
0
  __m64 a = load8888 (mask);
1305
0
  __m64 s = load8888 (src);
1306
0
  __m64 d = load8888 (dest);
1307
0
  __m64 da = expand_alpha (d);
1308
0
  __m64 sa = expand_alpha (s);
1309
1310
0
  s = pix_multiply (s, a);
1311
0
  a = pix_multiply (a, sa);
1312
0
  da = negate (da);
1313
0
  d = pix_add_mul (d, a, s, da);
1314
0
  store8888 (dest, d);
1315
1316
0
  ++src;
1317
0
  ++dest;
1318
0
  ++mask;
1319
0
    }
1320
0
    _mm_empty ();
1321
0
}
1322
1323
static void
1324
mmx_combine_xor_ca (pixman_implementation_t *imp,
1325
                    pixman_op_t              op,
1326
                    uint32_t *               dest,
1327
                    const uint32_t *         src,
1328
                    const uint32_t *         mask,
1329
                    int                      width)
1330
0
{
1331
0
    const uint32_t *end = src + width;
1332
1333
0
    while (src < end)
1334
0
    {
1335
0
  __m64 a = load8888 (mask);
1336
0
  __m64 s = load8888 (src);
1337
0
  __m64 d = load8888 (dest);
1338
0
  __m64 da = expand_alpha (d);
1339
0
  __m64 sa = expand_alpha (s);
1340
1341
0
  s = pix_multiply (s, a);
1342
0
  a = pix_multiply (a, sa);
1343
0
  da = negate (da);
1344
0
  a = negate (a);
1345
0
  d = pix_add_mul (d, a, s, da);
1346
0
  store8888 (dest, d);
1347
1348
0
  ++src;
1349
0
  ++dest;
1350
0
  ++mask;
1351
0
    }
1352
0
    _mm_empty ();
1353
0
}
1354
1355
static void
1356
mmx_combine_add_ca (pixman_implementation_t *imp,
1357
                    pixman_op_t              op,
1358
                    uint32_t *               dest,
1359
                    const uint32_t *         src,
1360
                    const uint32_t *         mask,
1361
                    int                      width)
1362
0
{
1363
0
    const uint32_t *end = src + width;
1364
1365
0
    while (src < end)
1366
0
    {
1367
0
  __m64 a = load8888 (mask);
1368
0
  __m64 s = load8888 (src);
1369
0
  __m64 d = load8888 (dest);
1370
1371
0
  s = pix_multiply (s, a);
1372
0
  d = pix_add (s, d);
1373
0
  store8888 (dest, d);
1374
1375
0
  ++src;
1376
0
  ++dest;
1377
0
  ++mask;
1378
0
    }
1379
0
    _mm_empty ();
1380
0
}
1381
1382
/* ------------- MMX code paths called from fbpict.c -------------------- */
1383
1384
static void
1385
mmx_composite_over_n_8888 (pixman_implementation_t *imp,
1386
                           pixman_composite_info_t *info)
1387
0
{
1388
0
    PIXMAN_COMPOSITE_ARGS (info);
1389
0
    uint32_t src;
1390
0
    uint32_t    *dst_line, *dst;
1391
0
    int32_t w;
1392
0
    int dst_stride;
1393
0
    __m64 vsrc, vsrca;
1394
1395
0
    CHECKPOINT ();
1396
1397
0
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1398
1399
0
    if (src == 0)
1400
0
  return;
1401
1402
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1403
1404
0
    vsrc = load8888 (&src);
1405
0
    vsrca = expand_alpha (vsrc);
1406
1407
0
    while (height--)
1408
0
    {
1409
0
  dst = dst_line;
1410
0
  dst_line += dst_stride;
1411
0
  w = width;
1412
1413
0
  CHECKPOINT ();
1414
1415
0
  while (w && (uintptr_t)dst & 7)
1416
0
  {
1417
0
      store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
1418
1419
0
      w--;
1420
0
      dst++;
1421
0
  }
1422
1423
0
  while (w >= 2)
1424
0
  {
1425
0
      __m64 vdest;
1426
0
      __m64 dest0, dest1;
1427
1428
0
      vdest = *(__m64 *)dst;
1429
1430
0
      dest0 = over (vsrc, vsrca, expand8888 (vdest, 0));
1431
0
      dest1 = over (vsrc, vsrca, expand8888 (vdest, 1));
1432
1433
0
      *(__m64 *)dst = pack8888 (dest0, dest1);
1434
1435
0
      dst += 2;
1436
0
      w -= 2;
1437
0
  }
1438
1439
0
  CHECKPOINT ();
1440
1441
0
  if (w)
1442
0
  {
1443
0
      store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
1444
0
  }
1445
0
    }
1446
1447
0
    _mm_empty ();
1448
0
}
1449
1450
static void
1451
mmx_composite_over_n_0565 (pixman_implementation_t *imp,
1452
                           pixman_composite_info_t *info)
1453
0
{
1454
0
    PIXMAN_COMPOSITE_ARGS (info);
1455
0
    uint32_t src;
1456
0
    uint16_t    *dst_line, *dst;
1457
0
    int32_t w;
1458
0
    int dst_stride;
1459
0
    __m64 vsrc, vsrca;
1460
1461
0
    CHECKPOINT ();
1462
1463
0
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1464
1465
0
    if (src == 0)
1466
0
  return;
1467
1468
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1469
1470
0
    vsrc = load8888 (&src);
1471
0
    vsrca = expand_alpha (vsrc);
1472
1473
0
    while (height--)
1474
0
    {
1475
0
  dst = dst_line;
1476
0
  dst_line += dst_stride;
1477
0
  w = width;
1478
1479
0
  CHECKPOINT ();
1480
1481
0
  while (w && (uintptr_t)dst & 7)
1482
0
  {
1483
0
      uint64_t d = *dst;
1484
0
      __m64 vdest = expand565 (to_m64 (d), 0);
1485
1486
0
      vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1487
0
      *dst = to_uint64 (vdest);
1488
1489
0
      w--;
1490
0
      dst++;
1491
0
  }
1492
1493
0
  while (w >= 4)
1494
0
  {
1495
0
      __m64 vdest = *(__m64 *)dst;
1496
0
      __m64 v0, v1, v2, v3;
1497
1498
0
      expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
1499
1500
0
      v0 = over (vsrc, vsrca, v0);
1501
0
      v1 = over (vsrc, vsrca, v1);
1502
0
      v2 = over (vsrc, vsrca, v2);
1503
0
      v3 = over (vsrc, vsrca, v3);
1504
1505
0
      *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
1506
1507
0
      dst += 4;
1508
0
      w -= 4;
1509
0
  }
1510
1511
0
  CHECKPOINT ();
1512
1513
0
  while (w)
1514
0
  {
1515
0
      uint64_t d = *dst;
1516
0
      __m64 vdest = expand565 (to_m64 (d), 0);
1517
1518
0
      vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1519
0
      *dst = to_uint64 (vdest);
1520
1521
0
      w--;
1522
0
      dst++;
1523
0
  }
1524
0
    }
1525
1526
0
    _mm_empty ();
1527
0
}
1528
1529
static void
1530
mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
1531
                                   pixman_composite_info_t *info)
1532
0
{
1533
0
    PIXMAN_COMPOSITE_ARGS (info);
1534
0
    uint32_t src;
1535
0
    uint32_t    *dst_line;
1536
0
    uint32_t    *mask_line;
1537
0
    int dst_stride, mask_stride;
1538
0
    __m64 vsrc, vsrca;
1539
1540
0
    CHECKPOINT ();
1541
1542
0
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1543
1544
0
    if (src == 0)
1545
0
  return;
1546
1547
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1548
0
    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
1549
1550
0
    vsrc = load8888 (&src);
1551
0
    vsrca = expand_alpha (vsrc);
1552
1553
0
    while (height--)
1554
0
    {
1555
0
  int twidth = width;
1556
0
  uint32_t *p = (uint32_t *)mask_line;
1557
0
  uint32_t *q = (uint32_t *)dst_line;
1558
1559
0
  while (twidth && (uintptr_t)q & 7)
1560
0
  {
1561
0
      uint32_t m = *(uint32_t *)p;
1562
1563
0
      if (m)
1564
0
      {
1565
0
    __m64 vdest = load8888 (q);
1566
0
    vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
1567
0
    store8888 (q, vdest);
1568
0
      }
1569
1570
0
      twidth--;
1571
0
      p++;
1572
0
      q++;
1573
0
  }
1574
1575
0
  while (twidth >= 2)
1576
0
  {
1577
0
      uint32_t m0, m1;
1578
0
      m0 = *p;
1579
0
      m1 = *(p + 1);
1580
1581
0
      if (m0 | m1)
1582
0
      {
1583
0
    __m64 dest0, dest1;
1584
0
    __m64 vdest = *(__m64 *)q;
1585
1586
0
    dest0 = in_over (vsrc, vsrca, load8888 (&m0),
1587
0
                     expand8888 (vdest, 0));
1588
0
    dest1 = in_over (vsrc, vsrca, load8888 (&m1),
1589
0
                     expand8888 (vdest, 1));
1590
1591
0
    *(__m64 *)q = pack8888 (dest0, dest1);
1592
0
      }
1593
1594
0
      p += 2;
1595
0
      q += 2;
1596
0
      twidth -= 2;
1597
0
  }
1598
1599
0
  if (twidth)
1600
0
  {
1601
0
      uint32_t m = *(uint32_t *)p;
1602
1603
0
      if (m)
1604
0
      {
1605
0
    __m64 vdest = load8888 (q);
1606
0
    vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
1607
0
    store8888 (q, vdest);
1608
0
      }
1609
1610
0
      twidth--;
1611
0
      p++;
1612
0
      q++;
1613
0
  }
1614
1615
0
  dst_line += dst_stride;
1616
0
  mask_line += mask_stride;
1617
0
    }
1618
1619
0
    _mm_empty ();
1620
0
}
1621
1622
static void
1623
mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
1624
                                pixman_composite_info_t *info)
1625
0
{
1626
0
    PIXMAN_COMPOSITE_ARGS (info);
1627
0
    uint32_t    *dst_line, *dst;
1628
0
    uint32_t    *src_line, *src;
1629
0
    uint32_t mask;
1630
0
    __m64 vmask;
1631
0
    int dst_stride, src_stride;
1632
0
    int32_t w;
1633
1634
0
    CHECKPOINT ();
1635
1636
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1637
0
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1638
1639
0
    mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1640
0
    vmask = expand_alpha (load8888 (&mask));
1641
1642
0
    while (height--)
1643
0
    {
1644
0
  dst = dst_line;
1645
0
  dst_line += dst_stride;
1646
0
  src = src_line;
1647
0
  src_line += src_stride;
1648
0
  w = width;
1649
1650
0
  while (w && (uintptr_t)dst & 7)
1651
0
  {
1652
0
      __m64 s = load8888 (src);
1653
0
      __m64 d = load8888 (dst);
1654
1655
0
      store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
1656
1657
0
      w--;
1658
0
      dst++;
1659
0
      src++;
1660
0
  }
1661
1662
0
  while (w >= 2)
1663
0
  {
1664
0
      __m64 vs = ldq_u ((__m64 *)src);
1665
0
      __m64 vd = *(__m64 *)dst;
1666
0
      __m64 vsrc0 = expand8888 (vs, 0);
1667
0
      __m64 vsrc1 = expand8888 (vs, 1);
1668
1669
0
      *(__m64 *)dst = pack8888 (
1670
0
          in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)),
1671
0
          in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1)));
1672
1673
0
      w -= 2;
1674
0
      dst += 2;
1675
0
      src += 2;
1676
0
  }
1677
1678
0
  if (w)
1679
0
  {
1680
0
      __m64 s = load8888 (src);
1681
0
      __m64 d = load8888 (dst);
1682
1683
0
      store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
1684
0
  }
1685
0
    }
1686
1687
0
    _mm_empty ();
1688
0
}
1689
1690
static void
1691
mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
1692
                                pixman_composite_info_t *info)
1693
0
{
1694
0
    PIXMAN_COMPOSITE_ARGS (info);
1695
0
    uint32_t *dst_line, *dst;
1696
0
    uint32_t *src_line, *src;
1697
0
    uint32_t mask;
1698
0
    __m64 vmask;
1699
0
    int dst_stride, src_stride;
1700
0
    int32_t w;
1701
0
    __m64 srca;
1702
1703
0
    CHECKPOINT ();
1704
1705
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1706
0
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1707
0
    mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1708
1709
0
    vmask = expand_alpha (load8888 (&mask));
1710
0
    srca = MC (4x00ff);
1711
1712
0
    while (height--)
1713
0
    {
1714
0
  dst = dst_line;
1715
0
  dst_line += dst_stride;
1716
0
  src = src_line;
1717
0
  src_line += src_stride;
1718
0
  w = width;
1719
1720
0
  while (w && (uintptr_t)dst & 7)
1721
0
  {
1722
0
      uint32_t ssrc = *src | 0xff000000;
1723
0
      __m64 s = load8888 (&ssrc);
1724
0
      __m64 d = load8888 (dst);
1725
1726
0
      store8888 (dst, in_over (s, srca, vmask, d));
1727
1728
0
      w--;
1729
0
      dst++;
1730
0
      src++;
1731
0
  }
1732
1733
0
  while (w >= 16)
1734
0
  {
1735
0
      __m64 vd0 = *(__m64 *)(dst + 0);
1736
0
      __m64 vd1 = *(__m64 *)(dst + 2);
1737
0
      __m64 vd2 = *(__m64 *)(dst + 4);
1738
0
      __m64 vd3 = *(__m64 *)(dst + 6);
1739
0
      __m64 vd4 = *(__m64 *)(dst + 8);
1740
0
      __m64 vd5 = *(__m64 *)(dst + 10);
1741
0
      __m64 vd6 = *(__m64 *)(dst + 12);
1742
0
      __m64 vd7 = *(__m64 *)(dst + 14);
1743
1744
0
      __m64 vs0 = ldq_u ((__m64 *)(src + 0));
1745
0
      __m64 vs1 = ldq_u ((__m64 *)(src + 2));
1746
0
      __m64 vs2 = ldq_u ((__m64 *)(src + 4));
1747
0
      __m64 vs3 = ldq_u ((__m64 *)(src + 6));
1748
0
      __m64 vs4 = ldq_u ((__m64 *)(src + 8));
1749
0
      __m64 vs5 = ldq_u ((__m64 *)(src + 10));
1750
0
      __m64 vs6 = ldq_u ((__m64 *)(src + 12));
1751
0
      __m64 vs7 = ldq_u ((__m64 *)(src + 14));
1752
1753
0
      vd0 = pack8888 (
1754
0
          in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
1755
0
          in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
1756
1757
0
      vd1 = pack8888 (
1758
0
          in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
1759
0
          in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
1760
1761
0
      vd2 = pack8888 (
1762
0
          in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
1763
0
          in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
1764
1765
0
      vd3 = pack8888 (
1766
0
          in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
1767
0
          in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
1768
1769
0
      vd4 = pack8888 (
1770
0
          in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
1771
0
          in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
1772
1773
0
      vd5 = pack8888 (
1774
0
          in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
1775
0
          in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
1776
1777
0
      vd6 = pack8888 (
1778
0
          in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
1779
0
          in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
1780
1781
0
      vd7 = pack8888 (
1782
0
          in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
1783
0
          in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
1784
1785
0
      *(__m64 *)(dst + 0) = vd0;
1786
0
      *(__m64 *)(dst + 2) = vd1;
1787
0
      *(__m64 *)(dst + 4) = vd2;
1788
0
      *(__m64 *)(dst + 6) = vd3;
1789
0
      *(__m64 *)(dst + 8) = vd4;
1790
0
      *(__m64 *)(dst + 10) = vd5;
1791
0
      *(__m64 *)(dst + 12) = vd6;
1792
0
      *(__m64 *)(dst + 14) = vd7;
1793
1794
0
      w -= 16;
1795
0
      dst += 16;
1796
0
      src += 16;
1797
0
  }
1798
1799
0
  while (w)
1800
0
  {
1801
0
      uint32_t ssrc = *src | 0xff000000;
1802
0
      __m64 s = load8888 (&ssrc);
1803
0
      __m64 d = load8888 (dst);
1804
1805
0
      store8888 (dst, in_over (s, srca, vmask, d));
1806
1807
0
      w--;
1808
0
      dst++;
1809
0
      src++;
1810
0
  }
1811
0
    }
1812
1813
0
    _mm_empty ();
1814
0
}
1815
1816
static void
1817
mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
1818
                              pixman_composite_info_t *info)
1819
0
{
1820
0
    PIXMAN_COMPOSITE_ARGS (info);
1821
0
    uint32_t *dst_line, *dst;
1822
0
    uint32_t *src_line, *src;
1823
0
    uint32_t s;
1824
0
    int dst_stride, src_stride;
1825
0
    uint8_t a;
1826
0
    int32_t w;
1827
1828
0
    CHECKPOINT ();
1829
1830
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1831
0
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1832
1833
0
    while (height--)
1834
0
    {
1835
0
  dst = dst_line;
1836
0
  dst_line += dst_stride;
1837
0
  src = src_line;
1838
0
  src_line += src_stride;
1839
0
  w = width;
1840
1841
0
  while (w--)
1842
0
  {
1843
0
      s = *src++;
1844
0
      a = s >> 24;
1845
1846
0
      if (a == 0xff)
1847
0
      {
1848
0
    *dst = s;
1849
0
      }
1850
0
      else if (s)
1851
0
      {
1852
0
    __m64 ms, sa;
1853
0
    ms = load8888 (&s);
1854
0
    sa = expand_alpha (ms);
1855
0
    store8888 (dst, over (ms, sa, load8888 (dst)));
1856
0
      }
1857
1858
0
      dst++;
1859
0
  }
1860
0
    }
1861
0
    _mm_empty ();
1862
0
}
1863
1864
static void
1865
mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
1866
                              pixman_composite_info_t *info)
1867
0
{
1868
0
    PIXMAN_COMPOSITE_ARGS (info);
1869
0
    uint16_t    *dst_line, *dst;
1870
0
    uint32_t    *src_line, *src;
1871
0
    int dst_stride, src_stride;
1872
0
    int32_t w;
1873
1874
0
    CHECKPOINT ();
1875
1876
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1877
0
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1878
1879
#if 0
1880
    /* FIXME */
1881
    assert (src_image->drawable == mask_image->drawable);
1882
#endif
1883
1884
0
    while (height--)
1885
0
    {
1886
0
  dst = dst_line;
1887
0
  dst_line += dst_stride;
1888
0
  src = src_line;
1889
0
  src_line += src_stride;
1890
0
  w = width;
1891
1892
0
  CHECKPOINT ();
1893
1894
0
  while (w && (uintptr_t)dst & 7)
1895
0
  {
1896
0
      __m64 vsrc = load8888 (src);
1897
0
      uint64_t d = *dst;
1898
0
      __m64 vdest = expand565 (to_m64 (d), 0);
1899
1900
0
      vdest = pack_565 (
1901
0
    over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1902
1903
0
      *dst = to_uint64 (vdest);
1904
1905
0
      w--;
1906
0
      dst++;
1907
0
      src++;
1908
0
  }
1909
1910
0
  CHECKPOINT ();
1911
1912
0
  while (w >= 4)
1913
0
  {
1914
0
      __m64 vdest = *(__m64 *)dst;
1915
0
      __m64 v0, v1, v2, v3;
1916
0
      __m64 vsrc0, vsrc1, vsrc2, vsrc3;
1917
1918
0
      expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
1919
1920
0
      vsrc0 = load8888 ((src + 0));
1921
0
      vsrc1 = load8888 ((src + 1));
1922
0
      vsrc2 = load8888 ((src + 2));
1923
0
      vsrc3 = load8888 ((src + 3));
1924
1925
0
      v0 = over (vsrc0, expand_alpha (vsrc0), v0);
1926
0
      v1 = over (vsrc1, expand_alpha (vsrc1), v1);
1927
0
      v2 = over (vsrc2, expand_alpha (vsrc2), v2);
1928
0
      v3 = over (vsrc3, expand_alpha (vsrc3), v3);
1929
1930
0
      *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
1931
1932
0
      w -= 4;
1933
0
      dst += 4;
1934
0
      src += 4;
1935
0
  }
1936
1937
0
  CHECKPOINT ();
1938
1939
0
  while (w)
1940
0
  {
1941
0
      __m64 vsrc = load8888 (src);
1942
0
      uint64_t d = *dst;
1943
0
      __m64 vdest = expand565 (to_m64 (d), 0);
1944
1945
0
      vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1946
1947
0
      *dst = to_uint64 (vdest);
1948
1949
0
      w--;
1950
0
      dst++;
1951
0
      src++;
1952
0
  }
1953
0
    }
1954
1955
0
    _mm_empty ();
1956
0
}
1957
1958
static void
1959
mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
1960
                             pixman_composite_info_t *info)
1961
0
{
1962
0
    PIXMAN_COMPOSITE_ARGS (info);
1963
0
    uint32_t src, srca;
1964
0
    uint32_t *dst_line, *dst;
1965
0
    uint8_t *mask_line, *mask;
1966
0
    int dst_stride, mask_stride;
1967
0
    int32_t w;
1968
0
    __m64 vsrc, vsrca;
1969
0
    uint64_t srcsrc;
1970
1971
0
    CHECKPOINT ();
1972
1973
0
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1974
1975
0
    srca = src >> 24;
1976
0
    if (src == 0)
1977
0
  return;
1978
1979
0
    srcsrc = (uint64_t)src << 32 | src;
1980
1981
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1982
0
    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1983
1984
0
    vsrc = load8888 (&src);
1985
0
    vsrca = expand_alpha (vsrc);
1986
1987
0
    while (height--)
1988
0
    {
1989
0
  dst = dst_line;
1990
0
  dst_line += dst_stride;
1991
0
  mask = mask_line;
1992
0
  mask_line += mask_stride;
1993
0
  w = width;
1994
1995
0
  CHECKPOINT ();
1996
1997
0
  while (w && (uintptr_t)dst & 7)
1998
0
  {
1999
0
      uint64_t m = *mask;
2000
2001
0
      if (m)
2002
0
      {
2003
0
    __m64 vdest = in_over (vsrc, vsrca,
2004
0
               expand_alpha_rev (to_m64 (m)),
2005
0
               load8888 (dst));
2006
2007
0
    store8888 (dst, vdest);
2008
0
      }
2009
2010
0
      w--;
2011
0
      mask++;
2012
0
      dst++;
2013
0
  }
2014
2015
0
  CHECKPOINT ();
2016
2017
0
  while (w >= 2)
2018
0
  {
2019
0
      uint64_t m0, m1;
2020
2021
0
      m0 = *mask;
2022
0
      m1 = *(mask + 1);
2023
2024
0
      if (srca == 0xff && (m0 & m1) == 0xff)
2025
0
      {
2026
0
    *(uint64_t *)dst = srcsrc;
2027
0
      }
2028
0
      else if (m0 | m1)
2029
0
      {
2030
0
    __m64 vdest;
2031
0
    __m64 dest0, dest1;
2032
2033
0
    vdest = *(__m64 *)dst;
2034
2035
0
    dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)),
2036
0
         expand8888 (vdest, 0));
2037
0
    dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)),
2038
0
         expand8888 (vdest, 1));
2039
2040
0
    *(__m64 *)dst = pack8888 (dest0, dest1);
2041
0
      }
2042
2043
0
      mask += 2;
2044
0
      dst += 2;
2045
0
      w -= 2;
2046
0
  }
2047
2048
0
  CHECKPOINT ();
2049
2050
0
  if (w)
2051
0
  {
2052
0
      uint64_t m = *mask;
2053
2054
0
      if (m)
2055
0
      {
2056
0
    __m64 vdest = load8888 (dst);
2057
2058
0
    vdest = in_over (
2059
0
        vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest);
2060
0
    store8888 (dst, vdest);
2061
0
      }
2062
0
  }
2063
0
    }
2064
2065
0
    _mm_empty ();
2066
0
}
2067
2068
static pixman_bool_t
2069
mmx_fill (pixman_implementation_t *imp,
2070
          uint32_t *               bits,
2071
          int                      stride,
2072
          int                      bpp,
2073
          int                      x,
2074
          int                      y,
2075
          int                      width,
2076
          int                      height,
2077
          uint32_t       filler)
2078
0
{
2079
0
    uint64_t fill;
2080
0
    __m64 vfill;
2081
0
    uint32_t byte_width;
2082
0
    uint8_t     *byte_line;
2083
2084
0
#if defined __GNUC__ && defined USE_X86_MMX
2085
0
    __m64 v1, v2, v3, v4, v5, v6, v7;
2086
0
#endif
2087
2088
0
    if (bpp != 16 && bpp != 32 && bpp != 8)
2089
0
  return FALSE;
2090
2091
0
    if (bpp == 8)
2092
0
    {
2093
0
  stride = stride * (int) sizeof (uint32_t) / 1;
2094
0
  byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
2095
0
  byte_width = width;
2096
0
  stride *= 1;
2097
0
        filler = (filler & 0xff) * 0x01010101;
2098
0
    }
2099
0
    else if (bpp == 16)
2100
0
    {
2101
0
  stride = stride * (int) sizeof (uint32_t) / 2;
2102
0
  byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
2103
0
  byte_width = 2 * width;
2104
0
  stride *= 2;
2105
0
        filler = (filler & 0xffff) * 0x00010001;
2106
0
    }
2107
0
    else
2108
0
    {
2109
0
  stride = stride * (int) sizeof (uint32_t) / 4;
2110
0
  byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
2111
0
  byte_width = 4 * width;
2112
0
  stride *= 4;
2113
0
    }
2114
2115
0
    fill = ((uint64_t)filler << 32) | filler;
2116
0
    vfill = to_m64 (fill);
2117
2118
0
#if defined __GNUC__ && defined USE_X86_MMX
2119
0
    __asm__ (
2120
0
        "movq   %7, %0\n"
2121
0
        "movq   %7, %1\n"
2122
0
        "movq   %7, %2\n"
2123
0
        "movq   %7, %3\n"
2124
0
        "movq   %7, %4\n"
2125
0
        "movq   %7, %5\n"
2126
0
        "movq   %7, %6\n"
2127
0
  : "=&y" (v1), "=&y" (v2), "=&y" (v3),
2128
0
    "=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7)
2129
0
  : "y" (vfill));
2130
0
#endif
2131
2132
0
    while (height--)
2133
0
    {
2134
0
  int w;
2135
0
  uint8_t *d = byte_line;
2136
2137
0
  byte_line += stride;
2138
0
  w = byte_width;
2139
2140
0
  if (w >= 1 && ((uintptr_t)d & 1))
2141
0
  {
2142
0
      *(uint8_t *)d = (filler & 0xff);
2143
0
      w--;
2144
0
      d++;
2145
0
  }
2146
2147
0
  if (w >= 2 && ((uintptr_t)d & 3))
2148
0
  {
2149
0
      *(uint16_t *)d = filler;
2150
0
      w -= 2;
2151
0
      d += 2;
2152
0
  }
2153
2154
0
  while (w >= 4 && ((uintptr_t)d & 7))
2155
0
  {
2156
0
      *(uint32_t *)d = filler;
2157
2158
0
      w -= 4;
2159
0
      d += 4;
2160
0
  }
2161
2162
0
  while (w >= 64)
2163
0
  {
2164
0
#if defined __GNUC__ && defined USE_X86_MMX
2165
0
      __asm__ (
2166
0
          "movq %1,   (%0)\n"
2167
0
          "movq %2,  8(%0)\n"
2168
0
          "movq %3, 16(%0)\n"
2169
0
          "movq %4, 24(%0)\n"
2170
0
          "movq %5, 32(%0)\n"
2171
0
          "movq %6, 40(%0)\n"
2172
0
          "movq %7, 48(%0)\n"
2173
0
          "movq %8, 56(%0)\n"
2174
0
    :
2175
0
    : "r" (d),
2176
0
      "y" (vfill), "y" (v1), "y" (v2), "y" (v3),
2177
0
      "y" (v4), "y" (v5), "y" (v6), "y" (v7)
2178
0
    : "memory");
2179
#else
2180
      *(__m64*) (d +  0) = vfill;
2181
      *(__m64*) (d +  8) = vfill;
2182
      *(__m64*) (d + 16) = vfill;
2183
      *(__m64*) (d + 24) = vfill;
2184
      *(__m64*) (d + 32) = vfill;
2185
      *(__m64*) (d + 40) = vfill;
2186
      *(__m64*) (d + 48) = vfill;
2187
      *(__m64*) (d + 56) = vfill;
2188
#endif
2189
0
      w -= 64;
2190
0
      d += 64;
2191
0
  }
2192
2193
0
  while (w >= 4)
2194
0
  {
2195
0
      *(uint32_t *)d = filler;
2196
2197
0
      w -= 4;
2198
0
      d += 4;
2199
0
  }
2200
0
  if (w >= 2)
2201
0
  {
2202
0
      *(uint16_t *)d = filler;
2203
0
      w -= 2;
2204
0
      d += 2;
2205
0
  }
2206
0
  if (w >= 1)
2207
0
  {
2208
0
      *(uint8_t *)d = (filler & 0xff);
2209
0
      w--;
2210
0
      d++;
2211
0
  }
2212
2213
0
    }
2214
2215
0
    _mm_empty ();
2216
0
    return TRUE;
2217
0
}
2218
2219
static void
2220
mmx_composite_src_x888_0565 (pixman_implementation_t *imp,
2221
                             pixman_composite_info_t *info)
2222
0
{
2223
0
    PIXMAN_COMPOSITE_ARGS (info);
2224
0
    uint16_t    *dst_line, *dst;
2225
0
    uint32_t    *src_line, *src, s;
2226
0
    int dst_stride, src_stride;
2227
0
    int32_t w;
2228
2229
0
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2230
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2231
2232
0
    while (height--)
2233
0
    {
2234
0
  dst = dst_line;
2235
0
  dst_line += dst_stride;
2236
0
  src = src_line;
2237
0
  src_line += src_stride;
2238
0
  w = width;
2239
2240
0
  while (w && (uintptr_t)dst & 7)
2241
0
  {
2242
0
      s = *src++;
2243
0
      *dst = convert_8888_to_0565 (s);
2244
0
      dst++;
2245
0
      w--;
2246
0
  }
2247
2248
0
  while (w >= 4)
2249
0
  {
2250
0
      __m64 vdest;
2251
0
      __m64 vsrc0 = ldq_u ((__m64 *)(src + 0));
2252
0
      __m64 vsrc1 = ldq_u ((__m64 *)(src + 2));
2253
2254
0
      vdest = pack_4xpacked565 (vsrc0, vsrc1);
2255
2256
0
      *(__m64 *)dst = vdest;
2257
2258
0
      w -= 4;
2259
0
      src += 4;
2260
0
      dst += 4;
2261
0
  }
2262
2263
0
  while (w)
2264
0
  {
2265
0
      s = *src++;
2266
0
      *dst = convert_8888_to_0565 (s);
2267
0
      dst++;
2268
0
      w--;
2269
0
  }
2270
0
    }
2271
2272
0
    _mm_empty ();
2273
0
}
2274
2275
static void
2276
mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
2277
                            pixman_composite_info_t *info)
2278
0
{
2279
0
    PIXMAN_COMPOSITE_ARGS (info);
2280
0
    uint32_t src, srca;
2281
0
    uint32_t    *dst_line, *dst;
2282
0
    uint8_t     *mask_line, *mask;
2283
0
    int dst_stride, mask_stride;
2284
0
    int32_t w;
2285
0
    __m64 vsrc;
2286
0
    uint64_t srcsrc;
2287
2288
0
    CHECKPOINT ();
2289
2290
0
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2291
2292
0
    srca = src >> 24;
2293
0
    if (src == 0)
2294
0
    {
2295
0
  mmx_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
2296
0
      PIXMAN_FORMAT_BPP (dest_image->bits.format),
2297
0
      dest_x, dest_y, width, height, 0);
2298
0
  return;
2299
0
    }
2300
2301
0
    srcsrc = (uint64_t)src << 32 | src;
2302
2303
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2304
0
    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2305
2306
0
    vsrc = load8888 (&src);
2307
2308
0
    while (height--)
2309
0
    {
2310
0
  dst = dst_line;
2311
0
  dst_line += dst_stride;
2312
0
  mask = mask_line;
2313
0
  mask_line += mask_stride;
2314
0
  w = width;
2315
2316
0
  CHECKPOINT ();
2317
2318
0
  while (w && (uintptr_t)dst & 7)
2319
0
  {
2320
0
      uint64_t m = *mask;
2321
2322
0
      if (m)
2323
0
      {
2324
0
    __m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2325
2326
0
    store8888 (dst, vdest);
2327
0
      }
2328
0
      else
2329
0
      {
2330
0
    *dst = 0;
2331
0
      }
2332
2333
0
      w--;
2334
0
      mask++;
2335
0
      dst++;
2336
0
  }
2337
2338
0
  CHECKPOINT ();
2339
2340
0
  while (w >= 2)
2341
0
  {
2342
0
      uint64_t m0, m1;
2343
0
      m0 = *mask;
2344
0
      m1 = *(mask + 1);
2345
2346
0
      if (srca == 0xff && (m0 & m1) == 0xff)
2347
0
      {
2348
0
    *(uint64_t *)dst = srcsrc;
2349
0
      }
2350
0
      else if (m0 | m1)
2351
0
      {
2352
0
    __m64 dest0, dest1;
2353
2354
0
    dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0)));
2355
0
    dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1)));
2356
2357
0
    *(__m64 *)dst = pack8888 (dest0, dest1);
2358
0
      }
2359
0
      else
2360
0
      {
2361
0
    *(uint64_t *)dst = 0;
2362
0
      }
2363
2364
0
      mask += 2;
2365
0
      dst += 2;
2366
0
      w -= 2;
2367
0
  }
2368
2369
0
  CHECKPOINT ();
2370
2371
0
  if (w)
2372
0
  {
2373
0
      uint64_t m = *mask;
2374
2375
0
      if (m)
2376
0
      {
2377
0
    __m64 vdest = load8888 (dst);
2378
2379
0
    vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2380
0
    store8888 (dst, vdest);
2381
0
      }
2382
0
      else
2383
0
      {
2384
0
    *dst = 0;
2385
0
      }
2386
0
  }
2387
0
    }
2388
2389
0
    _mm_empty ();
2390
0
}
2391
2392
static void
2393
mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
2394
                             pixman_composite_info_t *info)
2395
0
{
2396
0
    PIXMAN_COMPOSITE_ARGS (info);
2397
0
    uint32_t src, srca;
2398
0
    uint16_t *dst_line, *dst;
2399
0
    uint8_t *mask_line, *mask;
2400
0
    int dst_stride, mask_stride;
2401
0
    int32_t w;
2402
0
    __m64 vsrc, vsrca, tmp;
2403
0
    __m64 srcsrcsrcsrc;
2404
2405
0
    CHECKPOINT ();
2406
2407
0
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2408
2409
0
    srca = src >> 24;
2410
0
    if (src == 0)
2411
0
  return;
2412
2413
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2414
0
    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2415
2416
0
    vsrc = load8888 (&src);
2417
0
    vsrca = expand_alpha (vsrc);
2418
2419
0
    tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0);
2420
0
    srcsrcsrcsrc = expand_alpha_rev (tmp);
2421
2422
0
    while (height--)
2423
0
    {
2424
0
  dst = dst_line;
2425
0
  dst_line += dst_stride;
2426
0
  mask = mask_line;
2427
0
  mask_line += mask_stride;
2428
0
  w = width;
2429
2430
0
  CHECKPOINT ();
2431
2432
0
  while (w && (uintptr_t)dst & 7)
2433
0
  {
2434
0
      uint64_t m = *mask;
2435
2436
0
      if (m)
2437
0
      {
2438
0
    uint64_t d = *dst;
2439
0
    __m64 vd = to_m64 (d);
2440
0
    __m64 vdest = in_over (
2441
0
        vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0));
2442
2443
0
    vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2444
0
    *dst = to_uint64 (vd);
2445
0
      }
2446
2447
0
      w--;
2448
0
      mask++;
2449
0
      dst++;
2450
0
  }
2451
2452
0
  CHECKPOINT ();
2453
2454
0
  while (w >= 4)
2455
0
  {
2456
0
      uint64_t m0, m1, m2, m3;
2457
0
      m0 = *mask;
2458
0
      m1 = *(mask + 1);
2459
0
      m2 = *(mask + 2);
2460
0
      m3 = *(mask + 3);
2461
2462
0
      if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
2463
0
      {
2464
0
    *(__m64 *)dst = srcsrcsrcsrc;
2465
0
      }
2466
0
      else if (m0 | m1 | m2 | m3)
2467
0
      {
2468
0
    __m64 vdest = *(__m64 *)dst;
2469
0
    __m64 v0, v1, v2, v3;
2470
0
    __m64 vm0, vm1, vm2, vm3;
2471
2472
0
    expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2473
2474
0
    vm0 = to_m64 (m0);
2475
0
    v0 = in_over (vsrc, vsrca, expand_alpha_rev (vm0), v0);
2476
2477
0
    vm1 = to_m64 (m1);
2478
0
    v1 = in_over (vsrc, vsrca, expand_alpha_rev (vm1), v1);
2479
2480
0
    vm2 = to_m64 (m2);
2481
0
    v2 = in_over (vsrc, vsrca, expand_alpha_rev (vm2), v2);
2482
2483
0
    vm3 = to_m64 (m3);
2484
0
    v3 = in_over (vsrc, vsrca, expand_alpha_rev (vm3), v3);
2485
2486
0
    *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);;
2487
0
      }
2488
2489
0
      w -= 4;
2490
0
      mask += 4;
2491
0
      dst += 4;
2492
0
  }
2493
2494
0
  CHECKPOINT ();
2495
2496
0
  while (w)
2497
0
  {
2498
0
      uint64_t m = *mask;
2499
2500
0
      if (m)
2501
0
      {
2502
0
    uint64_t d = *dst;
2503
0
    __m64 vd = to_m64 (d);
2504
0
    __m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)),
2505
0
               expand565 (vd, 0));
2506
0
    vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2507
0
    *dst = to_uint64 (vd);
2508
0
      }
2509
2510
0
      w--;
2511
0
      mask++;
2512
0
      dst++;
2513
0
  }
2514
0
    }
2515
2516
0
    _mm_empty ();
2517
0
}
2518
2519
static void
2520
mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
2521
                                pixman_composite_info_t *info)
2522
0
{
2523
0
    PIXMAN_COMPOSITE_ARGS (info);
2524
0
    uint16_t    *dst_line, *dst;
2525
0
    uint32_t    *src_line, *src;
2526
0
    int dst_stride, src_stride;
2527
0
    int32_t w;
2528
2529
0
    CHECKPOINT ();
2530
2531
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2532
0
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2533
2534
#if 0
2535
    /* FIXME */
2536
    assert (src_image->drawable == mask_image->drawable);
2537
#endif
2538
2539
0
    while (height--)
2540
0
    {
2541
0
  dst = dst_line;
2542
0
  dst_line += dst_stride;
2543
0
  src = src_line;
2544
0
  src_line += src_stride;
2545
0
  w = width;
2546
2547
0
  CHECKPOINT ();
2548
2549
0
  while (w && (uintptr_t)dst & 7)
2550
0
  {
2551
0
      __m64 vsrc = load8888 (src);
2552
0
      uint64_t d = *dst;
2553
0
      __m64 vdest = expand565 (to_m64 (d), 0);
2554
2555
0
      vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2556
2557
0
      *dst = to_uint64 (vdest);
2558
2559
0
      w--;
2560
0
      dst++;
2561
0
      src++;
2562
0
  }
2563
2564
0
  CHECKPOINT ();
2565
2566
0
  while (w >= 4)
2567
0
  {
2568
0
      uint32_t s0, s1, s2, s3;
2569
0
      unsigned char a0, a1, a2, a3;
2570
2571
0
      s0 = *src;
2572
0
      s1 = *(src + 1);
2573
0
      s2 = *(src + 2);
2574
0
      s3 = *(src + 3);
2575
2576
0
      a0 = (s0 >> 24);
2577
0
      a1 = (s1 >> 24);
2578
0
      a2 = (s2 >> 24);
2579
0
      a3 = (s3 >> 24);
2580
2581
0
      if ((a0 & a1 & a2 & a3) == 0xFF)
2582
0
      {
2583
0
    __m64 v0 = invert_colors (load8888 (&s0));
2584
0
    __m64 v1 = invert_colors (load8888 (&s1));
2585
0
    __m64 v2 = invert_colors (load8888 (&s2));
2586
0
    __m64 v3 = invert_colors (load8888 (&s3));
2587
2588
0
    *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
2589
0
      }
2590
0
      else if (s0 | s1 | s2 | s3)
2591
0
      {
2592
0
    __m64 vdest = *(__m64 *)dst;
2593
0
    __m64 v0, v1, v2, v3;
2594
2595
0
    __m64 vsrc0 = load8888 (&s0);
2596
0
    __m64 vsrc1 = load8888 (&s1);
2597
0
    __m64 vsrc2 = load8888 (&s2);
2598
0
    __m64 vsrc3 = load8888 (&s3);
2599
2600
0
    expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2601
2602
0
    v0 = over_rev_non_pre (vsrc0, v0);
2603
0
    v1 = over_rev_non_pre (vsrc1, v1);
2604
0
    v2 = over_rev_non_pre (vsrc2, v2);
2605
0
    v3 = over_rev_non_pre (vsrc3, v3);
2606
2607
0
    *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
2608
0
      }
2609
2610
0
      w -= 4;
2611
0
      dst += 4;
2612
0
      src += 4;
2613
0
  }
2614
2615
0
  CHECKPOINT ();
2616
2617
0
  while (w)
2618
0
  {
2619
0
      __m64 vsrc = load8888 (src);
2620
0
      uint64_t d = *dst;
2621
0
      __m64 vdest = expand565 (to_m64 (d), 0);
2622
2623
0
      vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2624
2625
0
      *dst = to_uint64 (vdest);
2626
2627
0
      w--;
2628
0
      dst++;
2629
0
      src++;
2630
0
  }
2631
0
    }
2632
2633
0
    _mm_empty ();
2634
0
}
2635
2636
static void
2637
mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
2638
                                pixman_composite_info_t *info)
2639
0
{
2640
0
    PIXMAN_COMPOSITE_ARGS (info);
2641
0
    uint32_t    *dst_line, *dst;
2642
0
    uint32_t    *src_line, *src;
2643
0
    int dst_stride, src_stride;
2644
0
    int32_t w;
2645
2646
0
    CHECKPOINT ();
2647
2648
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2649
0
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2650
2651
#if 0
2652
    /* FIXME */
2653
    assert (src_image->drawable == mask_image->drawable);
2654
#endif
2655
2656
0
    while (height--)
2657
0
    {
2658
0
  dst = dst_line;
2659
0
  dst_line += dst_stride;
2660
0
  src = src_line;
2661
0
  src_line += src_stride;
2662
0
  w = width;
2663
2664
0
  while (w && (uintptr_t)dst & 7)
2665
0
  {
2666
0
      __m64 s = load8888 (src);
2667
0
      __m64 d = load8888 (dst);
2668
2669
0
      store8888 (dst, over_rev_non_pre (s, d));
2670
2671
0
      w--;
2672
0
      dst++;
2673
0
      src++;
2674
0
  }
2675
2676
0
  while (w >= 2)
2677
0
  {
2678
0
      uint32_t s0, s1;
2679
0
      unsigned char a0, a1;
2680
0
      __m64 d0, d1;
2681
2682
0
      s0 = *src;
2683
0
      s1 = *(src + 1);
2684
2685
0
      a0 = (s0 >> 24);
2686
0
      a1 = (s1 >> 24);
2687
2688
0
      if ((a0 & a1) == 0xFF)
2689
0
      {
2690
0
    d0 = invert_colors (load8888 (&s0));
2691
0
    d1 = invert_colors (load8888 (&s1));
2692
2693
0
    *(__m64 *)dst = pack8888 (d0, d1);
2694
0
      }
2695
0
      else if (s0 | s1)
2696
0
      {
2697
0
    __m64 vdest = *(__m64 *)dst;
2698
2699
0
    d0 = over_rev_non_pre (load8888 (&s0), expand8888 (vdest, 0));
2700
0
    d1 = over_rev_non_pre (load8888 (&s1), expand8888 (vdest, 1));
2701
2702
0
    *(__m64 *)dst = pack8888 (d0, d1);
2703
0
      }
2704
2705
0
      w -= 2;
2706
0
      dst += 2;
2707
0
      src += 2;
2708
0
  }
2709
2710
0
  if (w)
2711
0
  {
2712
0
      __m64 s = load8888 (src);
2713
0
      __m64 d = load8888 (dst);
2714
2715
0
      store8888 (dst, over_rev_non_pre (s, d));
2716
0
  }
2717
0
    }
2718
2719
0
    _mm_empty ();
2720
0
}
2721
2722
static void
2723
mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
2724
                                   pixman_composite_info_t *info)
2725
0
{
2726
0
    PIXMAN_COMPOSITE_ARGS (info);
2727
0
    uint32_t src;
2728
0
    uint16_t    *dst_line;
2729
0
    uint32_t    *mask_line;
2730
0
    int dst_stride, mask_stride;
2731
0
    __m64 vsrc, vsrca;
2732
2733
0
    CHECKPOINT ();
2734
2735
0
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2736
2737
0
    if (src == 0)
2738
0
  return;
2739
2740
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2741
0
    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2742
2743
0
    vsrc = load8888 (&src);
2744
0
    vsrca = expand_alpha (vsrc);
2745
2746
0
    while (height--)
2747
0
    {
2748
0
  int twidth = width;
2749
0
  uint32_t *p = (uint32_t *)mask_line;
2750
0
  uint16_t *q = (uint16_t *)dst_line;
2751
2752
0
  while (twidth && ((uintptr_t)q & 7))
2753
0
  {
2754
0
      uint32_t m = *(uint32_t *)p;
2755
2756
0
      if (m)
2757
0
      {
2758
0
    uint64_t d = *q;
2759
0
    __m64 vdest = expand565 (to_m64 (d), 0);
2760
0
    vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
2761
0
    *q = to_uint64 (vdest);
2762
0
      }
2763
2764
0
      twidth--;
2765
0
      p++;
2766
0
      q++;
2767
0
  }
2768
2769
0
  while (twidth >= 4)
2770
0
  {
2771
0
      uint32_t m0, m1, m2, m3;
2772
2773
0
      m0 = *p;
2774
0
      m1 = *(p + 1);
2775
0
      m2 = *(p + 2);
2776
0
      m3 = *(p + 3);
2777
2778
0
      if ((m0 | m1 | m2 | m3))
2779
0
      {
2780
0
    __m64 vdest = *(__m64 *)q;
2781
0
    __m64 v0, v1, v2, v3;
2782
2783
0
    expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2784
2785
0
    v0 = in_over (vsrc, vsrca, load8888 (&m0), v0);
2786
0
    v1 = in_over (vsrc, vsrca, load8888 (&m1), v1);
2787
0
    v2 = in_over (vsrc, vsrca, load8888 (&m2), v2);
2788
0
    v3 = in_over (vsrc, vsrca, load8888 (&m3), v3);
2789
2790
0
    *(__m64 *)q = pack_4x565 (v0, v1, v2, v3);
2791
0
      }
2792
0
      twidth -= 4;
2793
0
      p += 4;
2794
0
      q += 4;
2795
0
  }
2796
2797
0
  while (twidth)
2798
0
  {
2799
0
      uint32_t m;
2800
2801
0
      m = *(uint32_t *)p;
2802
0
      if (m)
2803
0
      {
2804
0
    uint64_t d = *q;
2805
0
    __m64 vdest = expand565 (to_m64 (d), 0);
2806
0
    vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
2807
0
    *q = to_uint64 (vdest);
2808
0
      }
2809
2810
0
      twidth--;
2811
0
      p++;
2812
0
      q++;
2813
0
  }
2814
2815
0
  mask_line += mask_stride;
2816
0
  dst_line += dst_stride;
2817
0
    }
2818
2819
0
    _mm_empty ();
2820
0
}
2821
2822
static void
2823
mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
2824
                        pixman_composite_info_t *info)
2825
0
{
2826
0
    PIXMAN_COMPOSITE_ARGS (info);
2827
0
    uint8_t *dst_line, *dst;
2828
0
    uint8_t *mask_line, *mask;
2829
0
    int dst_stride, mask_stride;
2830
0
    int32_t w;
2831
0
    uint32_t src;
2832
0
    uint8_t sa;
2833
0
    __m64 vsrc, vsrca;
2834
2835
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2836
0
    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2837
2838
0
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2839
2840
0
    sa = src >> 24;
2841
2842
0
    vsrc = load8888 (&src);
2843
0
    vsrca = expand_alpha (vsrc);
2844
2845
0
    while (height--)
2846
0
    {
2847
0
  dst = dst_line;
2848
0
  dst_line += dst_stride;
2849
0
  mask = mask_line;
2850
0
  mask_line += mask_stride;
2851
0
  w = width;
2852
2853
0
  while (w && (uintptr_t)dst & 7)
2854
0
  {
2855
0
      uint16_t tmp;
2856
0
      uint8_t a;
2857
0
      uint32_t m, d;
2858
2859
0
      a = *mask++;
2860
0
      d = *dst;
2861
2862
0
      m = MUL_UN8 (sa, a, tmp);
2863
0
      d = MUL_UN8 (m, d, tmp);
2864
2865
0
      *dst++ = d;
2866
0
      w--;
2867
0
  }
2868
2869
0
  while (w >= 4)
2870
0
  {
2871
0
      __m64 vmask;
2872
0
      __m64 vdest;
2873
2874
0
      vmask = load8888u ((uint32_t *)mask);
2875
0
      vdest = load8888 ((uint32_t *)dst);
2876
2877
0
      store8888 ((uint32_t *)dst, in (in (vsrca, vmask), vdest));
2878
2879
0
      dst += 4;
2880
0
      mask += 4;
2881
0
      w -= 4;
2882
0
  }
2883
2884
0
  while (w--)
2885
0
  {
2886
0
      uint16_t tmp;
2887
0
      uint8_t a;
2888
0
      uint32_t m, d;
2889
2890
0
      a = *mask++;
2891
0
      d = *dst;
2892
2893
0
      m = MUL_UN8 (sa, a, tmp);
2894
0
      d = MUL_UN8 (m, d, tmp);
2895
2896
0
      *dst++ = d;
2897
0
  }
2898
0
    }
2899
2900
0
    _mm_empty ();
2901
0
}
2902
2903
static void
2904
mmx_composite_in_8_8 (pixman_implementation_t *imp,
2905
                      pixman_composite_info_t *info)
2906
0
{
2907
0
    PIXMAN_COMPOSITE_ARGS (info);
2908
0
    uint8_t     *dst_line, *dst;
2909
0
    uint8_t     *src_line, *src;
2910
0
    int src_stride, dst_stride;
2911
0
    int32_t w;
2912
2913
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2914
0
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
2915
2916
0
    while (height--)
2917
0
    {
2918
0
  dst = dst_line;
2919
0
  dst_line += dst_stride;
2920
0
  src = src_line;
2921
0
  src_line += src_stride;
2922
0
  w = width;
2923
2924
0
  while (w && (uintptr_t)dst & 3)
2925
0
  {
2926
0
      uint8_t s, d;
2927
0
      uint16_t tmp;
2928
2929
0
      s = *src;
2930
0
      d = *dst;
2931
2932
0
      *dst = MUL_UN8 (s, d, tmp);
2933
2934
0
      src++;
2935
0
      dst++;
2936
0
      w--;
2937
0
  }
2938
2939
0
  while (w >= 4)
2940
0
  {
2941
0
      uint32_t *s = (uint32_t *)src;
2942
0
      uint32_t *d = (uint32_t *)dst;
2943
2944
0
      store8888 (d, in (load8888u (s), load8888 (d)));
2945
2946
0
      w -= 4;
2947
0
      dst += 4;
2948
0
      src += 4;
2949
0
  }
2950
2951
0
  while (w--)
2952
0
  {
2953
0
      uint8_t s, d;
2954
0
      uint16_t tmp;
2955
2956
0
      s = *src;
2957
0
      d = *dst;
2958
2959
0
      *dst = MUL_UN8 (s, d, tmp);
2960
2961
0
      src++;
2962
0
      dst++;
2963
0
  }
2964
0
    }
2965
2966
0
    _mm_empty ();
2967
0
}
2968
2969
static void
2970
mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
2971
       pixman_composite_info_t *info)
2972
0
{
2973
0
    PIXMAN_COMPOSITE_ARGS (info);
2974
0
    uint8_t     *dst_line, *dst;
2975
0
    uint8_t     *mask_line, *mask;
2976
0
    int dst_stride, mask_stride;
2977
0
    int32_t w;
2978
0
    uint32_t src;
2979
0
    uint8_t sa;
2980
0
    __m64 vsrc, vsrca;
2981
2982
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2983
0
    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2984
2985
0
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2986
2987
0
    sa = src >> 24;
2988
2989
0
    if (src == 0)
2990
0
  return;
2991
2992
0
    vsrc = load8888 (&src);
2993
0
    vsrca = expand_alpha (vsrc);
2994
2995
0
    while (height--)
2996
0
    {
2997
0
  dst = dst_line;
2998
0
  dst_line += dst_stride;
2999
0
  mask = mask_line;
3000
0
  mask_line += mask_stride;
3001
0
  w = width;
3002
3003
0
  while (w && (uintptr_t)dst & 3)
3004
0
  {
3005
0
      uint16_t tmp;
3006
0
      uint16_t a;
3007
0
      uint32_t m, d;
3008
0
      uint32_t r;
3009
3010
0
      a = *mask++;
3011
0
      d = *dst;
3012
3013
0
      m = MUL_UN8 (sa, a, tmp);
3014
0
      r = ADD_UN8 (m, d, tmp);
3015
3016
0
      *dst++ = r;
3017
0
      w--;
3018
0
  }
3019
3020
0
  while (w >= 4)
3021
0
  {
3022
0
      __m64 vmask;
3023
0
      __m64 vdest;
3024
3025
0
      vmask = load8888u ((uint32_t *)mask);
3026
0
      vdest = load8888 ((uint32_t *)dst);
3027
3028
0
      store8888 ((uint32_t *)dst, _mm_adds_pu8 (in (vsrca, vmask), vdest));
3029
3030
0
      dst += 4;
3031
0
      mask += 4;
3032
0
      w -= 4;
3033
0
  }
3034
3035
0
  while (w--)
3036
0
  {
3037
0
      uint16_t tmp;
3038
0
      uint16_t a;
3039
0
      uint32_t m, d;
3040
0
      uint32_t r;
3041
3042
0
      a = *mask++;
3043
0
      d = *dst;
3044
3045
0
      m = MUL_UN8 (sa, a, tmp);
3046
0
      r = ADD_UN8 (m, d, tmp);
3047
3048
0
      *dst++ = r;
3049
0
  }
3050
0
    }
3051
3052
0
    _mm_empty ();
3053
0
}
3054
3055
static void
3056
mmx_composite_add_8_8 (pixman_implementation_t *imp,
3057
           pixman_composite_info_t *info)
3058
0
{
3059
0
    PIXMAN_COMPOSITE_ARGS (info);
3060
0
    uint8_t *dst_line, *dst;
3061
0
    uint8_t *src_line, *src;
3062
0
    int dst_stride, src_stride;
3063
0
    int32_t w;
3064
0
    uint8_t s, d;
3065
0
    uint16_t t;
3066
3067
0
    CHECKPOINT ();
3068
3069
0
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
3070
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
3071
3072
0
    while (height--)
3073
0
    {
3074
0
  dst = dst_line;
3075
0
  dst_line += dst_stride;
3076
0
  src = src_line;
3077
0
  src_line += src_stride;
3078
0
  w = width;
3079
3080
0
  while (w && (uintptr_t)dst & 7)
3081
0
  {
3082
0
      s = *src;
3083
0
      d = *dst;
3084
0
      t = d + s;
3085
0
      s = t | (0 - (t >> 8));
3086
0
      *dst = s;
3087
3088
0
      dst++;
3089
0
      src++;
3090
0
      w--;
3091
0
  }
3092
3093
0
  while (w >= 8)
3094
0
  {
3095
0
      *(__m64*)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
3096
0
      dst += 8;
3097
0
      src += 8;
3098
0
      w -= 8;
3099
0
  }
3100
3101
0
  while (w)
3102
0
  {
3103
0
      s = *src;
3104
0
      d = *dst;
3105
0
      t = d + s;
3106
0
      s = t | (0 - (t >> 8));
3107
0
      *dst = s;
3108
3109
0
      dst++;
3110
0
      src++;
3111
0
      w--;
3112
0
  }
3113
0
    }
3114
3115
0
    _mm_empty ();
3116
0
}
3117
3118
static void
3119
mmx_composite_add_0565_0565 (pixman_implementation_t *imp,
3120
                             pixman_composite_info_t *info)
3121
0
{
3122
0
    PIXMAN_COMPOSITE_ARGS (info);
3123
0
    uint16_t    *dst_line, *dst;
3124
0
    uint32_t  d;
3125
0
    uint16_t    *src_line, *src;
3126
0
    uint32_t  s;
3127
0
    int dst_stride, src_stride;
3128
0
    int32_t w;
3129
3130
0
    CHECKPOINT ();
3131
3132
0
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
3133
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3134
3135
0
    while (height--)
3136
0
    {
3137
0
  dst = dst_line;
3138
0
  dst_line += dst_stride;
3139
0
  src = src_line;
3140
0
  src_line += src_stride;
3141
0
  w = width;
3142
3143
0
  while (w && (uintptr_t)dst & 7)
3144
0
  {
3145
0
      s = *src++;
3146
0
      if (s)
3147
0
      {
3148
0
    d = *dst;
3149
0
    s = convert_0565_to_8888 (s);
3150
0
    if (d)
3151
0
    {
3152
0
        d = convert_0565_to_8888 (d);
3153
0
        UN8x4_ADD_UN8x4 (s, d);
3154
0
    }
3155
0
    *dst = convert_8888_to_0565 (s);
3156
0
      }
3157
0
      dst++;
3158
0
      w--;
3159
0
  }
3160
3161
0
  while (w >= 4)
3162
0
  {
3163
0
      __m64 vdest = *(__m64 *)dst;
3164
0
      __m64 vsrc = ldq_u ((__m64 *)src);
3165
0
      __m64 vd0, vd1;
3166
0
      __m64 vs0, vs1;
3167
3168
0
      expand_4xpacked565 (vdest, &vd0, &vd1, 0);
3169
0
      expand_4xpacked565 (vsrc, &vs0, &vs1, 0);
3170
3171
0
      vd0 = _mm_adds_pu8 (vd0, vs0);
3172
0
      vd1 = _mm_adds_pu8 (vd1, vs1);
3173
3174
0
      *(__m64 *)dst = pack_4xpacked565 (vd0, vd1);
3175
3176
0
      dst += 4;
3177
0
      src += 4;
3178
0
      w -= 4;
3179
0
  }
3180
3181
0
  while (w--)
3182
0
  {
3183
0
      s = *src++;
3184
0
      if (s)
3185
0
      {
3186
0
    d = *dst;
3187
0
    s = convert_0565_to_8888 (s);
3188
0
    if (d)
3189
0
    {
3190
0
        d = convert_0565_to_8888 (d);
3191
0
        UN8x4_ADD_UN8x4 (s, d);
3192
0
    }
3193
0
    *dst = convert_8888_to_0565 (s);
3194
0
      }
3195
0
      dst++;
3196
0
  }
3197
0
    }
3198
3199
0
    _mm_empty ();
3200
0
}
3201
3202
static void
3203
mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
3204
                             pixman_composite_info_t *info)
3205
0
{
3206
0
    PIXMAN_COMPOSITE_ARGS (info);
3207
0
    uint32_t    *dst_line, *dst;
3208
0
    uint32_t    *src_line, *src;
3209
0
    int dst_stride, src_stride;
3210
0
    int32_t w;
3211
3212
0
    CHECKPOINT ();
3213
3214
0
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3215
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3216
3217
0
    while (height--)
3218
0
    {
3219
0
  dst = dst_line;
3220
0
  dst_line += dst_stride;
3221
0
  src = src_line;
3222
0
  src_line += src_stride;
3223
0
  w = width;
3224
3225
0
  while (w && (uintptr_t)dst & 7)
3226
0
  {
3227
0
      store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
3228
0
                                load ((const uint32_t *)dst)));
3229
0
      dst++;
3230
0
      src++;
3231
0
      w--;
3232
0
  }
3233
3234
0
  while (w >= 2)
3235
0
  {
3236
0
      *(__m64 *)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
3237
0
      dst += 2;
3238
0
      src += 2;
3239
0
      w -= 2;
3240
0
  }
3241
3242
0
  if (w)
3243
0
  {
3244
0
      store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
3245
0
                                load ((const uint32_t *)dst)));
3246
3247
0
  }
3248
0
    }
3249
3250
0
    _mm_empty ();
3251
0
}
3252
3253
static pixman_bool_t
3254
mmx_blt (pixman_implementation_t *imp,
3255
         uint32_t *               src_bits,
3256
         uint32_t *               dst_bits,
3257
         int                      src_stride,
3258
         int                      dst_stride,
3259
         int                      src_bpp,
3260
         int                      dst_bpp,
3261
         int                      src_x,
3262
         int                      src_y,
3263
         int                      dest_x,
3264
         int                      dest_y,
3265
         int                      width,
3266
         int                      height)
3267
0
{
3268
0
    uint8_t *   src_bytes;
3269
0
    uint8_t *   dst_bytes;
3270
0
    int byte_width;
3271
3272
0
    if (src_bpp != dst_bpp)
3273
0
  return FALSE;
3274
3275
0
    if (src_bpp == 16)
3276
0
    {
3277
0
  src_stride = src_stride * (int) sizeof (uint32_t) / 2;
3278
0
  dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
3279
0
  src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
3280
0
  dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
3281
0
  byte_width = 2 * width;
3282
0
  src_stride *= 2;
3283
0
  dst_stride *= 2;
3284
0
    }
3285
0
    else if (src_bpp == 32)
3286
0
    {
3287
0
  src_stride = src_stride * (int) sizeof (uint32_t) / 4;
3288
0
  dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
3289
0
  src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
3290
0
  dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
3291
0
  byte_width = 4 * width;
3292
0
  src_stride *= 4;
3293
0
  dst_stride *= 4;
3294
0
    }
3295
0
    else
3296
0
    {
3297
0
  return FALSE;
3298
0
    }
3299
3300
0
    while (height--)
3301
0
    {
3302
0
  int w;
3303
0
  uint8_t *s = src_bytes;
3304
0
  uint8_t *d = dst_bytes;
3305
0
  src_bytes += src_stride;
3306
0
  dst_bytes += dst_stride;
3307
0
  w = byte_width;
3308
3309
0
  if (w >= 1 && ((uintptr_t)d & 1))
3310
0
  {
3311
0
      *(uint8_t *)d = *(uint8_t *)s;
3312
0
      w -= 1;
3313
0
      s += 1;
3314
0
      d += 1;
3315
0
  }
3316
3317
0
  if (w >= 2 && ((uintptr_t)d & 3))
3318
0
  {
3319
0
      *(uint16_t *)d = *(uint16_t *)s;
3320
0
      w -= 2;
3321
0
      s += 2;
3322
0
      d += 2;
3323
0
  }
3324
3325
0
  while (w >= 4 && ((uintptr_t)d & 7))
3326
0
  {
3327
0
      *(uint32_t *)d = ldl_u ((uint32_t *)s);
3328
3329
0
      w -= 4;
3330
0
      s += 4;
3331
0
      d += 4;
3332
0
  }
3333
3334
0
  while (w >= 64)
3335
0
  {
3336
0
#if (defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined USE_X86_MMX
3337
0
      __asm__ (
3338
0
          "movq   (%1),   %%mm0\n"
3339
0
          "movq  8(%1),   %%mm1\n"
3340
0
          "movq 16(%1),   %%mm2\n"
3341
0
          "movq 24(%1),   %%mm3\n"
3342
0
          "movq 32(%1),   %%mm4\n"
3343
0
          "movq 40(%1),   %%mm5\n"
3344
0
          "movq 48(%1),   %%mm6\n"
3345
0
          "movq 56(%1),   %%mm7\n"
3346
3347
0
          "movq %%mm0,    (%0)\n"
3348
0
          "movq %%mm1,   8(%0)\n"
3349
0
          "movq %%mm2,  16(%0)\n"
3350
0
          "movq %%mm3,  24(%0)\n"
3351
0
          "movq %%mm4,  32(%0)\n"
3352
0
          "movq %%mm5,  40(%0)\n"
3353
0
          "movq %%mm6,  48(%0)\n"
3354
0
          "movq %%mm7,  56(%0)\n"
3355
0
    :
3356
0
    : "r" (d), "r" (s)
3357
0
    : "memory",
3358
0
      "%mm0", "%mm1", "%mm2", "%mm3",
3359
0
      "%mm4", "%mm5", "%mm6", "%mm7");
3360
#else
3361
      __m64 v0 = ldq_u ((__m64 *)(s + 0));
3362
      __m64 v1 = ldq_u ((__m64 *)(s + 8));
3363
      __m64 v2 = ldq_u ((__m64 *)(s + 16));
3364
      __m64 v3 = ldq_u ((__m64 *)(s + 24));
3365
      __m64 v4 = ldq_u ((__m64 *)(s + 32));
3366
      __m64 v5 = ldq_u ((__m64 *)(s + 40));
3367
      __m64 v6 = ldq_u ((__m64 *)(s + 48));
3368
      __m64 v7 = ldq_u ((__m64 *)(s + 56));
3369
      *(__m64 *)(d + 0)  = v0;
3370
      *(__m64 *)(d + 8)  = v1;
3371
      *(__m64 *)(d + 16) = v2;
3372
      *(__m64 *)(d + 24) = v3;
3373
      *(__m64 *)(d + 32) = v4;
3374
      *(__m64 *)(d + 40) = v5;
3375
      *(__m64 *)(d + 48) = v6;
3376
      *(__m64 *)(d + 56) = v7;
3377
#endif
3378
3379
0
      w -= 64;
3380
0
      s += 64;
3381
0
      d += 64;
3382
0
  }
3383
0
  while (w >= 4)
3384
0
  {
3385
0
      *(uint32_t *)d = ldl_u ((uint32_t *)s);
3386
3387
0
      w -= 4;
3388
0
      s += 4;
3389
0
      d += 4;
3390
0
  }
3391
0
  if (w >= 2)
3392
0
  {
3393
0
      *(uint16_t *)d = *(uint16_t *)s;
3394
0
      w -= 2;
3395
0
      s += 2;
3396
0
      d += 2;
3397
0
  }
3398
0
    }
3399
3400
0
    _mm_empty ();
3401
3402
0
    return TRUE;
3403
0
}
3404
3405
static void
3406
mmx_composite_copy_area (pixman_implementation_t *imp,
3407
                         pixman_composite_info_t *info)
3408
0
{
3409
0
    PIXMAN_COMPOSITE_ARGS (info);
3410
3411
0
    mmx_blt (imp, src_image->bits.bits,
3412
0
       dest_image->bits.bits,
3413
0
       src_image->bits.rowstride,
3414
0
       dest_image->bits.rowstride,
3415
0
       PIXMAN_FORMAT_BPP (src_image->bits.format),
3416
0
       PIXMAN_FORMAT_BPP (dest_image->bits.format),
3417
0
       src_x, src_y, dest_x, dest_y, width, height);
3418
0
}
3419
3420
static void
3421
mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
3422
                                pixman_composite_info_t *info)
3423
0
{
3424
0
    PIXMAN_COMPOSITE_ARGS (info);
3425
0
    uint32_t  *src, *src_line;
3426
0
    uint32_t  *dst, *dst_line;
3427
0
    uint8_t  *mask, *mask_line;
3428
0
    int src_stride, mask_stride, dst_stride;
3429
0
    int32_t w;
3430
3431
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3432
0
    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3433
0
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3434
3435
0
    while (height--)
3436
0
    {
3437
0
  src = src_line;
3438
0
  src_line += src_stride;
3439
0
  dst = dst_line;
3440
0
  dst_line += dst_stride;
3441
0
  mask = mask_line;
3442
0
  mask_line += mask_stride;
3443
3444
0
  w = width;
3445
3446
0
  while (w--)
3447
0
  {
3448
0
      uint64_t m = *mask;
3449
3450
0
      if (m)
3451
0
      {
3452
0
    uint32_t ssrc = *src | 0xff000000;
3453
0
    __m64 s = load8888 (&ssrc);
3454
3455
0
    if (m == 0xff)
3456
0
    {
3457
0
        store8888 (dst, s);
3458
0
    }
3459
0
    else
3460
0
    {
3461
0
        __m64 sa = expand_alpha (s);
3462
0
        __m64 vm = expand_alpha_rev (to_m64 (m));
3463
0
        __m64 vdest = in_over (s, sa, vm, load8888 (dst));
3464
3465
0
        store8888 (dst, vdest);
3466
0
    }
3467
0
      }
3468
3469
0
      mask++;
3470
0
      dst++;
3471
0
      src++;
3472
0
  }
3473
0
    }
3474
3475
0
    _mm_empty ();
3476
0
}
3477
3478
static void
3479
mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
3480
                                   pixman_composite_info_t *info)
3481
0
{
3482
0
    PIXMAN_COMPOSITE_ARGS (info);
3483
0
    uint32_t src;
3484
0
    uint32_t    *dst_line, *dst;
3485
0
    int32_t w;
3486
0
    int dst_stride;
3487
0
    __m64 vsrc;
3488
3489
0
    CHECKPOINT ();
3490
3491
0
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3492
3493
0
    if (src == 0)
3494
0
  return;
3495
3496
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3497
3498
0
    vsrc = load8888 (&src);
3499
3500
0
    while (height--)
3501
0
    {
3502
0
  dst = dst_line;
3503
0
  dst_line += dst_stride;
3504
0
  w = width;
3505
3506
0
  CHECKPOINT ();
3507
3508
0
  while (w && (uintptr_t)dst & 7)
3509
0
  {
3510
0
      __m64 vdest = load8888 (dst);
3511
3512
0
      store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
3513
3514
0
      w--;
3515
0
      dst++;
3516
0
  }
3517
3518
0
  while (w >= 2)
3519
0
  {
3520
0
      __m64 vdest = *(__m64 *)dst;
3521
0
      __m64 dest0 = expand8888 (vdest, 0);
3522
0
      __m64 dest1 = expand8888 (vdest, 1);
3523
3524
3525
0
      dest0 = over (dest0, expand_alpha (dest0), vsrc);
3526
0
      dest1 = over (dest1, expand_alpha (dest1), vsrc);
3527
3528
0
      *(__m64 *)dst = pack8888 (dest0, dest1);
3529
3530
0
      dst += 2;
3531
0
      w -= 2;
3532
0
  }
3533
3534
0
  CHECKPOINT ();
3535
3536
0
  if (w)
3537
0
  {
3538
0
      __m64 vdest = load8888 (dst);
3539
3540
0
      store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
3541
0
  }
3542
0
    }
3543
3544
0
    _mm_empty ();
3545
0
}
3546
3547
static force_inline void
3548
scaled_nearest_scanline_mmx_8888_8888_OVER (uint32_t*       pd,
3549
                                            const uint32_t* ps,
3550
                                            int32_t         w,
3551
                                            pixman_fixed_t  vx,
3552
                                            pixman_fixed_t  unit_x,
3553
                                            pixman_fixed_t  src_width_fixed,
3554
                                            pixman_bool_t   fully_transparent_src)
3555
0
{
3556
0
    if (fully_transparent_src)
3557
0
  return;
3558
3559
0
    while (w)
3560
0
    {
3561
0
  __m64 d = load (pd);
3562
0
  __m64 s = load (ps + pixman_fixed_to_int (vx));
3563
0
  vx += unit_x;
3564
0
  while (vx >= 0)
3565
0
      vx -= src_width_fixed;
3566
3567
0
  store8888 (pd, core_combine_over_u_pixel_mmx (s, d));
3568
0
  pd++;
3569
3570
0
  w--;
3571
0
    }
3572
3573
0
    _mm_empty ();
3574
0
}
3575
3576
FAST_NEAREST_MAINLOOP (mmx_8888_8888_cover_OVER,
3577
           scaled_nearest_scanline_mmx_8888_8888_OVER,
3578
           uint32_t, uint32_t, COVER)
3579
FAST_NEAREST_MAINLOOP (mmx_8888_8888_none_OVER,
3580
           scaled_nearest_scanline_mmx_8888_8888_OVER,
3581
           uint32_t, uint32_t, NONE)
3582
FAST_NEAREST_MAINLOOP (mmx_8888_8888_pad_OVER,
3583
           scaled_nearest_scanline_mmx_8888_8888_OVER,
3584
           uint32_t, uint32_t, PAD)
3585
FAST_NEAREST_MAINLOOP (mmx_8888_8888_normal_OVER,
3586
           scaled_nearest_scanline_mmx_8888_8888_OVER,
3587
           uint32_t, uint32_t, NORMAL)
3588
3589
static force_inline void
3590
scaled_nearest_scanline_mmx_8888_n_8888_OVER (const uint32_t * mask,
3591
                uint32_t *       dst,
3592
                const uint32_t * src,
3593
                int32_t          w,
3594
                pixman_fixed_t   vx,
3595
                pixman_fixed_t   unit_x,
3596
                pixman_fixed_t   src_width_fixed,
3597
                pixman_bool_t    zero_src)
3598
0
{
3599
0
    __m64 mm_mask;
3600
3601
0
    if (zero_src || (*mask >> 24) == 0)
3602
0
    {
3603
  /* A workaround for https://gcc.gnu.org/PR47759 */
3604
0
  _mm_empty ();
3605
0
  return;
3606
0
    }
3607
3608
0
    mm_mask = expand_alpha (load8888 (mask));
3609
3610
0
    while (w)
3611
0
    {
3612
0
  uint32_t s = *(src + pixman_fixed_to_int (vx));
3613
0
  vx += unit_x;
3614
0
  while (vx >= 0)
3615
0
      vx -= src_width_fixed;
3616
3617
0
  if (s)
3618
0
  {
3619
0
      __m64 ms = load8888 (&s);
3620
0
      __m64 alpha = expand_alpha (ms);
3621
0
      __m64 dest  = load8888 (dst);
3622
3623
0
      store8888 (dst, (in_over (ms, alpha, mm_mask, dest)));
3624
0
  }
3625
3626
0
  dst++;
3627
0
  w--;
3628
0
    }
3629
3630
0
    _mm_empty ();
3631
0
}
3632
3633
FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_cover_OVER,
3634
            scaled_nearest_scanline_mmx_8888_n_8888_OVER,
3635
            uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
3636
FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_pad_OVER,
3637
            scaled_nearest_scanline_mmx_8888_n_8888_OVER,
3638
            uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
3639
FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_none_OVER,
3640
            scaled_nearest_scanline_mmx_8888_n_8888_OVER,
3641
            uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
3642
FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_normal_OVER,
3643
            scaled_nearest_scanline_mmx_8888_n_8888_OVER,
3644
            uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
3645
3646
0
#define BSHIFT ((1 << BILINEAR_INTERPOLATION_BITS))
3647
0
#define BMSK (BSHIFT - 1)
3648
3649
#define BILINEAR_DECLARE_VARIABLES            \
3650
0
    const __m64 mm_wt = _mm_set_pi16 (wt, wt, wt, wt);        \
3651
0
    const __m64 mm_wb = _mm_set_pi16 (wb, wb, wb, wb);        \
3652
0
    const __m64 mm_addc7 = _mm_set_pi16 (0, 1, 0, 1);       \
3653
0
    const __m64 mm_xorc7 = _mm_set_pi16 (0, BMSK, 0, BMSK);      \
3654
0
    const __m64 mm_ux = _mm_set_pi16 (unit_x, unit_x, unit_x, unit_x);    \
3655
0
    const __m64 mm_zero = _mm_setzero_si64 ();          \
3656
0
    __m64 mm_x = _mm_set_pi16 (vx, vx, vx, vx)
3657
3658
0
#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)         \
3659
0
do {                   \
3660
0
    /* fetch 2x2 pixel block into 2 mmx registers */        \
3661
0
    __m64 t = ldq_u ((__m64 *)&src_top [pixman_fixed_to_int (vx)]);    \
3662
0
    __m64 b = ldq_u ((__m64 *)&src_bottom [pixman_fixed_to_int (vx)]);    \
3663
0
    /* vertical interpolation */            \
3664
0
    __m64 t_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (t, mm_zero), mm_wt);   \
3665
0
    __m64 t_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (t, mm_zero), mm_wt);   \
3666
0
    __m64 b_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (b, mm_zero), mm_wb);   \
3667
0
    __m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb);   \
3668
0
    __m64 hi = _mm_add_pi16 (t_hi, b_hi);         \
3669
0
    __m64 lo = _mm_add_pi16 (t_lo, b_lo);         \
3670
0
    /* calculate horizontal weights */            \
3671
0
    __m64 mm_wh = _mm_add_pi16 (mm_addc7, _mm_xor_si64 (mm_xorc7,   \
3672
0
        _mm_srli_pi16 (mm_x,          \
3673
0
           16 - BILINEAR_INTERPOLATION_BITS)));  \
3674
0
    /* horizontal interpolation */            \
3675
0
    __m64 p = _mm_unpacklo_pi16 (lo, hi);         \
3676
0
    __m64 q = _mm_unpackhi_pi16 (lo, hi);         \
3677
0
    vx += unit_x;               \
3678
0
    lo = _mm_madd_pi16 (p, mm_wh);            \
3679
0
    hi = _mm_madd_pi16 (q, mm_wh);            \
3680
0
    mm_x = _mm_add_pi16 (mm_x, mm_ux);            \
3681
0
    /* shift and pack the result */           \
3682
0
    hi = _mm_srli_pi32 (hi, BILINEAR_INTERPOLATION_BITS * 2);      \
3683
0
    lo = _mm_srli_pi32 (lo, BILINEAR_INTERPOLATION_BITS * 2);      \
3684
0
    lo = _mm_packs_pi32 (lo, hi);           \
3685
0
    lo = _mm_packs_pu16 (lo, lo);           \
3686
0
    pix = lo;                 \
3687
0
} while (0)
3688
3689
0
#define BILINEAR_SKIP_ONE_PIXEL()           \
3690
0
do {                   \
3691
0
    vx += unit_x;               \
3692
0
    mm_x = _mm_add_pi16 (mm_x, mm_ux);            \
3693
0
} while(0)
3694
3695
static force_inline void
3696
scaled_bilinear_scanline_mmx_8888_8888_SRC (uint32_t *       dst,
3697
              const uint32_t * mask,
3698
              const uint32_t * src_top,
3699
              const uint32_t * src_bottom,
3700
              int32_t          w,
3701
              int              wt,
3702
              int              wb,
3703
              pixman_fixed_t   vx,
3704
              pixman_fixed_t   unit_x,
3705
              pixman_fixed_t   max_vx,
3706
              pixman_bool_t    zero_src)
3707
0
{
3708
0
    BILINEAR_DECLARE_VARIABLES;
3709
0
    __m64 pix;
3710
3711
0
    while (w--)
3712
0
    {
3713
0
  BILINEAR_INTERPOLATE_ONE_PIXEL (pix);
3714
0
  store (dst, pix);
3715
0
  dst++;
3716
0
    }
3717
3718
0
    _mm_empty ();
3719
0
}
3720
3721
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_SRC,
3722
             scaled_bilinear_scanline_mmx_8888_8888_SRC,
3723
             uint32_t, uint32_t, uint32_t,
3724
             COVER, FLAG_NONE)
3725
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_SRC,
3726
             scaled_bilinear_scanline_mmx_8888_8888_SRC,
3727
             uint32_t, uint32_t, uint32_t,
3728
             PAD, FLAG_NONE)
3729
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_SRC,
3730
             scaled_bilinear_scanline_mmx_8888_8888_SRC,
3731
             uint32_t, uint32_t, uint32_t,
3732
             NONE, FLAG_NONE)
3733
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_SRC,
3734
             scaled_bilinear_scanline_mmx_8888_8888_SRC,
3735
             uint32_t, uint32_t, uint32_t,
3736
             NORMAL, FLAG_NONE)
3737
3738
static force_inline void
3739
scaled_bilinear_scanline_mmx_8888_8888_OVER (uint32_t *       dst,
3740
               const uint32_t * mask,
3741
               const uint32_t * src_top,
3742
               const uint32_t * src_bottom,
3743
               int32_t          w,
3744
               int              wt,
3745
               int              wb,
3746
               pixman_fixed_t   vx,
3747
               pixman_fixed_t   unit_x,
3748
               pixman_fixed_t   max_vx,
3749
               pixman_bool_t    zero_src)
3750
0
{
3751
0
    BILINEAR_DECLARE_VARIABLES;
3752
0
    __m64 pix1, pix2;
3753
3754
0
    while (w)
3755
0
    {
3756
0
  BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
3757
3758
0
  if (!is_zero (pix1))
3759
0
  {
3760
0
      pix2 = load (dst);
3761
0
      store8888 (dst, core_combine_over_u_pixel_mmx (pix1, pix2));
3762
0
  }
3763
3764
0
  w--;
3765
0
  dst++;
3766
0
    }
3767
3768
0
    _mm_empty ();
3769
0
}
3770
3771
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_OVER,
3772
             scaled_bilinear_scanline_mmx_8888_8888_OVER,
3773
             uint32_t, uint32_t, uint32_t,
3774
             COVER, FLAG_NONE)
3775
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_OVER,
3776
             scaled_bilinear_scanline_mmx_8888_8888_OVER,
3777
             uint32_t, uint32_t, uint32_t,
3778
             PAD, FLAG_NONE)
3779
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_OVER,
3780
             scaled_bilinear_scanline_mmx_8888_8888_OVER,
3781
             uint32_t, uint32_t, uint32_t,
3782
             NONE, FLAG_NONE)
3783
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_OVER,
3784
             scaled_bilinear_scanline_mmx_8888_8888_OVER,
3785
             uint32_t, uint32_t, uint32_t,
3786
             NORMAL, FLAG_NONE)
3787
3788
static force_inline void
3789
scaled_bilinear_scanline_mmx_8888_8_8888_OVER (uint32_t *       dst,
3790
                 const uint8_t  * mask,
3791
                 const uint32_t * src_top,
3792
                 const uint32_t * src_bottom,
3793
                 int32_t          w,
3794
                 int              wt,
3795
                 int              wb,
3796
                 pixman_fixed_t   vx,
3797
                 pixman_fixed_t   unit_x,
3798
                 pixman_fixed_t   max_vx,
3799
                 pixman_bool_t    zero_src)
3800
0
{
3801
0
    BILINEAR_DECLARE_VARIABLES;
3802
0
    __m64 pix1, pix2;
3803
0
    uint32_t m;
3804
3805
0
    while (w)
3806
0
    {
3807
0
  m = (uint32_t) *mask++;
3808
3809
0
  if (m)
3810
0
  {
3811
0
      BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
3812
3813
0
      if (m == 0xff && is_opaque (pix1))
3814
0
      {
3815
0
    store (dst, pix1);
3816
0
      }
3817
0
      else
3818
0
      {
3819
0
    __m64 ms, md, ma, msa;
3820
3821
0
    pix2 = load (dst);
3822
0
    ma = expand_alpha_rev (to_m64 (m));
3823
0
    ms = _mm_unpacklo_pi8 (pix1, _mm_setzero_si64 ());
3824
0
    md = _mm_unpacklo_pi8 (pix2, _mm_setzero_si64 ());
3825
3826
0
    msa = expand_alpha (ms);
3827
3828
0
    store8888 (dst, (in_over (ms, msa, ma, md)));
3829
0
      }
3830
0
  }
3831
0
  else
3832
0
  {
3833
0
      BILINEAR_SKIP_ONE_PIXEL ();
3834
0
  }
3835
3836
0
  w--;
3837
0
  dst++;
3838
0
    }
3839
3840
0
    _mm_empty ();
3841
0
}
3842
3843
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_cover_OVER,
3844
             scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3845
             uint32_t, uint8_t, uint32_t,
3846
             COVER, FLAG_HAVE_NON_SOLID_MASK)
3847
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_pad_OVER,
3848
             scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3849
             uint32_t, uint8_t, uint32_t,
3850
             PAD, FLAG_HAVE_NON_SOLID_MASK)
3851
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_none_OVER,
3852
             scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3853
             uint32_t, uint8_t, uint32_t,
3854
             NONE, FLAG_HAVE_NON_SOLID_MASK)
3855
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_normal_OVER,
3856
             scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3857
             uint32_t, uint8_t, uint32_t,
3858
             NORMAL, FLAG_HAVE_NON_SOLID_MASK)
3859
3860
static uint32_t *
3861
mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
3862
0
{
3863
0
    int w = iter->width;
3864
0
    uint32_t *dst = iter->buffer;
3865
0
    uint32_t *src = (uint32_t *)iter->bits;
3866
3867
0
    iter->bits += iter->stride;
3868
3869
0
    while (w && ((uintptr_t)dst) & 7)
3870
0
    {
3871
0
  *dst++ = (*src++) | 0xff000000;
3872
0
  w--;
3873
0
    }
3874
3875
0
    while (w >= 8)
3876
0
    {
3877
0
  __m64 vsrc1 = ldq_u ((__m64 *)(src + 0));
3878
0
  __m64 vsrc2 = ldq_u ((__m64 *)(src + 2));
3879
0
  __m64 vsrc3 = ldq_u ((__m64 *)(src + 4));
3880
0
  __m64 vsrc4 = ldq_u ((__m64 *)(src + 6));
3881
3882
0
  *(__m64 *)(dst + 0) = _mm_or_si64 (vsrc1, MC (ff000000));
3883
0
  *(__m64 *)(dst + 2) = _mm_or_si64 (vsrc2, MC (ff000000));
3884
0
  *(__m64 *)(dst + 4) = _mm_or_si64 (vsrc3, MC (ff000000));
3885
0
  *(__m64 *)(dst + 6) = _mm_or_si64 (vsrc4, MC (ff000000));
3886
3887
0
  dst += 8;
3888
0
  src += 8;
3889
0
  w -= 8;
3890
0
    }
3891
3892
0
    while (w)
3893
0
    {
3894
0
  *dst++ = (*src++) | 0xff000000;
3895
0
  w--;
3896
0
    }
3897
3898
0
    _mm_empty ();
3899
0
    return iter->buffer;
3900
0
}
3901
3902
static uint32_t *
3903
mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
3904
0
{
3905
0
    int w = iter->width;
3906
0
    uint32_t *dst = iter->buffer;
3907
0
    uint16_t *src = (uint16_t *)iter->bits;
3908
3909
0
    iter->bits += iter->stride;
3910
3911
0
    while (w && ((uintptr_t)dst) & 0x0f)
3912
0
    {
3913
0
  uint16_t s = *src++;
3914
3915
0
  *dst++ = convert_0565_to_8888 (s);
3916
0
  w--;
3917
0
    }
3918
3919
0
    while (w >= 4)
3920
0
    {
3921
0
  __m64 vsrc = ldq_u ((__m64 *)src);
3922
0
  __m64 mm0, mm1;
3923
3924
0
  expand_4xpacked565 (vsrc, &mm0, &mm1, 1);
3925
3926
0
  *(__m64 *)(dst + 0) = mm0;
3927
0
  *(__m64 *)(dst + 2) = mm1;
3928
3929
0
  dst += 4;
3930
0
  src += 4;
3931
0
  w -= 4;
3932
0
    }
3933
3934
0
    while (w)
3935
0
    {
3936
0
  uint16_t s = *src++;
3937
3938
0
  *dst++ = convert_0565_to_8888 (s);
3939
0
  w--;
3940
0
    }
3941
3942
0
    _mm_empty ();
3943
0
    return iter->buffer;
3944
0
}
3945
3946
static uint32_t *
3947
mmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
3948
0
{
3949
0
    int w = iter->width;
3950
0
    uint32_t *dst = iter->buffer;
3951
0
    uint8_t *src = iter->bits;
3952
3953
0
    iter->bits += iter->stride;
3954
3955
0
    while (w && (((uintptr_t)dst) & 15))
3956
0
    {
3957
0
        *dst++ = (uint32_t)*(src++) << 24;
3958
0
        w--;
3959
0
    }
3960
3961
0
    while (w >= 8)
3962
0
    {
3963
0
  __m64 mm0 = ldq_u ((__m64 *)src);
3964
3965
0
  __m64 mm1 = _mm_unpacklo_pi8  (_mm_setzero_si64(), mm0);
3966
0
  __m64 mm2 = _mm_unpackhi_pi8  (_mm_setzero_si64(), mm0);
3967
0
  __m64 mm3 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm1);
3968
0
  __m64 mm4 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm1);
3969
0
  __m64 mm5 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm2);
3970
0
  __m64 mm6 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm2);
3971
3972
0
  *(__m64 *)(dst + 0) = mm3;
3973
0
  *(__m64 *)(dst + 2) = mm4;
3974
0
  *(__m64 *)(dst + 4) = mm5;
3975
0
  *(__m64 *)(dst + 6) = mm6;
3976
3977
0
  dst += 8;
3978
0
  src += 8;
3979
0
  w -= 8;
3980
0
    }
3981
3982
0
    while (w)
3983
0
    {
3984
0
  *dst++ = (uint32_t)*(src++) << 24;
3985
0
  w--;
3986
0
    }
3987
3988
0
    _mm_empty ();
3989
0
    return iter->buffer;
3990
0
}
3991
3992
#define IMAGE_FLAGS             \
3993
    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |    \
3994
     FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
3995
3996
static const pixman_iter_info_t mmx_iters[] = 
3997
{
3998
    { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW,
3999
      _pixman_iter_init_bits_stride, mmx_fetch_x8r8g8b8, NULL
4000
    },
4001
    { PIXMAN_r5g6b5, IMAGE_FLAGS, ITER_NARROW,
4002
      _pixman_iter_init_bits_stride, mmx_fetch_r5g6b5, NULL
4003
    },
4004
    { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW,
4005
      _pixman_iter_init_bits_stride, mmx_fetch_a8, NULL
4006
    },
4007
    { PIXMAN_null },
4008
};
4009
4010
static const pixman_fast_path_t mmx_fast_paths[] =
4011
{
4012
    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       r5g6b5,   mmx_composite_over_n_8_0565       ),
4013
    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       b5g6r5,   mmx_composite_over_n_8_0565       ),
4014
    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8r8g8b8, mmx_composite_over_n_8_8888       ),
4015
    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8r8g8b8, mmx_composite_over_n_8_8888       ),
4016
    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8b8g8r8, mmx_composite_over_n_8_8888       ),
4017
    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8b8g8r8, mmx_composite_over_n_8_8888       ),
4018
    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
4019
    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
4020
    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, r5g6b5,   mmx_composite_over_n_8888_0565_ca ),
4021
    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
4022
    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
4023
    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, b5g6r5,   mmx_composite_over_n_8888_0565_ca ),
4024
    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   a8r8g8b8, mmx_composite_over_pixbuf_8888    ),
4025
    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   x8r8g8b8, mmx_composite_over_pixbuf_8888    ),
4026
    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   r5g6b5,   mmx_composite_over_pixbuf_0565    ),
4027
    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  a8b8g8r8, mmx_composite_over_pixbuf_8888    ),
4028
    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  x8b8g8r8, mmx_composite_over_pixbuf_8888    ),
4029
    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  b5g6r5,   mmx_composite_over_pixbuf_0565    ),
4030
    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    a8r8g8b8, mmx_composite_over_x888_n_8888    ),
4031
    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    x8r8g8b8, mmx_composite_over_x888_n_8888    ),
4032
    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    a8b8g8r8, mmx_composite_over_x888_n_8888    ),
4033
    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    x8b8g8r8, mmx_composite_over_x888_n_8888    ),
4034
    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    a8r8g8b8, mmx_composite_over_8888_n_8888    ),
4035
    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    x8r8g8b8, mmx_composite_over_8888_n_8888    ),
4036
    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    a8b8g8r8, mmx_composite_over_8888_n_8888    ),
4037
    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    x8b8g8r8, mmx_composite_over_8888_n_8888    ),
4038
    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       x8r8g8b8, mmx_composite_over_x888_8_8888    ),
4039
    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       a8r8g8b8, mmx_composite_over_x888_8_8888    ),
4040
    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       x8b8g8r8, mmx_composite_over_x888_8_8888    ),
4041
    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       a8b8g8r8, mmx_composite_over_x888_8_8888    ),
4042
    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     a8r8g8b8, mmx_composite_over_n_8888         ),
4043
    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     x8r8g8b8, mmx_composite_over_n_8888         ),
4044
    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     r5g6b5,   mmx_composite_over_n_0565         ),
4045
    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     b5g6r5,   mmx_composite_over_n_0565         ),
4046
    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
4047
    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
4048
4049
    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     a8r8g8b8, mmx_composite_over_8888_8888      ),
4050
    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     x8r8g8b8, mmx_composite_over_8888_8888      ),
4051
    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     r5g6b5,   mmx_composite_over_8888_0565      ),
4052
    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     a8b8g8r8, mmx_composite_over_8888_8888      ),
4053
    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     x8b8g8r8, mmx_composite_over_8888_8888      ),
4054
    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     b5g6r5,   mmx_composite_over_8888_0565      ),
4055
4056
    PIXMAN_STD_FAST_PATH    (OVER_REVERSE, solid, null, a8r8g8b8, mmx_composite_over_reverse_n_8888),
4057
    PIXMAN_STD_FAST_PATH    (OVER_REVERSE, solid, null, a8b8g8r8, mmx_composite_over_reverse_n_8888),
4058
4059
    PIXMAN_STD_FAST_PATH    (ADD,  r5g6b5,   null,     r5g6b5,   mmx_composite_add_0565_0565       ),
4060
    PIXMAN_STD_FAST_PATH    (ADD,  b5g6r5,   null,     b5g6r5,   mmx_composite_add_0565_0565       ),
4061
    PIXMAN_STD_FAST_PATH    (ADD,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_add_8888_8888       ),
4062
    PIXMAN_STD_FAST_PATH    (ADD,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_add_8888_8888       ),
4063
    PIXMAN_STD_FAST_PATH    (ADD,  a8,       null,     a8,       mmx_composite_add_8_8       ),
4064
    PIXMAN_STD_FAST_PATH    (ADD,  solid,    a8,       a8,       mmx_composite_add_n_8_8           ),
4065
4066
    PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     r5g6b5,   mmx_composite_src_x888_0565       ),
4067
    PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     b5g6r5,   mmx_composite_src_x888_0565       ),
4068
    PIXMAN_STD_FAST_PATH    (SRC,  x8r8g8b8, null,     r5g6b5,   mmx_composite_src_x888_0565       ),
4069
    PIXMAN_STD_FAST_PATH    (SRC,  x8b8g8r8, null,     b5g6r5,   mmx_composite_src_x888_0565       ),
4070
    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8r8g8b8, mmx_composite_src_n_8_8888        ),
4071
    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8r8g8b8, mmx_composite_src_n_8_8888        ),
4072
    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8b8g8r8, mmx_composite_src_n_8_8888        ),
4073
    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8b8g8r8, mmx_composite_src_n_8_8888        ),
4074
    PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_copy_area           ),
4075
    PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_copy_area           ),
4076
    PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
4077
    PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
4078
    PIXMAN_STD_FAST_PATH    (SRC,  x8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
4079
    PIXMAN_STD_FAST_PATH    (SRC,  x8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
4080
    PIXMAN_STD_FAST_PATH    (SRC,  r5g6b5,   null,     r5g6b5,   mmx_composite_copy_area           ),
4081
    PIXMAN_STD_FAST_PATH    (SRC,  b5g6r5,   null,     b5g6r5,   mmx_composite_copy_area           ),
4082
4083
    PIXMAN_STD_FAST_PATH    (IN,   a8,       null,     a8,       mmx_composite_in_8_8              ),
4084
    PIXMAN_STD_FAST_PATH    (IN,   solid,    a8,       a8,       mmx_composite_in_n_8_8            ),
4085
4086
    SIMPLE_NEAREST_FAST_PATH (OVER,   a8r8g8b8, x8r8g8b8, mmx_8888_8888                            ),
4087
    SIMPLE_NEAREST_FAST_PATH (OVER,   a8b8g8r8, x8b8g8r8, mmx_8888_8888                            ),
4088
    SIMPLE_NEAREST_FAST_PATH (OVER,   a8r8g8b8, a8r8g8b8, mmx_8888_8888                            ),
4089
    SIMPLE_NEAREST_FAST_PATH (OVER,   a8b8g8r8, a8b8g8r8, mmx_8888_8888                            ),
4090
4091
    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_n_8888                 ),
4092
    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_n_8888                 ),
4093
    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_n_8888                 ),
4094
    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_n_8888                 ),
4095
4096
    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8,          a8r8g8b8, mmx_8888_8888                     ),
4097
    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8,          x8r8g8b8, mmx_8888_8888                     ),
4098
    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8,          x8r8g8b8, mmx_8888_8888                     ),
4099
    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8,          a8b8g8r8, mmx_8888_8888                     ),
4100
    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8,          x8b8g8r8, mmx_8888_8888                     ),
4101
    SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8,          x8b8g8r8, mmx_8888_8888                     ),
4102
4103
    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8,         x8r8g8b8, mmx_8888_8888                     ),
4104
    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8,         x8b8g8r8, mmx_8888_8888                     ),
4105
    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8,         a8r8g8b8, mmx_8888_8888                     ),
4106
    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8,         a8b8g8r8, mmx_8888_8888                     ),
4107
4108
    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8_8888                   ),
4109
    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8_8888                   ),
4110
    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8_8888                   ),
4111
    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8_8888                   ),
4112
4113
    { PIXMAN_OP_NONE },
4114
};
4115
4116
pixman_implementation_t *
4117
_pixman_implementation_create_mmx (pixman_implementation_t *fallback)
4118
108
{
4119
108
    pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths);
4120
4121
108
    imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u;
4122
108
    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u;
4123
108
    imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u;
4124
108
    imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u;
4125
108
    imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u;
4126
108
    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u;
4127
108
    imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u;
4128
108
    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u;
4129
108
    imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u;
4130
108
    imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u;
4131
108
    imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u;
4132
4133
108
    imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca;
4134
108
    imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca;
4135
108
    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca;
4136
108
    imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca;
4137
108
    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca;
4138
108
    imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca;
4139
108
    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca;
4140
108
    imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca;
4141
108
    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca;
4142
108
    imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca;
4143
108
    imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca;
4144
4145
108
    imp->blt = mmx_blt;
4146
108
    imp->fill = mmx_fill;
4147
4148
108
    imp->iter_info = mmx_iters;
4149
4150
108
    return imp;
4151
108
}
4152
4153
#endif /* USE_X86_MMX || USE_ARM_IWMMXT || USE_LOONGSON_MMI */