Coverage Report

Created: 2025-07-23 08:13

/src/cairo/subprojects/pixman-0.44.2/pixman/pixman-mmx.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright © 2004, 2005 Red Hat, Inc.
3
 * Copyright © 2004 Nicholas Miell
4
 * Copyright © 2005 Trolltech AS
5
 *
6
 * Permission to use, copy, modify, distribute, and sell this software and its
7
 * documentation for any purpose is hereby granted without fee, provided that
8
 * the above copyright notice appear in all copies and that both that
9
 * copyright notice and this permission notice appear in supporting
10
 * documentation, and that the name of Red Hat not be used in advertising or
11
 * publicity pertaining to distribution of the software without specific,
12
 * written prior permission.  Red Hat makes no representations about the
13
 * suitability of this software for any purpose.  It is provided "as is"
14
 * without express or implied warranty.
15
 *
16
 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
17
 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
18
 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
19
 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
21
 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
22
 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
23
 * SOFTWARE.
24
 *
25
 * Author:  Søren Sandmann (sandmann@redhat.com)
26
 * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
27
 * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com)
28
 *
29
 * Based on work by Owen Taylor
30
 */
31
32
#ifdef HAVE_CONFIG_H
33
#include <pixman-config.h>
34
#endif
35
36
#if defined USE_X86_MMX || defined USE_LOONGSON_MMI
37
38
#ifdef USE_LOONGSON_MMI
39
#include <loongson-mmintrin.h>
40
#else
41
#include <mmintrin.h>
42
#endif
43
#include "pixman-private.h"
44
#include "pixman-combine32.h"
45
#include "pixman-inlines.h"
46
47
#ifdef VERBOSE
48
#define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__)
49
#else
50
#define CHECKPOINT()
51
#endif
52
53
#ifdef USE_X86_MMX
54
# if (defined(__SSE2__) || defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64))
55
#  include <xmmintrin.h>
56
# else
57
/* We have to compile with -msse to use xmmintrin.h, but that causes SSE
58
 * instructions to be generated that we don't want. Just duplicate the
59
 * functions we want to use.  */
60
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
61
_mm_movemask_pi8 (__m64 __A)
62
{
63
    int ret;
64
65
    asm ("pmovmskb %1, %0\n\t"
66
  : "=r" (ret)
67
  : "y" (__A)
68
    );
69
70
    return ret;
71
}
72
73
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
74
_mm_mulhi_pu16 (__m64 __A, __m64 __B)
75
{
76
    asm ("pmulhuw %1, %0\n\t"
77
  : "+y" (__A)
78
  : "y" (__B)
79
    );
80
    return __A;
81
}
82
83
# define _mm_shuffle_pi16(A, N)           \
84
    ({                  \
85
  __m64 ret;              \
86
                  \
87
  asm ("pshufw %2, %1, %0\n\t"          \
88
       : "=y" (ret)           \
89
       : "y" (A), "K" ((const int8_t)N)       \
90
  );                \
91
                  \
92
  ret;                \
93
    })
94
# endif
95
#endif
96
97
#ifndef _MM_SHUFFLE
98
#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
99
 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
100
#endif
101
102
/* Notes about writing mmx code
103
 *
104
 * give memory operands as the second operand. If you give it as the
105
 * first, gcc will first load it into a register, then use that
106
 * register
107
 *
108
 *   ie. use
109
 *
110
 *         _mm_mullo_pi16 (x, mmx_constant);
111
 *
112
 *   not
113
 *
114
 *         _mm_mullo_pi16 (mmx_constant, x);
115
 *
116
 * Also try to minimize dependencies. i.e. when you need a value, try
117
 * to calculate it from a value that was calculated as early as
118
 * possible.
119
 */
120
121
/* --------------- MMX primitives ------------------------------------- */
122
123
/* If __m64 is defined as a struct or union, then define M64_MEMBER to be
124
 * the name of the member used to access the data.
125
 * If __m64 requires using mm_cvt* intrinsics functions to convert between
126
 * uint64_t and __m64 values, then define USE_CVT_INTRINSICS.
127
 * If __m64 and uint64_t values can just be cast to each other directly,
128
 * then define USE_M64_CASTS.
129
 * If __m64 is a double datatype, then define USE_M64_DOUBLE.
130
 */
131
#ifdef _MSC_VER
132
# ifdef __clang__
133
#  define USE_CVT_INTRINSICS
134
# else
135
#  define M64_MEMBER m64_u64
136
# endif
137
#elif defined(__ICC)
138
# define USE_CVT_INTRINSICS
139
#elif defined(USE_LOONGSON_MMI)
140
# define USE_M64_DOUBLE
141
#elif defined(__GNUC__)
142
# define USE_M64_CASTS
143
#elif defined(__SUNPRO_C)
144
# if (__SUNPRO_C >= 0x5120) && !defined(__NOVECTORSIZE__)
145
/* Solaris Studio 12.3 (Sun C 5.12) introduces __attribute__(__vector_size__)
146
 * support, and defaults to using it to define __m64, unless __NOVECTORSIZE__
147
 * is defined.   If it is used, then the mm_cvt* intrinsics must be used.
148
 */
149
#  define USE_CVT_INTRINSICS
150
# else
151
/* For Studio 12.2 or older, or when __attribute__(__vector_size__) is
152
 * disabled, __m64 is defined as a struct containing "unsigned long long l_".
153
 */
154
#  define M64_MEMBER l_
155
# endif
156
#endif
157
158
#if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS) || defined(USE_M64_DOUBLE)
159
typedef uint64_t mmxdatafield;
160
#else
161
typedef __m64 mmxdatafield;
162
#endif
163
164
typedef struct
165
{
166
    mmxdatafield mmx_4x00ff;
167
    mmxdatafield mmx_4x0080;
168
    mmxdatafield mmx_565_rgb;
169
    mmxdatafield mmx_565_unpack_multiplier;
170
    mmxdatafield mmx_565_pack_multiplier;
171
    mmxdatafield mmx_565_r;
172
    mmxdatafield mmx_565_g;
173
    mmxdatafield mmx_565_b;
174
    mmxdatafield mmx_packed_565_rb;
175
    mmxdatafield mmx_packed_565_g;
176
    mmxdatafield mmx_expand_565_g;
177
    mmxdatafield mmx_expand_565_b;
178
    mmxdatafield mmx_expand_565_r;
179
#ifndef USE_LOONGSON_MMI
180
    mmxdatafield mmx_mask_0;
181
    mmxdatafield mmx_mask_1;
182
    mmxdatafield mmx_mask_2;
183
    mmxdatafield mmx_mask_3;
184
#endif
185
    mmxdatafield mmx_full_alpha;
186
    mmxdatafield mmx_4x0101;
187
    mmxdatafield mmx_ff000000;
188
} mmx_data_t;
189
190
#if defined(_MSC_VER)
191
# define MMXDATA_INIT(field, val) { val ## UI64 }
192
#elif defined(M64_MEMBER)       /* __m64 is a struct, not an integral type */
193
# define MMXDATA_INIT(field, val) field =   { val ## ULL }
194
#else                           /* mmxdatafield is an integral type */
195
# define MMXDATA_INIT(field, val) field =   val ## ULL
196
#endif
197
198
static const mmx_data_t c =
199
{
200
    MMXDATA_INIT (.mmx_4x00ff,                   0x00ff00ff00ff00ff),
201
    MMXDATA_INIT (.mmx_4x0080,                   0x0080008000800080),
202
    MMXDATA_INIT (.mmx_565_rgb,                  0x000001f0003f001f),
203
    MMXDATA_INIT (.mmx_565_unpack_multiplier,    0x0000008404100840),
204
    MMXDATA_INIT (.mmx_565_pack_multiplier,      0x2000000420000004),
205
    MMXDATA_INIT (.mmx_565_r,                    0x000000f800000000),
206
    MMXDATA_INIT (.mmx_565_g,                    0x0000000000fc0000),
207
    MMXDATA_INIT (.mmx_565_b,                    0x00000000000000f8),
208
    MMXDATA_INIT (.mmx_packed_565_rb,            0x00f800f800f800f8),
209
    MMXDATA_INIT (.mmx_packed_565_g,             0x0000fc000000fc00),
210
    MMXDATA_INIT (.mmx_expand_565_g,             0x07e007e007e007e0),
211
    MMXDATA_INIT (.mmx_expand_565_b,             0x001f001f001f001f),
212
    MMXDATA_INIT (.mmx_expand_565_r,             0xf800f800f800f800),
213
#ifndef USE_LOONGSON_MMI
214
    MMXDATA_INIT (.mmx_mask_0,                   0xffffffffffff0000),
215
    MMXDATA_INIT (.mmx_mask_1,                   0xffffffff0000ffff),
216
    MMXDATA_INIT (.mmx_mask_2,                   0xffff0000ffffffff),
217
    MMXDATA_INIT (.mmx_mask_3,                   0x0000ffffffffffff),
218
#endif
219
    MMXDATA_INIT (.mmx_full_alpha,               0x00ff000000000000),
220
    MMXDATA_INIT (.mmx_4x0101,                   0x0101010101010101),
221
    MMXDATA_INIT (.mmx_ff000000,                 0xff000000ff000000),
222
};
223
224
#ifdef USE_CVT_INTRINSICS
225
#    define MC(x) to_m64 (c.mmx_ ## x)
226
#elif defined(USE_M64_CASTS)
227
0
#    define MC(x) ((__m64)c.mmx_ ## x)
228
#elif defined(USE_M64_DOUBLE)
229
#    define MC(x) (*(__m64 *)&c.mmx_ ## x)
230
#else
231
#    define MC(x) c.mmx_ ## x
232
#endif
233
234
static force_inline __m64
235
to_m64 (uint64_t x)
236
0
{
237
#ifdef USE_CVT_INTRINSICS
238
    return _mm_cvtsi64_m64 (x);
239
#elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
240
    __m64 res;
241
242
    res.M64_MEMBER = x;
243
    return res;
244
#elif defined USE_M64_DOUBLE
245
    return *(__m64 *)&x;
246
#else /* USE_M64_CASTS */
247
0
    return (__m64)x;
248
0
#endif
249
0
}
250
251
static force_inline uint64_t
252
to_uint64 (__m64 x)
253
0
{
254
#ifdef USE_CVT_INTRINSICS
255
    return _mm_cvtm64_si64 (x);
256
#elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
257
    uint64_t res = x.M64_MEMBER;
258
    return res;
259
#elif defined USE_M64_DOUBLE
260
    return *(uint64_t *)&x;
261
#else /* USE_M64_CASTS */
262
0
    return (uint64_t)x;
263
0
#endif
264
0
}
265
266
static force_inline __m64
267
shift (__m64 v,
268
       int   s)
269
0
{
270
0
    if (s > 0)
271
0
  return _mm_slli_si64 (v, s);
272
0
    else if (s < 0)
273
0
  return _mm_srli_si64 (v, -s);
274
0
    else
275
0
  return v;
276
0
}
277
278
static force_inline __m64
279
negate (__m64 mask)
280
0
{
281
0
    return _mm_xor_si64 (mask, MC (4x00ff));
282
0
}
283
284
/* Computes the product of two unsigned fixed-point 8-bit values from 0 to 1
285
 * and maps its result to the same range.
286
 *
287
 * Jim Blinn gives multiple ways to compute this in "Jim Blinn's Corner:
288
 * Notation, Notation, Notation", the first of which is
289
 *
290
 *   prod(a, b) = (a * b + 128) / 255.
291
 *
292
 * By approximating the division by 255 as 257/65536 it can be replaced by a
293
 * multiply and a right shift. This is the implementation that we use in
294
 * pix_multiply(), but we _mm_mulhi_pu16() by 257 (part of SSE1 or Extended
295
 * 3DNow!, and unavailable at the time of the book's publication) to perform
296
 * the multiplication and right shift in a single operation.
297
 *
298
 *   prod(a, b) = ((a * b + 128) * 257) >> 16.
299
 *
300
 * A third way (how pix_multiply() was implemented prior to 14208344) exists
301
 * also that performs the multiplication by 257 with adds and shifts.
302
 *
303
 * Where temp = a * b + 128
304
 *
305
 *   prod(a, b) = (temp + (temp >> 8)) >> 8.
306
 */
307
static force_inline __m64
308
pix_multiply (__m64 a, __m64 b)
309
0
{
310
0
    __m64 res;
311
312
0
    res = _mm_mullo_pi16 (a, b);
313
0
    res = _mm_adds_pu16 (res, MC (4x0080));
314
0
    res = _mm_mulhi_pu16 (res, MC (4x0101));
315
316
0
    return res;
317
0
}
318
319
static force_inline __m64
320
pix_add (__m64 a, __m64 b)
321
0
{
322
0
    return _mm_adds_pu8 (a, b);
323
0
}
324
325
static force_inline __m64
326
expand_alpha (__m64 pixel)
327
0
{
328
0
    return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 3, 3, 3));
329
0
}
330
331
static force_inline __m64
332
expand_alpha_rev (__m64 pixel)
333
0
{
334
0
    return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (0, 0, 0, 0));
335
0
}
336
337
static force_inline __m64
338
invert_colors (__m64 pixel)
339
0
{
340
0
    return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 0, 1, 2));
341
0
}
342
343
static force_inline __m64
344
over (__m64 src,
345
      __m64 srca,
346
      __m64 dest)
347
0
{
348
0
    return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca)));
349
0
}
350
351
static force_inline __m64
352
over_rev_non_pre (__m64 src, __m64 dest)
353
0
{
354
0
    __m64 srca = expand_alpha (src);
355
0
    __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha));
356
357
0
    return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest);
358
0
}
359
360
static force_inline __m64
361
in (__m64 src, __m64 mask)
362
0
{
363
0
    return pix_multiply (src, mask);
364
0
}
365
366
#ifndef _MSC_VER
367
static force_inline __m64
368
in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
369
0
{
370
0
    return over (in (src, mask), pix_multiply (srca, mask), dest);
371
0
}
372
373
#else
374
375
#define in_over(src, srca, mask, dest)          \
376
    over (in (src, mask), pix_multiply (srca, mask), dest)
377
378
#endif
379
380
/* Elemental unaligned loads */
381
382
static force_inline __m64 ldq_u(__m64 *p)
383
0
{
384
0
#ifdef USE_X86_MMX
385
    /* x86's alignment restrictions are very relaxed, but that's no excuse */
386
0
    __m64 r;
387
0
    memcpy(&r, p, sizeof(__m64));
388
0
    return r;
389
#else
390
    struct __una_u64 { __m64 x __attribute__((packed)); };
391
    const struct __una_u64 *ptr = (const struct __una_u64 *) p;
392
    return (__m64) ptr->x;
393
#endif
394
0
}
395
396
static force_inline uint32_t ldl_u(const uint32_t *p)
397
0
{
398
0
#ifdef USE_X86_MMX
399
    /* x86's alignment restrictions are very relaxed. */
400
0
    uint32_t r;
401
0
    memcpy(&r, p, sizeof(uint32_t));
402
0
    return r;
403
#else
404
    struct __una_u32 { uint32_t x __attribute__((packed)); };
405
    const struct __una_u32 *ptr = (const struct __una_u32 *) p;
406
    return ptr->x;
407
#endif
408
0
}
409
410
static force_inline __m64
411
load (const uint32_t *v)
412
0
{
413
#ifdef USE_LOONGSON_MMI
414
    __m64 ret;
415
    asm ("lwc1 %0, %1\n\t"
416
  : "=f" (ret)
417
  : "m" (*v)
418
    );
419
    return ret;
420
#else
421
0
    return _mm_cvtsi32_si64 (*v);
422
0
#endif
423
0
}
424
425
static force_inline __m64
426
load8888 (const uint32_t *v)
427
0
{
428
#ifdef USE_LOONGSON_MMI
429
    return _mm_unpacklo_pi8_f (*(__m32 *)v, _mm_setzero_si64 ());
430
#else
431
0
    return _mm_unpacklo_pi8 (load (v), _mm_setzero_si64 ());
432
0
#endif
433
0
}
434
435
static force_inline __m64
436
load8888u (const uint32_t *v)
437
0
{
438
0
    uint32_t l = ldl_u (v);
439
0
    return load8888 (&l);
440
0
}
441
442
static force_inline __m64
443
pack8888 (__m64 lo, __m64 hi)
444
0
{
445
0
    return _mm_packs_pu16 (lo, hi);
446
0
}
447
448
static force_inline void
449
store (uint32_t *dest, __m64 v)
450
0
{
451
#ifdef USE_LOONGSON_MMI
452
    asm ("swc1 %1, %0\n\t"
453
  : "=m" (*dest)
454
  : "f" (v)
455
  : "memory"
456
    );
457
#else
458
0
    *dest = _mm_cvtsi64_si32 (v);
459
0
#endif
460
0
}
461
462
static force_inline void
463
store8888 (uint32_t *dest, __m64 v)
464
0
{
465
0
    v = pack8888 (v, _mm_setzero_si64 ());
466
0
    store (dest, v);
467
0
}
468
469
static force_inline pixman_bool_t
470
is_equal (__m64 a, __m64 b)
471
0
{
472
#ifdef USE_LOONGSON_MMI
473
    /* __m64 is double, we can compare directly. */
474
    return a == b;
475
#else
476
0
    return _mm_movemask_pi8 (_mm_cmpeq_pi8 (a, b)) == 0xff;
477
0
#endif
478
0
}
479
480
static force_inline pixman_bool_t
481
is_opaque (__m64 v)
482
0
{
483
#ifdef USE_LOONGSON_MMI
484
    return is_equal (_mm_and_si64 (v, MC (full_alpha)), MC (full_alpha));
485
#else
486
0
    __m64 ffs = _mm_cmpeq_pi8 (v, v);
487
0
    return (_mm_movemask_pi8 (_mm_cmpeq_pi8 (v, ffs)) & 0x40);
488
0
#endif
489
0
}
490
491
static force_inline pixman_bool_t
492
is_zero (__m64 v)
493
0
{
494
0
    return is_equal (v, _mm_setzero_si64 ());
495
0
}
496
497
/* Expand 16 bits positioned at @pos (0-3) of a mmx register into
498
 *
499
 *    00RR00GG00BB
500
 *
501
 * --- Expanding 565 in the low word ---
502
 *
503
 * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
504
 * m = m & (01f0003f001f);
505
 * m = m * (008404100840);
506
 * m = m >> 8;
507
 *
508
 * Note the trick here - the top word is shifted by another nibble to
509
 * avoid it bumping into the middle word
510
 */
511
static force_inline __m64
512
expand565 (__m64 pixel, int pos)
513
0
{
514
0
    __m64 p = pixel;
515
0
    __m64 t1, t2;
516
517
    /* move pixel to low 16 bit and zero the rest */
518
#ifdef USE_LOONGSON_MMI
519
    p = loongson_extract_pi16 (p, pos);
520
#else
521
0
    p = shift (shift (p, (3 - pos) * 16), -48);
522
0
#endif
523
524
0
    t1 = shift (p, 36 - 11);
525
0
    t2 = shift (p, 16 - 5);
526
527
0
    p = _mm_or_si64 (t1, p);
528
0
    p = _mm_or_si64 (t2, p);
529
0
    p = _mm_and_si64 (p, MC (565_rgb));
530
531
0
    pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier));
532
0
    return _mm_srli_pi16 (pixel, 8);
533
0
}
534
535
/* Expand 4 16 bit pixels in an mmx register into two mmx registers of
536
 *
537
 *    AARRGGBBRRGGBB
538
 */
539
static force_inline void
540
expand_4xpacked565 (__m64 vin, __m64 *vout0, __m64 *vout1, int full_alpha)
541
0
{
542
0
    __m64 t0, t1, alpha = _mm_setzero_si64 ();
543
0
    __m64 r = _mm_and_si64 (vin, MC (expand_565_r));
544
0
    __m64 g = _mm_and_si64 (vin, MC (expand_565_g));
545
0
    __m64 b = _mm_and_si64 (vin, MC (expand_565_b));
546
0
    if (full_alpha)
547
0
  alpha = _mm_cmpeq_pi32 (alpha, alpha);
548
549
    /* Replicate high bits into empty low bits. */
550
0
    r = _mm_or_si64 (_mm_srli_pi16 (r, 8), _mm_srli_pi16 (r, 13));
551
0
    g = _mm_or_si64 (_mm_srli_pi16 (g, 3), _mm_srli_pi16 (g, 9));
552
0
    b = _mm_or_si64 (_mm_slli_pi16 (b, 3), _mm_srli_pi16 (b, 2));
553
554
0
    r = _mm_packs_pu16 (r, _mm_setzero_si64 ());  /* 00 00 00 00 R3 R2 R1 R0 */
555
0
    g = _mm_packs_pu16 (g, _mm_setzero_si64 ());  /* 00 00 00 00 G3 G2 G1 G0 */
556
0
    b = _mm_packs_pu16 (b, _mm_setzero_si64 ());  /* 00 00 00 00 B3 B2 B1 B0 */
557
558
0
    t1 = _mm_unpacklo_pi8 (r, alpha);     /* A3 R3 A2 R2 A1 R1 A0 R0 */
559
0
    t0 = _mm_unpacklo_pi8 (b, g);     /* G3 B3 G2 B2 G1 B1 G0 B0 */
560
561
0
    *vout0 = _mm_unpacklo_pi16 (t0, t1);    /* A1 R1 G1 B1 A0 R0 G0 B0 */
562
0
    *vout1 = _mm_unpackhi_pi16 (t0, t1);    /* A3 R3 G3 B3 A2 R2 G2 B2 */
563
0
}
564
565
static force_inline __m64
566
expand8888 (__m64 in, int pos)
567
0
{
568
0
    if (pos == 0)
569
0
  return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ());
570
0
    else
571
0
  return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ());
572
0
}
573
574
static force_inline __m64
575
expandx888 (__m64 in, int pos)
576
0
{
577
0
    return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha));
578
0
}
579
580
static force_inline void
581
expand_4x565 (__m64 vin, __m64 *vout0, __m64 *vout1, __m64 *vout2, __m64 *vout3, int full_alpha)
582
0
{
583
0
    __m64 v0, v1;
584
0
    expand_4xpacked565 (vin, &v0, &v1, full_alpha);
585
0
    *vout0 = expand8888 (v0, 0);
586
0
    *vout1 = expand8888 (v0, 1);
587
0
    *vout2 = expand8888 (v1, 0);
588
0
    *vout3 = expand8888 (v1, 1);
589
0
}
590
591
static force_inline __m64
592
pack_565 (__m64 pixel, __m64 target, int pos)
593
0
{
594
0
    __m64 p = pixel;
595
0
    __m64 t = target;
596
0
    __m64 r, g, b;
597
598
0
    r = _mm_and_si64 (p, MC (565_r));
599
0
    g = _mm_and_si64 (p, MC (565_g));
600
0
    b = _mm_and_si64 (p, MC (565_b));
601
602
#ifdef USE_LOONGSON_MMI
603
    r = shift (r, -(32 - 8));
604
    g = shift (g, -(16 - 3));
605
    b = shift (b, -(0  + 3));
606
607
    p = _mm_or_si64 (r, g);
608
    p = _mm_or_si64 (p, b);
609
    return loongson_insert_pi16 (t, p, pos);
610
#else
611
0
    r = shift (r, -(32 - 8) + pos * 16);
612
0
    g = shift (g, -(16 - 3) + pos * 16);
613
0
    b = shift (b, -(0  + 3) + pos * 16);
614
615
0
    if (pos == 0)
616
0
  t = _mm_and_si64 (t, MC (mask_0));
617
0
    else if (pos == 1)
618
0
  t = _mm_and_si64 (t, MC (mask_1));
619
0
    else if (pos == 2)
620
0
  t = _mm_and_si64 (t, MC (mask_2));
621
0
    else if (pos == 3)
622
0
  t = _mm_and_si64 (t, MC (mask_3));
623
624
0
    p = _mm_or_si64 (r, t);
625
0
    p = _mm_or_si64 (g, p);
626
627
0
    return _mm_or_si64 (b, p);
628
0
#endif
629
0
}
630
631
static force_inline __m64
632
pack_4xpacked565 (__m64 a, __m64 b)
633
0
{
634
0
    __m64 rb0 = _mm_and_si64 (a, MC (packed_565_rb));
635
0
    __m64 rb1 = _mm_and_si64 (b, MC (packed_565_rb));
636
637
0
    __m64 t0 = _mm_madd_pi16 (rb0, MC (565_pack_multiplier));
638
0
    __m64 t1 = _mm_madd_pi16 (rb1, MC (565_pack_multiplier));
639
640
0
    __m64 g0 = _mm_and_si64 (a, MC (packed_565_g));
641
0
    __m64 g1 = _mm_and_si64 (b, MC (packed_565_g));
642
643
0
    t0 = _mm_or_si64 (t0, g0);
644
0
    t1 = _mm_or_si64 (t1, g1);
645
646
0
    t0 = shift(t0, -5);
647
0
    t1 = shift(t1, -5 + 16);
648
0
    return _mm_shuffle_pi16 (_mm_or_si64 (t0, t1), _MM_SHUFFLE (3, 1, 2, 0));
649
0
}
650
651
#ifndef _MSC_VER
652
653
static force_inline __m64
654
pack_4x565 (__m64 v0, __m64 v1, __m64 v2, __m64 v3)
655
0
{
656
0
    return pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3));
657
0
}
658
659
static force_inline __m64
660
pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
661
0
{
662
0
    x = pix_multiply (x, a);
663
0
    y = pix_multiply (y, b);
664
665
0
    return pix_add (x, y);
666
0
}
667
668
#else
669
670
/* MSVC only handles a "pass by register" of up to three SSE intrinsics */
671
672
#define pack_4x565(v0, v1, v2, v3) \
673
    pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3))
674
675
#define pix_add_mul(x, a, y, b)  \
676
    ( x = pix_multiply (x, a),   \
677
      y = pix_multiply (y, b),   \
678
      pix_add (x, y) )
679
680
#endif
681
682
/* --------------- MMX code patch for fbcompose.c --------------------- */
683
684
static force_inline __m64
685
combine (const uint32_t *src, const uint32_t *mask)
686
0
{
687
0
    __m64 vsrc = load8888 (src);
688
689
0
    if (mask)
690
0
    {
691
0
  __m64 m = load8888 (mask);
692
693
0
  m = expand_alpha (m);
694
0
  vsrc = pix_multiply (vsrc, m);
695
0
    }
696
697
0
    return vsrc;
698
0
}
699
700
static force_inline __m64
701
core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst)
702
0
{
703
0
    vsrc = _mm_unpacklo_pi8 (vsrc, _mm_setzero_si64 ());
704
705
0
    if (is_opaque (vsrc))
706
0
    {
707
0
  return vsrc;
708
0
    }
709
0
    else if (!is_zero (vsrc))
710
0
    {
711
0
  return over (vsrc, expand_alpha (vsrc),
712
0
         _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ()));
713
0
    }
714
715
0
    return _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ());
716
0
}
717
718
static void
719
mmx_combine_over_u (pixman_implementation_t *imp,
720
                    pixman_op_t              op,
721
                    uint32_t *               dest,
722
                    const uint32_t *         src,
723
                    const uint32_t *         mask,
724
                    int                      width)
725
0
{
726
0
    const uint32_t *end = dest + width;
727
728
0
    while (dest < end)
729
0
    {
730
0
  __m64 vsrc = combine (src, mask);
731
732
0
  if (is_opaque (vsrc))
733
0
  {
734
0
      store8888 (dest, vsrc);
735
0
  }
736
0
  else if (!is_zero (vsrc))
737
0
  {
738
0
      __m64 sa = expand_alpha (vsrc);
739
0
      store8888 (dest, over (vsrc, sa, load8888 (dest)));
740
0
  }
741
742
0
  ++dest;
743
0
  ++src;
744
0
  if (mask)
745
0
      ++mask;
746
0
    }
747
0
    _mm_empty ();
748
0
}
749
750
static void
751
mmx_combine_over_reverse_u (pixman_implementation_t *imp,
752
                            pixman_op_t              op,
753
                            uint32_t *               dest,
754
                            const uint32_t *         src,
755
                            const uint32_t *         mask,
756
                            int                      width)
757
0
{
758
0
    const uint32_t *end = dest + width;
759
760
0
    while (dest < end)
761
0
    {
762
0
  __m64 d, da;
763
0
  __m64 s = combine (src, mask);
764
765
0
  d = load8888 (dest);
766
0
  da = expand_alpha (d);
767
0
  store8888 (dest, over (d, da, s));
768
769
0
  ++dest;
770
0
  ++src;
771
0
  if (mask)
772
0
      mask++;
773
0
    }
774
0
    _mm_empty ();
775
0
}
776
777
static void
778
mmx_combine_in_u (pixman_implementation_t *imp,
779
                  pixman_op_t              op,
780
                  uint32_t *               dest,
781
                  const uint32_t *         src,
782
                  const uint32_t *         mask,
783
                  int                      width)
784
0
{
785
0
    const uint32_t *end = dest + width;
786
787
0
    while (dest < end)
788
0
    {
789
0
  __m64 a;
790
0
  __m64 x = combine (src, mask);
791
792
0
  a = load8888 (dest);
793
0
  a = expand_alpha (a);
794
0
  x = pix_multiply (x, a);
795
796
0
  store8888 (dest, x);
797
798
0
  ++dest;
799
0
  ++src;
800
0
  if (mask)
801
0
      mask++;
802
0
    }
803
0
    _mm_empty ();
804
0
}
805
806
static void
807
mmx_combine_in_reverse_u (pixman_implementation_t *imp,
808
                          pixman_op_t              op,
809
                          uint32_t *               dest,
810
                          const uint32_t *         src,
811
                          const uint32_t *         mask,
812
                          int                      width)
813
0
{
814
0
    const uint32_t *end = dest + width;
815
816
0
    while (dest < end)
817
0
    {
818
0
  __m64 a = combine (src, mask);
819
0
  __m64 x;
820
821
0
  x = load8888 (dest);
822
0
  a = expand_alpha (a);
823
0
  x = pix_multiply (x, a);
824
0
  store8888 (dest, x);
825
826
0
  ++dest;
827
0
  ++src;
828
0
  if (mask)
829
0
      mask++;
830
0
    }
831
0
    _mm_empty ();
832
0
}
833
834
static void
835
mmx_combine_out_u (pixman_implementation_t *imp,
836
                   pixman_op_t              op,
837
                   uint32_t *               dest,
838
                   const uint32_t *         src,
839
                   const uint32_t *         mask,
840
                   int                      width)
841
0
{
842
0
    const uint32_t *end = dest + width;
843
844
0
    while (dest < end)
845
0
    {
846
0
  __m64 a;
847
0
  __m64 x = combine (src, mask);
848
849
0
  a = load8888 (dest);
850
0
  a = expand_alpha (a);
851
0
  a = negate (a);
852
0
  x = pix_multiply (x, a);
853
0
  store8888 (dest, x);
854
855
0
  ++dest;
856
0
  ++src;
857
0
  if (mask)
858
0
      mask++;
859
0
    }
860
0
    _mm_empty ();
861
0
}
862
863
static void
864
mmx_combine_out_reverse_u (pixman_implementation_t *imp,
865
                           pixman_op_t              op,
866
                           uint32_t *               dest,
867
                           const uint32_t *         src,
868
                           const uint32_t *         mask,
869
                           int                      width)
870
0
{
871
0
    const uint32_t *end = dest + width;
872
873
0
    while (dest < end)
874
0
    {
875
0
  __m64 a = combine (src, mask);
876
0
  __m64 x;
877
878
0
  x = load8888 (dest);
879
0
  a = expand_alpha (a);
880
0
  a = negate (a);
881
0
  x = pix_multiply (x, a);
882
883
0
  store8888 (dest, x);
884
885
0
  ++dest;
886
0
  ++src;
887
0
  if (mask)
888
0
      mask++;
889
0
    }
890
0
    _mm_empty ();
891
0
}
892
893
static void
894
mmx_combine_atop_u (pixman_implementation_t *imp,
895
                    pixman_op_t              op,
896
                    uint32_t *               dest,
897
                    const uint32_t *         src,
898
                    const uint32_t *         mask,
899
                    int                      width)
900
0
{
901
0
    const uint32_t *end = dest + width;
902
903
0
    while (dest < end)
904
0
    {
905
0
  __m64 da, d, sia;
906
0
  __m64 s = combine (src, mask);
907
908
0
  d = load8888 (dest);
909
0
  sia = expand_alpha (s);
910
0
  sia = negate (sia);
911
0
  da = expand_alpha (d);
912
0
  s = pix_add_mul (s, da, d, sia);
913
0
  store8888 (dest, s);
914
915
0
  ++dest;
916
0
  ++src;
917
0
  if (mask)
918
0
      mask++;
919
0
    }
920
0
    _mm_empty ();
921
0
}
922
923
static void
924
mmx_combine_atop_reverse_u (pixman_implementation_t *imp,
925
                            pixman_op_t              op,
926
                            uint32_t *               dest,
927
                            const uint32_t *         src,
928
                            const uint32_t *         mask,
929
                            int                      width)
930
0
{
931
0
    const uint32_t *end;
932
933
0
    end = dest + width;
934
935
0
    while (dest < end)
936
0
    {
937
0
  __m64 dia, d, sa;
938
0
  __m64 s = combine (src, mask);
939
940
0
  d = load8888 (dest);
941
0
  sa = expand_alpha (s);
942
0
  dia = expand_alpha (d);
943
0
  dia = negate (dia);
944
0
  s = pix_add_mul (s, dia, d, sa);
945
0
  store8888 (dest, s);
946
947
0
  ++dest;
948
0
  ++src;
949
0
  if (mask)
950
0
      mask++;
951
0
    }
952
0
    _mm_empty ();
953
0
}
954
955
static void
956
mmx_combine_xor_u (pixman_implementation_t *imp,
957
                   pixman_op_t              op,
958
                   uint32_t *               dest,
959
                   const uint32_t *         src,
960
                   const uint32_t *         mask,
961
                   int                      width)
962
0
{
963
0
    const uint32_t *end = dest + width;
964
965
0
    while (dest < end)
966
0
    {
967
0
  __m64 dia, d, sia;
968
0
  __m64 s = combine (src, mask);
969
970
0
  d = load8888 (dest);
971
0
  sia = expand_alpha (s);
972
0
  dia = expand_alpha (d);
973
0
  sia = negate (sia);
974
0
  dia = negate (dia);
975
0
  s = pix_add_mul (s, dia, d, sia);
976
0
  store8888 (dest, s);
977
978
0
  ++dest;
979
0
  ++src;
980
0
  if (mask)
981
0
      mask++;
982
0
    }
983
0
    _mm_empty ();
984
0
}
985
986
static void
987
mmx_combine_add_u (pixman_implementation_t *imp,
988
                   pixman_op_t              op,
989
                   uint32_t *               dest,
990
                   const uint32_t *         src,
991
                   const uint32_t *         mask,
992
                   int                      width)
993
0
{
994
0
    const uint32_t *end = dest + width;
995
996
0
    while (dest < end)
997
0
    {
998
0
  __m64 d;
999
0
  __m64 s = combine (src, mask);
1000
1001
0
  d = load8888 (dest);
1002
0
  s = pix_add (s, d);
1003
0
  store8888 (dest, s);
1004
1005
0
  ++dest;
1006
0
  ++src;
1007
0
  if (mask)
1008
0
      mask++;
1009
0
    }
1010
0
    _mm_empty ();
1011
0
}
1012
1013
static void
1014
mmx_combine_saturate_u (pixman_implementation_t *imp,
1015
                        pixman_op_t              op,
1016
                        uint32_t *               dest,
1017
                        const uint32_t *         src,
1018
                        const uint32_t *         mask,
1019
                        int                      width)
1020
0
{
1021
0
    const uint32_t *end = dest + width;
1022
1023
0
    while (dest < end)
1024
0
    {
1025
0
  uint32_t s, sa, da;
1026
0
  uint32_t d = *dest;
1027
0
  __m64 ms = combine (src, mask);
1028
0
  __m64 md = load8888 (dest);
1029
1030
0
  store8888(&s, ms);
1031
0
  da = ~d >> 24;
1032
0
  sa = s >> 24;
1033
1034
0
  if (sa > da)
1035
0
  {
1036
0
      uint32_t quot = DIV_UN8 (da, sa) << 24;
1037
0
      __m64 msa = load8888 (&quot);
1038
0
      msa = expand_alpha (msa);
1039
0
      ms = pix_multiply (ms, msa);
1040
0
  }
1041
1042
0
  md = pix_add (md, ms);
1043
0
  store8888 (dest, md);
1044
1045
0
  ++src;
1046
0
  ++dest;
1047
0
  if (mask)
1048
0
      mask++;
1049
0
    }
1050
0
    _mm_empty ();
1051
0
}
1052
1053
static void
1054
mmx_combine_src_ca (pixman_implementation_t *imp,
1055
                    pixman_op_t              op,
1056
                    uint32_t *               dest,
1057
                    const uint32_t *         src,
1058
                    const uint32_t *         mask,
1059
                    int                      width)
1060
0
{
1061
0
    const uint32_t *end = src + width;
1062
1063
0
    while (src < end)
1064
0
    {
1065
0
  __m64 a = load8888 (mask);
1066
0
  __m64 s = load8888 (src);
1067
1068
0
  s = pix_multiply (s, a);
1069
0
  store8888 (dest, s);
1070
1071
0
  ++src;
1072
0
  ++mask;
1073
0
  ++dest;
1074
0
    }
1075
0
    _mm_empty ();
1076
0
}
1077
1078
static void
1079
mmx_combine_over_ca (pixman_implementation_t *imp,
1080
                     pixman_op_t              op,
1081
                     uint32_t *               dest,
1082
                     const uint32_t *         src,
1083
                     const uint32_t *         mask,
1084
                     int                      width)
1085
0
{
1086
0
    const uint32_t *end = src + width;
1087
1088
0
    while (src < end)
1089
0
    {
1090
0
  __m64 a = load8888 (mask);
1091
0
  __m64 s = load8888 (src);
1092
0
  __m64 d = load8888 (dest);
1093
0
  __m64 sa = expand_alpha (s);
1094
1095
0
  store8888 (dest, in_over (s, sa, a, d));
1096
1097
0
  ++src;
1098
0
  ++dest;
1099
0
  ++mask;
1100
0
    }
1101
0
    _mm_empty ();
1102
0
}
1103
1104
static void
1105
mmx_combine_over_reverse_ca (pixman_implementation_t *imp,
1106
                             pixman_op_t              op,
1107
                             uint32_t *               dest,
1108
                             const uint32_t *         src,
1109
                             const uint32_t *         mask,
1110
                             int                      width)
1111
0
{
1112
0
    const uint32_t *end = src + width;
1113
1114
0
    while (src < end)
1115
0
    {
1116
0
  __m64 a = load8888 (mask);
1117
0
  __m64 s = load8888 (src);
1118
0
  __m64 d = load8888 (dest);
1119
0
  __m64 da = expand_alpha (d);
1120
1121
0
  store8888 (dest, over (d, da, in (s, a)));
1122
1123
0
  ++src;
1124
0
  ++dest;
1125
0
  ++mask;
1126
0
    }
1127
0
    _mm_empty ();
1128
0
}
1129
1130
static void
1131
mmx_combine_in_ca (pixman_implementation_t *imp,
1132
                   pixman_op_t              op,
1133
                   uint32_t *               dest,
1134
                   const uint32_t *         src,
1135
                   const uint32_t *         mask,
1136
                   int                      width)
1137
0
{
1138
0
    const uint32_t *end = src + width;
1139
1140
0
    while (src < end)
1141
0
    {
1142
0
  __m64 a = load8888 (mask);
1143
0
  __m64 s = load8888 (src);
1144
0
  __m64 d = load8888 (dest);
1145
0
  __m64 da = expand_alpha (d);
1146
1147
0
  s = pix_multiply (s, a);
1148
0
  s = pix_multiply (s, da);
1149
0
  store8888 (dest, s);
1150
1151
0
  ++src;
1152
0
  ++dest;
1153
0
  ++mask;
1154
0
    }
1155
0
    _mm_empty ();
1156
0
}
1157
1158
static void
1159
mmx_combine_in_reverse_ca (pixman_implementation_t *imp,
1160
                           pixman_op_t              op,
1161
                           uint32_t *               dest,
1162
                           const uint32_t *         src,
1163
                           const uint32_t *         mask,
1164
                           int                      width)
1165
0
{
1166
0
    const uint32_t *end = src + width;
1167
1168
0
    while (src < end)
1169
0
    {
1170
0
  __m64 a = load8888 (mask);
1171
0
  __m64 s = load8888 (src);
1172
0
  __m64 d = load8888 (dest);
1173
0
  __m64 sa = expand_alpha (s);
1174
1175
0
  a = pix_multiply (a, sa);
1176
0
  d = pix_multiply (d, a);
1177
0
  store8888 (dest, d);
1178
1179
0
  ++src;
1180
0
  ++dest;
1181
0
  ++mask;
1182
0
    }
1183
0
    _mm_empty ();
1184
0
}
1185
1186
static void
1187
mmx_combine_out_ca (pixman_implementation_t *imp,
1188
                    pixman_op_t              op,
1189
                    uint32_t *               dest,
1190
                    const uint32_t *         src,
1191
                    const uint32_t *         mask,
1192
                    int                      width)
1193
0
{
1194
0
    const uint32_t *end = src + width;
1195
1196
0
    while (src < end)
1197
0
    {
1198
0
  __m64 a = load8888 (mask);
1199
0
  __m64 s = load8888 (src);
1200
0
  __m64 d = load8888 (dest);
1201
0
  __m64 da = expand_alpha (d);
1202
1203
0
  da = negate (da);
1204
0
  s = pix_multiply (s, a);
1205
0
  s = pix_multiply (s, da);
1206
0
  store8888 (dest, s);
1207
1208
0
  ++src;
1209
0
  ++dest;
1210
0
  ++mask;
1211
0
    }
1212
0
    _mm_empty ();
1213
0
}
1214
1215
static void
1216
mmx_combine_out_reverse_ca (pixman_implementation_t *imp,
1217
                            pixman_op_t              op,
1218
                            uint32_t *               dest,
1219
                            const uint32_t *         src,
1220
                            const uint32_t *         mask,
1221
                            int                      width)
1222
0
{
1223
0
    const uint32_t *end = src + width;
1224
1225
0
    while (src < end)
1226
0
    {
1227
0
  __m64 a = load8888 (mask);
1228
0
  __m64 s = load8888 (src);
1229
0
  __m64 d = load8888 (dest);
1230
0
  __m64 sa = expand_alpha (s);
1231
1232
0
  a = pix_multiply (a, sa);
1233
0
  a = negate (a);
1234
0
  d = pix_multiply (d, a);
1235
0
  store8888 (dest, d);
1236
1237
0
  ++src;
1238
0
  ++dest;
1239
0
  ++mask;
1240
0
    }
1241
0
    _mm_empty ();
1242
0
}
1243
1244
static void
1245
mmx_combine_atop_ca (pixman_implementation_t *imp,
1246
                     pixman_op_t              op,
1247
                     uint32_t *               dest,
1248
                     const uint32_t *         src,
1249
                     const uint32_t *         mask,
1250
                     int                      width)
1251
0
{
1252
0
    const uint32_t *end = src + width;
1253
1254
0
    while (src < end)
1255
0
    {
1256
0
  __m64 a = load8888 (mask);
1257
0
  __m64 s = load8888 (src);
1258
0
  __m64 d = load8888 (dest);
1259
0
  __m64 da = expand_alpha (d);
1260
0
  __m64 sa = expand_alpha (s);
1261
1262
0
  s = pix_multiply (s, a);
1263
0
  a = pix_multiply (a, sa);
1264
0
  a = negate (a);
1265
0
  d = pix_add_mul (d, a, s, da);
1266
0
  store8888 (dest, d);
1267
1268
0
  ++src;
1269
0
  ++dest;
1270
0
  ++mask;
1271
0
    }
1272
0
    _mm_empty ();
1273
0
}
1274
1275
static void
1276
mmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
1277
                             pixman_op_t              op,
1278
                             uint32_t *               dest,
1279
                             const uint32_t *         src,
1280
                             const uint32_t *         mask,
1281
                             int                      width)
1282
0
{
1283
0
    const uint32_t *end = src + width;
1284
1285
0
    while (src < end)
1286
0
    {
1287
0
  __m64 a = load8888 (mask);
1288
0
  __m64 s = load8888 (src);
1289
0
  __m64 d = load8888 (dest);
1290
0
  __m64 da = expand_alpha (d);
1291
0
  __m64 sa = expand_alpha (s);
1292
1293
0
  s = pix_multiply (s, a);
1294
0
  a = pix_multiply (a, sa);
1295
0
  da = negate (da);
1296
0
  d = pix_add_mul (d, a, s, da);
1297
0
  store8888 (dest, d);
1298
1299
0
  ++src;
1300
0
  ++dest;
1301
0
  ++mask;
1302
0
    }
1303
0
    _mm_empty ();
1304
0
}
1305
1306
static void
1307
mmx_combine_xor_ca (pixman_implementation_t *imp,
1308
                    pixman_op_t              op,
1309
                    uint32_t *               dest,
1310
                    const uint32_t *         src,
1311
                    const uint32_t *         mask,
1312
                    int                      width)
1313
0
{
1314
0
    const uint32_t *end = src + width;
1315
1316
0
    while (src < end)
1317
0
    {
1318
0
  __m64 a = load8888 (mask);
1319
0
  __m64 s = load8888 (src);
1320
0
  __m64 d = load8888 (dest);
1321
0
  __m64 da = expand_alpha (d);
1322
0
  __m64 sa = expand_alpha (s);
1323
1324
0
  s = pix_multiply (s, a);
1325
0
  a = pix_multiply (a, sa);
1326
0
  da = negate (da);
1327
0
  a = negate (a);
1328
0
  d = pix_add_mul (d, a, s, da);
1329
0
  store8888 (dest, d);
1330
1331
0
  ++src;
1332
0
  ++dest;
1333
0
  ++mask;
1334
0
    }
1335
0
    _mm_empty ();
1336
0
}
1337
1338
static void
1339
mmx_combine_add_ca (pixman_implementation_t *imp,
1340
                    pixman_op_t              op,
1341
                    uint32_t *               dest,
1342
                    const uint32_t *         src,
1343
                    const uint32_t *         mask,
1344
                    int                      width)
1345
0
{
1346
0
    const uint32_t *end = src + width;
1347
1348
0
    while (src < end)
1349
0
    {
1350
0
  __m64 a = load8888 (mask);
1351
0
  __m64 s = load8888 (src);
1352
0
  __m64 d = load8888 (dest);
1353
1354
0
  s = pix_multiply (s, a);
1355
0
  d = pix_add (s, d);
1356
0
  store8888 (dest, d);
1357
1358
0
  ++src;
1359
0
  ++dest;
1360
0
  ++mask;
1361
0
    }
1362
0
    _mm_empty ();
1363
0
}
1364
1365
/* ------------- MMX code paths called from fbpict.c -------------------- */
1366
1367
static void
1368
mmx_composite_over_n_8888 (pixman_implementation_t *imp,
1369
                           pixman_composite_info_t *info)
1370
0
{
1371
0
    PIXMAN_COMPOSITE_ARGS (info);
1372
0
    uint32_t src;
1373
0
    uint32_t    *dst_line, *dst;
1374
0
    int32_t w;
1375
0
    int dst_stride;
1376
0
    __m64 vsrc, vsrca;
1377
1378
0
    CHECKPOINT ();
1379
1380
0
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1381
1382
0
    if (src == 0)
1383
0
  return;
1384
1385
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1386
1387
0
    vsrc = load8888 (&src);
1388
0
    vsrca = expand_alpha (vsrc);
1389
1390
0
    while (height--)
1391
0
    {
1392
0
  dst = dst_line;
1393
0
  dst_line += dst_stride;
1394
0
  w = width;
1395
1396
0
  CHECKPOINT ();
1397
1398
0
  while (w && (uintptr_t)dst & 7)
1399
0
  {
1400
0
      store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
1401
1402
0
      w--;
1403
0
      dst++;
1404
0
  }
1405
1406
0
  while (w >= 2)
1407
0
  {
1408
0
      __m64 vdest;
1409
0
      __m64 dest0, dest1;
1410
1411
0
      vdest = *(__m64 *)dst;
1412
1413
0
      dest0 = over (vsrc, vsrca, expand8888 (vdest, 0));
1414
0
      dest1 = over (vsrc, vsrca, expand8888 (vdest, 1));
1415
1416
0
      *(__m64 *)dst = pack8888 (dest0, dest1);
1417
1418
0
      dst += 2;
1419
0
      w -= 2;
1420
0
  }
1421
1422
0
  CHECKPOINT ();
1423
1424
0
  if (w)
1425
0
  {
1426
0
      store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
1427
0
  }
1428
0
    }
1429
1430
0
    _mm_empty ();
1431
0
}
1432
1433
static void
1434
mmx_composite_over_n_0565 (pixman_implementation_t *imp,
1435
                           pixman_composite_info_t *info)
1436
0
{
1437
0
    PIXMAN_COMPOSITE_ARGS (info);
1438
0
    uint32_t src;
1439
0
    uint16_t    *dst_line, *dst;
1440
0
    int32_t w;
1441
0
    int dst_stride;
1442
0
    __m64 vsrc, vsrca;
1443
1444
0
    CHECKPOINT ();
1445
1446
0
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1447
1448
0
    if (src == 0)
1449
0
  return;
1450
1451
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1452
1453
0
    vsrc = load8888 (&src);
1454
0
    vsrca = expand_alpha (vsrc);
1455
1456
0
    while (height--)
1457
0
    {
1458
0
  dst = dst_line;
1459
0
  dst_line += dst_stride;
1460
0
  w = width;
1461
1462
0
  CHECKPOINT ();
1463
1464
0
  while (w && (uintptr_t)dst & 7)
1465
0
  {
1466
0
      uint64_t d = *dst;
1467
0
      __m64 vdest = expand565 (to_m64 (d), 0);
1468
1469
0
      vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1470
0
      *dst = to_uint64 (vdest);
1471
1472
0
      w--;
1473
0
      dst++;
1474
0
  }
1475
1476
0
  while (w >= 4)
1477
0
  {
1478
0
      __m64 vdest = *(__m64 *)dst;
1479
0
      __m64 v0, v1, v2, v3;
1480
1481
0
      expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
1482
1483
0
      v0 = over (vsrc, vsrca, v0);
1484
0
      v1 = over (vsrc, vsrca, v1);
1485
0
      v2 = over (vsrc, vsrca, v2);
1486
0
      v3 = over (vsrc, vsrca, v3);
1487
1488
0
      *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
1489
1490
0
      dst += 4;
1491
0
      w -= 4;
1492
0
  }
1493
1494
0
  CHECKPOINT ();
1495
1496
0
  while (w)
1497
0
  {
1498
0
      uint64_t d = *dst;
1499
0
      __m64 vdest = expand565 (to_m64 (d), 0);
1500
1501
0
      vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1502
0
      *dst = to_uint64 (vdest);
1503
1504
0
      w--;
1505
0
      dst++;
1506
0
  }
1507
0
    }
1508
1509
0
    _mm_empty ();
1510
0
}
1511
1512
static void
1513
mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
1514
                                   pixman_composite_info_t *info)
1515
0
{
1516
0
    PIXMAN_COMPOSITE_ARGS (info);
1517
0
    uint32_t src;
1518
0
    uint32_t    *dst_line;
1519
0
    uint32_t    *mask_line;
1520
0
    int dst_stride, mask_stride;
1521
0
    __m64 vsrc, vsrca;
1522
1523
0
    CHECKPOINT ();
1524
1525
0
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1526
1527
0
    if (src == 0)
1528
0
  return;
1529
1530
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1531
0
    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
1532
1533
0
    vsrc = load8888 (&src);
1534
0
    vsrca = expand_alpha (vsrc);
1535
1536
0
    while (height--)
1537
0
    {
1538
0
  int twidth = width;
1539
0
  uint32_t *p = (uint32_t *)mask_line;
1540
0
  uint32_t *q = (uint32_t *)dst_line;
1541
1542
0
  while (twidth && (uintptr_t)q & 7)
1543
0
  {
1544
0
      uint32_t m = *(uint32_t *)p;
1545
1546
0
      if (m)
1547
0
      {
1548
0
    __m64 vdest = load8888 (q);
1549
0
    vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
1550
0
    store8888 (q, vdest);
1551
0
      }
1552
1553
0
      twidth--;
1554
0
      p++;
1555
0
      q++;
1556
0
  }
1557
1558
0
  while (twidth >= 2)
1559
0
  {
1560
0
      uint32_t m0, m1;
1561
0
      m0 = *p;
1562
0
      m1 = *(p + 1);
1563
1564
0
      if (m0 | m1)
1565
0
      {
1566
0
    __m64 dest0, dest1;
1567
0
    __m64 vdest = *(__m64 *)q;
1568
1569
0
    dest0 = in_over (vsrc, vsrca, load8888 (&m0),
1570
0
                     expand8888 (vdest, 0));
1571
0
    dest1 = in_over (vsrc, vsrca, load8888 (&m1),
1572
0
                     expand8888 (vdest, 1));
1573
1574
0
    *(__m64 *)q = pack8888 (dest0, dest1);
1575
0
      }
1576
1577
0
      p += 2;
1578
0
      q += 2;
1579
0
      twidth -= 2;
1580
0
  }
1581
1582
0
  if (twidth)
1583
0
  {
1584
0
      uint32_t m = *(uint32_t *)p;
1585
1586
0
      if (m)
1587
0
      {
1588
0
    __m64 vdest = load8888 (q);
1589
0
    vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
1590
0
    store8888 (q, vdest);
1591
0
      }
1592
1593
0
      twidth--;
1594
0
      p++;
1595
0
      q++;
1596
0
  }
1597
1598
0
  dst_line += dst_stride;
1599
0
  mask_line += mask_stride;
1600
0
    }
1601
1602
0
    _mm_empty ();
1603
0
}
1604
1605
static void
1606
mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
1607
                                pixman_composite_info_t *info)
1608
0
{
1609
0
    PIXMAN_COMPOSITE_ARGS (info);
1610
0
    uint32_t    *dst_line, *dst;
1611
0
    uint32_t    *src_line, *src;
1612
0
    uint32_t mask;
1613
0
    __m64 vmask;
1614
0
    int dst_stride, src_stride;
1615
0
    int32_t w;
1616
1617
0
    CHECKPOINT ();
1618
1619
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1620
0
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1621
1622
0
    mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1623
0
    vmask = expand_alpha (load8888 (&mask));
1624
1625
0
    while (height--)
1626
0
    {
1627
0
  dst = dst_line;
1628
0
  dst_line += dst_stride;
1629
0
  src = src_line;
1630
0
  src_line += src_stride;
1631
0
  w = width;
1632
1633
0
  while (w && (uintptr_t)dst & 7)
1634
0
  {
1635
0
      __m64 s = load8888 (src);
1636
0
      __m64 d = load8888 (dst);
1637
1638
0
      store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
1639
1640
0
      w--;
1641
0
      dst++;
1642
0
      src++;
1643
0
  }
1644
1645
0
  while (w >= 2)
1646
0
  {
1647
0
      __m64 vs = ldq_u ((__m64 *)src);
1648
0
      __m64 vd = *(__m64 *)dst;
1649
0
      __m64 vsrc0 = expand8888 (vs, 0);
1650
0
      __m64 vsrc1 = expand8888 (vs, 1);
1651
1652
0
      *(__m64 *)dst = pack8888 (
1653
0
          in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)),
1654
0
          in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1)));
1655
1656
0
      w -= 2;
1657
0
      dst += 2;
1658
0
      src += 2;
1659
0
  }
1660
1661
0
  if (w)
1662
0
  {
1663
0
      __m64 s = load8888 (src);
1664
0
      __m64 d = load8888 (dst);
1665
1666
0
      store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
1667
0
  }
1668
0
    }
1669
1670
0
    _mm_empty ();
1671
0
}
1672
1673
static void
1674
mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
1675
                                pixman_composite_info_t *info)
1676
0
{
1677
0
    PIXMAN_COMPOSITE_ARGS (info);
1678
0
    uint32_t *dst_line, *dst;
1679
0
    uint32_t *src_line, *src;
1680
0
    uint32_t mask;
1681
0
    __m64 vmask;
1682
0
    int dst_stride, src_stride;
1683
0
    int32_t w;
1684
0
    __m64 srca;
1685
1686
0
    CHECKPOINT ();
1687
1688
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1689
0
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1690
0
    mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1691
1692
0
    vmask = expand_alpha (load8888 (&mask));
1693
0
    srca = MC (4x00ff);
1694
1695
0
    while (height--)
1696
0
    {
1697
0
  dst = dst_line;
1698
0
  dst_line += dst_stride;
1699
0
  src = src_line;
1700
0
  src_line += src_stride;
1701
0
  w = width;
1702
1703
0
  while (w && (uintptr_t)dst & 7)
1704
0
  {
1705
0
      uint32_t ssrc = *src | 0xff000000;
1706
0
      __m64 s = load8888 (&ssrc);
1707
0
      __m64 d = load8888 (dst);
1708
1709
0
      store8888 (dst, in_over (s, srca, vmask, d));
1710
1711
0
      w--;
1712
0
      dst++;
1713
0
      src++;
1714
0
  }
1715
1716
0
  while (w >= 16)
1717
0
  {
1718
0
      __m64 vd0 = *(__m64 *)(dst + 0);
1719
0
      __m64 vd1 = *(__m64 *)(dst + 2);
1720
0
      __m64 vd2 = *(__m64 *)(dst + 4);
1721
0
      __m64 vd3 = *(__m64 *)(dst + 6);
1722
0
      __m64 vd4 = *(__m64 *)(dst + 8);
1723
0
      __m64 vd5 = *(__m64 *)(dst + 10);
1724
0
      __m64 vd6 = *(__m64 *)(dst + 12);
1725
0
      __m64 vd7 = *(__m64 *)(dst + 14);
1726
1727
0
      __m64 vs0 = ldq_u ((__m64 *)(src + 0));
1728
0
      __m64 vs1 = ldq_u ((__m64 *)(src + 2));
1729
0
      __m64 vs2 = ldq_u ((__m64 *)(src + 4));
1730
0
      __m64 vs3 = ldq_u ((__m64 *)(src + 6));
1731
0
      __m64 vs4 = ldq_u ((__m64 *)(src + 8));
1732
0
      __m64 vs5 = ldq_u ((__m64 *)(src + 10));
1733
0
      __m64 vs6 = ldq_u ((__m64 *)(src + 12));
1734
0
      __m64 vs7 = ldq_u ((__m64 *)(src + 14));
1735
1736
0
      vd0 = pack8888 (
1737
0
          in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
1738
0
          in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
1739
1740
0
      vd1 = pack8888 (
1741
0
          in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
1742
0
          in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
1743
1744
0
      vd2 = pack8888 (
1745
0
          in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
1746
0
          in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
1747
1748
0
      vd3 = pack8888 (
1749
0
          in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
1750
0
          in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
1751
1752
0
      vd4 = pack8888 (
1753
0
          in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
1754
0
          in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
1755
1756
0
      vd5 = pack8888 (
1757
0
          in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
1758
0
          in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
1759
1760
0
      vd6 = pack8888 (
1761
0
          in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
1762
0
          in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
1763
1764
0
      vd7 = pack8888 (
1765
0
          in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
1766
0
          in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
1767
1768
0
      *(__m64 *)(dst + 0) = vd0;
1769
0
      *(__m64 *)(dst + 2) = vd1;
1770
0
      *(__m64 *)(dst + 4) = vd2;
1771
0
      *(__m64 *)(dst + 6) = vd3;
1772
0
      *(__m64 *)(dst + 8) = vd4;
1773
0
      *(__m64 *)(dst + 10) = vd5;
1774
0
      *(__m64 *)(dst + 12) = vd6;
1775
0
      *(__m64 *)(dst + 14) = vd7;
1776
1777
0
      w -= 16;
1778
0
      dst += 16;
1779
0
      src += 16;
1780
0
  }
1781
1782
0
  while (w)
1783
0
  {
1784
0
      uint32_t ssrc = *src | 0xff000000;
1785
0
      __m64 s = load8888 (&ssrc);
1786
0
      __m64 d = load8888 (dst);
1787
1788
0
      store8888 (dst, in_over (s, srca, vmask, d));
1789
1790
0
      w--;
1791
0
      dst++;
1792
0
      src++;
1793
0
  }
1794
0
    }
1795
1796
0
    _mm_empty ();
1797
0
}
1798
1799
static void
1800
mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
1801
                              pixman_composite_info_t *info)
1802
0
{
1803
0
    PIXMAN_COMPOSITE_ARGS (info);
1804
0
    uint32_t *dst_line, *dst;
1805
0
    uint32_t *src_line, *src;
1806
0
    uint32_t s;
1807
0
    int dst_stride, src_stride;
1808
0
    uint8_t a;
1809
0
    int32_t w;
1810
1811
0
    CHECKPOINT ();
1812
1813
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1814
0
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1815
1816
0
    while (height--)
1817
0
    {
1818
0
  dst = dst_line;
1819
0
  dst_line += dst_stride;
1820
0
  src = src_line;
1821
0
  src_line += src_stride;
1822
0
  w = width;
1823
1824
0
  while (w--)
1825
0
  {
1826
0
      s = *src++;
1827
0
      a = s >> 24;
1828
1829
0
      if (a == 0xff)
1830
0
      {
1831
0
    *dst = s;
1832
0
      }
1833
0
      else if (s)
1834
0
      {
1835
0
    __m64 ms, sa;
1836
0
    ms = load8888 (&s);
1837
0
    sa = expand_alpha (ms);
1838
0
    store8888 (dst, over (ms, sa, load8888 (dst)));
1839
0
      }
1840
1841
0
      dst++;
1842
0
  }
1843
0
    }
1844
0
    _mm_empty ();
1845
0
}
1846
1847
static void
1848
mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
1849
                              pixman_composite_info_t *info)
1850
0
{
1851
0
    PIXMAN_COMPOSITE_ARGS (info);
1852
0
    uint16_t    *dst_line, *dst;
1853
0
    uint32_t    *src_line, *src;
1854
0
    int dst_stride, src_stride;
1855
0
    int32_t w;
1856
1857
0
    CHECKPOINT ();
1858
1859
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1860
0
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1861
1862
#if 0
1863
    /* FIXME */
1864
    assert (src_image->drawable == mask_image->drawable);
1865
#endif
1866
1867
0
    while (height--)
1868
0
    {
1869
0
  dst = dst_line;
1870
0
  dst_line += dst_stride;
1871
0
  src = src_line;
1872
0
  src_line += src_stride;
1873
0
  w = width;
1874
1875
0
  CHECKPOINT ();
1876
1877
0
  while (w && (uintptr_t)dst & 7)
1878
0
  {
1879
0
      __m64 vsrc = load8888 (src);
1880
0
      uint64_t d = *dst;
1881
0
      __m64 vdest = expand565 (to_m64 (d), 0);
1882
1883
0
      vdest = pack_565 (
1884
0
    over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1885
1886
0
      *dst = to_uint64 (vdest);
1887
1888
0
      w--;
1889
0
      dst++;
1890
0
      src++;
1891
0
  }
1892
1893
0
  CHECKPOINT ();
1894
1895
0
  while (w >= 4)
1896
0
  {
1897
0
      __m64 vdest = *(__m64 *)dst;
1898
0
      __m64 v0, v1, v2, v3;
1899
0
      __m64 vsrc0, vsrc1, vsrc2, vsrc3;
1900
1901
0
      expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
1902
1903
0
      vsrc0 = load8888 ((src + 0));
1904
0
      vsrc1 = load8888 ((src + 1));
1905
0
      vsrc2 = load8888 ((src + 2));
1906
0
      vsrc3 = load8888 ((src + 3));
1907
1908
0
      v0 = over (vsrc0, expand_alpha (vsrc0), v0);
1909
0
      v1 = over (vsrc1, expand_alpha (vsrc1), v1);
1910
0
      v2 = over (vsrc2, expand_alpha (vsrc2), v2);
1911
0
      v3 = over (vsrc3, expand_alpha (vsrc3), v3);
1912
1913
0
      *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
1914
1915
0
      w -= 4;
1916
0
      dst += 4;
1917
0
      src += 4;
1918
0
  }
1919
1920
0
  CHECKPOINT ();
1921
1922
0
  while (w)
1923
0
  {
1924
0
      __m64 vsrc = load8888 (src);
1925
0
      uint64_t d = *dst;
1926
0
      __m64 vdest = expand565 (to_m64 (d), 0);
1927
1928
0
      vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1929
1930
0
      *dst = to_uint64 (vdest);
1931
1932
0
      w--;
1933
0
      dst++;
1934
0
      src++;
1935
0
  }
1936
0
    }
1937
1938
0
    _mm_empty ();
1939
0
}
1940
1941
static void
1942
mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
1943
                             pixman_composite_info_t *info)
1944
0
{
1945
0
    PIXMAN_COMPOSITE_ARGS (info);
1946
0
    uint32_t src, srca;
1947
0
    uint32_t *dst_line, *dst;
1948
0
    uint8_t *mask_line, *mask;
1949
0
    int dst_stride, mask_stride;
1950
0
    int32_t w;
1951
0
    __m64 vsrc, vsrca;
1952
0
    uint64_t srcsrc;
1953
1954
0
    CHECKPOINT ();
1955
1956
0
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1957
1958
0
    srca = src >> 24;
1959
0
    if (src == 0)
1960
0
  return;
1961
1962
0
    srcsrc = (uint64_t)src << 32 | src;
1963
1964
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1965
0
    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1966
1967
0
    vsrc = load8888 (&src);
1968
0
    vsrca = expand_alpha (vsrc);
1969
1970
0
    while (height--)
1971
0
    {
1972
0
  dst = dst_line;
1973
0
  dst_line += dst_stride;
1974
0
  mask = mask_line;
1975
0
  mask_line += mask_stride;
1976
0
  w = width;
1977
1978
0
  CHECKPOINT ();
1979
1980
0
  while (w && (uintptr_t)dst & 7)
1981
0
  {
1982
0
      uint64_t m = *mask;
1983
1984
0
      if (m)
1985
0
      {
1986
0
    __m64 vdest = in_over (vsrc, vsrca,
1987
0
               expand_alpha_rev (to_m64 (m)),
1988
0
               load8888 (dst));
1989
1990
0
    store8888 (dst, vdest);
1991
0
      }
1992
1993
0
      w--;
1994
0
      mask++;
1995
0
      dst++;
1996
0
  }
1997
1998
0
  CHECKPOINT ();
1999
2000
0
  while (w >= 2)
2001
0
  {
2002
0
      uint64_t m0, m1;
2003
2004
0
      m0 = *mask;
2005
0
      m1 = *(mask + 1);
2006
2007
0
      if (srca == 0xff && (m0 & m1) == 0xff)
2008
0
      {
2009
0
    *(uint64_t *)dst = srcsrc;
2010
0
      }
2011
0
      else if (m0 | m1)
2012
0
      {
2013
0
    __m64 vdest;
2014
0
    __m64 dest0, dest1;
2015
2016
0
    vdest = *(__m64 *)dst;
2017
2018
0
    dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)),
2019
0
         expand8888 (vdest, 0));
2020
0
    dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)),
2021
0
         expand8888 (vdest, 1));
2022
2023
0
    *(__m64 *)dst = pack8888 (dest0, dest1);
2024
0
      }
2025
2026
0
      mask += 2;
2027
0
      dst += 2;
2028
0
      w -= 2;
2029
0
  }
2030
2031
0
  CHECKPOINT ();
2032
2033
0
  if (w)
2034
0
  {
2035
0
      uint64_t m = *mask;
2036
2037
0
      if (m)
2038
0
      {
2039
0
    __m64 vdest = load8888 (dst);
2040
2041
0
    vdest = in_over (
2042
0
        vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest);
2043
0
    store8888 (dst, vdest);
2044
0
      }
2045
0
  }
2046
0
    }
2047
2048
0
    _mm_empty ();
2049
0
}
2050
2051
static pixman_bool_t
2052
mmx_fill (pixman_implementation_t *imp,
2053
          uint32_t *               bits,
2054
          int                      stride,
2055
          int                      bpp,
2056
          int                      x,
2057
          int                      y,
2058
          int                      width,
2059
          int                      height,
2060
          uint32_t       filler)
2061
0
{
2062
0
    uint64_t fill;
2063
0
    __m64 vfill;
2064
0
    uint32_t byte_width;
2065
0
    uint8_t     *byte_line;
2066
2067
0
#if defined __GNUC__ && defined USE_X86_MMX
2068
0
    __m64 v1, v2, v3, v4, v5, v6, v7;
2069
0
#endif
2070
2071
0
    if (bpp != 16 && bpp != 32 && bpp != 8)
2072
0
  return FALSE;
2073
2074
0
    if (bpp == 8)
2075
0
    {
2076
0
  stride = stride * (int) sizeof (uint32_t) / 1;
2077
0
  byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
2078
0
  byte_width = width;
2079
0
  stride *= 1;
2080
0
        filler = (filler & 0xff) * 0x01010101;
2081
0
    }
2082
0
    else if (bpp == 16)
2083
0
    {
2084
0
  stride = stride * (int) sizeof (uint32_t) / 2;
2085
0
  byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
2086
0
  byte_width = 2 * width;
2087
0
  stride *= 2;
2088
0
        filler = (filler & 0xffff) * 0x00010001;
2089
0
    }
2090
0
    else
2091
0
    {
2092
0
  stride = stride * (int) sizeof (uint32_t) / 4;
2093
0
  byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
2094
0
  byte_width = 4 * width;
2095
0
  stride *= 4;
2096
0
    }
2097
2098
0
    fill = ((uint64_t)filler << 32) | filler;
2099
0
    vfill = to_m64 (fill);
2100
2101
0
#if defined __GNUC__ && defined USE_X86_MMX
2102
0
    __asm__ (
2103
0
        "movq   %7, %0\n"
2104
0
        "movq   %7, %1\n"
2105
0
        "movq   %7, %2\n"
2106
0
        "movq   %7, %3\n"
2107
0
        "movq   %7, %4\n"
2108
0
        "movq   %7, %5\n"
2109
0
        "movq   %7, %6\n"
2110
0
  : "=&y" (v1), "=&y" (v2), "=&y" (v3),
2111
0
    "=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7)
2112
0
  : "y" (vfill));
2113
0
#endif
2114
2115
0
    while (height--)
2116
0
    {
2117
0
  int w;
2118
0
  uint8_t *d = byte_line;
2119
2120
0
  byte_line += stride;
2121
0
  w = byte_width;
2122
2123
0
  if (w >= 1 && ((uintptr_t)d & 1))
2124
0
  {
2125
0
      *(uint8_t *)d = (filler & 0xff);
2126
0
      w--;
2127
0
      d++;
2128
0
  }
2129
2130
0
  if (w >= 2 && ((uintptr_t)d & 3))
2131
0
  {
2132
0
      *(uint16_t *)d = filler;
2133
0
      w -= 2;
2134
0
      d += 2;
2135
0
  }
2136
2137
0
  while (w >= 4 && ((uintptr_t)d & 7))
2138
0
  {
2139
0
      *(uint32_t *)d = filler;
2140
2141
0
      w -= 4;
2142
0
      d += 4;
2143
0
  }
2144
2145
0
  while (w >= 64)
2146
0
  {
2147
0
#if defined __GNUC__ && defined USE_X86_MMX
2148
0
      __asm__ (
2149
0
          "movq %1,   (%0)\n"
2150
0
          "movq %2,  8(%0)\n"
2151
0
          "movq %3, 16(%0)\n"
2152
0
          "movq %4, 24(%0)\n"
2153
0
          "movq %5, 32(%0)\n"
2154
0
          "movq %6, 40(%0)\n"
2155
0
          "movq %7, 48(%0)\n"
2156
0
          "movq %8, 56(%0)\n"
2157
0
    :
2158
0
    : "r" (d),
2159
0
      "y" (vfill), "y" (v1), "y" (v2), "y" (v3),
2160
0
      "y" (v4), "y" (v5), "y" (v6), "y" (v7)
2161
0
    : "memory");
2162
#else
2163
      *(__m64*) (d +  0) = vfill;
2164
      *(__m64*) (d +  8) = vfill;
2165
      *(__m64*) (d + 16) = vfill;
2166
      *(__m64*) (d + 24) = vfill;
2167
      *(__m64*) (d + 32) = vfill;
2168
      *(__m64*) (d + 40) = vfill;
2169
      *(__m64*) (d + 48) = vfill;
2170
      *(__m64*) (d + 56) = vfill;
2171
#endif
2172
0
      w -= 64;
2173
0
      d += 64;
2174
0
  }
2175
2176
0
  while (w >= 4)
2177
0
  {
2178
0
      *(uint32_t *)d = filler;
2179
2180
0
      w -= 4;
2181
0
      d += 4;
2182
0
  }
2183
0
  if (w >= 2)
2184
0
  {
2185
0
      *(uint16_t *)d = filler;
2186
0
      w -= 2;
2187
0
      d += 2;
2188
0
  }
2189
0
  if (w >= 1)
2190
0
  {
2191
0
      *(uint8_t *)d = (filler & 0xff);
2192
0
      w--;
2193
0
      d++;
2194
0
  }
2195
2196
0
    }
2197
2198
0
    _mm_empty ();
2199
0
    return TRUE;
2200
0
}
2201
2202
static void
2203
mmx_composite_src_x888_0565 (pixman_implementation_t *imp,
2204
                             pixman_composite_info_t *info)
2205
0
{
2206
0
    PIXMAN_COMPOSITE_ARGS (info);
2207
0
    uint16_t    *dst_line, *dst;
2208
0
    uint32_t    *src_line, *src, s;
2209
0
    int dst_stride, src_stride;
2210
0
    int32_t w;
2211
2212
0
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2213
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2214
2215
0
    while (height--)
2216
0
    {
2217
0
  dst = dst_line;
2218
0
  dst_line += dst_stride;
2219
0
  src = src_line;
2220
0
  src_line += src_stride;
2221
0
  w = width;
2222
2223
0
  while (w && (uintptr_t)dst & 7)
2224
0
  {
2225
0
      s = *src++;
2226
0
      *dst = convert_8888_to_0565 (s);
2227
0
      dst++;
2228
0
      w--;
2229
0
  }
2230
2231
0
  while (w >= 4)
2232
0
  {
2233
0
      __m64 vdest;
2234
0
      __m64 vsrc0 = ldq_u ((__m64 *)(src + 0));
2235
0
      __m64 vsrc1 = ldq_u ((__m64 *)(src + 2));
2236
2237
0
      vdest = pack_4xpacked565 (vsrc0, vsrc1);
2238
2239
0
      *(__m64 *)dst = vdest;
2240
2241
0
      w -= 4;
2242
0
      src += 4;
2243
0
      dst += 4;
2244
0
  }
2245
2246
0
  while (w)
2247
0
  {
2248
0
      s = *src++;
2249
0
      *dst = convert_8888_to_0565 (s);
2250
0
      dst++;
2251
0
      w--;
2252
0
  }
2253
0
    }
2254
2255
0
    _mm_empty ();
2256
0
}
2257
2258
static void
2259
mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
2260
                            pixman_composite_info_t *info)
2261
0
{
2262
0
    PIXMAN_COMPOSITE_ARGS (info);
2263
0
    uint32_t src, srca;
2264
0
    uint32_t    *dst_line, *dst;
2265
0
    uint8_t     *mask_line, *mask;
2266
0
    int dst_stride, mask_stride;
2267
0
    int32_t w;
2268
0
    __m64 vsrc;
2269
0
    uint64_t srcsrc;
2270
2271
0
    CHECKPOINT ();
2272
2273
0
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2274
2275
0
    srca = src >> 24;
2276
0
    if (src == 0)
2277
0
    {
2278
0
  mmx_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
2279
0
      PIXMAN_FORMAT_BPP (dest_image->bits.format),
2280
0
      dest_x, dest_y, width, height, 0);
2281
0
  return;
2282
0
    }
2283
2284
0
    srcsrc = (uint64_t)src << 32 | src;
2285
2286
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2287
0
    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2288
2289
0
    vsrc = load8888 (&src);
2290
2291
0
    while (height--)
2292
0
    {
2293
0
  dst = dst_line;
2294
0
  dst_line += dst_stride;
2295
0
  mask = mask_line;
2296
0
  mask_line += mask_stride;
2297
0
  w = width;
2298
2299
0
  CHECKPOINT ();
2300
2301
0
  while (w && (uintptr_t)dst & 7)
2302
0
  {
2303
0
      uint64_t m = *mask;
2304
2305
0
      if (m)
2306
0
      {
2307
0
    __m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2308
2309
0
    store8888 (dst, vdest);
2310
0
      }
2311
0
      else
2312
0
      {
2313
0
    *dst = 0;
2314
0
      }
2315
2316
0
      w--;
2317
0
      mask++;
2318
0
      dst++;
2319
0
  }
2320
2321
0
  CHECKPOINT ();
2322
2323
0
  while (w >= 2)
2324
0
  {
2325
0
      uint64_t m0, m1;
2326
0
      m0 = *mask;
2327
0
      m1 = *(mask + 1);
2328
2329
0
      if (srca == 0xff && (m0 & m1) == 0xff)
2330
0
      {
2331
0
    *(uint64_t *)dst = srcsrc;
2332
0
      }
2333
0
      else if (m0 | m1)
2334
0
      {
2335
0
    __m64 dest0, dest1;
2336
2337
0
    dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0)));
2338
0
    dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1)));
2339
2340
0
    *(__m64 *)dst = pack8888 (dest0, dest1);
2341
0
      }
2342
0
      else
2343
0
      {
2344
0
    *(uint64_t *)dst = 0;
2345
0
      }
2346
2347
0
      mask += 2;
2348
0
      dst += 2;
2349
0
      w -= 2;
2350
0
  }
2351
2352
0
  CHECKPOINT ();
2353
2354
0
  if (w)
2355
0
  {
2356
0
      uint64_t m = *mask;
2357
2358
0
      if (m)
2359
0
      {
2360
0
    __m64 vdest = load8888 (dst);
2361
2362
0
    vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2363
0
    store8888 (dst, vdest);
2364
0
      }
2365
0
      else
2366
0
      {
2367
0
    *dst = 0;
2368
0
      }
2369
0
  }
2370
0
    }
2371
2372
0
    _mm_empty ();
2373
0
}
2374
2375
static void
2376
mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
2377
                             pixman_composite_info_t *info)
2378
0
{
2379
0
    PIXMAN_COMPOSITE_ARGS (info);
2380
0
    uint32_t src, srca;
2381
0
    uint16_t *dst_line, *dst;
2382
0
    uint8_t *mask_line, *mask;
2383
0
    int dst_stride, mask_stride;
2384
0
    int32_t w;
2385
0
    __m64 vsrc, vsrca, tmp;
2386
0
    __m64 srcsrcsrcsrc;
2387
2388
0
    CHECKPOINT ();
2389
2390
0
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2391
2392
0
    srca = src >> 24;
2393
0
    if (src == 0)
2394
0
  return;
2395
2396
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2397
0
    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2398
2399
0
    vsrc = load8888 (&src);
2400
0
    vsrca = expand_alpha (vsrc);
2401
2402
0
    tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0);
2403
0
    srcsrcsrcsrc = expand_alpha_rev (tmp);
2404
2405
0
    while (height--)
2406
0
    {
2407
0
  dst = dst_line;
2408
0
  dst_line += dst_stride;
2409
0
  mask = mask_line;
2410
0
  mask_line += mask_stride;
2411
0
  w = width;
2412
2413
0
  CHECKPOINT ();
2414
2415
0
  while (w && (uintptr_t)dst & 7)
2416
0
  {
2417
0
      uint64_t m = *mask;
2418
2419
0
      if (m)
2420
0
      {
2421
0
    uint64_t d = *dst;
2422
0
    __m64 vd = to_m64 (d);
2423
0
    __m64 vdest = in_over (
2424
0
        vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0));
2425
2426
0
    vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2427
0
    *dst = to_uint64 (vd);
2428
0
      }
2429
2430
0
      w--;
2431
0
      mask++;
2432
0
      dst++;
2433
0
  }
2434
2435
0
  CHECKPOINT ();
2436
2437
0
  while (w >= 4)
2438
0
  {
2439
0
      uint64_t m0, m1, m2, m3;
2440
0
      m0 = *mask;
2441
0
      m1 = *(mask + 1);
2442
0
      m2 = *(mask + 2);
2443
0
      m3 = *(mask + 3);
2444
2445
0
      if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
2446
0
      {
2447
0
    *(__m64 *)dst = srcsrcsrcsrc;
2448
0
      }
2449
0
      else if (m0 | m1 | m2 | m3)
2450
0
      {
2451
0
    __m64 vdest = *(__m64 *)dst;
2452
0
    __m64 v0, v1, v2, v3;
2453
0
    __m64 vm0, vm1, vm2, vm3;
2454
2455
0
    expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2456
2457
0
    vm0 = to_m64 (m0);
2458
0
    v0 = in_over (vsrc, vsrca, expand_alpha_rev (vm0), v0);
2459
2460
0
    vm1 = to_m64 (m1);
2461
0
    v1 = in_over (vsrc, vsrca, expand_alpha_rev (vm1), v1);
2462
2463
0
    vm2 = to_m64 (m2);
2464
0
    v2 = in_over (vsrc, vsrca, expand_alpha_rev (vm2), v2);
2465
2466
0
    vm3 = to_m64 (m3);
2467
0
    v3 = in_over (vsrc, vsrca, expand_alpha_rev (vm3), v3);
2468
2469
0
    *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);;
2470
0
      }
2471
2472
0
      w -= 4;
2473
0
      mask += 4;
2474
0
      dst += 4;
2475
0
  }
2476
2477
0
  CHECKPOINT ();
2478
2479
0
  while (w)
2480
0
  {
2481
0
      uint64_t m = *mask;
2482
2483
0
      if (m)
2484
0
      {
2485
0
    uint64_t d = *dst;
2486
0
    __m64 vd = to_m64 (d);
2487
0
    __m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)),
2488
0
               expand565 (vd, 0));
2489
0
    vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2490
0
    *dst = to_uint64 (vd);
2491
0
      }
2492
2493
0
      w--;
2494
0
      mask++;
2495
0
      dst++;
2496
0
  }
2497
0
    }
2498
2499
0
    _mm_empty ();
2500
0
}
2501
2502
static void
2503
mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
2504
                                pixman_composite_info_t *info)
2505
0
{
2506
0
    PIXMAN_COMPOSITE_ARGS (info);
2507
0
    uint16_t    *dst_line, *dst;
2508
0
    uint32_t    *src_line, *src;
2509
0
    int dst_stride, src_stride;
2510
0
    int32_t w;
2511
2512
0
    CHECKPOINT ();
2513
2514
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2515
0
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2516
2517
#if 0
2518
    /* FIXME */
2519
    assert (src_image->drawable == mask_image->drawable);
2520
#endif
2521
2522
0
    while (height--)
2523
0
    {
2524
0
  dst = dst_line;
2525
0
  dst_line += dst_stride;
2526
0
  src = src_line;
2527
0
  src_line += src_stride;
2528
0
  w = width;
2529
2530
0
  CHECKPOINT ();
2531
2532
0
  while (w && (uintptr_t)dst & 7)
2533
0
  {
2534
0
      __m64 vsrc = load8888 (src);
2535
0
      uint64_t d = *dst;
2536
0
      __m64 vdest = expand565 (to_m64 (d), 0);
2537
2538
0
      vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2539
2540
0
      *dst = to_uint64 (vdest);
2541
2542
0
      w--;
2543
0
      dst++;
2544
0
      src++;
2545
0
  }
2546
2547
0
  CHECKPOINT ();
2548
2549
0
  while (w >= 4)
2550
0
  {
2551
0
      uint32_t s0, s1, s2, s3;
2552
0
      unsigned char a0, a1, a2, a3;
2553
2554
0
      s0 = *src;
2555
0
      s1 = *(src + 1);
2556
0
      s2 = *(src + 2);
2557
0
      s3 = *(src + 3);
2558
2559
0
      a0 = (s0 >> 24);
2560
0
      a1 = (s1 >> 24);
2561
0
      a2 = (s2 >> 24);
2562
0
      a3 = (s3 >> 24);
2563
2564
0
      if ((a0 & a1 & a2 & a3) == 0xFF)
2565
0
      {
2566
0
    __m64 v0 = invert_colors (load8888 (&s0));
2567
0
    __m64 v1 = invert_colors (load8888 (&s1));
2568
0
    __m64 v2 = invert_colors (load8888 (&s2));
2569
0
    __m64 v3 = invert_colors (load8888 (&s3));
2570
2571
0
    *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
2572
0
      }
2573
0
      else if (s0 | s1 | s2 | s3)
2574
0
      {
2575
0
    __m64 vdest = *(__m64 *)dst;
2576
0
    __m64 v0, v1, v2, v3;
2577
2578
0
    __m64 vsrc0 = load8888 (&s0);
2579
0
    __m64 vsrc1 = load8888 (&s1);
2580
0
    __m64 vsrc2 = load8888 (&s2);
2581
0
    __m64 vsrc3 = load8888 (&s3);
2582
2583
0
    expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2584
2585
0
    v0 = over_rev_non_pre (vsrc0, v0);
2586
0
    v1 = over_rev_non_pre (vsrc1, v1);
2587
0
    v2 = over_rev_non_pre (vsrc2, v2);
2588
0
    v3 = over_rev_non_pre (vsrc3, v3);
2589
2590
0
    *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
2591
0
      }
2592
2593
0
      w -= 4;
2594
0
      dst += 4;
2595
0
      src += 4;
2596
0
  }
2597
2598
0
  CHECKPOINT ();
2599
2600
0
  while (w)
2601
0
  {
2602
0
      __m64 vsrc = load8888 (src);
2603
0
      uint64_t d = *dst;
2604
0
      __m64 vdest = expand565 (to_m64 (d), 0);
2605
2606
0
      vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2607
2608
0
      *dst = to_uint64 (vdest);
2609
2610
0
      w--;
2611
0
      dst++;
2612
0
      src++;
2613
0
  }
2614
0
    }
2615
2616
0
    _mm_empty ();
2617
0
}
2618
2619
static void
2620
mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
2621
                                pixman_composite_info_t *info)
2622
0
{
2623
0
    PIXMAN_COMPOSITE_ARGS (info);
2624
0
    uint32_t    *dst_line, *dst;
2625
0
    uint32_t    *src_line, *src;
2626
0
    int dst_stride, src_stride;
2627
0
    int32_t w;
2628
2629
0
    CHECKPOINT ();
2630
2631
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2632
0
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2633
2634
#if 0
2635
    /* FIXME */
2636
    assert (src_image->drawable == mask_image->drawable);
2637
#endif
2638
2639
0
    while (height--)
2640
0
    {
2641
0
  dst = dst_line;
2642
0
  dst_line += dst_stride;
2643
0
  src = src_line;
2644
0
  src_line += src_stride;
2645
0
  w = width;
2646
2647
0
  while (w && (uintptr_t)dst & 7)
2648
0
  {
2649
0
      __m64 s = load8888 (src);
2650
0
      __m64 d = load8888 (dst);
2651
2652
0
      store8888 (dst, over_rev_non_pre (s, d));
2653
2654
0
      w--;
2655
0
      dst++;
2656
0
      src++;
2657
0
  }
2658
2659
0
  while (w >= 2)
2660
0
  {
2661
0
      uint32_t s0, s1;
2662
0
      unsigned char a0, a1;
2663
0
      __m64 d0, d1;
2664
2665
0
      s0 = *src;
2666
0
      s1 = *(src + 1);
2667
2668
0
      a0 = (s0 >> 24);
2669
0
      a1 = (s1 >> 24);
2670
2671
0
      if ((a0 & a1) == 0xFF)
2672
0
      {
2673
0
    d0 = invert_colors (load8888 (&s0));
2674
0
    d1 = invert_colors (load8888 (&s1));
2675
2676
0
    *(__m64 *)dst = pack8888 (d0, d1);
2677
0
      }
2678
0
      else if (s0 | s1)
2679
0
      {
2680
0
    __m64 vdest = *(__m64 *)dst;
2681
2682
0
    d0 = over_rev_non_pre (load8888 (&s0), expand8888 (vdest, 0));
2683
0
    d1 = over_rev_non_pre (load8888 (&s1), expand8888 (vdest, 1));
2684
2685
0
    *(__m64 *)dst = pack8888 (d0, d1);
2686
0
      }
2687
2688
0
      w -= 2;
2689
0
      dst += 2;
2690
0
      src += 2;
2691
0
  }
2692
2693
0
  if (w)
2694
0
  {
2695
0
      __m64 s = load8888 (src);
2696
0
      __m64 d = load8888 (dst);
2697
2698
0
      store8888 (dst, over_rev_non_pre (s, d));
2699
0
  }
2700
0
    }
2701
2702
0
    _mm_empty ();
2703
0
}
2704
2705
static void
2706
mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
2707
                                   pixman_composite_info_t *info)
2708
0
{
2709
0
    PIXMAN_COMPOSITE_ARGS (info);
2710
0
    uint32_t src;
2711
0
    uint16_t    *dst_line;
2712
0
    uint32_t    *mask_line;
2713
0
    int dst_stride, mask_stride;
2714
0
    __m64 vsrc, vsrca;
2715
2716
0
    CHECKPOINT ();
2717
2718
0
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2719
2720
0
    if (src == 0)
2721
0
  return;
2722
2723
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2724
0
    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2725
2726
0
    vsrc = load8888 (&src);
2727
0
    vsrca = expand_alpha (vsrc);
2728
2729
0
    while (height--)
2730
0
    {
2731
0
  int twidth = width;
2732
0
  uint32_t *p = (uint32_t *)mask_line;
2733
0
  uint16_t *q = (uint16_t *)dst_line;
2734
2735
0
  while (twidth && ((uintptr_t)q & 7))
2736
0
  {
2737
0
      uint32_t m = *(uint32_t *)p;
2738
2739
0
      if (m)
2740
0
      {
2741
0
    uint64_t d = *q;
2742
0
    __m64 vdest = expand565 (to_m64 (d), 0);
2743
0
    vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
2744
0
    *q = to_uint64 (vdest);
2745
0
      }
2746
2747
0
      twidth--;
2748
0
      p++;
2749
0
      q++;
2750
0
  }
2751
2752
0
  while (twidth >= 4)
2753
0
  {
2754
0
      uint32_t m0, m1, m2, m3;
2755
2756
0
      m0 = *p;
2757
0
      m1 = *(p + 1);
2758
0
      m2 = *(p + 2);
2759
0
      m3 = *(p + 3);
2760
2761
0
      if ((m0 | m1 | m2 | m3))
2762
0
      {
2763
0
    __m64 vdest = *(__m64 *)q;
2764
0
    __m64 v0, v1, v2, v3;
2765
2766
0
    expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2767
2768
0
    v0 = in_over (vsrc, vsrca, load8888 (&m0), v0);
2769
0
    v1 = in_over (vsrc, vsrca, load8888 (&m1), v1);
2770
0
    v2 = in_over (vsrc, vsrca, load8888 (&m2), v2);
2771
0
    v3 = in_over (vsrc, vsrca, load8888 (&m3), v3);
2772
2773
0
    *(__m64 *)q = pack_4x565 (v0, v1, v2, v3);
2774
0
      }
2775
0
      twidth -= 4;
2776
0
      p += 4;
2777
0
      q += 4;
2778
0
  }
2779
2780
0
  while (twidth)
2781
0
  {
2782
0
      uint32_t m;
2783
2784
0
      m = *(uint32_t *)p;
2785
0
      if (m)
2786
0
      {
2787
0
    uint64_t d = *q;
2788
0
    __m64 vdest = expand565 (to_m64 (d), 0);
2789
0
    vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
2790
0
    *q = to_uint64 (vdest);
2791
0
      }
2792
2793
0
      twidth--;
2794
0
      p++;
2795
0
      q++;
2796
0
  }
2797
2798
0
  mask_line += mask_stride;
2799
0
  dst_line += dst_stride;
2800
0
    }
2801
2802
0
    _mm_empty ();
2803
0
}
2804
2805
static void
2806
mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
2807
                        pixman_composite_info_t *info)
2808
0
{
2809
0
    PIXMAN_COMPOSITE_ARGS (info);
2810
0
    uint8_t *dst_line, *dst;
2811
0
    uint8_t *mask_line, *mask;
2812
0
    int dst_stride, mask_stride;
2813
0
    int32_t w;
2814
0
    uint32_t src;
2815
0
    uint8_t sa;
2816
0
    __m64 vsrc, vsrca;
2817
2818
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2819
0
    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2820
2821
0
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2822
2823
0
    sa = src >> 24;
2824
2825
0
    vsrc = load8888 (&src);
2826
0
    vsrca = expand_alpha (vsrc);
2827
2828
0
    while (height--)
2829
0
    {
2830
0
  dst = dst_line;
2831
0
  dst_line += dst_stride;
2832
0
  mask = mask_line;
2833
0
  mask_line += mask_stride;
2834
0
  w = width;
2835
2836
0
  while (w && (uintptr_t)dst & 7)
2837
0
  {
2838
0
      uint16_t tmp;
2839
0
      uint8_t a;
2840
0
      uint32_t m, d;
2841
2842
0
      a = *mask++;
2843
0
      d = *dst;
2844
2845
0
      m = MUL_UN8 (sa, a, tmp);
2846
0
      d = MUL_UN8 (m, d, tmp);
2847
2848
0
      *dst++ = d;
2849
0
      w--;
2850
0
  }
2851
2852
0
  while (w >= 4)
2853
0
  {
2854
0
      __m64 vmask;
2855
0
      __m64 vdest;
2856
2857
0
      vmask = load8888u ((uint32_t *)mask);
2858
0
      vdest = load8888 ((uint32_t *)dst);
2859
2860
0
      store8888 ((uint32_t *)dst, in (in (vsrca, vmask), vdest));
2861
2862
0
      dst += 4;
2863
0
      mask += 4;
2864
0
      w -= 4;
2865
0
  }
2866
2867
0
  while (w--)
2868
0
  {
2869
0
      uint16_t tmp;
2870
0
      uint8_t a;
2871
0
      uint32_t m, d;
2872
2873
0
      a = *mask++;
2874
0
      d = *dst;
2875
2876
0
      m = MUL_UN8 (sa, a, tmp);
2877
0
      d = MUL_UN8 (m, d, tmp);
2878
2879
0
      *dst++ = d;
2880
0
  }
2881
0
    }
2882
2883
0
    _mm_empty ();
2884
0
}
2885
2886
static void
2887
mmx_composite_in_8_8 (pixman_implementation_t *imp,
2888
                      pixman_composite_info_t *info)
2889
0
{
2890
0
    PIXMAN_COMPOSITE_ARGS (info);
2891
0
    uint8_t     *dst_line, *dst;
2892
0
    uint8_t     *src_line, *src;
2893
0
    int src_stride, dst_stride;
2894
0
    int32_t w;
2895
2896
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2897
0
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
2898
2899
0
    while (height--)
2900
0
    {
2901
0
  dst = dst_line;
2902
0
  dst_line += dst_stride;
2903
0
  src = src_line;
2904
0
  src_line += src_stride;
2905
0
  w = width;
2906
2907
0
  while (w && (uintptr_t)dst & 3)
2908
0
  {
2909
0
      uint8_t s, d;
2910
0
      uint16_t tmp;
2911
2912
0
      s = *src;
2913
0
      d = *dst;
2914
2915
0
      *dst = MUL_UN8 (s, d, tmp);
2916
2917
0
      src++;
2918
0
      dst++;
2919
0
      w--;
2920
0
  }
2921
2922
0
  while (w >= 4)
2923
0
  {
2924
0
      uint32_t *s = (uint32_t *)src;
2925
0
      uint32_t *d = (uint32_t *)dst;
2926
2927
0
      store8888 (d, in (load8888u (s), load8888 (d)));
2928
2929
0
      w -= 4;
2930
0
      dst += 4;
2931
0
      src += 4;
2932
0
  }
2933
2934
0
  while (w--)
2935
0
  {
2936
0
      uint8_t s, d;
2937
0
      uint16_t tmp;
2938
2939
0
      s = *src;
2940
0
      d = *dst;
2941
2942
0
      *dst = MUL_UN8 (s, d, tmp);
2943
2944
0
      src++;
2945
0
      dst++;
2946
0
  }
2947
0
    }
2948
2949
0
    _mm_empty ();
2950
0
}
2951
2952
static void
2953
mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
2954
       pixman_composite_info_t *info)
2955
0
{
2956
0
    PIXMAN_COMPOSITE_ARGS (info);
2957
0
    uint8_t     *dst_line, *dst;
2958
0
    uint8_t     *mask_line, *mask;
2959
0
    int dst_stride, mask_stride;
2960
0
    int32_t w;
2961
0
    uint32_t src;
2962
0
    uint8_t sa;
2963
0
    __m64 vsrc, vsrca;
2964
2965
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2966
0
    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2967
2968
0
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2969
2970
0
    sa = src >> 24;
2971
2972
0
    if (src == 0)
2973
0
  return;
2974
2975
0
    vsrc = load8888 (&src);
2976
0
    vsrca = expand_alpha (vsrc);
2977
2978
0
    while (height--)
2979
0
    {
2980
0
  dst = dst_line;
2981
0
  dst_line += dst_stride;
2982
0
  mask = mask_line;
2983
0
  mask_line += mask_stride;
2984
0
  w = width;
2985
2986
0
  while (w && (uintptr_t)dst & 3)
2987
0
  {
2988
0
      uint16_t tmp;
2989
0
      uint16_t a;
2990
0
      uint32_t m, d;
2991
0
      uint32_t r;
2992
2993
0
      a = *mask++;
2994
0
      d = *dst;
2995
2996
0
      m = MUL_UN8 (sa, a, tmp);
2997
0
      r = ADD_UN8 (m, d, tmp);
2998
2999
0
      *dst++ = r;
3000
0
      w--;
3001
0
  }
3002
3003
0
  while (w >= 4)
3004
0
  {
3005
0
      __m64 vmask;
3006
0
      __m64 vdest;
3007
3008
0
      vmask = load8888u ((uint32_t *)mask);
3009
0
      vdest = load8888 ((uint32_t *)dst);
3010
3011
0
      store8888 ((uint32_t *)dst, _mm_adds_pu8 (in (vsrca, vmask), vdest));
3012
3013
0
      dst += 4;
3014
0
      mask += 4;
3015
0
      w -= 4;
3016
0
  }
3017
3018
0
  while (w--)
3019
0
  {
3020
0
      uint16_t tmp;
3021
0
      uint16_t a;
3022
0
      uint32_t m, d;
3023
0
      uint32_t r;
3024
3025
0
      a = *mask++;
3026
0
      d = *dst;
3027
3028
0
      m = MUL_UN8 (sa, a, tmp);
3029
0
      r = ADD_UN8 (m, d, tmp);
3030
3031
0
      *dst++ = r;
3032
0
  }
3033
0
    }
3034
3035
0
    _mm_empty ();
3036
0
}
3037
3038
static void
3039
mmx_composite_add_8_8 (pixman_implementation_t *imp,
3040
           pixman_composite_info_t *info)
3041
0
{
3042
0
    PIXMAN_COMPOSITE_ARGS (info);
3043
0
    uint8_t *dst_line, *dst;
3044
0
    uint8_t *src_line, *src;
3045
0
    int dst_stride, src_stride;
3046
0
    int32_t w;
3047
0
    uint8_t s, d;
3048
0
    uint16_t t;
3049
3050
0
    CHECKPOINT ();
3051
3052
0
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
3053
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
3054
3055
0
    while (height--)
3056
0
    {
3057
0
  dst = dst_line;
3058
0
  dst_line += dst_stride;
3059
0
  src = src_line;
3060
0
  src_line += src_stride;
3061
0
  w = width;
3062
3063
0
  while (w && (uintptr_t)dst & 7)
3064
0
  {
3065
0
      s = *src;
3066
0
      d = *dst;
3067
0
      t = d + s;
3068
0
      s = t | (0 - (t >> 8));
3069
0
      *dst = s;
3070
3071
0
      dst++;
3072
0
      src++;
3073
0
      w--;
3074
0
  }
3075
3076
0
  while (w >= 8)
3077
0
  {
3078
0
      *(__m64*)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
3079
0
      dst += 8;
3080
0
      src += 8;
3081
0
      w -= 8;
3082
0
  }
3083
3084
0
  while (w)
3085
0
  {
3086
0
      s = *src;
3087
0
      d = *dst;
3088
0
      t = d + s;
3089
0
      s = t | (0 - (t >> 8));
3090
0
      *dst = s;
3091
3092
0
      dst++;
3093
0
      src++;
3094
0
      w--;
3095
0
  }
3096
0
    }
3097
3098
0
    _mm_empty ();
3099
0
}
3100
3101
static void
3102
mmx_composite_add_0565_0565 (pixman_implementation_t *imp,
3103
                             pixman_composite_info_t *info)
3104
0
{
3105
0
    PIXMAN_COMPOSITE_ARGS (info);
3106
0
    uint16_t    *dst_line, *dst;
3107
0
    uint32_t  d;
3108
0
    uint16_t    *src_line, *src;
3109
0
    uint32_t  s;
3110
0
    int dst_stride, src_stride;
3111
0
    int32_t w;
3112
3113
0
    CHECKPOINT ();
3114
3115
0
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
3116
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3117
3118
0
    while (height--)
3119
0
    {
3120
0
  dst = dst_line;
3121
0
  dst_line += dst_stride;
3122
0
  src = src_line;
3123
0
  src_line += src_stride;
3124
0
  w = width;
3125
3126
0
  while (w && (uintptr_t)dst & 7)
3127
0
  {
3128
0
      s = *src++;
3129
0
      if (s)
3130
0
      {
3131
0
    d = *dst;
3132
0
    s = convert_0565_to_8888 (s);
3133
0
    if (d)
3134
0
    {
3135
0
        d = convert_0565_to_8888 (d);
3136
0
        UN8x4_ADD_UN8x4 (s, d);
3137
0
    }
3138
0
    *dst = convert_8888_to_0565 (s);
3139
0
      }
3140
0
      dst++;
3141
0
      w--;
3142
0
  }
3143
3144
0
  while (w >= 4)
3145
0
  {
3146
0
      __m64 vdest = *(__m64 *)dst;
3147
0
      __m64 vsrc = ldq_u ((__m64 *)src);
3148
0
      __m64 vd0, vd1;
3149
0
      __m64 vs0, vs1;
3150
3151
0
      expand_4xpacked565 (vdest, &vd0, &vd1, 0);
3152
0
      expand_4xpacked565 (vsrc, &vs0, &vs1, 0);
3153
3154
0
      vd0 = _mm_adds_pu8 (vd0, vs0);
3155
0
      vd1 = _mm_adds_pu8 (vd1, vs1);
3156
3157
0
      *(__m64 *)dst = pack_4xpacked565 (vd0, vd1);
3158
3159
0
      dst += 4;
3160
0
      src += 4;
3161
0
      w -= 4;
3162
0
  }
3163
3164
0
  while (w--)
3165
0
  {
3166
0
      s = *src++;
3167
0
      if (s)
3168
0
      {
3169
0
    d = *dst;
3170
0
    s = convert_0565_to_8888 (s);
3171
0
    if (d)
3172
0
    {
3173
0
        d = convert_0565_to_8888 (d);
3174
0
        UN8x4_ADD_UN8x4 (s, d);
3175
0
    }
3176
0
    *dst = convert_8888_to_0565 (s);
3177
0
      }
3178
0
      dst++;
3179
0
  }
3180
0
    }
3181
3182
0
    _mm_empty ();
3183
0
}
3184
3185
static void
3186
mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
3187
                             pixman_composite_info_t *info)
3188
0
{
3189
0
    PIXMAN_COMPOSITE_ARGS (info);
3190
0
    uint32_t    *dst_line, *dst;
3191
0
    uint32_t    *src_line, *src;
3192
0
    int dst_stride, src_stride;
3193
0
    int32_t w;
3194
3195
0
    CHECKPOINT ();
3196
3197
0
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3198
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3199
3200
0
    while (height--)
3201
0
    {
3202
0
  dst = dst_line;
3203
0
  dst_line += dst_stride;
3204
0
  src = src_line;
3205
0
  src_line += src_stride;
3206
0
  w = width;
3207
3208
0
  while (w && (uintptr_t)dst & 7)
3209
0
  {
3210
0
      store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
3211
0
                                load ((const uint32_t *)dst)));
3212
0
      dst++;
3213
0
      src++;
3214
0
      w--;
3215
0
  }
3216
3217
0
  while (w >= 2)
3218
0
  {
3219
0
      *(__m64 *)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
3220
0
      dst += 2;
3221
0
      src += 2;
3222
0
      w -= 2;
3223
0
  }
3224
3225
0
  if (w)
3226
0
  {
3227
0
      store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
3228
0
                                load ((const uint32_t *)dst)));
3229
3230
0
  }
3231
0
    }
3232
3233
0
    _mm_empty ();
3234
0
}
3235
3236
static pixman_bool_t
3237
mmx_blt (pixman_implementation_t *imp,
3238
         uint32_t *               src_bits,
3239
         uint32_t *               dst_bits,
3240
         int                      src_stride,
3241
         int                      dst_stride,
3242
         int                      src_bpp,
3243
         int                      dst_bpp,
3244
         int                      src_x,
3245
         int                      src_y,
3246
         int                      dest_x,
3247
         int                      dest_y,
3248
         int                      width,
3249
         int                      height)
3250
0
{
3251
0
    uint8_t *   src_bytes;
3252
0
    uint8_t *   dst_bytes;
3253
0
    int byte_width;
3254
3255
0
    if (src_bpp != dst_bpp)
3256
0
  return FALSE;
3257
3258
0
    if (src_bpp == 16)
3259
0
    {
3260
0
  src_stride = src_stride * (int) sizeof (uint32_t) / 2;
3261
0
  dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
3262
0
  src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
3263
0
  dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
3264
0
  byte_width = 2 * width;
3265
0
  src_stride *= 2;
3266
0
  dst_stride *= 2;
3267
0
    }
3268
0
    else if (src_bpp == 32)
3269
0
    {
3270
0
  src_stride = src_stride * (int) sizeof (uint32_t) / 4;
3271
0
  dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
3272
0
  src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
3273
0
  dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
3274
0
  byte_width = 4 * width;
3275
0
  src_stride *= 4;
3276
0
  dst_stride *= 4;
3277
0
    }
3278
0
    else
3279
0
    {
3280
0
  return FALSE;
3281
0
    }
3282
3283
0
    while (height--)
3284
0
    {
3285
0
  int w;
3286
0
  uint8_t *s = src_bytes;
3287
0
  uint8_t *d = dst_bytes;
3288
0
  src_bytes += src_stride;
3289
0
  dst_bytes += dst_stride;
3290
0
  w = byte_width;
3291
3292
0
  if (w >= 1 && ((uintptr_t)d & 1))
3293
0
  {
3294
0
      *(uint8_t *)d = *(uint8_t *)s;
3295
0
      w -= 1;
3296
0
      s += 1;
3297
0
      d += 1;
3298
0
  }
3299
3300
0
  if (w >= 2 && ((uintptr_t)d & 3))
3301
0
  {
3302
0
      *(uint16_t *)d = *(uint16_t *)s;
3303
0
      w -= 2;
3304
0
      s += 2;
3305
0
      d += 2;
3306
0
  }
3307
3308
0
  while (w >= 4 && ((uintptr_t)d & 7))
3309
0
  {
3310
0
      *(uint32_t *)d = ldl_u ((uint32_t *)s);
3311
3312
0
      w -= 4;
3313
0
      s += 4;
3314
0
      d += 4;
3315
0
  }
3316
3317
0
  while (w >= 64)
3318
0
  {
3319
0
#if (defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined USE_X86_MMX
3320
0
      __asm__ (
3321
0
          "movq   (%1),   %%mm0\n"
3322
0
          "movq  8(%1),   %%mm1\n"
3323
0
          "movq 16(%1),   %%mm2\n"
3324
0
          "movq 24(%1),   %%mm3\n"
3325
0
          "movq 32(%1),   %%mm4\n"
3326
0
          "movq 40(%1),   %%mm5\n"
3327
0
          "movq 48(%1),   %%mm6\n"
3328
0
          "movq 56(%1),   %%mm7\n"
3329
3330
0
          "movq %%mm0,    (%0)\n"
3331
0
          "movq %%mm1,   8(%0)\n"
3332
0
          "movq %%mm2,  16(%0)\n"
3333
0
          "movq %%mm3,  24(%0)\n"
3334
0
          "movq %%mm4,  32(%0)\n"
3335
0
          "movq %%mm5,  40(%0)\n"
3336
0
          "movq %%mm6,  48(%0)\n"
3337
0
          "movq %%mm7,  56(%0)\n"
3338
0
    :
3339
0
    : "r" (d), "r" (s)
3340
0
    : "memory",
3341
0
      "%mm0", "%mm1", "%mm2", "%mm3",
3342
0
      "%mm4", "%mm5", "%mm6", "%mm7");
3343
#else
3344
      __m64 v0 = ldq_u ((__m64 *)(s + 0));
3345
      __m64 v1 = ldq_u ((__m64 *)(s + 8));
3346
      __m64 v2 = ldq_u ((__m64 *)(s + 16));
3347
      __m64 v3 = ldq_u ((__m64 *)(s + 24));
3348
      __m64 v4 = ldq_u ((__m64 *)(s + 32));
3349
      __m64 v5 = ldq_u ((__m64 *)(s + 40));
3350
      __m64 v6 = ldq_u ((__m64 *)(s + 48));
3351
      __m64 v7 = ldq_u ((__m64 *)(s + 56));
3352
      *(__m64 *)(d + 0)  = v0;
3353
      *(__m64 *)(d + 8)  = v1;
3354
      *(__m64 *)(d + 16) = v2;
3355
      *(__m64 *)(d + 24) = v3;
3356
      *(__m64 *)(d + 32) = v4;
3357
      *(__m64 *)(d + 40) = v5;
3358
      *(__m64 *)(d + 48) = v6;
3359
      *(__m64 *)(d + 56) = v7;
3360
#endif
3361
3362
0
      w -= 64;
3363
0
      s += 64;
3364
0
      d += 64;
3365
0
  }
3366
0
  while (w >= 4)
3367
0
  {
3368
0
      *(uint32_t *)d = ldl_u ((uint32_t *)s);
3369
3370
0
      w -= 4;
3371
0
      s += 4;
3372
0
      d += 4;
3373
0
  }
3374
0
  if (w >= 2)
3375
0
  {
3376
0
      *(uint16_t *)d = *(uint16_t *)s;
3377
0
      w -= 2;
3378
0
      s += 2;
3379
0
      d += 2;
3380
0
  }
3381
0
    }
3382
3383
0
    _mm_empty ();
3384
3385
0
    return TRUE;
3386
0
}
3387
3388
static void
3389
mmx_composite_copy_area (pixman_implementation_t *imp,
3390
                         pixman_composite_info_t *info)
3391
0
{
3392
0
    PIXMAN_COMPOSITE_ARGS (info);
3393
3394
0
    mmx_blt (imp, src_image->bits.bits,
3395
0
       dest_image->bits.bits,
3396
0
       src_image->bits.rowstride,
3397
0
       dest_image->bits.rowstride,
3398
0
       PIXMAN_FORMAT_BPP (src_image->bits.format),
3399
0
       PIXMAN_FORMAT_BPP (dest_image->bits.format),
3400
0
       src_x, src_y, dest_x, dest_y, width, height);
3401
0
}
3402
3403
static void
3404
mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
3405
                                pixman_composite_info_t *info)
3406
0
{
3407
0
    PIXMAN_COMPOSITE_ARGS (info);
3408
0
    uint32_t  *src, *src_line;
3409
0
    uint32_t  *dst, *dst_line;
3410
0
    uint8_t  *mask, *mask_line;
3411
0
    int src_stride, mask_stride, dst_stride;
3412
0
    int32_t w;
3413
3414
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3415
0
    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3416
0
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3417
3418
0
    while (height--)
3419
0
    {
3420
0
  src = src_line;
3421
0
  src_line += src_stride;
3422
0
  dst = dst_line;
3423
0
  dst_line += dst_stride;
3424
0
  mask = mask_line;
3425
0
  mask_line += mask_stride;
3426
3427
0
  w = width;
3428
3429
0
  while (w--)
3430
0
  {
3431
0
      uint64_t m = *mask;
3432
3433
0
      if (m)
3434
0
      {
3435
0
    uint32_t ssrc = *src | 0xff000000;
3436
0
    __m64 s = load8888 (&ssrc);
3437
3438
0
    if (m == 0xff)
3439
0
    {
3440
0
        store8888 (dst, s);
3441
0
    }
3442
0
    else
3443
0
    {
3444
0
        __m64 sa = expand_alpha (s);
3445
0
        __m64 vm = expand_alpha_rev (to_m64 (m));
3446
0
        __m64 vdest = in_over (s, sa, vm, load8888 (dst));
3447
3448
0
        store8888 (dst, vdest);
3449
0
    }
3450
0
      }
3451
3452
0
      mask++;
3453
0
      dst++;
3454
0
      src++;
3455
0
  }
3456
0
    }
3457
3458
0
    _mm_empty ();
3459
0
}
3460
3461
static void
3462
mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
3463
                                   pixman_composite_info_t *info)
3464
0
{
3465
0
    PIXMAN_COMPOSITE_ARGS (info);
3466
0
    uint32_t src;
3467
0
    uint32_t    *dst_line, *dst;
3468
0
    int32_t w;
3469
0
    int dst_stride;
3470
0
    __m64 vsrc;
3471
3472
0
    CHECKPOINT ();
3473
3474
0
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3475
3476
0
    if (src == 0)
3477
0
  return;
3478
3479
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3480
3481
0
    vsrc = load8888 (&src);
3482
3483
0
    while (height--)
3484
0
    {
3485
0
  dst = dst_line;
3486
0
  dst_line += dst_stride;
3487
0
  w = width;
3488
3489
0
  CHECKPOINT ();
3490
3491
0
  while (w && (uintptr_t)dst & 7)
3492
0
  {
3493
0
      __m64 vdest = load8888 (dst);
3494
3495
0
      store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
3496
3497
0
      w--;
3498
0
      dst++;
3499
0
  }
3500
3501
0
  while (w >= 2)
3502
0
  {
3503
0
      __m64 vdest = *(__m64 *)dst;
3504
0
      __m64 dest0 = expand8888 (vdest, 0);
3505
0
      __m64 dest1 = expand8888 (vdest, 1);
3506
3507
3508
0
      dest0 = over (dest0, expand_alpha (dest0), vsrc);
3509
0
      dest1 = over (dest1, expand_alpha (dest1), vsrc);
3510
3511
0
      *(__m64 *)dst = pack8888 (dest0, dest1);
3512
3513
0
      dst += 2;
3514
0
      w -= 2;
3515
0
  }
3516
3517
0
  CHECKPOINT ();
3518
3519
0
  if (w)
3520
0
  {
3521
0
      __m64 vdest = load8888 (dst);
3522
3523
0
      store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
3524
0
  }
3525
0
    }
3526
3527
0
    _mm_empty ();
3528
0
}
3529
3530
static force_inline void
3531
scaled_nearest_scanline_mmx_8888_8888_OVER (uint32_t*       pd,
3532
                                            const uint32_t* ps,
3533
                                            int32_t         w,
3534
                                            pixman_fixed_t  vx,
3535
                                            pixman_fixed_t  unit_x,
3536
                                            pixman_fixed_t  src_width_fixed,
3537
                                            pixman_bool_t   fully_transparent_src)
3538
0
{
3539
0
    if (fully_transparent_src)
3540
0
  return;
3541
3542
0
    while (w)
3543
0
    {
3544
0
  __m64 d = load (pd);
3545
0
  __m64 s = load (ps + pixman_fixed_to_int (vx));
3546
0
  vx += unit_x;
3547
0
  while (vx >= 0)
3548
0
      vx -= src_width_fixed;
3549
3550
0
  store8888 (pd, core_combine_over_u_pixel_mmx (s, d));
3551
0
  pd++;
3552
3553
0
  w--;
3554
0
    }
3555
3556
0
    _mm_empty ();
3557
0
}
3558
3559
FAST_NEAREST_MAINLOOP (mmx_8888_8888_cover_OVER,
3560
           scaled_nearest_scanline_mmx_8888_8888_OVER,
3561
           uint32_t, uint32_t, COVER)
3562
FAST_NEAREST_MAINLOOP (mmx_8888_8888_none_OVER,
3563
           scaled_nearest_scanline_mmx_8888_8888_OVER,
3564
           uint32_t, uint32_t, NONE)
3565
FAST_NEAREST_MAINLOOP (mmx_8888_8888_pad_OVER,
3566
           scaled_nearest_scanline_mmx_8888_8888_OVER,
3567
           uint32_t, uint32_t, PAD)
3568
FAST_NEAREST_MAINLOOP (mmx_8888_8888_normal_OVER,
3569
           scaled_nearest_scanline_mmx_8888_8888_OVER,
3570
           uint32_t, uint32_t, NORMAL)
3571
3572
static force_inline void
3573
scaled_nearest_scanline_mmx_8888_n_8888_OVER (const uint32_t * mask,
3574
                uint32_t *       dst,
3575
                const uint32_t * src,
3576
                int32_t          w,
3577
                pixman_fixed_t   vx,
3578
                pixman_fixed_t   unit_x,
3579
                pixman_fixed_t   src_width_fixed,
3580
                pixman_bool_t    zero_src)
3581
0
{
3582
0
    __m64 mm_mask;
3583
3584
0
    if (zero_src || (*mask >> 24) == 0)
3585
0
    {
3586
  /* A workaround for https://gcc.gnu.org/PR47759 */
3587
0
  _mm_empty ();
3588
0
  return;
3589
0
    }
3590
3591
0
    mm_mask = expand_alpha (load8888 (mask));
3592
3593
0
    while (w)
3594
0
    {
3595
0
  uint32_t s = *(src + pixman_fixed_to_int (vx));
3596
0
  vx += unit_x;
3597
0
  while (vx >= 0)
3598
0
      vx -= src_width_fixed;
3599
3600
0
  if (s)
3601
0
  {
3602
0
      __m64 ms = load8888 (&s);
3603
0
      __m64 alpha = expand_alpha (ms);
3604
0
      __m64 dest  = load8888 (dst);
3605
3606
0
      store8888 (dst, (in_over (ms, alpha, mm_mask, dest)));
3607
0
  }
3608
3609
0
  dst++;
3610
0
  w--;
3611
0
    }
3612
3613
0
    _mm_empty ();
3614
0
}
3615
3616
FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_cover_OVER,
3617
            scaled_nearest_scanline_mmx_8888_n_8888_OVER,
3618
            uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
3619
FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_pad_OVER,
3620
            scaled_nearest_scanline_mmx_8888_n_8888_OVER,
3621
            uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
3622
FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_none_OVER,
3623
            scaled_nearest_scanline_mmx_8888_n_8888_OVER,
3624
            uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
3625
FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_normal_OVER,
3626
            scaled_nearest_scanline_mmx_8888_n_8888_OVER,
3627
            uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
3628
3629
0
#define BSHIFT ((1 << BILINEAR_INTERPOLATION_BITS))
3630
0
#define BMSK (BSHIFT - 1)
3631
3632
#define BILINEAR_DECLARE_VARIABLES            \
3633
0
    const __m64 mm_wt = _mm_set_pi16 (wt, wt, wt, wt);        \
3634
0
    const __m64 mm_wb = _mm_set_pi16 (wb, wb, wb, wb);        \
3635
0
    const __m64 mm_addc7 = _mm_set_pi16 (0, 1, 0, 1);       \
3636
0
    const __m64 mm_xorc7 = _mm_set_pi16 (0, BMSK, 0, BMSK);      \
3637
0
    const __m64 mm_ux = _mm_set_pi16 (unit_x, unit_x, unit_x, unit_x);    \
3638
0
    const __m64 mm_zero = _mm_setzero_si64 ();          \
3639
0
    __m64 mm_x = _mm_set_pi16 (vx, vx, vx, vx)
3640
3641
0
#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)         \
3642
0
do {                   \
3643
0
    /* fetch 2x2 pixel block into 2 mmx registers */        \
3644
0
    __m64 t = ldq_u ((__m64 *)&src_top [pixman_fixed_to_int (vx)]);    \
3645
0
    __m64 b = ldq_u ((__m64 *)&src_bottom [pixman_fixed_to_int (vx)]);    \
3646
0
    /* vertical interpolation */            \
3647
0
    __m64 t_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (t, mm_zero), mm_wt);   \
3648
0
    __m64 t_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (t, mm_zero), mm_wt);   \
3649
0
    __m64 b_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (b, mm_zero), mm_wb);   \
3650
0
    __m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb);   \
3651
0
    __m64 hi = _mm_add_pi16 (t_hi, b_hi);         \
3652
0
    __m64 lo = _mm_add_pi16 (t_lo, b_lo);         \
3653
0
    /* calculate horizontal weights */            \
3654
0
    __m64 mm_wh = _mm_add_pi16 (mm_addc7, _mm_xor_si64 (mm_xorc7,   \
3655
0
        _mm_srli_pi16 (mm_x,          \
3656
0
           16 - BILINEAR_INTERPOLATION_BITS)));  \
3657
0
    /* horizontal interpolation */            \
3658
0
    __m64 p = _mm_unpacklo_pi16 (lo, hi);         \
3659
0
    __m64 q = _mm_unpackhi_pi16 (lo, hi);         \
3660
0
    vx += unit_x;               \
3661
0
    lo = _mm_madd_pi16 (p, mm_wh);            \
3662
0
    hi = _mm_madd_pi16 (q, mm_wh);            \
3663
0
    mm_x = _mm_add_pi16 (mm_x, mm_ux);            \
3664
0
    /* shift and pack the result */           \
3665
0
    hi = _mm_srli_pi32 (hi, BILINEAR_INTERPOLATION_BITS * 2);      \
3666
0
    lo = _mm_srli_pi32 (lo, BILINEAR_INTERPOLATION_BITS * 2);      \
3667
0
    lo = _mm_packs_pi32 (lo, hi);           \
3668
0
    lo = _mm_packs_pu16 (lo, lo);           \
3669
0
    pix = lo;                 \
3670
0
} while (0)
3671
3672
0
#define BILINEAR_SKIP_ONE_PIXEL()           \
3673
0
do {                   \
3674
0
    vx += unit_x;               \
3675
0
    mm_x = _mm_add_pi16 (mm_x, mm_ux);            \
3676
0
} while(0)
3677
3678
static force_inline void
3679
scaled_bilinear_scanline_mmx_8888_8888_SRC (uint32_t *       dst,
3680
              const uint32_t * mask,
3681
              const uint32_t * src_top,
3682
              const uint32_t * src_bottom,
3683
              int32_t          w,
3684
              int              wt,
3685
              int              wb,
3686
              pixman_fixed_t   vx,
3687
              pixman_fixed_t   unit_x,
3688
              pixman_fixed_t   max_vx,
3689
              pixman_bool_t    zero_src)
3690
0
{
3691
0
    BILINEAR_DECLARE_VARIABLES;
3692
0
    __m64 pix;
3693
3694
0
    while (w--)
3695
0
    {
3696
0
  BILINEAR_INTERPOLATE_ONE_PIXEL (pix);
3697
0
  store (dst, pix);
3698
0
  dst++;
3699
0
    }
3700
3701
0
    _mm_empty ();
3702
0
}
3703
3704
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_SRC,
3705
             scaled_bilinear_scanline_mmx_8888_8888_SRC,
3706
             uint32_t, uint32_t, uint32_t,
3707
             COVER, FLAG_NONE)
3708
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_SRC,
3709
             scaled_bilinear_scanline_mmx_8888_8888_SRC,
3710
             uint32_t, uint32_t, uint32_t,
3711
             PAD, FLAG_NONE)
3712
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_SRC,
3713
             scaled_bilinear_scanline_mmx_8888_8888_SRC,
3714
             uint32_t, uint32_t, uint32_t,
3715
             NONE, FLAG_NONE)
3716
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_SRC,
3717
             scaled_bilinear_scanline_mmx_8888_8888_SRC,
3718
             uint32_t, uint32_t, uint32_t,
3719
             NORMAL, FLAG_NONE)
3720
3721
static force_inline void
3722
scaled_bilinear_scanline_mmx_8888_8888_OVER (uint32_t *       dst,
3723
               const uint32_t * mask,
3724
               const uint32_t * src_top,
3725
               const uint32_t * src_bottom,
3726
               int32_t          w,
3727
               int              wt,
3728
               int              wb,
3729
               pixman_fixed_t   vx,
3730
               pixman_fixed_t   unit_x,
3731
               pixman_fixed_t   max_vx,
3732
               pixman_bool_t    zero_src)
3733
0
{
3734
0
    BILINEAR_DECLARE_VARIABLES;
3735
0
    __m64 pix1, pix2;
3736
3737
0
    while (w)
3738
0
    {
3739
0
  BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
3740
3741
0
  if (!is_zero (pix1))
3742
0
  {
3743
0
      pix2 = load (dst);
3744
0
      store8888 (dst, core_combine_over_u_pixel_mmx (pix1, pix2));
3745
0
  }
3746
3747
0
  w--;
3748
0
  dst++;
3749
0
    }
3750
3751
0
    _mm_empty ();
3752
0
}
3753
3754
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_OVER,
3755
             scaled_bilinear_scanline_mmx_8888_8888_OVER,
3756
             uint32_t, uint32_t, uint32_t,
3757
             COVER, FLAG_NONE)
3758
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_OVER,
3759
             scaled_bilinear_scanline_mmx_8888_8888_OVER,
3760
             uint32_t, uint32_t, uint32_t,
3761
             PAD, FLAG_NONE)
3762
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_OVER,
3763
             scaled_bilinear_scanline_mmx_8888_8888_OVER,
3764
             uint32_t, uint32_t, uint32_t,
3765
             NONE, FLAG_NONE)
3766
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_OVER,
3767
             scaled_bilinear_scanline_mmx_8888_8888_OVER,
3768
             uint32_t, uint32_t, uint32_t,
3769
             NORMAL, FLAG_NONE)
3770
3771
static force_inline void
3772
scaled_bilinear_scanline_mmx_8888_8_8888_OVER (uint32_t *       dst,
3773
                 const uint8_t  * mask,
3774
                 const uint32_t * src_top,
3775
                 const uint32_t * src_bottom,
3776
                 int32_t          w,
3777
                 int              wt,
3778
                 int              wb,
3779
                 pixman_fixed_t   vx,
3780
                 pixman_fixed_t   unit_x,
3781
                 pixman_fixed_t   max_vx,
3782
                 pixman_bool_t    zero_src)
3783
0
{
3784
0
    BILINEAR_DECLARE_VARIABLES;
3785
0
    __m64 pix1, pix2;
3786
0
    uint32_t m;
3787
3788
0
    while (w)
3789
0
    {
3790
0
  m = (uint32_t) *mask++;
3791
3792
0
  if (m)
3793
0
  {
3794
0
      BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
3795
3796
0
      if (m == 0xff && is_opaque (pix1))
3797
0
      {
3798
0
    store (dst, pix1);
3799
0
      }
3800
0
      else
3801
0
      {
3802
0
    __m64 ms, md, ma, msa;
3803
3804
0
    pix2 = load (dst);
3805
0
    ma = expand_alpha_rev (to_m64 (m));
3806
0
    ms = _mm_unpacklo_pi8 (pix1, _mm_setzero_si64 ());
3807
0
    md = _mm_unpacklo_pi8 (pix2, _mm_setzero_si64 ());
3808
3809
0
    msa = expand_alpha (ms);
3810
3811
0
    store8888 (dst, (in_over (ms, msa, ma, md)));
3812
0
      }
3813
0
  }
3814
0
  else
3815
0
  {
3816
0
      BILINEAR_SKIP_ONE_PIXEL ();
3817
0
  }
3818
3819
0
  w--;
3820
0
  dst++;
3821
0
    }
3822
3823
0
    _mm_empty ();
3824
0
}
3825
3826
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_cover_OVER,
3827
             scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3828
             uint32_t, uint8_t, uint32_t,
3829
             COVER, FLAG_HAVE_NON_SOLID_MASK)
3830
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_pad_OVER,
3831
             scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3832
             uint32_t, uint8_t, uint32_t,
3833
             PAD, FLAG_HAVE_NON_SOLID_MASK)
3834
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_none_OVER,
3835
             scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3836
             uint32_t, uint8_t, uint32_t,
3837
             NONE, FLAG_HAVE_NON_SOLID_MASK)
3838
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_normal_OVER,
3839
             scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3840
             uint32_t, uint8_t, uint32_t,
3841
             NORMAL, FLAG_HAVE_NON_SOLID_MASK)
3842
3843
static uint32_t *
3844
mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
3845
0
{
3846
0
    int w = iter->width;
3847
0
    uint32_t *dst = iter->buffer;
3848
0
    uint32_t *src = (uint32_t *)iter->bits;
3849
3850
0
    iter->bits += iter->stride;
3851
3852
0
    while (w && ((uintptr_t)dst) & 7)
3853
0
    {
3854
0
  *dst++ = (*src++) | 0xff000000;
3855
0
  w--;
3856
0
    }
3857
3858
0
    while (w >= 8)
3859
0
    {
3860
0
  __m64 vsrc1 = ldq_u ((__m64 *)(src + 0));
3861
0
  __m64 vsrc2 = ldq_u ((__m64 *)(src + 2));
3862
0
  __m64 vsrc3 = ldq_u ((__m64 *)(src + 4));
3863
0
  __m64 vsrc4 = ldq_u ((__m64 *)(src + 6));
3864
3865
0
  *(__m64 *)(dst + 0) = _mm_or_si64 (vsrc1, MC (ff000000));
3866
0
  *(__m64 *)(dst + 2) = _mm_or_si64 (vsrc2, MC (ff000000));
3867
0
  *(__m64 *)(dst + 4) = _mm_or_si64 (vsrc3, MC (ff000000));
3868
0
  *(__m64 *)(dst + 6) = _mm_or_si64 (vsrc4, MC (ff000000));
3869
3870
0
  dst += 8;
3871
0
  src += 8;
3872
0
  w -= 8;
3873
0
    }
3874
3875
0
    while (w)
3876
0
    {
3877
0
  *dst++ = (*src++) | 0xff000000;
3878
0
  w--;
3879
0
    }
3880
3881
0
    _mm_empty ();
3882
0
    return iter->buffer;
3883
0
}
3884
3885
static uint32_t *
3886
mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
3887
0
{
3888
0
    int w = iter->width;
3889
0
    uint32_t *dst = iter->buffer;
3890
0
    uint16_t *src = (uint16_t *)iter->bits;
3891
3892
0
    iter->bits += iter->stride;
3893
3894
0
    while (w && ((uintptr_t)dst) & 0x0f)
3895
0
    {
3896
0
  uint16_t s = *src++;
3897
3898
0
  *dst++ = convert_0565_to_8888 (s);
3899
0
  w--;
3900
0
    }
3901
3902
0
    while (w >= 4)
3903
0
    {
3904
0
  __m64 vsrc = ldq_u ((__m64 *)src);
3905
0
  __m64 mm0, mm1;
3906
3907
0
  expand_4xpacked565 (vsrc, &mm0, &mm1, 1);
3908
3909
0
  *(__m64 *)(dst + 0) = mm0;
3910
0
  *(__m64 *)(dst + 2) = mm1;
3911
3912
0
  dst += 4;
3913
0
  src += 4;
3914
0
  w -= 4;
3915
0
    }
3916
3917
0
    while (w)
3918
0
    {
3919
0
  uint16_t s = *src++;
3920
3921
0
  *dst++ = convert_0565_to_8888 (s);
3922
0
  w--;
3923
0
    }
3924
3925
0
    _mm_empty ();
3926
0
    return iter->buffer;
3927
0
}
3928
3929
static uint32_t *
3930
mmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
3931
0
{
3932
0
    int w = iter->width;
3933
0
    uint32_t *dst = iter->buffer;
3934
0
    uint8_t *src = iter->bits;
3935
3936
0
    iter->bits += iter->stride;
3937
3938
0
    while (w && (((uintptr_t)dst) & 15))
3939
0
    {
3940
0
        *dst++ = (uint32_t)*(src++) << 24;
3941
0
        w--;
3942
0
    }
3943
3944
0
    while (w >= 8)
3945
0
    {
3946
0
  __m64 mm0 = ldq_u ((__m64 *)src);
3947
3948
0
  __m64 mm1 = _mm_unpacklo_pi8  (_mm_setzero_si64(), mm0);
3949
0
  __m64 mm2 = _mm_unpackhi_pi8  (_mm_setzero_si64(), mm0);
3950
0
  __m64 mm3 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm1);
3951
0
  __m64 mm4 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm1);
3952
0
  __m64 mm5 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm2);
3953
0
  __m64 mm6 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm2);
3954
3955
0
  *(__m64 *)(dst + 0) = mm3;
3956
0
  *(__m64 *)(dst + 2) = mm4;
3957
0
  *(__m64 *)(dst + 4) = mm5;
3958
0
  *(__m64 *)(dst + 6) = mm6;
3959
3960
0
  dst += 8;
3961
0
  src += 8;
3962
0
  w -= 8;
3963
0
    }
3964
3965
0
    while (w)
3966
0
    {
3967
0
  *dst++ = (uint32_t)*(src++) << 24;
3968
0
  w--;
3969
0
    }
3970
3971
0
    _mm_empty ();
3972
0
    return iter->buffer;
3973
0
}
3974
3975
#define IMAGE_FLAGS             \
3976
    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |    \
3977
     FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
3978
3979
static const pixman_iter_info_t mmx_iters[] = 
3980
{
3981
    { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW,
3982
      _pixman_iter_init_bits_stride, mmx_fetch_x8r8g8b8, NULL
3983
    },
3984
    { PIXMAN_r5g6b5, IMAGE_FLAGS, ITER_NARROW,
3985
      _pixman_iter_init_bits_stride, mmx_fetch_r5g6b5, NULL
3986
    },
3987
    { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW,
3988
      _pixman_iter_init_bits_stride, mmx_fetch_a8, NULL
3989
    },
3990
    { PIXMAN_null },
3991
};
3992
3993
static const pixman_fast_path_t mmx_fast_paths[] =
3994
{
3995
    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       r5g6b5,   mmx_composite_over_n_8_0565       ),
3996
    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       b5g6r5,   mmx_composite_over_n_8_0565       ),
3997
    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8r8g8b8, mmx_composite_over_n_8_8888       ),
3998
    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8r8g8b8, mmx_composite_over_n_8_8888       ),
3999
    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8b8g8r8, mmx_composite_over_n_8_8888       ),
4000
    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8b8g8r8, mmx_composite_over_n_8_8888       ),
4001
    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
4002
    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
4003
    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, r5g6b5,   mmx_composite_over_n_8888_0565_ca ),
4004
    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
4005
    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
4006
    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, b5g6r5,   mmx_composite_over_n_8888_0565_ca ),
4007
    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   a8r8g8b8, mmx_composite_over_pixbuf_8888    ),
4008
    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   x8r8g8b8, mmx_composite_over_pixbuf_8888    ),
4009
    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   r5g6b5,   mmx_composite_over_pixbuf_0565    ),
4010
    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  a8b8g8r8, mmx_composite_over_pixbuf_8888    ),
4011
    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  x8b8g8r8, mmx_composite_over_pixbuf_8888    ),
4012
    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  b5g6r5,   mmx_composite_over_pixbuf_0565    ),
4013
    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    a8r8g8b8, mmx_composite_over_x888_n_8888    ),
4014
    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    x8r8g8b8, mmx_composite_over_x888_n_8888    ),
4015
    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    a8b8g8r8, mmx_composite_over_x888_n_8888    ),
4016
    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    x8b8g8r8, mmx_composite_over_x888_n_8888    ),
4017
    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    a8r8g8b8, mmx_composite_over_8888_n_8888    ),
4018
    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    x8r8g8b8, mmx_composite_over_8888_n_8888    ),
4019
    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    a8b8g8r8, mmx_composite_over_8888_n_8888    ),
4020
    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    x8b8g8r8, mmx_composite_over_8888_n_8888    ),
4021
    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       x8r8g8b8, mmx_composite_over_x888_8_8888    ),
4022
    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       a8r8g8b8, mmx_composite_over_x888_8_8888    ),
4023
    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       x8b8g8r8, mmx_composite_over_x888_8_8888    ),
4024
    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       a8b8g8r8, mmx_composite_over_x888_8_8888    ),
4025
    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     a8r8g8b8, mmx_composite_over_n_8888         ),
4026
    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     x8r8g8b8, mmx_composite_over_n_8888         ),
4027
    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     r5g6b5,   mmx_composite_over_n_0565         ),
4028
    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     b5g6r5,   mmx_composite_over_n_0565         ),
4029
    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
4030
    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
4031
4032
    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     a8r8g8b8, mmx_composite_over_8888_8888      ),
4033
    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     x8r8g8b8, mmx_composite_over_8888_8888      ),
4034
    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     r5g6b5,   mmx_composite_over_8888_0565      ),
4035
    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     a8b8g8r8, mmx_composite_over_8888_8888      ),
4036
    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     x8b8g8r8, mmx_composite_over_8888_8888      ),
4037
    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     b5g6r5,   mmx_composite_over_8888_0565      ),
4038
4039
    PIXMAN_STD_FAST_PATH    (OVER_REVERSE, solid, null, a8r8g8b8, mmx_composite_over_reverse_n_8888),
4040
    PIXMAN_STD_FAST_PATH    (OVER_REVERSE, solid, null, a8b8g8r8, mmx_composite_over_reverse_n_8888),
4041
4042
    PIXMAN_STD_FAST_PATH    (ADD,  r5g6b5,   null,     r5g6b5,   mmx_composite_add_0565_0565       ),
4043
    PIXMAN_STD_FAST_PATH    (ADD,  b5g6r5,   null,     b5g6r5,   mmx_composite_add_0565_0565       ),
4044
    PIXMAN_STD_FAST_PATH    (ADD,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_add_8888_8888       ),
4045
    PIXMAN_STD_FAST_PATH    (ADD,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_add_8888_8888       ),
4046
    PIXMAN_STD_FAST_PATH    (ADD,  a8,       null,     a8,       mmx_composite_add_8_8       ),
4047
    PIXMAN_STD_FAST_PATH    (ADD,  solid,    a8,       a8,       mmx_composite_add_n_8_8           ),
4048
4049
    PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     r5g6b5,   mmx_composite_src_x888_0565       ),
4050
    PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     b5g6r5,   mmx_composite_src_x888_0565       ),
4051
    PIXMAN_STD_FAST_PATH    (SRC,  x8r8g8b8, null,     r5g6b5,   mmx_composite_src_x888_0565       ),
4052
    PIXMAN_STD_FAST_PATH    (SRC,  x8b8g8r8, null,     b5g6r5,   mmx_composite_src_x888_0565       ),
4053
    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8r8g8b8, mmx_composite_src_n_8_8888        ),
4054
    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8r8g8b8, mmx_composite_src_n_8_8888        ),
4055
    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8b8g8r8, mmx_composite_src_n_8_8888        ),
4056
    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8b8g8r8, mmx_composite_src_n_8_8888        ),
4057
    PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_copy_area           ),
4058
    PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_copy_area           ),
4059
    PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
4060
    PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
4061
    PIXMAN_STD_FAST_PATH    (SRC,  x8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
4062
    PIXMAN_STD_FAST_PATH    (SRC,  x8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
4063
    PIXMAN_STD_FAST_PATH    (SRC,  r5g6b5,   null,     r5g6b5,   mmx_composite_copy_area           ),
4064
    PIXMAN_STD_FAST_PATH    (SRC,  b5g6r5,   null,     b5g6r5,   mmx_composite_copy_area           ),
4065
4066
    PIXMAN_STD_FAST_PATH    (IN,   a8,       null,     a8,       mmx_composite_in_8_8              ),
4067
    PIXMAN_STD_FAST_PATH    (IN,   solid,    a8,       a8,       mmx_composite_in_n_8_8            ),
4068
4069
    SIMPLE_NEAREST_FAST_PATH (OVER,   a8r8g8b8, x8r8g8b8, mmx_8888_8888                            ),
4070
    SIMPLE_NEAREST_FAST_PATH (OVER,   a8b8g8r8, x8b8g8r8, mmx_8888_8888                            ),
4071
    SIMPLE_NEAREST_FAST_PATH (OVER,   a8r8g8b8, a8r8g8b8, mmx_8888_8888                            ),
4072
    SIMPLE_NEAREST_FAST_PATH (OVER,   a8b8g8r8, a8b8g8r8, mmx_8888_8888                            ),
4073
4074
    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_n_8888                 ),
4075
    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_n_8888                 ),
4076
    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_n_8888                 ),
4077
    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_n_8888                 ),
4078
4079
    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8,          a8r8g8b8, mmx_8888_8888                     ),
4080
    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8,          x8r8g8b8, mmx_8888_8888                     ),
4081
    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8,          x8r8g8b8, mmx_8888_8888                     ),
4082
    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8,          a8b8g8r8, mmx_8888_8888                     ),
4083
    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8,          x8b8g8r8, mmx_8888_8888                     ),
4084
    SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8,          x8b8g8r8, mmx_8888_8888                     ),
4085
4086
    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8,         x8r8g8b8, mmx_8888_8888                     ),
4087
    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8,         x8b8g8r8, mmx_8888_8888                     ),
4088
    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8,         a8r8g8b8, mmx_8888_8888                     ),
4089
    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8,         a8b8g8r8, mmx_8888_8888                     ),
4090
4091
    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8_8888                   ),
4092
    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8_8888                   ),
4093
    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8_8888                   ),
4094
    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8_8888                   ),
4095
4096
    { PIXMAN_OP_NONE },
4097
};
4098
4099
pixman_implementation_t *
4100
_pixman_implementation_create_mmx (pixman_implementation_t *fallback)
4101
12
{
4102
12
    pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths);
4103
4104
12
    imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u;
4105
12
    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u;
4106
12
    imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u;
4107
12
    imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u;
4108
12
    imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u;
4109
12
    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u;
4110
12
    imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u;
4111
12
    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u;
4112
12
    imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u;
4113
12
    imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u;
4114
12
    imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u;
4115
4116
12
    imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca;
4117
12
    imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca;
4118
12
    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca;
4119
12
    imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca;
4120
12
    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca;
4121
12
    imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca;
4122
12
    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca;
4123
12
    imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca;
4124
12
    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca;
4125
12
    imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca;
4126
12
    imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca;
4127
4128
12
    imp->blt = mmx_blt;
4129
12
    imp->fill = mmx_fill;
4130
4131
12
    imp->iter_info = mmx_iters;
4132
4133
12
    return imp;
4134
12
}
4135
4136
#endif /* USE_X86_MMX || USE_LOONGSON_MMI */