Coverage Report

Created: 2025-07-23 08:13

/src/cairo/subprojects/pixman-0.44.2/pixman/pixman-sse2.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright © 2008 Rodrigo Kumpera
3
 * Copyright © 2008 André Tupinambá
4
 *
5
 * Permission to use, copy, modify, distribute, and sell this software and its
6
 * documentation for any purpose is hereby granted without fee, provided that
7
 * the above copyright notice appear in all copies and that both that
8
 * copyright notice and this permission notice appear in supporting
9
 * documentation, and that the name of Red Hat not be used in advertising or
10
 * publicity pertaining to distribution of the software without specific,
11
 * written prior permission.  Red Hat makes no representations about the
12
 * suitability of this software for any purpose.  It is provided "as is"
13
 * without express or implied warranty.
14
 *
15
 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16
 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17
 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18
 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20
 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21
 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
22
 * SOFTWARE.
23
 *
24
 * Author:  Rodrigo Kumpera (kumpera@gmail.com)
25
 *          André Tupinambá (andrelrt@gmail.com)
26
 *
27
 * Based on work by Owen Taylor and Søren Sandmann
28
 */
29
#ifdef HAVE_CONFIG_H
30
#include <pixman-config.h>
31
#endif
32
33
/* PSHUFD is slow on a lot of old processors, and new processors have SSSE3 */
34
#define PSHUFD_IS_FAST 0
35
36
#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
37
#include <emmintrin.h> /* for SSE2 intrinsics */
38
#include "pixman-private.h"
39
#include "pixman-combine32.h"
40
#include "pixman-inlines.h"
41
42
static __m128i mask_0080;
43
static __m128i mask_00ff;
44
static __m128i mask_0101;
45
static __m128i mask_ffff;
46
static __m128i mask_ff000000;
47
static __m128i mask_alpha;
48
49
static __m128i mask_565_r;
50
static __m128i mask_565_g1, mask_565_g2;
51
static __m128i mask_565_b;
52
static __m128i mask_red;
53
static __m128i mask_green;
54
static __m128i mask_blue;
55
56
static __m128i mask_565_fix_rb;
57
static __m128i mask_565_fix_g;
58
59
static __m128i mask_565_rb;
60
static __m128i mask_565_pack_multiplier;
61
62
static force_inline __m128i
63
unpack_32_1x128 (uint32_t data)
64
739
{
65
739
    return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
66
739
}
67
68
static force_inline void
69
unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
70
1.38k
{
71
1.38k
    *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
72
1.38k
    *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
73
1.38k
}
74
75
static force_inline __m128i
76
unpack_565_to_8888 (__m128i lo)
77
0
{
78
0
    __m128i r, g, b, rb, t;
79
80
0
    r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
81
0
    g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
82
0
    b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
83
84
0
    rb = _mm_or_si128 (r, b);
85
0
    t  = _mm_and_si128 (rb, mask_565_fix_rb);
86
0
    t  = _mm_srli_epi32 (t, 5);
87
0
    rb = _mm_or_si128 (rb, t);
88
89
0
    t  = _mm_and_si128 (g, mask_565_fix_g);
90
0
    t  = _mm_srli_epi32 (t, 6);
91
0
    g  = _mm_or_si128 (g, t);
92
93
0
    return _mm_or_si128 (rb, g);
94
0
}
95
96
static force_inline void
97
unpack_565_128_4x128 (__m128i  data,
98
                      __m128i* data0,
99
                      __m128i* data1,
100
                      __m128i* data2,
101
                      __m128i* data3)
102
0
{
103
0
    __m128i lo, hi;
104
105
0
    lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
106
0
    hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
107
108
0
    lo = unpack_565_to_8888 (lo);
109
0
    hi = unpack_565_to_8888 (hi);
110
111
0
    unpack_128_2x128 (lo, data0, data1);
112
0
    unpack_128_2x128 (hi, data2, data3);
113
0
}
114
115
static force_inline uint16_t
116
pack_565_32_16 (uint32_t pixel)
117
0
{
118
0
    return (uint16_t) (((pixel >> 8) & 0xf800) |
119
0
           ((pixel >> 5) & 0x07e0) |
120
0
           ((pixel >> 3) & 0x001f));
121
0
}
122
123
static force_inline __m128i
124
pack_2x128_128 (__m128i lo, __m128i hi)
125
685
{
126
685
    return _mm_packus_epi16 (lo, hi);
127
685
}
128
129
static force_inline __m128i
130
pack_565_2packedx128_128 (__m128i lo, __m128i hi)
131
0
{
132
0
    __m128i rb0 = _mm_and_si128 (lo, mask_565_rb);
133
0
    __m128i rb1 = _mm_and_si128 (hi, mask_565_rb);
134
135
0
    __m128i t0 = _mm_madd_epi16 (rb0, mask_565_pack_multiplier);
136
0
    __m128i t1 = _mm_madd_epi16 (rb1, mask_565_pack_multiplier);
137
138
0
    __m128i g0 = _mm_and_si128 (lo, mask_green);
139
0
    __m128i g1 = _mm_and_si128 (hi, mask_green);
140
141
0
    t0 = _mm_or_si128 (t0, g0);
142
0
    t1 = _mm_or_si128 (t1, g1);
143
144
    /* Simulates _mm_packus_epi32 */
145
0
    t0 = _mm_slli_epi32 (t0, 16 - 5);
146
0
    t1 = _mm_slli_epi32 (t1, 16 - 5);
147
0
    t0 = _mm_srai_epi32 (t0, 16);
148
0
    t1 = _mm_srai_epi32 (t1, 16);
149
0
    return _mm_packs_epi32 (t0, t1);
150
0
}
151
152
static force_inline __m128i
153
pack_565_2x128_128 (__m128i lo, __m128i hi)
154
0
{
155
0
    __m128i data;
156
0
    __m128i r, g1, g2, b;
157
158
0
    data = pack_2x128_128 (lo, hi);
159
160
0
    r  = _mm_and_si128 (data, mask_565_r);
161
0
    g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
162
0
    g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
163
0
    b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
164
165
0
    return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
166
0
}
167
168
static force_inline __m128i
169
pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
170
0
{
171
0
    return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
172
0
           pack_565_2x128_128 (*xmm2, *xmm3));
173
0
}
174
175
static force_inline int
176
is_opaque (__m128i x)
177
1.65k
{
178
1.65k
    __m128i ffs = _mm_cmpeq_epi8 (x, x);
179
180
1.65k
    return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
181
1.65k
}
182
183
static force_inline int
184
is_zero (__m128i x)
185
4.43k
{
186
4.43k
    return _mm_movemask_epi8 (
187
4.43k
  _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
188
4.43k
}
189
190
static force_inline int
191
is_transparent (__m128i x)
192
0
{
193
0
    return (_mm_movemask_epi8 (
194
0
    _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
195
0
}
196
197
static force_inline __m128i
198
expand_pixel_32_1x128 (uint32_t data)
199
2
{
200
2
    return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
201
2
}
202
203
static force_inline __m128i
204
expand_alpha_1x128 (__m128i data)
205
230
{
206
230
    return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
207
230
                 _MM_SHUFFLE (3, 3, 3, 3)),
208
230
        _MM_SHUFFLE (3, 3, 3, 3));
209
230
}
210
211
static force_inline void
212
expand_alpha_2x128 (__m128i  data_lo,
213
                    __m128i  data_hi,
214
                    __m128i* alpha_lo,
215
                    __m128i* alpha_hi)
216
558
{
217
558
    __m128i lo, hi;
218
219
558
    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
220
558
    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
221
222
558
    *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
223
558
    *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
224
558
}
225
226
static force_inline void
227
expand_alpha_rev_2x128 (__m128i  data_lo,
228
                        __m128i  data_hi,
229
                        __m128i* alpha_lo,
230
                        __m128i* alpha_hi)
231
137
{
232
137
    __m128i lo, hi;
233
234
137
    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
235
137
    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
236
137
    *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
237
137
    *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
238
137
}
239
240
static force_inline void
241
pix_multiply_2x128 (__m128i* data_lo,
242
                    __m128i* data_hi,
243
                    __m128i* alpha_lo,
244
                    __m128i* alpha_hi,
245
                    __m128i* ret_lo,
246
                    __m128i* ret_hi)
247
959
{
248
959
    __m128i lo, hi;
249
250
959
    lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
251
959
    hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
252
959
    lo = _mm_adds_epu16 (lo, mask_0080);
253
959
    hi = _mm_adds_epu16 (hi, mask_0080);
254
959
    *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
255
959
    *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
256
959
}
257
258
static force_inline void
259
pix_add_multiply_2x128 (__m128i* src_lo,
260
                        __m128i* src_hi,
261
                        __m128i* alpha_dst_lo,
262
                        __m128i* alpha_dst_hi,
263
                        __m128i* dst_lo,
264
                        __m128i* dst_hi,
265
                        __m128i* alpha_src_lo,
266
                        __m128i* alpha_src_hi,
267
                        __m128i* ret_lo,
268
                        __m128i* ret_hi)
269
0
{
270
0
    __m128i t1_lo, t1_hi;
271
0
    __m128i t2_lo, t2_hi;
272
273
0
    pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
274
0
    pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
275
276
0
    *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
277
0
    *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
278
0
}
279
280
static force_inline void
281
negate_2x128 (__m128i  data_lo,
282
              __m128i  data_hi,
283
              __m128i* neg_lo,
284
              __m128i* neg_hi)
285
685
{
286
685
    *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
287
685
    *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
288
685
}
289
290
static force_inline void
291
invert_colors_2x128 (__m128i  data_lo,
292
                     __m128i  data_hi,
293
                     __m128i* inv_lo,
294
                     __m128i* inv_hi)
295
0
{
296
0
    __m128i lo, hi;
297
298
0
    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
299
0
    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
300
0
    *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
301
0
    *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
302
0
}
303
304
static force_inline void
305
over_2x128 (__m128i* src_lo,
306
            __m128i* src_hi,
307
            __m128i* alpha_lo,
308
            __m128i* alpha_hi,
309
            __m128i* dst_lo,
310
            __m128i* dst_hi)
311
685
{
312
685
    __m128i t1, t2;
313
314
685
    negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
315
316
685
    pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
317
318
685
    *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
319
685
    *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
320
685
}
321
322
static force_inline void
323
over_rev_non_pre_2x128 (__m128i  src_lo,
324
                        __m128i  src_hi,
325
                        __m128i* dst_lo,
326
                        __m128i* dst_hi)
327
0
{
328
0
    __m128i lo, hi;
329
0
    __m128i alpha_lo, alpha_hi;
330
331
0
    expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
332
333
0
    lo = _mm_or_si128 (alpha_lo, mask_alpha);
334
0
    hi = _mm_or_si128 (alpha_hi, mask_alpha);
335
336
0
    invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
337
338
0
    pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
339
340
0
    over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
341
0
}
342
343
static force_inline void
344
in_over_2x128 (__m128i* src_lo,
345
               __m128i* src_hi,
346
               __m128i* alpha_lo,
347
               __m128i* alpha_hi,
348
               __m128i* mask_lo,
349
               __m128i* mask_hi,
350
               __m128i* dst_lo,
351
               __m128i* dst_hi)
352
137
{
353
137
    __m128i s_lo, s_hi;
354
137
    __m128i a_lo, a_hi;
355
356
137
    pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
357
137
    pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
358
359
137
    over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
360
137
}
361
362
/* load 4 pixels from a 16-byte boundary aligned address */
363
static force_inline __m128i
364
load_128_aligned (__m128i* src)
365
685
{
366
685
    return _mm_load_si128 (src);
367
685
}
368
369
/* load 4 pixels from a unaligned address */
370
static force_inline __m128i
371
load_128_unaligned (const __m128i* src)
372
4.44k
{
373
4.44k
    return _mm_loadu_si128 (src);
374
4.44k
}
375
376
/* save 4 pixels on a 16-byte boundary aligned address */
377
static force_inline void
378
save_128_aligned (__m128i* dst,
379
                  __m128i  data)
380
95.0k
{
381
95.0k
    _mm_store_si128 (dst, data);
382
95.0k
}
383
384
static force_inline __m128i
385
load_32_1x128 (uint32_t data)
386
78
{
387
78
    return _mm_cvtsi32_si128 (data);
388
78
}
389
390
static force_inline __m128i
391
expand_alpha_rev_1x128 (__m128i data)
392
78
{
393
78
    return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
394
78
}
395
396
static force_inline __m128i
397
expand_pixel_8_1x128 (uint8_t data)
398
33
{
399
33
    return _mm_shufflelo_epi16 (
400
33
  unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
401
33
}
402
403
static force_inline __m128i
404
pix_multiply_1x128 (__m128i data,
405
        __m128i alpha)
406
444
{
407
444
    return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
408
444
              mask_0080),
409
444
          mask_0101);
410
444
}
411
412
static force_inline __m128i
413
pix_add_multiply_1x128 (__m128i* src,
414
      __m128i* alpha_dst,
415
      __m128i* dst,
416
      __m128i* alpha_src)
417
0
{
418
0
    __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
419
0
    __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
420
421
0
    return _mm_adds_epu8 (t1, t2);
422
0
}
423
424
static force_inline __m128i
425
negate_1x128 (__m128i data)
426
300
{
427
300
    return _mm_xor_si128 (data, mask_00ff);
428
300
}
429
430
static force_inline __m128i
431
invert_colors_1x128 (__m128i data)
432
0
{
433
0
    return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
434
0
}
435
436
static force_inline __m128i
437
over_1x128 (__m128i src, __m128i alpha, __m128i dst)
438
300
{
439
300
    return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
440
300
}
441
442
static force_inline __m128i
443
in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
444
72
{
445
72
    return over_1x128 (pix_multiply_1x128 (*src, *mask),
446
72
           pix_multiply_1x128 (*alpha, *mask),
447
72
           *dst);
448
72
}
449
450
static force_inline __m128i
451
over_rev_non_pre_1x128 (__m128i src, __m128i dst)
452
0
{
453
0
    __m128i alpha = expand_alpha_1x128 (src);
454
455
0
    return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
456
0
             _mm_or_si128 (alpha, mask_alpha)),
457
0
           alpha,
458
0
           dst);
459
0
}
460
461
static force_inline uint32_t
462
pack_1x128_32 (__m128i data)
463
300
{
464
300
    return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
465
300
}
466
467
static force_inline __m128i
468
expand565_16_1x128 (uint16_t pixel)
469
0
{
470
0
    __m128i m = _mm_cvtsi32_si128 (pixel);
471
472
0
    m = unpack_565_to_8888 (m);
473
474
0
    return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
475
0
}
476
477
static force_inline uint32_t
478
core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
479
341
{
480
341
    uint8_t a;
481
341
    __m128i xmms;
482
483
341
    a = src >> 24;
484
485
341
    if (a == 0xff)
486
113
    {
487
113
  return src;
488
113
    }
489
228
    else if (src)
490
228
    {
491
228
  xmms = unpack_32_1x128 (src);
492
228
  return pack_1x128_32 (
493
228
      over_1x128 (xmms, expand_alpha_1x128 (xmms),
494
228
      unpack_32_1x128 (dst)));
495
228
    }
496
497
0
    return dst;
498
341
}
499
500
static force_inline uint32_t
501
combine1 (const uint32_t *ps, const uint32_t *pm)
502
0
{
503
0
    uint32_t s;
504
0
    memcpy(&s, ps, sizeof(uint32_t));
505
506
0
    if (pm)
507
0
    {
508
0
  __m128i ms, mm;
509
510
0
  mm = unpack_32_1x128 (*pm);
511
0
  mm = expand_alpha_1x128 (mm);
512
513
0
  ms = unpack_32_1x128 (s);
514
0
  ms = pix_multiply_1x128 (ms, mm);
515
516
0
  s = pack_1x128_32 (ms);
517
0
    }
518
519
0
    return s;
520
0
}
521
522
static force_inline __m128i
523
combine4 (const __m128i *ps, const __m128i *pm)
524
0
{
525
0
    __m128i xmm_src_lo, xmm_src_hi;
526
0
    __m128i xmm_msk_lo, xmm_msk_hi;
527
0
    __m128i s;
528
529
0
    if (pm)
530
0
    {
531
0
  xmm_msk_lo = load_128_unaligned (pm);
532
533
0
  if (is_transparent (xmm_msk_lo))
534
0
      return _mm_setzero_si128 ();
535
0
    }
536
537
0
    s = load_128_unaligned (ps);
538
539
0
    if (pm)
540
0
    {
541
0
  unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
542
0
  unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
543
544
0
  expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
545
546
0
  pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
547
0
          &xmm_msk_lo, &xmm_msk_hi,
548
0
          &xmm_src_lo, &xmm_src_hi);
549
550
0
  s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
551
0
    }
552
553
0
    return s;
554
0
}
555
556
static force_inline void
557
core_combine_over_u_sse2_mask (uint32_t *   pd,
558
             const uint32_t*    ps,
559
             const uint32_t*    pm,
560
             int                w)
561
0
{
562
0
    uint32_t s, d;
563
564
    /* Align dst on a 16-byte boundary */
565
0
    while (w && ((uintptr_t)pd & 15))
566
0
    {
567
0
  d = *pd;
568
0
  s = combine1 (ps, pm);
569
570
0
  if (s)
571
0
      *pd = core_combine_over_u_pixel_sse2 (s, d);
572
0
  pd++;
573
0
  ps++;
574
0
  pm++;
575
0
  w--;
576
0
    }
577
578
0
    while (w >= 4)
579
0
    {
580
0
  __m128i mask = load_128_unaligned ((__m128i *)pm);
581
582
0
  if (!is_zero (mask))
583
0
  {
584
0
      __m128i src;
585
0
      __m128i src_hi, src_lo;
586
0
      __m128i mask_hi, mask_lo;
587
0
      __m128i alpha_hi, alpha_lo;
588
589
0
      src = load_128_unaligned ((__m128i *)ps);
590
591
0
      if (is_opaque (_mm_and_si128 (src, mask)))
592
0
      {
593
0
    save_128_aligned ((__m128i *)pd, src);
594
0
      }
595
0
      else
596
0
      {
597
0
    __m128i dst = load_128_aligned ((__m128i *)pd);
598
0
    __m128i dst_hi, dst_lo;
599
600
0
    unpack_128_2x128 (mask, &mask_lo, &mask_hi);
601
0
    unpack_128_2x128 (src, &src_lo, &src_hi);
602
603
0
    expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
604
0
    pix_multiply_2x128 (&src_lo, &src_hi,
605
0
            &mask_lo, &mask_hi,
606
0
            &src_lo, &src_hi);
607
608
0
    unpack_128_2x128 (dst, &dst_lo, &dst_hi);
609
610
0
    expand_alpha_2x128 (src_lo, src_hi,
611
0
            &alpha_lo, &alpha_hi);
612
613
0
    over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
614
0
          &dst_lo, &dst_hi);
615
616
0
    save_128_aligned (
617
0
        (__m128i *)pd,
618
0
        pack_2x128_128 (dst_lo, dst_hi));
619
0
      }
620
0
  }
621
622
0
  pm += 4;
623
0
  ps += 4;
624
0
  pd += 4;
625
0
  w -= 4;
626
0
    }
627
0
    while (w)
628
0
    {
629
0
  d = *pd;
630
0
  s = combine1 (ps, pm);
631
632
0
  if (s)
633
0
      *pd = core_combine_over_u_pixel_sse2 (s, d);
634
0
  pd++;
635
0
  ps++;
636
0
  pm++;
637
638
0
  w--;
639
0
    }
640
0
}
641
642
static force_inline void
643
core_combine_over_u_sse2_no_mask (uint32_t *    pd,
644
          const uint32_t*    ps,
645
          int                w)
646
249
{
647
249
    uint32_t s, d;
648
649
    /* Align dst on a 16-byte boundary */
650
621
    while (w && ((uintptr_t)pd & 15))
651
372
    {
652
372
  d = *pd;
653
372
  s = *ps;
654
655
372
  if (s)
656
152
      *pd = core_combine_over_u_pixel_sse2 (s, d);
657
372
  pd++;
658
372
  ps++;
659
372
  w--;
660
372
    }
661
662
4.68k
    while (w >= 4)
663
4.43k
    {
664
4.43k
  __m128i src;
665
4.43k
  __m128i src_hi, src_lo, dst_hi, dst_lo;
666
4.43k
  __m128i alpha_hi, alpha_lo;
667
668
4.43k
  src = load_128_unaligned ((__m128i *)ps);
669
670
4.43k
  if (!is_zero (src))
671
1.65k
  {
672
1.65k
      if (is_opaque (src))
673
1.11k
      {
674
1.11k
    save_128_aligned ((__m128i *)pd, src);
675
1.11k
      }
676
548
      else
677
548
      {
678
548
    __m128i dst = load_128_aligned ((__m128i *)pd);
679
680
548
    unpack_128_2x128 (src, &src_lo, &src_hi);
681
548
    unpack_128_2x128 (dst, &dst_lo, &dst_hi);
682
683
548
    expand_alpha_2x128 (src_lo, src_hi,
684
548
            &alpha_lo, &alpha_hi);
685
548
    over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
686
548
          &dst_lo, &dst_hi);
687
688
548
    save_128_aligned (
689
548
        (__m128i *)pd,
690
548
        pack_2x128_128 (dst_lo, dst_hi));
691
548
      }
692
1.65k
  }
693
694
4.43k
  ps += 4;
695
4.43k
  pd += 4;
696
4.43k
  w -= 4;
697
4.43k
    }
698
691
    while (w)
699
442
    {
700
442
  d = *pd;
701
442
  s = *ps;
702
703
442
  if (s)
704
189
      *pd = core_combine_over_u_pixel_sse2 (s, d);
705
442
  pd++;
706
442
  ps++;
707
708
442
  w--;
709
442
    }
710
249
}
711
712
static force_inline void
713
sse2_combine_over_u (pixman_implementation_t *imp,
714
                     pixman_op_t              op,
715
                     uint32_t *               pd,
716
                     const uint32_t *         ps,
717
                     const uint32_t *         pm,
718
                     int                      w)
719
249
{
720
249
    if (pm)
721
0
  core_combine_over_u_sse2_mask (pd, ps, pm, w);
722
249
    else
723
249
  core_combine_over_u_sse2_no_mask (pd, ps, w);
724
249
}
725
726
static void
727
sse2_combine_over_reverse_u (pixman_implementation_t *imp,
728
                             pixman_op_t              op,
729
                             uint32_t *               pd,
730
                             const uint32_t *         ps,
731
                             const uint32_t *         pm,
732
                             int                      w)
733
0
{
734
0
    uint32_t s, d;
735
736
0
    __m128i xmm_dst_lo, xmm_dst_hi;
737
0
    __m128i xmm_src_lo, xmm_src_hi;
738
0
    __m128i xmm_alpha_lo, xmm_alpha_hi;
739
740
    /* Align dst on a 16-byte boundary */
741
0
    while (w &&
742
0
           ((uintptr_t)pd & 15))
743
0
    {
744
0
  d = *pd;
745
0
  s = combine1 (ps, pm);
746
747
0
  *pd++ = core_combine_over_u_pixel_sse2 (d, s);
748
0
  w--;
749
0
  ps++;
750
0
  if (pm)
751
0
      pm++;
752
0
    }
753
754
0
    while (w >= 4)
755
0
    {
756
  /* I'm loading unaligned because I'm not sure
757
   * about the address alignment.
758
   */
759
0
  xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
760
0
  xmm_dst_hi = load_128_aligned ((__m128i*) pd);
761
762
0
  unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
763
0
  unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
764
765
0
  expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
766
0
          &xmm_alpha_lo, &xmm_alpha_hi);
767
768
0
  over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
769
0
        &xmm_alpha_lo, &xmm_alpha_hi,
770
0
        &xmm_src_lo, &xmm_src_hi);
771
772
  /* rebuid the 4 pixel data and save*/
773
0
  save_128_aligned ((__m128i*)pd,
774
0
        pack_2x128_128 (xmm_src_lo, xmm_src_hi));
775
776
0
  w -= 4;
777
0
  ps += 4;
778
0
  pd += 4;
779
780
0
  if (pm)
781
0
      pm += 4;
782
0
    }
783
784
0
    while (w)
785
0
    {
786
0
  d = *pd;
787
0
  s = combine1 (ps, pm);
788
789
0
  *pd++ = core_combine_over_u_pixel_sse2 (d, s);
790
0
  ps++;
791
0
  w--;
792
0
  if (pm)
793
0
      pm++;
794
0
    }
795
0
}
796
797
static force_inline uint32_t
798
core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
799
0
{
800
0
    uint32_t maska = src >> 24;
801
802
0
    if (maska == 0)
803
0
    {
804
0
  return 0;
805
0
    }
806
0
    else if (maska != 0xff)
807
0
    {
808
0
  return pack_1x128_32 (
809
0
      pix_multiply_1x128 (unpack_32_1x128 (dst),
810
0
        expand_alpha_1x128 (unpack_32_1x128 (src))));
811
0
    }
812
813
0
    return dst;
814
0
}
815
816
static void
817
sse2_combine_in_u (pixman_implementation_t *imp,
818
                   pixman_op_t              op,
819
                   uint32_t *               pd,
820
                   const uint32_t *         ps,
821
                   const uint32_t *         pm,
822
                   int                      w)
823
0
{
824
0
    uint32_t s, d;
825
826
0
    __m128i xmm_src_lo, xmm_src_hi;
827
0
    __m128i xmm_dst_lo, xmm_dst_hi;
828
829
0
    while (w && ((uintptr_t)pd & 15))
830
0
    {
831
0
  s = combine1 (ps, pm);
832
0
  d = *pd;
833
834
0
  *pd++ = core_combine_in_u_pixel_sse2 (d, s);
835
0
  w--;
836
0
  ps++;
837
0
  if (pm)
838
0
      pm++;
839
0
    }
840
841
0
    while (w >= 4)
842
0
    {
843
0
  xmm_dst_hi = load_128_aligned ((__m128i*) pd);
844
0
  xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
845
846
0
  unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
847
0
  expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
848
849
0
  unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
850
0
  pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
851
0
          &xmm_dst_lo, &xmm_dst_hi,
852
0
          &xmm_dst_lo, &xmm_dst_hi);
853
854
0
  save_128_aligned ((__m128i*)pd,
855
0
        pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
856
857
0
  ps += 4;
858
0
  pd += 4;
859
0
  w -= 4;
860
0
  if (pm)
861
0
      pm += 4;
862
0
    }
863
864
0
    while (w)
865
0
    {
866
0
  s = combine1 (ps, pm);
867
0
  d = *pd;
868
869
0
  *pd++ = core_combine_in_u_pixel_sse2 (d, s);
870
0
  w--;
871
0
  ps++;
872
0
  if (pm)
873
0
      pm++;
874
0
    }
875
0
}
876
877
static void
878
sse2_combine_in_reverse_u (pixman_implementation_t *imp,
879
                           pixman_op_t              op,
880
                           uint32_t *               pd,
881
                           const uint32_t *         ps,
882
                           const uint32_t *         pm,
883
                           int                      w)
884
0
{
885
0
    uint32_t s, d;
886
887
0
    __m128i xmm_src_lo, xmm_src_hi;
888
0
    __m128i xmm_dst_lo, xmm_dst_hi;
889
890
0
    while (w && ((uintptr_t)pd & 15))
891
0
    {
892
0
  s = combine1 (ps, pm);
893
0
  d = *pd;
894
895
0
  *pd++ = core_combine_in_u_pixel_sse2 (s, d);
896
0
  ps++;
897
0
  w--;
898
0
  if (pm)
899
0
      pm++;
900
0
    }
901
902
0
    while (w >= 4)
903
0
    {
904
0
  xmm_dst_hi = load_128_aligned ((__m128i*) pd);
905
0
  xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
906
907
0
  unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
908
0
  expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
909
910
0
  unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
911
0
  pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
912
0
          &xmm_src_lo, &xmm_src_hi,
913
0
          &xmm_dst_lo, &xmm_dst_hi);
914
915
0
  save_128_aligned (
916
0
      (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
917
918
0
  ps += 4;
919
0
  pd += 4;
920
0
  w -= 4;
921
0
  if (pm)
922
0
      pm += 4;
923
0
    }
924
925
0
    while (w)
926
0
    {
927
0
  s = combine1 (ps, pm);
928
0
  d = *pd;
929
930
0
  *pd++ = core_combine_in_u_pixel_sse2 (s, d);
931
0
  w--;
932
0
  ps++;
933
0
  if (pm)
934
0
      pm++;
935
0
    }
936
0
}
937
938
static void
939
sse2_combine_out_reverse_u (pixman_implementation_t *imp,
940
                            pixman_op_t              op,
941
                            uint32_t *               pd,
942
                            const uint32_t *         ps,
943
                            const uint32_t *         pm,
944
                            int                      w)
945
0
{
946
0
    while (w && ((uintptr_t)pd & 15))
947
0
    {
948
0
  uint32_t s = combine1 (ps, pm);
949
0
  uint32_t d = *pd;
950
951
0
  *pd++ = pack_1x128_32 (
952
0
      pix_multiply_1x128 (
953
0
    unpack_32_1x128 (d), negate_1x128 (
954
0
        expand_alpha_1x128 (unpack_32_1x128 (s)))));
955
956
0
  if (pm)
957
0
      pm++;
958
0
  ps++;
959
0
  w--;
960
0
    }
961
962
0
    while (w >= 4)
963
0
    {
964
0
  __m128i xmm_src_lo, xmm_src_hi;
965
0
  __m128i xmm_dst_lo, xmm_dst_hi;
966
967
0
  xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
968
0
  xmm_dst_hi = load_128_aligned ((__m128i*) pd);
969
970
0
  unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
971
0
  unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
972
973
0
  expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
974
0
  negate_2x128       (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
975
976
0
  pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
977
0
          &xmm_src_lo, &xmm_src_hi,
978
0
          &xmm_dst_lo, &xmm_dst_hi);
979
980
0
  save_128_aligned (
981
0
      (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
982
983
0
  ps += 4;
984
0
  pd += 4;
985
0
  if (pm)
986
0
      pm += 4;
987
988
0
  w -= 4;
989
0
    }
990
991
0
    while (w)
992
0
    {
993
0
  uint32_t s = combine1 (ps, pm);
994
0
  uint32_t d = *pd;
995
996
0
  *pd++ = pack_1x128_32 (
997
0
      pix_multiply_1x128 (
998
0
    unpack_32_1x128 (d), negate_1x128 (
999
0
        expand_alpha_1x128 (unpack_32_1x128 (s)))));
1000
0
  ps++;
1001
0
  if (pm)
1002
0
      pm++;
1003
0
  w--;
1004
0
    }
1005
0
}
1006
1007
static void
1008
sse2_combine_out_u (pixman_implementation_t *imp,
1009
                    pixman_op_t              op,
1010
                    uint32_t *               pd,
1011
                    const uint32_t *         ps,
1012
                    const uint32_t *         pm,
1013
                    int                      w)
1014
0
{
1015
0
    while (w && ((uintptr_t)pd & 15))
1016
0
    {
1017
0
  uint32_t s = combine1 (ps, pm);
1018
0
  uint32_t d = *pd;
1019
1020
0
  *pd++ = pack_1x128_32 (
1021
0
      pix_multiply_1x128 (
1022
0
    unpack_32_1x128 (s), negate_1x128 (
1023
0
        expand_alpha_1x128 (unpack_32_1x128 (d)))));
1024
0
  w--;
1025
0
  ps++;
1026
0
  if (pm)
1027
0
      pm++;
1028
0
    }
1029
1030
0
    while (w >= 4)
1031
0
    {
1032
0
  __m128i xmm_src_lo, xmm_src_hi;
1033
0
  __m128i xmm_dst_lo, xmm_dst_hi;
1034
1035
0
  xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1036
0
  xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1037
1038
0
  unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1039
0
  unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1040
1041
0
  expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1042
0
  negate_2x128       (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1043
1044
0
  pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1045
0
          &xmm_dst_lo, &xmm_dst_hi,
1046
0
          &xmm_dst_lo, &xmm_dst_hi);
1047
1048
0
  save_128_aligned (
1049
0
      (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1050
1051
0
  ps += 4;
1052
0
  pd += 4;
1053
0
  w -= 4;
1054
0
  if (pm)
1055
0
      pm += 4;
1056
0
    }
1057
1058
0
    while (w)
1059
0
    {
1060
0
  uint32_t s = combine1 (ps, pm);
1061
0
  uint32_t d = *pd;
1062
1063
0
  *pd++ = pack_1x128_32 (
1064
0
      pix_multiply_1x128 (
1065
0
    unpack_32_1x128 (s), negate_1x128 (
1066
0
        expand_alpha_1x128 (unpack_32_1x128 (d)))));
1067
0
  w--;
1068
0
  ps++;
1069
0
  if (pm)
1070
0
      pm++;
1071
0
    }
1072
0
}
1073
1074
static force_inline uint32_t
1075
core_combine_atop_u_pixel_sse2 (uint32_t src,
1076
                                uint32_t dst)
1077
0
{
1078
0
    __m128i s = unpack_32_1x128 (src);
1079
0
    __m128i d = unpack_32_1x128 (dst);
1080
1081
0
    __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
1082
0
    __m128i da = expand_alpha_1x128 (d);
1083
1084
0
    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1085
0
}
1086
1087
static void
1088
sse2_combine_atop_u (pixman_implementation_t *imp,
1089
                     pixman_op_t              op,
1090
                     uint32_t *               pd,
1091
                     const uint32_t *         ps,
1092
                     const uint32_t *         pm,
1093
                     int                      w)
1094
0
{
1095
0
    uint32_t s, d;
1096
1097
0
    __m128i xmm_src_lo, xmm_src_hi;
1098
0
    __m128i xmm_dst_lo, xmm_dst_hi;
1099
0
    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1100
0
    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1101
1102
0
    while (w && ((uintptr_t)pd & 15))
1103
0
    {
1104
0
  s = combine1 (ps, pm);
1105
0
  d = *pd;
1106
1107
0
  *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1108
0
  w--;
1109
0
  ps++;
1110
0
  if (pm)
1111
0
      pm++;
1112
0
    }
1113
1114
0
    while (w >= 4)
1115
0
    {
1116
0
  xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1117
0
  xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1118
1119
0
  unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1120
0
  unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1121
1122
0
  expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1123
0
          &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1124
0
  expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1125
0
          &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1126
1127
0
  negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1128
0
          &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1129
1130
0
  pix_add_multiply_2x128 (
1131
0
      &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1132
0
      &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1133
0
      &xmm_dst_lo, &xmm_dst_hi);
1134
1135
0
  save_128_aligned (
1136
0
      (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1137
1138
0
  ps += 4;
1139
0
  pd += 4;
1140
0
  w -= 4;
1141
0
  if (pm)
1142
0
      pm += 4;
1143
0
    }
1144
1145
0
    while (w)
1146
0
    {
1147
0
  s = combine1 (ps, pm);
1148
0
  d = *pd;
1149
1150
0
  *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1151
0
  w--;
1152
0
  ps++;
1153
0
  if (pm)
1154
0
      pm++;
1155
0
    }
1156
0
}
1157
1158
static force_inline uint32_t
1159
core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1160
                                        uint32_t dst)
1161
0
{
1162
0
    __m128i s = unpack_32_1x128 (src);
1163
0
    __m128i d = unpack_32_1x128 (dst);
1164
1165
0
    __m128i sa = expand_alpha_1x128 (s);
1166
0
    __m128i da = negate_1x128 (expand_alpha_1x128 (d));
1167
1168
0
    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1169
0
}
1170
1171
static void
1172
sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
1173
                             pixman_op_t              op,
1174
                             uint32_t *               pd,
1175
                             const uint32_t *         ps,
1176
                             const uint32_t *         pm,
1177
                             int                      w)
1178
0
{
1179
0
    uint32_t s, d;
1180
1181
0
    __m128i xmm_src_lo, xmm_src_hi;
1182
0
    __m128i xmm_dst_lo, xmm_dst_hi;
1183
0
    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1184
0
    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1185
1186
0
    while (w && ((uintptr_t)pd & 15))
1187
0
    {
1188
0
  s = combine1 (ps, pm);
1189
0
  d = *pd;
1190
1191
0
  *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1192
0
  ps++;
1193
0
  w--;
1194
0
  if (pm)
1195
0
      pm++;
1196
0
    }
1197
1198
0
    while (w >= 4)
1199
0
    {
1200
0
  xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1201
0
  xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1202
1203
0
  unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1204
0
  unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1205
1206
0
  expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1207
0
          &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1208
0
  expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1209
0
          &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1210
1211
0
  negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1212
0
          &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1213
1214
0
  pix_add_multiply_2x128 (
1215
0
      &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1216
0
      &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1217
0
      &xmm_dst_lo, &xmm_dst_hi);
1218
1219
0
  save_128_aligned (
1220
0
      (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1221
1222
0
  ps += 4;
1223
0
  pd += 4;
1224
0
  w -= 4;
1225
0
  if (pm)
1226
0
      pm += 4;
1227
0
    }
1228
1229
0
    while (w)
1230
0
    {
1231
0
  s = combine1 (ps, pm);
1232
0
  d = *pd;
1233
1234
0
  *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1235
0
  ps++;
1236
0
  w--;
1237
0
  if (pm)
1238
0
      pm++;
1239
0
    }
1240
0
}
1241
1242
static force_inline uint32_t
1243
core_combine_xor_u_pixel_sse2 (uint32_t src,
1244
                               uint32_t dst)
1245
0
{
1246
0
    __m128i s = unpack_32_1x128 (src);
1247
0
    __m128i d = unpack_32_1x128 (dst);
1248
1249
0
    __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
1250
0
    __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
1251
1252
0
    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
1253
0
}
1254
1255
static void
1256
sse2_combine_xor_u (pixman_implementation_t *imp,
1257
                    pixman_op_t              op,
1258
                    uint32_t *               dst,
1259
                    const uint32_t *         src,
1260
                    const uint32_t *         mask,
1261
                    int                      width)
1262
0
{
1263
0
    int w = width;
1264
0
    uint32_t s, d;
1265
0
    uint32_t* pd = dst;
1266
0
    const uint32_t* ps = src;
1267
0
    const uint32_t* pm = mask;
1268
1269
0
    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1270
0
    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1271
0
    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1272
0
    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1273
1274
0
    while (w && ((uintptr_t)pd & 15))
1275
0
    {
1276
0
  s = combine1 (ps, pm);
1277
0
  d = *pd;
1278
1279
0
  *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1280
0
  w--;
1281
0
  ps++;
1282
0
  if (pm)
1283
0
      pm++;
1284
0
    }
1285
1286
0
    while (w >= 4)
1287
0
    {
1288
0
  xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1289
0
  xmm_dst = load_128_aligned ((__m128i*) pd);
1290
1291
0
  unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1292
0
  unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1293
1294
0
  expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1295
0
          &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1296
0
  expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1297
0
          &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1298
1299
0
  negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1300
0
          &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1301
0
  negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1302
0
          &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1303
1304
0
  pix_add_multiply_2x128 (
1305
0
      &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1306
0
      &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1307
0
      &xmm_dst_lo, &xmm_dst_hi);
1308
1309
0
  save_128_aligned (
1310
0
      (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1311
1312
0
  ps += 4;
1313
0
  pd += 4;
1314
0
  w -= 4;
1315
0
  if (pm)
1316
0
      pm += 4;
1317
0
    }
1318
1319
0
    while (w)
1320
0
    {
1321
0
  s = combine1 (ps, pm);
1322
0
  d = *pd;
1323
1324
0
  *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1325
0
  w--;
1326
0
  ps++;
1327
0
  if (pm)
1328
0
      pm++;
1329
0
    }
1330
0
}
1331
1332
static force_inline void
1333
sse2_combine_add_u (pixman_implementation_t *imp,
1334
                    pixman_op_t              op,
1335
                    uint32_t *               dst,
1336
                    const uint32_t *         src,
1337
                    const uint32_t *         mask,
1338
                    int                      width)
1339
0
{
1340
0
    int w = width;
1341
0
    uint32_t s, d;
1342
0
    uint32_t* pd = dst;
1343
0
    const uint32_t* ps = src;
1344
0
    const uint32_t* pm = mask;
1345
1346
0
    while (w && (uintptr_t)pd & 15)
1347
0
    {
1348
0
  s = combine1 (ps, pm);
1349
0
  d = *pd;
1350
1351
0
  ps++;
1352
0
  if (pm)
1353
0
      pm++;
1354
0
  *pd++ = _mm_cvtsi128_si32 (
1355
0
      _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1356
0
  w--;
1357
0
    }
1358
1359
0
    while (w >= 4)
1360
0
    {
1361
0
  __m128i s;
1362
1363
0
  s = combine4 ((__m128i*)ps, (__m128i*)pm);
1364
1365
0
  save_128_aligned (
1366
0
      (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned  ((__m128i*)pd)));
1367
1368
0
  pd += 4;
1369
0
  ps += 4;
1370
0
  if (pm)
1371
0
      pm += 4;
1372
0
  w -= 4;
1373
0
    }
1374
1375
0
    while (w--)
1376
0
    {
1377
0
  s = combine1 (ps, pm);
1378
0
  d = *pd;
1379
1380
0
  ps++;
1381
0
  *pd++ = _mm_cvtsi128_si32 (
1382
0
      _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1383
0
  if (pm)
1384
0
      pm++;
1385
0
    }
1386
0
}
1387
1388
static force_inline uint32_t
1389
core_combine_saturate_u_pixel_sse2 (uint32_t src,
1390
                                    uint32_t dst)
1391
0
{
1392
0
    __m128i ms = unpack_32_1x128 (src);
1393
0
    __m128i md = unpack_32_1x128 (dst);
1394
0
    uint32_t sa = src >> 24;
1395
0
    uint32_t da = ~dst >> 24;
1396
1397
0
    if (sa > da)
1398
0
    {
1399
0
  ms = pix_multiply_1x128 (
1400
0
      ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
1401
0
    }
1402
1403
0
    return pack_1x128_32 (_mm_adds_epu16 (md, ms));
1404
0
}
1405
1406
static void
1407
sse2_combine_saturate_u (pixman_implementation_t *imp,
1408
                         pixman_op_t              op,
1409
                         uint32_t *               pd,
1410
                         const uint32_t *         ps,
1411
                         const uint32_t *         pm,
1412
                         int                      w)
1413
0
{
1414
0
    uint32_t s, d;
1415
1416
0
    uint32_t pack_cmp;
1417
0
    __m128i xmm_src, xmm_dst;
1418
1419
0
    while (w && (uintptr_t)pd & 15)
1420
0
    {
1421
0
  s = combine1 (ps, pm);
1422
0
  d = *pd;
1423
1424
0
  *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1425
0
  w--;
1426
0
  ps++;
1427
0
  if (pm)
1428
0
      pm++;
1429
0
    }
1430
1431
0
    while (w >= 4)
1432
0
    {
1433
0
  xmm_dst = load_128_aligned  ((__m128i*)pd);
1434
0
  xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1435
1436
0
  pack_cmp = _mm_movemask_epi8 (
1437
0
      _mm_cmpgt_epi32 (
1438
0
    _mm_srli_epi32 (xmm_src, 24),
1439
0
    _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1440
1441
  /* if some alpha src is grater than respective ~alpha dst */
1442
0
  if (pack_cmp)
1443
0
  {
1444
0
      s = combine1 (ps++, pm);
1445
0
      d = *pd;
1446
0
      *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1447
0
      if (pm)
1448
0
    pm++;
1449
1450
0
      s = combine1 (ps++, pm);
1451
0
      d = *pd;
1452
0
      *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1453
0
      if (pm)
1454
0
    pm++;
1455
1456
0
      s = combine1 (ps++, pm);
1457
0
      d = *pd;
1458
0
      *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1459
0
      if (pm)
1460
0
    pm++;
1461
1462
0
      s = combine1 (ps++, pm);
1463
0
      d = *pd;
1464
0
      *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1465
0
      if (pm)
1466
0
    pm++;
1467
0
  }
1468
0
  else
1469
0
  {
1470
0
      save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1471
1472
0
      pd += 4;
1473
0
      ps += 4;
1474
0
      if (pm)
1475
0
    pm += 4;
1476
0
  }
1477
1478
0
  w -= 4;
1479
0
    }
1480
1481
0
    while (w--)
1482
0
    {
1483
0
  s = combine1 (ps, pm);
1484
0
  d = *pd;
1485
1486
0
  *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1487
0
  ps++;
1488
0
  if (pm)
1489
0
      pm++;
1490
0
    }
1491
0
}
1492
1493
static void
1494
sse2_combine_src_ca (pixman_implementation_t *imp,
1495
                     pixman_op_t              op,
1496
                     uint32_t *               pd,
1497
                     const uint32_t *         ps,
1498
                     const uint32_t *         pm,
1499
                     int                      w)
1500
0
{
1501
0
    uint32_t s, m;
1502
1503
0
    __m128i xmm_src_lo, xmm_src_hi;
1504
0
    __m128i xmm_mask_lo, xmm_mask_hi;
1505
0
    __m128i xmm_dst_lo, xmm_dst_hi;
1506
1507
0
    while (w && (uintptr_t)pd & 15)
1508
0
    {
1509
0
  s = *ps++;
1510
0
  m = *pm++;
1511
0
  *pd++ = pack_1x128_32 (
1512
0
      pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1513
0
  w--;
1514
0
    }
1515
1516
0
    while (w >= 4)
1517
0
    {
1518
0
  xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1519
0
  xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1520
1521
0
  unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1522
0
  unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1523
1524
0
  pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1525
0
          &xmm_mask_lo, &xmm_mask_hi,
1526
0
          &xmm_dst_lo, &xmm_dst_hi);
1527
1528
0
  save_128_aligned (
1529
0
      (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1530
1531
0
  ps += 4;
1532
0
  pd += 4;
1533
0
  pm += 4;
1534
0
  w -= 4;
1535
0
    }
1536
1537
0
    while (w)
1538
0
    {
1539
0
  s = *ps++;
1540
0
  m = *pm++;
1541
0
  *pd++ = pack_1x128_32 (
1542
0
      pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1543
0
  w--;
1544
0
    }
1545
0
}
1546
1547
static force_inline uint32_t
1548
core_combine_over_ca_pixel_sse2 (uint32_t src,
1549
                                 uint32_t mask,
1550
                                 uint32_t dst)
1551
0
{
1552
0
    __m128i s = unpack_32_1x128 (src);
1553
0
    __m128i expAlpha = expand_alpha_1x128 (s);
1554
0
    __m128i unpk_mask = unpack_32_1x128 (mask);
1555
0
    __m128i unpk_dst  = unpack_32_1x128 (dst);
1556
1557
0
    return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1558
0
}
1559
1560
static void
1561
sse2_combine_over_ca (pixman_implementation_t *imp,
1562
                      pixman_op_t              op,
1563
                      uint32_t *               pd,
1564
                      const uint32_t *         ps,
1565
                      const uint32_t *         pm,
1566
                      int                      w)
1567
0
{
1568
0
    uint32_t s, m, d;
1569
1570
0
    __m128i xmm_alpha_lo, xmm_alpha_hi;
1571
0
    __m128i xmm_src_lo, xmm_src_hi;
1572
0
    __m128i xmm_dst_lo, xmm_dst_hi;
1573
0
    __m128i xmm_mask_lo, xmm_mask_hi;
1574
1575
0
    while (w && (uintptr_t)pd & 15)
1576
0
    {
1577
0
  s = *ps++;
1578
0
  m = *pm++;
1579
0
  d = *pd;
1580
1581
0
  *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1582
0
  w--;
1583
0
    }
1584
1585
0
    while (w >= 4)
1586
0
    {
1587
0
  xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1588
0
  xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1589
0
  xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1590
1591
0
  unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1592
0
  unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1593
0
  unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1594
1595
0
  expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1596
0
          &xmm_alpha_lo, &xmm_alpha_hi);
1597
1598
0
  in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1599
0
           &xmm_alpha_lo, &xmm_alpha_hi,
1600
0
           &xmm_mask_lo, &xmm_mask_hi,
1601
0
           &xmm_dst_lo, &xmm_dst_hi);
1602
1603
0
  save_128_aligned (
1604
0
      (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1605
1606
0
  ps += 4;
1607
0
  pd += 4;
1608
0
  pm += 4;
1609
0
  w -= 4;
1610
0
    }
1611
1612
0
    while (w)
1613
0
    {
1614
0
  s = *ps++;
1615
0
  m = *pm++;
1616
0
  d = *pd;
1617
1618
0
  *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1619
0
  w--;
1620
0
    }
1621
0
}
1622
1623
static force_inline uint32_t
1624
core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1625
                                         uint32_t mask,
1626
                                         uint32_t dst)
1627
0
{
1628
0
    __m128i d = unpack_32_1x128 (dst);
1629
1630
0
    return pack_1x128_32 (
1631
0
  over_1x128 (d, expand_alpha_1x128 (d),
1632
0
        pix_multiply_1x128 (unpack_32_1x128 (src),
1633
0
          unpack_32_1x128 (mask))));
1634
0
}
1635
1636
static void
1637
sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
1638
                              pixman_op_t              op,
1639
                              uint32_t *               pd,
1640
                              const uint32_t *         ps,
1641
                              const uint32_t *         pm,
1642
                              int                      w)
1643
0
{
1644
0
    uint32_t s, m, d;
1645
1646
0
    __m128i xmm_alpha_lo, xmm_alpha_hi;
1647
0
    __m128i xmm_src_lo, xmm_src_hi;
1648
0
    __m128i xmm_dst_lo, xmm_dst_hi;
1649
0
    __m128i xmm_mask_lo, xmm_mask_hi;
1650
1651
0
    while (w && (uintptr_t)pd & 15)
1652
0
    {
1653
0
  s = *ps++;
1654
0
  m = *pm++;
1655
0
  d = *pd;
1656
1657
0
  *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1658
0
  w--;
1659
0
    }
1660
1661
0
    while (w >= 4)
1662
0
    {
1663
0
  xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1664
0
  xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1665
0
  xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1666
1667
0
  unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1668
0
  unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1669
0
  unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1670
1671
0
  expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1672
0
          &xmm_alpha_lo, &xmm_alpha_hi);
1673
0
  pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1674
0
          &xmm_mask_lo, &xmm_mask_hi,
1675
0
          &xmm_mask_lo, &xmm_mask_hi);
1676
1677
0
  over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1678
0
        &xmm_alpha_lo, &xmm_alpha_hi,
1679
0
        &xmm_mask_lo, &xmm_mask_hi);
1680
1681
0
  save_128_aligned (
1682
0
      (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1683
1684
0
  ps += 4;
1685
0
  pd += 4;
1686
0
  pm += 4;
1687
0
  w -= 4;
1688
0
    }
1689
1690
0
    while (w)
1691
0
    {
1692
0
  s = *ps++;
1693
0
  m = *pm++;
1694
0
  d = *pd;
1695
1696
0
  *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1697
0
  w--;
1698
0
    }
1699
0
}
1700
1701
static void
1702
sse2_combine_in_ca (pixman_implementation_t *imp,
1703
                    pixman_op_t              op,
1704
                    uint32_t *               pd,
1705
                    const uint32_t *         ps,
1706
                    const uint32_t *         pm,
1707
                    int                      w)
1708
0
{
1709
0
    uint32_t s, m, d;
1710
1711
0
    __m128i xmm_alpha_lo, xmm_alpha_hi;
1712
0
    __m128i xmm_src_lo, xmm_src_hi;
1713
0
    __m128i xmm_dst_lo, xmm_dst_hi;
1714
0
    __m128i xmm_mask_lo, xmm_mask_hi;
1715
1716
0
    while (w && (uintptr_t)pd & 15)
1717
0
    {
1718
0
  s = *ps++;
1719
0
  m = *pm++;
1720
0
  d = *pd;
1721
1722
0
  *pd++ = pack_1x128_32 (
1723
0
      pix_multiply_1x128 (
1724
0
    pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
1725
0
    expand_alpha_1x128 (unpack_32_1x128 (d))));
1726
1727
0
  w--;
1728
0
    }
1729
1730
0
    while (w >= 4)
1731
0
    {
1732
0
  xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1733
0
  xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1734
0
  xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1735
1736
0
  unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1737
0
  unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1738
0
  unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1739
1740
0
  expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1741
0
          &xmm_alpha_lo, &xmm_alpha_hi);
1742
1743
0
  pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1744
0
          &xmm_mask_lo, &xmm_mask_hi,
1745
0
          &xmm_dst_lo, &xmm_dst_hi);
1746
1747
0
  pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1748
0
          &xmm_alpha_lo, &xmm_alpha_hi,
1749
0
          &xmm_dst_lo, &xmm_dst_hi);
1750
1751
0
  save_128_aligned (
1752
0
      (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1753
1754
0
  ps += 4;
1755
0
  pd += 4;
1756
0
  pm += 4;
1757
0
  w -= 4;
1758
0
    }
1759
1760
0
    while (w)
1761
0
    {
1762
0
  s = *ps++;
1763
0
  m = *pm++;
1764
0
  d = *pd;
1765
1766
0
  *pd++ = pack_1x128_32 (
1767
0
      pix_multiply_1x128 (
1768
0
    pix_multiply_1x128 (
1769
0
        unpack_32_1x128 (s), unpack_32_1x128 (m)),
1770
0
    expand_alpha_1x128 (unpack_32_1x128 (d))));
1771
1772
0
  w--;
1773
0
    }
1774
0
}
1775
1776
static void
1777
sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
1778
                            pixman_op_t              op,
1779
                            uint32_t *               pd,
1780
                            const uint32_t *         ps,
1781
                            const uint32_t *         pm,
1782
                            int                      w)
1783
0
{
1784
0
    uint32_t s, m, d;
1785
1786
0
    __m128i xmm_alpha_lo, xmm_alpha_hi;
1787
0
    __m128i xmm_src_lo, xmm_src_hi;
1788
0
    __m128i xmm_dst_lo, xmm_dst_hi;
1789
0
    __m128i xmm_mask_lo, xmm_mask_hi;
1790
1791
0
    while (w && (uintptr_t)pd & 15)
1792
0
    {
1793
0
  s = *ps++;
1794
0
  m = *pm++;
1795
0
  d = *pd;
1796
1797
0
  *pd++ = pack_1x128_32 (
1798
0
      pix_multiply_1x128 (
1799
0
    unpack_32_1x128 (d),
1800
0
    pix_multiply_1x128 (unpack_32_1x128 (m),
1801
0
           expand_alpha_1x128 (unpack_32_1x128 (s)))));
1802
0
  w--;
1803
0
    }
1804
1805
0
    while (w >= 4)
1806
0
    {
1807
0
  xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1808
0
  xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1809
0
  xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1810
1811
0
  unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1812
0
  unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1813
0
  unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1814
1815
0
  expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1816
0
          &xmm_alpha_lo, &xmm_alpha_hi);
1817
0
  pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1818
0
          &xmm_alpha_lo, &xmm_alpha_hi,
1819
0
          &xmm_alpha_lo, &xmm_alpha_hi);
1820
1821
0
  pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1822
0
          &xmm_alpha_lo, &xmm_alpha_hi,
1823
0
          &xmm_dst_lo, &xmm_dst_hi);
1824
1825
0
  save_128_aligned (
1826
0
      (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1827
1828
0
  ps += 4;
1829
0
  pd += 4;
1830
0
  pm += 4;
1831
0
  w -= 4;
1832
0
    }
1833
1834
0
    while (w)
1835
0
    {
1836
0
  s = *ps++;
1837
0
  m = *pm++;
1838
0
  d = *pd;
1839
1840
0
  *pd++ = pack_1x128_32 (
1841
0
      pix_multiply_1x128 (
1842
0
    unpack_32_1x128 (d),
1843
0
    pix_multiply_1x128 (unpack_32_1x128 (m),
1844
0
           expand_alpha_1x128 (unpack_32_1x128 (s)))));
1845
0
  w--;
1846
0
    }
1847
0
}
1848
1849
static void
1850
sse2_combine_out_ca (pixman_implementation_t *imp,
1851
                     pixman_op_t              op,
1852
                     uint32_t *               pd,
1853
                     const uint32_t *         ps,
1854
                     const uint32_t *         pm,
1855
                     int                      w)
1856
0
{
1857
0
    uint32_t s, m, d;
1858
1859
0
    __m128i xmm_alpha_lo, xmm_alpha_hi;
1860
0
    __m128i xmm_src_lo, xmm_src_hi;
1861
0
    __m128i xmm_dst_lo, xmm_dst_hi;
1862
0
    __m128i xmm_mask_lo, xmm_mask_hi;
1863
1864
0
    while (w && (uintptr_t)pd & 15)
1865
0
    {
1866
0
  s = *ps++;
1867
0
  m = *pm++;
1868
0
  d = *pd;
1869
1870
0
  *pd++ = pack_1x128_32 (
1871
0
      pix_multiply_1x128 (
1872
0
    pix_multiply_1x128 (
1873
0
        unpack_32_1x128 (s), unpack_32_1x128 (m)),
1874
0
    negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1875
0
  w--;
1876
0
    }
1877
1878
0
    while (w >= 4)
1879
0
    {
1880
0
  xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1881
0
  xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1882
0
  xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1883
1884
0
  unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1885
0
  unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1886
0
  unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1887
1888
0
  expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1889
0
          &xmm_alpha_lo, &xmm_alpha_hi);
1890
0
  negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
1891
0
          &xmm_alpha_lo, &xmm_alpha_hi);
1892
1893
0
  pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1894
0
          &xmm_mask_lo, &xmm_mask_hi,
1895
0
          &xmm_dst_lo, &xmm_dst_hi);
1896
0
  pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1897
0
          &xmm_alpha_lo, &xmm_alpha_hi,
1898
0
          &xmm_dst_lo, &xmm_dst_hi);
1899
1900
0
  save_128_aligned (
1901
0
      (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1902
1903
0
  ps += 4;
1904
0
  pd += 4;
1905
0
  pm += 4;
1906
0
  w -= 4;
1907
0
    }
1908
1909
0
    while (w)
1910
0
    {
1911
0
  s = *ps++;
1912
0
  m = *pm++;
1913
0
  d = *pd;
1914
1915
0
  *pd++ = pack_1x128_32 (
1916
0
      pix_multiply_1x128 (
1917
0
    pix_multiply_1x128 (
1918
0
        unpack_32_1x128 (s), unpack_32_1x128 (m)),
1919
0
    negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1920
1921
0
  w--;
1922
0
    }
1923
0
}
1924
1925
static void
1926
sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
1927
                             pixman_op_t              op,
1928
                             uint32_t *               pd,
1929
                             const uint32_t *         ps,
1930
                             const uint32_t *         pm,
1931
                             int                      w)
1932
0
{
1933
0
    uint32_t s, m, d;
1934
1935
0
    __m128i xmm_alpha_lo, xmm_alpha_hi;
1936
0
    __m128i xmm_src_lo, xmm_src_hi;
1937
0
    __m128i xmm_dst_lo, xmm_dst_hi;
1938
0
    __m128i xmm_mask_lo, xmm_mask_hi;
1939
1940
0
    while (w && (uintptr_t)pd & 15)
1941
0
    {
1942
0
  s = *ps++;
1943
0
  m = *pm++;
1944
0
  d = *pd;
1945
1946
0
  *pd++ = pack_1x128_32 (
1947
0
      pix_multiply_1x128 (
1948
0
    unpack_32_1x128 (d),
1949
0
    negate_1x128 (pix_multiply_1x128 (
1950
0
         unpack_32_1x128 (m),
1951
0
         expand_alpha_1x128 (unpack_32_1x128 (s))))));
1952
0
  w--;
1953
0
    }
1954
1955
0
    while (w >= 4)
1956
0
    {
1957
0
  xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1958
0
  xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1959
0
  xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1960
1961
0
  unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1962
0
  unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1963
0
  unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1964
1965
0
  expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1966
0
          &xmm_alpha_lo, &xmm_alpha_hi);
1967
1968
0
  pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1969
0
          &xmm_alpha_lo, &xmm_alpha_hi,
1970
0
          &xmm_mask_lo, &xmm_mask_hi);
1971
1972
0
  negate_2x128 (xmm_mask_lo, xmm_mask_hi,
1973
0
          &xmm_mask_lo, &xmm_mask_hi);
1974
1975
0
  pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1976
0
          &xmm_mask_lo, &xmm_mask_hi,
1977
0
          &xmm_dst_lo, &xmm_dst_hi);
1978
1979
0
  save_128_aligned (
1980
0
      (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1981
1982
0
  ps += 4;
1983
0
  pd += 4;
1984
0
  pm += 4;
1985
0
  w -= 4;
1986
0
    }
1987
1988
0
    while (w)
1989
0
    {
1990
0
  s = *ps++;
1991
0
  m = *pm++;
1992
0
  d = *pd;
1993
1994
0
  *pd++ = pack_1x128_32 (
1995
0
      pix_multiply_1x128 (
1996
0
    unpack_32_1x128 (d),
1997
0
    negate_1x128 (pix_multiply_1x128 (
1998
0
         unpack_32_1x128 (m),
1999
0
         expand_alpha_1x128 (unpack_32_1x128 (s))))));
2000
0
  w--;
2001
0
    }
2002
0
}
2003
2004
static force_inline uint32_t
2005
core_combine_atop_ca_pixel_sse2 (uint32_t src,
2006
                                 uint32_t mask,
2007
                                 uint32_t dst)
2008
0
{
2009
0
    __m128i m = unpack_32_1x128 (mask);
2010
0
    __m128i s = unpack_32_1x128 (src);
2011
0
    __m128i d = unpack_32_1x128 (dst);
2012
0
    __m128i sa = expand_alpha_1x128 (s);
2013
0
    __m128i da = expand_alpha_1x128 (d);
2014
2015
0
    s = pix_multiply_1x128 (s, m);
2016
0
    m = negate_1x128 (pix_multiply_1x128 (m, sa));
2017
2018
0
    return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2019
0
}
2020
2021
static void
2022
sse2_combine_atop_ca (pixman_implementation_t *imp,
2023
                      pixman_op_t              op,
2024
                      uint32_t *               pd,
2025
                      const uint32_t *         ps,
2026
                      const uint32_t *         pm,
2027
                      int                      w)
2028
0
{
2029
0
    uint32_t s, m, d;
2030
2031
0
    __m128i xmm_src_lo, xmm_src_hi;
2032
0
    __m128i xmm_dst_lo, xmm_dst_hi;
2033
0
    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2034
0
    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2035
0
    __m128i xmm_mask_lo, xmm_mask_hi;
2036
2037
0
    while (w && (uintptr_t)pd & 15)
2038
0
    {
2039
0
  s = *ps++;
2040
0
  m = *pm++;
2041
0
  d = *pd;
2042
2043
0
  *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2044
0
  w--;
2045
0
    }
2046
2047
0
    while (w >= 4)
2048
0
    {
2049
0
  xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2050
0
  xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2051
0
  xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2052
2053
0
  unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2054
0
  unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2055
0
  unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2056
2057
0
  expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2058
0
          &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2059
0
  expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2060
0
          &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2061
2062
0
  pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2063
0
          &xmm_mask_lo, &xmm_mask_hi,
2064
0
          &xmm_src_lo, &xmm_src_hi);
2065
0
  pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2066
0
          &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2067
0
          &xmm_mask_lo, &xmm_mask_hi);
2068
2069
0
  negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2070
2071
0
  pix_add_multiply_2x128 (
2072
0
      &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2073
0
      &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2074
0
      &xmm_dst_lo, &xmm_dst_hi);
2075
2076
0
  save_128_aligned (
2077
0
      (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2078
2079
0
  ps += 4;
2080
0
  pd += 4;
2081
0
  pm += 4;
2082
0
  w -= 4;
2083
0
    }
2084
2085
0
    while (w)
2086
0
    {
2087
0
  s = *ps++;
2088
0
  m = *pm++;
2089
0
  d = *pd;
2090
2091
0
  *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2092
0
  w--;
2093
0
    }
2094
0
}
2095
2096
static force_inline uint32_t
2097
core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2098
                                         uint32_t mask,
2099
                                         uint32_t dst)
2100
0
{
2101
0
    __m128i m = unpack_32_1x128 (mask);
2102
0
    __m128i s = unpack_32_1x128 (src);
2103
0
    __m128i d = unpack_32_1x128 (dst);
2104
2105
0
    __m128i da = negate_1x128 (expand_alpha_1x128 (d));
2106
0
    __m128i sa = expand_alpha_1x128 (s);
2107
2108
0
    s = pix_multiply_1x128 (s, m);
2109
0
    m = pix_multiply_1x128 (m, sa);
2110
2111
0
    return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2112
0
}
2113
2114
static void
2115
sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2116
                              pixman_op_t              op,
2117
                              uint32_t *               pd,
2118
                              const uint32_t *         ps,
2119
                              const uint32_t *         pm,
2120
                              int                      w)
2121
0
{
2122
0
    uint32_t s, m, d;
2123
2124
0
    __m128i xmm_src_lo, xmm_src_hi;
2125
0
    __m128i xmm_dst_lo, xmm_dst_hi;
2126
0
    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2127
0
    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2128
0
    __m128i xmm_mask_lo, xmm_mask_hi;
2129
2130
0
    while (w && (uintptr_t)pd & 15)
2131
0
    {
2132
0
  s = *ps++;
2133
0
  m = *pm++;
2134
0
  d = *pd;
2135
2136
0
  *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2137
0
  w--;
2138
0
    }
2139
2140
0
    while (w >= 4)
2141
0
    {
2142
0
  xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2143
0
  xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2144
0
  xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2145
2146
0
  unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2147
0
  unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2148
0
  unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2149
2150
0
  expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2151
0
          &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2152
0
  expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2153
0
          &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2154
2155
0
  pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2156
0
          &xmm_mask_lo, &xmm_mask_hi,
2157
0
          &xmm_src_lo, &xmm_src_hi);
2158
0
  pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2159
0
          &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2160
0
          &xmm_mask_lo, &xmm_mask_hi);
2161
2162
0
  negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2163
0
          &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2164
2165
0
  pix_add_multiply_2x128 (
2166
0
      &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2167
0
      &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2168
0
      &xmm_dst_lo, &xmm_dst_hi);
2169
2170
0
  save_128_aligned (
2171
0
      (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2172
2173
0
  ps += 4;
2174
0
  pd += 4;
2175
0
  pm += 4;
2176
0
  w -= 4;
2177
0
    }
2178
2179
0
    while (w)
2180
0
    {
2181
0
  s = *ps++;
2182
0
  m = *pm++;
2183
0
  d = *pd;
2184
2185
0
  *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2186
0
  w--;
2187
0
    }
2188
0
}
2189
2190
static force_inline uint32_t
2191
core_combine_xor_ca_pixel_sse2 (uint32_t src,
2192
                                uint32_t mask,
2193
                                uint32_t dst)
2194
0
{
2195
0
    __m128i a = unpack_32_1x128 (mask);
2196
0
    __m128i s = unpack_32_1x128 (src);
2197
0
    __m128i d = unpack_32_1x128 (dst);
2198
2199
0
    __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
2200
0
               a, expand_alpha_1x128 (s)));
2201
0
    __m128i dest      = pix_multiply_1x128 (s, a);
2202
0
    __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
2203
2204
0
    return pack_1x128_32 (pix_add_multiply_1x128 (&d,
2205
0
                                                &alpha_dst,
2206
0
                                                &dest,
2207
0
                                                &alpha_src));
2208
0
}
2209
2210
static void
2211
sse2_combine_xor_ca (pixman_implementation_t *imp,
2212
                     pixman_op_t              op,
2213
                     uint32_t *               pd,
2214
                     const uint32_t *         ps,
2215
                     const uint32_t *         pm,
2216
                     int                      w)
2217
0
{
2218
0
    uint32_t s, m, d;
2219
2220
0
    __m128i xmm_src_lo, xmm_src_hi;
2221
0
    __m128i xmm_dst_lo, xmm_dst_hi;
2222
0
    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2223
0
    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2224
0
    __m128i xmm_mask_lo, xmm_mask_hi;
2225
2226
0
    while (w && (uintptr_t)pd & 15)
2227
0
    {
2228
0
  s = *ps++;
2229
0
  m = *pm++;
2230
0
  d = *pd;
2231
2232
0
  *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2233
0
  w--;
2234
0
    }
2235
2236
0
    while (w >= 4)
2237
0
    {
2238
0
  xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2239
0
  xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2240
0
  xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2241
2242
0
  unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2243
0
  unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2244
0
  unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2245
2246
0
  expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2247
0
          &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2248
0
  expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2249
0
          &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2250
2251
0
  pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2252
0
          &xmm_mask_lo, &xmm_mask_hi,
2253
0
          &xmm_src_lo, &xmm_src_hi);
2254
0
  pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2255
0
          &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2256
0
          &xmm_mask_lo, &xmm_mask_hi);
2257
2258
0
  negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2259
0
          &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2260
0
  negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2261
0
          &xmm_mask_lo, &xmm_mask_hi);
2262
2263
0
  pix_add_multiply_2x128 (
2264
0
      &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2265
0
      &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2266
0
      &xmm_dst_lo, &xmm_dst_hi);
2267
2268
0
  save_128_aligned (
2269
0
      (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2270
2271
0
  ps += 4;
2272
0
  pd += 4;
2273
0
  pm += 4;
2274
0
  w -= 4;
2275
0
    }
2276
2277
0
    while (w)
2278
0
    {
2279
0
  s = *ps++;
2280
0
  m = *pm++;
2281
0
  d = *pd;
2282
2283
0
  *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2284
0
  w--;
2285
0
    }
2286
0
}
2287
2288
static void
2289
sse2_combine_add_ca (pixman_implementation_t *imp,
2290
                     pixman_op_t              op,
2291
                     uint32_t *               pd,
2292
                     const uint32_t *         ps,
2293
                     const uint32_t *         pm,
2294
                     int                      w)
2295
0
{
2296
0
    uint32_t s, m, d;
2297
2298
0
    __m128i xmm_src_lo, xmm_src_hi;
2299
0
    __m128i xmm_dst_lo, xmm_dst_hi;
2300
0
    __m128i xmm_mask_lo, xmm_mask_hi;
2301
2302
0
    while (w && (uintptr_t)pd & 15)
2303
0
    {
2304
0
  s = *ps++;
2305
0
  m = *pm++;
2306
0
  d = *pd;
2307
2308
0
  *pd++ = pack_1x128_32 (
2309
0
      _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2310
0
                 unpack_32_1x128 (m)),
2311
0
         unpack_32_1x128 (d)));
2312
0
  w--;
2313
0
    }
2314
2315
0
    while (w >= 4)
2316
0
    {
2317
0
  xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2318
0
  xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2319
0
  xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2320
2321
0
  unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2322
0
  unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2323
0
  unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2324
2325
0
  pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2326
0
          &xmm_mask_lo, &xmm_mask_hi,
2327
0
          &xmm_src_lo, &xmm_src_hi);
2328
2329
0
  save_128_aligned (
2330
0
      (__m128i*)pd, pack_2x128_128 (
2331
0
    _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2332
0
    _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2333
2334
0
  ps += 4;
2335
0
  pd += 4;
2336
0
  pm += 4;
2337
0
  w -= 4;
2338
0
    }
2339
2340
0
    while (w)
2341
0
    {
2342
0
  s = *ps++;
2343
0
  m = *pm++;
2344
0
  d = *pd;
2345
2346
0
  *pd++ = pack_1x128_32 (
2347
0
      _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2348
0
                 unpack_32_1x128 (m)),
2349
0
         unpack_32_1x128 (d)));
2350
0
  w--;
2351
0
    }
2352
0
}
2353
2354
static force_inline __m128i
2355
create_mask_16_128 (uint16_t mask)
2356
48
{
2357
48
    return _mm_set1_epi16 (mask);
2358
48
}
2359
2360
/* Work around a code generation bug in Sun Studio 12. */
2361
#if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2362
# define create_mask_2x32_128(mask0, mask1)       \
2363
    (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2364
#else
2365
static force_inline __m128i
2366
create_mask_2x32_128 (uint32_t mask0,
2367
                      uint32_t mask1)
2368
166
{
2369
166
    return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2370
166
}
2371
#endif
2372
2373
static void
2374
sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2375
                            pixman_composite_info_t *info)
2376
0
{
2377
0
    PIXMAN_COMPOSITE_ARGS (info);
2378
0
    uint32_t src;
2379
0
    uint32_t    *dst_line, *dst, d;
2380
0
    int32_t w;
2381
0
    int dst_stride;
2382
0
    __m128i xmm_src, xmm_alpha;
2383
0
    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2384
2385
0
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2386
2387
0
    if (src == 0)
2388
0
  return;
2389
2390
0
    PIXMAN_IMAGE_GET_LINE (
2391
0
  dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2392
2393
0
    xmm_src = expand_pixel_32_1x128 (src);
2394
0
    xmm_alpha = expand_alpha_1x128 (xmm_src);
2395
2396
0
    while (height--)
2397
0
    {
2398
0
  dst = dst_line;
2399
2400
0
  dst_line += dst_stride;
2401
0
  w = width;
2402
2403
0
  while (w && (uintptr_t)dst & 15)
2404
0
  {
2405
0
      d = *dst;
2406
0
      *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2407
0
            xmm_alpha,
2408
0
            unpack_32_1x128 (d)));
2409
0
      w--;
2410
0
  }
2411
2412
0
  while (w >= 4)
2413
0
  {
2414
0
      xmm_dst = load_128_aligned ((__m128i*)dst);
2415
2416
0
      unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2417
2418
0
      over_2x128 (&xmm_src, &xmm_src,
2419
0
      &xmm_alpha, &xmm_alpha,
2420
0
      &xmm_dst_lo, &xmm_dst_hi);
2421
2422
      /* rebuid the 4 pixel data and save*/
2423
0
      save_128_aligned (
2424
0
    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2425
2426
0
      w -= 4;
2427
0
      dst += 4;
2428
0
  }
2429
2430
0
  while (w)
2431
0
  {
2432
0
      d = *dst;
2433
0
      *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2434
0
            xmm_alpha,
2435
0
            unpack_32_1x128 (d)));
2436
0
      w--;
2437
0
  }
2438
2439
0
    }
2440
0
}
2441
2442
static void
2443
sse2_composite_over_n_0565 (pixman_implementation_t *imp,
2444
                            pixman_composite_info_t *info)
2445
0
{
2446
0
    PIXMAN_COMPOSITE_ARGS (info);
2447
0
    uint32_t src;
2448
0
    uint16_t    *dst_line, *dst, d;
2449
0
    int32_t w;
2450
0
    int dst_stride;
2451
0
    __m128i xmm_src, xmm_alpha;
2452
0
    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
2453
2454
0
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2455
2456
0
    if (src == 0)
2457
0
  return;
2458
2459
0
    PIXMAN_IMAGE_GET_LINE (
2460
0
  dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2461
2462
0
    xmm_src = expand_pixel_32_1x128 (src);
2463
0
    xmm_alpha = expand_alpha_1x128 (xmm_src);
2464
2465
0
    while (height--)
2466
0
    {
2467
0
  dst = dst_line;
2468
2469
0
  dst_line += dst_stride;
2470
0
  w = width;
2471
2472
0
  while (w && (uintptr_t)dst & 15)
2473
0
  {
2474
0
      d = *dst;
2475
2476
0
      *dst++ = pack_565_32_16 (
2477
0
    pack_1x128_32 (over_1x128 (xmm_src,
2478
0
             xmm_alpha,
2479
0
             expand565_16_1x128 (d))));
2480
0
      w--;
2481
0
  }
2482
2483
0
  while (w >= 8)
2484
0
  {
2485
0
      xmm_dst = load_128_aligned ((__m128i*)dst);
2486
2487
0
      unpack_565_128_4x128 (xmm_dst,
2488
0
          &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2489
2490
0
      over_2x128 (&xmm_src, &xmm_src,
2491
0
      &xmm_alpha, &xmm_alpha,
2492
0
      &xmm_dst0, &xmm_dst1);
2493
0
      over_2x128 (&xmm_src, &xmm_src,
2494
0
      &xmm_alpha, &xmm_alpha,
2495
0
      &xmm_dst2, &xmm_dst3);
2496
2497
0
      xmm_dst = pack_565_4x128_128 (
2498
0
    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2499
2500
0
      save_128_aligned ((__m128i*)dst, xmm_dst);
2501
2502
0
      dst += 8;
2503
0
      w -= 8;
2504
0
  }
2505
2506
0
  while (w--)
2507
0
  {
2508
0
      d = *dst;
2509
0
      *dst++ = pack_565_32_16 (
2510
0
    pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
2511
0
             expand565_16_1x128 (d))));
2512
0
  }
2513
0
    }
2514
2515
0
}
2516
2517
static void
2518
sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
2519
           pixman_composite_info_t *info)
2520
0
{
2521
0
    PIXMAN_COMPOSITE_ARGS (info);
2522
0
    uint32_t src;
2523
0
    uint32_t    *dst_line, d;
2524
0
    uint32_t    *mask_line, m;
2525
0
    uint32_t pack_cmp;
2526
0
    int dst_stride, mask_stride;
2527
2528
0
    __m128i xmm_src;
2529
0
    __m128i xmm_dst;
2530
0
    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2531
2532
0
    __m128i mmx_src, mmx_mask, mmx_dest;
2533
2534
0
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2535
2536
0
    if (src == 0)
2537
0
  return;
2538
2539
0
    PIXMAN_IMAGE_GET_LINE (
2540
0
  dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2541
0
    PIXMAN_IMAGE_GET_LINE (
2542
0
  mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2543
2544
0
    xmm_src = _mm_unpacklo_epi8 (
2545
0
  create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2546
0
    mmx_src   = xmm_src;
2547
2548
0
    while (height--)
2549
0
    {
2550
0
  int w = width;
2551
0
  const uint32_t *pm = (uint32_t *)mask_line;
2552
0
  uint32_t *pd = (uint32_t *)dst_line;
2553
2554
0
  dst_line += dst_stride;
2555
0
  mask_line += mask_stride;
2556
2557
0
  while (w && (uintptr_t)pd & 15)
2558
0
  {
2559
0
      m = *pm++;
2560
2561
0
      if (m)
2562
0
      {
2563
0
    d = *pd;
2564
2565
0
    mmx_mask = unpack_32_1x128 (m);
2566
0
    mmx_dest = unpack_32_1x128 (d);
2567
2568
0
    *pd = pack_1x128_32 (
2569
0
        _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
2570
0
           mmx_dest));
2571
0
      }
2572
2573
0
      pd++;
2574
0
      w--;
2575
0
  }
2576
2577
0
  while (w >= 4)
2578
0
  {
2579
0
      xmm_mask = load_128_unaligned ((__m128i*)pm);
2580
2581
0
      pack_cmp =
2582
0
    _mm_movemask_epi8 (
2583
0
        _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2584
2585
      /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2586
0
      if (pack_cmp != 0xffff)
2587
0
      {
2588
0
    xmm_dst = load_128_aligned ((__m128i*)pd);
2589
2590
0
    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2591
2592
0
    pix_multiply_2x128 (&xmm_src, &xmm_src,
2593
0
            &xmm_mask_lo, &xmm_mask_hi,
2594
0
            &xmm_mask_lo, &xmm_mask_hi);
2595
0
    xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
2596
2597
0
    save_128_aligned (
2598
0
        (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
2599
0
      }
2600
2601
0
      pd += 4;
2602
0
      pm += 4;
2603
0
      w -= 4;
2604
0
  }
2605
2606
0
  while (w)
2607
0
  {
2608
0
      m = *pm++;
2609
2610
0
      if (m)
2611
0
      {
2612
0
    d = *pd;
2613
2614
0
    mmx_mask = unpack_32_1x128 (m);
2615
0
    mmx_dest = unpack_32_1x128 (d);
2616
2617
0
    *pd = pack_1x128_32 (
2618
0
        _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
2619
0
           mmx_dest));
2620
0
      }
2621
2622
0
      pd++;
2623
0
      w--;
2624
0
  }
2625
0
    }
2626
2627
0
}
2628
2629
static void
2630
sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
2631
                                    pixman_composite_info_t *info)
2632
0
{
2633
0
    PIXMAN_COMPOSITE_ARGS (info);
2634
0
    uint32_t src;
2635
0
    uint32_t    *dst_line, d;
2636
0
    uint32_t    *mask_line, m;
2637
0
    uint32_t pack_cmp;
2638
0
    int dst_stride, mask_stride;
2639
2640
0
    __m128i xmm_src, xmm_alpha;
2641
0
    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2642
0
    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2643
2644
0
    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2645
2646
0
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2647
2648
0
    if (src == 0)
2649
0
  return;
2650
2651
0
    PIXMAN_IMAGE_GET_LINE (
2652
0
  dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2653
0
    PIXMAN_IMAGE_GET_LINE (
2654
0
  mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2655
2656
0
    xmm_src = _mm_unpacklo_epi8 (
2657
0
  create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2658
0
    xmm_alpha = expand_alpha_1x128 (xmm_src);
2659
0
    mmx_src   = xmm_src;
2660
0
    mmx_alpha = xmm_alpha;
2661
2662
0
    while (height--)
2663
0
    {
2664
0
  int w = width;
2665
0
  const uint32_t *pm = (uint32_t *)mask_line;
2666
0
  uint32_t *pd = (uint32_t *)dst_line;
2667
2668
0
  dst_line += dst_stride;
2669
0
  mask_line += mask_stride;
2670
2671
0
  while (w && (uintptr_t)pd & 15)
2672
0
  {
2673
0
      m = *pm++;
2674
2675
0
      if (m)
2676
0
      {
2677
0
    d = *pd;
2678
0
    mmx_mask = unpack_32_1x128 (m);
2679
0
    mmx_dest = unpack_32_1x128 (d);
2680
2681
0
    *pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
2682
0
                                      &mmx_alpha,
2683
0
                                      &mmx_mask,
2684
0
                                      &mmx_dest));
2685
0
      }
2686
2687
0
      pd++;
2688
0
      w--;
2689
0
  }
2690
2691
0
  while (w >= 4)
2692
0
  {
2693
0
      xmm_mask = load_128_unaligned ((__m128i*)pm);
2694
2695
0
      pack_cmp =
2696
0
    _mm_movemask_epi8 (
2697
0
        _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2698
2699
      /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2700
0
      if (pack_cmp != 0xffff)
2701
0
      {
2702
0
    xmm_dst = load_128_aligned ((__m128i*)pd);
2703
2704
0
    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2705
0
    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2706
2707
0
    in_over_2x128 (&xmm_src, &xmm_src,
2708
0
             &xmm_alpha, &xmm_alpha,
2709
0
             &xmm_mask_lo, &xmm_mask_hi,
2710
0
             &xmm_dst_lo, &xmm_dst_hi);
2711
2712
0
    save_128_aligned (
2713
0
        (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2714
0
      }
2715
2716
0
      pd += 4;
2717
0
      pm += 4;
2718
0
      w -= 4;
2719
0
  }
2720
2721
0
  while (w)
2722
0
  {
2723
0
      m = *pm++;
2724
2725
0
      if (m)
2726
0
      {
2727
0
    d = *pd;
2728
0
    mmx_mask = unpack_32_1x128 (m);
2729
0
    mmx_dest = unpack_32_1x128 (d);
2730
2731
0
    *pd = pack_1x128_32 (
2732
0
        in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
2733
0
      }
2734
2735
0
      pd++;
2736
0
      w--;
2737
0
  }
2738
0
    }
2739
2740
0
}
2741
2742
static void
2743
sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
2744
                                 pixman_composite_info_t *info)
2745
0
{
2746
0
    PIXMAN_COMPOSITE_ARGS (info);
2747
0
    uint32_t    *dst_line, *dst;
2748
0
    uint32_t    *src_line, *src;
2749
0
    uint32_t mask;
2750
0
    int32_t w;
2751
0
    int dst_stride, src_stride;
2752
2753
0
    __m128i xmm_mask;
2754
0
    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2755
0
    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2756
0
    __m128i xmm_alpha_lo, xmm_alpha_hi;
2757
2758
0
    PIXMAN_IMAGE_GET_LINE (
2759
0
  dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2760
0
    PIXMAN_IMAGE_GET_LINE (
2761
0
  src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2762
2763
0
    mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2764
2765
0
    xmm_mask = create_mask_16_128 (mask >> 24);
2766
2767
0
    while (height--)
2768
0
    {
2769
0
  dst = dst_line;
2770
0
  dst_line += dst_stride;
2771
0
  src = src_line;
2772
0
  src_line += src_stride;
2773
0
  w = width;
2774
2775
0
  while (w && (uintptr_t)dst & 15)
2776
0
  {
2777
0
      uint32_t s = *src++;
2778
2779
0
      if (s)
2780
0
      {
2781
0
    uint32_t d = *dst;
2782
    
2783
0
    __m128i ms = unpack_32_1x128 (s);
2784
0
    __m128i alpha    = expand_alpha_1x128 (ms);
2785
0
    __m128i dest     = xmm_mask;
2786
0
    __m128i alpha_dst = unpack_32_1x128 (d);
2787
    
2788
0
    *dst = pack_1x128_32 (
2789
0
        in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
2790
0
      }
2791
0
      dst++;
2792
0
      w--;
2793
0
  }
2794
2795
0
  while (w >= 4)
2796
0
  {
2797
0
      xmm_src = load_128_unaligned ((__m128i*)src);
2798
2799
0
      if (!is_zero (xmm_src))
2800
0
      {
2801
0
    xmm_dst = load_128_aligned ((__m128i*)dst);
2802
    
2803
0
    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
2804
0
    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2805
0
    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2806
0
            &xmm_alpha_lo, &xmm_alpha_hi);
2807
    
2808
0
    in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
2809
0
             &xmm_alpha_lo, &xmm_alpha_hi,
2810
0
             &xmm_mask, &xmm_mask,
2811
0
             &xmm_dst_lo, &xmm_dst_hi);
2812
    
2813
0
    save_128_aligned (
2814
0
        (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2815
0
      }
2816
    
2817
0
      dst += 4;
2818
0
      src += 4;
2819
0
      w -= 4;
2820
0
  }
2821
2822
0
  while (w)
2823
0
  {
2824
0
      uint32_t s = *src++;
2825
2826
0
      if (s)
2827
0
      {
2828
0
    uint32_t d = *dst;
2829
    
2830
0
    __m128i ms = unpack_32_1x128 (s);
2831
0
    __m128i alpha = expand_alpha_1x128 (ms);
2832
0
    __m128i mask  = xmm_mask;
2833
0
    __m128i dest  = unpack_32_1x128 (d);
2834
    
2835
0
    *dst = pack_1x128_32 (
2836
0
        in_over_1x128 (&ms, &alpha, &mask, &dest));
2837
0
      }
2838
2839
0
      dst++;
2840
0
      w--;
2841
0
  }
2842
0
    }
2843
2844
0
}
2845
2846
static void
2847
sse2_composite_src_x888_0565 (pixman_implementation_t *imp,
2848
                              pixman_composite_info_t *info)
2849
0
{
2850
0
    PIXMAN_COMPOSITE_ARGS (info);
2851
0
    uint16_t    *dst_line, *dst;
2852
0
    uint32_t    *src_line, *src, s;
2853
0
    int dst_stride, src_stride;
2854
0
    int32_t w;
2855
2856
0
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2857
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2858
2859
0
    while (height--)
2860
0
    {
2861
0
  dst = dst_line;
2862
0
  dst_line += dst_stride;
2863
0
  src = src_line;
2864
0
  src_line += src_stride;
2865
0
  w = width;
2866
2867
0
  while (w && (uintptr_t)dst & 15)
2868
0
  {
2869
0
      s = *src++;
2870
0
      *dst = convert_8888_to_0565 (s);
2871
0
      dst++;
2872
0
      w--;
2873
0
  }
2874
2875
0
  while (w >= 8)
2876
0
  {
2877
0
      __m128i xmm_src0 = load_128_unaligned ((__m128i *)src + 0);
2878
0
      __m128i xmm_src1 = load_128_unaligned ((__m128i *)src + 1);
2879
2880
0
      save_128_aligned ((__m128i*)dst, pack_565_2packedx128_128 (xmm_src0, xmm_src1));
2881
2882
0
      w -= 8;
2883
0
      src += 8;
2884
0
      dst += 8;
2885
0
  }
2886
2887
0
  while (w)
2888
0
  {
2889
0
      s = *src++;
2890
0
      *dst = convert_8888_to_0565 (s);
2891
0
      dst++;
2892
0
      w--;
2893
0
  }
2894
0
    }
2895
0
}
2896
2897
static void
2898
sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
2899
            pixman_composite_info_t *info)
2900
0
{
2901
0
    PIXMAN_COMPOSITE_ARGS (info);
2902
0
    uint32_t    *dst_line, *dst;
2903
0
    uint32_t    *src_line, *src;
2904
0
    int32_t w;
2905
0
    int dst_stride, src_stride;
2906
2907
2908
0
    PIXMAN_IMAGE_GET_LINE (
2909
0
  dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2910
0
    PIXMAN_IMAGE_GET_LINE (
2911
0
  src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2912
2913
0
    while (height--)
2914
0
    {
2915
0
  dst = dst_line;
2916
0
  dst_line += dst_stride;
2917
0
  src = src_line;
2918
0
  src_line += src_stride;
2919
0
  w = width;
2920
2921
0
  while (w && (uintptr_t)dst & 15)
2922
0
  {
2923
0
      *dst++ = *src++ | 0xff000000;
2924
0
      w--;
2925
0
  }
2926
2927
0
  while (w >= 16)
2928
0
  {
2929
0
      __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
2930
      
2931
0
      xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
2932
0
      xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
2933
0
      xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
2934
0
      xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
2935
      
2936
0
      save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
2937
0
      save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
2938
0
      save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
2939
0
      save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
2940
      
2941
0
      dst += 16;
2942
0
      src += 16;
2943
0
      w -= 16;
2944
0
  }
2945
2946
0
  while (w)
2947
0
  {
2948
0
      *dst++ = *src++ | 0xff000000;
2949
0
      w--;
2950
0
  }
2951
0
    }
2952
2953
0
}
2954
2955
static void
2956
sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
2957
                                 pixman_composite_info_t *info)
2958
0
{
2959
0
    PIXMAN_COMPOSITE_ARGS (info);
2960
0
    uint32_t    *dst_line, *dst;
2961
0
    uint32_t    *src_line, *src;
2962
0
    uint32_t mask;
2963
0
    int dst_stride, src_stride;
2964
0
    int32_t w;
2965
2966
0
    __m128i xmm_mask, xmm_alpha;
2967
0
    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2968
0
    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2969
2970
0
    PIXMAN_IMAGE_GET_LINE (
2971
0
  dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2972
0
    PIXMAN_IMAGE_GET_LINE (
2973
0
  src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2974
2975
0
    mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2976
2977
0
    xmm_mask = create_mask_16_128 (mask >> 24);
2978
0
    xmm_alpha = mask_00ff;
2979
2980
0
    while (height--)
2981
0
    {
2982
0
  dst = dst_line;
2983
0
  dst_line += dst_stride;
2984
0
  src = src_line;
2985
0
  src_line += src_stride;
2986
0
  w = width;
2987
2988
0
  while (w && (uintptr_t)dst & 15)
2989
0
  {
2990
0
      uint32_t s = (*src++) | 0xff000000;
2991
0
      uint32_t d = *dst;
2992
2993
0
      __m128i src   = unpack_32_1x128 (s);
2994
0
      __m128i alpha = xmm_alpha;
2995
0
      __m128i mask  = xmm_mask;
2996
0
      __m128i dest  = unpack_32_1x128 (d);
2997
2998
0
      *dst++ = pack_1x128_32 (
2999
0
    in_over_1x128 (&src, &alpha, &mask, &dest));
3000
3001
0
      w--;
3002
0
  }
3003
3004
0
  while (w >= 4)
3005
0
  {
3006
0
      xmm_src = _mm_or_si128 (
3007
0
    load_128_unaligned ((__m128i*)src), mask_ff000000);
3008
0
      xmm_dst = load_128_aligned ((__m128i*)dst);
3009
3010
0
      unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3011
0
      unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3012
3013
0
      in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3014
0
         &xmm_alpha, &xmm_alpha,
3015
0
         &xmm_mask, &xmm_mask,
3016
0
         &xmm_dst_lo, &xmm_dst_hi);
3017
3018
0
      save_128_aligned (
3019
0
    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3020
3021
0
      dst += 4;
3022
0
      src += 4;
3023
0
      w -= 4;
3024
3025
0
  }
3026
3027
0
  while (w)
3028
0
  {
3029
0
      uint32_t s = (*src++) | 0xff000000;
3030
0
      uint32_t d = *dst;
3031
3032
0
      __m128i src  = unpack_32_1x128 (s);
3033
0
      __m128i alpha = xmm_alpha;
3034
0
      __m128i mask  = xmm_mask;
3035
0
      __m128i dest  = unpack_32_1x128 (d);
3036
3037
0
      *dst++ = pack_1x128_32 (
3038
0
    in_over_1x128 (&src, &alpha, &mask, &dest));
3039
3040
0
      w--;
3041
0
  }
3042
0
    }
3043
3044
0
}
3045
3046
static void
3047
sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3048
                               pixman_composite_info_t *info)
3049
8
{
3050
8
    PIXMAN_COMPOSITE_ARGS (info);
3051
8
    int dst_stride, src_stride;
3052
8
    uint32_t    *dst_line, *dst;
3053
8
    uint32_t    *src_line, *src;
3054
3055
8
    PIXMAN_IMAGE_GET_LINE (
3056
8
  dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3057
8
    PIXMAN_IMAGE_GET_LINE (
3058
8
  src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3059
3060
8
    dst = dst_line;
3061
8
    src = src_line;
3062
3063
257
    while (height--)
3064
249
    {
3065
249
  sse2_combine_over_u (imp, op, dst, src, NULL, width);
3066
3067
249
  dst += dst_stride;
3068
249
  src += src_stride;
3069
249
    }
3070
8
}
3071
3072
static force_inline uint16_t
3073
composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3074
0
{
3075
0
    __m128i ms;
3076
3077
0
    ms = unpack_32_1x128 (src);
3078
0
    return pack_565_32_16 (
3079
0
  pack_1x128_32 (
3080
0
      over_1x128 (
3081
0
    ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
3082
0
}
3083
3084
static void
3085
sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3086
                               pixman_composite_info_t *info)
3087
0
{
3088
0
    PIXMAN_COMPOSITE_ARGS (info);
3089
0
    uint16_t    *dst_line, *dst, d;
3090
0
    uint32_t    *src_line, *src, s;
3091
0
    int dst_stride, src_stride;
3092
0
    int32_t w;
3093
3094
0
    __m128i xmm_alpha_lo, xmm_alpha_hi;
3095
0
    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3096
0
    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3097
3098
0
    PIXMAN_IMAGE_GET_LINE (
3099
0
  dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3100
0
    PIXMAN_IMAGE_GET_LINE (
3101
0
  src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3102
3103
0
    while (height--)
3104
0
    {
3105
0
  dst = dst_line;
3106
0
  src = src_line;
3107
3108
0
  dst_line += dst_stride;
3109
0
  src_line += src_stride;
3110
0
  w = width;
3111
3112
  /* Align dst on a 16-byte boundary */
3113
0
  while (w &&
3114
0
         ((uintptr_t)dst & 15))
3115
0
  {
3116
0
      s = *src++;
3117
0
      d = *dst;
3118
3119
0
      *dst++ = composite_over_8888_0565pixel (s, d);
3120
0
      w--;
3121
0
  }
3122
3123
  /* It's a 8 pixel loop */
3124
0
  while (w >= 8)
3125
0
  {
3126
      /* I'm loading unaligned because I'm not sure
3127
       * about the address alignment.
3128
       */
3129
0
      xmm_src = load_128_unaligned ((__m128i*) src);
3130
0
      xmm_dst = load_128_aligned ((__m128i*) dst);
3131
3132
      /* Unpacking */
3133
0
      unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3134
0
      unpack_565_128_4x128 (xmm_dst,
3135
0
          &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3136
0
      expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3137
0
        &xmm_alpha_lo, &xmm_alpha_hi);
3138
3139
      /* I'm loading next 4 pixels from memory
3140
       * before to optimze the memory read.
3141
       */
3142
0
      xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3143
3144
0
      over_2x128 (&xmm_src_lo, &xmm_src_hi,
3145
0
      &xmm_alpha_lo, &xmm_alpha_hi,
3146
0
      &xmm_dst0, &xmm_dst1);
3147
3148
      /* Unpacking */
3149
0
      unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3150
0
      expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3151
0
        &xmm_alpha_lo, &xmm_alpha_hi);
3152
3153
0
      over_2x128 (&xmm_src_lo, &xmm_src_hi,
3154
0
      &xmm_alpha_lo, &xmm_alpha_hi,
3155
0
      &xmm_dst2, &xmm_dst3);
3156
3157
0
      save_128_aligned (
3158
0
    (__m128i*)dst, pack_565_4x128_128 (
3159
0
        &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3160
3161
0
      w -= 8;
3162
0
      dst += 8;
3163
0
      src += 8;
3164
0
  }
3165
3166
0
  while (w--)
3167
0
  {
3168
0
      s = *src++;
3169
0
      d = *dst;
3170
3171
0
      *dst++ = composite_over_8888_0565pixel (s, d);
3172
0
  }
3173
0
    }
3174
3175
0
}
3176
3177
static void
3178
sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3179
                              pixman_composite_info_t *info)
3180
2
{
3181
2
    PIXMAN_COMPOSITE_ARGS (info);
3182
2
    uint32_t src, srca;
3183
2
    uint32_t *dst_line, *dst;
3184
2
    uint8_t *mask_line, *mask;
3185
2
    int dst_stride, mask_stride;
3186
2
    int32_t w;
3187
2
    uint32_t d;
3188
3189
2
    __m128i xmm_src, xmm_alpha, xmm_def;
3190
2
    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3191
2
    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3192
3193
2
    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3194
3195
2
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3196
3197
2
    srca = src >> 24;
3198
2
    if (src == 0)
3199
0
  return;
3200
3201
2
    PIXMAN_IMAGE_GET_LINE (
3202
2
  dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3203
2
    PIXMAN_IMAGE_GET_LINE (
3204
2
  mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3205
3206
2
    xmm_def = create_mask_2x32_128 (src, src);
3207
2
    xmm_src = expand_pixel_32_1x128 (src);
3208
2
    xmm_alpha = expand_alpha_1x128 (xmm_src);
3209
2
    mmx_src   = xmm_src;
3210
2
    mmx_alpha = xmm_alpha;
3211
3212
37
    while (height--)
3213
35
    {
3214
35
  dst = dst_line;
3215
35
  dst_line += dst_stride;
3216
35
  mask = mask_line;
3217
35
  mask_line += mask_stride;
3218
35
  w = width;
3219
3220
84
  while (w && (uintptr_t)dst & 15)
3221
49
  {
3222
49
      uint8_t m = *mask++;
3223
3224
49
      if (m)
3225
16
      {
3226
16
    d = *dst;
3227
16
    mmx_mask = expand_pixel_8_1x128 (m);
3228
16
    mmx_dest = unpack_32_1x128 (d);
3229
3230
16
    *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3231
16
                                       &mmx_alpha,
3232
16
                                       &mmx_mask,
3233
16
                                       &mmx_dest));
3234
16
      }
3235
3236
49
      w--;
3237
49
      dst++;
3238
49
  }
3239
3240
210
  while (w >= 4)
3241
175
  {
3242
175
            uint32_t m;
3243
175
            memcpy(&m, mask, sizeof(uint32_t));
3244
3245
175
      if (srca == 0xff && m == 0xffffffff)
3246
0
      {
3247
0
    save_128_aligned ((__m128i*)dst, xmm_def);
3248
0
      }
3249
175
      else if (m)
3250
127
      {
3251
127
    xmm_dst = load_128_aligned ((__m128i*) dst);
3252
127
    xmm_mask = unpack_32_1x128 (m);
3253
127
    xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3254
3255
    /* Unpacking */
3256
127
    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3257
127
    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3258
3259
127
    expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3260
127
          &xmm_mask_lo, &xmm_mask_hi);
3261
3262
127
    in_over_2x128 (&xmm_src, &xmm_src,
3263
127
             &xmm_alpha, &xmm_alpha,
3264
127
             &xmm_mask_lo, &xmm_mask_hi,
3265
127
             &xmm_dst_lo, &xmm_dst_hi);
3266
3267
127
    save_128_aligned (
3268
127
        (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3269
127
      }
3270
3271
175
      w -= 4;
3272
175
      dst += 4;
3273
175
      mask += 4;
3274
175
  }
3275
3276
91
  while (w)
3277
56
  {
3278
56
      uint8_t m = *mask++;
3279
3280
56
      if (m)
3281
17
      {
3282
17
    d = *dst;
3283
17
    mmx_mask = expand_pixel_8_1x128 (m);
3284
17
    mmx_dest = unpack_32_1x128 (d);
3285
3286
17
    *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3287
17
                                       &mmx_alpha,
3288
17
                                       &mmx_mask,
3289
17
                                       &mmx_dest));
3290
17
      }
3291
3292
56
      w--;
3293
56
      dst++;
3294
56
  }
3295
35
    }
3296
3297
2
}
3298
3299
#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
3300
__attribute__((__force_align_arg_pointer__))
3301
#endif
3302
static pixman_bool_t
3303
sse2_fill (pixman_implementation_t *imp,
3304
           uint32_t *               bits,
3305
           int                      stride,
3306
           int                      bpp,
3307
           int                      x,
3308
           int                      y,
3309
           int                      width,
3310
           int                      height,
3311
           uint32_t       filler)
3312
8
{
3313
8
    uint32_t byte_width;
3314
8
    uint8_t *byte_line;
3315
3316
8
    __m128i xmm_def;
3317
3318
8
    if (bpp == 8)
3319
0
    {
3320
0
  uint32_t b;
3321
0
  uint32_t w;
3322
3323
0
  stride = stride * (int) sizeof (uint32_t) / 1;
3324
0
  byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
3325
0
  byte_width = width;
3326
0
  stride *= 1;
3327
3328
0
  b = filler & 0xff;
3329
0
  w = (b << 8) | b;
3330
0
  filler = (w << 16) | w;
3331
0
    }
3332
8
    else if (bpp == 16)
3333
0
    {
3334
0
  stride = stride * (int) sizeof (uint32_t) / 2;
3335
0
  byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3336
0
  byte_width = 2 * width;
3337
0
  stride *= 2;
3338
3339
0
        filler = (filler & 0xffff) * 0x00010001;
3340
0
    }
3341
8
    else if (bpp == 32)
3342
8
    {
3343
8
  stride = stride * (int) sizeof (uint32_t) / 4;
3344
8
  byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3345
8
  byte_width = 4 * width;
3346
8
  stride *= 4;
3347
8
    }
3348
0
    else
3349
0
    {
3350
0
  return FALSE;
3351
0
    }
3352
3353
8
    xmm_def = create_mask_2x32_128 (filler, filler);
3354
3355
969
    while (height--)
3356
961
    {
3357
961
  int w;
3358
961
  uint8_t *d = byte_line;
3359
961
  byte_line += stride;
3360
961
  w = byte_width;
3361
3362
961
  if (w >= 1 && ((uintptr_t)d & 1))
3363
0
  {
3364
0
      *(uint8_t *)d = filler;
3365
0
      w -= 1;
3366
0
      d += 1;
3367
0
  }
3368
3369
961
  while (w >= 2 && ((uintptr_t)d & 3))
3370
0
  {
3371
0
      *(uint16_t *)d = filler;
3372
0
      w -= 2;
3373
0
      d += 2;
3374
0
  }
3375
3376
1.28k
  while (w >= 4 && ((uintptr_t)d & 15))
3377
326
  {
3378
326
      *(uint32_t *)d = filler;
3379
3380
326
      w -= 4;
3381
326
      d += 4;
3382
326
  }
3383
3384
11.8k
  while (w >= 128)
3385
10.9k
  {
3386
10.9k
      save_128_aligned ((__m128i*)(d),     xmm_def);
3387
10.9k
      save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3388
10.9k
      save_128_aligned ((__m128i*)(d + 32),  xmm_def);
3389
10.9k
      save_128_aligned ((__m128i*)(d + 48),  xmm_def);
3390
10.9k
      save_128_aligned ((__m128i*)(d + 64),  xmm_def);
3391
10.9k
      save_128_aligned ((__m128i*)(d + 80),  xmm_def);
3392
10.9k
      save_128_aligned ((__m128i*)(d + 96),  xmm_def);
3393
10.9k
      save_128_aligned ((__m128i*)(d + 112), xmm_def);
3394
3395
10.9k
      d += 128;
3396
10.9k
      w -= 128;
3397
10.9k
  }
3398
3399
961
  if (w >= 64)
3400
813
  {
3401
813
      save_128_aligned ((__m128i*)(d),     xmm_def);
3402
813
      save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3403
813
      save_128_aligned ((__m128i*)(d + 32),  xmm_def);
3404
813
      save_128_aligned ((__m128i*)(d + 48),  xmm_def);
3405
3406
813
      d += 64;
3407
813
      w -= 64;
3408
813
  }
3409
3410
961
  if (w >= 32)
3411
880
  {
3412
880
      save_128_aligned ((__m128i*)(d),     xmm_def);
3413
880
      save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3414
3415
880
      d += 32;
3416
880
      w -= 32;
3417
880
  }
3418
3419
961
  if (w >= 16)
3420
849
  {
3421
849
      save_128_aligned ((__m128i*)(d),     xmm_def);
3422
3423
849
      d += 16;
3424
849
      w -= 16;
3425
849
  }
3426
3427
1.29k
  while (w >= 4)
3428
337
  {
3429
337
      *(uint32_t *)d = filler;
3430
3431
337
      w -= 4;
3432
337
      d += 4;
3433
337
  }
3434
3435
961
  if (w >= 2)
3436
0
  {
3437
0
      *(uint16_t *)d = filler;
3438
0
      w -= 2;
3439
0
      d += 2;
3440
0
  }
3441
3442
961
  if (w >= 1)
3443
0
  {
3444
0
      *(uint8_t *)d = filler;
3445
0
      w -= 1;
3446
0
      d += 1;
3447
0
  }
3448
961
    }
3449
3450
8
    return TRUE;
3451
8
}
3452
3453
static void
3454
sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
3455
                             pixman_composite_info_t *info)
3456
0
{
3457
0
    PIXMAN_COMPOSITE_ARGS (info);
3458
0
    uint32_t src, srca;
3459
0
    uint32_t    *dst_line, *dst;
3460
0
    uint8_t     *mask_line, *mask;
3461
0
    int dst_stride, mask_stride;
3462
0
    int32_t w;
3463
3464
0
    __m128i xmm_src, xmm_def;
3465
0
    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3466
3467
0
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3468
3469
0
    srca = src >> 24;
3470
0
    if (src == 0)
3471
0
    {
3472
0
  sse2_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
3473
0
       PIXMAN_FORMAT_BPP (dest_image->bits.format),
3474
0
       dest_x, dest_y, width, height, 0);
3475
0
  return;
3476
0
    }
3477
3478
0
    PIXMAN_IMAGE_GET_LINE (
3479
0
  dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3480
0
    PIXMAN_IMAGE_GET_LINE (
3481
0
  mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3482
3483
0
    xmm_def = create_mask_2x32_128 (src, src);
3484
0
    xmm_src = expand_pixel_32_1x128 (src);
3485
3486
0
    while (height--)
3487
0
    {
3488
0
  dst = dst_line;
3489
0
  dst_line += dst_stride;
3490
0
  mask = mask_line;
3491
0
  mask_line += mask_stride;
3492
0
  w = width;
3493
3494
0
  while (w && (uintptr_t)dst & 15)
3495
0
  {
3496
0
      uint8_t m = *mask++;
3497
3498
0
      if (m)
3499
0
      {
3500
0
    *dst = pack_1x128_32 (
3501
0
        pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
3502
0
      }
3503
0
      else
3504
0
      {
3505
0
    *dst = 0;
3506
0
      }
3507
3508
0
      w--;
3509
0
      dst++;
3510
0
  }
3511
3512
0
  while (w >= 4)
3513
0
  {
3514
0
            uint32_t m;
3515
0
            memcpy(&m, mask, sizeof(uint32_t));
3516
3517
0
      if (srca == 0xff && m == 0xffffffff)
3518
0
      {
3519
0
    save_128_aligned ((__m128i*)dst, xmm_def);
3520
0
      }
3521
0
      else if (m)
3522
0
      {
3523
0
    xmm_mask = unpack_32_1x128 (m);
3524
0
    xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3525
3526
    /* Unpacking */
3527
0
    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3528
3529
0
    expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3530
0
          &xmm_mask_lo, &xmm_mask_hi);
3531
3532
0
    pix_multiply_2x128 (&xmm_src, &xmm_src,
3533
0
            &xmm_mask_lo, &xmm_mask_hi,
3534
0
            &xmm_mask_lo, &xmm_mask_hi);
3535
3536
0
    save_128_aligned (
3537
0
        (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
3538
0
      }
3539
0
      else
3540
0
      {
3541
0
    save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
3542
0
      }
3543
3544
0
      w -= 4;
3545
0
      dst += 4;
3546
0
      mask += 4;
3547
0
  }
3548
3549
0
  while (w)
3550
0
  {
3551
0
      uint8_t m = *mask++;
3552
3553
0
      if (m)
3554
0
      {
3555
0
    *dst = pack_1x128_32 (
3556
0
        pix_multiply_1x128 (
3557
0
      xmm_src, expand_pixel_8_1x128 (m)));
3558
0
      }
3559
0
      else
3560
0
      {
3561
0
    *dst = 0;
3562
0
      }
3563
3564
0
      w--;
3565
0
      dst++;
3566
0
  }
3567
0
    }
3568
3569
0
}
3570
3571
static void
3572
sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
3573
                              pixman_composite_info_t *info)
3574
0
{
3575
0
    PIXMAN_COMPOSITE_ARGS (info);
3576
0
    uint32_t src;
3577
0
    uint16_t    *dst_line, *dst, d;
3578
0
    uint8_t     *mask_line, *mask;
3579
0
    int dst_stride, mask_stride;
3580
0
    int32_t w;
3581
0
    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3582
3583
0
    __m128i xmm_src, xmm_alpha;
3584
0
    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3585
0
    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3586
3587
0
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3588
3589
0
    if (src == 0)
3590
0
  return;
3591
3592
0
    PIXMAN_IMAGE_GET_LINE (
3593
0
  dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3594
0
    PIXMAN_IMAGE_GET_LINE (
3595
0
  mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3596
3597
0
    xmm_src = expand_pixel_32_1x128 (src);
3598
0
    xmm_alpha = expand_alpha_1x128 (xmm_src);
3599
0
    mmx_src = xmm_src;
3600
0
    mmx_alpha = xmm_alpha;
3601
3602
0
    while (height--)
3603
0
    {
3604
0
  dst = dst_line;
3605
0
  dst_line += dst_stride;
3606
0
  mask = mask_line;
3607
0
  mask_line += mask_stride;
3608
0
  w = width;
3609
3610
0
  while (w && (uintptr_t)dst & 15)
3611
0
  {
3612
0
      uint8_t m = *mask++;
3613
3614
0
      if (m)
3615
0
      {
3616
0
    d = *dst;
3617
0
    mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3618
0
    mmx_dest = expand565_16_1x128 (d);
3619
3620
0
    *dst = pack_565_32_16 (
3621
0
        pack_1x128_32 (
3622
0
      in_over_1x128 (
3623
0
          &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3624
0
      }
3625
3626
0
      w--;
3627
0
      dst++;
3628
0
  }
3629
3630
0
  while (w >= 8)
3631
0
  {
3632
0
            uint32_t m;
3633
3634
0
      xmm_dst = load_128_aligned ((__m128i*) dst);
3635
0
      unpack_565_128_4x128 (xmm_dst,
3636
0
          &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3637
3638
0
            memcpy(&m, mask, sizeof(uint32_t));
3639
0
      mask += 4;
3640
3641
0
      if (m)
3642
0
      {
3643
0
    xmm_mask = unpack_32_1x128 (m);
3644
0
    xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3645
3646
    /* Unpacking */
3647
0
    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3648
3649
0
    expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3650
0
          &xmm_mask_lo, &xmm_mask_hi);
3651
3652
0
    in_over_2x128 (&xmm_src, &xmm_src,
3653
0
             &xmm_alpha, &xmm_alpha,
3654
0
             &xmm_mask_lo, &xmm_mask_hi,
3655
0
             &xmm_dst0, &xmm_dst1);
3656
0
      }
3657
3658
0
            memcpy(&m, mask, sizeof(uint32_t));
3659
0
      mask += 4;
3660
3661
0
      if (m)
3662
0
      {
3663
0
    xmm_mask = unpack_32_1x128 (m);
3664
0
    xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3665
3666
    /* Unpacking */
3667
0
    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3668
3669
0
    expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3670
0
          &xmm_mask_lo, &xmm_mask_hi);
3671
0
    in_over_2x128 (&xmm_src, &xmm_src,
3672
0
             &xmm_alpha, &xmm_alpha,
3673
0
             &xmm_mask_lo, &xmm_mask_hi,
3674
0
             &xmm_dst2, &xmm_dst3);
3675
0
      }
3676
3677
0
      save_128_aligned (
3678
0
    (__m128i*)dst, pack_565_4x128_128 (
3679
0
        &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3680
3681
0
      w -= 8;
3682
0
      dst += 8;
3683
0
  }
3684
3685
0
  while (w)
3686
0
  {
3687
0
      uint8_t m = *mask++;
3688
3689
0
      if (m)
3690
0
      {
3691
0
    d = *dst;
3692
0
    mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3693
0
    mmx_dest = expand565_16_1x128 (d);
3694
3695
0
    *dst = pack_565_32_16 (
3696
0
        pack_1x128_32 (
3697
0
      in_over_1x128 (
3698
0
          &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3699
0
      }
3700
3701
0
      w--;
3702
0
      dst++;
3703
0
  }
3704
0
    }
3705
3706
0
}
3707
3708
static void
3709
sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
3710
                                 pixman_composite_info_t *info)
3711
0
{
3712
0
    PIXMAN_COMPOSITE_ARGS (info);
3713
0
    uint16_t    *dst_line, *dst, d;
3714
0
    uint32_t    *src_line, *src, s;
3715
0
    int dst_stride, src_stride;
3716
0
    int32_t w;
3717
0
    uint32_t opaque, zero;
3718
3719
0
    __m128i ms;
3720
0
    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3721
0
    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3722
3723
0
    PIXMAN_IMAGE_GET_LINE (
3724
0
  dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3725
0
    PIXMAN_IMAGE_GET_LINE (
3726
0
  src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3727
3728
0
    while (height--)
3729
0
    {
3730
0
  dst = dst_line;
3731
0
  dst_line += dst_stride;
3732
0
  src = src_line;
3733
0
  src_line += src_stride;
3734
0
  w = width;
3735
3736
0
  while (w && (uintptr_t)dst & 15)
3737
0
  {
3738
0
      s = *src++;
3739
0
      d = *dst;
3740
3741
0
      ms = unpack_32_1x128 (s);
3742
3743
0
      *dst++ = pack_565_32_16 (
3744
0
    pack_1x128_32 (
3745
0
        over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3746
0
      w--;
3747
0
  }
3748
3749
0
  while (w >= 8)
3750
0
  {
3751
      /* First round */
3752
0
      xmm_src = load_128_unaligned ((__m128i*)src);
3753
0
      xmm_dst = load_128_aligned  ((__m128i*)dst);
3754
3755
0
      opaque = is_opaque (xmm_src);
3756
0
      zero = is_zero (xmm_src);
3757
3758
0
      unpack_565_128_4x128 (xmm_dst,
3759
0
          &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3760
0
      unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3761
3762
      /* preload next round*/
3763
0
      xmm_src = load_128_unaligned ((__m128i*)(src + 4));
3764
3765
0
      if (opaque)
3766
0
      {
3767
0
    invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3768
0
             &xmm_dst0, &xmm_dst1);
3769
0
      }
3770
0
      else if (!zero)
3771
0
      {
3772
0
    over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3773
0
          &xmm_dst0, &xmm_dst1);
3774
0
      }
3775
3776
      /* Second round */
3777
0
      opaque = is_opaque (xmm_src);
3778
0
      zero = is_zero (xmm_src);
3779
3780
0
      unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3781
3782
0
      if (opaque)
3783
0
      {
3784
0
    invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3785
0
             &xmm_dst2, &xmm_dst3);
3786
0
      }
3787
0
      else if (!zero)
3788
0
      {
3789
0
    over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3790
0
          &xmm_dst2, &xmm_dst3);
3791
0
      }
3792
3793
0
      save_128_aligned (
3794
0
    (__m128i*)dst, pack_565_4x128_128 (
3795
0
        &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3796
3797
0
      w -= 8;
3798
0
      src += 8;
3799
0
      dst += 8;
3800
0
  }
3801
3802
0
  while (w)
3803
0
  {
3804
0
      s = *src++;
3805
0
      d = *dst;
3806
3807
0
      ms = unpack_32_1x128 (s);
3808
3809
0
      *dst++ = pack_565_32_16 (
3810
0
    pack_1x128_32 (
3811
0
        over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3812
0
      w--;
3813
0
  }
3814
0
    }
3815
3816
0
}
3817
3818
static void
3819
sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
3820
                                 pixman_composite_info_t *info)
3821
0
{
3822
0
    PIXMAN_COMPOSITE_ARGS (info);
3823
0
    uint32_t    *dst_line, *dst, d;
3824
0
    uint32_t    *src_line, *src, s;
3825
0
    int dst_stride, src_stride;
3826
0
    int32_t w;
3827
0
    uint32_t opaque, zero;
3828
3829
0
    __m128i xmm_src_lo, xmm_src_hi;
3830
0
    __m128i xmm_dst_lo, xmm_dst_hi;
3831
3832
0
    PIXMAN_IMAGE_GET_LINE (
3833
0
  dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3834
0
    PIXMAN_IMAGE_GET_LINE (
3835
0
  src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3836
3837
0
    while (height--)
3838
0
    {
3839
0
  dst = dst_line;
3840
0
  dst_line += dst_stride;
3841
0
  src = src_line;
3842
0
  src_line += src_stride;
3843
0
  w = width;
3844
3845
0
  while (w && (uintptr_t)dst & 15)
3846
0
  {
3847
0
      s = *src++;
3848
0
      d = *dst;
3849
3850
0
      *dst++ = pack_1x128_32 (
3851
0
    over_rev_non_pre_1x128 (
3852
0
        unpack_32_1x128 (s), unpack_32_1x128 (d)));
3853
3854
0
      w--;
3855
0
  }
3856
3857
0
  while (w >= 4)
3858
0
  {
3859
0
      xmm_src_hi = load_128_unaligned ((__m128i*)src);
3860
3861
0
      opaque = is_opaque (xmm_src_hi);
3862
0
      zero = is_zero (xmm_src_hi);
3863
3864
0
      unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
3865
3866
0
      if (opaque)
3867
0
      {
3868
0
    invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3869
0
             &xmm_dst_lo, &xmm_dst_hi);
3870
3871
0
    save_128_aligned (
3872
0
        (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3873
0
      }
3874
0
      else if (!zero)
3875
0
      {
3876
0
    xmm_dst_hi = load_128_aligned  ((__m128i*)dst);
3877
3878
0
    unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
3879
3880
0
    over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3881
0
          &xmm_dst_lo, &xmm_dst_hi);
3882
3883
0
    save_128_aligned (
3884
0
        (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3885
0
      }
3886
3887
0
      w -= 4;
3888
0
      dst += 4;
3889
0
      src += 4;
3890
0
  }
3891
3892
0
  while (w)
3893
0
  {
3894
0
      s = *src++;
3895
0
      d = *dst;
3896
3897
0
      *dst++ = pack_1x128_32 (
3898
0
    over_rev_non_pre_1x128 (
3899
0
        unpack_32_1x128 (s), unpack_32_1x128 (d)));
3900
3901
0
      w--;
3902
0
  }
3903
0
    }
3904
3905
0
}
3906
3907
static void
3908
sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
3909
                                    pixman_composite_info_t *info)
3910
0
{
3911
0
    PIXMAN_COMPOSITE_ARGS (info);
3912
0
    uint32_t src;
3913
0
    uint16_t    *dst_line, *dst, d;
3914
0
    uint32_t    *mask_line, *mask, m;
3915
0
    int dst_stride, mask_stride;
3916
0
    int w;
3917
0
    uint32_t pack_cmp;
3918
3919
0
    __m128i xmm_src, xmm_alpha;
3920
0
    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3921
0
    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3922
3923
0
    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3924
3925
0
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3926
3927
0
    if (src == 0)
3928
0
  return;
3929
3930
0
    PIXMAN_IMAGE_GET_LINE (
3931
0
  dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3932
0
    PIXMAN_IMAGE_GET_LINE (
3933
0
  mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3934
3935
0
    xmm_src = expand_pixel_32_1x128 (src);
3936
0
    xmm_alpha = expand_alpha_1x128 (xmm_src);
3937
0
    mmx_src = xmm_src;
3938
0
    mmx_alpha = xmm_alpha;
3939
3940
0
    while (height--)
3941
0
    {
3942
0
  w = width;
3943
0
  mask = mask_line;
3944
0
  dst = dst_line;
3945
0
  mask_line += mask_stride;
3946
0
  dst_line += dst_stride;
3947
3948
0
  while (w && ((uintptr_t)dst & 15))
3949
0
  {
3950
0
      m = *(uint32_t *) mask;
3951
3952
0
      if (m)
3953
0
      {
3954
0
    d = *dst;
3955
0
    mmx_mask = unpack_32_1x128 (m);
3956
0
    mmx_dest = expand565_16_1x128 (d);
3957
3958
0
    *dst = pack_565_32_16 (
3959
0
        pack_1x128_32 (
3960
0
      in_over_1x128 (
3961
0
          &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3962
0
      }
3963
3964
0
      w--;
3965
0
      dst++;
3966
0
      mask++;
3967
0
  }
3968
3969
0
  while (w >= 8)
3970
0
  {
3971
      /* First round */
3972
0
      xmm_mask = load_128_unaligned ((__m128i*)mask);
3973
0
      xmm_dst = load_128_aligned ((__m128i*)dst);
3974
3975
0
      pack_cmp = _mm_movemask_epi8 (
3976
0
    _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3977
3978
0
      unpack_565_128_4x128 (xmm_dst,
3979
0
          &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3980
0
      unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3981
3982
      /* preload next round */
3983
0
      xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
3984
3985
      /* preload next round */
3986
0
      if (pack_cmp != 0xffff)
3987
0
      {
3988
0
    in_over_2x128 (&xmm_src, &xmm_src,
3989
0
             &xmm_alpha, &xmm_alpha,
3990
0
             &xmm_mask_lo, &xmm_mask_hi,
3991
0
             &xmm_dst0, &xmm_dst1);
3992
0
      }
3993
3994
      /* Second round */
3995
0
      pack_cmp = _mm_movemask_epi8 (
3996
0
    _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3997
3998
0
      unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3999
4000
0
      if (pack_cmp != 0xffff)
4001
0
      {
4002
0
    in_over_2x128 (&xmm_src, &xmm_src,
4003
0
             &xmm_alpha, &xmm_alpha,
4004
0
             &xmm_mask_lo, &xmm_mask_hi,
4005
0
             &xmm_dst2, &xmm_dst3);
4006
0
      }
4007
4008
0
      save_128_aligned (
4009
0
    (__m128i*)dst, pack_565_4x128_128 (
4010
0
        &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4011
4012
0
      w -= 8;
4013
0
      dst += 8;
4014
0
      mask += 8;
4015
0
  }
4016
4017
0
  while (w)
4018
0
  {
4019
0
      m = *(uint32_t *) mask;
4020
4021
0
      if (m)
4022
0
      {
4023
0
    d = *dst;
4024
0
    mmx_mask = unpack_32_1x128 (m);
4025
0
    mmx_dest = expand565_16_1x128 (d);
4026
4027
0
    *dst = pack_565_32_16 (
4028
0
        pack_1x128_32 (
4029
0
      in_over_1x128 (
4030
0
          &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4031
0
      }
4032
4033
0
      w--;
4034
0
      dst++;
4035
0
      mask++;
4036
0
  }
4037
0
    }
4038
4039
0
}
4040
4041
static void
4042
sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4043
                         pixman_composite_info_t *info)
4044
0
{
4045
0
    PIXMAN_COMPOSITE_ARGS (info);
4046
0
    uint8_t     *dst_line, *dst;
4047
0
    uint8_t     *mask_line, *mask;
4048
0
    int dst_stride, mask_stride;
4049
0
    uint32_t d;
4050
0
    uint32_t src;
4051
0
    int32_t w;
4052
4053
0
    __m128i xmm_alpha;
4054
0
    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4055
0
    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4056
4057
0
    PIXMAN_IMAGE_GET_LINE (
4058
0
  dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4059
0
    PIXMAN_IMAGE_GET_LINE (
4060
0
  mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4061
4062
0
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4063
4064
0
    xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4065
4066
0
    while (height--)
4067
0
    {
4068
0
  dst = dst_line;
4069
0
  dst_line += dst_stride;
4070
0
  mask = mask_line;
4071
0
  mask_line += mask_stride;
4072
0
  w = width;
4073
4074
0
  while (w && ((uintptr_t)dst & 15))
4075
0
  {
4076
0
      uint8_t m = *mask++;
4077
0
      d = (uint32_t) *dst;
4078
4079
0
      *dst++ = (uint8_t) pack_1x128_32 (
4080
0
    pix_multiply_1x128 (
4081
0
        pix_multiply_1x128 (xmm_alpha,
4082
0
               unpack_32_1x128 (m)),
4083
0
        unpack_32_1x128 (d)));
4084
0
      w--;
4085
0
  }
4086
4087
0
  while (w >= 16)
4088
0
  {
4089
0
      xmm_mask = load_128_unaligned ((__m128i*)mask);
4090
0
      xmm_dst = load_128_aligned ((__m128i*)dst);
4091
4092
0
      unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4093
0
      unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4094
4095
0
      pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4096
0
        &xmm_mask_lo, &xmm_mask_hi,
4097
0
        &xmm_mask_lo, &xmm_mask_hi);
4098
4099
0
      pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4100
0
        &xmm_dst_lo, &xmm_dst_hi,
4101
0
        &xmm_dst_lo, &xmm_dst_hi);
4102
4103
0
      save_128_aligned (
4104
0
    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4105
4106
0
      mask += 16;
4107
0
      dst += 16;
4108
0
      w -= 16;
4109
0
  }
4110
4111
0
  while (w)
4112
0
  {
4113
0
      uint8_t m = *mask++;
4114
0
      d = (uint32_t) *dst;
4115
4116
0
      *dst++ = (uint8_t) pack_1x128_32 (
4117
0
    pix_multiply_1x128 (
4118
0
        pix_multiply_1x128 (
4119
0
      xmm_alpha, unpack_32_1x128 (m)),
4120
0
        unpack_32_1x128 (d)));
4121
0
      w--;
4122
0
  }
4123
0
    }
4124
4125
0
}
4126
4127
static void
4128
sse2_composite_in_n_8 (pixman_implementation_t *imp,
4129
           pixman_composite_info_t *info)
4130
0
{
4131
0
    PIXMAN_COMPOSITE_ARGS (info);
4132
0
    uint8_t     *dst_line, *dst;
4133
0
    int dst_stride;
4134
0
    uint32_t d;
4135
0
    uint32_t src;
4136
0
    int32_t w;
4137
4138
0
    __m128i xmm_alpha;
4139
0
    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4140
4141
0
    PIXMAN_IMAGE_GET_LINE (
4142
0
  dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4143
4144
0
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4145
4146
0
    xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4147
4148
0
    src = src >> 24;
4149
4150
0
    if (src == 0xff)
4151
0
  return;
4152
4153
0
    if (src == 0x00)
4154
0
    {
4155
0
  pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
4156
0
         8, dest_x, dest_y, width, height, src);
4157
4158
0
  return;
4159
0
    }
4160
4161
0
    while (height--)
4162
0
    {
4163
0
  dst = dst_line;
4164
0
  dst_line += dst_stride;
4165
0
  w = width;
4166
4167
0
  while (w && ((uintptr_t)dst & 15))
4168
0
  {
4169
0
      d = (uint32_t) *dst;
4170
4171
0
      *dst++ = (uint8_t) pack_1x128_32 (
4172
0
    pix_multiply_1x128 (
4173
0
        xmm_alpha,
4174
0
        unpack_32_1x128 (d)));
4175
0
      w--;
4176
0
  }
4177
4178
0
  while (w >= 16)
4179
0
  {
4180
0
      xmm_dst = load_128_aligned ((__m128i*)dst);
4181
4182
0
      unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4183
      
4184
0
      pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4185
0
        &xmm_dst_lo, &xmm_dst_hi,
4186
0
        &xmm_dst_lo, &xmm_dst_hi);
4187
4188
0
      save_128_aligned (
4189
0
    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4190
4191
0
      dst += 16;
4192
0
      w -= 16;
4193
0
  }
4194
4195
0
  while (w)
4196
0
  {
4197
0
      d = (uint32_t) *dst;
4198
4199
0
      *dst++ = (uint8_t) pack_1x128_32 (
4200
0
    pix_multiply_1x128 (
4201
0
        xmm_alpha,
4202
0
        unpack_32_1x128 (d)));
4203
0
      w--;
4204
0
  }
4205
0
    }
4206
4207
0
}
4208
4209
static void
4210
sse2_composite_in_8_8 (pixman_implementation_t *imp,
4211
                       pixman_composite_info_t *info)
4212
0
{
4213
0
    PIXMAN_COMPOSITE_ARGS (info);
4214
0
    uint8_t     *dst_line, *dst;
4215
0
    uint8_t     *src_line, *src;
4216
0
    int src_stride, dst_stride;
4217
0
    int32_t w;
4218
0
    uint32_t s, d;
4219
4220
0
    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4221
0
    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4222
4223
0
    PIXMAN_IMAGE_GET_LINE (
4224
0
  dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4225
0
    PIXMAN_IMAGE_GET_LINE (
4226
0
  src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4227
4228
0
    while (height--)
4229
0
    {
4230
0
  dst = dst_line;
4231
0
  dst_line += dst_stride;
4232
0
  src = src_line;
4233
0
  src_line += src_stride;
4234
0
  w = width;
4235
4236
0
  while (w && ((uintptr_t)dst & 15))
4237
0
  {
4238
0
      s = (uint32_t) *src++;
4239
0
      d = (uint32_t) *dst;
4240
4241
0
      *dst++ = (uint8_t) pack_1x128_32 (
4242
0
    pix_multiply_1x128 (
4243
0
        unpack_32_1x128 (s), unpack_32_1x128 (d)));
4244
0
      w--;
4245
0
  }
4246
4247
0
  while (w >= 16)
4248
0
  {
4249
0
      xmm_src = load_128_unaligned ((__m128i*)src);
4250
0
      xmm_dst = load_128_aligned ((__m128i*)dst);
4251
4252
0
      unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4253
0
      unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4254
4255
0
      pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
4256
0
        &xmm_dst_lo, &xmm_dst_hi,
4257
0
        &xmm_dst_lo, &xmm_dst_hi);
4258
4259
0
      save_128_aligned (
4260
0
    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4261
4262
0
      src += 16;
4263
0
      dst += 16;
4264
0
      w -= 16;
4265
0
  }
4266
4267
0
  while (w)
4268
0
  {
4269
0
      s = (uint32_t) *src++;
4270
0
      d = (uint32_t) *dst;
4271
4272
0
      *dst++ = (uint8_t) pack_1x128_32 (
4273
0
    pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
4274
0
      w--;
4275
0
  }
4276
0
    }
4277
4278
0
}
4279
4280
static void
4281
sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
4282
        pixman_composite_info_t *info)
4283
0
{
4284
0
    PIXMAN_COMPOSITE_ARGS (info);
4285
0
    uint8_t     *dst_line, *dst;
4286
0
    uint8_t     *mask_line, *mask;
4287
0
    int dst_stride, mask_stride;
4288
0
    int32_t w;
4289
0
    uint32_t src;
4290
0
    uint32_t d;
4291
4292
0
    __m128i xmm_alpha;
4293
0
    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4294
0
    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4295
4296
0
    PIXMAN_IMAGE_GET_LINE (
4297
0
  dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4298
0
    PIXMAN_IMAGE_GET_LINE (
4299
0
  mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4300
4301
0
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4302
4303
0
    xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4304
4305
0
    while (height--)
4306
0
    {
4307
0
  dst = dst_line;
4308
0
  dst_line += dst_stride;
4309
0
  mask = mask_line;
4310
0
  mask_line += mask_stride;
4311
0
  w = width;
4312
4313
0
  while (w && ((uintptr_t)dst & 15))
4314
0
  {
4315
0
      uint8_t m = *mask++;
4316
0
      d = (uint32_t) *dst;
4317
4318
0
      *dst++ = (uint8_t) pack_1x128_32 (
4319
0
    _mm_adds_epu16 (
4320
0
        pix_multiply_1x128 (
4321
0
      xmm_alpha, unpack_32_1x128 (m)),
4322
0
        unpack_32_1x128 (d)));
4323
0
      w--;
4324
0
  }
4325
4326
0
  while (w >= 16)
4327
0
  {
4328
0
      xmm_mask = load_128_unaligned ((__m128i*)mask);
4329
0
      xmm_dst = load_128_aligned ((__m128i*)dst);
4330
4331
0
      unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4332
0
      unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4333
4334
0
      pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4335
0
        &xmm_mask_lo, &xmm_mask_hi,
4336
0
        &xmm_mask_lo, &xmm_mask_hi);
4337
4338
0
      xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4339
0
      xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4340
4341
0
      save_128_aligned (
4342
0
    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4343
4344
0
      mask += 16;
4345
0
      dst += 16;
4346
0
      w -= 16;
4347
0
  }
4348
4349
0
  while (w)
4350
0
  {
4351
0
      uint8_t m = (uint32_t) *mask++;
4352
0
      d = (uint32_t) *dst;
4353
4354
0
      *dst++ = (uint8_t) pack_1x128_32 (
4355
0
    _mm_adds_epu16 (
4356
0
        pix_multiply_1x128 (
4357
0
      xmm_alpha, unpack_32_1x128 (m)),
4358
0
        unpack_32_1x128 (d)));
4359
4360
0
      w--;
4361
0
  }
4362
0
    }
4363
4364
0
}
4365
4366
static void
4367
sse2_composite_add_n_8 (pixman_implementation_t *imp,
4368
      pixman_composite_info_t *info)
4369
0
{
4370
0
    PIXMAN_COMPOSITE_ARGS (info);
4371
0
    uint8_t     *dst_line, *dst;
4372
0
    int dst_stride;
4373
0
    int32_t w;
4374
0
    uint32_t src;
4375
4376
0
    __m128i xmm_src;
4377
4378
0
    PIXMAN_IMAGE_GET_LINE (
4379
0
  dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4380
4381
0
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4382
4383
0
    src >>= 24;
4384
4385
0
    if (src == 0x00)
4386
0
  return;
4387
4388
0
    if (src == 0xff)
4389
0
    {
4390
0
  pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
4391
0
         8, dest_x, dest_y, width, height, 0xff);
4392
4393
0
  return;
4394
0
    }
4395
4396
0
    src = (src << 24) | (src << 16) | (src << 8) | src;
4397
0
    xmm_src = _mm_set_epi32 (src, src, src, src);
4398
4399
0
    while (height--)
4400
0
    {
4401
0
  dst = dst_line;
4402
0
  dst_line += dst_stride;
4403
0
  w = width;
4404
4405
0
  while (w && ((uintptr_t)dst & 15))
4406
0
  {
4407
0
      *dst = (uint8_t)_mm_cvtsi128_si32 (
4408
0
    _mm_adds_epu8 (
4409
0
        xmm_src,
4410
0
        _mm_cvtsi32_si128 (*dst)));
4411
4412
0
      w--;
4413
0
      dst++;
4414
0
  }
4415
4416
0
  while (w >= 16)
4417
0
  {
4418
0
      save_128_aligned (
4419
0
    (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned  ((__m128i*)dst)));
4420
4421
0
      dst += 16;
4422
0
      w -= 16;
4423
0
  }
4424
4425
0
  while (w)
4426
0
  {
4427
0
      *dst = (uint8_t)_mm_cvtsi128_si32 (
4428
0
    _mm_adds_epu8 (
4429
0
        xmm_src,
4430
0
        _mm_cvtsi32_si128 (*dst)));
4431
4432
0
      w--;
4433
0
      dst++;
4434
0
  }
4435
0
    }
4436
4437
0
}
4438
4439
static void
4440
sse2_composite_add_8_8 (pixman_implementation_t *imp,
4441
      pixman_composite_info_t *info)
4442
0
{
4443
0
    PIXMAN_COMPOSITE_ARGS (info);
4444
0
    uint8_t     *dst_line, *dst;
4445
0
    uint8_t     *src_line, *src;
4446
0
    int dst_stride, src_stride;
4447
0
    int32_t w;
4448
0
    uint16_t t;
4449
4450
0
    PIXMAN_IMAGE_GET_LINE (
4451
0
  src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4452
0
    PIXMAN_IMAGE_GET_LINE (
4453
0
  dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4454
4455
0
    while (height--)
4456
0
    {
4457
0
  dst = dst_line;
4458
0
  src = src_line;
4459
4460
0
  dst_line += dst_stride;
4461
0
  src_line += src_stride;
4462
0
  w = width;
4463
4464
  /* Small head */
4465
0
  while (w && (uintptr_t)dst & 3)
4466
0
  {
4467
0
      t = (*dst) + (*src++);
4468
0
      *dst++ = t | (0 - (t >> 8));
4469
0
      w--;
4470
0
  }
4471
4472
0
  sse2_combine_add_u (imp, op,
4473
0
          (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
4474
4475
  /* Small tail */
4476
0
  dst += w & 0xfffc;
4477
0
  src += w & 0xfffc;
4478
4479
0
  w &= 3;
4480
4481
0
  while (w)
4482
0
  {
4483
0
      t = (*dst) + (*src++);
4484
0
      *dst++ = t | (0 - (t >> 8));
4485
0
      w--;
4486
0
  }
4487
0
    }
4488
4489
0
}
4490
4491
static void
4492
sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
4493
                              pixman_composite_info_t *info)
4494
0
{
4495
0
    PIXMAN_COMPOSITE_ARGS (info);
4496
0
    uint32_t    *dst_line, *dst;
4497
0
    uint32_t    *src_line, *src;
4498
0
    int dst_stride, src_stride;
4499
4500
0
    PIXMAN_IMAGE_GET_LINE (
4501
0
  src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4502
0
    PIXMAN_IMAGE_GET_LINE (
4503
0
  dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4504
4505
0
    while (height--)
4506
0
    {
4507
0
  dst = dst_line;
4508
0
  dst_line += dst_stride;
4509
0
  src = src_line;
4510
0
  src_line += src_stride;
4511
4512
0
  sse2_combine_add_u (imp, op, dst, src, NULL, width);
4513
0
    }
4514
0
}
4515
4516
static void
4517
sse2_composite_add_n_8888 (pixman_implementation_t *imp,
4518
         pixman_composite_info_t *info)
4519
0
{
4520
0
    PIXMAN_COMPOSITE_ARGS (info);
4521
0
    uint32_t *dst_line, *dst, src;
4522
0
    int dst_stride;
4523
4524
0
    __m128i xmm_src;
4525
4526
0
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4527
4528
0
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4529
0
    if (src == 0)
4530
0
  return;
4531
4532
0
    if (src == ~0)
4533
0
    {
4534
0
  pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, 32,
4535
0
         dest_x, dest_y, width, height, ~0);
4536
4537
0
  return;
4538
0
    }
4539
4540
0
    xmm_src = _mm_set_epi32 (src, src, src, src);
4541
0
    while (height--)
4542
0
    {
4543
0
  int w = width;
4544
0
  uint32_t d;
4545
4546
0
  dst = dst_line;
4547
0
  dst_line += dst_stride;
4548
4549
0
  while (w && (uintptr_t)dst & 15)
4550
0
  {
4551
0
      d = *dst;
4552
0
      *dst++ =
4553
0
    _mm_cvtsi128_si32 ( _mm_adds_epu8 (xmm_src, _mm_cvtsi32_si128 (d)));
4554
0
      w--;
4555
0
  }
4556
4557
0
  while (w >= 4)
4558
0
  {
4559
0
      save_128_aligned
4560
0
    ((__m128i*)dst,
4561
0
     _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
4562
4563
0
      dst += 4;
4564
0
      w -= 4;
4565
0
  }
4566
4567
0
  while (w--)
4568
0
  {
4569
0
      d = *dst;
4570
0
      *dst++ =
4571
0
    _mm_cvtsi128_si32 (_mm_adds_epu8 (xmm_src,
4572
0
              _mm_cvtsi32_si128 (d)));
4573
0
  }
4574
0
    }
4575
0
}
4576
4577
static void
4578
sse2_composite_add_n_8_8888 (pixman_implementation_t *imp,
4579
           pixman_composite_info_t *info)
4580
0
{
4581
0
    PIXMAN_COMPOSITE_ARGS (info);
4582
0
    uint32_t     *dst_line, *dst;
4583
0
    uint8_t     *mask_line, *mask;
4584
0
    int dst_stride, mask_stride;
4585
0
    int32_t w;
4586
0
    uint32_t src;
4587
4588
0
    __m128i xmm_src;
4589
4590
0
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4591
0
    if (src == 0)
4592
0
  return;
4593
0
    xmm_src = expand_pixel_32_1x128 (src);
4594
4595
0
    PIXMAN_IMAGE_GET_LINE (
4596
0
  dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4597
0
    PIXMAN_IMAGE_GET_LINE (
4598
0
  mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4599
4600
0
    while (height--)
4601
0
    {
4602
0
  dst = dst_line;
4603
0
  dst_line += dst_stride;
4604
0
  mask = mask_line;
4605
0
  mask_line += mask_stride;
4606
0
  w = width;
4607
4608
0
  while (w && ((uintptr_t)dst & 15))
4609
0
  {
4610
0
      uint8_t m = *mask++;
4611
0
      if (m)
4612
0
      {
4613
0
    *dst = pack_1x128_32
4614
0
        (_mm_adds_epu16
4615
0
         (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
4616
0
          unpack_32_1x128 (*dst)));
4617
0
      }
4618
0
      dst++;
4619
0
      w--;
4620
0
  }
4621
4622
0
  while (w >= 4)
4623
0
  {
4624
0
      uint32_t m;
4625
0
            memcpy(&m, mask, sizeof(uint32_t));
4626
4627
0
      if (m)
4628
0
      {
4629
0
    __m128i xmm_mask_lo, xmm_mask_hi;
4630
0
    __m128i xmm_dst_lo, xmm_dst_hi;
4631
4632
0
    __m128i xmm_dst = load_128_aligned ((__m128i*)dst);
4633
0
    __m128i xmm_mask =
4634
0
        _mm_unpacklo_epi8 (unpack_32_1x128(m),
4635
0
               _mm_setzero_si128 ());
4636
4637
0
    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4638
0
    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4639
4640
0
    expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4641
0
          &xmm_mask_lo, &xmm_mask_hi);
4642
4643
0
    pix_multiply_2x128 (&xmm_src, &xmm_src,
4644
0
            &xmm_mask_lo, &xmm_mask_hi,
4645
0
            &xmm_mask_lo, &xmm_mask_hi);
4646
4647
0
    xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4648
0
    xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4649
4650
0
    save_128_aligned (
4651
0
        (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4652
0
      }
4653
4654
0
      w -= 4;
4655
0
      dst += 4;
4656
0
      mask += 4;
4657
0
  }
4658
4659
0
  while (w)
4660
0
  {
4661
0
      uint8_t m = *mask++;
4662
0
      if (m)
4663
0
      {
4664
0
    *dst = pack_1x128_32
4665
0
        (_mm_adds_epu16
4666
0
         (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
4667
0
          unpack_32_1x128 (*dst)));
4668
0
      }
4669
0
      dst++;
4670
0
      w--;
4671
0
  }
4672
0
    }
4673
0
}
4674
4675
static pixman_bool_t
4676
sse2_blt (pixman_implementation_t *imp,
4677
          uint32_t *               src_bits,
4678
          uint32_t *               dst_bits,
4679
          int                      src_stride,
4680
          int                      dst_stride,
4681
          int                      src_bpp,
4682
          int                      dst_bpp,
4683
          int                      src_x,
4684
          int                      src_y,
4685
          int                      dest_x,
4686
          int                      dest_y,
4687
          int                      width,
4688
          int                      height)
4689
0
{
4690
0
    uint8_t *   src_bytes;
4691
0
    uint8_t *   dst_bytes;
4692
0
    int byte_width;
4693
4694
0
    if (src_bpp != dst_bpp)
4695
0
  return FALSE;
4696
4697
0
    if (src_bpp == 16)
4698
0
    {
4699
0
  src_stride = src_stride * (int) sizeof (uint32_t) / 2;
4700
0
  dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
4701
0
  src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
4702
0
  dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
4703
0
  byte_width = 2 * width;
4704
0
  src_stride *= 2;
4705
0
  dst_stride *= 2;
4706
0
    }
4707
0
    else if (src_bpp == 32)
4708
0
    {
4709
0
  src_stride = src_stride * (int) sizeof (uint32_t) / 4;
4710
0
  dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
4711
0
  src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
4712
0
  dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
4713
0
  byte_width = 4 * width;
4714
0
  src_stride *= 4;
4715
0
  dst_stride *= 4;
4716
0
    }
4717
0
    else
4718
0
    {
4719
0
  return FALSE;
4720
0
    }
4721
4722
0
    while (height--)
4723
0
    {
4724
0
  int w;
4725
0
  uint8_t *s = src_bytes;
4726
0
  uint8_t *d = dst_bytes;
4727
0
  src_bytes += src_stride;
4728
0
  dst_bytes += dst_stride;
4729
0
  w = byte_width;
4730
4731
0
  while (w >= 2 && ((uintptr_t)d & 3))
4732
0
  {
4733
0
            memmove(d, s, 2);
4734
0
      w -= 2;
4735
0
      s += 2;
4736
0
      d += 2;
4737
0
  }
4738
4739
0
  while (w >= 4 && ((uintptr_t)d & 15))
4740
0
  {
4741
0
            memmove(d, s, 4);
4742
4743
0
      w -= 4;
4744
0
      s += 4;
4745
0
      d += 4;
4746
0
  }
4747
4748
0
  while (w >= 64)
4749
0
  {
4750
0
      __m128i xmm0, xmm1, xmm2, xmm3;
4751
4752
0
      xmm0 = load_128_unaligned ((__m128i*)(s));
4753
0
      xmm1 = load_128_unaligned ((__m128i*)(s + 16));
4754
0
      xmm2 = load_128_unaligned ((__m128i*)(s + 32));
4755
0
      xmm3 = load_128_unaligned ((__m128i*)(s + 48));
4756
4757
0
      save_128_aligned ((__m128i*)(d),    xmm0);
4758
0
      save_128_aligned ((__m128i*)(d + 16), xmm1);
4759
0
      save_128_aligned ((__m128i*)(d + 32), xmm2);
4760
0
      save_128_aligned ((__m128i*)(d + 48), xmm3);
4761
4762
0
      s += 64;
4763
0
      d += 64;
4764
0
      w -= 64;
4765
0
  }
4766
4767
0
  while (w >= 16)
4768
0
  {
4769
0
      save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
4770
4771
0
      w -= 16;
4772
0
      d += 16;
4773
0
      s += 16;
4774
0
  }
4775
4776
0
  while (w >= 4)
4777
0
  {
4778
0
            memmove(d, s, 4);
4779
4780
0
      w -= 4;
4781
0
      s += 4;
4782
0
      d += 4;
4783
0
  }
4784
4785
0
  if (w >= 2)
4786
0
  {
4787
0
            memmove(d, s, 2);
4788
0
      w -= 2;
4789
0
      s += 2;
4790
0
      d += 2;
4791
0
  }
4792
0
    }
4793
4794
0
    return TRUE;
4795
0
}
4796
4797
static void
4798
sse2_composite_copy_area (pixman_implementation_t *imp,
4799
                          pixman_composite_info_t *info)
4800
0
{
4801
0
    PIXMAN_COMPOSITE_ARGS (info);
4802
0
    sse2_blt (imp, src_image->bits.bits,
4803
0
        dest_image->bits.bits,
4804
0
        src_image->bits.rowstride,
4805
0
        dest_image->bits.rowstride,
4806
0
        PIXMAN_FORMAT_BPP (src_image->bits.format),
4807
0
        PIXMAN_FORMAT_BPP (dest_image->bits.format),
4808
0
        src_x, src_y, dest_x, dest_y, width, height);
4809
0
}
4810
4811
static void
4812
sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
4813
                                 pixman_composite_info_t *info)
4814
0
{
4815
0
    PIXMAN_COMPOSITE_ARGS (info);
4816
0
    uint32_t    *src, *src_line, s;
4817
0
    uint32_t    *dst, *dst_line, d;
4818
0
    uint8_t         *mask, *mask_line;
4819
0
    int src_stride, mask_stride, dst_stride;
4820
0
    int32_t w;
4821
0
    __m128i ms;
4822
4823
0
    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4824
0
    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4825
0
    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4826
4827
0
    PIXMAN_IMAGE_GET_LINE (
4828
0
  dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4829
0
    PIXMAN_IMAGE_GET_LINE (
4830
0
  mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4831
0
    PIXMAN_IMAGE_GET_LINE (
4832
0
  src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4833
4834
0
    while (height--)
4835
0
    {
4836
0
        src = src_line;
4837
0
        src_line += src_stride;
4838
0
        dst = dst_line;
4839
0
        dst_line += dst_stride;
4840
0
        mask = mask_line;
4841
0
        mask_line += mask_stride;
4842
4843
0
        w = width;
4844
4845
0
        while (w && (uintptr_t)dst & 15)
4846
0
        {
4847
0
            uint8_t m = *mask++;
4848
0
            s = 0xff000000 | *src++;
4849
0
            d = *dst;
4850
0
            ms = unpack_32_1x128 (s);
4851
4852
0
            if (m != 0xff)
4853
0
            {
4854
0
    __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4855
0
    __m128i md = unpack_32_1x128 (d);
4856
4857
0
                ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
4858
0
            }
4859
4860
0
            *dst++ = pack_1x128_32 (ms);
4861
0
            w--;
4862
0
        }
4863
4864
0
        while (w >= 4)
4865
0
        {
4866
0
            uint32_t m;
4867
0
            memcpy(&m, mask, sizeof(uint32_t));
4868
0
            xmm_src = _mm_or_si128 (
4869
0
    load_128_unaligned ((__m128i*)src), mask_ff000000);
4870
4871
0
            if (m == 0xffffffff)
4872
0
            {
4873
0
                save_128_aligned ((__m128i*)dst, xmm_src);
4874
0
            }
4875
0
            else
4876
0
            {
4877
0
                xmm_dst = load_128_aligned ((__m128i*)dst);
4878
4879
0
                xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
4880
4881
0
                unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4882
0
                unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4883
0
                unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4884
4885
0
                expand_alpha_rev_2x128 (
4886
0
        xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
4887
4888
0
                in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
4889
0
             &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi,
4890
0
             &xmm_dst_lo, &xmm_dst_hi);
4891
4892
0
                save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4893
0
            }
4894
4895
0
            src += 4;
4896
0
            dst += 4;
4897
0
            mask += 4;
4898
0
            w -= 4;
4899
0
        }
4900
4901
0
        while (w)
4902
0
        {
4903
0
            uint8_t m = *mask++;
4904
4905
0
            if (m)
4906
0
            {
4907
0
                s = 0xff000000 | *src;
4908
4909
0
                if (m == 0xff)
4910
0
                {
4911
0
                    *dst = s;
4912
0
                }
4913
0
                else
4914
0
                {
4915
0
        __m128i ma, md, ms;
4916
4917
0
                    d = *dst;
4918
4919
0
        ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4920
0
        md = unpack_32_1x128 (d);
4921
0
        ms = unpack_32_1x128 (s);
4922
4923
0
                    *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
4924
0
                }
4925
4926
0
            }
4927
4928
0
            src++;
4929
0
            dst++;
4930
0
            w--;
4931
0
        }
4932
0
    }
4933
4934
0
}
4935
4936
static void
4937
sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
4938
                                 pixman_composite_info_t *info)
4939
4
{
4940
4
    PIXMAN_COMPOSITE_ARGS (info);
4941
4
    uint32_t    *src, *src_line, s;
4942
4
    uint32_t    *dst, *dst_line, d;
4943
4
    uint8_t         *mask, *mask_line;
4944
4
    int src_stride, mask_stride, dst_stride;
4945
4
    int32_t w;
4946
4947
4
    __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
4948
4
    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4949
4
    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4950
4951
4
    PIXMAN_IMAGE_GET_LINE (
4952
4
  dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4953
4
    PIXMAN_IMAGE_GET_LINE (
4954
4
  mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4955
4
    PIXMAN_IMAGE_GET_LINE (
4956
4
  src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4957
4958
39
    while (height--)
4959
35
    {
4960
35
        src = src_line;
4961
35
        src_line += src_stride;
4962
35
        dst = dst_line;
4963
35
        dst_line += dst_stride;
4964
35
        mask = mask_line;
4965
35
        mask_line += mask_stride;
4966
4967
35
        w = width;
4968
4969
67
        while (w && (uintptr_t)dst & 15)
4970
32
        {
4971
32
      uint32_t sa;
4972
32
            uint8_t m = *mask++;
4973
4974
32
            s = *src++;
4975
32
            d = *dst;
4976
4977
32
      sa = s >> 24;
4978
4979
32
      if (m)
4980
32
      {
4981
32
    if (sa == 0xff && m == 0xff)
4982
0
    {
4983
0
        *dst = s;
4984
0
    }
4985
32
    else
4986
32
    {
4987
32
        __m128i ms, md, ma, msa;
4988
4989
32
        ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
4990
32
        ms = unpack_32_1x128 (s);
4991
32
        md = unpack_32_1x128 (d);
4992
4993
32
        msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
4994
4995
32
        *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
4996
32
    }
4997
32
      }
4998
4999
32
      dst++;
5000
32
            w--;
5001
32
        }
5002
5003
45
        while (w >= 4)
5004
10
        {
5005
10
            uint32_t m;
5006
10
            memcpy(&m, mask, sizeof(uint32_t));
5007
5008
10
      if (m)
5009
10
      {
5010
10
    xmm_src = load_128_unaligned ((__m128i*)src);
5011
5012
10
    if (m == 0xffffffff && is_opaque (xmm_src))
5013
0
    {
5014
0
        save_128_aligned ((__m128i *)dst, xmm_src);
5015
0
    }
5016
10
    else
5017
10
    {
5018
10
        xmm_dst = load_128_aligned ((__m128i *)dst);
5019
5020
10
        xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5021
5022
10
        unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5023
10
        unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5024
10
        unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5025
5026
10
        expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5027
10
        expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5028
5029
10
        in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5030
10
           &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5031
5032
10
        save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5033
10
    }
5034
10
      }
5035
5036
10
            src += 4;
5037
10
            dst += 4;
5038
10
            mask += 4;
5039
10
            w -= 4;
5040
10
        }
5041
5042
42
        while (w)
5043
7
        {
5044
7
      uint32_t sa;
5045
7
            uint8_t m = *mask++;
5046
5047
7
            s = *src++;
5048
7
            d = *dst;
5049
5050
7
      sa = s >> 24;
5051
5052
7
      if (m)
5053
7
      {
5054
7
    if (sa == 0xff && m == 0xff)
5055
0
    {
5056
0
        *dst = s;
5057
0
    }
5058
7
    else
5059
7
    {
5060
7
        __m128i ms, md, ma, msa;
5061
5062
7
        ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5063
7
        ms = unpack_32_1x128 (s);
5064
7
        md = unpack_32_1x128 (d);
5065
5066
7
        msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5067
5068
7
        *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5069
7
    }
5070
7
      }
5071
5072
7
      dst++;
5073
7
            w--;
5074
7
        }
5075
35
    }
5076
5077
4
}
5078
5079
static void
5080
sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
5081
            pixman_composite_info_t *info)
5082
0
{
5083
0
    PIXMAN_COMPOSITE_ARGS (info);
5084
0
    uint32_t src;
5085
0
    uint32_t    *dst_line, *dst;
5086
0
    __m128i xmm_src;
5087
0
    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5088
0
    __m128i xmm_dsta_hi, xmm_dsta_lo;
5089
0
    int dst_stride;
5090
0
    int32_t w;
5091
5092
0
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
5093
5094
0
    if (src == 0)
5095
0
  return;
5096
5097
0
    PIXMAN_IMAGE_GET_LINE (
5098
0
  dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5099
5100
0
    xmm_src = expand_pixel_32_1x128 (src);
5101
5102
0
    while (height--)
5103
0
    {
5104
0
  dst = dst_line;
5105
5106
0
  dst_line += dst_stride;
5107
0
  w = width;
5108
5109
0
  while (w && (uintptr_t)dst & 15)
5110
0
  {
5111
0
      __m128i vd;
5112
5113
0
      vd = unpack_32_1x128 (*dst);
5114
5115
0
      *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5116
0
                xmm_src));
5117
0
      w--;
5118
0
      dst++;
5119
0
  }
5120
5121
0
  while (w >= 4)
5122
0
  {
5123
0
      __m128i tmp_lo, tmp_hi;
5124
5125
0
      xmm_dst = load_128_aligned ((__m128i*)dst);
5126
5127
0
      unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5128
0
      expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
5129
5130
0
      tmp_lo = xmm_src;
5131
0
      tmp_hi = xmm_src;
5132
5133
0
      over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
5134
0
      &xmm_dsta_lo, &xmm_dsta_hi,
5135
0
      &tmp_lo, &tmp_hi);
5136
5137
0
      save_128_aligned (
5138
0
    (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
5139
5140
0
      w -= 4;
5141
0
      dst += 4;
5142
0
  }
5143
5144
0
  while (w)
5145
0
  {
5146
0
      __m128i vd;
5147
5148
0
      vd = unpack_32_1x128 (*dst);
5149
5150
0
      *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5151
0
                xmm_src));
5152
0
      w--;
5153
0
      dst++;
5154
0
  }
5155
5156
0
    }
5157
5158
0
}
5159
5160
static void
5161
sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
5162
            pixman_composite_info_t *info)
5163
0
{
5164
0
    PIXMAN_COMPOSITE_ARGS (info);
5165
0
    uint32_t    *src, *src_line, s;
5166
0
    uint32_t    *dst, *dst_line, d;
5167
0
    uint32_t    *mask, *mask_line;
5168
0
    uint32_t    m;
5169
0
    int src_stride, mask_stride, dst_stride;
5170
0
    int32_t w;
5171
5172
0
    __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5173
0
    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5174
0
    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5175
5176
0
    PIXMAN_IMAGE_GET_LINE (
5177
0
  dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5178
0
    PIXMAN_IMAGE_GET_LINE (
5179
0
  mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
5180
0
    PIXMAN_IMAGE_GET_LINE (
5181
0
  src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5182
5183
0
    while (height--)
5184
0
    {
5185
0
        src = src_line;
5186
0
        src_line += src_stride;
5187
0
        dst = dst_line;
5188
0
        dst_line += dst_stride;
5189
0
        mask = mask_line;
5190
0
        mask_line += mask_stride;
5191
5192
0
        w = width;
5193
5194
0
        while (w && (uintptr_t)dst & 15)
5195
0
        {
5196
0
      uint32_t sa;
5197
5198
0
            s = *src++;
5199
0
            m = (*mask++) >> 24;
5200
0
            d = *dst;
5201
5202
0
      sa = s >> 24;
5203
5204
0
      if (m)
5205
0
      {
5206
0
    if (sa == 0xff && m == 0xff)
5207
0
    {
5208
0
        *dst = s;
5209
0
    }
5210
0
    else
5211
0
    {
5212
0
        __m128i ms, md, ma, msa;
5213
5214
0
        ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5215
0
        ms = unpack_32_1x128 (s);
5216
0
        md = unpack_32_1x128 (d);
5217
5218
0
        msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5219
5220
0
        *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5221
0
    }
5222
0
      }
5223
5224
0
      dst++;
5225
0
            w--;
5226
0
        }
5227
5228
0
        while (w >= 4)
5229
0
        {
5230
0
      xmm_mask = load_128_unaligned ((__m128i*)mask);
5231
5232
0
      if (!is_transparent (xmm_mask))
5233
0
      {
5234
0
    xmm_src = load_128_unaligned ((__m128i*)src);
5235
5236
0
    if (is_opaque (xmm_mask) && is_opaque (xmm_src))
5237
0
    {
5238
0
        save_128_aligned ((__m128i *)dst, xmm_src);
5239
0
    }
5240
0
    else
5241
0
    {
5242
0
        xmm_dst = load_128_aligned ((__m128i *)dst);
5243
5244
0
        unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5245
0
        unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5246
0
        unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5247
5248
0
        expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5249
0
        expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5250
5251
0
        in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5252
0
           &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5253
5254
0
        save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5255
0
    }
5256
0
      }
5257
5258
0
            src += 4;
5259
0
            dst += 4;
5260
0
            mask += 4;
5261
0
            w -= 4;
5262
0
        }
5263
5264
0
        while (w)
5265
0
        {
5266
0
      uint32_t sa;
5267
5268
0
            s = *src++;
5269
0
            m = (*mask++) >> 24;
5270
0
            d = *dst;
5271
5272
0
      sa = s >> 24;
5273
5274
0
      if (m)
5275
0
      {
5276
0
    if (sa == 0xff && m == 0xff)
5277
0
    {
5278
0
        *dst = s;
5279
0
    }
5280
0
    else
5281
0
    {
5282
0
        __m128i ms, md, ma, msa;
5283
5284
0
        ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5285
0
        ms = unpack_32_1x128 (s);
5286
0
        md = unpack_32_1x128 (d);
5287
5288
0
        msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5289
5290
0
        *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5291
0
    }
5292
0
      }
5293
5294
0
      dst++;
5295
0
            w--;
5296
0
        }
5297
0
    }
5298
5299
0
}
5300
5301
/* A variant of 'sse2_combine_over_u' with minor tweaks */
5302
static force_inline void
5303
scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
5304
                                             const uint32_t* ps,
5305
                                             int32_t         w,
5306
                                             pixman_fixed_t  vx,
5307
                                             pixman_fixed_t  unit_x,
5308
                                             pixman_fixed_t  src_width_fixed,
5309
                                             pixman_bool_t   fully_transparent_src)
5310
0
{
5311
0
    uint32_t s, d;
5312
0
    const uint32_t* pm = NULL;
5313
5314
0
    __m128i xmm_dst_lo, xmm_dst_hi;
5315
0
    __m128i xmm_src_lo, xmm_src_hi;
5316
0
    __m128i xmm_alpha_lo, xmm_alpha_hi;
5317
5318
0
    if (fully_transparent_src)
5319
0
  return;
5320
5321
    /* Align dst on a 16-byte boundary */
5322
0
    while (w && ((uintptr_t)pd & 15))
5323
0
    {
5324
0
  d = *pd;
5325
0
  s = combine1 (ps + pixman_fixed_to_int (vx), pm);
5326
0
  vx += unit_x;
5327
0
  while (vx >= 0)
5328
0
      vx -= src_width_fixed;
5329
5330
0
  *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5331
0
  if (pm)
5332
0
      pm++;
5333
0
  w--;
5334
0
    }
5335
5336
0
    while (w >= 4)
5337
0
    {
5338
0
  __m128i tmp;
5339
0
  uint32_t tmp1, tmp2, tmp3, tmp4;
5340
5341
0
  tmp1 = *(ps + pixman_fixed_to_int (vx));
5342
0
  vx += unit_x;
5343
0
  while (vx >= 0)
5344
0
      vx -= src_width_fixed;
5345
0
  tmp2 = *(ps + pixman_fixed_to_int (vx));
5346
0
  vx += unit_x;
5347
0
  while (vx >= 0)
5348
0
      vx -= src_width_fixed;
5349
0
  tmp3 = *(ps + pixman_fixed_to_int (vx));
5350
0
  vx += unit_x;
5351
0
  while (vx >= 0)
5352
0
      vx -= src_width_fixed;
5353
0
  tmp4 = *(ps + pixman_fixed_to_int (vx));
5354
0
  vx += unit_x;
5355
0
  while (vx >= 0)
5356
0
      vx -= src_width_fixed;
5357
5358
0
  tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5359
5360
0
  xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
5361
5362
0
  if (is_opaque (xmm_src_hi))
5363
0
  {
5364
0
      save_128_aligned ((__m128i*)pd, xmm_src_hi);
5365
0
  }
5366
0
  else if (!is_zero (xmm_src_hi))
5367
0
  {
5368
0
      xmm_dst_hi = load_128_aligned ((__m128i*) pd);
5369
5370
0
      unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
5371
0
      unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
5372
5373
0
      expand_alpha_2x128 (
5374
0
    xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5375
5376
0
      over_2x128 (&xmm_src_lo, &xmm_src_hi,
5377
0
      &xmm_alpha_lo, &xmm_alpha_hi,
5378
0
      &xmm_dst_lo, &xmm_dst_hi);
5379
5380
      /* rebuid the 4 pixel data and save*/
5381
0
      save_128_aligned ((__m128i*)pd,
5382
0
            pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5383
0
  }
5384
5385
0
  w -= 4;
5386
0
  pd += 4;
5387
0
  if (pm)
5388
0
      pm += 4;
5389
0
    }
5390
5391
0
    while (w)
5392
0
    {
5393
0
  d = *pd;
5394
0
  s = combine1 (ps + pixman_fixed_to_int (vx), pm);
5395
0
  vx += unit_x;
5396
0
  while (vx >= 0)
5397
0
      vx -= src_width_fixed;
5398
5399
0
  *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5400
0
  if (pm)
5401
0
      pm++;
5402
5403
0
  w--;
5404
0
    }
5405
0
}
5406
5407
FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
5408
           scaled_nearest_scanline_sse2_8888_8888_OVER,
5409
           uint32_t, uint32_t, COVER)
5410
FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
5411
           scaled_nearest_scanline_sse2_8888_8888_OVER,
5412
           uint32_t, uint32_t, NONE)
5413
FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
5414
           scaled_nearest_scanline_sse2_8888_8888_OVER,
5415
           uint32_t, uint32_t, PAD)
5416
FAST_NEAREST_MAINLOOP (sse2_8888_8888_normal_OVER,
5417
           scaled_nearest_scanline_sse2_8888_8888_OVER,
5418
           uint32_t, uint32_t, NORMAL)
5419
5420
static force_inline void
5421
scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
5422
                 uint32_t *       dst,
5423
                 const uint32_t * src,
5424
                 int32_t          w,
5425
                 pixman_fixed_t   vx,
5426
                 pixman_fixed_t   unit_x,
5427
                 pixman_fixed_t   src_width_fixed,
5428
                 pixman_bool_t    zero_src)
5429
0
{
5430
0
    __m128i xmm_mask;
5431
0
    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5432
0
    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5433
0
    __m128i xmm_alpha_lo, xmm_alpha_hi;
5434
5435
0
    if (zero_src || (*mask >> 24) == 0)
5436
0
  return;
5437
5438
0
    xmm_mask = create_mask_16_128 (*mask >> 24);
5439
5440
0
    while (w && (uintptr_t)dst & 15)
5441
0
    {
5442
0
  uint32_t s = *(src + pixman_fixed_to_int (vx));
5443
0
  vx += unit_x;
5444
0
  while (vx >= 0)
5445
0
      vx -= src_width_fixed;
5446
5447
0
  if (s)
5448
0
  {
5449
0
      uint32_t d = *dst;
5450
5451
0
      __m128i ms = unpack_32_1x128 (s);
5452
0
      __m128i alpha     = expand_alpha_1x128 (ms);
5453
0
      __m128i dest      = xmm_mask;
5454
0
      __m128i alpha_dst = unpack_32_1x128 (d);
5455
5456
0
      *dst = pack_1x128_32 (
5457
0
    in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
5458
0
  }
5459
0
  dst++;
5460
0
  w--;
5461
0
    }
5462
5463
0
    while (w >= 4)
5464
0
    {
5465
0
  uint32_t tmp1, tmp2, tmp3, tmp4;
5466
5467
0
  tmp1 = *(src + pixman_fixed_to_int (vx));
5468
0
  vx += unit_x;
5469
0
  while (vx >= 0)
5470
0
      vx -= src_width_fixed;
5471
0
  tmp2 = *(src + pixman_fixed_to_int (vx));
5472
0
  vx += unit_x;
5473
0
  while (vx >= 0)
5474
0
      vx -= src_width_fixed;
5475
0
  tmp3 = *(src + pixman_fixed_to_int (vx));
5476
0
  vx += unit_x;
5477
0
  while (vx >= 0)
5478
0
      vx -= src_width_fixed;
5479
0
  tmp4 = *(src + pixman_fixed_to_int (vx));
5480
0
  vx += unit_x;
5481
0
  while (vx >= 0)
5482
0
      vx -= src_width_fixed;
5483
5484
0
  xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5485
5486
0
  if (!is_zero (xmm_src))
5487
0
  {
5488
0
      xmm_dst = load_128_aligned ((__m128i*)dst);
5489
5490
0
      unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5491
0
      unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5492
0
      expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
5493
0
              &xmm_alpha_lo, &xmm_alpha_hi);
5494
5495
0
      in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
5496
0
         &xmm_alpha_lo, &xmm_alpha_hi,
5497
0
         &xmm_mask, &xmm_mask,
5498
0
         &xmm_dst_lo, &xmm_dst_hi);
5499
5500
0
      save_128_aligned (
5501
0
    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5502
0
  }
5503
5504
0
  dst += 4;
5505
0
  w -= 4;
5506
0
    }
5507
5508
0
    while (w)
5509
0
    {
5510
0
  uint32_t s = *(src + pixman_fixed_to_int (vx));
5511
0
  vx += unit_x;
5512
0
  while (vx >= 0)
5513
0
      vx -= src_width_fixed;
5514
5515
0
  if (s)
5516
0
  {
5517
0
      uint32_t d = *dst;
5518
5519
0
      __m128i ms = unpack_32_1x128 (s);
5520
0
      __m128i alpha = expand_alpha_1x128 (ms);
5521
0
      __m128i mask  = xmm_mask;
5522
0
      __m128i dest  = unpack_32_1x128 (d);
5523
5524
0
      *dst = pack_1x128_32 (
5525
0
    in_over_1x128 (&ms, &alpha, &mask, &dest));
5526
0
  }
5527
5528
0
  dst++;
5529
0
  w--;
5530
0
    }
5531
5532
0
}
5533
5534
FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
5535
            scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5536
            uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
5537
FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
5538
            scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5539
            uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
5540
FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
5541
            scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5542
            uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
5543
FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
5544
            scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5545
            uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
5546
5547
#if PSHUFD_IS_FAST
5548
5549
/***********************************************************************************/
5550
5551
# define BILINEAR_DECLARE_VARIABLES           \
5552
    const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);  \
5553
    const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);  \
5554
    const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);    \
5555
    const __m128i xmm_ux1 = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x,  \
5556
             unit_x, -unit_x, unit_x, -unit_x); \
5557
    const __m128i xmm_ux4 = _mm_set_epi16 (unit_x * 4, -unit_x * 4,   \
5558
             unit_x * 4, -unit_x * 4,   \
5559
             unit_x * 4, -unit_x * 4,   \
5560
             unit_x * 4, -unit_x * 4);    \
5561
    const __m128i xmm_zero = _mm_setzero_si128 ();        \
5562
    __m128i xmm_x = _mm_set_epi16 (vx + unit_x * 3, -(vx + 1) - unit_x * 3, \
5563
           vx + unit_x * 2, -(vx + 1) - unit_x * 2, \
5564
           vx + unit_x * 1, -(vx + 1) - unit_x * 1, \
5565
           vx + unit_x * 0, -(vx + 1) - unit_x * 0);  \
5566
    __m128i xmm_wh_state;
5567
5568
#define BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER(pix, phase_)      \
5569
do {                    \
5570
    int phase = phase_;               \
5571
    __m128i xmm_wh, xmm_a, xmm_b;           \
5572
    /* fetch 2x2 pixel block into sse2 registers */       \
5573
    __m128i tltr = _mm_loadl_epi64 ((__m128i *)&src_top[vx >> 16]);   \
5574
    __m128i blbr = _mm_loadl_epi64 ((__m128i *)&src_bottom[vx >> 16]);    \
5575
    vx += unit_x;               \
5576
    /* vertical interpolation */            \
5577
    xmm_a = _mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), xmm_wt); \
5578
    xmm_b = _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), xmm_wb); \
5579
    xmm_a = _mm_add_epi16 (xmm_a, xmm_b);           \
5580
    /* calculate horizontal weights */            \
5581
    if (phase <= 0)               \
5582
    {                   \
5583
  xmm_wh_state = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x,    \
5584
          16 - BILINEAR_INTERPOLATION_BITS)); \
5585
  xmm_x = _mm_add_epi16 (xmm_x, (phase < 0) ? xmm_ux1 : xmm_ux4);   \
5586
  phase = 0;                \
5587
    }                   \
5588
    xmm_wh = _mm_shuffle_epi32 (xmm_wh_state, _MM_SHUFFLE (phase, phase,  \
5589
                 phase, phase));  \
5590
    /* horizontal interpolation */            \
5591
    xmm_a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 (   \
5592
    xmm_a, _MM_SHUFFLE (1, 0, 3, 2)), xmm_a), xmm_wh);    \
5593
    /* shift the result */              \
5594
    pix = _mm_srli_epi32 (xmm_a, BILINEAR_INTERPOLATION_BITS * 2);    \
5595
} while (0)
5596
5597
#else /************************************************************************/
5598
5599
# define BILINEAR_DECLARE_VARIABLES           \
5600
0
    const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);  \
5601
0
    const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);  \
5602
0
    const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);    \
5603
0
    const __m128i xmm_ux1 = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x,  \
5604
0
            unit_x, -unit_x, unit_x, -unit_x);  \
5605
0
    const __m128i xmm_ux4 = _mm_set_epi16 (unit_x * 4, -unit_x * 4,   \
5606
0
             unit_x * 4, -unit_x * 4,   \
5607
0
             unit_x * 4, -unit_x * 4,   \
5608
0
             unit_x * 4, -unit_x * 4);    \
5609
0
    const __m128i xmm_zero = _mm_setzero_si128 ();        \
5610
0
    __m128i xmm_x = _mm_set_epi16 (vx, -(vx + 1), vx, -(vx + 1),    \
5611
0
           vx, -(vx + 1), vx, -(vx + 1))
5612
5613
0
#define BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER(pix, phase)     \
5614
0
do {                   \
5615
0
    __m128i xmm_wh, xmm_a, xmm_b;           \
5616
0
    /* fetch 2x2 pixel block into sse2 registers */       \
5617
0
    __m128i tltr = _mm_loadl_epi64 ((__m128i *)&src_top[vx >> 16]);   \
5618
0
    __m128i blbr = _mm_loadl_epi64 ((__m128i *)&src_bottom[vx >> 16]);    \
5619
0
    (void)xmm_ux4; /* suppress warning: unused variable 'xmm_ux4' */    \
5620
0
    vx += unit_x;               \
5621
0
    /* vertical interpolation */            \
5622
0
    xmm_a = _mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), xmm_wt); \
5623
0
    xmm_b = _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), xmm_wb); \
5624
0
    xmm_a = _mm_add_epi16 (xmm_a, xmm_b);         \
5625
0
    /* calculate horizontal weights */            \
5626
0
    xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x,      \
5627
0
          16 - BILINEAR_INTERPOLATION_BITS)); \
5628
0
    xmm_x = _mm_add_epi16 (xmm_x, xmm_ux1);         \
5629
0
    /* horizontal interpolation */            \
5630
0
    xmm_b = _mm_unpacklo_epi64 (/* any value is fine here */ xmm_b, xmm_a); \
5631
0
    xmm_a = _mm_madd_epi16 (_mm_unpackhi_epi16 (xmm_b, xmm_a), xmm_wh);   \
5632
0
    /* shift the result */              \
5633
0
    pix = _mm_srli_epi32 (xmm_a, BILINEAR_INTERPOLATION_BITS * 2);    \
5634
0
} while (0)
5635
5636
/***********************************************************************************/
5637
5638
#endif
5639
5640
0
#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix);         \
5641
0
do {                   \
5642
0
  __m128i xmm_pix;              \
5643
0
  BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix, -1);     \
5644
0
  xmm_pix = _mm_packs_epi32 (xmm_pix, xmm_pix);       \
5645
0
  xmm_pix = _mm_packus_epi16 (xmm_pix, xmm_pix);        \
5646
0
  pix = _mm_cvtsi128_si32 (xmm_pix);          \
5647
0
} while(0)
5648
5649
0
#define BILINEAR_INTERPOLATE_FOUR_PIXELS(pix);         \
5650
0
do {                   \
5651
0
  __m128i xmm_pix1, xmm_pix2, xmm_pix3, xmm_pix4;       \
5652
0
  BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix1, 0);     \
5653
0
  BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix2, 1);     \
5654
0
  BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix3, 2);     \
5655
0
  BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix4, 3);     \
5656
0
  xmm_pix1 = _mm_packs_epi32 (xmm_pix1, xmm_pix2);      \
5657
0
  xmm_pix3 = _mm_packs_epi32 (xmm_pix3, xmm_pix4);      \
5658
0
  pix = _mm_packus_epi16 (xmm_pix1, xmm_pix3);        \
5659
0
} while(0)
5660
5661
0
#define BILINEAR_SKIP_ONE_PIXEL()           \
5662
0
do {                   \
5663
0
    vx += unit_x;               \
5664
0
    xmm_x = _mm_add_epi16 (xmm_x, xmm_ux1);         \
5665
0
} while(0)
5666
5667
0
#define BILINEAR_SKIP_FOUR_PIXELS()           \
5668
0
do {                   \
5669
0
    vx += unit_x * 4;               \
5670
0
    xmm_x = _mm_add_epi16 (xmm_x, xmm_ux4);         \
5671
0
} while(0)
5672
5673
/***********************************************************************************/
5674
5675
static force_inline void
5676
scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t *       dst,
5677
               const uint32_t * mask,
5678
               const uint32_t * src_top,
5679
               const uint32_t * src_bottom,
5680
               int32_t          w,
5681
               int              wt,
5682
               int              wb,
5683
               pixman_fixed_t   vx_,
5684
               pixman_fixed_t   unit_x_,
5685
               pixman_fixed_t   max_vx,
5686
               pixman_bool_t    zero_src)
5687
0
{
5688
0
    intptr_t vx = vx_;
5689
0
    intptr_t unit_x = unit_x_;
5690
0
    BILINEAR_DECLARE_VARIABLES;
5691
0
    uint32_t pix1, pix2;
5692
5693
0
    while (w && ((uintptr_t)dst & 15))
5694
0
    {
5695
0
  BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5696
0
  *dst++ = pix1;
5697
0
  w--;
5698
0
    }
5699
5700
0
    while ((w -= 4) >= 0) {
5701
0
  __m128i xmm_src;
5702
0
  BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
5703
0
  _mm_store_si128 ((__m128i *)dst, xmm_src);
5704
0
  dst += 4;
5705
0
    }
5706
5707
0
    if (w & 2)
5708
0
    {
5709
0
  BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5710
0
  BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5711
0
  *dst++ = pix1;
5712
0
  *dst++ = pix2;
5713
0
    }
5714
5715
0
    if (w & 1)
5716
0
    {
5717
0
  BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5718
0
  *dst = pix1;
5719
0
    }
5720
5721
0
}
5722
5723
FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
5724
             scaled_bilinear_scanline_sse2_8888_8888_SRC,
5725
             uint32_t, uint32_t, uint32_t,
5726
             COVER, FLAG_NONE)
5727
FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
5728
             scaled_bilinear_scanline_sse2_8888_8888_SRC,
5729
             uint32_t, uint32_t, uint32_t,
5730
             PAD, FLAG_NONE)
5731
FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
5732
             scaled_bilinear_scanline_sse2_8888_8888_SRC,
5733
             uint32_t, uint32_t, uint32_t,
5734
             NONE, FLAG_NONE)
5735
FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC,
5736
             scaled_bilinear_scanline_sse2_8888_8888_SRC,
5737
             uint32_t, uint32_t, uint32_t,
5738
             NORMAL, FLAG_NONE)
5739
5740
static force_inline void
5741
scaled_bilinear_scanline_sse2_x888_8888_SRC (uint32_t *       dst,
5742
               const uint32_t * mask,
5743
               const uint32_t * src_top,
5744
               const uint32_t * src_bottom,
5745
               int32_t          w,
5746
               int              wt,
5747
               int              wb,
5748
               pixman_fixed_t   vx_,
5749
               pixman_fixed_t   unit_x_,
5750
               pixman_fixed_t   max_vx,
5751
               pixman_bool_t    zero_src)
5752
0
{
5753
0
    intptr_t vx = vx_;
5754
0
    intptr_t unit_x = unit_x_;
5755
0
    BILINEAR_DECLARE_VARIABLES;
5756
0
    uint32_t pix1, pix2;
5757
5758
0
    while (w && ((uintptr_t)dst & 15))
5759
0
    {
5760
0
  BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5761
0
  *dst++ = pix1 | 0xFF000000;
5762
0
  w--;
5763
0
    }
5764
5765
0
    while ((w -= 4) >= 0) {
5766
0
  __m128i xmm_src;
5767
0
  BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
5768
0
  _mm_store_si128 ((__m128i *)dst, _mm_or_si128 (xmm_src, mask_ff000000));
5769
0
  dst += 4;
5770
0
    }
5771
5772
0
    if (w & 2)
5773
0
    {
5774
0
  BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5775
0
  BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5776
0
  *dst++ = pix1 | 0xFF000000;
5777
0
  *dst++ = pix2 | 0xFF000000;
5778
0
    }
5779
5780
0
    if (w & 1)
5781
0
    {
5782
0
  BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5783
0
  *dst = pix1 | 0xFF000000;
5784
0
    }
5785
0
}
5786
5787
FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_cover_SRC,
5788
             scaled_bilinear_scanline_sse2_x888_8888_SRC,
5789
             uint32_t, uint32_t, uint32_t,
5790
             COVER, FLAG_NONE)
5791
FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_pad_SRC,
5792
             scaled_bilinear_scanline_sse2_x888_8888_SRC,
5793
             uint32_t, uint32_t, uint32_t,
5794
             PAD, FLAG_NONE)
5795
FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_normal_SRC,
5796
             scaled_bilinear_scanline_sse2_x888_8888_SRC,
5797
             uint32_t, uint32_t, uint32_t,
5798
             NORMAL, FLAG_NONE)
5799
5800
static force_inline void
5801
scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t *       dst,
5802
                const uint32_t * mask,
5803
                const uint32_t * src_top,
5804
                const uint32_t * src_bottom,
5805
                int32_t          w,
5806
                int              wt,
5807
                int              wb,
5808
                pixman_fixed_t   vx_,
5809
                pixman_fixed_t   unit_x_,
5810
                pixman_fixed_t   max_vx,
5811
                pixman_bool_t    zero_src)
5812
0
{
5813
0
    intptr_t vx = vx_;
5814
0
    intptr_t unit_x = unit_x_;
5815
0
    BILINEAR_DECLARE_VARIABLES;
5816
0
    uint32_t pix1, pix2;
5817
5818
0
    while (w && ((uintptr_t)dst & 15))
5819
0
    {
5820
0
  BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5821
5822
0
  if (pix1)
5823
0
  {
5824
0
      pix2 = *dst;
5825
0
      *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
5826
0
  }
5827
5828
0
  w--;
5829
0
  dst++;
5830
0
    }
5831
5832
0
    while (w  >= 4)
5833
0
    {
5834
0
  __m128i xmm_src;
5835
0
  __m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo;
5836
0
  __m128i xmm_alpha_hi, xmm_alpha_lo;
5837
5838
0
  BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
5839
5840
0
  if (!is_zero (xmm_src))
5841
0
  {
5842
0
      if (is_opaque (xmm_src))
5843
0
      {
5844
0
    save_128_aligned ((__m128i *)dst, xmm_src);
5845
0
      }
5846
0
      else
5847
0
      {
5848
0
    __m128i xmm_dst = load_128_aligned ((__m128i *)dst);
5849
5850
0
    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5851
0
    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5852
5853
0
    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5854
0
    over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi,
5855
0
          &xmm_dst_lo, &xmm_dst_hi);
5856
5857
0
    save_128_aligned ((__m128i *)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5858
0
      }
5859
0
  }
5860
5861
0
  w -= 4;
5862
0
  dst += 4;
5863
0
    }
5864
5865
0
    while (w)
5866
0
    {
5867
0
  BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5868
5869
0
  if (pix1)
5870
0
  {
5871
0
      pix2 = *dst;
5872
0
      *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
5873
0
  }
5874
5875
0
  w--;
5876
0
  dst++;
5877
0
    }
5878
0
}
5879
5880
FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER,
5881
             scaled_bilinear_scanline_sse2_8888_8888_OVER,
5882
             uint32_t, uint32_t, uint32_t,
5883
             COVER, FLAG_NONE)
5884
FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER,
5885
             scaled_bilinear_scanline_sse2_8888_8888_OVER,
5886
             uint32_t, uint32_t, uint32_t,
5887
             PAD, FLAG_NONE)
5888
FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER,
5889
             scaled_bilinear_scanline_sse2_8888_8888_OVER,
5890
             uint32_t, uint32_t, uint32_t,
5891
             NONE, FLAG_NONE)
5892
FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER,
5893
             scaled_bilinear_scanline_sse2_8888_8888_OVER,
5894
             uint32_t, uint32_t, uint32_t,
5895
             NORMAL, FLAG_NONE)
5896
5897
static force_inline void
5898
scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t *       dst,
5899
            const uint8_t  * mask,
5900
            const uint32_t * src_top,
5901
            const uint32_t * src_bottom,
5902
            int32_t          w,
5903
            int              wt,
5904
            int              wb,
5905
            pixman_fixed_t   vx_,
5906
            pixman_fixed_t   unit_x_,
5907
            pixman_fixed_t   max_vx,
5908
            pixman_bool_t    zero_src)
5909
0
{
5910
0
    intptr_t vx = vx_;
5911
0
    intptr_t unit_x = unit_x_;
5912
0
    BILINEAR_DECLARE_VARIABLES;
5913
0
    uint32_t pix1, pix2;
5914
5915
0
    while (w && ((uintptr_t)dst & 15))
5916
0
    {
5917
0
  uint32_t sa;
5918
0
  uint8_t m = *mask++;
5919
5920
0
  if (m)
5921
0
  {
5922
0
      BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5923
0
      sa = pix1 >> 24;
5924
5925
0
      if (sa == 0xff && m == 0xff)
5926
0
      {
5927
0
    *dst = pix1;
5928
0
      }
5929
0
      else
5930
0
      {
5931
0
    __m128i ms, md, ma, msa;
5932
5933
0
    pix2 = *dst;
5934
0
    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5935
0
    ms = unpack_32_1x128 (pix1);
5936
0
    md = unpack_32_1x128 (pix2);
5937
5938
0
    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5939
5940
0
    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5941
0
      }
5942
0
  }
5943
0
  else
5944
0
  {
5945
0
      BILINEAR_SKIP_ONE_PIXEL ();
5946
0
  }
5947
5948
0
  w--;
5949
0
  dst++;
5950
0
    }
5951
5952
0
    while (w >= 4)
5953
0
    {
5954
0
        uint32_t m;
5955
5956
0
  __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5957
0
  __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5958
0
  __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5959
5960
0
        memcpy(&m, mask, sizeof(uint32_t));
5961
5962
0
  if (m)
5963
0
  {
5964
0
      BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
5965
5966
0
      if (m == 0xffffffff && is_opaque (xmm_src))
5967
0
      {
5968
0
    save_128_aligned ((__m128i *)dst, xmm_src);
5969
0
      }
5970
0
      else
5971
0
      {
5972
0
    xmm_dst = load_128_aligned ((__m128i *)dst);
5973
5974
0
    xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5975
5976
0
    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5977
0
    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5978
0
    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5979
5980
0
    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5981
0
    expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5982
5983
0
    in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5984
0
             &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5985
5986
0
    save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5987
0
      }
5988
0
  }
5989
0
  else
5990
0
  {
5991
0
      BILINEAR_SKIP_FOUR_PIXELS ();
5992
0
  }
5993
5994
0
  w -= 4;
5995
0
  dst += 4;
5996
0
  mask += 4;
5997
0
    }
5998
5999
0
    while (w)
6000
0
    {
6001
0
  uint32_t sa;
6002
0
  uint8_t m = *mask++;
6003
6004
0
  if (m)
6005
0
  {
6006
0
      BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
6007
0
      sa = pix1 >> 24;
6008
6009
0
      if (sa == 0xff && m == 0xff)
6010
0
      {
6011
0
    *dst = pix1;
6012
0
      }
6013
0
      else
6014
0
      {
6015
0
    __m128i ms, md, ma, msa;
6016
6017
0
    pix2 = *dst;
6018
0
    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
6019
0
    ms = unpack_32_1x128 (pix1);
6020
0
    md = unpack_32_1x128 (pix2);
6021
6022
0
    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
6023
6024
0
    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
6025
0
      }
6026
0
  }
6027
0
  else
6028
0
  {
6029
0
      BILINEAR_SKIP_ONE_PIXEL ();
6030
0
  }
6031
6032
0
  w--;
6033
0
  dst++;
6034
0
    }
6035
0
}
6036
6037
FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER,
6038
             scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
6039
             uint32_t, uint8_t, uint32_t,
6040
             COVER, FLAG_HAVE_NON_SOLID_MASK)
6041
FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER,
6042
             scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
6043
             uint32_t, uint8_t, uint32_t,
6044
             PAD, FLAG_HAVE_NON_SOLID_MASK)
6045
FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER,
6046
             scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
6047
             uint32_t, uint8_t, uint32_t,
6048
             NONE, FLAG_HAVE_NON_SOLID_MASK)
6049
FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER,
6050
             scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
6051
             uint32_t, uint8_t, uint32_t,
6052
             NORMAL, FLAG_HAVE_NON_SOLID_MASK)
6053
6054
static force_inline void
6055
scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t *       dst,
6056
            const uint32_t * mask,
6057
            const uint32_t * src_top,
6058
            const uint32_t * src_bottom,
6059
            int32_t          w,
6060
            int              wt,
6061
            int              wb,
6062
            pixman_fixed_t   vx_,
6063
            pixman_fixed_t   unit_x_,
6064
            pixman_fixed_t   max_vx,
6065
            pixman_bool_t    zero_src)
6066
0
{
6067
0
    intptr_t vx = vx_;
6068
0
    intptr_t unit_x = unit_x_;
6069
0
    BILINEAR_DECLARE_VARIABLES;
6070
0
    uint32_t pix1;
6071
0
    __m128i xmm_mask;
6072
6073
0
    if (zero_src || (*mask >> 24) == 0)
6074
0
  return;
6075
6076
0
    xmm_mask = create_mask_16_128 (*mask >> 24);
6077
6078
0
    while (w && ((uintptr_t)dst & 15))
6079
0
    {
6080
0
  BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
6081
0
  if (pix1)
6082
0
  {
6083
0
    uint32_t d = *dst;
6084
6085
0
    __m128i ms = unpack_32_1x128 (pix1);
6086
0
    __m128i alpha     = expand_alpha_1x128 (ms);
6087
0
    __m128i dest      = xmm_mask;
6088
0
    __m128i alpha_dst = unpack_32_1x128 (d);
6089
6090
0
    *dst = pack_1x128_32
6091
0
      (in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
6092
0
  }
6093
6094
0
  dst++;
6095
0
  w--;
6096
0
    }
6097
6098
0
    while (w >= 4)
6099
0
    {
6100
0
  __m128i xmm_src;
6101
0
  BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
6102
6103
0
  if (!is_zero (xmm_src))
6104
0
  {
6105
0
      __m128i xmm_src_lo, xmm_src_hi;
6106
0
      __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
6107
0
      __m128i xmm_alpha_lo, xmm_alpha_hi;
6108
6109
0
      xmm_dst = load_128_aligned ((__m128i*)dst);
6110
6111
0
      unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
6112
0
      unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
6113
0
      expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
6114
0
        &xmm_alpha_lo, &xmm_alpha_hi);
6115
6116
0
      in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
6117
0
         &xmm_alpha_lo, &xmm_alpha_hi,
6118
0
         &xmm_mask, &xmm_mask,
6119
0
         &xmm_dst_lo, &xmm_dst_hi);
6120
6121
0
      save_128_aligned
6122
0
    ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
6123
0
  }
6124
6125
0
  dst += 4;
6126
0
  w -= 4;
6127
0
    }
6128
6129
0
    while (w)
6130
0
    {
6131
0
  BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
6132
0
  if (pix1)
6133
0
  {
6134
0
    uint32_t d = *dst;
6135
6136
0
    __m128i ms = unpack_32_1x128 (pix1);
6137
0
    __m128i alpha     = expand_alpha_1x128 (ms);
6138
0
    __m128i dest      = xmm_mask;
6139
0
    __m128i alpha_dst = unpack_32_1x128 (d);
6140
6141
0
    *dst = pack_1x128_32
6142
0
      (in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
6143
0
  }
6144
6145
0
  dst++;
6146
0
  w--;
6147
0
    }
6148
0
}
6149
6150
FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
6151
             scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
6152
             uint32_t, uint32_t, uint32_t,
6153
             COVER, FLAG_HAVE_SOLID_MASK)
6154
FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
6155
             scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
6156
             uint32_t, uint32_t, uint32_t,
6157
             PAD, FLAG_HAVE_SOLID_MASK)
6158
FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
6159
             scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
6160
             uint32_t, uint32_t, uint32_t,
6161
             NONE, FLAG_HAVE_SOLID_MASK)
6162
FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
6163
             scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
6164
             uint32_t, uint32_t, uint32_t,
6165
             NORMAL, FLAG_HAVE_SOLID_MASK)
6166
6167
static const pixman_fast_path_t sse2_fast_paths[] =
6168
{
6169
    /* PIXMAN_OP_OVER */
6170
    PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
6171
    PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
6172
    PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
6173
    PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
6174
    PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
6175
    PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, sse2_composite_over_n_0565),
6176
    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
6177
    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
6178
    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
6179
    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
6180
    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
6181
    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
6182
    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
6183
    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
6184
    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
6185
    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
6186
    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
6187
    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
6188
    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
6189
    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
6190
    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
6191
    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
6192
    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
6193
    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
6194
    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
6195
    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
6196
    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
6197
    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
6198
    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
6199
    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
6200
    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
6201
    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
6202
    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
6203
    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
6204
    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
6205
    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
6206
    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
6207
    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
6208
    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
6209
    PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
6210
    PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
6211
    PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
6212
    PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
6213
    PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
6214
    PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
6215
    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6216
    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6217
    
6218
    /* PIXMAN_OP_OVER_REVERSE */
6219
    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
6220
    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
6221
6222
    /* PIXMAN_OP_ADD */
6223
    PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
6224
    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
6225
    PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
6226
    PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
6227
    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
6228
    PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
6229
    PIXMAN_STD_FAST_PATH (ADD, solid, null, x8r8g8b8, sse2_composite_add_n_8888),
6230
    PIXMAN_STD_FAST_PATH (ADD, solid, null, a8r8g8b8, sse2_composite_add_n_8888),
6231
    PIXMAN_STD_FAST_PATH (ADD, solid, null, x8b8g8r8, sse2_composite_add_n_8888),
6232
    PIXMAN_STD_FAST_PATH (ADD, solid, null, a8b8g8r8, sse2_composite_add_n_8888),
6233
    PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8r8g8b8, sse2_composite_add_n_8_8888),
6234
    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, sse2_composite_add_n_8_8888),
6235
    PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8b8g8r8, sse2_composite_add_n_8_8888),
6236
    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8b8g8r8, sse2_composite_add_n_8_8888),
6237
6238
    /* PIXMAN_OP_SRC */
6239
    PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
6240
    PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
6241
    PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
6242
    PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
6243
    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
6244
    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
6245
    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
6246
    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
6247
    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
6248
    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
6249
    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
6250
    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
6251
    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6252
    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6253
    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6254
    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6255
    PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
6256
    PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
6257
6258
    /* PIXMAN_OP_IN */
6259
    PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
6260
    PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
6261
    PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
6262
6263
    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6264
    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6265
    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6266
    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6267
6268
    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
6269
    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
6270
    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
6271
    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
6272
6273
    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6274
    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6275
    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
6276
    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6277
    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6278
    SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, sse2_8888_8888),
6279
6280
    SIMPLE_BILINEAR_FAST_PATH_COVER  (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
6281
    SIMPLE_BILINEAR_FAST_PATH_COVER  (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
6282
    SIMPLE_BILINEAR_FAST_PATH_PAD    (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
6283
    SIMPLE_BILINEAR_FAST_PATH_PAD    (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
6284
    SIMPLE_BILINEAR_FAST_PATH_NORMAL (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
6285
    SIMPLE_BILINEAR_FAST_PATH_NORMAL (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
6286
6287
    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6288
    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6289
    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6290
    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6291
6292
    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
6293
    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
6294
    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
6295
    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
6296
6297
    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888),
6298
    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888),
6299
    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888),
6300
    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8_8888),
6301
6302
    { PIXMAN_OP_NONE },
6303
};
6304
6305
static uint32_t *
6306
sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
6307
0
{
6308
0
    int w = iter->width;
6309
0
    __m128i ff000000 = mask_ff000000;
6310
0
    uint32_t *dst = iter->buffer;
6311
0
    uint32_t *src = (uint32_t *)iter->bits;
6312
6313
0
    iter->bits += iter->stride;
6314
6315
0
    while (w && ((uintptr_t)dst) & 0x0f)
6316
0
    {
6317
0
  *dst++ = (*src++) | 0xff000000;
6318
0
  w--;
6319
0
    }
6320
6321
0
    while (w >= 4)
6322
0
    {
6323
0
  save_128_aligned (
6324
0
      (__m128i *)dst, _mm_or_si128 (
6325
0
    load_128_unaligned ((__m128i *)src), ff000000));
6326
6327
0
  dst += 4;
6328
0
  src += 4;
6329
0
  w -= 4;
6330
0
    }
6331
6332
0
    while (w)
6333
0
    {
6334
0
  *dst++ = (*src++) | 0xff000000;
6335
0
  w--;
6336
0
    }
6337
6338
0
    return iter->buffer;
6339
0
}
6340
6341
static uint32_t *
6342
sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
6343
0
{
6344
0
    int w = iter->width;
6345
0
    uint32_t *dst = iter->buffer;
6346
0
    uint16_t *src = (uint16_t *)iter->bits;
6347
0
    __m128i ff000000 = mask_ff000000;
6348
6349
0
    iter->bits += iter->stride;
6350
6351
0
    while (w && ((uintptr_t)dst) & 0x0f)
6352
0
    {
6353
0
  uint16_t s = *src++;
6354
6355
0
  *dst++ = convert_0565_to_8888 (s);
6356
0
  w--;
6357
0
    }
6358
6359
0
    while (w >= 8)
6360
0
    {
6361
0
  __m128i lo, hi, s;
6362
6363
0
  s = _mm_loadu_si128 ((__m128i *)src);
6364
6365
0
  lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
6366
0
  hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
6367
6368
0
  save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
6369
0
  save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
6370
6371
0
  dst += 8;
6372
0
  src += 8;
6373
0
  w -= 8;
6374
0
    }
6375
6376
0
    while (w)
6377
0
    {
6378
0
  uint16_t s = *src++;
6379
6380
0
  *dst++ = convert_0565_to_8888 (s);
6381
0
  w--;
6382
0
    }
6383
6384
0
    return iter->buffer;
6385
0
}
6386
6387
static uint32_t *
6388
sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
6389
207
{
6390
207
    int w = iter->width;
6391
207
    uint32_t *dst = iter->buffer;
6392
207
    uint8_t *src = iter->bits;
6393
207
    __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6394
6395
207
    iter->bits += iter->stride;
6396
6397
207
    while (w && (((uintptr_t)dst) & 15))
6398
0
    {
6399
0
        *dst++ = (uint32_t)(*(src++)) << 24;
6400
0
        w--;
6401
0
    }
6402
6403
461
    while (w >= 16)
6404
254
    {
6405
254
  xmm0 = _mm_loadu_si128((__m128i *)src);
6406
6407
254
  xmm1 = _mm_unpacklo_epi8  (_mm_setzero_si128(), xmm0);
6408
254
  xmm2 = _mm_unpackhi_epi8  (_mm_setzero_si128(), xmm0);
6409
254
  xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
6410
254
  xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
6411
254
  xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
6412
254
  xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
6413
6414
254
  _mm_store_si128(((__m128i *)(dst +  0)), xmm3);
6415
254
  _mm_store_si128(((__m128i *)(dst +  4)), xmm4);
6416
254
  _mm_store_si128(((__m128i *)(dst +  8)), xmm5);
6417
254
  _mm_store_si128(((__m128i *)(dst + 12)), xmm6);
6418
6419
254
  dst += 16;
6420
254
  src += 16;
6421
254
  w -= 16;
6422
254
    }
6423
6424
1.67k
    while (w)
6425
1.46k
    {
6426
1.46k
  *dst++ = (uint32_t)(*(src++)) << 24;
6427
1.46k
  w--;
6428
1.46k
    }
6429
6430
207
    return iter->buffer;
6431
207
}
6432
6433
#define IMAGE_FLAGS             \
6434
    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |    \
6435
     FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
6436
6437
static const pixman_iter_info_t sse2_iters[] = 
6438
{
6439
    { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW,
6440
      _pixman_iter_init_bits_stride, sse2_fetch_x8r8g8b8, NULL
6441
    },
6442
    { PIXMAN_r5g6b5, IMAGE_FLAGS, ITER_NARROW,
6443
      _pixman_iter_init_bits_stride, sse2_fetch_r5g6b5, NULL
6444
    },
6445
    { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW,
6446
      _pixman_iter_init_bits_stride, sse2_fetch_a8, NULL
6447
    },
6448
    { PIXMAN_null },
6449
};
6450
6451
#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
6452
__attribute__((__force_align_arg_pointer__))
6453
#endif
6454
pixman_implementation_t *
6455
_pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
6456
12
{
6457
12
    pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
6458
6459
    /* SSE2 constants */
6460
12
    mask_565_r  = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6461
12
    mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
6462
12
    mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
6463
12
    mask_565_b  = create_mask_2x32_128 (0x0000001f, 0x0000001f);
6464
12
    mask_red   = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6465
12
    mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
6466
12
    mask_blue  = create_mask_2x32_128 (0x000000f8, 0x000000f8);
6467
12
    mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
6468
12
    mask_565_fix_g = create_mask_2x32_128  (0x0000c000, 0x0000c000);
6469
12
    mask_0080 = create_mask_16_128 (0x0080);
6470
12
    mask_00ff = create_mask_16_128 (0x00ff);
6471
12
    mask_0101 = create_mask_16_128 (0x0101);
6472
12
    mask_ffff = create_mask_16_128 (0xffff);
6473
12
    mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
6474
12
    mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
6475
12
    mask_565_rb = create_mask_2x32_128 (0x00f800f8, 0x00f800f8);
6476
12
    mask_565_pack_multiplier = create_mask_2x32_128 (0x20000004, 0x20000004);
6477
6478
    /* Set up function pointers */
6479
12
    imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
6480
12
    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
6481
12
    imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
6482
12
    imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
6483
12
    imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
6484
12
    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
6485
12
    imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
6486
12
    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
6487
12
    imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
6488
12
    imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
6489
6490
12
    imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
6491
6492
12
    imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
6493
12
    imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
6494
12
    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
6495
12
    imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
6496
12
    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
6497
12
    imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
6498
12
    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
6499
12
    imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
6500
12
    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
6501
12
    imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
6502
12
    imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
6503
6504
12
    imp->blt = sse2_blt;
6505
12
    imp->fill = sse2_fill;
6506
6507
12
    imp->iter_info = sse2_iters;
6508
6509
12
    return imp;
6510
12
}