Coverage Report

Created: 2026-05-16 07:21

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cairo/subprojects/pixman-0.44.2/pixman/pixman-ssse3.c
Line
Count
Source
1
/*
2
 * Copyright © 2013 Soren Sandmann Pedersen
3
 * Copyright © 2013 Red Hat, Inc.
4
 *
5
 * Permission is hereby granted, free of charge, to any person obtaining a
6
 * copy of this software and associated documentation files (the "Software"),
7
 * to deal in the Software without restriction, including without limitation
8
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9
 * and/or sell copies of the Software, and to permit persons to whom the
10
 * Software is furnished to do so, subject to the following conditions:
11
 *
12
 * The above copyright notice and this permission notice (including the next
13
 * paragraph) shall be included in all copies or substantial portions of the
14
 * Software.
15
 * 
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22
 * DEALINGS IN THE SOFTWARE.
23
 *
24
 * Author: Soren Sandmann (soren.sandmann@gmail.com)
25
 */
26
#ifdef HAVE_CONFIG_H
27
#include <pixman-config.h>
28
#endif
29
30
#include <stdlib.h>
31
#include <mmintrin.h>
32
#include <xmmintrin.h>
33
#include <emmintrin.h>
34
#include <tmmintrin.h>
35
#include "pixman-private.h"
36
#include "pixman-inlines.h"
37
38
typedef struct
39
{
40
    int   y;
41
    uint64_t *  buffer;
42
} line_t;
43
44
typedef struct
45
{
46
    line_t    lines[2];
47
    pixman_fixed_t  y;
48
    pixman_fixed_t  x;
49
    uint64_t    data[1];
50
} bilinear_info_t;
51
52
static void
53
ssse3_fetch_horizontal (bits_image_t *image, line_t *line,
54
      int y, pixman_fixed_t x, pixman_fixed_t ux, int n)
55
169
{
56
169
    uint32_t *bits = image->bits + y * image->rowstride;
57
169
    __m128i vx = _mm_set_epi16 (
58
169
  - (x + 1), x, - (x + 1), x,
59
169
  - (x + ux + 1), x + ux,  - (x + ux + 1), x + ux);
60
169
    __m128i vux = _mm_set_epi16 (
61
169
  - 2 * ux, 2 * ux, - 2 * ux, 2 * ux,
62
169
  - 2 * ux, 2 * ux, - 2 * ux, 2 * ux);
63
169
    __m128i vaddc = _mm_set_epi16 (1, 0, 1, 0, 1, 0, 1, 0);
64
169
    __m128i *b = (__m128i *)line->buffer;
65
169
    __m128i vrl0, vrl1;
66
67
8.04k
    while ((n -= 2) >= 0)
68
7.73k
    {
69
7.73k
  __m128i vw, vr, s;
70
71
7.73k
  vrl1 = _mm_loadl_epi64 (
72
7.73k
      (__m128i *)(bits + pixman_fixed_to_int (x + ux)));
73
  /* vrl1: R1, L1 */
74
75
7.87k
    final_pixel:
76
7.87k
  vrl0 = _mm_loadl_epi64 (
77
7.87k
      (__m128i *)(bits + pixman_fixed_to_int (x)));
78
  /* vrl0: R0, L0 */
79
80
  /* The weights are based on vx which is a vector of 
81
   *
82
   *    - (x + 1), x, - (x + 1), x,
83
   *          - (x + ux + 1), x + ux, - (x + ux + 1), x + ux
84
   *
85
   * so the 16 bit weights end up like this:
86
   *
87
   *    iw0, w0, iw0, w0, iw1, w1, iw1, w1
88
   *
89
   * and after shifting and packing, we get these bytes:
90
   *
91
   *    iw0, w0, iw0, w0, iw1, w1, iw1, w1,
92
   *        iw0, w0, iw0, w0, iw1, w1, iw1, w1,
93
   *
94
   * which means the first and the second input pixel 
95
   * have to be interleaved like this:
96
   *
97
   *    la0, ra0, lr0, rr0, la1, ra1, lr1, rr1,
98
   *        lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1
99
   *
100
   * before maddubsw can be used.
101
   */
102
103
7.87k
  vw = _mm_add_epi16 (
104
7.87k
      vaddc, _mm_srli_epi16 (vx, 16 - BILINEAR_INTERPOLATION_BITS));
105
  /* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1
106
   */
107
108
7.87k
  vw = _mm_packus_epi16 (vw, vw);
109
  /* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1,
110
   *         iw0, w0, iw0, w0, iw1, w1, iw1, w1
111
   */
112
7.87k
  vx = _mm_add_epi16 (vx, vux);
113
114
7.87k
  x += 2 * ux;
115
116
7.87k
  vr = _mm_unpacklo_epi16 (vrl1, vrl0);
117
  /* vr: rar0, rar1, rgb0, rgb1, lar0, lar1, lgb0, lgb1 */
118
119
7.87k
  s = _mm_shuffle_epi32 (vr, _MM_SHUFFLE (1, 0, 3, 2));
120
  /* s:  lar0, lar1, lgb0, lgb1, rar0, rar1, rgb0, rgb1 */
121
122
7.87k
  vr = _mm_unpackhi_epi8 (vr, s);
123
  /* vr: la0, ra0, lr0, rr0, la1, ra1, lr1, rr1,
124
   *         lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1
125
   */
126
127
7.87k
  vr = _mm_maddubs_epi16 (vr, vw);
128
129
  /* When the weight is 0, the inverse weight is
130
   * 128 which can't be represented in a signed byte.
131
   * As a result maddubsw computes the following:
132
   *
133
   *     r = l * -128 + r * 0
134
   *
135
   * rather than the desired
136
   *
137
   *     r = l * 128 + r * 0
138
   *
139
   * We fix this by taking the absolute value of the
140
   * result.
141
   */
142
7.87k
  vr = _mm_abs_epi16 (vr);
143
144
  /* vr: A0, R0, A1, R1, G0, B0, G1, B1 */
145
7.87k
  _mm_store_si128 (b++, vr);
146
7.87k
    }
147
148
310
    if (n == -1)
149
141
    {
150
141
  vrl1 = _mm_setzero_si128();
151
141
  goto final_pixel;
152
141
    }
153
154
169
    line->y = y;
155
169
}
156
157
static uint32_t *
158
ssse3_fetch_bilinear_cover (pixman_iter_t *iter, const uint32_t *mask)
159
615
{
160
615
    pixman_fixed_t fx, ux;
161
615
    bilinear_info_t *info = iter->data;
162
615
    line_t *line0, *line1;
163
615
    int y0, y1;
164
615
    int32_t dist_y;
165
615
    __m128i vw;
166
615
    int i;
167
168
615
    fx = info->x;
169
615
    ux = iter->image->common.transform->matrix[0][0];
170
171
615
    y0 = pixman_fixed_to_int (info->y);
172
615
    y1 = y0 + 1;
173
174
615
    line0 = &info->lines[y0 & 0x01];
175
615
    line1 = &info->lines[y1 & 0x01];
176
177
615
    if (line0->y != y0)
178
13
    {
179
13
  ssse3_fetch_horizontal (
180
13
      &iter->image->bits, line0, y0, fx, ux, iter->width);
181
13
    }
182
183
615
    if (line1->y != y1)
184
156
    {
185
156
  ssse3_fetch_horizontal (
186
156
      &iter->image->bits, line1, y1, fx, ux, iter->width);
187
156
    }
188
189
615
    dist_y = pixman_fixed_to_bilinear_weight (info->y);
190
615
    dist_y <<= (16 - BILINEAR_INTERPOLATION_BITS);
191
192
615
    vw = _mm_set_epi16 (
193
615
  dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y);
194
195
15.5k
    for (i = 0; i + 3 < iter->width; i += 4)
196
14.9k
    {
197
14.9k
  __m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i));
198
14.9k
  __m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i));
199
14.9k
  __m128i top1 = _mm_load_si128 ((__m128i *)(line0->buffer + i + 2));
200
14.9k
  __m128i bot1 = _mm_load_si128 ((__m128i *)(line1->buffer + i + 2));
201
14.9k
  __m128i r0, r1, tmp, p;
202
203
14.9k
  r0 = _mm_mulhi_epu16 (
204
14.9k
      _mm_sub_epi16 (bot0, top0), vw);
205
14.9k
  tmp = _mm_cmplt_epi16 (bot0, top0);
206
14.9k
  tmp = _mm_and_si128 (tmp, vw);
207
14.9k
  r0 = _mm_sub_epi16 (r0, tmp);
208
14.9k
  r0 = _mm_add_epi16 (r0, top0);
209
14.9k
  r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS);
210
  /* r0:  A0 R0 A1 R1 G0 B0 G1 B1 */
211
14.9k
  r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1));
212
  /* r0:  A1 R1 G1 B1 A0 R0 G0 B0 */
213
214
14.9k
  r1 = _mm_mulhi_epu16 (
215
14.9k
      _mm_sub_epi16 (bot1, top1), vw);
216
14.9k
  tmp = _mm_cmplt_epi16 (bot1, top1);
217
14.9k
  tmp = _mm_and_si128 (tmp, vw);
218
14.9k
  r1 = _mm_sub_epi16 (r1, tmp);
219
14.9k
  r1 = _mm_add_epi16 (r1, top1);
220
14.9k
  r1 = _mm_srli_epi16 (r1, BILINEAR_INTERPOLATION_BITS);
221
14.9k
  r1 = _mm_shuffle_epi32 (r1, _MM_SHUFFLE (2, 0, 3, 1));
222
  /* r1: A3 R3 G3 B3 A2 R2 G2 B2 */
223
224
14.9k
  p = _mm_packus_epi16 (r0, r1);
225
226
14.9k
  _mm_storeu_si128 ((__m128i *)(iter->buffer + i), p);
227
14.9k
    }
228
229
1.36k
    while (i < iter->width)
230
752
    {
231
752
  __m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i));
232
752
  __m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i));
233
752
  __m128i r0, tmp, p;
234
235
752
  r0 = _mm_mulhi_epu16 (
236
752
      _mm_sub_epi16 (bot0, top0), vw);
237
752
  tmp = _mm_cmplt_epi16 (bot0, top0);
238
752
  tmp = _mm_and_si128 (tmp, vw);
239
752
  r0 = _mm_sub_epi16 (r0, tmp);
240
752
  r0 = _mm_add_epi16 (r0, top0);
241
752
  r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS);
242
  /* r0:  A0 R0 A1 R1 G0 B0 G1 B1 */
243
752
  r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1));
244
  /* r0:  A1 R1 G1 B1 A0 R0 G0 B0 */
245
246
752
  p = _mm_packus_epi16 (r0, r0);
247
248
752
  if (iter->width - i == 1)
249
502
  {
250
502
      *(uint32_t *)(iter->buffer + i) = _mm_cvtsi128_si32 (p);
251
502
      i++;
252
502
  }
253
250
  else
254
250
  {
255
250
      _mm_storel_epi64 ((__m128i *)(iter->buffer + i), p);
256
250
      i += 2;
257
250
  }
258
752
    }
259
    
260
615
    info->y += iter->image->common.transform->matrix[1][1];
261
262
615
    return iter->buffer;
263
615
}
264
265
static void
266
ssse3_bilinear_cover_iter_fini (pixman_iter_t *iter)
267
13
{
268
13
    free (iter->data);
269
13
}
270
271
static void
272
ssse3_bilinear_cover_iter_init (pixman_iter_t *iter, const pixman_iter_info_t *iter_info)
273
13
{
274
13
    int width = iter->width;
275
13
    bilinear_info_t *info;
276
13
    pixman_vector_t v;
277
278
    /* Reference point is the center of the pixel */
279
13
    v.vector[0] = pixman_int_to_fixed (iter->x) + pixman_fixed_1 / 2;
280
13
    v.vector[1] = pixman_int_to_fixed (iter->y) + pixman_fixed_1 / 2;
281
13
    v.vector[2] = pixman_fixed_1;
282
283
13
    if (!pixman_transform_point_3d (iter->image->common.transform, &v))
284
0
  goto fail;
285
286
13
    info = malloc (sizeof (*info) + (2 * width - 1) * sizeof (uint64_t) + 64);
287
13
    if (!info)
288
0
  goto fail;
289
290
13
    info->x = v.vector[0] - pixman_fixed_1 / 2;
291
13
    info->y = v.vector[1] - pixman_fixed_1 / 2;
292
293
13
#define ALIGN(addr)             \
294
26
    ((void *)((((uintptr_t)(addr)) + 15) & (~15)))
295
296
    /* It is safe to set the y coordinates to -1 initially
297
     * because COVER_CLIP_BILINEAR ensures that we will only
298
     * be asked to fetch lines in the [0, height) interval
299
     */
300
13
    info->lines[0].y = -1;
301
13
    info->lines[0].buffer = ALIGN (&(info->data[0]));
302
13
    info->lines[1].y = -1;
303
13
    info->lines[1].buffer = ALIGN (info->lines[0].buffer + width);
304
305
13
    iter->get_scanline = ssse3_fetch_bilinear_cover;
306
13
    iter->fini = ssse3_bilinear_cover_iter_fini;
307
308
13
    iter->data = info;
309
13
    return;
310
311
0
fail:
312
    /* Something went wrong, either a bad matrix or OOM; in such cases,
313
     * we don't guarantee any particular rendering.
314
     */
315
0
    _pixman_log_error (
316
0
  FUNC, "Allocation failure or bad matrix, skipping rendering\n");
317
    
318
0
    iter->get_scanline = _pixman_iter_get_scanline_noop;
319
0
    iter->fini = NULL;
320
0
}
321
322
static const pixman_iter_info_t ssse3_iters[] = 
323
{
324
    { PIXMAN_a8r8g8b8,
325
      (FAST_PATH_STANDARD_FLAGS     |
326
       FAST_PATH_SCALE_TRANSFORM    |
327
       FAST_PATH_BILINEAR_FILTER    |
328
       FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR),
329
      ITER_NARROW | ITER_SRC,
330
      ssse3_bilinear_cover_iter_init,
331
      NULL, NULL
332
    },
333
334
    { PIXMAN_null },
335
};
336
337
static const pixman_fast_path_t ssse3_fast_paths[] =
338
{
339
    { PIXMAN_OP_NONE },
340
};
341
342
pixman_implementation_t *
343
_pixman_implementation_create_ssse3 (pixman_implementation_t *fallback)
344
14
{
345
14
    pixman_implementation_t *imp =
346
14
  _pixman_implementation_create (fallback, ssse3_fast_paths);
347
348
14
    imp->iter_info = ssse3_iters;
349
350
14
    return imp;
351
14
}