/src/cairo/subprojects/pixman-0.44.2/pixman/pixman-ssse3.c

Source (jump to first uncovered line)
/*
 * Copyright © 2013 Soren Sandmann Pedersen
 * Copyright © 2013 Red Hat, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 *
 * Author: Soren Sandmann (soren.sandmann@gmail.com)
 */
#ifdef HAVE_CONFIG_H
#include <pixman-config.h>
#endif

#include <stdlib.h>
#include <mmintrin.h>
#include <xmmintrin.h>
#include <emmintrin.h>
#include <tmmintrin.h>
#include "pixman-private.h"
#include "pixman-inlines.h"

typedef struct
{
    int   y;
    uint64_t *  buffer;
} line_t;

typedef struct
{
    line_t    lines[2];
    pixman_fixed_t  y;
    pixman_fixed_t  x;
    uint64_t    data[1];
} bilinear_info_t;

static void
ssse3_fetch_horizontal (bits_image_t *image, line_t *line,
      int y, pixman_fixed_t x, pixman_fixed_t ux, int n)
{
    uint32_t *bits = image->bits + y * image->rowstride;
    __m128i vx = _mm_set_epi16 (
  - (x + 1), x, - (x + 1), x,
  - (x + ux + 1), x + ux,  - (x + ux + 1), x + ux);
    __m128i vux = _mm_set_epi16 (
  - 2 * ux, 2 * ux, - 2 * ux, 2 * ux,
  - 2 * ux, 2 * ux, - 2 * ux, 2 * ux);
    __m128i vaddc = _mm_set_epi16 (1, 0, 1, 0, 1, 0, 1, 0);
    __m128i *b = (__m128i *)line->buffer;
    __m128i vrl0, vrl1;

    while ((n -= 2) >= 0)
    {
  __m128i vw, vr, s;

  vrl1 = _mm_loadl_epi64 (
      (__m128i *)(bits + pixman_fixed_to_int (x + ux)));
  /* vrl1: R1, L1 */

    final_pixel:
  vrl0 = _mm_loadl_epi64 (
      (__m128i *)(bits + pixman_fixed_to_int (x)));
  /* vrl0: R0, L0 */

  /* The weights are based on vx which is a vector of 
   *
   *    - (x + 1), x, - (x + 1), x,
   *          - (x + ux + 1), x + ux, - (x + ux + 1), x + ux
   *
   * so the 16 bit weights end up like this:
   *
   *    iw0, w0, iw0, w0, iw1, w1, iw1, w1
   *
   * and after shifting and packing, we get these bytes:
   *
   *    iw0, w0, iw0, w0, iw1, w1, iw1, w1,
   *        iw0, w0, iw0, w0, iw1, w1, iw1, w1,
   *
   * which means the first and the second input pixel 
   * have to be interleaved like this:
   *
   *    la0, ra0, lr0, rr0, la1, ra1, lr1, rr1,
   *        lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1
   *
   * before maddubsw can be used.
   */

  vw = _mm_add_epi16 (
      vaddc, _mm_srli_epi16 (vx, 16 - BILINEAR_INTERPOLATION_BITS));
  /* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1
   */

  vw = _mm_packus_epi16 (vw, vw);
  /* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1,
   *         iw0, w0, iw0, w0, iw1, w1, iw1, w1
   */
  vx = _mm_add_epi16 (vx, vux);

  x += 2 * ux;

  vr = _mm_unpacklo_epi16 (vrl1, vrl0);
  /* vr: rar0, rar1, rgb0, rgb1, lar0, lar1, lgb0, lgb1 */

  s = _mm_shuffle_epi32 (vr, _MM_SHUFFLE (1, 0, 3, 2));
  /* s:  lar0, lar1, lgb0, lgb1, rar0, rar1, rgb0, rgb1 */

  vr = _mm_unpackhi_epi8 (vr, s);
  /* vr: la0, ra0, lr0, rr0, la1, ra1, lr1, rr1,
   *         lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1
   */

  vr = _mm_maddubs_epi16 (vr, vw);

  /* When the weight is 0, the inverse weight is
   * 128 which can't be represented in a signed byte.
   * As a result maddubsw computes the following:
   *
   *     r = l * -128 + r * 0
   *
   * rather than the desired
   *
   *     r = l * 128 + r * 0
   *
   * We fix this by taking the absolute value of the
   * result.
   */
  vr = _mm_abs_epi16 (vr);

  /* vr: A0, R0, A1, R1, G0, B0, G1, B1 */
  _mm_store_si128 (b++, vr);
    }

    if (n == -1)
    {
  vrl1 = _mm_setzero_si128();
  goto final_pixel;
    }

    line->y = y;
}

static uint32_t *
ssse3_fetch_bilinear_cover (pixman_iter_t *iter, const uint32_t *mask)
{
    pixman_fixed_t fx, ux;
    bilinear_info_t *info = iter->data;
    line_t *line0, *line1;
    int y0, y1;
    int32_t dist_y;
    __m128i vw;
    int i;

    fx = info->x;
    ux = iter->image->common.transform->matrix[0][0];

    y0 = pixman_fixed_to_int (info->y);
    y1 = y0 + 1;

    line0 = &info->lines[y0 & 0x01];
    line1 = &info->lines[y1 & 0x01];

    if (line0->y != y0)
    {
  ssse3_fetch_horizontal (
      &iter->image->bits, line0, y0, fx, ux, iter->width);
    }

    if (line1->y != y1)
    {
  ssse3_fetch_horizontal (
      &iter->image->bits, line1, y1, fx, ux, iter->width);
    }

    dist_y = pixman_fixed_to_bilinear_weight (info->y);
    dist_y <<= (16 - BILINEAR_INTERPOLATION_BITS);

    vw = _mm_set_epi16 (
  dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y);

    for (i = 0; i + 3 < iter->width; i += 4)
    {
  __m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i));
  __m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i));
  __m128i top1 = _mm_load_si128 ((__m128i *)(line0->buffer + i + 2));
  __m128i bot1 = _mm_load_si128 ((__m128i *)(line1->buffer + i + 2));
  __m128i r0, r1, tmp, p;

  r0 = _mm_mulhi_epu16 (
      _mm_sub_epi16 (bot0, top0), vw);
  tmp = _mm_cmplt_epi16 (bot0, top0);
  tmp = _mm_and_si128 (tmp, vw);
  r0 = _mm_sub_epi16 (r0, tmp);
  r0 = _mm_add_epi16 (r0, top0);
  r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS);
  /* r0:  A0 R0 A1 R1 G0 B0 G1 B1 */
  r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1));
  /* r0:  A1 R1 G1 B1 A0 R0 G0 B0 */

  r1 = _mm_mulhi_epu16 (
      _mm_sub_epi16 (bot1, top1), vw);
  tmp = _mm_cmplt_epi16 (bot1, top1);
  tmp = _mm_and_si128 (tmp, vw);
  r1 = _mm_sub_epi16 (r1, tmp);
  r1 = _mm_add_epi16 (r1, top1);
  r1 = _mm_srli_epi16 (r1, BILINEAR_INTERPOLATION_BITS);
  r1 = _mm_shuffle_epi32 (r1, _MM_SHUFFLE (2, 0, 3, 1));
  /* r1: A3 R3 G3 B3 A2 R2 G2 B2 */

  p = _mm_packus_epi16 (r0, r1);

  _mm_storeu_si128 ((__m128i *)(iter->buffer + i), p);
    }

    while (i < iter->width)
    {
  __m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i));
  __m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i));
  __m128i r0, tmp, p;

  r0 = _mm_mulhi_epu16 (
      _mm_sub_epi16 (bot0, top0), vw);
  tmp = _mm_cmplt_epi16 (bot0, top0);
  tmp = _mm_and_si128 (tmp, vw);
  r0 = _mm_sub_epi16 (r0, tmp);
  r0 = _mm_add_epi16 (r0, top0);
  r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS);
  /* r0:  A0 R0 A1 R1 G0 B0 G1 B1 */
  r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1));
  /* r0:  A1 R1 G1 B1 A0 R0 G0 B0 */

  p = _mm_packus_epi16 (r0, r0);

  if (iter->width - i == 1)
  {
      *(uint32_t *)(iter->buffer + i) = _mm_cvtsi128_si32 (p);
      i++;
  }
  else
  {
      _mm_storel_epi64 ((__m128i *)(iter->buffer + i), p);
      i += 2;
  }
    }
    
    info->y += iter->image->common.transform->matrix[1][1];

    return iter->buffer;
}

static void
ssse3_bilinear_cover_iter_fini (pixman_iter_t *iter)
{
    free (iter->data);
}

static void
ssse3_bilinear_cover_iter_init (pixman_iter_t *iter, const pixman_iter_info_t *iter_info)
{
    int width = iter->width;
    bilinear_info_t *info;
    pixman_vector_t v;

    /* Reference point is the center of the pixel */
    v.vector[0] = pixman_int_to_fixed (iter->x) + pixman_fixed_1 / 2;
    v.vector[1] = pixman_int_to_fixed (iter->y) + pixman_fixed_1 / 2;
    v.vector[2] = pixman_fixed_1;

    if (!pixman_transform_point_3d (iter->image->common.transform, &v))
  goto fail;

    info = malloc (sizeof (*info) + (2 * width - 1) * sizeof (uint64_t) + 64);
    if (!info)
  goto fail;

    info->x = v.vector[0] - pixman_fixed_1 / 2;
    info->y = v.vector[1] - pixman_fixed_1 / 2;

#define ALIGN(addr)             \
    ((void *)((((uintptr_t)(addr)) + 15) & (~15)))

    /* It is safe to set the y coordinates to -1 initially
     * because COVER_CLIP_BILINEAR ensures that we will only
     * be asked to fetch lines in the [0, height) interval
     */
    info->lines[0].y = -1;
    info->lines[0].buffer = ALIGN (&(info->data[0]));
    info->lines[1].y = -1;
    info->lines[1].buffer = ALIGN (info->lines[0].buffer + width);

    iter->get_scanline = ssse3_fetch_bilinear_cover;
    iter->fini = ssse3_bilinear_cover_iter_fini;

    iter->data = info;
    return;

fail:
    /* Something went wrong, either a bad matrix or OOM; in such cases,
     * we don't guarantee any particular rendering.
     */
    _pixman_log_error (
  FUNC, "Allocation failure or bad matrix, skipping rendering\n");
    
    iter->get_scanline = _pixman_iter_get_scanline_noop;
    iter->fini = NULL;
}

static const pixman_iter_info_t ssse3_iters[] = 
{
    { PIXMAN_a8r8g8b8,
      (FAST_PATH_STANDARD_FLAGS     |
       FAST_PATH_SCALE_TRANSFORM    |
       FAST_PATH_BILINEAR_FILTER    |
       FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR),
      ITER_NARROW | ITER_SRC,
      ssse3_bilinear_cover_iter_init,
      NULL, NULL
    },

    { PIXMAN_null },
};

static const pixman_fast_path_t ssse3_fast_paths[] =
{
    { PIXMAN_OP_NONE },
};

pixman_implementation_t *
_pixman_implementation_create_ssse3 (pixman_implementation_t *fallback)
{
    pixman_implementation_t *imp =
  _pixman_implementation_create (fallback, ssse3_fast_paths);

    imp->iter_info = ssse3_iters;

    return imp;
}

Coverage Report

Created: 2025-07-23 06:50

Line	Count	Source (jump to first uncovered line)
1		/*
2		* Copyright © 2013 Soren Sandmann Pedersen
3		* Copyright © 2013 Red Hat, Inc.
4		*
5		* Permission is hereby granted, free of charge, to any person obtaining a
6		* copy of this software and associated documentation files (the "Software"),
7		* to deal in the Software without restriction, including without limitation
8		* the rights to use, copy, modify, merge, publish, distribute, sublicense,
9		* and/or sell copies of the Software, and to permit persons to whom the
10		* Software is furnished to do so, subject to the following conditions:
11		*
12		* The above copyright notice and this permission notice (including the next
13		* paragraph) shall be included in all copies or substantial portions of the
14		* Software.
15		*
16		* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17		* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18		* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19		* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20		* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21		* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22		* DEALINGS IN THE SOFTWARE.
23		*
24		* Author: Soren Sandmann (soren.sandmann@gmail.com)
25		*/
26		#ifdef HAVE_CONFIG_H
27		#include <pixman-config.h>
28		#endif
29
30		#include <stdlib.h>
31		#include <mmintrin.h>
32		#include <xmmintrin.h>
33		#include <emmintrin.h>
34		#include <tmmintrin.h>
35		#include "pixman-private.h"
36		#include "pixman-inlines.h"
37
38		typedef struct
39		{
40		int y;
41		uint64_t * buffer;
42		} line_t;
43
44		typedef struct
45		{
46		line_t lines[2];
47		pixman_fixed_t y;
48		pixman_fixed_t x;
49		uint64_t data[1];
50		} bilinear_info_t;
51
52		static void
53		ssse3_fetch_horizontal (bits_image_t image, line_t line,
54		int y, pixman_fixed_t x, pixman_fixed_t ux, int n)
55	0	{
56	0	uint32_t bits = image->bits + y image->rowstride;
57	0	__m128i vx = _mm_set_epi16 (
58	0	- (x + 1), x, - (x + 1), x,
59	0	- (x + ux + 1), x + ux, - (x + ux + 1), x + ux);
60	0	__m128i vux = _mm_set_epi16 (
61	0	- 2 * ux, 2 * ux, - 2 * ux, 2 * ux,
62	0	- 2 * ux, 2 * ux, - 2 * ux, 2 * ux);
63	0	__m128i vaddc = _mm_set_epi16 (1, 0, 1, 0, 1, 0, 1, 0);
64	0	__m128i b = (__m128i )line->buffer;
65	0	__m128i vrl0, vrl1;
66
67	0	while ((n -= 2) >= 0)
68	0	{
69	0	__m128i vw, vr, s;
70
71	0	vrl1 = _mm_loadl_epi64 (
72	0	(__m128i *)(bits + pixman_fixed_to_int (x + ux)));
73		/* vrl1: R1, L1 */
74
75	0	final_pixel:
76	0	vrl0 = _mm_loadl_epi64 (
77	0	(__m128i *)(bits + pixman_fixed_to_int (x)));
78		/* vrl0: R0, L0 */
79
80		/* The weights are based on vx which is a vector of
81		*
82		* - (x + 1), x, - (x + 1), x,
83		* - (x + ux + 1), x + ux, - (x + ux + 1), x + ux
84		*
85		* so the 16 bit weights end up like this:
86		*
87		* iw0, w0, iw0, w0, iw1, w1, iw1, w1
88		*
89		* and after shifting and packing, we get these bytes:
90		*
91		* iw0, w0, iw0, w0, iw1, w1, iw1, w1,
92		* iw0, w0, iw0, w0, iw1, w1, iw1, w1,
93		*
94		* which means the first and the second input pixel
95		* have to be interleaved like this:
96		*
97		* la0, ra0, lr0, rr0, la1, ra1, lr1, rr1,
98		* lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1
99		*
100		* before maddubsw can be used.
101		*/
102
103	0	vw = _mm_add_epi16 (
104	0	vaddc, _mm_srli_epi16 (vx, 16 - BILINEAR_INTERPOLATION_BITS));
105		/* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1
106		*/
107
108	0	vw = _mm_packus_epi16 (vw, vw);
109		/* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1,
110		* iw0, w0, iw0, w0, iw1, w1, iw1, w1
111		*/
112	0	vx = _mm_add_epi16 (vx, vux);
113
114	0	x += 2 * ux;
115
116	0	vr = _mm_unpacklo_epi16 (vrl1, vrl0);
117		/* vr: rar0, rar1, rgb0, rgb1, lar0, lar1, lgb0, lgb1 */
118
119	0	s = _mm_shuffle_epi32 (vr, _MM_SHUFFLE (1, 0, 3, 2));
120		/* s: lar0, lar1, lgb0, lgb1, rar0, rar1, rgb0, rgb1 */
121
122	0	vr = _mm_unpackhi_epi8 (vr, s);
123		/* vr: la0, ra0, lr0, rr0, la1, ra1, lr1, rr1,
124		* lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1
125		*/
126
127	0	vr = _mm_maddubs_epi16 (vr, vw);
128
129		/* When the weight is 0, the inverse weight is
130		* 128 which can't be represented in a signed byte.
131		* As a result maddubsw computes the following:
132		*
133		* r = l * -128 + r * 0
134		*
135		* rather than the desired
136		*
137		* r = l * 128 + r * 0
138		*
139		* We fix this by taking the absolute value of the
140		* result.
141		*/
142	0	vr = _mm_abs_epi16 (vr);
143
144		/* vr: A0, R0, A1, R1, G0, B0, G1, B1 */
145	0	_mm_store_si128 (b++, vr);
146	0	}
147
148	0	if (n == -1)
149	0	{
150	0	vrl1 = _mm_setzero_si128();
151	0	goto final_pixel;
152	0	}
153
154	0	line->y = y;
155	0	}
156
157		static uint32_t *
158		ssse3_fetch_bilinear_cover (pixman_iter_t iter, const uint32_t mask)
159	0	{
160	0	pixman_fixed_t fx, ux;
161	0	bilinear_info_t *info = iter->data;
162	0	line_t line0, line1;
163	0	int y0, y1;
164	0	int32_t dist_y;
165	0	__m128i vw;
166	0	int i;
167
168	0	fx = info->x;
169	0	ux = iter->image->common.transform->matrix[0][0];
170
171	0	y0 = pixman_fixed_to_int (info->y);
172	0	y1 = y0 + 1;
173
174	0	line0 = &info->lines[y0 & 0x01];
175	0	line1 = &info->lines[y1 & 0x01];
176
177	0	if (line0->y != y0)
178	0	{
179	0	ssse3_fetch_horizontal (
180	0	&iter->image->bits, line0, y0, fx, ux, iter->width);
181	0	}
182
183	0	if (line1->y != y1)
184	0	{
185	0	ssse3_fetch_horizontal (
186	0	&iter->image->bits, line1, y1, fx, ux, iter->width);
187	0	}
188
189	0	dist_y = pixman_fixed_to_bilinear_weight (info->y);
190	0	dist_y <<= (16 - BILINEAR_INTERPOLATION_BITS);
191
192	0	vw = _mm_set_epi16 (
193	0	dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y);
194
195	0	for (i = 0; i + 3 < iter->width; i += 4)
196	0	{
197	0	__m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i));
198	0	__m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i));
199	0	__m128i top1 = _mm_load_si128 ((__m128i *)(line0->buffer + i + 2));
200	0	__m128i bot1 = _mm_load_si128 ((__m128i *)(line1->buffer + i + 2));
201	0	__m128i r0, r1, tmp, p;
202
203	0	r0 = _mm_mulhi_epu16 (
204	0	_mm_sub_epi16 (bot0, top0), vw);
205	0	tmp = _mm_cmplt_epi16 (bot0, top0);
206	0	tmp = _mm_and_si128 (tmp, vw);
207	0	r0 = _mm_sub_epi16 (r0, tmp);
208	0	r0 = _mm_add_epi16 (r0, top0);
209	0	r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS);
210		/* r0: A0 R0 A1 R1 G0 B0 G1 B1 */
211	0	r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1));
212		/* r0: A1 R1 G1 B1 A0 R0 G0 B0 */
213
214	0	r1 = _mm_mulhi_epu16 (
215	0	_mm_sub_epi16 (bot1, top1), vw);
216	0	tmp = _mm_cmplt_epi16 (bot1, top1);
217	0	tmp = _mm_and_si128 (tmp, vw);
218	0	r1 = _mm_sub_epi16 (r1, tmp);
219	0	r1 = _mm_add_epi16 (r1, top1);
220	0	r1 = _mm_srli_epi16 (r1, BILINEAR_INTERPOLATION_BITS);
221	0	r1 = _mm_shuffle_epi32 (r1, _MM_SHUFFLE (2, 0, 3, 1));
222		/* r1: A3 R3 G3 B3 A2 R2 G2 B2 */
223
224	0	p = _mm_packus_epi16 (r0, r1);
225
226	0	_mm_storeu_si128 ((__m128i *)(iter->buffer + i), p);
227	0	}
228
229	0	while (i < iter->width)
230	0	{
231	0	__m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i));
232	0	__m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i));
233	0	__m128i r0, tmp, p;
234
235	0	r0 = _mm_mulhi_epu16 (
236	0	_mm_sub_epi16 (bot0, top0), vw);
237	0	tmp = _mm_cmplt_epi16 (bot0, top0);
238	0	tmp = _mm_and_si128 (tmp, vw);
239	0	r0 = _mm_sub_epi16 (r0, tmp);
240	0	r0 = _mm_add_epi16 (r0, top0);
241	0	r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS);
242		/* r0: A0 R0 A1 R1 G0 B0 G1 B1 */
243	0	r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1));
244		/* r0: A1 R1 G1 B1 A0 R0 G0 B0 */
245
246	0	p = _mm_packus_epi16 (r0, r0);
247
248	0	if (iter->width - i == 1)
249	0	{
250	0	(uint32_t )(iter->buffer + i) = _mm_cvtsi128_si32 (p);
251	0	i++;
252	0	}
253	0	else
254	0	{
255	0	_mm_storel_epi64 ((__m128i *)(iter->buffer + i), p);
256	0	i += 2;
257	0	}
258	0	}
259
260	0	info->y += iter->image->common.transform->matrix[1][1];
261
262	0	return iter->buffer;
263	0	}
264
265		static void
266		ssse3_bilinear_cover_iter_fini (pixman_iter_t *iter)
267	0	{
268	0	free (iter->data);
269	0	}
270
271		static void
272		ssse3_bilinear_cover_iter_init (pixman_iter_t iter, const pixman_iter_info_t iter_info)
273	0	{
274	0	int width = iter->width;
275	0	bilinear_info_t *info;
276	0	pixman_vector_t v;
277
278		/* Reference point is the center of the pixel */
279	0	v.vector[0] = pixman_int_to_fixed (iter->x) + pixman_fixed_1 / 2;
280	0	v.vector[1] = pixman_int_to_fixed (iter->y) + pixman_fixed_1 / 2;
281	0	v.vector[2] = pixman_fixed_1;
282
283	0	if (!pixman_transform_point_3d (iter->image->common.transform, &v))
284	0	goto fail;
285
286	0	info = malloc (sizeof (info) + (2 width - 1) * sizeof (uint64_t) + 64);
287	0	if (!info)
288	0	goto fail;
289
290	0	info->x = v.vector[0] - pixman_fixed_1 / 2;
291	0	info->y = v.vector[1] - pixman_fixed_1 / 2;
292
293	0	#define ALIGN(addr) \
294	0	((void *)((((uintptr_t)(addr)) + 15) & (~15)))
295
296		/* It is safe to set the y coordinates to -1 initially
297		* because COVER_CLIP_BILINEAR ensures that we will only
298		* be asked to fetch lines in the [0, height) interval
299		*/
300	0	info->lines[0].y = -1;
301	0	info->lines[0].buffer = ALIGN (&(info->data[0]));
302	0	info->lines[1].y = -1;
303	0	info->lines[1].buffer = ALIGN (info->lines[0].buffer + width);
304
305	0	iter->get_scanline = ssse3_fetch_bilinear_cover;
306	0	iter->fini = ssse3_bilinear_cover_iter_fini;
307
308	0	iter->data = info;
309	0	return;
310
311	0	fail:
312		/* Something went wrong, either a bad matrix or OOM; in such cases,
313		* we don't guarantee any particular rendering.
314		*/
315	0	_pixman_log_error (
316	0	FUNC, "Allocation failure or bad matrix, skipping rendering\n");
317
318	0	iter->get_scanline = _pixman_iter_get_scanline_noop;
319	0	iter->fini = NULL;
320	0	}
321
322		static const pixman_iter_info_t ssse3_iters[] =
323		{
324		{ PIXMAN_a8r8g8b8,
325		(FAST_PATH_STANDARD_FLAGS \|
326		FAST_PATH_SCALE_TRANSFORM \|
327		FAST_PATH_BILINEAR_FILTER \|
328		FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR),
329		ITER_NARROW \| ITER_SRC,
330		ssse3_bilinear_cover_iter_init,
331		NULL, NULL
332		},
333
334		{ PIXMAN_null },
335		};
336
337		static const pixman_fast_path_t ssse3_fast_paths[] =
338		{
339		{ PIXMAN_OP_NONE },
340		};
341
342		pixman_implementation_t *
343		_pixman_implementation_create_ssse3 (pixman_implementation_t *fallback)
344	10	{
345	10	pixman_implementation_t *imp =
346	10	_pixman_implementation_create (fallback, ssse3_fast_paths);
347
348	10	imp->iter_info = ssse3_iters;
349
350	10	return imp;
351	10	}