/src/cairo/subprojects/pixman-0.44.2/pixman/pixman-ssse3.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright © 2013 Soren Sandmann Pedersen |
3 | | * Copyright © 2013 Red Hat, Inc. |
4 | | * |
5 | | * Permission is hereby granted, free of charge, to any person obtaining a |
6 | | * copy of this software and associated documentation files (the "Software"), |
7 | | * to deal in the Software without restriction, including without limitation |
8 | | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
9 | | * and/or sell copies of the Software, and to permit persons to whom the |
10 | | * Software is furnished to do so, subject to the following conditions: |
11 | | * |
12 | | * The above copyright notice and this permission notice (including the next |
13 | | * paragraph) shall be included in all copies or substantial portions of the |
14 | | * Software. |
15 | | * |
16 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
17 | | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
18 | | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
19 | | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
20 | | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
21 | | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
22 | | * DEALINGS IN THE SOFTWARE. |
23 | | * |
24 | | * Author: Soren Sandmann (soren.sandmann@gmail.com) |
25 | | */ |
26 | | #ifdef HAVE_CONFIG_H |
27 | | #include <pixman-config.h> |
28 | | #endif |
29 | | |
30 | | #include <stdlib.h> |
31 | | #include <mmintrin.h> |
32 | | #include <xmmintrin.h> |
33 | | #include <emmintrin.h> |
34 | | #include <tmmintrin.h> |
35 | | #include "pixman-private.h" |
36 | | #include "pixman-inlines.h" |
37 | | |
38 | | typedef struct |
39 | | { |
40 | | int y; |
41 | | uint64_t * buffer; |
42 | | } line_t; |
43 | | |
44 | | typedef struct |
45 | | { |
46 | | line_t lines[2]; |
47 | | pixman_fixed_t y; |
48 | | pixman_fixed_t x; |
49 | | uint64_t data[1]; |
50 | | } bilinear_info_t; |
51 | | |
52 | | static void |
53 | | ssse3_fetch_horizontal (bits_image_t *image, line_t *line, |
54 | | int y, pixman_fixed_t x, pixman_fixed_t ux, int n) |
55 | 0 | { |
56 | 0 | uint32_t *bits = image->bits + y * image->rowstride; |
57 | 0 | __m128i vx = _mm_set_epi16 ( |
58 | 0 | - (x + 1), x, - (x + 1), x, |
59 | 0 | - (x + ux + 1), x + ux, - (x + ux + 1), x + ux); |
60 | 0 | __m128i vux = _mm_set_epi16 ( |
61 | 0 | - 2 * ux, 2 * ux, - 2 * ux, 2 * ux, |
62 | 0 | - 2 * ux, 2 * ux, - 2 * ux, 2 * ux); |
63 | 0 | __m128i vaddc = _mm_set_epi16 (1, 0, 1, 0, 1, 0, 1, 0); |
64 | 0 | __m128i *b = (__m128i *)line->buffer; |
65 | 0 | __m128i vrl0, vrl1; |
66 | |
|
67 | 0 | while ((n -= 2) >= 0) |
68 | 0 | { |
69 | 0 | __m128i vw, vr, s; |
70 | |
|
71 | 0 | vrl1 = _mm_loadl_epi64 ( |
72 | 0 | (__m128i *)(bits + pixman_fixed_to_int (x + ux))); |
73 | | /* vrl1: R1, L1 */ |
74 | |
|
75 | 0 | final_pixel: |
76 | 0 | vrl0 = _mm_loadl_epi64 ( |
77 | 0 | (__m128i *)(bits + pixman_fixed_to_int (x))); |
78 | | /* vrl0: R0, L0 */ |
79 | | |
80 | | /* The weights are based on vx which is a vector of |
81 | | * |
82 | | * - (x + 1), x, - (x + 1), x, |
83 | | * - (x + ux + 1), x + ux, - (x + ux + 1), x + ux |
84 | | * |
85 | | * so the 16 bit weights end up like this: |
86 | | * |
87 | | * iw0, w0, iw0, w0, iw1, w1, iw1, w1 |
88 | | * |
89 | | * and after shifting and packing, we get these bytes: |
90 | | * |
91 | | * iw0, w0, iw0, w0, iw1, w1, iw1, w1, |
92 | | * iw0, w0, iw0, w0, iw1, w1, iw1, w1, |
93 | | * |
94 | | * which means the first and the second input pixel |
95 | | * have to be interleaved like this: |
96 | | * |
97 | | * la0, ra0, lr0, rr0, la1, ra1, lr1, rr1, |
98 | | * lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1 |
99 | | * |
100 | | * before maddubsw can be used. |
101 | | */ |
102 | |
|
103 | 0 | vw = _mm_add_epi16 ( |
104 | 0 | vaddc, _mm_srli_epi16 (vx, 16 - BILINEAR_INTERPOLATION_BITS)); |
105 | | /* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1 |
106 | | */ |
107 | |
|
108 | 0 | vw = _mm_packus_epi16 (vw, vw); |
109 | | /* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1, |
110 | | * iw0, w0, iw0, w0, iw1, w1, iw1, w1 |
111 | | */ |
112 | 0 | vx = _mm_add_epi16 (vx, vux); |
113 | |
|
114 | 0 | x += 2 * ux; |
115 | |
|
116 | 0 | vr = _mm_unpacklo_epi16 (vrl1, vrl0); |
117 | | /* vr: rar0, rar1, rgb0, rgb1, lar0, lar1, lgb0, lgb1 */ |
118 | |
|
119 | 0 | s = _mm_shuffle_epi32 (vr, _MM_SHUFFLE (1, 0, 3, 2)); |
120 | | /* s: lar0, lar1, lgb0, lgb1, rar0, rar1, rgb0, rgb1 */ |
121 | |
|
122 | 0 | vr = _mm_unpackhi_epi8 (vr, s); |
123 | | /* vr: la0, ra0, lr0, rr0, la1, ra1, lr1, rr1, |
124 | | * lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1 |
125 | | */ |
126 | |
|
127 | 0 | vr = _mm_maddubs_epi16 (vr, vw); |
128 | | |
129 | | /* When the weight is 0, the inverse weight is |
130 | | * 128 which can't be represented in a signed byte. |
131 | | * As a result maddubsw computes the following: |
132 | | * |
133 | | * r = l * -128 + r * 0 |
134 | | * |
135 | | * rather than the desired |
136 | | * |
137 | | * r = l * 128 + r * 0 |
138 | | * |
139 | | * We fix this by taking the absolute value of the |
140 | | * result. |
141 | | */ |
142 | 0 | vr = _mm_abs_epi16 (vr); |
143 | | |
144 | | /* vr: A0, R0, A1, R1, G0, B0, G1, B1 */ |
145 | 0 | _mm_store_si128 (b++, vr); |
146 | 0 | } |
147 | |
|
148 | 0 | if (n == -1) |
149 | 0 | { |
150 | 0 | vrl1 = _mm_setzero_si128(); |
151 | 0 | goto final_pixel; |
152 | 0 | } |
153 | | |
154 | 0 | line->y = y; |
155 | 0 | } |
156 | | |
157 | | static uint32_t * |
158 | | ssse3_fetch_bilinear_cover (pixman_iter_t *iter, const uint32_t *mask) |
159 | 0 | { |
160 | 0 | pixman_fixed_t fx, ux; |
161 | 0 | bilinear_info_t *info = iter->data; |
162 | 0 | line_t *line0, *line1; |
163 | 0 | int y0, y1; |
164 | 0 | int32_t dist_y; |
165 | 0 | __m128i vw; |
166 | 0 | int i; |
167 | |
|
168 | 0 | fx = info->x; |
169 | 0 | ux = iter->image->common.transform->matrix[0][0]; |
170 | |
|
171 | 0 | y0 = pixman_fixed_to_int (info->y); |
172 | 0 | y1 = y0 + 1; |
173 | |
|
174 | 0 | line0 = &info->lines[y0 & 0x01]; |
175 | 0 | line1 = &info->lines[y1 & 0x01]; |
176 | |
|
177 | 0 | if (line0->y != y0) |
178 | 0 | { |
179 | 0 | ssse3_fetch_horizontal ( |
180 | 0 | &iter->image->bits, line0, y0, fx, ux, iter->width); |
181 | 0 | } |
182 | |
|
183 | 0 | if (line1->y != y1) |
184 | 0 | { |
185 | 0 | ssse3_fetch_horizontal ( |
186 | 0 | &iter->image->bits, line1, y1, fx, ux, iter->width); |
187 | 0 | } |
188 | |
|
189 | 0 | dist_y = pixman_fixed_to_bilinear_weight (info->y); |
190 | 0 | dist_y <<= (16 - BILINEAR_INTERPOLATION_BITS); |
191 | |
|
192 | 0 | vw = _mm_set_epi16 ( |
193 | 0 | dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y); |
194 | |
|
195 | 0 | for (i = 0; i + 3 < iter->width; i += 4) |
196 | 0 | { |
197 | 0 | __m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i)); |
198 | 0 | __m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i)); |
199 | 0 | __m128i top1 = _mm_load_si128 ((__m128i *)(line0->buffer + i + 2)); |
200 | 0 | __m128i bot1 = _mm_load_si128 ((__m128i *)(line1->buffer + i + 2)); |
201 | 0 | __m128i r0, r1, tmp, p; |
202 | |
|
203 | 0 | r0 = _mm_mulhi_epu16 ( |
204 | 0 | _mm_sub_epi16 (bot0, top0), vw); |
205 | 0 | tmp = _mm_cmplt_epi16 (bot0, top0); |
206 | 0 | tmp = _mm_and_si128 (tmp, vw); |
207 | 0 | r0 = _mm_sub_epi16 (r0, tmp); |
208 | 0 | r0 = _mm_add_epi16 (r0, top0); |
209 | 0 | r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS); |
210 | | /* r0: A0 R0 A1 R1 G0 B0 G1 B1 */ |
211 | 0 | r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1)); |
212 | | /* r0: A1 R1 G1 B1 A0 R0 G0 B0 */ |
213 | |
|
214 | 0 | r1 = _mm_mulhi_epu16 ( |
215 | 0 | _mm_sub_epi16 (bot1, top1), vw); |
216 | 0 | tmp = _mm_cmplt_epi16 (bot1, top1); |
217 | 0 | tmp = _mm_and_si128 (tmp, vw); |
218 | 0 | r1 = _mm_sub_epi16 (r1, tmp); |
219 | 0 | r1 = _mm_add_epi16 (r1, top1); |
220 | 0 | r1 = _mm_srli_epi16 (r1, BILINEAR_INTERPOLATION_BITS); |
221 | 0 | r1 = _mm_shuffle_epi32 (r1, _MM_SHUFFLE (2, 0, 3, 1)); |
222 | | /* r1: A3 R3 G3 B3 A2 R2 G2 B2 */ |
223 | |
|
224 | 0 | p = _mm_packus_epi16 (r0, r1); |
225 | |
|
226 | 0 | _mm_storeu_si128 ((__m128i *)(iter->buffer + i), p); |
227 | 0 | } |
228 | |
|
229 | 0 | while (i < iter->width) |
230 | 0 | { |
231 | 0 | __m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i)); |
232 | 0 | __m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i)); |
233 | 0 | __m128i r0, tmp, p; |
234 | |
|
235 | 0 | r0 = _mm_mulhi_epu16 ( |
236 | 0 | _mm_sub_epi16 (bot0, top0), vw); |
237 | 0 | tmp = _mm_cmplt_epi16 (bot0, top0); |
238 | 0 | tmp = _mm_and_si128 (tmp, vw); |
239 | 0 | r0 = _mm_sub_epi16 (r0, tmp); |
240 | 0 | r0 = _mm_add_epi16 (r0, top0); |
241 | 0 | r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS); |
242 | | /* r0: A0 R0 A1 R1 G0 B0 G1 B1 */ |
243 | 0 | r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1)); |
244 | | /* r0: A1 R1 G1 B1 A0 R0 G0 B0 */ |
245 | |
|
246 | 0 | p = _mm_packus_epi16 (r0, r0); |
247 | |
|
248 | 0 | if (iter->width - i == 1) |
249 | 0 | { |
250 | 0 | *(uint32_t *)(iter->buffer + i) = _mm_cvtsi128_si32 (p); |
251 | 0 | i++; |
252 | 0 | } |
253 | 0 | else |
254 | 0 | { |
255 | 0 | _mm_storel_epi64 ((__m128i *)(iter->buffer + i), p); |
256 | 0 | i += 2; |
257 | 0 | } |
258 | 0 | } |
259 | | |
260 | 0 | info->y += iter->image->common.transform->matrix[1][1]; |
261 | |
|
262 | 0 | return iter->buffer; |
263 | 0 | } |
264 | | |
265 | | static void |
266 | | ssse3_bilinear_cover_iter_fini (pixman_iter_t *iter) |
267 | 0 | { |
268 | 0 | free (iter->data); |
269 | 0 | } |
270 | | |
271 | | static void |
272 | | ssse3_bilinear_cover_iter_init (pixman_iter_t *iter, const pixman_iter_info_t *iter_info) |
273 | 0 | { |
274 | 0 | int width = iter->width; |
275 | 0 | bilinear_info_t *info; |
276 | 0 | pixman_vector_t v; |
277 | | |
278 | | /* Reference point is the center of the pixel */ |
279 | 0 | v.vector[0] = pixman_int_to_fixed (iter->x) + pixman_fixed_1 / 2; |
280 | 0 | v.vector[1] = pixman_int_to_fixed (iter->y) + pixman_fixed_1 / 2; |
281 | 0 | v.vector[2] = pixman_fixed_1; |
282 | |
|
283 | 0 | if (!pixman_transform_point_3d (iter->image->common.transform, &v)) |
284 | 0 | goto fail; |
285 | | |
286 | 0 | info = malloc (sizeof (*info) + (2 * width - 1) * sizeof (uint64_t) + 64); |
287 | 0 | if (!info) |
288 | 0 | goto fail; |
289 | | |
290 | 0 | info->x = v.vector[0] - pixman_fixed_1 / 2; |
291 | 0 | info->y = v.vector[1] - pixman_fixed_1 / 2; |
292 | |
|
293 | 0 | #define ALIGN(addr) \ |
294 | 0 | ((void *)((((uintptr_t)(addr)) + 15) & (~15))) |
295 | | |
296 | | /* It is safe to set the y coordinates to -1 initially |
297 | | * because COVER_CLIP_BILINEAR ensures that we will only |
298 | | * be asked to fetch lines in the [0, height) interval |
299 | | */ |
300 | 0 | info->lines[0].y = -1; |
301 | 0 | info->lines[0].buffer = ALIGN (&(info->data[0])); |
302 | 0 | info->lines[1].y = -1; |
303 | 0 | info->lines[1].buffer = ALIGN (info->lines[0].buffer + width); |
304 | |
|
305 | 0 | iter->get_scanline = ssse3_fetch_bilinear_cover; |
306 | 0 | iter->fini = ssse3_bilinear_cover_iter_fini; |
307 | |
|
308 | 0 | iter->data = info; |
309 | 0 | return; |
310 | | |
311 | 0 | fail: |
312 | | /* Something went wrong, either a bad matrix or OOM; in such cases, |
313 | | * we don't guarantee any particular rendering. |
314 | | */ |
315 | 0 | _pixman_log_error ( |
316 | 0 | FUNC, "Allocation failure or bad matrix, skipping rendering\n"); |
317 | | |
318 | 0 | iter->get_scanline = _pixman_iter_get_scanline_noop; |
319 | 0 | iter->fini = NULL; |
320 | 0 | } |
321 | | |
322 | | static const pixman_iter_info_t ssse3_iters[] = |
323 | | { |
324 | | { PIXMAN_a8r8g8b8, |
325 | | (FAST_PATH_STANDARD_FLAGS | |
326 | | FAST_PATH_SCALE_TRANSFORM | |
327 | | FAST_PATH_BILINEAR_FILTER | |
328 | | FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR), |
329 | | ITER_NARROW | ITER_SRC, |
330 | | ssse3_bilinear_cover_iter_init, |
331 | | NULL, NULL |
332 | | }, |
333 | | |
334 | | { PIXMAN_null }, |
335 | | }; |
336 | | |
337 | | static const pixman_fast_path_t ssse3_fast_paths[] = |
338 | | { |
339 | | { PIXMAN_OP_NONE }, |
340 | | }; |
341 | | |
342 | | pixman_implementation_t * |
343 | | _pixman_implementation_create_ssse3 (pixman_implementation_t *fallback) |
344 | 10 | { |
345 | 10 | pixman_implementation_t *imp = |
346 | 10 | _pixman_implementation_create (fallback, ssse3_fast_paths); |
347 | | |
348 | 10 | imp->iter_info = ssse3_iters; |
349 | | |
350 | 10 | return imp; |
351 | 10 | } |