/src/mozilla-central/gfx/2d/ssse3-scaler.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright © 2013 Soren Sandmann Pedersen |
3 | | * Copyright © 2013 Red Hat, Inc. |
4 | | * Copyright © 2016 Mozilla Foundation |
5 | | * |
6 | | * Permission is hereby granted, free of charge, to any person obtaining a |
7 | | * copy of this software and associated documentation files (the "Software"), |
8 | | * to deal in the Software without restriction, including without limitation |
9 | | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
10 | | * and/or sell copies of the Software, and to permit persons to whom the |
11 | | * Software is furnished to do so, subject to the following conditions: |
12 | | * |
13 | | * The above copyright notice and this permission notice (including the next |
14 | | * paragraph) shall be included in all copies or substantial portions of the |
15 | | * Software. |
16 | | * |
17 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
18 | | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
19 | | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
20 | | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
21 | | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
22 | | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
23 | | * DEALINGS IN THE SOFTWARE. |
24 | | * |
25 | | * Author: Soren Sandmann (soren.sandmann@gmail.com) |
26 | | * Jeff Muizelaar (jmuizelaar@mozilla.com) |
27 | | */ |
28 | | |
29 | | /* This has been adapted from the ssse3 code from pixman. It's currently |
30 | | * a mess as I want to try it out in practice before finalizing the details. |
31 | | */ |
32 | | |
33 | | #include <stdlib.h> |
34 | | #include <mmintrin.h> |
35 | | #include <xmmintrin.h> |
36 | | #include <emmintrin.h> |
37 | | #include <tmmintrin.h> |
38 | | #include <stdint.h> |
39 | | #include <assert.h> |
40 | | #include "ssse3-scaler.h" |
41 | | |
42 | | typedef int32_t pixman_fixed_16_16_t; |
43 | | typedef pixman_fixed_16_16_t pixman_fixed_t; |
44 | 0 | #define pixman_fixed_1 (pixman_int_to_fixed(1)) |
45 | 0 | #define pixman_fixed_to_int(f) ((int) ((f) >> 16)) |
46 | 0 | #define pixman_int_to_fixed(i) ((pixman_fixed_t) ((i) << 16)) |
47 | 0 | #define pixman_double_to_fixed(d) ((pixman_fixed_t) ((d) * 65536.0)) |
48 | 0 | #define PIXMAN_FIXED_INT_MAX 32767 |
49 | 0 | #define PIXMAN_FIXED_INT_MIN -32768 |
50 | | typedef struct pixman_vector pixman_vector_t; |
51 | | |
52 | | typedef int pixman_bool_t; |
53 | | typedef int64_t pixman_fixed_32_32_t; |
54 | | typedef pixman_fixed_32_32_t pixman_fixed_48_16_t; |
55 | | typedef struct { pixman_fixed_48_16_t v[3]; } pixman_vector_48_16_t; |
56 | | |
57 | | struct pixman_vector |
58 | | { |
59 | | pixman_fixed_t vector[3]; |
60 | | }; |
61 | | typedef struct pixman_transform pixman_transform_t; |
62 | | |
63 | | struct pixman_transform |
64 | | { |
65 | | pixman_fixed_t matrix[3][3]; |
66 | | }; |
67 | | |
68 | | #ifdef _MSC_VER |
69 | | #define force_inline __forceinline |
70 | | #else |
71 | | #define force_inline __inline__ __attribute__((always_inline)) |
72 | | #endif |
73 | | |
74 | 0 | #define BILINEAR_INTERPOLATION_BITS 6 |
75 | | |
76 | | static force_inline int |
77 | | pixman_fixed_to_bilinear_weight (pixman_fixed_t x) |
78 | 0 | { |
79 | 0 | return (x >> (16 - BILINEAR_INTERPOLATION_BITS)) & |
80 | 0 | ((1 << BILINEAR_INTERPOLATION_BITS) - 1); |
81 | 0 | } |
82 | | |
83 | | static void |
84 | | pixman_transform_point_31_16_3d (const pixman_transform_t *t, |
85 | | const pixman_vector_48_16_t *v, |
86 | | pixman_vector_48_16_t *result) |
87 | 0 | { |
88 | 0 | int i; |
89 | 0 | int64_t tmp[3][2]; |
90 | 0 |
|
91 | 0 | /* input vector values must have no more than 31 bits (including sign) |
92 | 0 | * in the integer part */ |
93 | 0 | assert (v->v[0] < ((pixman_fixed_48_16_t)1 << (30 + 16))); |
94 | 0 | assert (v->v[0] >= -((pixman_fixed_48_16_t)1 << (30 + 16))); |
95 | 0 | assert (v->v[1] < ((pixman_fixed_48_16_t)1 << (30 + 16))); |
96 | 0 | assert (v->v[1] >= -((pixman_fixed_48_16_t)1 << (30 + 16))); |
97 | 0 | assert (v->v[2] < ((pixman_fixed_48_16_t)1 << (30 + 16))); |
98 | 0 | assert (v->v[2] >= -((pixman_fixed_48_16_t)1 << (30 + 16))); |
99 | 0 |
|
100 | 0 | for (i = 0; i < 3; i++) |
101 | 0 | { |
102 | 0 | tmp[i][0] = (int64_t)t->matrix[i][0] * (v->v[0] >> 16); |
103 | 0 | tmp[i][1] = (int64_t)t->matrix[i][0] * (v->v[0] & 0xFFFF); |
104 | 0 | tmp[i][0] += (int64_t)t->matrix[i][1] * (v->v[1] >> 16); |
105 | 0 | tmp[i][1] += (int64_t)t->matrix[i][1] * (v->v[1] & 0xFFFF); |
106 | 0 | tmp[i][0] += (int64_t)t->matrix[i][2] * (v->v[2] >> 16); |
107 | 0 | tmp[i][1] += (int64_t)t->matrix[i][2] * (v->v[2] & 0xFFFF); |
108 | 0 | } |
109 | 0 |
|
110 | 0 | result->v[0] = tmp[0][0] + ((tmp[0][1] + 0x8000) >> 16); |
111 | 0 | result->v[1] = tmp[1][0] + ((tmp[1][1] + 0x8000) >> 16); |
112 | 0 | result->v[2] = tmp[2][0] + ((tmp[2][1] + 0x8000) >> 16); |
113 | 0 | } |
114 | | |
115 | | static pixman_bool_t |
116 | | pixman_transform_point_3d (const struct pixman_transform *transform, |
117 | | struct pixman_vector * vector) |
118 | 0 | { |
119 | 0 | pixman_vector_48_16_t tmp; |
120 | 0 | tmp.v[0] = vector->vector[0]; |
121 | 0 | tmp.v[1] = vector->vector[1]; |
122 | 0 | tmp.v[2] = vector->vector[2]; |
123 | 0 |
|
124 | 0 | pixman_transform_point_31_16_3d (transform, &tmp, &tmp); |
125 | 0 |
|
126 | 0 | vector->vector[0] = tmp.v[0]; |
127 | 0 | vector->vector[1] = tmp.v[1]; |
128 | 0 | vector->vector[2] = tmp.v[2]; |
129 | 0 |
|
130 | 0 | return vector->vector[0] == tmp.v[0] && |
131 | 0 | vector->vector[1] == tmp.v[1] && |
132 | 0 | vector->vector[2] == tmp.v[2]; |
133 | 0 | } |
134 | | |
135 | | |
136 | | struct bits_image_t |
137 | | { |
138 | | uint32_t * bits; |
139 | | int rowstride; |
140 | | pixman_transform_t *transform; |
141 | | }; |
142 | | |
143 | | typedef struct bits_image_t bits_image_t; |
144 | | typedef struct { |
145 | | int unused; |
146 | | } pixman_iter_info_t; |
147 | | |
148 | | typedef struct pixman_iter_t pixman_iter_t; |
149 | | typedef void (* pixman_iter_fini_t) (pixman_iter_t *iter); |
150 | | |
151 | | struct pixman_iter_t |
152 | | { |
153 | | int x, y; |
154 | | pixman_iter_fini_t fini; |
155 | | bits_image_t *image; |
156 | | uint32_t * buffer; |
157 | | int width; |
158 | | int height; |
159 | | void * data; |
160 | | }; |
161 | | |
162 | | typedef struct |
163 | | { |
164 | | int y; |
165 | | uint64_t * buffer; |
166 | | } line_t; |
167 | | |
168 | | typedef struct |
169 | | { |
170 | | line_t lines[2]; |
171 | | pixman_fixed_t y; |
172 | | pixman_fixed_t x; |
173 | | uint64_t data[1]; |
174 | | } bilinear_info_t; |
175 | | |
176 | | static void |
177 | | ssse3_fetch_horizontal (bits_image_t *image, line_t *line, |
178 | | int y, pixman_fixed_t x, pixman_fixed_t ux, int n) |
179 | 0 | { |
180 | 0 | uint32_t *bits = image->bits + y * image->rowstride; |
181 | 0 | __m128i vx = _mm_set_epi16 ( |
182 | 0 | - (x + 1), x, - (x + 1), x, |
183 | 0 | - (x + ux + 1), x + ux, - (x + ux + 1), x + ux); |
184 | 0 | __m128i vux = _mm_set_epi16 ( |
185 | 0 | - 2 * ux, 2 * ux, - 2 * ux, 2 * ux, |
186 | 0 | - 2 * ux, 2 * ux, - 2 * ux, 2 * ux); |
187 | 0 | __m128i vaddc = _mm_set_epi16 (1, 0, 1, 0, 1, 0, 1, 0); |
188 | 0 | __m128i *b = (__m128i *)line->buffer; |
189 | 0 | __m128i vrl0, vrl1; |
190 | 0 |
|
191 | 0 | while ((n -= 2) >= 0) |
192 | 0 | { |
193 | 0 | __m128i vw, vr, s; |
194 | | #ifdef HACKY_PADDING |
195 | | if (pixman_fixed_to_int(x + ux) >= image->rowstride) { |
196 | | vrl1 = _mm_setzero_si128(); |
197 | | printf("overread 2loop\n"); |
198 | | } else { |
199 | | if (pixman_fixed_to_int(x + ux) < 0) |
200 | | printf("underflow\n"); |
201 | | vrl1 = _mm_loadl_epi64( |
202 | | (__m128i *)(bits + (pixman_fixed_to_int(x + ux) < 0 ? 0 : pixman_fixed_to_int(x + ux)))); |
203 | | } |
204 | | #else |
205 | | vrl1 = _mm_loadl_epi64( |
206 | 0 | (__m128i *)(bits + pixman_fixed_to_int(x + ux))); |
207 | 0 | #endif |
208 | 0 | /* vrl1: R1, L1 */ |
209 | 0 |
|
210 | 0 | final_pixel: |
211 | | #ifdef HACKY_PADDING |
212 | | vrl0 = _mm_loadl_epi64 ( |
213 | | (__m128i *)(bits + (pixman_fixed_to_int (x) < 0 ? 0 : pixman_fixed_to_int (x)))); |
214 | | #else |
215 | | vrl0 = _mm_loadl_epi64 ( |
216 | 0 | (__m128i *)(bits + pixman_fixed_to_int (x))); |
217 | 0 | #endif |
218 | 0 | /* vrl0: R0, L0 */ |
219 | 0 |
|
220 | 0 | /* The weights are based on vx which is a vector of |
221 | 0 | * |
222 | 0 | * - (x + 1), x, - (x + 1), x, |
223 | 0 | * - (x + ux + 1), x + ux, - (x + ux + 1), x + ux |
224 | 0 | * |
225 | 0 | * so the 16 bit weights end up like this: |
226 | 0 | * |
227 | 0 | * iw0, w0, iw0, w0, iw1, w1, iw1, w1 |
228 | 0 | * |
229 | 0 | * and after shifting and packing, we get these bytes: |
230 | 0 | * |
231 | 0 | * iw0, w0, iw0, w0, iw1, w1, iw1, w1, |
232 | 0 | * iw0, w0, iw0, w0, iw1, w1, iw1, w1, |
233 | 0 | * |
234 | 0 | * which means the first and the second input pixel |
235 | 0 | * have to be interleaved like this: |
236 | 0 | * |
237 | 0 | * la0, ra0, lr0, rr0, la1, ra1, lr1, rr1, |
238 | 0 | * lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1 |
239 | 0 | * |
240 | 0 | * before maddubsw can be used. |
241 | 0 | */ |
242 | 0 |
|
243 | 0 | vw = _mm_add_epi16 ( |
244 | 0 | vaddc, _mm_srli_epi16 (vx, 16 - BILINEAR_INTERPOLATION_BITS)); |
245 | 0 | /* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1 |
246 | 0 | */ |
247 | 0 |
|
248 | 0 | vw = _mm_packus_epi16 (vw, vw); |
249 | 0 | /* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1, |
250 | 0 | * iw0, w0, iw0, w0, iw1, w1, iw1, w1 |
251 | 0 | */ |
252 | 0 | vx = _mm_add_epi16 (vx, vux); |
253 | 0 |
|
254 | 0 | x += 2 * ux; |
255 | 0 |
|
256 | 0 | vr = _mm_unpacklo_epi16 (vrl1, vrl0); |
257 | 0 | /* vr: rar0, rar1, rgb0, rgb1, lar0, lar1, lgb0, lgb1 */ |
258 | 0 |
|
259 | 0 | s = _mm_shuffle_epi32 (vr, _MM_SHUFFLE (1, 0, 3, 2)); |
260 | 0 | /* s: lar0, lar1, lgb0, lgb1, rar0, rar1, rgb0, rgb1 */ |
261 | 0 |
|
262 | 0 | vr = _mm_unpackhi_epi8 (vr, s); |
263 | 0 | /* vr: la0, ra0, lr0, rr0, la1, ra1, lr1, rr1, |
264 | 0 | * lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1 |
265 | 0 | */ |
266 | 0 |
|
267 | 0 | vr = _mm_maddubs_epi16 (vr, vw); |
268 | 0 |
|
269 | 0 | /* When the weight is 0, the inverse weight is |
270 | 0 | * 128 which can't be represented in a signed byte. |
271 | 0 | * As a result maddubsw computes the following: |
272 | 0 | * |
273 | 0 | * r = l * -128 + r * 0 |
274 | 0 | * |
275 | 0 | * rather than the desired |
276 | 0 | * |
277 | 0 | * r = l * 128 + r * 0 |
278 | 0 | * |
279 | 0 | * We fix this by taking the absolute value of the |
280 | 0 | * result. |
281 | 0 | */ |
282 | 0 | // we can drop this if we use lower precision |
283 | 0 |
|
284 | 0 | vr = _mm_shuffle_epi32 (vr, _MM_SHUFFLE (2, 0, 3, 1)); |
285 | 0 | /* vr: A0, R0, A1, R1, G0, B0, G1, B1 */ |
286 | 0 | _mm_store_si128 (b++, vr); |
287 | 0 | } |
288 | 0 |
|
289 | 0 | if (n == -1) |
290 | 0 | { |
291 | 0 | vrl1 = _mm_setzero_si128(); |
292 | 0 | goto final_pixel; |
293 | 0 | } |
294 | 0 | |
295 | 0 | line->y = y; |
296 | 0 | } |
297 | | |
298 | | // scale a line of destination pixels |
299 | | static uint32_t * |
300 | | ssse3_fetch_bilinear_cover (pixman_iter_t *iter, const uint32_t *mask) |
301 | 0 | { |
302 | 0 | pixman_fixed_t fx, ux; |
303 | 0 | bilinear_info_t *info = iter->data; |
304 | 0 | line_t *line0, *line1; |
305 | 0 | int y0, y1; |
306 | 0 | int32_t dist_y; |
307 | 0 | __m128i vw, uvw; |
308 | 0 | int i; |
309 | 0 |
|
310 | 0 | fx = info->x; |
311 | 0 | ux = iter->image->transform->matrix[0][0]; |
312 | 0 |
|
313 | 0 | y0 = pixman_fixed_to_int (info->y); |
314 | 0 | if (y0 < 0) |
315 | 0 | *(volatile char*)0 = 9; |
316 | 0 | y1 = y0 + 1; |
317 | 0 |
|
318 | 0 | // clamping in y direction |
319 | 0 | if (y1 >= iter->height) { |
320 | 0 | y1 = iter->height - 1; |
321 | 0 | } |
322 | 0 |
|
323 | 0 | line0 = &info->lines[y0 & 0x01]; |
324 | 0 | line1 = &info->lines[y1 & 0x01]; |
325 | 0 |
|
326 | 0 | if (line0->y != y0) |
327 | 0 | { |
328 | 0 | ssse3_fetch_horizontal ( |
329 | 0 | iter->image, line0, y0, fx, ux, iter->width); |
330 | 0 | } |
331 | 0 |
|
332 | 0 | if (line1->y != y1) |
333 | 0 | { |
334 | 0 | ssse3_fetch_horizontal ( |
335 | 0 | iter->image, line1, y1, fx, ux, iter->width); |
336 | 0 | } |
337 | 0 |
|
338 | | #ifdef PIXMAN_STYLE_INTERPOLATION |
339 | | dist_y = pixman_fixed_to_bilinear_weight (info->y); |
340 | | dist_y <<= (16 - BILINEAR_INTERPOLATION_BITS); |
341 | | |
342 | | vw = _mm_set_epi16 ( |
343 | | dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y); |
344 | | |
345 | | #else |
346 | | // setup the weights for the top (vw) and bottom (uvw) lines |
347 | 0 | dist_y = pixman_fixed_to_bilinear_weight (info->y); |
348 | 0 | // we use 15 instead of 16 because we need an extra bit to handle when the weights are 0 and 1 |
349 | 0 | dist_y <<= (15 - BILINEAR_INTERPOLATION_BITS); |
350 | 0 |
|
351 | 0 | vw = _mm_set_epi16 ( |
352 | 0 | dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y); |
353 | 0 |
|
354 | 0 |
|
355 | 0 | dist_y = (1 << BILINEAR_INTERPOLATION_BITS) - pixman_fixed_to_bilinear_weight (info->y); |
356 | 0 | dist_y <<= (15 - BILINEAR_INTERPOLATION_BITS); |
357 | 0 | uvw = _mm_set_epi16 ( |
358 | 0 | dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y); |
359 | 0 | #endif |
360 | 0 |
|
361 | 0 | for (i = 0; i + 3 < iter->width; i += 4) |
362 | 0 | { |
363 | 0 | __m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i)); |
364 | 0 | __m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i)); |
365 | 0 | __m128i top1 = _mm_load_si128 ((__m128i *)(line0->buffer + i + 2)); |
366 | 0 | __m128i bot1 = _mm_load_si128 ((__m128i *)(line1->buffer + i + 2)); |
367 | | #ifdef PIXMAN_STYLE_INTERPOLATION |
368 | | __m128i r0, r1, tmp, p; |
369 | | |
370 | | r0 = _mm_mulhi_epu16 ( |
371 | | _mm_sub_epi16 (bot0, top0), vw); |
372 | | tmp = _mm_cmplt_epi16 (bot0, top0); |
373 | | tmp = _mm_and_si128 (tmp, vw); |
374 | | r0 = _mm_sub_epi16 (r0, tmp); |
375 | | r0 = _mm_add_epi16 (r0, top0); |
376 | | r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS); |
377 | | /* r0: A0 R0 A1 R1 G0 B0 G1 B1 */ |
378 | | //r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1)); |
379 | | /* r0: A1 R1 G1 B1 A0 R0 G0 B0 */ |
380 | | |
381 | | // tmp = bot1 < top1 ? vw : 0; |
382 | | // r1 = (bot1 - top1)*vw + top1 - tmp |
383 | | // r1 = bot1*vw - vw*top1 + top1 - tmp |
384 | | // r1 = bot1*vw + top1 - vw*top1 - tmp |
385 | | // r1 = bot1*vw + top1*(1 - vw) - tmp |
386 | | r1 = _mm_mulhi_epu16 ( |
387 | | _mm_sub_epi16 (bot1, top1), vw); |
388 | | tmp = _mm_cmplt_epi16 (bot1, top1); |
389 | | tmp = _mm_and_si128 (tmp, vw); |
390 | | r1 = _mm_sub_epi16 (r1, tmp); |
391 | | r1 = _mm_add_epi16 (r1, top1); |
392 | | r1 = _mm_srli_epi16 (r1, BILINEAR_INTERPOLATION_BITS); |
393 | | //r1 = _mm_shuffle_epi32 (r1, _MM_SHUFFLE (2, 0, 3, 1)); |
394 | | /* r1: A3 R3 G3 B3 A2 R2 G2 B2 */ |
395 | | #else |
396 | | __m128i r0, r1, p; |
397 | 0 | top0 = _mm_mulhi_epu16 (top0, uvw); |
398 | 0 | bot0 = _mm_mulhi_epu16 (bot0, vw); |
399 | 0 | r0 = _mm_add_epi16(top0, bot0); |
400 | 0 | r0 = _mm_srli_epi16(r0, BILINEAR_INTERPOLATION_BITS-1); |
401 | 0 |
|
402 | 0 | top1 = _mm_mulhi_epu16 (top1, uvw); |
403 | 0 | bot1 = _mm_mulhi_epu16 (bot1, vw); |
404 | 0 | r1 = _mm_add_epi16(top1, bot1); |
405 | 0 | r1 = _mm_srli_epi16(r1, BILINEAR_INTERPOLATION_BITS-1); |
406 | 0 | #endif |
407 | 0 |
|
408 | 0 | p = _mm_packus_epi16 (r0, r1); |
409 | 0 | _mm_storeu_si128 ((__m128i *)(iter->buffer + i), p); |
410 | 0 | } |
411 | 0 |
|
412 | 0 | while (i < iter->width) |
413 | 0 | { |
414 | 0 | __m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i)); |
415 | 0 | __m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i)); |
416 | 0 |
|
417 | | #ifdef PIXMAN_STYLE_INTERPOLATION |
418 | | __m128i r0, tmp, p; |
419 | | r0 = _mm_mulhi_epu16 ( |
420 | | _mm_sub_epi16 (bot0, top0), vw); |
421 | | tmp = _mm_cmplt_epi16 (bot0, top0); |
422 | | tmp = _mm_and_si128 (tmp, vw); |
423 | | r0 = _mm_sub_epi16 (r0, tmp); |
424 | | r0 = _mm_add_epi16 (r0, top0); |
425 | | r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS); |
426 | | /* r0: A0 R0 A1 R1 G0 B0 G1 B1 */ |
427 | | r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1)); |
428 | | /* r0: A1 R1 G1 B1 A0 R0 G0 B0 */ |
429 | | #else |
430 | | __m128i r0, p; |
431 | 0 | top0 = _mm_mulhi_epu16 (top0, uvw); |
432 | 0 | bot0 = _mm_mulhi_epu16 (bot0, vw); |
433 | 0 | r0 = _mm_add_epi16(top0, bot0); |
434 | 0 | r0 = _mm_srli_epi16(r0, BILINEAR_INTERPOLATION_BITS-1); |
435 | 0 | #endif |
436 | 0 |
|
437 | 0 | p = _mm_packus_epi16 (r0, r0); |
438 | 0 |
|
439 | 0 | if (iter->width - i == 1) |
440 | 0 | { |
441 | 0 | *(uint32_t *)(iter->buffer + i) = _mm_cvtsi128_si32 (p); |
442 | 0 | i++; |
443 | 0 | } |
444 | 0 | else |
445 | 0 | { |
446 | 0 | _mm_storel_epi64 ((__m128i *)(iter->buffer + i), p); |
447 | 0 | i += 2; |
448 | 0 | } |
449 | 0 | } |
450 | 0 |
|
451 | 0 | info->y += iter->image->transform->matrix[1][1]; |
452 | 0 |
|
453 | 0 | return iter->buffer; |
454 | 0 | } |
455 | | |
456 | | static void |
457 | | ssse3_bilinear_cover_iter_fini (pixman_iter_t *iter) |
458 | 0 | { |
459 | 0 | free (iter->data); |
460 | 0 | } |
461 | | |
462 | | static void |
463 | | ssse3_bilinear_cover_iter_init (pixman_iter_t *iter) |
464 | 0 | { |
465 | 0 | int width = iter->width; |
466 | 0 | bilinear_info_t *info; |
467 | 0 | pixman_vector_t v; |
468 | 0 |
|
469 | 0 | if (iter->x > PIXMAN_FIXED_INT_MAX || |
470 | 0 | iter->x < PIXMAN_FIXED_INT_MIN || |
471 | 0 | iter->y > PIXMAN_FIXED_INT_MAX || |
472 | 0 | iter->y < PIXMAN_FIXED_INT_MIN) |
473 | 0 | goto fail; |
474 | 0 | |
475 | 0 | /* Reference point is the center of the pixel */ |
476 | 0 | v.vector[0] = pixman_int_to_fixed (iter->x) + pixman_fixed_1 / 2; |
477 | 0 | v.vector[1] = pixman_int_to_fixed (iter->y) + pixman_fixed_1 / 2; |
478 | 0 | v.vector[2] = pixman_fixed_1; |
479 | 0 |
|
480 | 0 | if (!pixman_transform_point_3d (iter->image->transform, &v)) |
481 | 0 | goto fail; |
482 | 0 | |
483 | 0 | info = malloc (sizeof (*info) + (2 * width - 1) * sizeof (uint64_t) + 64); |
484 | 0 | if (!info) |
485 | 0 | goto fail; |
486 | 0 | |
487 | 0 | info->x = v.vector[0] - pixman_fixed_1 / 2; |
488 | 0 | info->y = v.vector[1] - pixman_fixed_1 / 2; |
489 | 0 |
|
490 | 0 | #define ALIGN(addr) \ |
491 | 0 | ((void *)((((uintptr_t)(addr)) + 15) & (~15))) |
492 | 0 |
|
493 | 0 | /* It is safe to set the y coordinates to -1 initially |
494 | 0 | * because COVER_CLIP_BILINEAR ensures that we will only |
495 | 0 | * be asked to fetch lines in the [0, height) interval |
496 | 0 | */ |
497 | 0 | info->lines[0].y = -1; |
498 | 0 | info->lines[0].buffer = ALIGN (&(info->data[0])); |
499 | 0 | info->lines[1].y = -1; |
500 | 0 | info->lines[1].buffer = ALIGN (info->lines[0].buffer + width); |
501 | 0 |
|
502 | 0 | iter->fini = ssse3_bilinear_cover_iter_fini; |
503 | 0 |
|
504 | 0 | iter->data = info; |
505 | 0 | return; |
506 | 0 |
|
507 | 0 | fail: |
508 | 0 | /* Something went wrong, either a bad matrix or OOM; in such cases, |
509 | 0 | * we don't guarantee any particular rendering. |
510 | 0 | */ |
511 | 0 | iter->fini = NULL; |
512 | 0 | } |
513 | | |
514 | | /* scale the src from src_width/height to dest_width/height drawn |
515 | | * into the rectangle x,y width,height |
516 | | * src_stride and dst_stride are 4 byte units */ |
517 | | bool ssse3_scale_data(uint32_t *src, int src_width, int src_height, int src_stride, |
518 | | uint32_t *dest, int dest_width, int dest_height, |
519 | | int dest_stride, |
520 | | int x, int y, |
521 | | int width, int height) |
522 | 0 | { |
523 | 0 | //XXX: assert(src_width > 1) |
524 | 0 | pixman_transform_t transform = { |
525 | 0 | { { pixman_fixed_1, 0, 0 }, |
526 | 0 | { 0, pixman_fixed_1, 0 }, |
527 | 0 | { 0, 0, pixman_fixed_1 } } |
528 | 0 | }; |
529 | 0 | double width_scale = ((double)src_width)/dest_width; |
530 | 0 | double height_scale = ((double)src_height)/dest_height; |
531 | 0 | #define AVOID_PADDING |
532 | 0 | #ifdef AVOID_PADDING |
533 | 0 | // scale up by enough that we don't read outside of the bounds of the source surface |
534 | 0 | // currently this is required to avoid reading out of bounds. |
535 | 0 | if (width_scale < 1) { |
536 | 0 | width_scale = (double)(src_width-1)/dest_width; |
537 | 0 | transform.matrix[0][2] = pixman_fixed_1/2; |
538 | 0 | } |
539 | 0 | if (height_scale < 1) { |
540 | 0 | height_scale = (double)(src_height-1)/dest_height; |
541 | 0 | transform.matrix[1][2] = pixman_fixed_1/2; |
542 | 0 | } |
543 | 0 | #endif |
544 | 0 | transform.matrix[0][0] = pixman_double_to_fixed(width_scale); |
545 | 0 | transform.matrix[1][1] = pixman_double_to_fixed(height_scale); |
546 | 0 | transform.matrix[2][2] = pixman_fixed_1; |
547 | 0 |
|
548 | 0 | bits_image_t image; |
549 | 0 | image.bits = src; |
550 | 0 | image.transform = &transform; |
551 | 0 | image.rowstride = src_stride; |
552 | 0 |
|
553 | 0 | pixman_iter_t iter; |
554 | 0 | iter.image = ℑ |
555 | 0 | iter.x = x; |
556 | 0 | iter.y = y; |
557 | 0 | iter.width = width; |
558 | 0 | iter.height = src_height; |
559 | 0 | iter.buffer = dest; |
560 | 0 | iter.data = NULL; |
561 | 0 |
|
562 | 0 | ssse3_bilinear_cover_iter_init(&iter); |
563 | 0 |
|
564 | 0 | if (!iter.fini) |
565 | 0 | return false; |
566 | 0 | |
567 | 0 | if (iter.data) { |
568 | 0 | for (int iy = 0; iy < height; iy++) { |
569 | 0 | ssse3_fetch_bilinear_cover(&iter, NULL); |
570 | 0 | iter.buffer += dest_stride; |
571 | 0 | } |
572 | 0 | ssse3_bilinear_cover_iter_fini(&iter); |
573 | 0 | } |
574 | 0 | return true; |
575 | 0 | } |