/src/mozilla-central/gfx/2d/ImageScalingSSE2.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
2 | | /* vim: set ts=8 sts=2 et sw=2 tw=80: */ |
3 | | /* This Source Code Form is subject to the terms of the Mozilla Public |
4 | | * License, v. 2.0. If a copy of the MPL was not distributed with this |
5 | | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
6 | | |
7 | | #include "ImageScaling.h" |
8 | | #include "mozilla/Attributes.h" |
9 | | |
10 | | #include "SSEHelpers.h" |
11 | | |
12 | | /* The functions below use the following system for averaging 4 pixels: |
13 | | * |
14 | | * The first observation is that a half-adder is implemented as follows: |
15 | | * R = S + 2C or in the case of a and b (a ^ b) + ((a & b) << 1); |
16 | | * |
17 | | * This can be trivially extended to three pixels by observaring that when |
18 | | * doing (a ^ b ^ c) as the sum, the carry is simply the bitwise-or of the |
19 | | * carries of the individual numbers, since the sum of 3 bits can only ever |
20 | | * have a carry of one. |
21 | | * |
22 | | * We then observe that the average is then ((carry << 1) + sum) >> 1, or, |
23 | | * assuming eliminating overflows and underflows, carry + (sum >> 1). |
24 | | * |
25 | | * We now average our existing sum with the fourth number, so we get: |
26 | | * sum2 = (sum + d) >> 1 or (sum >> 1) + (d >> 1). |
27 | | * |
28 | | * We now observe that our sum has been moved into place relative to the |
29 | | * carry, so we can now average with the carry to get the final 4 input |
30 | | * average: avg = (sum2 + carry) >> 1; |
31 | | * |
32 | | * Or to reverse the proof: |
33 | | * avg = ((sum >> 1) + carry + d >> 1) >> 1 |
34 | | * avg = ((a + b + c) >> 1 + d >> 1) >> 1 |
35 | | * avg = ((a + b + c + d) >> 2) |
36 | | * |
37 | | * An additional fact used in the SSE versions is the concept that we can |
38 | | * trivially convert a rounded average to a truncated average: |
39 | | * |
40 | | * We have: |
41 | | * f(a, b) = (a + b + 1) >> 1 |
42 | | * |
43 | | * And want: |
44 | | * g(a, b) = (a + b) >> 1 |
45 | | * |
46 | | * Observe: |
47 | | * ~f(~a, ~b) == ~((~a + ~b + 1) >> 1) |
48 | | * == ~((-a - 1 + -b - 1 + 1) >> 1) |
49 | | * == ~((-a - 1 + -b) >> 1) |
50 | | * == ~((-(a + b) - 1) >> 1) |
51 | | * == ~((~(a + b)) >> 1) |
52 | | * == (a + b) >> 1 |
53 | | * == g(a, b) |
54 | | */ |
55 | | |
56 | | MOZ_ALWAYS_INLINE __m128i _mm_not_si128(__m128i arg) |
57 | 0 | { |
58 | 0 | __m128i minusone = _mm_set1_epi32(0xffffffff); |
59 | 0 | return _mm_xor_si128(arg, minusone); |
60 | 0 | } |
61 | | |
62 | | /* We have to pass pointers here, MSVC does not allow passing more than 3 |
63 | | * __m128i arguments on the stack. And it does not allow 16-byte aligned |
64 | | * stack variables. This inlines properly on MSVC 2010. It does -not- inline |
65 | | * with just the inline directive. |
66 | | */ |
67 | | MOZ_ALWAYS_INLINE __m128i avg_sse2_8x2(__m128i *a, __m128i *b, __m128i *c, __m128i *d) |
68 | 0 | { |
69 | 0 | #define shuf1 _MM_SHUFFLE(2, 0, 2, 0) |
70 | 0 | #define shuf2 _MM_SHUFFLE(3, 1, 3, 1) |
71 | 0 |
|
72 | 0 | // This cannot be an inline function as the __Imm argument to _mm_shuffle_ps |
73 | 0 | // needs to be a compile time constant. |
74 | 0 | #define shuffle_si128(arga, argb, imm) \ |
75 | 0 | _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps((arga)), _mm_castsi128_ps((argb)), (imm))); |
76 | 0 |
|
77 | 0 | __m128i t = shuffle_si128(*a, *b, shuf1); |
78 | 0 | *b = shuffle_si128(*a, *b, shuf2); |
79 | 0 | *a = t; |
80 | 0 | t = shuffle_si128(*c, *d, shuf1); |
81 | 0 | *d = shuffle_si128(*c, *d, shuf2); |
82 | 0 | *c = t; |
83 | 0 |
|
84 | 0 | #undef shuf1 |
85 | 0 | #undef shuf2 |
86 | 0 | #undef shuffle_si128 |
87 | 0 |
|
88 | 0 | __m128i sum = _mm_xor_si128(*a, _mm_xor_si128(*b, *c)); |
89 | 0 |
|
90 | 0 | __m128i carry = _mm_or_si128(_mm_and_si128(*a, *b), _mm_or_si128(_mm_and_si128(*a, *c), _mm_and_si128(*b, *c))); |
91 | 0 |
|
92 | 0 | sum = _mm_avg_epu8(_mm_not_si128(sum), _mm_not_si128(*d)); |
93 | 0 |
|
94 | 0 | return _mm_not_si128(_mm_avg_epu8(sum, _mm_not_si128(carry))); |
95 | 0 | } |
96 | | |
97 | | MOZ_ALWAYS_INLINE __m128i avg_sse2_4x2_4x1(__m128i a, __m128i b) |
98 | 0 | { |
99 | 0 | return _mm_not_si128(_mm_avg_epu8(_mm_not_si128(a), _mm_not_si128(b))); |
100 | 0 | } |
101 | | |
102 | | MOZ_ALWAYS_INLINE __m128i avg_sse2_8x1_4x1(__m128i a, __m128i b) |
103 | 0 | { |
104 | 0 | __m128i t = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(3, 1, 3, 1))); |
105 | 0 | b = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(2, 0, 2, 0))); |
106 | 0 | a = t; |
107 | 0 |
|
108 | 0 | return _mm_not_si128(_mm_avg_epu8(_mm_not_si128(a), _mm_not_si128(b))); |
109 | 0 | } |
110 | | |
111 | | MOZ_ALWAYS_INLINE uint32_t Avg2x2(uint32_t a, uint32_t b, uint32_t c, uint32_t d) |
112 | 0 | { |
113 | 0 | uint32_t sum = a ^ b ^ c; |
114 | 0 | uint32_t carry = (a & b) | (a & c) | (b & c); |
115 | 0 |
|
116 | 0 | uint32_t mask = 0xfefefefe; |
117 | 0 |
|
118 | 0 | // Not having a byte based average instruction means we should mask to avoid |
119 | 0 | // underflow. |
120 | 0 | sum = (((sum ^ d) & mask) >> 1) + (sum & d); |
121 | 0 |
|
122 | 0 | return (((sum ^ carry) & mask) >> 1) + (sum & carry); |
123 | 0 | } |
124 | | |
125 | | // Simple 2 pixel average version of the function above. |
126 | | MOZ_ALWAYS_INLINE uint32_t Avg2(uint32_t a, uint32_t b) |
127 | 0 | { |
128 | 0 | uint32_t sum = a ^ b; |
129 | 0 | uint32_t carry = (a & b); |
130 | 0 |
|
131 | 0 | uint32_t mask = 0xfefefefe; |
132 | 0 |
|
133 | 0 | return ((sum & mask) >> 1) + carry; |
134 | 0 | } |
135 | | |
136 | | namespace mozilla { |
137 | | namespace gfx { |
138 | | |
139 | | void |
140 | | ImageHalfScaler::HalfImage2D_SSE2(uint8_t *aSource, int32_t aSourceStride, |
141 | | const IntSize &aSourceSize, uint8_t *aDest, |
142 | | uint32_t aDestStride) |
143 | 0 | { |
144 | 0 | const int Bpp = 4; |
145 | 0 |
|
146 | 0 | for (int y = 0; y < aSourceSize.height; y += 2) { |
147 | 0 | __m128i *storage = (__m128i*)(aDest + (y / 2) * aDestStride); |
148 | 0 | int x = 0; |
149 | 0 | // Run a loop depending on alignment. |
150 | 0 | if (!(uintptr_t(aSource + (y * aSourceStride)) % 16) && |
151 | 0 | !(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) { |
152 | 0 | for (; x < (aSourceSize.width - 7); x += 8) { |
153 | 0 | __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp)); |
154 | 0 | __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp)); |
155 | 0 |
|
156 | 0 | __m128i a = _mm_load_si128(upperRow); |
157 | 0 | __m128i b = _mm_load_si128(upperRow + 1); |
158 | 0 | __m128i c = _mm_load_si128(lowerRow); |
159 | 0 | __m128i d = _mm_load_si128(lowerRow + 1); |
160 | 0 |
|
161 | 0 | *storage++ = avg_sse2_8x2(&a, &b, &c, &d); |
162 | 0 | } |
163 | 0 | } else if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) { |
164 | 0 | for (; x < (aSourceSize.width - 7); x += 8) { |
165 | 0 | __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp)); |
166 | 0 | __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp)); |
167 | 0 |
|
168 | 0 | __m128i a = _mm_load_si128(upperRow); |
169 | 0 | __m128i b = _mm_load_si128(upperRow + 1); |
170 | 0 | __m128i c = loadUnaligned128(lowerRow); |
171 | 0 | __m128i d = loadUnaligned128(lowerRow + 1); |
172 | 0 |
|
173 | 0 | *storage++ = avg_sse2_8x2(&a, &b, &c, &d); |
174 | 0 | } |
175 | 0 | } else if (!(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) { |
176 | 0 | for (; x < (aSourceSize.width - 7); x += 8) { |
177 | 0 | __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp)); |
178 | 0 | __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp)); |
179 | 0 |
|
180 | 0 | __m128i a = loadUnaligned128((__m128i*)upperRow); |
181 | 0 | __m128i b = loadUnaligned128((__m128i*)upperRow + 1); |
182 | 0 | __m128i c = _mm_load_si128((__m128i*)lowerRow); |
183 | 0 | __m128i d = _mm_load_si128((__m128i*)lowerRow + 1); |
184 | 0 |
|
185 | 0 | *storage++ = avg_sse2_8x2(&a, &b, &c, &d); |
186 | 0 | } |
187 | 0 | } else { |
188 | 0 | for (; x < (aSourceSize.width - 7); x += 8) { |
189 | 0 | __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp)); |
190 | 0 | __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp)); |
191 | 0 |
|
192 | 0 | __m128i a = loadUnaligned128(upperRow); |
193 | 0 | __m128i b = loadUnaligned128(upperRow + 1); |
194 | 0 | __m128i c = loadUnaligned128(lowerRow); |
195 | 0 | __m128i d = loadUnaligned128(lowerRow + 1); |
196 | 0 |
|
197 | 0 | *storage++ = avg_sse2_8x2(&a, &b, &c, &d); |
198 | 0 | } |
199 | 0 | } |
200 | 0 |
|
201 | 0 | uint32_t *unalignedStorage = (uint32_t*)storage; |
202 | 0 | // Take care of the final pixels, we know there's an even number of pixels |
203 | 0 | // in the source rectangle. We use a 2x2 'simd' implementation for this. |
204 | 0 | // |
205 | 0 | // Potentially we only have to do this in the last row since overflowing |
206 | 0 | // 8 pixels in an earlier row would appear to be harmless as it doesn't |
207 | 0 | // touch invalid memory. Even when reading and writing to the same surface. |
208 | 0 | // in practice we only do this when doing an additional downscale pass, and |
209 | 0 | // in this situation we have unused stride to write into harmlessly. |
210 | 0 | // I do not believe the additional code complexity would be worth it though. |
211 | 0 | for (; x < aSourceSize.width; x += 2) { |
212 | 0 | uint8_t *upperRow = aSource + (y * aSourceStride + x * Bpp); |
213 | 0 | uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * Bpp); |
214 | 0 |
|
215 | 0 | *unalignedStorage++ = Avg2x2(*(uint32_t*)upperRow, *((uint32_t*)upperRow + 1), |
216 | 0 | *(uint32_t*)lowerRow, *((uint32_t*)lowerRow + 1)); |
217 | 0 | } |
218 | 0 | } |
219 | 0 | } |
220 | | |
221 | | void |
222 | | ImageHalfScaler::HalfImageVertical_SSE2(uint8_t *aSource, int32_t aSourceStride, |
223 | | const IntSize &aSourceSize, uint8_t *aDest, |
224 | | uint32_t aDestStride) |
225 | 0 | { |
226 | 0 | for (int y = 0; y < aSourceSize.height; y += 2) { |
227 | 0 | __m128i *storage = (__m128i*)(aDest + (y / 2) * aDestStride); |
228 | 0 | int x = 0; |
229 | 0 | // Run a loop depending on alignment. |
230 | 0 | if (!(uintptr_t(aSource + (y * aSourceStride)) % 16) && |
231 | 0 | !(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) { |
232 | 0 | for (; x < (aSourceSize.width - 3); x += 4) { |
233 | 0 | uint8_t *upperRow = aSource + (y * aSourceStride + x * 4); |
234 | 0 | uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4); |
235 | 0 |
|
236 | 0 | __m128i a = _mm_load_si128((__m128i*)upperRow); |
237 | 0 | __m128i b = _mm_load_si128((__m128i*)lowerRow); |
238 | 0 |
|
239 | 0 | *storage++ = avg_sse2_4x2_4x1(a, b); |
240 | 0 | } |
241 | 0 | } else if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) { |
242 | 0 | // This line doesn't align well. |
243 | 0 | for (; x < (aSourceSize.width - 3); x += 4) { |
244 | 0 | uint8_t *upperRow = aSource + (y * aSourceStride + x * 4); |
245 | 0 | uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4); |
246 | 0 |
|
247 | 0 | __m128i a = _mm_load_si128((__m128i*)upperRow); |
248 | 0 | __m128i b = loadUnaligned128((__m128i*)lowerRow); |
249 | 0 |
|
250 | 0 | *storage++ = avg_sse2_4x2_4x1(a, b); |
251 | 0 | } |
252 | 0 | } else if (!(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) { |
253 | 0 | for (; x < (aSourceSize.width - 3); x += 4) { |
254 | 0 | uint8_t *upperRow = aSource + (y * aSourceStride + x * 4); |
255 | 0 | uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4); |
256 | 0 |
|
257 | 0 | __m128i a = loadUnaligned128((__m128i*)upperRow); |
258 | 0 | __m128i b = _mm_load_si128((__m128i*)lowerRow); |
259 | 0 |
|
260 | 0 | *storage++ = avg_sse2_4x2_4x1(a, b); |
261 | 0 | } |
262 | 0 | } else { |
263 | 0 | for (; x < (aSourceSize.width - 3); x += 4) { |
264 | 0 | uint8_t *upperRow = aSource + (y * aSourceStride + x * 4); |
265 | 0 | uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4); |
266 | 0 |
|
267 | 0 | __m128i a = loadUnaligned128((__m128i*)upperRow); |
268 | 0 | __m128i b = loadUnaligned128((__m128i*)lowerRow); |
269 | 0 |
|
270 | 0 | *storage++ = avg_sse2_4x2_4x1(a, b); |
271 | 0 | } |
272 | 0 | } |
273 | 0 |
|
274 | 0 | uint32_t *unalignedStorage = (uint32_t*)storage; |
275 | 0 | // Take care of the final pixels, we know there's an even number of pixels |
276 | 0 | // in the source rectangle. |
277 | 0 | // |
278 | 0 | // Similar overflow considerations are valid as in the previous function. |
279 | 0 | for (; x < aSourceSize.width; x++) { |
280 | 0 | uint8_t *upperRow = aSource + (y * aSourceStride + x * 4); |
281 | 0 | uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4); |
282 | 0 |
|
283 | 0 | *unalignedStorage++ = Avg2(*(uint32_t*)upperRow, *(uint32_t*)lowerRow); |
284 | 0 | } |
285 | 0 | } |
286 | 0 | } |
287 | | |
288 | | void |
289 | | ImageHalfScaler::HalfImageHorizontal_SSE2(uint8_t *aSource, int32_t aSourceStride, |
290 | | const IntSize &aSourceSize, uint8_t *aDest, |
291 | | uint32_t aDestStride) |
292 | 0 | { |
293 | 0 | for (int y = 0; y < aSourceSize.height; y++) { |
294 | 0 | __m128i *storage = (__m128i*)(aDest + (y * aDestStride)); |
295 | 0 | int x = 0; |
296 | 0 | // Run a loop depending on alignment. |
297 | 0 | if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) { |
298 | 0 | for (; x < (aSourceSize.width - 7); x += 8) { |
299 | 0 | __m128i* pixels = (__m128i*)(aSource + (y * aSourceStride + x * 4)); |
300 | 0 |
|
301 | 0 | __m128i a = _mm_load_si128(pixels); |
302 | 0 | __m128i b = _mm_load_si128(pixels + 1); |
303 | 0 |
|
304 | 0 | *storage++ = avg_sse2_8x1_4x1(a, b); |
305 | 0 | } |
306 | 0 | } else { |
307 | 0 | for (; x < (aSourceSize.width - 7); x += 8) { |
308 | 0 | __m128i* pixels = (__m128i*)(aSource + (y * aSourceStride + x * 4)); |
309 | 0 |
|
310 | 0 | __m128i a = loadUnaligned128(pixels); |
311 | 0 | __m128i b = loadUnaligned128(pixels + 1); |
312 | 0 |
|
313 | 0 | *storage++ = avg_sse2_8x1_4x1(a, b); |
314 | 0 | } |
315 | 0 | } |
316 | 0 |
|
317 | 0 | uint32_t *unalignedStorage = (uint32_t*)storage; |
318 | 0 | // Take care of the final pixels, we know there's an even number of pixels |
319 | 0 | // in the source rectangle. |
320 | 0 | // |
321 | 0 | // Similar overflow considerations are valid as in the previous function. |
322 | 0 | for (; x < aSourceSize.width; x += 2) { |
323 | 0 | uint32_t *pixels = (uint32_t*)(aSource + (y * aSourceStride + x * 4)); |
324 | 0 |
|
325 | 0 | *unalignedStorage++ = Avg2(*pixels, *(pixels + 1)); |
326 | 0 | } |
327 | 0 | } |
328 | 0 | } |
329 | | |
330 | | } // namespace gfx |
331 | | } // namespace mozilla |