/src/mozilla-central/gfx/2d/BlurSSE2.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
2 | | /* vim: set ts=8 sts=2 et sw=2 tw=80: */ |
3 | | /* This Source Code Form is subject to the terms of the Mozilla Public |
4 | | * License, v. 2.0. If a copy of the MPL was not distributed with this |
5 | | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
6 | | |
7 | | #include "Blur.h" |
8 | | |
9 | | #include "SSEHelpers.h" |
10 | | |
11 | | #include <string.h> |
12 | | |
13 | | namespace mozilla { |
14 | | namespace gfx { |
15 | | |
16 | | MOZ_ALWAYS_INLINE |
17 | | __m128i Divide(__m128i aValues, __m128i aDivisor) |
18 | 0 | { |
19 | 0 | const __m128i mask = _mm_setr_epi32(0x0, 0xffffffff, 0x0, 0xffffffff); |
20 | 0 | static const union { |
21 | 0 | int64_t i64[2]; |
22 | 0 | __m128i m; |
23 | 0 | } roundingAddition = { { int64_t(1) << 31, int64_t(1) << 31 } }; |
24 | 0 |
|
25 | 0 | __m128i multiplied31 = _mm_mul_epu32(aValues, aDivisor); |
26 | 0 | __m128i multiplied42 = _mm_mul_epu32(_mm_srli_epi64(aValues, 32), aDivisor); |
27 | 0 |
|
28 | 0 | // Add 1 << 31 before shifting or masking the lower 32 bits away, so that the |
29 | 0 | // result is rounded. |
30 | 0 | __m128i p_3_1 = _mm_srli_epi64(_mm_add_epi64(multiplied31, roundingAddition.m), 32); |
31 | 0 | __m128i p4_2_ = _mm_and_si128(_mm_add_epi64(multiplied42, roundingAddition.m), mask); |
32 | 0 | __m128i p4321 = _mm_or_si128(p_3_1, p4_2_); |
33 | 0 | return p4321; |
34 | 0 | } |
35 | | |
36 | | MOZ_ALWAYS_INLINE |
37 | | __m128i BlurFourPixels(const __m128i& aTopLeft, const __m128i& aTopRight, |
38 | | const __m128i& aBottomRight, const __m128i& aBottomLeft, |
39 | | const __m128i& aDivisor) |
40 | 0 | { |
41 | 0 | __m128i values = _mm_add_epi32(_mm_sub_epi32(_mm_sub_epi32(aBottomRight, aTopRight), aBottomLeft), aTopLeft); |
42 | 0 | return Divide(values, aDivisor); |
43 | 0 | } |
44 | | |
45 | | MOZ_ALWAYS_INLINE |
46 | | void LoadIntegralRowFromRow(uint32_t *aDest, const uint8_t *aSource, |
47 | | int32_t aSourceWidth, int32_t aLeftInflation, |
48 | | int32_t aRightInflation) |
49 | 0 | { |
50 | 0 | int32_t currentRowSum = 0; |
51 | 0 |
|
52 | 0 | for (int x = 0; x < aLeftInflation; x++) { |
53 | 0 | currentRowSum += aSource[0]; |
54 | 0 | aDest[x] = currentRowSum; |
55 | 0 | } |
56 | 0 | for (int x = aLeftInflation; x < (aSourceWidth + aLeftInflation); x++) { |
57 | 0 | currentRowSum += aSource[(x - aLeftInflation)]; |
58 | 0 | aDest[x] = currentRowSum; |
59 | 0 | } |
60 | 0 | for (int x = (aSourceWidth + aLeftInflation); x < (aSourceWidth + aLeftInflation + aRightInflation); x++) { |
61 | 0 | currentRowSum += aSource[aSourceWidth - 1]; |
62 | 0 | aDest[x] = currentRowSum; |
63 | 0 | } |
64 | 0 | } |
65 | | |
66 | | // This function calculates an integral of four pixels stored in the 4 |
67 | | // 32-bit integers on aPixels. i.e. for { 30, 50, 80, 100 } this returns |
68 | | // { 30, 80, 160, 260 }. This seems to be the fastest way to do this after |
69 | | // much testing. |
70 | | MOZ_ALWAYS_INLINE |
71 | | __m128i AccumulatePixelSums(__m128i aPixels) |
72 | 0 | { |
73 | 0 | __m128i sumPixels = aPixels; |
74 | 0 | __m128i currentPixels = _mm_slli_si128(aPixels, 4); |
75 | 0 | sumPixels = _mm_add_epi32(sumPixels, currentPixels); |
76 | 0 | currentPixels = _mm_unpacklo_epi64(_mm_setzero_si128(), sumPixels); |
77 | 0 |
|
78 | 0 | return _mm_add_epi32(sumPixels, currentPixels); |
79 | 0 | } |
80 | | |
81 | | MOZ_ALWAYS_INLINE void |
82 | | GenerateIntegralImage_SSE2(int32_t aLeftInflation, int32_t aRightInflation, |
83 | | int32_t aTopInflation, int32_t aBottomInflation, |
84 | | uint32_t *aIntegralImage, size_t aIntegralImageStride, |
85 | | uint8_t *aSource, int32_t aSourceStride, const IntSize &aSize) |
86 | 0 | { |
87 | 0 | MOZ_ASSERT(!(aLeftInflation & 3)); |
88 | 0 |
|
89 | 0 | uint32_t stride32bit = aIntegralImageStride / 4; |
90 | 0 |
|
91 | 0 | IntSize integralImageSize(aSize.width + aLeftInflation + aRightInflation, |
92 | 0 | aSize.height + aTopInflation + aBottomInflation); |
93 | 0 |
|
94 | 0 | LoadIntegralRowFromRow(aIntegralImage, aSource, aSize.width, aLeftInflation, aRightInflation); |
95 | 0 |
|
96 | 0 | for (int y = 1; y < aTopInflation + 1; y++) { |
97 | 0 | uint32_t *intRow = aIntegralImage + (y * stride32bit); |
98 | 0 | uint32_t *intPrevRow = aIntegralImage + (y - 1) * stride32bit; |
99 | 0 | uint32_t *intFirstRow = aIntegralImage; |
100 | 0 |
|
101 | 0 | for (int x = 0; x < integralImageSize.width; x += 4) { |
102 | 0 | __m128i firstRow = _mm_load_si128((__m128i*)(intFirstRow + x)); |
103 | 0 | __m128i previousRow = _mm_load_si128((__m128i*)(intPrevRow + x)); |
104 | 0 | _mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(firstRow, previousRow)); |
105 | 0 | } |
106 | 0 | } |
107 | 0 |
|
108 | 0 | for (int y = aTopInflation + 1; y < (aSize.height + aTopInflation); y++) { |
109 | 0 | __m128i currentRowSum = _mm_setzero_si128(); |
110 | 0 | uint32_t *intRow = aIntegralImage + (y * stride32bit); |
111 | 0 | uint32_t *intPrevRow = aIntegralImage + (y - 1) * stride32bit; |
112 | 0 | uint8_t *sourceRow = aSource + aSourceStride * (y - aTopInflation); |
113 | 0 |
|
114 | 0 | uint32_t pixel = sourceRow[0]; |
115 | 0 | for (int x = 0; x < aLeftInflation; x += 4) { |
116 | 0 | __m128i sumPixels = AccumulatePixelSums(_mm_shuffle_epi32(_mm_set1_epi32(pixel), _MM_SHUFFLE(0, 0, 0, 0))); |
117 | 0 |
|
118 | 0 | sumPixels = _mm_add_epi32(sumPixels, currentRowSum); |
119 | 0 |
|
120 | 0 | currentRowSum = _mm_shuffle_epi32(sumPixels, _MM_SHUFFLE(3, 3, 3, 3)); |
121 | 0 |
|
122 | 0 | _mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(sumPixels, _mm_load_si128((__m128i*)(intPrevRow + x)))); |
123 | 0 | } |
124 | 0 | for (int x = aLeftInflation; x < (aSize.width + aLeftInflation); x += 4) { |
125 | 0 | uint32_t pixels = *(uint32_t*)(sourceRow + (x - aLeftInflation)); |
126 | 0 |
|
127 | 0 | // It's important to shuffle here. When we exit this loop currentRowSum |
128 | 0 | // has to be set to sumPixels, so that the following loop can get the |
129 | 0 | // correct pixel for the currentRowSum. The highest order pixel in |
130 | 0 | // currentRowSum could've originated from accumulation in the stride. |
131 | 0 | currentRowSum = _mm_shuffle_epi32(currentRowSum, _MM_SHUFFLE(3, 3, 3, 3)); |
132 | 0 |
|
133 | 0 | __m128i sumPixels = AccumulatePixelSums(_mm_unpacklo_epi16(_mm_unpacklo_epi8( _mm_set1_epi32(pixels), _mm_setzero_si128()), _mm_setzero_si128())); |
134 | 0 | sumPixels = _mm_add_epi32(sumPixels, currentRowSum); |
135 | 0 |
|
136 | 0 | currentRowSum = sumPixels; |
137 | 0 |
|
138 | 0 | _mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(sumPixels, _mm_load_si128((__m128i*)(intPrevRow + x)))); |
139 | 0 | } |
140 | 0 |
|
141 | 0 | pixel = sourceRow[aSize.width - 1]; |
142 | 0 | int x = (aSize.width + aLeftInflation); |
143 | 0 | if ((aSize.width & 3)) { |
144 | 0 | // Deal with unaligned portion. Get the correct pixel from currentRowSum, |
145 | 0 | // see explanation above. |
146 | 0 | uint32_t intCurrentRowSum = ((uint32_t*)¤tRowSum)[(aSize.width % 4) - 1]; |
147 | 0 | for (; x < integralImageSize.width; x++) { |
148 | 0 | // We could be unaligned here! |
149 | 0 | if (!(x & 3)) { |
150 | 0 | // aligned! |
151 | 0 | currentRowSum = _mm_set1_epi32(intCurrentRowSum); |
152 | 0 | break; |
153 | 0 | } |
154 | 0 | intCurrentRowSum += pixel; |
155 | 0 | intRow[x] = intPrevRow[x] + intCurrentRowSum; |
156 | 0 | } |
157 | 0 | } else { |
158 | 0 | currentRowSum = _mm_shuffle_epi32(currentRowSum, _MM_SHUFFLE(3, 3, 3, 3)); |
159 | 0 | } |
160 | 0 | for (; x < integralImageSize.width; x += 4) { |
161 | 0 | __m128i sumPixels = AccumulatePixelSums(_mm_set1_epi32(pixel)); |
162 | 0 |
|
163 | 0 | sumPixels = _mm_add_epi32(sumPixels, currentRowSum); |
164 | 0 |
|
165 | 0 | currentRowSum = _mm_shuffle_epi32(sumPixels, _MM_SHUFFLE(3, 3, 3, 3)); |
166 | 0 |
|
167 | 0 | _mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(sumPixels, _mm_load_si128((__m128i*)(intPrevRow + x)))); |
168 | 0 | } |
169 | 0 | } |
170 | 0 |
|
171 | 0 | if (aBottomInflation) { |
172 | 0 | // Store the last valid row of our source image in the last row of |
173 | 0 | // our integral image. This will be overwritten with the correct values |
174 | 0 | // in the upcoming loop. |
175 | 0 | LoadIntegralRowFromRow(aIntegralImage + (integralImageSize.height - 1) * stride32bit, |
176 | 0 | aSource + (aSize.height - 1) * aSourceStride, aSize.width, aLeftInflation, aRightInflation); |
177 | 0 |
|
178 | 0 |
|
179 | 0 | for (int y = aSize.height + aTopInflation; y < integralImageSize.height; y++) { |
180 | 0 | __m128i *intRow = (__m128i*)(aIntegralImage + (y * stride32bit)); |
181 | 0 | __m128i *intPrevRow = (__m128i*)(aIntegralImage + (y - 1) * stride32bit); |
182 | 0 | __m128i *intLastRow = (__m128i*)(aIntegralImage + (integralImageSize.height - 1) * stride32bit); |
183 | 0 |
|
184 | 0 | for (int x = 0; x < integralImageSize.width; x += 4) { |
185 | 0 | _mm_store_si128(intRow + (x / 4), |
186 | 0 | _mm_add_epi32(_mm_load_si128(intLastRow + (x / 4)), |
187 | 0 | _mm_load_si128(intPrevRow + (x / 4)))); |
188 | 0 | } |
189 | 0 | } |
190 | 0 | } |
191 | 0 | } |
192 | | |
193 | | /** |
194 | | * Attempt to do an in-place box blur using an integral image. |
195 | | */ |
196 | | void |
197 | | AlphaBoxBlur::BoxBlur_SSE2(uint8_t* aData, |
198 | | int32_t aLeftLobe, |
199 | | int32_t aRightLobe, |
200 | | int32_t aTopLobe, |
201 | | int32_t aBottomLobe, |
202 | | uint32_t *aIntegralImage, |
203 | | size_t aIntegralImageStride) const |
204 | 0 | { |
205 | 0 | IntSize size = GetSize(); |
206 | 0 |
|
207 | 0 | MOZ_ASSERT(size.height > 0); |
208 | 0 |
|
209 | 0 | // Our 'left' or 'top' lobe will include the current pixel. i.e. when |
210 | 0 | // looking at an integral image the value of a pixel at 'x,y' is calculated |
211 | 0 | // using the value of the integral image values above/below that. |
212 | 0 | aLeftLobe++; |
213 | 0 | aTopLobe++; |
214 | 0 | int32_t boxSize = (aLeftLobe + aRightLobe) * (aTopLobe + aBottomLobe); |
215 | 0 |
|
216 | 0 | MOZ_ASSERT(boxSize > 0); |
217 | 0 |
|
218 | 0 | if (boxSize == 1) { |
219 | 0 | return; |
220 | 0 | } |
221 | 0 | |
222 | 0 | uint32_t reciprocal = uint32_t((uint64_t(1) << 32) / boxSize); |
223 | 0 |
|
224 | 0 | uint32_t stride32bit = aIntegralImageStride / 4; |
225 | 0 | int32_t leftInflation = RoundUpToMultipleOf4(aLeftLobe).value(); |
226 | 0 |
|
227 | 0 | GenerateIntegralImage_SSE2(leftInflation, aRightLobe, aTopLobe, aBottomLobe, |
228 | 0 | aIntegralImage, aIntegralImageStride, aData, |
229 | 0 | mStride, size); |
230 | 0 |
|
231 | 0 | __m128i divisor = _mm_set1_epi32(reciprocal); |
232 | 0 |
|
233 | 0 | // This points to the start of the rectangle within the IntegralImage that overlaps |
234 | 0 | // the surface being blurred. |
235 | 0 | uint32_t *innerIntegral = aIntegralImage + (aTopLobe * stride32bit) + leftInflation; |
236 | 0 |
|
237 | 0 | IntRect skipRect = mSkipRect; |
238 | 0 | int32_t stride = mStride; |
239 | 0 | uint8_t *data = aData; |
240 | 0 | for (int32_t y = 0; y < size.height; y++) { |
241 | 0 | // Not using ContainsY(y) because we do not skip y == skipRect.Y() |
242 | 0 | // although that may not be done on purpose |
243 | 0 | bool inSkipRectY = y > skipRect.Y() && y < skipRect.YMost(); |
244 | 0 |
|
245 | 0 | uint32_t *topLeftBase = innerIntegral + ((y - aTopLobe) * ptrdiff_t(stride32bit) - aLeftLobe); |
246 | 0 | uint32_t *topRightBase = innerIntegral + ((y - aTopLobe) * ptrdiff_t(stride32bit) + aRightLobe); |
247 | 0 | uint32_t *bottomRightBase = innerIntegral + ((y + aBottomLobe) * ptrdiff_t(stride32bit) + aRightLobe); |
248 | 0 | uint32_t *bottomLeftBase = innerIntegral + ((y + aBottomLobe) * ptrdiff_t(stride32bit) - aLeftLobe); |
249 | 0 |
|
250 | 0 | int32_t x = 0; |
251 | 0 | // Process 16 pixels at a time for as long as possible. |
252 | 0 | for (; x <= size.width - 16; x += 16) { |
253 | 0 | // Not using ContainsX(x) because we do not skip x == skipRect.X() |
254 | 0 | // although that may not be done on purpose |
255 | 0 | if (inSkipRectY && x > skipRect.X() && x < skipRect.XMost()) { |
256 | 0 | x = skipRect.XMost() - 16; |
257 | 0 | // Trigger early jump on coming loop iterations, this will be reset |
258 | 0 | // next line anyway. |
259 | 0 | inSkipRectY = false; |
260 | 0 | continue; |
261 | 0 | } |
262 | 0 | |
263 | 0 | __m128i topLeft; |
264 | 0 | __m128i topRight; |
265 | 0 | __m128i bottomRight; |
266 | 0 | __m128i bottomLeft; |
267 | 0 |
|
268 | 0 | topLeft = loadUnaligned128((__m128i*)(topLeftBase + x)); |
269 | 0 | topRight = loadUnaligned128((__m128i*)(topRightBase + x)); |
270 | 0 | bottomRight = loadUnaligned128((__m128i*)(bottomRightBase + x)); |
271 | 0 | bottomLeft = loadUnaligned128((__m128i*)(bottomLeftBase + x)); |
272 | 0 | __m128i result1 = BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor); |
273 | 0 |
|
274 | 0 | topLeft = loadUnaligned128((__m128i*)(topLeftBase + x + 4)); |
275 | 0 | topRight = loadUnaligned128((__m128i*)(topRightBase + x + 4)); |
276 | 0 | bottomRight = loadUnaligned128((__m128i*)(bottomRightBase + x + 4)); |
277 | 0 | bottomLeft = loadUnaligned128((__m128i*)(bottomLeftBase + x + 4)); |
278 | 0 | __m128i result2 = BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor); |
279 | 0 |
|
280 | 0 | topLeft = loadUnaligned128((__m128i*)(topLeftBase + x + 8)); |
281 | 0 | topRight = loadUnaligned128((__m128i*)(topRightBase + x + 8)); |
282 | 0 | bottomRight = loadUnaligned128((__m128i*)(bottomRightBase + x + 8)); |
283 | 0 | bottomLeft = loadUnaligned128((__m128i*)(bottomLeftBase + x + 8)); |
284 | 0 | __m128i result3 = BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor); |
285 | 0 |
|
286 | 0 | topLeft = loadUnaligned128((__m128i*)(topLeftBase + x + 12)); |
287 | 0 | topRight = loadUnaligned128((__m128i*)(topRightBase + x + 12)); |
288 | 0 | bottomRight = loadUnaligned128((__m128i*)(bottomRightBase + x + 12)); |
289 | 0 | bottomLeft = loadUnaligned128((__m128i*)(bottomLeftBase + x + 12)); |
290 | 0 | __m128i result4 = BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor); |
291 | 0 |
|
292 | 0 | __m128i final = _mm_packus_epi16(_mm_packs_epi32(result1, result2), _mm_packs_epi32(result3, result4)); |
293 | 0 |
|
294 | 0 | _mm_storeu_si128((__m128i*)(data + stride * y + x), final); |
295 | 0 | } |
296 | 0 |
|
297 | 0 | // Process the remaining pixels 4 bytes at a time. |
298 | 0 | for (; x < size.width; x += 4) { |
299 | 0 | // Not using Containsx(x) because we do not skip x == skipRect.X() |
300 | 0 | // although that may not be done on purpose |
301 | 0 | if (inSkipRectY && x > skipRect.X() && x < skipRect.XMost()) { |
302 | 0 | x = skipRect.XMost() - 4; |
303 | 0 | // Trigger early jump on coming loop iterations, this will be reset |
304 | 0 | // next line anyway. |
305 | 0 | inSkipRectY = false; |
306 | 0 | continue; |
307 | 0 | } |
308 | 0 | __m128i topLeft = loadUnaligned128((__m128i*)(topLeftBase + x)); |
309 | 0 | __m128i topRight = loadUnaligned128((__m128i*)(topRightBase + x)); |
310 | 0 | __m128i bottomRight = loadUnaligned128((__m128i*)(bottomRightBase + x)); |
311 | 0 | __m128i bottomLeft = loadUnaligned128((__m128i*)(bottomLeftBase + x)); |
312 | 0 |
|
313 | 0 | __m128i result = BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor); |
314 | 0 | __m128i final = _mm_packus_epi16(_mm_packs_epi32(result, _mm_setzero_si128()), _mm_setzero_si128()); |
315 | 0 |
|
316 | 0 | *(uint32_t*)(data + stride * y + x) = _mm_cvtsi128_si32(final); |
317 | 0 | } |
318 | 0 | } |
319 | 0 |
|
320 | 0 | } |
321 | | |
322 | | } // namespace gfx |
323 | | } // namespace mozilla |