/src/mozilla-central/gfx/thebes/gfxAlphaRecoverySSE2.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*- |
2 | | * This Source Code Form is subject to the terms of the Mozilla Public |
3 | | * License, v. 2.0. If a copy of the MPL was not distributed with this |
4 | | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
5 | | |
6 | | #include "gfxAlphaRecovery.h" |
7 | | #include "gfxImageSurface.h" |
8 | | #include <emmintrin.h> |
9 | | |
10 | | // This file should only be compiled on x86 and x64 systems. Additionally, |
11 | | // you'll need to compile it with -msse2 if you're using GCC on x86. |
12 | | |
13 | | #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64)) |
14 | | __declspec(align(16)) static uint32_t greenMaski[] = |
15 | | { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 }; |
16 | | __declspec(align(16)) static uint32_t alphaMaski[] = |
17 | | { 0xff000000, 0xff000000, 0xff000000, 0xff000000 }; |
18 | | #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) |
19 | | static uint32_t greenMaski[] __attribute__ ((aligned (16))) = |
20 | | { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 }; |
21 | | static uint32_t alphaMaski[] __attribute__ ((aligned (16))) = |
22 | | { 0xff000000, 0xff000000, 0xff000000, 0xff000000 }; |
23 | | #elif defined(__SUNPRO_CC) && (defined(__i386) || defined(__x86_64__)) |
24 | | #pragma align 16 (greenMaski, alphaMaski) |
25 | | static uint32_t greenMaski[] = { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 }; |
26 | | static uint32_t alphaMaski[] = { 0xff000000, 0xff000000, 0xff000000, 0xff000000 }; |
27 | | #endif |
28 | | |
29 | | bool |
30 | | gfxAlphaRecovery::RecoverAlphaSSE2(gfxImageSurface* blackSurf, |
31 | | const gfxImageSurface* whiteSurf) |
32 | 0 | { |
33 | 0 | mozilla::gfx::IntSize size = blackSurf->GetSize(); |
34 | 0 |
|
35 | 0 | if (size != whiteSurf->GetSize() || |
36 | 0 | (blackSurf->Format() != mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 && |
37 | 0 | blackSurf->Format() != mozilla::gfx::SurfaceFormat::X8R8G8B8_UINT32) || |
38 | 0 | (whiteSurf->Format() != mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 && |
39 | 0 | whiteSurf->Format() != mozilla::gfx::SurfaceFormat::X8R8G8B8_UINT32)) |
40 | 0 | return false; |
41 | 0 | |
42 | 0 | blackSurf->Flush(); |
43 | 0 | whiteSurf->Flush(); |
44 | 0 |
|
45 | 0 | unsigned char* blackData = blackSurf->Data(); |
46 | 0 | unsigned char* whiteData = whiteSurf->Data(); |
47 | 0 |
|
48 | 0 | if ((NS_PTR_TO_UINT32(blackData) & 0xf) != (NS_PTR_TO_UINT32(whiteData) & 0xf) || |
49 | 0 | (blackSurf->Stride() - whiteSurf->Stride()) & 0xf) { |
50 | 0 | // Cannot keep these in alignment. |
51 | 0 | return false; |
52 | 0 | } |
53 | 0 | |
54 | 0 | __m128i greenMask = _mm_load_si128((__m128i*)greenMaski); |
55 | 0 | __m128i alphaMask = _mm_load_si128((__m128i*)alphaMaski); |
56 | 0 |
|
57 | 0 | for (int32_t i = 0; i < size.height; ++i) { |
58 | 0 | int32_t j = 0; |
59 | 0 | // Loop single pixels until at 4 byte alignment. |
60 | 0 | while (NS_PTR_TO_UINT32(blackData) & 0xf && j < size.width) { |
61 | 0 | *((uint32_t*)blackData) = |
62 | 0 | RecoverPixel(*reinterpret_cast<uint32_t*>(blackData), |
63 | 0 | *reinterpret_cast<uint32_t*>(whiteData)); |
64 | 0 | blackData += 4; |
65 | 0 | whiteData += 4; |
66 | 0 | j++; |
67 | 0 | } |
68 | 0 | // This extra loop allows the compiler to do some more clever registry |
69 | 0 | // management and makes it about 5% faster than with only the 4 pixel |
70 | 0 | // at a time loop. |
71 | 0 | for (; j < size.width - 8; j += 8) { |
72 | 0 | __m128i black1 = _mm_load_si128((__m128i*)blackData); |
73 | 0 | __m128i white1 = _mm_load_si128((__m128i*)whiteData); |
74 | 0 | __m128i black2 = _mm_load_si128((__m128i*)(blackData + 16)); |
75 | 0 | __m128i white2 = _mm_load_si128((__m128i*)(whiteData + 16)); |
76 | 0 |
|
77 | 0 | // Execute the same instructions as described in RecoverPixel, only |
78 | 0 | // using an SSE2 packed saturated subtract. |
79 | 0 | white1 = _mm_subs_epu8(white1, black1); |
80 | 0 | white2 = _mm_subs_epu8(white2, black2); |
81 | 0 | white1 = _mm_subs_epu8(greenMask, white1); |
82 | 0 | white2 = _mm_subs_epu8(greenMask, white2); |
83 | 0 | // Producing the final black pixel in an XMM register and storing |
84 | 0 | // that is actually faster than doing a masked store since that |
85 | 0 | // does an unaligned storage. We have the black pixel in a register |
86 | 0 | // anyway. |
87 | 0 | black1 = _mm_andnot_si128(alphaMask, black1); |
88 | 0 | black2 = _mm_andnot_si128(alphaMask, black2); |
89 | 0 | white1 = _mm_slli_si128(white1, 2); |
90 | 0 | white2 = _mm_slli_si128(white2, 2); |
91 | 0 | white1 = _mm_and_si128(alphaMask, white1); |
92 | 0 | white2 = _mm_and_si128(alphaMask, white2); |
93 | 0 | black1 = _mm_or_si128(white1, black1); |
94 | 0 | black2 = _mm_or_si128(white2, black2); |
95 | 0 |
|
96 | 0 | _mm_store_si128((__m128i*)blackData, black1); |
97 | 0 | _mm_store_si128((__m128i*)(blackData + 16), black2); |
98 | 0 | blackData += 32; |
99 | 0 | whiteData += 32; |
100 | 0 | } |
101 | 0 | for (; j < size.width - 4; j += 4) { |
102 | 0 | __m128i black = _mm_load_si128((__m128i*)blackData); |
103 | 0 | __m128i white = _mm_load_si128((__m128i*)whiteData); |
104 | 0 |
|
105 | 0 | white = _mm_subs_epu8(white, black); |
106 | 0 | white = _mm_subs_epu8(greenMask, white); |
107 | 0 | black = _mm_andnot_si128(alphaMask, black); |
108 | 0 | white = _mm_slli_si128(white, 2); |
109 | 0 | white = _mm_and_si128(alphaMask, white); |
110 | 0 | black = _mm_or_si128(white, black); |
111 | 0 | _mm_store_si128((__m128i*)blackData, black); |
112 | 0 | blackData += 16; |
113 | 0 | whiteData += 16; |
114 | 0 | } |
115 | 0 | // Loop single pixels until we're done. |
116 | 0 | while (j < size.width) { |
117 | 0 | *((uint32_t*)blackData) = |
118 | 0 | RecoverPixel(*reinterpret_cast<uint32_t*>(blackData), |
119 | 0 | *reinterpret_cast<uint32_t*>(whiteData)); |
120 | 0 | blackData += 4; |
121 | 0 | whiteData += 4; |
122 | 0 | j++; |
123 | 0 | } |
124 | 0 | blackData += blackSurf->Stride() - j * 4; |
125 | 0 | whiteData += whiteSurf->Stride() - j * 4; |
126 | 0 | } |
127 | 0 |
|
128 | 0 | blackSurf->MarkDirty(); |
129 | 0 |
|
130 | 0 | return true; |
131 | 0 | } |
132 | | |
133 | | static int32_t |
134 | | ByteAlignment(int32_t aAlignToLog2, int32_t aX, int32_t aY=0, int32_t aStride=1) |
135 | 0 | { |
136 | 0 | return (aX + aStride * aY) & ((1 << aAlignToLog2) - 1); |
137 | 0 | } |
138 | | |
139 | | /*static*/ mozilla::gfx::IntRect |
140 | | gfxAlphaRecovery::AlignRectForSubimageRecovery(const mozilla::gfx::IntRect& aRect, |
141 | | gfxImageSurface* aSurface) |
142 | 0 | { |
143 | 0 | NS_ASSERTION(mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 == aSurface->Format(), |
144 | 0 | "Thebes grew support for non-ARGB32 COLOR_ALPHA?"); |
145 | 0 | static const int32_t kByteAlignLog2 = GoodAlignmentLog2(); |
146 | 0 | static const int32_t bpp = 4; |
147 | 0 | static const int32_t pixPerAlign = (1 << kByteAlignLog2) / bpp; |
148 | 0 | // |
149 | 0 | // We're going to create a subimage of the surface with size |
150 | 0 | // <sw,sh> for alpha recovery, and want a SIMD fast-path. The |
151 | 0 | // rect <x,y, w,h> /needs/ to be redrawn, but it might not be |
152 | 0 | // properly aligned for SIMD. So we want to find a rect <x',y', |
153 | 0 | // w',h'> that's a superset of what needs to be redrawn but is |
154 | 0 | // properly aligned. Proper alignment is |
155 | 0 | // |
156 | 0 | // BPP * (x' + y' * sw) \cong 0 (mod ALIGN) |
157 | 0 | // BPP * w' \cong BPP * sw (mod ALIGN) |
158 | 0 | // |
159 | 0 | // (We assume the pixel at surface <0,0> is already ALIGN'd.) |
160 | 0 | // That rect (obviously) has to fit within the surface bounds, and |
161 | 0 | // we should also minimize the extra pixels redrawn only for |
162 | 0 | // alignment's sake. So we also want |
163 | 0 | // |
164 | 0 | // minimize <x',y', w',h'> |
165 | 0 | // 0 <= x' <= x |
166 | 0 | // 0 <= y' <= y |
167 | 0 | // w <= w' <= sw |
168 | 0 | // h <= h' <= sh |
169 | 0 | // |
170 | 0 | // This is a messy integer non-linear programming problem, except |
171 | 0 | // ... we can assume that ALIGN/BPP is a very small constant. So, |
172 | 0 | // brute force is viable. The algorithm below will find a |
173 | 0 | // solution if one exists, but isn't guaranteed to find the |
174 | 0 | // minimum solution. (For SSE2, ALIGN/BPP = 4, so it'll do at |
175 | 0 | // most 64 iterations below). In what's likely the common case, |
176 | 0 | // an already-aligned rectangle, it only needs 1 iteration. |
177 | 0 | // |
178 | 0 | // Is this alignment worth doing? Recovering alpha will take work |
179 | 0 | // proportional to w*h (assuming alpha recovery computation isn't |
180 | 0 | // memory bound). This analysis can lead to O(w+h) extra work |
181 | 0 | // (with small constants). In exchange, we expect to shave off a |
182 | 0 | // ALIGN/BPP constant by using SIMD-ized alpha recovery. So as |
183 | 0 | // w*h diverges from w+h, the win factor approaches ALIGN/BPP. We |
184 | 0 | // only really care about the w*h >> w+h case anyway; others |
185 | 0 | // should be fast enough even with the overhead. (Unless the cost |
186 | 0 | // of repainting the expanded rect is high, but in that case |
187 | 0 | // SIMD-ized alpha recovery won't make a difference so this code |
188 | 0 | // shouldn't be called.) |
189 | 0 | // |
190 | 0 | mozilla::gfx::IntSize surfaceSize = aSurface->GetSize(); |
191 | 0 | const int32_t stride = bpp * surfaceSize.width; |
192 | 0 | if (stride != aSurface->Stride()) { |
193 | 0 | NS_WARNING("Unexpected stride, falling back on slow alpha recovery"); |
194 | 0 | return aRect; |
195 | 0 | } |
196 | 0 |
|
197 | 0 | const int32_t x = aRect.X(), y = aRect.Y(), w = aRect.Width(), h = aRect.Height(); |
198 | 0 | const int32_t r = x + w; |
199 | 0 | const int32_t sw = surfaceSize.width; |
200 | 0 | const int32_t strideAlign = ByteAlignment(kByteAlignLog2, stride); |
201 | 0 |
|
202 | 0 | // The outer two loops below keep the rightmost (|r| above) and |
203 | 0 | // bottommost pixels in |aRect| fixed wrt <x,y>, to ensure that we |
204 | 0 | // return only a superset of the original rect. These loops |
205 | 0 | // search for an aligned top-left pixel by trying to expand <x,y> |
206 | 0 | // left and up by <dx,dy> pixels, respectively. |
207 | 0 | // |
208 | 0 | // Then if a properly-aligned top-left pixel is found, the |
209 | 0 | // innermost loop tries to find an aligned stride by moving the |
210 | 0 | // rightmost pixel rightward by dr. |
211 | 0 | int32_t dx, dy, dr; |
212 | 0 | for (dy = 0; (dy < pixPerAlign) && (y - dy >= 0); ++dy) { |
213 | 0 | for (dx = 0; (dx < pixPerAlign) && (x - dx >= 0); ++dx) { |
214 | 0 | if (0 != ByteAlignment(kByteAlignLog2, |
215 | 0 | bpp * (x - dx), y - dy, stride)) { |
216 | 0 | continue; |
217 | 0 | } |
218 | 0 | for (dr = 0; (dr < pixPerAlign) && (r + dr <= sw); ++dr) { |
219 | 0 | if (strideAlign == ByteAlignment(kByteAlignLog2, |
220 | 0 | bpp * (w + dr + dx))) { |
221 | 0 | goto FOUND_SOLUTION; |
222 | 0 | } |
223 | 0 | } |
224 | 0 | } |
225 | 0 | } |
226 | 0 |
|
227 | 0 | // Didn't find a solution. |
228 | 0 | return aRect; |
229 | 0 | |
230 | 0 | FOUND_SOLUTION: |
231 | 0 | mozilla::gfx::IntRect solution = mozilla::gfx::IntRect(x - dx, y - dy, w + dr + dx, h + dy); |
232 | 0 | MOZ_ASSERT(mozilla::gfx::IntRect(0, 0, sw, surfaceSize.height).Contains(solution), |
233 | 0 | "'Solution' extends outside surface bounds!"); |
234 | 0 | return solution; |
235 | 0 | } |