Coverage Report

Created: 2018-09-25 14:53

/src/mozilla-central/gfx/thebes/gfxAlphaRecoverySSE2.cpp
Line
Count
Source (jump to first uncovered line)
1
/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*-
2
 * This Source Code Form is subject to the terms of the Mozilla Public
3
 * License, v. 2.0. If a copy of the MPL was not distributed with this
4
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5
6
#include "gfxAlphaRecovery.h"
7
#include "gfxImageSurface.h"
8
#include <emmintrin.h>
9
10
// This file should only be compiled on x86 and x64 systems.  Additionally,
11
// you'll need to compile it with -msse2 if you're using GCC on x86.
12
13
#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64))
14
__declspec(align(16)) static uint32_t greenMaski[] =
15
    { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 };
16
__declspec(align(16)) static uint32_t alphaMaski[] =
17
    { 0xff000000, 0xff000000, 0xff000000, 0xff000000 };
18
#elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
19
static uint32_t greenMaski[] __attribute__ ((aligned (16))) =
20
    { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 };
21
static uint32_t alphaMaski[] __attribute__ ((aligned (16))) =
22
    { 0xff000000, 0xff000000, 0xff000000, 0xff000000 };
23
#elif defined(__SUNPRO_CC) && (defined(__i386) || defined(__x86_64__))
24
#pragma align 16 (greenMaski, alphaMaski)
25
static uint32_t greenMaski[] = { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 };
26
static uint32_t alphaMaski[] = { 0xff000000, 0xff000000, 0xff000000, 0xff000000 };
27
#endif
28
29
bool
30
gfxAlphaRecovery::RecoverAlphaSSE2(gfxImageSurface* blackSurf,
31
                                   const gfxImageSurface* whiteSurf)
32
0
{
33
0
    mozilla::gfx::IntSize size = blackSurf->GetSize();
34
0
35
0
    if (size != whiteSurf->GetSize() ||
36
0
        (blackSurf->Format() != mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 &&
37
0
         blackSurf->Format() != mozilla::gfx::SurfaceFormat::X8R8G8B8_UINT32) ||
38
0
        (whiteSurf->Format() != mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 &&
39
0
         whiteSurf->Format() != mozilla::gfx::SurfaceFormat::X8R8G8B8_UINT32))
40
0
        return false;
41
0
42
0
    blackSurf->Flush();
43
0
    whiteSurf->Flush();
44
0
45
0
    unsigned char* blackData = blackSurf->Data();
46
0
    unsigned char* whiteData = whiteSurf->Data();
47
0
48
0
    if ((NS_PTR_TO_UINT32(blackData) & 0xf) != (NS_PTR_TO_UINT32(whiteData) & 0xf) ||
49
0
        (blackSurf->Stride() - whiteSurf->Stride()) & 0xf) {
50
0
        // Cannot keep these in alignment.
51
0
        return false;
52
0
    }
53
0
54
0
    __m128i greenMask = _mm_load_si128((__m128i*)greenMaski);
55
0
    __m128i alphaMask = _mm_load_si128((__m128i*)alphaMaski);
56
0
57
0
    for (int32_t i = 0; i < size.height; ++i) {
58
0
        int32_t j = 0;
59
0
        // Loop single pixels until at 4 byte alignment.
60
0
        while (NS_PTR_TO_UINT32(blackData) & 0xf && j < size.width) {
61
0
            *((uint32_t*)blackData) =
62
0
                RecoverPixel(*reinterpret_cast<uint32_t*>(blackData),
63
0
                             *reinterpret_cast<uint32_t*>(whiteData));
64
0
            blackData += 4;
65
0
            whiteData += 4;
66
0
            j++;
67
0
        }
68
0
        // This extra loop allows the compiler to do some more clever registry
69
0
        // management and makes it about 5% faster than with only the 4 pixel
70
0
        // at a time loop.
71
0
        for (; j < size.width - 8; j += 8) {
72
0
            __m128i black1 = _mm_load_si128((__m128i*)blackData);
73
0
            __m128i white1 = _mm_load_si128((__m128i*)whiteData);
74
0
            __m128i black2 = _mm_load_si128((__m128i*)(blackData + 16));
75
0
            __m128i white2 = _mm_load_si128((__m128i*)(whiteData + 16));
76
0
77
0
            // Execute the same instructions as described in RecoverPixel, only
78
0
            // using an SSE2 packed saturated subtract.
79
0
            white1 = _mm_subs_epu8(white1, black1);
80
0
            white2 = _mm_subs_epu8(white2, black2);
81
0
            white1 = _mm_subs_epu8(greenMask, white1);
82
0
            white2 = _mm_subs_epu8(greenMask, white2);
83
0
            // Producing the final black pixel in an XMM register and storing
84
0
            // that is actually faster than doing a masked store since that
85
0
            // does an unaligned storage. We have the black pixel in a register
86
0
            // anyway.
87
0
            black1 = _mm_andnot_si128(alphaMask, black1);
88
0
            black2 = _mm_andnot_si128(alphaMask, black2);
89
0
            white1 = _mm_slli_si128(white1, 2);
90
0
            white2 = _mm_slli_si128(white2, 2);
91
0
            white1 = _mm_and_si128(alphaMask, white1);
92
0
            white2 = _mm_and_si128(alphaMask, white2);
93
0
            black1 = _mm_or_si128(white1, black1);
94
0
            black2 = _mm_or_si128(white2, black2);
95
0
96
0
            _mm_store_si128((__m128i*)blackData, black1);
97
0
            _mm_store_si128((__m128i*)(blackData + 16), black2);
98
0
            blackData += 32;
99
0
            whiteData += 32;
100
0
        }
101
0
        for (; j < size.width - 4; j += 4) {
102
0
            __m128i black = _mm_load_si128((__m128i*)blackData);
103
0
            __m128i white = _mm_load_si128((__m128i*)whiteData);
104
0
105
0
            white = _mm_subs_epu8(white, black);
106
0
            white = _mm_subs_epu8(greenMask, white);
107
0
            black = _mm_andnot_si128(alphaMask, black);
108
0
            white = _mm_slli_si128(white, 2);
109
0
            white = _mm_and_si128(alphaMask, white);
110
0
            black = _mm_or_si128(white, black);
111
0
            _mm_store_si128((__m128i*)blackData, black);
112
0
            blackData += 16;
113
0
            whiteData += 16;
114
0
        }
115
0
        // Loop single pixels until we're done.
116
0
        while (j < size.width) {
117
0
            *((uint32_t*)blackData) =
118
0
                RecoverPixel(*reinterpret_cast<uint32_t*>(blackData),
119
0
                             *reinterpret_cast<uint32_t*>(whiteData));
120
0
            blackData += 4;
121
0
            whiteData += 4;
122
0
            j++;
123
0
        }
124
0
        blackData += blackSurf->Stride() - j * 4;
125
0
        whiteData += whiteSurf->Stride() - j * 4;
126
0
    }
127
0
128
0
    blackSurf->MarkDirty();
129
0
130
0
    return true;
131
0
}
132
133
static int32_t
134
ByteAlignment(int32_t aAlignToLog2, int32_t aX, int32_t aY=0, int32_t aStride=1)
135
0
{
136
0
    return (aX + aStride * aY) & ((1 << aAlignToLog2) - 1);
137
0
}
138
139
/*static*/ mozilla::gfx::IntRect
140
gfxAlphaRecovery::AlignRectForSubimageRecovery(const mozilla::gfx::IntRect& aRect,
141
                                               gfxImageSurface* aSurface)
142
0
{
143
0
    NS_ASSERTION(mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 == aSurface->Format(),
144
0
                 "Thebes grew support for non-ARGB32 COLOR_ALPHA?");
145
0
    static const int32_t kByteAlignLog2 = GoodAlignmentLog2();
146
0
    static const int32_t bpp = 4;
147
0
    static const int32_t pixPerAlign = (1 << kByteAlignLog2) / bpp;
148
0
    //
149
0
    // We're going to create a subimage of the surface with size
150
0
    // <sw,sh> for alpha recovery, and want a SIMD fast-path.  The
151
0
    // rect <x,y, w,h> /needs/ to be redrawn, but it might not be
152
0
    // properly aligned for SIMD.  So we want to find a rect <x',y',
153
0
    // w',h'> that's a superset of what needs to be redrawn but is
154
0
    // properly aligned.  Proper alignment is
155
0
    //
156
0
    //   BPP * (x' + y' * sw) \cong 0         (mod ALIGN)
157
0
    //   BPP * w'             \cong BPP * sw  (mod ALIGN)
158
0
    //
159
0
    // (We assume the pixel at surface <0,0> is already ALIGN'd.)
160
0
    // That rect (obviously) has to fit within the surface bounds, and
161
0
    // we should also minimize the extra pixels redrawn only for
162
0
    // alignment's sake.  So we also want
163
0
    //
164
0
    //  minimize <x',y', w',h'>
165
0
    //   0 <= x' <= x
166
0
    //   0 <= y' <= y
167
0
    //   w <= w' <= sw
168
0
    //   h <= h' <= sh
169
0
    //
170
0
    // This is a messy integer non-linear programming problem, except
171
0
    // ... we can assume that ALIGN/BPP is a very small constant.  So,
172
0
    // brute force is viable.  The algorithm below will find a
173
0
    // solution if one exists, but isn't guaranteed to find the
174
0
    // minimum solution.  (For SSE2, ALIGN/BPP = 4, so it'll do at
175
0
    // most 64 iterations below).  In what's likely the common case,
176
0
    // an already-aligned rectangle, it only needs 1 iteration.
177
0
    //
178
0
    // Is this alignment worth doing?  Recovering alpha will take work
179
0
    // proportional to w*h (assuming alpha recovery computation isn't
180
0
    // memory bound).  This analysis can lead to O(w+h) extra work
181
0
    // (with small constants).  In exchange, we expect to shave off a
182
0
    // ALIGN/BPP constant by using SIMD-ized alpha recovery.  So as
183
0
    // w*h diverges from w+h, the win factor approaches ALIGN/BPP.  We
184
0
    // only really care about the w*h >> w+h case anyway; others
185
0
    // should be fast enough even with the overhead.  (Unless the cost
186
0
    // of repainting the expanded rect is high, but in that case
187
0
    // SIMD-ized alpha recovery won't make a difference so this code
188
0
    // shouldn't be called.)
189
0
    //
190
0
    mozilla::gfx::IntSize surfaceSize = aSurface->GetSize();
191
0
    const int32_t stride = bpp * surfaceSize.width;
192
0
    if (stride != aSurface->Stride()) {
193
0
        NS_WARNING("Unexpected stride, falling back on slow alpha recovery");
194
0
        return aRect;
195
0
    }
196
0
197
0
    const int32_t x = aRect.X(), y = aRect.Y(), w = aRect.Width(), h = aRect.Height();
198
0
    const int32_t r = x + w;
199
0
    const int32_t sw = surfaceSize.width;
200
0
    const int32_t strideAlign = ByteAlignment(kByteAlignLog2, stride);
201
0
202
0
    // The outer two loops below keep the rightmost (|r| above) and
203
0
    // bottommost pixels in |aRect| fixed wrt <x,y>, to ensure that we
204
0
    // return only a superset of the original rect.  These loops
205
0
    // search for an aligned top-left pixel by trying to expand <x,y>
206
0
    // left and up by <dx,dy> pixels, respectively.
207
0
    //
208
0
    // Then if a properly-aligned top-left pixel is found, the
209
0
    // innermost loop tries to find an aligned stride by moving the
210
0
    // rightmost pixel rightward by dr.
211
0
    int32_t dx, dy, dr;
212
0
    for (dy = 0; (dy < pixPerAlign) && (y - dy >= 0); ++dy) {
213
0
        for (dx = 0; (dx < pixPerAlign) && (x - dx >= 0); ++dx) {
214
0
            if (0 != ByteAlignment(kByteAlignLog2,
215
0
                                   bpp * (x - dx), y - dy, stride)) {
216
0
                continue;
217
0
            }
218
0
            for (dr = 0; (dr < pixPerAlign) && (r + dr <= sw); ++dr) {
219
0
                if (strideAlign == ByteAlignment(kByteAlignLog2,
220
0
                                                 bpp * (w + dr + dx))) {
221
0
                    goto FOUND_SOLUTION;
222
0
                }
223
0
            }
224
0
        }
225
0
    }
226
0
227
0
    // Didn't find a solution.
228
0
    return aRect;
229
0
230
0
FOUND_SOLUTION:
231
0
    mozilla::gfx::IntRect solution = mozilla::gfx::IntRect(x - dx, y - dy, w + dr + dx, h + dy);
232
0
    MOZ_ASSERT(mozilla::gfx::IntRect(0, 0, sw, surfaceSize.height).Contains(solution),
233
0
               "'Solution' extends outside surface bounds!");
234
0
    return solution;
235
0
}