/src/mozilla-central/gfx/thebes/gfxAlphaRecoverySSE2.cpp

Source (jump to first uncovered line)
/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*-
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#include "gfxAlphaRecovery.h"
#include "gfxImageSurface.h"
#include <emmintrin.h>

// This file should only be compiled on x86 and x64 systems.  Additionally,
// you'll need to compile it with -msse2 if you're using GCC on x86.

#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64))
__declspec(align(16)) static uint32_t greenMaski[] =
    { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 };
__declspec(align(16)) static uint32_t alphaMaski[] =
    { 0xff000000, 0xff000000, 0xff000000, 0xff000000 };
#elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
static uint32_t greenMaski[] __attribute__ ((aligned (16))) =
    { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 };
static uint32_t alphaMaski[] __attribute__ ((aligned (16))) =
    { 0xff000000, 0xff000000, 0xff000000, 0xff000000 };
#elif defined(__SUNPRO_CC) && (defined(__i386) || defined(__x86_64__))
#pragma align 16 (greenMaski, alphaMaski)
static uint32_t greenMaski[] = { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 };
static uint32_t alphaMaski[] = { 0xff000000, 0xff000000, 0xff000000, 0xff000000 };
#endif

bool
gfxAlphaRecovery::RecoverAlphaSSE2(gfxImageSurface* blackSurf,
                                   const gfxImageSurface* whiteSurf)
{
    mozilla::gfx::IntSize size = blackSurf->GetSize();

    if (size != whiteSurf->GetSize() ||
        (blackSurf->Format() != mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 &&
         blackSurf->Format() != mozilla::gfx::SurfaceFormat::X8R8G8B8_UINT32) ||
        (whiteSurf->Format() != mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 &&
         whiteSurf->Format() != mozilla::gfx::SurfaceFormat::X8R8G8B8_UINT32))
        return false;

    blackSurf->Flush();
    whiteSurf->Flush();

    unsigned char* blackData = blackSurf->Data();
    unsigned char* whiteData = whiteSurf->Data();

    if ((NS_PTR_TO_UINT32(blackData) & 0xf) != (NS_PTR_TO_UINT32(whiteData) & 0xf) ||
        (blackSurf->Stride() - whiteSurf->Stride()) & 0xf) {
        // Cannot keep these in alignment.
        return false;
    }

    __m128i greenMask = _mm_load_si128((__m128i*)greenMaski);
    __m128i alphaMask = _mm_load_si128((__m128i*)alphaMaski);

    for (int32_t i = 0; i < size.height; ++i) {
        int32_t j = 0;
        // Loop single pixels until at 4 byte alignment.
        while (NS_PTR_TO_UINT32(blackData) & 0xf && j < size.width) {
            *((uint32_t*)blackData) =
                RecoverPixel(*reinterpret_cast<uint32_t*>(blackData),
                             *reinterpret_cast<uint32_t*>(whiteData));
            blackData += 4;
            whiteData += 4;
            j++;
        }
        // This extra loop allows the compiler to do some more clever registry
        // management and makes it about 5% faster than with only the 4 pixel
        // at a time loop.
        for (; j < size.width - 8; j += 8) {
            __m128i black1 = _mm_load_si128((__m128i*)blackData);
            __m128i white1 = _mm_load_si128((__m128i*)whiteData);
            __m128i black2 = _mm_load_si128((__m128i*)(blackData + 16));
            __m128i white2 = _mm_load_si128((__m128i*)(whiteData + 16));

            // Execute the same instructions as described in RecoverPixel, only
            // using an SSE2 packed saturated subtract.
            white1 = _mm_subs_epu8(white1, black1);
            white2 = _mm_subs_epu8(white2, black2);
            white1 = _mm_subs_epu8(greenMask, white1);
            white2 = _mm_subs_epu8(greenMask, white2);
            // Producing the final black pixel in an XMM register and storing
            // that is actually faster than doing a masked store since that
            // does an unaligned storage. We have the black pixel in a register
            // anyway.
            black1 = _mm_andnot_si128(alphaMask, black1);
            black2 = _mm_andnot_si128(alphaMask, black2);
            white1 = _mm_slli_si128(white1, 2);
            white2 = _mm_slli_si128(white2, 2);
            white1 = _mm_and_si128(alphaMask, white1);
            white2 = _mm_and_si128(alphaMask, white2);
            black1 = _mm_or_si128(white1, black1);
            black2 = _mm_or_si128(white2, black2);

            _mm_store_si128((__m128i*)blackData, black1);
            _mm_store_si128((__m128i*)(blackData + 16), black2);
            blackData += 32;
            whiteData += 32;
        }
        for (; j < size.width - 4; j += 4) {
            __m128i black = _mm_load_si128((__m128i*)blackData);
            __m128i white = _mm_load_si128((__m128i*)whiteData);

            white = _mm_subs_epu8(white, black);
            white = _mm_subs_epu8(greenMask, white);
            black = _mm_andnot_si128(alphaMask, black);
            white = _mm_slli_si128(white, 2);
            white = _mm_and_si128(alphaMask, white);
            black = _mm_or_si128(white, black);
            _mm_store_si128((__m128i*)blackData, black);
            blackData += 16;
            whiteData += 16;
        }
        // Loop single pixels until we're done.
        while (j < size.width) {
            *((uint32_t*)blackData) =
                RecoverPixel(*reinterpret_cast<uint32_t*>(blackData),
                             *reinterpret_cast<uint32_t*>(whiteData));
            blackData += 4;
            whiteData += 4;
            j++;
        }
        blackData += blackSurf->Stride() - j * 4;
        whiteData += whiteSurf->Stride() - j * 4;
    }

    blackSurf->MarkDirty();

    return true;
}

static int32_t
ByteAlignment(int32_t aAlignToLog2, int32_t aX, int32_t aY=0, int32_t aStride=1)
{
    return (aX + aStride * aY) & ((1 << aAlignToLog2) - 1);
}

/*static*/ mozilla::gfx::IntRect
gfxAlphaRecovery::AlignRectForSubimageRecovery(const mozilla::gfx::IntRect& aRect,
                                               gfxImageSurface* aSurface)
{
    NS_ASSERTION(mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 == aSurface->Format(),
                 "Thebes grew support for non-ARGB32 COLOR_ALPHA?");
    static const int32_t kByteAlignLog2 = GoodAlignmentLog2();
    static const int32_t bpp = 4;
    static const int32_t pixPerAlign = (1 << kByteAlignLog2) / bpp;
    //
    // We're going to create a subimage of the surface with size
    // <sw,sh> for alpha recovery, and want a SIMD fast-path.  The
    // rect <x,y, w,h> /needs/ to be redrawn, but it might not be
    // properly aligned for SIMD.  So we want to find a rect <x',y',
    // w',h'> that's a superset of what needs to be redrawn but is
    // properly aligned.  Proper alignment is
    //
    //   BPP * (x' + y' * sw) \cong 0         (mod ALIGN)
    //   BPP * w'             \cong BPP * sw  (mod ALIGN)
    //
    // (We assume the pixel at surface <0,0> is already ALIGN'd.)
    // That rect (obviously) has to fit within the surface bounds, and
    // we should also minimize the extra pixels redrawn only for
    // alignment's sake.  So we also want
    //
    //  minimize <x',y', w',h'>
    //   0 <= x' <= x
    //   0 <= y' <= y
    //   w <= w' <= sw
    //   h <= h' <= sh
    //
    // This is a messy integer non-linear programming problem, except
    // ... we can assume that ALIGN/BPP is a very small constant.  So,
    // brute force is viable.  The algorithm below will find a
    // solution if one exists, but isn't guaranteed to find the
    // minimum solution.  (For SSE2, ALIGN/BPP = 4, so it'll do at
    // most 64 iterations below).  In what's likely the common case,
    // an already-aligned rectangle, it only needs 1 iteration.
    //
    // Is this alignment worth doing?  Recovering alpha will take work
    // proportional to w*h (assuming alpha recovery computation isn't
    // memory bound).  This analysis can lead to O(w+h) extra work
    // (with small constants).  In exchange, we expect to shave off a
    // ALIGN/BPP constant by using SIMD-ized alpha recovery.  So as
    // w*h diverges from w+h, the win factor approaches ALIGN/BPP.  We
    // only really care about the w*h >> w+h case anyway; others
    // should be fast enough even with the overhead.  (Unless the cost
    // of repainting the expanded rect is high, but in that case
    // SIMD-ized alpha recovery won't make a difference so this code
    // shouldn't be called.)
    //
    mozilla::gfx::IntSize surfaceSize = aSurface->GetSize();
    const int32_t stride = bpp * surfaceSize.width;
    if (stride != aSurface->Stride()) {
        NS_WARNING("Unexpected stride, falling back on slow alpha recovery");
        return aRect;
    }

    const int32_t x = aRect.X(), y = aRect.Y(), w = aRect.Width(), h = aRect.Height();
    const int32_t r = x + w;
    const int32_t sw = surfaceSize.width;
    const int32_t strideAlign = ByteAlignment(kByteAlignLog2, stride);

    // The outer two loops below keep the rightmost (|r| above) and
    // bottommost pixels in |aRect| fixed wrt <x,y>, to ensure that we
    // return only a superset of the original rect.  These loops
    // search for an aligned top-left pixel by trying to expand <x,y>
    // left and up by <dx,dy> pixels, respectively.
    //
    // Then if a properly-aligned top-left pixel is found, the
    // innermost loop tries to find an aligned stride by moving the
    // rightmost pixel rightward by dr.
    int32_t dx, dy, dr;
    for (dy = 0; (dy < pixPerAlign) && (y - dy >= 0); ++dy) {
        for (dx = 0; (dx < pixPerAlign) && (x - dx >= 0); ++dx) {
            if (0 != ByteAlignment(kByteAlignLog2,
                                   bpp * (x - dx), y - dy, stride)) {
                continue;
            }
            for (dr = 0; (dr < pixPerAlign) && (r + dr <= sw); ++dr) {
                if (strideAlign == ByteAlignment(kByteAlignLog2,
                                                 bpp * (w + dr + dx))) {
                    goto FOUND_SOLUTION;
                }
            }
        }
    }

    // Didn't find a solution.
    return aRect;

FOUND_SOLUTION:
    mozilla::gfx::IntRect solution = mozilla::gfx::IntRect(x - dx, y - dy, w + dr + dx, h + dy);
    MOZ_ASSERT(mozilla::gfx::IntRect(0, 0, sw, surfaceSize.height).Contains(solution),
               "'Solution' extends outside surface bounds!");
    return solution;
}

Coverage Report

Created: 2018-09-25 14:53

Line	Count	Source (jump to first uncovered line)
1		/* -- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 --
2		* This Source Code Form is subject to the terms of the Mozilla Public
3		* License, v. 2.0. If a copy of the MPL was not distributed with this
4		* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5
6		#include "gfxAlphaRecovery.h"
7		#include "gfxImageSurface.h"
8		#include <emmintrin.h>
9
10		// This file should only be compiled on x86 and x64 systems. Additionally,
11		// you'll need to compile it with -msse2 if you're using GCC on x86.
12
13		#if defined(_MSC_VER) && (defined(_M_IX86) \|\| defined(_M_AMD64))
14		__declspec(align(16)) static uint32_t greenMaski[] =
15		{ 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 };
16		__declspec(align(16)) static uint32_t alphaMaski[] =
17		{ 0xff000000, 0xff000000, 0xff000000, 0xff000000 };
18		#elif defined(__GNUC__) && (defined(__i386__) \|\| defined(__x86_64__))
19		static uint32_t greenMaski[] __attribute__ ((aligned (16))) =
20		{ 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 };
21		static uint32_t alphaMaski[] __attribute__ ((aligned (16))) =
22		{ 0xff000000, 0xff000000, 0xff000000, 0xff000000 };
23		#elif defined(__SUNPRO_CC) && (defined(__i386) \|\| defined(__x86_64__))
24		#pragma align 16 (greenMaski, alphaMaski)
25		static uint32_t greenMaski[] = { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 };
26		static uint32_t alphaMaski[] = { 0xff000000, 0xff000000, 0xff000000, 0xff000000 };
27		#endif
28
29		bool
30		gfxAlphaRecovery::RecoverAlphaSSE2(gfxImageSurface* blackSurf,
31		const gfxImageSurface* whiteSurf)
32	0	{
33	0	mozilla::gfx::IntSize size = blackSurf->GetSize();
34	0
35	0	if (size != whiteSurf->GetSize() \|\|
36	0	(blackSurf->Format() != mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 &&
37	0	blackSurf->Format() != mozilla::gfx::SurfaceFormat::X8R8G8B8_UINT32) \|\|
38	0	(whiteSurf->Format() != mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 &&
39	0	whiteSurf->Format() != mozilla::gfx::SurfaceFormat::X8R8G8B8_UINT32))
40	0	return false;
41	0
42	0	blackSurf->Flush();
43	0	whiteSurf->Flush();
44	0
45	0	unsigned char* blackData = blackSurf->Data();
46	0	unsigned char* whiteData = whiteSurf->Data();
47	0
48	0	if ((NS_PTR_TO_UINT32(blackData) & 0xf) != (NS_PTR_TO_UINT32(whiteData) & 0xf) \|\|
49	0	(blackSurf->Stride() - whiteSurf->Stride()) & 0xf) {
50	0	// Cannot keep these in alignment.
51	0	return false;
52	0	}
53	0
54	0	__m128i greenMask = _mm_load_si128((__m128i*)greenMaski);
55	0	__m128i alphaMask = _mm_load_si128((__m128i*)alphaMaski);
56	0
57	0	for (int32_t i = 0; i < size.height; ++i) {
58	0	int32_t j = 0;
59	0	// Loop single pixels until at 4 byte alignment.
60	0	while (NS_PTR_TO_UINT32(blackData) & 0xf && j < size.width) {
61	0	((uint32_t)blackData) =
62	0	RecoverPixel(reinterpret_cast<uint32_t>(blackData),
63	0	reinterpret_cast<uint32_t>(whiteData));
64	0	blackData += 4;
65	0	whiteData += 4;
66	0	j++;
67	0	}
68	0	// This extra loop allows the compiler to do some more clever registry
69	0	// management and makes it about 5% faster than with only the 4 pixel
70	0	// at a time loop.
71	0	for (; j < size.width - 8; j += 8) {
72	0	__m128i black1 = _mm_load_si128((__m128i*)blackData);
73	0	__m128i white1 = _mm_load_si128((__m128i*)whiteData);
74	0	__m128i black2 = _mm_load_si128((__m128i*)(blackData + 16));
75	0	__m128i white2 = _mm_load_si128((__m128i*)(whiteData + 16));
76	0
77	0	// Execute the same instructions as described in RecoverPixel, only
78	0	// using an SSE2 packed saturated subtract.
79	0	white1 = _mm_subs_epu8(white1, black1);
80	0	white2 = _mm_subs_epu8(white2, black2);
81	0	white1 = _mm_subs_epu8(greenMask, white1);
82	0	white2 = _mm_subs_epu8(greenMask, white2);
83	0	// Producing the final black pixel in an XMM register and storing
84	0	// that is actually faster than doing a masked store since that
85	0	// does an unaligned storage. We have the black pixel in a register
86	0	// anyway.
87	0	black1 = _mm_andnot_si128(alphaMask, black1);
88	0	black2 = _mm_andnot_si128(alphaMask, black2);
89	0	white1 = _mm_slli_si128(white1, 2);
90	0	white2 = _mm_slli_si128(white2, 2);
91	0	white1 = _mm_and_si128(alphaMask, white1);
92	0	white2 = _mm_and_si128(alphaMask, white2);
93	0	black1 = _mm_or_si128(white1, black1);
94	0	black2 = _mm_or_si128(white2, black2);
95	0
96	0	_mm_store_si128((__m128i*)blackData, black1);
97	0	_mm_store_si128((__m128i*)(blackData + 16), black2);
98	0	blackData += 32;
99	0	whiteData += 32;
100	0	}
101	0	for (; j < size.width - 4; j += 4) {
102	0	__m128i black = _mm_load_si128((__m128i*)blackData);
103	0	__m128i white = _mm_load_si128((__m128i*)whiteData);
104	0
105	0	white = _mm_subs_epu8(white, black);
106	0	white = _mm_subs_epu8(greenMask, white);
107	0	black = _mm_andnot_si128(alphaMask, black);
108	0	white = _mm_slli_si128(white, 2);
109	0	white = _mm_and_si128(alphaMask, white);
110	0	black = _mm_or_si128(white, black);
111	0	_mm_store_si128((__m128i*)blackData, black);
112	0	blackData += 16;
113	0	whiteData += 16;
114	0	}
115	0	// Loop single pixels until we're done.
116	0	while (j < size.width) {
117	0	((uint32_t)blackData) =
118	0	RecoverPixel(reinterpret_cast<uint32_t>(blackData),
119	0	reinterpret_cast<uint32_t>(whiteData));
120	0	blackData += 4;
121	0	whiteData += 4;
122	0	j++;
123	0	}
124	0	blackData += blackSurf->Stride() - j * 4;
125	0	whiteData += whiteSurf->Stride() - j * 4;
126	0	}
127	0
128	0	blackSurf->MarkDirty();
129	0
130	0	return true;
131	0	}
132
133		static int32_t
134		ByteAlignment(int32_t aAlignToLog2, int32_t aX, int32_t aY=0, int32_t aStride=1)
135	0	{
136	0	return (aX + aStride * aY) & ((1 << aAlignToLog2) - 1);
137	0	}
138
139		/static/ mozilla::gfx::IntRect
140		gfxAlphaRecovery::AlignRectForSubimageRecovery(const mozilla::gfx::IntRect& aRect,
141		gfxImageSurface* aSurface)
142	0	{
143	0	NS_ASSERTION(mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 == aSurface->Format(),
144	0	"Thebes grew support for non-ARGB32 COLOR_ALPHA?");
145	0	static const int32_t kByteAlignLog2 = GoodAlignmentLog2();
146	0	static const int32_t bpp = 4;
147	0	static const int32_t pixPerAlign = (1 << kByteAlignLog2) / bpp;
148	0	//
149	0	// We're going to create a subimage of the surface with size
150	0	// <sw,sh> for alpha recovery, and want a SIMD fast-path. The
151	0	// rect <x,y, w,h> /needs/ to be redrawn, but it might not be
152	0	// properly aligned for SIMD. So we want to find a rect <x',y',
153	0	// w',h'> that's a superset of what needs to be redrawn but is
154	0	// properly aligned. Proper alignment is
155	0	//
156	0	// BPP * (x' + y' * sw) \cong 0 (mod ALIGN)
157	0	// BPP * w' \cong BPP * sw (mod ALIGN)
158	0	//
159	0	// (We assume the pixel at surface <0,0> is already ALIGN'd.)
160	0	// That rect (obviously) has to fit within the surface bounds, and
161	0	// we should also minimize the extra pixels redrawn only for
162	0	// alignment's sake. So we also want
163	0	//
164	0	// minimize <x',y', w',h'>
165	0	// 0 <= x' <= x
166	0	// 0 <= y' <= y
167	0	// w <= w' <= sw
168	0	// h <= h' <= sh
169	0	//
170	0	// This is a messy integer non-linear programming problem, except
171	0	// ... we can assume that ALIGN/BPP is a very small constant. So,
172	0	// brute force is viable. The algorithm below will find a
173	0	// solution if one exists, but isn't guaranteed to find the
174	0	// minimum solution. (For SSE2, ALIGN/BPP = 4, so it'll do at
175	0	// most 64 iterations below). In what's likely the common case,
176	0	// an already-aligned rectangle, it only needs 1 iteration.
177	0	//
178	0	// Is this alignment worth doing? Recovering alpha will take work
179	0	// proportional to w*h (assuming alpha recovery computation isn't
180	0	// memory bound). This analysis can lead to O(w+h) extra work
181	0	// (with small constants). In exchange, we expect to shave off a
182	0	// ALIGN/BPP constant by using SIMD-ized alpha recovery. So as
183	0	// w*h diverges from w+h, the win factor approaches ALIGN/BPP. We
184	0	// only really care about the w*h >> w+h case anyway; others
185	0	// should be fast enough even with the overhead. (Unless the cost
186	0	// of repainting the expanded rect is high, but in that case
187	0	// SIMD-ized alpha recovery won't make a difference so this code
188	0	// shouldn't be called.)
189	0	//
190	0	mozilla::gfx::IntSize surfaceSize = aSurface->GetSize();
191	0	const int32_t stride = bpp * surfaceSize.width;
192	0	if (stride != aSurface->Stride()) {
193	0	NS_WARNING("Unexpected stride, falling back on slow alpha recovery");
194	0	return aRect;
195	0	}
196	0
197	0	const int32_t x = aRect.X(), y = aRect.Y(), w = aRect.Width(), h = aRect.Height();
198	0	const int32_t r = x + w;
199	0	const int32_t sw = surfaceSize.width;
200	0	const int32_t strideAlign = ByteAlignment(kByteAlignLog2, stride);
201	0
202	0	// The outer two loops below keep the rightmost (\|r\| above) and
203	0	// bottommost pixels in \|aRect\| fixed wrt <x,y>, to ensure that we
204	0	// return only a superset of the original rect. These loops
205	0	// search for an aligned top-left pixel by trying to expand <x,y>
206	0	// left and up by <dx,dy> pixels, respectively.
207	0	//
208	0	// Then if a properly-aligned top-left pixel is found, the
209	0	// innermost loop tries to find an aligned stride by moving the
210	0	// rightmost pixel rightward by dr.
211	0	int32_t dx, dy, dr;
212	0	for (dy = 0; (dy < pixPerAlign) && (y - dy >= 0); ++dy) {
213	0	for (dx = 0; (dx < pixPerAlign) && (x - dx >= 0); ++dx) {
214	0	if (0 != ByteAlignment(kByteAlignLog2,
215	0	bpp * (x - dx), y - dy, stride)) {
216	0	continue;
217	0	}
218	0	for (dr = 0; (dr < pixPerAlign) && (r + dr <= sw); ++dr) {
219	0	if (strideAlign == ByteAlignment(kByteAlignLog2,
220	0	bpp * (w + dr + dx))) {
221	0	goto FOUND_SOLUTION;
222	0	}
223	0	}
224	0	}
225	0	}
226	0
227	0	// Didn't find a solution.
228	0	return aRect;
229	0
230	0	FOUND_SOLUTION:
231	0	mozilla::gfx::IntRect solution = mozilla::gfx::IntRect(x - dx, y - dy, w + dr + dx, h + dy);
232	0	MOZ_ASSERT(mozilla::gfx::IntRect(0, 0, sw, surfaceSize.height).Contains(solution),
233	0	"'Solution' extends outside surface bounds!");
234	0	return solution;
235	0	}