/src/mozilla-central/gfx/2d/SwizzleSSE2.cpp

Source (jump to first uncovered line)
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#include "Swizzle.h"

#include <emmintrin.h>

namespace mozilla {
namespace gfx {

// Load 1-3 pixels into a 4 pixel vector.
static MOZ_ALWAYS_INLINE __m128i
LoadRemainder_SSE2(const uint8_t* aSrc, size_t aLength)
{
  __m128i px;
  if (aLength >= 2) {
    // Load first 2 pixels
    px = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(aSrc));
    // Load third pixel
    if (aLength >= 3) {
      px = _mm_unpacklo_epi64(px,
                              _mm_cvtsi32_si128(*reinterpret_cast<const uint32_t*>(aSrc + 2 * 4)));
    }
  } else {
    // Load single pixel
    px = _mm_cvtsi32_si128(*reinterpret_cast<const uint32_t*>(aSrc));
  }
  return px;
}

// Store 1-3 pixels from a vector into memory without overwriting.
static MOZ_ALWAYS_INLINE void
StoreRemainder_SSE2(uint8_t* aDst, size_t aLength, const __m128i& aSrc)
{
  if (aLength >= 2) {
    // Store first 2 pixels
    _mm_storel_epi64(reinterpret_cast<__m128i*>(aDst), aSrc);
    // Store third pixel
    if (aLength >= 3) {
      *reinterpret_cast<uint32_t*>(aDst + 2 * 4) =
        _mm_cvtsi128_si32(_mm_srli_si128(aSrc, 2 * 4));
    }
  } else {
    // Store single pixel
    *reinterpret_cast<uint32_t*>(aDst) = _mm_cvtsi128_si32(aSrc);
  }
}

// Premultiply vector of 4 pixels using splayed math.
template<bool aSwapRB, bool aOpaqueAlpha>
static MOZ_ALWAYS_INLINE __m128i
PremultiplyVector_SSE2(const __m128i& aSrc)
{
  // Isolate R and B with mask.
  const __m128i mask = _mm_set1_epi32(0x00FF00FF);
  __m128i rb = _mm_and_si128(mask, aSrc);
  // Swap R and B if necessary.
  if (aSwapRB) {
    rb = _mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
    rb = _mm_shufflehi_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
  }
  // Isolate G and A by shifting down to bottom of word.
  __m128i ga = _mm_srli_epi16(aSrc, 8);

  // Duplicate alphas to get vector of A1 A1 A2 A2 A3 A3 A4 A4
  __m128i alphas = _mm_shufflelo_epi16(ga, _MM_SHUFFLE(3, 3, 1, 1));
  alphas = _mm_shufflehi_epi16(alphas, _MM_SHUFFLE(3, 3, 1, 1));

  // rb = rb*a + 255; rb += rb >> 8;
  rb = _mm_add_epi16(_mm_mullo_epi16(rb, alphas), mask);
  rb = _mm_add_epi16(rb, _mm_srli_epi16(rb, 8));

  // If format is not opaque, force A to 255 so that A*alpha/255 = alpha
  if (!aOpaqueAlpha) {
    ga = _mm_or_si128(ga, _mm_set1_epi32(0x00FF0000));
  }
  // ga = ga*a + 255; ga += ga >> 8;
  ga = _mm_add_epi16(_mm_mullo_epi16(ga, alphas), mask);
  ga = _mm_add_epi16(ga, _mm_srli_epi16(ga, 8));
  // If format is opaque, force output A to be 255.
  if (aOpaqueAlpha) {
    ga = _mm_or_si128(ga, _mm_set1_epi32(0xFF000000));
  }

  // Combine back to final pixel with (rb >> 8) | (ga & 0xFF00FF00)
  rb = _mm_srli_epi16(rb, 8);
  ga = _mm_andnot_si128(mask, ga);
  return _mm_or_si128(rb, ga);
}

template<bool aSwapRB, bool aOpaqueAlpha>
void Premultiply_SSE2(const uint8_t* aSrc, int32_t aSrcGap,
                      uint8_t* aDst, int32_t aDstGap,
                      IntSize aSize)
{
  int32_t alignedRow = 4 * (aSize.width & ~3);
  int32_t remainder = aSize.width & 3;
  // Fold remainder into stride gap.
  aSrcGap += 4 * remainder;
  aDstGap += 4 * remainder;

  for (int32_t height = aSize.height; height > 0; height--) {
    // Process all 4-pixel chunks as one vector.
    for (const uint8_t* end = aSrc + alignedRow; aSrc < end;) {
      __m128i px = _mm_loadu_si128(reinterpret_cast<const __m128i*>(aSrc));
      px = PremultiplyVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
      _mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px);
      aSrc += 4 * 4;
      aDst += 4 * 4;
    }

    // Handle any 1-3 remaining pixels.
    if (remainder) {
      __m128i px = LoadRemainder_SSE2(aSrc, remainder);
      px = PremultiplyVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
      StoreRemainder_SSE2(aDst, remainder, px);
    }

    aSrc += aSrcGap;
    aDst += aDstGap;
  }
}

// Force instantiation of premultiply variants here.
template void Premultiply_SSE2<false, false>(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
template void Premultiply_SSE2<false, true>(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
template void Premultiply_SSE2<true, false>(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
template void Premultiply_SSE2<true, true>(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);

// This generates a table of fixed-point reciprocals representing 1/alpha
// similar to the fallback implementation. However, the reciprocal must fit
// in 16 bits to multiply cheaply. Observe that reciprocals of smaller alphas
// require more bits than for larger alphas. We take advantage of this by
// shifting the reciprocal down by either 3 or 8 bits depending on whether
// the alpha value is less than 0x20. This is easy to then undo by multiplying
// the color component to be unpremultiplying by either 8 or 0x100, respectively.
// The 16 bit reciprocal is duplicated into both words of a uint32_t here to
// reduce unpacking overhead.
#define UNPREMULQ_SSE2(x) (0x10001U * (0xFF0220U / ((x) * ((x) < 0x20 ? 0x100 : 8))))
#define UNPREMULQ_SSE2_2(x) UNPREMULQ_SSE2(x), UNPREMULQ_SSE2((x) + 1)
#define UNPREMULQ_SSE2_4(x) UNPREMULQ_SSE2_2(x), UNPREMULQ_SSE2_2((x) + 2)
#define UNPREMULQ_SSE2_8(x) UNPREMULQ_SSE2_4(x), UNPREMULQ_SSE2_4((x) + 4)
#define UNPREMULQ_SSE2_16(x) UNPREMULQ_SSE2_8(x), UNPREMULQ_SSE2_8((x) + 8)
#define UNPREMULQ_SSE2_32(x) UNPREMULQ_SSE2_16(x), UNPREMULQ_SSE2_16((x) + 16)
static const uint32_t sUnpremultiplyTable_SSE2[256] =
{
  0, UNPREMULQ_SSE2(1), UNPREMULQ_SSE2_2(2), UNPREMULQ_SSE2_4(4),
  UNPREMULQ_SSE2_8(8), UNPREMULQ_SSE2_16(16), UNPREMULQ_SSE2_32(32),
  UNPREMULQ_SSE2_32(64), UNPREMULQ_SSE2_32(96), UNPREMULQ_SSE2_32(128),
  UNPREMULQ_SSE2_32(160), UNPREMULQ_SSE2_32(192), UNPREMULQ_SSE2_32(224)
};

// Unpremultiply a vector of 4 pixels using splayed math and a reciprocal table
// that avoids doing any actual division.
template<bool aSwapRB>
static MOZ_ALWAYS_INLINE __m128i
UnpremultiplyVector_SSE2(const __m128i& aSrc)
{
  // Isolate R and B with mask.
  __m128i rb = _mm_and_si128(aSrc, _mm_set1_epi32(0x00FF00FF));
  // Swap R and B if necessary.
  if (aSwapRB) {
    rb = _mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
    rb = _mm_shufflehi_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
  }

  // Isolate G and A by shifting down to bottom of word.
  __m128i ga = _mm_srli_epi16(aSrc, 8);
  // Extract the alphas for the 4 pixels from the now isolated words.
  int a1 = _mm_extract_epi16(ga, 1);
  int a2 = _mm_extract_epi16(ga, 3);
  int a3 = _mm_extract_epi16(ga, 5);
  int a4 = _mm_extract_epi16(ga, 7);

  // Load the 16 bit reciprocals from the table for each alpha.
  // The reciprocals are doubled in each uint32_t entry.
  // Unpack them to a final vector of duplicated reciprocals of
  // the form Q1 Q1 Q2 Q2 Q3 Q3 Q4 Q4.
  __m128i q12 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(sUnpremultiplyTable_SSE2[a1]),
                                   _mm_cvtsi32_si128(sUnpremultiplyTable_SSE2[a2]));
  __m128i q34 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(sUnpremultiplyTable_SSE2[a3]),
                                   _mm_cvtsi32_si128(sUnpremultiplyTable_SSE2[a4]));
  __m128i q1234 = _mm_unpacklo_epi64(q12, q34);

  // Check if the alphas are less than 0x20, so that we can undo
  // scaling of the reciprocals as appropriate.
  __m128i scale = _mm_cmplt_epi32(ga, _mm_set1_epi32(0x00200000));
  // Produce scale factors by ((a < 0x20) ^ 8) & 0x108,
  // such that scale is 0x100 if < 0x20, and 8 otherwise.
  scale = _mm_xor_si128(scale, _mm_set1_epi16(8));
  scale = _mm_and_si128(scale, _mm_set1_epi16(0x108));
  // Isolate G now so that we don't accidentally unpremultiply A.
  ga = _mm_and_si128(ga, _mm_set1_epi32(0x000000FF));

  // Scale R, B, and G as required depending on reciprocal precision.
  rb = _mm_mullo_epi16(rb, scale);
  ga = _mm_mullo_epi16(ga, scale);

  // Multiply R, B, and G by the reciprocal, only taking the high word
  // too effectively shift right by 16.
  rb = _mm_mulhi_epu16(rb, q1234);
  ga = _mm_mulhi_epu16(ga, q1234);

  // Combine back to final pixel with rb | (ga << 8) | (aSrc & 0xFF000000),
  // which will add back on the original alpha value unchanged.
  ga = _mm_slli_si128(ga, 1);
  ga = _mm_or_si128(ga, _mm_and_si128(aSrc, _mm_set1_epi32(0xFF000000)));
  return _mm_or_si128(rb, ga);
}

template<bool aSwapRB>
void Unpremultiply_SSE2(const uint8_t* aSrc, int32_t aSrcGap,
                        uint8_t* aDst, int32_t aDstGap,
                        IntSize aSize)
{
  int32_t alignedRow = 4 * (aSize.width & ~3);
  int32_t remainder = aSize.width & 3;
  // Fold remainder into stride gap.
  aSrcGap += 4 * remainder;
  aDstGap += 4 * remainder;

  for (int32_t height = aSize.height; height > 0; height--) {
    // Process all 4-pixel chunks as one vector.
    for (const uint8_t* end = aSrc + alignedRow; aSrc < end;) {
      __m128i px = _mm_loadu_si128(reinterpret_cast<const __m128i*>(aSrc));
      px = UnpremultiplyVector_SSE2<aSwapRB>(px);
      _mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px);
      aSrc += 4 * 4;
      aDst += 4 * 4;
    }

    // Handle any 1-3 remaining pixels.
    if (remainder) {
      __m128i px = LoadRemainder_SSE2(aSrc, remainder);
      px = UnpremultiplyVector_SSE2<aSwapRB>(px);
      StoreRemainder_SSE2(aDst, remainder, px);
    }

    aSrc += aSrcGap;
    aDst += aDstGap;
  }
}

// Force instantiation of unpremultiply variants here.
template void Unpremultiply_SSE2<false>(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
template void Unpremultiply_SSE2<true>(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);

// Swizzle a vector of 4 pixels providing swaps and opaquifying.
template<bool aSwapRB, bool aOpaqueAlpha>
static MOZ_ALWAYS_INLINE __m128i
SwizzleVector_SSE2(const __m128i& aSrc)
{
  // Isolate R and B.
  __m128i rb = _mm_and_si128(aSrc, _mm_set1_epi32(0x00FF00FF));
  // Swap R and B.
  rb = _mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
  rb = _mm_shufflehi_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
  // Isolate G and A.
  __m128i ga = _mm_and_si128(aSrc, _mm_set1_epi32(0xFF00FF00));
  // Force alpha to 255 if necessary.
  if (aOpaqueAlpha) {
    ga = _mm_or_si128(ga, _mm_set1_epi32(0xFF000000));
  }
  // Combine everything back together.
  return _mm_or_si128(rb, ga);
}

#if 0
// These specializations currently do not profile faster than the generic versions,
// so disable them for now.

// Optimized implementations for when there is no R and B swap.
template<>
MOZ_ALWAYS_INLINE __m128i
SwizzleVector_SSE2<false, true>(const __m128i& aSrc)
{
  // Force alpha to 255.
  return _mm_or_si128(aSrc, _mm_set1_epi32(0xFF000000));
}

template<>
MOZ_ALWAYS_INLINE __m128i
SwizzleVector_SSE2<false, false>(const __m128i& aSrc)
{
  return aSrc;
}
#endif

template<bool aSwapRB, bool aOpaqueAlpha>
void Swizzle_SSE2(const uint8_t* aSrc, int32_t aSrcGap,
                  uint8_t* aDst, int32_t aDstGap,
                  IntSize aSize)
{
  int32_t alignedRow = 4 * (aSize.width & ~3);
  int32_t remainder = aSize.width & 3;
  // Fold remainder into stride gap.
  aSrcGap += 4 * remainder;
  aDstGap += 4 * remainder;

  for (int32_t height = aSize.height; height > 0; height--) {
    // Process all 4-pixel chunks as one vector.
    for (const uint8_t* end = aSrc + alignedRow; aSrc < end;) {
      __m128i px = _mm_loadu_si128(reinterpret_cast<const __m128i*>(aSrc));
      px = SwizzleVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
      _mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px);
      aSrc += 4 * 4;
      aDst += 4 * 4;
    }

    // Handle any 1-3 remaining pixels.
    if (remainder) {
      __m128i px = LoadRemainder_SSE2(aSrc, remainder);
      px = SwizzleVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
      StoreRemainder_SSE2(aDst, remainder, px);
    }

    aSrc += aSrcGap;
    aDst += aDstGap;
  }
}

// Force instantiation of swizzle variants here.
template void Swizzle_SSE2<true, false>(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
template void Swizzle_SSE2<true, true>(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);

} // namespace gfx
} // namespace mozilla


Coverage Report

Created: 2018-09-25 14:53

Line	Count	Source (jump to first uncovered line)
1		/* -- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -- */
2		/* vim: set ts=8 sts=2 et sw=2 tw=80: */
3		/* This Source Code Form is subject to the terms of the Mozilla Public
4		* License, v. 2.0. If a copy of the MPL was not distributed with this
5		* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6
7		#include "Swizzle.h"
8
9		#include <emmintrin.h>
10
11		namespace mozilla {
12		namespace gfx {
13
14		// Load 1-3 pixels into a 4 pixel vector.
15		static MOZ_ALWAYS_INLINE __m128i
16		LoadRemainder_SSE2(const uint8_t* aSrc, size_t aLength)
17	0	{
18	0	__m128i px;
19	0	if (aLength >= 2) {
20	0	// Load first 2 pixels
21	0	px = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(aSrc));
22	0	// Load third pixel
23	0	if (aLength >= 3) {
24	0	px = _mm_unpacklo_epi64(px,
25	0	_mm_cvtsi32_si128(reinterpret_cast<const uint32_t>(aSrc + 2 * 4)));
26	0	}
27	0	} else {
28	0	// Load single pixel
29	0	px = _mm_cvtsi32_si128(reinterpret_cast<const uint32_t>(aSrc));
30	0	}
31	0	return px;
32	0	}
33
34		// Store 1-3 pixels from a vector into memory without overwriting.
35		static MOZ_ALWAYS_INLINE void
36		StoreRemainder_SSE2(uint8_t* aDst, size_t aLength, const __m128i& aSrc)
37	0	{
38	0	if (aLength >= 2) {
39	0	// Store first 2 pixels
40	0	_mm_storel_epi64(reinterpret_cast<__m128i*>(aDst), aSrc);
41	0	// Store third pixel
42	0	if (aLength >= 3) {
43	0	reinterpret_cast<uint32_t>(aDst + 2 * 4) =
44	0	_mm_cvtsi128_si32(_mm_srli_si128(aSrc, 2 * 4));
45	0	}
46	0	} else {
47	0	// Store single pixel
48	0	reinterpret_cast<uint32_t>(aDst) = _mm_cvtsi128_si32(aSrc);
49	0	}
50	0	}
51
52		// Premultiply vector of 4 pixels using splayed math.
53		template<bool aSwapRB, bool aOpaqueAlpha>
54		static MOZ_ALWAYS_INLINE __m128i
55		PremultiplyVector_SSE2(const __m128i& aSrc)
56	0	{
57	0	// Isolate R and B with mask.
58	0	const __m128i mask = _mm_set1_epi32(0x00FF00FF);
59	0	__m128i rb = _mm_and_si128(mask, aSrc);
60	0	// Swap R and B if necessary.
61	0	if (aSwapRB) {
62	0	rb = _mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
63	0	rb = _mm_shufflehi_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
64	0	}
65	0	// Isolate G and A by shifting down to bottom of word.
66	0	__m128i ga = _mm_srli_epi16(aSrc, 8);
67	0
68	0	// Duplicate alphas to get vector of A1 A1 A2 A2 A3 A3 A4 A4
69	0	__m128i alphas = _mm_shufflelo_epi16(ga, _MM_SHUFFLE(3, 3, 1, 1));
70	0	alphas = _mm_shufflehi_epi16(alphas, _MM_SHUFFLE(3, 3, 1, 1));
71	0
72	0	// rb = rb*a + 255; rb += rb >> 8;
73	0	rb = _mm_add_epi16(_mm_mullo_epi16(rb, alphas), mask);
74	0	rb = _mm_add_epi16(rb, _mm_srli_epi16(rb, 8));
75	0
76	0	// If format is not opaque, force A to 255 so that A*alpha/255 = alpha
77	0	if (!aOpaqueAlpha) {
78	0	ga = _mm_or_si128(ga, _mm_set1_epi32(0x00FF0000));
79	0	}
80	0	// ga = ga*a + 255; ga += ga >> 8;
81	0	ga = _mm_add_epi16(_mm_mullo_epi16(ga, alphas), mask);
82	0	ga = _mm_add_epi16(ga, _mm_srli_epi16(ga, 8));
83	0	// If format is opaque, force output A to be 255.
84	0	if (aOpaqueAlpha) {
85	0	ga = _mm_or_si128(ga, _mm_set1_epi32(0xFF000000));
86	0	}
87	0
88	0	// Combine back to final pixel with (rb >> 8) \| (ga & 0xFF00FF00)
89	0	rb = _mm_srli_epi16(rb, 8);
90	0	ga = _mm_andnot_si128(mask, ga);
91	0	return _mm_or_si128(rb, ga);
92	0	} Unexecuted instantiation: SwizzleSSE2.cpp:long long __vector(2) mozilla::gfx::PremultiplyVector_SSE2<false, false>(long long __vector(2) const&) Unexecuted instantiation: SwizzleSSE2.cpp:long long __vector(2) mozilla::gfx::PremultiplyVector_SSE2<false, true>(long long __vector(2) const&) Unexecuted instantiation: SwizzleSSE2.cpp:long long __vector(2) mozilla::gfx::PremultiplyVector_SSE2<true, false>(long long __vector(2) const&) Unexecuted instantiation: SwizzleSSE2.cpp:long long __vector(2) mozilla::gfx::PremultiplyVector_SSE2<true, true>(long long __vector(2) const&)
93
94		template<bool aSwapRB, bool aOpaqueAlpha>
95		void Premultiply_SSE2(const uint8_t* aSrc, int32_t aSrcGap,
96		uint8_t* aDst, int32_t aDstGap,
97		IntSize aSize)
98	0	{
99	0	int32_t alignedRow = 4 * (aSize.width & ~3);
100	0	int32_t remainder = aSize.width & 3;
101	0	// Fold remainder into stride gap.
102	0	aSrcGap += 4 * remainder;
103	0	aDstGap += 4 * remainder;
104	0
105	0	for (int32_t height = aSize.height; height > 0; height--) {
106	0	// Process all 4-pixel chunks as one vector.
107	0	for (const uint8_t* end = aSrc + alignedRow; aSrc < end;) {
108	0	__m128i px = _mm_loadu_si128(reinterpret_cast<const __m128i*>(aSrc));
109	0	px = PremultiplyVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
110	0	_mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px);
111	0	aSrc += 4 * 4;
112	0	aDst += 4 * 4;
113	0	}
114	0
115	0	// Handle any 1-3 remaining pixels.
116	0	if (remainder) {
117	0	__m128i px = LoadRemainder_SSE2(aSrc, remainder);
118	0	px = PremultiplyVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
119	0	StoreRemainder_SSE2(aDst, remainder, px);
120	0	}
121	0
122	0	aSrc += aSrcGap;
123	0	aDst += aDstGap;
124	0	}
125	0	} Unexecuted instantiation: void mozilla::gfx::Premultiply_SSE2<false, false>(unsigned char const, int, unsigned char, int, mozilla::gfx::IntSizeTyped<mozilla::gfx::UnknownUnits>) Unexecuted instantiation: void mozilla::gfx::Premultiply_SSE2<false, true>(unsigned char const, int, unsigned char, int, mozilla::gfx::IntSizeTyped<mozilla::gfx::UnknownUnits>) Unexecuted instantiation: void mozilla::gfx::Premultiply_SSE2<true, false>(unsigned char const, int, unsigned char, int, mozilla::gfx::IntSizeTyped<mozilla::gfx::UnknownUnits>) Unexecuted instantiation: void mozilla::gfx::Premultiply_SSE2<true, true>(unsigned char const, int, unsigned char, int, mozilla::gfx::IntSizeTyped<mozilla::gfx::UnknownUnits>)
126
127		// Force instantiation of premultiply variants here.
128		template void Premultiply_SSE2<false, false>(const uint8_t, int32_t, uint8_t, int32_t, IntSize);
129		template void Premultiply_SSE2<false, true>(const uint8_t, int32_t, uint8_t, int32_t, IntSize);
130		template void Premultiply_SSE2<true, false>(const uint8_t, int32_t, uint8_t, int32_t, IntSize);
131		template void Premultiply_SSE2<true, true>(const uint8_t, int32_t, uint8_t, int32_t, IntSize);
132
133		// This generates a table of fixed-point reciprocals representing 1/alpha
134		// similar to the fallback implementation. However, the reciprocal must fit
135		// in 16 bits to multiply cheaply. Observe that reciprocals of smaller alphas
136		// require more bits than for larger alphas. We take advantage of this by
137		// shifting the reciprocal down by either 3 or 8 bits depending on whether
138		// the alpha value is less than 0x20. This is easy to then undo by multiplying
139		// the color component to be unpremultiplying by either 8 or 0x100, respectively.
140		// The 16 bit reciprocal is duplicated into both words of a uint32_t here to
141		// reduce unpacking overhead.
142		#define UNPREMULQ_SSE2(x) (0x10001U * (0xFF0220U / ((x) * ((x) < 0x20 ? 0x100 : 8))))
143		#define UNPREMULQ_SSE2_2(x) UNPREMULQ_SSE2(x), UNPREMULQ_SSE2((x) + 1)
144		#define UNPREMULQ_SSE2_4(x) UNPREMULQ_SSE2_2(x), UNPREMULQ_SSE2_2((x) + 2)
145		#define UNPREMULQ_SSE2_8(x) UNPREMULQ_SSE2_4(x), UNPREMULQ_SSE2_4((x) + 4)
146		#define UNPREMULQ_SSE2_16(x) UNPREMULQ_SSE2_8(x), UNPREMULQ_SSE2_8((x) + 8)
147		#define UNPREMULQ_SSE2_32(x) UNPREMULQ_SSE2_16(x), UNPREMULQ_SSE2_16((x) + 16)
148		static const uint32_t sUnpremultiplyTable_SSE2[256] =
149		{
150		0, UNPREMULQ_SSE2(1), UNPREMULQ_SSE2_2(2), UNPREMULQ_SSE2_4(4),
151		UNPREMULQ_SSE2_8(8), UNPREMULQ_SSE2_16(16), UNPREMULQ_SSE2_32(32),
152		UNPREMULQ_SSE2_32(64), UNPREMULQ_SSE2_32(96), UNPREMULQ_SSE2_32(128),
153		UNPREMULQ_SSE2_32(160), UNPREMULQ_SSE2_32(192), UNPREMULQ_SSE2_32(224)
154		};
155
156		// Unpremultiply a vector of 4 pixels using splayed math and a reciprocal table
157		// that avoids doing any actual division.
158		template<bool aSwapRB>
159		static MOZ_ALWAYS_INLINE __m128i
160		UnpremultiplyVector_SSE2(const __m128i& aSrc)
161	0	{
162	0	// Isolate R and B with mask.
163	0	__m128i rb = _mm_and_si128(aSrc, _mm_set1_epi32(0x00FF00FF));
164	0	// Swap R and B if necessary.
165	0	if (aSwapRB) {
166	0	rb = _mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
167	0	rb = _mm_shufflehi_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
168	0	}
169	0
170	0	// Isolate G and A by shifting down to bottom of word.
171	0	__m128i ga = _mm_srli_epi16(aSrc, 8);
172	0	// Extract the alphas for the 4 pixels from the now isolated words.
173	0	int a1 = _mm_extract_epi16(ga, 1);
174	0	int a2 = _mm_extract_epi16(ga, 3);
175	0	int a3 = _mm_extract_epi16(ga, 5);
176	0	int a4 = _mm_extract_epi16(ga, 7);
177	0
178	0	// Load the 16 bit reciprocals from the table for each alpha.
179	0	// The reciprocals are doubled in each uint32_t entry.
180	0	// Unpack them to a final vector of duplicated reciprocals of
181	0	// the form Q1 Q1 Q2 Q2 Q3 Q3 Q4 Q4.
182	0	__m128i q12 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(sUnpremultiplyTable_SSE2[a1]),
183	0	_mm_cvtsi32_si128(sUnpremultiplyTable_SSE2[a2]));
184	0	__m128i q34 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(sUnpremultiplyTable_SSE2[a3]),
185	0	_mm_cvtsi32_si128(sUnpremultiplyTable_SSE2[a4]));
186	0	__m128i q1234 = _mm_unpacklo_epi64(q12, q34);
187	0
188	0	// Check if the alphas are less than 0x20, so that we can undo
189	0	// scaling of the reciprocals as appropriate.
190	0	__m128i scale = _mm_cmplt_epi32(ga, _mm_set1_epi32(0x00200000));
191	0	// Produce scale factors by ((a < 0x20) ^ 8) & 0x108,
192	0	// such that scale is 0x100 if < 0x20, and 8 otherwise.
193	0	scale = _mm_xor_si128(scale, _mm_set1_epi16(8));
194	0	scale = _mm_and_si128(scale, _mm_set1_epi16(0x108));
195	0	// Isolate G now so that we don't accidentally unpremultiply A.
196	0	ga = _mm_and_si128(ga, _mm_set1_epi32(0x000000FF));
197	0
198	0	// Scale R, B, and G as required depending on reciprocal precision.
199	0	rb = _mm_mullo_epi16(rb, scale);
200	0	ga = _mm_mullo_epi16(ga, scale);
201	0
202	0	// Multiply R, B, and G by the reciprocal, only taking the high word
203	0	// too effectively shift right by 16.
204	0	rb = _mm_mulhi_epu16(rb, q1234);
205	0	ga = _mm_mulhi_epu16(ga, q1234);
206	0
207	0	// Combine back to final pixel with rb \| (ga << 8) \| (aSrc & 0xFF000000),
208	0	// which will add back on the original alpha value unchanged.
209	0	ga = _mm_slli_si128(ga, 1);
210	0	ga = _mm_or_si128(ga, _mm_and_si128(aSrc, _mm_set1_epi32(0xFF000000)));
211	0	return _mm_or_si128(rb, ga);
212	0	} Unexecuted instantiation: SwizzleSSE2.cpp:long long __vector(2) mozilla::gfx::UnpremultiplyVector_SSE2<false>(long long __vector(2) const&) Unexecuted instantiation: SwizzleSSE2.cpp:long long __vector(2) mozilla::gfx::UnpremultiplyVector_SSE2<true>(long long __vector(2) const&)
213
214		template<bool aSwapRB>
215		void Unpremultiply_SSE2(const uint8_t* aSrc, int32_t aSrcGap,
216		uint8_t* aDst, int32_t aDstGap,
217		IntSize aSize)
218	0	{
219	0	int32_t alignedRow = 4 * (aSize.width & ~3);
220	0	int32_t remainder = aSize.width & 3;
221	0	// Fold remainder into stride gap.
222	0	aSrcGap += 4 * remainder;
223	0	aDstGap += 4 * remainder;
224	0
225	0	for (int32_t height = aSize.height; height > 0; height--) {
226	0	// Process all 4-pixel chunks as one vector.
227	0	for (const uint8_t* end = aSrc + alignedRow; aSrc < end;) {
228	0	__m128i px = _mm_loadu_si128(reinterpret_cast<const __m128i*>(aSrc));
229	0	px = UnpremultiplyVector_SSE2<aSwapRB>(px);
230	0	_mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px);
231	0	aSrc += 4 * 4;
232	0	aDst += 4 * 4;
233	0	}
234	0
235	0	// Handle any 1-3 remaining pixels.
236	0	if (remainder) {
237	0	__m128i px = LoadRemainder_SSE2(aSrc, remainder);
238	0	px = UnpremultiplyVector_SSE2<aSwapRB>(px);
239	0	StoreRemainder_SSE2(aDst, remainder, px);
240	0	}
241	0
242	0	aSrc += aSrcGap;
243	0	aDst += aDstGap;
244	0	}
245	0	} Unexecuted instantiation: void mozilla::gfx::Unpremultiply_SSE2<false>(unsigned char const, int, unsigned char, int, mozilla::gfx::IntSizeTyped<mozilla::gfx::UnknownUnits>) Unexecuted instantiation: void mozilla::gfx::Unpremultiply_SSE2<true>(unsigned char const, int, unsigned char, int, mozilla::gfx::IntSizeTyped<mozilla::gfx::UnknownUnits>)
246
247		// Force instantiation of unpremultiply variants here.
248		template void Unpremultiply_SSE2<false>(const uint8_t, int32_t, uint8_t, int32_t, IntSize);
249		template void Unpremultiply_SSE2<true>(const uint8_t, int32_t, uint8_t, int32_t, IntSize);
250
251		// Swizzle a vector of 4 pixels providing swaps and opaquifying.
252		template<bool aSwapRB, bool aOpaqueAlpha>
253		static MOZ_ALWAYS_INLINE __m128i
254		SwizzleVector_SSE2(const __m128i& aSrc)
255	0	{
256	0	// Isolate R and B.
257	0	__m128i rb = _mm_and_si128(aSrc, _mm_set1_epi32(0x00FF00FF));
258	0	// Swap R and B.
259	0	rb = _mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
260	0	rb = _mm_shufflehi_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
261	0	// Isolate G and A.
262	0	__m128i ga = _mm_and_si128(aSrc, _mm_set1_epi32(0xFF00FF00));
263	0	// Force alpha to 255 if necessary.
264	0	if (aOpaqueAlpha) {
265	0	ga = _mm_or_si128(ga, _mm_set1_epi32(0xFF000000));
266	0	}
267	0	// Combine everything back together.
268	0	return _mm_or_si128(rb, ga);
269	0	} Unexecuted instantiation: SwizzleSSE2.cpp:long long __vector(2) mozilla::gfx::SwizzleVector_SSE2<true, false>(long long __vector(2) const&) Unexecuted instantiation: SwizzleSSE2.cpp:long long __vector(2) mozilla::gfx::SwizzleVector_SSE2<true, true>(long long __vector(2) const&)
270
271		#if 0
272		// These specializations currently do not profile faster than the generic versions,
273		// so disable them for now.
274
275		// Optimized implementations for when there is no R and B swap.
276		template<>
277		MOZ_ALWAYS_INLINE __m128i
278		SwizzleVector_SSE2<false, true>(const __m128i& aSrc)
279		{
280		// Force alpha to 255.
281		return _mm_or_si128(aSrc, _mm_set1_epi32(0xFF000000));
282		}
283
284		template<>
285		MOZ_ALWAYS_INLINE __m128i
286		SwizzleVector_SSE2<false, false>(const __m128i& aSrc)
287		{
288		return aSrc;
289		}
290		#endif
291
292		template<bool aSwapRB, bool aOpaqueAlpha>
293		void Swizzle_SSE2(const uint8_t* aSrc, int32_t aSrcGap,
294		uint8_t* aDst, int32_t aDstGap,
295		IntSize aSize)
296	0	{
297	0	int32_t alignedRow = 4 * (aSize.width & ~3);
298	0	int32_t remainder = aSize.width & 3;
299	0	// Fold remainder into stride gap.
300	0	aSrcGap += 4 * remainder;
301	0	aDstGap += 4 * remainder;
302	0
303	0	for (int32_t height = aSize.height; height > 0; height--) {
304	0	// Process all 4-pixel chunks as one vector.
305	0	for (const uint8_t* end = aSrc + alignedRow; aSrc < end;) {
306	0	__m128i px = _mm_loadu_si128(reinterpret_cast<const __m128i*>(aSrc));
307	0	px = SwizzleVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
308	0	_mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px);
309	0	aSrc += 4 * 4;
310	0	aDst += 4 * 4;
311	0	}
312	0
313	0	// Handle any 1-3 remaining pixels.
314	0	if (remainder) {
315	0	__m128i px = LoadRemainder_SSE2(aSrc, remainder);
316	0	px = SwizzleVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
317	0	StoreRemainder_SSE2(aDst, remainder, px);
318	0	}
319	0
320	0	aSrc += aSrcGap;
321	0	aDst += aDstGap;
322	0	}
323	0	} Unexecuted instantiation: void mozilla::gfx::Swizzle_SSE2<true, false>(unsigned char const, int, unsigned char, int, mozilla::gfx::IntSizeTyped<mozilla::gfx::UnknownUnits>) Unexecuted instantiation: void mozilla::gfx::Swizzle_SSE2<true, true>(unsigned char const, int, unsigned char, int, mozilla::gfx::IntSizeTyped<mozilla::gfx::UnknownUnits>)
324
325		// Force instantiation of swizzle variants here.
326		template void Swizzle_SSE2<true, false>(const uint8_t, int32_t, uint8_t, int32_t, IntSize);
327		template void Swizzle_SSE2<true, true>(const uint8_t, int32_t, uint8_t, int32_t, IntSize);
328
329		} // namespace gfx
330		} // namespace mozilla
331