Coverage Report

Created: 2018-09-25 14:53

/src/mozilla-central/gfx/2d/SwizzleSSE2.cpp
Line
Count
Source (jump to first uncovered line)
1
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
3
/* This Source Code Form is subject to the terms of the Mozilla Public
4
 * License, v. 2.0. If a copy of the MPL was not distributed with this
5
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6
7
#include "Swizzle.h"
8
9
#include <emmintrin.h>
10
11
namespace mozilla {
12
namespace gfx {
13
14
// Load 1-3 pixels into a 4 pixel vector.
15
static MOZ_ALWAYS_INLINE __m128i
16
LoadRemainder_SSE2(const uint8_t* aSrc, size_t aLength)
17
0
{
18
0
  __m128i px;
19
0
  if (aLength >= 2) {
20
0
    // Load first 2 pixels
21
0
    px = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(aSrc));
22
0
    // Load third pixel
23
0
    if (aLength >= 3) {
24
0
      px = _mm_unpacklo_epi64(px,
25
0
                              _mm_cvtsi32_si128(*reinterpret_cast<const uint32_t*>(aSrc + 2 * 4)));
26
0
    }
27
0
  } else {
28
0
    // Load single pixel
29
0
    px = _mm_cvtsi32_si128(*reinterpret_cast<const uint32_t*>(aSrc));
30
0
  }
31
0
  return px;
32
0
}
33
34
// Store 1-3 pixels from a vector into memory without overwriting.
35
static MOZ_ALWAYS_INLINE void
36
StoreRemainder_SSE2(uint8_t* aDst, size_t aLength, const __m128i& aSrc)
37
0
{
38
0
  if (aLength >= 2) {
39
0
    // Store first 2 pixels
40
0
    _mm_storel_epi64(reinterpret_cast<__m128i*>(aDst), aSrc);
41
0
    // Store third pixel
42
0
    if (aLength >= 3) {
43
0
      *reinterpret_cast<uint32_t*>(aDst + 2 * 4) =
44
0
        _mm_cvtsi128_si32(_mm_srli_si128(aSrc, 2 * 4));
45
0
    }
46
0
  } else {
47
0
    // Store single pixel
48
0
    *reinterpret_cast<uint32_t*>(aDst) = _mm_cvtsi128_si32(aSrc);
49
0
  }
50
0
}
51
52
// Premultiply vector of 4 pixels using splayed math.
53
template<bool aSwapRB, bool aOpaqueAlpha>
54
static MOZ_ALWAYS_INLINE __m128i
55
PremultiplyVector_SSE2(const __m128i& aSrc)
56
0
{
57
0
  // Isolate R and B with mask.
58
0
  const __m128i mask = _mm_set1_epi32(0x00FF00FF);
59
0
  __m128i rb = _mm_and_si128(mask, aSrc);
60
0
  // Swap R and B if necessary.
61
0
  if (aSwapRB) {
62
0
    rb = _mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
63
0
    rb = _mm_shufflehi_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
64
0
  }
65
0
  // Isolate G and A by shifting down to bottom of word.
66
0
  __m128i ga = _mm_srli_epi16(aSrc, 8);
67
0
68
0
  // Duplicate alphas to get vector of A1 A1 A2 A2 A3 A3 A4 A4
69
0
  __m128i alphas = _mm_shufflelo_epi16(ga, _MM_SHUFFLE(3, 3, 1, 1));
70
0
  alphas = _mm_shufflehi_epi16(alphas, _MM_SHUFFLE(3, 3, 1, 1));
71
0
72
0
  // rb = rb*a + 255; rb += rb >> 8;
73
0
  rb = _mm_add_epi16(_mm_mullo_epi16(rb, alphas), mask);
74
0
  rb = _mm_add_epi16(rb, _mm_srli_epi16(rb, 8));
75
0
76
0
  // If format is not opaque, force A to 255 so that A*alpha/255 = alpha
77
0
  if (!aOpaqueAlpha) {
78
0
    ga = _mm_or_si128(ga, _mm_set1_epi32(0x00FF0000));
79
0
  }
80
0
  // ga = ga*a + 255; ga += ga >> 8;
81
0
  ga = _mm_add_epi16(_mm_mullo_epi16(ga, alphas), mask);
82
0
  ga = _mm_add_epi16(ga, _mm_srli_epi16(ga, 8));
83
0
  // If format is opaque, force output A to be 255.
84
0
  if (aOpaqueAlpha) {
85
0
    ga = _mm_or_si128(ga, _mm_set1_epi32(0xFF000000));
86
0
  }
87
0
88
0
  // Combine back to final pixel with (rb >> 8) | (ga & 0xFF00FF00)
89
0
  rb = _mm_srli_epi16(rb, 8);
90
0
  ga = _mm_andnot_si128(mask, ga);
91
0
  return _mm_or_si128(rb, ga);
92
0
}
Unexecuted instantiation: SwizzleSSE2.cpp:long long __vector(2) mozilla::gfx::PremultiplyVector_SSE2<false, false>(long long __vector(2) const&)
Unexecuted instantiation: SwizzleSSE2.cpp:long long __vector(2) mozilla::gfx::PremultiplyVector_SSE2<false, true>(long long __vector(2) const&)
Unexecuted instantiation: SwizzleSSE2.cpp:long long __vector(2) mozilla::gfx::PremultiplyVector_SSE2<true, false>(long long __vector(2) const&)
Unexecuted instantiation: SwizzleSSE2.cpp:long long __vector(2) mozilla::gfx::PremultiplyVector_SSE2<true, true>(long long __vector(2) const&)
93
94
template<bool aSwapRB, bool aOpaqueAlpha>
95
void Premultiply_SSE2(const uint8_t* aSrc, int32_t aSrcGap,
96
                      uint8_t* aDst, int32_t aDstGap,
97
                      IntSize aSize)
98
0
{
99
0
  int32_t alignedRow = 4 * (aSize.width & ~3);
100
0
  int32_t remainder = aSize.width & 3;
101
0
  // Fold remainder into stride gap.
102
0
  aSrcGap += 4 * remainder;
103
0
  aDstGap += 4 * remainder;
104
0
105
0
  for (int32_t height = aSize.height; height > 0; height--) {
106
0
    // Process all 4-pixel chunks as one vector.
107
0
    for (const uint8_t* end = aSrc + alignedRow; aSrc < end;) {
108
0
      __m128i px = _mm_loadu_si128(reinterpret_cast<const __m128i*>(aSrc));
109
0
      px = PremultiplyVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
110
0
      _mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px);
111
0
      aSrc += 4 * 4;
112
0
      aDst += 4 * 4;
113
0
    }
114
0
115
0
    // Handle any 1-3 remaining pixels.
116
0
    if (remainder) {
117
0
      __m128i px = LoadRemainder_SSE2(aSrc, remainder);
118
0
      px = PremultiplyVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
119
0
      StoreRemainder_SSE2(aDst, remainder, px);
120
0
    }
121
0
122
0
    aSrc += aSrcGap;
123
0
    aDst += aDstGap;
124
0
  }
125
0
}
Unexecuted instantiation: void mozilla::gfx::Premultiply_SSE2<false, false>(unsigned char const*, int, unsigned char*, int, mozilla::gfx::IntSizeTyped<mozilla::gfx::UnknownUnits>)
Unexecuted instantiation: void mozilla::gfx::Premultiply_SSE2<false, true>(unsigned char const*, int, unsigned char*, int, mozilla::gfx::IntSizeTyped<mozilla::gfx::UnknownUnits>)
Unexecuted instantiation: void mozilla::gfx::Premultiply_SSE2<true, false>(unsigned char const*, int, unsigned char*, int, mozilla::gfx::IntSizeTyped<mozilla::gfx::UnknownUnits>)
Unexecuted instantiation: void mozilla::gfx::Premultiply_SSE2<true, true>(unsigned char const*, int, unsigned char*, int, mozilla::gfx::IntSizeTyped<mozilla::gfx::UnknownUnits>)
126
127
// Force instantiation of premultiply variants here.
128
template void Premultiply_SSE2<false, false>(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
129
template void Premultiply_SSE2<false, true>(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
130
template void Premultiply_SSE2<true, false>(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
131
template void Premultiply_SSE2<true, true>(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
132
133
// This generates a table of fixed-point reciprocals representing 1/alpha
134
// similar to the fallback implementation. However, the reciprocal must fit
135
// in 16 bits to multiply cheaply. Observe that reciprocals of smaller alphas
136
// require more bits than for larger alphas. We take advantage of this by
137
// shifting the reciprocal down by either 3 or 8 bits depending on whether
138
// the alpha value is less than 0x20. This is easy to then undo by multiplying
139
// the color component to be unpremultiplying by either 8 or 0x100, respectively.
140
// The 16 bit reciprocal is duplicated into both words of a uint32_t here to
141
// reduce unpacking overhead.
142
#define UNPREMULQ_SSE2(x) (0x10001U * (0xFF0220U / ((x) * ((x) < 0x20 ? 0x100 : 8))))
143
#define UNPREMULQ_SSE2_2(x) UNPREMULQ_SSE2(x), UNPREMULQ_SSE2((x) + 1)
144
#define UNPREMULQ_SSE2_4(x) UNPREMULQ_SSE2_2(x), UNPREMULQ_SSE2_2((x) + 2)
145
#define UNPREMULQ_SSE2_8(x) UNPREMULQ_SSE2_4(x), UNPREMULQ_SSE2_4((x) + 4)
146
#define UNPREMULQ_SSE2_16(x) UNPREMULQ_SSE2_8(x), UNPREMULQ_SSE2_8((x) + 8)
147
#define UNPREMULQ_SSE2_32(x) UNPREMULQ_SSE2_16(x), UNPREMULQ_SSE2_16((x) + 16)
148
static const uint32_t sUnpremultiplyTable_SSE2[256] =
149
{
150
  0, UNPREMULQ_SSE2(1), UNPREMULQ_SSE2_2(2), UNPREMULQ_SSE2_4(4),
151
  UNPREMULQ_SSE2_8(8), UNPREMULQ_SSE2_16(16), UNPREMULQ_SSE2_32(32),
152
  UNPREMULQ_SSE2_32(64), UNPREMULQ_SSE2_32(96), UNPREMULQ_SSE2_32(128),
153
  UNPREMULQ_SSE2_32(160), UNPREMULQ_SSE2_32(192), UNPREMULQ_SSE2_32(224)
154
};
155
156
// Unpremultiply a vector of 4 pixels using splayed math and a reciprocal table
157
// that avoids doing any actual division.
158
template<bool aSwapRB>
159
static MOZ_ALWAYS_INLINE __m128i
160
UnpremultiplyVector_SSE2(const __m128i& aSrc)
161
0
{
162
0
  // Isolate R and B with mask.
163
0
  __m128i rb = _mm_and_si128(aSrc, _mm_set1_epi32(0x00FF00FF));
164
0
  // Swap R and B if necessary.
165
0
  if (aSwapRB) {
166
0
    rb = _mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
167
0
    rb = _mm_shufflehi_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
168
0
  }
169
0
170
0
  // Isolate G and A by shifting down to bottom of word.
171
0
  __m128i ga = _mm_srli_epi16(aSrc, 8);
172
0
  // Extract the alphas for the 4 pixels from the now isolated words.
173
0
  int a1 = _mm_extract_epi16(ga, 1);
174
0
  int a2 = _mm_extract_epi16(ga, 3);
175
0
  int a3 = _mm_extract_epi16(ga, 5);
176
0
  int a4 = _mm_extract_epi16(ga, 7);
177
0
178
0
  // Load the 16 bit reciprocals from the table for each alpha.
179
0
  // The reciprocals are doubled in each uint32_t entry.
180
0
  // Unpack them to a final vector of duplicated reciprocals of
181
0
  // the form Q1 Q1 Q2 Q2 Q3 Q3 Q4 Q4.
182
0
  __m128i q12 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(sUnpremultiplyTable_SSE2[a1]),
183
0
                                   _mm_cvtsi32_si128(sUnpremultiplyTable_SSE2[a2]));
184
0
  __m128i q34 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(sUnpremultiplyTable_SSE2[a3]),
185
0
                                   _mm_cvtsi32_si128(sUnpremultiplyTable_SSE2[a4]));
186
0
  __m128i q1234 = _mm_unpacklo_epi64(q12, q34);
187
0
188
0
  // Check if the alphas are less than 0x20, so that we can undo
189
0
  // scaling of the reciprocals as appropriate.
190
0
  __m128i scale = _mm_cmplt_epi32(ga, _mm_set1_epi32(0x00200000));
191
0
  // Produce scale factors by ((a < 0x20) ^ 8) & 0x108,
192
0
  // such that scale is 0x100 if < 0x20, and 8 otherwise.
193
0
  scale = _mm_xor_si128(scale, _mm_set1_epi16(8));
194
0
  scale = _mm_and_si128(scale, _mm_set1_epi16(0x108));
195
0
  // Isolate G now so that we don't accidentally unpremultiply A.
196
0
  ga = _mm_and_si128(ga, _mm_set1_epi32(0x000000FF));
197
0
198
0
  // Scale R, B, and G as required depending on reciprocal precision.
199
0
  rb = _mm_mullo_epi16(rb, scale);
200
0
  ga = _mm_mullo_epi16(ga, scale);
201
0
202
0
  // Multiply R, B, and G by the reciprocal, only taking the high word
203
0
  // too effectively shift right by 16.
204
0
  rb = _mm_mulhi_epu16(rb, q1234);
205
0
  ga = _mm_mulhi_epu16(ga, q1234);
206
0
207
0
  // Combine back to final pixel with rb | (ga << 8) | (aSrc & 0xFF000000),
208
0
  // which will add back on the original alpha value unchanged.
209
0
  ga = _mm_slli_si128(ga, 1);
210
0
  ga = _mm_or_si128(ga, _mm_and_si128(aSrc, _mm_set1_epi32(0xFF000000)));
211
0
  return _mm_or_si128(rb, ga);
212
0
}
Unexecuted instantiation: SwizzleSSE2.cpp:long long __vector(2) mozilla::gfx::UnpremultiplyVector_SSE2<false>(long long __vector(2) const&)
Unexecuted instantiation: SwizzleSSE2.cpp:long long __vector(2) mozilla::gfx::UnpremultiplyVector_SSE2<true>(long long __vector(2) const&)
213
214
template<bool aSwapRB>
215
void Unpremultiply_SSE2(const uint8_t* aSrc, int32_t aSrcGap,
216
                        uint8_t* aDst, int32_t aDstGap,
217
                        IntSize aSize)
218
0
{
219
0
  int32_t alignedRow = 4 * (aSize.width & ~3);
220
0
  int32_t remainder = aSize.width & 3;
221
0
  // Fold remainder into stride gap.
222
0
  aSrcGap += 4 * remainder;
223
0
  aDstGap += 4 * remainder;
224
0
225
0
  for (int32_t height = aSize.height; height > 0; height--) {
226
0
    // Process all 4-pixel chunks as one vector.
227
0
    for (const uint8_t* end = aSrc + alignedRow; aSrc < end;) {
228
0
      __m128i px = _mm_loadu_si128(reinterpret_cast<const __m128i*>(aSrc));
229
0
      px = UnpremultiplyVector_SSE2<aSwapRB>(px);
230
0
      _mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px);
231
0
      aSrc += 4 * 4;
232
0
      aDst += 4 * 4;
233
0
    }
234
0
235
0
    // Handle any 1-3 remaining pixels.
236
0
    if (remainder) {
237
0
      __m128i px = LoadRemainder_SSE2(aSrc, remainder);
238
0
      px = UnpremultiplyVector_SSE2<aSwapRB>(px);
239
0
      StoreRemainder_SSE2(aDst, remainder, px);
240
0
    }
241
0
242
0
    aSrc += aSrcGap;
243
0
    aDst += aDstGap;
244
0
  }
245
0
}
Unexecuted instantiation: void mozilla::gfx::Unpremultiply_SSE2<false>(unsigned char const*, int, unsigned char*, int, mozilla::gfx::IntSizeTyped<mozilla::gfx::UnknownUnits>)
Unexecuted instantiation: void mozilla::gfx::Unpremultiply_SSE2<true>(unsigned char const*, int, unsigned char*, int, mozilla::gfx::IntSizeTyped<mozilla::gfx::UnknownUnits>)
246
247
// Force instantiation of unpremultiply variants here.
248
template void Unpremultiply_SSE2<false>(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
249
template void Unpremultiply_SSE2<true>(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
250
251
// Swizzle a vector of 4 pixels providing swaps and opaquifying.
252
template<bool aSwapRB, bool aOpaqueAlpha>
253
static MOZ_ALWAYS_INLINE __m128i
254
SwizzleVector_SSE2(const __m128i& aSrc)
255
0
{
256
0
  // Isolate R and B.
257
0
  __m128i rb = _mm_and_si128(aSrc, _mm_set1_epi32(0x00FF00FF));
258
0
  // Swap R and B.
259
0
  rb = _mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
260
0
  rb = _mm_shufflehi_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
261
0
  // Isolate G and A.
262
0
  __m128i ga = _mm_and_si128(aSrc, _mm_set1_epi32(0xFF00FF00));
263
0
  // Force alpha to 255 if necessary.
264
0
  if (aOpaqueAlpha) {
265
0
    ga = _mm_or_si128(ga, _mm_set1_epi32(0xFF000000));
266
0
  }
267
0
  // Combine everything back together.
268
0
  return _mm_or_si128(rb, ga);
269
0
}
Unexecuted instantiation: SwizzleSSE2.cpp:long long __vector(2) mozilla::gfx::SwizzleVector_SSE2<true, false>(long long __vector(2) const&)
Unexecuted instantiation: SwizzleSSE2.cpp:long long __vector(2) mozilla::gfx::SwizzleVector_SSE2<true, true>(long long __vector(2) const&)
270
271
#if 0
272
// These specializations currently do not profile faster than the generic versions,
273
// so disable them for now.
274
275
// Optimized implementations for when there is no R and B swap.
276
template<>
277
MOZ_ALWAYS_INLINE __m128i
278
SwizzleVector_SSE2<false, true>(const __m128i& aSrc)
279
{
280
  // Force alpha to 255.
281
  return _mm_or_si128(aSrc, _mm_set1_epi32(0xFF000000));
282
}
283
284
template<>
285
MOZ_ALWAYS_INLINE __m128i
286
SwizzleVector_SSE2<false, false>(const __m128i& aSrc)
287
{
288
  return aSrc;
289
}
290
#endif
291
292
template<bool aSwapRB, bool aOpaqueAlpha>
293
void Swizzle_SSE2(const uint8_t* aSrc, int32_t aSrcGap,
294
                  uint8_t* aDst, int32_t aDstGap,
295
                  IntSize aSize)
296
0
{
297
0
  int32_t alignedRow = 4 * (aSize.width & ~3);
298
0
  int32_t remainder = aSize.width & 3;
299
0
  // Fold remainder into stride gap.
300
0
  aSrcGap += 4 * remainder;
301
0
  aDstGap += 4 * remainder;
302
0
303
0
  for (int32_t height = aSize.height; height > 0; height--) {
304
0
    // Process all 4-pixel chunks as one vector.
305
0
    for (const uint8_t* end = aSrc + alignedRow; aSrc < end;) {
306
0
      __m128i px = _mm_loadu_si128(reinterpret_cast<const __m128i*>(aSrc));
307
0
      px = SwizzleVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
308
0
      _mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px);
309
0
      aSrc += 4 * 4;
310
0
      aDst += 4 * 4;
311
0
    }
312
0
313
0
    // Handle any 1-3 remaining pixels.
314
0
    if (remainder) {
315
0
      __m128i px = LoadRemainder_SSE2(aSrc, remainder);
316
0
      px = SwizzleVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
317
0
      StoreRemainder_SSE2(aDst, remainder, px);
318
0
    }
319
0
320
0
    aSrc += aSrcGap;
321
0
    aDst += aDstGap;
322
0
  }
323
0
}
Unexecuted instantiation: void mozilla::gfx::Swizzle_SSE2<true, false>(unsigned char const*, int, unsigned char*, int, mozilla::gfx::IntSizeTyped<mozilla::gfx::UnknownUnits>)
Unexecuted instantiation: void mozilla::gfx::Swizzle_SSE2<true, true>(unsigned char const*, int, unsigned char*, int, mozilla::gfx::IntSizeTyped<mozilla::gfx::UnknownUnits>)
324
325
// Force instantiation of swizzle variants here.
326
template void Swizzle_SSE2<true, false>(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
327
template void Swizzle_SSE2<true, true>(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
328
329
} // namespace gfx
330
} // namespace mozilla
331