Coverage Report

Created: 2018-09-25 14:53

/src/mozilla-central/gfx/2d/ImageScalingSSE2.cpp
Line
Count
Source (jump to first uncovered line)
1
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
3
/* This Source Code Form is subject to the terms of the Mozilla Public
4
 * License, v. 2.0. If a copy of the MPL was not distributed with this
5
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6
7
#include "ImageScaling.h"
8
#include "mozilla/Attributes.h"
9
10
#include "SSEHelpers.h"
11
12
/* The functions below use the following system for averaging 4 pixels:
13
 *
14
 * The first observation is that a half-adder is implemented as follows:
15
 * R = S + 2C or in the case of a and b (a ^ b) + ((a & b) << 1);
16
 *
17
 * This can be trivially extended to three pixels by observaring that when
18
 * doing (a ^ b ^ c) as the sum, the carry is simply the bitwise-or of the
19
 * carries of the individual numbers, since the sum of 3 bits can only ever
20
 * have a carry of one.
21
 *
22
 * We then observe that the average is then ((carry << 1) + sum) >> 1, or,
23
 * assuming eliminating overflows and underflows, carry + (sum >> 1).
24
 *
25
 * We now average our existing sum with the fourth number, so we get:
26
 * sum2 = (sum + d) >> 1 or (sum >> 1) + (d >> 1).
27
 *
28
 * We now observe that our sum has been moved into place relative to the
29
 * carry, so we can now average with the carry to get the final 4 input
30
 * average: avg = (sum2 + carry) >> 1;
31
 *
32
 * Or to reverse the proof:
33
 * avg = ((sum >> 1) + carry + d >> 1) >> 1
34
 * avg = ((a + b + c) >> 1 + d >> 1) >> 1
35
 * avg = ((a + b + c + d) >> 2)
36
 *
37
 * An additional fact used in the SSE versions is the concept that we can
38
 * trivially convert a rounded average to a truncated average:
39
 *
40
 * We have:
41
 * f(a, b) = (a + b + 1) >> 1
42
 *
43
 * And want:
44
 * g(a, b) = (a + b) >> 1
45
 *
46
 * Observe:
47
 * ~f(~a, ~b) == ~((~a + ~b + 1) >> 1)
48
 *            == ~((-a - 1 + -b - 1 + 1) >> 1)
49
 *            == ~((-a - 1 + -b) >> 1)
50
 *            == ~((-(a + b) - 1) >> 1)
51
 *            == ~((~(a + b)) >> 1)
52
 *            == (a + b) >> 1
53
 *            == g(a, b)
54
 */
55
56
MOZ_ALWAYS_INLINE __m128i _mm_not_si128(__m128i arg)
57
0
{
58
0
  __m128i minusone = _mm_set1_epi32(0xffffffff);
59
0
  return _mm_xor_si128(arg, minusone);
60
0
}
61
62
/* We have to pass pointers here, MSVC does not allow passing more than 3
63
 * __m128i arguments on the stack. And it does not allow 16-byte aligned
64
 * stack variables. This inlines properly on MSVC 2010. It does -not- inline
65
 * with just the inline directive.
66
 */
67
MOZ_ALWAYS_INLINE __m128i avg_sse2_8x2(__m128i *a, __m128i *b, __m128i *c, __m128i *d)
68
0
{
69
0
#define shuf1 _MM_SHUFFLE(2, 0, 2, 0)
70
0
#define shuf2 _MM_SHUFFLE(3, 1, 3, 1)
71
0
72
0
// This cannot be an inline function as the __Imm argument to _mm_shuffle_ps
73
0
// needs to be a compile time constant.
74
0
#define shuffle_si128(arga, argb, imm) \
75
0
  _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps((arga)), _mm_castsi128_ps((argb)), (imm)));
76
0
77
0
  __m128i t = shuffle_si128(*a, *b, shuf1);
78
0
  *b = shuffle_si128(*a, *b, shuf2);
79
0
  *a = t;
80
0
  t = shuffle_si128(*c, *d, shuf1);
81
0
  *d = shuffle_si128(*c, *d, shuf2);
82
0
  *c = t;
83
0
84
0
#undef shuf1
85
0
#undef shuf2
86
0
#undef shuffle_si128
87
0
88
0
  __m128i sum = _mm_xor_si128(*a, _mm_xor_si128(*b, *c));
89
0
90
0
  __m128i carry = _mm_or_si128(_mm_and_si128(*a, *b), _mm_or_si128(_mm_and_si128(*a, *c), _mm_and_si128(*b, *c)));
91
0
92
0
  sum = _mm_avg_epu8(_mm_not_si128(sum), _mm_not_si128(*d));
93
0
94
0
  return _mm_not_si128(_mm_avg_epu8(sum, _mm_not_si128(carry)));
95
0
}
96
97
MOZ_ALWAYS_INLINE __m128i avg_sse2_4x2_4x1(__m128i a, __m128i b)
98
0
{
99
0
  return _mm_not_si128(_mm_avg_epu8(_mm_not_si128(a), _mm_not_si128(b)));
100
0
}
101
102
MOZ_ALWAYS_INLINE __m128i avg_sse2_8x1_4x1(__m128i a, __m128i b)
103
0
{
104
0
  __m128i t = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(3, 1, 3, 1)));
105
0
  b = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(2, 0, 2, 0)));
106
0
  a = t;
107
0
108
0
  return _mm_not_si128(_mm_avg_epu8(_mm_not_si128(a), _mm_not_si128(b)));
109
0
}
110
111
MOZ_ALWAYS_INLINE uint32_t Avg2x2(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
112
0
{
113
0
  uint32_t sum = a ^ b ^ c;
114
0
  uint32_t carry = (a & b) | (a & c) | (b & c);
115
0
116
0
  uint32_t mask = 0xfefefefe;
117
0
118
0
  // Not having a byte based average instruction means we should mask to avoid
119
0
  // underflow.
120
0
  sum = (((sum ^ d) & mask) >> 1) + (sum & d);
121
0
122
0
  return (((sum ^ carry) & mask) >> 1) + (sum & carry);
123
0
}
124
125
// Simple 2 pixel average version of the function above.
126
MOZ_ALWAYS_INLINE uint32_t Avg2(uint32_t a, uint32_t b)
127
0
{
128
0
  uint32_t sum = a ^ b;
129
0
  uint32_t carry = (a & b);
130
0
131
0
  uint32_t mask = 0xfefefefe;
132
0
133
0
  return ((sum & mask) >> 1) + carry;
134
0
}
135
136
namespace mozilla {
137
namespace gfx {
138
139
void
140
ImageHalfScaler::HalfImage2D_SSE2(uint8_t *aSource, int32_t aSourceStride,
141
                                  const IntSize &aSourceSize, uint8_t *aDest,
142
                                  uint32_t aDestStride)
143
0
{
144
0
  const int Bpp = 4;
145
0
146
0
  for (int y = 0; y < aSourceSize.height; y += 2) {
147
0
    __m128i *storage = (__m128i*)(aDest + (y / 2) * aDestStride);
148
0
    int x = 0;
149
0
    // Run a loop depending on alignment.
150
0
    if (!(uintptr_t(aSource + (y * aSourceStride)) % 16) &&
151
0
        !(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) {
152
0
      for (; x < (aSourceSize.width - 7); x += 8) {
153
0
        __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp));
154
0
        __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp));
155
0
156
0
        __m128i a = _mm_load_si128(upperRow);
157
0
        __m128i b = _mm_load_si128(upperRow + 1);
158
0
        __m128i c = _mm_load_si128(lowerRow);
159
0
        __m128i d = _mm_load_si128(lowerRow + 1);
160
0
161
0
        *storage++ = avg_sse2_8x2(&a, &b, &c, &d);
162
0
      }
163
0
    } else if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) {
164
0
      for (; x < (aSourceSize.width - 7); x += 8) {
165
0
        __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp));
166
0
        __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp));
167
0
168
0
        __m128i a = _mm_load_si128(upperRow);
169
0
        __m128i b = _mm_load_si128(upperRow + 1);
170
0
        __m128i c = loadUnaligned128(lowerRow);
171
0
        __m128i d = loadUnaligned128(lowerRow + 1);
172
0
173
0
        *storage++ = avg_sse2_8x2(&a, &b, &c, &d);
174
0
      }
175
0
    } else if (!(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) {
176
0
      for (; x < (aSourceSize.width - 7); x += 8) {
177
0
        __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp));
178
0
        __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp));
179
0
180
0
        __m128i a = loadUnaligned128((__m128i*)upperRow);
181
0
        __m128i b = loadUnaligned128((__m128i*)upperRow + 1);
182
0
        __m128i c = _mm_load_si128((__m128i*)lowerRow);
183
0
        __m128i d = _mm_load_si128((__m128i*)lowerRow + 1);
184
0
185
0
        *storage++ = avg_sse2_8x2(&a, &b, &c, &d);
186
0
      }
187
0
    } else {
188
0
      for (; x < (aSourceSize.width - 7); x += 8) {
189
0
        __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp));
190
0
        __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp));
191
0
192
0
        __m128i a = loadUnaligned128(upperRow);
193
0
        __m128i b = loadUnaligned128(upperRow + 1);
194
0
        __m128i c = loadUnaligned128(lowerRow);
195
0
        __m128i d = loadUnaligned128(lowerRow + 1);
196
0
197
0
        *storage++ = avg_sse2_8x2(&a, &b, &c, &d);
198
0
      }
199
0
    }
200
0
201
0
    uint32_t *unalignedStorage = (uint32_t*)storage;
202
0
    // Take care of the final pixels, we know there's an even number of pixels
203
0
    // in the source rectangle. We use a 2x2 'simd' implementation for this.
204
0
    //
205
0
    // Potentially we only have to do this in the last row since overflowing 
206
0
    // 8 pixels in an earlier row would appear to be harmless as it doesn't
207
0
    // touch invalid memory. Even when reading and writing to the same surface.
208
0
    // in practice we only do this when doing an additional downscale pass, and
209
0
    // in this situation we have unused stride to write into harmlessly.
210
0
    // I do not believe the additional code complexity would be worth it though.
211
0
    for (; x < aSourceSize.width; x += 2) {
212
0
      uint8_t *upperRow = aSource + (y * aSourceStride + x * Bpp);
213
0
      uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * Bpp);
214
0
215
0
      *unalignedStorage++ = Avg2x2(*(uint32_t*)upperRow, *((uint32_t*)upperRow + 1),
216
0
                                   *(uint32_t*)lowerRow, *((uint32_t*)lowerRow + 1));
217
0
    }
218
0
  }
219
0
}
220
221
void
222
ImageHalfScaler::HalfImageVertical_SSE2(uint8_t *aSource, int32_t aSourceStride,
223
                                        const IntSize &aSourceSize, uint8_t *aDest,
224
                                        uint32_t aDestStride)
225
0
{
226
0
  for (int y = 0; y < aSourceSize.height; y += 2) {
227
0
    __m128i *storage = (__m128i*)(aDest + (y / 2) * aDestStride);
228
0
    int x = 0;
229
0
    // Run a loop depending on alignment.
230
0
    if (!(uintptr_t(aSource + (y * aSourceStride)) % 16) &&
231
0
        !(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) {
232
0
      for (; x < (aSourceSize.width - 3); x += 4) {
233
0
        uint8_t *upperRow = aSource + (y * aSourceStride + x * 4);
234
0
        uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
235
0
236
0
        __m128i a = _mm_load_si128((__m128i*)upperRow);
237
0
        __m128i b = _mm_load_si128((__m128i*)lowerRow);
238
0
239
0
        *storage++ = avg_sse2_4x2_4x1(a, b);
240
0
      }
241
0
    } else if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) {
242
0
      // This line doesn't align well.
243
0
      for (; x < (aSourceSize.width - 3); x += 4) {
244
0
        uint8_t *upperRow = aSource + (y * aSourceStride + x * 4);
245
0
        uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
246
0
247
0
        __m128i a = _mm_load_si128((__m128i*)upperRow);
248
0
        __m128i b = loadUnaligned128((__m128i*)lowerRow);
249
0
250
0
        *storage++ = avg_sse2_4x2_4x1(a, b);
251
0
      }
252
0
    } else if (!(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) {
253
0
      for (; x < (aSourceSize.width - 3); x += 4) {
254
0
        uint8_t *upperRow = aSource + (y * aSourceStride + x * 4);
255
0
        uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
256
0
257
0
        __m128i a = loadUnaligned128((__m128i*)upperRow);
258
0
        __m128i b = _mm_load_si128((__m128i*)lowerRow);
259
0
260
0
        *storage++ = avg_sse2_4x2_4x1(a, b);
261
0
      }
262
0
    } else {
263
0
      for (; x < (aSourceSize.width - 3); x += 4) {
264
0
        uint8_t *upperRow = aSource + (y * aSourceStride + x * 4);
265
0
        uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
266
0
267
0
        __m128i a = loadUnaligned128((__m128i*)upperRow);
268
0
        __m128i b = loadUnaligned128((__m128i*)lowerRow);
269
0
270
0
        *storage++ = avg_sse2_4x2_4x1(a, b);
271
0
      }
272
0
    }
273
0
274
0
    uint32_t *unalignedStorage = (uint32_t*)storage;
275
0
    // Take care of the final pixels, we know there's an even number of pixels
276
0
    // in the source rectangle.
277
0
    //
278
0
    // Similar overflow considerations are valid as in the previous function.
279
0
    for (; x < aSourceSize.width; x++) {
280
0
      uint8_t *upperRow = aSource + (y * aSourceStride + x * 4);
281
0
      uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
282
0
283
0
      *unalignedStorage++ = Avg2(*(uint32_t*)upperRow, *(uint32_t*)lowerRow);
284
0
    }
285
0
  }
286
0
}
287
288
void
289
ImageHalfScaler::HalfImageHorizontal_SSE2(uint8_t *aSource, int32_t aSourceStride,
290
                                          const IntSize &aSourceSize, uint8_t *aDest,
291
                                          uint32_t aDestStride)
292
0
{
293
0
  for (int y = 0; y < aSourceSize.height; y++) {
294
0
    __m128i *storage = (__m128i*)(aDest + (y * aDestStride));
295
0
    int x = 0;
296
0
    // Run a loop depending on alignment.
297
0
    if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) {
298
0
      for (; x < (aSourceSize.width - 7); x += 8) {
299
0
        __m128i* pixels = (__m128i*)(aSource + (y * aSourceStride + x * 4));
300
0
301
0
        __m128i a = _mm_load_si128(pixels);
302
0
        __m128i b = _mm_load_si128(pixels + 1);
303
0
304
0
        *storage++ = avg_sse2_8x1_4x1(a, b);
305
0
      }
306
0
    } else {
307
0
      for (; x < (aSourceSize.width - 7); x += 8) {
308
0
        __m128i* pixels = (__m128i*)(aSource + (y * aSourceStride + x * 4));
309
0
310
0
        __m128i a = loadUnaligned128(pixels);
311
0
        __m128i b = loadUnaligned128(pixels + 1);
312
0
313
0
        *storage++ = avg_sse2_8x1_4x1(a, b);
314
0
      }
315
0
    }
316
0
317
0
    uint32_t *unalignedStorage = (uint32_t*)storage;
318
0
    // Take care of the final pixels, we know there's an even number of pixels
319
0
    // in the source rectangle.
320
0
    //
321
0
    // Similar overflow considerations are valid as in the previous function.
322
0
    for (; x < aSourceSize.width; x += 2) {
323
0
      uint32_t *pixels = (uint32_t*)(aSource + (y * aSourceStride + x * 4));
324
0
325
0
      *unalignedStorage++ = Avg2(*pixels, *(pixels + 1));
326
0
    }
327
0
  }
328
0
}
329
330
} // namespace gfx
331
} // namespace mozilla