Coverage Report

Created: 2018-09-25 14:53

/src/mozilla-central/gfx/2d/SIMD.h
Line
Count
Source (jump to first uncovered line)
1
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
3
/* This Source Code Form is subject to the terms of the Mozilla Public
4
 * License, v. 2.0. If a copy of the MPL was not distributed with this
5
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6
7
#ifndef _MOZILLA_GFX_SIMD_H_
8
#define _MOZILLA_GFX_SIMD_H_
9
10
/**
11
 * Consumers of this file need to #define SIMD_COMPILE_SSE2 before including it
12
 * if they want access to the SSE2 functions.
13
 */
14
15
#ifdef SIMD_COMPILE_SSE2
16
#include <xmmintrin.h>
17
#endif
18
19
namespace mozilla {
20
namespace gfx {
21
22
namespace simd {
23
24
template<typename u8x16_t>
25
u8x16_t Load8(const uint8_t* aSource);
26
27
template<typename u8x16_t>
28
u8x16_t From8(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h,
29
              uint8_t i, uint8_t j, uint8_t k, uint8_t l, uint8_t m, uint8_t n, uint8_t o, uint8_t p);
30
31
template<typename u8x16_t>
32
u8x16_t FromZero8();
33
34
template<typename i16x8_t>
35
i16x8_t FromI16(int16_t a, int16_t b, int16_t c, int16_t d, int16_t e, int16_t f, int16_t g, int16_t h);
36
37
template<typename u16x8_t>
38
u16x8_t FromU16(uint16_t a, uint16_t b, uint16_t c, uint16_t d, uint16_t e, uint16_t f, uint16_t g, uint16_t h);
39
40
template<typename i16x8_t>
41
i16x8_t FromI16(int16_t a);
42
43
template<typename u16x8_t>
44
u16x8_t FromU16(uint16_t a);
45
46
template<typename i32x4_t>
47
i32x4_t From32(int32_t a, int32_t b, int32_t c, int32_t d);
48
49
template<typename i32x4_t>
50
i32x4_t From32(int32_t a);
51
52
template<typename f32x4_t>
53
f32x4_t FromF32(float a, float b, float c, float d);
54
55
template<typename f32x4_t>
56
f32x4_t FromF32(float a);
57
58
// All SIMD backends overload these functions for their SIMD types:
59
60
#if 0
61
62
// Store 16 bytes to a 16-byte aligned address
63
void Store8(uint8_t* aTarget, u8x16_t aM);
64
65
// Fixed shifts
66
template<int32_t aNumberOfBits> i16x8_t ShiftRight16(i16x8_t aM);
67
template<int32_t aNumberOfBits> i32x4_t ShiftRight32(i32x4_t aM);
68
69
i16x8_t Add16(i16x8_t aM1, i16x8_t aM2);
70
i32x4_t Add32(i32x4_t aM1, i32x4_t aM2);
71
i16x8_t Sub16(i16x8_t aM1, i16x8_t aM2);
72
i32x4_t Sub32(i32x4_t aM1, i32x4_t aM2);
73
u8x16_t Min8(u8x16_t aM1, iu8x16_t aM2);
74
u8x16_t Max8(u8x16_t aM1, iu8x16_t aM2);
75
i32x4_t Min32(i32x4_t aM1, i32x4_t aM2);
76
i32x4_t Max32(i32x4_t aM1, i32x4_t aM2);
77
78
// Truncating i16 -> i16 multiplication
79
i16x8_t Mul16(i16x8_t aM1, i16x8_t aM2);
80
81
// Long multiplication i16 -> i32
82
// aFactorsA1B1 = (a1[4] b1[4])
83
// aFactorsA2B2 = (a2[4] b2[4])
84
// aProductA = a1 * a2, aProductB = b1 * b2
85
void Mul16x4x2x2To32x4x2(i16x8_t aFactorsA1B1, i16x8_t aFactorsA2B2,
86
                         i32x4_t& aProductA, i32x4_t& aProductB);
87
88
// Long multiplication + pairwise addition i16 -> i32
89
// See the scalar implementation for specifics.
90
i32x4_t MulAdd16x8x2To32x4(i16x8_t aFactorsA, i16x8_t aFactorsB);
91
i32x4_t MulAdd16x8x2To32x4(u16x8_t aFactorsA, u16x8_t aFactorsB);
92
93
// Set all four 32-bit components to the value of the component at aIndex.
94
template<int8_t aIndex>
95
i32x4_t Splat32(i32x4_t aM);
96
97
// Interpret the input as four 32-bit values, apply Splat32<aIndex> on them,
98
// re-interpret the result as sixteen 8-bit values.
99
template<int8_t aIndex>
100
u8x16_t Splat32On8(u8x16_t aM);
101
102
template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> i32x4 Shuffle32(i32x4 aM);
103
template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> i16x8 ShuffleLo16(i16x8 aM);
104
template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> i16x8 ShuffleHi16(i16x8 aM);
105
106
u8x16_t InterleaveLo8(u8x16_t m1, u8x16_t m2);
107
u8x16_t InterleaveHi8(u8x16_t m1, u8x16_t m2);
108
i16x8_t InterleaveLo16(i16x8_t m1, i16x8_t m2);
109
i16x8_t InterleaveHi16(i16x8_t m1, i16x8_t m2);
110
i32x4_t InterleaveLo32(i32x4_t m1, i32x4_t m2);
111
112
i16x8_t UnpackLo8x8ToI16x8(u8x16_t m);
113
i16x8_t UnpackHi8x8ToI16x8(u8x16_t m);
114
u16x8_t UnpackLo8x8ToU16x8(u8x16_t m);
115
u16x8_t UnpackHi8x8ToU16x8(u8x16_t m);
116
117
i16x8_t PackAndSaturate32To16(i32x4_t m1, i32x4_t m2);
118
u8x16_t PackAndSaturate16To8(i16x8_t m1, i16x8_t m2);
119
u8x16_t PackAndSaturate32To8(i32x4_t m1, i32x4_t m2, i32x4_t m3, const i32x4_t& m4);
120
121
i32x4 FastDivideBy255(i32x4 m);
122
i16x8 FastDivideBy255_16(i16x8 m);
123
124
#endif
125
126
// Scalar
127
128
struct Scalaru8x16_t {
129
  uint8_t u8[16];
130
};
131
132
union Scalari16x8_t {
133
  int16_t i16[8];
134
  uint16_t u16[8];
135
};
136
137
typedef Scalari16x8_t Scalaru16x8_t;
138
139
struct Scalari32x4_t {
140
  int32_t i32[4];
141
};
142
143
struct Scalarf32x4_t {
144
  float f32[4];
145
};
146
147
template<>
148
inline Scalaru8x16_t
149
Load8<Scalaru8x16_t>(const uint8_t* aSource)
150
0
{
151
0
  return *(Scalaru8x16_t*)aSource;
152
0
}
153
154
inline void Store8(uint8_t* aTarget, Scalaru8x16_t aM)
155
0
{
156
0
  *(Scalaru8x16_t*)aTarget = aM;
157
0
}
158
159
template<>
160
inline Scalaru8x16_t From8<Scalaru8x16_t>(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h,
161
                                          uint8_t i, uint8_t j, uint8_t k, uint8_t l, uint8_t m, uint8_t n, uint8_t o, uint8_t p)
162
0
{
163
0
  Scalaru8x16_t _m;
164
0
  _m.u8[0] = a;
165
0
  _m.u8[1] = b;
166
0
  _m.u8[2] = c;
167
0
  _m.u8[3] = d;
168
0
  _m.u8[4] = e;
169
0
  _m.u8[5] = f;
170
0
  _m.u8[6] = g;
171
0
  _m.u8[7] = h;
172
0
  _m.u8[8+0] = i;
173
0
  _m.u8[8+1] = j;
174
0
  _m.u8[8+2] = k;
175
0
  _m.u8[8+3] = l;
176
0
  _m.u8[8+4] = m;
177
0
  _m.u8[8+5] = n;
178
0
  _m.u8[8+6] = o;
179
0
  _m.u8[8+7] = p;
180
0
  return _m;
181
0
}
182
183
template<>
184
inline Scalaru8x16_t FromZero8<Scalaru8x16_t>()
185
0
{
186
0
  return From8<Scalaru8x16_t>(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
187
0
}
188
189
template<>
190
inline Scalari16x8_t FromI16<Scalari16x8_t>(int16_t a, int16_t b, int16_t c, int16_t d, int16_t e, int16_t f, int16_t g, int16_t h)
191
0
{
192
0
  Scalari16x8_t m;
193
0
  m.i16[0] = a;
194
0
  m.i16[1] = b;
195
0
  m.i16[2] = c;
196
0
  m.i16[3] = d;
197
0
  m.i16[4] = e;
198
0
  m.i16[5] = f;
199
0
  m.i16[6] = g;
200
0
  m.i16[7] = h;
201
0
  return m;
202
0
}
203
204
template<>
205
inline Scalaru16x8_t FromU16<Scalaru16x8_t>(uint16_t a, uint16_t b, uint16_t c, uint16_t d, uint16_t e, uint16_t f, uint16_t g, uint16_t h)
206
0
{
207
0
  Scalaru16x8_t m;
208
0
  m.u16[0] = a;
209
0
  m.u16[1] = b;
210
0
  m.u16[2] = c;
211
0
  m.u16[3] = d;
212
0
  m.u16[4] = e;
213
0
  m.u16[5] = f;
214
0
  m.u16[6] = g;
215
0
  m.u16[7] = h;
216
0
  return m;
217
0
}
218
219
template<>
220
inline Scalari16x8_t FromI16<Scalari16x8_t>(int16_t a)
221
0
{
222
0
  return FromI16<Scalari16x8_t>(a, a, a, a, a, a, a, a);
223
0
}
224
225
template<>
226
inline Scalaru16x8_t FromU16<Scalaru16x8_t>(uint16_t a)
227
0
{
228
0
  return FromU16<Scalaru16x8_t>(a, a, a, a, a, a, a, a);
229
0
}
230
231
template<>
232
inline Scalari32x4_t From32<Scalari32x4_t>(int32_t a, int32_t b, int32_t c, int32_t d)
233
0
{
234
0
  Scalari32x4_t m;
235
0
  m.i32[0] = a;
236
0
  m.i32[1] = b;
237
0
  m.i32[2] = c;
238
0
  m.i32[3] = d;
239
0
  return m;
240
0
}
241
242
template<>
243
inline Scalarf32x4_t FromF32<Scalarf32x4_t>(float a, float b, float c, float d)
244
0
{
245
0
  Scalarf32x4_t m;
246
0
  m.f32[0] = a;
247
0
  m.f32[1] = b;
248
0
  m.f32[2] = c;
249
0
  m.f32[3] = d;
250
0
  return m;
251
0
}
252
253
template<>
254
inline Scalarf32x4_t FromF32<Scalarf32x4_t>(float a)
255
0
{
256
0
  return FromF32<Scalarf32x4_t>(a, a, a, a);
257
0
}
258
259
template<>
260
inline Scalari32x4_t From32<Scalari32x4_t>(int32_t a)
261
0
{
262
0
  return From32<Scalari32x4_t>(a, a, a, a);
263
0
}
264
265
template<int32_t aNumberOfBits>
266
inline Scalari16x8_t ShiftRight16(Scalari16x8_t aM)
267
{
268
  return FromI16<Scalari16x8_t>(uint16_t(aM.i16[0]) >> aNumberOfBits, uint16_t(aM.i16[1]) >> aNumberOfBits,
269
                               uint16_t(aM.i16[2]) >> aNumberOfBits, uint16_t(aM.i16[3]) >> aNumberOfBits,
270
                               uint16_t(aM.i16[4]) >> aNumberOfBits, uint16_t(aM.i16[5]) >> aNumberOfBits,
271
                               uint16_t(aM.i16[6]) >> aNumberOfBits, uint16_t(aM.i16[7]) >> aNumberOfBits);
272
}
273
274
template<int32_t aNumberOfBits>
275
inline Scalari32x4_t ShiftRight32(Scalari32x4_t aM)
276
0
{
277
0
  return From32<Scalari32x4_t>(aM.i32[0] >> aNumberOfBits, aM.i32[1] >> aNumberOfBits,
278
0
                               aM.i32[2] >> aNumberOfBits, aM.i32[3] >> aNumberOfBits);
279
0
}
280
281
inline Scalaru16x8_t Add16(Scalaru16x8_t aM1, Scalaru16x8_t aM2)
282
0
{
283
0
  return FromU16<Scalaru16x8_t>(aM1.u16[0] + aM2.u16[0], aM1.u16[1] + aM2.u16[1],
284
0
                               aM1.u16[2] + aM2.u16[2], aM1.u16[3] + aM2.u16[3],
285
0
                               aM1.u16[4] + aM2.u16[4], aM1.u16[5] + aM2.u16[5],
286
0
                               aM1.u16[6] + aM2.u16[6], aM1.u16[7] + aM2.u16[7]);
287
0
}
288
289
inline Scalari32x4_t Add32(Scalari32x4_t aM1, Scalari32x4_t aM2)
290
0
{
291
0
  return From32<Scalari32x4_t>(aM1.i32[0] + aM2.i32[0], aM1.i32[1] + aM2.i32[1],
292
0
                               aM1.i32[2] + aM2.i32[2], aM1.i32[3] + aM2.i32[3]);
293
0
}
294
295
inline Scalaru16x8_t Sub16(Scalaru16x8_t aM1, Scalaru16x8_t aM2)
296
0
{
297
0
  return FromU16<Scalaru16x8_t>(aM1.u16[0] - aM2.u16[0], aM1.u16[1] - aM2.u16[1],
298
0
                               aM1.u16[2] - aM2.u16[2], aM1.u16[3] - aM2.u16[3],
299
0
                               aM1.u16[4] - aM2.u16[4], aM1.u16[5] - aM2.u16[5],
300
0
                               aM1.u16[6] - aM2.u16[6], aM1.u16[7] - aM2.u16[7]);
301
0
}
302
303
inline Scalari32x4_t Sub32(Scalari32x4_t aM1, Scalari32x4_t aM2)
304
0
{
305
0
  return From32<Scalari32x4_t>(aM1.i32[0] - aM2.i32[0], aM1.i32[1] - aM2.i32[1],
306
0
                               aM1.i32[2] - aM2.i32[2], aM1.i32[3] - aM2.i32[3]);
307
0
}
308
309
inline int32_t
310
umin(int32_t a, int32_t b)
311
0
{
312
0
  return a - ((a - b) & -(a > b));
313
0
}
314
315
inline int32_t
316
umax(int32_t a, int32_t b)
317
0
{
318
0
  return a - ((a - b) & -(a < b));
319
0
}
320
321
inline Scalaru8x16_t Min8(Scalaru8x16_t aM1, Scalaru8x16_t aM2)
322
0
{
323
0
  return From8<Scalaru8x16_t>(umin(aM1.u8[0], aM2.u8[0]), umin(aM1.u8[1], aM2.u8[1]),
324
0
                              umin(aM1.u8[2], aM2.u8[2]), umin(aM1.u8[3], aM2.u8[3]),
325
0
                              umin(aM1.u8[4], aM2.u8[4]), umin(aM1.u8[5], aM2.u8[5]),
326
0
                              umin(aM1.u8[6], aM2.u8[6]), umin(aM1.u8[7], aM2.u8[7]),
327
0
                              umin(aM1.u8[8+0], aM2.u8[8+0]), umin(aM1.u8[8+1], aM2.u8[8+1]),
328
0
                              umin(aM1.u8[8+2], aM2.u8[8+2]), umin(aM1.u8[8+3], aM2.u8[8+3]),
329
0
                              umin(aM1.u8[8+4], aM2.u8[8+4]), umin(aM1.u8[8+5], aM2.u8[8+5]),
330
0
                              umin(aM1.u8[8+6], aM2.u8[8+6]), umin(aM1.u8[8+7], aM2.u8[8+7]));
331
0
}
332
333
inline Scalaru8x16_t Max8(Scalaru8x16_t aM1, Scalaru8x16_t aM2)
334
0
{
335
0
  return From8<Scalaru8x16_t>(umax(aM1.u8[0], aM2.u8[0]), umax(aM1.u8[1], aM2.u8[1]),
336
0
                              umax(aM1.u8[2], aM2.u8[2]), umax(aM1.u8[3], aM2.u8[3]),
337
0
                              umax(aM1.u8[4], aM2.u8[4]), umax(aM1.u8[5], aM2.u8[5]),
338
0
                              umax(aM1.u8[6], aM2.u8[6]), umax(aM1.u8[7], aM2.u8[7]),
339
0
                              umax(aM1.u8[8+0], aM2.u8[8+0]), umax(aM1.u8[8+1], aM2.u8[8+1]),
340
0
                              umax(aM1.u8[8+2], aM2.u8[8+2]), umax(aM1.u8[8+3], aM2.u8[8+3]),
341
0
                              umax(aM1.u8[8+4], aM2.u8[8+4]), umax(aM1.u8[8+5], aM2.u8[8+5]),
342
0
                              umax(aM1.u8[8+6], aM2.u8[8+6]), umax(aM1.u8[8+7], aM2.u8[8+7]));
343
0
}
344
345
inline Scalari32x4_t Min32(Scalari32x4_t aM1, Scalari32x4_t aM2)
346
0
{
347
0
  return From32<Scalari32x4_t>(umin(aM1.i32[0], aM2.i32[0]), umin(aM1.i32[1], aM2.i32[1]),
348
0
                               umin(aM1.i32[2], aM2.i32[2]), umin(aM1.i32[3], aM2.i32[3]));
349
0
}
350
351
inline Scalari32x4_t Max32(Scalari32x4_t aM1, Scalari32x4_t aM2)
352
0
{
353
0
  return From32<Scalari32x4_t>(umax(aM1.i32[0], aM2.i32[0]), umax(aM1.i32[1], aM2.i32[1]),
354
0
                               umax(aM1.i32[2], aM2.i32[2]), umax(aM1.i32[3], aM2.i32[3]));
355
0
}
356
357
inline Scalaru16x8_t Mul16(Scalaru16x8_t aM1, Scalaru16x8_t aM2)
358
0
{
359
0
  return FromU16<Scalaru16x8_t>(uint16_t(int32_t(aM1.u16[0]) * int32_t(aM2.u16[0])), uint16_t(int32_t(aM1.u16[1]) * int32_t(aM2.u16[1])),
360
0
                                uint16_t(int32_t(aM1.u16[2]) * int32_t(aM2.u16[2])), uint16_t(int32_t(aM1.u16[3]) * int32_t(aM2.u16[3])),
361
0
                                uint16_t(int32_t(aM1.u16[4]) * int32_t(aM2.u16[4])), uint16_t(int32_t(aM1.u16[5]) * int32_t(aM2.u16[5])),
362
0
                                uint16_t(int32_t(aM1.u16[6]) * int32_t(aM2.u16[6])), uint16_t(int32_t(aM1.u16[7]) * int32_t(aM2.u16[7])));
363
0
}
364
365
inline void Mul16x4x2x2To32x4x2(Scalari16x8_t aFactorsA1B1,
366
                                Scalari16x8_t aFactorsA2B2,
367
                                Scalari32x4_t& aProductA,
368
                                Scalari32x4_t& aProductB)
369
0
{
370
0
  aProductA = From32<Scalari32x4_t>(aFactorsA1B1.i16[0] * aFactorsA2B2.i16[0],
371
0
                                    aFactorsA1B1.i16[1] * aFactorsA2B2.i16[1],
372
0
                                    aFactorsA1B1.i16[2] * aFactorsA2B2.i16[2],
373
0
                                    aFactorsA1B1.i16[3] * aFactorsA2B2.i16[3]);
374
0
  aProductB = From32<Scalari32x4_t>(aFactorsA1B1.i16[4] * aFactorsA2B2.i16[4],
375
0
                                    aFactorsA1B1.i16[5] * aFactorsA2B2.i16[5],
376
0
                                    aFactorsA1B1.i16[6] * aFactorsA2B2.i16[6],
377
0
                                    aFactorsA1B1.i16[7] * aFactorsA2B2.i16[7]);
378
0
}
379
380
inline Scalari32x4_t MulAdd16x8x2To32x4(Scalari16x8_t aFactorsA,
381
                                        Scalari16x8_t aFactorsB)
382
0
{
383
0
  return From32<Scalari32x4_t>(aFactorsA.i16[0] * aFactorsB.i16[0] + aFactorsA.i16[1] * aFactorsB.i16[1],
384
0
                               aFactorsA.i16[2] * aFactorsB.i16[2] + aFactorsA.i16[3] * aFactorsB.i16[3],
385
0
                               aFactorsA.i16[4] * aFactorsB.i16[4] + aFactorsA.i16[5] * aFactorsB.i16[5],
386
0
                               aFactorsA.i16[6] * aFactorsB.i16[6] + aFactorsA.i16[7] * aFactorsB.i16[7]);
387
0
}
388
389
template<int8_t aIndex>
390
inline void AssertIndex()
391
0
{
392
0
  static_assert(aIndex == 0 || aIndex == 1 || aIndex == 2 || aIndex == 3,
393
0
                "Invalid splat index");
394
0
}
Unexecuted instantiation: void mozilla::gfx::simd::AssertIndex<(signed char)3>()
Unexecuted instantiation: void mozilla::gfx::simd::AssertIndex<(signed char)2>()
Unexecuted instantiation: void mozilla::gfx::simd::AssertIndex<(signed char)0>()
Unexecuted instantiation: void mozilla::gfx::simd::AssertIndex<(signed char)1>()
395
396
template<int8_t aIndex>
397
inline Scalari32x4_t Splat32(Scalari32x4_t aM)
398
{
399
  AssertIndex<aIndex>();
400
  return From32<Scalari32x4_t>(aM.i32[aIndex], aM.i32[aIndex],
401
                               aM.i32[aIndex], aM.i32[aIndex]);
402
}
403
404
template<int8_t i>
405
inline Scalaru8x16_t Splat32On8(Scalaru8x16_t aM)
406
0
{
407
0
  AssertIndex<i>();
408
0
  return From8<Scalaru8x16_t>(aM.u8[i*4], aM.u8[i*4+1], aM.u8[i*4+2], aM.u8[i*4+3],
409
0
                              aM.u8[i*4], aM.u8[i*4+1], aM.u8[i*4+2], aM.u8[i*4+3],
410
0
                              aM.u8[i*4], aM.u8[i*4+1], aM.u8[i*4+2], aM.u8[i*4+3],
411
0
                              aM.u8[i*4], aM.u8[i*4+1], aM.u8[i*4+2], aM.u8[i*4+3]);
412
0
}
Unexecuted instantiation: mozilla::gfx::simd::Scalaru8x16_t mozilla::gfx::simd::Splat32On8<(signed char)0>(mozilla::gfx::simd::Scalaru8x16_t)
Unexecuted instantiation: mozilla::gfx::simd::Scalaru8x16_t mozilla::gfx::simd::Splat32On8<(signed char)1>(mozilla::gfx::simd::Scalaru8x16_t)
Unexecuted instantiation: mozilla::gfx::simd::Scalaru8x16_t mozilla::gfx::simd::Splat32On8<(signed char)2>(mozilla::gfx::simd::Scalaru8x16_t)
Unexecuted instantiation: mozilla::gfx::simd::Scalaru8x16_t mozilla::gfx::simd::Splat32On8<(signed char)3>(mozilla::gfx::simd::Scalaru8x16_t)
413
414
template<int8_t i0, int8_t i1, int8_t i2, int8_t i3>
415
inline Scalari32x4_t Shuffle32(Scalari32x4_t aM)
416
{
417
  AssertIndex<i0>();
418
  AssertIndex<i1>();
419
  AssertIndex<i2>();
420
  AssertIndex<i3>();
421
  Scalari32x4_t m = aM;
422
  m.i32[0] = aM.i32[i3];
423
  m.i32[1] = aM.i32[i2];
424
  m.i32[2] = aM.i32[i1];
425
  m.i32[3] = aM.i32[i0];
426
  return m;
427
}
428
429
template<int8_t i0, int8_t i1, int8_t i2, int8_t i3>
430
inline Scalari16x8_t ShuffleLo16(Scalari16x8_t aM)
431
0
{
432
0
  AssertIndex<i0>();
433
0
  AssertIndex<i1>();
434
0
  AssertIndex<i2>();
435
0
  AssertIndex<i3>();
436
0
  Scalari16x8_t m = aM;
437
0
  m.i16[0] = aM.i16[i3];
438
0
  m.i16[1] = aM.i16[i2];
439
0
  m.i16[2] = aM.i16[i1];
440
0
  m.i16[3] = aM.i16[i0];
441
0
  return m;
442
0
}
Unexecuted instantiation: mozilla::gfx::simd::Scalari16x8_t mozilla::gfx::simd::ShuffleLo16<(signed char)1, (signed char)0, (signed char)1, (signed char)0>(mozilla::gfx::simd::Scalari16x8_t)
Unexecuted instantiation: mozilla::gfx::simd::Scalari16x8_t mozilla::gfx::simd::ShuffleLo16<(signed char)3, (signed char)2, (signed char)3, (signed char)2>(mozilla::gfx::simd::Scalari16x8_t)
443
444
template<int8_t i0, int8_t i1, int8_t i2, int8_t i3>
445
inline Scalari16x8_t ShuffleHi16(Scalari16x8_t aM)
446
0
{
447
0
  AssertIndex<i0>();
448
0
  AssertIndex<i1>();
449
0
  AssertIndex<i2>();
450
0
  AssertIndex<i3>();
451
0
  Scalari16x8_t m = aM;
452
0
  m.i16[4 + 0] = aM.i16[4 + i3];
453
0
  m.i16[4 + 1] = aM.i16[4 + i2];
454
0
  m.i16[4 + 2] = aM.i16[4 + i1];
455
0
  m.i16[4 + 3] = aM.i16[4 + i0];
456
0
  return m;
457
0
}
Unexecuted instantiation: mozilla::gfx::simd::Scalari16x8_t mozilla::gfx::simd::ShuffleHi16<(signed char)1, (signed char)0, (signed char)1, (signed char)0>(mozilla::gfx::simd::Scalari16x8_t)
Unexecuted instantiation: mozilla::gfx::simd::Scalari16x8_t mozilla::gfx::simd::ShuffleHi16<(signed char)3, (signed char)2, (signed char)3, (signed char)2>(mozilla::gfx::simd::Scalari16x8_t)
458
459
template<int8_t aIndexLo, int8_t aIndexHi>
460
inline Scalaru16x8_t Splat16(Scalaru16x8_t aM)
461
0
{
462
0
  AssertIndex<aIndexLo>();
463
0
  AssertIndex<aIndexHi>();
464
0
  Scalaru16x8_t m;
465
0
  int16_t chosenValueLo = aM.u16[aIndexLo];
466
0
  m.u16[0] = chosenValueLo;
467
0
  m.u16[1] = chosenValueLo;
468
0
  m.u16[2] = chosenValueLo;
469
0
  m.u16[3] = chosenValueLo;
470
0
  int16_t chosenValueHi = aM.u16[4 + aIndexHi];
471
0
  m.u16[4] = chosenValueHi;
472
0
  m.u16[5] = chosenValueHi;
473
0
  m.u16[6] = chosenValueHi;
474
0
  m.u16[7] = chosenValueHi;
475
0
  return m;
476
0
}
477
478
inline Scalaru8x16_t
479
InterleaveLo8(Scalaru8x16_t m1, Scalaru8x16_t m2)
480
0
{
481
0
  return From8<Scalaru8x16_t>(m1.u8[0], m2.u8[0], m1.u8[1], m2.u8[1],
482
0
                              m1.u8[2], m2.u8[2], m1.u8[3], m2.u8[3],
483
0
                              m1.u8[4], m2.u8[4], m1.u8[5], m2.u8[5],
484
0
                              m1.u8[6], m2.u8[6], m1.u8[7], m2.u8[7]);
485
0
}
486
487
inline Scalaru8x16_t
488
InterleaveHi8(Scalaru8x16_t m1, Scalaru8x16_t m2)
489
0
{
490
0
  return From8<Scalaru8x16_t>(m1.u8[8+0], m2.u8[8+0], m1.u8[8+1], m2.u8[8+1],
491
0
                              m1.u8[8+2], m2.u8[8+2], m1.u8[8+3], m2.u8[8+3],
492
0
                              m1.u8[8+4], m2.u8[8+4], m1.u8[8+5], m2.u8[8+5],
493
0
                              m1.u8[8+6], m2.u8[8+6], m1.u8[8+7], m2.u8[8+7]);
494
0
}
495
496
inline Scalaru16x8_t
497
InterleaveLo16(Scalaru16x8_t m1, Scalaru16x8_t m2)
498
0
{
499
0
  return FromU16<Scalaru16x8_t>(m1.u16[0], m2.u16[0], m1.u16[1], m2.u16[1],
500
0
                               m1.u16[2], m2.u16[2], m1.u16[3], m2.u16[3]);
501
0
}
502
503
inline Scalaru16x8_t
504
InterleaveHi16(Scalaru16x8_t m1, Scalaru16x8_t m2)
505
0
{
506
0
  return FromU16<Scalaru16x8_t>(m1.u16[4], m2.u16[4], m1.u16[5], m2.u16[5],
507
0
                               m1.u16[6], m2.u16[6], m1.u16[7], m2.u16[7]);
508
0
}
509
510
inline Scalari32x4_t
511
InterleaveLo32(Scalari32x4_t m1, Scalari32x4_t m2)
512
0
{
513
0
  return From32<Scalari32x4_t>(m1.i32[0], m2.i32[0], m1.i32[1], m2.i32[1]);
514
0
}
515
516
inline Scalari16x8_t
517
UnpackLo8x8ToI16x8(Scalaru8x16_t aM)
518
0
{
519
0
  Scalari16x8_t m;
520
0
  m.i16[0] = aM.u8[0];
521
0
  m.i16[1] = aM.u8[1];
522
0
  m.i16[2] = aM.u8[2];
523
0
  m.i16[3] = aM.u8[3];
524
0
  m.i16[4] = aM.u8[4];
525
0
  m.i16[5] = aM.u8[5];
526
0
  m.i16[6] = aM.u8[6];
527
0
  m.i16[7] = aM.u8[7];
528
0
  return m;
529
0
}
530
531
inline Scalari16x8_t
532
UnpackHi8x8ToI16x8(Scalaru8x16_t aM)
533
0
{
534
0
  Scalari16x8_t m;
535
0
  m.i16[0] = aM.u8[8+0];
536
0
  m.i16[1] = aM.u8[8+1];
537
0
  m.i16[2] = aM.u8[8+2];
538
0
  m.i16[3] = aM.u8[8+3];
539
0
  m.i16[4] = aM.u8[8+4];
540
0
  m.i16[5] = aM.u8[8+5];
541
0
  m.i16[6] = aM.u8[8+6];
542
0
  m.i16[7] = aM.u8[8+7];
543
0
  return m;
544
0
}
545
546
inline Scalaru16x8_t
547
UnpackLo8x8ToU16x8(Scalaru8x16_t aM)
548
0
{
549
0
  return FromU16<Scalaru16x8_t>(uint16_t(aM.u8[0]), uint16_t(aM.u8[1]), uint16_t(aM.u8[2]), uint16_t(aM.u8[3]),
550
0
                                uint16_t(aM.u8[4]), uint16_t(aM.u8[5]), uint16_t(aM.u8[6]), uint16_t(aM.u8[7]));
551
0
}
552
553
inline Scalaru16x8_t
554
UnpackHi8x8ToU16x8(Scalaru8x16_t aM)
555
0
{
556
0
  return FromU16<Scalaru16x8_t>(aM.u8[8+0], aM.u8[8+1], aM.u8[8+2], aM.u8[8+3],
557
0
                                aM.u8[8+4], aM.u8[8+5], aM.u8[8+6], aM.u8[8+7]);
558
0
}
559
560
template<uint8_t aNumBytes>
561
inline Scalaru8x16_t
562
Rotate8(Scalaru8x16_t a1234, Scalaru8x16_t a5678)
563
{
564
  Scalaru8x16_t m;
565
  for (uint8_t i = 0; i < 16; i++) {
566
    uint8_t sourceByte = i + aNumBytes;
567
    m.u8[i] = sourceByte < 16 ? a1234.u8[sourceByte] : a5678.u8[sourceByte - 16];
568
  }
569
  return m;
570
}
571
572
template<typename T>
573
inline int16_t
574
SaturateTo16(T a)
575
0
{
576
0
  return int16_t(a >= INT16_MIN ? (a <= INT16_MAX ? a : INT16_MAX) : INT16_MIN);
577
0
}
578
579
inline Scalari16x8_t
580
PackAndSaturate32To16(Scalari32x4_t m1, Scalari32x4_t m2)
581
0
{
582
0
  Scalari16x8_t m;
583
0
  m.i16[0] = SaturateTo16(m1.i32[0]);
584
0
  m.i16[1] = SaturateTo16(m1.i32[1]);
585
0
  m.i16[2] = SaturateTo16(m1.i32[2]);
586
0
  m.i16[3] = SaturateTo16(m1.i32[3]);
587
0
  m.i16[4] = SaturateTo16(m2.i32[0]);
588
0
  m.i16[5] = SaturateTo16(m2.i32[1]);
589
0
  m.i16[6] = SaturateTo16(m2.i32[2]);
590
0
  m.i16[7] = SaturateTo16(m2.i32[3]);
591
0
  return m;
592
0
}
593
594
template<typename T>
595
inline uint16_t
596
SaturateToU16(T a)
597
0
{
598
0
  return uint16_t(umin(a & -(a >= 0), INT16_MAX));
599
0
}
600
601
inline Scalaru16x8_t
602
PackAndSaturate32ToU16(Scalari32x4_t m1, Scalari32x4_t m2)
603
0
{
604
0
  Scalaru16x8_t m;
605
0
  m.u16[0] = SaturateToU16(m1.i32[0]);
606
0
  m.u16[1] = SaturateToU16(m1.i32[1]);
607
0
  m.u16[2] = SaturateToU16(m1.i32[2]);
608
0
  m.u16[3] = SaturateToU16(m1.i32[3]);
609
0
  m.u16[4] = SaturateToU16(m2.i32[0]);
610
0
  m.u16[5] = SaturateToU16(m2.i32[1]);
611
0
  m.u16[6] = SaturateToU16(m2.i32[2]);
612
0
  m.u16[7] = SaturateToU16(m2.i32[3]);
613
0
  return m;
614
0
}
615
616
template<typename T>
617
inline uint8_t
618
SaturateTo8(T a)
619
0
{
620
0
  return uint8_t(umin(a & -(a >= 0), 255));
621
0
}
Unexecuted instantiation: unsigned char mozilla::gfx::simd::SaturateTo8<int>(int)
Unexecuted instantiation: unsigned char mozilla::gfx::simd::SaturateTo8<short>(short)
622
623
inline Scalaru8x16_t
624
PackAndSaturate32To8(Scalari32x4_t m1, Scalari32x4_t m2, Scalari32x4_t m3, const Scalari32x4_t& m4)
625
0
{
626
0
  Scalaru8x16_t m;
627
0
  m.u8[0]  = SaturateTo8(m1.i32[0]);
628
0
  m.u8[1]  = SaturateTo8(m1.i32[1]);
629
0
  m.u8[2]  = SaturateTo8(m1.i32[2]);
630
0
  m.u8[3]  = SaturateTo8(m1.i32[3]);
631
0
  m.u8[4]  = SaturateTo8(m2.i32[0]);
632
0
  m.u8[5]  = SaturateTo8(m2.i32[1]);
633
0
  m.u8[6]  = SaturateTo8(m2.i32[2]);
634
0
  m.u8[7]  = SaturateTo8(m2.i32[3]);
635
0
  m.u8[8]  = SaturateTo8(m3.i32[0]);
636
0
  m.u8[9]  = SaturateTo8(m3.i32[1]);
637
0
  m.u8[10] = SaturateTo8(m3.i32[2]);
638
0
  m.u8[11] = SaturateTo8(m3.i32[3]);
639
0
  m.u8[12] = SaturateTo8(m4.i32[0]);
640
0
  m.u8[13] = SaturateTo8(m4.i32[1]);
641
0
  m.u8[14] = SaturateTo8(m4.i32[2]);
642
0
  m.u8[15] = SaturateTo8(m4.i32[3]);
643
0
  return m;
644
0
}
645
646
inline Scalaru8x16_t
647
PackAndSaturate16To8(Scalari16x8_t m1, Scalari16x8_t m2)
648
0
{
649
0
  Scalaru8x16_t m;
650
0
  m.u8[0]  = SaturateTo8(m1.i16[0]);
651
0
  m.u8[1]  = SaturateTo8(m1.i16[1]);
652
0
  m.u8[2]  = SaturateTo8(m1.i16[2]);
653
0
  m.u8[3]  = SaturateTo8(m1.i16[3]);
654
0
  m.u8[4]  = SaturateTo8(m1.i16[4]);
655
0
  m.u8[5]  = SaturateTo8(m1.i16[5]);
656
0
  m.u8[6]  = SaturateTo8(m1.i16[6]);
657
0
  m.u8[7]  = SaturateTo8(m1.i16[7]);
658
0
  m.u8[8]  = SaturateTo8(m2.i16[0]);
659
0
  m.u8[9]  = SaturateTo8(m2.i16[1]);
660
0
  m.u8[10] = SaturateTo8(m2.i16[2]);
661
0
  m.u8[11] = SaturateTo8(m2.i16[3]);
662
0
  m.u8[12] = SaturateTo8(m2.i16[4]);
663
0
  m.u8[13] = SaturateTo8(m2.i16[5]);
664
0
  m.u8[14] = SaturateTo8(m2.i16[6]);
665
0
  m.u8[15] = SaturateTo8(m2.i16[7]);
666
0
  return m;
667
0
}
668
669
// Fast approximate division by 255. It has the property that
670
// for all 0 <= n <= 255*255, FAST_DIVIDE_BY_255(n) == n/255.
671
// But it only uses two adds and two shifts instead of an
672
// integer division (which is expensive on many processors).
673
//
674
// equivalent to v/255
675
template<class B, class A>
676
inline B FastDivideBy255(A v)
677
0
{
678
0
  return ((v << 8) + v + 255) >> 16;
679
0
}
Unexecuted instantiation: unsigned short mozilla::gfx::simd::FastDivideBy255<unsigned short, int>(int)
Unexecuted instantiation: int mozilla::gfx::simd::FastDivideBy255<int, int>(int)
680
681
inline Scalaru16x8_t
682
FastDivideBy255_16(Scalaru16x8_t m)
683
0
{
684
0
  return FromU16<Scalaru16x8_t>(FastDivideBy255<uint16_t>(int32_t(m.u16[0])),
685
0
                                FastDivideBy255<uint16_t>(int32_t(m.u16[1])),
686
0
                                FastDivideBy255<uint16_t>(int32_t(m.u16[2])),
687
0
                                FastDivideBy255<uint16_t>(int32_t(m.u16[3])),
688
0
                                FastDivideBy255<uint16_t>(int32_t(m.u16[4])),
689
0
                                FastDivideBy255<uint16_t>(int32_t(m.u16[5])),
690
0
                                FastDivideBy255<uint16_t>(int32_t(m.u16[6])),
691
0
                                FastDivideBy255<uint16_t>(int32_t(m.u16[7])));
692
0
}
693
694
inline Scalari32x4_t
695
FastDivideBy255(Scalari32x4_t m)
696
0
{
697
0
  return From32<Scalari32x4_t>(FastDivideBy255<int32_t>(m.i32[0]),
698
0
                               FastDivideBy255<int32_t>(m.i32[1]),
699
0
                               FastDivideBy255<int32_t>(m.i32[2]),
700
0
                               FastDivideBy255<int32_t>(m.i32[3]));
701
0
}
702
703
inline Scalaru8x16_t
704
Pick(Scalaru8x16_t mask, Scalaru8x16_t a, Scalaru8x16_t b)
705
0
{
706
0
  return From8<Scalaru8x16_t>((a.u8[0] & (~mask.u8[0])) | (b.u8[0] & mask.u8[0]),
707
0
                              (a.u8[1] & (~mask.u8[1])) | (b.u8[1] & mask.u8[1]),
708
0
                              (a.u8[2] & (~mask.u8[2])) | (b.u8[2] & mask.u8[2]),
709
0
                              (a.u8[3] & (~mask.u8[3])) | (b.u8[3] & mask.u8[3]),
710
0
                              (a.u8[4] & (~mask.u8[4])) | (b.u8[4] & mask.u8[4]),
711
0
                              (a.u8[5] & (~mask.u8[5])) | (b.u8[5] & mask.u8[5]),
712
0
                              (a.u8[6] & (~mask.u8[6])) | (b.u8[6] & mask.u8[6]),
713
0
                              (a.u8[7] & (~mask.u8[7])) | (b.u8[7] & mask.u8[7]),
714
0
                              (a.u8[8+0] & (~mask.u8[8+0])) | (b.u8[8+0] & mask.u8[8+0]),
715
0
                              (a.u8[8+1] & (~mask.u8[8+1])) | (b.u8[8+1] & mask.u8[8+1]),
716
0
                              (a.u8[8+2] & (~mask.u8[8+2])) | (b.u8[8+2] & mask.u8[8+2]),
717
0
                              (a.u8[8+3] & (~mask.u8[8+3])) | (b.u8[8+3] & mask.u8[8+3]),
718
0
                              (a.u8[8+4] & (~mask.u8[8+4])) | (b.u8[8+4] & mask.u8[8+4]),
719
0
                              (a.u8[8+5] & (~mask.u8[8+5])) | (b.u8[8+5] & mask.u8[8+5]),
720
0
                              (a.u8[8+6] & (~mask.u8[8+6])) | (b.u8[8+6] & mask.u8[8+6]),
721
0
                              (a.u8[8+7] & (~mask.u8[8+7])) | (b.u8[8+7] & mask.u8[8+7]));
722
0
}
723
724
inline Scalari32x4_t
725
Pick(Scalari32x4_t mask, Scalari32x4_t a, Scalari32x4_t b)
726
0
{
727
0
  return From32<Scalari32x4_t>((a.i32[0] & (~mask.i32[0])) | (b.i32[0] & mask.i32[0]),
728
0
                               (a.i32[1] & (~mask.i32[1])) | (b.i32[1] & mask.i32[1]),
729
0
                               (a.i32[2] & (~mask.i32[2])) | (b.i32[2] & mask.i32[2]),
730
0
                               (a.i32[3] & (~mask.i32[3])) | (b.i32[3] & mask.i32[3]));
731
0
}
732
733
inline Scalarf32x4_t MixF32(Scalarf32x4_t a, Scalarf32x4_t b, float t)
734
0
{
735
0
  return FromF32<Scalarf32x4_t>(a.f32[0] + (b.f32[0] - a.f32[0]) * t,
736
0
                                a.f32[1] + (b.f32[1] - a.f32[1]) * t,
737
0
                                a.f32[2] + (b.f32[2] - a.f32[2]) * t,
738
0
                                a.f32[3] + (b.f32[3] - a.f32[3]) * t);
739
0
}
740
741
inline Scalarf32x4_t WSumF32(Scalarf32x4_t a, Scalarf32x4_t b, float wa, float wb)
742
0
{
743
0
  return FromF32<Scalarf32x4_t>(a.f32[0] * wa + b.f32[0] * wb,
744
0
                                a.f32[1] * wa + b.f32[1] * wb,
745
0
                                a.f32[2] * wa + b.f32[2] * wb,
746
0
                                a.f32[3] * wa + b.f32[3] * wb);
747
0
}
748
749
inline Scalarf32x4_t AbsF32(Scalarf32x4_t a)
750
0
{
751
0
  return FromF32<Scalarf32x4_t>(fabs(a.f32[0]),
752
0
                                fabs(a.f32[1]),
753
0
                                fabs(a.f32[2]),
754
0
                                fabs(a.f32[3]));
755
0
}
756
757
inline Scalarf32x4_t AddF32(Scalarf32x4_t a, Scalarf32x4_t b)
758
0
{
759
0
  return FromF32<Scalarf32x4_t>(a.f32[0] + b.f32[0],
760
0
                                a.f32[1] + b.f32[1],
761
0
                                a.f32[2] + b.f32[2],
762
0
                                a.f32[3] + b.f32[3]);
763
0
}
764
765
inline Scalarf32x4_t MulF32(Scalarf32x4_t a, Scalarf32x4_t b)
766
0
{
767
0
  return FromF32<Scalarf32x4_t>(a.f32[0] * b.f32[0],
768
0
                                a.f32[1] * b.f32[1],
769
0
                                a.f32[2] * b.f32[2],
770
0
                                a.f32[3] * b.f32[3]);
771
0
}
772
773
inline Scalarf32x4_t DivF32(Scalarf32x4_t a, Scalarf32x4_t b)
774
0
{
775
0
  return FromF32<Scalarf32x4_t>(a.f32[0] / b.f32[0],
776
0
                                a.f32[1] / b.f32[1],
777
0
                                a.f32[2] / b.f32[2],
778
0
                                a.f32[3] / b.f32[3]);
779
0
}
780
781
template<uint8_t aIndex>
782
inline Scalarf32x4_t SplatF32(Scalarf32x4_t m)
783
0
{
784
0
  AssertIndex<aIndex>();
785
0
  return FromF32<Scalarf32x4_t>(m.f32[aIndex],
786
0
                                m.f32[aIndex],
787
0
                                m.f32[aIndex],
788
0
                                m.f32[aIndex]);
789
0
}
790
791
inline Scalari32x4_t F32ToI32(Scalarf32x4_t m)
792
0
{
793
0
  return From32<Scalari32x4_t>(int32_t(floor(m.f32[0] + 0.5f)),
794
0
                               int32_t(floor(m.f32[1] + 0.5f)),
795
0
                               int32_t(floor(m.f32[2] + 0.5f)),
796
0
                               int32_t(floor(m.f32[3] + 0.5f)));
797
0
}
798
799
#ifdef SIMD_COMPILE_SSE2
800
801
// SSE2
802
803
template<>
804
inline __m128i
805
Load8<__m128i>(const uint8_t* aSource)
806
0
{
807
0
  return _mm_load_si128((const __m128i*)aSource);
808
0
}
809
810
inline void Store8(uint8_t* aTarget, __m128i aM)
811
0
{
812
0
  _mm_store_si128((__m128i*)aTarget, aM);
813
0
}
814
815
template<>
816
inline __m128i FromZero8<__m128i>()
817
0
{
818
0
  return _mm_setzero_si128();
819
0
}
820
821
template<>
822
inline __m128i From8<__m128i>(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h,
823
                              uint8_t i, uint8_t j, uint8_t k, uint8_t l, uint8_t m, uint8_t n, uint8_t o, uint8_t p)
824
0
{
825
0
  return _mm_setr_epi16((b << 8) + a, (d << 8) + c, (e << 8) + f, (h << 8) + g,
826
0
                        (j << 8) + i, (l << 8) + k, (m << 8) + n, (p << 8) + o);
827
0
}
828
829
template<>
830
inline __m128i FromI16<__m128i>(int16_t a, int16_t b, int16_t c, int16_t d, int16_t e, int16_t f, int16_t g, int16_t h)
831
0
{
832
0
  return _mm_setr_epi16(a, b, c, d, e, f, g, h);
833
0
}
834
835
template<>
836
inline __m128i FromU16<__m128i>(uint16_t a, uint16_t b, uint16_t c, uint16_t d, uint16_t e, uint16_t f, uint16_t g, uint16_t h)
837
0
{
838
0
  return _mm_setr_epi16(a, b, c, d, e, f, g, h);
839
0
}
840
841
template<>
842
inline __m128i FromI16<__m128i>(int16_t a)
843
0
{
844
0
  return _mm_set1_epi16(a);
845
0
}
846
847
template<>
848
inline __m128i FromU16<__m128i>(uint16_t a)
849
0
{
850
0
  return _mm_set1_epi16((int16_t)a);
851
0
}
852
853
template<>
854
inline __m128i From32<__m128i>(int32_t a, int32_t b, int32_t c, int32_t d)
855
0
{
856
0
  return _mm_setr_epi32(a, b, c, d);
857
0
}
858
859
template<>
860
inline __m128i From32<__m128i>(int32_t a)
861
0
{
862
0
  return _mm_set1_epi32(a);
863
0
}
864
865
template<>
866
inline __m128 FromF32<__m128>(float a, float b, float c, float d)
867
0
{
868
0
  return _mm_setr_ps(a, b, c, d);
869
0
}
870
871
template<>
872
inline __m128 FromF32<__m128>(float a)
873
0
{
874
0
  return _mm_set1_ps(a);
875
0
}
876
877
template<int32_t aNumberOfBits>
878
inline __m128i ShiftRight16(__m128i aM)
879
0
{
880
0
  return _mm_srli_epi16(aM, aNumberOfBits);
881
0
}
882
883
template<int32_t aNumberOfBits>
884
inline __m128i ShiftRight32(__m128i aM)
885
0
{
886
0
  return _mm_srai_epi32(aM, aNumberOfBits);
887
0
}
888
889
inline __m128i Add16(__m128i aM1, __m128i aM2)
890
0
{
891
0
  return _mm_add_epi16(aM1, aM2);
892
0
}
893
894
inline __m128i Add32(__m128i aM1, __m128i aM2)
895
0
{
896
0
  return _mm_add_epi32(aM1, aM2);
897
0
}
898
899
inline __m128i Sub16(__m128i aM1, __m128i aM2)
900
0
{
901
0
  return _mm_sub_epi16(aM1, aM2);
902
0
}
903
904
inline __m128i Sub32(__m128i aM1, __m128i aM2)
905
0
{
906
0
  return _mm_sub_epi32(aM1, aM2);
907
0
}
908
909
inline __m128i Min8(__m128i aM1, __m128i aM2)
910
0
{
911
0
  return _mm_min_epu8(aM1, aM2);
912
0
}
913
914
inline __m128i Max8(__m128i aM1, __m128i aM2)
915
0
{
916
0
  return _mm_max_epu8(aM1, aM2);
917
0
}
918
919
inline __m128i Min32(__m128i aM1, __m128i aM2)
920
0
{
921
0
  __m128i m1_minus_m2 = _mm_sub_epi32(aM1, aM2);
922
0
  __m128i m1_greater_than_m2 = _mm_cmpgt_epi32(aM1, aM2);
923
0
  return _mm_sub_epi32(aM1, _mm_and_si128(m1_minus_m2, m1_greater_than_m2));
924
0
}
925
926
inline __m128i Max32(__m128i aM1, __m128i aM2)
927
0
{
928
0
  __m128i m1_minus_m2 = _mm_sub_epi32(aM1, aM2);
929
0
  __m128i m2_greater_than_m1 = _mm_cmpgt_epi32(aM2, aM1);
930
0
  return _mm_sub_epi32(aM1, _mm_and_si128(m1_minus_m2, m2_greater_than_m1));
931
0
}
932
933
inline __m128i Mul16(__m128i aM1, __m128i aM2)
934
0
{
935
0
  return _mm_mullo_epi16(aM1, aM2);
936
0
}
937
938
inline __m128i MulU16(__m128i aM1, __m128i aM2)
939
0
{
940
0
  return _mm_mullo_epi16(aM1, aM2);
941
0
}
942
943
inline void Mul16x4x2x2To32x4x2(__m128i aFactorsA1B1,
944
                                __m128i aFactorsA2B2,
945
                                __m128i& aProductA,
946
                                __m128i& aProductB)
947
0
{
948
0
  __m128i prodAB_lo = _mm_mullo_epi16(aFactorsA1B1, aFactorsA2B2);
949
0
  __m128i prodAB_hi = _mm_mulhi_epi16(aFactorsA1B1, aFactorsA2B2);
950
0
  aProductA = _mm_unpacklo_epi16(prodAB_lo, prodAB_hi);
951
0
  aProductB = _mm_unpackhi_epi16(prodAB_lo, prodAB_hi);
952
0
}
953
954
inline __m128i MulAdd16x8x2To32x4(__m128i aFactorsA,
955
                                  __m128i aFactorsB)
956
0
{
957
0
  return _mm_madd_epi16(aFactorsA, aFactorsB);
958
0
}
959
960
template<int8_t i0, int8_t i1, int8_t i2, int8_t i3>
961
inline __m128i Shuffle32(__m128i aM)
962
0
{
963
0
  AssertIndex<i0>();
964
0
  AssertIndex<i1>();
965
0
  AssertIndex<i2>();
966
0
  AssertIndex<i3>();
967
0
  return _mm_shuffle_epi32(aM, _MM_SHUFFLE(i0, i1, i2, i3));
968
0
}
Unexecuted instantiation: long long __vector(2) mozilla::gfx::simd::Shuffle32<(signed char)3, (signed char)2, (signed char)3, (signed char)2>(long long __vector(2))
Unexecuted instantiation: long long __vector(2) mozilla::gfx::simd::Shuffle32<(signed char)0, (signed char)0, (signed char)0, (signed char)0>(long long __vector(2))
Unexecuted instantiation: long long __vector(2) mozilla::gfx::simd::Shuffle32<(signed char)1, (signed char)1, (signed char)1, (signed char)1>(long long __vector(2))
Unexecuted instantiation: long long __vector(2) mozilla::gfx::simd::Shuffle32<(signed char)2, (signed char)2, (signed char)2, (signed char)2>(long long __vector(2))
Unexecuted instantiation: long long __vector(2) mozilla::gfx::simd::Shuffle32<(signed char)3, (signed char)3, (signed char)3, (signed char)3>(long long __vector(2))
969
970
template<int8_t i0, int8_t i1, int8_t i2, int8_t i3>
971
inline __m128i ShuffleLo16(__m128i aM)
972
0
{
973
0
  AssertIndex<i0>();
974
0
  AssertIndex<i1>();
975
0
  AssertIndex<i2>();
976
0
  AssertIndex<i3>();
977
0
  return _mm_shufflelo_epi16(aM, _MM_SHUFFLE(i0, i1, i2, i3));
978
0
}
Unexecuted instantiation: long long __vector(2) mozilla::gfx::simd::ShuffleLo16<(signed char)1, (signed char)0, (signed char)1, (signed char)0>(long long __vector(2))
Unexecuted instantiation: long long __vector(2) mozilla::gfx::simd::ShuffleLo16<(signed char)3, (signed char)2, (signed char)3, (signed char)2>(long long __vector(2))
Unexecuted instantiation: long long __vector(2) mozilla::gfx::simd::ShuffleLo16<(signed char)3, (signed char)3, (signed char)3, (signed char)3>(long long __vector(2))
979
980
template<int8_t i0, int8_t i1, int8_t i2, int8_t i3>
981
inline __m128i ShuffleHi16(__m128i aM)
982
0
{
983
0
  AssertIndex<i0>();
984
0
  AssertIndex<i1>();
985
0
  AssertIndex<i2>();
986
0
  AssertIndex<i3>();
987
0
  return _mm_shufflehi_epi16(aM, _MM_SHUFFLE(i0, i1, i2, i3));
988
0
}
Unexecuted instantiation: long long __vector(2) mozilla::gfx::simd::ShuffleHi16<(signed char)1, (signed char)0, (signed char)1, (signed char)0>(long long __vector(2))
Unexecuted instantiation: long long __vector(2) mozilla::gfx::simd::ShuffleHi16<(signed char)3, (signed char)2, (signed char)3, (signed char)2>(long long __vector(2))
Unexecuted instantiation: long long __vector(2) mozilla::gfx::simd::ShuffleHi16<(signed char)3, (signed char)3, (signed char)3, (signed char)3>(long long __vector(2))
989
990
template<int8_t aIndex>
991
inline __m128i Splat32(__m128i aM)
992
{
993
  return Shuffle32<aIndex,aIndex,aIndex,aIndex>(aM);
994
}
995
996
template<int8_t aIndex>
997
inline __m128i Splat32On8(__m128i aM)
998
0
{
999
0
  return Shuffle32<aIndex,aIndex,aIndex,aIndex>(aM);
1000
0
}
Unexecuted instantiation: long long __vector(2) mozilla::gfx::simd::Splat32On8<(signed char)0>(long long __vector(2))
Unexecuted instantiation: long long __vector(2) mozilla::gfx::simd::Splat32On8<(signed char)1>(long long __vector(2))
Unexecuted instantiation: long long __vector(2) mozilla::gfx::simd::Splat32On8<(signed char)2>(long long __vector(2))
Unexecuted instantiation: long long __vector(2) mozilla::gfx::simd::Splat32On8<(signed char)3>(long long __vector(2))
1001
1002
template<int8_t aIndexLo, int8_t aIndexHi>
1003
inline __m128i Splat16(__m128i aM)
1004
0
{
1005
0
  AssertIndex<aIndexLo>();
1006
0
  AssertIndex<aIndexHi>();
1007
0
  return ShuffleHi16<aIndexHi,aIndexHi,aIndexHi,aIndexHi>(
1008
0
           ShuffleLo16<aIndexLo,aIndexLo,aIndexLo,aIndexLo>(aM));
1009
0
}
1010
1011
inline __m128i
1012
UnpackLo8x8ToI16x8(__m128i m)
1013
0
{
1014
0
  __m128i zero = _mm_set1_epi8(0);
1015
0
  return _mm_unpacklo_epi8(m, zero);
1016
0
}
1017
1018
inline __m128i
1019
UnpackHi8x8ToI16x8(__m128i m)
1020
0
{
1021
0
  __m128i zero = _mm_set1_epi8(0);
1022
0
  return _mm_unpackhi_epi8(m, zero);
1023
0
}
1024
1025
inline __m128i
1026
UnpackLo8x8ToU16x8(__m128i m)
1027
0
{
1028
0
  __m128i zero = _mm_set1_epi8(0);
1029
0
  return _mm_unpacklo_epi8(m, zero);
1030
0
}
1031
1032
inline __m128i
1033
UnpackHi8x8ToU16x8(__m128i m)
1034
0
{
1035
0
  __m128i zero = _mm_set1_epi8(0);
1036
0
  return _mm_unpackhi_epi8(m, zero);
1037
0
}
1038
1039
inline __m128i
1040
InterleaveLo8(__m128i m1, __m128i m2)
1041
0
{
1042
0
  return _mm_unpacklo_epi8(m1, m2);
1043
0
}
1044
1045
inline __m128i
1046
InterleaveHi8(__m128i m1, __m128i m2)
1047
0
{
1048
0
  return _mm_unpackhi_epi8(m1, m2);
1049
0
}
1050
1051
inline __m128i
1052
InterleaveLo16(__m128i m1, __m128i m2)
1053
0
{
1054
0
  return _mm_unpacklo_epi16(m1, m2);
1055
0
}
1056
1057
inline __m128i
1058
InterleaveHi16(__m128i m1, __m128i m2)
1059
0
{
1060
0
  return _mm_unpackhi_epi16(m1, m2);
1061
0
}
1062
1063
inline __m128i
1064
InterleaveLo32(__m128i m1, __m128i m2)
1065
0
{
1066
0
  return _mm_unpacklo_epi32(m1, m2);
1067
0
}
1068
1069
template<uint8_t aNumBytes>
1070
inline __m128i
1071
Rotate8(__m128i a1234, __m128i a5678)
1072
0
{
1073
0
  return _mm_or_si128(_mm_srli_si128(a1234, aNumBytes), _mm_slli_si128(a5678, 16 - aNumBytes));
1074
0
}
Unexecuted instantiation: long long __vector(2) mozilla::gfx::simd::Rotate8<(unsigned char)4>(long long __vector(2), long long __vector(2))
Unexecuted instantiation: long long __vector(2) mozilla::gfx::simd::Rotate8<(unsigned char)8>(long long __vector(2), long long __vector(2))
Unexecuted instantiation: long long __vector(2) mozilla::gfx::simd::Rotate8<(unsigned char)12>(long long __vector(2), long long __vector(2))
1075
1076
inline __m128i
1077
PackAndSaturate32To16(__m128i m1, __m128i m2)
1078
0
{
1079
0
  return _mm_packs_epi32(m1, m2);
1080
0
}
1081
1082
inline __m128i
1083
PackAndSaturate32ToU16(__m128i m1, __m128i m2)
1084
0
{
1085
0
  return _mm_packs_epi32(m1, m2);
1086
0
}
1087
1088
inline __m128i
1089
PackAndSaturate32To8(__m128i m1, __m128i m2, __m128i m3, const __m128i& m4)
1090
0
{
1091
0
  // Pack into 8 16bit signed integers (saturating).
1092
0
  __m128i m12 = _mm_packs_epi32(m1, m2);
1093
0
  __m128i m34 = _mm_packs_epi32(m3, m4);
1094
0
1095
0
  // Pack into 16 8bit unsigned integers (saturating).
1096
0
  return _mm_packus_epi16(m12, m34);
1097
0
}
1098
1099
inline __m128i
1100
PackAndSaturate16To8(__m128i m1, __m128i m2)
1101
0
{
1102
0
  // Pack into 16 8bit unsigned integers (saturating).
1103
0
  return _mm_packus_epi16(m1, m2);
1104
0
}
1105
1106
inline __m128i
1107
FastDivideBy255(__m128i m)
1108
0
{
1109
0
  // v = m << 8
1110
0
  __m128i v = _mm_slli_epi32(m, 8);
1111
0
  // v = v + (m + (255,255,255,255))
1112
0
  v = _mm_add_epi32(v, _mm_add_epi32(m, _mm_set1_epi32(255)));
1113
0
  // v = v >> 16
1114
0
  return _mm_srai_epi32(v, 16);
1115
0
}
1116
1117
inline __m128i
1118
FastDivideBy255_16(__m128i m)
1119
0
{
1120
0
  __m128i zero = _mm_set1_epi16(0);
1121
0
  __m128i lo = _mm_unpacklo_epi16(m, zero);
1122
0
  __m128i hi = _mm_unpackhi_epi16(m, zero);
1123
0
  return _mm_packs_epi32(FastDivideBy255(lo), FastDivideBy255(hi));
1124
0
}
1125
1126
inline __m128i
1127
Pick(__m128i mask, __m128i a, __m128i b)
1128
0
{
1129
0
  return _mm_or_si128(_mm_andnot_si128(mask, a), _mm_and_si128(mask, b));
1130
0
}
1131
1132
inline __m128 MixF32(__m128 a, __m128 b, float t)
1133
0
{
1134
0
  return _mm_add_ps(a, _mm_mul_ps(_mm_sub_ps(b, a), _mm_set1_ps(t)));
1135
0
}
1136
1137
inline __m128 WSumF32(__m128 a, __m128 b, float wa, float wb)
1138
0
{
1139
0
  return _mm_add_ps(_mm_mul_ps(a, _mm_set1_ps(wa)), _mm_mul_ps(b, _mm_set1_ps(wb)));
1140
0
}
1141
1142
inline __m128 AbsF32(__m128 a)
1143
0
{
1144
0
  return _mm_max_ps(_mm_sub_ps(_mm_setzero_ps(), a), a);
1145
0
}
1146
1147
inline __m128 AddF32(__m128 a, __m128 b)
1148
0
{
1149
0
  return _mm_add_ps(a, b);
1150
0
}
1151
1152
inline __m128 MulF32(__m128 a, __m128 b)
1153
0
{
1154
0
  return _mm_mul_ps(a, b);
1155
0
}
1156
1157
inline __m128 DivF32(__m128 a, __m128 b)
1158
0
{
1159
0
  return _mm_div_ps(a, b);
1160
0
}
1161
1162
template<uint8_t aIndex>
1163
inline __m128 SplatF32(__m128 m)
1164
0
{
1165
0
  AssertIndex<aIndex>();
1166
0
  return _mm_shuffle_ps(m, m, _MM_SHUFFLE(aIndex, aIndex, aIndex, aIndex));
1167
0
}
1168
1169
inline __m128i F32ToI32(__m128 m)
1170
0
{
1171
0
  return _mm_cvtps_epi32(m);
1172
0
}
1173
1174
#endif // SIMD_COMPILE_SSE2
1175
1176
} // namespace simd
1177
1178
} // namespace gfx
1179
} // namespace mozilla
1180
1181
#endif // _MOZILLA_GFX_SIMD_H_