/src/mozilla-central/gfx/2d/SIMD.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
2 | | /* vim: set ts=8 sts=2 et sw=2 tw=80: */ |
3 | | /* This Source Code Form is subject to the terms of the Mozilla Public |
4 | | * License, v. 2.0. If a copy of the MPL was not distributed with this |
5 | | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
6 | | |
7 | | #ifndef _MOZILLA_GFX_SIMD_H_ |
8 | | #define _MOZILLA_GFX_SIMD_H_ |
9 | | |
10 | | /** |
11 | | * Consumers of this file need to #define SIMD_COMPILE_SSE2 before including it |
12 | | * if they want access to the SSE2 functions. |
13 | | */ |
14 | | |
15 | | #ifdef SIMD_COMPILE_SSE2 |
16 | | #include <xmmintrin.h> |
17 | | #endif |
18 | | |
19 | | namespace mozilla { |
20 | | namespace gfx { |
21 | | |
22 | | namespace simd { |
23 | | |
24 | | template<typename u8x16_t> |
25 | | u8x16_t Load8(const uint8_t* aSource); |
26 | | |
27 | | template<typename u8x16_t> |
28 | | u8x16_t From8(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h, |
29 | | uint8_t i, uint8_t j, uint8_t k, uint8_t l, uint8_t m, uint8_t n, uint8_t o, uint8_t p); |
30 | | |
31 | | template<typename u8x16_t> |
32 | | u8x16_t FromZero8(); |
33 | | |
34 | | template<typename i16x8_t> |
35 | | i16x8_t FromI16(int16_t a, int16_t b, int16_t c, int16_t d, int16_t e, int16_t f, int16_t g, int16_t h); |
36 | | |
37 | | template<typename u16x8_t> |
38 | | u16x8_t FromU16(uint16_t a, uint16_t b, uint16_t c, uint16_t d, uint16_t e, uint16_t f, uint16_t g, uint16_t h); |
39 | | |
40 | | template<typename i16x8_t> |
41 | | i16x8_t FromI16(int16_t a); |
42 | | |
43 | | template<typename u16x8_t> |
44 | | u16x8_t FromU16(uint16_t a); |
45 | | |
46 | | template<typename i32x4_t> |
47 | | i32x4_t From32(int32_t a, int32_t b, int32_t c, int32_t d); |
48 | | |
49 | | template<typename i32x4_t> |
50 | | i32x4_t From32(int32_t a); |
51 | | |
52 | | template<typename f32x4_t> |
53 | | f32x4_t FromF32(float a, float b, float c, float d); |
54 | | |
55 | | template<typename f32x4_t> |
56 | | f32x4_t FromF32(float a); |
57 | | |
58 | | // All SIMD backends overload these functions for their SIMD types: |
59 | | |
60 | | #if 0 |
61 | | |
62 | | // Store 16 bytes to a 16-byte aligned address |
63 | | void Store8(uint8_t* aTarget, u8x16_t aM); |
64 | | |
65 | | // Fixed shifts |
66 | | template<int32_t aNumberOfBits> i16x8_t ShiftRight16(i16x8_t aM); |
67 | | template<int32_t aNumberOfBits> i32x4_t ShiftRight32(i32x4_t aM); |
68 | | |
69 | | i16x8_t Add16(i16x8_t aM1, i16x8_t aM2); |
70 | | i32x4_t Add32(i32x4_t aM1, i32x4_t aM2); |
71 | | i16x8_t Sub16(i16x8_t aM1, i16x8_t aM2); |
72 | | i32x4_t Sub32(i32x4_t aM1, i32x4_t aM2); |
73 | | u8x16_t Min8(u8x16_t aM1, iu8x16_t aM2); |
74 | | u8x16_t Max8(u8x16_t aM1, iu8x16_t aM2); |
75 | | i32x4_t Min32(i32x4_t aM1, i32x4_t aM2); |
76 | | i32x4_t Max32(i32x4_t aM1, i32x4_t aM2); |
77 | | |
78 | | // Truncating i16 -> i16 multiplication |
79 | | i16x8_t Mul16(i16x8_t aM1, i16x8_t aM2); |
80 | | |
81 | | // Long multiplication i16 -> i32 |
82 | | // aFactorsA1B1 = (a1[4] b1[4]) |
83 | | // aFactorsA2B2 = (a2[4] b2[4]) |
84 | | // aProductA = a1 * a2, aProductB = b1 * b2 |
85 | | void Mul16x4x2x2To32x4x2(i16x8_t aFactorsA1B1, i16x8_t aFactorsA2B2, |
86 | | i32x4_t& aProductA, i32x4_t& aProductB); |
87 | | |
88 | | // Long multiplication + pairwise addition i16 -> i32 |
89 | | // See the scalar implementation for specifics. |
90 | | i32x4_t MulAdd16x8x2To32x4(i16x8_t aFactorsA, i16x8_t aFactorsB); |
91 | | i32x4_t MulAdd16x8x2To32x4(u16x8_t aFactorsA, u16x8_t aFactorsB); |
92 | | |
93 | | // Set all four 32-bit components to the value of the component at aIndex. |
94 | | template<int8_t aIndex> |
95 | | i32x4_t Splat32(i32x4_t aM); |
96 | | |
97 | | // Interpret the input as four 32-bit values, apply Splat32<aIndex> on them, |
98 | | // re-interpret the result as sixteen 8-bit values. |
99 | | template<int8_t aIndex> |
100 | | u8x16_t Splat32On8(u8x16_t aM); |
101 | | |
102 | | template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> i32x4 Shuffle32(i32x4 aM); |
103 | | template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> i16x8 ShuffleLo16(i16x8 aM); |
104 | | template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> i16x8 ShuffleHi16(i16x8 aM); |
105 | | |
106 | | u8x16_t InterleaveLo8(u8x16_t m1, u8x16_t m2); |
107 | | u8x16_t InterleaveHi8(u8x16_t m1, u8x16_t m2); |
108 | | i16x8_t InterleaveLo16(i16x8_t m1, i16x8_t m2); |
109 | | i16x8_t InterleaveHi16(i16x8_t m1, i16x8_t m2); |
110 | | i32x4_t InterleaveLo32(i32x4_t m1, i32x4_t m2); |
111 | | |
112 | | i16x8_t UnpackLo8x8ToI16x8(u8x16_t m); |
113 | | i16x8_t UnpackHi8x8ToI16x8(u8x16_t m); |
114 | | u16x8_t UnpackLo8x8ToU16x8(u8x16_t m); |
115 | | u16x8_t UnpackHi8x8ToU16x8(u8x16_t m); |
116 | | |
117 | | i16x8_t PackAndSaturate32To16(i32x4_t m1, i32x4_t m2); |
118 | | u8x16_t PackAndSaturate16To8(i16x8_t m1, i16x8_t m2); |
119 | | u8x16_t PackAndSaturate32To8(i32x4_t m1, i32x4_t m2, i32x4_t m3, const i32x4_t& m4); |
120 | | |
121 | | i32x4 FastDivideBy255(i32x4 m); |
122 | | i16x8 FastDivideBy255_16(i16x8 m); |
123 | | |
124 | | #endif |
125 | | |
126 | | // Scalar |
127 | | |
128 | | struct Scalaru8x16_t { |
129 | | uint8_t u8[16]; |
130 | | }; |
131 | | |
132 | | union Scalari16x8_t { |
133 | | int16_t i16[8]; |
134 | | uint16_t u16[8]; |
135 | | }; |
136 | | |
137 | | typedef Scalari16x8_t Scalaru16x8_t; |
138 | | |
139 | | struct Scalari32x4_t { |
140 | | int32_t i32[4]; |
141 | | }; |
142 | | |
143 | | struct Scalarf32x4_t { |
144 | | float f32[4]; |
145 | | }; |
146 | | |
147 | | template<> |
148 | | inline Scalaru8x16_t |
149 | | Load8<Scalaru8x16_t>(const uint8_t* aSource) |
150 | 0 | { |
151 | 0 | return *(Scalaru8x16_t*)aSource; |
152 | 0 | } |
153 | | |
154 | | inline void Store8(uint8_t* aTarget, Scalaru8x16_t aM) |
155 | 0 | { |
156 | 0 | *(Scalaru8x16_t*)aTarget = aM; |
157 | 0 | } |
158 | | |
159 | | template<> |
160 | | inline Scalaru8x16_t From8<Scalaru8x16_t>(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h, |
161 | | uint8_t i, uint8_t j, uint8_t k, uint8_t l, uint8_t m, uint8_t n, uint8_t o, uint8_t p) |
162 | 0 | { |
163 | 0 | Scalaru8x16_t _m; |
164 | 0 | _m.u8[0] = a; |
165 | 0 | _m.u8[1] = b; |
166 | 0 | _m.u8[2] = c; |
167 | 0 | _m.u8[3] = d; |
168 | 0 | _m.u8[4] = e; |
169 | 0 | _m.u8[5] = f; |
170 | 0 | _m.u8[6] = g; |
171 | 0 | _m.u8[7] = h; |
172 | 0 | _m.u8[8+0] = i; |
173 | 0 | _m.u8[8+1] = j; |
174 | 0 | _m.u8[8+2] = k; |
175 | 0 | _m.u8[8+3] = l; |
176 | 0 | _m.u8[8+4] = m; |
177 | 0 | _m.u8[8+5] = n; |
178 | 0 | _m.u8[8+6] = o; |
179 | 0 | _m.u8[8+7] = p; |
180 | 0 | return _m; |
181 | 0 | } |
182 | | |
183 | | template<> |
184 | | inline Scalaru8x16_t FromZero8<Scalaru8x16_t>() |
185 | 0 | { |
186 | 0 | return From8<Scalaru8x16_t>(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0); |
187 | 0 | } |
188 | | |
189 | | template<> |
190 | | inline Scalari16x8_t FromI16<Scalari16x8_t>(int16_t a, int16_t b, int16_t c, int16_t d, int16_t e, int16_t f, int16_t g, int16_t h) |
191 | 0 | { |
192 | 0 | Scalari16x8_t m; |
193 | 0 | m.i16[0] = a; |
194 | 0 | m.i16[1] = b; |
195 | 0 | m.i16[2] = c; |
196 | 0 | m.i16[3] = d; |
197 | 0 | m.i16[4] = e; |
198 | 0 | m.i16[5] = f; |
199 | 0 | m.i16[6] = g; |
200 | 0 | m.i16[7] = h; |
201 | 0 | return m; |
202 | 0 | } |
203 | | |
204 | | template<> |
205 | | inline Scalaru16x8_t FromU16<Scalaru16x8_t>(uint16_t a, uint16_t b, uint16_t c, uint16_t d, uint16_t e, uint16_t f, uint16_t g, uint16_t h) |
206 | 0 | { |
207 | 0 | Scalaru16x8_t m; |
208 | 0 | m.u16[0] = a; |
209 | 0 | m.u16[1] = b; |
210 | 0 | m.u16[2] = c; |
211 | 0 | m.u16[3] = d; |
212 | 0 | m.u16[4] = e; |
213 | 0 | m.u16[5] = f; |
214 | 0 | m.u16[6] = g; |
215 | 0 | m.u16[7] = h; |
216 | 0 | return m; |
217 | 0 | } |
218 | | |
219 | | template<> |
220 | | inline Scalari16x8_t FromI16<Scalari16x8_t>(int16_t a) |
221 | 0 | { |
222 | 0 | return FromI16<Scalari16x8_t>(a, a, a, a, a, a, a, a); |
223 | 0 | } |
224 | | |
225 | | template<> |
226 | | inline Scalaru16x8_t FromU16<Scalaru16x8_t>(uint16_t a) |
227 | 0 | { |
228 | 0 | return FromU16<Scalaru16x8_t>(a, a, a, a, a, a, a, a); |
229 | 0 | } |
230 | | |
231 | | template<> |
232 | | inline Scalari32x4_t From32<Scalari32x4_t>(int32_t a, int32_t b, int32_t c, int32_t d) |
233 | 0 | { |
234 | 0 | Scalari32x4_t m; |
235 | 0 | m.i32[0] = a; |
236 | 0 | m.i32[1] = b; |
237 | 0 | m.i32[2] = c; |
238 | 0 | m.i32[3] = d; |
239 | 0 | return m; |
240 | 0 | } |
241 | | |
242 | | template<> |
243 | | inline Scalarf32x4_t FromF32<Scalarf32x4_t>(float a, float b, float c, float d) |
244 | 0 | { |
245 | 0 | Scalarf32x4_t m; |
246 | 0 | m.f32[0] = a; |
247 | 0 | m.f32[1] = b; |
248 | 0 | m.f32[2] = c; |
249 | 0 | m.f32[3] = d; |
250 | 0 | return m; |
251 | 0 | } |
252 | | |
253 | | template<> |
254 | | inline Scalarf32x4_t FromF32<Scalarf32x4_t>(float a) |
255 | 0 | { |
256 | 0 | return FromF32<Scalarf32x4_t>(a, a, a, a); |
257 | 0 | } |
258 | | |
259 | | template<> |
260 | | inline Scalari32x4_t From32<Scalari32x4_t>(int32_t a) |
261 | 0 | { |
262 | 0 | return From32<Scalari32x4_t>(a, a, a, a); |
263 | 0 | } |
264 | | |
265 | | template<int32_t aNumberOfBits> |
266 | | inline Scalari16x8_t ShiftRight16(Scalari16x8_t aM) |
267 | | { |
268 | | return FromI16<Scalari16x8_t>(uint16_t(aM.i16[0]) >> aNumberOfBits, uint16_t(aM.i16[1]) >> aNumberOfBits, |
269 | | uint16_t(aM.i16[2]) >> aNumberOfBits, uint16_t(aM.i16[3]) >> aNumberOfBits, |
270 | | uint16_t(aM.i16[4]) >> aNumberOfBits, uint16_t(aM.i16[5]) >> aNumberOfBits, |
271 | | uint16_t(aM.i16[6]) >> aNumberOfBits, uint16_t(aM.i16[7]) >> aNumberOfBits); |
272 | | } |
273 | | |
274 | | template<int32_t aNumberOfBits> |
275 | | inline Scalari32x4_t ShiftRight32(Scalari32x4_t aM) |
276 | 0 | { |
277 | 0 | return From32<Scalari32x4_t>(aM.i32[0] >> aNumberOfBits, aM.i32[1] >> aNumberOfBits, |
278 | 0 | aM.i32[2] >> aNumberOfBits, aM.i32[3] >> aNumberOfBits); |
279 | 0 | } |
280 | | |
281 | | inline Scalaru16x8_t Add16(Scalaru16x8_t aM1, Scalaru16x8_t aM2) |
282 | 0 | { |
283 | 0 | return FromU16<Scalaru16x8_t>(aM1.u16[0] + aM2.u16[0], aM1.u16[1] + aM2.u16[1], |
284 | 0 | aM1.u16[2] + aM2.u16[2], aM1.u16[3] + aM2.u16[3], |
285 | 0 | aM1.u16[4] + aM2.u16[4], aM1.u16[5] + aM2.u16[5], |
286 | 0 | aM1.u16[6] + aM2.u16[6], aM1.u16[7] + aM2.u16[7]); |
287 | 0 | } |
288 | | |
289 | | inline Scalari32x4_t Add32(Scalari32x4_t aM1, Scalari32x4_t aM2) |
290 | 0 | { |
291 | 0 | return From32<Scalari32x4_t>(aM1.i32[0] + aM2.i32[0], aM1.i32[1] + aM2.i32[1], |
292 | 0 | aM1.i32[2] + aM2.i32[2], aM1.i32[3] + aM2.i32[3]); |
293 | 0 | } |
294 | | |
295 | | inline Scalaru16x8_t Sub16(Scalaru16x8_t aM1, Scalaru16x8_t aM2) |
296 | 0 | { |
297 | 0 | return FromU16<Scalaru16x8_t>(aM1.u16[0] - aM2.u16[0], aM1.u16[1] - aM2.u16[1], |
298 | 0 | aM1.u16[2] - aM2.u16[2], aM1.u16[3] - aM2.u16[3], |
299 | 0 | aM1.u16[4] - aM2.u16[4], aM1.u16[5] - aM2.u16[5], |
300 | 0 | aM1.u16[6] - aM2.u16[6], aM1.u16[7] - aM2.u16[7]); |
301 | 0 | } |
302 | | |
303 | | inline Scalari32x4_t Sub32(Scalari32x4_t aM1, Scalari32x4_t aM2) |
304 | 0 | { |
305 | 0 | return From32<Scalari32x4_t>(aM1.i32[0] - aM2.i32[0], aM1.i32[1] - aM2.i32[1], |
306 | 0 | aM1.i32[2] - aM2.i32[2], aM1.i32[3] - aM2.i32[3]); |
307 | 0 | } |
308 | | |
309 | | inline int32_t |
310 | | umin(int32_t a, int32_t b) |
311 | 0 | { |
312 | 0 | return a - ((a - b) & -(a > b)); |
313 | 0 | } |
314 | | |
315 | | inline int32_t |
316 | | umax(int32_t a, int32_t b) |
317 | 0 | { |
318 | 0 | return a - ((a - b) & -(a < b)); |
319 | 0 | } |
320 | | |
321 | | inline Scalaru8x16_t Min8(Scalaru8x16_t aM1, Scalaru8x16_t aM2) |
322 | 0 | { |
323 | 0 | return From8<Scalaru8x16_t>(umin(aM1.u8[0], aM2.u8[0]), umin(aM1.u8[1], aM2.u8[1]), |
324 | 0 | umin(aM1.u8[2], aM2.u8[2]), umin(aM1.u8[3], aM2.u8[3]), |
325 | 0 | umin(aM1.u8[4], aM2.u8[4]), umin(aM1.u8[5], aM2.u8[5]), |
326 | 0 | umin(aM1.u8[6], aM2.u8[6]), umin(aM1.u8[7], aM2.u8[7]), |
327 | 0 | umin(aM1.u8[8+0], aM2.u8[8+0]), umin(aM1.u8[8+1], aM2.u8[8+1]), |
328 | 0 | umin(aM1.u8[8+2], aM2.u8[8+2]), umin(aM1.u8[8+3], aM2.u8[8+3]), |
329 | 0 | umin(aM1.u8[8+4], aM2.u8[8+4]), umin(aM1.u8[8+5], aM2.u8[8+5]), |
330 | 0 | umin(aM1.u8[8+6], aM2.u8[8+6]), umin(aM1.u8[8+7], aM2.u8[8+7])); |
331 | 0 | } |
332 | | |
333 | | inline Scalaru8x16_t Max8(Scalaru8x16_t aM1, Scalaru8x16_t aM2) |
334 | 0 | { |
335 | 0 | return From8<Scalaru8x16_t>(umax(aM1.u8[0], aM2.u8[0]), umax(aM1.u8[1], aM2.u8[1]), |
336 | 0 | umax(aM1.u8[2], aM2.u8[2]), umax(aM1.u8[3], aM2.u8[3]), |
337 | 0 | umax(aM1.u8[4], aM2.u8[4]), umax(aM1.u8[5], aM2.u8[5]), |
338 | 0 | umax(aM1.u8[6], aM2.u8[6]), umax(aM1.u8[7], aM2.u8[7]), |
339 | 0 | umax(aM1.u8[8+0], aM2.u8[8+0]), umax(aM1.u8[8+1], aM2.u8[8+1]), |
340 | 0 | umax(aM1.u8[8+2], aM2.u8[8+2]), umax(aM1.u8[8+3], aM2.u8[8+3]), |
341 | 0 | umax(aM1.u8[8+4], aM2.u8[8+4]), umax(aM1.u8[8+5], aM2.u8[8+5]), |
342 | 0 | umax(aM1.u8[8+6], aM2.u8[8+6]), umax(aM1.u8[8+7], aM2.u8[8+7])); |
343 | 0 | } |
344 | | |
345 | | inline Scalari32x4_t Min32(Scalari32x4_t aM1, Scalari32x4_t aM2) |
346 | 0 | { |
347 | 0 | return From32<Scalari32x4_t>(umin(aM1.i32[0], aM2.i32[0]), umin(aM1.i32[1], aM2.i32[1]), |
348 | 0 | umin(aM1.i32[2], aM2.i32[2]), umin(aM1.i32[3], aM2.i32[3])); |
349 | 0 | } |
350 | | |
351 | | inline Scalari32x4_t Max32(Scalari32x4_t aM1, Scalari32x4_t aM2) |
352 | 0 | { |
353 | 0 | return From32<Scalari32x4_t>(umax(aM1.i32[0], aM2.i32[0]), umax(aM1.i32[1], aM2.i32[1]), |
354 | 0 | umax(aM1.i32[2], aM2.i32[2]), umax(aM1.i32[3], aM2.i32[3])); |
355 | 0 | } |
356 | | |
357 | | inline Scalaru16x8_t Mul16(Scalaru16x8_t aM1, Scalaru16x8_t aM2) |
358 | 0 | { |
359 | 0 | return FromU16<Scalaru16x8_t>(uint16_t(int32_t(aM1.u16[0]) * int32_t(aM2.u16[0])), uint16_t(int32_t(aM1.u16[1]) * int32_t(aM2.u16[1])), |
360 | 0 | uint16_t(int32_t(aM1.u16[2]) * int32_t(aM2.u16[2])), uint16_t(int32_t(aM1.u16[3]) * int32_t(aM2.u16[3])), |
361 | 0 | uint16_t(int32_t(aM1.u16[4]) * int32_t(aM2.u16[4])), uint16_t(int32_t(aM1.u16[5]) * int32_t(aM2.u16[5])), |
362 | 0 | uint16_t(int32_t(aM1.u16[6]) * int32_t(aM2.u16[6])), uint16_t(int32_t(aM1.u16[7]) * int32_t(aM2.u16[7]))); |
363 | 0 | } |
364 | | |
365 | | inline void Mul16x4x2x2To32x4x2(Scalari16x8_t aFactorsA1B1, |
366 | | Scalari16x8_t aFactorsA2B2, |
367 | | Scalari32x4_t& aProductA, |
368 | | Scalari32x4_t& aProductB) |
369 | 0 | { |
370 | 0 | aProductA = From32<Scalari32x4_t>(aFactorsA1B1.i16[0] * aFactorsA2B2.i16[0], |
371 | 0 | aFactorsA1B1.i16[1] * aFactorsA2B2.i16[1], |
372 | 0 | aFactorsA1B1.i16[2] * aFactorsA2B2.i16[2], |
373 | 0 | aFactorsA1B1.i16[3] * aFactorsA2B2.i16[3]); |
374 | 0 | aProductB = From32<Scalari32x4_t>(aFactorsA1B1.i16[4] * aFactorsA2B2.i16[4], |
375 | 0 | aFactorsA1B1.i16[5] * aFactorsA2B2.i16[5], |
376 | 0 | aFactorsA1B1.i16[6] * aFactorsA2B2.i16[6], |
377 | 0 | aFactorsA1B1.i16[7] * aFactorsA2B2.i16[7]); |
378 | 0 | } |
379 | | |
380 | | inline Scalari32x4_t MulAdd16x8x2To32x4(Scalari16x8_t aFactorsA, |
381 | | Scalari16x8_t aFactorsB) |
382 | 0 | { |
383 | 0 | return From32<Scalari32x4_t>(aFactorsA.i16[0] * aFactorsB.i16[0] + aFactorsA.i16[1] * aFactorsB.i16[1], |
384 | 0 | aFactorsA.i16[2] * aFactorsB.i16[2] + aFactorsA.i16[3] * aFactorsB.i16[3], |
385 | 0 | aFactorsA.i16[4] * aFactorsB.i16[4] + aFactorsA.i16[5] * aFactorsB.i16[5], |
386 | 0 | aFactorsA.i16[6] * aFactorsB.i16[6] + aFactorsA.i16[7] * aFactorsB.i16[7]); |
387 | 0 | } |
388 | | |
389 | | template<int8_t aIndex> |
390 | | inline void AssertIndex() |
391 | 0 | { |
392 | 0 | static_assert(aIndex == 0 || aIndex == 1 || aIndex == 2 || aIndex == 3, |
393 | 0 | "Invalid splat index"); |
394 | 0 | } Unexecuted instantiation: void mozilla::gfx::simd::AssertIndex<(signed char)3>() Unexecuted instantiation: void mozilla::gfx::simd::AssertIndex<(signed char)2>() Unexecuted instantiation: void mozilla::gfx::simd::AssertIndex<(signed char)0>() Unexecuted instantiation: void mozilla::gfx::simd::AssertIndex<(signed char)1>() |
395 | | |
396 | | template<int8_t aIndex> |
397 | | inline Scalari32x4_t Splat32(Scalari32x4_t aM) |
398 | | { |
399 | | AssertIndex<aIndex>(); |
400 | | return From32<Scalari32x4_t>(aM.i32[aIndex], aM.i32[aIndex], |
401 | | aM.i32[aIndex], aM.i32[aIndex]); |
402 | | } |
403 | | |
404 | | template<int8_t i> |
405 | | inline Scalaru8x16_t Splat32On8(Scalaru8x16_t aM) |
406 | 0 | { |
407 | 0 | AssertIndex<i>(); |
408 | 0 | return From8<Scalaru8x16_t>(aM.u8[i*4], aM.u8[i*4+1], aM.u8[i*4+2], aM.u8[i*4+3], |
409 | 0 | aM.u8[i*4], aM.u8[i*4+1], aM.u8[i*4+2], aM.u8[i*4+3], |
410 | 0 | aM.u8[i*4], aM.u8[i*4+1], aM.u8[i*4+2], aM.u8[i*4+3], |
411 | 0 | aM.u8[i*4], aM.u8[i*4+1], aM.u8[i*4+2], aM.u8[i*4+3]); |
412 | 0 | } Unexecuted instantiation: mozilla::gfx::simd::Scalaru8x16_t mozilla::gfx::simd::Splat32On8<(signed char)0>(mozilla::gfx::simd::Scalaru8x16_t) Unexecuted instantiation: mozilla::gfx::simd::Scalaru8x16_t mozilla::gfx::simd::Splat32On8<(signed char)1>(mozilla::gfx::simd::Scalaru8x16_t) Unexecuted instantiation: mozilla::gfx::simd::Scalaru8x16_t mozilla::gfx::simd::Splat32On8<(signed char)2>(mozilla::gfx::simd::Scalaru8x16_t) Unexecuted instantiation: mozilla::gfx::simd::Scalaru8x16_t mozilla::gfx::simd::Splat32On8<(signed char)3>(mozilla::gfx::simd::Scalaru8x16_t) |
413 | | |
414 | | template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> |
415 | | inline Scalari32x4_t Shuffle32(Scalari32x4_t aM) |
416 | | { |
417 | | AssertIndex<i0>(); |
418 | | AssertIndex<i1>(); |
419 | | AssertIndex<i2>(); |
420 | | AssertIndex<i3>(); |
421 | | Scalari32x4_t m = aM; |
422 | | m.i32[0] = aM.i32[i3]; |
423 | | m.i32[1] = aM.i32[i2]; |
424 | | m.i32[2] = aM.i32[i1]; |
425 | | m.i32[3] = aM.i32[i0]; |
426 | | return m; |
427 | | } |
428 | | |
429 | | template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> |
430 | | inline Scalari16x8_t ShuffleLo16(Scalari16x8_t aM) |
431 | 0 | { |
432 | 0 | AssertIndex<i0>(); |
433 | 0 | AssertIndex<i1>(); |
434 | 0 | AssertIndex<i2>(); |
435 | 0 | AssertIndex<i3>(); |
436 | 0 | Scalari16x8_t m = aM; |
437 | 0 | m.i16[0] = aM.i16[i3]; |
438 | 0 | m.i16[1] = aM.i16[i2]; |
439 | 0 | m.i16[2] = aM.i16[i1]; |
440 | 0 | m.i16[3] = aM.i16[i0]; |
441 | 0 | return m; |
442 | 0 | } Unexecuted instantiation: mozilla::gfx::simd::Scalari16x8_t mozilla::gfx::simd::ShuffleLo16<(signed char)1, (signed char)0, (signed char)1, (signed char)0>(mozilla::gfx::simd::Scalari16x8_t) Unexecuted instantiation: mozilla::gfx::simd::Scalari16x8_t mozilla::gfx::simd::ShuffleLo16<(signed char)3, (signed char)2, (signed char)3, (signed char)2>(mozilla::gfx::simd::Scalari16x8_t) |
443 | | |
444 | | template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> |
445 | | inline Scalari16x8_t ShuffleHi16(Scalari16x8_t aM) |
446 | 0 | { |
447 | 0 | AssertIndex<i0>(); |
448 | 0 | AssertIndex<i1>(); |
449 | 0 | AssertIndex<i2>(); |
450 | 0 | AssertIndex<i3>(); |
451 | 0 | Scalari16x8_t m = aM; |
452 | 0 | m.i16[4 + 0] = aM.i16[4 + i3]; |
453 | 0 | m.i16[4 + 1] = aM.i16[4 + i2]; |
454 | 0 | m.i16[4 + 2] = aM.i16[4 + i1]; |
455 | 0 | m.i16[4 + 3] = aM.i16[4 + i0]; |
456 | 0 | return m; |
457 | 0 | } Unexecuted instantiation: mozilla::gfx::simd::Scalari16x8_t mozilla::gfx::simd::ShuffleHi16<(signed char)1, (signed char)0, (signed char)1, (signed char)0>(mozilla::gfx::simd::Scalari16x8_t) Unexecuted instantiation: mozilla::gfx::simd::Scalari16x8_t mozilla::gfx::simd::ShuffleHi16<(signed char)3, (signed char)2, (signed char)3, (signed char)2>(mozilla::gfx::simd::Scalari16x8_t) |
458 | | |
459 | | template<int8_t aIndexLo, int8_t aIndexHi> |
460 | | inline Scalaru16x8_t Splat16(Scalaru16x8_t aM) |
461 | 0 | { |
462 | 0 | AssertIndex<aIndexLo>(); |
463 | 0 | AssertIndex<aIndexHi>(); |
464 | 0 | Scalaru16x8_t m; |
465 | 0 | int16_t chosenValueLo = aM.u16[aIndexLo]; |
466 | 0 | m.u16[0] = chosenValueLo; |
467 | 0 | m.u16[1] = chosenValueLo; |
468 | 0 | m.u16[2] = chosenValueLo; |
469 | 0 | m.u16[3] = chosenValueLo; |
470 | 0 | int16_t chosenValueHi = aM.u16[4 + aIndexHi]; |
471 | 0 | m.u16[4] = chosenValueHi; |
472 | 0 | m.u16[5] = chosenValueHi; |
473 | 0 | m.u16[6] = chosenValueHi; |
474 | 0 | m.u16[7] = chosenValueHi; |
475 | 0 | return m; |
476 | 0 | } |
477 | | |
478 | | inline Scalaru8x16_t |
479 | | InterleaveLo8(Scalaru8x16_t m1, Scalaru8x16_t m2) |
480 | 0 | { |
481 | 0 | return From8<Scalaru8x16_t>(m1.u8[0], m2.u8[0], m1.u8[1], m2.u8[1], |
482 | 0 | m1.u8[2], m2.u8[2], m1.u8[3], m2.u8[3], |
483 | 0 | m1.u8[4], m2.u8[4], m1.u8[5], m2.u8[5], |
484 | 0 | m1.u8[6], m2.u8[6], m1.u8[7], m2.u8[7]); |
485 | 0 | } |
486 | | |
487 | | inline Scalaru8x16_t |
488 | | InterleaveHi8(Scalaru8x16_t m1, Scalaru8x16_t m2) |
489 | 0 | { |
490 | 0 | return From8<Scalaru8x16_t>(m1.u8[8+0], m2.u8[8+0], m1.u8[8+1], m2.u8[8+1], |
491 | 0 | m1.u8[8+2], m2.u8[8+2], m1.u8[8+3], m2.u8[8+3], |
492 | 0 | m1.u8[8+4], m2.u8[8+4], m1.u8[8+5], m2.u8[8+5], |
493 | 0 | m1.u8[8+6], m2.u8[8+6], m1.u8[8+7], m2.u8[8+7]); |
494 | 0 | } |
495 | | |
496 | | inline Scalaru16x8_t |
497 | | InterleaveLo16(Scalaru16x8_t m1, Scalaru16x8_t m2) |
498 | 0 | { |
499 | 0 | return FromU16<Scalaru16x8_t>(m1.u16[0], m2.u16[0], m1.u16[1], m2.u16[1], |
500 | 0 | m1.u16[2], m2.u16[2], m1.u16[3], m2.u16[3]); |
501 | 0 | } |
502 | | |
503 | | inline Scalaru16x8_t |
504 | | InterleaveHi16(Scalaru16x8_t m1, Scalaru16x8_t m2) |
505 | 0 | { |
506 | 0 | return FromU16<Scalaru16x8_t>(m1.u16[4], m2.u16[4], m1.u16[5], m2.u16[5], |
507 | 0 | m1.u16[6], m2.u16[6], m1.u16[7], m2.u16[7]); |
508 | 0 | } |
509 | | |
510 | | inline Scalari32x4_t |
511 | | InterleaveLo32(Scalari32x4_t m1, Scalari32x4_t m2) |
512 | 0 | { |
513 | 0 | return From32<Scalari32x4_t>(m1.i32[0], m2.i32[0], m1.i32[1], m2.i32[1]); |
514 | 0 | } |
515 | | |
516 | | inline Scalari16x8_t |
517 | | UnpackLo8x8ToI16x8(Scalaru8x16_t aM) |
518 | 0 | { |
519 | 0 | Scalari16x8_t m; |
520 | 0 | m.i16[0] = aM.u8[0]; |
521 | 0 | m.i16[1] = aM.u8[1]; |
522 | 0 | m.i16[2] = aM.u8[2]; |
523 | 0 | m.i16[3] = aM.u8[3]; |
524 | 0 | m.i16[4] = aM.u8[4]; |
525 | 0 | m.i16[5] = aM.u8[5]; |
526 | 0 | m.i16[6] = aM.u8[6]; |
527 | 0 | m.i16[7] = aM.u8[7]; |
528 | 0 | return m; |
529 | 0 | } |
530 | | |
531 | | inline Scalari16x8_t |
532 | | UnpackHi8x8ToI16x8(Scalaru8x16_t aM) |
533 | 0 | { |
534 | 0 | Scalari16x8_t m; |
535 | 0 | m.i16[0] = aM.u8[8+0]; |
536 | 0 | m.i16[1] = aM.u8[8+1]; |
537 | 0 | m.i16[2] = aM.u8[8+2]; |
538 | 0 | m.i16[3] = aM.u8[8+3]; |
539 | 0 | m.i16[4] = aM.u8[8+4]; |
540 | 0 | m.i16[5] = aM.u8[8+5]; |
541 | 0 | m.i16[6] = aM.u8[8+6]; |
542 | 0 | m.i16[7] = aM.u8[8+7]; |
543 | 0 | return m; |
544 | 0 | } |
545 | | |
546 | | inline Scalaru16x8_t |
547 | | UnpackLo8x8ToU16x8(Scalaru8x16_t aM) |
548 | 0 | { |
549 | 0 | return FromU16<Scalaru16x8_t>(uint16_t(aM.u8[0]), uint16_t(aM.u8[1]), uint16_t(aM.u8[2]), uint16_t(aM.u8[3]), |
550 | 0 | uint16_t(aM.u8[4]), uint16_t(aM.u8[5]), uint16_t(aM.u8[6]), uint16_t(aM.u8[7])); |
551 | 0 | } |
552 | | |
553 | | inline Scalaru16x8_t |
554 | | UnpackHi8x8ToU16x8(Scalaru8x16_t aM) |
555 | 0 | { |
556 | 0 | return FromU16<Scalaru16x8_t>(aM.u8[8+0], aM.u8[8+1], aM.u8[8+2], aM.u8[8+3], |
557 | 0 | aM.u8[8+4], aM.u8[8+5], aM.u8[8+6], aM.u8[8+7]); |
558 | 0 | } |
559 | | |
560 | | template<uint8_t aNumBytes> |
561 | | inline Scalaru8x16_t |
562 | | Rotate8(Scalaru8x16_t a1234, Scalaru8x16_t a5678) |
563 | | { |
564 | | Scalaru8x16_t m; |
565 | | for (uint8_t i = 0; i < 16; i++) { |
566 | | uint8_t sourceByte = i + aNumBytes; |
567 | | m.u8[i] = sourceByte < 16 ? a1234.u8[sourceByte] : a5678.u8[sourceByte - 16]; |
568 | | } |
569 | | return m; |
570 | | } |
571 | | |
572 | | template<typename T> |
573 | | inline int16_t |
574 | | SaturateTo16(T a) |
575 | 0 | { |
576 | 0 | return int16_t(a >= INT16_MIN ? (a <= INT16_MAX ? a : INT16_MAX) : INT16_MIN); |
577 | 0 | } |
578 | | |
579 | | inline Scalari16x8_t |
580 | | PackAndSaturate32To16(Scalari32x4_t m1, Scalari32x4_t m2) |
581 | 0 | { |
582 | 0 | Scalari16x8_t m; |
583 | 0 | m.i16[0] = SaturateTo16(m1.i32[0]); |
584 | 0 | m.i16[1] = SaturateTo16(m1.i32[1]); |
585 | 0 | m.i16[2] = SaturateTo16(m1.i32[2]); |
586 | 0 | m.i16[3] = SaturateTo16(m1.i32[3]); |
587 | 0 | m.i16[4] = SaturateTo16(m2.i32[0]); |
588 | 0 | m.i16[5] = SaturateTo16(m2.i32[1]); |
589 | 0 | m.i16[6] = SaturateTo16(m2.i32[2]); |
590 | 0 | m.i16[7] = SaturateTo16(m2.i32[3]); |
591 | 0 | return m; |
592 | 0 | } |
593 | | |
594 | | template<typename T> |
595 | | inline uint16_t |
596 | | SaturateToU16(T a) |
597 | 0 | { |
598 | 0 | return uint16_t(umin(a & -(a >= 0), INT16_MAX)); |
599 | 0 | } |
600 | | |
601 | | inline Scalaru16x8_t |
602 | | PackAndSaturate32ToU16(Scalari32x4_t m1, Scalari32x4_t m2) |
603 | 0 | { |
604 | 0 | Scalaru16x8_t m; |
605 | 0 | m.u16[0] = SaturateToU16(m1.i32[0]); |
606 | 0 | m.u16[1] = SaturateToU16(m1.i32[1]); |
607 | 0 | m.u16[2] = SaturateToU16(m1.i32[2]); |
608 | 0 | m.u16[3] = SaturateToU16(m1.i32[3]); |
609 | 0 | m.u16[4] = SaturateToU16(m2.i32[0]); |
610 | 0 | m.u16[5] = SaturateToU16(m2.i32[1]); |
611 | 0 | m.u16[6] = SaturateToU16(m2.i32[2]); |
612 | 0 | m.u16[7] = SaturateToU16(m2.i32[3]); |
613 | 0 | return m; |
614 | 0 | } |
615 | | |
616 | | template<typename T> |
617 | | inline uint8_t |
618 | | SaturateTo8(T a) |
619 | 0 | { |
620 | 0 | return uint8_t(umin(a & -(a >= 0), 255)); |
621 | 0 | } Unexecuted instantiation: unsigned char mozilla::gfx::simd::SaturateTo8<int>(int) Unexecuted instantiation: unsigned char mozilla::gfx::simd::SaturateTo8<short>(short) |
622 | | |
623 | | inline Scalaru8x16_t |
624 | | PackAndSaturate32To8(Scalari32x4_t m1, Scalari32x4_t m2, Scalari32x4_t m3, const Scalari32x4_t& m4) |
625 | 0 | { |
626 | 0 | Scalaru8x16_t m; |
627 | 0 | m.u8[0] = SaturateTo8(m1.i32[0]); |
628 | 0 | m.u8[1] = SaturateTo8(m1.i32[1]); |
629 | 0 | m.u8[2] = SaturateTo8(m1.i32[2]); |
630 | 0 | m.u8[3] = SaturateTo8(m1.i32[3]); |
631 | 0 | m.u8[4] = SaturateTo8(m2.i32[0]); |
632 | 0 | m.u8[5] = SaturateTo8(m2.i32[1]); |
633 | 0 | m.u8[6] = SaturateTo8(m2.i32[2]); |
634 | 0 | m.u8[7] = SaturateTo8(m2.i32[3]); |
635 | 0 | m.u8[8] = SaturateTo8(m3.i32[0]); |
636 | 0 | m.u8[9] = SaturateTo8(m3.i32[1]); |
637 | 0 | m.u8[10] = SaturateTo8(m3.i32[2]); |
638 | 0 | m.u8[11] = SaturateTo8(m3.i32[3]); |
639 | 0 | m.u8[12] = SaturateTo8(m4.i32[0]); |
640 | 0 | m.u8[13] = SaturateTo8(m4.i32[1]); |
641 | 0 | m.u8[14] = SaturateTo8(m4.i32[2]); |
642 | 0 | m.u8[15] = SaturateTo8(m4.i32[3]); |
643 | 0 | return m; |
644 | 0 | } |
645 | | |
646 | | inline Scalaru8x16_t |
647 | | PackAndSaturate16To8(Scalari16x8_t m1, Scalari16x8_t m2) |
648 | 0 | { |
649 | 0 | Scalaru8x16_t m; |
650 | 0 | m.u8[0] = SaturateTo8(m1.i16[0]); |
651 | 0 | m.u8[1] = SaturateTo8(m1.i16[1]); |
652 | 0 | m.u8[2] = SaturateTo8(m1.i16[2]); |
653 | 0 | m.u8[3] = SaturateTo8(m1.i16[3]); |
654 | 0 | m.u8[4] = SaturateTo8(m1.i16[4]); |
655 | 0 | m.u8[5] = SaturateTo8(m1.i16[5]); |
656 | 0 | m.u8[6] = SaturateTo8(m1.i16[6]); |
657 | 0 | m.u8[7] = SaturateTo8(m1.i16[7]); |
658 | 0 | m.u8[8] = SaturateTo8(m2.i16[0]); |
659 | 0 | m.u8[9] = SaturateTo8(m2.i16[1]); |
660 | 0 | m.u8[10] = SaturateTo8(m2.i16[2]); |
661 | 0 | m.u8[11] = SaturateTo8(m2.i16[3]); |
662 | 0 | m.u8[12] = SaturateTo8(m2.i16[4]); |
663 | 0 | m.u8[13] = SaturateTo8(m2.i16[5]); |
664 | 0 | m.u8[14] = SaturateTo8(m2.i16[6]); |
665 | 0 | m.u8[15] = SaturateTo8(m2.i16[7]); |
666 | 0 | return m; |
667 | 0 | } |
668 | | |
669 | | // Fast approximate division by 255. It has the property that |
670 | | // for all 0 <= n <= 255*255, FAST_DIVIDE_BY_255(n) == n/255. |
671 | | // But it only uses two adds and two shifts instead of an |
672 | | // integer division (which is expensive on many processors). |
673 | | // |
674 | | // equivalent to v/255 |
675 | | template<class B, class A> |
676 | | inline B FastDivideBy255(A v) |
677 | 0 | { |
678 | 0 | return ((v << 8) + v + 255) >> 16; |
679 | 0 | } Unexecuted instantiation: unsigned short mozilla::gfx::simd::FastDivideBy255<unsigned short, int>(int) Unexecuted instantiation: int mozilla::gfx::simd::FastDivideBy255<int, int>(int) |
680 | | |
681 | | inline Scalaru16x8_t |
682 | | FastDivideBy255_16(Scalaru16x8_t m) |
683 | 0 | { |
684 | 0 | return FromU16<Scalaru16x8_t>(FastDivideBy255<uint16_t>(int32_t(m.u16[0])), |
685 | 0 | FastDivideBy255<uint16_t>(int32_t(m.u16[1])), |
686 | 0 | FastDivideBy255<uint16_t>(int32_t(m.u16[2])), |
687 | 0 | FastDivideBy255<uint16_t>(int32_t(m.u16[3])), |
688 | 0 | FastDivideBy255<uint16_t>(int32_t(m.u16[4])), |
689 | 0 | FastDivideBy255<uint16_t>(int32_t(m.u16[5])), |
690 | 0 | FastDivideBy255<uint16_t>(int32_t(m.u16[6])), |
691 | 0 | FastDivideBy255<uint16_t>(int32_t(m.u16[7]))); |
692 | 0 | } |
693 | | |
694 | | inline Scalari32x4_t |
695 | | FastDivideBy255(Scalari32x4_t m) |
696 | 0 | { |
697 | 0 | return From32<Scalari32x4_t>(FastDivideBy255<int32_t>(m.i32[0]), |
698 | 0 | FastDivideBy255<int32_t>(m.i32[1]), |
699 | 0 | FastDivideBy255<int32_t>(m.i32[2]), |
700 | 0 | FastDivideBy255<int32_t>(m.i32[3])); |
701 | 0 | } |
702 | | |
703 | | inline Scalaru8x16_t |
704 | | Pick(Scalaru8x16_t mask, Scalaru8x16_t a, Scalaru8x16_t b) |
705 | 0 | { |
706 | 0 | return From8<Scalaru8x16_t>((a.u8[0] & (~mask.u8[0])) | (b.u8[0] & mask.u8[0]), |
707 | 0 | (a.u8[1] & (~mask.u8[1])) | (b.u8[1] & mask.u8[1]), |
708 | 0 | (a.u8[2] & (~mask.u8[2])) | (b.u8[2] & mask.u8[2]), |
709 | 0 | (a.u8[3] & (~mask.u8[3])) | (b.u8[3] & mask.u8[3]), |
710 | 0 | (a.u8[4] & (~mask.u8[4])) | (b.u8[4] & mask.u8[4]), |
711 | 0 | (a.u8[5] & (~mask.u8[5])) | (b.u8[5] & mask.u8[5]), |
712 | 0 | (a.u8[6] & (~mask.u8[6])) | (b.u8[6] & mask.u8[6]), |
713 | 0 | (a.u8[7] & (~mask.u8[7])) | (b.u8[7] & mask.u8[7]), |
714 | 0 | (a.u8[8+0] & (~mask.u8[8+0])) | (b.u8[8+0] & mask.u8[8+0]), |
715 | 0 | (a.u8[8+1] & (~mask.u8[8+1])) | (b.u8[8+1] & mask.u8[8+1]), |
716 | 0 | (a.u8[8+2] & (~mask.u8[8+2])) | (b.u8[8+2] & mask.u8[8+2]), |
717 | 0 | (a.u8[8+3] & (~mask.u8[8+3])) | (b.u8[8+3] & mask.u8[8+3]), |
718 | 0 | (a.u8[8+4] & (~mask.u8[8+4])) | (b.u8[8+4] & mask.u8[8+4]), |
719 | 0 | (a.u8[8+5] & (~mask.u8[8+5])) | (b.u8[8+5] & mask.u8[8+5]), |
720 | 0 | (a.u8[8+6] & (~mask.u8[8+6])) | (b.u8[8+6] & mask.u8[8+6]), |
721 | 0 | (a.u8[8+7] & (~mask.u8[8+7])) | (b.u8[8+7] & mask.u8[8+7])); |
722 | 0 | } |
723 | | |
724 | | inline Scalari32x4_t |
725 | | Pick(Scalari32x4_t mask, Scalari32x4_t a, Scalari32x4_t b) |
726 | 0 | { |
727 | 0 | return From32<Scalari32x4_t>((a.i32[0] & (~mask.i32[0])) | (b.i32[0] & mask.i32[0]), |
728 | 0 | (a.i32[1] & (~mask.i32[1])) | (b.i32[1] & mask.i32[1]), |
729 | 0 | (a.i32[2] & (~mask.i32[2])) | (b.i32[2] & mask.i32[2]), |
730 | 0 | (a.i32[3] & (~mask.i32[3])) | (b.i32[3] & mask.i32[3])); |
731 | 0 | } |
732 | | |
733 | | inline Scalarf32x4_t MixF32(Scalarf32x4_t a, Scalarf32x4_t b, float t) |
734 | 0 | { |
735 | 0 | return FromF32<Scalarf32x4_t>(a.f32[0] + (b.f32[0] - a.f32[0]) * t, |
736 | 0 | a.f32[1] + (b.f32[1] - a.f32[1]) * t, |
737 | 0 | a.f32[2] + (b.f32[2] - a.f32[2]) * t, |
738 | 0 | a.f32[3] + (b.f32[3] - a.f32[3]) * t); |
739 | 0 | } |
740 | | |
741 | | inline Scalarf32x4_t WSumF32(Scalarf32x4_t a, Scalarf32x4_t b, float wa, float wb) |
742 | 0 | { |
743 | 0 | return FromF32<Scalarf32x4_t>(a.f32[0] * wa + b.f32[0] * wb, |
744 | 0 | a.f32[1] * wa + b.f32[1] * wb, |
745 | 0 | a.f32[2] * wa + b.f32[2] * wb, |
746 | 0 | a.f32[3] * wa + b.f32[3] * wb); |
747 | 0 | } |
748 | | |
749 | | inline Scalarf32x4_t AbsF32(Scalarf32x4_t a) |
750 | 0 | { |
751 | 0 | return FromF32<Scalarf32x4_t>(fabs(a.f32[0]), |
752 | 0 | fabs(a.f32[1]), |
753 | 0 | fabs(a.f32[2]), |
754 | 0 | fabs(a.f32[3])); |
755 | 0 | } |
756 | | |
757 | | inline Scalarf32x4_t AddF32(Scalarf32x4_t a, Scalarf32x4_t b) |
758 | 0 | { |
759 | 0 | return FromF32<Scalarf32x4_t>(a.f32[0] + b.f32[0], |
760 | 0 | a.f32[1] + b.f32[1], |
761 | 0 | a.f32[2] + b.f32[2], |
762 | 0 | a.f32[3] + b.f32[3]); |
763 | 0 | } |
764 | | |
765 | | inline Scalarf32x4_t MulF32(Scalarf32x4_t a, Scalarf32x4_t b) |
766 | 0 | { |
767 | 0 | return FromF32<Scalarf32x4_t>(a.f32[0] * b.f32[0], |
768 | 0 | a.f32[1] * b.f32[1], |
769 | 0 | a.f32[2] * b.f32[2], |
770 | 0 | a.f32[3] * b.f32[3]); |
771 | 0 | } |
772 | | |
773 | | inline Scalarf32x4_t DivF32(Scalarf32x4_t a, Scalarf32x4_t b) |
774 | 0 | { |
775 | 0 | return FromF32<Scalarf32x4_t>(a.f32[0] / b.f32[0], |
776 | 0 | a.f32[1] / b.f32[1], |
777 | 0 | a.f32[2] / b.f32[2], |
778 | 0 | a.f32[3] / b.f32[3]); |
779 | 0 | } |
780 | | |
781 | | template<uint8_t aIndex> |
782 | | inline Scalarf32x4_t SplatF32(Scalarf32x4_t m) |
783 | 0 | { |
784 | 0 | AssertIndex<aIndex>(); |
785 | 0 | return FromF32<Scalarf32x4_t>(m.f32[aIndex], |
786 | 0 | m.f32[aIndex], |
787 | 0 | m.f32[aIndex], |
788 | 0 | m.f32[aIndex]); |
789 | 0 | } |
790 | | |
791 | | inline Scalari32x4_t F32ToI32(Scalarf32x4_t m) |
792 | 0 | { |
793 | 0 | return From32<Scalari32x4_t>(int32_t(floor(m.f32[0] + 0.5f)), |
794 | 0 | int32_t(floor(m.f32[1] + 0.5f)), |
795 | 0 | int32_t(floor(m.f32[2] + 0.5f)), |
796 | 0 | int32_t(floor(m.f32[3] + 0.5f))); |
797 | 0 | } |
798 | | |
799 | | #ifdef SIMD_COMPILE_SSE2 |
800 | | |
801 | | // SSE2 |
802 | | |
803 | | template<> |
804 | | inline __m128i |
805 | | Load8<__m128i>(const uint8_t* aSource) |
806 | 0 | { |
807 | 0 | return _mm_load_si128((const __m128i*)aSource); |
808 | 0 | } |
809 | | |
810 | | inline void Store8(uint8_t* aTarget, __m128i aM) |
811 | 0 | { |
812 | 0 | _mm_store_si128((__m128i*)aTarget, aM); |
813 | 0 | } |
814 | | |
815 | | template<> |
816 | | inline __m128i FromZero8<__m128i>() |
817 | 0 | { |
818 | 0 | return _mm_setzero_si128(); |
819 | 0 | } |
820 | | |
821 | | template<> |
822 | | inline __m128i From8<__m128i>(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h, |
823 | | uint8_t i, uint8_t j, uint8_t k, uint8_t l, uint8_t m, uint8_t n, uint8_t o, uint8_t p) |
824 | 0 | { |
825 | 0 | return _mm_setr_epi16((b << 8) + a, (d << 8) + c, (e << 8) + f, (h << 8) + g, |
826 | 0 | (j << 8) + i, (l << 8) + k, (m << 8) + n, (p << 8) + o); |
827 | 0 | } |
828 | | |
829 | | template<> |
830 | | inline __m128i FromI16<__m128i>(int16_t a, int16_t b, int16_t c, int16_t d, int16_t e, int16_t f, int16_t g, int16_t h) |
831 | 0 | { |
832 | 0 | return _mm_setr_epi16(a, b, c, d, e, f, g, h); |
833 | 0 | } |
834 | | |
835 | | template<> |
836 | | inline __m128i FromU16<__m128i>(uint16_t a, uint16_t b, uint16_t c, uint16_t d, uint16_t e, uint16_t f, uint16_t g, uint16_t h) |
837 | 0 | { |
838 | 0 | return _mm_setr_epi16(a, b, c, d, e, f, g, h); |
839 | 0 | } |
840 | | |
841 | | template<> |
842 | | inline __m128i FromI16<__m128i>(int16_t a) |
843 | 0 | { |
844 | 0 | return _mm_set1_epi16(a); |
845 | 0 | } |
846 | | |
847 | | template<> |
848 | | inline __m128i FromU16<__m128i>(uint16_t a) |
849 | 0 | { |
850 | 0 | return _mm_set1_epi16((int16_t)a); |
851 | 0 | } |
852 | | |
853 | | template<> |
854 | | inline __m128i From32<__m128i>(int32_t a, int32_t b, int32_t c, int32_t d) |
855 | 0 | { |
856 | 0 | return _mm_setr_epi32(a, b, c, d); |
857 | 0 | } |
858 | | |
859 | | template<> |
860 | | inline __m128i From32<__m128i>(int32_t a) |
861 | 0 | { |
862 | 0 | return _mm_set1_epi32(a); |
863 | 0 | } |
864 | | |
865 | | template<> |
866 | | inline __m128 FromF32<__m128>(float a, float b, float c, float d) |
867 | 0 | { |
868 | 0 | return _mm_setr_ps(a, b, c, d); |
869 | 0 | } |
870 | | |
871 | | template<> |
872 | | inline __m128 FromF32<__m128>(float a) |
873 | 0 | { |
874 | 0 | return _mm_set1_ps(a); |
875 | 0 | } |
876 | | |
877 | | template<int32_t aNumberOfBits> |
878 | | inline __m128i ShiftRight16(__m128i aM) |
879 | 0 | { |
880 | 0 | return _mm_srli_epi16(aM, aNumberOfBits); |
881 | 0 | } |
882 | | |
883 | | template<int32_t aNumberOfBits> |
884 | | inline __m128i ShiftRight32(__m128i aM) |
885 | 0 | { |
886 | 0 | return _mm_srai_epi32(aM, aNumberOfBits); |
887 | 0 | } |
888 | | |
889 | | inline __m128i Add16(__m128i aM1, __m128i aM2) |
890 | 0 | { |
891 | 0 | return _mm_add_epi16(aM1, aM2); |
892 | 0 | } |
893 | | |
894 | | inline __m128i Add32(__m128i aM1, __m128i aM2) |
895 | 0 | { |
896 | 0 | return _mm_add_epi32(aM1, aM2); |
897 | 0 | } |
898 | | |
899 | | inline __m128i Sub16(__m128i aM1, __m128i aM2) |
900 | 0 | { |
901 | 0 | return _mm_sub_epi16(aM1, aM2); |
902 | 0 | } |
903 | | |
904 | | inline __m128i Sub32(__m128i aM1, __m128i aM2) |
905 | 0 | { |
906 | 0 | return _mm_sub_epi32(aM1, aM2); |
907 | 0 | } |
908 | | |
909 | | inline __m128i Min8(__m128i aM1, __m128i aM2) |
910 | 0 | { |
911 | 0 | return _mm_min_epu8(aM1, aM2); |
912 | 0 | } |
913 | | |
914 | | inline __m128i Max8(__m128i aM1, __m128i aM2) |
915 | 0 | { |
916 | 0 | return _mm_max_epu8(aM1, aM2); |
917 | 0 | } |
918 | | |
919 | | inline __m128i Min32(__m128i aM1, __m128i aM2) |
920 | 0 | { |
921 | 0 | __m128i m1_minus_m2 = _mm_sub_epi32(aM1, aM2); |
922 | 0 | __m128i m1_greater_than_m2 = _mm_cmpgt_epi32(aM1, aM2); |
923 | 0 | return _mm_sub_epi32(aM1, _mm_and_si128(m1_minus_m2, m1_greater_than_m2)); |
924 | 0 | } |
925 | | |
926 | | inline __m128i Max32(__m128i aM1, __m128i aM2) |
927 | 0 | { |
928 | 0 | __m128i m1_minus_m2 = _mm_sub_epi32(aM1, aM2); |
929 | 0 | __m128i m2_greater_than_m1 = _mm_cmpgt_epi32(aM2, aM1); |
930 | 0 | return _mm_sub_epi32(aM1, _mm_and_si128(m1_minus_m2, m2_greater_than_m1)); |
931 | 0 | } |
932 | | |
933 | | inline __m128i Mul16(__m128i aM1, __m128i aM2) |
934 | 0 | { |
935 | 0 | return _mm_mullo_epi16(aM1, aM2); |
936 | 0 | } |
937 | | |
938 | | inline __m128i MulU16(__m128i aM1, __m128i aM2) |
939 | 0 | { |
940 | 0 | return _mm_mullo_epi16(aM1, aM2); |
941 | 0 | } |
942 | | |
943 | | inline void Mul16x4x2x2To32x4x2(__m128i aFactorsA1B1, |
944 | | __m128i aFactorsA2B2, |
945 | | __m128i& aProductA, |
946 | | __m128i& aProductB) |
947 | 0 | { |
948 | 0 | __m128i prodAB_lo = _mm_mullo_epi16(aFactorsA1B1, aFactorsA2B2); |
949 | 0 | __m128i prodAB_hi = _mm_mulhi_epi16(aFactorsA1B1, aFactorsA2B2); |
950 | 0 | aProductA = _mm_unpacklo_epi16(prodAB_lo, prodAB_hi); |
951 | 0 | aProductB = _mm_unpackhi_epi16(prodAB_lo, prodAB_hi); |
952 | 0 | } |
953 | | |
954 | | inline __m128i MulAdd16x8x2To32x4(__m128i aFactorsA, |
955 | | __m128i aFactorsB) |
956 | 0 | { |
957 | 0 | return _mm_madd_epi16(aFactorsA, aFactorsB); |
958 | 0 | } |
959 | | |
960 | | template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> |
961 | | inline __m128i Shuffle32(__m128i aM) |
962 | 0 | { |
963 | 0 | AssertIndex<i0>(); |
964 | 0 | AssertIndex<i1>(); |
965 | 0 | AssertIndex<i2>(); |
966 | 0 | AssertIndex<i3>(); |
967 | 0 | return _mm_shuffle_epi32(aM, _MM_SHUFFLE(i0, i1, i2, i3)); |
968 | 0 | } Unexecuted instantiation: long long __vector(2) mozilla::gfx::simd::Shuffle32<(signed char)3, (signed char)2, (signed char)3, (signed char)2>(long long __vector(2)) Unexecuted instantiation: long long __vector(2) mozilla::gfx::simd::Shuffle32<(signed char)0, (signed char)0, (signed char)0, (signed char)0>(long long __vector(2)) Unexecuted instantiation: long long __vector(2) mozilla::gfx::simd::Shuffle32<(signed char)1, (signed char)1, (signed char)1, (signed char)1>(long long __vector(2)) Unexecuted instantiation: long long __vector(2) mozilla::gfx::simd::Shuffle32<(signed char)2, (signed char)2, (signed char)2, (signed char)2>(long long __vector(2)) Unexecuted instantiation: long long __vector(2) mozilla::gfx::simd::Shuffle32<(signed char)3, (signed char)3, (signed char)3, (signed char)3>(long long __vector(2)) |
969 | | |
970 | | template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> |
971 | | inline __m128i ShuffleLo16(__m128i aM) |
972 | 0 | { |
973 | 0 | AssertIndex<i0>(); |
974 | 0 | AssertIndex<i1>(); |
975 | 0 | AssertIndex<i2>(); |
976 | 0 | AssertIndex<i3>(); |
977 | 0 | return _mm_shufflelo_epi16(aM, _MM_SHUFFLE(i0, i1, i2, i3)); |
978 | 0 | } Unexecuted instantiation: long long __vector(2) mozilla::gfx::simd::ShuffleLo16<(signed char)1, (signed char)0, (signed char)1, (signed char)0>(long long __vector(2)) Unexecuted instantiation: long long __vector(2) mozilla::gfx::simd::ShuffleLo16<(signed char)3, (signed char)2, (signed char)3, (signed char)2>(long long __vector(2)) Unexecuted instantiation: long long __vector(2) mozilla::gfx::simd::ShuffleLo16<(signed char)3, (signed char)3, (signed char)3, (signed char)3>(long long __vector(2)) |
979 | | |
980 | | template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> |
981 | | inline __m128i ShuffleHi16(__m128i aM) |
982 | 0 | { |
983 | 0 | AssertIndex<i0>(); |
984 | 0 | AssertIndex<i1>(); |
985 | 0 | AssertIndex<i2>(); |
986 | 0 | AssertIndex<i3>(); |
987 | 0 | return _mm_shufflehi_epi16(aM, _MM_SHUFFLE(i0, i1, i2, i3)); |
988 | 0 | } Unexecuted instantiation: long long __vector(2) mozilla::gfx::simd::ShuffleHi16<(signed char)1, (signed char)0, (signed char)1, (signed char)0>(long long __vector(2)) Unexecuted instantiation: long long __vector(2) mozilla::gfx::simd::ShuffleHi16<(signed char)3, (signed char)2, (signed char)3, (signed char)2>(long long __vector(2)) Unexecuted instantiation: long long __vector(2) mozilla::gfx::simd::ShuffleHi16<(signed char)3, (signed char)3, (signed char)3, (signed char)3>(long long __vector(2)) |
989 | | |
990 | | template<int8_t aIndex> |
991 | | inline __m128i Splat32(__m128i aM) |
992 | | { |
993 | | return Shuffle32<aIndex,aIndex,aIndex,aIndex>(aM); |
994 | | } |
995 | | |
996 | | template<int8_t aIndex> |
997 | | inline __m128i Splat32On8(__m128i aM) |
998 | 0 | { |
999 | 0 | return Shuffle32<aIndex,aIndex,aIndex,aIndex>(aM); |
1000 | 0 | } Unexecuted instantiation: long long __vector(2) mozilla::gfx::simd::Splat32On8<(signed char)0>(long long __vector(2)) Unexecuted instantiation: long long __vector(2) mozilla::gfx::simd::Splat32On8<(signed char)1>(long long __vector(2)) Unexecuted instantiation: long long __vector(2) mozilla::gfx::simd::Splat32On8<(signed char)2>(long long __vector(2)) Unexecuted instantiation: long long __vector(2) mozilla::gfx::simd::Splat32On8<(signed char)3>(long long __vector(2)) |
1001 | | |
1002 | | template<int8_t aIndexLo, int8_t aIndexHi> |
1003 | | inline __m128i Splat16(__m128i aM) |
1004 | 0 | { |
1005 | 0 | AssertIndex<aIndexLo>(); |
1006 | 0 | AssertIndex<aIndexHi>(); |
1007 | 0 | return ShuffleHi16<aIndexHi,aIndexHi,aIndexHi,aIndexHi>( |
1008 | 0 | ShuffleLo16<aIndexLo,aIndexLo,aIndexLo,aIndexLo>(aM)); |
1009 | 0 | } |
1010 | | |
1011 | | inline __m128i |
1012 | | UnpackLo8x8ToI16x8(__m128i m) |
1013 | 0 | { |
1014 | 0 | __m128i zero = _mm_set1_epi8(0); |
1015 | 0 | return _mm_unpacklo_epi8(m, zero); |
1016 | 0 | } |
1017 | | |
1018 | | inline __m128i |
1019 | | UnpackHi8x8ToI16x8(__m128i m) |
1020 | 0 | { |
1021 | 0 | __m128i zero = _mm_set1_epi8(0); |
1022 | 0 | return _mm_unpackhi_epi8(m, zero); |
1023 | 0 | } |
1024 | | |
1025 | | inline __m128i |
1026 | | UnpackLo8x8ToU16x8(__m128i m) |
1027 | 0 | { |
1028 | 0 | __m128i zero = _mm_set1_epi8(0); |
1029 | 0 | return _mm_unpacklo_epi8(m, zero); |
1030 | 0 | } |
1031 | | |
1032 | | inline __m128i |
1033 | | UnpackHi8x8ToU16x8(__m128i m) |
1034 | 0 | { |
1035 | 0 | __m128i zero = _mm_set1_epi8(0); |
1036 | 0 | return _mm_unpackhi_epi8(m, zero); |
1037 | 0 | } |
1038 | | |
1039 | | inline __m128i |
1040 | | InterleaveLo8(__m128i m1, __m128i m2) |
1041 | 0 | { |
1042 | 0 | return _mm_unpacklo_epi8(m1, m2); |
1043 | 0 | } |
1044 | | |
1045 | | inline __m128i |
1046 | | InterleaveHi8(__m128i m1, __m128i m2) |
1047 | 0 | { |
1048 | 0 | return _mm_unpackhi_epi8(m1, m2); |
1049 | 0 | } |
1050 | | |
1051 | | inline __m128i |
1052 | | InterleaveLo16(__m128i m1, __m128i m2) |
1053 | 0 | { |
1054 | 0 | return _mm_unpacklo_epi16(m1, m2); |
1055 | 0 | } |
1056 | | |
1057 | | inline __m128i |
1058 | | InterleaveHi16(__m128i m1, __m128i m2) |
1059 | 0 | { |
1060 | 0 | return _mm_unpackhi_epi16(m1, m2); |
1061 | 0 | } |
1062 | | |
1063 | | inline __m128i |
1064 | | InterleaveLo32(__m128i m1, __m128i m2) |
1065 | 0 | { |
1066 | 0 | return _mm_unpacklo_epi32(m1, m2); |
1067 | 0 | } |
1068 | | |
1069 | | template<uint8_t aNumBytes> |
1070 | | inline __m128i |
1071 | | Rotate8(__m128i a1234, __m128i a5678) |
1072 | 0 | { |
1073 | 0 | return _mm_or_si128(_mm_srli_si128(a1234, aNumBytes), _mm_slli_si128(a5678, 16 - aNumBytes)); |
1074 | 0 | } Unexecuted instantiation: long long __vector(2) mozilla::gfx::simd::Rotate8<(unsigned char)4>(long long __vector(2), long long __vector(2)) Unexecuted instantiation: long long __vector(2) mozilla::gfx::simd::Rotate8<(unsigned char)8>(long long __vector(2), long long __vector(2)) Unexecuted instantiation: long long __vector(2) mozilla::gfx::simd::Rotate8<(unsigned char)12>(long long __vector(2), long long __vector(2)) |
1075 | | |
1076 | | inline __m128i |
1077 | | PackAndSaturate32To16(__m128i m1, __m128i m2) |
1078 | 0 | { |
1079 | 0 | return _mm_packs_epi32(m1, m2); |
1080 | 0 | } |
1081 | | |
1082 | | inline __m128i |
1083 | | PackAndSaturate32ToU16(__m128i m1, __m128i m2) |
1084 | 0 | { |
1085 | 0 | return _mm_packs_epi32(m1, m2); |
1086 | 0 | } |
1087 | | |
1088 | | inline __m128i |
1089 | | PackAndSaturate32To8(__m128i m1, __m128i m2, __m128i m3, const __m128i& m4) |
1090 | 0 | { |
1091 | 0 | // Pack into 8 16bit signed integers (saturating). |
1092 | 0 | __m128i m12 = _mm_packs_epi32(m1, m2); |
1093 | 0 | __m128i m34 = _mm_packs_epi32(m3, m4); |
1094 | 0 |
|
1095 | 0 | // Pack into 16 8bit unsigned integers (saturating). |
1096 | 0 | return _mm_packus_epi16(m12, m34); |
1097 | 0 | } |
1098 | | |
1099 | | inline __m128i |
1100 | | PackAndSaturate16To8(__m128i m1, __m128i m2) |
1101 | 0 | { |
1102 | 0 | // Pack into 16 8bit unsigned integers (saturating). |
1103 | 0 | return _mm_packus_epi16(m1, m2); |
1104 | 0 | } |
1105 | | |
1106 | | inline __m128i |
1107 | | FastDivideBy255(__m128i m) |
1108 | 0 | { |
1109 | 0 | // v = m << 8 |
1110 | 0 | __m128i v = _mm_slli_epi32(m, 8); |
1111 | 0 | // v = v + (m + (255,255,255,255)) |
1112 | 0 | v = _mm_add_epi32(v, _mm_add_epi32(m, _mm_set1_epi32(255))); |
1113 | 0 | // v = v >> 16 |
1114 | 0 | return _mm_srai_epi32(v, 16); |
1115 | 0 | } |
1116 | | |
1117 | | inline __m128i |
1118 | | FastDivideBy255_16(__m128i m) |
1119 | 0 | { |
1120 | 0 | __m128i zero = _mm_set1_epi16(0); |
1121 | 0 | __m128i lo = _mm_unpacklo_epi16(m, zero); |
1122 | 0 | __m128i hi = _mm_unpackhi_epi16(m, zero); |
1123 | 0 | return _mm_packs_epi32(FastDivideBy255(lo), FastDivideBy255(hi)); |
1124 | 0 | } |
1125 | | |
1126 | | inline __m128i |
1127 | | Pick(__m128i mask, __m128i a, __m128i b) |
1128 | 0 | { |
1129 | 0 | return _mm_or_si128(_mm_andnot_si128(mask, a), _mm_and_si128(mask, b)); |
1130 | 0 | } |
1131 | | |
1132 | | inline __m128 MixF32(__m128 a, __m128 b, float t) |
1133 | 0 | { |
1134 | 0 | return _mm_add_ps(a, _mm_mul_ps(_mm_sub_ps(b, a), _mm_set1_ps(t))); |
1135 | 0 | } |
1136 | | |
1137 | | inline __m128 WSumF32(__m128 a, __m128 b, float wa, float wb) |
1138 | 0 | { |
1139 | 0 | return _mm_add_ps(_mm_mul_ps(a, _mm_set1_ps(wa)), _mm_mul_ps(b, _mm_set1_ps(wb))); |
1140 | 0 | } |
1141 | | |
1142 | | inline __m128 AbsF32(__m128 a) |
1143 | 0 | { |
1144 | 0 | return _mm_max_ps(_mm_sub_ps(_mm_setzero_ps(), a), a); |
1145 | 0 | } |
1146 | | |
1147 | | inline __m128 AddF32(__m128 a, __m128 b) |
1148 | 0 | { |
1149 | 0 | return _mm_add_ps(a, b); |
1150 | 0 | } |
1151 | | |
1152 | | inline __m128 MulF32(__m128 a, __m128 b) |
1153 | 0 | { |
1154 | 0 | return _mm_mul_ps(a, b); |
1155 | 0 | } |
1156 | | |
1157 | | inline __m128 DivF32(__m128 a, __m128 b) |
1158 | 0 | { |
1159 | 0 | return _mm_div_ps(a, b); |
1160 | 0 | } |
1161 | | |
1162 | | template<uint8_t aIndex> |
1163 | | inline __m128 SplatF32(__m128 m) |
1164 | 0 | { |
1165 | 0 | AssertIndex<aIndex>(); |
1166 | 0 | return _mm_shuffle_ps(m, m, _MM_SHUFFLE(aIndex, aIndex, aIndex, aIndex)); |
1167 | 0 | } |
1168 | | |
1169 | | inline __m128i F32ToI32(__m128 m) |
1170 | 0 | { |
1171 | 0 | return _mm_cvtps_epi32(m); |
1172 | 0 | } |
1173 | | |
1174 | | #endif // SIMD_COMPILE_SSE2 |
1175 | | |
1176 | | } // namespace simd |
1177 | | |
1178 | | } // namespace gfx |
1179 | | } // namespace mozilla |
1180 | | |
1181 | | #endif // _MOZILLA_GFX_SIMD_H_ |