/src/mozilla-central/dom/media/webaudio/AudioNodeEngineSSE2.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
2 | | /* this source code form is subject to the terms of the mozilla public |
3 | | * license, v. 2.0. if a copy of the mpl was not distributed with this file, |
4 | | * You can obtain one at http://mozilla.org/MPL/2.0/. */ |
5 | | |
6 | | #include "AudioNodeEngineSSE2.h" |
7 | | #include "AlignmentUtils.h" |
8 | | #include <emmintrin.h> |
9 | | |
10 | | |
11 | | namespace mozilla { |
12 | | void |
13 | | AudioBufferAddWithScale_SSE(const float* aInput, |
14 | | float aScale, |
15 | | float* aOutput, |
16 | | uint32_t aSize) |
17 | 0 | { |
18 | 0 | __m128 vin0, vin1, vin2, vin3, |
19 | 0 | vscaled0, vscaled1, vscaled2, vscaled3, |
20 | 0 | vout0, vout1, vout2, vout3, |
21 | 0 | vgain; |
22 | 0 |
|
23 | 0 | ASSERT_ALIGNED16(aInput); |
24 | 0 | ASSERT_ALIGNED16(aOutput); |
25 | 0 | ASSERT_MULTIPLE16(aSize); |
26 | 0 |
|
27 | 0 | vgain = _mm_load1_ps(&aScale); |
28 | 0 |
|
29 | 0 | for (unsigned i = 0; i < aSize; i+=16) { |
30 | 0 | vin0 = _mm_load_ps(&aInput[i]); |
31 | 0 | vin1 = _mm_load_ps(&aInput[i + 4]); |
32 | 0 | vin2 = _mm_load_ps(&aInput[i + 8]); |
33 | 0 | vin3 = _mm_load_ps(&aInput[i + 12]); |
34 | 0 |
|
35 | 0 | vscaled0 = _mm_mul_ps(vin0, vgain); |
36 | 0 | vscaled1 = _mm_mul_ps(vin1, vgain); |
37 | 0 | vscaled2 = _mm_mul_ps(vin2, vgain); |
38 | 0 | vscaled3 = _mm_mul_ps(vin3, vgain); |
39 | 0 |
|
40 | 0 | vin0 = _mm_load_ps(&aOutput[i]); |
41 | 0 | vin1 = _mm_load_ps(&aOutput[i + 4]); |
42 | 0 | vin2 = _mm_load_ps(&aOutput[i + 8]); |
43 | 0 | vin3 = _mm_load_ps(&aOutput[i + 12]); |
44 | 0 |
|
45 | 0 | vout0 = _mm_add_ps(vin0, vscaled0); |
46 | 0 | vout1 = _mm_add_ps(vin1, vscaled1); |
47 | 0 | vout2 = _mm_add_ps(vin2, vscaled2); |
48 | 0 | vout3 = _mm_add_ps(vin3, vscaled3); |
49 | 0 |
|
50 | 0 | _mm_store_ps(&aOutput[i], vout0); |
51 | 0 | _mm_store_ps(&aOutput[i + 4], vout1); |
52 | 0 | _mm_store_ps(&aOutput[i + 8], vout2); |
53 | 0 | _mm_store_ps(&aOutput[i + 12], vout3); |
54 | 0 | } |
55 | 0 | } |
56 | | |
57 | | void |
58 | | AudioBlockCopyChannelWithScale_SSE(const float* aInput, |
59 | | float aScale, |
60 | | float* aOutput) |
61 | 0 | { |
62 | 0 | __m128 vin0, vin1, vin2, vin3, |
63 | 0 | vout0, vout1, vout2, vout3; |
64 | 0 |
|
65 | 0 | ASSERT_ALIGNED16(aInput); |
66 | 0 | ASSERT_ALIGNED16(aOutput); |
67 | 0 |
|
68 | 0 | __m128 vgain = _mm_load1_ps(&aScale); |
69 | 0 |
|
70 | 0 | for (unsigned i = 0 ; i < WEBAUDIO_BLOCK_SIZE; i+=16) { |
71 | 0 | vin0 = _mm_load_ps(&aInput[i]); |
72 | 0 | vin1 = _mm_load_ps(&aInput[i + 4]); |
73 | 0 | vin2 = _mm_load_ps(&aInput[i + 8]); |
74 | 0 | vin3 = _mm_load_ps(&aInput[i + 12]); |
75 | 0 | vout0 = _mm_mul_ps(vin0, vgain); |
76 | 0 | vout1 = _mm_mul_ps(vin1, vgain); |
77 | 0 | vout2 = _mm_mul_ps(vin2, vgain); |
78 | 0 | vout3 = _mm_mul_ps(vin3, vgain); |
79 | 0 | _mm_store_ps(&aOutput[i], vout0); |
80 | 0 | _mm_store_ps(&aOutput[i + 4], vout1); |
81 | 0 | _mm_store_ps(&aOutput[i + 8], vout2); |
82 | 0 | _mm_store_ps(&aOutput[i + 12], vout3); |
83 | 0 | } |
84 | 0 | } |
85 | | |
86 | | void |
87 | | AudioBlockCopyChannelWithScale_SSE(const float aInput[WEBAUDIO_BLOCK_SIZE], |
88 | | const float aScale[WEBAUDIO_BLOCK_SIZE], |
89 | | float aOutput[WEBAUDIO_BLOCK_SIZE]) |
90 | 0 | { |
91 | 0 | __m128 vin0, vin1, vin2, vin3, |
92 | 0 | vscaled0, vscaled1, vscaled2, vscaled3, |
93 | 0 | vout0, vout1, vout2, vout3; |
94 | 0 |
|
95 | 0 | ASSERT_ALIGNED16(aInput); |
96 | 0 | ASSERT_ALIGNED16(aScale); |
97 | 0 | ASSERT_ALIGNED16(aOutput); |
98 | 0 |
|
99 | 0 | for (unsigned i = 0 ; i < WEBAUDIO_BLOCK_SIZE; i+=16) { |
100 | 0 | vscaled0 = _mm_load_ps(&aScale[i]); |
101 | 0 | vscaled1 = _mm_load_ps(&aScale[i+4]); |
102 | 0 | vscaled2 = _mm_load_ps(&aScale[i+8]); |
103 | 0 | vscaled3 = _mm_load_ps(&aScale[i+12]); |
104 | 0 |
|
105 | 0 | vin0 = _mm_load_ps(&aInput[i]); |
106 | 0 | vin1 = _mm_load_ps(&aInput[i + 4]); |
107 | 0 | vin2 = _mm_load_ps(&aInput[i + 8]); |
108 | 0 | vin3 = _mm_load_ps(&aInput[i + 12]); |
109 | 0 |
|
110 | 0 | vout0 = _mm_mul_ps(vin0, vscaled0); |
111 | 0 | vout1 = _mm_mul_ps(vin1, vscaled1); |
112 | 0 | vout2 = _mm_mul_ps(vin2, vscaled2); |
113 | 0 | vout3 = _mm_mul_ps(vin3, vscaled3); |
114 | 0 |
|
115 | 0 | _mm_store_ps(&aOutput[i], vout0); |
116 | 0 | _mm_store_ps(&aOutput[i + 4], vout1); |
117 | 0 | _mm_store_ps(&aOutput[i + 8], vout2); |
118 | 0 | _mm_store_ps(&aOutput[i + 12], vout3); |
119 | 0 | } |
120 | 0 | } |
121 | | |
122 | | void |
123 | | AudioBufferInPlaceScale_SSE(float* aBlock, |
124 | | float aScale, |
125 | | uint32_t aSize) |
126 | 0 | { |
127 | 0 | __m128 vout0, vout1, vout2, vout3, |
128 | 0 | vin0, vin1, vin2, vin3; |
129 | 0 |
|
130 | 0 | ASSERT_ALIGNED16(aBlock); |
131 | 0 | ASSERT_MULTIPLE16(aSize); |
132 | 0 |
|
133 | 0 | __m128 vgain = _mm_load1_ps(&aScale); |
134 | 0 |
|
135 | 0 | for (unsigned i = 0; i < aSize; i+=16) { |
136 | 0 | vin0 = _mm_load_ps(&aBlock[i]); |
137 | 0 | vin1 = _mm_load_ps(&aBlock[i + 4]); |
138 | 0 | vin2 = _mm_load_ps(&aBlock[i + 8]); |
139 | 0 | vin3 = _mm_load_ps(&aBlock[i + 12]); |
140 | 0 | vout0 = _mm_mul_ps(vin0, vgain); |
141 | 0 | vout1 = _mm_mul_ps(vin1, vgain); |
142 | 0 | vout2 = _mm_mul_ps(vin2, vgain); |
143 | 0 | vout3 = _mm_mul_ps(vin3, vgain); |
144 | 0 | _mm_store_ps(&aBlock[i], vout0); |
145 | 0 | _mm_store_ps(&aBlock[i + 4], vout1); |
146 | 0 | _mm_store_ps(&aBlock[i + 8], vout2); |
147 | 0 | _mm_store_ps(&aBlock[i + 12], vout3); |
148 | 0 | } |
149 | 0 | } |
150 | | |
151 | | void |
152 | | AudioBufferInPlaceScale_SSE(float* aBlock, |
153 | | float* aScale, |
154 | | uint32_t aSize) |
155 | 0 | { |
156 | 0 | __m128 vout0, vout1, vout2, vout3, |
157 | 0 | vgain0, vgain1, vgain2, vgain3, |
158 | 0 | vin0, vin1, vin2, vin3; |
159 | 0 |
|
160 | 0 | ASSERT_ALIGNED16(aBlock); |
161 | 0 | ASSERT_MULTIPLE16(aSize); |
162 | 0 |
|
163 | 0 | for (unsigned i = 0; i < aSize; i+=16) { |
164 | 0 | vin0 = _mm_load_ps(&aBlock[i]); |
165 | 0 | vin1 = _mm_load_ps(&aBlock[i + 4]); |
166 | 0 | vin2 = _mm_load_ps(&aBlock[i + 8]); |
167 | 0 | vin3 = _mm_load_ps(&aBlock[i + 12]); |
168 | 0 | vgain0 = _mm_load_ps(&aScale[i]); |
169 | 0 | vgain1 = _mm_load_ps(&aScale[i + 4]); |
170 | 0 | vgain2 = _mm_load_ps(&aScale[i + 8]); |
171 | 0 | vgain3 = _mm_load_ps(&aScale[i + 12]); |
172 | 0 | vout0 = _mm_mul_ps(vin0, vgain0); |
173 | 0 | vout1 = _mm_mul_ps(vin1, vgain1); |
174 | 0 | vout2 = _mm_mul_ps(vin2, vgain2); |
175 | 0 | vout3 = _mm_mul_ps(vin3, vgain3); |
176 | 0 | _mm_store_ps(&aBlock[i], vout0); |
177 | 0 | _mm_store_ps(&aBlock[i + 4], vout1); |
178 | 0 | _mm_store_ps(&aBlock[i + 8], vout2); |
179 | 0 | _mm_store_ps(&aBlock[i + 12], vout3); |
180 | 0 | } |
181 | 0 | } |
182 | | |
183 | | void |
184 | | AudioBlockPanStereoToStereo_SSE(const float aInputL[WEBAUDIO_BLOCK_SIZE], |
185 | | const float aInputR[WEBAUDIO_BLOCK_SIZE], |
186 | | float aGainL, float aGainR, bool aIsOnTheLeft, |
187 | | float aOutputL[WEBAUDIO_BLOCK_SIZE], |
188 | | float aOutputR[WEBAUDIO_BLOCK_SIZE]) |
189 | 0 | { |
190 | 0 | __m128 vinl0, vinr0, vinl1, vinr1, |
191 | 0 | vout0, vout1, |
192 | 0 | vscaled0, vscaled1, |
193 | 0 | vgainl, vgainr; |
194 | 0 |
|
195 | 0 | ASSERT_ALIGNED16(aInputL); |
196 | 0 | ASSERT_ALIGNED16(aInputR); |
197 | 0 | ASSERT_ALIGNED16(aOutputL); |
198 | 0 | ASSERT_ALIGNED16(aOutputR); |
199 | 0 |
|
200 | 0 | vgainl = _mm_load1_ps(&aGainL); |
201 | 0 | vgainr = _mm_load1_ps(&aGainR); |
202 | 0 |
|
203 | 0 | if (aIsOnTheLeft) { |
204 | 0 | for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE; i+=8) { |
205 | 0 | vinl0 = _mm_load_ps(&aInputL[i]); |
206 | 0 | vinr0 = _mm_load_ps(&aInputR[i]); |
207 | 0 | vinl1 = _mm_load_ps(&aInputL[i+4]); |
208 | 0 | vinr1 = _mm_load_ps(&aInputR[i+4]); |
209 | 0 |
|
210 | 0 | /* left channel : aOutputL = aInputL + aInputR * gainL */ |
211 | 0 | vscaled0 = _mm_mul_ps(vinr0, vgainl); |
212 | 0 | vscaled1 = _mm_mul_ps(vinr1, vgainl); |
213 | 0 | vout0 = _mm_add_ps(vscaled0, vinl0); |
214 | 0 | vout1 = _mm_add_ps(vscaled1, vinl1); |
215 | 0 | _mm_store_ps(&aOutputL[i], vout0); |
216 | 0 | _mm_store_ps(&aOutputL[i+4], vout1); |
217 | 0 |
|
218 | 0 | /* right channel : aOutputR = aInputR * gainR */ |
219 | 0 | vscaled0 = _mm_mul_ps(vinr0, vgainr); |
220 | 0 | vscaled1 = _mm_mul_ps(vinr1, vgainr); |
221 | 0 | _mm_store_ps(&aOutputR[i], vscaled0); |
222 | 0 | _mm_store_ps(&aOutputR[i+4], vscaled1); |
223 | 0 | } |
224 | 0 | } else { |
225 | 0 | for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE; i+=8) { |
226 | 0 | vinl0 = _mm_load_ps(&aInputL[i]); |
227 | 0 | vinr0 = _mm_load_ps(&aInputR[i]); |
228 | 0 | vinl1 = _mm_load_ps(&aInputL[i+4]); |
229 | 0 | vinr1 = _mm_load_ps(&aInputR[i+4]); |
230 | 0 |
|
231 | 0 | /* left channel : aInputL * gainL */ |
232 | 0 | vscaled0 = _mm_mul_ps(vinl0, vgainl); |
233 | 0 | vscaled1 = _mm_mul_ps(vinl1, vgainl); |
234 | 0 | _mm_store_ps(&aOutputL[i], vscaled0); |
235 | 0 | _mm_store_ps(&aOutputL[i+4], vscaled1); |
236 | 0 |
|
237 | 0 | /* right channel: aOutputR = aInputR + aInputL * gainR */ |
238 | 0 | vscaled0 = _mm_mul_ps(vinl0, vgainr); |
239 | 0 | vscaled1 = _mm_mul_ps(vinl1, vgainr); |
240 | 0 | vout0 = _mm_add_ps(vscaled0, vinr0); |
241 | 0 | vout1 = _mm_add_ps(vscaled1, vinr1); |
242 | 0 | _mm_store_ps(&aOutputR[i], vout0); |
243 | 0 | _mm_store_ps(&aOutputR[i+4], vout1); |
244 | 0 | } |
245 | 0 | } |
246 | 0 | } |
247 | | |
248 | | void BufferComplexMultiply_SSE(const float* aInput, |
249 | | const float* aScale, |
250 | | float* aOutput, |
251 | | uint32_t aSize) |
252 | 0 | { |
253 | 0 | unsigned i; |
254 | 0 | __m128 in0, in1, in2, in3, |
255 | 0 | outreal0, outreal1, outreal2, outreal3, |
256 | 0 | outimag0, outimag1, outimag2, outimag3; |
257 | 0 |
|
258 | 0 | ASSERT_ALIGNED16(aInput); |
259 | 0 | ASSERT_ALIGNED16(aScale); |
260 | 0 | ASSERT_ALIGNED16(aOutput); |
261 | 0 | ASSERT_MULTIPLE16(aSize); |
262 | 0 |
|
263 | 0 | for (i = 0; i < aSize * 2; i += 16) { |
264 | 0 | in0 = _mm_load_ps(&aInput[i]); |
265 | 0 | in1 = _mm_load_ps(&aInput[i + 4]); |
266 | 0 | in2 = _mm_load_ps(&aInput[i + 8]); |
267 | 0 | in3 = _mm_load_ps(&aInput[i + 12]); |
268 | 0 |
|
269 | 0 | outreal0 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(2, 0, 2, 0)); |
270 | 0 | outimag0 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(3, 1, 3, 1)); |
271 | 0 | outreal2 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(2, 0, 2, 0)); |
272 | 0 | outimag2 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(3, 1, 3, 1)); |
273 | 0 |
|
274 | 0 | in0 = _mm_load_ps(&aScale[i]); |
275 | 0 | in1 = _mm_load_ps(&aScale[i + 4]); |
276 | 0 | in2 = _mm_load_ps(&aScale[i + 8]); |
277 | 0 | in3 = _mm_load_ps(&aScale[i + 12]); |
278 | 0 |
|
279 | 0 | outreal1 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(2, 0, 2, 0)); |
280 | 0 | outimag1 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(3, 1, 3, 1)); |
281 | 0 | outreal3 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(2, 0, 2, 0)); |
282 | 0 | outimag3 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(3, 1, 3, 1)); |
283 | 0 |
|
284 | 0 | in0 = _mm_sub_ps(_mm_mul_ps(outreal0, outreal1), |
285 | 0 | _mm_mul_ps(outimag0, outimag1)); |
286 | 0 | in1 = _mm_add_ps(_mm_mul_ps(outreal0, outimag1), |
287 | 0 | _mm_mul_ps(outimag0, outreal1)); |
288 | 0 | in2 = _mm_sub_ps(_mm_mul_ps(outreal2, outreal3), |
289 | 0 | _mm_mul_ps(outimag2, outimag3)); |
290 | 0 | in3 = _mm_add_ps(_mm_mul_ps(outreal2, outimag3), |
291 | 0 | _mm_mul_ps(outimag2, outreal3)); |
292 | 0 |
|
293 | 0 | outreal0 = _mm_unpacklo_ps(in0, in1); |
294 | 0 | outreal1 = _mm_unpackhi_ps(in0, in1); |
295 | 0 | outreal2 = _mm_unpacklo_ps(in2, in3); |
296 | 0 | outreal3 = _mm_unpackhi_ps(in2, in3); |
297 | 0 |
|
298 | 0 | _mm_store_ps(&aOutput[i], outreal0); |
299 | 0 | _mm_store_ps(&aOutput[i + 4], outreal1); |
300 | 0 | _mm_store_ps(&aOutput[i + 8], outreal2); |
301 | 0 | _mm_store_ps(&aOutput[i + 12], outreal3); |
302 | 0 | } |
303 | 0 | } |
304 | | |
305 | | float |
306 | | AudioBufferSumOfSquares_SSE(const float* aInput, uint32_t aLength) |
307 | 0 | { |
308 | 0 | unsigned i; |
309 | 0 | __m128 in0, in1, in2, in3, |
310 | 0 | acc0, acc1, acc2, acc3; |
311 | 0 | float out[4]; |
312 | 0 |
|
313 | 0 | ASSERT_ALIGNED16(aInput); |
314 | 0 | ASSERT_MULTIPLE16(aLength); |
315 | 0 |
|
316 | 0 | acc0 = _mm_setzero_ps(); |
317 | 0 | acc1 = _mm_setzero_ps(); |
318 | 0 | acc2 = _mm_setzero_ps(); |
319 | 0 | acc3 = _mm_setzero_ps(); |
320 | 0 |
|
321 | 0 | for (i = 0; i < aLength; i+=16) { |
322 | 0 | in0 = _mm_load_ps(&aInput[i]); |
323 | 0 | in1 = _mm_load_ps(&aInput[i + 4]); |
324 | 0 | in2 = _mm_load_ps(&aInput[i + 8]); |
325 | 0 | in3 = _mm_load_ps(&aInput[i + 12]); |
326 | 0 |
|
327 | 0 | in0 = _mm_mul_ps(in0, in0); |
328 | 0 | in1 = _mm_mul_ps(in1, in1); |
329 | 0 | in2 = _mm_mul_ps(in2, in2); |
330 | 0 | in3 = _mm_mul_ps(in3, in3); |
331 | 0 |
|
332 | 0 | acc0 = _mm_add_ps(acc0, in0); |
333 | 0 | acc1 = _mm_add_ps(acc1, in1); |
334 | 0 | acc2 = _mm_add_ps(acc2, in2); |
335 | 0 | acc3 = _mm_add_ps(acc3, in3); |
336 | 0 | } |
337 | 0 |
|
338 | 0 | acc0 = _mm_add_ps(acc0, acc1); |
339 | 0 | acc0 = _mm_add_ps(acc0, acc2); |
340 | 0 | acc0 = _mm_add_ps(acc0, acc3); |
341 | 0 |
|
342 | 0 | _mm_store_ps(out, acc0); |
343 | 0 |
|
344 | 0 | return out[0] + out[1] + out[2] + out[3]; |
345 | 0 | } |
346 | | |
347 | | } |