/src/mozilla-central/dom/media/webaudio/AudioNodeEngineSSE2.cpp

Source (jump to first uncovered line)
/* -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* this source code form is subject to the terms of the mozilla public
 * license, v. 2.0. if a copy of the mpl was not distributed with this file,
 * You can obtain one at http://mozilla.org/MPL/2.0/. */

#include "AudioNodeEngineSSE2.h"
#include "AlignmentUtils.h"
#include <emmintrin.h>


namespace mozilla {
void
AudioBufferAddWithScale_SSE(const float* aInput,
                            float aScale,
                            float* aOutput,
                            uint32_t aSize)
{
  __m128 vin0, vin1, vin2, vin3,
         vscaled0, vscaled1, vscaled2, vscaled3,
         vout0, vout1, vout2, vout3,
         vgain;

  ASSERT_ALIGNED16(aInput);
  ASSERT_ALIGNED16(aOutput);
  ASSERT_MULTIPLE16(aSize);

  vgain = _mm_load1_ps(&aScale);

  for (unsigned i = 0; i < aSize; i+=16) {
    vin0 = _mm_load_ps(&aInput[i]);
    vin1 = _mm_load_ps(&aInput[i + 4]);
    vin2 = _mm_load_ps(&aInput[i + 8]);
    vin3 = _mm_load_ps(&aInput[i + 12]);

    vscaled0 = _mm_mul_ps(vin0, vgain);
    vscaled1 = _mm_mul_ps(vin1, vgain);
    vscaled2 = _mm_mul_ps(vin2, vgain);
    vscaled3 = _mm_mul_ps(vin3, vgain);

    vin0 = _mm_load_ps(&aOutput[i]);
    vin1 = _mm_load_ps(&aOutput[i + 4]);
    vin2 = _mm_load_ps(&aOutput[i + 8]);
    vin3 = _mm_load_ps(&aOutput[i + 12]);

    vout0 = _mm_add_ps(vin0, vscaled0);
    vout1 = _mm_add_ps(vin1, vscaled1);
    vout2 = _mm_add_ps(vin2, vscaled2);
    vout3 = _mm_add_ps(vin3, vscaled3);

    _mm_store_ps(&aOutput[i], vout0);
    _mm_store_ps(&aOutput[i + 4], vout1);
    _mm_store_ps(&aOutput[i + 8], vout2);
    _mm_store_ps(&aOutput[i + 12], vout3);
  }
}

void
AudioBlockCopyChannelWithScale_SSE(const float* aInput,
                                   float aScale,
                                   float* aOutput)
{
  __m128 vin0, vin1, vin2, vin3,
         vout0, vout1, vout2, vout3;

  ASSERT_ALIGNED16(aInput);
  ASSERT_ALIGNED16(aOutput);

  __m128 vgain = _mm_load1_ps(&aScale);

  for (unsigned i = 0 ; i < WEBAUDIO_BLOCK_SIZE; i+=16) {
    vin0 = _mm_load_ps(&aInput[i]);
    vin1 = _mm_load_ps(&aInput[i + 4]);
    vin2 = _mm_load_ps(&aInput[i + 8]);
    vin3 = _mm_load_ps(&aInput[i + 12]);
    vout0 = _mm_mul_ps(vin0, vgain);
    vout1 = _mm_mul_ps(vin1, vgain);
    vout2 = _mm_mul_ps(vin2, vgain);
    vout3 = _mm_mul_ps(vin3, vgain);
    _mm_store_ps(&aOutput[i], vout0);
    _mm_store_ps(&aOutput[i + 4], vout1);
    _mm_store_ps(&aOutput[i + 8], vout2);
    _mm_store_ps(&aOutput[i + 12], vout3);
  }
}

void
AudioBlockCopyChannelWithScale_SSE(const float aInput[WEBAUDIO_BLOCK_SIZE],
                                   const float aScale[WEBAUDIO_BLOCK_SIZE],
                                   float aOutput[WEBAUDIO_BLOCK_SIZE])
{
  __m128 vin0, vin1, vin2, vin3,
         vscaled0, vscaled1, vscaled2, vscaled3,
         vout0, vout1, vout2, vout3;

  ASSERT_ALIGNED16(aInput);
  ASSERT_ALIGNED16(aScale);
  ASSERT_ALIGNED16(aOutput);

  for (unsigned i = 0 ; i < WEBAUDIO_BLOCK_SIZE; i+=16) {
    vscaled0 = _mm_load_ps(&aScale[i]);
    vscaled1 = _mm_load_ps(&aScale[i+4]);
    vscaled2 = _mm_load_ps(&aScale[i+8]);
    vscaled3 = _mm_load_ps(&aScale[i+12]);

    vin0 = _mm_load_ps(&aInput[i]);
    vin1 = _mm_load_ps(&aInput[i + 4]);
    vin2 = _mm_load_ps(&aInput[i + 8]);
    vin3 = _mm_load_ps(&aInput[i + 12]);

    vout0 = _mm_mul_ps(vin0, vscaled0);
    vout1 = _mm_mul_ps(vin1, vscaled1);
    vout2 = _mm_mul_ps(vin2, vscaled2);
    vout3 = _mm_mul_ps(vin3, vscaled3);

    _mm_store_ps(&aOutput[i], vout0);
    _mm_store_ps(&aOutput[i + 4], vout1);
    _mm_store_ps(&aOutput[i + 8], vout2);
    _mm_store_ps(&aOutput[i + 12], vout3);
  }
}

void
AudioBufferInPlaceScale_SSE(float* aBlock,
                            float aScale,
                            uint32_t aSize)
{
  __m128 vout0, vout1, vout2, vout3,
         vin0, vin1, vin2, vin3;

  ASSERT_ALIGNED16(aBlock);
  ASSERT_MULTIPLE16(aSize);

  __m128 vgain = _mm_load1_ps(&aScale);

  for (unsigned i = 0; i < aSize; i+=16) {
    vin0 = _mm_load_ps(&aBlock[i]);
    vin1 = _mm_load_ps(&aBlock[i + 4]);
    vin2 = _mm_load_ps(&aBlock[i + 8]);
    vin3 = _mm_load_ps(&aBlock[i + 12]);
    vout0 = _mm_mul_ps(vin0, vgain);
    vout1 = _mm_mul_ps(vin1, vgain);
    vout2 = _mm_mul_ps(vin2, vgain);
    vout3 = _mm_mul_ps(vin3, vgain);
    _mm_store_ps(&aBlock[i], vout0);
    _mm_store_ps(&aBlock[i + 4], vout1);
    _mm_store_ps(&aBlock[i + 8], vout2);
    _mm_store_ps(&aBlock[i + 12], vout3);
  }
}

void
AudioBufferInPlaceScale_SSE(float* aBlock,
                            float* aScale,
                            uint32_t aSize)
{
  __m128 vout0, vout1, vout2, vout3,
         vgain0, vgain1, vgain2, vgain3,
         vin0, vin1, vin2, vin3;

  ASSERT_ALIGNED16(aBlock);
  ASSERT_MULTIPLE16(aSize);

  for (unsigned i = 0; i < aSize; i+=16) {
    vin0 = _mm_load_ps(&aBlock[i]);
    vin1 = _mm_load_ps(&aBlock[i + 4]);
    vin2 = _mm_load_ps(&aBlock[i + 8]);
    vin3 = _mm_load_ps(&aBlock[i + 12]);
    vgain0 = _mm_load_ps(&aScale[i]);
    vgain1 = _mm_load_ps(&aScale[i + 4]);
    vgain2 = _mm_load_ps(&aScale[i + 8]);
    vgain3 = _mm_load_ps(&aScale[i + 12]);
    vout0 = _mm_mul_ps(vin0, vgain0);
    vout1 = _mm_mul_ps(vin1, vgain1);
    vout2 = _mm_mul_ps(vin2, vgain2);
    vout3 = _mm_mul_ps(vin3, vgain3);
    _mm_store_ps(&aBlock[i], vout0);
    _mm_store_ps(&aBlock[i + 4], vout1);
    _mm_store_ps(&aBlock[i + 8], vout2);
    _mm_store_ps(&aBlock[i + 12], vout3);
  }
}

void
AudioBlockPanStereoToStereo_SSE(const float aInputL[WEBAUDIO_BLOCK_SIZE],
                                const float aInputR[WEBAUDIO_BLOCK_SIZE],
                                float aGainL, float aGainR, bool aIsOnTheLeft,
                                float aOutputL[WEBAUDIO_BLOCK_SIZE],
                                float aOutputR[WEBAUDIO_BLOCK_SIZE])
{
  __m128 vinl0, vinr0, vinl1, vinr1,
         vout0, vout1,
         vscaled0, vscaled1,
         vgainl, vgainr;

  ASSERT_ALIGNED16(aInputL);
  ASSERT_ALIGNED16(aInputR);
  ASSERT_ALIGNED16(aOutputL);
  ASSERT_ALIGNED16(aOutputR);

  vgainl = _mm_load1_ps(&aGainL);
  vgainr = _mm_load1_ps(&aGainR);

  if (aIsOnTheLeft) {
    for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE; i+=8) {
      vinl0 = _mm_load_ps(&aInputL[i]);
      vinr0 = _mm_load_ps(&aInputR[i]);
      vinl1 = _mm_load_ps(&aInputL[i+4]);
      vinr1 = _mm_load_ps(&aInputR[i+4]);

      /* left channel : aOutputL  = aInputL + aInputR * gainL */
      vscaled0 = _mm_mul_ps(vinr0, vgainl);
      vscaled1 = _mm_mul_ps(vinr1, vgainl);
      vout0 = _mm_add_ps(vscaled0, vinl0);
      vout1 = _mm_add_ps(vscaled1, vinl1);
      _mm_store_ps(&aOutputL[i], vout0);
      _mm_store_ps(&aOutputL[i+4], vout1);

      /* right channel : aOutputR = aInputR * gainR */
      vscaled0 = _mm_mul_ps(vinr0, vgainr);
      vscaled1 = _mm_mul_ps(vinr1, vgainr);
      _mm_store_ps(&aOutputR[i], vscaled0);
      _mm_store_ps(&aOutputR[i+4], vscaled1);
    }
  } else {
    for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE; i+=8) {
      vinl0 = _mm_load_ps(&aInputL[i]);
      vinr0 = _mm_load_ps(&aInputR[i]);
      vinl1 = _mm_load_ps(&aInputL[i+4]);
      vinr1 = _mm_load_ps(&aInputR[i+4]);

      /* left channel : aInputL * gainL */
      vscaled0 = _mm_mul_ps(vinl0, vgainl);
      vscaled1 = _mm_mul_ps(vinl1, vgainl);
      _mm_store_ps(&aOutputL[i], vscaled0);
      _mm_store_ps(&aOutputL[i+4], vscaled1);

      /* right channel: aOutputR = aInputR + aInputL * gainR */
      vscaled0 = _mm_mul_ps(vinl0, vgainr);
      vscaled1 = _mm_mul_ps(vinl1, vgainr);
      vout0 = _mm_add_ps(vscaled0, vinr0);
      vout1 = _mm_add_ps(vscaled1, vinr1);
      _mm_store_ps(&aOutputR[i], vout0);
      _mm_store_ps(&aOutputR[i+4], vout1);
    }
  }
}

void BufferComplexMultiply_SSE(const float* aInput,
                               const float* aScale,
                               float* aOutput,
                               uint32_t aSize)
{
  unsigned i;
  __m128 in0, in1, in2, in3,
         outreal0, outreal1, outreal2, outreal3,
         outimag0, outimag1, outimag2, outimag3;

  ASSERT_ALIGNED16(aInput);
  ASSERT_ALIGNED16(aScale);
  ASSERT_ALIGNED16(aOutput);
  ASSERT_MULTIPLE16(aSize);

  for (i = 0; i < aSize * 2; i += 16) {
    in0 = _mm_load_ps(&aInput[i]);
    in1 = _mm_load_ps(&aInput[i + 4]);
    in2 = _mm_load_ps(&aInput[i + 8]);
    in3 = _mm_load_ps(&aInput[i + 12]);

    outreal0 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(2, 0, 2, 0));
    outimag0 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(3, 1, 3, 1));
    outreal2 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(2, 0, 2, 0));
    outimag2 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(3, 1, 3, 1));

    in0 = _mm_load_ps(&aScale[i]);
    in1 = _mm_load_ps(&aScale[i + 4]);
    in2 = _mm_load_ps(&aScale[i + 8]);
    in3 = _mm_load_ps(&aScale[i + 12]);

    outreal1 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(2, 0, 2, 0));
    outimag1 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(3, 1, 3, 1));
    outreal3 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(2, 0, 2, 0));
    outimag3 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(3, 1, 3, 1));

    in0 = _mm_sub_ps(_mm_mul_ps(outreal0, outreal1),
                     _mm_mul_ps(outimag0, outimag1));
    in1 = _mm_add_ps(_mm_mul_ps(outreal0, outimag1),
                     _mm_mul_ps(outimag0, outreal1));
    in2 = _mm_sub_ps(_mm_mul_ps(outreal2, outreal3),
                     _mm_mul_ps(outimag2, outimag3));
    in3 = _mm_add_ps(_mm_mul_ps(outreal2, outimag3),
                     _mm_mul_ps(outimag2, outreal3));

    outreal0 = _mm_unpacklo_ps(in0, in1);
    outreal1 = _mm_unpackhi_ps(in0, in1);
    outreal2 = _mm_unpacklo_ps(in2, in3);
    outreal3 = _mm_unpackhi_ps(in2, in3);

    _mm_store_ps(&aOutput[i], outreal0);
    _mm_store_ps(&aOutput[i + 4], outreal1);
    _mm_store_ps(&aOutput[i + 8], outreal2);
    _mm_store_ps(&aOutput[i + 12], outreal3);
  }
}

float
AudioBufferSumOfSquares_SSE(const float* aInput, uint32_t aLength)
{
  unsigned i;
  __m128 in0, in1, in2, in3,
         acc0, acc1, acc2, acc3;
  float out[4];

  ASSERT_ALIGNED16(aInput);
  ASSERT_MULTIPLE16(aLength);

  acc0 = _mm_setzero_ps();
  acc1 = _mm_setzero_ps();
  acc2 = _mm_setzero_ps();
  acc3 = _mm_setzero_ps();

  for (i = 0; i < aLength; i+=16) {
    in0 = _mm_load_ps(&aInput[i]);
    in1 = _mm_load_ps(&aInput[i + 4]);
    in2 = _mm_load_ps(&aInput[i + 8]);
    in3 = _mm_load_ps(&aInput[i + 12]);

    in0 = _mm_mul_ps(in0, in0);
    in1 = _mm_mul_ps(in1, in1);
    in2 = _mm_mul_ps(in2, in2);
    in3 = _mm_mul_ps(in3, in3);

    acc0 = _mm_add_ps(acc0, in0);
    acc1 = _mm_add_ps(acc1, in1);
    acc2 = _mm_add_ps(acc2, in2);
    acc3 = _mm_add_ps(acc3, in3);
  }

  acc0 = _mm_add_ps(acc0, acc1);
  acc0 = _mm_add_ps(acc0, acc2);
  acc0 = _mm_add_ps(acc0, acc3);

  _mm_store_ps(out, acc0);

  return out[0] + out[1] + out[2] + out[3];
}

}

Coverage Report

Created: 2018-09-25 14:53

Line	Count	Source (jump to first uncovered line)
1		/* -- mode: c++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -- */
2		/* this source code form is subject to the terms of the mozilla public
3		* license, v. 2.0. if a copy of the mpl was not distributed with this file,
4		* You can obtain one at http://mozilla.org/MPL/2.0/. */
5
6		#include "AudioNodeEngineSSE2.h"
7		#include "AlignmentUtils.h"
8		#include <emmintrin.h>
9
10
11		namespace mozilla {
12		void
13		AudioBufferAddWithScale_SSE(const float* aInput,
14		float aScale,
15		float* aOutput,
16		uint32_t aSize)
17	0	{
18	0	__m128 vin0, vin1, vin2, vin3,
19	0	vscaled0, vscaled1, vscaled2, vscaled3,
20	0	vout0, vout1, vout2, vout3,
21	0	vgain;
22	0
23	0	ASSERT_ALIGNED16(aInput);
24	0	ASSERT_ALIGNED16(aOutput);
25	0	ASSERT_MULTIPLE16(aSize);
26	0
27	0	vgain = _mm_load1_ps(&aScale);
28	0
29	0	for (unsigned i = 0; i < aSize; i+=16) {
30	0	vin0 = _mm_load_ps(&aInput[i]);
31	0	vin1 = _mm_load_ps(&aInput[i + 4]);
32	0	vin2 = _mm_load_ps(&aInput[i + 8]);
33	0	vin3 = _mm_load_ps(&aInput[i + 12]);
34	0
35	0	vscaled0 = _mm_mul_ps(vin0, vgain);
36	0	vscaled1 = _mm_mul_ps(vin1, vgain);
37	0	vscaled2 = _mm_mul_ps(vin2, vgain);
38	0	vscaled3 = _mm_mul_ps(vin3, vgain);
39	0
40	0	vin0 = _mm_load_ps(&aOutput[i]);
41	0	vin1 = _mm_load_ps(&aOutput[i + 4]);
42	0	vin2 = _mm_load_ps(&aOutput[i + 8]);
43	0	vin3 = _mm_load_ps(&aOutput[i + 12]);
44	0
45	0	vout0 = _mm_add_ps(vin0, vscaled0);
46	0	vout1 = _mm_add_ps(vin1, vscaled1);
47	0	vout2 = _mm_add_ps(vin2, vscaled2);
48	0	vout3 = _mm_add_ps(vin3, vscaled3);
49	0
50	0	_mm_store_ps(&aOutput[i], vout0);
51	0	_mm_store_ps(&aOutput[i + 4], vout1);
52	0	_mm_store_ps(&aOutput[i + 8], vout2);
53	0	_mm_store_ps(&aOutput[i + 12], vout3);
54	0	}
55	0	}
56
57		void
58		AudioBlockCopyChannelWithScale_SSE(const float* aInput,
59		float aScale,
60		float* aOutput)
61	0	{
62	0	__m128 vin0, vin1, vin2, vin3,
63	0	vout0, vout1, vout2, vout3;
64	0
65	0	ASSERT_ALIGNED16(aInput);
66	0	ASSERT_ALIGNED16(aOutput);
67	0
68	0	__m128 vgain = _mm_load1_ps(&aScale);
69	0
70	0	for (unsigned i = 0 ; i < WEBAUDIO_BLOCK_SIZE; i+=16) {
71	0	vin0 = _mm_load_ps(&aInput[i]);
72	0	vin1 = _mm_load_ps(&aInput[i + 4]);
73	0	vin2 = _mm_load_ps(&aInput[i + 8]);
74	0	vin3 = _mm_load_ps(&aInput[i + 12]);
75	0	vout0 = _mm_mul_ps(vin0, vgain);
76	0	vout1 = _mm_mul_ps(vin1, vgain);
77	0	vout2 = _mm_mul_ps(vin2, vgain);
78	0	vout3 = _mm_mul_ps(vin3, vgain);
79	0	_mm_store_ps(&aOutput[i], vout0);
80	0	_mm_store_ps(&aOutput[i + 4], vout1);
81	0	_mm_store_ps(&aOutput[i + 8], vout2);
82	0	_mm_store_ps(&aOutput[i + 12], vout3);
83	0	}
84	0	}
85
86		void
87		AudioBlockCopyChannelWithScale_SSE(const float aInput[WEBAUDIO_BLOCK_SIZE],
88		const float aScale[WEBAUDIO_BLOCK_SIZE],
89		float aOutput[WEBAUDIO_BLOCK_SIZE])
90	0	{
91	0	__m128 vin0, vin1, vin2, vin3,
92	0	vscaled0, vscaled1, vscaled2, vscaled3,
93	0	vout0, vout1, vout2, vout3;
94	0
95	0	ASSERT_ALIGNED16(aInput);
96	0	ASSERT_ALIGNED16(aScale);
97	0	ASSERT_ALIGNED16(aOutput);
98	0
99	0	for (unsigned i = 0 ; i < WEBAUDIO_BLOCK_SIZE; i+=16) {
100	0	vscaled0 = _mm_load_ps(&aScale[i]);
101	0	vscaled1 = _mm_load_ps(&aScale[i+4]);
102	0	vscaled2 = _mm_load_ps(&aScale[i+8]);
103	0	vscaled3 = _mm_load_ps(&aScale[i+12]);
104	0
105	0	vin0 = _mm_load_ps(&aInput[i]);
106	0	vin1 = _mm_load_ps(&aInput[i + 4]);
107	0	vin2 = _mm_load_ps(&aInput[i + 8]);
108	0	vin3 = _mm_load_ps(&aInput[i + 12]);
109	0
110	0	vout0 = _mm_mul_ps(vin0, vscaled0);
111	0	vout1 = _mm_mul_ps(vin1, vscaled1);
112	0	vout2 = _mm_mul_ps(vin2, vscaled2);
113	0	vout3 = _mm_mul_ps(vin3, vscaled3);
114	0
115	0	_mm_store_ps(&aOutput[i], vout0);
116	0	_mm_store_ps(&aOutput[i + 4], vout1);
117	0	_mm_store_ps(&aOutput[i + 8], vout2);
118	0	_mm_store_ps(&aOutput[i + 12], vout3);
119	0	}
120	0	}
121
122		void
123		AudioBufferInPlaceScale_SSE(float* aBlock,
124		float aScale,
125		uint32_t aSize)
126	0	{
127	0	__m128 vout0, vout1, vout2, vout3,
128	0	vin0, vin1, vin2, vin3;
129	0
130	0	ASSERT_ALIGNED16(aBlock);
131	0	ASSERT_MULTIPLE16(aSize);
132	0
133	0	__m128 vgain = _mm_load1_ps(&aScale);
134	0
135	0	for (unsigned i = 0; i < aSize; i+=16) {
136	0	vin0 = _mm_load_ps(&aBlock[i]);
137	0	vin1 = _mm_load_ps(&aBlock[i + 4]);
138	0	vin2 = _mm_load_ps(&aBlock[i + 8]);
139	0	vin3 = _mm_load_ps(&aBlock[i + 12]);
140	0	vout0 = _mm_mul_ps(vin0, vgain);
141	0	vout1 = _mm_mul_ps(vin1, vgain);
142	0	vout2 = _mm_mul_ps(vin2, vgain);
143	0	vout3 = _mm_mul_ps(vin3, vgain);
144	0	_mm_store_ps(&aBlock[i], vout0);
145	0	_mm_store_ps(&aBlock[i + 4], vout1);
146	0	_mm_store_ps(&aBlock[i + 8], vout2);
147	0	_mm_store_ps(&aBlock[i + 12], vout3);
148	0	}
149	0	}
150
151		void
152		AudioBufferInPlaceScale_SSE(float* aBlock,
153		float* aScale,
154		uint32_t aSize)
155	0	{
156	0	__m128 vout0, vout1, vout2, vout3,
157	0	vgain0, vgain1, vgain2, vgain3,
158	0	vin0, vin1, vin2, vin3;
159	0
160	0	ASSERT_ALIGNED16(aBlock);
161	0	ASSERT_MULTIPLE16(aSize);
162	0
163	0	for (unsigned i = 0; i < aSize; i+=16) {
164	0	vin0 = _mm_load_ps(&aBlock[i]);
165	0	vin1 = _mm_load_ps(&aBlock[i + 4]);
166	0	vin2 = _mm_load_ps(&aBlock[i + 8]);
167	0	vin3 = _mm_load_ps(&aBlock[i + 12]);
168	0	vgain0 = _mm_load_ps(&aScale[i]);
169	0	vgain1 = _mm_load_ps(&aScale[i + 4]);
170	0	vgain2 = _mm_load_ps(&aScale[i + 8]);
171	0	vgain3 = _mm_load_ps(&aScale[i + 12]);
172	0	vout0 = _mm_mul_ps(vin0, vgain0);
173	0	vout1 = _mm_mul_ps(vin1, vgain1);
174	0	vout2 = _mm_mul_ps(vin2, vgain2);
175	0	vout3 = _mm_mul_ps(vin3, vgain3);
176	0	_mm_store_ps(&aBlock[i], vout0);
177	0	_mm_store_ps(&aBlock[i + 4], vout1);
178	0	_mm_store_ps(&aBlock[i + 8], vout2);
179	0	_mm_store_ps(&aBlock[i + 12], vout3);
180	0	}
181	0	}
182
183		void
184		AudioBlockPanStereoToStereo_SSE(const float aInputL[WEBAUDIO_BLOCK_SIZE],
185		const float aInputR[WEBAUDIO_BLOCK_SIZE],
186		float aGainL, float aGainR, bool aIsOnTheLeft,
187		float aOutputL[WEBAUDIO_BLOCK_SIZE],
188		float aOutputR[WEBAUDIO_BLOCK_SIZE])
189	0	{
190	0	__m128 vinl0, vinr0, vinl1, vinr1,
191	0	vout0, vout1,
192	0	vscaled0, vscaled1,
193	0	vgainl, vgainr;
194	0
195	0	ASSERT_ALIGNED16(aInputL);
196	0	ASSERT_ALIGNED16(aInputR);
197	0	ASSERT_ALIGNED16(aOutputL);
198	0	ASSERT_ALIGNED16(aOutputR);
199	0
200	0	vgainl = _mm_load1_ps(&aGainL);
201	0	vgainr = _mm_load1_ps(&aGainR);
202	0
203	0	if (aIsOnTheLeft) {
204	0	for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE; i+=8) {
205	0	vinl0 = _mm_load_ps(&aInputL[i]);
206	0	vinr0 = _mm_load_ps(&aInputR[i]);
207	0	vinl1 = _mm_load_ps(&aInputL[i+4]);
208	0	vinr1 = _mm_load_ps(&aInputR[i+4]);
209	0
210	0	/* left channel : aOutputL = aInputL + aInputR * gainL */
211	0	vscaled0 = _mm_mul_ps(vinr0, vgainl);
212	0	vscaled1 = _mm_mul_ps(vinr1, vgainl);
213	0	vout0 = _mm_add_ps(vscaled0, vinl0);
214	0	vout1 = _mm_add_ps(vscaled1, vinl1);
215	0	_mm_store_ps(&aOutputL[i], vout0);
216	0	_mm_store_ps(&aOutputL[i+4], vout1);
217	0
218	0	/* right channel : aOutputR = aInputR * gainR */
219	0	vscaled0 = _mm_mul_ps(vinr0, vgainr);
220	0	vscaled1 = _mm_mul_ps(vinr1, vgainr);
221	0	_mm_store_ps(&aOutputR[i], vscaled0);
222	0	_mm_store_ps(&aOutputR[i+4], vscaled1);
223	0	}
224	0	} else {
225	0	for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE; i+=8) {
226	0	vinl0 = _mm_load_ps(&aInputL[i]);
227	0	vinr0 = _mm_load_ps(&aInputR[i]);
228	0	vinl1 = _mm_load_ps(&aInputL[i+4]);
229	0	vinr1 = _mm_load_ps(&aInputR[i+4]);
230	0
231	0	/* left channel : aInputL * gainL */
232	0	vscaled0 = _mm_mul_ps(vinl0, vgainl);
233	0	vscaled1 = _mm_mul_ps(vinl1, vgainl);
234	0	_mm_store_ps(&aOutputL[i], vscaled0);
235	0	_mm_store_ps(&aOutputL[i+4], vscaled1);
236	0
237	0	/* right channel: aOutputR = aInputR + aInputL * gainR */
238	0	vscaled0 = _mm_mul_ps(vinl0, vgainr);
239	0	vscaled1 = _mm_mul_ps(vinl1, vgainr);
240	0	vout0 = _mm_add_ps(vscaled0, vinr0);
241	0	vout1 = _mm_add_ps(vscaled1, vinr1);
242	0	_mm_store_ps(&aOutputR[i], vout0);
243	0	_mm_store_ps(&aOutputR[i+4], vout1);
244	0	}
245	0	}
246	0	}
247
248		void BufferComplexMultiply_SSE(const float* aInput,
249		const float* aScale,
250		float* aOutput,
251		uint32_t aSize)
252	0	{
253	0	unsigned i;
254	0	__m128 in0, in1, in2, in3,
255	0	outreal0, outreal1, outreal2, outreal3,
256	0	outimag0, outimag1, outimag2, outimag3;
257	0
258	0	ASSERT_ALIGNED16(aInput);
259	0	ASSERT_ALIGNED16(aScale);
260	0	ASSERT_ALIGNED16(aOutput);
261	0	ASSERT_MULTIPLE16(aSize);
262	0
263	0	for (i = 0; i < aSize * 2; i += 16) {
264	0	in0 = _mm_load_ps(&aInput[i]);
265	0	in1 = _mm_load_ps(&aInput[i + 4]);
266	0	in2 = _mm_load_ps(&aInput[i + 8]);
267	0	in3 = _mm_load_ps(&aInput[i + 12]);
268	0
269	0	outreal0 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(2, 0, 2, 0));
270	0	outimag0 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(3, 1, 3, 1));
271	0	outreal2 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(2, 0, 2, 0));
272	0	outimag2 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(3, 1, 3, 1));
273	0
274	0	in0 = _mm_load_ps(&aScale[i]);
275	0	in1 = _mm_load_ps(&aScale[i + 4]);
276	0	in2 = _mm_load_ps(&aScale[i + 8]);
277	0	in3 = _mm_load_ps(&aScale[i + 12]);
278	0
279	0	outreal1 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(2, 0, 2, 0));
280	0	outimag1 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(3, 1, 3, 1));
281	0	outreal3 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(2, 0, 2, 0));
282	0	outimag3 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(3, 1, 3, 1));
283	0
284	0	in0 = _mm_sub_ps(_mm_mul_ps(outreal0, outreal1),
285	0	_mm_mul_ps(outimag0, outimag1));
286	0	in1 = _mm_add_ps(_mm_mul_ps(outreal0, outimag1),
287	0	_mm_mul_ps(outimag0, outreal1));
288	0	in2 = _mm_sub_ps(_mm_mul_ps(outreal2, outreal3),
289	0	_mm_mul_ps(outimag2, outimag3));
290	0	in3 = _mm_add_ps(_mm_mul_ps(outreal2, outimag3),
291	0	_mm_mul_ps(outimag2, outreal3));
292	0
293	0	outreal0 = _mm_unpacklo_ps(in0, in1);
294	0	outreal1 = _mm_unpackhi_ps(in0, in1);
295	0	outreal2 = _mm_unpacklo_ps(in2, in3);
296	0	outreal3 = _mm_unpackhi_ps(in2, in3);
297	0
298	0	_mm_store_ps(&aOutput[i], outreal0);
299	0	_mm_store_ps(&aOutput[i + 4], outreal1);
300	0	_mm_store_ps(&aOutput[i + 8], outreal2);
301	0	_mm_store_ps(&aOutput[i + 12], outreal3);
302	0	}
303	0	}
304
305		float
306		AudioBufferSumOfSquares_SSE(const float* aInput, uint32_t aLength)
307	0	{
308	0	unsigned i;
309	0	__m128 in0, in1, in2, in3,
310	0	acc0, acc1, acc2, acc3;
311	0	float out[4];
312	0
313	0	ASSERT_ALIGNED16(aInput);
314	0	ASSERT_MULTIPLE16(aLength);
315	0
316	0	acc0 = _mm_setzero_ps();
317	0	acc1 = _mm_setzero_ps();
318	0	acc2 = _mm_setzero_ps();
319	0	acc3 = _mm_setzero_ps();
320	0
321	0	for (i = 0; i < aLength; i+=16) {
322	0	in0 = _mm_load_ps(&aInput[i]);
323	0	in1 = _mm_load_ps(&aInput[i + 4]);
324	0	in2 = _mm_load_ps(&aInput[i + 8]);
325	0	in3 = _mm_load_ps(&aInput[i + 12]);
326	0
327	0	in0 = _mm_mul_ps(in0, in0);
328	0	in1 = _mm_mul_ps(in1, in1);
329	0	in2 = _mm_mul_ps(in2, in2);
330	0	in3 = _mm_mul_ps(in3, in3);
331	0
332	0	acc0 = _mm_add_ps(acc0, in0);
333	0	acc1 = _mm_add_ps(acc1, in1);
334	0	acc2 = _mm_add_ps(acc2, in2);
335	0	acc3 = _mm_add_ps(acc3, in3);
336	0	}
337	0
338	0	acc0 = _mm_add_ps(acc0, acc1);
339	0	acc0 = _mm_add_ps(acc0, acc2);
340	0	acc0 = _mm_add_ps(acc0, acc3);
341	0
342	0	_mm_store_ps(out, acc0);
343	0
344	0	return out[0] + out[1] + out[2] + out[3];
345	0	}
346
347		}