/src/opus/celt/x86/pitch_sse4_1.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* Copyright (c) 2014, Cisco Systems, INC |
2 | | Written by XiangMingZhu WeiZhou MinPeng YanWang |
3 | | |
4 | | Redistribution and use in source and binary forms, with or without |
5 | | modification, are permitted provided that the following conditions |
6 | | are met: |
7 | | |
8 | | - Redistributions of source code must retain the above copyright |
9 | | notice, this list of conditions and the following disclaimer. |
10 | | |
11 | | - Redistributions in binary form must reproduce the above copyright |
12 | | notice, this list of conditions and the following disclaimer in the |
13 | | documentation and/or other materials provided with the distribution. |
14 | | |
15 | | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
16 | | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
17 | | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
18 | | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER |
19 | | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
20 | | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
21 | | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
22 | | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
23 | | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
24 | | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
25 | | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
26 | | */ |
27 | | |
28 | | #ifdef HAVE_CONFIG_H |
29 | | #include "config.h" |
30 | | #endif |
31 | | |
32 | | #include <xmmintrin.h> |
33 | | #include <emmintrin.h> |
34 | | |
35 | | #include "macros.h" |
36 | | #include "celt_lpc.h" |
37 | | #include "stack_alloc.h" |
38 | | #include "mathops.h" |
39 | | #include "pitch.h" |
40 | | |
41 | | #if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT) |
42 | | #include <smmintrin.h> |
43 | | #include "x86cpu.h" |
44 | | |
45 | | opus_val32 celt_inner_prod_sse4_1(const opus_val16 *x, const opus_val16 *y, |
46 | | int N) |
47 | 549M | { |
48 | 549M | opus_int i, dataSize16; |
49 | 549M | opus_int32 sum; |
50 | 549M | __m128i inVec1_76543210, inVec1_FEDCBA98, acc1; |
51 | 549M | __m128i inVec2_76543210, inVec2_FEDCBA98, acc2; |
52 | 549M | __m128i inVec1_3210, inVec2_3210; |
53 | | |
54 | 549M | sum = 0; |
55 | 549M | dataSize16 = N & ~15; |
56 | | |
57 | 549M | acc1 = _mm_setzero_si128(); |
58 | 549M | acc2 = _mm_setzero_si128(); |
59 | | |
60 | 2.39G | for (i=0;i<dataSize16;i+=16) { |
61 | 1.84G | inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); |
62 | 1.84G | inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); |
63 | | |
64 | 1.84G | inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8])); |
65 | 1.84G | inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8])); |
66 | | |
67 | 1.84G | inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); |
68 | 1.84G | inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98); |
69 | | |
70 | 1.84G | acc1 = _mm_add_epi32(acc1, inVec1_76543210); |
71 | 1.84G | acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98); |
72 | 1.84G | } |
73 | | |
74 | 549M | acc1 = _mm_add_epi32(acc1, acc2); |
75 | | |
76 | 549M | if (N - i >= 8) |
77 | 311M | { |
78 | 311M | inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); |
79 | 311M | inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); |
80 | | |
81 | 311M | inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); |
82 | | |
83 | 311M | acc1 = _mm_add_epi32(acc1, inVec1_76543210); |
84 | 311M | i += 8; |
85 | 311M | } |
86 | | |
87 | 549M | if (N - i >= 4) |
88 | 177M | { |
89 | 177M | inVec1_3210 = OP_CVTEPI16_EPI32_M64(&x[i + 0]); |
90 | 177M | inVec2_3210 = OP_CVTEPI16_EPI32_M64(&y[i + 0]); |
91 | | |
92 | 177M | inVec1_3210 = _mm_mullo_epi32(inVec1_3210, inVec2_3210); |
93 | | |
94 | 177M | acc1 = _mm_add_epi32(acc1, inVec1_3210); |
95 | 177M | i += 4; |
96 | 177M | } |
97 | | |
98 | 549M | acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64(acc1, acc1)); |
99 | 549M | acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16(acc1, 0x0E)); |
100 | | |
101 | 549M | sum += _mm_cvtsi128_si32(acc1); |
102 | | |
103 | 911M | for (;i<N;i++) |
104 | 362M | { |
105 | 362M | sum = silk_SMLABB(sum, x[i], y[i]); |
106 | 362M | } |
107 | | |
108 | 549M | return sum; |
109 | 549M | } |
110 | | |
111 | | void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[ 4 ], int len) |
112 | 469M | { |
113 | 469M | int j; |
114 | | |
115 | 469M | __m128i vecX, vecX0, vecX1, vecX2, vecX3; |
116 | 469M | __m128i vecY0, vecY1, vecY2, vecY3; |
117 | 469M | __m128i sum0, sum1, sum2, sum3, vecSum; |
118 | 469M | __m128i initSum; |
119 | | |
120 | 469M | #ifdef OPUS_CHECK_ASM |
121 | 469M | opus_val32 sum_c[4]; |
122 | 2.34G | for (j=0;j<4;j++) { |
123 | 1.87G | sum_c[j] = sum[j]; |
124 | 1.87G | } |
125 | 469M | xcorr_kernel_c(x, y, sum_c, len); |
126 | 469M | #endif |
127 | | |
128 | 469M | celt_assert(len >= 3); |
129 | | |
130 | 469M | sum0 = _mm_setzero_si128(); |
131 | 469M | sum1 = _mm_setzero_si128(); |
132 | 469M | sum2 = _mm_setzero_si128(); |
133 | 469M | sum3 = _mm_setzero_si128(); |
134 | | |
135 | 6.19G | for (j=0;j<(len-7);j+=8) |
136 | 5.72G | { |
137 | 5.72G | vecX = _mm_loadu_si128((__m128i *)(&x[j + 0])); |
138 | 5.72G | vecY0 = _mm_loadu_si128((__m128i *)(&y[j + 0])); |
139 | 5.72G | vecY1 = _mm_loadu_si128((__m128i *)(&y[j + 1])); |
140 | 5.72G | vecY2 = _mm_loadu_si128((__m128i *)(&y[j + 2])); |
141 | 5.72G | vecY3 = _mm_loadu_si128((__m128i *)(&y[j + 3])); |
142 | | |
143 | 5.72G | sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(vecX, vecY0)); |
144 | 5.72G | sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(vecX, vecY1)); |
145 | 5.72G | sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(vecX, vecY2)); |
146 | 5.72G | sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(vecX, vecY3)); |
147 | 5.72G | } |
148 | | |
149 | 469M | sum0 = _mm_add_epi32(sum0, _mm_unpackhi_epi64( sum0, sum0)); |
150 | 469M | sum0 = _mm_add_epi32(sum0, _mm_shufflelo_epi16( sum0, 0x0E)); |
151 | | |
152 | 469M | sum1 = _mm_add_epi32(sum1, _mm_unpackhi_epi64( sum1, sum1)); |
153 | 469M | sum1 = _mm_add_epi32(sum1, _mm_shufflelo_epi16( sum1, 0x0E)); |
154 | | |
155 | 469M | sum2 = _mm_add_epi32(sum2, _mm_unpackhi_epi64( sum2, sum2)); |
156 | 469M | sum2 = _mm_add_epi32(sum2, _mm_shufflelo_epi16( sum2, 0x0E)); |
157 | | |
158 | 469M | sum3 = _mm_add_epi32(sum3, _mm_unpackhi_epi64( sum3, sum3)); |
159 | 469M | sum3 = _mm_add_epi32(sum3, _mm_shufflelo_epi16( sum3, 0x0E)); |
160 | | |
161 | 469M | vecSum = _mm_unpacklo_epi64(_mm_unpacklo_epi32(sum0, sum1), |
162 | 469M | _mm_unpacklo_epi32(sum2, sum3)); |
163 | | |
164 | 643M | for (;j<(len-3);j+=4) |
165 | 173M | { |
166 | 173M | vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]); |
167 | 173M | vecX0 = _mm_shuffle_epi32(vecX, 0x00); |
168 | 173M | vecX1 = _mm_shuffle_epi32(vecX, 0x55); |
169 | 173M | vecX2 = _mm_shuffle_epi32(vecX, 0xaa); |
170 | 173M | vecX3 = _mm_shuffle_epi32(vecX, 0xff); |
171 | | |
172 | 173M | vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]); |
173 | 173M | vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]); |
174 | 173M | vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]); |
175 | 173M | vecY3 = OP_CVTEPI16_EPI32_M64(&y[j + 3]); |
176 | | |
177 | 173M | sum0 = _mm_mullo_epi32(vecX0, vecY0); |
178 | 173M | sum1 = _mm_mullo_epi32(vecX1, vecY1); |
179 | 173M | sum2 = _mm_mullo_epi32(vecX2, vecY2); |
180 | 173M | sum3 = _mm_mullo_epi32(vecX3, vecY3); |
181 | | |
182 | 173M | sum0 = _mm_add_epi32(sum0, sum1); |
183 | 173M | sum2 = _mm_add_epi32(sum2, sum3); |
184 | 173M | vecSum = _mm_add_epi32(vecSum, sum0); |
185 | 173M | vecSum = _mm_add_epi32(vecSum, sum2); |
186 | 173M | } |
187 | | |
188 | 469M | vecX = OP_CVTEPI16_EPI32_M64(&x[len - 4]); |
189 | 469M | if (len - j == 3) |
190 | 0 | { |
191 | 0 | vecX0 = _mm_shuffle_epi32(vecX, 0x55); |
192 | 0 | vecX1 = _mm_shuffle_epi32(vecX, 0xaa); |
193 | 0 | vecX2 = _mm_shuffle_epi32(vecX, 0xff); |
194 | |
|
195 | 0 | vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]); |
196 | 0 | vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]); |
197 | 0 | vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]); |
198 | |
|
199 | 0 | sum0 = _mm_mullo_epi32(vecX0, vecY0); |
200 | 0 | sum1 = _mm_mullo_epi32(vecX1, vecY1); |
201 | 0 | sum2 = _mm_mullo_epi32(vecX2, vecY2); |
202 | |
|
203 | 0 | vecSum = _mm_add_epi32(vecSum, sum0); |
204 | 0 | vecSum = _mm_add_epi32(vecSum, sum1); |
205 | 0 | vecSum = _mm_add_epi32(vecSum, sum2); |
206 | 0 | } |
207 | 469M | else if (len - j == 2) |
208 | 78.7M | { |
209 | 78.7M | vecX0 = _mm_shuffle_epi32(vecX, 0xaa); |
210 | 78.7M | vecX1 = _mm_shuffle_epi32(vecX, 0xff); |
211 | | |
212 | 78.7M | vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]); |
213 | 78.7M | vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]); |
214 | | |
215 | 78.7M | sum0 = _mm_mullo_epi32(vecX0, vecY0); |
216 | 78.7M | sum1 = _mm_mullo_epi32(vecX1, vecY1); |
217 | | |
218 | 78.7M | vecSum = _mm_add_epi32(vecSum, sum0); |
219 | 78.7M | vecSum = _mm_add_epi32(vecSum, sum1); |
220 | 78.7M | } |
221 | 390M | else if (len - j == 1) |
222 | 0 | { |
223 | 0 | vecX0 = _mm_shuffle_epi32(vecX, 0xff); |
224 | |
|
225 | 0 | vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]); |
226 | |
|
227 | 0 | sum0 = _mm_mullo_epi32(vecX0, vecY0); |
228 | |
|
229 | 0 | vecSum = _mm_add_epi32(vecSum, sum0); |
230 | 0 | } |
231 | | |
232 | 469M | initSum = _mm_loadu_si128((__m128i *)(&sum[0])); |
233 | 469M | initSum = _mm_add_epi32(initSum, vecSum); |
234 | 469M | _mm_storeu_si128((__m128i *)sum, initSum); |
235 | | |
236 | 469M | #ifdef OPUS_CHECK_ASM |
237 | 469M | celt_assert(!memcmp(sum_c, sum, sizeof(sum_c))); |
238 | 469M | #endif |
239 | 469M | } |
240 | | #endif |