/src/opus/silk/x86/NSQ_del_dec_avx2.c
Line | Count | Source |
1 | | /*********************************************************************** |
2 | | Copyright (c) 2021 Google Inc. |
3 | | Redistribution and use in source and binary forms, with or without |
4 | | modification, are permitted provided that the following conditions |
5 | | are met: |
6 | | - Redistributions of source code must retain the above copyright notice, |
7 | | this list of conditions and the following disclaimer. |
8 | | - Redistributions in binary form must reproduce the above copyright |
9 | | notice, this list of conditions and the following disclaimer in the |
10 | | documentation and/or other materials provided with the distribution. |
11 | | - Neither the name of Internet Society, IETF or IETF Trust, nor the |
12 | | names of specific contributors, may be used to endorse or promote |
13 | | products derived from this software without specific prior written |
14 | | permission. |
15 | | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
16 | | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
17 | | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
18 | | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
19 | | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
20 | | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
21 | | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
22 | | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
23 | | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
24 | | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
25 | | POSSIBILITY OF SUCH DAMAGE. |
26 | | ***********************************************************************/ |
27 | | |
28 | | #ifdef HAVE_CONFIG_H |
29 | | #include "config.h" |
30 | | #endif |
31 | | |
32 | | #ifdef OPUS_CHECK_ASM |
33 | | #include <string.h> |
34 | | #endif |
35 | | |
36 | | #include "opus_defines.h" |
37 | | #include <immintrin.h> |
38 | | |
39 | | #include "main.h" |
40 | | #include "stack_alloc.h" |
41 | | #include "NSQ.h" |
42 | | #include "celt/x86/x86cpu.h" |
43 | | |
44 | | /* Returns TRUE if all assumptions met */ |
45 | | static OPUS_INLINE int verify_assumptions(const silk_encoder_state *psEncC) |
46 | 61.5M | { |
47 | | /* This optimization is based on these assumptions */ |
48 | | /* These assumptions are fundamental and hence assert are */ |
49 | | /* used. Should any assert triggers, we have to re-visit */ |
50 | | /* all related code to make sure it still functions the */ |
51 | | /* same as the C implementation. */ |
52 | 61.5M | silk_assert(MAX_DEL_DEC_STATES <= 4 && |
53 | 61.5M | MAX_FRAME_LENGTH % 4 == 0 && |
54 | 61.5M | MAX_SUB_FRAME_LENGTH % 4 == 0 && |
55 | 61.5M | LTP_MEM_LENGTH_MS % 4 == 0 ); |
56 | 61.5M | silk_assert(psEncC->fs_kHz == 8 || |
57 | 61.5M | psEncC->fs_kHz == 12 || |
58 | 61.5M | psEncC->fs_kHz == 16 ); |
59 | 61.5M | silk_assert(psEncC->nb_subfr <= MAX_NB_SUBFR && |
60 | 61.5M | psEncC->nb_subfr > 0 ); |
61 | 61.5M | silk_assert(psEncC->nStatesDelayedDecision <= MAX_DEL_DEC_STATES && |
62 | 61.5M | psEncC->nStatesDelayedDecision > 0 ); |
63 | 61.5M | silk_assert(psEncC->ltp_mem_length == psEncC->fs_kHz * LTP_MEM_LENGTH_MS); |
64 | | |
65 | | /* Regressions were observed on certain AMD Zen CPUs when */ |
66 | | /* nStatesDelayedDecision is 1 or 2. Ideally we should detect */ |
67 | | /* these CPUs and enable this optimization on others; however, */ |
68 | | /* there is no good way to do so under current OPUS framework. */ |
69 | 61.5M | return psEncC->nStatesDelayedDecision == 3 || |
70 | 47.7M | psEncC->nStatesDelayedDecision == 4; |
71 | 61.5M | } |
72 | | |
73 | | /* Intrinsics not defined on MSVC */ |
74 | | #ifdef _MSC_VER |
75 | | #include <intsafe.h> |
76 | | static inline int __builtin_sadd_overflow(opus_int32 a, opus_int32 b, opus_int32* res) |
77 | | { |
78 | | *res = a+b; |
79 | | return (*res ^ a) & (*res ^ b) & 0x80000000; |
80 | | } |
81 | | static inline int __builtin_ctz(unsigned int x) |
82 | | { |
83 | | DWORD res = 0; |
84 | | return _BitScanForward(&res, x) ? res : 32; |
85 | | } |
86 | | #endif |
87 | | |
88 | | static OPUS_INLINE __m128i silk_cvtepi64_epi32_high(__m256i num) |
89 | 208G | { |
90 | 208G | return _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(num, _mm256_set_epi32(0, 0, 0, 0, 7, 5, 3, 1))); |
91 | 208G | } |
92 | | |
93 | | static OPUS_INLINE opus_int16 silk_sat16(opus_int32 num) |
94 | 4.39G | { |
95 | 4.39G | num = num > silk_int16_MAX ? silk_int16_MAX : num; |
96 | 4.39G | num = num < silk_int16_MIN ? silk_int16_MIN : num; |
97 | 4.39G | return num; |
98 | 4.39G | } |
99 | | |
100 | | static OPUS_INLINE opus_int32 silk_sar_round_32(opus_int32 a, int bits) |
101 | 4.48G | { |
102 | 4.48G | silk_assert(bits > 0 && bits < 31); |
103 | 4.48G | a += 1 << (bits-1); |
104 | 4.48G | return a >> bits; |
105 | 4.48G | } |
106 | | |
107 | | static OPUS_INLINE opus_int64 silk_sar_round_smulww(opus_int32 a, opus_int32 b, int bits) |
108 | 4.32G | { |
109 | | #ifndef OPUS_CHECK_ASM |
110 | | opus_int64 t; |
111 | | #endif |
112 | 4.32G | silk_assert(bits > 0 && bits < 63); |
113 | | #ifdef OPUS_CHECK_ASM |
114 | 1.08G | return silk_RSHIFT_ROUND(silk_SMULWW(a, b), bits); |
115 | | #else |
116 | | /* This code is more correct, but it won't overflow like the C code in some rare cases. */ |
117 | 3.24G | silk_assert(bits > 0 && bits < 63); |
118 | 3.24G | t = ((opus_int64)a) * ((opus_int64)b); |
119 | 3.24G | bits += 16; |
120 | 3.24G | t += 1ull << (bits-1); |
121 | 3.24G | return t >> bits; |
122 | | #endif |
123 | 3.24G | } NSQ_del_dec_avx2.c:silk_sar_round_smulww Line | Count | Source | 108 | 1.08G | { | 109 | | #ifndef OPUS_CHECK_ASM | 110 | | opus_int64 t; | 111 | | #endif | 112 | 1.08G | silk_assert(bits > 0 && bits < 63); | 113 | 1.08G | #ifdef OPUS_CHECK_ASM | 114 | 1.08G | return silk_RSHIFT_ROUND(silk_SMULWW(a, b), bits); | 115 | | #else | 116 | | /* This code is more correct, but it won't overflow like the C code in some rare cases. */ | 117 | | silk_assert(bits > 0 && bits < 63); | 118 | | t = ((opus_int64)a) * ((opus_int64)b); | 119 | | bits += 16; | 120 | | t += 1ull << (bits-1); | 121 | | return t >> bits; | 122 | | #endif | 123 | 1.08G | } |
NSQ_del_dec_avx2.c:silk_sar_round_smulww Line | Count | Source | 108 | 3.24G | { | 109 | 3.24G | #ifndef OPUS_CHECK_ASM | 110 | 3.24G | opus_int64 t; | 111 | 3.24G | #endif | 112 | 3.24G | silk_assert(bits > 0 && bits < 63); | 113 | | #ifdef OPUS_CHECK_ASM | 114 | | return silk_RSHIFT_ROUND(silk_SMULWW(a, b), bits); | 115 | | #else | 116 | | /* This code is more correct, but it won't overflow like the C code in some rare cases. */ | 117 | 3.24G | silk_assert(bits > 0 && bits < 63); | 118 | 3.24G | t = ((opus_int64)a) * ((opus_int64)b); | 119 | 3.24G | bits += 16; | 120 | 3.24G | t += 1ull << (bits-1); | 121 | 3.24G | return t >> bits; | 122 | 3.24G | #endif | 123 | 3.24G | } |
|
124 | | |
125 | | static OPUS_INLINE opus_int32 silk_add_sat32(opus_int32 a, opus_int32 b) |
126 | 183M | { |
127 | 183M | opus_int32 sum; |
128 | 183M | if (__builtin_sadd_overflow(a, b, &sum)) |
129 | 14.7k | { |
130 | 14.7k | return a >= 0 ? silk_int32_MAX : silk_int32_MIN; |
131 | 14.7k | } |
132 | 183M | return sum; |
133 | 183M | } |
134 | | |
135 | | static OPUS_INLINE __m128i silk_mm_srai_round_epi32(__m128i a, int bits) |
136 | 8.65G | { |
137 | 8.65G | silk_assert(bits > 0 && bits < 31); |
138 | 8.65G | return _mm_srai_epi32(_mm_add_epi32(a, _mm_set1_epi32(1 << (bits - 1))), bits); |
139 | 8.65G | } |
140 | | |
141 | | /* add/subtract with output saturated */ |
142 | | static OPUS_INLINE __m128i silk_mm_add_sat_epi32(__m128i a, __m128i b) |
143 | 4.32G | { |
144 | 4.32G | __m128i r = _mm_add_epi32(a, b); |
145 | 4.32G | __m128i OF = _mm_and_si128(_mm_xor_si128(a, r), _mm_xor_si128(b, r)); /* OF = (sum ^ a) & (sum ^ b) */ |
146 | 4.32G | __m128i SAT = _mm_add_epi32(_mm_srli_epi32(a, 31), _mm_set1_epi32(0x7FFFFFFF)); /* SAT = (a >> 31) + 0x7FFFFFFF */ |
147 | 4.32G | return _mm_blendv_epi8(r, SAT, _mm_srai_epi32(OF, 31)); |
148 | 4.32G | } |
149 | | static OPUS_INLINE __m128i silk_mm_sub_sat_epi32(__m128i a, __m128i b) |
150 | 4.32G | { |
151 | 4.32G | __m128i r = _mm_sub_epi32(a, b); |
152 | 4.32G | __m128i OF = _mm_andnot_si128(_mm_xor_si128(b, r), _mm_xor_si128(a, r)); /* OF = (sum ^ a) & (sum ^ ~b) = (sum ^ a) & ~(sum ^ b) */ |
153 | 4.32G | __m128i SAT = _mm_add_epi32(_mm_srli_epi32(a, 31), _mm_set1_epi32(0x7FFFFFFF)); /* SAT = (a >> 31) + 0x7FFFFFFF */ |
154 | 4.32G | return _mm_blendv_epi8(r, SAT, _mm_srai_epi32(OF, 31)); |
155 | 4.32G | } |
156 | | static OPUS_INLINE __m256i silk_mm256_sub_sat_epi32(__m256i a, __m256i b) |
157 | 4.32G | { |
158 | 4.32G | __m256i r = _mm256_sub_epi32(a, b); |
159 | 4.32G | __m256i OF = _mm256_andnot_si256(_mm256_xor_si256(b, r), _mm256_xor_si256(a, r)); /* OF = (sum ^ a) & (sum ^ ~b) = (sum ^ a) & ~(sum ^ b) */ |
160 | 4.32G | __m256i SAT = _mm256_add_epi32(_mm256_srli_epi32(a, 31), _mm256_set1_epi32(0x7FFFFFFF)); /* SAT = (a >> 31) + 0x7FFFFFFF */ |
161 | 4.32G | return _mm256_blendv_epi8(r, SAT, _mm256_srai_epi32(OF, 31)); |
162 | 4.32G | } |
163 | | |
164 | | static OPUS_INLINE __m128i silk_mm_limit_epi32(__m128i num, opus_int32 limit1, opus_int32 limit2) |
165 | 4.32G | { |
166 | 4.32G | opus_int32 lo = limit1 < limit2 ? limit1 : limit2; |
167 | 4.32G | opus_int32 hi = limit1 > limit2 ? limit1 : limit2; |
168 | | |
169 | 4.32G | num = _mm_min_epi32(num, _mm_set1_epi32(hi)); |
170 | 4.32G | num = _mm_max_epi32(num, _mm_set1_epi32(lo)); |
171 | 4.32G | return num; |
172 | 4.32G | } |
173 | | |
174 | | /* cond < 0 ? -num : num */ |
175 | | static OPUS_INLINE __m128i silk_mm_sign_epi32(__m128i num, __m128i cond) |
176 | 4.34G | { |
177 | 4.34G | return _mm_sign_epi32(num, _mm_or_si128(cond, _mm_set1_epi32(1))); |
178 | 4.34G | } |
179 | | static OPUS_INLINE __m256i silk_mm256_sign_epi32(__m256i num, __m256i cond) |
180 | 4.32G | { |
181 | 4.32G | return _mm256_sign_epi32(num, _mm256_or_si256(cond, _mm256_set1_epi32(1))); |
182 | 4.32G | } |
183 | | |
184 | | /* (a32 * b32) >> 16 */ |
185 | | static OPUS_INLINE __m128i silk_mm_smulww_epi32(__m128i a, opus_int32 b) |
186 | 1.04G | { |
187 | 1.04G | return silk_cvtepi64_epi32_high(_mm256_slli_epi64(_mm256_mul_epi32(_mm256_cvtepi32_epi64(a), _mm256_set1_epi32(b)), 16)); |
188 | 1.04G | } |
189 | | |
190 | | /* (a32 * (opus_int32)((opus_int16)(b32))) >> 16 output have to be 32bit int */ |
191 | | static OPUS_INLINE __m128i silk_mm_smulwb_epi32(__m128i a, opus_int32 b) |
192 | 201G | { |
193 | 201G | return silk_cvtepi64_epi32_high(_mm256_mul_epi32(_mm256_cvtepi32_epi64(a), _mm256_set1_epi32((opus_uint32)b<<16))); |
194 | 201G | } |
195 | | |
196 | | /* (opus_int32)((opus_int16)(a3))) * (opus_int32)((opus_int16)(b32)) output have to be 32bit int */ |
197 | | static OPUS_INLINE __m256i silk_mm256_smulbb_epi32(__m256i a, __m256i b) |
198 | 8.65G | { |
199 | 8.65G | const char FF = (char)0xFF; |
200 | 8.65G | __m256i msk = _mm256_set_epi8( |
201 | 8.65G | FF, FF, FF, FF, FF, FF, FF, FF, 13, 12, 9, 8, 5, 4, 1, 0, |
202 | 8.65G | FF, FF, FF, FF, FF, FF, FF, FF, 13, 12, 9, 8, 5, 4, 1, 0); |
203 | 8.65G | __m256i lo = _mm256_mullo_epi16(a, b); |
204 | 8.65G | __m256i hi = _mm256_mulhi_epi16(a, b); |
205 | 8.65G | lo = _mm256_shuffle_epi8(lo, msk); |
206 | 8.65G | hi = _mm256_shuffle_epi8(hi, msk); |
207 | 8.65G | return _mm256_unpacklo_epi16(lo, hi); |
208 | 8.65G | } |
209 | | |
210 | | static OPUS_INLINE __m256i silk_mm256_reverse_epi32(__m256i v) |
211 | 75.5M | { |
212 | 75.5M | v = _mm256_shuffle_epi32(v, 0x1B); |
213 | 75.5M | v = _mm256_permute4x64_epi64(v, 0x4E); |
214 | 75.5M | return v; |
215 | 75.5M | } |
216 | | |
217 | | static OPUS_INLINE opus_int32 silk_mm256_hsum_epi32(__m256i v) |
218 | 66.7M | { |
219 | 66.7M | __m128i sum = _mm_add_epi32(_mm256_extracti128_si256(v, 1), _mm256_extracti128_si256(v, 0)); |
220 | 66.7M | sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); |
221 | 66.7M | sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); |
222 | 66.7M | return _mm_cvtsi128_si32(sum); |
223 | 66.7M | } |
224 | | |
225 | | static OPUS_INLINE __m128i silk_mm_hmin_epi32(__m128i num) |
226 | 8.68G | { |
227 | 8.68G | num = _mm_min_epi32(num, _mm_shuffle_epi32(num, 0x4E)); /* 0123 -> 2301 */ |
228 | 8.68G | num = _mm_min_epi32(num, _mm_shuffle_epi32(num, 0xB1)); /* 0123 -> 1032 */ |
229 | 8.68G | return num; |
230 | 8.68G | } |
231 | | |
232 | | static OPUS_INLINE __m128i silk_mm_hmax_epi32(__m128i num) |
233 | 4.32G | { |
234 | 4.32G | num = _mm_max_epi32(num, _mm_shuffle_epi32(num, 0x4E)); /* 0123 -> 2310 */ |
235 | 4.32G | num = _mm_max_epi32(num, _mm_shuffle_epi32(num, 0xB1)); /* 0123 -> 1032 */ |
236 | 4.32G | return num; |
237 | 4.32G | } |
238 | | |
239 | | static OPUS_INLINE __m128i silk_mm_mask_hmin_epi32(__m128i num, __m128i mask) |
240 | 8.68G | { |
241 | 8.68G | num = _mm_blendv_epi8(num, _mm_set1_epi32(silk_int32_MAX), mask); |
242 | 8.68G | return silk_mm_hmin_epi32(num); |
243 | 8.68G | } |
244 | | |
245 | | static OPUS_INLINE __m128i silk_mm_mask_hmax_epi32(__m128i num, __m128i mask) |
246 | 4.32G | { |
247 | 4.32G | num = _mm_blendv_epi8(num, _mm_set1_epi32(silk_int32_MIN), mask); |
248 | 4.32G | return silk_mm_hmax_epi32(num); |
249 | 4.32G | } |
250 | | |
251 | | static OPUS_INLINE __m128i silk_mm256_rand_epi32(__m128i seed) |
252 | 4.32G | { |
253 | 4.32G | seed = _mm_mullo_epi32(seed, _mm_set1_epi32(RAND_MULTIPLIER)); |
254 | 4.32G | seed = _mm_add_epi32(seed, _mm_set1_epi32(RAND_INCREMENT)); |
255 | 4.32G | return seed; |
256 | 4.32G | } |
257 | | |
258 | | static OPUS_INLINE opus_int32 silk_index_of_first_equal_epi32(__m128i a, __m128i b) |
259 | 6.34G | { |
260 | 6.34G | unsigned int mask = _mm_movemask_epi8(_mm_cmpeq_epi32(a, b)) & 0x1111; |
261 | 6.34G | silk_assert(mask != 0); |
262 | 6.34G | return __builtin_ctz(mask) >> 2; |
263 | 6.34G | } |
264 | | |
265 | | static __m128i silk_index_to_selector(opus_int32 index) |
266 | 5.35G | { |
267 | 5.35G | silk_assert(index < 4); |
268 | 5.35G | index <<= 2; |
269 | 5.35G | return _mm_set_epi8( |
270 | 5.35G | index + 3, index + 2, index + 1, index + 0, |
271 | 5.35G | index + 3, index + 2, index + 1, index + 0, |
272 | 5.35G | index + 3, index + 2, index + 1, index + 0, |
273 | 5.35G | index + 3, index + 2, index + 1, index + 0); |
274 | 5.35G | } |
275 | | |
276 | | static opus_int32 silk_select_winner(__m128i num, __m128i selector) |
277 | 17.3G | { |
278 | 17.3G | return _mm_cvtsi128_si32(_mm_shuffle_epi8(num, selector)); |
279 | 17.3G | } |
280 | | |
281 | | typedef struct |
282 | | { |
283 | | __m128i RandState; |
284 | | __m128i Q_Q10; |
285 | | __m128i Xq_Q14; |
286 | | __m128i Pred_Q15; |
287 | | __m128i Shape_Q14; |
288 | | } NSQ_del_dec_sample_struct; |
289 | | |
290 | | typedef struct |
291 | | { |
292 | | __m128i sLPC_Q14[MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH]; |
293 | | __m128i LF_AR_Q14; |
294 | | __m128i Seed; |
295 | | __m128i SeedInit; |
296 | | __m128i RD_Q10; |
297 | | __m128i Diff_Q14; |
298 | | __m128i sAR2_Q14[MAX_SHAPE_LPC_ORDER]; |
299 | | NSQ_del_dec_sample_struct Samples[DECISION_DELAY]; |
300 | | } NSQ_del_dec_struct; |
301 | | |
302 | | static OPUS_INLINE void silk_nsq_del_dec_scale_states_avx2( |
303 | | const silk_encoder_state *psEncC, /* I Encoder State */ |
304 | | silk_nsq_state *NSQ, /* I/O NSQ state */ |
305 | | NSQ_del_dec_struct *psDelDec, /* I/O Delayed decision states */ |
306 | | const opus_int16 x16[], /* I Input */ |
307 | | opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH], /* O Input scaled with 1/Gain in Q10 */ |
308 | | const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */ |
309 | | opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */ |
310 | | opus_int subfr, /* I Subframe number */ |
311 | | const opus_int LTP_scale_Q14, /* I LTP state scaling */ |
312 | | const opus_int32 Gains_Q16[MAX_NB_SUBFR], /* I */ |
313 | | const opus_int pitchL[MAX_NB_SUBFR], /* I Pitch lag */ |
314 | | const opus_int signal_type, /* I Signal type */ |
315 | | const opus_int decisionDelay /* I Decision delay */ |
316 | | ); |
317 | | |
318 | | /*******************************************/ |
319 | | /* LPC analysis filter */ |
320 | | /* NB! State is kept internally and the */ |
321 | | /* filter always starts with zero state */ |
322 | | /* first d output samples are set to zero */ |
323 | | /*******************************************/ |
324 | | static OPUS_INLINE void silk_LPC_analysis_filter_avx2( |
325 | | opus_int16 *out, /* O Output signal */ |
326 | | const opus_int16 *in, /* I Input signal */ |
327 | | const opus_int16 *B, /* I MA prediction coefficients, Q12 [order] */ |
328 | | const opus_int32 len, /* I Signal length */ |
329 | | const opus_int32 order /* I Filter order */ |
330 | | ); |
331 | | |
332 | | /******************************************/ |
333 | | /* Noise shape quantizer for one subframe */ |
334 | | /******************************************/ |
335 | | static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_avx2( |
336 | | silk_nsq_state *NSQ, /* I/O NSQ state */ |
337 | | NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */ |
338 | | opus_int signalType, /* I Signal type */ |
339 | | const opus_int32 x_Q10[], /* I */ |
340 | | opus_int8 pulses[], /* O */ |
341 | | opus_int16 xq[], /* O */ |
342 | | opus_int32 sLTP_Q15[], /* I/O LTP filter state */ |
343 | | opus_int32 delayedGain_Q10[DECISION_DELAY], /* I/O Gain delay buffer */ |
344 | | const opus_int16 a_Q12[], /* I Short term prediction coefs */ |
345 | | const opus_int16 b_Q14[], /* I Long term prediction coefs */ |
346 | | const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs */ |
347 | | opus_int lag, /* I Pitch lag */ |
348 | | opus_int32 HarmShapeFIRPacked_Q14, /* I */ |
349 | | opus_int Tilt_Q14, /* I Spectral tilt */ |
350 | | opus_int32 LF_shp_Q14, /* I */ |
351 | | opus_int32 Gain_Q16, /* I */ |
352 | | opus_int Lambda_Q10, /* I */ |
353 | | opus_int offset_Q10, /* I */ |
354 | | opus_int length, /* I Input length */ |
355 | | opus_int subfr, /* I Subframe number */ |
356 | | opus_int shapingLPCOrder, /* I Shaping LPC filter order */ |
357 | | opus_int predictLPCOrder, /* I Prediction filter order */ |
358 | | opus_int warping_Q16, /* I */ |
359 | | __m128i MaskDelDec, /* I Mask of states in decision tree */ |
360 | | opus_int *smpl_buf_idx, /* I/O Index to newest samples in buffers */ |
361 | | opus_int decisionDelay /* I */ |
362 | | ); |
363 | | |
364 | | void silk_NSQ_del_dec_avx2( |
365 | | const silk_encoder_state *psEncC, /* I Encoder State */ |
366 | | silk_nsq_state *NSQ, /* I/O NSQ state */ |
367 | | SideInfoIndices *psIndices, /* I/O Quantization Indices */ |
368 | | const opus_int16 x16[], /* I Input */ |
369 | | opus_int8 pulses[], /* O Quantized pulse signal */ |
370 | | const opus_int16 *PredCoef_Q12, /* I Short term prediction coefs */ |
371 | | const opus_int16 LTPCoef_Q14[LTP_ORDER * MAX_NB_SUBFR], /* I Long term prediction coefs */ |
372 | | const opus_int16 AR_Q13[MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER], /* I Noise shaping coefs */ |
373 | | const opus_int HarmShapeGain_Q14[MAX_NB_SUBFR], /* I Long term shaping coefs */ |
374 | | const opus_int Tilt_Q14[MAX_NB_SUBFR], /* I Spectral tilt */ |
375 | | const opus_int32 LF_shp_Q14[MAX_NB_SUBFR], /* I Low frequency shaping coefs */ |
376 | | const opus_int32 Gains_Q16[MAX_NB_SUBFR], /* I Quantization step sizes */ |
377 | | const opus_int32 pitchL[MAX_NB_SUBFR], /* I Pitch lags */ |
378 | | const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ |
379 | | const opus_int LTP_scale_Q14 /* I LTP state scaling */ |
380 | | ) |
381 | 61.5M | { |
382 | | #ifdef OPUS_CHECK_ASM |
383 | | silk_nsq_state NSQ_c; |
384 | | SideInfoIndices psIndices_c; |
385 | | opus_int8 pulses_c[MAX_FRAME_LENGTH]; |
386 | | const opus_int8 *const pulses_a = pulses; |
387 | | |
388 | 18.6M | silk_memcpy(&NSQ_c, NSQ, sizeof(NSQ_c)); |
389 | 18.6M | silk_memcpy(&psIndices_c, psIndices, sizeof(psIndices_c)); |
390 | 18.6M | silk_memcpy(pulses_c, pulses, sizeof(pulses_c)); |
391 | | silk_NSQ_del_dec_c(psEncC, &NSQ_c, &psIndices_c, x16, pulses_c, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, |
392 | | pitchL, Lambda_Q10, LTP_scale_Q14); |
393 | | #endif |
394 | | |
395 | 61.5M | if (!verify_assumptions(psEncC)) |
396 | 36.4M | { |
397 | 36.4M | silk_NSQ_del_dec_c(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14); |
398 | 36.4M | return; |
399 | 36.4M | } |
400 | | |
401 | 25.1M | opus_int i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr; |
402 | 25.1M | opus_int last_smple_idx, smpl_buf_idx, decisionDelay; |
403 | 25.1M | const opus_int16 *A_Q12, *B_Q14, *AR_shp_Q13; |
404 | 25.1M | opus_int16 *pxq; |
405 | 25.1M | VARDECL(opus_int32, sLTP_Q15); |
406 | 25.1M | VARDECL(opus_int16, sLTP); |
407 | 25.1M | opus_int32 HarmShapeFIRPacked_Q14; |
408 | 25.1M | opus_int offset_Q10; |
409 | 25.1M | opus_int32 Gain_Q10; |
410 | 25.1M | opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH]; |
411 | 25.1M | opus_int32 delayedGain_Q10[DECISION_DELAY]; |
412 | 25.1M | NSQ_del_dec_struct psDelDec = {0}; |
413 | 25.1M | NSQ_del_dec_sample_struct *psSample; |
414 | 25.1M | __m128i RDmin_Q10, MaskDelDec, Winner_selector; |
415 | 25.1M | SAVE_STACK; |
416 | | |
417 | 25.1M | MaskDelDec = _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFFFFF00ul << ((psEncC->nStatesDelayedDecision - 1) << 3))); |
418 | | |
419 | | /* Set unvoiced lag to the previous one, overwrite later for voiced */ |
420 | 25.1M | lag = NSQ->lagPrev; |
421 | | |
422 | 25.1M | silk_assert(NSQ->prev_gain_Q16 != 0); |
423 | 25.1M | psDelDec.Seed = _mm_and_si128( |
424 | 25.1M | _mm_add_epi32(_mm_set_epi32(3, 2, 1, 0), _mm_set1_epi32(psIndices->Seed)), |
425 | 25.1M | _mm_set1_epi32(3)); |
426 | 25.1M | psDelDec.SeedInit = psDelDec.Seed; |
427 | 25.1M | psDelDec.RD_Q10 = _mm_setzero_si128(); |
428 | 25.1M | psDelDec.LF_AR_Q14 = _mm_set1_epi32(NSQ->sLF_AR_shp_Q14); |
429 | 25.1M | psDelDec.Diff_Q14 = _mm_set1_epi32(NSQ->sDiff_shp_Q14); |
430 | 25.1M | psDelDec.Samples[0].Shape_Q14 = _mm_set1_epi32(NSQ->sLTP_shp_Q14[psEncC->ltp_mem_length - 1]); |
431 | 427M | for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++) |
432 | 402M | { |
433 | 402M | psDelDec.sLPC_Q14[i] = _mm_set1_epi32(NSQ->sLPC_Q14[i]); |
434 | 402M | } |
435 | 628M | for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++) |
436 | 603M | { |
437 | 603M | psDelDec.sAR2_Q14[i] = _mm_set1_epi32(NSQ->sAR2_Q14[i]); |
438 | 603M | } |
439 | | |
440 | 25.1M | offset_Q10 = silk_Quantization_Offsets_Q10[psIndices->signalType >> 1][psIndices->quantOffsetType]; |
441 | 25.1M | smpl_buf_idx = 0; /* index of oldest samples */ |
442 | | |
443 | 25.1M | decisionDelay = silk_min_int(DECISION_DELAY, psEncC->subfr_length); |
444 | | |
445 | | /* For voiced frames limit the decision delay to lower than the pitch lag */ |
446 | 25.1M | if (psIndices->signalType == TYPE_VOICED) |
447 | 687k | { |
448 | 3.14M | for (k = 0; k < psEncC->nb_subfr; k++) |
449 | 2.46M | { |
450 | 2.46M | decisionDelay = silk_min_int(decisionDelay, pitchL[k] - LTP_ORDER / 2 - 1); |
451 | 2.46M | } |
452 | 687k | } |
453 | 24.4M | else |
454 | 24.4M | { |
455 | 24.4M | if (lag > 0) |
456 | 431k | { |
457 | 431k | decisionDelay = silk_min_int(decisionDelay, lag - LTP_ORDER / 2 - 1); |
458 | 431k | } |
459 | 24.4M | } |
460 | | |
461 | 25.1M | if (psIndices->NLSFInterpCoef_Q2 == 4) |
462 | 24.7M | { |
463 | 24.7M | LSF_interpolation_flag = 0; |
464 | 24.7M | } |
465 | 430k | else |
466 | 430k | { |
467 | 430k | LSF_interpolation_flag = 1; |
468 | 430k | } |
469 | | |
470 | 25.1M | ALLOC(sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32); |
471 | 25.1M | ALLOC(sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16); |
472 | | /* Set up pointers to start of sub frame */ |
473 | 25.1M | pxq = &NSQ->xq[psEncC->ltp_mem_length]; |
474 | 25.1M | NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length; |
475 | 25.1M | NSQ->sLTP_buf_idx = psEncC->ltp_mem_length; |
476 | 25.1M | subfr = 0; |
477 | 112M | for (k = 0; k < psEncC->nb_subfr; k++) |
478 | 87.6M | { |
479 | 87.6M | A_Q12 = &PredCoef_Q12[((k >> 1) | (1 ^ LSF_interpolation_flag)) * MAX_LPC_ORDER]; |
480 | 87.6M | B_Q14 = <PCoef_Q14[k * LTP_ORDER]; |
481 | 87.6M | AR_shp_Q13 = &AR_Q13[k * MAX_SHAPE_LPC_ORDER]; |
482 | | |
483 | | /* Noise shape parameters */ |
484 | 87.6M | silk_assert(HarmShapeGain_Q14[k] >= 0); |
485 | 87.6M | HarmShapeFIRPacked_Q14 = silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 ); |
486 | 87.6M | HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 ); |
487 | | |
488 | 87.6M | NSQ->rewhite_flag = 0; |
489 | 87.6M | if (psIndices->signalType == TYPE_VOICED) |
490 | 2.46M | { |
491 | | /* Voiced */ |
492 | 2.46M | lag = pitchL[k]; |
493 | | |
494 | | /* Re-whitening */ |
495 | 2.46M | if ((k & (3 ^ (LSF_interpolation_flag << 1))) == 0) |
496 | 823k | { |
497 | 823k | if (k == 2) |
498 | 136k | { |
499 | | /* RESET DELAYED DECISIONS */ |
500 | | /* Find winner */ |
501 | 136k | RDmin_Q10 = silk_mm_mask_hmin_epi32(psDelDec.RD_Q10, MaskDelDec); |
502 | 136k | Winner_ind = silk_index_of_first_equal_epi32(RDmin_Q10, psDelDec.RD_Q10); |
503 | 136k | Winner_selector = silk_index_to_selector(Winner_ind); |
504 | 136k | psDelDec.RD_Q10 = _mm_add_epi32( |
505 | 136k | psDelDec.RD_Q10, |
506 | 136k | _mm_blendv_epi8( |
507 | 136k | _mm_set1_epi32(silk_int32_MAX >> 4), |
508 | 136k | _mm_setzero_si128(), |
509 | 136k | _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFU << (unsigned)(Winner_ind << 3))))); |
510 | | |
511 | | /* Copy final part of signals from winner state to output and long-term filter states */ |
512 | 136k | last_smple_idx = smpl_buf_idx + decisionDelay; |
513 | 4.13M | for (i = 0; i < decisionDelay; i++) |
514 | 4.00M | { |
515 | 4.00M | last_smple_idx = (last_smple_idx + DECISION_DELAY - 1) % DECISION_DELAY; |
516 | 4.00M | psSample = &psDelDec.Samples[last_smple_idx]; |
517 | 4.00M | pulses[i - decisionDelay] = |
518 | 4.00M | (opus_int8)silk_sar_round_32(silk_select_winner(psSample->Q_Q10, Winner_selector), 10); |
519 | 4.00M | pxq[i - decisionDelay] = |
520 | 4.00M | silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psSample->Xq_Q14, Winner_selector), Gains_Q16[1], 14)); |
521 | 4.00M | NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay + i] = |
522 | 4.00M | silk_select_winner(psSample->Shape_Q14, Winner_selector); |
523 | 4.00M | } |
524 | | |
525 | 136k | subfr = 0; |
526 | 136k | } |
527 | | |
528 | | /* Rewhiten with new A coefs */ |
529 | 823k | start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2; |
530 | 823k | silk_assert(start_idx > 0); |
531 | | |
532 | 823k | silk_LPC_analysis_filter_avx2(&sLTP[start_idx], &NSQ->xq[start_idx + k * psEncC->subfr_length], |
533 | 823k | A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder); |
534 | | |
535 | 823k | NSQ->sLTP_buf_idx = psEncC->ltp_mem_length; |
536 | 823k | NSQ->rewhite_flag = 1; |
537 | 823k | } |
538 | 2.46M | } |
539 | | |
540 | 87.6M | silk_nsq_del_dec_scale_states_avx2(psEncC, NSQ, &psDelDec, x16, x_sc_Q10, sLTP, sLTP_Q15, k, |
541 | 87.6M | LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay); |
542 | | |
543 | 87.6M | silk_noise_shape_quantizer_del_dec_avx2(NSQ, &psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, |
544 | 87.6M | delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[k], LF_shp_Q14[k], |
545 | 87.6M | Gains_Q16[k], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder, |
546 | 87.6M | psEncC->predictLPCOrder, psEncC->warping_Q16, MaskDelDec, &smpl_buf_idx, decisionDelay); |
547 | | |
548 | 87.6M | x16 += psEncC->subfr_length; |
549 | 87.6M | pulses += psEncC->subfr_length; |
550 | 87.6M | pxq += psEncC->subfr_length; |
551 | 87.6M | } |
552 | | |
553 | | /* Find winner */ |
554 | 25.1M | RDmin_Q10 = silk_mm_mask_hmin_epi32(psDelDec.RD_Q10, MaskDelDec); |
555 | 25.1M | Winner_selector = silk_index_to_selector(silk_index_of_first_equal_epi32(RDmin_Q10, psDelDec.RD_Q10)); |
556 | | |
557 | | /* Copy final part of signals from winner state to output and long-term filter states */ |
558 | 25.1M | psIndices->Seed = silk_select_winner(psDelDec.SeedInit, Winner_selector); |
559 | 25.1M | last_smple_idx = smpl_buf_idx + decisionDelay; |
560 | 25.1M | Gain_Q10 = Gains_Q16[psEncC->nb_subfr - 1] >> 6; |
561 | 1.02G | for (i = 0; i < decisionDelay; i++) |
562 | 1.00G | { |
563 | 1.00G | last_smple_idx = (last_smple_idx + DECISION_DELAY - 1) % DECISION_DELAY; |
564 | 1.00G | psSample = &psDelDec.Samples[last_smple_idx]; |
565 | | |
566 | 1.00G | pulses[i - decisionDelay] = |
567 | 1.00G | (opus_int8)silk_sar_round_32(silk_select_winner(psSample->Q_Q10, Winner_selector), 10); |
568 | 1.00G | pxq[i - decisionDelay] = |
569 | 1.00G | silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psSample->Xq_Q14, Winner_selector), Gain_Q10, 8)); |
570 | 1.00G | NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay + i] = |
571 | 1.00G | silk_select_winner(psSample->Shape_Q14, Winner_selector); |
572 | 1.00G | } |
573 | 427M | for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++) |
574 | 402M | { |
575 | 402M | NSQ->sLPC_Q14[i] = silk_select_winner(psDelDec.sLPC_Q14[i], Winner_selector); |
576 | 402M | } |
577 | 628M | for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++) |
578 | 603M | { |
579 | 603M | NSQ->sAR2_Q14[i] = silk_select_winner(psDelDec.sAR2_Q14[i], Winner_selector); |
580 | 603M | } |
581 | | |
582 | | /* Update states */ |
583 | 25.1M | NSQ->sLF_AR_shp_Q14 = silk_select_winner(psDelDec.LF_AR_Q14, Winner_selector); |
584 | 25.1M | NSQ->sDiff_shp_Q14 = silk_select_winner(psDelDec.Diff_Q14, Winner_selector); |
585 | 25.1M | NSQ->lagPrev = pitchL[psEncC->nb_subfr - 1]; |
586 | | |
587 | | /* Save quantized speech signal */ |
588 | 25.1M | silk_memmove(NSQ->xq, &NSQ->xq[psEncC->frame_length], psEncC->ltp_mem_length * sizeof(opus_int16)); |
589 | 25.1M | silk_memmove(NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[psEncC->frame_length], psEncC->ltp_mem_length * sizeof(opus_int32)); |
590 | | |
591 | | #ifdef OPUS_CHECK_ASM |
592 | 6.78M | silk_assert(!memcmp(&NSQ_c, NSQ, sizeof(NSQ_c))); |
593 | 6.78M | silk_assert(!memcmp(&psIndices_c, psIndices, sizeof(psIndices_c))); |
594 | 6.78M | silk_assert(!memcmp(pulses_c, pulses_a, sizeof(pulses_c))); |
595 | 6.78M | #endif |
596 | | |
597 | 6.78M | RESTORE_STACK; |
598 | 6.78M | } Line | Count | Source | 381 | 18.6M | { | 382 | 18.6M | #ifdef OPUS_CHECK_ASM | 383 | 18.6M | silk_nsq_state NSQ_c; | 384 | 18.6M | SideInfoIndices psIndices_c; | 385 | 18.6M | opus_int8 pulses_c[MAX_FRAME_LENGTH]; | 386 | 18.6M | const opus_int8 *const pulses_a = pulses; | 387 | | | 388 | 18.6M | silk_memcpy(&NSQ_c, NSQ, sizeof(NSQ_c)); | 389 | 18.6M | silk_memcpy(&psIndices_c, psIndices, sizeof(psIndices_c)); | 390 | 18.6M | silk_memcpy(pulses_c, pulses, sizeof(pulses_c)); | 391 | 18.6M | silk_NSQ_del_dec_c(psEncC, &NSQ_c, &psIndices_c, x16, pulses_c, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, | 392 | 18.6M | pitchL, Lambda_Q10, LTP_scale_Q14); | 393 | 18.6M | #endif | 394 | | | 395 | 18.6M | if (!verify_assumptions(psEncC)) | 396 | 11.8M | { | 397 | 11.8M | silk_NSQ_del_dec_c(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14); | 398 | 11.8M | return; | 399 | 11.8M | } | 400 | | | 401 | 6.78M | opus_int i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr; | 402 | 6.78M | opus_int last_smple_idx, smpl_buf_idx, decisionDelay; | 403 | 6.78M | const opus_int16 *A_Q12, *B_Q14, *AR_shp_Q13; | 404 | 6.78M | opus_int16 *pxq; | 405 | 6.78M | VARDECL(opus_int32, sLTP_Q15); | 406 | 6.78M | VARDECL(opus_int16, sLTP); | 407 | 6.78M | opus_int32 HarmShapeFIRPacked_Q14; | 408 | 6.78M | opus_int offset_Q10; | 409 | 6.78M | opus_int32 Gain_Q10; | 410 | 6.78M | opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH]; | 411 | 6.78M | opus_int32 delayedGain_Q10[DECISION_DELAY]; | 412 | 6.78M | NSQ_del_dec_struct psDelDec = {0}; | 413 | 6.78M | NSQ_del_dec_sample_struct *psSample; | 414 | 6.78M | __m128i RDmin_Q10, MaskDelDec, Winner_selector; | 415 | 6.78M | SAVE_STACK; | 416 | | | 417 | 6.78M | MaskDelDec = _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFFFFF00ul << ((psEncC->nStatesDelayedDecision - 1) << 3))); | 418 | | | 419 | | /* Set unvoiced lag to the previous one, overwrite later for voiced */ | 420 | 6.78M | lag = NSQ->lagPrev; | 421 | | | 422 | 6.78M | silk_assert(NSQ->prev_gain_Q16 != 0); | 423 | 6.78M | psDelDec.Seed = _mm_and_si128( | 424 | 6.78M | _mm_add_epi32(_mm_set_epi32(3, 2, 1, 0), _mm_set1_epi32(psIndices->Seed)), | 425 | 6.78M | _mm_set1_epi32(3)); | 426 | 6.78M | psDelDec.SeedInit = psDelDec.Seed; | 427 | 6.78M | psDelDec.RD_Q10 = _mm_setzero_si128(); | 428 | 6.78M | psDelDec.LF_AR_Q14 = _mm_set1_epi32(NSQ->sLF_AR_shp_Q14); | 429 | 6.78M | psDelDec.Diff_Q14 = _mm_set1_epi32(NSQ->sDiff_shp_Q14); | 430 | 6.78M | psDelDec.Samples[0].Shape_Q14 = _mm_set1_epi32(NSQ->sLTP_shp_Q14[psEncC->ltp_mem_length - 1]); | 431 | 115M | for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++) | 432 | 108M | { | 433 | 108M | psDelDec.sLPC_Q14[i] = _mm_set1_epi32(NSQ->sLPC_Q14[i]); | 434 | 108M | } | 435 | 169M | for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++) | 436 | 162M | { | 437 | 162M | psDelDec.sAR2_Q14[i] = _mm_set1_epi32(NSQ->sAR2_Q14[i]); | 438 | 162M | } | 439 | | | 440 | 6.78M | offset_Q10 = silk_Quantization_Offsets_Q10[psIndices->signalType >> 1][psIndices->quantOffsetType]; | 441 | 6.78M | smpl_buf_idx = 0; /* index of oldest samples */ | 442 | | | 443 | 6.78M | decisionDelay = silk_min_int(DECISION_DELAY, psEncC->subfr_length); | 444 | | | 445 | | /* For voiced frames limit the decision delay to lower than the pitch lag */ | 446 | 6.78M | if (psIndices->signalType == TYPE_VOICED) | 447 | 180k | { | 448 | 829k | for (k = 0; k < psEncC->nb_subfr; k++) | 449 | 648k | { | 450 | 648k | decisionDelay = silk_min_int(decisionDelay, pitchL[k] - LTP_ORDER / 2 - 1); | 451 | 648k | } | 452 | 180k | } | 453 | 6.60M | else | 454 | 6.60M | { | 455 | 6.60M | if (lag > 0) | 456 | 198k | { | 457 | 198k | decisionDelay = silk_min_int(decisionDelay, lag - LTP_ORDER / 2 - 1); | 458 | 198k | } | 459 | 6.60M | } | 460 | | | 461 | 6.78M | if (psIndices->NLSFInterpCoef_Q2 == 4) | 462 | 6.66M | { | 463 | 6.66M | LSF_interpolation_flag = 0; | 464 | 6.66M | } | 465 | 123k | else | 466 | 123k | { | 467 | 123k | LSF_interpolation_flag = 1; | 468 | 123k | } | 469 | | | 470 | 6.78M | ALLOC(sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32); | 471 | 6.78M | ALLOC(sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16); | 472 | | /* Set up pointers to start of sub frame */ | 473 | 6.78M | pxq = &NSQ->xq[psEncC->ltp_mem_length]; | 474 | 6.78M | NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length; | 475 | 6.78M | NSQ->sLTP_buf_idx = psEncC->ltp_mem_length; | 476 | 6.78M | subfr = 0; | 477 | 31.1M | for (k = 0; k < psEncC->nb_subfr; k++) | 478 | 24.3M | { | 479 | 24.3M | A_Q12 = &PredCoef_Q12[((k >> 1) | (1 ^ LSF_interpolation_flag)) * MAX_LPC_ORDER]; | 480 | 24.3M | B_Q14 = <PCoef_Q14[k * LTP_ORDER]; | 481 | 24.3M | AR_shp_Q13 = &AR_Q13[k * MAX_SHAPE_LPC_ORDER]; | 482 | | | 483 | | /* Noise shape parameters */ | 484 | 24.3M | silk_assert(HarmShapeGain_Q14[k] >= 0); | 485 | 24.3M | HarmShapeFIRPacked_Q14 = silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 ); | 486 | 24.3M | HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 ); | 487 | | | 488 | 24.3M | NSQ->rewhite_flag = 0; | 489 | 24.3M | if (psIndices->signalType == TYPE_VOICED) | 490 | 648k | { | 491 | | /* Voiced */ | 492 | 648k | lag = pitchL[k]; | 493 | | | 494 | | /* Re-whitening */ | 495 | 648k | if ((k & (3 ^ (LSF_interpolation_flag << 1))) == 0) | 496 | 225k | { | 497 | 225k | if (k == 2) | 498 | 45.3k | { | 499 | | /* RESET DELAYED DECISIONS */ | 500 | | /* Find winner */ | 501 | 45.3k | RDmin_Q10 = silk_mm_mask_hmin_epi32(psDelDec.RD_Q10, MaskDelDec); | 502 | 45.3k | Winner_ind = silk_index_of_first_equal_epi32(RDmin_Q10, psDelDec.RD_Q10); | 503 | 45.3k | Winner_selector = silk_index_to_selector(Winner_ind); | 504 | 45.3k | psDelDec.RD_Q10 = _mm_add_epi32( | 505 | 45.3k | psDelDec.RD_Q10, | 506 | 45.3k | _mm_blendv_epi8( | 507 | 45.3k | _mm_set1_epi32(silk_int32_MAX >> 4), | 508 | 45.3k | _mm_setzero_si128(), | 509 | 45.3k | _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFU << (unsigned)(Winner_ind << 3))))); | 510 | | | 511 | | /* Copy final part of signals from winner state to output and long-term filter states */ | 512 | 45.3k | last_smple_idx = smpl_buf_idx + decisionDelay; | 513 | 1.26M | for (i = 0; i < decisionDelay; i++) | 514 | 1.22M | { | 515 | 1.22M | last_smple_idx = (last_smple_idx + DECISION_DELAY - 1) % DECISION_DELAY; | 516 | 1.22M | psSample = &psDelDec.Samples[last_smple_idx]; | 517 | 1.22M | pulses[i - decisionDelay] = | 518 | 1.22M | (opus_int8)silk_sar_round_32(silk_select_winner(psSample->Q_Q10, Winner_selector), 10); | 519 | 1.22M | pxq[i - decisionDelay] = | 520 | 1.22M | silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psSample->Xq_Q14, Winner_selector), Gains_Q16[1], 14)); | 521 | 1.22M | NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay + i] = | 522 | 1.22M | silk_select_winner(psSample->Shape_Q14, Winner_selector); | 523 | 1.22M | } | 524 | | | 525 | 45.3k | subfr = 0; | 526 | 45.3k | } | 527 | | | 528 | | /* Rewhiten with new A coefs */ | 529 | 225k | start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2; | 530 | 225k | silk_assert(start_idx > 0); | 531 | | | 532 | 225k | silk_LPC_analysis_filter_avx2(&sLTP[start_idx], &NSQ->xq[start_idx + k * psEncC->subfr_length], | 533 | 225k | A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder); | 534 | | | 535 | 225k | NSQ->sLTP_buf_idx = psEncC->ltp_mem_length; | 536 | 225k | NSQ->rewhite_flag = 1; | 537 | 225k | } | 538 | 648k | } | 539 | | | 540 | 24.3M | silk_nsq_del_dec_scale_states_avx2(psEncC, NSQ, &psDelDec, x16, x_sc_Q10, sLTP, sLTP_Q15, k, | 541 | 24.3M | LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay); | 542 | | | 543 | 24.3M | silk_noise_shape_quantizer_del_dec_avx2(NSQ, &psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, | 544 | 24.3M | delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[k], LF_shp_Q14[k], | 545 | 24.3M | Gains_Q16[k], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder, | 546 | 24.3M | psEncC->predictLPCOrder, psEncC->warping_Q16, MaskDelDec, &smpl_buf_idx, decisionDelay); | 547 | | | 548 | 24.3M | x16 += psEncC->subfr_length; | 549 | 24.3M | pulses += psEncC->subfr_length; | 550 | 24.3M | pxq += psEncC->subfr_length; | 551 | 24.3M | } | 552 | | | 553 | | /* Find winner */ | 554 | 6.78M | RDmin_Q10 = silk_mm_mask_hmin_epi32(psDelDec.RD_Q10, MaskDelDec); | 555 | 6.78M | Winner_selector = silk_index_to_selector(silk_index_of_first_equal_epi32(RDmin_Q10, psDelDec.RD_Q10)); | 556 | | | 557 | | /* Copy final part of signals from winner state to output and long-term filter states */ | 558 | 6.78M | psIndices->Seed = silk_select_winner(psDelDec.SeedInit, Winner_selector); | 559 | 6.78M | last_smple_idx = smpl_buf_idx + decisionDelay; | 560 | 6.78M | Gain_Q10 = Gains_Q16[psEncC->nb_subfr - 1] >> 6; | 561 | 276M | for (i = 0; i < decisionDelay; i++) | 562 | 269M | { | 563 | 269M | last_smple_idx = (last_smple_idx + DECISION_DELAY - 1) % DECISION_DELAY; | 564 | 269M | psSample = &psDelDec.Samples[last_smple_idx]; | 565 | | | 566 | 269M | pulses[i - decisionDelay] = | 567 | 269M | (opus_int8)silk_sar_round_32(silk_select_winner(psSample->Q_Q10, Winner_selector), 10); | 568 | 269M | pxq[i - decisionDelay] = | 569 | 269M | silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psSample->Xq_Q14, Winner_selector), Gain_Q10, 8)); | 570 | 269M | NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay + i] = | 571 | 269M | silk_select_winner(psSample->Shape_Q14, Winner_selector); | 572 | 269M | } | 573 | 115M | for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++) | 574 | 108M | { | 575 | 108M | NSQ->sLPC_Q14[i] = silk_select_winner(psDelDec.sLPC_Q14[i], Winner_selector); | 576 | 108M | } | 577 | 169M | for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++) | 578 | 162M | { | 579 | 162M | NSQ->sAR2_Q14[i] = silk_select_winner(psDelDec.sAR2_Q14[i], Winner_selector); | 580 | 162M | } | 581 | | | 582 | | /* Update states */ | 583 | 6.78M | NSQ->sLF_AR_shp_Q14 = silk_select_winner(psDelDec.LF_AR_Q14, Winner_selector); | 584 | 6.78M | NSQ->sDiff_shp_Q14 = silk_select_winner(psDelDec.Diff_Q14, Winner_selector); | 585 | 6.78M | NSQ->lagPrev = pitchL[psEncC->nb_subfr - 1]; | 586 | | | 587 | | /* Save quantized speech signal */ | 588 | 6.78M | silk_memmove(NSQ->xq, &NSQ->xq[psEncC->frame_length], psEncC->ltp_mem_length * sizeof(opus_int16)); | 589 | 6.78M | silk_memmove(NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[psEncC->frame_length], psEncC->ltp_mem_length * sizeof(opus_int32)); | 590 | | | 591 | 6.78M | #ifdef OPUS_CHECK_ASM | 592 | 6.78M | silk_assert(!memcmp(&NSQ_c, NSQ, sizeof(NSQ_c))); | 593 | 6.78M | silk_assert(!memcmp(&psIndices_c, psIndices, sizeof(psIndices_c))); | 594 | 6.78M | silk_assert(!memcmp(pulses_c, pulses_a, sizeof(pulses_c))); | 595 | 6.78M | #endif | 596 | | | 597 | 6.78M | RESTORE_STACK; | 598 | 6.78M | } |
Line | Count | Source | 381 | 42.9M | { | 382 | | #ifdef OPUS_CHECK_ASM | 383 | | silk_nsq_state NSQ_c; | 384 | | SideInfoIndices psIndices_c; | 385 | | opus_int8 pulses_c[MAX_FRAME_LENGTH]; | 386 | | const opus_int8 *const pulses_a = pulses; | 387 | | | 388 | | silk_memcpy(&NSQ_c, NSQ, sizeof(NSQ_c)); | 389 | | silk_memcpy(&psIndices_c, psIndices, sizeof(psIndices_c)); | 390 | | silk_memcpy(pulses_c, pulses, sizeof(pulses_c)); | 391 | | silk_NSQ_del_dec_c(psEncC, &NSQ_c, &psIndices_c, x16, pulses_c, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, | 392 | | pitchL, Lambda_Q10, LTP_scale_Q14); | 393 | | #endif | 394 | | | 395 | 42.9M | if (!verify_assumptions(psEncC)) | 396 | 24.5M | { | 397 | 24.5M | silk_NSQ_del_dec_c(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14); | 398 | 24.5M | return; | 399 | 24.5M | } | 400 | | | 401 | 18.3M | opus_int i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr; | 402 | 18.3M | opus_int last_smple_idx, smpl_buf_idx, decisionDelay; | 403 | 18.3M | const opus_int16 *A_Q12, *B_Q14, *AR_shp_Q13; | 404 | 18.3M | opus_int16 *pxq; | 405 | 18.3M | VARDECL(opus_int32, sLTP_Q15); | 406 | 18.3M | VARDECL(opus_int16, sLTP); | 407 | 18.3M | opus_int32 HarmShapeFIRPacked_Q14; | 408 | 18.3M | opus_int offset_Q10; | 409 | 18.3M | opus_int32 Gain_Q10; | 410 | 18.3M | opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH]; | 411 | 18.3M | opus_int32 delayedGain_Q10[DECISION_DELAY]; | 412 | 18.3M | NSQ_del_dec_struct psDelDec = {0}; | 413 | 18.3M | NSQ_del_dec_sample_struct *psSample; | 414 | 18.3M | __m128i RDmin_Q10, MaskDelDec, Winner_selector; | 415 | 18.3M | SAVE_STACK; | 416 | | | 417 | 18.3M | MaskDelDec = _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFFFFF00ul << ((psEncC->nStatesDelayedDecision - 1) << 3))); | 418 | | | 419 | | /* Set unvoiced lag to the previous one, overwrite later for voiced */ | 420 | 18.3M | lag = NSQ->lagPrev; | 421 | | | 422 | 18.3M | silk_assert(NSQ->prev_gain_Q16 != 0); | 423 | 18.3M | psDelDec.Seed = _mm_and_si128( | 424 | 18.3M | _mm_add_epi32(_mm_set_epi32(3, 2, 1, 0), _mm_set1_epi32(psIndices->Seed)), | 425 | 18.3M | _mm_set1_epi32(3)); | 426 | 18.3M | psDelDec.SeedInit = psDelDec.Seed; | 427 | 18.3M | psDelDec.RD_Q10 = _mm_setzero_si128(); | 428 | 18.3M | psDelDec.LF_AR_Q14 = _mm_set1_epi32(NSQ->sLF_AR_shp_Q14); | 429 | 18.3M | psDelDec.Diff_Q14 = _mm_set1_epi32(NSQ->sDiff_shp_Q14); | 430 | 18.3M | psDelDec.Samples[0].Shape_Q14 = _mm_set1_epi32(NSQ->sLTP_shp_Q14[psEncC->ltp_mem_length - 1]); | 431 | 312M | for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++) | 432 | 293M | { | 433 | 293M | psDelDec.sLPC_Q14[i] = _mm_set1_epi32(NSQ->sLPC_Q14[i]); | 434 | 293M | } | 435 | 458M | for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++) | 436 | 440M | { | 437 | 440M | psDelDec.sAR2_Q14[i] = _mm_set1_epi32(NSQ->sAR2_Q14[i]); | 438 | 440M | } | 439 | | | 440 | 18.3M | offset_Q10 = silk_Quantization_Offsets_Q10[psIndices->signalType >> 1][psIndices->quantOffsetType]; | 441 | 18.3M | smpl_buf_idx = 0; /* index of oldest samples */ | 442 | | | 443 | 18.3M | decisionDelay = silk_min_int(DECISION_DELAY, psEncC->subfr_length); | 444 | | | 445 | | /* For voiced frames limit the decision delay to lower than the pitch lag */ | 446 | 18.3M | if (psIndices->signalType == TYPE_VOICED) | 447 | 506k | { | 448 | 2.31M | for (k = 0; k < psEncC->nb_subfr; k++) | 449 | 1.81M | { | 450 | 1.81M | decisionDelay = silk_min_int(decisionDelay, pitchL[k] - LTP_ORDER / 2 - 1); | 451 | 1.81M | } | 452 | 506k | } | 453 | 17.8M | else | 454 | 17.8M | { | 455 | 17.8M | if (lag > 0) | 456 | 233k | { | 457 | 233k | decisionDelay = silk_min_int(decisionDelay, lag - LTP_ORDER / 2 - 1); | 458 | 233k | } | 459 | 17.8M | } | 460 | | | 461 | 18.3M | if (psIndices->NLSFInterpCoef_Q2 == 4) | 462 | 18.0M | { | 463 | 18.0M | LSF_interpolation_flag = 0; | 464 | 18.0M | } | 465 | 306k | else | 466 | 306k | { | 467 | 306k | LSF_interpolation_flag = 1; | 468 | 306k | } | 469 | | | 470 | 18.3M | ALLOC(sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32); | 471 | 18.3M | ALLOC(sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16); | 472 | | /* Set up pointers to start of sub frame */ | 473 | 18.3M | pxq = &NSQ->xq[psEncC->ltp_mem_length]; | 474 | 18.3M | NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length; | 475 | 18.3M | NSQ->sLTP_buf_idx = psEncC->ltp_mem_length; | 476 | 18.3M | subfr = 0; | 477 | 81.6M | for (k = 0; k < psEncC->nb_subfr; k++) | 478 | 63.2M | { | 479 | 63.2M | A_Q12 = &PredCoef_Q12[((k >> 1) | (1 ^ LSF_interpolation_flag)) * MAX_LPC_ORDER]; | 480 | 63.2M | B_Q14 = <PCoef_Q14[k * LTP_ORDER]; | 481 | 63.2M | AR_shp_Q13 = &AR_Q13[k * MAX_SHAPE_LPC_ORDER]; | 482 | | | 483 | | /* Noise shape parameters */ | 484 | 63.2M | silk_assert(HarmShapeGain_Q14[k] >= 0); | 485 | 63.2M | HarmShapeFIRPacked_Q14 = silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 ); | 486 | 63.2M | HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 ); | 487 | | | 488 | 63.2M | NSQ->rewhite_flag = 0; | 489 | 63.2M | if (psIndices->signalType == TYPE_VOICED) | 490 | 1.81M | { | 491 | | /* Voiced */ | 492 | 1.81M | lag = pitchL[k]; | 493 | | | 494 | | /* Re-whitening */ | 495 | 1.81M | if ((k & (3 ^ (LSF_interpolation_flag << 1))) == 0) | 496 | 598k | { | 497 | 598k | if (k == 2) | 498 | 91.4k | { | 499 | | /* RESET DELAYED DECISIONS */ | 500 | | /* Find winner */ | 501 | 91.4k | RDmin_Q10 = silk_mm_mask_hmin_epi32(psDelDec.RD_Q10, MaskDelDec); | 502 | 91.4k | Winner_ind = silk_index_of_first_equal_epi32(RDmin_Q10, psDelDec.RD_Q10); | 503 | 91.4k | Winner_selector = silk_index_to_selector(Winner_ind); | 504 | 91.4k | psDelDec.RD_Q10 = _mm_add_epi32( | 505 | 91.4k | psDelDec.RD_Q10, | 506 | 91.4k | _mm_blendv_epi8( | 507 | 91.4k | _mm_set1_epi32(silk_int32_MAX >> 4), | 508 | 91.4k | _mm_setzero_si128(), | 509 | 91.4k | _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFU << (unsigned)(Winner_ind << 3))))); | 510 | | | 511 | | /* Copy final part of signals from winner state to output and long-term filter states */ | 512 | 91.4k | last_smple_idx = smpl_buf_idx + decisionDelay; | 513 | 2.87M | for (i = 0; i < decisionDelay; i++) | 514 | 2.78M | { | 515 | 2.78M | last_smple_idx = (last_smple_idx + DECISION_DELAY - 1) % DECISION_DELAY; | 516 | 2.78M | psSample = &psDelDec.Samples[last_smple_idx]; | 517 | 2.78M | pulses[i - decisionDelay] = | 518 | 2.78M | (opus_int8)silk_sar_round_32(silk_select_winner(psSample->Q_Q10, Winner_selector), 10); | 519 | 2.78M | pxq[i - decisionDelay] = | 520 | 2.78M | silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psSample->Xq_Q14, Winner_selector), Gains_Q16[1], 14)); | 521 | 2.78M | NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay + i] = | 522 | 2.78M | silk_select_winner(psSample->Shape_Q14, Winner_selector); | 523 | 2.78M | } | 524 | | | 525 | 91.4k | subfr = 0; | 526 | 91.4k | } | 527 | | | 528 | | /* Rewhiten with new A coefs */ | 529 | 598k | start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2; | 530 | 598k | silk_assert(start_idx > 0); | 531 | | | 532 | 598k | silk_LPC_analysis_filter_avx2(&sLTP[start_idx], &NSQ->xq[start_idx + k * psEncC->subfr_length], | 533 | 598k | A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder); | 534 | | | 535 | 598k | NSQ->sLTP_buf_idx = psEncC->ltp_mem_length; | 536 | 598k | NSQ->rewhite_flag = 1; | 537 | 598k | } | 538 | 1.81M | } | 539 | | | 540 | 63.2M | silk_nsq_del_dec_scale_states_avx2(psEncC, NSQ, &psDelDec, x16, x_sc_Q10, sLTP, sLTP_Q15, k, | 541 | 63.2M | LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay); | 542 | | | 543 | 63.2M | silk_noise_shape_quantizer_del_dec_avx2(NSQ, &psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, | 544 | 63.2M | delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[k], LF_shp_Q14[k], | 545 | 63.2M | Gains_Q16[k], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder, | 546 | 63.2M | psEncC->predictLPCOrder, psEncC->warping_Q16, MaskDelDec, &smpl_buf_idx, decisionDelay); | 547 | | | 548 | 63.2M | x16 += psEncC->subfr_length; | 549 | 63.2M | pulses += psEncC->subfr_length; | 550 | 63.2M | pxq += psEncC->subfr_length; | 551 | 63.2M | } | 552 | | | 553 | | /* Find winner */ | 554 | 18.3M | RDmin_Q10 = silk_mm_mask_hmin_epi32(psDelDec.RD_Q10, MaskDelDec); | 555 | 18.3M | Winner_selector = silk_index_to_selector(silk_index_of_first_equal_epi32(RDmin_Q10, psDelDec.RD_Q10)); | 556 | | | 557 | | /* Copy final part of signals from winner state to output and long-term filter states */ | 558 | 18.3M | psIndices->Seed = silk_select_winner(psDelDec.SeedInit, Winner_selector); | 559 | 18.3M | last_smple_idx = smpl_buf_idx + decisionDelay; | 560 | 18.3M | Gain_Q10 = Gains_Q16[psEncC->nb_subfr - 1] >> 6; | 561 | 749M | for (i = 0; i < decisionDelay; i++) | 562 | 730M | { | 563 | 730M | last_smple_idx = (last_smple_idx + DECISION_DELAY - 1) % DECISION_DELAY; | 564 | 730M | psSample = &psDelDec.Samples[last_smple_idx]; | 565 | | | 566 | 730M | pulses[i - decisionDelay] = | 567 | 730M | (opus_int8)silk_sar_round_32(silk_select_winner(psSample->Q_Q10, Winner_selector), 10); | 568 | 730M | pxq[i - decisionDelay] = | 569 | 730M | silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psSample->Xq_Q14, Winner_selector), Gain_Q10, 8)); | 570 | 730M | NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay + i] = | 571 | 730M | silk_select_winner(psSample->Shape_Q14, Winner_selector); | 572 | 730M | } | 573 | 312M | for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++) | 574 | 293M | { | 575 | 293M | NSQ->sLPC_Q14[i] = silk_select_winner(psDelDec.sLPC_Q14[i], Winner_selector); | 576 | 293M | } | 577 | 458M | for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++) | 578 | 440M | { | 579 | 440M | NSQ->sAR2_Q14[i] = silk_select_winner(psDelDec.sAR2_Q14[i], Winner_selector); | 580 | 440M | } | 581 | | | 582 | | /* Update states */ | 583 | 18.3M | NSQ->sLF_AR_shp_Q14 = silk_select_winner(psDelDec.LF_AR_Q14, Winner_selector); | 584 | 18.3M | NSQ->sDiff_shp_Q14 = silk_select_winner(psDelDec.Diff_Q14, Winner_selector); | 585 | 18.3M | NSQ->lagPrev = pitchL[psEncC->nb_subfr - 1]; | 586 | | | 587 | | /* Save quantized speech signal */ | 588 | 18.3M | silk_memmove(NSQ->xq, &NSQ->xq[psEncC->frame_length], psEncC->ltp_mem_length * sizeof(opus_int16)); | 589 | 18.3M | silk_memmove(NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[psEncC->frame_length], psEncC->ltp_mem_length * sizeof(opus_int32)); | 590 | | | 591 | | #ifdef OPUS_CHECK_ASM | 592 | | silk_assert(!memcmp(&NSQ_c, NSQ, sizeof(NSQ_c))); | 593 | | silk_assert(!memcmp(&psIndices_c, psIndices, sizeof(psIndices_c))); | 594 | | silk_assert(!memcmp(pulses_c, pulses_a, sizeof(pulses_c))); | 595 | | #endif | 596 | | | 597 | 18.3M | RESTORE_STACK; | 598 | 18.3M | } |
|
599 | | |
600 | | static OPUS_INLINE __m128i silk_noise_shape_quantizer_short_prediction_x4(const __m128i *buf32, const opus_int16 *coef16, opus_int order) |
601 | 4.32G | { |
602 | 4.32G | __m256i out; |
603 | 4.32G | silk_assert(order == 10 || order == 16); |
604 | | |
605 | | /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */ |
606 | 4.32G | out = _mm256_set1_epi32(order >> 1); |
607 | 4.32G | out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-0]), _mm256_set1_epi32(silk_LSHIFT(coef16[0], 16)))); /* High DWORD */ |
608 | 4.32G | out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-1]), _mm256_set1_epi32(silk_LSHIFT(coef16[1], 16)))); /* High DWORD */ |
609 | 4.32G | out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-2]), _mm256_set1_epi32(silk_LSHIFT(coef16[2], 16)))); /* High DWORD */ |
610 | 4.32G | out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-3]), _mm256_set1_epi32(silk_LSHIFT(coef16[3], 16)))); /* High DWORD */ |
611 | 4.32G | out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-4]), _mm256_set1_epi32(silk_LSHIFT(coef16[4], 16)))); /* High DWORD */ |
612 | 4.32G | out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-5]), _mm256_set1_epi32(silk_LSHIFT(coef16[5], 16)))); /* High DWORD */ |
613 | 4.32G | out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-6]), _mm256_set1_epi32(silk_LSHIFT(coef16[6], 16)))); /* High DWORD */ |
614 | 4.32G | out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-7]), _mm256_set1_epi32(silk_LSHIFT(coef16[7], 16)))); /* High DWORD */ |
615 | 4.32G | out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-8]), _mm256_set1_epi32(silk_LSHIFT(coef16[8], 16)))); /* High DWORD */ |
616 | 4.32G | out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-9]), _mm256_set1_epi32(silk_LSHIFT(coef16[9], 16)))); /* High DWORD */ |
617 | | |
618 | 4.32G | if (order == 16) |
619 | 951M | { |
620 | 951M | out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-10]), _mm256_set1_epi32(silk_LSHIFT(coef16[10], 16)))); /* High DWORD */ |
621 | 951M | out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-11]), _mm256_set1_epi32(silk_LSHIFT(coef16[11], 16)))); /* High DWORD */ |
622 | 951M | out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-12]), _mm256_set1_epi32(silk_LSHIFT(coef16[12], 16)))); /* High DWORD */ |
623 | 951M | out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-13]), _mm256_set1_epi32(silk_LSHIFT(coef16[13], 16)))); /* High DWORD */ |
624 | 951M | out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-14]), _mm256_set1_epi32(silk_LSHIFT(coef16[14], 16)))); /* High DWORD */ |
625 | 951M | out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-15]), _mm256_set1_epi32(silk_LSHIFT(coef16[15], 16)))); /* High DWORD */ |
626 | 951M | } |
627 | 4.32G | return silk_cvtepi64_epi32_high(out); |
628 | 4.32G | } |
629 | | |
630 | | /******************************************/ |
631 | | /* Noise shape quantizer for one subframe */ |
632 | | /******************************************/ |
633 | | static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_avx2( |
634 | | silk_nsq_state *NSQ, /* I/O NSQ state */ |
635 | | NSQ_del_dec_struct *psDelDec, /* I/O Delayed decision states */ |
636 | | opus_int signalType, /* I Signal type */ |
637 | | const opus_int32 x_Q10[], /* I */ |
638 | | opus_int8 pulses[], /* O */ |
639 | | opus_int16 xq[], /* O */ |
640 | | opus_int32 sLTP_Q15[], /* I/O LTP filter state */ |
641 | | opus_int32 delayedGain_Q10[DECISION_DELAY], /* I/O Gain delay buffer */ |
642 | | const opus_int16 a_Q12[], /* I Short term prediction coefs */ |
643 | | const opus_int16 b_Q14[], /* I Long term prediction coefs */ |
644 | | const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs */ |
645 | | opus_int lag, /* I Pitch lag */ |
646 | | opus_int32 HarmShapeFIRPacked_Q14, /* I */ |
647 | | opus_int Tilt_Q14, /* I Spectral tilt */ |
648 | | opus_int32 LF_shp_Q14, /* I */ |
649 | | opus_int32 Gain_Q16, /* I */ |
650 | | opus_int Lambda_Q10, /* I */ |
651 | | opus_int offset_Q10, /* I */ |
652 | | opus_int length, /* I Input length */ |
653 | | opus_int subfr, /* I Subframe number */ |
654 | | opus_int shapingLPCOrder, /* I Shaping LPC filter order */ |
655 | | opus_int predictLPCOrder, /* I Prediction filter order */ |
656 | | opus_int warping_Q16, /* I */ |
657 | | __m128i MaskDelDec, /* I Mask of states in decision tree */ |
658 | | opus_int *smpl_buf_idx, /* I/O Index to newest samples in buffers */ |
659 | | opus_int decisionDelay /* I */ |
660 | | ) |
661 | 87.6M | { |
662 | 87.6M | int i; |
663 | 87.6M | opus_int32 *shp_lag_ptr = &NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2]; |
664 | 87.6M | opus_int32 *pred_lag_ptr = &sLTP_Q15[NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2]; |
665 | 87.6M | opus_int32 Gain_Q10 = Gain_Q16 >> 6; |
666 | | |
667 | 4.41G | for (i = 0; i < length; i++) |
668 | 4.32G | { |
669 | | /* Perform common calculations used in all states */ |
670 | | /* NSQ_sample_struct */ |
671 | | /* Low 128 bits => 1st set */ |
672 | | /* High 128 bits => 2nd set */ |
673 | 4.32G | int j; |
674 | 4.32G | __m256i SS_Q_Q10; |
675 | 4.32G | __m256i SS_RD_Q10; |
676 | 4.32G | __m256i SS_xq_Q14; |
677 | 4.32G | __m256i SS_LF_AR_Q14; |
678 | 4.32G | __m256i SS_Diff_Q14; |
679 | 4.32G | __m256i SS_sLTP_shp_Q14; |
680 | 4.32G | __m256i SS_LPC_exc_Q14; |
681 | 4.32G | __m256i exc_Q14; |
682 | 4.32G | __m256i q_Q10, rr_Q10, rd_Q10; |
683 | 4.32G | __m256i mask; |
684 | 4.32G | __m128i LPC_pred_Q14, n_AR_Q14; |
685 | 4.32G | __m128i RDmin_Q10, RDmax_Q10; |
686 | 4.32G | __m128i n_LF_Q14; |
687 | 4.32G | __m128i r_Q10, q1_Q0, q1_Q10, q2_Q10; |
688 | 4.32G | __m128i Winner_rand_state, Winner_selector; |
689 | 4.32G | __m128i tmp0, tmp1; |
690 | 4.32G | NSQ_del_dec_sample_struct *psLastSample, *psSample; |
691 | 4.32G | opus_int32 RDmin_ind, RDmax_ind, last_smple_idx; |
692 | 4.32G | opus_int32 LTP_pred_Q14, n_LTP_Q14; |
693 | | |
694 | | /* Long-term prediction */ |
695 | 4.32G | if (signalType == TYPE_VOICED) |
696 | 115M | { |
697 | | /* Unrolled loop */ |
698 | | /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */ |
699 | 115M | LTP_pred_Q14 = 2; |
700 | 115M | LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-0], b_Q14[0]); |
701 | 115M | LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-1], b_Q14[1]); |
702 | 115M | LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-2], b_Q14[2]); |
703 | 115M | LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-3], b_Q14[3]); |
704 | 115M | LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-4], b_Q14[4]); |
705 | 115M | LTP_pred_Q14 = silk_LSHIFT(LTP_pred_Q14, 1); /* Q13 -> Q14 */ |
706 | 115M | pred_lag_ptr++; |
707 | 115M | } |
708 | 4.21G | else |
709 | 4.21G | { |
710 | 4.21G | LTP_pred_Q14 = 0; |
711 | 4.21G | } |
712 | | |
713 | | /* Long-term shaping */ |
714 | 4.32G | if (lag > 0) |
715 | 183M | { |
716 | | /* Symmetric, packed FIR coefficients */ |
717 | 183M | n_LTP_Q14 = silk_add_sat32(shp_lag_ptr[0], shp_lag_ptr[-2]); |
718 | 183M | n_LTP_Q14 = silk_SMULWB(n_LTP_Q14, HarmShapeFIRPacked_Q14); |
719 | 183M | n_LTP_Q14 = n_LTP_Q14 + silk_SMULWT(shp_lag_ptr[-1], HarmShapeFIRPacked_Q14); |
720 | 183M | n_LTP_Q14 = LTP_pred_Q14 - (silk_LSHIFT(n_LTP_Q14, 2)); /* Q12 -> Q14 */ |
721 | 183M | shp_lag_ptr++; |
722 | 183M | } |
723 | 4.14G | else |
724 | 4.14G | { |
725 | 4.14G | n_LTP_Q14 = 0; |
726 | 4.14G | } |
727 | | |
728 | | /* BEGIN Updating Delayed Decision States */ |
729 | | |
730 | | /* Generate dither */ |
731 | 4.32G | psDelDec->Seed = silk_mm256_rand_epi32(psDelDec->Seed); |
732 | | |
733 | | /* Short-term prediction */ |
734 | 4.32G | LPC_pred_Q14 = silk_noise_shape_quantizer_short_prediction_x4(&psDelDec->sLPC_Q14[NSQ_LPC_BUF_LENGTH - 1 + i], a_Q12, predictLPCOrder); |
735 | 4.32G | LPC_pred_Q14 = _mm_slli_epi32(LPC_pred_Q14, 4); /* Q10 -> Q14 */ |
736 | | |
737 | | /* Noise shape feedback */ |
738 | 4.32G | silk_assert(shapingLPCOrder > 0); |
739 | 4.32G | silk_assert((shapingLPCOrder & 1) == 0); /* check that order is even */ |
740 | | /* Output of lowpass section */ |
741 | 4.32G | tmp0 = _mm_add_epi32(psDelDec->Diff_Q14, silk_mm_smulwb_epi32(psDelDec->sAR2_Q14[0], warping_Q16)); |
742 | 4.32G | n_AR_Q14 = _mm_set1_epi32(shapingLPCOrder >> 1); |
743 | 94.4G | for (j = 0; j < shapingLPCOrder - 1; j++) |
744 | 90.0G | { |
745 | | /* Output of allpass section */ |
746 | 90.0G | tmp1 = psDelDec->sAR2_Q14[j]; |
747 | 90.0G | psDelDec->sAR2_Q14[j] = tmp0; |
748 | 90.0G | n_AR_Q14 = _mm_add_epi32(n_AR_Q14, silk_mm_smulwb_epi32(tmp0, AR_shp_Q13[j])); |
749 | 90.0G | tmp0 = _mm_add_epi32(tmp1, silk_mm_smulwb_epi32(_mm_sub_epi32(psDelDec->sAR2_Q14[j + 1], tmp0), warping_Q16)); |
750 | 90.0G | } |
751 | 4.32G | psDelDec->sAR2_Q14[shapingLPCOrder - 1] = tmp0; |
752 | 4.32G | n_AR_Q14 = _mm_add_epi32(n_AR_Q14, silk_mm_smulwb_epi32(tmp0, AR_shp_Q13[shapingLPCOrder - 1])); |
753 | | |
754 | 4.32G | n_AR_Q14 = _mm_slli_epi32(n_AR_Q14, 1); /* Q11 -> Q12 */ |
755 | 4.32G | n_AR_Q14 = _mm_add_epi32(n_AR_Q14, silk_mm_smulwb_epi32(psDelDec->LF_AR_Q14, Tilt_Q14)); /* Q12 */ |
756 | 4.32G | n_AR_Q14 = _mm_slli_epi32(n_AR_Q14, 2); /* Q12 -> Q14 */ |
757 | | |
758 | 4.32G | tmp0 = silk_mm_smulwb_epi32(psDelDec->Samples[*smpl_buf_idx].Shape_Q14, LF_shp_Q14); /* Q12 */ |
759 | 4.32G | tmp1 = silk_mm_smulwb_epi32(psDelDec->LF_AR_Q14, LF_shp_Q14 >> 16); /* Q12 */ |
760 | 4.32G | n_LF_Q14 = _mm_add_epi32(tmp0, tmp1); /* Q12 */ |
761 | 4.32G | n_LF_Q14 = _mm_slli_epi32(n_LF_Q14, 2); /* Q12 -> Q14 */ |
762 | | |
763 | | /* Input minus prediction plus noise feedback */ |
764 | | /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP */ |
765 | 4.32G | tmp0 = silk_mm_add_sat_epi32(n_AR_Q14, n_LF_Q14); /* Q14 */ |
766 | 4.32G | tmp1 = _mm_add_epi32(_mm_set1_epi32(n_LTP_Q14), LPC_pred_Q14); /* Q13 */ |
767 | 4.32G | tmp0 = silk_mm_sub_sat_epi32(tmp1, tmp0); /* Q13 */ |
768 | 4.32G | tmp0 = silk_mm_srai_round_epi32(tmp0, 4); /* Q10 */ |
769 | | |
770 | 4.32G | r_Q10 = _mm_sub_epi32(_mm_set1_epi32(x_Q10[i]), tmp0); /* residual error Q10 */ |
771 | | |
772 | | /* Flip sign depending on dither */ |
773 | 4.32G | r_Q10 = silk_mm_sign_epi32(r_Q10, psDelDec->Seed); |
774 | 4.32G | r_Q10 = silk_mm_limit_epi32(r_Q10, -(31 << 10), 30 << 10); |
775 | | |
776 | | /* Find two quantization level candidates and measure their rate-distortion */ |
777 | 4.32G | q1_Q10 = _mm_sub_epi32(r_Q10, _mm_set1_epi32(offset_Q10)); |
778 | 4.32G | q1_Q0 = _mm_srai_epi32(q1_Q10, 10); |
779 | 4.32G | if (Lambda_Q10 > 2048) |
780 | 16.4M | { |
781 | | /* For aggressive RDO, the bias becomes more than one pulse. */ |
782 | 16.4M | tmp0 = _mm_sub_epi32(_mm_abs_epi32(q1_Q10), _mm_set1_epi32(Lambda_Q10 / 2 - 512)); /* rdo_offset */ |
783 | 16.4M | q1_Q0 = _mm_srai_epi32(q1_Q10, 31); |
784 | 16.4M | tmp1 = _mm_cmpgt_epi32(tmp0, _mm_setzero_si128()); |
785 | 16.4M | tmp0 = _mm_srai_epi32(silk_mm_sign_epi32(tmp0, q1_Q10), 10); |
786 | 16.4M | q1_Q0 = _mm_blendv_epi8(q1_Q0, tmp0, tmp1); |
787 | 16.4M | } |
788 | | |
789 | 4.32G | tmp0 = _mm_sign_epi32(_mm_set1_epi32(QUANT_LEVEL_ADJUST_Q10), q1_Q0); |
790 | 4.32G | q1_Q10 = _mm_sub_epi32(_mm_slli_epi32(q1_Q0, 10), tmp0); |
791 | 4.32G | q1_Q10 = _mm_add_epi32(q1_Q10, _mm_set1_epi32(offset_Q10)); |
792 | | |
793 | | /* check if q1_Q0 is 0 or -1 */ |
794 | 4.32G | tmp0 = _mm_add_epi32(_mm_srli_epi32(q1_Q0, 31), q1_Q0); |
795 | 4.32G | tmp1 = _mm_cmpeq_epi32(tmp0, _mm_setzero_si128()); |
796 | 4.32G | tmp0 = _mm_blendv_epi8(_mm_set1_epi32(1024), _mm_set1_epi32(1024 - QUANT_LEVEL_ADJUST_Q10), tmp1); |
797 | 4.32G | q2_Q10 = _mm_add_epi32(q1_Q10, tmp0); |
798 | 4.32G | q_Q10 = _mm256_set_m128i(q2_Q10, q1_Q10); |
799 | | |
800 | 4.32G | rr_Q10 = _mm256_sub_epi32(_mm256_broadcastsi128_si256(r_Q10), q_Q10); |
801 | 4.32G | rd_Q10 = _mm256_abs_epi32(q_Q10); |
802 | 4.32G | rr_Q10 = silk_mm256_smulbb_epi32(rr_Q10, rr_Q10); |
803 | 4.32G | rd_Q10 = silk_mm256_smulbb_epi32(rd_Q10, _mm256_set1_epi32(Lambda_Q10)); |
804 | 4.32G | rd_Q10 = _mm256_add_epi32(rd_Q10, rr_Q10); |
805 | 4.32G | rd_Q10 = _mm256_srai_epi32(rd_Q10, 10); |
806 | | |
807 | 4.32G | mask = _mm256_broadcastsi128_si256(_mm_cmplt_epi32(_mm256_extracti128_si256(rd_Q10, 0), _mm256_extracti128_si256(rd_Q10, 1))); |
808 | 4.32G | SS_RD_Q10 = _mm256_add_epi32( |
809 | 4.32G | _mm256_broadcastsi128_si256(psDelDec->RD_Q10), |
810 | 4.32G | _mm256_blendv_epi8( |
811 | 4.32G | _mm256_permute2x128_si256(rd_Q10, rd_Q10, 0x1), |
812 | 4.32G | rd_Q10, |
813 | 4.32G | mask)); |
814 | 4.32G | SS_Q_Q10 = _mm256_blendv_epi8( |
815 | 4.32G | _mm256_permute2x128_si256(q_Q10, q_Q10, 0x1), |
816 | 4.32G | q_Q10, |
817 | 4.32G | mask); |
818 | | |
819 | | /* Update states for best and second best quantization */ |
820 | | |
821 | | /* Quantized excitation */ |
822 | 4.32G | exc_Q14 = silk_mm256_sign_epi32(_mm256_slli_epi32(SS_Q_Q10, 4), _mm256_broadcastsi128_si256(psDelDec->Seed)); |
823 | | |
824 | | /* Add predictions */ |
825 | 4.32G | exc_Q14 = _mm256_add_epi32(exc_Q14, _mm256_set1_epi32(LTP_pred_Q14)); |
826 | 4.32G | SS_LPC_exc_Q14 = _mm256_slli_epi32(exc_Q14, 1); |
827 | 4.32G | SS_xq_Q14 = _mm256_add_epi32(exc_Q14, _mm256_broadcastsi128_si256(LPC_pred_Q14)); |
828 | | |
829 | | /* Update states */ |
830 | 4.32G | SS_Diff_Q14 = _mm256_sub_epi32(SS_xq_Q14, _mm256_set1_epi32(silk_LSHIFT(x_Q10[i], 4))); |
831 | 4.32G | SS_LF_AR_Q14 = _mm256_sub_epi32(SS_Diff_Q14, _mm256_broadcastsi128_si256(n_AR_Q14)); |
832 | 4.32G | SS_sLTP_shp_Q14 = silk_mm256_sub_sat_epi32(SS_LF_AR_Q14, _mm256_broadcastsi128_si256(n_LF_Q14)); |
833 | | |
834 | | /* END Updating Delayed Decision States */ |
835 | | |
836 | 4.32G | *smpl_buf_idx = (*smpl_buf_idx + DECISION_DELAY - 1) % DECISION_DELAY; |
837 | 4.32G | last_smple_idx = (*smpl_buf_idx + decisionDelay) % DECISION_DELAY; |
838 | 4.32G | psLastSample = &psDelDec->Samples[last_smple_idx]; |
839 | | |
840 | | /* Find winner */ |
841 | 4.32G | RDmin_Q10 = silk_mm_mask_hmin_epi32(_mm256_castsi256_si128(SS_RD_Q10), MaskDelDec); |
842 | 4.32G | Winner_selector = silk_index_to_selector(silk_index_of_first_equal_epi32(RDmin_Q10, _mm256_castsi256_si128(SS_RD_Q10))); |
843 | | |
844 | | /* Increase RD values of expired states */ |
845 | 4.32G | Winner_rand_state = _mm_shuffle_epi8(psLastSample->RandState, Winner_selector); |
846 | | |
847 | 4.32G | SS_RD_Q10 = _mm256_blendv_epi8( |
848 | 4.32G | _mm256_add_epi32(SS_RD_Q10, _mm256_set1_epi32(silk_int32_MAX >> 4)), |
849 | 4.32G | SS_RD_Q10, |
850 | 4.32G | _mm256_broadcastsi128_si256(_mm_cmpeq_epi32(psLastSample->RandState, Winner_rand_state))); |
851 | | |
852 | | /* find worst in first set */ |
853 | 4.32G | RDmax_Q10 = silk_mm_mask_hmax_epi32(_mm256_extracti128_si256(SS_RD_Q10, 0), MaskDelDec); |
854 | | /* find best in second set */ |
855 | 4.32G | RDmin_Q10 = silk_mm_mask_hmin_epi32(_mm256_extracti128_si256(SS_RD_Q10, 1), MaskDelDec); |
856 | | |
857 | | /* Replace a state if best from second set outperforms worst in first set */ |
858 | 4.32G | tmp0 = _mm_cmplt_epi32(RDmin_Q10, RDmax_Q10); |
859 | 4.32G | if (!_mm_test_all_zeros(tmp0, tmp0)) |
860 | 996M | { |
861 | 996M | int t; |
862 | 996M | RDmax_ind = silk_index_of_first_equal_epi32(RDmax_Q10, _mm256_extracti128_si256(SS_RD_Q10, 0)); |
863 | 996M | RDmin_ind = silk_index_of_first_equal_epi32(RDmin_Q10, _mm256_extracti128_si256(SS_RD_Q10, 1)); |
864 | 996M | tmp1 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFU << (unsigned)(RDmax_ind << 3))); |
865 | 996M | tmp0 = _mm_blendv_epi8( |
866 | 996M | _mm_set_epi8(0xF, 0xE, 0xD, 0xC, 0xB, 0xA, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0), |
867 | 996M | silk_index_to_selector(RDmin_ind), |
868 | 996M | tmp1); |
869 | 69.6G | for (t = i; t < MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH; t++) |
870 | 68.6G | { |
871 | 68.6G | psDelDec->sLPC_Q14[t] = _mm_shuffle_epi8(psDelDec->sLPC_Q14[t], tmp0); |
872 | 68.6G | } |
873 | 996M | psDelDec->Seed = _mm_shuffle_epi8(psDelDec->Seed, tmp0); |
874 | 996M | psDelDec->SeedInit = _mm_shuffle_epi8(psDelDec->SeedInit, tmp0); |
875 | 24.9G | for (t = 0; t < MAX_SHAPE_LPC_ORDER; t++) |
876 | 23.9G | { |
877 | 23.9G | psDelDec->sAR2_Q14[t] = _mm_shuffle_epi8(psDelDec->sAR2_Q14[t], tmp0); |
878 | 23.9G | } |
879 | 40.8G | for (t = 0; t < DECISION_DELAY; t++) |
880 | 39.8G | { |
881 | 39.8G | psDelDec->Samples[t].RandState = _mm_shuffle_epi8(psDelDec->Samples[t].RandState, tmp0); |
882 | 39.8G | psDelDec->Samples[t].Q_Q10 = _mm_shuffle_epi8(psDelDec->Samples[t].Q_Q10, tmp0); |
883 | 39.8G | psDelDec->Samples[t].Xq_Q14 = _mm_shuffle_epi8(psDelDec->Samples[t].Xq_Q14, tmp0); |
884 | 39.8G | psDelDec->Samples[t].Pred_Q15 = _mm_shuffle_epi8(psDelDec->Samples[t].Pred_Q15, tmp0); |
885 | 39.8G | psDelDec->Samples[t].Shape_Q14 = _mm_shuffle_epi8(psDelDec->Samples[t].Shape_Q14, tmp0); |
886 | 39.8G | } |
887 | 996M | mask = _mm256_castsi128_si256(_mm_blendv_epi8(_mm_set_epi32(0x3, 0x2, 0x1, 0x0), _mm_set1_epi32(RDmin_ind + 4), tmp1)); |
888 | 996M | SS_Q_Q10 = _mm256_permutevar8x32_epi32(SS_Q_Q10, mask); |
889 | 996M | SS_RD_Q10 = _mm256_permutevar8x32_epi32(SS_RD_Q10, mask); |
890 | 996M | SS_xq_Q14 = _mm256_permutevar8x32_epi32(SS_xq_Q14, mask); |
891 | 996M | SS_LF_AR_Q14 = _mm256_permutevar8x32_epi32(SS_LF_AR_Q14, mask); |
892 | 996M | SS_Diff_Q14 = _mm256_permutevar8x32_epi32(SS_Diff_Q14, mask); |
893 | 996M | SS_sLTP_shp_Q14 = _mm256_permutevar8x32_epi32(SS_sLTP_shp_Q14, mask); |
894 | 996M | SS_LPC_exc_Q14 = _mm256_permutevar8x32_epi32(SS_LPC_exc_Q14, mask); |
895 | 996M | } |
896 | | |
897 | | /* Write samples from winner to output and long-term filter states */ |
898 | 4.32G | if (subfr > 0 || i >= decisionDelay) |
899 | 3.32G | { |
900 | 3.32G | pulses[i - decisionDelay] = |
901 | 3.32G | (opus_int8)silk_sar_round_32(silk_select_winner(psLastSample->Q_Q10, Winner_selector), 10); |
902 | 3.32G | xq[i - decisionDelay] = |
903 | 3.32G | silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psLastSample->Xq_Q14, Winner_selector), delayedGain_Q10[last_smple_idx], 8)); |
904 | 3.32G | NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay] = |
905 | 3.32G | silk_select_winner(psLastSample->Shape_Q14, Winner_selector); |
906 | 3.32G | sLTP_Q15[NSQ->sLTP_buf_idx - decisionDelay] = |
907 | 3.32G | silk_select_winner(psLastSample->Pred_Q15, Winner_selector); |
908 | 3.32G | } |
909 | 4.32G | NSQ->sLTP_shp_buf_idx++; |
910 | 4.32G | NSQ->sLTP_buf_idx++; |
911 | | |
912 | | /* Update states */ |
913 | 4.32G | psSample = &psDelDec->Samples[*smpl_buf_idx]; |
914 | 4.32G | psDelDec->Seed = _mm_add_epi32(psDelDec->Seed, silk_mm_srai_round_epi32(_mm256_castsi256_si128(SS_Q_Q10), 10)); |
915 | 4.32G | psDelDec->LF_AR_Q14 = _mm256_castsi256_si128(SS_LF_AR_Q14); |
916 | 4.32G | psDelDec->Diff_Q14 = _mm256_castsi256_si128(SS_Diff_Q14); |
917 | 4.32G | psDelDec->sLPC_Q14[i + NSQ_LPC_BUF_LENGTH] = _mm256_castsi256_si128(SS_xq_Q14); |
918 | 4.32G | psDelDec->RD_Q10 = _mm256_castsi256_si128(SS_RD_Q10); |
919 | 4.32G | psSample->Xq_Q14 = _mm256_castsi256_si128(SS_xq_Q14); |
920 | 4.32G | psSample->Q_Q10 = _mm256_castsi256_si128(SS_Q_Q10); |
921 | 4.32G | psSample->Pred_Q15 = _mm256_castsi256_si128(SS_LPC_exc_Q14); |
922 | 4.32G | psSample->Shape_Q14 = _mm256_castsi256_si128(SS_sLTP_shp_Q14); |
923 | 4.32G | psSample->RandState = psDelDec->Seed; |
924 | 4.32G | delayedGain_Q10[*smpl_buf_idx] = Gain_Q10; |
925 | 4.32G | } |
926 | | /* Update LPC states */ |
927 | 1.49G | for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++) |
928 | 1.40G | { |
929 | 1.40G | psDelDec->sLPC_Q14[i] = (&psDelDec->sLPC_Q14[length])[i]; |
930 | 1.40G | } |
931 | 87.6M | } |
932 | | |
933 | | static OPUS_INLINE void silk_nsq_del_dec_scale_states_avx2( |
934 | | const silk_encoder_state *psEncC, /* I Encoder State */ |
935 | | silk_nsq_state *NSQ, /* I/O NSQ state */ |
936 | | NSQ_del_dec_struct *psDelDec, /* I/O Delayed decision states */ |
937 | | const opus_int16 x16[], /* I Input */ |
938 | | opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH], /* O Input scaled with 1/Gain in Q10 */ |
939 | | const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */ |
940 | | opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */ |
941 | | opus_int subfr, /* I Subframe number */ |
942 | | const opus_int LTP_scale_Q14, /* I LTP state scaling */ |
943 | | const opus_int32 Gains_Q16[MAX_NB_SUBFR], /* I */ |
944 | | const opus_int pitchL[MAX_NB_SUBFR], /* I Pitch lag */ |
945 | | const opus_int signal_type, /* I Signal type */ |
946 | | const opus_int decisionDelay /* I Decision delay */ |
947 | | ) |
948 | 87.6M | { |
949 | 87.6M | int i; |
950 | 87.6M | opus_int lag; |
951 | 87.6M | opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q26; |
952 | 87.6M | NSQ_del_dec_sample_struct *psSample; |
953 | | |
954 | 87.6M | lag = pitchL[subfr]; |
955 | 87.6M | inv_gain_Q31 = silk_INVERSE32_varQ(silk_max(Gains_Q16[subfr], 1), 47); |
956 | 87.6M | silk_assert(inv_gain_Q31 != 0); |
957 | | |
958 | | /* Scale input */ |
959 | 87.6M | inv_gain_Q26 = silk_sar_round_32(inv_gain_Q31, 5); |
960 | 1.16G | for (i = 0; i < psEncC->subfr_length; i+=4) |
961 | 1.08G | { |
962 | 1.08G | __m256i x = _mm256_cvtepi16_epi64(_mm_loadu_si64(&x16[i])); |
963 | 1.08G | x = _mm256_slli_epi64(_mm256_mul_epi32(x, _mm256_set1_epi32(inv_gain_Q26)), 16); |
964 | 1.08G | _mm_storeu_si128((__m128i*)(void*)&x_sc_Q10[i], silk_cvtepi64_epi32_high(x)); |
965 | 1.08G | } |
966 | | |
967 | | /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */ |
968 | 87.6M | if (NSQ->rewhite_flag) |
969 | 823k | { |
970 | 823k | if (subfr == 0) |
971 | 687k | { |
972 | | /* Do LTP downscaling */ |
973 | 687k | inv_gain_Q31 = silk_LSHIFT(silk_SMULWB(inv_gain_Q31, LTP_scale_Q14), 2); |
974 | 687k | } |
975 | 67.5M | for (i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++) |
976 | 66.7M | { |
977 | 66.7M | silk_assert(i < MAX_FRAME_LENGTH); |
978 | 66.7M | sLTP_Q15[i] = silk_SMULWB(inv_gain_Q31, sLTP[i]); |
979 | 66.7M | } |
980 | 823k | } |
981 | | |
982 | | /* Adjust for changing gain */ |
983 | 87.6M | if (Gains_Q16[subfr] != NSQ->prev_gain_Q16) |
984 | 6.05M | { |
985 | 6.05M | gain_adj_Q16 = silk_DIV32_varQ(NSQ->prev_gain_Q16, Gains_Q16[subfr], 16); |
986 | | |
987 | | /* Scale long-term shaping state */ |
988 | 308M | for (i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx; i+=4) |
989 | 302M | { |
990 | 302M | opus_int32 *p = &NSQ->sLTP_shp_Q14[i]; |
991 | 302M | _mm_storeu_si128((__m128i*)(void*)p, silk_mm_smulww_epi32(_mm_loadu_si128((__m128i*)(void*)p), gain_adj_Q16)); |
992 | 302M | } |
993 | | |
994 | | /* Scale long-term prediction state */ |
995 | 6.05M | if (signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0) |
996 | 1.13M | { |
997 | 59.1M | for (i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx - decisionDelay; i++) |
998 | 58.0M | { |
999 | 58.0M | sLTP_Q15[i] = ((opus_int64)sLTP_Q15[i]) * ((opus_int64)gain_adj_Q16) >> 16; |
1000 | 58.0M | } |
1001 | 1.13M | } |
1002 | | |
1003 | | /* Scale scalar states */ |
1004 | 6.05M | psDelDec->LF_AR_Q14 = silk_mm_smulww_epi32(psDelDec->LF_AR_Q14, gain_adj_Q16); |
1005 | 6.05M | psDelDec->Diff_Q14 = silk_mm_smulww_epi32(psDelDec->Diff_Q14, gain_adj_Q16); |
1006 | | |
1007 | | /* Scale short-term prediction and shaping states */ |
1008 | 102M | for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++) |
1009 | 96.8M | { |
1010 | 96.8M | psDelDec->sLPC_Q14[i] = silk_mm_smulww_epi32(psDelDec->sLPC_Q14[i], gain_adj_Q16); |
1011 | 96.8M | } |
1012 | 248M | for (i = 0; i < DECISION_DELAY; i++) |
1013 | 242M | { |
1014 | 242M | psSample = &psDelDec->Samples[i]; |
1015 | 242M | psSample->Pred_Q15 = silk_mm_smulww_epi32(psSample->Pred_Q15, gain_adj_Q16); |
1016 | 242M | psSample->Shape_Q14 = silk_mm_smulww_epi32(psSample->Shape_Q14, gain_adj_Q16); |
1017 | 242M | } |
1018 | 151M | for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++) |
1019 | 145M | { |
1020 | 145M | psDelDec->sAR2_Q14[i] = silk_mm_smulww_epi32(psDelDec->sAR2_Q14[i], gain_adj_Q16); |
1021 | 145M | } |
1022 | | |
1023 | | /* Save inverse gain */ |
1024 | 6.05M | NSQ->prev_gain_Q16 = Gains_Q16[subfr]; |
1025 | 6.05M | } |
1026 | 87.6M | } |
1027 | | |
1028 | | static OPUS_INLINE void silk_LPC_analysis_filter_avx2( |
1029 | | opus_int16 *out, /* O Output signal */ |
1030 | | const opus_int16 *in, /* I Input signal */ |
1031 | | const opus_int16 *B, /* I MA prediction coefficients, Q12 [order] */ |
1032 | | const opus_int32 len, /* I Signal length */ |
1033 | | const opus_int32 order /* I Filter order */ |
1034 | | ) |
1035 | 823k | { |
1036 | 823k | int i; |
1037 | 823k | opus_int32 out32_Q12, out32; |
1038 | 823k | silk_assert(order == 10 || order == 16); |
1039 | | |
1040 | 67.5M | for(i = order; i < len; i++ ) |
1041 | 66.7M | { |
1042 | 66.7M | const opus_int16 *in_ptr = &in[ i ]; |
1043 | | /* Allowing wrap around so that two wraps can cancel each other. The rare |
1044 | | cases where the result wraps around can only be triggered by invalid streams*/ |
1045 | | |
1046 | 66.7M | __m256i in_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)(void*)&in_ptr[-8])); |
1047 | 66.7M | __m256i B_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)(void*)& B[0])); |
1048 | 66.7M | __m256i sum = _mm256_mullo_epi32(in_v, silk_mm256_reverse_epi32(B_v)); |
1049 | 66.7M | if (order > 10) |
1050 | 8.84M | { |
1051 | 8.84M | in_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)(void*)&in_ptr[-16])); |
1052 | 8.84M | B_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)(void*)&B [8])); |
1053 | 8.84M | B_v = silk_mm256_reverse_epi32(B_v); |
1054 | 8.84M | } |
1055 | 57.8M | else |
1056 | 57.8M | { |
1057 | 57.8M | in_v = _mm256_cvtepi16_epi32(_mm_loadu_si32(&in_ptr[-10])); |
1058 | 57.8M | B_v = _mm256_cvtepi16_epi32(_mm_loadu_si32(&B [8])); |
1059 | 57.8M | B_v = _mm256_shuffle_epi32(B_v, 0x01); |
1060 | 57.8M | } |
1061 | 66.7M | sum = _mm256_add_epi32(sum, _mm256_mullo_epi32(in_v, B_v)); |
1062 | | |
1063 | 66.7M | out32_Q12 = silk_mm256_hsum_epi32(sum); |
1064 | | |
1065 | | /* Subtract prediction */ |
1066 | 66.7M | out32_Q12 = silk_SUB32_ovflw( silk_LSHIFT( (opus_int32)*in_ptr, 12 ), out32_Q12 ); |
1067 | | |
1068 | | /* Scale to Q0 */ |
1069 | 66.7M | out32 = silk_sar_round_32(out32_Q12, 12); |
1070 | | |
1071 | | /* Saturate output */ |
1072 | 66.7M | out[ i ] = silk_sat16(out32); |
1073 | 66.7M | } |
1074 | | |
1075 | | /* Set first d output samples to zero */ |
1076 | 823k | silk_memset( out, 0, order * sizeof( opus_int16 ) ); |
1077 | 823k | } |