/src/opus/silk/x86/NSQ_del_dec_avx2.c
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | /***********************************************************************  | 
2  |  | Copyright (c) 2021 Google Inc.  | 
3  |  | Redistribution and use in source and binary forms, with or without  | 
4  |  | modification, are permitted provided that the following conditions  | 
5  |  | are met:  | 
6  |  | - Redistributions of source code must retain the above copyright notice,  | 
7  |  | this list of conditions and the following disclaimer.  | 
8  |  | - Redistributions in binary form must reproduce the above copyright  | 
9  |  | notice, this list of conditions and the following disclaimer in the  | 
10  |  | documentation and/or other materials provided with the distribution.  | 
11  |  | - Neither the name of Internet Society, IETF or IETF Trust, nor the  | 
12  |  | names of specific contributors, may be used to endorse or promote  | 
13  |  | products derived from this software without specific prior written  | 
14  |  | permission.  | 
15  |  | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"  | 
16  |  | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE  | 
17  |  | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE  | 
18  |  | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE  | 
19  |  | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR  | 
20  |  | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF  | 
21  |  | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS  | 
22  |  | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN  | 
23  |  | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)  | 
24  |  | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE  | 
25  |  | POSSIBILITY OF SUCH DAMAGE.  | 
26  |  | ***********************************************************************/  | 
27  |  |  | 
28  |  | #ifdef HAVE_CONFIG_H  | 
29  |  | #include "config.h"  | 
30  |  | #endif  | 
31  |  |  | 
32  |  | #ifdef OPUS_CHECK_ASM  | 
33  |  | #include <string.h>  | 
34  |  | #endif  | 
35  |  |  | 
36  |  | #include "opus_defines.h"  | 
37  |  | #include <immintrin.h>  | 
38  |  |  | 
39  |  | #include "main.h"  | 
40  |  | #include "stack_alloc.h"  | 
41  |  | #include "NSQ.h"  | 
42  |  | #include "celt/x86/x86cpu.h"  | 
43  |  |  | 
44  |  | /* Returns TRUE if all assumptions met */  | 
45  |  | static OPUS_INLINE int verify_assumptions(const silk_encoder_state *psEncC)  | 
46  | 0  | { | 
47  |  |     /* This optimization is based on these assumptions        */  | 
48  |  |     /* These assumptions are fundamental and hence assert are */  | 
49  |  |     /* used. Should any assert triggers, we have to re-visit  */  | 
50  |  |     /* all related code to make sure it still functions the   */  | 
51  |  |     /* same as the C implementation.                          */  | 
52  | 0  |     silk_assert(MAX_DEL_DEC_STATES  <= 4      &&  | 
53  | 0  |                 MAX_FRAME_LENGTH     % 4 == 0 &&  | 
54  | 0  |                 MAX_SUB_FRAME_LENGTH % 4 == 0 &&  | 
55  | 0  |                 LTP_MEM_LENGTH_MS    % 4 == 0 );  | 
56  | 0  |     silk_assert(psEncC->fs_kHz ==  8 ||  | 
57  | 0  |                 psEncC->fs_kHz == 12 ||  | 
58  | 0  |                 psEncC->fs_kHz == 16 );  | 
59  | 0  |     silk_assert(psEncC->nb_subfr <= MAX_NB_SUBFR &&  | 
60  | 0  |                 psEncC->nb_subfr > 0             );  | 
61  | 0  |     silk_assert(psEncC->nStatesDelayedDecision <= MAX_DEL_DEC_STATES &&  | 
62  | 0  |                 psEncC->nStatesDelayedDecision > 0                   );  | 
63  | 0  |     silk_assert(psEncC->ltp_mem_length == psEncC->fs_kHz * LTP_MEM_LENGTH_MS);  | 
64  |  |  | 
65  |  |     /* Regressions were observed on certain AMD Zen CPUs when      */  | 
66  |  |     /* nStatesDelayedDecision is 1 or 2. Ideally we should detect  */  | 
67  |  |     /* these CPUs and enable this optimization on others; however, */  | 
68  |  |     /* there is no good way to do so under current OPUS framework. */  | 
69  | 0  |     return psEncC->nStatesDelayedDecision == 3 ||  | 
70  | 0  |            psEncC->nStatesDelayedDecision == 4;  | 
71  | 0  | }  | 
72  |  |  | 
73  |  | /* Intrinsics not defined on MSVC */  | 
74  |  | #ifdef _MSC_VER  | 
75  |  | #include <Intsafe.h>  | 
76  |  | static inline int __builtin_sadd_overflow(opus_int32 a, opus_int32 b, opus_int32* res)  | 
77  |  | { | 
78  |  |     *res = a+b;  | 
79  |  |     return (*res ^ a) & (*res ^ b) & 0x80000000;  | 
80  |  | }  | 
81  |  | static inline int __builtin_ctz(unsigned int x)  | 
82  |  | { | 
83  |  |     DWORD res = 0;  | 
84  |  |     return _BitScanForward(&res, x) ? res : 32;  | 
85  |  | }  | 
86  |  | #endif  | 
87  |  |  | 
88  |  | static OPUS_INLINE __m128i silk_cvtepi64_epi32_high(__m256i num)  | 
89  | 0  | { | 
90  | 0  |     return _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(num, _mm256_set_epi32(0, 0, 0, 0, 7, 5, 3, 1)));  | 
91  | 0  | }  | 
92  |  |  | 
93  |  | static OPUS_INLINE opus_int16 silk_sat16(opus_int32 num)  | 
94  | 0  | { | 
95  | 0  |     num = num > silk_int16_MAX ? silk_int16_MAX : num;  | 
96  | 0  |     num = num < silk_int16_MIN ? silk_int16_MIN : num;  | 
97  | 0  |     return num;  | 
98  | 0  | }  | 
99  |  |  | 
100  |  | static OPUS_INLINE opus_int32 silk_sar_round_32(opus_int32 a, int bits)  | 
101  | 0  | { | 
102  | 0  |     silk_assert(bits > 0 && bits < 31);  | 
103  | 0  |     a += 1 << (bits-1);  | 
104  | 0  |     return a >> bits;  | 
105  | 0  | }  | 
106  |  |  | 
107  |  | static OPUS_INLINE opus_int64 silk_sar_round_smulww(opus_int32 a, opus_int32 b, int bits)  | 
108  | 0  | { | 
109  | 0  |     silk_assert(bits > 0 && bits < 63);  | 
110  |  | #ifdef OPUS_CHECK_ASM  | 
111  |  |     return silk_RSHIFT_ROUND(silk_SMULWW(a, b), bits);  | 
112  |  | #else  | 
113  |  |     /* This code is more correct, but it won't overflow like the C code in some rare cases. */  | 
114  | 0  |     silk_assert(bits > 0 && bits < 63);  | 
115  | 0  |     opus_int64 t = ((opus_int64)a) * ((opus_int64)b);  | 
116  | 0  |     bits += 16;  | 
117  | 0  |     t += 1ull << (bits-1);  | 
118  | 0  |     return t >> bits;  | 
119  | 0  | #endif  | 
120  | 0  | }  | 
121  |  |  | 
122  |  | static OPUS_INLINE opus_int32 silk_add_sat32(opus_int32 a, opus_int32 b)  | 
123  | 0  | { | 
124  | 0  |     opus_int32 sum;  | 
125  | 0  |     if (__builtin_sadd_overflow(a, b, &sum))  | 
126  | 0  |     { | 
127  | 0  |         return a >= 0 ? silk_int32_MAX : silk_int32_MIN;  | 
128  | 0  |     }  | 
129  | 0  |     return sum;  | 
130  | 0  | }  | 
131  |  |  | 
132  |  | static OPUS_INLINE __m128i silk_mm_srai_round_epi32(__m128i a, int bits)  | 
133  | 0  | { | 
134  | 0  |     silk_assert(bits > 0 && bits < 31);  | 
135  | 0  |     return _mm_srai_epi32(_mm_add_epi32(a, _mm_set1_epi32(1 << (bits - 1))), bits);  | 
136  | 0  | }  | 
137  |  |  | 
138  |  | /* add/subtract with output saturated */  | 
139  |  | static OPUS_INLINE __m128i silk_mm_add_sat_epi32(__m128i a, __m128i b)  | 
140  | 0  | { | 
141  | 0  |     __m128i r = _mm_add_epi32(a, b);  | 
142  | 0  |     __m128i OF = _mm_and_si128(_mm_xor_si128(a, r), _mm_xor_si128(b, r));           /* OF = (sum ^ a) & (sum ^ b)   */  | 
143  | 0  |     __m128i SAT = _mm_add_epi32(_mm_srli_epi32(a, 31), _mm_set1_epi32(0x7FFFFFFF)); /* SAT = (a >> 31) + 0x7FFFFFFF */  | 
144  | 0  |     return _mm_blendv_epi8(r, SAT, _mm_srai_epi32(OF, 31));  | 
145  | 0  | }  | 
146  |  | static OPUS_INLINE __m128i silk_mm_sub_sat_epi32(__m128i a, __m128i b)  | 
147  | 0  | { | 
148  | 0  |     __m128i r = _mm_sub_epi32(a, b);  | 
149  | 0  |     __m128i OF = _mm_andnot_si128(_mm_xor_si128(b, r), _mm_xor_si128(a, r));        /* OF = (sum ^ a) & (sum ^ ~b) = (sum ^ a) & ~(sum ^ b) */  | 
150  | 0  |     __m128i SAT = _mm_add_epi32(_mm_srli_epi32(a, 31), _mm_set1_epi32(0x7FFFFFFF)); /* SAT = (a >> 31) + 0x7FFFFFFF                         */  | 
151  | 0  |     return _mm_blendv_epi8(r, SAT, _mm_srai_epi32(OF, 31));  | 
152  | 0  | }  | 
153  |  | static OPUS_INLINE __m256i silk_mm256_sub_sat_epi32(__m256i a, __m256i b)  | 
154  | 0  | { | 
155  | 0  |     __m256i r = _mm256_sub_epi32(a, b);  | 
156  | 0  |     __m256i OF = _mm256_andnot_si256(_mm256_xor_si256(b, r), _mm256_xor_si256(a, r));        /* OF = (sum ^ a) & (sum ^ ~b) = (sum ^ a) & ~(sum ^ b) */  | 
157  | 0  |     __m256i SAT = _mm256_add_epi32(_mm256_srli_epi32(a, 31), _mm256_set1_epi32(0x7FFFFFFF)); /* SAT = (a >> 31) + 0x7FFFFFFF                         */  | 
158  | 0  |     return _mm256_blendv_epi8(r, SAT, _mm256_srai_epi32(OF, 31));  | 
159  | 0  | }  | 
160  |  |  | 
161  |  | static OPUS_INLINE __m128i silk_mm_limit_epi32(__m128i num, opus_int32 limit1, opus_int32 limit2)  | 
162  | 0  | { | 
163  | 0  |     opus_int32 lo = limit1 < limit2 ? limit1 : limit2;  | 
164  | 0  |     opus_int32 hi = limit1 > limit2 ? limit1 : limit2;  | 
165  |  | 
  | 
166  | 0  |     num = _mm_min_epi32(num, _mm_set1_epi32(hi));  | 
167  | 0  |     num = _mm_max_epi32(num, _mm_set1_epi32(lo));  | 
168  | 0  |     return num;  | 
169  | 0  | }  | 
170  |  |  | 
171  |  | /* cond < 0 ? -num : num */  | 
172  |  | static OPUS_INLINE __m128i silk_mm_sign_epi32(__m128i num, __m128i cond)  | 
173  | 0  | { | 
174  | 0  |     return _mm_sign_epi32(num, _mm_or_si128(cond, _mm_set1_epi32(1)));  | 
175  | 0  | }  | 
176  |  | static OPUS_INLINE __m256i silk_mm256_sign_epi32(__m256i num, __m256i cond)  | 
177  | 0  | { | 
178  | 0  |     return _mm256_sign_epi32(num, _mm256_or_si256(cond, _mm256_set1_epi32(1)));  | 
179  | 0  | }  | 
180  |  |  | 
181  |  | /* (a32 * b32) >> 16 */  | 
182  |  | static OPUS_INLINE __m128i silk_mm_smulww_epi32(__m128i a, opus_int32 b)  | 
183  | 0  | { | 
184  | 0  |     return silk_cvtepi64_epi32_high(_mm256_slli_epi64(_mm256_mul_epi32(_mm256_cvtepi32_epi64(a), _mm256_set1_epi32(b)), 16));  | 
185  | 0  | }  | 
186  |  |  | 
187  |  | /* (a32 * (opus_int32)((opus_int16)(b32))) >> 16 output have to be 32bit int */  | 
188  |  | static OPUS_INLINE __m128i silk_mm_smulwb_epi32(__m128i a, opus_int32 b)  | 
189  | 0  | { | 
190  | 0  |     return silk_cvtepi64_epi32_high(_mm256_mul_epi32(_mm256_cvtepi32_epi64(a), _mm256_set1_epi32(silk_LSHIFT(b, 16))));  | 
191  | 0  | }  | 
192  |  |  | 
193  |  | /* (opus_int32)((opus_int16)(a3))) * (opus_int32)((opus_int16)(b32)) output have to be 32bit int */  | 
194  |  | static OPUS_INLINE __m256i silk_mm256_smulbb_epi32(__m256i a, __m256i b)  | 
195  | 0  | { | 
196  | 0  |     const char FF = (char)0xFF;  | 
197  | 0  |     __m256i msk = _mm256_set_epi8(  | 
198  | 0  |         FF, FF, FF, FF, FF, FF, FF, FF, 13, 12, 9, 8, 5, 4, 1, 0,  | 
199  | 0  |         FF, FF, FF, FF, FF, FF, FF, FF, 13, 12, 9, 8, 5, 4, 1, 0);  | 
200  | 0  |     __m256i lo = _mm256_mullo_epi16(a, b);  | 
201  | 0  |     __m256i hi = _mm256_mulhi_epi16(a, b);  | 
202  | 0  |     lo = _mm256_shuffle_epi8(lo, msk);  | 
203  | 0  |     hi = _mm256_shuffle_epi8(hi, msk);  | 
204  | 0  |     return _mm256_unpacklo_epi16(lo, hi);  | 
205  | 0  | }  | 
206  |  |  | 
207  |  | static OPUS_INLINE __m256i silk_mm256_reverse_epi32(__m256i v)  | 
208  | 0  | { | 
209  | 0  |     v = _mm256_shuffle_epi32(v, 0x1B);  | 
210  | 0  |     v = _mm256_permute4x64_epi64(v, 0x4E);  | 
211  | 0  |     return v;  | 
212  | 0  | }  | 
213  |  |  | 
214  |  | static OPUS_INLINE opus_int32 silk_mm256_hsum_epi32(__m256i v)  | 
215  | 0  | { | 
216  | 0  |     __m128i sum = _mm_add_epi32(_mm256_extracti128_si256(v, 1), _mm256_extracti128_si256(v, 0));  | 
217  | 0  |     sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E));  | 
218  | 0  |     sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1));  | 
219  | 0  |     return _mm_cvtsi128_si32(sum);  | 
220  | 0  | }  | 
221  |  |  | 
222  |  | static OPUS_INLINE __m128i silk_mm_hmin_epi32(__m128i num)  | 
223  | 0  | { | 
224  | 0  |     num = _mm_min_epi32(num, _mm_shuffle_epi32(num, 0x4E)); /* 0123 -> 2301 */  | 
225  | 0  |     num = _mm_min_epi32(num, _mm_shuffle_epi32(num, 0xB1)); /* 0123 -> 1032 */  | 
226  | 0  |     return num;  | 
227  | 0  | }  | 
228  |  |  | 
229  |  | static OPUS_INLINE __m128i silk_mm_hmax_epi32(__m128i num)  | 
230  | 0  | { | 
231  | 0  |     num = _mm_max_epi32(num, _mm_shuffle_epi32(num, 0x4E)); /* 0123 -> 2310 */  | 
232  | 0  |     num = _mm_max_epi32(num, _mm_shuffle_epi32(num, 0xB1)); /* 0123 -> 1032 */  | 
233  | 0  |     return num;  | 
234  | 0  | }  | 
235  |  |  | 
236  |  | static OPUS_INLINE __m128i silk_mm_mask_hmin_epi32(__m128i num, __m128i mask)  | 
237  | 0  | { | 
238  | 0  |     num = _mm_blendv_epi8(num, _mm_set1_epi32(silk_int32_MAX), mask);  | 
239  | 0  |     return silk_mm_hmin_epi32(num);  | 
240  | 0  | }  | 
241  |  |  | 
242  |  | static OPUS_INLINE __m128i silk_mm_mask_hmax_epi32(__m128i num, __m128i mask)  | 
243  | 0  | { | 
244  | 0  |     num = _mm_blendv_epi8(num, _mm_set1_epi32(silk_int32_MIN), mask);  | 
245  | 0  |     return silk_mm_hmax_epi32(num);  | 
246  | 0  | }  | 
247  |  |  | 
248  |  | static OPUS_INLINE __m128i silk_mm256_rand_epi32(__m128i seed)  | 
249  | 0  | { | 
250  | 0  |     seed = _mm_mullo_epi32(seed, _mm_set1_epi32(RAND_MULTIPLIER));  | 
251  | 0  |     seed = _mm_add_epi32(seed, _mm_set1_epi32(RAND_INCREMENT));  | 
252  | 0  |     return seed;  | 
253  | 0  | }  | 
254  |  |  | 
255  |  | static OPUS_INLINE opus_int32 silk_index_of_first_equal_epi32(__m128i a, __m128i b)  | 
256  | 0  | { | 
257  | 0  |     unsigned int mask = _mm_movemask_epi8(_mm_cmpeq_epi32(a, b)) & 0x1111;  | 
258  | 0  |     silk_assert(mask != 0);  | 
259  | 0  |     return __builtin_ctz(mask) >> 2;  | 
260  | 0  | }  | 
261  |  |  | 
262  |  | static __m128i silk_index_to_selector(opus_int32 index)  | 
263  | 0  | { | 
264  | 0  |     silk_assert(index < 4);  | 
265  | 0  |     index <<= 2;  | 
266  | 0  |     return _mm_set_epi8(  | 
267  | 0  |         index + 3, index + 2, index + 1, index + 0,  | 
268  | 0  |         index + 3, index + 2, index + 1, index + 0,  | 
269  | 0  |         index + 3, index + 2, index + 1, index + 0,  | 
270  | 0  |         index + 3, index + 2, index + 1, index + 0);  | 
271  | 0  | }  | 
272  |  |  | 
273  |  | static opus_int32 silk_select_winner(__m128i num, __m128i selector)  | 
274  | 0  | { | 
275  | 0  |     return _mm_cvtsi128_si32(_mm_shuffle_epi8(num, selector));  | 
276  | 0  | }  | 
277  |  |  | 
278  |  | typedef struct  | 
279  |  | { | 
280  |  |     __m128i RandState;  | 
281  |  |     __m128i Q_Q10;  | 
282  |  |     __m128i Xq_Q14;  | 
283  |  |     __m128i Pred_Q15;  | 
284  |  |     __m128i Shape_Q14;  | 
285  |  | } NSQ_del_dec_sample_struct;  | 
286  |  |  | 
287  |  | typedef struct  | 
288  |  | { | 
289  |  |     __m128i sLPC_Q14[MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH];  | 
290  |  |     __m128i LF_AR_Q14;  | 
291  |  |     __m128i Seed;  | 
292  |  |     __m128i SeedInit;  | 
293  |  |     __m128i RD_Q10;  | 
294  |  |     __m128i Diff_Q14;  | 
295  |  |     __m128i sAR2_Q14[MAX_SHAPE_LPC_ORDER];  | 
296  |  |     NSQ_del_dec_sample_struct Samples[DECISION_DELAY];  | 
297  |  | } NSQ_del_dec_struct;  | 
298  |  |  | 
299  |  | static OPUS_INLINE void silk_nsq_del_dec_scale_states_avx2(  | 
300  |  |     const silk_encoder_state *psEncC,          /* I    Encoder State                   */  | 
301  |  |     silk_nsq_state *NSQ,                       /* I/O  NSQ state                       */  | 
302  |  |     NSQ_del_dec_struct *psDelDec,              /* I/O  Delayed decision states         */  | 
303  |  |     const opus_int16 x16[],                    /* I    Input                           */  | 
304  |  |     opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH], /* O    Input scaled with 1/Gain in Q10 */  | 
305  |  |     const opus_int16 sLTP[],                   /* I    Re-whitened LTP state in Q0     */  | 
306  |  |     opus_int32 sLTP_Q15[],                     /* O    LTP state matching scaled input */  | 
307  |  |     opus_int subfr,                            /* I    Subframe number                 */  | 
308  |  |     const opus_int LTP_scale_Q14,              /* I    LTP state scaling               */  | 
309  |  |     const opus_int32 Gains_Q16[MAX_NB_SUBFR],  /* I                                    */  | 
310  |  |     const opus_int pitchL[MAX_NB_SUBFR],       /* I    Pitch lag                       */  | 
311  |  |     const opus_int signal_type,                /* I    Signal type                     */  | 
312  |  |     const opus_int decisionDelay               /* I    Decision delay                  */  | 
313  |  | );  | 
314  |  |  | 
315  |  | /*******************************************/  | 
316  |  | /* LPC analysis filter                     */  | 
317  |  | /* NB! State is kept internally and the    */  | 
318  |  | /* filter always starts with zero state    */  | 
319  |  | /* first d output samples are set to zero  */  | 
320  |  | /*******************************************/  | 
321  |  | static OPUS_INLINE void silk_LPC_analysis_filter_avx2(  | 
322  |  |     opus_int16                  *out,               /* O    Output signal                           */  | 
323  |  |     const opus_int16            *in,                /* I    Input signal                            */  | 
324  |  |     const opus_int16            *B,                 /* I    MA prediction coefficients, Q12 [order] */  | 
325  |  |     const opus_int32            len,                /* I    Signal length                           */  | 
326  |  |     const opus_int32            order               /* I    Filter order                            */  | 
327  |  | );  | 
328  |  |  | 
329  |  | /******************************************/  | 
330  |  | /* Noise shape quantizer for one subframe */  | 
331  |  | /******************************************/  | 
332  |  | static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_avx2(  | 
333  |  |     silk_nsq_state *NSQ,                        /* I/O  NSQ state                          */  | 
334  |  |     NSQ_del_dec_struct psDelDec[],              /* I/O  Delayed decision states            */  | 
335  |  |     opus_int signalType,                        /* I    Signal type                        */  | 
336  |  |     const opus_int32 x_Q10[],                   /* I                                       */  | 
337  |  |     opus_int8 pulses[],                         /* O                                       */  | 
338  |  |     opus_int16 xq[],                            /* O                                       */  | 
339  |  |     opus_int32 sLTP_Q15[],                      /* I/O  LTP filter state                   */  | 
340  |  |     opus_int32 delayedGain_Q10[DECISION_DELAY], /* I/O  Gain delay buffer                  */  | 
341  |  |     const opus_int16 a_Q12[],                   /* I    Short term prediction coefs        */  | 
342  |  |     const opus_int16 b_Q14[],                   /* I    Long term prediction coefs         */  | 
343  |  |     const opus_int16 AR_shp_Q13[],              /* I    Noise shaping coefs                */  | 
344  |  |     opus_int lag,                               /* I    Pitch lag                          */  | 
345  |  |     opus_int32 HarmShapeFIRPacked_Q14,          /* I                                       */  | 
346  |  |     opus_int Tilt_Q14,                          /* I    Spectral tilt                      */  | 
347  |  |     opus_int32 LF_shp_Q14,                      /* I                                       */  | 
348  |  |     opus_int32 Gain_Q16,                        /* I                                       */  | 
349  |  |     opus_int Lambda_Q10,                        /* I                                       */  | 
350  |  |     opus_int offset_Q10,                        /* I                                       */  | 
351  |  |     opus_int length,                            /* I    Input length                       */  | 
352  |  |     opus_int subfr,                             /* I    Subframe number                    */  | 
353  |  |     opus_int shapingLPCOrder,                   /* I    Shaping LPC filter order           */  | 
354  |  |     opus_int predictLPCOrder,                   /* I    Prediction filter order            */  | 
355  |  |     opus_int warping_Q16,                       /* I                                       */  | 
356  |  |     __m128i MaskDelDec,                         /* I    Mask of states in decision tree    */  | 
357  |  |     opus_int *smpl_buf_idx,                     /* I/O  Index to newest samples in buffers */  | 
358  |  |     opus_int decisionDelay                      /* I                                       */  | 
359  |  | );  | 
360  |  |  | 
361  |  | void silk_NSQ_del_dec_avx2(  | 
362  |  |     const silk_encoder_state *psEncC,                            /* I    Encoder State               */  | 
363  |  |     silk_nsq_state *NSQ,                                         /* I/O  NSQ state                   */  | 
364  |  |     SideInfoIndices *psIndices,                                  /* I/O  Quantization Indices        */  | 
365  |  |     const opus_int16 x16[],                                      /* I    Input                       */  | 
366  |  |     opus_int8 pulses[],                                          /* O    Quantized pulse signal      */  | 
367  |  |     const opus_int16 *PredCoef_Q12,                              /* I    Short term prediction coefs */  | 
368  |  |     const opus_int16 LTPCoef_Q14[LTP_ORDER * MAX_NB_SUBFR],      /* I    Long term prediction coefs  */  | 
369  |  |     const opus_int16 AR_Q13[MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER], /* I    Noise shaping coefs         */  | 
370  |  |     const opus_int HarmShapeGain_Q14[MAX_NB_SUBFR],              /* I    Long term shaping coefs     */  | 
371  |  |     const opus_int Tilt_Q14[MAX_NB_SUBFR],                       /* I    Spectral tilt               */  | 
372  |  |     const opus_int32 LF_shp_Q14[MAX_NB_SUBFR],                   /* I    Low frequency shaping coefs */  | 
373  |  |     const opus_int32 Gains_Q16[MAX_NB_SUBFR],                    /* I    Quantization step sizes     */  | 
374  |  |     const opus_int32 pitchL[MAX_NB_SUBFR],                       /* I    Pitch lags                  */  | 
375  |  |     const opus_int Lambda_Q10,                                   /* I    Rate/distortion tradeoff    */  | 
376  |  |     const opus_int LTP_scale_Q14                                 /* I    LTP state scaling           */  | 
377  |  | )  | 
378  | 0  | { | 
379  |  | #ifdef OPUS_CHECK_ASM  | 
380  |  |     silk_nsq_state NSQ_c;  | 
381  |  |     SideInfoIndices psIndices_c;  | 
382  |  |     opus_int8 pulses_c[MAX_FRAME_LENGTH];  | 
383  |  |     const opus_int8 *const pulses_a = pulses;  | 
384  |  |  | 
385  |  |     silk_memcpy(&NSQ_c, NSQ, sizeof(NSQ_c));  | 
386  |  |     silk_memcpy(&psIndices_c, psIndices, sizeof(psIndices_c));  | 
387  |  |     silk_memcpy(pulses_c, pulses, sizeof(pulses_c));  | 
388  |  |     silk_NSQ_del_dec_c(psEncC, &NSQ_c, &psIndices_c, x16, pulses_c, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16,  | 
389  |  |                        pitchL, Lambda_Q10, LTP_scale_Q14);  | 
390  |  | #endif  | 
391  |  | 
  | 
392  | 0  |     if (!verify_assumptions(psEncC))  | 
393  | 0  |     { | 
394  | 0  |         silk_NSQ_del_dec_c(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14);  | 
395  | 0  |         return;  | 
396  | 0  |     }  | 
397  |  |  | 
398  | 0  |     opus_int i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr;  | 
399  | 0  |     opus_int last_smple_idx, smpl_buf_idx, decisionDelay;  | 
400  | 0  |     const opus_int16 *A_Q12, *B_Q14, *AR_shp_Q13;  | 
401  | 0  |     opus_int16 *pxq;  | 
402  | 0  |     VARDECL(opus_int32, sLTP_Q15);  | 
403  | 0  |     VARDECL(opus_int16, sLTP);  | 
404  | 0  |     opus_int32 HarmShapeFIRPacked_Q14;  | 
405  | 0  |     opus_int offset_Q10;  | 
406  | 0  |     opus_int32 Gain_Q10;  | 
407  | 0  |     opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH];  | 
408  | 0  |     opus_int32 delayedGain_Q10[DECISION_DELAY];  | 
409  | 0  |     NSQ_del_dec_struct psDelDec = {0}; | 
410  | 0  |     NSQ_del_dec_sample_struct *psSample;  | 
411  | 0  |     __m128i RDmin_Q10, MaskDelDec, Winner_selector;  | 
412  | 0  |     SAVE_STACK;  | 
413  |  | 
  | 
414  | 0  |     MaskDelDec = _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFFFFF00ul << ((psEncC->nStatesDelayedDecision - 1) << 3)));  | 
415  |  |  | 
416  |  |     /* Set unvoiced lag to the previous one, overwrite later for voiced */  | 
417  | 0  |     lag = NSQ->lagPrev;  | 
418  |  | 
  | 
419  | 0  |     silk_assert(NSQ->prev_gain_Q16 != 0);  | 
420  | 0  |     psDelDec.Seed = _mm_and_si128(  | 
421  | 0  |         _mm_add_epi32(_mm_set_epi32(3, 2, 1, 0), _mm_set1_epi32(psIndices->Seed)),  | 
422  | 0  |         _mm_set1_epi32(3));  | 
423  | 0  |     psDelDec.SeedInit = psDelDec.Seed;  | 
424  | 0  |     psDelDec.RD_Q10 = _mm_setzero_si128();  | 
425  | 0  |     psDelDec.LF_AR_Q14 = _mm_set1_epi32(NSQ->sLF_AR_shp_Q14);  | 
426  | 0  |     psDelDec.Diff_Q14 = _mm_set1_epi32(NSQ->sDiff_shp_Q14);  | 
427  | 0  |     psDelDec.Samples[0].Shape_Q14 = _mm_set1_epi32(NSQ->sLTP_shp_Q14[psEncC->ltp_mem_length - 1]);  | 
428  | 0  |     for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++)  | 
429  | 0  |     { | 
430  | 0  |         psDelDec.sLPC_Q14[i] = _mm_set1_epi32(NSQ->sLPC_Q14[i]);  | 
431  | 0  |     }  | 
432  | 0  |     for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++)  | 
433  | 0  |     { | 
434  | 0  |         psDelDec.sAR2_Q14[i] = _mm_set1_epi32(NSQ->sAR2_Q14[i]);  | 
435  | 0  |     }  | 
436  |  | 
  | 
437  | 0  |     offset_Q10 = silk_Quantization_Offsets_Q10[psIndices->signalType >> 1][psIndices->quantOffsetType];  | 
438  | 0  |     smpl_buf_idx = 0; /* index of oldest samples */  | 
439  |  | 
  | 
440  | 0  |     decisionDelay = silk_min_int(DECISION_DELAY, psEncC->subfr_length);  | 
441  |  |  | 
442  |  |     /* For voiced frames limit the decision delay to lower than the pitch lag */  | 
443  | 0  |     if (psIndices->signalType == TYPE_VOICED)  | 
444  | 0  |     { | 
445  | 0  |         for (k = 0; k < psEncC->nb_subfr; k++)  | 
446  | 0  |         { | 
447  | 0  |             decisionDelay = silk_min_int(decisionDelay, pitchL[k] - LTP_ORDER / 2 - 1);  | 
448  | 0  |         }  | 
449  | 0  |     }  | 
450  | 0  |     else  | 
451  | 0  |     { | 
452  | 0  |         if (lag > 0)  | 
453  | 0  |         { | 
454  | 0  |             decisionDelay = silk_min_int(decisionDelay, lag - LTP_ORDER / 2 - 1);  | 
455  | 0  |         }  | 
456  | 0  |     }  | 
457  |  | 
  | 
458  | 0  |     if (psIndices->NLSFInterpCoef_Q2 == 4)  | 
459  | 0  |     { | 
460  | 0  |         LSF_interpolation_flag = 0;  | 
461  | 0  |     }  | 
462  | 0  |     else  | 
463  | 0  |     { | 
464  | 0  |         LSF_interpolation_flag = 1;  | 
465  | 0  |     }  | 
466  |  | 
  | 
467  | 0  |     ALLOC(sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32);  | 
468  | 0  |     ALLOC(sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16);  | 
469  |  |     /* Set up pointers to start of sub frame */  | 
470  | 0  |     pxq = &NSQ->xq[psEncC->ltp_mem_length];  | 
471  | 0  |     NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length;  | 
472  | 0  |     NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;  | 
473  | 0  |     subfr = 0;  | 
474  | 0  |     for (k = 0; k < psEncC->nb_subfr; k++)  | 
475  | 0  |     { | 
476  | 0  |         A_Q12 = &PredCoef_Q12[((k >> 1) | (1 ^ LSF_interpolation_flag)) * MAX_LPC_ORDER];  | 
477  | 0  |         B_Q14 = <PCoef_Q14[k * LTP_ORDER];  | 
478  | 0  |         AR_shp_Q13 = &AR_Q13[k * MAX_SHAPE_LPC_ORDER];  | 
479  |  |  | 
480  |  |         /* Noise shape parameters */  | 
481  | 0  |         silk_assert(HarmShapeGain_Q14[k] >= 0);  | 
482  | 0  |         HarmShapeFIRPacked_Q14  =                          silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 );  | 
483  | 0  |         HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 );  | 
484  |  | 
  | 
485  | 0  |         NSQ->rewhite_flag = 0;  | 
486  | 0  |         if (psIndices->signalType == TYPE_VOICED)  | 
487  | 0  |         { | 
488  |  |             /* Voiced */  | 
489  | 0  |             lag = pitchL[k];  | 
490  |  |  | 
491  |  |             /* Re-whitening */  | 
492  | 0  |             if ((k & (3 ^ (LSF_interpolation_flag << 1))) == 0)  | 
493  | 0  |             { | 
494  | 0  |                 if (k == 2)  | 
495  | 0  |                 { | 
496  |  |                     /* RESET DELAYED DECISIONS */  | 
497  |  |                     /* Find winner */  | 
498  | 0  |                     RDmin_Q10 = silk_mm_mask_hmin_epi32(psDelDec.RD_Q10, MaskDelDec);  | 
499  | 0  |                     Winner_ind = silk_index_of_first_equal_epi32(RDmin_Q10, psDelDec.RD_Q10);  | 
500  | 0  |                     Winner_selector = silk_index_to_selector(Winner_ind);  | 
501  | 0  |                     psDelDec.RD_Q10 = _mm_add_epi32(  | 
502  | 0  |                         psDelDec.RD_Q10,  | 
503  | 0  |                         _mm_blendv_epi8(  | 
504  | 0  |                             _mm_set1_epi32(silk_int32_MAX >> 4),  | 
505  | 0  |                             _mm_setzero_si128(),  | 
506  | 0  |                             _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFU << (unsigned)(Winner_ind << 3)))));  | 
507  |  |  | 
508  |  |                     /* Copy final part of signals from winner state to output and long-term filter states */  | 
509  | 0  |                     last_smple_idx = smpl_buf_idx + decisionDelay;  | 
510  | 0  |                     for (i = 0; i < decisionDelay; i++)  | 
511  | 0  |                     { | 
512  | 0  |                         last_smple_idx = (last_smple_idx + DECISION_DELAY - 1) % DECISION_DELAY;  | 
513  | 0  |                         psSample = &psDelDec.Samples[last_smple_idx];  | 
514  | 0  |                         pulses[i - decisionDelay] =  | 
515  | 0  |                             (opus_int8)silk_sar_round_32(silk_select_winner(psSample->Q_Q10, Winner_selector), 10);  | 
516  | 0  |                         pxq[i - decisionDelay] =  | 
517  | 0  |                             silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psSample->Xq_Q14, Winner_selector), Gains_Q16[1], 14));  | 
518  | 0  |                         NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay + i] =  | 
519  | 0  |                             silk_select_winner(psSample->Shape_Q14, Winner_selector);  | 
520  | 0  |                     }  | 
521  |  | 
  | 
522  | 0  |                     subfr = 0;  | 
523  | 0  |                 }  | 
524  |  |  | 
525  |  |                 /* Rewhiten with new A coefs */  | 
526  | 0  |                 start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2;  | 
527  | 0  |                 silk_assert(start_idx > 0);  | 
528  |  | 
  | 
529  | 0  |                 silk_LPC_analysis_filter_avx2(&sLTP[start_idx], &NSQ->xq[start_idx + k * psEncC->subfr_length],  | 
530  | 0  |                                               A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder);  | 
531  |  | 
  | 
532  | 0  |                 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;  | 
533  | 0  |                 NSQ->rewhite_flag = 1;  | 
534  | 0  |             }  | 
535  | 0  |         }  | 
536  |  | 
  | 
537  | 0  |         silk_nsq_del_dec_scale_states_avx2(psEncC, NSQ, &psDelDec, x16, x_sc_Q10, sLTP, sLTP_Q15, k,  | 
538  | 0  |                                            LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay);  | 
539  |  | 
  | 
540  | 0  |         silk_noise_shape_quantizer_del_dec_avx2(NSQ, &psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15,  | 
541  | 0  |                                                 delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[k], LF_shp_Q14[k],  | 
542  | 0  |                                                 Gains_Q16[k], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder,  | 
543  | 0  |                                                 psEncC->predictLPCOrder, psEncC->warping_Q16, MaskDelDec, &smpl_buf_idx, decisionDelay);  | 
544  |  | 
  | 
545  | 0  |         x16 += psEncC->subfr_length;  | 
546  | 0  |         pulses += psEncC->subfr_length;  | 
547  | 0  |         pxq += psEncC->subfr_length;  | 
548  | 0  |     }  | 
549  |  |  | 
550  |  |     /* Find winner */  | 
551  | 0  |     RDmin_Q10 = silk_mm_mask_hmin_epi32(psDelDec.RD_Q10, MaskDelDec);  | 
552  | 0  |     Winner_selector = silk_index_to_selector(silk_index_of_first_equal_epi32(RDmin_Q10, psDelDec.RD_Q10));  | 
553  |  |  | 
554  |  |     /* Copy final part of signals from winner state to output and long-term filter states */  | 
555  | 0  |     psIndices->Seed = silk_select_winner(psDelDec.SeedInit, Winner_selector);  | 
556  | 0  |     last_smple_idx = smpl_buf_idx + decisionDelay;  | 
557  | 0  |     Gain_Q10 = Gains_Q16[psEncC->nb_subfr - 1] >> 6;  | 
558  | 0  |     for (i = 0; i < decisionDelay; i++)  | 
559  | 0  |     { | 
560  | 0  |         last_smple_idx = (last_smple_idx + DECISION_DELAY - 1) % DECISION_DELAY;  | 
561  | 0  |         psSample = &psDelDec.Samples[last_smple_idx];  | 
562  |  | 
  | 
563  | 0  |         pulses[i - decisionDelay] =  | 
564  | 0  |             (opus_int8)silk_sar_round_32(silk_select_winner(psSample->Q_Q10, Winner_selector), 10);  | 
565  | 0  |         pxq[i - decisionDelay] =  | 
566  | 0  |             silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psSample->Xq_Q14, Winner_selector), Gain_Q10, 8));  | 
567  | 0  |         NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay + i] =  | 
568  | 0  |             silk_select_winner(psSample->Shape_Q14, Winner_selector);  | 
569  | 0  |     }  | 
570  | 0  |     for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++)  | 
571  | 0  |     { | 
572  | 0  |         NSQ->sLPC_Q14[i] = silk_select_winner(psDelDec.sLPC_Q14[i], Winner_selector);  | 
573  | 0  |     }  | 
574  | 0  |     for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++)  | 
575  | 0  |     { | 
576  | 0  |         NSQ->sAR2_Q14[i] = silk_select_winner(psDelDec.sAR2_Q14[i], Winner_selector);  | 
577  | 0  |     }  | 
578  |  |  | 
579  |  |     /* Update states */  | 
580  | 0  |     NSQ->sLF_AR_shp_Q14 = silk_select_winner(psDelDec.LF_AR_Q14, Winner_selector);  | 
581  | 0  |     NSQ->sDiff_shp_Q14 = silk_select_winner(psDelDec.Diff_Q14, Winner_selector);  | 
582  | 0  |     NSQ->lagPrev = pitchL[psEncC->nb_subfr - 1];  | 
583  |  |  | 
584  |  |     /* Save quantized speech signal */  | 
585  | 0  |     silk_memmove(NSQ->xq, &NSQ->xq[psEncC->frame_length], psEncC->ltp_mem_length * sizeof(opus_int16));  | 
586  | 0  |     silk_memmove(NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[psEncC->frame_length], psEncC->ltp_mem_length * sizeof(opus_int32));  | 
587  |  | 
  | 
588  |  | #ifdef OPUS_CHECK_ASM  | 
589  |  |     silk_assert(!memcmp(&NSQ_c, NSQ, sizeof(NSQ_c)));  | 
590  |  |     silk_assert(!memcmp(&psIndices_c, psIndices, sizeof(psIndices_c)));  | 
591  |  |     silk_assert(!memcmp(pulses_c, pulses_a, sizeof(pulses_c)));  | 
592  |  | #endif  | 
593  |  | 
  | 
594  | 0  |     RESTORE_STACK;  | 
595  | 0  | }  | 
596  |  |  | 
597  |  | static OPUS_INLINE __m128i silk_noise_shape_quantizer_short_prediction_x4(const __m128i *buf32, const opus_int16 *coef16, opus_int order)  | 
598  | 0  | { | 
599  | 0  |     __m256i out;  | 
600  | 0  |     silk_assert(order == 10 || order == 16);  | 
601  |  |  | 
602  |  |     /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */  | 
603  | 0  |     out = _mm256_set1_epi32(order >> 1);  | 
604  | 0  |     out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-0]), _mm256_set1_epi32(silk_LSHIFT(coef16[0], 16)))); /* High DWORD */  | 
605  | 0  |     out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-1]), _mm256_set1_epi32(silk_LSHIFT(coef16[1], 16)))); /* High DWORD */  | 
606  | 0  |     out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-2]), _mm256_set1_epi32(silk_LSHIFT(coef16[2], 16)))); /* High DWORD */  | 
607  | 0  |     out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-3]), _mm256_set1_epi32(silk_LSHIFT(coef16[3], 16)))); /* High DWORD */  | 
608  | 0  |     out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-4]), _mm256_set1_epi32(silk_LSHIFT(coef16[4], 16)))); /* High DWORD */  | 
609  | 0  |     out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-5]), _mm256_set1_epi32(silk_LSHIFT(coef16[5], 16)))); /* High DWORD */  | 
610  | 0  |     out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-6]), _mm256_set1_epi32(silk_LSHIFT(coef16[6], 16)))); /* High DWORD */  | 
611  | 0  |     out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-7]), _mm256_set1_epi32(silk_LSHIFT(coef16[7], 16)))); /* High DWORD */  | 
612  | 0  |     out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-8]), _mm256_set1_epi32(silk_LSHIFT(coef16[8], 16)))); /* High DWORD */  | 
613  | 0  |     out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-9]), _mm256_set1_epi32(silk_LSHIFT(coef16[9], 16)))); /* High DWORD */  | 
614  |  | 
  | 
615  | 0  |     if (order == 16)  | 
616  | 0  |     { | 
617  | 0  |         out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-10]), _mm256_set1_epi32(silk_LSHIFT(coef16[10], 16)))); /* High DWORD */  | 
618  | 0  |         out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-11]), _mm256_set1_epi32(silk_LSHIFT(coef16[11], 16)))); /* High DWORD */  | 
619  | 0  |         out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-12]), _mm256_set1_epi32(silk_LSHIFT(coef16[12], 16)))); /* High DWORD */  | 
620  | 0  |         out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-13]), _mm256_set1_epi32(silk_LSHIFT(coef16[13], 16)))); /* High DWORD */  | 
621  | 0  |         out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-14]), _mm256_set1_epi32(silk_LSHIFT(coef16[14], 16)))); /* High DWORD */  | 
622  | 0  |         out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-15]), _mm256_set1_epi32(silk_LSHIFT(coef16[15], 16)))); /* High DWORD */  | 
623  | 0  |     }  | 
624  | 0  |     return silk_cvtepi64_epi32_high(out);  | 
625  | 0  | }  | 
626  |  |  | 
627  |  | /******************************************/  | 
628  |  | /* Noise shape quantizer for one subframe */  | 
629  |  | /******************************************/  | 
630  |  | static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_avx2(  | 
631  |  |     silk_nsq_state *NSQ,                        /* I/O  NSQ state                          */  | 
632  |  |     NSQ_del_dec_struct *psDelDec,               /* I/O  Delayed decision states            */  | 
633  |  |     opus_int signalType,                        /* I    Signal type                        */  | 
634  |  |     const opus_int32 x_Q10[],                   /* I                                       */  | 
635  |  |     opus_int8 pulses[],                         /* O                                       */  | 
636  |  |     opus_int16 xq[],                            /* O                                       */  | 
637  |  |     opus_int32 sLTP_Q15[],                      /* I/O  LTP filter state                   */  | 
638  |  |     opus_int32 delayedGain_Q10[DECISION_DELAY], /* I/O  Gain delay buffer                  */  | 
639  |  |     const opus_int16 a_Q12[],                   /* I    Short term prediction coefs        */  | 
640  |  |     const opus_int16 b_Q14[],                   /* I    Long term prediction coefs         */  | 
641  |  |     const opus_int16 AR_shp_Q13[],              /* I    Noise shaping coefs                */  | 
642  |  |     opus_int lag,                               /* I    Pitch lag                          */  | 
643  |  |     opus_int32 HarmShapeFIRPacked_Q14,          /* I                                       */  | 
644  |  |     opus_int Tilt_Q14,                          /* I    Spectral tilt                      */  | 
645  |  |     opus_int32 LF_shp_Q14,                      /* I                                       */  | 
646  |  |     opus_int32 Gain_Q16,                        /* I                                       */  | 
647  |  |     opus_int Lambda_Q10,                        /* I                                       */  | 
648  |  |     opus_int offset_Q10,                        /* I                                       */  | 
649  |  |     opus_int length,                            /* I    Input length                       */  | 
650  |  |     opus_int subfr,                             /* I    Subframe number                    */  | 
651  |  |     opus_int shapingLPCOrder,                   /* I    Shaping LPC filter order           */  | 
652  |  |     opus_int predictLPCOrder,                   /* I    Prediction filter order            */  | 
653  |  |     opus_int warping_Q16,                       /* I                                       */  | 
654  |  |     __m128i MaskDelDec,                         /* I    Mask of states in decision tree    */  | 
655  |  |     opus_int *smpl_buf_idx,                     /* I/O  Index to newest samples in buffers */  | 
656  |  |     opus_int decisionDelay                      /* I                                       */  | 
657  |  | )  | 
658  | 0  | { | 
659  | 0  |     int i;  | 
660  | 0  |     opus_int32 *shp_lag_ptr = &NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2];  | 
661  | 0  |     opus_int32 *pred_lag_ptr = &sLTP_Q15[NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2];  | 
662  | 0  |     opus_int32 Gain_Q10 = Gain_Q16 >> 6;  | 
663  |  | 
  | 
664  | 0  |     for (i = 0; i < length; i++)  | 
665  | 0  |     { | 
666  |  |         /* Perform common calculations used in all states */  | 
667  |  |         /* NSQ_sample_struct */  | 
668  |  |         /* Low  128 bits => 1st set */  | 
669  |  |         /* High 128 bits => 2nd set */  | 
670  | 0  |         int j;  | 
671  | 0  |         __m256i SS_Q_Q10;  | 
672  | 0  |         __m256i SS_RD_Q10;  | 
673  | 0  |         __m256i SS_xq_Q14;  | 
674  | 0  |         __m256i SS_LF_AR_Q14;  | 
675  | 0  |         __m256i SS_Diff_Q14;  | 
676  | 0  |         __m256i SS_sLTP_shp_Q14;  | 
677  | 0  |         __m256i SS_LPC_exc_Q14;  | 
678  | 0  |         __m256i exc_Q14;  | 
679  | 0  |         __m256i q_Q10, rr_Q10, rd_Q10;  | 
680  | 0  |         __m256i mask;  | 
681  | 0  |         __m128i LPC_pred_Q14, n_AR_Q14;  | 
682  | 0  |         __m128i RDmin_Q10, RDmax_Q10;  | 
683  | 0  |         __m128i n_LF_Q14;  | 
684  | 0  |         __m128i r_Q10, q1_Q0, q1_Q10, q2_Q10;  | 
685  | 0  |         __m128i Winner_rand_state, Winner_selector;  | 
686  | 0  |         __m128i tmp0, tmp1;  | 
687  | 0  |         NSQ_del_dec_sample_struct *psLastSample, *psSample;  | 
688  | 0  |         opus_int32 RDmin_ind, RDmax_ind, last_smple_idx;  | 
689  | 0  |         opus_int32 LTP_pred_Q14, n_LTP_Q14;  | 
690  |  |  | 
691  |  |         /* Long-term prediction */  | 
692  | 0  |         if (signalType == TYPE_VOICED)  | 
693  | 0  |         { | 
694  |  |             /* Unrolled loop */  | 
695  |  |             /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */  | 
696  | 0  |             LTP_pred_Q14 = 2;  | 
697  | 0  |             LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-0], b_Q14[0]);  | 
698  | 0  |             LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-1], b_Q14[1]);  | 
699  | 0  |             LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-2], b_Q14[2]);  | 
700  | 0  |             LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-3], b_Q14[3]);  | 
701  | 0  |             LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-4], b_Q14[4]);  | 
702  | 0  |             LTP_pred_Q14 = silk_LSHIFT(LTP_pred_Q14, 1); /* Q13 -> Q14 */  | 
703  | 0  |             pred_lag_ptr++;  | 
704  | 0  |         }  | 
705  | 0  |         else  | 
706  | 0  |         { | 
707  | 0  |             LTP_pred_Q14 = 0;  | 
708  | 0  |         }  | 
709  |  |  | 
710  |  |         /* Long-term shaping */  | 
711  | 0  |         if (lag > 0)  | 
712  | 0  |         { | 
713  |  |             /* Symmetric, packed FIR coefficients */  | 
714  | 0  |             n_LTP_Q14 = silk_add_sat32(shp_lag_ptr[0], shp_lag_ptr[-2]);  | 
715  | 0  |             n_LTP_Q14 = silk_SMULWB(n_LTP_Q14, HarmShapeFIRPacked_Q14);  | 
716  | 0  |             n_LTP_Q14 = n_LTP_Q14 + silk_SMULWT(shp_lag_ptr[-1], HarmShapeFIRPacked_Q14);  | 
717  | 0  |             n_LTP_Q14 = LTP_pred_Q14 - (silk_LSHIFT(n_LTP_Q14, 2)); /* Q12 -> Q14 */  | 
718  | 0  |             shp_lag_ptr++;  | 
719  | 0  |         }  | 
720  | 0  |         else  | 
721  | 0  |         { | 
722  | 0  |             n_LTP_Q14 = 0;  | 
723  | 0  |         }  | 
724  |  |  | 
725  |  |         /* BEGIN Updating Delayed Decision States */  | 
726  |  |  | 
727  |  |         /* Generate dither */  | 
728  | 0  |         psDelDec->Seed = silk_mm256_rand_epi32(psDelDec->Seed);  | 
729  |  |  | 
730  |  |         /* Short-term prediction */  | 
731  | 0  |         LPC_pred_Q14 = silk_noise_shape_quantizer_short_prediction_x4(&psDelDec->sLPC_Q14[NSQ_LPC_BUF_LENGTH - 1 + i], a_Q12, predictLPCOrder);  | 
732  | 0  |         LPC_pred_Q14 = _mm_slli_epi32(LPC_pred_Q14, 4); /* Q10 -> Q14 */  | 
733  |  |  | 
734  |  |         /* Noise shape feedback */  | 
735  | 0  |         silk_assert(shapingLPCOrder > 0);  | 
736  | 0  |         silk_assert((shapingLPCOrder & 1) == 0); /* check that order is even */  | 
737  |  |         /* Output of lowpass section */  | 
738  | 0  |         tmp0 = _mm_add_epi32(psDelDec->Diff_Q14, silk_mm_smulwb_epi32(psDelDec->sAR2_Q14[0], warping_Q16));  | 
739  | 0  |         n_AR_Q14 = _mm_set1_epi32(shapingLPCOrder >> 1);  | 
740  | 0  |         for (j = 0; j < shapingLPCOrder - 1; j++)  | 
741  | 0  |         { | 
742  |  |             /* Output of allpass section */  | 
743  | 0  |             tmp1 = psDelDec->sAR2_Q14[j];  | 
744  | 0  |             psDelDec->sAR2_Q14[j] = tmp0;  | 
745  | 0  |             n_AR_Q14 = _mm_add_epi32(n_AR_Q14, silk_mm_smulwb_epi32(tmp0, AR_shp_Q13[j]));  | 
746  | 0  |             tmp0 = _mm_add_epi32(tmp1, silk_mm_smulwb_epi32(_mm_sub_epi32(psDelDec->sAR2_Q14[j + 1], tmp0), warping_Q16));  | 
747  | 0  |         }  | 
748  | 0  |         psDelDec->sAR2_Q14[shapingLPCOrder - 1] = tmp0;  | 
749  | 0  |         n_AR_Q14 = _mm_add_epi32(n_AR_Q14, silk_mm_smulwb_epi32(tmp0, AR_shp_Q13[shapingLPCOrder - 1]));  | 
750  |  | 
  | 
751  | 0  |         n_AR_Q14 = _mm_slli_epi32(n_AR_Q14, 1);                                                  /* Q11 -> Q12 */  | 
752  | 0  |         n_AR_Q14 = _mm_add_epi32(n_AR_Q14, silk_mm_smulwb_epi32(psDelDec->LF_AR_Q14, Tilt_Q14)); /* Q12 */  | 
753  | 0  |         n_AR_Q14 = _mm_slli_epi32(n_AR_Q14, 2);                                                  /* Q12 -> Q14 */  | 
754  |  | 
  | 
755  | 0  |         tmp0 = silk_mm_smulwb_epi32(psDelDec->Samples[*smpl_buf_idx].Shape_Q14, LF_shp_Q14); /* Q12 */  | 
756  | 0  |         tmp1 = silk_mm_smulwb_epi32(psDelDec->LF_AR_Q14, LF_shp_Q14 >> 16);                  /* Q12 */  | 
757  | 0  |         n_LF_Q14 = _mm_add_epi32(tmp0, tmp1);                                                /* Q12 */  | 
758  | 0  |         n_LF_Q14 = _mm_slli_epi32(n_LF_Q14, 2);                                              /* Q12 -> Q14 */  | 
759  |  |  | 
760  |  |         /* Input minus prediction plus noise feedback                       */  | 
761  |  |         /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP  */  | 
762  | 0  |         tmp0 = silk_mm_add_sat_epi32(n_AR_Q14, n_LF_Q14);              /* Q14 */  | 
763  | 0  |         tmp1 = _mm_add_epi32(_mm_set1_epi32(n_LTP_Q14), LPC_pred_Q14); /* Q13 */  | 
764  | 0  |         tmp0 = silk_mm_sub_sat_epi32(tmp1, tmp0);                      /* Q13 */  | 
765  | 0  |         tmp0 = silk_mm_srai_round_epi32(tmp0, 4);                      /* Q10 */  | 
766  |  | 
  | 
767  | 0  |         r_Q10 = _mm_sub_epi32(_mm_set1_epi32(x_Q10[i]), tmp0); /* residual error Q10 */  | 
768  |  |  | 
769  |  |         /* Flip sign depending on dither */  | 
770  | 0  |         r_Q10 = silk_mm_sign_epi32(r_Q10, psDelDec->Seed);  | 
771  | 0  |         r_Q10 = silk_mm_limit_epi32(r_Q10, -(31 << 10), 30 << 10);  | 
772  |  |  | 
773  |  |         /* Find two quantization level candidates and measure their rate-distortion */  | 
774  | 0  |         q1_Q10 = _mm_sub_epi32(r_Q10, _mm_set1_epi32(offset_Q10));  | 
775  | 0  |         q1_Q0 = _mm_srai_epi32(q1_Q10, 10);  | 
776  | 0  |         if (Lambda_Q10 > 2048)  | 
777  | 0  |         { | 
778  |  |             /* For aggressive RDO, the bias becomes more than one pulse. */  | 
779  | 0  |             tmp0 = _mm_sub_epi32(_mm_abs_epi32(q1_Q10), _mm_set1_epi32(Lambda_Q10 / 2 - 512)); /* rdo_offset */  | 
780  | 0  |             q1_Q0 = _mm_srai_epi32(q1_Q10, 31);  | 
781  | 0  |             tmp1 = _mm_cmpgt_epi32(tmp0, _mm_setzero_si128());  | 
782  | 0  |             tmp0 = _mm_srai_epi32(silk_mm_sign_epi32(tmp0, q1_Q10), 10);  | 
783  | 0  |             q1_Q0 = _mm_blendv_epi8(q1_Q0, tmp0, tmp1);  | 
784  | 0  |         }  | 
785  |  | 
  | 
786  | 0  |         tmp0 = _mm_sign_epi32(_mm_set1_epi32(QUANT_LEVEL_ADJUST_Q10), q1_Q0);  | 
787  | 0  |         q1_Q10 = _mm_sub_epi32(_mm_slli_epi32(q1_Q0, 10), tmp0);  | 
788  | 0  |         q1_Q10 = _mm_add_epi32(q1_Q10, _mm_set1_epi32(offset_Q10));  | 
789  |  |  | 
790  |  |         /* check if q1_Q0 is 0 or -1 */  | 
791  | 0  |         tmp0 = _mm_add_epi32(_mm_srli_epi32(q1_Q0, 31), q1_Q0);  | 
792  | 0  |         tmp1 = _mm_cmpeq_epi32(tmp0, _mm_setzero_si128());  | 
793  | 0  |         tmp0 = _mm_blendv_epi8(_mm_set1_epi32(1024), _mm_set1_epi32(1024 - QUANT_LEVEL_ADJUST_Q10), tmp1);  | 
794  | 0  |         q2_Q10 = _mm_add_epi32(q1_Q10, tmp0);  | 
795  | 0  |         q_Q10 = _mm256_set_m128i(q2_Q10, q1_Q10);  | 
796  |  | 
  | 
797  | 0  |         rr_Q10 = _mm256_sub_epi32(_mm256_broadcastsi128_si256(r_Q10), q_Q10);  | 
798  | 0  |         rd_Q10 = _mm256_abs_epi32(q_Q10);  | 
799  | 0  |         rr_Q10 = silk_mm256_smulbb_epi32(rr_Q10, rr_Q10);  | 
800  | 0  |         rd_Q10 = silk_mm256_smulbb_epi32(rd_Q10, _mm256_set1_epi32(Lambda_Q10));  | 
801  | 0  |         rd_Q10 = _mm256_add_epi32(rd_Q10, rr_Q10);  | 
802  | 0  |         rd_Q10 = _mm256_srai_epi32(rd_Q10, 10);  | 
803  |  | 
  | 
804  | 0  |         mask = _mm256_broadcastsi128_si256(_mm_cmplt_epi32(_mm256_extracti128_si256(rd_Q10, 0), _mm256_extracti128_si256(rd_Q10, 1)));  | 
805  | 0  |         SS_RD_Q10 = _mm256_add_epi32(  | 
806  | 0  |             _mm256_broadcastsi128_si256(psDelDec->RD_Q10),  | 
807  | 0  |             _mm256_blendv_epi8(  | 
808  | 0  |                 _mm256_permute2x128_si256(rd_Q10, rd_Q10, 0x1),  | 
809  | 0  |                 rd_Q10,  | 
810  | 0  |                 mask));  | 
811  | 0  |         SS_Q_Q10 = _mm256_blendv_epi8(  | 
812  | 0  |             _mm256_permute2x128_si256(q_Q10, q_Q10, 0x1),  | 
813  | 0  |             q_Q10,  | 
814  | 0  |             mask);  | 
815  |  |  | 
816  |  |         /* Update states for best and second best quantization */  | 
817  |  |  | 
818  |  |         /* Quantized excitation */  | 
819  | 0  |         exc_Q14 = silk_mm256_sign_epi32(_mm256_slli_epi32(SS_Q_Q10, 4), _mm256_broadcastsi128_si256(psDelDec->Seed));  | 
820  |  |  | 
821  |  |         /* Add predictions */  | 
822  | 0  |         exc_Q14 = _mm256_add_epi32(exc_Q14, _mm256_set1_epi32(LTP_pred_Q14));  | 
823  | 0  |         SS_LPC_exc_Q14 = _mm256_slli_epi32(exc_Q14, 1);  | 
824  | 0  |         SS_xq_Q14 = _mm256_add_epi32(exc_Q14, _mm256_broadcastsi128_si256(LPC_pred_Q14));  | 
825  |  |  | 
826  |  |         /* Update states */  | 
827  | 0  |         SS_Diff_Q14 = _mm256_sub_epi32(SS_xq_Q14, _mm256_set1_epi32(silk_LSHIFT(x_Q10[i], 4)));  | 
828  | 0  |         SS_LF_AR_Q14 = _mm256_sub_epi32(SS_Diff_Q14, _mm256_broadcastsi128_si256(n_AR_Q14));  | 
829  | 0  |         SS_sLTP_shp_Q14 = silk_mm256_sub_sat_epi32(SS_LF_AR_Q14, _mm256_broadcastsi128_si256(n_LF_Q14));  | 
830  |  |  | 
831  |  |         /* END Updating Delayed Decision States */  | 
832  |  | 
  | 
833  | 0  |         *smpl_buf_idx = (*smpl_buf_idx + DECISION_DELAY - 1) % DECISION_DELAY;  | 
834  | 0  |         last_smple_idx = (*smpl_buf_idx + decisionDelay) % DECISION_DELAY;  | 
835  | 0  |         psLastSample = &psDelDec->Samples[last_smple_idx];  | 
836  |  |  | 
837  |  |         /* Find winner */  | 
838  | 0  |         RDmin_Q10 = silk_mm_mask_hmin_epi32(_mm256_castsi256_si128(SS_RD_Q10), MaskDelDec);  | 
839  | 0  |         Winner_selector = silk_index_to_selector(silk_index_of_first_equal_epi32(RDmin_Q10, _mm256_castsi256_si128(SS_RD_Q10)));  | 
840  |  |  | 
841  |  |         /* Increase RD values of expired states */  | 
842  | 0  |         Winner_rand_state = _mm_shuffle_epi8(psLastSample->RandState, Winner_selector);  | 
843  |  | 
  | 
844  | 0  |         SS_RD_Q10 = _mm256_blendv_epi8(  | 
845  | 0  |             _mm256_add_epi32(SS_RD_Q10, _mm256_set1_epi32(silk_int32_MAX >> 4)),  | 
846  | 0  |             SS_RD_Q10,  | 
847  | 0  |             _mm256_broadcastsi128_si256(_mm_cmpeq_epi32(psLastSample->RandState, Winner_rand_state)));  | 
848  |  |  | 
849  |  |         /* find worst in first set */  | 
850  | 0  |         RDmax_Q10 = silk_mm_mask_hmax_epi32(_mm256_extracti128_si256(SS_RD_Q10, 0), MaskDelDec);  | 
851  |  |         /* find best in second set */  | 
852  | 0  |         RDmin_Q10 = silk_mm_mask_hmin_epi32(_mm256_extracti128_si256(SS_RD_Q10, 1), MaskDelDec);  | 
853  |  |  | 
854  |  |         /* Replace a state if best from second set outperforms worst in first set */  | 
855  | 0  |         tmp0 = _mm_cmplt_epi32(RDmin_Q10, RDmax_Q10);  | 
856  | 0  |         if (!_mm_test_all_zeros(tmp0, tmp0))  | 
857  | 0  |         { | 
858  | 0  |             int t;  | 
859  | 0  |             RDmax_ind = silk_index_of_first_equal_epi32(RDmax_Q10, _mm256_extracti128_si256(SS_RD_Q10, 0));  | 
860  | 0  |             RDmin_ind = silk_index_of_first_equal_epi32(RDmin_Q10, _mm256_extracti128_si256(SS_RD_Q10, 1));  | 
861  | 0  |             tmp1 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFU << (unsigned)(RDmax_ind << 3)));  | 
862  | 0  |             tmp0 = _mm_blendv_epi8(  | 
863  | 0  |                 _mm_set_epi8(0xF, 0xE, 0xD, 0xC, 0xB, 0xA, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0),  | 
864  | 0  |                 silk_index_to_selector(RDmin_ind),  | 
865  | 0  |                 tmp1);  | 
866  | 0  |             for (t = i; t < MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH; t++)  | 
867  | 0  |             { | 
868  | 0  |                 psDelDec->sLPC_Q14[t] = _mm_shuffle_epi8(psDelDec->sLPC_Q14[t], tmp0);  | 
869  | 0  |             }  | 
870  | 0  |             psDelDec->Seed = _mm_shuffle_epi8(psDelDec->Seed, tmp0);  | 
871  | 0  |             psDelDec->SeedInit = _mm_shuffle_epi8(psDelDec->SeedInit, tmp0);  | 
872  | 0  |             for (t = 0; t < MAX_SHAPE_LPC_ORDER; t++)  | 
873  | 0  |             { | 
874  | 0  |                 psDelDec->sAR2_Q14[t] = _mm_shuffle_epi8(psDelDec->sAR2_Q14[t], tmp0);  | 
875  | 0  |             }  | 
876  | 0  |             for (t = 0; t < DECISION_DELAY; t++)  | 
877  | 0  |             { | 
878  | 0  |                 psDelDec->Samples[t].RandState = _mm_shuffle_epi8(psDelDec->Samples[t].RandState, tmp0);  | 
879  | 0  |                 psDelDec->Samples[t].Q_Q10 = _mm_shuffle_epi8(psDelDec->Samples[t].Q_Q10, tmp0);  | 
880  | 0  |                 psDelDec->Samples[t].Xq_Q14 = _mm_shuffle_epi8(psDelDec->Samples[t].Xq_Q14, tmp0);  | 
881  | 0  |                 psDelDec->Samples[t].Pred_Q15 = _mm_shuffle_epi8(psDelDec->Samples[t].Pred_Q15, tmp0);  | 
882  | 0  |                 psDelDec->Samples[t].Shape_Q14 = _mm_shuffle_epi8(psDelDec->Samples[t].Shape_Q14, tmp0);  | 
883  | 0  |             }  | 
884  | 0  |             mask = _mm256_castsi128_si256(_mm_blendv_epi8(_mm_set_epi32(0x3, 0x2, 0x1, 0x0), _mm_set1_epi32(RDmin_ind + 4), tmp1));  | 
885  | 0  |             SS_Q_Q10 = _mm256_permutevar8x32_epi32(SS_Q_Q10, mask);  | 
886  | 0  |             SS_RD_Q10 = _mm256_permutevar8x32_epi32(SS_RD_Q10, mask);  | 
887  | 0  |             SS_xq_Q14 = _mm256_permutevar8x32_epi32(SS_xq_Q14, mask);  | 
888  | 0  |             SS_LF_AR_Q14 = _mm256_permutevar8x32_epi32(SS_LF_AR_Q14, mask);  | 
889  | 0  |             SS_Diff_Q14 = _mm256_permutevar8x32_epi32(SS_Diff_Q14, mask);  | 
890  | 0  |             SS_sLTP_shp_Q14 = _mm256_permutevar8x32_epi32(SS_sLTP_shp_Q14, mask);  | 
891  | 0  |             SS_LPC_exc_Q14 = _mm256_permutevar8x32_epi32(SS_LPC_exc_Q14, mask);  | 
892  | 0  |         }  | 
893  |  |  | 
894  |  |         /* Write samples from winner to output and long-term filter states */  | 
895  | 0  |         if (subfr > 0 || i >= decisionDelay)  | 
896  | 0  |         { | 
897  | 0  |             pulses[i - decisionDelay] =  | 
898  | 0  |                 (opus_int8)silk_sar_round_32(silk_select_winner(psLastSample->Q_Q10, Winner_selector), 10);  | 
899  | 0  |             xq[i - decisionDelay] =  | 
900  | 0  |                 silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psLastSample->Xq_Q14, Winner_selector), delayedGain_Q10[last_smple_idx], 8));  | 
901  | 0  |             NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay] =  | 
902  | 0  |                 silk_select_winner(psLastSample->Shape_Q14, Winner_selector);  | 
903  | 0  |             sLTP_Q15[NSQ->sLTP_buf_idx - decisionDelay] =  | 
904  | 0  |                 silk_select_winner(psLastSample->Pred_Q15, Winner_selector);  | 
905  | 0  |         }  | 
906  | 0  |         NSQ->sLTP_shp_buf_idx++;  | 
907  | 0  |         NSQ->sLTP_buf_idx++;  | 
908  |  |  | 
909  |  |         /* Update states */  | 
910  | 0  |         psSample = &psDelDec->Samples[*smpl_buf_idx];  | 
911  | 0  |         psDelDec->Seed = _mm_add_epi32(psDelDec->Seed, silk_mm_srai_round_epi32(_mm256_castsi256_si128(SS_Q_Q10), 10));  | 
912  | 0  |         psDelDec->LF_AR_Q14 = _mm256_castsi256_si128(SS_LF_AR_Q14);  | 
913  | 0  |         psDelDec->Diff_Q14 = _mm256_castsi256_si128(SS_Diff_Q14);  | 
914  | 0  |         psDelDec->sLPC_Q14[i + NSQ_LPC_BUF_LENGTH] = _mm256_castsi256_si128(SS_xq_Q14);  | 
915  | 0  |         psDelDec->RD_Q10 = _mm256_castsi256_si128(SS_RD_Q10);  | 
916  | 0  |         psSample->Xq_Q14 = _mm256_castsi256_si128(SS_xq_Q14);  | 
917  | 0  |         psSample->Q_Q10 = _mm256_castsi256_si128(SS_Q_Q10);  | 
918  | 0  |         psSample->Pred_Q15 = _mm256_castsi256_si128(SS_LPC_exc_Q14);  | 
919  | 0  |         psSample->Shape_Q14 = _mm256_castsi256_si128(SS_sLTP_shp_Q14);  | 
920  | 0  |         psSample->RandState = psDelDec->Seed;  | 
921  | 0  |         delayedGain_Q10[*smpl_buf_idx] = Gain_Q10;  | 
922  | 0  |     }  | 
923  |  |     /* Update LPC states */  | 
924  | 0  |     for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++)  | 
925  | 0  |     { | 
926  | 0  |         psDelDec->sLPC_Q14[i] = (&psDelDec->sLPC_Q14[length])[i];  | 
927  | 0  |     }  | 
928  | 0  | }  | 
929  |  |  | 
930  |  | static OPUS_INLINE void silk_nsq_del_dec_scale_states_avx2(  | 
931  |  |     const silk_encoder_state *psEncC,          /* I    Encoder State                   */  | 
932  |  |     silk_nsq_state *NSQ,                       /* I/O  NSQ state                       */  | 
933  |  |     NSQ_del_dec_struct *psDelDec,              /* I/O  Delayed decision states         */  | 
934  |  |     const opus_int16 x16[],                    /* I    Input                           */  | 
935  |  |     opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH], /* O    Input scaled with 1/Gain in Q10 */  | 
936  |  |     const opus_int16 sLTP[],                   /* I    Re-whitened LTP state in Q0     */  | 
937  |  |     opus_int32 sLTP_Q15[],                     /* O    LTP state matching scaled input */  | 
938  |  |     opus_int subfr,                            /* I    Subframe number                 */  | 
939  |  |     const opus_int LTP_scale_Q14,              /* I    LTP state scaling               */  | 
940  |  |     const opus_int32 Gains_Q16[MAX_NB_SUBFR],  /* I                                    */  | 
941  |  |     const opus_int pitchL[MAX_NB_SUBFR],       /* I    Pitch lag                       */  | 
942  |  |     const opus_int signal_type,                /* I    Signal type                     */  | 
943  |  |     const opus_int decisionDelay               /* I    Decision delay                  */  | 
944  |  | )  | 
945  | 0  | { | 
946  | 0  |     int i;  | 
947  | 0  |     opus_int lag;  | 
948  | 0  |     opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q26;  | 
949  | 0  |     NSQ_del_dec_sample_struct *psSample;  | 
950  |  | 
  | 
951  | 0  |     lag = pitchL[subfr];  | 
952  | 0  |     inv_gain_Q31 = silk_INVERSE32_varQ(silk_max(Gains_Q16[subfr], 1), 47);  | 
953  | 0  |     silk_assert(inv_gain_Q31 != 0);  | 
954  |  |  | 
955  |  |     /* Scale input */  | 
956  | 0  |     inv_gain_Q26 = silk_sar_round_32(inv_gain_Q31, 5);  | 
957  | 0  |     for (i = 0; i < psEncC->subfr_length; i+=4)  | 
958  | 0  |     { | 
959  | 0  |         __m256i x = _mm256_cvtepi16_epi64(_mm_loadu_si64(&x16[i]));  | 
960  | 0  |         x = _mm256_slli_epi64(_mm256_mul_epi32(x, _mm256_set1_epi32(inv_gain_Q26)), 16);  | 
961  | 0  |         _mm_storeu_si128((__m128i*)&x_sc_Q10[i], silk_cvtepi64_epi32_high(x));  | 
962  | 0  |     }  | 
963  |  |  | 
964  |  |     /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */  | 
965  | 0  |     if (NSQ->rewhite_flag)  | 
966  | 0  |     { | 
967  | 0  |         if (subfr == 0)  | 
968  | 0  |         { | 
969  |  |             /* Do LTP downscaling */  | 
970  | 0  |             inv_gain_Q31 = silk_LSHIFT(silk_SMULWB(inv_gain_Q31, LTP_scale_Q14), 2);  | 
971  | 0  |         }  | 
972  | 0  |         for (i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++)  | 
973  | 0  |         { | 
974  | 0  |             silk_assert(i < MAX_FRAME_LENGTH);  | 
975  | 0  |             sLTP_Q15[i] = silk_SMULWB(inv_gain_Q31, sLTP[i]);  | 
976  | 0  |         }  | 
977  | 0  |     }  | 
978  |  |  | 
979  |  |     /* Adjust for changing gain */  | 
980  | 0  |     if (Gains_Q16[subfr] != NSQ->prev_gain_Q16)  | 
981  | 0  |     { | 
982  | 0  |         gain_adj_Q16 = silk_DIV32_varQ(NSQ->prev_gain_Q16, Gains_Q16[subfr], 16);  | 
983  |  |  | 
984  |  |         /* Scale long-term shaping state */  | 
985  | 0  |         for (i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx; i+=4)  | 
986  | 0  |         { | 
987  | 0  |       opus_int32 *p = &NSQ->sLTP_shp_Q14[i];  | 
988  | 0  |             _mm_storeu_si128((__m128i*)p, silk_mm_smulww_epi32(_mm_loadu_si128((__m128i*)p), gain_adj_Q16));  | 
989  | 0  |         }  | 
990  |  |  | 
991  |  |         /* Scale long-term prediction state */  | 
992  | 0  |         if (signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0)  | 
993  | 0  |         { | 
994  | 0  |             for (i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx - decisionDelay; i++)  | 
995  | 0  |             { | 
996  | 0  |                 sLTP_Q15[i] = ((opus_int64)sLTP_Q15[i]) * ((opus_int64)gain_adj_Q16) >> 16;  | 
997  | 0  |             }  | 
998  | 0  |         }  | 
999  |  |  | 
1000  |  |         /* Scale scalar states */  | 
1001  | 0  |         psDelDec->LF_AR_Q14 = silk_mm_smulww_epi32(psDelDec->LF_AR_Q14, gain_adj_Q16);  | 
1002  | 0  |         psDelDec->Diff_Q14 = silk_mm_smulww_epi32(psDelDec->Diff_Q14, gain_adj_Q16);  | 
1003  |  |  | 
1004  |  |         /* Scale short-term prediction and shaping states */  | 
1005  | 0  |         for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++)  | 
1006  | 0  |         { | 
1007  | 0  |             psDelDec->sLPC_Q14[i] = silk_mm_smulww_epi32(psDelDec->sLPC_Q14[i], gain_adj_Q16);  | 
1008  | 0  |         }  | 
1009  | 0  |         for (i = 0; i < DECISION_DELAY; i++)  | 
1010  | 0  |         { | 
1011  | 0  |             psSample = &psDelDec->Samples[i];  | 
1012  | 0  |             psSample->Pred_Q15 = silk_mm_smulww_epi32(psSample->Pred_Q15, gain_adj_Q16);  | 
1013  | 0  |             psSample->Shape_Q14 = silk_mm_smulww_epi32(psSample->Shape_Q14, gain_adj_Q16);  | 
1014  | 0  |         }  | 
1015  | 0  |         for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++)  | 
1016  | 0  |         { | 
1017  | 0  |             psDelDec->sAR2_Q14[i] = silk_mm_smulww_epi32(psDelDec->sAR2_Q14[i], gain_adj_Q16);  | 
1018  | 0  |         }  | 
1019  |  |  | 
1020  |  |         /* Save inverse gain */  | 
1021  | 0  |         NSQ->prev_gain_Q16 = Gains_Q16[subfr];  | 
1022  | 0  |     }  | 
1023  | 0  | }  | 
1024  |  |  | 
1025  |  | static OPUS_INLINE void silk_LPC_analysis_filter_avx2(  | 
1026  |  |     opus_int16                  *out,               /* O    Output signal                           */  | 
1027  |  |     const opus_int16            *in,                /* I    Input signal                            */  | 
1028  |  |     const opus_int16            *B,                 /* I    MA prediction coefficients, Q12 [order] */  | 
1029  |  |     const opus_int32            len,                /* I    Signal length                           */  | 
1030  |  |     const opus_int32            order               /* I    Filter order                            */  | 
1031  |  | )  | 
1032  | 0  | { | 
1033  | 0  |     int i;  | 
1034  | 0  |     opus_int32       out32_Q12, out32;  | 
1035  | 0  |     silk_assert(order == 10 || order == 16);  | 
1036  |  | 
  | 
1037  | 0  |     for(i = order; i < len; i++ )  | 
1038  | 0  |     { | 
1039  | 0  |         const opus_int16 *in_ptr = &in[ i ];  | 
1040  |  |         /* Allowing wrap around so that two wraps can cancel each other. The rare  | 
1041  |  |            cases where the result wraps around can only be triggered by invalid streams*/  | 
1042  |  | 
  | 
1043  | 0  |         __m256i in_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)&in_ptr[-8]));  | 
1044  | 0  |         __m256i B_v  = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)&      B[0]));  | 
1045  | 0  |         __m256i sum = _mm256_mullo_epi32(in_v, silk_mm256_reverse_epi32(B_v));  | 
1046  | 0  |         if (order > 10)  | 
1047  | 0  |         { | 
1048  | 0  |             in_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)&in_ptr[-16]));  | 
1049  | 0  |             B_v  = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)&B       [8]));  | 
1050  | 0  |             B_v  = silk_mm256_reverse_epi32(B_v);  | 
1051  | 0  |         }  | 
1052  | 0  |         else  | 
1053  | 0  |         { | 
1054  | 0  |             in_v = _mm256_cvtepi16_epi32(_mm_loadu_si32(&in_ptr[-10]));  | 
1055  | 0  |             B_v  = _mm256_cvtepi16_epi32(_mm_loadu_si32(&B       [8]));  | 
1056  | 0  |             B_v  = _mm256_shuffle_epi32(B_v, 0x01);  | 
1057  | 0  |         }  | 
1058  | 0  |         sum = _mm256_add_epi32(sum, _mm256_mullo_epi32(in_v, B_v));  | 
1059  |  | 
  | 
1060  | 0  |         out32_Q12 = silk_mm256_hsum_epi32(sum);  | 
1061  |  |  | 
1062  |  |         /* Subtract prediction */  | 
1063  | 0  |         out32_Q12 = silk_SUB32_ovflw( silk_LSHIFT( (opus_int32)*in_ptr, 12 ), out32_Q12 );  | 
1064  |  |  | 
1065  |  |         /* Scale to Q0 */  | 
1066  | 0  |         out32 = silk_sar_round_32(out32_Q12, 12);  | 
1067  |  |  | 
1068  |  |         /* Saturate output */  | 
1069  | 0  |         out[ i ] = silk_sat16(out32);  | 
1070  | 0  |     }  | 
1071  |  |  | 
1072  |  |     /* Set first d output samples to zero */  | 
1073  | 0  |     silk_memset( out, 0, order * sizeof( opus_int16 ) );  | 
1074  | 0  | }  |