Coverage Report

Created: 2024-03-26 07:25

/src/opus/silk/x86/NSQ_del_dec_avx2.c
Line
Count
Source (jump to first uncovered line)
1
/***********************************************************************
2
Copyright (c) 2021 Google Inc.
3
Redistribution and use in source and binary forms, with or without
4
modification, are permitted provided that the following conditions
5
are met:
6
- Redistributions of source code must retain the above copyright notice,
7
this list of conditions and the following disclaimer.
8
- Redistributions in binary form must reproduce the above copyright
9
notice, this list of conditions and the following disclaimer in the
10
documentation and/or other materials provided with the distribution.
11
- Neither the name of Internet Society, IETF or IETF Trust, nor the
12
names of specific contributors, may be used to endorse or promote
13
products derived from this software without specific prior written
14
permission.
15
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25
POSSIBILITY OF SUCH DAMAGE.
26
***********************************************************************/
27
28
#ifdef HAVE_CONFIG_H
29
#include "config.h"
30
#endif
31
32
#ifdef OPUS_CHECK_ASM
33
#include <string.h>
34
#endif
35
36
#include "opus_defines.h"
37
#include <immintrin.h>
38
39
#include "main.h"
40
#include "stack_alloc.h"
41
#include "NSQ.h"
42
#include "celt/x86/x86cpu.h"
43
44
/* Returns TRUE if all assumptions met */
45
static OPUS_INLINE int verify_assumptions(const silk_encoder_state *psEncC)
46
12.8M
{
47
    /* This optimization is based on these assumptions        */
48
    /* These assumptions are fundamental and hence assert are */
49
    /* used. Should any assert triggers, we have to re-visit  */
50
    /* all related code to make sure it still functions the   */
51
    /* same as the C implementation.                          */
52
12.8M
    silk_assert(MAX_DEL_DEC_STATES  <= 4      &&
53
12.8M
                MAX_FRAME_LENGTH     % 4 == 0 &&
54
12.8M
                MAX_SUB_FRAME_LENGTH % 4 == 0 &&
55
12.8M
                LTP_MEM_LENGTH_MS    % 4 == 0 );
56
12.8M
    silk_assert(psEncC->fs_kHz ==  8 ||
57
12.8M
                psEncC->fs_kHz == 12 ||
58
12.8M
                psEncC->fs_kHz == 16 );
59
12.8M
    silk_assert(psEncC->nb_subfr <= MAX_NB_SUBFR &&
60
12.8M
                psEncC->nb_subfr > 0             );
61
12.8M
    silk_assert(psEncC->nStatesDelayedDecision <= MAX_DEL_DEC_STATES &&
62
12.8M
                psEncC->nStatesDelayedDecision > 0                   );
63
12.8M
    silk_assert(psEncC->ltp_mem_length == psEncC->fs_kHz * LTP_MEM_LENGTH_MS);
64
65
    /* Regressions were observed on certain AMD Zen CPUs when      */
66
    /* nStatesDelayedDecision is 1 or 2. Ideally we should detect  */
67
    /* these CPUs and enable this optimization on others; however, */
68
    /* there is no good way to do so under current OPUS framework. */
69
12.8M
    return psEncC->nStatesDelayedDecision == 3 ||
70
12.8M
           psEncC->nStatesDelayedDecision == 4;
71
12.8M
}
72
73
/* Intrinsics not defined on MSVC */
74
#ifdef _MSC_VER
75
#include <Intsafe.h>
76
static inline int __builtin_sadd_overflow(opus_int32 a, opus_int32 b, opus_int32* res)
77
{
78
    *res = a+b;
79
    return (*res ^ a) & (*res ^ b) & 0x80000000;
80
}
81
static inline int __builtin_ctz(unsigned int x)
82
{
83
    DWORD res = 0;
84
    return _BitScanForward(&res, x) ? res : 32;
85
}
86
#endif
87
88
static OPUS_INLINE __m128i silk_cvtepi64_epi32_high(__m256i num)
89
45.5G
{
90
45.5G
    return _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(num, _mm256_set_epi32(0, 0, 0, 0, 7, 5, 3, 1)));
91
45.5G
}
92
93
static OPUS_INLINE opus_int16 silk_sat16(opus_int32 num)
94
942M
{
95
942M
    num = num > silk_int16_MAX ? silk_int16_MAX : num;
96
942M
    num = num < silk_int16_MIN ? silk_int16_MIN : num;
97
942M
    return num;
98
942M
}
99
100
static OPUS_INLINE opus_int32 silk_sar_round_32(opus_int32 a, int bits)
101
961M
{
102
961M
    silk_assert(bits > 0 && bits < 31);
103
961M
    a += 1 << (bits-1);
104
961M
    return a >> bits;
105
961M
}
106
107
static OPUS_INLINE opus_int64 silk_sar_round_smulww(opus_int32 a, opus_int32 b, int bits)
108
927M
{
109
927M
    silk_assert(bits > 0 && bits < 63);
110
#ifdef OPUS_CHECK_ASM
111
896M
    return silk_RSHIFT_ROUND(silk_SMULWW(a, b), bits);
112
#else
113
    /* This code is more correct, but it won't overflow like the C code in some rare cases. */
114
31.5M
    silk_assert(bits > 0 && bits < 63);
115
31.5M
    opus_int64 t = ((opus_int64)a) * ((opus_int64)b);
116
31.5M
    bits += 16;
117
31.5M
    t += 1ull << (bits-1);
118
31.5M
    return t >> bits;
119
#endif
120
31.5M
}
NSQ_del_dec_avx2.c:silk_sar_round_smulww
Line
Count
Source
108
896M
{
109
896M
    silk_assert(bits > 0 && bits < 63);
110
896M
#ifdef OPUS_CHECK_ASM
111
896M
    return silk_RSHIFT_ROUND(silk_SMULWW(a, b), bits);
112
#else
113
    /* This code is more correct, but it won't overflow like the C code in some rare cases. */
114
    silk_assert(bits > 0 && bits < 63);
115
    opus_int64 t = ((opus_int64)a) * ((opus_int64)b);
116
    bits += 16;
117
    t += 1ull << (bits-1);
118
    return t >> bits;
119
#endif
120
896M
}
NSQ_del_dec_avx2.c:silk_sar_round_smulww
Line
Count
Source
108
31.5M
{
109
31.5M
    silk_assert(bits > 0 && bits < 63);
110
#ifdef OPUS_CHECK_ASM
111
    return silk_RSHIFT_ROUND(silk_SMULWW(a, b), bits);
112
#else
113
    /* This code is more correct, but it won't overflow like the C code in some rare cases. */
114
31.5M
    silk_assert(bits > 0 && bits < 63);
115
31.5M
    opus_int64 t = ((opus_int64)a) * ((opus_int64)b);
116
31.5M
    bits += 16;
117
31.5M
    t += 1ull << (bits-1);
118
31.5M
    return t >> bits;
119
31.5M
#endif
120
31.5M
}
121
122
static OPUS_INLINE opus_int32 silk_add_sat32(opus_int32 a, opus_int32 b)
123
59.9M
{
124
59.9M
    opus_int32 sum;
125
59.9M
    if (__builtin_sadd_overflow(a, b, &sum))
126
7.03k
    {
127
7.03k
        return a >= 0 ? silk_int32_MAX : silk_int32_MIN;
128
7.03k
    }
129
59.9M
    return sum;
130
59.9M
}
131
132
static OPUS_INLINE __m128i silk_mm_srai_round_epi32(__m128i a, int bits)
133
1.85G
{
134
1.85G
    silk_assert(bits > 0 && bits < 31);
135
1.85G
    return _mm_srai_epi32(_mm_add_epi32(a, _mm_set1_epi32(1 << (bits - 1))), bits);
136
1.85G
}
137
138
/* add/subtract with output saturated */
139
static OPUS_INLINE __m128i silk_mm_add_sat_epi32(__m128i a, __m128i b)
140
927M
{
141
927M
    __m128i r = _mm_add_epi32(a, b);
142
927M
    __m128i OF = _mm_and_si128(_mm_xor_si128(a, r), _mm_xor_si128(b, r));           /* OF = (sum ^ a) & (sum ^ b)   */
143
927M
    __m128i SAT = _mm_add_epi32(_mm_srli_epi32(a, 31), _mm_set1_epi32(0x7FFFFFFF)); /* SAT = (a >> 31) + 0x7FFFFFFF */
144
927M
    return _mm_blendv_epi8(r, SAT, _mm_srai_epi32(OF, 31));
145
927M
}
146
static OPUS_INLINE __m128i silk_mm_sub_sat_epi32(__m128i a, __m128i b)
147
927M
{
148
927M
    __m128i r = _mm_sub_epi32(a, b);
149
927M
    __m128i OF = _mm_andnot_si128(_mm_xor_si128(b, r), _mm_xor_si128(a, r));        /* OF = (sum ^ a) & (sum ^ ~b) = (sum ^ a) & ~(sum ^ b) */
150
927M
    __m128i SAT = _mm_add_epi32(_mm_srli_epi32(a, 31), _mm_set1_epi32(0x7FFFFFFF)); /* SAT = (a >> 31) + 0x7FFFFFFF                         */
151
927M
    return _mm_blendv_epi8(r, SAT, _mm_srai_epi32(OF, 31));
152
927M
}
153
static OPUS_INLINE __m256i silk_mm256_sub_sat_epi32(__m256i a, __m256i b)
154
927M
{
155
927M
    __m256i r = _mm256_sub_epi32(a, b);
156
927M
    __m256i OF = _mm256_andnot_si256(_mm256_xor_si256(b, r), _mm256_xor_si256(a, r));        /* OF = (sum ^ a) & (sum ^ ~b) = (sum ^ a) & ~(sum ^ b) */
157
927M
    __m256i SAT = _mm256_add_epi32(_mm256_srli_epi32(a, 31), _mm256_set1_epi32(0x7FFFFFFF)); /* SAT = (a >> 31) + 0x7FFFFFFF                         */
158
927M
    return _mm256_blendv_epi8(r, SAT, _mm256_srai_epi32(OF, 31));
159
927M
}
160
161
static OPUS_INLINE __m128i silk_mm_limit_epi32(__m128i num, opus_int32 limit1, opus_int32 limit2)
162
927M
{
163
927M
    opus_int32 lo = limit1 < limit2 ? limit1 : limit2;
164
927M
    opus_int32 hi = limit1 > limit2 ? limit1 : limit2;
165
166
927M
    num = _mm_min_epi32(num, _mm_set1_epi32(hi));
167
927M
    num = _mm_max_epi32(num, _mm_set1_epi32(lo));
168
927M
    return num;
169
927M
}
170
171
/* cond < 0 ? -num : num */
172
static OPUS_INLINE __m128i silk_mm_sign_epi32(__m128i num, __m128i cond)
173
932M
{
174
932M
    return _mm_sign_epi32(num, _mm_or_si128(cond, _mm_set1_epi32(1)));
175
932M
}
176
static OPUS_INLINE __m256i silk_mm256_sign_epi32(__m256i num, __m256i cond)
177
927M
{
178
927M
    return _mm256_sign_epi32(num, _mm256_or_si256(cond, _mm256_set1_epi32(1)));
179
927M
}
180
181
/* (a32 * b32) >> 16 */
182
static OPUS_INLINE __m128i silk_mm_smulww_epi32(__m128i a, opus_int32 b)
183
317M
{
184
317M
    return silk_cvtepi64_epi32_high(_mm256_slli_epi64(_mm256_mul_epi32(_mm256_cvtepi32_epi64(a), _mm256_set1_epi32(b)), 16));
185
317M
}
186
187
/* (a32 * (opus_int32)((opus_int16)(b32))) >> 16 output have to be 32bit int */
188
static OPUS_INLINE __m128i silk_mm_smulwb_epi32(__m128i a, opus_int32 b)
189
44.0G
{
190
44.0G
    return silk_cvtepi64_epi32_high(_mm256_mul_epi32(_mm256_cvtepi32_epi64(a), _mm256_set1_epi32(silk_LSHIFT(b, 16))));
191
44.0G
}
192
193
/* (opus_int32)((opus_int16)(a3))) * (opus_int32)((opus_int16)(b32)) output have to be 32bit int */
194
static OPUS_INLINE __m256i silk_mm256_smulbb_epi32(__m256i a, __m256i b)
195
1.85G
{
196
1.85G
    const char FF = (char)0xFF;
197
1.85G
    __m256i msk = _mm256_set_epi8(
198
1.85G
        FF, FF, FF, FF, FF, FF, FF, FF, 13, 12, 9, 8, 5, 4, 1, 0,
199
1.85G
        FF, FF, FF, FF, FF, FF, FF, FF, 13, 12, 9, 8, 5, 4, 1, 0);
200
1.85G
    __m256i lo = _mm256_mullo_epi16(a, b);
201
1.85G
    __m256i hi = _mm256_mulhi_epi16(a, b);
202
1.85G
    lo = _mm256_shuffle_epi8(lo, msk);
203
1.85G
    hi = _mm256_shuffle_epi8(hi, msk);
204
1.85G
    return _mm256_unpacklo_epi16(lo, hi);
205
1.85G
}
206
207
static OPUS_INLINE __m256i silk_mm256_reverse_epi32(__m256i v)
208
18.1M
{
209
18.1M
    v = _mm256_shuffle_epi32(v, 0x1B);
210
18.1M
    v = _mm256_permute4x64_epi64(v, 0x4E);
211
18.1M
    return v;
212
18.1M
}
213
214
static OPUS_INLINE opus_int32 silk_mm256_hsum_epi32(__m256i v)
215
15.0M
{
216
15.0M
    __m128i sum = _mm_add_epi32(_mm256_extracti128_si256(v, 1), _mm256_extracti128_si256(v, 0));
217
15.0M
    sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E));
218
15.0M
    sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1));
219
15.0M
    return _mm_cvtsi128_si32(sum);
220
15.0M
}
221
222
static OPUS_INLINE __m128i silk_mm_hmin_epi32(__m128i num)
223
1.86G
{
224
1.86G
    num = _mm_min_epi32(num, _mm_shuffle_epi32(num, 0x4E)); /* 0123 -> 2301 */
225
1.86G
    num = _mm_min_epi32(num, _mm_shuffle_epi32(num, 0xB1)); /* 0123 -> 1032 */
226
1.86G
    return num;
227
1.86G
}
228
229
static OPUS_INLINE __m128i silk_mm_hmax_epi32(__m128i num)
230
927M
{
231
927M
    num = _mm_max_epi32(num, _mm_shuffle_epi32(num, 0x4E)); /* 0123 -> 2310 */
232
927M
    num = _mm_max_epi32(num, _mm_shuffle_epi32(num, 0xB1)); /* 0123 -> 1032 */
233
927M
    return num;
234
927M
}
235
236
static OPUS_INLINE __m128i silk_mm_mask_hmin_epi32(__m128i num, __m128i mask)
237
1.86G
{
238
1.86G
    num = _mm_blendv_epi8(num, _mm_set1_epi32(silk_int32_MAX), mask);
239
1.86G
    return silk_mm_hmin_epi32(num);
240
1.86G
}
241
242
static OPUS_INLINE __m128i silk_mm_mask_hmax_epi32(__m128i num, __m128i mask)
243
927M
{
244
927M
    num = _mm_blendv_epi8(num, _mm_set1_epi32(silk_int32_MIN), mask);
245
927M
    return silk_mm_hmax_epi32(num);
246
927M
}
247
248
static OPUS_INLINE __m128i silk_mm256_rand_epi32(__m128i seed)
249
927M
{
250
927M
    seed = _mm_mullo_epi32(seed, _mm_set1_epi32(RAND_MULTIPLIER));
251
927M
    seed = _mm_add_epi32(seed, _mm_set1_epi32(RAND_INCREMENT));
252
927M
    return seed;
253
927M
}
254
255
static OPUS_INLINE opus_int32 silk_index_of_first_equal_epi32(__m128i a, __m128i b)
256
1.41G
{
257
1.41G
    unsigned int mask = _mm_movemask_epi8(_mm_cmpeq_epi32(a, b)) & 0x1111;
258
1.41G
    silk_assert(mask != 0);
259
1.41G
    return __builtin_ctz(mask) >> 2;
260
1.41G
}
261
262
static __m128i silk_index_to_selector(opus_int32 index)
263
1.17G
{
264
1.17G
    silk_assert(index < 4);
265
1.17G
    index <<= 2;
266
1.17G
    return _mm_set_epi8(
267
1.17G
        index + 3, index + 2, index + 1, index + 0,
268
1.17G
        index + 3, index + 2, index + 1, index + 0,
269
1.17G
        index + 3, index + 2, index + 1, index + 0,
270
1.17G
        index + 3, index + 2, index + 1, index + 0);
271
1.17G
}
272
273
static opus_int32 silk_select_winner(__m128i num, __m128i selector)
274
3.72G
{
275
3.72G
    return _mm_cvtsi128_si32(_mm_shuffle_epi8(num, selector));
276
3.72G
}
277
278
typedef struct
279
{
280
    __m128i RandState;
281
    __m128i Q_Q10;
282
    __m128i Xq_Q14;
283
    __m128i Pred_Q15;
284
    __m128i Shape_Q14;
285
} NSQ_del_dec_sample_struct;
286
287
typedef struct
288
{
289
    __m128i sLPC_Q14[MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH];
290
    __m128i LF_AR_Q14;
291
    __m128i Seed;
292
    __m128i SeedInit;
293
    __m128i RD_Q10;
294
    __m128i Diff_Q14;
295
    __m128i sAR2_Q14[MAX_SHAPE_LPC_ORDER];
296
    NSQ_del_dec_sample_struct Samples[DECISION_DELAY];
297
} NSQ_del_dec_struct;
298
299
static OPUS_INLINE void silk_nsq_del_dec_scale_states_avx2(
300
    const silk_encoder_state *psEncC,          /* I    Encoder State                   */
301
    silk_nsq_state *NSQ,                       /* I/O  NSQ state                       */
302
    NSQ_del_dec_struct *psDelDec,              /* I/O  Delayed decision states         */
303
    const opus_int16 x16[],                    /* I    Input                           */
304
    opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH], /* O    Input scaled with 1/Gain in Q10 */
305
    const opus_int16 sLTP[],                   /* I    Re-whitened LTP state in Q0     */
306
    opus_int32 sLTP_Q15[],                     /* O    LTP state matching scaled input */
307
    opus_int subfr,                            /* I    Subframe number                 */
308
    const opus_int LTP_scale_Q14,              /* I    LTP state scaling               */
309
    const opus_int32 Gains_Q16[MAX_NB_SUBFR],  /* I                                    */
310
    const opus_int pitchL[MAX_NB_SUBFR],       /* I    Pitch lag                       */
311
    const opus_int signal_type,                /* I    Signal type                     */
312
    const opus_int decisionDelay               /* I    Decision delay                  */
313
);
314
315
/*******************************************/
316
/* LPC analysis filter                     */
317
/* NB! State is kept internally and the    */
318
/* filter always starts with zero state    */
319
/* first d output samples are set to zero  */
320
/*******************************************/
321
static OPUS_INLINE void silk_LPC_analysis_filter_avx2(
322
    opus_int16                  *out,               /* O    Output signal                           */
323
    const opus_int16            *in,                /* I    Input signal                            */
324
    const opus_int16            *B,                 /* I    MA prediction coefficients, Q12 [order] */
325
    const opus_int32            len,                /* I    Signal length                           */
326
    const opus_int32            order               /* I    Filter order                            */
327
);
328
329
/******************************************/
330
/* Noise shape quantizer for one subframe */
331
/******************************************/
332
static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_avx2(
333
    silk_nsq_state *NSQ,                        /* I/O  NSQ state                          */
334
    NSQ_del_dec_struct psDelDec[],              /* I/O  Delayed decision states            */
335
    opus_int signalType,                        /* I    Signal type                        */
336
    const opus_int32 x_Q10[],                   /* I                                       */
337
    opus_int8 pulses[],                         /* O                                       */
338
    opus_int16 xq[],                            /* O                                       */
339
    opus_int32 sLTP_Q15[],                      /* I/O  LTP filter state                   */
340
    opus_int32 delayedGain_Q10[DECISION_DELAY], /* I/O  Gain delay buffer                  */
341
    const opus_int16 a_Q12[],                   /* I    Short term prediction coefs        */
342
    const opus_int16 b_Q14[],                   /* I    Long term prediction coefs         */
343
    const opus_int16 AR_shp_Q13[],              /* I    Noise shaping coefs                */
344
    opus_int lag,                               /* I    Pitch lag                          */
345
    opus_int32 HarmShapeFIRPacked_Q14,          /* I                                       */
346
    opus_int Tilt_Q14,                          /* I    Spectral tilt                      */
347
    opus_int32 LF_shp_Q14,                      /* I                                       */
348
    opus_int32 Gain_Q16,                        /* I                                       */
349
    opus_int Lambda_Q10,                        /* I                                       */
350
    opus_int offset_Q10,                        /* I                                       */
351
    opus_int length,                            /* I    Input length                       */
352
    opus_int subfr,                             /* I    Subframe number                    */
353
    opus_int shapingLPCOrder,                   /* I    Shaping LPC filter order           */
354
    opus_int predictLPCOrder,                   /* I    Prediction filter order            */
355
    opus_int warping_Q16,                       /* I                                       */
356
    __m128i MaskDelDec,                         /* I    Mask of states in decision tree    */
357
    opus_int *smpl_buf_idx,                     /* I/O  Index to newest samples in buffers */
358
    opus_int decisionDelay                      /* I                                       */
359
);
360
361
void silk_NSQ_del_dec_avx2(
362
    const silk_encoder_state *psEncC,                            /* I    Encoder State               */
363
    silk_nsq_state *NSQ,                                         /* I/O  NSQ state                   */
364
    SideInfoIndices *psIndices,                                  /* I/O  Quantization Indices        */
365
    const opus_int16 x16[],                                      /* I    Input                       */
366
    opus_int8 pulses[],                                          /* O    Quantized pulse signal      */
367
    const opus_int16 *PredCoef_Q12,                              /* I    Short term prediction coefs */
368
    const opus_int16 LTPCoef_Q14[LTP_ORDER * MAX_NB_SUBFR],      /* I    Long term prediction coefs  */
369
    const opus_int16 AR_Q13[MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER], /* I    Noise shaping coefs         */
370
    const opus_int HarmShapeGain_Q14[MAX_NB_SUBFR],              /* I    Long term shaping coefs     */
371
    const opus_int Tilt_Q14[MAX_NB_SUBFR],                       /* I    Spectral tilt               */
372
    const opus_int32 LF_shp_Q14[MAX_NB_SUBFR],                   /* I    Low frequency shaping coefs */
373
    const opus_int32 Gains_Q16[MAX_NB_SUBFR],                    /* I    Quantization step sizes     */
374
    const opus_int32 pitchL[MAX_NB_SUBFR],                       /* I    Pitch lags                  */
375
    const opus_int Lambda_Q10,                                   /* I    Rate/distortion tradeoff    */
376
    const opus_int LTP_scale_Q14                                 /* I    LTP state scaling           */
377
)
378
12.8M
{
379
#ifdef OPUS_CHECK_ASM
380
    silk_nsq_state NSQ_c;
381
    SideInfoIndices psIndices_c;
382
    opus_int8 pulses_c[MAX_FRAME_LENGTH];
383
    const opus_int8 *const pulses_a = pulses;
384
385
12.5M
    silk_memcpy(&NSQ_c, NSQ, sizeof(NSQ_c));
386
12.5M
    silk_memcpy(&psIndices_c, psIndices, sizeof(psIndices_c));
387
12.5M
    silk_memcpy(pulses_c, pulses, sizeof(pulses_c));
388
    silk_NSQ_del_dec_c(psEncC, &NSQ_c, &psIndices_c, x16, pulses_c, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16,
389
                       pitchL, Lambda_Q10, LTP_scale_Q14);
390
#endif
391
392
12.8M
    if (!verify_assumptions(psEncC))
393
7.12M
    {
394
7.12M
        silk_NSQ_del_dec_c(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14);
395
7.12M
        return;
396
7.12M
    }
397
398
5.69M
    opus_int i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr;
399
5.69M
    opus_int last_smple_idx, smpl_buf_idx, decisionDelay;
400
5.69M
    const opus_int16 *A_Q12, *B_Q14, *AR_shp_Q13;
401
5.69M
    opus_int16 *pxq;
402
5.69M
    VARDECL(opus_int32, sLTP_Q15);
403
5.69M
    VARDECL(opus_int16, sLTP);
404
5.69M
    opus_int32 HarmShapeFIRPacked_Q14;
405
5.69M
    opus_int offset_Q10;
406
5.69M
    opus_int32 Gain_Q10;
407
5.69M
    opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH];
408
5.69M
    opus_int32 delayedGain_Q10[DECISION_DELAY];
409
5.69M
    NSQ_del_dec_struct psDelDec = {0};
410
5.69M
    NSQ_del_dec_sample_struct *psSample;
411
5.69M
    __m128i RDmin_Q10, MaskDelDec, Winner_selector;
412
5.69M
    SAVE_STACK;
413
414
5.69M
    MaskDelDec = _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFFFFF00ul << ((psEncC->nStatesDelayedDecision - 1) << 3)));
415
416
    /* Set unvoiced lag to the previous one, overwrite later for voiced */
417
5.69M
    lag = NSQ->lagPrev;
418
419
5.69M
    silk_assert(NSQ->prev_gain_Q16 != 0);
420
5.69M
    psDelDec.Seed = _mm_and_si128(
421
5.69M
        _mm_add_epi32(_mm_set_epi32(3, 2, 1, 0), _mm_set1_epi32(psIndices->Seed)),
422
5.69M
        _mm_set1_epi32(3));
423
5.69M
    psDelDec.SeedInit = psDelDec.Seed;
424
5.69M
    psDelDec.RD_Q10 = _mm_setzero_si128();
425
5.69M
    psDelDec.LF_AR_Q14 = _mm_set1_epi32(NSQ->sLF_AR_shp_Q14);
426
5.69M
    psDelDec.Diff_Q14 = _mm_set1_epi32(NSQ->sDiff_shp_Q14);
427
5.69M
    psDelDec.Samples[0].Shape_Q14 = _mm_set1_epi32(NSQ->sLTP_shp_Q14[psEncC->ltp_mem_length - 1]);
428
96.7M
    for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++)
429
91.1M
    {
430
91.1M
        psDelDec.sLPC_Q14[i] = _mm_set1_epi32(NSQ->sLPC_Q14[i]);
431
91.1M
    }
432
142M
    for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++)
433
136M
    {
434
136M
        psDelDec.sAR2_Q14[i] = _mm_set1_epi32(NSQ->sAR2_Q14[i]);
435
136M
    }
436
437
5.69M
    offset_Q10 = silk_Quantization_Offsets_Q10[psIndices->signalType >> 1][psIndices->quantOffsetType];
438
5.69M
    smpl_buf_idx = 0; /* index of oldest samples */
439
440
5.69M
    decisionDelay = silk_min_int(DECISION_DELAY, psEncC->subfr_length);
441
442
    /* For voiced frames limit the decision delay to lower than the pitch lag */
443
5.69M
    if (psIndices->signalType == TYPE_VOICED)
444
172k
    {
445
792k
        for (k = 0; k < psEncC->nb_subfr; k++)
446
619k
        {
447
619k
            decisionDelay = silk_min_int(decisionDelay, pitchL[k] - LTP_ORDER / 2 - 1);
448
619k
        }
449
172k
    }
450
5.52M
    else
451
5.52M
    {
452
5.52M
        if (lag > 0)
453
186k
        {
454
186k
            decisionDelay = silk_min_int(decisionDelay, lag - LTP_ORDER / 2 - 1);
455
186k
        }
456
5.52M
    }
457
458
5.69M
    if (psIndices->NLSFInterpCoef_Q2 == 4)
459
5.57M
    {
460
5.57M
        LSF_interpolation_flag = 0;
461
5.57M
    }
462
121k
    else
463
121k
    {
464
121k
        LSF_interpolation_flag = 1;
465
121k
    }
466
467
5.69M
    ALLOC(sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32);
468
5.69M
    ALLOC(sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16);
469
    /* Set up pointers to start of sub frame */
470
5.69M
    pxq = &NSQ->xq[psEncC->ltp_mem_length];
471
5.69M
    NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length;
472
5.69M
    NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
473
5.69M
    subfr = 0;
474
24.6M
    for (k = 0; k < psEncC->nb_subfr; k++)
475
18.9M
    {
476
18.9M
        A_Q12 = &PredCoef_Q12[((k >> 1) | (1 ^ LSF_interpolation_flag)) * MAX_LPC_ORDER];
477
18.9M
        B_Q14 = &LTPCoef_Q14[k * LTP_ORDER];
478
18.9M
        AR_shp_Q13 = &AR_Q13[k * MAX_SHAPE_LPC_ORDER];
479
480
        /* Noise shape parameters */
481
18.9M
        silk_assert(HarmShapeGain_Q14[k] >= 0);
482
18.9M
        HarmShapeFIRPacked_Q14  =                          silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 );
483
18.9M
        HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 );
484
485
18.9M
        NSQ->rewhite_flag = 0;
486
18.9M
        if (psIndices->signalType == TYPE_VOICED)
487
619k
        {
488
            /* Voiced */
489
619k
            lag = pitchL[k];
490
491
            /* Re-whitening */
492
619k
            if ((k & (3 ^ (LSF_interpolation_flag << 1))) == 0)
493
215k
            {
494
215k
                if (k == 2)
495
43.3k
                {
496
                    /* RESET DELAYED DECISIONS */
497
                    /* Find winner */
498
43.3k
                    RDmin_Q10 = silk_mm_mask_hmin_epi32(psDelDec.RD_Q10, MaskDelDec);
499
43.3k
                    Winner_ind = silk_index_of_first_equal_epi32(RDmin_Q10, psDelDec.RD_Q10);
500
43.3k
                    Winner_selector = silk_index_to_selector(Winner_ind);
501
43.3k
                    psDelDec.RD_Q10 = _mm_add_epi32(
502
43.3k
                        psDelDec.RD_Q10,
503
43.3k
                        _mm_blendv_epi8(
504
43.3k
                            _mm_set1_epi32(silk_int32_MAX >> 4),
505
43.3k
                            _mm_setzero_si128(),
506
43.3k
                            _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFU << (unsigned)(Winner_ind << 3)))));
507
508
                    /* Copy final part of signals from winner state to output and long-term filter states */
509
43.3k
                    last_smple_idx = smpl_buf_idx + decisionDelay;
510
1.24M
                    for (i = 0; i < decisionDelay; i++)
511
1.19M
                    {
512
1.19M
                        last_smple_idx = (last_smple_idx + DECISION_DELAY - 1) % DECISION_DELAY;
513
1.19M
                        psSample = &psDelDec.Samples[last_smple_idx];
514
1.19M
                        pulses[i - decisionDelay] =
515
1.19M
                            (opus_int8)silk_sar_round_32(silk_select_winner(psSample->Q_Q10, Winner_selector), 10);
516
1.19M
                        pxq[i - decisionDelay] =
517
1.19M
                            silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psSample->Xq_Q14, Winner_selector), Gains_Q16[1], 14));
518
1.19M
                        NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay + i] =
519
1.19M
                            silk_select_winner(psSample->Shape_Q14, Winner_selector);
520
1.19M
                    }
521
522
43.3k
                    subfr = 0;
523
43.3k
                }
524
525
                /* Rewhiten with new A coefs */
526
215k
                start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2;
527
215k
                silk_assert(start_idx > 0);
528
529
215k
                silk_LPC_analysis_filter_avx2(&sLTP[start_idx], &NSQ->xq[start_idx + k * psEncC->subfr_length],
530
215k
                                              A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder);
531
532
215k
                NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
533
215k
                NSQ->rewhite_flag = 1;
534
215k
            }
535
619k
        }
536
537
18.9M
        silk_nsq_del_dec_scale_states_avx2(psEncC, NSQ, &psDelDec, x16, x_sc_Q10, sLTP, sLTP_Q15, k,
538
18.9M
                                           LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay);
539
540
18.9M
        silk_noise_shape_quantizer_del_dec_avx2(NSQ, &psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15,
541
18.9M
                                                delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[k], LF_shp_Q14[k],
542
18.9M
                                                Gains_Q16[k], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder,
543
18.9M
                                                psEncC->predictLPCOrder, psEncC->warping_Q16, MaskDelDec, &smpl_buf_idx, decisionDelay);
544
545
18.9M
        x16 += psEncC->subfr_length;
546
18.9M
        pulses += psEncC->subfr_length;
547
18.9M
        pxq += psEncC->subfr_length;
548
18.9M
    }
549
550
    /* Find winner */
551
5.69M
    RDmin_Q10 = silk_mm_mask_hmin_epi32(psDelDec.RD_Q10, MaskDelDec);
552
5.69M
    Winner_selector = silk_index_to_selector(silk_index_of_first_equal_epi32(RDmin_Q10, psDelDec.RD_Q10));
553
554
    /* Copy final part of signals from winner state to output and long-term filter states */
555
5.69M
    psIndices->Seed = silk_select_winner(psDelDec.SeedInit, Winner_selector);
556
5.69M
    last_smple_idx = smpl_buf_idx + decisionDelay;
557
5.69M
    Gain_Q10 = Gains_Q16[psEncC->nb_subfr - 1] >> 6;
558
231M
    for (i = 0; i < decisionDelay; i++)
559
225M
    {
560
225M
        last_smple_idx = (last_smple_idx + DECISION_DELAY - 1) % DECISION_DELAY;
561
225M
        psSample = &psDelDec.Samples[last_smple_idx];
562
563
225M
        pulses[i - decisionDelay] =
564
225M
            (opus_int8)silk_sar_round_32(silk_select_winner(psSample->Q_Q10, Winner_selector), 10);
565
225M
        pxq[i - decisionDelay] =
566
225M
            silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psSample->Xq_Q14, Winner_selector), Gain_Q10, 8));
567
225M
        NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay + i] =
568
225M
            silk_select_winner(psSample->Shape_Q14, Winner_selector);
569
225M
    }
570
96.7M
    for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++)
571
91.1M
    {
572
91.1M
        NSQ->sLPC_Q14[i] = silk_select_winner(psDelDec.sLPC_Q14[i], Winner_selector);
573
91.1M
    }
574
142M
    for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++)
575
136M
    {
576
136M
        NSQ->sAR2_Q14[i] = silk_select_winner(psDelDec.sAR2_Q14[i], Winner_selector);
577
136M
    }
578
579
    /* Update states */
580
5.69M
    NSQ->sLF_AR_shp_Q14 = silk_select_winner(psDelDec.LF_AR_Q14, Winner_selector);
581
5.69M
    NSQ->sDiff_shp_Q14 = silk_select_winner(psDelDec.Diff_Q14, Winner_selector);
582
5.69M
    NSQ->lagPrev = pitchL[psEncC->nb_subfr - 1];
583
584
    /* Save quantized speech signal */
585
5.69M
    silk_memmove(NSQ->xq, &NSQ->xq[psEncC->frame_length], psEncC->ltp_mem_length * sizeof(opus_int16));
586
5.69M
    silk_memmove(NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[psEncC->frame_length], psEncC->ltp_mem_length * sizeof(opus_int32));
587
588
#ifdef OPUS_CHECK_ASM
589
5.52M
    silk_assert(!memcmp(&NSQ_c, NSQ, sizeof(NSQ_c)));
590
5.52M
    silk_assert(!memcmp(&psIndices_c, psIndices, sizeof(psIndices_c)));
591
5.52M
    silk_assert(!memcmp(pulses_c, pulses_a, sizeof(pulses_c)));
592
5.52M
#endif
593
594
5.52M
    RESTORE_STACK;
595
5.52M
}
silk_NSQ_del_dec_avx2
Line
Count
Source
378
12.5M
{
379
12.5M
#ifdef OPUS_CHECK_ASM
380
12.5M
    silk_nsq_state NSQ_c;
381
12.5M
    SideInfoIndices psIndices_c;
382
12.5M
    opus_int8 pulses_c[MAX_FRAME_LENGTH];
383
12.5M
    const opus_int8 *const pulses_a = pulses;
384
385
12.5M
    silk_memcpy(&NSQ_c, NSQ, sizeof(NSQ_c));
386
12.5M
    silk_memcpy(&psIndices_c, psIndices, sizeof(psIndices_c));
387
12.5M
    silk_memcpy(pulses_c, pulses, sizeof(pulses_c));
388
12.5M
    silk_NSQ_del_dec_c(psEncC, &NSQ_c, &psIndices_c, x16, pulses_c, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16,
389
12.5M
                       pitchL, Lambda_Q10, LTP_scale_Q14);
390
12.5M
#endif
391
392
12.5M
    if (!verify_assumptions(psEncC))
393
7.03M
    {
394
7.03M
        silk_NSQ_del_dec_c(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14);
395
7.03M
        return;
396
7.03M
    }
397
398
5.52M
    opus_int i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr;
399
5.52M
    opus_int last_smple_idx, smpl_buf_idx, decisionDelay;
400
5.52M
    const opus_int16 *A_Q12, *B_Q14, *AR_shp_Q13;
401
5.52M
    opus_int16 *pxq;
402
5.52M
    VARDECL(opus_int32, sLTP_Q15);
403
5.52M
    VARDECL(opus_int16, sLTP);
404
5.52M
    opus_int32 HarmShapeFIRPacked_Q14;
405
5.52M
    opus_int offset_Q10;
406
5.52M
    opus_int32 Gain_Q10;
407
5.52M
    opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH];
408
5.52M
    opus_int32 delayedGain_Q10[DECISION_DELAY];
409
5.52M
    NSQ_del_dec_struct psDelDec = {0};
410
5.52M
    NSQ_del_dec_sample_struct *psSample;
411
5.52M
    __m128i RDmin_Q10, MaskDelDec, Winner_selector;
412
5.52M
    SAVE_STACK;
413
414
5.52M
    MaskDelDec = _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFFFFF00ul << ((psEncC->nStatesDelayedDecision - 1) << 3)));
415
416
    /* Set unvoiced lag to the previous one, overwrite later for voiced */
417
5.52M
    lag = NSQ->lagPrev;
418
419
5.52M
    silk_assert(NSQ->prev_gain_Q16 != 0);
420
5.52M
    psDelDec.Seed = _mm_and_si128(
421
5.52M
        _mm_add_epi32(_mm_set_epi32(3, 2, 1, 0), _mm_set1_epi32(psIndices->Seed)),
422
5.52M
        _mm_set1_epi32(3));
423
5.52M
    psDelDec.SeedInit = psDelDec.Seed;
424
5.52M
    psDelDec.RD_Q10 = _mm_setzero_si128();
425
5.52M
    psDelDec.LF_AR_Q14 = _mm_set1_epi32(NSQ->sLF_AR_shp_Q14);
426
5.52M
    psDelDec.Diff_Q14 = _mm_set1_epi32(NSQ->sDiff_shp_Q14);
427
5.52M
    psDelDec.Samples[0].Shape_Q14 = _mm_set1_epi32(NSQ->sLTP_shp_Q14[psEncC->ltp_mem_length - 1]);
428
93.9M
    for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++)
429
88.4M
    {
430
88.4M
        psDelDec.sLPC_Q14[i] = _mm_set1_epi32(NSQ->sLPC_Q14[i]);
431
88.4M
    }
432
138M
    for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++)
433
132M
    {
434
132M
        psDelDec.sAR2_Q14[i] = _mm_set1_epi32(NSQ->sAR2_Q14[i]);
435
132M
    }
436
437
5.52M
    offset_Q10 = silk_Quantization_Offsets_Q10[psIndices->signalType >> 1][psIndices->quantOffsetType];
438
5.52M
    smpl_buf_idx = 0; /* index of oldest samples */
439
440
5.52M
    decisionDelay = silk_min_int(DECISION_DELAY, psEncC->subfr_length);
441
442
    /* For voiced frames limit the decision delay to lower than the pitch lag */
443
5.52M
    if (psIndices->signalType == TYPE_VOICED)
444
141k
    {
445
637k
        for (k = 0; k < psEncC->nb_subfr; k++)
446
496k
        {
447
496k
            decisionDelay = silk_min_int(decisionDelay, pitchL[k] - LTP_ORDER / 2 - 1);
448
496k
        }
449
141k
    }
450
5.38M
    else
451
5.38M
    {
452
5.38M
        if (lag > 0)
453
108k
        {
454
108k
            decisionDelay = silk_min_int(decisionDelay, lag - LTP_ORDER / 2 - 1);
455
108k
        }
456
5.38M
    }
457
458
5.52M
    if (psIndices->NLSFInterpCoef_Q2 == 4)
459
5.43M
    {
460
5.43M
        LSF_interpolation_flag = 0;
461
5.43M
    }
462
91.8k
    else
463
91.8k
    {
464
91.8k
        LSF_interpolation_flag = 1;
465
91.8k
    }
466
467
5.52M
    ALLOC(sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32);
468
5.52M
    ALLOC(sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16);
469
    /* Set up pointers to start of sub frame */
470
5.52M
    pxq = &NSQ->xq[psEncC->ltp_mem_length];
471
5.52M
    NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length;
472
5.52M
    NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
473
5.52M
    subfr = 0;
474
23.8M
    for (k = 0; k < psEncC->nb_subfr; k++)
475
18.3M
    {
476
18.3M
        A_Q12 = &PredCoef_Q12[((k >> 1) | (1 ^ LSF_interpolation_flag)) * MAX_LPC_ORDER];
477
18.3M
        B_Q14 = &LTPCoef_Q14[k * LTP_ORDER];
478
18.3M
        AR_shp_Q13 = &AR_Q13[k * MAX_SHAPE_LPC_ORDER];
479
480
        /* Noise shape parameters */
481
18.3M
        silk_assert(HarmShapeGain_Q14[k] >= 0);
482
18.3M
        HarmShapeFIRPacked_Q14  =                          silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 );
483
18.3M
        HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 );
484
485
18.3M
        NSQ->rewhite_flag = 0;
486
18.3M
        if (psIndices->signalType == TYPE_VOICED)
487
496k
        {
488
            /* Voiced */
489
496k
            lag = pitchL[k];
490
491
            /* Re-whitening */
492
496k
            if ((k & (3 ^ (LSF_interpolation_flag << 1))) == 0)
493
173k
            {
494
173k
                if (k == 2)
495
31.8k
                {
496
                    /* RESET DELAYED DECISIONS */
497
                    /* Find winner */
498
31.8k
                    RDmin_Q10 = silk_mm_mask_hmin_epi32(psDelDec.RD_Q10, MaskDelDec);
499
31.8k
                    Winner_ind = silk_index_of_first_equal_epi32(RDmin_Q10, psDelDec.RD_Q10);
500
31.8k
                    Winner_selector = silk_index_to_selector(Winner_ind);
501
31.8k
                    psDelDec.RD_Q10 = _mm_add_epi32(
502
31.8k
                        psDelDec.RD_Q10,
503
31.8k
                        _mm_blendv_epi8(
504
31.8k
                            _mm_set1_epi32(silk_int32_MAX >> 4),
505
31.8k
                            _mm_setzero_si128(),
506
31.8k
                            _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFU << (unsigned)(Winner_ind << 3)))));
507
508
                    /* Copy final part of signals from winner state to output and long-term filter states */
509
31.8k
                    last_smple_idx = smpl_buf_idx + decisionDelay;
510
916k
                    for (i = 0; i < decisionDelay; i++)
511
884k
                    {
512
884k
                        last_smple_idx = (last_smple_idx + DECISION_DELAY - 1) % DECISION_DELAY;
513
884k
                        psSample = &psDelDec.Samples[last_smple_idx];
514
884k
                        pulses[i - decisionDelay] =
515
884k
                            (opus_int8)silk_sar_round_32(silk_select_winner(psSample->Q_Q10, Winner_selector), 10);
516
884k
                        pxq[i - decisionDelay] =
517
884k
                            silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psSample->Xq_Q14, Winner_selector), Gains_Q16[1], 14));
518
884k
                        NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay + i] =
519
884k
                            silk_select_winner(psSample->Shape_Q14, Winner_selector);
520
884k
                    }
521
522
31.8k
                    subfr = 0;
523
31.8k
                }
524
525
                /* Rewhiten with new A coefs */
526
173k
                start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2;
527
173k
                silk_assert(start_idx > 0);
528
529
173k
                silk_LPC_analysis_filter_avx2(&sLTP[start_idx], &NSQ->xq[start_idx + k * psEncC->subfr_length],
530
173k
                                              A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder);
531
532
173k
                NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
533
173k
                NSQ->rewhite_flag = 1;
534
173k
            }
535
496k
        }
536
537
18.3M
        silk_nsq_del_dec_scale_states_avx2(psEncC, NSQ, &psDelDec, x16, x_sc_Q10, sLTP, sLTP_Q15, k,
538
18.3M
                                           LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay);
539
540
18.3M
        silk_noise_shape_quantizer_del_dec_avx2(NSQ, &psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15,
541
18.3M
                                                delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[k], LF_shp_Q14[k],
542
18.3M
                                                Gains_Q16[k], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder,
543
18.3M
                                                psEncC->predictLPCOrder, psEncC->warping_Q16, MaskDelDec, &smpl_buf_idx, decisionDelay);
544
545
18.3M
        x16 += psEncC->subfr_length;
546
18.3M
        pulses += psEncC->subfr_length;
547
18.3M
        pxq += psEncC->subfr_length;
548
18.3M
    }
549
550
    /* Find winner */
551
5.52M
    RDmin_Q10 = silk_mm_mask_hmin_epi32(psDelDec.RD_Q10, MaskDelDec);
552
5.52M
    Winner_selector = silk_index_to_selector(silk_index_of_first_equal_epi32(RDmin_Q10, psDelDec.RD_Q10));
553
554
    /* Copy final part of signals from winner state to output and long-term filter states */
555
5.52M
    psIndices->Seed = silk_select_winner(psDelDec.SeedInit, Winner_selector);
556
5.52M
    last_smple_idx = smpl_buf_idx + decisionDelay;
557
5.52M
    Gain_Q10 = Gains_Q16[psEncC->nb_subfr - 1] >> 6;
558
225M
    for (i = 0; i < decisionDelay; i++)
559
219M
    {
560
219M
        last_smple_idx = (last_smple_idx + DECISION_DELAY - 1) % DECISION_DELAY;
561
219M
        psSample = &psDelDec.Samples[last_smple_idx];
562
563
219M
        pulses[i - decisionDelay] =
564
219M
            (opus_int8)silk_sar_round_32(silk_select_winner(psSample->Q_Q10, Winner_selector), 10);
565
219M
        pxq[i - decisionDelay] =
566
219M
            silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psSample->Xq_Q14, Winner_selector), Gain_Q10, 8));
567
219M
        NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay + i] =
568
219M
            silk_select_winner(psSample->Shape_Q14, Winner_selector);
569
219M
    }
570
93.9M
    for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++)
571
88.4M
    {
572
88.4M
        NSQ->sLPC_Q14[i] = silk_select_winner(psDelDec.sLPC_Q14[i], Winner_selector);
573
88.4M
    }
574
138M
    for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++)
575
132M
    {
576
132M
        NSQ->sAR2_Q14[i] = silk_select_winner(psDelDec.sAR2_Q14[i], Winner_selector);
577
132M
    }
578
579
    /* Update states */
580
5.52M
    NSQ->sLF_AR_shp_Q14 = silk_select_winner(psDelDec.LF_AR_Q14, Winner_selector);
581
5.52M
    NSQ->sDiff_shp_Q14 = silk_select_winner(psDelDec.Diff_Q14, Winner_selector);
582
5.52M
    NSQ->lagPrev = pitchL[psEncC->nb_subfr - 1];
583
584
    /* Save quantized speech signal */
585
5.52M
    silk_memmove(NSQ->xq, &NSQ->xq[psEncC->frame_length], psEncC->ltp_mem_length * sizeof(opus_int16));
586
5.52M
    silk_memmove(NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[psEncC->frame_length], psEncC->ltp_mem_length * sizeof(opus_int32));
587
588
5.52M
#ifdef OPUS_CHECK_ASM
589
5.52M
    silk_assert(!memcmp(&NSQ_c, NSQ, sizeof(NSQ_c)));
590
5.52M
    silk_assert(!memcmp(&psIndices_c, psIndices, sizeof(psIndices_c)));
591
5.52M
    silk_assert(!memcmp(pulses_c, pulses_a, sizeof(pulses_c)));
592
5.52M
#endif
593
594
5.52M
    RESTORE_STACK;
595
5.52M
}
silk_NSQ_del_dec_avx2
Line
Count
Source
378
257k
{
379
#ifdef OPUS_CHECK_ASM
380
    silk_nsq_state NSQ_c;
381
    SideInfoIndices psIndices_c;
382
    opus_int8 pulses_c[MAX_FRAME_LENGTH];
383
    const opus_int8 *const pulses_a = pulses;
384
385
    silk_memcpy(&NSQ_c, NSQ, sizeof(NSQ_c));
386
    silk_memcpy(&psIndices_c, psIndices, sizeof(psIndices_c));
387
    silk_memcpy(pulses_c, pulses, sizeof(pulses_c));
388
    silk_NSQ_del_dec_c(psEncC, &NSQ_c, &psIndices_c, x16, pulses_c, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16,
389
                       pitchL, Lambda_Q10, LTP_scale_Q14);
390
#endif
391
392
257k
    if (!verify_assumptions(psEncC))
393
92.4k
    {
394
92.4k
        silk_NSQ_del_dec_c(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14);
395
92.4k
        return;
396
92.4k
    }
397
398
165k
    opus_int i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr;
399
165k
    opus_int last_smple_idx, smpl_buf_idx, decisionDelay;
400
165k
    const opus_int16 *A_Q12, *B_Q14, *AR_shp_Q13;
401
165k
    opus_int16 *pxq;
402
165k
    VARDECL(opus_int32, sLTP_Q15);
403
165k
    VARDECL(opus_int16, sLTP);
404
165k
    opus_int32 HarmShapeFIRPacked_Q14;
405
165k
    opus_int offset_Q10;
406
165k
    opus_int32 Gain_Q10;
407
165k
    opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH];
408
165k
    opus_int32 delayedGain_Q10[DECISION_DELAY];
409
165k
    NSQ_del_dec_struct psDelDec = {0};
410
165k
    NSQ_del_dec_sample_struct *psSample;
411
165k
    __m128i RDmin_Q10, MaskDelDec, Winner_selector;
412
165k
    SAVE_STACK;
413
414
165k
    MaskDelDec = _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFFFFF00ul << ((psEncC->nStatesDelayedDecision - 1) << 3)));
415
416
    /* Set unvoiced lag to the previous one, overwrite later for voiced */
417
165k
    lag = NSQ->lagPrev;
418
419
165k
    silk_assert(NSQ->prev_gain_Q16 != 0);
420
165k
    psDelDec.Seed = _mm_and_si128(
421
165k
        _mm_add_epi32(_mm_set_epi32(3, 2, 1, 0), _mm_set1_epi32(psIndices->Seed)),
422
165k
        _mm_set1_epi32(3));
423
165k
    psDelDec.SeedInit = psDelDec.Seed;
424
165k
    psDelDec.RD_Q10 = _mm_setzero_si128();
425
165k
    psDelDec.LF_AR_Q14 = _mm_set1_epi32(NSQ->sLF_AR_shp_Q14);
426
165k
    psDelDec.Diff_Q14 = _mm_set1_epi32(NSQ->sDiff_shp_Q14);
427
165k
    psDelDec.Samples[0].Shape_Q14 = _mm_set1_epi32(NSQ->sLTP_shp_Q14[psEncC->ltp_mem_length - 1]);
428
2.80M
    for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++)
429
2.64M
    {
430
2.64M
        psDelDec.sLPC_Q14[i] = _mm_set1_epi32(NSQ->sLPC_Q14[i]);
431
2.64M
    }
432
4.12M
    for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++)
433
3.96M
    {
434
3.96M
        psDelDec.sAR2_Q14[i] = _mm_set1_epi32(NSQ->sAR2_Q14[i]);
435
3.96M
    }
436
437
165k
    offset_Q10 = silk_Quantization_Offsets_Q10[psIndices->signalType >> 1][psIndices->quantOffsetType];
438
165k
    smpl_buf_idx = 0; /* index of oldest samples */
439
440
165k
    decisionDelay = silk_min_int(DECISION_DELAY, psEncC->subfr_length);
441
442
    /* For voiced frames limit the decision delay to lower than the pitch lag */
443
165k
    if (psIndices->signalType == TYPE_VOICED)
444
30.9k
    {
445
154k
        for (k = 0; k < psEncC->nb_subfr; k++)
446
123k
        {
447
123k
            decisionDelay = silk_min_int(decisionDelay, pitchL[k] - LTP_ORDER / 2 - 1);
448
123k
        }
449
30.9k
    }
450
134k
    else
451
134k
    {
452
134k
        if (lag > 0)
453
77.9k
        {
454
77.9k
            decisionDelay = silk_min_int(decisionDelay, lag - LTP_ORDER / 2 - 1);
455
77.9k
        }
456
134k
    }
457
458
165k
    if (psIndices->NLSFInterpCoef_Q2 == 4)
459
135k
    {
460
135k
        LSF_interpolation_flag = 0;
461
135k
    }
462
29.3k
    else
463
29.3k
    {
464
29.3k
        LSF_interpolation_flag = 1;
465
29.3k
    }
466
467
165k
    ALLOC(sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32);
468
165k
    ALLOC(sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16);
469
    /* Set up pointers to start of sub frame */
470
165k
    pxq = &NSQ->xq[psEncC->ltp_mem_length];
471
165k
    NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length;
472
165k
    NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
473
165k
    subfr = 0;
474
752k
    for (k = 0; k < psEncC->nb_subfr; k++)
475
587k
    {
476
587k
        A_Q12 = &PredCoef_Q12[((k >> 1) | (1 ^ LSF_interpolation_flag)) * MAX_LPC_ORDER];
477
587k
        B_Q14 = &LTPCoef_Q14[k * LTP_ORDER];
478
587k
        AR_shp_Q13 = &AR_Q13[k * MAX_SHAPE_LPC_ORDER];
479
480
        /* Noise shape parameters */
481
587k
        silk_assert(HarmShapeGain_Q14[k] >= 0);
482
587k
        HarmShapeFIRPacked_Q14  =                          silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 );
483
587k
        HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 );
484
485
587k
        NSQ->rewhite_flag = 0;
486
587k
        if (psIndices->signalType == TYPE_VOICED)
487
123k
        {
488
            /* Voiced */
489
123k
            lag = pitchL[k];
490
491
            /* Re-whitening */
492
123k
            if ((k & (3 ^ (LSF_interpolation_flag << 1))) == 0)
493
42.4k
            {
494
42.4k
                if (k == 2)
495
11.5k
                {
496
                    /* RESET DELAYED DECISIONS */
497
                    /* Find winner */
498
11.5k
                    RDmin_Q10 = silk_mm_mask_hmin_epi32(psDelDec.RD_Q10, MaskDelDec);
499
11.5k
                    Winner_ind = silk_index_of_first_equal_epi32(RDmin_Q10, psDelDec.RD_Q10);
500
11.5k
                    Winner_selector = silk_index_to_selector(Winner_ind);
501
11.5k
                    psDelDec.RD_Q10 = _mm_add_epi32(
502
11.5k
                        psDelDec.RD_Q10,
503
11.5k
                        _mm_blendv_epi8(
504
11.5k
                            _mm_set1_epi32(silk_int32_MAX >> 4),
505
11.5k
                            _mm_setzero_si128(),
506
11.5k
                            _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFU << (unsigned)(Winner_ind << 3)))));
507
508
                    /* Copy final part of signals from winner state to output and long-term filter states */
509
11.5k
                    last_smple_idx = smpl_buf_idx + decisionDelay;
510
326k
                    for (i = 0; i < decisionDelay; i++)
511
314k
                    {
512
314k
                        last_smple_idx = (last_smple_idx + DECISION_DELAY - 1) % DECISION_DELAY;
513
314k
                        psSample = &psDelDec.Samples[last_smple_idx];
514
314k
                        pulses[i - decisionDelay] =
515
314k
                            (opus_int8)silk_sar_round_32(silk_select_winner(psSample->Q_Q10, Winner_selector), 10);
516
314k
                        pxq[i - decisionDelay] =
517
314k
                            silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psSample->Xq_Q14, Winner_selector), Gains_Q16[1], 14));
518
314k
                        NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay + i] =
519
314k
                            silk_select_winner(psSample->Shape_Q14, Winner_selector);
520
314k
                    }
521
522
11.5k
                    subfr = 0;
523
11.5k
                }
524
525
                /* Rewhiten with new A coefs */
526
42.4k
                start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2;
527
42.4k
                silk_assert(start_idx > 0);
528
529
42.4k
                silk_LPC_analysis_filter_avx2(&sLTP[start_idx], &NSQ->xq[start_idx + k * psEncC->subfr_length],
530
42.4k
                                              A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder);
531
532
42.4k
                NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
533
42.4k
                NSQ->rewhite_flag = 1;
534
42.4k
            }
535
123k
        }
536
537
587k
        silk_nsq_del_dec_scale_states_avx2(psEncC, NSQ, &psDelDec, x16, x_sc_Q10, sLTP, sLTP_Q15, k,
538
587k
                                           LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay);
539
540
587k
        silk_noise_shape_quantizer_del_dec_avx2(NSQ, &psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15,
541
587k
                                                delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[k], LF_shp_Q14[k],
542
587k
                                                Gains_Q16[k], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder,
543
587k
                                                psEncC->predictLPCOrder, psEncC->warping_Q16, MaskDelDec, &smpl_buf_idx, decisionDelay);
544
545
587k
        x16 += psEncC->subfr_length;
546
587k
        pulses += psEncC->subfr_length;
547
587k
        pxq += psEncC->subfr_length;
548
587k
    }
549
550
    /* Find winner */
551
165k
    RDmin_Q10 = silk_mm_mask_hmin_epi32(psDelDec.RD_Q10, MaskDelDec);
552
165k
    Winner_selector = silk_index_to_selector(silk_index_of_first_equal_epi32(RDmin_Q10, psDelDec.RD_Q10));
553
554
    /* Copy final part of signals from winner state to output and long-term filter states */
555
165k
    psIndices->Seed = silk_select_winner(psDelDec.SeedInit, Winner_selector);
556
165k
    last_smple_idx = smpl_buf_idx + decisionDelay;
557
165k
    Gain_Q10 = Gains_Q16[psEncC->nb_subfr - 1] >> 6;
558
6.33M
    for (i = 0; i < decisionDelay; i++)
559
6.17M
    {
560
6.17M
        last_smple_idx = (last_smple_idx + DECISION_DELAY - 1) % DECISION_DELAY;
561
6.17M
        psSample = &psDelDec.Samples[last_smple_idx];
562
563
6.17M
        pulses[i - decisionDelay] =
564
6.17M
            (opus_int8)silk_sar_round_32(silk_select_winner(psSample->Q_Q10, Winner_selector), 10);
565
6.17M
        pxq[i - decisionDelay] =
566
6.17M
            silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psSample->Xq_Q14, Winner_selector), Gain_Q10, 8));
567
6.17M
        NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay + i] =
568
6.17M
            silk_select_winner(psSample->Shape_Q14, Winner_selector);
569
6.17M
    }
570
2.80M
    for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++)
571
2.64M
    {
572
2.64M
        NSQ->sLPC_Q14[i] = silk_select_winner(psDelDec.sLPC_Q14[i], Winner_selector);
573
2.64M
    }
574
4.12M
    for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++)
575
3.96M
    {
576
3.96M
        NSQ->sAR2_Q14[i] = silk_select_winner(psDelDec.sAR2_Q14[i], Winner_selector);
577
3.96M
    }
578
579
    /* Update states */
580
165k
    NSQ->sLF_AR_shp_Q14 = silk_select_winner(psDelDec.LF_AR_Q14, Winner_selector);
581
165k
    NSQ->sDiff_shp_Q14 = silk_select_winner(psDelDec.Diff_Q14, Winner_selector);
582
165k
    NSQ->lagPrev = pitchL[psEncC->nb_subfr - 1];
583
584
    /* Save quantized speech signal */
585
165k
    silk_memmove(NSQ->xq, &NSQ->xq[psEncC->frame_length], psEncC->ltp_mem_length * sizeof(opus_int16));
586
165k
    silk_memmove(NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[psEncC->frame_length], psEncC->ltp_mem_length * sizeof(opus_int32));
587
588
#ifdef OPUS_CHECK_ASM
589
    silk_assert(!memcmp(&NSQ_c, NSQ, sizeof(NSQ_c)));
590
    silk_assert(!memcmp(&psIndices_c, psIndices, sizeof(psIndices_c)));
591
    silk_assert(!memcmp(pulses_c, pulses_a, sizeof(pulses_c)));
592
#endif
593
594
165k
    RESTORE_STACK;
595
165k
}
596
597
static OPUS_INLINE __m128i silk_noise_shape_quantizer_short_prediction_x4(const __m128i *buf32, const opus_int16 *coef16, opus_int order)
598
927M
{
599
927M
    __m256i out;
600
927M
    silk_assert(order == 10 || order == 16);
601
602
    /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
603
927M
    out = _mm256_set1_epi32(order >> 1);
604
927M
    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-0]), _mm256_set1_epi32(silk_LSHIFT(coef16[0], 16)))); /* High DWORD */
605
927M
    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-1]), _mm256_set1_epi32(silk_LSHIFT(coef16[1], 16)))); /* High DWORD */
606
927M
    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-2]), _mm256_set1_epi32(silk_LSHIFT(coef16[2], 16)))); /* High DWORD */
607
927M
    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-3]), _mm256_set1_epi32(silk_LSHIFT(coef16[3], 16)))); /* High DWORD */
608
927M
    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-4]), _mm256_set1_epi32(silk_LSHIFT(coef16[4], 16)))); /* High DWORD */
609
927M
    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-5]), _mm256_set1_epi32(silk_LSHIFT(coef16[5], 16)))); /* High DWORD */
610
927M
    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-6]), _mm256_set1_epi32(silk_LSHIFT(coef16[6], 16)))); /* High DWORD */
611
927M
    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-7]), _mm256_set1_epi32(silk_LSHIFT(coef16[7], 16)))); /* High DWORD */
612
927M
    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-8]), _mm256_set1_epi32(silk_LSHIFT(coef16[8], 16)))); /* High DWORD */
613
927M
    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-9]), _mm256_set1_epi32(silk_LSHIFT(coef16[9], 16)))); /* High DWORD */
614
615
927M
    if (order == 16)
616
173M
    {
617
173M
        out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-10]), _mm256_set1_epi32(silk_LSHIFT(coef16[10], 16)))); /* High DWORD */
618
173M
        out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-11]), _mm256_set1_epi32(silk_LSHIFT(coef16[11], 16)))); /* High DWORD */
619
173M
        out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-12]), _mm256_set1_epi32(silk_LSHIFT(coef16[12], 16)))); /* High DWORD */
620
173M
        out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-13]), _mm256_set1_epi32(silk_LSHIFT(coef16[13], 16)))); /* High DWORD */
621
173M
        out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-14]), _mm256_set1_epi32(silk_LSHIFT(coef16[14], 16)))); /* High DWORD */
622
173M
        out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-15]), _mm256_set1_epi32(silk_LSHIFT(coef16[15], 16)))); /* High DWORD */
623
173M
    }
624
927M
    return silk_cvtepi64_epi32_high(out);
625
927M
}
626
627
/******************************************/
628
/* Noise shape quantizer for one subframe */
629
/******************************************/
630
static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_avx2(
631
    silk_nsq_state *NSQ,                        /* I/O  NSQ state                          */
632
    NSQ_del_dec_struct *psDelDec,               /* I/O  Delayed decision states            */
633
    opus_int signalType,                        /* I    Signal type                        */
634
    const opus_int32 x_Q10[],                   /* I                                       */
635
    opus_int8 pulses[],                         /* O                                       */
636
    opus_int16 xq[],                            /* O                                       */
637
    opus_int32 sLTP_Q15[],                      /* I/O  LTP filter state                   */
638
    opus_int32 delayedGain_Q10[DECISION_DELAY], /* I/O  Gain delay buffer                  */
639
    const opus_int16 a_Q12[],                   /* I    Short term prediction coefs        */
640
    const opus_int16 b_Q14[],                   /* I    Long term prediction coefs         */
641
    const opus_int16 AR_shp_Q13[],              /* I    Noise shaping coefs                */
642
    opus_int lag,                               /* I    Pitch lag                          */
643
    opus_int32 HarmShapeFIRPacked_Q14,          /* I                                       */
644
    opus_int Tilt_Q14,                          /* I    Spectral tilt                      */
645
    opus_int32 LF_shp_Q14,                      /* I                                       */
646
    opus_int32 Gain_Q16,                        /* I                                       */
647
    opus_int Lambda_Q10,                        /* I                                       */
648
    opus_int offset_Q10,                        /* I                                       */
649
    opus_int length,                            /* I    Input length                       */
650
    opus_int subfr,                             /* I    Subframe number                    */
651
    opus_int shapingLPCOrder,                   /* I    Shaping LPC filter order           */
652
    opus_int predictLPCOrder,                   /* I    Prediction filter order            */
653
    opus_int warping_Q16,                       /* I                                       */
654
    __m128i MaskDelDec,                         /* I    Mask of states in decision tree    */
655
    opus_int *smpl_buf_idx,                     /* I/O  Index to newest samples in buffers */
656
    opus_int decisionDelay                      /* I                                       */
657
)
658
18.9M
{
659
18.9M
    int i;
660
18.9M
    opus_int32 *shp_lag_ptr = &NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2];
661
18.9M
    opus_int32 *pred_lag_ptr = &sLTP_Q15[NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2];
662
18.9M
    opus_int32 Gain_Q10 = Gain_Q16 >> 6;
663
664
946M
    for (i = 0; i < length; i++)
665
927M
    {
666
        /* Perform common calculations used in all states */
667
        /* NSQ_sample_struct */
668
        /* Low  128 bits => 1st set */
669
        /* High 128 bits => 2nd set */
670
927M
        int j;
671
927M
        __m256i SS_Q_Q10;
672
927M
        __m256i SS_RD_Q10;
673
927M
        __m256i SS_xq_Q14;
674
927M
        __m256i SS_LF_AR_Q14;
675
927M
        __m256i SS_Diff_Q14;
676
927M
        __m256i SS_sLTP_shp_Q14;
677
927M
        __m256i SS_LPC_exc_Q14;
678
927M
        __m256i exc_Q14;
679
927M
        __m256i q_Q10, rr_Q10, rd_Q10;
680
927M
        __m256i mask;
681
927M
        __m128i LPC_pred_Q14, n_AR_Q14;
682
927M
        __m128i RDmin_Q10, RDmax_Q10;
683
927M
        __m128i n_LF_Q14;
684
927M
        __m128i r_Q10, q1_Q0, q1_Q10, q2_Q10;
685
927M
        __m128i Winner_rand_state, Winner_selector;
686
927M
        __m128i tmp0, tmp1;
687
927M
        NSQ_del_dec_sample_struct *psLastSample, *psSample;
688
927M
        opus_int32 RDmin_ind, RDmax_ind, last_smple_idx;
689
927M
        opus_int32 LTP_pred_Q14, n_LTP_Q14;
690
691
        /* Long-term prediction */
692
927M
        if (signalType == TYPE_VOICED)
693
31.0M
        {
694
            /* Unrolled loop */
695
            /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
696
31.0M
            LTP_pred_Q14 = 2;
697
31.0M
            LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-0], b_Q14[0]);
698
31.0M
            LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-1], b_Q14[1]);
699
31.0M
            LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-2], b_Q14[2]);
700
31.0M
            LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-3], b_Q14[3]);
701
31.0M
            LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-4], b_Q14[4]);
702
31.0M
            LTP_pred_Q14 = silk_LSHIFT(LTP_pred_Q14, 1); /* Q13 -> Q14 */
703
31.0M
            pred_lag_ptr++;
704
31.0M
        }
705
896M
        else
706
896M
        {
707
896M
            LTP_pred_Q14 = 0;
708
896M
        }
709
710
        /* Long-term shaping */
711
927M
        if (lag > 0)
712
59.9M
        {
713
            /* Symmetric, packed FIR coefficients */
714
59.9M
            n_LTP_Q14 = silk_add_sat32(shp_lag_ptr[0], shp_lag_ptr[-2]);
715
59.9M
            n_LTP_Q14 = silk_SMULWB(n_LTP_Q14, HarmShapeFIRPacked_Q14);
716
59.9M
            n_LTP_Q14 = n_LTP_Q14 + silk_SMULWT(shp_lag_ptr[-1], HarmShapeFIRPacked_Q14);
717
59.9M
            n_LTP_Q14 = LTP_pred_Q14 - (silk_LSHIFT(n_LTP_Q14, 2)); /* Q12 -> Q14 */
718
59.9M
            shp_lag_ptr++;
719
59.9M
        }
720
867M
        else
721
867M
        {
722
867M
            n_LTP_Q14 = 0;
723
867M
        }
724
725
        /* BEGIN Updating Delayed Decision States */
726
727
        /* Generate dither */
728
927M
        psDelDec->Seed = silk_mm256_rand_epi32(psDelDec->Seed);
729
730
        /* Short-term prediction */
731
927M
        LPC_pred_Q14 = silk_noise_shape_quantizer_short_prediction_x4(&psDelDec->sLPC_Q14[NSQ_LPC_BUF_LENGTH - 1 + i], a_Q12, predictLPCOrder);
732
927M
        LPC_pred_Q14 = _mm_slli_epi32(LPC_pred_Q14, 4); /* Q10 -> Q14 */
733
734
        /* Noise shape feedback */
735
927M
        silk_assert(shapingLPCOrder > 0);
736
927M
        silk_assert((shapingLPCOrder & 1) == 0); /* check that order is even */
737
        /* Output of lowpass section */
738
927M
        tmp0 = _mm_add_epi32(psDelDec->Diff_Q14, silk_mm_smulwb_epi32(psDelDec->sAR2_Q14[0], warping_Q16));
739
927M
        n_AR_Q14 = _mm_set1_epi32(shapingLPCOrder >> 1);
740
20.6G
        for (j = 0; j < shapingLPCOrder - 1; j++)
741
19.7G
        {
742
            /* Output of allpass section */
743
19.7G
            tmp1 = psDelDec->sAR2_Q14[j];
744
19.7G
            psDelDec->sAR2_Q14[j] = tmp0;
745
19.7G
            n_AR_Q14 = _mm_add_epi32(n_AR_Q14, silk_mm_smulwb_epi32(tmp0, AR_shp_Q13[j]));
746
19.7G
            tmp0 = _mm_add_epi32(tmp1, silk_mm_smulwb_epi32(_mm_sub_epi32(psDelDec->sAR2_Q14[j + 1], tmp0), warping_Q16));
747
19.7G
        }
748
927M
        psDelDec->sAR2_Q14[shapingLPCOrder - 1] = tmp0;
749
927M
        n_AR_Q14 = _mm_add_epi32(n_AR_Q14, silk_mm_smulwb_epi32(tmp0, AR_shp_Q13[shapingLPCOrder - 1]));
750
751
927M
        n_AR_Q14 = _mm_slli_epi32(n_AR_Q14, 1);                                                  /* Q11 -> Q12 */
752
927M
        n_AR_Q14 = _mm_add_epi32(n_AR_Q14, silk_mm_smulwb_epi32(psDelDec->LF_AR_Q14, Tilt_Q14)); /* Q12 */
753
927M
        n_AR_Q14 = _mm_slli_epi32(n_AR_Q14, 2);                                                  /* Q12 -> Q14 */
754
755
927M
        tmp0 = silk_mm_smulwb_epi32(psDelDec->Samples[*smpl_buf_idx].Shape_Q14, LF_shp_Q14); /* Q12 */
756
927M
        tmp1 = silk_mm_smulwb_epi32(psDelDec->LF_AR_Q14, LF_shp_Q14 >> 16);                  /* Q12 */
757
927M
        n_LF_Q14 = _mm_add_epi32(tmp0, tmp1);                                                /* Q12 */
758
927M
        n_LF_Q14 = _mm_slli_epi32(n_LF_Q14, 2);                                              /* Q12 -> Q14 */
759
760
        /* Input minus prediction plus noise feedback                       */
761
        /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP  */
762
927M
        tmp0 = silk_mm_add_sat_epi32(n_AR_Q14, n_LF_Q14);              /* Q14 */
763
927M
        tmp1 = _mm_add_epi32(_mm_set1_epi32(n_LTP_Q14), LPC_pred_Q14); /* Q13 */
764
927M
        tmp0 = silk_mm_sub_sat_epi32(tmp1, tmp0);                      /* Q13 */
765
927M
        tmp0 = silk_mm_srai_round_epi32(tmp0, 4);                      /* Q10 */
766
767
927M
        r_Q10 = _mm_sub_epi32(_mm_set1_epi32(x_Q10[i]), tmp0); /* residual error Q10 */
768
769
        /* Flip sign depending on dither */
770
927M
        r_Q10 = silk_mm_sign_epi32(r_Q10, psDelDec->Seed);
771
927M
        r_Q10 = silk_mm_limit_epi32(r_Q10, -(31 << 10), 30 << 10);
772
773
        /* Find two quantization level candidates and measure their rate-distortion */
774
927M
        q1_Q10 = _mm_sub_epi32(r_Q10, _mm_set1_epi32(offset_Q10));
775
927M
        q1_Q0 = _mm_srai_epi32(q1_Q10, 10);
776
927M
        if (Lambda_Q10 > 2048)
777
4.81M
        {
778
            /* For aggressive RDO, the bias becomes more than one pulse. */
779
4.81M
            tmp0 = _mm_sub_epi32(_mm_abs_epi32(q1_Q10), _mm_set1_epi32(Lambda_Q10 / 2 - 512)); /* rdo_offset */
780
4.81M
            q1_Q0 = _mm_srai_epi32(q1_Q10, 31);
781
4.81M
            tmp1 = _mm_cmpgt_epi32(tmp0, _mm_setzero_si128());
782
4.81M
            tmp0 = _mm_srai_epi32(silk_mm_sign_epi32(tmp0, q1_Q10), 10);
783
4.81M
            q1_Q0 = _mm_blendv_epi8(q1_Q0, tmp0, tmp1);
784
4.81M
        }
785
786
927M
        tmp0 = _mm_sign_epi32(_mm_set1_epi32(QUANT_LEVEL_ADJUST_Q10), q1_Q0);
787
927M
        q1_Q10 = _mm_sub_epi32(_mm_slli_epi32(q1_Q0, 10), tmp0);
788
927M
        q1_Q10 = _mm_add_epi32(q1_Q10, _mm_set1_epi32(offset_Q10));
789
790
        /* check if q1_Q0 is 0 or -1 */
791
927M
        tmp0 = _mm_add_epi32(_mm_srli_epi32(q1_Q0, 31), q1_Q0);
792
927M
        tmp1 = _mm_cmpeq_epi32(tmp0, _mm_setzero_si128());
793
927M
        tmp0 = _mm_blendv_epi8(_mm_set1_epi32(1024), _mm_set1_epi32(1024 - QUANT_LEVEL_ADJUST_Q10), tmp1);
794
927M
        q2_Q10 = _mm_add_epi32(q1_Q10, tmp0);
795
927M
        q_Q10 = _mm256_set_m128i(q2_Q10, q1_Q10);
796
797
927M
        rr_Q10 = _mm256_sub_epi32(_mm256_broadcastsi128_si256(r_Q10), q_Q10);
798
927M
        rd_Q10 = _mm256_abs_epi32(q_Q10);
799
927M
        rr_Q10 = silk_mm256_smulbb_epi32(rr_Q10, rr_Q10);
800
927M
        rd_Q10 = silk_mm256_smulbb_epi32(rd_Q10, _mm256_set1_epi32(Lambda_Q10));
801
927M
        rd_Q10 = _mm256_add_epi32(rd_Q10, rr_Q10);
802
927M
        rd_Q10 = _mm256_srai_epi32(rd_Q10, 10);
803
804
927M
        mask = _mm256_broadcastsi128_si256(_mm_cmplt_epi32(_mm256_extracti128_si256(rd_Q10, 0), _mm256_extracti128_si256(rd_Q10, 1)));
805
927M
        SS_RD_Q10 = _mm256_add_epi32(
806
927M
            _mm256_broadcastsi128_si256(psDelDec->RD_Q10),
807
927M
            _mm256_blendv_epi8(
808
927M
                _mm256_permute2x128_si256(rd_Q10, rd_Q10, 0x1),
809
927M
                rd_Q10,
810
927M
                mask));
811
927M
        SS_Q_Q10 = _mm256_blendv_epi8(
812
927M
            _mm256_permute2x128_si256(q_Q10, q_Q10, 0x1),
813
927M
            q_Q10,
814
927M
            mask);
815
816
        /* Update states for best and second best quantization */
817
818
        /* Quantized excitation */
819
927M
        exc_Q14 = silk_mm256_sign_epi32(_mm256_slli_epi32(SS_Q_Q10, 4), _mm256_broadcastsi128_si256(psDelDec->Seed));
820
821
        /* Add predictions */
822
927M
        exc_Q14 = _mm256_add_epi32(exc_Q14, _mm256_set1_epi32(LTP_pred_Q14));
823
927M
        SS_LPC_exc_Q14 = _mm256_slli_epi32(exc_Q14, 1);
824
927M
        SS_xq_Q14 = _mm256_add_epi32(exc_Q14, _mm256_broadcastsi128_si256(LPC_pred_Q14));
825
826
        /* Update states */
827
927M
        SS_Diff_Q14 = _mm256_sub_epi32(SS_xq_Q14, _mm256_set1_epi32(silk_LSHIFT(x_Q10[i], 4)));
828
927M
        SS_LF_AR_Q14 = _mm256_sub_epi32(SS_Diff_Q14, _mm256_broadcastsi128_si256(n_AR_Q14));
829
927M
        SS_sLTP_shp_Q14 = silk_mm256_sub_sat_epi32(SS_LF_AR_Q14, _mm256_broadcastsi128_si256(n_LF_Q14));
830
831
        /* END Updating Delayed Decision States */
832
833
927M
        *smpl_buf_idx = (*smpl_buf_idx + DECISION_DELAY - 1) % DECISION_DELAY;
834
927M
        last_smple_idx = (*smpl_buf_idx + decisionDelay) % DECISION_DELAY;
835
927M
        psLastSample = &psDelDec->Samples[last_smple_idx];
836
837
        /* Find winner */
838
927M
        RDmin_Q10 = silk_mm_mask_hmin_epi32(_mm256_castsi256_si128(SS_RD_Q10), MaskDelDec);
839
927M
        Winner_selector = silk_index_to_selector(silk_index_of_first_equal_epi32(RDmin_Q10, _mm256_castsi256_si128(SS_RD_Q10)));
840
841
        /* Increase RD values of expired states */
842
927M
        Winner_rand_state = _mm_shuffle_epi8(psLastSample->RandState, Winner_selector);
843
844
927M
        SS_RD_Q10 = _mm256_blendv_epi8(
845
927M
            _mm256_add_epi32(SS_RD_Q10, _mm256_set1_epi32(silk_int32_MAX >> 4)),
846
927M
            SS_RD_Q10,
847
927M
            _mm256_broadcastsi128_si256(_mm_cmpeq_epi32(psLastSample->RandState, Winner_rand_state)));
848
849
        /* find worst in first set */
850
927M
        RDmax_Q10 = silk_mm_mask_hmax_epi32(_mm256_extracti128_si256(SS_RD_Q10, 0), MaskDelDec);
851
        /* find best in second set */
852
927M
        RDmin_Q10 = silk_mm_mask_hmin_epi32(_mm256_extracti128_si256(SS_RD_Q10, 1), MaskDelDec);
853
854
        /* Replace a state if best from second set outperforms worst in first set */
855
927M
        tmp0 = _mm_cmplt_epi32(RDmin_Q10, RDmax_Q10);
856
927M
        if (!_mm_test_all_zeros(tmp0, tmp0))
857
241M
        {
858
241M
            int t;
859
241M
            RDmax_ind = silk_index_of_first_equal_epi32(RDmax_Q10, _mm256_extracti128_si256(SS_RD_Q10, 0));
860
241M
            RDmin_ind = silk_index_of_first_equal_epi32(RDmin_Q10, _mm256_extracti128_si256(SS_RD_Q10, 1));
861
241M
            tmp1 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFU << (unsigned)(RDmax_ind << 3)));
862
241M
            tmp0 = _mm_blendv_epi8(
863
241M
                _mm_set_epi8(0xF, 0xE, 0xD, 0xC, 0xB, 0xA, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0),
864
241M
                silk_index_to_selector(RDmin_ind),
865
241M
                tmp1);
866
16.9G
            for (t = i; t < MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH; t++)
867
16.6G
            {
868
16.6G
                psDelDec->sLPC_Q14[t] = _mm_shuffle_epi8(psDelDec->sLPC_Q14[t], tmp0);
869
16.6G
            }
870
241M
            psDelDec->Seed = _mm_shuffle_epi8(psDelDec->Seed, tmp0);
871
241M
            psDelDec->SeedInit = _mm_shuffle_epi8(psDelDec->SeedInit, tmp0);
872
6.04G
            for (t = 0; t < MAX_SHAPE_LPC_ORDER; t++)
873
5.79G
            {
874
5.79G
                psDelDec->sAR2_Q14[t] = _mm_shuffle_epi8(psDelDec->sAR2_Q14[t], tmp0);
875
5.79G
            }
876
9.90G
            for (t = 0; t < DECISION_DELAY; t++)
877
9.66G
            {
878
9.66G
                psDelDec->Samples[t].RandState = _mm_shuffle_epi8(psDelDec->Samples[t].RandState, tmp0);
879
9.66G
                psDelDec->Samples[t].Q_Q10 = _mm_shuffle_epi8(psDelDec->Samples[t].Q_Q10, tmp0);
880
9.66G
                psDelDec->Samples[t].Xq_Q14 = _mm_shuffle_epi8(psDelDec->Samples[t].Xq_Q14, tmp0);
881
9.66G
                psDelDec->Samples[t].Pred_Q15 = _mm_shuffle_epi8(psDelDec->Samples[t].Pred_Q15, tmp0);
882
9.66G
                psDelDec->Samples[t].Shape_Q14 = _mm_shuffle_epi8(psDelDec->Samples[t].Shape_Q14, tmp0);
883
9.66G
            }
884
241M
            mask = _mm256_castsi128_si256(_mm_blendv_epi8(_mm_set_epi32(0x3, 0x2, 0x1, 0x0), _mm_set1_epi32(RDmin_ind + 4), tmp1));
885
241M
            SS_Q_Q10 = _mm256_permutevar8x32_epi32(SS_Q_Q10, mask);
886
241M
            SS_RD_Q10 = _mm256_permutevar8x32_epi32(SS_RD_Q10, mask);
887
241M
            SS_xq_Q14 = _mm256_permutevar8x32_epi32(SS_xq_Q14, mask);
888
241M
            SS_LF_AR_Q14 = _mm256_permutevar8x32_epi32(SS_LF_AR_Q14, mask);
889
241M
            SS_Diff_Q14 = _mm256_permutevar8x32_epi32(SS_Diff_Q14, mask);
890
241M
            SS_sLTP_shp_Q14 = _mm256_permutevar8x32_epi32(SS_sLTP_shp_Q14, mask);
891
241M
            SS_LPC_exc_Q14 = _mm256_permutevar8x32_epi32(SS_LPC_exc_Q14, mask);
892
241M
        }
893
894
        /* Write samples from winner to output and long-term filter states */
895
927M
        if (subfr > 0 || i >= decisionDelay)
896
700M
        {
897
700M
            pulses[i - decisionDelay] =
898
700M
                (opus_int8)silk_sar_round_32(silk_select_winner(psLastSample->Q_Q10, Winner_selector), 10);
899
700M
            xq[i - decisionDelay] =
900
700M
                silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psLastSample->Xq_Q14, Winner_selector), delayedGain_Q10[last_smple_idx], 8));
901
700M
            NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay] =
902
700M
                silk_select_winner(psLastSample->Shape_Q14, Winner_selector);
903
700M
            sLTP_Q15[NSQ->sLTP_buf_idx - decisionDelay] =
904
700M
                silk_select_winner(psLastSample->Pred_Q15, Winner_selector);
905
700M
        }
906
927M
        NSQ->sLTP_shp_buf_idx++;
907
927M
        NSQ->sLTP_buf_idx++;
908
909
        /* Update states */
910
927M
        psSample = &psDelDec->Samples[*smpl_buf_idx];
911
927M
        psDelDec->Seed = _mm_add_epi32(psDelDec->Seed, silk_mm_srai_round_epi32(_mm256_castsi256_si128(SS_Q_Q10), 10));
912
927M
        psDelDec->LF_AR_Q14 = _mm256_castsi256_si128(SS_LF_AR_Q14);
913
927M
        psDelDec->Diff_Q14 = _mm256_castsi256_si128(SS_Diff_Q14);
914
927M
        psDelDec->sLPC_Q14[i + NSQ_LPC_BUF_LENGTH] = _mm256_castsi256_si128(SS_xq_Q14);
915
927M
        psDelDec->RD_Q10 = _mm256_castsi256_si128(SS_RD_Q10);
916
927M
        psSample->Xq_Q14 = _mm256_castsi256_si128(SS_xq_Q14);
917
927M
        psSample->Q_Q10 = _mm256_castsi256_si128(SS_Q_Q10);
918
927M
        psSample->Pred_Q15 = _mm256_castsi256_si128(SS_LPC_exc_Q14);
919
927M
        psSample->Shape_Q14 = _mm256_castsi256_si128(SS_sLTP_shp_Q14);
920
927M
        psSample->RandState = psDelDec->Seed;
921
927M
        delayedGain_Q10[*smpl_buf_idx] = Gain_Q10;
922
927M
    }
923
    /* Update LPC states */
924
322M
    for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++)
925
303M
    {
926
303M
        psDelDec->sLPC_Q14[i] = (&psDelDec->sLPC_Q14[length])[i];
927
303M
    }
928
18.9M
}
929
930
static OPUS_INLINE void silk_nsq_del_dec_scale_states_avx2(
931
    const silk_encoder_state *psEncC,          /* I    Encoder State                   */
932
    silk_nsq_state *NSQ,                       /* I/O  NSQ state                       */
933
    NSQ_del_dec_struct *psDelDec,              /* I/O  Delayed decision states         */
934
    const opus_int16 x16[],                    /* I    Input                           */
935
    opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH], /* O    Input scaled with 1/Gain in Q10 */
936
    const opus_int16 sLTP[],                   /* I    Re-whitened LTP state in Q0     */
937
    opus_int32 sLTP_Q15[],                     /* O    LTP state matching scaled input */
938
    opus_int subfr,                            /* I    Subframe number                 */
939
    const opus_int LTP_scale_Q14,              /* I    LTP state scaling               */
940
    const opus_int32 Gains_Q16[MAX_NB_SUBFR],  /* I                                    */
941
    const opus_int pitchL[MAX_NB_SUBFR],       /* I    Pitch lag                       */
942
    const opus_int signal_type,                /* I    Signal type                     */
943
    const opus_int decisionDelay               /* I    Decision delay                  */
944
)
945
18.9M
{
946
18.9M
    int i;
947
18.9M
    opus_int lag;
948
18.9M
    opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q26;
949
18.9M
    NSQ_del_dec_sample_struct *psSample;
950
951
18.9M
    lag = pitchL[subfr];
952
18.9M
    inv_gain_Q31 = silk_INVERSE32_varQ(silk_max(Gains_Q16[subfr], 1), 47);
953
18.9M
    silk_assert(inv_gain_Q31 != 0);
954
955
    /* Scale input */
956
18.9M
    inv_gain_Q26 = silk_sar_round_32(inv_gain_Q31, 5);
957
250M
    for (i = 0; i < psEncC->subfr_length; i+=4)
958
231M
    {
959
231M
        __m256i x = _mm256_cvtepi16_epi64(_mm_loadu_si64(&x16[i]));
960
231M
        x = _mm256_slli_epi64(_mm256_mul_epi32(x, _mm256_set1_epi32(inv_gain_Q26)), 16);
961
231M
        _mm_storeu_si128((__m128i*)&x_sc_Q10[i], silk_cvtepi64_epi32_high(x));
962
231M
    }
963
964
    /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
965
18.9M
    if (NSQ->rewhite_flag)
966
215k
    {
967
215k
        if (subfr == 0)
968
172k
        {
969
            /* Do LTP downscaling */
970
172k
            inv_gain_Q31 = silk_LSHIFT(silk_SMULWB(inv_gain_Q31, LTP_scale_Q14), 2);
971
172k
        }
972
15.2M
        for (i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++)
973
15.0M
        {
974
15.0M
            silk_assert(i < MAX_FRAME_LENGTH);
975
15.0M
            sLTP_Q15[i] = silk_SMULWB(inv_gain_Q31, sLTP[i]);
976
15.0M
        }
977
215k
    }
978
979
    /* Adjust for changing gain */
980
18.9M
    if (Gains_Q16[subfr] != NSQ->prev_gain_Q16)
981
1.84M
    {
982
1.84M
        gain_adj_Q16 = silk_DIV32_varQ(NSQ->prev_gain_Q16, Gains_Q16[subfr], 16);
983
984
        /* Scale long-term shaping state */
985
94.6M
        for (i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx; i+=4)
986
92.7M
        {
987
92.7M
      opus_int32 *p = &NSQ->sLTP_shp_Q14[i];
988
92.7M
            _mm_storeu_si128((__m128i*)p, silk_mm_smulww_epi32(_mm_loadu_si128((__m128i*)p), gain_adj_Q16));
989
92.7M
        }
990
991
        /* Scale long-term prediction state */
992
1.84M
        if (signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0)
993
311k
        {
994
13.4M
            for (i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx - decisionDelay; i++)
995
13.1M
            {
996
13.1M
                sLTP_Q15[i] = ((opus_int64)sLTP_Q15[i]) * ((opus_int64)gain_adj_Q16) >> 16;
997
13.1M
            }
998
311k
        }
999
1000
        /* Scale scalar states */
1001
1.84M
        psDelDec->LF_AR_Q14 = silk_mm_smulww_epi32(psDelDec->LF_AR_Q14, gain_adj_Q16);
1002
1.84M
        psDelDec->Diff_Q14 = silk_mm_smulww_epi32(psDelDec->Diff_Q14, gain_adj_Q16);
1003
1004
        /* Scale short-term prediction and shaping states */
1005
31.3M
        for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++)
1006
29.4M
        {
1007
29.4M
            psDelDec->sLPC_Q14[i] = silk_mm_smulww_epi32(psDelDec->sLPC_Q14[i], gain_adj_Q16);
1008
29.4M
        }
1009
75.5M
        for (i = 0; i < DECISION_DELAY; i++)
1010
73.7M
        {
1011
73.7M
            psSample = &psDelDec->Samples[i];
1012
73.7M
            psSample->Pred_Q15 = silk_mm_smulww_epi32(psSample->Pred_Q15, gain_adj_Q16);
1013
73.7M
            psSample->Shape_Q14 = silk_mm_smulww_epi32(psSample->Shape_Q14, gain_adj_Q16);
1014
73.7M
        }
1015
46.0M
        for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++)
1016
44.2M
        {
1017
44.2M
            psDelDec->sAR2_Q14[i] = silk_mm_smulww_epi32(psDelDec->sAR2_Q14[i], gain_adj_Q16);
1018
44.2M
        }
1019
1020
        /* Save inverse gain */
1021
1.84M
        NSQ->prev_gain_Q16 = Gains_Q16[subfr];
1022
1.84M
    }
1023
18.9M
}
1024
1025
static OPUS_INLINE void silk_LPC_analysis_filter_avx2(
1026
    opus_int16                  *out,               /* O    Output signal                           */
1027
    const opus_int16            *in,                /* I    Input signal                            */
1028
    const opus_int16            *B,                 /* I    MA prediction coefficients, Q12 [order] */
1029
    const opus_int32            len,                /* I    Signal length                           */
1030
    const opus_int32            order               /* I    Filter order                            */
1031
)
1032
215k
{
1033
215k
    int i;
1034
215k
    opus_int32       out32_Q12, out32;
1035
215k
    silk_assert(order == 10 || order == 16);
1036
1037
15.2M
    for(i = order; i < len; i++ )
1038
15.0M
    {
1039
15.0M
        const opus_int16 *in_ptr = &in[ i ];
1040
        /* Allowing wrap around so that two wraps can cancel each other. The rare
1041
           cases where the result wraps around can only be triggered by invalid streams*/
1042
1043
15.0M
        __m256i in_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)&in_ptr[-8]));
1044
15.0M
        __m256i B_v  = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)&      B[0]));
1045
15.0M
        __m256i sum = _mm256_mullo_epi32(in_v, silk_mm256_reverse_epi32(B_v));
1046
15.0M
        if (order > 10)
1047
3.14M
        {
1048
3.14M
            in_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)&in_ptr[-16]));
1049
3.14M
            B_v  = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)&B       [8]));
1050
3.14M
            B_v  = silk_mm256_reverse_epi32(B_v);
1051
3.14M
        }
1052
11.8M
        else
1053
11.8M
        {
1054
11.8M
            in_v = _mm256_cvtepi16_epi32(_mm_loadu_si32(&in_ptr[-10]));
1055
11.8M
            B_v  = _mm256_cvtepi16_epi32(_mm_loadu_si32(&B       [8]));
1056
11.8M
            B_v  = _mm256_shuffle_epi32(B_v, 0x01);
1057
11.8M
        }
1058
15.0M
        sum = _mm256_add_epi32(sum, _mm256_mullo_epi32(in_v, B_v));
1059
1060
15.0M
        out32_Q12 = silk_mm256_hsum_epi32(sum);
1061
1062
        /* Subtract prediction */
1063
15.0M
        out32_Q12 = silk_SUB32_ovflw( silk_LSHIFT( (opus_int32)*in_ptr, 12 ), out32_Q12 );
1064
1065
        /* Scale to Q0 */
1066
15.0M
        out32 = silk_sar_round_32(out32_Q12, 12);
1067
1068
        /* Saturate output */
1069
15.0M
        out[ i ] = silk_sat16(out32);
1070
15.0M
    }
1071
1072
    /* Set first d output samples to zero */
1073
215k
    silk_memset( out, 0, order * sizeof( opus_int16 ) );
1074
215k
}