Coverage Report

Created: 2026-05-16 07:41

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/opus/silk/x86/NSQ_del_dec_avx2.c
Line
Count
Source
1
/***********************************************************************
2
Copyright (c) 2021 Google Inc.
3
Redistribution and use in source and binary forms, with or without
4
modification, are permitted provided that the following conditions
5
are met:
6
- Redistributions of source code must retain the above copyright notice,
7
this list of conditions and the following disclaimer.
8
- Redistributions in binary form must reproduce the above copyright
9
notice, this list of conditions and the following disclaimer in the
10
documentation and/or other materials provided with the distribution.
11
- Neither the name of Internet Society, IETF or IETF Trust, nor the
12
names of specific contributors, may be used to endorse or promote
13
products derived from this software without specific prior written
14
permission.
15
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25
POSSIBILITY OF SUCH DAMAGE.
26
***********************************************************************/
27
28
#ifdef HAVE_CONFIG_H
29
#include "config.h"
30
#endif
31
32
#ifdef OPUS_CHECK_ASM
33
#include <string.h>
34
#endif
35
36
#include "opus_defines.h"
37
#include <immintrin.h>
38
39
#include "main.h"
40
#include "stack_alloc.h"
41
#include "NSQ.h"
42
#include "celt/x86/x86cpu.h"
43
44
/* Returns TRUE if all assumptions met */
45
static OPUS_INLINE int verify_assumptions(const silk_encoder_state *psEncC)
46
61.5M
{
47
    /* This optimization is based on these assumptions        */
48
    /* These assumptions are fundamental and hence assert are */
49
    /* used. Should any assert triggers, we have to re-visit  */
50
    /* all related code to make sure it still functions the   */
51
    /* same as the C implementation.                          */
52
61.5M
    silk_assert(MAX_DEL_DEC_STATES  <= 4      &&
53
61.5M
                MAX_FRAME_LENGTH     % 4 == 0 &&
54
61.5M
                MAX_SUB_FRAME_LENGTH % 4 == 0 &&
55
61.5M
                LTP_MEM_LENGTH_MS    % 4 == 0 );
56
61.5M
    silk_assert(psEncC->fs_kHz ==  8 ||
57
61.5M
                psEncC->fs_kHz == 12 ||
58
61.5M
                psEncC->fs_kHz == 16 );
59
61.5M
    silk_assert(psEncC->nb_subfr <= MAX_NB_SUBFR &&
60
61.5M
                psEncC->nb_subfr > 0             );
61
61.5M
    silk_assert(psEncC->nStatesDelayedDecision <= MAX_DEL_DEC_STATES &&
62
61.5M
                psEncC->nStatesDelayedDecision > 0                   );
63
61.5M
    silk_assert(psEncC->ltp_mem_length == psEncC->fs_kHz * LTP_MEM_LENGTH_MS);
64
65
    /* Regressions were observed on certain AMD Zen CPUs when      */
66
    /* nStatesDelayedDecision is 1 or 2. Ideally we should detect  */
67
    /* these CPUs and enable this optimization on others; however, */
68
    /* there is no good way to do so under current OPUS framework. */
69
61.5M
    return psEncC->nStatesDelayedDecision == 3 ||
70
47.7M
           psEncC->nStatesDelayedDecision == 4;
71
61.5M
}
72
73
/* Intrinsics not defined on MSVC */
74
#ifdef _MSC_VER
75
#include <intsafe.h>
76
static inline int __builtin_sadd_overflow(opus_int32 a, opus_int32 b, opus_int32* res)
77
{
78
    *res = a+b;
79
    return (*res ^ a) & (*res ^ b) & 0x80000000;
80
}
81
static inline int __builtin_ctz(unsigned int x)
82
{
83
    DWORD res = 0;
84
    return _BitScanForward(&res, x) ? res : 32;
85
}
86
#endif
87
88
static OPUS_INLINE __m128i silk_cvtepi64_epi32_high(__m256i num)
89
208G
{
90
208G
    return _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(num, _mm256_set_epi32(0, 0, 0, 0, 7, 5, 3, 1)));
91
208G
}
92
93
static OPUS_INLINE opus_int16 silk_sat16(opus_int32 num)
94
4.39G
{
95
4.39G
    num = num > silk_int16_MAX ? silk_int16_MAX : num;
96
4.39G
    num = num < silk_int16_MIN ? silk_int16_MIN : num;
97
4.39G
    return num;
98
4.39G
}
99
100
static OPUS_INLINE opus_int32 silk_sar_round_32(opus_int32 a, int bits)
101
4.48G
{
102
4.48G
    silk_assert(bits > 0 && bits < 31);
103
4.48G
    a += 1 << (bits-1);
104
4.48G
    return a >> bits;
105
4.48G
}
106
107
static OPUS_INLINE opus_int64 silk_sar_round_smulww(opus_int32 a, opus_int32 b, int bits)
108
4.32G
{
109
#ifndef OPUS_CHECK_ASM
110
    opus_int64 t;
111
#endif
112
4.32G
    silk_assert(bits > 0 && bits < 63);
113
#ifdef OPUS_CHECK_ASM
114
1.08G
    return silk_RSHIFT_ROUND(silk_SMULWW(a, b), bits);
115
#else
116
    /* This code is more correct, but it won't overflow like the C code in some rare cases. */
117
3.24G
    silk_assert(bits > 0 && bits < 63);
118
3.24G
    t = ((opus_int64)a) * ((opus_int64)b);
119
3.24G
    bits += 16;
120
3.24G
    t += 1ull << (bits-1);
121
3.24G
    return t >> bits;
122
#endif
123
3.24G
}
NSQ_del_dec_avx2.c:silk_sar_round_smulww
Line
Count
Source
108
1.08G
{
109
#ifndef OPUS_CHECK_ASM
110
    opus_int64 t;
111
#endif
112
1.08G
    silk_assert(bits > 0 && bits < 63);
113
1.08G
#ifdef OPUS_CHECK_ASM
114
1.08G
    return silk_RSHIFT_ROUND(silk_SMULWW(a, b), bits);
115
#else
116
    /* This code is more correct, but it won't overflow like the C code in some rare cases. */
117
    silk_assert(bits > 0 && bits < 63);
118
    t = ((opus_int64)a) * ((opus_int64)b);
119
    bits += 16;
120
    t += 1ull << (bits-1);
121
    return t >> bits;
122
#endif
123
1.08G
}
NSQ_del_dec_avx2.c:silk_sar_round_smulww
Line
Count
Source
108
3.24G
{
109
3.24G
#ifndef OPUS_CHECK_ASM
110
3.24G
    opus_int64 t;
111
3.24G
#endif
112
3.24G
    silk_assert(bits > 0 && bits < 63);
113
#ifdef OPUS_CHECK_ASM
114
    return silk_RSHIFT_ROUND(silk_SMULWW(a, b), bits);
115
#else
116
    /* This code is more correct, but it won't overflow like the C code in some rare cases. */
117
3.24G
    silk_assert(bits > 0 && bits < 63);
118
3.24G
    t = ((opus_int64)a) * ((opus_int64)b);
119
3.24G
    bits += 16;
120
3.24G
    t += 1ull << (bits-1);
121
3.24G
    return t >> bits;
122
3.24G
#endif
123
3.24G
}
124
125
static OPUS_INLINE opus_int32 silk_add_sat32(opus_int32 a, opus_int32 b)
126
183M
{
127
183M
    opus_int32 sum;
128
183M
    if (__builtin_sadd_overflow(a, b, &sum))
129
14.7k
    {
130
14.7k
        return a >= 0 ? silk_int32_MAX : silk_int32_MIN;
131
14.7k
    }
132
183M
    return sum;
133
183M
}
134
135
static OPUS_INLINE __m128i silk_mm_srai_round_epi32(__m128i a, int bits)
136
8.65G
{
137
8.65G
    silk_assert(bits > 0 && bits < 31);
138
8.65G
    return _mm_srai_epi32(_mm_add_epi32(a, _mm_set1_epi32(1 << (bits - 1))), bits);
139
8.65G
}
140
141
/* add/subtract with output saturated */
142
static OPUS_INLINE __m128i silk_mm_add_sat_epi32(__m128i a, __m128i b)
143
4.32G
{
144
4.32G
    __m128i r = _mm_add_epi32(a, b);
145
4.32G
    __m128i OF = _mm_and_si128(_mm_xor_si128(a, r), _mm_xor_si128(b, r));           /* OF = (sum ^ a) & (sum ^ b)   */
146
4.32G
    __m128i SAT = _mm_add_epi32(_mm_srli_epi32(a, 31), _mm_set1_epi32(0x7FFFFFFF)); /* SAT = (a >> 31) + 0x7FFFFFFF */
147
4.32G
    return _mm_blendv_epi8(r, SAT, _mm_srai_epi32(OF, 31));
148
4.32G
}
149
static OPUS_INLINE __m128i silk_mm_sub_sat_epi32(__m128i a, __m128i b)
150
4.32G
{
151
4.32G
    __m128i r = _mm_sub_epi32(a, b);
152
4.32G
    __m128i OF = _mm_andnot_si128(_mm_xor_si128(b, r), _mm_xor_si128(a, r));        /* OF = (sum ^ a) & (sum ^ ~b) = (sum ^ a) & ~(sum ^ b) */
153
4.32G
    __m128i SAT = _mm_add_epi32(_mm_srli_epi32(a, 31), _mm_set1_epi32(0x7FFFFFFF)); /* SAT = (a >> 31) + 0x7FFFFFFF                         */
154
4.32G
    return _mm_blendv_epi8(r, SAT, _mm_srai_epi32(OF, 31));
155
4.32G
}
156
static OPUS_INLINE __m256i silk_mm256_sub_sat_epi32(__m256i a, __m256i b)
157
4.32G
{
158
4.32G
    __m256i r = _mm256_sub_epi32(a, b);
159
4.32G
    __m256i OF = _mm256_andnot_si256(_mm256_xor_si256(b, r), _mm256_xor_si256(a, r));        /* OF = (sum ^ a) & (sum ^ ~b) = (sum ^ a) & ~(sum ^ b) */
160
4.32G
    __m256i SAT = _mm256_add_epi32(_mm256_srli_epi32(a, 31), _mm256_set1_epi32(0x7FFFFFFF)); /* SAT = (a >> 31) + 0x7FFFFFFF                         */
161
4.32G
    return _mm256_blendv_epi8(r, SAT, _mm256_srai_epi32(OF, 31));
162
4.32G
}
163
164
static OPUS_INLINE __m128i silk_mm_limit_epi32(__m128i num, opus_int32 limit1, opus_int32 limit2)
165
4.32G
{
166
4.32G
    opus_int32 lo = limit1 < limit2 ? limit1 : limit2;
167
4.32G
    opus_int32 hi = limit1 > limit2 ? limit1 : limit2;
168
169
4.32G
    num = _mm_min_epi32(num, _mm_set1_epi32(hi));
170
4.32G
    num = _mm_max_epi32(num, _mm_set1_epi32(lo));
171
4.32G
    return num;
172
4.32G
}
173
174
/* cond < 0 ? -num : num */
175
static OPUS_INLINE __m128i silk_mm_sign_epi32(__m128i num, __m128i cond)
176
4.34G
{
177
4.34G
    return _mm_sign_epi32(num, _mm_or_si128(cond, _mm_set1_epi32(1)));
178
4.34G
}
179
static OPUS_INLINE __m256i silk_mm256_sign_epi32(__m256i num, __m256i cond)
180
4.32G
{
181
4.32G
    return _mm256_sign_epi32(num, _mm256_or_si256(cond, _mm256_set1_epi32(1)));
182
4.32G
}
183
184
/* (a32 * b32) >> 16 */
185
static OPUS_INLINE __m128i silk_mm_smulww_epi32(__m128i a, opus_int32 b)
186
1.04G
{
187
1.04G
    return silk_cvtepi64_epi32_high(_mm256_slli_epi64(_mm256_mul_epi32(_mm256_cvtepi32_epi64(a), _mm256_set1_epi32(b)), 16));
188
1.04G
}
189
190
/* (a32 * (opus_int32)((opus_int16)(b32))) >> 16 output have to be 32bit int */
191
static OPUS_INLINE __m128i silk_mm_smulwb_epi32(__m128i a, opus_int32 b)
192
201G
{
193
201G
    return silk_cvtepi64_epi32_high(_mm256_mul_epi32(_mm256_cvtepi32_epi64(a), _mm256_set1_epi32((opus_uint32)b<<16)));
194
201G
}
195
196
/* (opus_int32)((opus_int16)(a3))) * (opus_int32)((opus_int16)(b32)) output have to be 32bit int */
197
static OPUS_INLINE __m256i silk_mm256_smulbb_epi32(__m256i a, __m256i b)
198
8.65G
{
199
8.65G
    const char FF = (char)0xFF;
200
8.65G
    __m256i msk = _mm256_set_epi8(
201
8.65G
        FF, FF, FF, FF, FF, FF, FF, FF, 13, 12, 9, 8, 5, 4, 1, 0,
202
8.65G
        FF, FF, FF, FF, FF, FF, FF, FF, 13, 12, 9, 8, 5, 4, 1, 0);
203
8.65G
    __m256i lo = _mm256_mullo_epi16(a, b);
204
8.65G
    __m256i hi = _mm256_mulhi_epi16(a, b);
205
8.65G
    lo = _mm256_shuffle_epi8(lo, msk);
206
8.65G
    hi = _mm256_shuffle_epi8(hi, msk);
207
8.65G
    return _mm256_unpacklo_epi16(lo, hi);
208
8.65G
}
209
210
static OPUS_INLINE __m256i silk_mm256_reverse_epi32(__m256i v)
211
75.5M
{
212
75.5M
    v = _mm256_shuffle_epi32(v, 0x1B);
213
75.5M
    v = _mm256_permute4x64_epi64(v, 0x4E);
214
75.5M
    return v;
215
75.5M
}
216
217
static OPUS_INLINE opus_int32 silk_mm256_hsum_epi32(__m256i v)
218
66.7M
{
219
66.7M
    __m128i sum = _mm_add_epi32(_mm256_extracti128_si256(v, 1), _mm256_extracti128_si256(v, 0));
220
66.7M
    sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E));
221
66.7M
    sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1));
222
66.7M
    return _mm_cvtsi128_si32(sum);
223
66.7M
}
224
225
static OPUS_INLINE __m128i silk_mm_hmin_epi32(__m128i num)
226
8.68G
{
227
8.68G
    num = _mm_min_epi32(num, _mm_shuffle_epi32(num, 0x4E)); /* 0123 -> 2301 */
228
8.68G
    num = _mm_min_epi32(num, _mm_shuffle_epi32(num, 0xB1)); /* 0123 -> 1032 */
229
8.68G
    return num;
230
8.68G
}
231
232
static OPUS_INLINE __m128i silk_mm_hmax_epi32(__m128i num)
233
4.32G
{
234
4.32G
    num = _mm_max_epi32(num, _mm_shuffle_epi32(num, 0x4E)); /* 0123 -> 2310 */
235
4.32G
    num = _mm_max_epi32(num, _mm_shuffle_epi32(num, 0xB1)); /* 0123 -> 1032 */
236
4.32G
    return num;
237
4.32G
}
238
239
static OPUS_INLINE __m128i silk_mm_mask_hmin_epi32(__m128i num, __m128i mask)
240
8.68G
{
241
8.68G
    num = _mm_blendv_epi8(num, _mm_set1_epi32(silk_int32_MAX), mask);
242
8.68G
    return silk_mm_hmin_epi32(num);
243
8.68G
}
244
245
static OPUS_INLINE __m128i silk_mm_mask_hmax_epi32(__m128i num, __m128i mask)
246
4.32G
{
247
4.32G
    num = _mm_blendv_epi8(num, _mm_set1_epi32(silk_int32_MIN), mask);
248
4.32G
    return silk_mm_hmax_epi32(num);
249
4.32G
}
250
251
static OPUS_INLINE __m128i silk_mm256_rand_epi32(__m128i seed)
252
4.32G
{
253
4.32G
    seed = _mm_mullo_epi32(seed, _mm_set1_epi32(RAND_MULTIPLIER));
254
4.32G
    seed = _mm_add_epi32(seed, _mm_set1_epi32(RAND_INCREMENT));
255
4.32G
    return seed;
256
4.32G
}
257
258
static OPUS_INLINE opus_int32 silk_index_of_first_equal_epi32(__m128i a, __m128i b)
259
6.34G
{
260
6.34G
    unsigned int mask = _mm_movemask_epi8(_mm_cmpeq_epi32(a, b)) & 0x1111;
261
6.34G
    silk_assert(mask != 0);
262
6.34G
    return __builtin_ctz(mask) >> 2;
263
6.34G
}
264
265
static __m128i silk_index_to_selector(opus_int32 index)
266
5.35G
{
267
5.35G
    silk_assert(index < 4);
268
5.35G
    index <<= 2;
269
5.35G
    return _mm_set_epi8(
270
5.35G
        index + 3, index + 2, index + 1, index + 0,
271
5.35G
        index + 3, index + 2, index + 1, index + 0,
272
5.35G
        index + 3, index + 2, index + 1, index + 0,
273
5.35G
        index + 3, index + 2, index + 1, index + 0);
274
5.35G
}
275
276
static opus_int32 silk_select_winner(__m128i num, __m128i selector)
277
17.3G
{
278
17.3G
    return _mm_cvtsi128_si32(_mm_shuffle_epi8(num, selector));
279
17.3G
}
280
281
typedef struct
282
{
283
    __m128i RandState;
284
    __m128i Q_Q10;
285
    __m128i Xq_Q14;
286
    __m128i Pred_Q15;
287
    __m128i Shape_Q14;
288
} NSQ_del_dec_sample_struct;
289
290
typedef struct
291
{
292
    __m128i sLPC_Q14[MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH];
293
    __m128i LF_AR_Q14;
294
    __m128i Seed;
295
    __m128i SeedInit;
296
    __m128i RD_Q10;
297
    __m128i Diff_Q14;
298
    __m128i sAR2_Q14[MAX_SHAPE_LPC_ORDER];
299
    NSQ_del_dec_sample_struct Samples[DECISION_DELAY];
300
} NSQ_del_dec_struct;
301
302
static OPUS_INLINE void silk_nsq_del_dec_scale_states_avx2(
303
    const silk_encoder_state *psEncC,          /* I    Encoder State                   */
304
    silk_nsq_state *NSQ,                       /* I/O  NSQ state                       */
305
    NSQ_del_dec_struct *psDelDec,              /* I/O  Delayed decision states         */
306
    const opus_int16 x16[],                    /* I    Input                           */
307
    opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH], /* O    Input scaled with 1/Gain in Q10 */
308
    const opus_int16 sLTP[],                   /* I    Re-whitened LTP state in Q0     */
309
    opus_int32 sLTP_Q15[],                     /* O    LTP state matching scaled input */
310
    opus_int subfr,                            /* I    Subframe number                 */
311
    const opus_int LTP_scale_Q14,              /* I    LTP state scaling               */
312
    const opus_int32 Gains_Q16[MAX_NB_SUBFR],  /* I                                    */
313
    const opus_int pitchL[MAX_NB_SUBFR],       /* I    Pitch lag                       */
314
    const opus_int signal_type,                /* I    Signal type                     */
315
    const opus_int decisionDelay               /* I    Decision delay                  */
316
);
317
318
/*******************************************/
319
/* LPC analysis filter                     */
320
/* NB! State is kept internally and the    */
321
/* filter always starts with zero state    */
322
/* first d output samples are set to zero  */
323
/*******************************************/
324
static OPUS_INLINE void silk_LPC_analysis_filter_avx2(
325
    opus_int16                  *out,               /* O    Output signal                           */
326
    const opus_int16            *in,                /* I    Input signal                            */
327
    const opus_int16            *B,                 /* I    MA prediction coefficients, Q12 [order] */
328
    const opus_int32            len,                /* I    Signal length                           */
329
    const opus_int32            order               /* I    Filter order                            */
330
);
331
332
/******************************************/
333
/* Noise shape quantizer for one subframe */
334
/******************************************/
335
static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_avx2(
336
    silk_nsq_state *NSQ,                        /* I/O  NSQ state                          */
337
    NSQ_del_dec_struct psDelDec[],              /* I/O  Delayed decision states            */
338
    opus_int signalType,                        /* I    Signal type                        */
339
    const opus_int32 x_Q10[],                   /* I                                       */
340
    opus_int8 pulses[],                         /* O                                       */
341
    opus_int16 xq[],                            /* O                                       */
342
    opus_int32 sLTP_Q15[],                      /* I/O  LTP filter state                   */
343
    opus_int32 delayedGain_Q10[DECISION_DELAY], /* I/O  Gain delay buffer                  */
344
    const opus_int16 a_Q12[],                   /* I    Short term prediction coefs        */
345
    const opus_int16 b_Q14[],                   /* I    Long term prediction coefs         */
346
    const opus_int16 AR_shp_Q13[],              /* I    Noise shaping coefs                */
347
    opus_int lag,                               /* I    Pitch lag                          */
348
    opus_int32 HarmShapeFIRPacked_Q14,          /* I                                       */
349
    opus_int Tilt_Q14,                          /* I    Spectral tilt                      */
350
    opus_int32 LF_shp_Q14,                      /* I                                       */
351
    opus_int32 Gain_Q16,                        /* I                                       */
352
    opus_int Lambda_Q10,                        /* I                                       */
353
    opus_int offset_Q10,                        /* I                                       */
354
    opus_int length,                            /* I    Input length                       */
355
    opus_int subfr,                             /* I    Subframe number                    */
356
    opus_int shapingLPCOrder,                   /* I    Shaping LPC filter order           */
357
    opus_int predictLPCOrder,                   /* I    Prediction filter order            */
358
    opus_int warping_Q16,                       /* I                                       */
359
    __m128i MaskDelDec,                         /* I    Mask of states in decision tree    */
360
    opus_int *smpl_buf_idx,                     /* I/O  Index to newest samples in buffers */
361
    opus_int decisionDelay                      /* I                                       */
362
);
363
364
void silk_NSQ_del_dec_avx2(
365
    const silk_encoder_state *psEncC,                            /* I    Encoder State               */
366
    silk_nsq_state *NSQ,                                         /* I/O  NSQ state                   */
367
    SideInfoIndices *psIndices,                                  /* I/O  Quantization Indices        */
368
    const opus_int16 x16[],                                      /* I    Input                       */
369
    opus_int8 pulses[],                                          /* O    Quantized pulse signal      */
370
    const opus_int16 *PredCoef_Q12,                              /* I    Short term prediction coefs */
371
    const opus_int16 LTPCoef_Q14[LTP_ORDER * MAX_NB_SUBFR],      /* I    Long term prediction coefs  */
372
    const opus_int16 AR_Q13[MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER], /* I    Noise shaping coefs         */
373
    const opus_int HarmShapeGain_Q14[MAX_NB_SUBFR],              /* I    Long term shaping coefs     */
374
    const opus_int Tilt_Q14[MAX_NB_SUBFR],                       /* I    Spectral tilt               */
375
    const opus_int32 LF_shp_Q14[MAX_NB_SUBFR],                   /* I    Low frequency shaping coefs */
376
    const opus_int32 Gains_Q16[MAX_NB_SUBFR],                    /* I    Quantization step sizes     */
377
    const opus_int32 pitchL[MAX_NB_SUBFR],                       /* I    Pitch lags                  */
378
    const opus_int Lambda_Q10,                                   /* I    Rate/distortion tradeoff    */
379
    const opus_int LTP_scale_Q14                                 /* I    LTP state scaling           */
380
)
381
61.5M
{
382
#ifdef OPUS_CHECK_ASM
383
    silk_nsq_state NSQ_c;
384
    SideInfoIndices psIndices_c;
385
    opus_int8 pulses_c[MAX_FRAME_LENGTH];
386
    const opus_int8 *const pulses_a = pulses;
387
388
18.6M
    silk_memcpy(&NSQ_c, NSQ, sizeof(NSQ_c));
389
18.6M
    silk_memcpy(&psIndices_c, psIndices, sizeof(psIndices_c));
390
18.6M
    silk_memcpy(pulses_c, pulses, sizeof(pulses_c));
391
    silk_NSQ_del_dec_c(psEncC, &NSQ_c, &psIndices_c, x16, pulses_c, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16,
392
                       pitchL, Lambda_Q10, LTP_scale_Q14);
393
#endif
394
395
61.5M
    if (!verify_assumptions(psEncC))
396
36.4M
    {
397
36.4M
        silk_NSQ_del_dec_c(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14);
398
36.4M
        return;
399
36.4M
    }
400
401
25.1M
    opus_int i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr;
402
25.1M
    opus_int last_smple_idx, smpl_buf_idx, decisionDelay;
403
25.1M
    const opus_int16 *A_Q12, *B_Q14, *AR_shp_Q13;
404
25.1M
    opus_int16 *pxq;
405
25.1M
    VARDECL(opus_int32, sLTP_Q15);
406
25.1M
    VARDECL(opus_int16, sLTP);
407
25.1M
    opus_int32 HarmShapeFIRPacked_Q14;
408
25.1M
    opus_int offset_Q10;
409
25.1M
    opus_int32 Gain_Q10;
410
25.1M
    opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH];
411
25.1M
    opus_int32 delayedGain_Q10[DECISION_DELAY];
412
25.1M
    NSQ_del_dec_struct psDelDec = {0};
413
25.1M
    NSQ_del_dec_sample_struct *psSample;
414
25.1M
    __m128i RDmin_Q10, MaskDelDec, Winner_selector;
415
25.1M
    SAVE_STACK;
416
417
25.1M
    MaskDelDec = _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFFFFF00ul << ((psEncC->nStatesDelayedDecision - 1) << 3)));
418
419
    /* Set unvoiced lag to the previous one, overwrite later for voiced */
420
25.1M
    lag = NSQ->lagPrev;
421
422
25.1M
    silk_assert(NSQ->prev_gain_Q16 != 0);
423
25.1M
    psDelDec.Seed = _mm_and_si128(
424
25.1M
        _mm_add_epi32(_mm_set_epi32(3, 2, 1, 0), _mm_set1_epi32(psIndices->Seed)),
425
25.1M
        _mm_set1_epi32(3));
426
25.1M
    psDelDec.SeedInit = psDelDec.Seed;
427
25.1M
    psDelDec.RD_Q10 = _mm_setzero_si128();
428
25.1M
    psDelDec.LF_AR_Q14 = _mm_set1_epi32(NSQ->sLF_AR_shp_Q14);
429
25.1M
    psDelDec.Diff_Q14 = _mm_set1_epi32(NSQ->sDiff_shp_Q14);
430
25.1M
    psDelDec.Samples[0].Shape_Q14 = _mm_set1_epi32(NSQ->sLTP_shp_Q14[psEncC->ltp_mem_length - 1]);
431
427M
    for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++)
432
402M
    {
433
402M
        psDelDec.sLPC_Q14[i] = _mm_set1_epi32(NSQ->sLPC_Q14[i]);
434
402M
    }
435
628M
    for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++)
436
603M
    {
437
603M
        psDelDec.sAR2_Q14[i] = _mm_set1_epi32(NSQ->sAR2_Q14[i]);
438
603M
    }
439
440
25.1M
    offset_Q10 = silk_Quantization_Offsets_Q10[psIndices->signalType >> 1][psIndices->quantOffsetType];
441
25.1M
    smpl_buf_idx = 0; /* index of oldest samples */
442
443
25.1M
    decisionDelay = silk_min_int(DECISION_DELAY, psEncC->subfr_length);
444
445
    /* For voiced frames limit the decision delay to lower than the pitch lag */
446
25.1M
    if (psIndices->signalType == TYPE_VOICED)
447
687k
    {
448
3.14M
        for (k = 0; k < psEncC->nb_subfr; k++)
449
2.46M
        {
450
2.46M
            decisionDelay = silk_min_int(decisionDelay, pitchL[k] - LTP_ORDER / 2 - 1);
451
2.46M
        }
452
687k
    }
453
24.4M
    else
454
24.4M
    {
455
24.4M
        if (lag > 0)
456
431k
        {
457
431k
            decisionDelay = silk_min_int(decisionDelay, lag - LTP_ORDER / 2 - 1);
458
431k
        }
459
24.4M
    }
460
461
25.1M
    if (psIndices->NLSFInterpCoef_Q2 == 4)
462
24.7M
    {
463
24.7M
        LSF_interpolation_flag = 0;
464
24.7M
    }
465
430k
    else
466
430k
    {
467
430k
        LSF_interpolation_flag = 1;
468
430k
    }
469
470
25.1M
    ALLOC(sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32);
471
25.1M
    ALLOC(sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16);
472
    /* Set up pointers to start of sub frame */
473
25.1M
    pxq = &NSQ->xq[psEncC->ltp_mem_length];
474
25.1M
    NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length;
475
25.1M
    NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
476
25.1M
    subfr = 0;
477
112M
    for (k = 0; k < psEncC->nb_subfr; k++)
478
87.6M
    {
479
87.6M
        A_Q12 = &PredCoef_Q12[((k >> 1) | (1 ^ LSF_interpolation_flag)) * MAX_LPC_ORDER];
480
87.6M
        B_Q14 = &LTPCoef_Q14[k * LTP_ORDER];
481
87.6M
        AR_shp_Q13 = &AR_Q13[k * MAX_SHAPE_LPC_ORDER];
482
483
        /* Noise shape parameters */
484
87.6M
        silk_assert(HarmShapeGain_Q14[k] >= 0);
485
87.6M
        HarmShapeFIRPacked_Q14  =                          silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 );
486
87.6M
        HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 );
487
488
87.6M
        NSQ->rewhite_flag = 0;
489
87.6M
        if (psIndices->signalType == TYPE_VOICED)
490
2.46M
        {
491
            /* Voiced */
492
2.46M
            lag = pitchL[k];
493
494
            /* Re-whitening */
495
2.46M
            if ((k & (3 ^ (LSF_interpolation_flag << 1))) == 0)
496
823k
            {
497
823k
                if (k == 2)
498
136k
                {
499
                    /* RESET DELAYED DECISIONS */
500
                    /* Find winner */
501
136k
                    RDmin_Q10 = silk_mm_mask_hmin_epi32(psDelDec.RD_Q10, MaskDelDec);
502
136k
                    Winner_ind = silk_index_of_first_equal_epi32(RDmin_Q10, psDelDec.RD_Q10);
503
136k
                    Winner_selector = silk_index_to_selector(Winner_ind);
504
136k
                    psDelDec.RD_Q10 = _mm_add_epi32(
505
136k
                        psDelDec.RD_Q10,
506
136k
                        _mm_blendv_epi8(
507
136k
                            _mm_set1_epi32(silk_int32_MAX >> 4),
508
136k
                            _mm_setzero_si128(),
509
136k
                            _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFU << (unsigned)(Winner_ind << 3)))));
510
511
                    /* Copy final part of signals from winner state to output and long-term filter states */
512
136k
                    last_smple_idx = smpl_buf_idx + decisionDelay;
513
4.13M
                    for (i = 0; i < decisionDelay; i++)
514
4.00M
                    {
515
4.00M
                        last_smple_idx = (last_smple_idx + DECISION_DELAY - 1) % DECISION_DELAY;
516
4.00M
                        psSample = &psDelDec.Samples[last_smple_idx];
517
4.00M
                        pulses[i - decisionDelay] =
518
4.00M
                            (opus_int8)silk_sar_round_32(silk_select_winner(psSample->Q_Q10, Winner_selector), 10);
519
4.00M
                        pxq[i - decisionDelay] =
520
4.00M
                            silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psSample->Xq_Q14, Winner_selector), Gains_Q16[1], 14));
521
4.00M
                        NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay + i] =
522
4.00M
                            silk_select_winner(psSample->Shape_Q14, Winner_selector);
523
4.00M
                    }
524
525
136k
                    subfr = 0;
526
136k
                }
527
528
                /* Rewhiten with new A coefs */
529
823k
                start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2;
530
823k
                silk_assert(start_idx > 0);
531
532
823k
                silk_LPC_analysis_filter_avx2(&sLTP[start_idx], &NSQ->xq[start_idx + k * psEncC->subfr_length],
533
823k
                                              A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder);
534
535
823k
                NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
536
823k
                NSQ->rewhite_flag = 1;
537
823k
            }
538
2.46M
        }
539
540
87.6M
        silk_nsq_del_dec_scale_states_avx2(psEncC, NSQ, &psDelDec, x16, x_sc_Q10, sLTP, sLTP_Q15, k,
541
87.6M
                                           LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay);
542
543
87.6M
        silk_noise_shape_quantizer_del_dec_avx2(NSQ, &psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15,
544
87.6M
                                                delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[k], LF_shp_Q14[k],
545
87.6M
                                                Gains_Q16[k], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder,
546
87.6M
                                                psEncC->predictLPCOrder, psEncC->warping_Q16, MaskDelDec, &smpl_buf_idx, decisionDelay);
547
548
87.6M
        x16 += psEncC->subfr_length;
549
87.6M
        pulses += psEncC->subfr_length;
550
87.6M
        pxq += psEncC->subfr_length;
551
87.6M
    }
552
553
    /* Find winner */
554
25.1M
    RDmin_Q10 = silk_mm_mask_hmin_epi32(psDelDec.RD_Q10, MaskDelDec);
555
25.1M
    Winner_selector = silk_index_to_selector(silk_index_of_first_equal_epi32(RDmin_Q10, psDelDec.RD_Q10));
556
557
    /* Copy final part of signals from winner state to output and long-term filter states */
558
25.1M
    psIndices->Seed = silk_select_winner(psDelDec.SeedInit, Winner_selector);
559
25.1M
    last_smple_idx = smpl_buf_idx + decisionDelay;
560
25.1M
    Gain_Q10 = Gains_Q16[psEncC->nb_subfr - 1] >> 6;
561
1.02G
    for (i = 0; i < decisionDelay; i++)
562
1.00G
    {
563
1.00G
        last_smple_idx = (last_smple_idx + DECISION_DELAY - 1) % DECISION_DELAY;
564
1.00G
        psSample = &psDelDec.Samples[last_smple_idx];
565
566
1.00G
        pulses[i - decisionDelay] =
567
1.00G
            (opus_int8)silk_sar_round_32(silk_select_winner(psSample->Q_Q10, Winner_selector), 10);
568
1.00G
        pxq[i - decisionDelay] =
569
1.00G
            silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psSample->Xq_Q14, Winner_selector), Gain_Q10, 8));
570
1.00G
        NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay + i] =
571
1.00G
            silk_select_winner(psSample->Shape_Q14, Winner_selector);
572
1.00G
    }
573
427M
    for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++)
574
402M
    {
575
402M
        NSQ->sLPC_Q14[i] = silk_select_winner(psDelDec.sLPC_Q14[i], Winner_selector);
576
402M
    }
577
628M
    for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++)
578
603M
    {
579
603M
        NSQ->sAR2_Q14[i] = silk_select_winner(psDelDec.sAR2_Q14[i], Winner_selector);
580
603M
    }
581
582
    /* Update states */
583
25.1M
    NSQ->sLF_AR_shp_Q14 = silk_select_winner(psDelDec.LF_AR_Q14, Winner_selector);
584
25.1M
    NSQ->sDiff_shp_Q14 = silk_select_winner(psDelDec.Diff_Q14, Winner_selector);
585
25.1M
    NSQ->lagPrev = pitchL[psEncC->nb_subfr - 1];
586
587
    /* Save quantized speech signal */
588
25.1M
    silk_memmove(NSQ->xq, &NSQ->xq[psEncC->frame_length], psEncC->ltp_mem_length * sizeof(opus_int16));
589
25.1M
    silk_memmove(NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[psEncC->frame_length], psEncC->ltp_mem_length * sizeof(opus_int32));
590
591
#ifdef OPUS_CHECK_ASM
592
6.78M
    silk_assert(!memcmp(&NSQ_c, NSQ, sizeof(NSQ_c)));
593
6.78M
    silk_assert(!memcmp(&psIndices_c, psIndices, sizeof(psIndices_c)));
594
6.78M
    silk_assert(!memcmp(pulses_c, pulses_a, sizeof(pulses_c)));
595
6.78M
#endif
596
597
6.78M
    RESTORE_STACK;
598
6.78M
}
silk_NSQ_del_dec_avx2
Line
Count
Source
381
18.6M
{
382
18.6M
#ifdef OPUS_CHECK_ASM
383
18.6M
    silk_nsq_state NSQ_c;
384
18.6M
    SideInfoIndices psIndices_c;
385
18.6M
    opus_int8 pulses_c[MAX_FRAME_LENGTH];
386
18.6M
    const opus_int8 *const pulses_a = pulses;
387
388
18.6M
    silk_memcpy(&NSQ_c, NSQ, sizeof(NSQ_c));
389
18.6M
    silk_memcpy(&psIndices_c, psIndices, sizeof(psIndices_c));
390
18.6M
    silk_memcpy(pulses_c, pulses, sizeof(pulses_c));
391
18.6M
    silk_NSQ_del_dec_c(psEncC, &NSQ_c, &psIndices_c, x16, pulses_c, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16,
392
18.6M
                       pitchL, Lambda_Q10, LTP_scale_Q14);
393
18.6M
#endif
394
395
18.6M
    if (!verify_assumptions(psEncC))
396
11.8M
    {
397
11.8M
        silk_NSQ_del_dec_c(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14);
398
11.8M
        return;
399
11.8M
    }
400
401
6.78M
    opus_int i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr;
402
6.78M
    opus_int last_smple_idx, smpl_buf_idx, decisionDelay;
403
6.78M
    const opus_int16 *A_Q12, *B_Q14, *AR_shp_Q13;
404
6.78M
    opus_int16 *pxq;
405
6.78M
    VARDECL(opus_int32, sLTP_Q15);
406
6.78M
    VARDECL(opus_int16, sLTP);
407
6.78M
    opus_int32 HarmShapeFIRPacked_Q14;
408
6.78M
    opus_int offset_Q10;
409
6.78M
    opus_int32 Gain_Q10;
410
6.78M
    opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH];
411
6.78M
    opus_int32 delayedGain_Q10[DECISION_DELAY];
412
6.78M
    NSQ_del_dec_struct psDelDec = {0};
413
6.78M
    NSQ_del_dec_sample_struct *psSample;
414
6.78M
    __m128i RDmin_Q10, MaskDelDec, Winner_selector;
415
6.78M
    SAVE_STACK;
416
417
6.78M
    MaskDelDec = _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFFFFF00ul << ((psEncC->nStatesDelayedDecision - 1) << 3)));
418
419
    /* Set unvoiced lag to the previous one, overwrite later for voiced */
420
6.78M
    lag = NSQ->lagPrev;
421
422
6.78M
    silk_assert(NSQ->prev_gain_Q16 != 0);
423
6.78M
    psDelDec.Seed = _mm_and_si128(
424
6.78M
        _mm_add_epi32(_mm_set_epi32(3, 2, 1, 0), _mm_set1_epi32(psIndices->Seed)),
425
6.78M
        _mm_set1_epi32(3));
426
6.78M
    psDelDec.SeedInit = psDelDec.Seed;
427
6.78M
    psDelDec.RD_Q10 = _mm_setzero_si128();
428
6.78M
    psDelDec.LF_AR_Q14 = _mm_set1_epi32(NSQ->sLF_AR_shp_Q14);
429
6.78M
    psDelDec.Diff_Q14 = _mm_set1_epi32(NSQ->sDiff_shp_Q14);
430
6.78M
    psDelDec.Samples[0].Shape_Q14 = _mm_set1_epi32(NSQ->sLTP_shp_Q14[psEncC->ltp_mem_length - 1]);
431
115M
    for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++)
432
108M
    {
433
108M
        psDelDec.sLPC_Q14[i] = _mm_set1_epi32(NSQ->sLPC_Q14[i]);
434
108M
    }
435
169M
    for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++)
436
162M
    {
437
162M
        psDelDec.sAR2_Q14[i] = _mm_set1_epi32(NSQ->sAR2_Q14[i]);
438
162M
    }
439
440
6.78M
    offset_Q10 = silk_Quantization_Offsets_Q10[psIndices->signalType >> 1][psIndices->quantOffsetType];
441
6.78M
    smpl_buf_idx = 0; /* index of oldest samples */
442
443
6.78M
    decisionDelay = silk_min_int(DECISION_DELAY, psEncC->subfr_length);
444
445
    /* For voiced frames limit the decision delay to lower than the pitch lag */
446
6.78M
    if (psIndices->signalType == TYPE_VOICED)
447
180k
    {
448
829k
        for (k = 0; k < psEncC->nb_subfr; k++)
449
648k
        {
450
648k
            decisionDelay = silk_min_int(decisionDelay, pitchL[k] - LTP_ORDER / 2 - 1);
451
648k
        }
452
180k
    }
453
6.60M
    else
454
6.60M
    {
455
6.60M
        if (lag > 0)
456
198k
        {
457
198k
            decisionDelay = silk_min_int(decisionDelay, lag - LTP_ORDER / 2 - 1);
458
198k
        }
459
6.60M
    }
460
461
6.78M
    if (psIndices->NLSFInterpCoef_Q2 == 4)
462
6.66M
    {
463
6.66M
        LSF_interpolation_flag = 0;
464
6.66M
    }
465
123k
    else
466
123k
    {
467
123k
        LSF_interpolation_flag = 1;
468
123k
    }
469
470
6.78M
    ALLOC(sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32);
471
6.78M
    ALLOC(sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16);
472
    /* Set up pointers to start of sub frame */
473
6.78M
    pxq = &NSQ->xq[psEncC->ltp_mem_length];
474
6.78M
    NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length;
475
6.78M
    NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
476
6.78M
    subfr = 0;
477
31.1M
    for (k = 0; k < psEncC->nb_subfr; k++)
478
24.3M
    {
479
24.3M
        A_Q12 = &PredCoef_Q12[((k >> 1) | (1 ^ LSF_interpolation_flag)) * MAX_LPC_ORDER];
480
24.3M
        B_Q14 = &LTPCoef_Q14[k * LTP_ORDER];
481
24.3M
        AR_shp_Q13 = &AR_Q13[k * MAX_SHAPE_LPC_ORDER];
482
483
        /* Noise shape parameters */
484
24.3M
        silk_assert(HarmShapeGain_Q14[k] >= 0);
485
24.3M
        HarmShapeFIRPacked_Q14  =                          silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 );
486
24.3M
        HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 );
487
488
24.3M
        NSQ->rewhite_flag = 0;
489
24.3M
        if (psIndices->signalType == TYPE_VOICED)
490
648k
        {
491
            /* Voiced */
492
648k
            lag = pitchL[k];
493
494
            /* Re-whitening */
495
648k
            if ((k & (3 ^ (LSF_interpolation_flag << 1))) == 0)
496
225k
            {
497
225k
                if (k == 2)
498
45.3k
                {
499
                    /* RESET DELAYED DECISIONS */
500
                    /* Find winner */
501
45.3k
                    RDmin_Q10 = silk_mm_mask_hmin_epi32(psDelDec.RD_Q10, MaskDelDec);
502
45.3k
                    Winner_ind = silk_index_of_first_equal_epi32(RDmin_Q10, psDelDec.RD_Q10);
503
45.3k
                    Winner_selector = silk_index_to_selector(Winner_ind);
504
45.3k
                    psDelDec.RD_Q10 = _mm_add_epi32(
505
45.3k
                        psDelDec.RD_Q10,
506
45.3k
                        _mm_blendv_epi8(
507
45.3k
                            _mm_set1_epi32(silk_int32_MAX >> 4),
508
45.3k
                            _mm_setzero_si128(),
509
45.3k
                            _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFU << (unsigned)(Winner_ind << 3)))));
510
511
                    /* Copy final part of signals from winner state to output and long-term filter states */
512
45.3k
                    last_smple_idx = smpl_buf_idx + decisionDelay;
513
1.26M
                    for (i = 0; i < decisionDelay; i++)
514
1.22M
                    {
515
1.22M
                        last_smple_idx = (last_smple_idx + DECISION_DELAY - 1) % DECISION_DELAY;
516
1.22M
                        psSample = &psDelDec.Samples[last_smple_idx];
517
1.22M
                        pulses[i - decisionDelay] =
518
1.22M
                            (opus_int8)silk_sar_round_32(silk_select_winner(psSample->Q_Q10, Winner_selector), 10);
519
1.22M
                        pxq[i - decisionDelay] =
520
1.22M
                            silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psSample->Xq_Q14, Winner_selector), Gains_Q16[1], 14));
521
1.22M
                        NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay + i] =
522
1.22M
                            silk_select_winner(psSample->Shape_Q14, Winner_selector);
523
1.22M
                    }
524
525
45.3k
                    subfr = 0;
526
45.3k
                }
527
528
                /* Rewhiten with new A coefs */
529
225k
                start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2;
530
225k
                silk_assert(start_idx > 0);
531
532
225k
                silk_LPC_analysis_filter_avx2(&sLTP[start_idx], &NSQ->xq[start_idx + k * psEncC->subfr_length],
533
225k
                                              A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder);
534
535
225k
                NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
536
225k
                NSQ->rewhite_flag = 1;
537
225k
            }
538
648k
        }
539
540
24.3M
        silk_nsq_del_dec_scale_states_avx2(psEncC, NSQ, &psDelDec, x16, x_sc_Q10, sLTP, sLTP_Q15, k,
541
24.3M
                                           LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay);
542
543
24.3M
        silk_noise_shape_quantizer_del_dec_avx2(NSQ, &psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15,
544
24.3M
                                                delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[k], LF_shp_Q14[k],
545
24.3M
                                                Gains_Q16[k], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder,
546
24.3M
                                                psEncC->predictLPCOrder, psEncC->warping_Q16, MaskDelDec, &smpl_buf_idx, decisionDelay);
547
548
24.3M
        x16 += psEncC->subfr_length;
549
24.3M
        pulses += psEncC->subfr_length;
550
24.3M
        pxq += psEncC->subfr_length;
551
24.3M
    }
552
553
    /* Find winner */
554
6.78M
    RDmin_Q10 = silk_mm_mask_hmin_epi32(psDelDec.RD_Q10, MaskDelDec);
555
6.78M
    Winner_selector = silk_index_to_selector(silk_index_of_first_equal_epi32(RDmin_Q10, psDelDec.RD_Q10));
556
557
    /* Copy final part of signals from winner state to output and long-term filter states */
558
6.78M
    psIndices->Seed = silk_select_winner(psDelDec.SeedInit, Winner_selector);
559
6.78M
    last_smple_idx = smpl_buf_idx + decisionDelay;
560
6.78M
    Gain_Q10 = Gains_Q16[psEncC->nb_subfr - 1] >> 6;
561
276M
    for (i = 0; i < decisionDelay; i++)
562
269M
    {
563
269M
        last_smple_idx = (last_smple_idx + DECISION_DELAY - 1) % DECISION_DELAY;
564
269M
        psSample = &psDelDec.Samples[last_smple_idx];
565
566
269M
        pulses[i - decisionDelay] =
567
269M
            (opus_int8)silk_sar_round_32(silk_select_winner(psSample->Q_Q10, Winner_selector), 10);
568
269M
        pxq[i - decisionDelay] =
569
269M
            silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psSample->Xq_Q14, Winner_selector), Gain_Q10, 8));
570
269M
        NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay + i] =
571
269M
            silk_select_winner(psSample->Shape_Q14, Winner_selector);
572
269M
    }
573
115M
    for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++)
574
108M
    {
575
108M
        NSQ->sLPC_Q14[i] = silk_select_winner(psDelDec.sLPC_Q14[i], Winner_selector);
576
108M
    }
577
169M
    for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++)
578
162M
    {
579
162M
        NSQ->sAR2_Q14[i] = silk_select_winner(psDelDec.sAR2_Q14[i], Winner_selector);
580
162M
    }
581
582
    /* Update states */
583
6.78M
    NSQ->sLF_AR_shp_Q14 = silk_select_winner(psDelDec.LF_AR_Q14, Winner_selector);
584
6.78M
    NSQ->sDiff_shp_Q14 = silk_select_winner(psDelDec.Diff_Q14, Winner_selector);
585
6.78M
    NSQ->lagPrev = pitchL[psEncC->nb_subfr - 1];
586
587
    /* Save quantized speech signal */
588
6.78M
    silk_memmove(NSQ->xq, &NSQ->xq[psEncC->frame_length], psEncC->ltp_mem_length * sizeof(opus_int16));
589
6.78M
    silk_memmove(NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[psEncC->frame_length], psEncC->ltp_mem_length * sizeof(opus_int32));
590
591
6.78M
#ifdef OPUS_CHECK_ASM
592
6.78M
    silk_assert(!memcmp(&NSQ_c, NSQ, sizeof(NSQ_c)));
593
6.78M
    silk_assert(!memcmp(&psIndices_c, psIndices, sizeof(psIndices_c)));
594
6.78M
    silk_assert(!memcmp(pulses_c, pulses_a, sizeof(pulses_c)));
595
6.78M
#endif
596
597
6.78M
    RESTORE_STACK;
598
6.78M
}
silk_NSQ_del_dec_avx2
Line
Count
Source
381
42.9M
{
382
#ifdef OPUS_CHECK_ASM
383
    silk_nsq_state NSQ_c;
384
    SideInfoIndices psIndices_c;
385
    opus_int8 pulses_c[MAX_FRAME_LENGTH];
386
    const opus_int8 *const pulses_a = pulses;
387
388
    silk_memcpy(&NSQ_c, NSQ, sizeof(NSQ_c));
389
    silk_memcpy(&psIndices_c, psIndices, sizeof(psIndices_c));
390
    silk_memcpy(pulses_c, pulses, sizeof(pulses_c));
391
    silk_NSQ_del_dec_c(psEncC, &NSQ_c, &psIndices_c, x16, pulses_c, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16,
392
                       pitchL, Lambda_Q10, LTP_scale_Q14);
393
#endif
394
395
42.9M
    if (!verify_assumptions(psEncC))
396
24.5M
    {
397
24.5M
        silk_NSQ_del_dec_c(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14);
398
24.5M
        return;
399
24.5M
    }
400
401
18.3M
    opus_int i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr;
402
18.3M
    opus_int last_smple_idx, smpl_buf_idx, decisionDelay;
403
18.3M
    const opus_int16 *A_Q12, *B_Q14, *AR_shp_Q13;
404
18.3M
    opus_int16 *pxq;
405
18.3M
    VARDECL(opus_int32, sLTP_Q15);
406
18.3M
    VARDECL(opus_int16, sLTP);
407
18.3M
    opus_int32 HarmShapeFIRPacked_Q14;
408
18.3M
    opus_int offset_Q10;
409
18.3M
    opus_int32 Gain_Q10;
410
18.3M
    opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH];
411
18.3M
    opus_int32 delayedGain_Q10[DECISION_DELAY];
412
18.3M
    NSQ_del_dec_struct psDelDec = {0};
413
18.3M
    NSQ_del_dec_sample_struct *psSample;
414
18.3M
    __m128i RDmin_Q10, MaskDelDec, Winner_selector;
415
18.3M
    SAVE_STACK;
416
417
18.3M
    MaskDelDec = _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFFFFF00ul << ((psEncC->nStatesDelayedDecision - 1) << 3)));
418
419
    /* Set unvoiced lag to the previous one, overwrite later for voiced */
420
18.3M
    lag = NSQ->lagPrev;
421
422
18.3M
    silk_assert(NSQ->prev_gain_Q16 != 0);
423
18.3M
    psDelDec.Seed = _mm_and_si128(
424
18.3M
        _mm_add_epi32(_mm_set_epi32(3, 2, 1, 0), _mm_set1_epi32(psIndices->Seed)),
425
18.3M
        _mm_set1_epi32(3));
426
18.3M
    psDelDec.SeedInit = psDelDec.Seed;
427
18.3M
    psDelDec.RD_Q10 = _mm_setzero_si128();
428
18.3M
    psDelDec.LF_AR_Q14 = _mm_set1_epi32(NSQ->sLF_AR_shp_Q14);
429
18.3M
    psDelDec.Diff_Q14 = _mm_set1_epi32(NSQ->sDiff_shp_Q14);
430
18.3M
    psDelDec.Samples[0].Shape_Q14 = _mm_set1_epi32(NSQ->sLTP_shp_Q14[psEncC->ltp_mem_length - 1]);
431
312M
    for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++)
432
293M
    {
433
293M
        psDelDec.sLPC_Q14[i] = _mm_set1_epi32(NSQ->sLPC_Q14[i]);
434
293M
    }
435
458M
    for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++)
436
440M
    {
437
440M
        psDelDec.sAR2_Q14[i] = _mm_set1_epi32(NSQ->sAR2_Q14[i]);
438
440M
    }
439
440
18.3M
    offset_Q10 = silk_Quantization_Offsets_Q10[psIndices->signalType >> 1][psIndices->quantOffsetType];
441
18.3M
    smpl_buf_idx = 0; /* index of oldest samples */
442
443
18.3M
    decisionDelay = silk_min_int(DECISION_DELAY, psEncC->subfr_length);
444
445
    /* For voiced frames limit the decision delay to lower than the pitch lag */
446
18.3M
    if (psIndices->signalType == TYPE_VOICED)
447
506k
    {
448
2.31M
        for (k = 0; k < psEncC->nb_subfr; k++)
449
1.81M
        {
450
1.81M
            decisionDelay = silk_min_int(decisionDelay, pitchL[k] - LTP_ORDER / 2 - 1);
451
1.81M
        }
452
506k
    }
453
17.8M
    else
454
17.8M
    {
455
17.8M
        if (lag > 0)
456
233k
        {
457
233k
            decisionDelay = silk_min_int(decisionDelay, lag - LTP_ORDER / 2 - 1);
458
233k
        }
459
17.8M
    }
460
461
18.3M
    if (psIndices->NLSFInterpCoef_Q2 == 4)
462
18.0M
    {
463
18.0M
        LSF_interpolation_flag = 0;
464
18.0M
    }
465
306k
    else
466
306k
    {
467
306k
        LSF_interpolation_flag = 1;
468
306k
    }
469
470
18.3M
    ALLOC(sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32);
471
18.3M
    ALLOC(sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16);
472
    /* Set up pointers to start of sub frame */
473
18.3M
    pxq = &NSQ->xq[psEncC->ltp_mem_length];
474
18.3M
    NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length;
475
18.3M
    NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
476
18.3M
    subfr = 0;
477
81.6M
    for (k = 0; k < psEncC->nb_subfr; k++)
478
63.2M
    {
479
63.2M
        A_Q12 = &PredCoef_Q12[((k >> 1) | (1 ^ LSF_interpolation_flag)) * MAX_LPC_ORDER];
480
63.2M
        B_Q14 = &LTPCoef_Q14[k * LTP_ORDER];
481
63.2M
        AR_shp_Q13 = &AR_Q13[k * MAX_SHAPE_LPC_ORDER];
482
483
        /* Noise shape parameters */
484
63.2M
        silk_assert(HarmShapeGain_Q14[k] >= 0);
485
63.2M
        HarmShapeFIRPacked_Q14  =                          silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 );
486
63.2M
        HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 );
487
488
63.2M
        NSQ->rewhite_flag = 0;
489
63.2M
        if (psIndices->signalType == TYPE_VOICED)
490
1.81M
        {
491
            /* Voiced */
492
1.81M
            lag = pitchL[k];
493
494
            /* Re-whitening */
495
1.81M
            if ((k & (3 ^ (LSF_interpolation_flag << 1))) == 0)
496
598k
            {
497
598k
                if (k == 2)
498
91.4k
                {
499
                    /* RESET DELAYED DECISIONS */
500
                    /* Find winner */
501
91.4k
                    RDmin_Q10 = silk_mm_mask_hmin_epi32(psDelDec.RD_Q10, MaskDelDec);
502
91.4k
                    Winner_ind = silk_index_of_first_equal_epi32(RDmin_Q10, psDelDec.RD_Q10);
503
91.4k
                    Winner_selector = silk_index_to_selector(Winner_ind);
504
91.4k
                    psDelDec.RD_Q10 = _mm_add_epi32(
505
91.4k
                        psDelDec.RD_Q10,
506
91.4k
                        _mm_blendv_epi8(
507
91.4k
                            _mm_set1_epi32(silk_int32_MAX >> 4),
508
91.4k
                            _mm_setzero_si128(),
509
91.4k
                            _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFU << (unsigned)(Winner_ind << 3)))));
510
511
                    /* Copy final part of signals from winner state to output and long-term filter states */
512
91.4k
                    last_smple_idx = smpl_buf_idx + decisionDelay;
513
2.87M
                    for (i = 0; i < decisionDelay; i++)
514
2.78M
                    {
515
2.78M
                        last_smple_idx = (last_smple_idx + DECISION_DELAY - 1) % DECISION_DELAY;
516
2.78M
                        psSample = &psDelDec.Samples[last_smple_idx];
517
2.78M
                        pulses[i - decisionDelay] =
518
2.78M
                            (opus_int8)silk_sar_round_32(silk_select_winner(psSample->Q_Q10, Winner_selector), 10);
519
2.78M
                        pxq[i - decisionDelay] =
520
2.78M
                            silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psSample->Xq_Q14, Winner_selector), Gains_Q16[1], 14));
521
2.78M
                        NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay + i] =
522
2.78M
                            silk_select_winner(psSample->Shape_Q14, Winner_selector);
523
2.78M
                    }
524
525
91.4k
                    subfr = 0;
526
91.4k
                }
527
528
                /* Rewhiten with new A coefs */
529
598k
                start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2;
530
598k
                silk_assert(start_idx > 0);
531
532
598k
                silk_LPC_analysis_filter_avx2(&sLTP[start_idx], &NSQ->xq[start_idx + k * psEncC->subfr_length],
533
598k
                                              A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder);
534
535
598k
                NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
536
598k
                NSQ->rewhite_flag = 1;
537
598k
            }
538
1.81M
        }
539
540
63.2M
        silk_nsq_del_dec_scale_states_avx2(psEncC, NSQ, &psDelDec, x16, x_sc_Q10, sLTP, sLTP_Q15, k,
541
63.2M
                                           LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay);
542
543
63.2M
        silk_noise_shape_quantizer_del_dec_avx2(NSQ, &psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15,
544
63.2M
                                                delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[k], LF_shp_Q14[k],
545
63.2M
                                                Gains_Q16[k], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder,
546
63.2M
                                                psEncC->predictLPCOrder, psEncC->warping_Q16, MaskDelDec, &smpl_buf_idx, decisionDelay);
547
548
63.2M
        x16 += psEncC->subfr_length;
549
63.2M
        pulses += psEncC->subfr_length;
550
63.2M
        pxq += psEncC->subfr_length;
551
63.2M
    }
552
553
    /* Find winner */
554
18.3M
    RDmin_Q10 = silk_mm_mask_hmin_epi32(psDelDec.RD_Q10, MaskDelDec);
555
18.3M
    Winner_selector = silk_index_to_selector(silk_index_of_first_equal_epi32(RDmin_Q10, psDelDec.RD_Q10));
556
557
    /* Copy final part of signals from winner state to output and long-term filter states */
558
18.3M
    psIndices->Seed = silk_select_winner(psDelDec.SeedInit, Winner_selector);
559
18.3M
    last_smple_idx = smpl_buf_idx + decisionDelay;
560
18.3M
    Gain_Q10 = Gains_Q16[psEncC->nb_subfr - 1] >> 6;
561
749M
    for (i = 0; i < decisionDelay; i++)
562
730M
    {
563
730M
        last_smple_idx = (last_smple_idx + DECISION_DELAY - 1) % DECISION_DELAY;
564
730M
        psSample = &psDelDec.Samples[last_smple_idx];
565
566
730M
        pulses[i - decisionDelay] =
567
730M
            (opus_int8)silk_sar_round_32(silk_select_winner(psSample->Q_Q10, Winner_selector), 10);
568
730M
        pxq[i - decisionDelay] =
569
730M
            silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psSample->Xq_Q14, Winner_selector), Gain_Q10, 8));
570
730M
        NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay + i] =
571
730M
            silk_select_winner(psSample->Shape_Q14, Winner_selector);
572
730M
    }
573
312M
    for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++)
574
293M
    {
575
293M
        NSQ->sLPC_Q14[i] = silk_select_winner(psDelDec.sLPC_Q14[i], Winner_selector);
576
293M
    }
577
458M
    for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++)
578
440M
    {
579
440M
        NSQ->sAR2_Q14[i] = silk_select_winner(psDelDec.sAR2_Q14[i], Winner_selector);
580
440M
    }
581
582
    /* Update states */
583
18.3M
    NSQ->sLF_AR_shp_Q14 = silk_select_winner(psDelDec.LF_AR_Q14, Winner_selector);
584
18.3M
    NSQ->sDiff_shp_Q14 = silk_select_winner(psDelDec.Diff_Q14, Winner_selector);
585
18.3M
    NSQ->lagPrev = pitchL[psEncC->nb_subfr - 1];
586
587
    /* Save quantized speech signal */
588
18.3M
    silk_memmove(NSQ->xq, &NSQ->xq[psEncC->frame_length], psEncC->ltp_mem_length * sizeof(opus_int16));
589
18.3M
    silk_memmove(NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[psEncC->frame_length], psEncC->ltp_mem_length * sizeof(opus_int32));
590
591
#ifdef OPUS_CHECK_ASM
592
    silk_assert(!memcmp(&NSQ_c, NSQ, sizeof(NSQ_c)));
593
    silk_assert(!memcmp(&psIndices_c, psIndices, sizeof(psIndices_c)));
594
    silk_assert(!memcmp(pulses_c, pulses_a, sizeof(pulses_c)));
595
#endif
596
597
18.3M
    RESTORE_STACK;
598
18.3M
}
599
600
static OPUS_INLINE __m128i silk_noise_shape_quantizer_short_prediction_x4(const __m128i *buf32, const opus_int16 *coef16, opus_int order)
601
4.32G
{
602
4.32G
    __m256i out;
603
4.32G
    silk_assert(order == 10 || order == 16);
604
605
    /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
606
4.32G
    out = _mm256_set1_epi32(order >> 1);
607
4.32G
    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-0]), _mm256_set1_epi32(silk_LSHIFT(coef16[0], 16)))); /* High DWORD */
608
4.32G
    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-1]), _mm256_set1_epi32(silk_LSHIFT(coef16[1], 16)))); /* High DWORD */
609
4.32G
    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-2]), _mm256_set1_epi32(silk_LSHIFT(coef16[2], 16)))); /* High DWORD */
610
4.32G
    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-3]), _mm256_set1_epi32(silk_LSHIFT(coef16[3], 16)))); /* High DWORD */
611
4.32G
    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-4]), _mm256_set1_epi32(silk_LSHIFT(coef16[4], 16)))); /* High DWORD */
612
4.32G
    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-5]), _mm256_set1_epi32(silk_LSHIFT(coef16[5], 16)))); /* High DWORD */
613
4.32G
    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-6]), _mm256_set1_epi32(silk_LSHIFT(coef16[6], 16)))); /* High DWORD */
614
4.32G
    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-7]), _mm256_set1_epi32(silk_LSHIFT(coef16[7], 16)))); /* High DWORD */
615
4.32G
    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-8]), _mm256_set1_epi32(silk_LSHIFT(coef16[8], 16)))); /* High DWORD */
616
4.32G
    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-9]), _mm256_set1_epi32(silk_LSHIFT(coef16[9], 16)))); /* High DWORD */
617
618
4.32G
    if (order == 16)
619
951M
    {
620
951M
        out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-10]), _mm256_set1_epi32(silk_LSHIFT(coef16[10], 16)))); /* High DWORD */
621
951M
        out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-11]), _mm256_set1_epi32(silk_LSHIFT(coef16[11], 16)))); /* High DWORD */
622
951M
        out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-12]), _mm256_set1_epi32(silk_LSHIFT(coef16[12], 16)))); /* High DWORD */
623
951M
        out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-13]), _mm256_set1_epi32(silk_LSHIFT(coef16[13], 16)))); /* High DWORD */
624
951M
        out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-14]), _mm256_set1_epi32(silk_LSHIFT(coef16[14], 16)))); /* High DWORD */
625
951M
        out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-15]), _mm256_set1_epi32(silk_LSHIFT(coef16[15], 16)))); /* High DWORD */
626
951M
    }
627
4.32G
    return silk_cvtepi64_epi32_high(out);
628
4.32G
}
629
630
/******************************************/
631
/* Noise shape quantizer for one subframe */
632
/******************************************/
633
static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_avx2(
634
    silk_nsq_state *NSQ,                        /* I/O  NSQ state                          */
635
    NSQ_del_dec_struct *psDelDec,               /* I/O  Delayed decision states            */
636
    opus_int signalType,                        /* I    Signal type                        */
637
    const opus_int32 x_Q10[],                   /* I                                       */
638
    opus_int8 pulses[],                         /* O                                       */
639
    opus_int16 xq[],                            /* O                                       */
640
    opus_int32 sLTP_Q15[],                      /* I/O  LTP filter state                   */
641
    opus_int32 delayedGain_Q10[DECISION_DELAY], /* I/O  Gain delay buffer                  */
642
    const opus_int16 a_Q12[],                   /* I    Short term prediction coefs        */
643
    const opus_int16 b_Q14[],                   /* I    Long term prediction coefs         */
644
    const opus_int16 AR_shp_Q13[],              /* I    Noise shaping coefs                */
645
    opus_int lag,                               /* I    Pitch lag                          */
646
    opus_int32 HarmShapeFIRPacked_Q14,          /* I                                       */
647
    opus_int Tilt_Q14,                          /* I    Spectral tilt                      */
648
    opus_int32 LF_shp_Q14,                      /* I                                       */
649
    opus_int32 Gain_Q16,                        /* I                                       */
650
    opus_int Lambda_Q10,                        /* I                                       */
651
    opus_int offset_Q10,                        /* I                                       */
652
    opus_int length,                            /* I    Input length                       */
653
    opus_int subfr,                             /* I    Subframe number                    */
654
    opus_int shapingLPCOrder,                   /* I    Shaping LPC filter order           */
655
    opus_int predictLPCOrder,                   /* I    Prediction filter order            */
656
    opus_int warping_Q16,                       /* I                                       */
657
    __m128i MaskDelDec,                         /* I    Mask of states in decision tree    */
658
    opus_int *smpl_buf_idx,                     /* I/O  Index to newest samples in buffers */
659
    opus_int decisionDelay                      /* I                                       */
660
)
661
87.6M
{
662
87.6M
    int i;
663
87.6M
    opus_int32 *shp_lag_ptr = &NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2];
664
87.6M
    opus_int32 *pred_lag_ptr = &sLTP_Q15[NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2];
665
87.6M
    opus_int32 Gain_Q10 = Gain_Q16 >> 6;
666
667
4.41G
    for (i = 0; i < length; i++)
668
4.32G
    {
669
        /* Perform common calculations used in all states */
670
        /* NSQ_sample_struct */
671
        /* Low  128 bits => 1st set */
672
        /* High 128 bits => 2nd set */
673
4.32G
        int j;
674
4.32G
        __m256i SS_Q_Q10;
675
4.32G
        __m256i SS_RD_Q10;
676
4.32G
        __m256i SS_xq_Q14;
677
4.32G
        __m256i SS_LF_AR_Q14;
678
4.32G
        __m256i SS_Diff_Q14;
679
4.32G
        __m256i SS_sLTP_shp_Q14;
680
4.32G
        __m256i SS_LPC_exc_Q14;
681
4.32G
        __m256i exc_Q14;
682
4.32G
        __m256i q_Q10, rr_Q10, rd_Q10;
683
4.32G
        __m256i mask;
684
4.32G
        __m128i LPC_pred_Q14, n_AR_Q14;
685
4.32G
        __m128i RDmin_Q10, RDmax_Q10;
686
4.32G
        __m128i n_LF_Q14;
687
4.32G
        __m128i r_Q10, q1_Q0, q1_Q10, q2_Q10;
688
4.32G
        __m128i Winner_rand_state, Winner_selector;
689
4.32G
        __m128i tmp0, tmp1;
690
4.32G
        NSQ_del_dec_sample_struct *psLastSample, *psSample;
691
4.32G
        opus_int32 RDmin_ind, RDmax_ind, last_smple_idx;
692
4.32G
        opus_int32 LTP_pred_Q14, n_LTP_Q14;
693
694
        /* Long-term prediction */
695
4.32G
        if (signalType == TYPE_VOICED)
696
115M
        {
697
            /* Unrolled loop */
698
            /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
699
115M
            LTP_pred_Q14 = 2;
700
115M
            LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-0], b_Q14[0]);
701
115M
            LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-1], b_Q14[1]);
702
115M
            LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-2], b_Q14[2]);
703
115M
            LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-3], b_Q14[3]);
704
115M
            LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-4], b_Q14[4]);
705
115M
            LTP_pred_Q14 = silk_LSHIFT(LTP_pred_Q14, 1); /* Q13 -> Q14 */
706
115M
            pred_lag_ptr++;
707
115M
        }
708
4.21G
        else
709
4.21G
        {
710
4.21G
            LTP_pred_Q14 = 0;
711
4.21G
        }
712
713
        /* Long-term shaping */
714
4.32G
        if (lag > 0)
715
183M
        {
716
            /* Symmetric, packed FIR coefficients */
717
183M
            n_LTP_Q14 = silk_add_sat32(shp_lag_ptr[0], shp_lag_ptr[-2]);
718
183M
            n_LTP_Q14 = silk_SMULWB(n_LTP_Q14, HarmShapeFIRPacked_Q14);
719
183M
            n_LTP_Q14 = n_LTP_Q14 + silk_SMULWT(shp_lag_ptr[-1], HarmShapeFIRPacked_Q14);
720
183M
            n_LTP_Q14 = LTP_pred_Q14 - (silk_LSHIFT(n_LTP_Q14, 2)); /* Q12 -> Q14 */
721
183M
            shp_lag_ptr++;
722
183M
        }
723
4.14G
        else
724
4.14G
        {
725
4.14G
            n_LTP_Q14 = 0;
726
4.14G
        }
727
728
        /* BEGIN Updating Delayed Decision States */
729
730
        /* Generate dither */
731
4.32G
        psDelDec->Seed = silk_mm256_rand_epi32(psDelDec->Seed);
732
733
        /* Short-term prediction */
734
4.32G
        LPC_pred_Q14 = silk_noise_shape_quantizer_short_prediction_x4(&psDelDec->sLPC_Q14[NSQ_LPC_BUF_LENGTH - 1 + i], a_Q12, predictLPCOrder);
735
4.32G
        LPC_pred_Q14 = _mm_slli_epi32(LPC_pred_Q14, 4); /* Q10 -> Q14 */
736
737
        /* Noise shape feedback */
738
4.32G
        silk_assert(shapingLPCOrder > 0);
739
4.32G
        silk_assert((shapingLPCOrder & 1) == 0); /* check that order is even */
740
        /* Output of lowpass section */
741
4.32G
        tmp0 = _mm_add_epi32(psDelDec->Diff_Q14, silk_mm_smulwb_epi32(psDelDec->sAR2_Q14[0], warping_Q16));
742
4.32G
        n_AR_Q14 = _mm_set1_epi32(shapingLPCOrder >> 1);
743
94.4G
        for (j = 0; j < shapingLPCOrder - 1; j++)
744
90.0G
        {
745
            /* Output of allpass section */
746
90.0G
            tmp1 = psDelDec->sAR2_Q14[j];
747
90.0G
            psDelDec->sAR2_Q14[j] = tmp0;
748
90.0G
            n_AR_Q14 = _mm_add_epi32(n_AR_Q14, silk_mm_smulwb_epi32(tmp0, AR_shp_Q13[j]));
749
90.0G
            tmp0 = _mm_add_epi32(tmp1, silk_mm_smulwb_epi32(_mm_sub_epi32(psDelDec->sAR2_Q14[j + 1], tmp0), warping_Q16));
750
90.0G
        }
751
4.32G
        psDelDec->sAR2_Q14[shapingLPCOrder - 1] = tmp0;
752
4.32G
        n_AR_Q14 = _mm_add_epi32(n_AR_Q14, silk_mm_smulwb_epi32(tmp0, AR_shp_Q13[shapingLPCOrder - 1]));
753
754
4.32G
        n_AR_Q14 = _mm_slli_epi32(n_AR_Q14, 1);                                                  /* Q11 -> Q12 */
755
4.32G
        n_AR_Q14 = _mm_add_epi32(n_AR_Q14, silk_mm_smulwb_epi32(psDelDec->LF_AR_Q14, Tilt_Q14)); /* Q12 */
756
4.32G
        n_AR_Q14 = _mm_slli_epi32(n_AR_Q14, 2);                                                  /* Q12 -> Q14 */
757
758
4.32G
        tmp0 = silk_mm_smulwb_epi32(psDelDec->Samples[*smpl_buf_idx].Shape_Q14, LF_shp_Q14); /* Q12 */
759
4.32G
        tmp1 = silk_mm_smulwb_epi32(psDelDec->LF_AR_Q14, LF_shp_Q14 >> 16);                  /* Q12 */
760
4.32G
        n_LF_Q14 = _mm_add_epi32(tmp0, tmp1);                                                /* Q12 */
761
4.32G
        n_LF_Q14 = _mm_slli_epi32(n_LF_Q14, 2);                                              /* Q12 -> Q14 */
762
763
        /* Input minus prediction plus noise feedback                       */
764
        /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP  */
765
4.32G
        tmp0 = silk_mm_add_sat_epi32(n_AR_Q14, n_LF_Q14);              /* Q14 */
766
4.32G
        tmp1 = _mm_add_epi32(_mm_set1_epi32(n_LTP_Q14), LPC_pred_Q14); /* Q13 */
767
4.32G
        tmp0 = silk_mm_sub_sat_epi32(tmp1, tmp0);                      /* Q13 */
768
4.32G
        tmp0 = silk_mm_srai_round_epi32(tmp0, 4);                      /* Q10 */
769
770
4.32G
        r_Q10 = _mm_sub_epi32(_mm_set1_epi32(x_Q10[i]), tmp0); /* residual error Q10 */
771
772
        /* Flip sign depending on dither */
773
4.32G
        r_Q10 = silk_mm_sign_epi32(r_Q10, psDelDec->Seed);
774
4.32G
        r_Q10 = silk_mm_limit_epi32(r_Q10, -(31 << 10), 30 << 10);
775
776
        /* Find two quantization level candidates and measure their rate-distortion */
777
4.32G
        q1_Q10 = _mm_sub_epi32(r_Q10, _mm_set1_epi32(offset_Q10));
778
4.32G
        q1_Q0 = _mm_srai_epi32(q1_Q10, 10);
779
4.32G
        if (Lambda_Q10 > 2048)
780
16.4M
        {
781
            /* For aggressive RDO, the bias becomes more than one pulse. */
782
16.4M
            tmp0 = _mm_sub_epi32(_mm_abs_epi32(q1_Q10), _mm_set1_epi32(Lambda_Q10 / 2 - 512)); /* rdo_offset */
783
16.4M
            q1_Q0 = _mm_srai_epi32(q1_Q10, 31);
784
16.4M
            tmp1 = _mm_cmpgt_epi32(tmp0, _mm_setzero_si128());
785
16.4M
            tmp0 = _mm_srai_epi32(silk_mm_sign_epi32(tmp0, q1_Q10), 10);
786
16.4M
            q1_Q0 = _mm_blendv_epi8(q1_Q0, tmp0, tmp1);
787
16.4M
        }
788
789
4.32G
        tmp0 = _mm_sign_epi32(_mm_set1_epi32(QUANT_LEVEL_ADJUST_Q10), q1_Q0);
790
4.32G
        q1_Q10 = _mm_sub_epi32(_mm_slli_epi32(q1_Q0, 10), tmp0);
791
4.32G
        q1_Q10 = _mm_add_epi32(q1_Q10, _mm_set1_epi32(offset_Q10));
792
793
        /* check if q1_Q0 is 0 or -1 */
794
4.32G
        tmp0 = _mm_add_epi32(_mm_srli_epi32(q1_Q0, 31), q1_Q0);
795
4.32G
        tmp1 = _mm_cmpeq_epi32(tmp0, _mm_setzero_si128());
796
4.32G
        tmp0 = _mm_blendv_epi8(_mm_set1_epi32(1024), _mm_set1_epi32(1024 - QUANT_LEVEL_ADJUST_Q10), tmp1);
797
4.32G
        q2_Q10 = _mm_add_epi32(q1_Q10, tmp0);
798
4.32G
        q_Q10 = _mm256_set_m128i(q2_Q10, q1_Q10);
799
800
4.32G
        rr_Q10 = _mm256_sub_epi32(_mm256_broadcastsi128_si256(r_Q10), q_Q10);
801
4.32G
        rd_Q10 = _mm256_abs_epi32(q_Q10);
802
4.32G
        rr_Q10 = silk_mm256_smulbb_epi32(rr_Q10, rr_Q10);
803
4.32G
        rd_Q10 = silk_mm256_smulbb_epi32(rd_Q10, _mm256_set1_epi32(Lambda_Q10));
804
4.32G
        rd_Q10 = _mm256_add_epi32(rd_Q10, rr_Q10);
805
4.32G
        rd_Q10 = _mm256_srai_epi32(rd_Q10, 10);
806
807
4.32G
        mask = _mm256_broadcastsi128_si256(_mm_cmplt_epi32(_mm256_extracti128_si256(rd_Q10, 0), _mm256_extracti128_si256(rd_Q10, 1)));
808
4.32G
        SS_RD_Q10 = _mm256_add_epi32(
809
4.32G
            _mm256_broadcastsi128_si256(psDelDec->RD_Q10),
810
4.32G
            _mm256_blendv_epi8(
811
4.32G
                _mm256_permute2x128_si256(rd_Q10, rd_Q10, 0x1),
812
4.32G
                rd_Q10,
813
4.32G
                mask));
814
4.32G
        SS_Q_Q10 = _mm256_blendv_epi8(
815
4.32G
            _mm256_permute2x128_si256(q_Q10, q_Q10, 0x1),
816
4.32G
            q_Q10,
817
4.32G
            mask);
818
819
        /* Update states for best and second best quantization */
820
821
        /* Quantized excitation */
822
4.32G
        exc_Q14 = silk_mm256_sign_epi32(_mm256_slli_epi32(SS_Q_Q10, 4), _mm256_broadcastsi128_si256(psDelDec->Seed));
823
824
        /* Add predictions */
825
4.32G
        exc_Q14 = _mm256_add_epi32(exc_Q14, _mm256_set1_epi32(LTP_pred_Q14));
826
4.32G
        SS_LPC_exc_Q14 = _mm256_slli_epi32(exc_Q14, 1);
827
4.32G
        SS_xq_Q14 = _mm256_add_epi32(exc_Q14, _mm256_broadcastsi128_si256(LPC_pred_Q14));
828
829
        /* Update states */
830
4.32G
        SS_Diff_Q14 = _mm256_sub_epi32(SS_xq_Q14, _mm256_set1_epi32(silk_LSHIFT(x_Q10[i], 4)));
831
4.32G
        SS_LF_AR_Q14 = _mm256_sub_epi32(SS_Diff_Q14, _mm256_broadcastsi128_si256(n_AR_Q14));
832
4.32G
        SS_sLTP_shp_Q14 = silk_mm256_sub_sat_epi32(SS_LF_AR_Q14, _mm256_broadcastsi128_si256(n_LF_Q14));
833
834
        /* END Updating Delayed Decision States */
835
836
4.32G
        *smpl_buf_idx = (*smpl_buf_idx + DECISION_DELAY - 1) % DECISION_DELAY;
837
4.32G
        last_smple_idx = (*smpl_buf_idx + decisionDelay) % DECISION_DELAY;
838
4.32G
        psLastSample = &psDelDec->Samples[last_smple_idx];
839
840
        /* Find winner */
841
4.32G
        RDmin_Q10 = silk_mm_mask_hmin_epi32(_mm256_castsi256_si128(SS_RD_Q10), MaskDelDec);
842
4.32G
        Winner_selector = silk_index_to_selector(silk_index_of_first_equal_epi32(RDmin_Q10, _mm256_castsi256_si128(SS_RD_Q10)));
843
844
        /* Increase RD values of expired states */
845
4.32G
        Winner_rand_state = _mm_shuffle_epi8(psLastSample->RandState, Winner_selector);
846
847
4.32G
        SS_RD_Q10 = _mm256_blendv_epi8(
848
4.32G
            _mm256_add_epi32(SS_RD_Q10, _mm256_set1_epi32(silk_int32_MAX >> 4)),
849
4.32G
            SS_RD_Q10,
850
4.32G
            _mm256_broadcastsi128_si256(_mm_cmpeq_epi32(psLastSample->RandState, Winner_rand_state)));
851
852
        /* find worst in first set */
853
4.32G
        RDmax_Q10 = silk_mm_mask_hmax_epi32(_mm256_extracti128_si256(SS_RD_Q10, 0), MaskDelDec);
854
        /* find best in second set */
855
4.32G
        RDmin_Q10 = silk_mm_mask_hmin_epi32(_mm256_extracti128_si256(SS_RD_Q10, 1), MaskDelDec);
856
857
        /* Replace a state if best from second set outperforms worst in first set */
858
4.32G
        tmp0 = _mm_cmplt_epi32(RDmin_Q10, RDmax_Q10);
859
4.32G
        if (!_mm_test_all_zeros(tmp0, tmp0))
860
996M
        {
861
996M
            int t;
862
996M
            RDmax_ind = silk_index_of_first_equal_epi32(RDmax_Q10, _mm256_extracti128_si256(SS_RD_Q10, 0));
863
996M
            RDmin_ind = silk_index_of_first_equal_epi32(RDmin_Q10, _mm256_extracti128_si256(SS_RD_Q10, 1));
864
996M
            tmp1 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFU << (unsigned)(RDmax_ind << 3)));
865
996M
            tmp0 = _mm_blendv_epi8(
866
996M
                _mm_set_epi8(0xF, 0xE, 0xD, 0xC, 0xB, 0xA, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0),
867
996M
                silk_index_to_selector(RDmin_ind),
868
996M
                tmp1);
869
69.6G
            for (t = i; t < MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH; t++)
870
68.6G
            {
871
68.6G
                psDelDec->sLPC_Q14[t] = _mm_shuffle_epi8(psDelDec->sLPC_Q14[t], tmp0);
872
68.6G
            }
873
996M
            psDelDec->Seed = _mm_shuffle_epi8(psDelDec->Seed, tmp0);
874
996M
            psDelDec->SeedInit = _mm_shuffle_epi8(psDelDec->SeedInit, tmp0);
875
24.9G
            for (t = 0; t < MAX_SHAPE_LPC_ORDER; t++)
876
23.9G
            {
877
23.9G
                psDelDec->sAR2_Q14[t] = _mm_shuffle_epi8(psDelDec->sAR2_Q14[t], tmp0);
878
23.9G
            }
879
40.8G
            for (t = 0; t < DECISION_DELAY; t++)
880
39.8G
            {
881
39.8G
                psDelDec->Samples[t].RandState = _mm_shuffle_epi8(psDelDec->Samples[t].RandState, tmp0);
882
39.8G
                psDelDec->Samples[t].Q_Q10 = _mm_shuffle_epi8(psDelDec->Samples[t].Q_Q10, tmp0);
883
39.8G
                psDelDec->Samples[t].Xq_Q14 = _mm_shuffle_epi8(psDelDec->Samples[t].Xq_Q14, tmp0);
884
39.8G
                psDelDec->Samples[t].Pred_Q15 = _mm_shuffle_epi8(psDelDec->Samples[t].Pred_Q15, tmp0);
885
39.8G
                psDelDec->Samples[t].Shape_Q14 = _mm_shuffle_epi8(psDelDec->Samples[t].Shape_Q14, tmp0);
886
39.8G
            }
887
996M
            mask = _mm256_castsi128_si256(_mm_blendv_epi8(_mm_set_epi32(0x3, 0x2, 0x1, 0x0), _mm_set1_epi32(RDmin_ind + 4), tmp1));
888
996M
            SS_Q_Q10 = _mm256_permutevar8x32_epi32(SS_Q_Q10, mask);
889
996M
            SS_RD_Q10 = _mm256_permutevar8x32_epi32(SS_RD_Q10, mask);
890
996M
            SS_xq_Q14 = _mm256_permutevar8x32_epi32(SS_xq_Q14, mask);
891
996M
            SS_LF_AR_Q14 = _mm256_permutevar8x32_epi32(SS_LF_AR_Q14, mask);
892
996M
            SS_Diff_Q14 = _mm256_permutevar8x32_epi32(SS_Diff_Q14, mask);
893
996M
            SS_sLTP_shp_Q14 = _mm256_permutevar8x32_epi32(SS_sLTP_shp_Q14, mask);
894
996M
            SS_LPC_exc_Q14 = _mm256_permutevar8x32_epi32(SS_LPC_exc_Q14, mask);
895
996M
        }
896
897
        /* Write samples from winner to output and long-term filter states */
898
4.32G
        if (subfr > 0 || i >= decisionDelay)
899
3.32G
        {
900
3.32G
            pulses[i - decisionDelay] =
901
3.32G
                (opus_int8)silk_sar_round_32(silk_select_winner(psLastSample->Q_Q10, Winner_selector), 10);
902
3.32G
            xq[i - decisionDelay] =
903
3.32G
                silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psLastSample->Xq_Q14, Winner_selector), delayedGain_Q10[last_smple_idx], 8));
904
3.32G
            NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay] =
905
3.32G
                silk_select_winner(psLastSample->Shape_Q14, Winner_selector);
906
3.32G
            sLTP_Q15[NSQ->sLTP_buf_idx - decisionDelay] =
907
3.32G
                silk_select_winner(psLastSample->Pred_Q15, Winner_selector);
908
3.32G
        }
909
4.32G
        NSQ->sLTP_shp_buf_idx++;
910
4.32G
        NSQ->sLTP_buf_idx++;
911
912
        /* Update states */
913
4.32G
        psSample = &psDelDec->Samples[*smpl_buf_idx];
914
4.32G
        psDelDec->Seed = _mm_add_epi32(psDelDec->Seed, silk_mm_srai_round_epi32(_mm256_castsi256_si128(SS_Q_Q10), 10));
915
4.32G
        psDelDec->LF_AR_Q14 = _mm256_castsi256_si128(SS_LF_AR_Q14);
916
4.32G
        psDelDec->Diff_Q14 = _mm256_castsi256_si128(SS_Diff_Q14);
917
4.32G
        psDelDec->sLPC_Q14[i + NSQ_LPC_BUF_LENGTH] = _mm256_castsi256_si128(SS_xq_Q14);
918
4.32G
        psDelDec->RD_Q10 = _mm256_castsi256_si128(SS_RD_Q10);
919
4.32G
        psSample->Xq_Q14 = _mm256_castsi256_si128(SS_xq_Q14);
920
4.32G
        psSample->Q_Q10 = _mm256_castsi256_si128(SS_Q_Q10);
921
4.32G
        psSample->Pred_Q15 = _mm256_castsi256_si128(SS_LPC_exc_Q14);
922
4.32G
        psSample->Shape_Q14 = _mm256_castsi256_si128(SS_sLTP_shp_Q14);
923
4.32G
        psSample->RandState = psDelDec->Seed;
924
4.32G
        delayedGain_Q10[*smpl_buf_idx] = Gain_Q10;
925
4.32G
    }
926
    /* Update LPC states */
927
1.49G
    for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++)
928
1.40G
    {
929
1.40G
        psDelDec->sLPC_Q14[i] = (&psDelDec->sLPC_Q14[length])[i];
930
1.40G
    }
931
87.6M
}
932
933
static OPUS_INLINE void silk_nsq_del_dec_scale_states_avx2(
934
    const silk_encoder_state *psEncC,          /* I    Encoder State                   */
935
    silk_nsq_state *NSQ,                       /* I/O  NSQ state                       */
936
    NSQ_del_dec_struct *psDelDec,              /* I/O  Delayed decision states         */
937
    const opus_int16 x16[],                    /* I    Input                           */
938
    opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH], /* O    Input scaled with 1/Gain in Q10 */
939
    const opus_int16 sLTP[],                   /* I    Re-whitened LTP state in Q0     */
940
    opus_int32 sLTP_Q15[],                     /* O    LTP state matching scaled input */
941
    opus_int subfr,                            /* I    Subframe number                 */
942
    const opus_int LTP_scale_Q14,              /* I    LTP state scaling               */
943
    const opus_int32 Gains_Q16[MAX_NB_SUBFR],  /* I                                    */
944
    const opus_int pitchL[MAX_NB_SUBFR],       /* I    Pitch lag                       */
945
    const opus_int signal_type,                /* I    Signal type                     */
946
    const opus_int decisionDelay               /* I    Decision delay                  */
947
)
948
87.6M
{
949
87.6M
    int i;
950
87.6M
    opus_int lag;
951
87.6M
    opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q26;
952
87.6M
    NSQ_del_dec_sample_struct *psSample;
953
954
87.6M
    lag = pitchL[subfr];
955
87.6M
    inv_gain_Q31 = silk_INVERSE32_varQ(silk_max(Gains_Q16[subfr], 1), 47);
956
87.6M
    silk_assert(inv_gain_Q31 != 0);
957
958
    /* Scale input */
959
87.6M
    inv_gain_Q26 = silk_sar_round_32(inv_gain_Q31, 5);
960
1.16G
    for (i = 0; i < psEncC->subfr_length; i+=4)
961
1.08G
    {
962
1.08G
        __m256i x = _mm256_cvtepi16_epi64(_mm_loadu_si64(&x16[i]));
963
1.08G
        x = _mm256_slli_epi64(_mm256_mul_epi32(x, _mm256_set1_epi32(inv_gain_Q26)), 16);
964
1.08G
        _mm_storeu_si128((__m128i*)(void*)&x_sc_Q10[i], silk_cvtepi64_epi32_high(x));
965
1.08G
    }
966
967
    /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
968
87.6M
    if (NSQ->rewhite_flag)
969
823k
    {
970
823k
        if (subfr == 0)
971
687k
        {
972
            /* Do LTP downscaling */
973
687k
            inv_gain_Q31 = silk_LSHIFT(silk_SMULWB(inv_gain_Q31, LTP_scale_Q14), 2);
974
687k
        }
975
67.5M
        for (i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++)
976
66.7M
        {
977
66.7M
            silk_assert(i < MAX_FRAME_LENGTH);
978
66.7M
            sLTP_Q15[i] = silk_SMULWB(inv_gain_Q31, sLTP[i]);
979
66.7M
        }
980
823k
    }
981
982
    /* Adjust for changing gain */
983
87.6M
    if (Gains_Q16[subfr] != NSQ->prev_gain_Q16)
984
6.05M
    {
985
6.05M
        gain_adj_Q16 = silk_DIV32_varQ(NSQ->prev_gain_Q16, Gains_Q16[subfr], 16);
986
987
        /* Scale long-term shaping state */
988
308M
        for (i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx; i+=4)
989
302M
        {
990
302M
      opus_int32 *p = &NSQ->sLTP_shp_Q14[i];
991
302M
            _mm_storeu_si128((__m128i*)(void*)p, silk_mm_smulww_epi32(_mm_loadu_si128((__m128i*)(void*)p), gain_adj_Q16));
992
302M
        }
993
994
        /* Scale long-term prediction state */
995
6.05M
        if (signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0)
996
1.13M
        {
997
59.1M
            for (i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx - decisionDelay; i++)
998
58.0M
            {
999
58.0M
                sLTP_Q15[i] = ((opus_int64)sLTP_Q15[i]) * ((opus_int64)gain_adj_Q16) >> 16;
1000
58.0M
            }
1001
1.13M
        }
1002
1003
        /* Scale scalar states */
1004
6.05M
        psDelDec->LF_AR_Q14 = silk_mm_smulww_epi32(psDelDec->LF_AR_Q14, gain_adj_Q16);
1005
6.05M
        psDelDec->Diff_Q14 = silk_mm_smulww_epi32(psDelDec->Diff_Q14, gain_adj_Q16);
1006
1007
        /* Scale short-term prediction and shaping states */
1008
102M
        for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++)
1009
96.8M
        {
1010
96.8M
            psDelDec->sLPC_Q14[i] = silk_mm_smulww_epi32(psDelDec->sLPC_Q14[i], gain_adj_Q16);
1011
96.8M
        }
1012
248M
        for (i = 0; i < DECISION_DELAY; i++)
1013
242M
        {
1014
242M
            psSample = &psDelDec->Samples[i];
1015
242M
            psSample->Pred_Q15 = silk_mm_smulww_epi32(psSample->Pred_Q15, gain_adj_Q16);
1016
242M
            psSample->Shape_Q14 = silk_mm_smulww_epi32(psSample->Shape_Q14, gain_adj_Q16);
1017
242M
        }
1018
151M
        for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++)
1019
145M
        {
1020
145M
            psDelDec->sAR2_Q14[i] = silk_mm_smulww_epi32(psDelDec->sAR2_Q14[i], gain_adj_Q16);
1021
145M
        }
1022
1023
        /* Save inverse gain */
1024
6.05M
        NSQ->prev_gain_Q16 = Gains_Q16[subfr];
1025
6.05M
    }
1026
87.6M
}
1027
1028
static OPUS_INLINE void silk_LPC_analysis_filter_avx2(
1029
    opus_int16                  *out,               /* O    Output signal                           */
1030
    const opus_int16            *in,                /* I    Input signal                            */
1031
    const opus_int16            *B,                 /* I    MA prediction coefficients, Q12 [order] */
1032
    const opus_int32            len,                /* I    Signal length                           */
1033
    const opus_int32            order               /* I    Filter order                            */
1034
)
1035
823k
{
1036
823k
    int i;
1037
823k
    opus_int32       out32_Q12, out32;
1038
823k
    silk_assert(order == 10 || order == 16);
1039
1040
67.5M
    for(i = order; i < len; i++ )
1041
66.7M
    {
1042
66.7M
        const opus_int16 *in_ptr = &in[ i ];
1043
        /* Allowing wrap around so that two wraps can cancel each other. The rare
1044
           cases where the result wraps around can only be triggered by invalid streams*/
1045
1046
66.7M
        __m256i in_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)(void*)&in_ptr[-8]));
1047
66.7M
        __m256i B_v  = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)(void*)&      B[0]));
1048
66.7M
        __m256i sum = _mm256_mullo_epi32(in_v, silk_mm256_reverse_epi32(B_v));
1049
66.7M
        if (order > 10)
1050
8.84M
        {
1051
8.84M
            in_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)(void*)&in_ptr[-16]));
1052
8.84M
            B_v  = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)(void*)&B       [8]));
1053
8.84M
            B_v  = silk_mm256_reverse_epi32(B_v);
1054
8.84M
        }
1055
57.8M
        else
1056
57.8M
        {
1057
57.8M
            in_v = _mm256_cvtepi16_epi32(_mm_loadu_si32(&in_ptr[-10]));
1058
57.8M
            B_v  = _mm256_cvtepi16_epi32(_mm_loadu_si32(&B       [8]));
1059
57.8M
            B_v  = _mm256_shuffle_epi32(B_v, 0x01);
1060
57.8M
        }
1061
66.7M
        sum = _mm256_add_epi32(sum, _mm256_mullo_epi32(in_v, B_v));
1062
1063
66.7M
        out32_Q12 = silk_mm256_hsum_epi32(sum);
1064
1065
        /* Subtract prediction */
1066
66.7M
        out32_Q12 = silk_SUB32_ovflw( silk_LSHIFT( (opus_int32)*in_ptr, 12 ), out32_Q12 );
1067
1068
        /* Scale to Q0 */
1069
66.7M
        out32 = silk_sar_round_32(out32_Q12, 12);
1070
1071
        /* Saturate output */
1072
66.7M
        out[ i ] = silk_sat16(out32);
1073
66.7M
    }
1074
1075
    /* Set first d output samples to zero */
1076
823k
    silk_memset( out, 0, order * sizeof( opus_int16 ) );
1077
823k
}