Coverage Report

Created: 2024-09-06 07:53

/src/opus/silk/x86/NSQ_sse4_1.c
Line
Count
Source (jump to first uncovered line)
1
/* Copyright (c) 2014-2020, Cisco Systems, INC
2
   Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
3
4
   Redistribution and use in source and binary forms, with or without
5
   modification, are permitted provided that the following conditions
6
   are met:
7
8
   - Redistributions of source code must retain the above copyright
9
   notice, this list of conditions and the following disclaimer.
10
11
   - Redistributions in binary form must reproduce the above copyright
12
   notice, this list of conditions and the following disclaimer in the
13
   documentation and/or other materials provided with the distribution.
14
15
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18
   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
*/
27
28
#ifdef HAVE_CONFIG_H
29
#include "config.h"
30
#endif
31
32
#include <xmmintrin.h>
33
#include <emmintrin.h>
34
#include <smmintrin.h>
35
#include "main.h"
36
#include "celt/x86/x86cpu.h"
37
#include "stack_alloc.h"
38
39
static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
40
    const silk_encoder_state *psEncC,              /* I    Encoder State                   */
41
    silk_nsq_state      *NSQ,                      /* I/O  NSQ state                       */
42
    const opus_int16    x16[],                     /* I    input                           */
43
    opus_int32          x_sc_Q10[],                /* O    input scaled with 1/Gain        */
44
    const opus_int16    sLTP[],                    /* I    re-whitened LTP state in Q0     */
45
    opus_int32          sLTP_Q15[],                /* O    LTP state matching scaled input */
46
    opus_int            subfr,                     /* I    subframe number                 */
47
    const opus_int      LTP_scale_Q14,             /* I                                    */
48
    const opus_int32    Gains_Q16[ MAX_NB_SUBFR ], /* I                                    */
49
    const opus_int      pitchL[ MAX_NB_SUBFR ],    /* I    Pitch lag                       */
50
    const opus_int      signal_type                /* I    Signal type                     */
51
);
52
53
static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
54
    silk_nsq_state      *NSQ,                   /* I/O  NSQ state                       */
55
    opus_int            signalType,             /* I    Signal type                     */
56
    const opus_int32    x_sc_Q10[],             /* I                                    */
57
    opus_int8           pulses[],               /* O                                    */
58
    opus_int16          xq[],                   /* O                                    */
59
    opus_int32          sLTP_Q15[],             /* I/O  LTP state                       */
60
    const opus_int16    a_Q12[],                /* I    Short term prediction coefs     */
61
    const opus_int16    b_Q14[],                /* I    Long term prediction coefs      */
62
    const opus_int16    AR_shp_Q13[],           /* I    Noise shaping AR coefs          */
63
    opus_int            lag,                    /* I    Pitch lag                       */
64
    opus_int32          HarmShapeFIRPacked_Q14, /* I                                    */
65
    opus_int            Tilt_Q14,               /* I    Spectral tilt                   */
66
    opus_int32          LF_shp_Q14,             /* I                                    */
67
    opus_int32          Gain_Q16,               /* I                                    */
68
    opus_int            Lambda_Q10,             /* I                                    */
69
    opus_int            offset_Q10,             /* I                                    */
70
    opus_int            length,                 /* I    Input length                    */
71
    opus_int32          table[][4]              /* I                                    */
72
);
73
74
void silk_NSQ_sse4_1(
75
    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
76
    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
77
    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
78
    const opus_int16            x16[],                                        /* I    Input                           */
79
    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
80
    const opus_int16            *PredCoef_Q12,                                /* I    Short term prediction coefs     */
81
    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
82
    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
83
    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
84
    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
85
    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
86
    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */
87
    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */
88
    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */
89
    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
90
)
91
0
{
92
0
    opus_int            k, lag, start_idx, LSF_interpolation_flag;
93
0
    const opus_int16    *A_Q12, *B_Q14, *AR_shp_Q13;
94
0
    opus_int16          *pxq;
95
0
    VARDECL( opus_int32, sLTP_Q15 );
96
0
    VARDECL( opus_int16, sLTP );
97
0
    opus_int32          HarmShapeFIRPacked_Q14;
98
0
    opus_int            offset_Q10;
99
0
    VARDECL( opus_int32, x_sc_Q10 );
100
101
0
    opus_int32   table[ 64 ][ 4 ];
102
0
    opus_int32   tmp1;
103
0
    opus_int32   q1_Q10, q2_Q10, rd1_Q20, rd2_Q20;
104
105
#ifdef OPUS_CHECK_ASM
106
    silk_nsq_state NSQ_c;
107
    SideInfoIndices psIndices_c;
108
    opus_int8 pulses_c[ MAX_FRAME_LENGTH ];
109
    const opus_int8 *const pulses_a = pulses;
110
#endif
111
112
0
    SAVE_STACK;
113
114
#ifdef OPUS_CHECK_ASM
115
    ( void )pulses_a;
116
    silk_memcpy( &NSQ_c, NSQ, sizeof( NSQ_c ) );
117
    silk_memcpy( &psIndices_c, psIndices, sizeof( psIndices_c ) );
118
    silk_assert( psEncC->nb_subfr * psEncC->subfr_length <= MAX_FRAME_LENGTH );
119
    silk_memcpy( pulses_c, pulses, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) );
120
121
    silk_NSQ_c(
122
        psEncC,
123
        &NSQ_c,
124
        &psIndices_c,
125
        x16,
126
        pulses_c,
127
        PredCoef_Q12,
128
        LTPCoef_Q14,
129
        AR_Q13,
130
        HarmShapeGain_Q14,
131
        Tilt_Q14,
132
        LF_shp_Q14,
133
        Gains_Q16,
134
        pitchL,
135
        Lambda_Q10,
136
        LTP_scale_Q14
137
    );
138
#endif
139
140
0
    NSQ->rand_seed = psIndices->Seed;
141
142
    /* Set unvoiced lag to the previous one, overwrite later for voiced */
143
0
    lag = NSQ->lagPrev;
144
145
0
    silk_assert( NSQ->prev_gain_Q16 != 0 );
146
147
0
    offset_Q10 = silk_Quantization_Offsets_Q10[ psIndices->signalType >> 1 ][ psIndices->quantOffsetType ];
148
149
    /* 0 */
150
0
    q1_Q10  = offset_Q10;
151
0
    q2_Q10  = offset_Q10 + ( 1024 - QUANT_LEVEL_ADJUST_Q10 );
152
0
    rd1_Q20 = q1_Q10 * Lambda_Q10;
153
0
    rd2_Q20 = q2_Q10 * Lambda_Q10;
154
155
0
    table[ 32 ][ 0 ] = q1_Q10;
156
0
    table[ 32 ][ 1 ] = q2_Q10;
157
0
    table[ 32 ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
158
0
    table[ 32 ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
159
160
    /* -1 */
161
0
    q1_Q10  = offset_Q10 - ( 1024 - QUANT_LEVEL_ADJUST_Q10 );
162
0
    q2_Q10  = offset_Q10;
163
0
    rd1_Q20 = - q1_Q10 * Lambda_Q10;
164
0
    rd2_Q20 = q2_Q10 * Lambda_Q10;
165
166
0
    table[ 31 ][ 0 ] = q1_Q10;
167
0
    table[ 31 ][ 1 ] = q2_Q10;
168
0
    table[ 31 ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
169
0
    table[ 31 ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
170
171
    /* > 0 */
172
0
    for (k = 1; k <= 31; k++)
173
0
    {
174
0
        tmp1 = offset_Q10 + silk_LSHIFT( k, 10 );
175
176
0
        q1_Q10  = tmp1 - QUANT_LEVEL_ADJUST_Q10;
177
0
        q2_Q10  = tmp1 - QUANT_LEVEL_ADJUST_Q10 + 1024;
178
0
        rd1_Q20 = q1_Q10 * Lambda_Q10;
179
0
        rd2_Q20 = q2_Q10 * Lambda_Q10;
180
181
0
        table[ 32 + k ][ 0 ] = q1_Q10;
182
0
        table[ 32 + k ][ 1 ] = q2_Q10;
183
0
        table[ 32 + k ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
184
0
        table[ 32 + k ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
185
0
    }
186
187
    /* < -1 */
188
0
    for (k = -32; k <= -2; k++)
189
0
    {
190
0
        tmp1 = offset_Q10 + silk_LSHIFT( k, 10 );
191
192
0
        q1_Q10  = tmp1 + QUANT_LEVEL_ADJUST_Q10;
193
0
        q2_Q10  = tmp1 + QUANT_LEVEL_ADJUST_Q10 + 1024;
194
0
        rd1_Q20 = - q1_Q10 * Lambda_Q10;
195
0
        rd2_Q20 = - q2_Q10 * Lambda_Q10;
196
197
0
        table[ 32 + k ][ 0 ] = q1_Q10;
198
0
        table[ 32 + k ][ 1 ] = q2_Q10;
199
0
        table[ 32 + k ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
200
0
        table[ 32 + k ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
201
0
    }
202
203
0
    if( psIndices->NLSFInterpCoef_Q2 == 4 ) {
204
0
        LSF_interpolation_flag = 0;
205
0
    } else {
206
0
        LSF_interpolation_flag = 1;
207
0
    }
208
209
0
    ALLOC( sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
210
0
    ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );
211
0
    ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );
212
    /* Set up pointers to start of sub frame */
213
0
    NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length;
214
0
    NSQ->sLTP_buf_idx     = psEncC->ltp_mem_length;
215
0
    pxq                   = &NSQ->xq[ psEncC->ltp_mem_length ];
216
0
    for( k = 0; k < psEncC->nb_subfr; k++ ) {
217
0
        A_Q12      = &PredCoef_Q12[ (( k >> 1 ) | ( 1 - LSF_interpolation_flag )) * MAX_LPC_ORDER ];
218
0
        B_Q14      = &LTPCoef_Q14[ k * LTP_ORDER ];
219
0
        AR_shp_Q13 = &AR_Q13[ k * MAX_SHAPE_LPC_ORDER ];
220
221
        /* Noise shape parameters */
222
0
        silk_assert( HarmShapeGain_Q14[ k ] >= 0 );
223
0
        HarmShapeFIRPacked_Q14  =                          silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 );
224
0
        HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 );
225
226
0
        NSQ->rewhite_flag = 0;
227
0
        if( psIndices->signalType == TYPE_VOICED ) {
228
            /* Voiced */
229
0
            lag = pitchL[ k ];
230
231
            /* Re-whitening */
232
0
            if( ( k & ( 3 - silk_LSHIFT( LSF_interpolation_flag, 1 ) ) ) == 0 ) {
233
                /* Rewhiten with new A coefs */
234
0
                start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2;
235
0
                celt_assert( start_idx > 0 );
236
237
0
                silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_idx + k * psEncC->subfr_length ],
238
0
                    A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder, psEncC->arch );
239
240
0
                NSQ->rewhite_flag = 1;
241
0
                NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
242
0
            }
243
0
        }
244
245
0
        silk_nsq_scale_states_sse4_1( psEncC, NSQ, x16, x_sc_Q10, sLTP, sLTP_Q15, k, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType );
246
247
0
        if ( opus_likely( ( 10 == psEncC->shapingLPCOrder ) && ( 16 == psEncC->predictLPCOrder) ) )
248
0
        {
249
0
            silk_noise_shape_quantizer_10_16_sse4_1( NSQ, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, A_Q12, B_Q14,
250
0
                AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ], Lambda_Q10,
251
0
                offset_Q10, psEncC->subfr_length, &(table[32]) );
252
0
        }
253
0
        else
254
0
        {
255
0
            silk_noise_shape_quantizer( NSQ, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, A_Q12, B_Q14,
256
0
                AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ], Lambda_Q10,
257
0
                offset_Q10, psEncC->subfr_length, psEncC->shapingLPCOrder, psEncC->predictLPCOrder, psEncC->arch );
258
0
        }
259
260
0
        x16    += psEncC->subfr_length;
261
0
        pulses += psEncC->subfr_length;
262
0
        pxq    += psEncC->subfr_length;
263
0
    }
264
265
    /* Update lagPrev for next frame */
266
0
    NSQ->lagPrev = pitchL[ psEncC->nb_subfr - 1 ];
267
268
    /* Save quantized speech and noise shaping signals */
269
0
    silk_memmove( NSQ->xq,           &NSQ->xq[           psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );
270
0
    silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );
271
272
#ifdef OPUS_CHECK_ASM
273
    silk_assert( !memcmp( &NSQ_c, NSQ, sizeof( NSQ_c ) ) );
274
    silk_assert( !memcmp( &psIndices_c, psIndices, sizeof( psIndices_c ) ) );
275
    silk_assert( !memcmp( pulses_c, pulses_a, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) ) );
276
#endif
277
278
0
    RESTORE_STACK;
279
0
}
280
281
/************************************/
282
/* silk_noise_shape_quantizer_10_16 */
283
/************************************/
284
static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
285
    silk_nsq_state      *NSQ,                   /* I/O  NSQ state                       */
286
    opus_int            signalType,             /* I    Signal type                     */
287
    const opus_int32    x_sc_Q10[],             /* I                                    */
288
    opus_int8           pulses[],               /* O                                    */
289
    opus_int16          xq[],                   /* O                                    */
290
    opus_int32          sLTP_Q15[],             /* I/O  LTP state                       */
291
    const opus_int16    a_Q12[],                /* I    Short term prediction coefs     */
292
    const opus_int16    b_Q14[],                /* I    Long term prediction coefs      */
293
    const opus_int16    AR_shp_Q13[],           /* I    Noise shaping AR coefs          */
294
    opus_int            lag,                    /* I    Pitch lag                       */
295
    opus_int32          HarmShapeFIRPacked_Q14, /* I                                    */
296
    opus_int            Tilt_Q14,               /* I    Spectral tilt                   */
297
    opus_int32          LF_shp_Q14,             /* I                                    */
298
    opus_int32          Gain_Q16,               /* I                                    */
299
    opus_int            Lambda_Q10,             /* I                                    */
300
    opus_int            offset_Q10,             /* I                                    */
301
    opus_int            length,                 /* I    Input length                    */
302
    opus_int32          table[][4]              /* I                                    */
303
)
304
0
{
305
0
    opus_int     i;
306
0
    opus_int32   LTP_pred_Q13, LPC_pred_Q10, n_AR_Q12, n_LTP_Q13;
307
0
    opus_int32   n_LF_Q12, r_Q10, q1_Q0, q1_Q10, q2_Q10;
308
0
    opus_int32   exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10, sDiff_shp_Q14;
309
0
    opus_int32   tmp1, tmp2, sLF_AR_shp_Q14;
310
0
    opus_int32   *psLPC_Q14, *shp_lag_ptr, *pred_lag_ptr;
311
312
0
    __m128i xmm_tempa, xmm_tempb;
313
314
0
    __m128i xmm_one;
315
316
0
    __m128i psLPC_Q14_hi_01234567, psLPC_Q14_hi_89ABCDEF;
317
0
    __m128i psLPC_Q14_lo_01234567, psLPC_Q14_lo_89ABCDEF;
318
0
    __m128i a_Q12_01234567,        a_Q12_89ABCDEF;
319
320
0
    __m128i sAR2_Q14_hi_76543210, sAR2_Q14_lo_76543210;
321
0
    __m128i AR_shp_Q13_76543210;
322
323
0
    int rdo_offset = (Lambda_Q10 >> 1) - 512;
324
325
0
    shp_lag_ptr  = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
326
0
    pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
327
0
    Gain_Q10     = silk_RSHIFT( Gain_Q16, 6 );
328
329
    /* Set up short term AR state */
330
0
    psLPC_Q14 = &NSQ->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 1 ];
331
332
0
    sLF_AR_shp_Q14 = NSQ->sLF_AR_shp_Q14;
333
0
    xq_Q14         = psLPC_Q14[ 0 ];
334
0
    sDiff_shp_Q14  = NSQ->sDiff_shp_Q14;
335
0
    LTP_pred_Q13   = 0;
336
337
    /* load a_Q12 */
338
0
    xmm_one = _mm_set_epi8( 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 );
339
340
    /* load a_Q12[0] - a_Q12[7] */
341
0
    a_Q12_01234567 = _mm_loadu_si128( (__m128i *)(void*)(&a_Q12[ 0 ] ) );
342
    /* load a_Q12[ 8 ] - a_Q12[ 15 ] */
343
0
    a_Q12_89ABCDEF = _mm_loadu_si128( (__m128i *)(void*)(&a_Q12[ 8 ] ) );
344
345
0
    a_Q12_01234567 = _mm_shuffle_epi8( a_Q12_01234567, xmm_one );
346
0
    a_Q12_89ABCDEF = _mm_shuffle_epi8( a_Q12_89ABCDEF, xmm_one );
347
348
    /* load AR_shp_Q13 */
349
0
    AR_shp_Q13_76543210 = _mm_loadu_si128( (__m128i *)(void*)(&AR_shp_Q13[0] ) );
350
351
    /* load psLPC_Q14 */
352
0
    xmm_one = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 );
353
354
0
    xmm_tempa = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[-16]) );
355
0
    xmm_tempb = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[-12]) );
356
357
0
    xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one );
358
0
    xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one );
359
360
0
    psLPC_Q14_hi_89ABCDEF = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb );
361
0
    psLPC_Q14_lo_89ABCDEF = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb );
362
363
0
    xmm_tempa = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -8 ]) );
364
0
    xmm_tempb = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -4 ]) );
365
366
0
    xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one );
367
0
    xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one );
368
369
0
    psLPC_Q14_hi_01234567 = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb );
370
0
    psLPC_Q14_lo_01234567 = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb );
371
372
    /* load sAR2_Q14 */
373
0
    xmm_tempa = _mm_loadu_si128( (__m128i *)(void*)(&(NSQ->sAR2_Q14[ 0 ]) ) );
374
0
    xmm_tempb = _mm_loadu_si128( (__m128i *)(void*)(&(NSQ->sAR2_Q14[ 4 ]) ) );
375
376
0
    xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one );
377
0
    xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one );
378
379
0
    sAR2_Q14_hi_76543210 = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb );
380
0
    sAR2_Q14_lo_76543210 = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb );
381
382
    /* prepare 1 in 8 * 16bit */
383
0
    xmm_one = _mm_set1_epi16(1);
384
385
0
    for( i = 0; i < length; i++ )
386
0
    {
387
        /* Short-term prediction */
388
0
        __m128i xmm_hi_07, xmm_hi_8F, xmm_lo_07, xmm_lo_8F;
389
390
        /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
391
0
        LPC_pred_Q10 = 8; /* silk_RSHIFT( predictLPCOrder, 1 ); */
392
393
        /* shift psLPC_Q14 */
394
0
        psLPC_Q14_hi_89ABCDEF = _mm_alignr_epi8( psLPC_Q14_hi_01234567, psLPC_Q14_hi_89ABCDEF, 2 );
395
0
        psLPC_Q14_lo_89ABCDEF = _mm_alignr_epi8( psLPC_Q14_lo_01234567, psLPC_Q14_lo_89ABCDEF, 2 );
396
397
0
        psLPC_Q14_hi_01234567 = _mm_srli_si128( psLPC_Q14_hi_01234567, 2 );
398
0
        psLPC_Q14_lo_01234567 = _mm_srli_si128( psLPC_Q14_lo_01234567, 2 );
399
400
0
        psLPC_Q14_hi_01234567 = _mm_insert_epi16( psLPC_Q14_hi_01234567, (xq_Q14 >> 16), 7 );
401
0
        psLPC_Q14_lo_01234567 = _mm_insert_epi16( psLPC_Q14_lo_01234567, (xq_Q14),       7 );
402
403
        /* high part, use pmaddwd, results in 4 32-bit */
404
0
        xmm_hi_07 = _mm_madd_epi16( psLPC_Q14_hi_01234567, a_Q12_01234567 );
405
0
        xmm_hi_8F = _mm_madd_epi16( psLPC_Q14_hi_89ABCDEF, a_Q12_89ABCDEF );
406
407
        /* low part, use pmulhw, results in 8 16-bit, note we need simulate unsigned * signed, _mm_srai_epi16(psLPC_Q14_lo_01234567, 15) */
408
0
        xmm_tempa = _mm_cmpgt_epi16( _mm_setzero_si128(), psLPC_Q14_lo_01234567 );
409
0
        xmm_tempb = _mm_cmpgt_epi16( _mm_setzero_si128(), psLPC_Q14_lo_89ABCDEF );
410
411
0
        xmm_tempa = _mm_and_si128( xmm_tempa, a_Q12_01234567 );
412
0
        xmm_tempb = _mm_and_si128( xmm_tempb, a_Q12_89ABCDEF );
413
414
0
        xmm_lo_07 = _mm_mulhi_epi16( psLPC_Q14_lo_01234567, a_Q12_01234567 );
415
0
        xmm_lo_8F = _mm_mulhi_epi16( psLPC_Q14_lo_89ABCDEF, a_Q12_89ABCDEF );
416
417
0
        xmm_lo_07 = _mm_add_epi16( xmm_lo_07, xmm_tempa );
418
0
        xmm_lo_8F = _mm_add_epi16( xmm_lo_8F, xmm_tempb );
419
420
0
        xmm_lo_07 = _mm_madd_epi16( xmm_lo_07, xmm_one );
421
0
        xmm_lo_8F = _mm_madd_epi16( xmm_lo_8F, xmm_one );
422
423
        /* accumulate */
424
0
        xmm_hi_07 = _mm_add_epi32( xmm_hi_07, xmm_hi_8F );
425
0
        xmm_lo_07 = _mm_add_epi32( xmm_lo_07, xmm_lo_8F );
426
427
0
        xmm_hi_07 = _mm_add_epi32( xmm_hi_07, xmm_lo_07 );
428
429
0
        xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_unpackhi_epi64(xmm_hi_07, xmm_hi_07 ) );
430
0
        xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_shufflelo_epi16(xmm_hi_07, 0x0E ) );
431
432
0
        LPC_pred_Q10 += _mm_cvtsi128_si32( xmm_hi_07 );
433
434
        /* Long-term prediction */
435
0
        if ( opus_likely( signalType == TYPE_VOICED ) ) {
436
            /* Unrolled loop */
437
            /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
438
0
            LTP_pred_Q13 = 2;
439
0
            {
440
0
                __m128i b_Q14_3210, b_Q14_0123, pred_lag_ptr_0123;
441
442
0
                b_Q14_3210 = OP_CVTEPI16_EPI32_M64( b_Q14 );
443
0
                b_Q14_0123 = _mm_shuffle_epi32( b_Q14_3210, 0x1B );
444
445
                /* loaded: [0] [-1] [-2] [-3] */
446
0
                pred_lag_ptr_0123 = _mm_loadu_si128( (__m128i *)(void*)(&pred_lag_ptr[ -3 ] ) );
447
                /* shuffle to [-3] [-2] [-1] [0] and to new xmm */
448
0
                xmm_tempa = _mm_shuffle_epi32( pred_lag_ptr_0123, 0x1B );
449
                /*64-bit multiply, a[2] * b[-2], a[0] * b[0] */
450
0
                xmm_tempa = _mm_mul_epi32( xmm_tempa, b_Q14_3210 );
451
                /* right shift 2 bytes (16 bits), zero extended */
452
0
                xmm_tempa = _mm_srli_si128( xmm_tempa, 2 );
453
454
                /* a[1] * b[-1], a[3] * b[-3] */
455
0
                pred_lag_ptr_0123 = _mm_mul_epi32( pred_lag_ptr_0123, b_Q14_0123 );
456
0
                pred_lag_ptr_0123 = _mm_srli_si128( pred_lag_ptr_0123, 2 );
457
458
0
                pred_lag_ptr_0123 = _mm_add_epi32( pred_lag_ptr_0123, xmm_tempa );
459
                /* equal shift right 8 bytes*/
460
0
                xmm_tempa = _mm_shuffle_epi32( pred_lag_ptr_0123, _MM_SHUFFLE( 0, 0, 3, 2 ) );
461
0
                xmm_tempa = _mm_add_epi32( xmm_tempa, pred_lag_ptr_0123 );
462
463
0
                LTP_pred_Q13 += _mm_cvtsi128_si32( xmm_tempa );
464
465
0
                LTP_pred_Q13 = silk_SMLAWB( LTP_pred_Q13, pred_lag_ptr[ -4 ], b_Q14[ 4 ] );
466
0
                pred_lag_ptr++;
467
0
            }
468
0
        }
469
470
        /* Noise shape feedback */
471
0
        NSQ->sAR2_Q14[ 9 ] = NSQ->sAR2_Q14[ 8 ];
472
0
        NSQ->sAR2_Q14[ 8 ] = _mm_cvtsi128_si32( _mm_srli_si128(_mm_unpackhi_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 ), 12 ) );
473
474
0
        sAR2_Q14_hi_76543210 = _mm_slli_si128( sAR2_Q14_hi_76543210, 2 );
475
0
        sAR2_Q14_lo_76543210 = _mm_slli_si128( sAR2_Q14_lo_76543210, 2 );
476
477
0
        sAR2_Q14_hi_76543210 = _mm_insert_epi16( sAR2_Q14_hi_76543210, (sDiff_shp_Q14 >> 16), 0 );
478
0
        sAR2_Q14_lo_76543210 = _mm_insert_epi16( sAR2_Q14_lo_76543210, (sDiff_shp_Q14),       0 );
479
480
        /* high part, use pmaddwd, results in 4 32-bit */
481
0
        xmm_hi_07 = _mm_madd_epi16( sAR2_Q14_hi_76543210, AR_shp_Q13_76543210 );
482
483
        /* low part, use pmulhw, results in 8 16-bit, note we need simulate unsigned * signed,_mm_srai_epi16(sAR2_Q14_lo_76543210, 15) */
484
0
        xmm_tempa = _mm_cmpgt_epi16( _mm_setzero_si128(), sAR2_Q14_lo_76543210 );
485
0
        xmm_tempa = _mm_and_si128( xmm_tempa, AR_shp_Q13_76543210 );
486
487
0
        xmm_lo_07 = _mm_mulhi_epi16( sAR2_Q14_lo_76543210, AR_shp_Q13_76543210 );
488
0
        xmm_lo_07 = _mm_add_epi16( xmm_lo_07, xmm_tempa );
489
490
0
        xmm_lo_07 = _mm_madd_epi16( xmm_lo_07, xmm_one );
491
492
        /* accumulate */
493
0
        xmm_hi_07 = _mm_add_epi32( xmm_hi_07, xmm_lo_07 );
494
495
0
        xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_unpackhi_epi64(xmm_hi_07, xmm_hi_07 ) );
496
0
        xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_shufflelo_epi16(xmm_hi_07, 0x0E ) );
497
498
0
        n_AR_Q12 = 5 + _mm_cvtsi128_si32( xmm_hi_07 );
499
500
0
        n_AR_Q12 = silk_SMLAWB( n_AR_Q12, NSQ->sAR2_Q14[ 8 ], AR_shp_Q13[ 8 ] );
501
0
        n_AR_Q12 = silk_SMLAWB( n_AR_Q12, NSQ->sAR2_Q14[ 9 ], AR_shp_Q13[ 9 ] );
502
503
0
        n_AR_Q12 = silk_LSHIFT32( n_AR_Q12, 1 );                                /* Q11 -> Q12 */
504
0
        n_AR_Q12 = silk_SMLAWB( n_AR_Q12, sLF_AR_shp_Q14, Tilt_Q14 );
505
506
0
        n_LF_Q12 = silk_SMULWB( NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - 1 ], LF_shp_Q14 );
507
0
        n_LF_Q12 = silk_SMLAWT( n_LF_Q12, sLF_AR_shp_Q14, LF_shp_Q14 );
508
509
0
        celt_assert( lag > 0 || signalType != TYPE_VOICED );
510
511
        /* Combine prediction and noise shaping signals */
512
0
        tmp1 = silk_SUB32( silk_LSHIFT32( LPC_pred_Q10, 2 ), n_AR_Q12 );        /* Q12 */
513
0
        tmp1 = silk_SUB32( tmp1, n_LF_Q12 );                                    /* Q12 */
514
0
        if( lag > 0 ) {
515
            /* Symmetric, packed FIR coefficients */
516
0
            n_LTP_Q13 = silk_SMULWB( silk_ADD_SAT32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
517
0
            n_LTP_Q13 = silk_SMLAWT( n_LTP_Q13, shp_lag_ptr[ -1 ],                      HarmShapeFIRPacked_Q14 );
518
0
            n_LTP_Q13 = silk_LSHIFT( n_LTP_Q13, 1 );
519
0
            shp_lag_ptr++;
520
521
0
            tmp2 = silk_SUB32( LTP_pred_Q13, n_LTP_Q13 );                       /* Q13 */
522
0
            tmp1 = silk_ADD_LSHIFT32( tmp2, tmp1, 1 );                          /* Q13 */
523
0
            tmp1 = silk_RSHIFT_ROUND( tmp1, 3 );                                /* Q10 */
524
0
        } else {
525
0
            tmp1 = silk_RSHIFT_ROUND( tmp1, 2 );                                /* Q10 */
526
0
        }
527
528
0
        r_Q10 = silk_SUB32( x_sc_Q10[ i ], tmp1 );                              /* residual error Q10 */
529
530
        /* Generate dither */
531
0
        NSQ->rand_seed = silk_RAND( NSQ->rand_seed );
532
533
        /* Flip sign depending on dither */
534
0
        tmp2 = -r_Q10;
535
0
        if ( NSQ->rand_seed < 0 ) r_Q10 = tmp2;
536
537
0
        r_Q10 = silk_LIMIT_32( r_Q10, -(31 << 10), 30 << 10 );
538
539
        /* Find two quantization level candidates and measure their rate-distortion */
540
0
        q1_Q10 = silk_SUB32( r_Q10, offset_Q10 );
541
0
        q1_Q0 = silk_RSHIFT( q1_Q10, 10 );
542
0
        if (Lambda_Q10 > 2048) {
543
            /* For aggressive RDO, the bias becomes more than one pulse. */
544
0
            if (q1_Q10 > rdo_offset) {
545
0
                q1_Q0 = silk_RSHIFT( q1_Q10 - rdo_offset, 10 );
546
0
            } else if (q1_Q10 < -rdo_offset) {
547
0
                q1_Q0 = silk_RSHIFT( q1_Q10 + rdo_offset, 10 );
548
0
            } else if (q1_Q10 < 0) {
549
0
                q1_Q0 = -1;
550
0
            } else {
551
0
                q1_Q0 = 0;
552
0
            }
553
0
        }
554
555
0
        q1_Q10 = table[q1_Q0][0];
556
0
        q2_Q10 = table[q1_Q0][1];
557
558
0
        if (r_Q10 * table[q1_Q0][2] - table[q1_Q0][3] < 0)
559
0
        {
560
0
            q1_Q10 = q2_Q10;
561
0
        }
562
563
0
        pulses[ i ] = (opus_int8)silk_RSHIFT_ROUND( q1_Q10, 10 );
564
565
        /* Excitation */
566
0
        exc_Q14 = silk_LSHIFT( q1_Q10, 4 );
567
568
0
        tmp2 = -exc_Q14;
569
0
        if ( NSQ->rand_seed < 0 ) exc_Q14 = tmp2;
570
571
        /* Add predictions */
572
0
        LPC_exc_Q14 = silk_ADD_LSHIFT32( exc_Q14, LTP_pred_Q13, 1 );
573
0
        xq_Q14      = silk_ADD_LSHIFT32( LPC_exc_Q14, LPC_pred_Q10, 4 );
574
575
        /* Update states */
576
0
        psLPC_Q14++;
577
0
        *psLPC_Q14 = xq_Q14;
578
0
        NSQ->sDiff_shp_Q14 = silk_SUB_LSHIFT32( xq_Q14, x_sc_Q10[ i ], 4 );
579
0
        sLF_AR_shp_Q14 = silk_SUB_LSHIFT32( NSQ->sDiff_shp_Q14, n_AR_Q12, 2 );
580
581
0
        NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx ] = silk_SUB_LSHIFT32( sLF_AR_shp_Q14, n_LF_Q12, 2 );
582
0
        sLTP_Q15[ NSQ->sLTP_buf_idx ] = silk_LSHIFT( LPC_exc_Q14, 1 );
583
0
        NSQ->sLTP_shp_buf_idx++;
584
0
        NSQ->sLTP_buf_idx++;
585
586
        /* Make dither dependent on quantized signal */
587
0
        NSQ->rand_seed = silk_ADD32_ovflw( NSQ->rand_seed, pulses[ i ] );
588
0
    }
589
590
0
    NSQ->sLF_AR_shp_Q14 = sLF_AR_shp_Q14;
591
592
    /* Scale XQ back to normal level before saving */
593
0
    psLPC_Q14 = &NSQ->sLPC_Q14[ NSQ_LPC_BUF_LENGTH ];
594
595
    /* write back sAR2_Q14 */
596
0
    xmm_tempa = _mm_unpackhi_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 );
597
0
    xmm_tempb = _mm_unpacklo_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 );
598
0
    _mm_storeu_si128( (__m128i *)(void*)(&NSQ->sAR2_Q14[ 4 ]), xmm_tempa );
599
0
    _mm_storeu_si128( (__m128i *)(void*)(&NSQ->sAR2_Q14[ 0 ]), xmm_tempb );
600
601
    /* xq[ i ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psLPC_Q14[ i ], Gain_Q10 ), 8 ) ); */
602
0
    {
603
0
        __m128i xmm_Gain_Q10;
604
0
        __m128i xmm_xq_Q14_3210, xmm_xq_Q14_x3x1, xmm_xq_Q14_7654, xmm_xq_Q14_x7x5;
605
606
        /* prepare (1 << 7) in packed 4 32-bits */
607
0
        xmm_tempa = _mm_set1_epi32( (1 << 7) );
608
609
        /* prepare Gain_Q10 in packed 4 32-bits */
610
0
        xmm_Gain_Q10 = _mm_set1_epi32( Gain_Q10 );
611
612
        /* process xq */
613
0
        for (i = 0; i < length - 7; i += 8)
614
0
        {
615
0
            xmm_xq_Q14_3210 = _mm_loadu_si128( (__m128i *)(void*)(&(psLPC_Q14[ i + 0 ] ) ) );
616
0
            xmm_xq_Q14_7654 = _mm_loadu_si128( (__m128i *)(void*)(&(psLPC_Q14[ i + 4 ] ) ) );
617
618
            /* equal shift right 4 bytes*/
619
0
            xmm_xq_Q14_x3x1 = _mm_shuffle_epi32( xmm_xq_Q14_3210, _MM_SHUFFLE( 0, 3, 2, 1 ) );
620
            /* equal shift right 4 bytes*/
621
0
            xmm_xq_Q14_x7x5 = _mm_shuffle_epi32( xmm_xq_Q14_7654, _MM_SHUFFLE( 0, 3, 2, 1 ) );
622
623
0
            xmm_xq_Q14_3210 = _mm_mul_epi32( xmm_xq_Q14_3210, xmm_Gain_Q10 );
624
0
            xmm_xq_Q14_x3x1 = _mm_mul_epi32( xmm_xq_Q14_x3x1, xmm_Gain_Q10 );
625
0
            xmm_xq_Q14_7654 = _mm_mul_epi32( xmm_xq_Q14_7654, xmm_Gain_Q10 );
626
0
            xmm_xq_Q14_x7x5 = _mm_mul_epi32( xmm_xq_Q14_x7x5, xmm_Gain_Q10 );
627
628
0
            xmm_xq_Q14_3210 = _mm_srli_epi64( xmm_xq_Q14_3210, 16 );
629
0
            xmm_xq_Q14_x3x1 = _mm_slli_epi64( xmm_xq_Q14_x3x1, 16 );
630
0
            xmm_xq_Q14_7654 = _mm_srli_epi64( xmm_xq_Q14_7654, 16 );
631
0
            xmm_xq_Q14_x7x5 = _mm_slli_epi64( xmm_xq_Q14_x7x5, 16 );
632
633
0
            xmm_xq_Q14_3210 = _mm_blend_epi16( xmm_xq_Q14_3210, xmm_xq_Q14_x3x1, 0xCC );
634
0
            xmm_xq_Q14_7654 = _mm_blend_epi16( xmm_xq_Q14_7654, xmm_xq_Q14_x7x5, 0xCC );
635
636
            /* silk_RSHIFT_ROUND(xq, 8) */
637
0
            xmm_xq_Q14_3210 = _mm_add_epi32( xmm_xq_Q14_3210, xmm_tempa );
638
0
            xmm_xq_Q14_7654 = _mm_add_epi32( xmm_xq_Q14_7654, xmm_tempa );
639
640
0
            xmm_xq_Q14_3210 = _mm_srai_epi32( xmm_xq_Q14_3210, 8 );
641
0
            xmm_xq_Q14_7654 = _mm_srai_epi32( xmm_xq_Q14_7654, 8 );
642
643
            /* silk_SAT16 */
644
0
            xmm_xq_Q14_3210 = _mm_packs_epi32( xmm_xq_Q14_3210, xmm_xq_Q14_7654 );
645
646
            /* save to xq */
647
0
            _mm_storeu_si128( (__m128i *)(void*)(&xq[ i ] ), xmm_xq_Q14_3210 );
648
0
        }
649
0
    }
650
0
    for ( ; i < length; i++)
651
0
    {
652
0
        xq[i] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psLPC_Q14[ i ], Gain_Q10 ), 8 ) );
653
0
    }
654
655
    /* Update LPC synth buffer */
656
0
    silk_memcpy( NSQ->sLPC_Q14, &NSQ->sLPC_Q14[ length ], NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
657
0
}
658
659
static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
660
    const silk_encoder_state *psEncC,              /* I    Encoder State                   */
661
    silk_nsq_state      *NSQ,                      /* I/O  NSQ state                       */
662
    const opus_int16    x16[],                     /* I    input                           */
663
    opus_int32          x_sc_Q10[],                /* O    input scaled with 1/Gain        */
664
    const opus_int16    sLTP[],                    /* I    re-whitened LTP state in Q0     */
665
    opus_int32          sLTP_Q15[],                /* O    LTP state matching scaled input */
666
    opus_int            subfr,                     /* I    subframe number                 */
667
    const opus_int      LTP_scale_Q14,             /* I                                    */
668
    const opus_int32    Gains_Q16[ MAX_NB_SUBFR ], /* I                                    */
669
    const opus_int      pitchL[ MAX_NB_SUBFR ],    /* I    Pitch lag                       */
670
    const opus_int      signal_type                /* I    Signal type                     */
671
)
672
0
{
673
0
    opus_int   i, lag;
674
0
    opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q26;
675
0
    __m128i xmm_inv_gain_Q26, xmm_x16_x2x0, xmm_x16_x3x1;
676
677
0
    lag          = pitchL[ subfr ];
678
0
    inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );
679
0
    silk_assert( inv_gain_Q31 != 0 );
680
681
    /* Scale input */
682
0
    inv_gain_Q26 = silk_RSHIFT_ROUND( inv_gain_Q31, 5 );
683
684
    /* prepare inv_gain_Q26 in packed 4 32-bits */
685
0
    xmm_inv_gain_Q26 = _mm_set1_epi32(inv_gain_Q26);
686
687
0
    for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) {
688
0
        xmm_x16_x2x0 = OP_CVTEPI16_EPI32_M64( &(x16[ i ] ) );
689
690
        /* equal shift right 4 bytes*/
691
0
        xmm_x16_x3x1 = _mm_shuffle_epi32( xmm_x16_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
692
693
0
        xmm_x16_x2x0 = _mm_mul_epi32( xmm_x16_x2x0, xmm_inv_gain_Q26 );
694
0
        xmm_x16_x3x1 = _mm_mul_epi32( xmm_x16_x3x1, xmm_inv_gain_Q26 );
695
696
0
        xmm_x16_x2x0 = _mm_srli_epi64( xmm_x16_x2x0, 16 );
697
0
        xmm_x16_x3x1 = _mm_slli_epi64( xmm_x16_x3x1, 16 );
698
699
0
        xmm_x16_x2x0 = _mm_blend_epi16( xmm_x16_x2x0, xmm_x16_x3x1, 0xCC );
700
701
0
        _mm_storeu_si128( (__m128i *)(void*)(&(x_sc_Q10[ i ] ) ), xmm_x16_x2x0 );
702
0
    }
703
704
0
    for( ; i < psEncC->subfr_length; i++ ) {
705
0
        x_sc_Q10[ i ] = silk_SMULWW( x16[ i ], inv_gain_Q26 );
706
0
    }
707
708
    /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
709
0
    if( NSQ->rewhite_flag ) {
710
0
        if( subfr == 0 ) {
711
            /* Do LTP downscaling */
712
0
            inv_gain_Q31 = silk_LSHIFT( silk_SMULWB( inv_gain_Q31, LTP_scale_Q14 ), 2 );
713
0
        }
714
0
        for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++ ) {
715
0
            silk_assert( i < MAX_FRAME_LENGTH );
716
0
            sLTP_Q15[ i ] = silk_SMULWB( inv_gain_Q31, sLTP[ i ] );
717
0
        }
718
0
    }
719
720
    /* Adjust for changing gain */
721
0
    if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
722
0
        __m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1;
723
0
        gain_adj_Q16 =  silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
724
725
        /* Scale long-term shaping state */
726
727
        /* prepare gain_adj_Q16 in packed 4 32-bits */
728
0
        xmm_gain_adj_Q16 = _mm_set1_epi32(gain_adj_Q16);
729
730
0
        for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 3; i += 4 )
731
0
        {
732
0
            xmm_sLTP_shp_Q14_x2x0 = _mm_loadu_si128( (__m128i *)(void*)(&(NSQ->sLTP_shp_Q14[ i ] ) ) );
733
            /* equal shift right 4 bytes*/
734
0
            xmm_sLTP_shp_Q14_x3x1 = _mm_shuffle_epi32( xmm_sLTP_shp_Q14_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
735
736
0
            xmm_sLTP_shp_Q14_x2x0 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x2x0, xmm_gain_adj_Q16 );
737
0
            xmm_sLTP_shp_Q14_x3x1 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x3x1, xmm_gain_adj_Q16 );
738
739
0
            xmm_sLTP_shp_Q14_x2x0 = _mm_srli_epi64( xmm_sLTP_shp_Q14_x2x0, 16 );
740
0
            xmm_sLTP_shp_Q14_x3x1 = _mm_slli_epi64( xmm_sLTP_shp_Q14_x3x1, 16 );
741
742
0
            xmm_sLTP_shp_Q14_x2x0 = _mm_blend_epi16( xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1, 0xCC );
743
744
0
            _mm_storeu_si128( (__m128i *)(void*)(&(NSQ->sLTP_shp_Q14[ i ] ) ), xmm_sLTP_shp_Q14_x2x0 );
745
0
        }
746
747
0
        for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) {
748
0
            NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] );
749
0
        }
750
751
        /* Scale long-term prediction state */
752
0
        if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) {
753
0
            for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++ ) {
754
0
                sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] );
755
0
            }
756
0
        }
757
758
0
        NSQ->sLF_AR_shp_Q14 = silk_SMULWW( gain_adj_Q16, NSQ->sLF_AR_shp_Q14 );
759
0
        NSQ->sDiff_shp_Q14 = silk_SMULWW( gain_adj_Q16, NSQ->sDiff_shp_Q14 );
760
761
        /* Scale short-term prediction and shaping states */
762
0
        for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
763
0
            NSQ->sLPC_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLPC_Q14[ i ] );
764
0
        }
765
0
        for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {
766
0
            NSQ->sAR2_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sAR2_Q14[ i ] );
767
0
        }
768
769
        /* Save inverse gain */
770
0
        NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
771
0
    }
772
0
}