Coverage Report

Created: 2024-03-26 07:25

/src/opus/silk/x86/VAD_sse4_1.c
Line
Count
Source
1
/* Copyright (c) 2014-2020, Cisco Systems, INC
2
   Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
3
4
   Redistribution and use in source and binary forms, with or without
5
   modification, are permitted provided that the following conditions
6
   are met:
7
8
   - Redistributions of source code must retain the above copyright
9
   notice, this list of conditions and the following disclaimer.
10
11
   - Redistributions in binary form must reproduce the above copyright
12
   notice, this list of conditions and the following disclaimer in the
13
   documentation and/or other materials provided with the distribution.
14
15
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18
   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
*/
27
28
#ifdef HAVE_CONFIG_H
29
#include "config.h"
30
#endif
31
32
#include <xmmintrin.h>
33
#include <emmintrin.h>
34
#include <smmintrin.h>
35
36
#include "main.h"
37
#include "stack_alloc.h"
38
39
/* Weighting factors for tilt measure */
40
static const opus_int32 tiltWeights[ VAD_N_BANDS ] = { 30000, 6000, -12000, -12000 };
41
42
/***************************************/
43
/* Get the speech activity level in Q8 */
44
/***************************************/
45
opus_int silk_VAD_GetSA_Q8_sse4_1(                  /* O    Return value, 0 if success                  */
46
    silk_encoder_state          *psEncC,            /* I/O  Encoder state                               */
47
    const opus_int16            pIn[]               /* I    PCM input                                   */
48
)
49
20.0M
{
50
20.0M
    opus_int   SA_Q15, pSNR_dB_Q7, input_tilt;
51
20.0M
    opus_int   decimated_framelength1, decimated_framelength2;
52
20.0M
    opus_int   decimated_framelength;
53
20.0M
    opus_int   dec_subframe_length, dec_subframe_offset, SNR_Q7, i, b, s;
54
20.0M
    opus_int32 sumSquared, smooth_coef_Q16;
55
20.0M
    opus_int16 HPstateTmp;
56
20.0M
    VARDECL( opus_int16, X );
57
20.0M
    opus_int32 Xnrg[ VAD_N_BANDS ];
58
20.0M
    opus_int32 NrgToNoiseRatio_Q8[ VAD_N_BANDS ];
59
20.0M
    opus_int32 speech_nrg, x_tmp;
60
20.0M
    opus_int   X_offset[ VAD_N_BANDS ];
61
20.0M
    opus_int   ret = 0;
62
20.0M
    silk_VAD_state *psSilk_VAD = &psEncC->sVAD;
63
64
20.0M
    SAVE_STACK;
65
66
#ifdef OPUS_CHECK_ASM
67
    silk_encoder_state psEncC_c;
68
19.9M
    opus_int ret_c;
69
70
19.9M
    silk_memcpy( &psEncC_c, psEncC, sizeof( psEncC_c ) );
71
    ret_c = silk_VAD_GetSA_Q8_c( &psEncC_c, pIn );
72
#endif
73
74
    /* Safety checks */
75
20.0M
    silk_assert( VAD_N_BANDS == 4 );
76
20.0M
    celt_assert( MAX_FRAME_LENGTH >= psEncC->frame_length );
77
20.0M
    celt_assert( psEncC->frame_length <= 512 );
78
20.0M
    celt_assert( psEncC->frame_length == 8 * silk_RSHIFT( psEncC->frame_length, 3 ) );
79
80
    /***********************/
81
    /* Filter and Decimate */
82
    /***********************/
83
20.0M
    decimated_framelength1 = silk_RSHIFT( psEncC->frame_length, 1 );
84
20.0M
    decimated_framelength2 = silk_RSHIFT( psEncC->frame_length, 2 );
85
20.0M
    decimated_framelength = silk_RSHIFT( psEncC->frame_length, 3 );
86
    /* Decimate into 4 bands:
87
       0       L      3L       L              3L                             5L
88
               -      --       -              --                             --
89
               8       8       2               4                              4
90
91
       [0-1 kHz| temp. |1-2 kHz|    2-4 kHz    |            4-8 kHz           |
92
93
       They're arranged to allow the minimal ( frame_length / 4 ) extra
94
       scratch space during the downsampling process */
95
20.0M
    X_offset[ 0 ] = 0;
96
20.0M
    X_offset[ 1 ] = decimated_framelength + decimated_framelength2;
97
20.0M
    X_offset[ 2 ] = X_offset[ 1 ] + decimated_framelength;
98
20.0M
    X_offset[ 3 ] = X_offset[ 2 ] + decimated_framelength2;
99
20.0M
    ALLOC( X, X_offset[ 3 ] + decimated_framelength1, opus_int16 );
100
101
    /* 0-8 kHz to 0-4 kHz and 4-8 kHz */
102
20.0M
    silk_ana_filt_bank_1( pIn, &psSilk_VAD->AnaState[  0 ],
103
20.0M
        X, &X[ X_offset[ 3 ] ], psEncC->frame_length );
104
105
    /* 0-4 kHz to 0-2 kHz and 2-4 kHz */
106
20.0M
    silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState1[ 0 ],
107
20.0M
        X, &X[ X_offset[ 2 ] ], decimated_framelength1 );
108
109
    /* 0-2 kHz to 0-1 kHz and 1-2 kHz */
110
20.0M
    silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState2[ 0 ],
111
20.0M
        X, &X[ X_offset[ 1 ] ], decimated_framelength2 );
112
113
    /*********************************************/
114
    /* HP filter on lowest band (differentiator) */
115
    /*********************************************/
116
20.0M
    X[ decimated_framelength - 1 ] = silk_RSHIFT( X[ decimated_framelength - 1 ], 1 );
117
20.0M
    HPstateTmp = X[ decimated_framelength - 1 ];
118
438M
    for( i = decimated_framelength - 1; i > 0; i-- ) {
119
418M
        X[ i - 1 ]  = silk_RSHIFT( X[ i - 1 ], 1 );
120
418M
        X[ i ]     -= X[ i - 1 ];
121
418M
    }
122
20.0M
    X[ 0 ] -= psSilk_VAD->HPstate;
123
20.0M
    psSilk_VAD->HPstate = HPstateTmp;
124
125
    /*************************************/
126
    /* Calculate the energy in each band */
127
    /*************************************/
128
100M
    for( b = 0; b < VAD_N_BANDS; b++ ) {
129
        /* Find the decimated framelength in the non-uniformly divided bands */
130
80.0M
        decimated_framelength = silk_RSHIFT( psEncC->frame_length, silk_min_int( VAD_N_BANDS - b, VAD_N_BANDS - 1 ) );
131
132
        /* Split length into subframe lengths */
133
80.0M
        dec_subframe_length = silk_RSHIFT( decimated_framelength, VAD_INTERNAL_SUBFRAMES_LOG2 );
134
80.0M
        dec_subframe_offset = 0;
135
136
        /* Compute energy per sub-frame */
137
        /* initialize with summed energy of last subframe */
138
80.0M
        Xnrg[ b ] = psSilk_VAD->XnrgSubfr[ b ];
139
400M
        for( s = 0; s < VAD_INTERNAL_SUBFRAMES; s++ ) {
140
320M
            __m128i xmm_X, xmm_acc;
141
320M
            sumSquared = 0;
142
143
320M
            xmm_acc = _mm_setzero_si128();
144
145
600M
            for( i = 0; i < dec_subframe_length - 7; i += 8 )
146
280M
            {
147
280M
                xmm_X   = _mm_loadu_si128( (__m128i *)(void*)&(X[ X_offset[ b ] + i + dec_subframe_offset ] ) );
148
280M
                xmm_X   = _mm_srai_epi16( xmm_X, 3 );
149
280M
                xmm_X   = _mm_madd_epi16( xmm_X, xmm_X );
150
280M
                xmm_acc = _mm_add_epi32( xmm_acc, xmm_X );
151
280M
            }
152
153
320M
            xmm_acc = _mm_add_epi32( xmm_acc, _mm_unpackhi_epi64( xmm_acc, xmm_acc ) );
154
320M
            xmm_acc = _mm_add_epi32( xmm_acc, _mm_shufflelo_epi16( xmm_acc, 0x0E ) );
155
156
320M
            sumSquared += _mm_cvtsi128_si32( xmm_acc );
157
158
1.54G
            for( ; i < dec_subframe_length; i++ ) {
159
                /* The energy will be less than dec_subframe_length * ( silk_int16_MIN / 8 ) ^ 2.            */
160
                /* Therefore we can accumulate with no risk of overflow (unless dec_subframe_length > 128)  */
161
1.22G
                x_tmp = silk_RSHIFT(
162
1.22G
                    X[ X_offset[ b ] + i + dec_subframe_offset ], 3 );
163
1.22G
                sumSquared = silk_SMLABB( sumSquared, x_tmp, x_tmp );
164
165
                /* Safety check */
166
1.22G
                silk_assert( sumSquared >= 0 );
167
1.22G
            }
168
169
            /* Add/saturate summed energy of current subframe */
170
320M
            if( s < VAD_INTERNAL_SUBFRAMES - 1 ) {
171
240M
                Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], sumSquared );
172
240M
            } else {
173
                /* Look-ahead subframe */
174
80.0M
                Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], silk_RSHIFT( sumSquared, 1 ) );
175
80.0M
            }
176
177
320M
            dec_subframe_offset += dec_subframe_length;
178
320M
        }
179
80.0M
        psSilk_VAD->XnrgSubfr[ b ] = sumSquared;
180
80.0M
    }
181
182
    /********************/
183
    /* Noise estimation */
184
    /********************/
185
20.0M
    silk_VAD_GetNoiseLevels( &Xnrg[ 0 ], psSilk_VAD );
186
187
    /***********************************************/
188
    /* Signal-plus-noise to noise ratio estimation */
189
    /***********************************************/
190
113k
    sumSquared = 0;
191
113k
    input_tilt = 0;
192
100M
    for( b = 0; b < VAD_N_BANDS; b++ ) {
193
80.0M
        speech_nrg = Xnrg[ b ] - psSilk_VAD->NL[ b ];
194
80.0M
        if( speech_nrg > 0 ) {
195
            /* Divide, with sufficient resolution */
196
2.52M
            if( ( Xnrg[ b ] & 0xFF800000 ) == 0 ) {
197
1.60M
                NrgToNoiseRatio_Q8[ b ] = silk_DIV32( silk_LSHIFT( Xnrg[ b ], 8 ), psSilk_VAD->NL[ b ] + 1 );
198
1.60M
            } else {
199
924k
                NrgToNoiseRatio_Q8[ b ] = silk_DIV32( Xnrg[ b ], silk_RSHIFT( psSilk_VAD->NL[ b ], 8 ) + 1 );
200
924k
            }
201
202
            /* Convert to log domain */
203
2.52M
            SNR_Q7 = silk_lin2log( NrgToNoiseRatio_Q8[ b ] ) - 8 * 128;
204
205
            /* Sum-of-squares */
206
2.52M
            sumSquared = silk_SMLABB( sumSquared, SNR_Q7, SNR_Q7 );          /* Q14 */
207
208
            /* Tilt measure */
209
2.52M
            if( speech_nrg < ( (opus_int32)1 << 20 ) ) {
210
                /* Scale down SNR value for small subband speech energies */
211
1.01M
                SNR_Q7 = silk_SMULWB( silk_LSHIFT( silk_SQRT_APPROX( speech_nrg ), 6 ), SNR_Q7 );
212
1.01M
            }
213
2.52M
            input_tilt = silk_SMLAWB( input_tilt, tiltWeights[ b ], SNR_Q7 );
214
77.5M
        } else {
215
77.5M
            NrgToNoiseRatio_Q8[ b ] = 256;
216
77.5M
        }
217
80.0M
    }
218
219
    /* Mean-of-squares */
220
20.0M
    sumSquared = silk_DIV32_16( sumSquared, VAD_N_BANDS ); /* Q14 */
221
222
    /* Root-mean-square approximation, scale to dBs, and write to output pointer */
223
113k
    pSNR_dB_Q7 = (opus_int16)( 3 * silk_SQRT_APPROX( sumSquared ) ); /* Q7 */
224
225
    /*********************************/
226
    /* Speech Probability Estimation */
227
    /*********************************/
228
20.0M
    SA_Q15 = silk_sigm_Q15( silk_SMULWB( VAD_SNR_FACTOR_Q16, pSNR_dB_Q7 ) - VAD_NEGATIVE_OFFSET_Q5 );
229
230
    /**************************/
231
    /* Frequency Tilt Measure */
232
    /**************************/
233
20.0M
    psEncC->input_tilt_Q15 = silk_LSHIFT( silk_sigm_Q15( input_tilt ) - 16384, 1 );
234
235
    /**************************************************/
236
    /* Scale the sigmoid output based on power levels */
237
    /**************************************************/
238
113k
    speech_nrg = 0;
239
100M
    for( b = 0; b < VAD_N_BANDS; b++ ) {
240
        /* Accumulate signal-without-noise energies, higher frequency bands have more weight */
241
80.0M
        speech_nrg += ( b + 1 ) * silk_RSHIFT( Xnrg[ b ] - psSilk_VAD->NL[ b ], 4 );
242
80.0M
    }
243
244
20.0M
    if( psEncC->frame_length == 20 * psEncC->fs_kHz ) {
245
12.3M
        speech_nrg = silk_RSHIFT32( speech_nrg, 1 );
246
12.3M
    }
247
    /* Power scaling */
248
20.0M
    if( speech_nrg <= 0 ) {
249
19.3M
        SA_Q15 = silk_RSHIFT( SA_Q15, 1 );
250
19.3M
    } else if( speech_nrg < 16384 ) {
251
62.1k
        speech_nrg = silk_LSHIFT32( speech_nrg, 16 );
252
253
        /* square-root */
254
62.1k
        speech_nrg = silk_SQRT_APPROX( speech_nrg );
255
62.1k
        SA_Q15 = silk_SMULWB( 32768 + speech_nrg, SA_Q15 );
256
62.1k
    }
257
258
    /* Copy the resulting speech activity in Q8 */
259
20.0M
    psEncC->speech_activity_Q8 = silk_min_int( silk_RSHIFT( SA_Q15, 7 ), silk_uint8_MAX );
260
261
    /***********************************/
262
    /* Energy Level and SNR estimation */
263
    /***********************************/
264
    /* Smoothing coefficient */
265
20.0M
    smooth_coef_Q16 = silk_SMULWB( VAD_SNR_SMOOTH_COEF_Q18, silk_SMULWB( (opus_int32)SA_Q15, SA_Q15 ) );
266
267
20.0M
    if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
268
7.70M
        smooth_coef_Q16 >>= 1;
269
7.70M
    }
270
271
100M
    for( b = 0; b < VAD_N_BANDS; b++ ) {
272
        /* compute smoothed energy-to-noise ratio per band */
273
80.0M
        psSilk_VAD->NrgRatioSmth_Q8[ b ] = silk_SMLAWB( psSilk_VAD->NrgRatioSmth_Q8[ b ],
274
80.0M
            NrgToNoiseRatio_Q8[ b ] - psSilk_VAD->NrgRatioSmth_Q8[ b ], smooth_coef_Q16 );
275
276
        /* signal to noise ratio in dB per band */
277
80.0M
        SNR_Q7 = 3 * ( silk_lin2log( psSilk_VAD->NrgRatioSmth_Q8[b] ) - 8 * 128 );
278
        /* quality = sigmoid( 0.25 * ( SNR_dB - 16 ) ); */
279
80.0M
        psEncC->input_quality_bands_Q15[ b ] = silk_sigm_Q15( silk_RSHIFT( SNR_Q7 - 16 * 128, 4 ) );
280
80.0M
    }
281
282
#ifdef OPUS_CHECK_ASM
283
19.9M
    silk_assert( ret == ret_c );
284
19.9M
    silk_assert( !memcmp( &psEncC_c, psEncC, sizeof( psEncC_c ) ) );
285
19.9M
#endif
286
287
19.9M
    RESTORE_STACK;
288
19.9M
    return( ret );
289
19.9M
}
silk_VAD_GetSA_Q8_sse4_1
Line
Count
Source
49
19.9M
{
50
19.9M
    opus_int   SA_Q15, pSNR_dB_Q7, input_tilt;
51
19.9M
    opus_int   decimated_framelength1, decimated_framelength2;
52
19.9M
    opus_int   decimated_framelength;
53
19.9M
    opus_int   dec_subframe_length, dec_subframe_offset, SNR_Q7, i, b, s;
54
19.9M
    opus_int32 sumSquared, smooth_coef_Q16;
55
19.9M
    opus_int16 HPstateTmp;
56
19.9M
    VARDECL( opus_int16, X );
57
19.9M
    opus_int32 Xnrg[ VAD_N_BANDS ];
58
19.9M
    opus_int32 NrgToNoiseRatio_Q8[ VAD_N_BANDS ];
59
19.9M
    opus_int32 speech_nrg, x_tmp;
60
19.9M
    opus_int   X_offset[ VAD_N_BANDS ];
61
19.9M
    opus_int   ret = 0;
62
19.9M
    silk_VAD_state *psSilk_VAD = &psEncC->sVAD;
63
64
19.9M
    SAVE_STACK;
65
66
19.9M
#ifdef OPUS_CHECK_ASM
67
19.9M
    silk_encoder_state psEncC_c;
68
19.9M
    opus_int ret_c;
69
70
19.9M
    silk_memcpy( &psEncC_c, psEncC, sizeof( psEncC_c ) );
71
19.9M
    ret_c = silk_VAD_GetSA_Q8_c( &psEncC_c, pIn );
72
19.9M
#endif
73
74
    /* Safety checks */
75
19.9M
    silk_assert( VAD_N_BANDS == 4 );
76
19.9M
    celt_assert( MAX_FRAME_LENGTH >= psEncC->frame_length );
77
19.9M
    celt_assert( psEncC->frame_length <= 512 );
78
19.9M
    celt_assert( psEncC->frame_length == 8 * silk_RSHIFT( psEncC->frame_length, 3 ) );
79
80
    /***********************/
81
    /* Filter and Decimate */
82
    /***********************/
83
19.9M
    decimated_framelength1 = silk_RSHIFT( psEncC->frame_length, 1 );
84
19.9M
    decimated_framelength2 = silk_RSHIFT( psEncC->frame_length, 2 );
85
19.9M
    decimated_framelength = silk_RSHIFT( psEncC->frame_length, 3 );
86
    /* Decimate into 4 bands:
87
       0       L      3L       L              3L                             5L
88
               -      --       -              --                             --
89
               8       8       2               4                              4
90
91
       [0-1 kHz| temp. |1-2 kHz|    2-4 kHz    |            4-8 kHz           |
92
93
       They're arranged to allow the minimal ( frame_length / 4 ) extra
94
       scratch space during the downsampling process */
95
19.9M
    X_offset[ 0 ] = 0;
96
19.9M
    X_offset[ 1 ] = decimated_framelength + decimated_framelength2;
97
19.9M
    X_offset[ 2 ] = X_offset[ 1 ] + decimated_framelength;
98
19.9M
    X_offset[ 3 ] = X_offset[ 2 ] + decimated_framelength2;
99
19.9M
    ALLOC( X, X_offset[ 3 ] + decimated_framelength1, opus_int16 );
100
101
    /* 0-8 kHz to 0-4 kHz and 4-8 kHz */
102
19.9M
    silk_ana_filt_bank_1( pIn, &psSilk_VAD->AnaState[  0 ],
103
19.9M
        X, &X[ X_offset[ 3 ] ], psEncC->frame_length );
104
105
    /* 0-4 kHz to 0-2 kHz and 2-4 kHz */
106
19.9M
    silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState1[ 0 ],
107
19.9M
        X, &X[ X_offset[ 2 ] ], decimated_framelength1 );
108
109
    /* 0-2 kHz to 0-1 kHz and 1-2 kHz */
110
19.9M
    silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState2[ 0 ],
111
19.9M
        X, &X[ X_offset[ 1 ] ], decimated_framelength2 );
112
113
    /*********************************************/
114
    /* HP filter on lowest band (differentiator) */
115
    /*********************************************/
116
19.9M
    X[ decimated_framelength - 1 ] = silk_RSHIFT( X[ decimated_framelength - 1 ], 1 );
117
19.9M
    HPstateTmp = X[ decimated_framelength - 1 ];
118
435M
    for( i = decimated_framelength - 1; i > 0; i-- ) {
119
415M
        X[ i - 1 ]  = silk_RSHIFT( X[ i - 1 ], 1 );
120
415M
        X[ i ]     -= X[ i - 1 ];
121
415M
    }
122
19.9M
    X[ 0 ] -= psSilk_VAD->HPstate;
123
19.9M
    psSilk_VAD->HPstate = HPstateTmp;
124
125
    /*************************************/
126
    /* Calculate the energy in each band */
127
    /*************************************/
128
99.5M
    for( b = 0; b < VAD_N_BANDS; b++ ) {
129
        /* Find the decimated framelength in the non-uniformly divided bands */
130
79.6M
        decimated_framelength = silk_RSHIFT( psEncC->frame_length, silk_min_int( VAD_N_BANDS - b, VAD_N_BANDS - 1 ) );
131
132
        /* Split length into subframe lengths */
133
79.6M
        dec_subframe_length = silk_RSHIFT( decimated_framelength, VAD_INTERNAL_SUBFRAMES_LOG2 );
134
79.6M
        dec_subframe_offset = 0;
135
136
        /* Compute energy per sub-frame */
137
        /* initialize with summed energy of last subframe */
138
79.6M
        Xnrg[ b ] = psSilk_VAD->XnrgSubfr[ b ];
139
398M
        for( s = 0; s < VAD_INTERNAL_SUBFRAMES; s++ ) {
140
318M
            __m128i xmm_X, xmm_acc;
141
318M
            sumSquared = 0;
142
143
318M
            xmm_acc = _mm_setzero_si128();
144
145
597M
            for( i = 0; i < dec_subframe_length - 7; i += 8 )
146
278M
            {
147
278M
                xmm_X   = _mm_loadu_si128( (__m128i *)(void*)&(X[ X_offset[ b ] + i + dec_subframe_offset ] ) );
148
278M
                xmm_X   = _mm_srai_epi16( xmm_X, 3 );
149
278M
                xmm_X   = _mm_madd_epi16( xmm_X, xmm_X );
150
278M
                xmm_acc = _mm_add_epi32( xmm_acc, xmm_X );
151
278M
            }
152
153
318M
            xmm_acc = _mm_add_epi32( xmm_acc, _mm_unpackhi_epi64( xmm_acc, xmm_acc ) );
154
318M
            xmm_acc = _mm_add_epi32( xmm_acc, _mm_shufflelo_epi16( xmm_acc, 0x0E ) );
155
156
318M
            sumSquared += _mm_cvtsi128_si32( xmm_acc );
157
158
1.53G
            for( ; i < dec_subframe_length; i++ ) {
159
                /* The energy will be less than dec_subframe_length * ( silk_int16_MIN / 8 ) ^ 2.            */
160
                /* Therefore we can accumulate with no risk of overflow (unless dec_subframe_length > 128)  */
161
1.21G
                x_tmp = silk_RSHIFT(
162
1.21G
                    X[ X_offset[ b ] + i + dec_subframe_offset ], 3 );
163
1.21G
                sumSquared = silk_SMLABB( sumSquared, x_tmp, x_tmp );
164
165
                /* Safety check */
166
1.21G
                silk_assert( sumSquared >= 0 );
167
1.21G
            }
168
169
            /* Add/saturate summed energy of current subframe */
170
318M
            if( s < VAD_INTERNAL_SUBFRAMES - 1 ) {
171
238M
                Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], sumSquared );
172
238M
            } else {
173
                /* Look-ahead subframe */
174
79.6M
                Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], silk_RSHIFT( sumSquared, 1 ) );
175
79.6M
            }
176
177
318M
            dec_subframe_offset += dec_subframe_length;
178
318M
        }
179
79.6M
        psSilk_VAD->XnrgSubfr[ b ] = sumSquared;
180
79.6M
    }
181
182
    /********************/
183
    /* Noise estimation */
184
    /********************/
185
19.9M
    silk_VAD_GetNoiseLevels( &Xnrg[ 0 ], psSilk_VAD );
186
187
    /***********************************************/
188
    /* Signal-plus-noise to noise ratio estimation */
189
    /***********************************************/
190
19.9M
    sumSquared = 0;
191
19.9M
    input_tilt = 0;
192
99.5M
    for( b = 0; b < VAD_N_BANDS; b++ ) {
193
79.6M
        speech_nrg = Xnrg[ b ] - psSilk_VAD->NL[ b ];
194
79.6M
        if( speech_nrg > 0 ) {
195
            /* Divide, with sufficient resolution */
196
2.18M
            if( ( Xnrg[ b ] & 0xFF800000 ) == 0 ) {
197
1.33M
                NrgToNoiseRatio_Q8[ b ] = silk_DIV32( silk_LSHIFT( Xnrg[ b ], 8 ), psSilk_VAD->NL[ b ] + 1 );
198
1.33M
            } else {
199
850k
                NrgToNoiseRatio_Q8[ b ] = silk_DIV32( Xnrg[ b ], silk_RSHIFT( psSilk_VAD->NL[ b ], 8 ) + 1 );
200
850k
            }
201
202
            /* Convert to log domain */
203
2.18M
            SNR_Q7 = silk_lin2log( NrgToNoiseRatio_Q8[ b ] ) - 8 * 128;
204
205
            /* Sum-of-squares */
206
2.18M
            sumSquared = silk_SMLABB( sumSquared, SNR_Q7, SNR_Q7 );          /* Q14 */
207
208
            /* Tilt measure */
209
2.18M
            if( speech_nrg < ( (opus_int32)1 << 20 ) ) {
210
                /* Scale down SNR value for small subband speech energies */
211
848k
                SNR_Q7 = silk_SMULWB( silk_LSHIFT( silk_SQRT_APPROX( speech_nrg ), 6 ), SNR_Q7 );
212
848k
            }
213
2.18M
            input_tilt = silk_SMLAWB( input_tilt, tiltWeights[ b ], SNR_Q7 );
214
77.4M
        } else {
215
77.4M
            NrgToNoiseRatio_Q8[ b ] = 256;
216
77.4M
        }
217
79.6M
    }
218
219
    /* Mean-of-squares */
220
19.9M
    sumSquared = silk_DIV32_16( sumSquared, VAD_N_BANDS ); /* Q14 */
221
222
    /* Root-mean-square approximation, scale to dBs, and write to output pointer */
223
19.9M
    pSNR_dB_Q7 = (opus_int16)( 3 * silk_SQRT_APPROX( sumSquared ) ); /* Q7 */
224
225
    /*********************************/
226
    /* Speech Probability Estimation */
227
    /*********************************/
228
19.9M
    SA_Q15 = silk_sigm_Q15( silk_SMULWB( VAD_SNR_FACTOR_Q16, pSNR_dB_Q7 ) - VAD_NEGATIVE_OFFSET_Q5 );
229
230
    /**************************/
231
    /* Frequency Tilt Measure */
232
    /**************************/
233
19.9M
    psEncC->input_tilt_Q15 = silk_LSHIFT( silk_sigm_Q15( input_tilt ) - 16384, 1 );
234
235
    /**************************************************/
236
    /* Scale the sigmoid output based on power levels */
237
    /**************************************************/
238
19.9M
    speech_nrg = 0;
239
99.5M
    for( b = 0; b < VAD_N_BANDS; b++ ) {
240
        /* Accumulate signal-without-noise energies, higher frequency bands have more weight */
241
79.6M
        speech_nrg += ( b + 1 ) * silk_RSHIFT( Xnrg[ b ] - psSilk_VAD->NL[ b ], 4 );
242
79.6M
    }
243
244
19.9M
    if( psEncC->frame_length == 20 * psEncC->fs_kHz ) {
245
12.2M
        speech_nrg = silk_RSHIFT32( speech_nrg, 1 );
246
12.2M
    }
247
    /* Power scaling */
248
19.9M
    if( speech_nrg <= 0 ) {
249
19.2M
        SA_Q15 = silk_RSHIFT( SA_Q15, 1 );
250
19.2M
    } else if( speech_nrg < 16384 ) {
251
50.3k
        speech_nrg = silk_LSHIFT32( speech_nrg, 16 );
252
253
        /* square-root */
254
50.3k
        speech_nrg = silk_SQRT_APPROX( speech_nrg );
255
50.3k
        SA_Q15 = silk_SMULWB( 32768 + speech_nrg, SA_Q15 );
256
50.3k
    }
257
258
    /* Copy the resulting speech activity in Q8 */
259
19.9M
    psEncC->speech_activity_Q8 = silk_min_int( silk_RSHIFT( SA_Q15, 7 ), silk_uint8_MAX );
260
261
    /***********************************/
262
    /* Energy Level and SNR estimation */
263
    /***********************************/
264
    /* Smoothing coefficient */
265
19.9M
    smooth_coef_Q16 = silk_SMULWB( VAD_SNR_SMOOTH_COEF_Q18, silk_SMULWB( (opus_int32)SA_Q15, SA_Q15 ) );
266
267
19.9M
    if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
268
7.67M
        smooth_coef_Q16 >>= 1;
269
7.67M
    }
270
271
99.5M
    for( b = 0; b < VAD_N_BANDS; b++ ) {
272
        /* compute smoothed energy-to-noise ratio per band */
273
79.6M
        psSilk_VAD->NrgRatioSmth_Q8[ b ] = silk_SMLAWB( psSilk_VAD->NrgRatioSmth_Q8[ b ],
274
79.6M
            NrgToNoiseRatio_Q8[ b ] - psSilk_VAD->NrgRatioSmth_Q8[ b ], smooth_coef_Q16 );
275
276
        /* signal to noise ratio in dB per band */
277
79.6M
        SNR_Q7 = 3 * ( silk_lin2log( psSilk_VAD->NrgRatioSmth_Q8[b] ) - 8 * 128 );
278
        /* quality = sigmoid( 0.25 * ( SNR_dB - 16 ) ); */
279
79.6M
        psEncC->input_quality_bands_Q15[ b ] = silk_sigm_Q15( silk_RSHIFT( SNR_Q7 - 16 * 128, 4 ) );
280
79.6M
    }
281
282
19.9M
#ifdef OPUS_CHECK_ASM
283
19.9M
    silk_assert( ret == ret_c );
284
19.9M
    silk_assert( !memcmp( &psEncC_c, psEncC, sizeof( psEncC_c ) ) );
285
19.9M
#endif
286
287
19.9M
    RESTORE_STACK;
288
19.9M
    return( ret );
289
19.9M
}
silk_VAD_GetSA_Q8_sse4_1
Line
Count
Source
49
113k
{
50
113k
    opus_int   SA_Q15, pSNR_dB_Q7, input_tilt;
51
113k
    opus_int   decimated_framelength1, decimated_framelength2;
52
113k
    opus_int   decimated_framelength;
53
113k
    opus_int   dec_subframe_length, dec_subframe_offset, SNR_Q7, i, b, s;
54
113k
    opus_int32 sumSquared, smooth_coef_Q16;
55
113k
    opus_int16 HPstateTmp;
56
113k
    VARDECL( opus_int16, X );
57
113k
    opus_int32 Xnrg[ VAD_N_BANDS ];
58
113k
    opus_int32 NrgToNoiseRatio_Q8[ VAD_N_BANDS ];
59
113k
    opus_int32 speech_nrg, x_tmp;
60
113k
    opus_int   X_offset[ VAD_N_BANDS ];
61
113k
    opus_int   ret = 0;
62
113k
    silk_VAD_state *psSilk_VAD = &psEncC->sVAD;
63
64
113k
    SAVE_STACK;
65
66
#ifdef OPUS_CHECK_ASM
67
    silk_encoder_state psEncC_c;
68
    opus_int ret_c;
69
70
    silk_memcpy( &psEncC_c, psEncC, sizeof( psEncC_c ) );
71
    ret_c = silk_VAD_GetSA_Q8_c( &psEncC_c, pIn );
72
#endif
73
74
    /* Safety checks */
75
113k
    silk_assert( VAD_N_BANDS == 4 );
76
113k
    celt_assert( MAX_FRAME_LENGTH >= psEncC->frame_length );
77
113k
    celt_assert( psEncC->frame_length <= 512 );
78
113k
    celt_assert( psEncC->frame_length == 8 * silk_RSHIFT( psEncC->frame_length, 3 ) );
79
80
    /***********************/
81
    /* Filter and Decimate */
82
    /***********************/
83
113k
    decimated_framelength1 = silk_RSHIFT( psEncC->frame_length, 1 );
84
113k
    decimated_framelength2 = silk_RSHIFT( psEncC->frame_length, 2 );
85
113k
    decimated_framelength = silk_RSHIFT( psEncC->frame_length, 3 );
86
    /* Decimate into 4 bands:
87
       0       L      3L       L              3L                             5L
88
               -      --       -              --                             --
89
               8       8       2               4                              4
90
91
       [0-1 kHz| temp. |1-2 kHz|    2-4 kHz    |            4-8 kHz           |
92
93
       They're arranged to allow the minimal ( frame_length / 4 ) extra
94
       scratch space during the downsampling process */
95
113k
    X_offset[ 0 ] = 0;
96
113k
    X_offset[ 1 ] = decimated_framelength + decimated_framelength2;
97
113k
    X_offset[ 2 ] = X_offset[ 1 ] + decimated_framelength;
98
113k
    X_offset[ 3 ] = X_offset[ 2 ] + decimated_framelength2;
99
113k
    ALLOC( X, X_offset[ 3 ] + decimated_framelength1, opus_int16 );
100
101
    /* 0-8 kHz to 0-4 kHz and 4-8 kHz */
102
113k
    silk_ana_filt_bank_1( pIn, &psSilk_VAD->AnaState[  0 ],
103
113k
        X, &X[ X_offset[ 3 ] ], psEncC->frame_length );
104
105
    /* 0-4 kHz to 0-2 kHz and 2-4 kHz */
106
113k
    silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState1[ 0 ],
107
113k
        X, &X[ X_offset[ 2 ] ], decimated_framelength1 );
108
109
    /* 0-2 kHz to 0-1 kHz and 1-2 kHz */
110
113k
    silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState2[ 0 ],
111
113k
        X, &X[ X_offset[ 1 ] ], decimated_framelength2 );
112
113
    /*********************************************/
114
    /* HP filter on lowest band (differentiator) */
115
    /*********************************************/
116
113k
    X[ decimated_framelength - 1 ] = silk_RSHIFT( X[ decimated_framelength - 1 ], 1 );
117
113k
    HPstateTmp = X[ decimated_framelength - 1 ];
118
2.49M
    for( i = decimated_framelength - 1; i > 0; i-- ) {
119
2.37M
        X[ i - 1 ]  = silk_RSHIFT( X[ i - 1 ], 1 );
120
2.37M
        X[ i ]     -= X[ i - 1 ];
121
2.37M
    }
122
113k
    X[ 0 ] -= psSilk_VAD->HPstate;
123
113k
    psSilk_VAD->HPstate = HPstateTmp;
124
125
    /*************************************/
126
    /* Calculate the energy in each band */
127
    /*************************************/
128
567k
    for( b = 0; b < VAD_N_BANDS; b++ ) {
129
        /* Find the decimated framelength in the non-uniformly divided bands */
130
453k
        decimated_framelength = silk_RSHIFT( psEncC->frame_length, silk_min_int( VAD_N_BANDS - b, VAD_N_BANDS - 1 ) );
131
132
        /* Split length into subframe lengths */
133
453k
        dec_subframe_length = silk_RSHIFT( decimated_framelength, VAD_INTERNAL_SUBFRAMES_LOG2 );
134
453k
        dec_subframe_offset = 0;
135
136
        /* Compute energy per sub-frame */
137
        /* initialize with summed energy of last subframe */
138
453k
        Xnrg[ b ] = psSilk_VAD->XnrgSubfr[ b ];
139
2.26M
        for( s = 0; s < VAD_INTERNAL_SUBFRAMES; s++ ) {
140
1.81M
            __m128i xmm_X, xmm_acc;
141
1.81M
            sumSquared = 0;
142
143
1.81M
            xmm_acc = _mm_setzero_si128();
144
145
3.45M
            for( i = 0; i < dec_subframe_length - 7; i += 8 )
146
1.63M
            {
147
1.63M
                xmm_X   = _mm_loadu_si128( (__m128i *)(void*)&(X[ X_offset[ b ] + i + dec_subframe_offset ] ) );
148
1.63M
                xmm_X   = _mm_srai_epi16( xmm_X, 3 );
149
1.63M
                xmm_X   = _mm_madd_epi16( xmm_X, xmm_X );
150
1.63M
                xmm_acc = _mm_add_epi32( xmm_acc, xmm_X );
151
1.63M
            }
152
153
1.81M
            xmm_acc = _mm_add_epi32( xmm_acc, _mm_unpackhi_epi64( xmm_acc, xmm_acc ) );
154
1.81M
            xmm_acc = _mm_add_epi32( xmm_acc, _mm_shufflelo_epi16( xmm_acc, 0x0E ) );
155
156
1.81M
            sumSquared += _mm_cvtsi128_si32( xmm_acc );
157
158
8.51M
            for( ; i < dec_subframe_length; i++ ) {
159
                /* The energy will be less than dec_subframe_length * ( silk_int16_MIN / 8 ) ^ 2.            */
160
                /* Therefore we can accumulate with no risk of overflow (unless dec_subframe_length > 128)  */
161
6.69M
                x_tmp = silk_RSHIFT(
162
6.69M
                    X[ X_offset[ b ] + i + dec_subframe_offset ], 3 );
163
6.69M
                sumSquared = silk_SMLABB( sumSquared, x_tmp, x_tmp );
164
165
                /* Safety check */
166
6.69M
                silk_assert( sumSquared >= 0 );
167
6.69M
            }
168
169
            /* Add/saturate summed energy of current subframe */
170
1.81M
            if( s < VAD_INTERNAL_SUBFRAMES - 1 ) {
171
1.36M
                Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], sumSquared );
172
1.36M
            } else {
173
                /* Look-ahead subframe */
174
453k
                Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], silk_RSHIFT( sumSquared, 1 ) );
175
453k
            }
176
177
1.81M
            dec_subframe_offset += dec_subframe_length;
178
1.81M
        }
179
453k
        psSilk_VAD->XnrgSubfr[ b ] = sumSquared;
180
453k
    }
181
182
    /********************/
183
    /* Noise estimation */
184
    /********************/
185
113k
    silk_VAD_GetNoiseLevels( &Xnrg[ 0 ], psSilk_VAD );
186
187
    /***********************************************/
188
    /* Signal-plus-noise to noise ratio estimation */
189
    /***********************************************/
190
113k
    sumSquared = 0;
191
113k
    input_tilt = 0;
192
567k
    for( b = 0; b < VAD_N_BANDS; b++ ) {
193
453k
        speech_nrg = Xnrg[ b ] - psSilk_VAD->NL[ b ];
194
453k
        if( speech_nrg > 0 ) {
195
            /* Divide, with sufficient resolution */
196
338k
            if( ( Xnrg[ b ] & 0xFF800000 ) == 0 ) {
197
264k
                NrgToNoiseRatio_Q8[ b ] = silk_DIV32( silk_LSHIFT( Xnrg[ b ], 8 ), psSilk_VAD->NL[ b ] + 1 );
198
264k
            } else {
199
74.0k
                NrgToNoiseRatio_Q8[ b ] = silk_DIV32( Xnrg[ b ], silk_RSHIFT( psSilk_VAD->NL[ b ], 8 ) + 1 );
200
74.0k
            }
201
202
            /* Convert to log domain */
203
338k
            SNR_Q7 = silk_lin2log( NrgToNoiseRatio_Q8[ b ] ) - 8 * 128;
204
205
            /* Sum-of-squares */
206
338k
            sumSquared = silk_SMLABB( sumSquared, SNR_Q7, SNR_Q7 );          /* Q14 */
207
208
            /* Tilt measure */
209
338k
            if( speech_nrg < ( (opus_int32)1 << 20 ) ) {
210
                /* Scale down SNR value for small subband speech energies */
211
168k
                SNR_Q7 = silk_SMULWB( silk_LSHIFT( silk_SQRT_APPROX( speech_nrg ), 6 ), SNR_Q7 );
212
168k
            }
213
338k
            input_tilt = silk_SMLAWB( input_tilt, tiltWeights[ b ], SNR_Q7 );
214
338k
        } else {
215
115k
            NrgToNoiseRatio_Q8[ b ] = 256;
216
115k
        }
217
453k
    }
218
219
    /* Mean-of-squares */
220
113k
    sumSquared = silk_DIV32_16( sumSquared, VAD_N_BANDS ); /* Q14 */
221
222
    /* Root-mean-square approximation, scale to dBs, and write to output pointer */
223
113k
    pSNR_dB_Q7 = (opus_int16)( 3 * silk_SQRT_APPROX( sumSquared ) ); /* Q7 */
224
225
    /*********************************/
226
    /* Speech Probability Estimation */
227
    /*********************************/
228
113k
    SA_Q15 = silk_sigm_Q15( silk_SMULWB( VAD_SNR_FACTOR_Q16, pSNR_dB_Q7 ) - VAD_NEGATIVE_OFFSET_Q5 );
229
230
    /**************************/
231
    /* Frequency Tilt Measure */
232
    /**************************/
233
113k
    psEncC->input_tilt_Q15 = silk_LSHIFT( silk_sigm_Q15( input_tilt ) - 16384, 1 );
234
235
    /**************************************************/
236
    /* Scale the sigmoid output based on power levels */
237
    /**************************************************/
238
113k
    speech_nrg = 0;
239
567k
    for( b = 0; b < VAD_N_BANDS; b++ ) {
240
        /* Accumulate signal-without-noise energies, higher frequency bands have more weight */
241
453k
        speech_nrg += ( b + 1 ) * silk_RSHIFT( Xnrg[ b ] - psSilk_VAD->NL[ b ], 4 );
242
453k
    }
243
244
113k
    if( psEncC->frame_length == 20 * psEncC->fs_kHz ) {
245
86.2k
        speech_nrg = silk_RSHIFT32( speech_nrg, 1 );
246
86.2k
    }
247
    /* Power scaling */
248
113k
    if( speech_nrg <= 0 ) {
249
21.3k
        SA_Q15 = silk_RSHIFT( SA_Q15, 1 );
250
92.1k
    } else if( speech_nrg < 16384 ) {
251
11.8k
        speech_nrg = silk_LSHIFT32( speech_nrg, 16 );
252
253
        /* square-root */
254
11.8k
        speech_nrg = silk_SQRT_APPROX( speech_nrg );
255
11.8k
        SA_Q15 = silk_SMULWB( 32768 + speech_nrg, SA_Q15 );
256
11.8k
    }
257
258
    /* Copy the resulting speech activity in Q8 */
259
113k
    psEncC->speech_activity_Q8 = silk_min_int( silk_RSHIFT( SA_Q15, 7 ), silk_uint8_MAX );
260
261
    /***********************************/
262
    /* Energy Level and SNR estimation */
263
    /***********************************/
264
    /* Smoothing coefficient */
265
113k
    smooth_coef_Q16 = silk_SMULWB( VAD_SNR_SMOOTH_COEF_Q18, silk_SMULWB( (opus_int32)SA_Q15, SA_Q15 ) );
266
267
113k
    if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
268
27.1k
        smooth_coef_Q16 >>= 1;
269
27.1k
    }
270
271
567k
    for( b = 0; b < VAD_N_BANDS; b++ ) {
272
        /* compute smoothed energy-to-noise ratio per band */
273
453k
        psSilk_VAD->NrgRatioSmth_Q8[ b ] = silk_SMLAWB( psSilk_VAD->NrgRatioSmth_Q8[ b ],
274
453k
            NrgToNoiseRatio_Q8[ b ] - psSilk_VAD->NrgRatioSmth_Q8[ b ], smooth_coef_Q16 );
275
276
        /* signal to noise ratio in dB per band */
277
453k
        SNR_Q7 = 3 * ( silk_lin2log( psSilk_VAD->NrgRatioSmth_Q8[b] ) - 8 * 128 );
278
        /* quality = sigmoid( 0.25 * ( SNR_dB - 16 ) ); */
279
453k
        psEncC->input_quality_bands_Q15[ b ] = silk_sigm_Q15( silk_RSHIFT( SNR_Q7 - 16 * 128, 4 ) );
280
453k
    }
281
282
#ifdef OPUS_CHECK_ASM
283
    silk_assert( ret == ret_c );
284
    silk_assert( !memcmp( &psEncC_c, psEncC, sizeof( psEncC_c ) ) );
285
#endif
286
287
113k
    RESTORE_STACK;
288
113k
    return( ret );
289
113k
}