Coverage Report

Created: 2025-07-12 07:22

/src/opus/silk/x86/VAD_sse4_1.c
Line
Count
Source
1
/* Copyright (c) 2014-2020, Cisco Systems, INC
2
   Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
3
4
   Redistribution and use in source and binary forms, with or without
5
   modification, are permitted provided that the following conditions
6
   are met:
7
8
   - Redistributions of source code must retain the above copyright
9
   notice, this list of conditions and the following disclaimer.
10
11
   - Redistributions in binary form must reproduce the above copyright
12
   notice, this list of conditions and the following disclaimer in the
13
   documentation and/or other materials provided with the distribution.
14
15
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18
   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
*/
27
28
#ifdef HAVE_CONFIG_H
29
#include "config.h"
30
#endif
31
32
#include <xmmintrin.h>
33
#include <emmintrin.h>
34
#include <smmintrin.h>
35
36
#include "main.h"
37
#include "stack_alloc.h"
38
39
/* Weighting factors for tilt measure */
40
static const opus_int32 tiltWeights[ VAD_N_BANDS ] = { 30000, 6000, -12000, -12000 };
41
42
/***************************************/
43
/* Get the speech activity level in Q8 */
44
/***************************************/
45
opus_int silk_VAD_GetSA_Q8_sse4_1(                  /* O    Return value, 0 if success                  */
46
    silk_encoder_state          *psEncC,            /* I/O  Encoder state                               */
47
    const opus_int16            pIn[]               /* I    PCM input                                   */
48
)
49
37.2M
{
50
37.2M
    opus_int   SA_Q15, pSNR_dB_Q7, input_tilt;
51
37.2M
    opus_int   decimated_framelength1, decimated_framelength2;
52
37.2M
    opus_int   decimated_framelength;
53
37.2M
    opus_int   dec_subframe_length, dec_subframe_offset, SNR_Q7, i, b, s;
54
37.2M
    opus_int32 sumSquared, smooth_coef_Q16;
55
37.2M
    opus_int16 HPstateTmp;
56
37.2M
    VARDECL( opus_int16, X );
57
37.2M
    opus_int32 Xnrg[ VAD_N_BANDS ];
58
37.2M
    opus_int32 NrgToNoiseRatio_Q8[ VAD_N_BANDS ];
59
37.2M
    opus_int32 speech_nrg, x_tmp;
60
37.2M
    opus_int   X_offset[ VAD_N_BANDS ];
61
37.2M
    opus_int   ret = 0;
62
37.2M
    silk_VAD_state *psSilk_VAD = &psEncC->sVAD;
63
64
37.2M
    SAVE_STACK;
65
66
#ifdef OPUS_CHECK_ASM
67
    silk_encoder_state psEncC_c;
68
240k
    opus_int ret_c;
69
70
240k
    silk_memcpy( &psEncC_c, psEncC, sizeof( psEncC_c ) );
71
    ret_c = silk_VAD_GetSA_Q8_c( &psEncC_c, pIn );
72
#endif
73
74
    /* Safety checks */
75
37.2M
    silk_assert( VAD_N_BANDS == 4 );
76
37.2M
    celt_assert( MAX_FRAME_LENGTH >= psEncC->frame_length );
77
37.2M
    celt_assert( psEncC->frame_length <= 512 );
78
37.2M
    celt_assert( psEncC->frame_length == 8 * silk_RSHIFT( psEncC->frame_length, 3 ) );
79
80
    /***********************/
81
    /* Filter and Decimate */
82
    /***********************/
83
37.2M
    decimated_framelength1 = silk_RSHIFT( psEncC->frame_length, 1 );
84
37.2M
    decimated_framelength2 = silk_RSHIFT( psEncC->frame_length, 2 );
85
37.2M
    decimated_framelength = silk_RSHIFT( psEncC->frame_length, 3 );
86
    /* Decimate into 4 bands:
87
       0       L      3L       L              3L                             5L
88
               -      --       -              --                             --
89
               8       8       2               4                              4
90
91
       [0-1 kHz| temp. |1-2 kHz|    2-4 kHz    |            4-8 kHz           |
92
93
       They're arranged to allow the minimal ( frame_length / 4 ) extra
94
       scratch space during the downsampling process */
95
37.2M
    X_offset[ 0 ] = 0;
96
37.2M
    X_offset[ 1 ] = decimated_framelength + decimated_framelength2;
97
37.2M
    X_offset[ 2 ] = X_offset[ 1 ] + decimated_framelength;
98
37.2M
    X_offset[ 3 ] = X_offset[ 2 ] + decimated_framelength2;
99
37.2M
    ALLOC( X, X_offset[ 3 ] + decimated_framelength1, opus_int16 );
100
101
    /* 0-8 kHz to 0-4 kHz and 4-8 kHz */
102
37.2M
    silk_ana_filt_bank_1( pIn, &psSilk_VAD->AnaState[  0 ],
103
37.2M
        X, &X[ X_offset[ 3 ] ], psEncC->frame_length );
104
105
    /* 0-4 kHz to 0-2 kHz and 2-4 kHz */
106
37.2M
    silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState1[ 0 ],
107
37.2M
        X, &X[ X_offset[ 2 ] ], decimated_framelength1 );
108
109
    /* 0-2 kHz to 0-1 kHz and 1-2 kHz */
110
37.2M
    silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState2[ 0 ],
111
37.2M
        X, &X[ X_offset[ 1 ] ], decimated_framelength2 );
112
113
    /*********************************************/
114
    /* HP filter on lowest band (differentiator) */
115
    /*********************************************/
116
37.2M
    X[ decimated_framelength - 1 ] = silk_RSHIFT( X[ decimated_framelength - 1 ], 1 );
117
37.2M
    HPstateTmp = X[ decimated_framelength - 1 ];
118
822M
    for( i = decimated_framelength - 1; i > 0; i-- ) {
119
785M
        X[ i - 1 ]  = silk_RSHIFT( X[ i - 1 ], 1 );
120
785M
        X[ i ]     -= X[ i - 1 ];
121
785M
    }
122
37.2M
    X[ 0 ] -= psSilk_VAD->HPstate;
123
37.2M
    psSilk_VAD->HPstate = HPstateTmp;
124
125
    /*************************************/
126
    /* Calculate the energy in each band */
127
    /*************************************/
128
186M
    for( b = 0; b < VAD_N_BANDS; b++ ) {
129
        /* Find the decimated framelength in the non-uniformly divided bands */
130
148M
        decimated_framelength = silk_RSHIFT( psEncC->frame_length, silk_min_int( VAD_N_BANDS - b, VAD_N_BANDS - 1 ) );
131
132
        /* Split length into subframe lengths */
133
148M
        dec_subframe_length = silk_RSHIFT( decimated_framelength, VAD_INTERNAL_SUBFRAMES_LOG2 );
134
148M
        dec_subframe_offset = 0;
135
136
        /* Compute energy per sub-frame */
137
        /* initialize with summed energy of last subframe */
138
148M
        Xnrg[ b ] = psSilk_VAD->XnrgSubfr[ b ];
139
744M
        for( s = 0; s < VAD_INTERNAL_SUBFRAMES; s++ ) {
140
595M
            __m128i xmm_X, xmm_acc;
141
595M
            sumSquared = 0;
142
143
595M
            xmm_acc = _mm_setzero_si128();
144
145
1.12G
            for( i = 0; i < dec_subframe_length - 7; i += 8 )
146
529M
            {
147
529M
                xmm_X   = _mm_loadu_si128( (__m128i *)(void*)&(X[ X_offset[ b ] + i + dec_subframe_offset ] ) );
148
529M
                xmm_X   = _mm_srai_epi16( xmm_X, 3 );
149
529M
                xmm_X   = _mm_madd_epi16( xmm_X, xmm_X );
150
529M
                xmm_acc = _mm_add_epi32( xmm_acc, xmm_X );
151
529M
            }
152
153
595M
            xmm_acc = _mm_add_epi32( xmm_acc, _mm_unpackhi_epi64( xmm_acc, xmm_acc ) );
154
595M
            xmm_acc = _mm_add_epi32( xmm_acc, _mm_shufflelo_epi16( xmm_acc, 0x0E ) );
155
156
595M
            sumSquared += _mm_cvtsi128_si32( xmm_acc );
157
158
2.86G
            for( ; i < dec_subframe_length; i++ ) {
159
                /* The energy will be less than dec_subframe_length * ( silk_int16_MIN / 8 ) ^ 2.            */
160
                /* Therefore we can accumulate with no risk of overflow (unless dec_subframe_length > 128)  */
161
2.26G
                x_tmp = silk_RSHIFT(
162
2.26G
                    X[ X_offset[ b ] + i + dec_subframe_offset ], 3 );
163
2.26G
                sumSquared = silk_SMLABB( sumSquared, x_tmp, x_tmp );
164
165
                /* Safety check */
166
2.26G
                silk_assert( sumSquared >= 0 );
167
2.26G
            }
168
169
            /* Add/saturate summed energy of current subframe */
170
595M
            if( s < VAD_INTERNAL_SUBFRAMES - 1 ) {
171
446M
                Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], sumSquared );
172
446M
            } else {
173
                /* Look-ahead subframe */
174
148M
                Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], silk_RSHIFT( sumSquared, 1 ) );
175
148M
            }
176
177
595M
            dec_subframe_offset += dec_subframe_length;
178
595M
        }
179
148M
        psSilk_VAD->XnrgSubfr[ b ] = sumSquared;
180
148M
    }
181
182
    /********************/
183
    /* Noise estimation */
184
    /********************/
185
37.2M
    silk_VAD_GetNoiseLevels( &Xnrg[ 0 ], psSilk_VAD );
186
187
    /***********************************************/
188
    /* Signal-plus-noise to noise ratio estimation */
189
    /***********************************************/
190
37.0M
    sumSquared = 0;
191
37.0M
    input_tilt = 0;
192
186M
    for( b = 0; b < VAD_N_BANDS; b++ ) {
193
148M
        speech_nrg = Xnrg[ b ] - psSilk_VAD->NL[ b ];
194
148M
        if( speech_nrg > 0 ) {
195
            /* Divide, with sufficient resolution */
196
4.90M
            if( ( Xnrg[ b ] & 0xFF800000 ) == 0 ) {
197
3.30M
                NrgToNoiseRatio_Q8[ b ] = silk_DIV32( silk_LSHIFT( Xnrg[ b ], 8 ), psSilk_VAD->NL[ b ] + 1 );
198
3.30M
            } else {
199
1.60M
                NrgToNoiseRatio_Q8[ b ] = silk_DIV32( Xnrg[ b ], silk_RSHIFT( psSilk_VAD->NL[ b ], 8 ) + 1 );
200
1.60M
            }
201
202
            /* Convert to log domain */
203
4.90M
            SNR_Q7 = silk_lin2log( NrgToNoiseRatio_Q8[ b ] ) - 8 * 128;
204
205
            /* Sum-of-squares */
206
4.90M
            sumSquared = silk_SMLABB( sumSquared, SNR_Q7, SNR_Q7 );          /* Q14 */
207
208
            /* Tilt measure */
209
4.90M
            if( speech_nrg < ( (opus_int32)1 << 20 ) ) {
210
                /* Scale down SNR value for small subband speech energies */
211
2.23M
                SNR_Q7 = silk_SMULWB( silk_LSHIFT( silk_SQRT_APPROX( speech_nrg ), 6 ), SNR_Q7 );
212
2.23M
            }
213
4.90M
            input_tilt = silk_SMLAWB( input_tilt, tiltWeights[ b ], SNR_Q7 );
214
144M
        } else {
215
144M
            NrgToNoiseRatio_Q8[ b ] = 256;
216
144M
        }
217
148M
    }
218
219
    /* Mean-of-squares */
220
37.2M
    sumSquared = silk_DIV32_16( sumSquared, VAD_N_BANDS ); /* Q14 */
221
222
    /* Root-mean-square approximation, scale to dBs, and write to output pointer */
223
37.0M
    pSNR_dB_Q7 = (opus_int16)( 3 * silk_SQRT_APPROX( sumSquared ) ); /* Q7 */
224
225
    /*********************************/
226
    /* Speech Probability Estimation */
227
    /*********************************/
228
37.2M
    SA_Q15 = silk_sigm_Q15( silk_SMULWB( VAD_SNR_FACTOR_Q16, pSNR_dB_Q7 ) - VAD_NEGATIVE_OFFSET_Q5 );
229
230
    /**************************/
231
    /* Frequency Tilt Measure */
232
    /**************************/
233
37.2M
    psEncC->input_tilt_Q15 = silk_LSHIFT( silk_sigm_Q15( input_tilt ) - 16384, 1 );
234
235
    /**************************************************/
236
    /* Scale the sigmoid output based on power levels */
237
    /**************************************************/
238
37.0M
    speech_nrg = 0;
239
186M
    for( b = 0; b < VAD_N_BANDS; b++ ) {
240
        /* Accumulate signal-without-noise energies, higher frequency bands have more weight */
241
148M
        speech_nrg += ( b + 1 ) * silk_RSHIFT( Xnrg[ b ] - psSilk_VAD->NL[ b ], 4 );
242
148M
    }
243
244
37.2M
    if( psEncC->frame_length == 20 * psEncC->fs_kHz ) {
245
22.3M
        speech_nrg = silk_RSHIFT32( speech_nrg, 1 );
246
22.3M
    }
247
    /* Power scaling */
248
37.2M
    if( speech_nrg <= 0 ) {
249
35.8M
        SA_Q15 = silk_RSHIFT( SA_Q15, 1 );
250
35.8M
    } else if( speech_nrg < 16384 ) {
251
178k
        speech_nrg = silk_LSHIFT32( speech_nrg, 16 );
252
253
        /* square-root */
254
178k
        speech_nrg = silk_SQRT_APPROX( speech_nrg );
255
178k
        SA_Q15 = silk_SMULWB( 32768 + speech_nrg, SA_Q15 );
256
178k
    }
257
258
    /* Copy the resulting speech activity in Q8 */
259
37.2M
    psEncC->speech_activity_Q8 = silk_min_int( silk_RSHIFT( SA_Q15, 7 ), silk_uint8_MAX );
260
261
    /***********************************/
262
    /* Energy Level and SNR estimation */
263
    /***********************************/
264
    /* Smoothing coefficient */
265
37.2M
    smooth_coef_Q16 = silk_SMULWB( VAD_SNR_SMOOTH_COEF_Q18, silk_SMULWB( (opus_int32)SA_Q15, SA_Q15 ) );
266
267
37.2M
    if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
268
14.9M
        smooth_coef_Q16 >>= 1;
269
14.9M
    }
270
271
186M
    for( b = 0; b < VAD_N_BANDS; b++ ) {
272
        /* compute smoothed energy-to-noise ratio per band */
273
148M
        psSilk_VAD->NrgRatioSmth_Q8[ b ] = silk_SMLAWB( psSilk_VAD->NrgRatioSmth_Q8[ b ],
274
148M
            NrgToNoiseRatio_Q8[ b ] - psSilk_VAD->NrgRatioSmth_Q8[ b ], smooth_coef_Q16 );
275
276
        /* signal to noise ratio in dB per band */
277
148M
        SNR_Q7 = 3 * ( silk_lin2log( psSilk_VAD->NrgRatioSmth_Q8[b] ) - 8 * 128 );
278
        /* quality = sigmoid( 0.25 * ( SNR_dB - 16 ) ); */
279
148M
        psEncC->input_quality_bands_Q15[ b ] = silk_sigm_Q15( silk_RSHIFT( SNR_Q7 - 16 * 128, 4 ) );
280
148M
    }
281
282
#ifdef OPUS_CHECK_ASM
283
240k
    silk_assert( ret == ret_c );
284
240k
    silk_assert( !memcmp( &psEncC_c, psEncC, sizeof( psEncC_c ) ) );
285
240k
#endif
286
287
240k
    RESTORE_STACK;
288
240k
    return( ret );
289
240k
}
silk_VAD_GetSA_Q8_sse4_1
Line
Count
Source
49
240k
{
50
240k
    opus_int   SA_Q15, pSNR_dB_Q7, input_tilt;
51
240k
    opus_int   decimated_framelength1, decimated_framelength2;
52
240k
    opus_int   decimated_framelength;
53
240k
    opus_int   dec_subframe_length, dec_subframe_offset, SNR_Q7, i, b, s;
54
240k
    opus_int32 sumSquared, smooth_coef_Q16;
55
240k
    opus_int16 HPstateTmp;
56
240k
    VARDECL( opus_int16, X );
57
240k
    opus_int32 Xnrg[ VAD_N_BANDS ];
58
240k
    opus_int32 NrgToNoiseRatio_Q8[ VAD_N_BANDS ];
59
240k
    opus_int32 speech_nrg, x_tmp;
60
240k
    opus_int   X_offset[ VAD_N_BANDS ];
61
240k
    opus_int   ret = 0;
62
240k
    silk_VAD_state *psSilk_VAD = &psEncC->sVAD;
63
64
240k
    SAVE_STACK;
65
66
240k
#ifdef OPUS_CHECK_ASM
67
240k
    silk_encoder_state psEncC_c;
68
240k
    opus_int ret_c;
69
70
240k
    silk_memcpy( &psEncC_c, psEncC, sizeof( psEncC_c ) );
71
240k
    ret_c = silk_VAD_GetSA_Q8_c( &psEncC_c, pIn );
72
240k
#endif
73
74
    /* Safety checks */
75
240k
    silk_assert( VAD_N_BANDS == 4 );
76
240k
    celt_assert( MAX_FRAME_LENGTH >= psEncC->frame_length );
77
240k
    celt_assert( psEncC->frame_length <= 512 );
78
240k
    celt_assert( psEncC->frame_length == 8 * silk_RSHIFT( psEncC->frame_length, 3 ) );
79
80
    /***********************/
81
    /* Filter and Decimate */
82
    /***********************/
83
240k
    decimated_framelength1 = silk_RSHIFT( psEncC->frame_length, 1 );
84
240k
    decimated_framelength2 = silk_RSHIFT( psEncC->frame_length, 2 );
85
240k
    decimated_framelength = silk_RSHIFT( psEncC->frame_length, 3 );
86
    /* Decimate into 4 bands:
87
       0       L      3L       L              3L                             5L
88
               -      --       -              --                             --
89
               8       8       2               4                              4
90
91
       [0-1 kHz| temp. |1-2 kHz|    2-4 kHz    |            4-8 kHz           |
92
93
       They're arranged to allow the minimal ( frame_length / 4 ) extra
94
       scratch space during the downsampling process */
95
240k
    X_offset[ 0 ] = 0;
96
240k
    X_offset[ 1 ] = decimated_framelength + decimated_framelength2;
97
240k
    X_offset[ 2 ] = X_offset[ 1 ] + decimated_framelength;
98
240k
    X_offset[ 3 ] = X_offset[ 2 ] + decimated_framelength2;
99
240k
    ALLOC( X, X_offset[ 3 ] + decimated_framelength1, opus_int16 );
100
101
    /* 0-8 kHz to 0-4 kHz and 4-8 kHz */
102
240k
    silk_ana_filt_bank_1( pIn, &psSilk_VAD->AnaState[  0 ],
103
240k
        X, &X[ X_offset[ 3 ] ], psEncC->frame_length );
104
105
    /* 0-4 kHz to 0-2 kHz and 2-4 kHz */
106
240k
    silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState1[ 0 ],
107
240k
        X, &X[ X_offset[ 2 ] ], decimated_framelength1 );
108
109
    /* 0-2 kHz to 0-1 kHz and 1-2 kHz */
110
240k
    silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState2[ 0 ],
111
240k
        X, &X[ X_offset[ 1 ] ], decimated_framelength2 );
112
113
    /*********************************************/
114
    /* HP filter on lowest band (differentiator) */
115
    /*********************************************/
116
240k
    X[ decimated_framelength - 1 ] = silk_RSHIFT( X[ decimated_framelength - 1 ], 1 );
117
240k
    HPstateTmp = X[ decimated_framelength - 1 ];
118
5.19M
    for( i = decimated_framelength - 1; i > 0; i-- ) {
119
4.95M
        X[ i - 1 ]  = silk_RSHIFT( X[ i - 1 ], 1 );
120
4.95M
        X[ i ]     -= X[ i - 1 ];
121
4.95M
    }
122
240k
    X[ 0 ] -= psSilk_VAD->HPstate;
123
240k
    psSilk_VAD->HPstate = HPstateTmp;
124
125
    /*************************************/
126
    /* Calculate the energy in each band */
127
    /*************************************/
128
1.20M
    for( b = 0; b < VAD_N_BANDS; b++ ) {
129
        /* Find the decimated framelength in the non-uniformly divided bands */
130
961k
        decimated_framelength = silk_RSHIFT( psEncC->frame_length, silk_min_int( VAD_N_BANDS - b, VAD_N_BANDS - 1 ) );
131
132
        /* Split length into subframe lengths */
133
961k
        dec_subframe_length = silk_RSHIFT( decimated_framelength, VAD_INTERNAL_SUBFRAMES_LOG2 );
134
961k
        dec_subframe_offset = 0;
135
136
        /* Compute energy per sub-frame */
137
        /* initialize with summed energy of last subframe */
138
961k
        Xnrg[ b ] = psSilk_VAD->XnrgSubfr[ b ];
139
4.80M
        for( s = 0; s < VAD_INTERNAL_SUBFRAMES; s++ ) {
140
3.84M
            __m128i xmm_X, xmm_acc;
141
3.84M
            sumSquared = 0;
142
143
3.84M
            xmm_acc = _mm_setzero_si128();
144
145
7.25M
            for( i = 0; i < dec_subframe_length - 7; i += 8 )
146
3.41M
            {
147
3.41M
                xmm_X   = _mm_loadu_si128( (__m128i *)(void*)&(X[ X_offset[ b ] + i + dec_subframe_offset ] ) );
148
3.41M
                xmm_X   = _mm_srai_epi16( xmm_X, 3 );
149
3.41M
                xmm_X   = _mm_madd_epi16( xmm_X, xmm_X );
150
3.41M
                xmm_acc = _mm_add_epi32( xmm_acc, xmm_X );
151
3.41M
            }
152
153
3.84M
            xmm_acc = _mm_add_epi32( xmm_acc, _mm_unpackhi_epi64( xmm_acc, xmm_acc ) );
154
3.84M
            xmm_acc = _mm_add_epi32( xmm_acc, _mm_shufflelo_epi16( xmm_acc, 0x0E ) );
155
156
3.84M
            sumSquared += _mm_cvtsi128_si32( xmm_acc );
157
158
17.8M
            for( ; i < dec_subframe_length; i++ ) {
159
                /* The energy will be less than dec_subframe_length * ( silk_int16_MIN / 8 ) ^ 2.            */
160
                /* Therefore we can accumulate with no risk of overflow (unless dec_subframe_length > 128)  */
161
14.0M
                x_tmp = silk_RSHIFT(
162
14.0M
                    X[ X_offset[ b ] + i + dec_subframe_offset ], 3 );
163
14.0M
                sumSquared = silk_SMLABB( sumSquared, x_tmp, x_tmp );
164
165
                /* Safety check */
166
14.0M
                silk_assert( sumSquared >= 0 );
167
14.0M
            }
168
169
            /* Add/saturate summed energy of current subframe */
170
3.84M
            if( s < VAD_INTERNAL_SUBFRAMES - 1 ) {
171
2.88M
                Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], sumSquared );
172
2.88M
            } else {
173
                /* Look-ahead subframe */
174
961k
                Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], silk_RSHIFT( sumSquared, 1 ) );
175
961k
            }
176
177
3.84M
            dec_subframe_offset += dec_subframe_length;
178
3.84M
        }
179
961k
        psSilk_VAD->XnrgSubfr[ b ] = sumSquared;
180
961k
    }
181
182
    /********************/
183
    /* Noise estimation */
184
    /********************/
185
240k
    silk_VAD_GetNoiseLevels( &Xnrg[ 0 ], psSilk_VAD );
186
187
    /***********************************************/
188
    /* Signal-plus-noise to noise ratio estimation */
189
    /***********************************************/
190
240k
    sumSquared = 0;
191
240k
    input_tilt = 0;
192
1.20M
    for( b = 0; b < VAD_N_BANDS; b++ ) {
193
961k
        speech_nrg = Xnrg[ b ] - psSilk_VAD->NL[ b ];
194
961k
        if( speech_nrg > 0 ) {
195
            /* Divide, with sufficient resolution */
196
711k
            if( ( Xnrg[ b ] & 0xFF800000 ) == 0 ) {
197
546k
                NrgToNoiseRatio_Q8[ b ] = silk_DIV32( silk_LSHIFT( Xnrg[ b ], 8 ), psSilk_VAD->NL[ b ] + 1 );
198
546k
            } else {
199
164k
                NrgToNoiseRatio_Q8[ b ] = silk_DIV32( Xnrg[ b ], silk_RSHIFT( psSilk_VAD->NL[ b ], 8 ) + 1 );
200
164k
            }
201
202
            /* Convert to log domain */
203
711k
            SNR_Q7 = silk_lin2log( NrgToNoiseRatio_Q8[ b ] ) - 8 * 128;
204
205
            /* Sum-of-squares */
206
711k
            sumSquared = silk_SMLABB( sumSquared, SNR_Q7, SNR_Q7 );          /* Q14 */
207
208
            /* Tilt measure */
209
711k
            if( speech_nrg < ( (opus_int32)1 << 20 ) ) {
210
                /* Scale down SNR value for small subband speech energies */
211
335k
                SNR_Q7 = silk_SMULWB( silk_LSHIFT( silk_SQRT_APPROX( speech_nrg ), 6 ), SNR_Q7 );
212
335k
            }
213
711k
            input_tilt = silk_SMLAWB( input_tilt, tiltWeights[ b ], SNR_Q7 );
214
711k
        } else {
215
250k
            NrgToNoiseRatio_Q8[ b ] = 256;
216
250k
        }
217
961k
    }
218
219
    /* Mean-of-squares */
220
240k
    sumSquared = silk_DIV32_16( sumSquared, VAD_N_BANDS ); /* Q14 */
221
222
    /* Root-mean-square approximation, scale to dBs, and write to output pointer */
223
240k
    pSNR_dB_Q7 = (opus_int16)( 3 * silk_SQRT_APPROX( sumSquared ) ); /* Q7 */
224
225
    /*********************************/
226
    /* Speech Probability Estimation */
227
    /*********************************/
228
240k
    SA_Q15 = silk_sigm_Q15( silk_SMULWB( VAD_SNR_FACTOR_Q16, pSNR_dB_Q7 ) - VAD_NEGATIVE_OFFSET_Q5 );
229
230
    /**************************/
231
    /* Frequency Tilt Measure */
232
    /**************************/
233
240k
    psEncC->input_tilt_Q15 = silk_LSHIFT( silk_sigm_Q15( input_tilt ) - 16384, 1 );
234
235
    /**************************************************/
236
    /* Scale the sigmoid output based on power levels */
237
    /**************************************************/
238
240k
    speech_nrg = 0;
239
1.20M
    for( b = 0; b < VAD_N_BANDS; b++ ) {
240
        /* Accumulate signal-without-noise energies, higher frequency bands have more weight */
241
961k
        speech_nrg += ( b + 1 ) * silk_RSHIFT( Xnrg[ b ] - psSilk_VAD->NL[ b ], 4 );
242
961k
    }
243
244
240k
    if( psEncC->frame_length == 20 * psEncC->fs_kHz ) {
245
182k
        speech_nrg = silk_RSHIFT32( speech_nrg, 1 );
246
182k
    }
247
    /* Power scaling */
248
240k
    if( speech_nrg <= 0 ) {
249
46.7k
        SA_Q15 = silk_RSHIFT( SA_Q15, 1 );
250
193k
    } else if( speech_nrg < 16384 ) {
251
22.0k
        speech_nrg = silk_LSHIFT32( speech_nrg, 16 );
252
253
        /* square-root */
254
22.0k
        speech_nrg = silk_SQRT_APPROX( speech_nrg );
255
22.0k
        SA_Q15 = silk_SMULWB( 32768 + speech_nrg, SA_Q15 );
256
22.0k
    }
257
258
    /* Copy the resulting speech activity in Q8 */
259
240k
    psEncC->speech_activity_Q8 = silk_min_int( silk_RSHIFT( SA_Q15, 7 ), silk_uint8_MAX );
260
261
    /***********************************/
262
    /* Energy Level and SNR estimation */
263
    /***********************************/
264
    /* Smoothing coefficient */
265
240k
    smooth_coef_Q16 = silk_SMULWB( VAD_SNR_SMOOTH_COEF_Q18, silk_SMULWB( (opus_int32)SA_Q15, SA_Q15 ) );
266
267
240k
    if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
268
58.3k
        smooth_coef_Q16 >>= 1;
269
58.3k
    }
270
271
1.20M
    for( b = 0; b < VAD_N_BANDS; b++ ) {
272
        /* compute smoothed energy-to-noise ratio per band */
273
961k
        psSilk_VAD->NrgRatioSmth_Q8[ b ] = silk_SMLAWB( psSilk_VAD->NrgRatioSmth_Q8[ b ],
274
961k
            NrgToNoiseRatio_Q8[ b ] - psSilk_VAD->NrgRatioSmth_Q8[ b ], smooth_coef_Q16 );
275
276
        /* signal to noise ratio in dB per band */
277
961k
        SNR_Q7 = 3 * ( silk_lin2log( psSilk_VAD->NrgRatioSmth_Q8[b] ) - 8 * 128 );
278
        /* quality = sigmoid( 0.25 * ( SNR_dB - 16 ) ); */
279
961k
        psEncC->input_quality_bands_Q15[ b ] = silk_sigm_Q15( silk_RSHIFT( SNR_Q7 - 16 * 128, 4 ) );
280
961k
    }
281
282
240k
#ifdef OPUS_CHECK_ASM
283
240k
    silk_assert( ret == ret_c );
284
240k
    silk_assert( !memcmp( &psEncC_c, psEncC, sizeof( psEncC_c ) ) );
285
240k
#endif
286
287
240k
    RESTORE_STACK;
288
240k
    return( ret );
289
240k
}
silk_VAD_GetSA_Q8_sse4_1
Line
Count
Source
49
37.0M
{
50
37.0M
    opus_int   SA_Q15, pSNR_dB_Q7, input_tilt;
51
37.0M
    opus_int   decimated_framelength1, decimated_framelength2;
52
37.0M
    opus_int   decimated_framelength;
53
37.0M
    opus_int   dec_subframe_length, dec_subframe_offset, SNR_Q7, i, b, s;
54
37.0M
    opus_int32 sumSquared, smooth_coef_Q16;
55
37.0M
    opus_int16 HPstateTmp;
56
37.0M
    VARDECL( opus_int16, X );
57
37.0M
    opus_int32 Xnrg[ VAD_N_BANDS ];
58
37.0M
    opus_int32 NrgToNoiseRatio_Q8[ VAD_N_BANDS ];
59
37.0M
    opus_int32 speech_nrg, x_tmp;
60
37.0M
    opus_int   X_offset[ VAD_N_BANDS ];
61
37.0M
    opus_int   ret = 0;
62
37.0M
    silk_VAD_state *psSilk_VAD = &psEncC->sVAD;
63
64
37.0M
    SAVE_STACK;
65
66
#ifdef OPUS_CHECK_ASM
67
    silk_encoder_state psEncC_c;
68
    opus_int ret_c;
69
70
    silk_memcpy( &psEncC_c, psEncC, sizeof( psEncC_c ) );
71
    ret_c = silk_VAD_GetSA_Q8_c( &psEncC_c, pIn );
72
#endif
73
74
    /* Safety checks */
75
37.0M
    silk_assert( VAD_N_BANDS == 4 );
76
37.0M
    celt_assert( MAX_FRAME_LENGTH >= psEncC->frame_length );
77
37.0M
    celt_assert( psEncC->frame_length <= 512 );
78
37.0M
    celt_assert( psEncC->frame_length == 8 * silk_RSHIFT( psEncC->frame_length, 3 ) );
79
80
    /***********************/
81
    /* Filter and Decimate */
82
    /***********************/
83
37.0M
    decimated_framelength1 = silk_RSHIFT( psEncC->frame_length, 1 );
84
37.0M
    decimated_framelength2 = silk_RSHIFT( psEncC->frame_length, 2 );
85
37.0M
    decimated_framelength = silk_RSHIFT( psEncC->frame_length, 3 );
86
    /* Decimate into 4 bands:
87
       0       L      3L       L              3L                             5L
88
               -      --       -              --                             --
89
               8       8       2               4                              4
90
91
       [0-1 kHz| temp. |1-2 kHz|    2-4 kHz    |            4-8 kHz           |
92
93
       They're arranged to allow the minimal ( frame_length / 4 ) extra
94
       scratch space during the downsampling process */
95
37.0M
    X_offset[ 0 ] = 0;
96
37.0M
    X_offset[ 1 ] = decimated_framelength + decimated_framelength2;
97
37.0M
    X_offset[ 2 ] = X_offset[ 1 ] + decimated_framelength;
98
37.0M
    X_offset[ 3 ] = X_offset[ 2 ] + decimated_framelength2;
99
37.0M
    ALLOC( X, X_offset[ 3 ] + decimated_framelength1, opus_int16 );
100
101
    /* 0-8 kHz to 0-4 kHz and 4-8 kHz */
102
37.0M
    silk_ana_filt_bank_1( pIn, &psSilk_VAD->AnaState[  0 ],
103
37.0M
        X, &X[ X_offset[ 3 ] ], psEncC->frame_length );
104
105
    /* 0-4 kHz to 0-2 kHz and 2-4 kHz */
106
37.0M
    silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState1[ 0 ],
107
37.0M
        X, &X[ X_offset[ 2 ] ], decimated_framelength1 );
108
109
    /* 0-2 kHz to 0-1 kHz and 1-2 kHz */
110
37.0M
    silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState2[ 0 ],
111
37.0M
        X, &X[ X_offset[ 1 ] ], decimated_framelength2 );
112
113
    /*********************************************/
114
    /* HP filter on lowest band (differentiator) */
115
    /*********************************************/
116
37.0M
    X[ decimated_framelength - 1 ] = silk_RSHIFT( X[ decimated_framelength - 1 ], 1 );
117
37.0M
    HPstateTmp = X[ decimated_framelength - 1 ];
118
817M
    for( i = decimated_framelength - 1; i > 0; i-- ) {
119
780M
        X[ i - 1 ]  = silk_RSHIFT( X[ i - 1 ], 1 );
120
780M
        X[ i ]     -= X[ i - 1 ];
121
780M
    }
122
37.0M
    X[ 0 ] -= psSilk_VAD->HPstate;
123
37.0M
    psSilk_VAD->HPstate = HPstateTmp;
124
125
    /*************************************/
126
    /* Calculate the energy in each band */
127
    /*************************************/
128
185M
    for( b = 0; b < VAD_N_BANDS; b++ ) {
129
        /* Find the decimated framelength in the non-uniformly divided bands */
130
148M
        decimated_framelength = silk_RSHIFT( psEncC->frame_length, silk_min_int( VAD_N_BANDS - b, VAD_N_BANDS - 1 ) );
131
132
        /* Split length into subframe lengths */
133
148M
        dec_subframe_length = silk_RSHIFT( decimated_framelength, VAD_INTERNAL_SUBFRAMES_LOG2 );
134
148M
        dec_subframe_offset = 0;
135
136
        /* Compute energy per sub-frame */
137
        /* initialize with summed energy of last subframe */
138
148M
        Xnrg[ b ] = psSilk_VAD->XnrgSubfr[ b ];
139
740M
        for( s = 0; s < VAD_INTERNAL_SUBFRAMES; s++ ) {
140
592M
            __m128i xmm_X, xmm_acc;
141
592M
            sumSquared = 0;
142
143
592M
            xmm_acc = _mm_setzero_si128();
144
145
1.11G
            for( i = 0; i < dec_subframe_length - 7; i += 8 )
146
526M
            {
147
526M
                xmm_X   = _mm_loadu_si128( (__m128i *)(void*)&(X[ X_offset[ b ] + i + dec_subframe_offset ] ) );
148
526M
                xmm_X   = _mm_srai_epi16( xmm_X, 3 );
149
526M
                xmm_X   = _mm_madd_epi16( xmm_X, xmm_X );
150
526M
                xmm_acc = _mm_add_epi32( xmm_acc, xmm_X );
151
526M
            }
152
153
592M
            xmm_acc = _mm_add_epi32( xmm_acc, _mm_unpackhi_epi64( xmm_acc, xmm_acc ) );
154
592M
            xmm_acc = _mm_add_epi32( xmm_acc, _mm_shufflelo_epi16( xmm_acc, 0x0E ) );
155
156
592M
            sumSquared += _mm_cvtsi128_si32( xmm_acc );
157
158
2.84G
            for( ; i < dec_subframe_length; i++ ) {
159
                /* The energy will be less than dec_subframe_length * ( silk_int16_MIN / 8 ) ^ 2.            */
160
                /* Therefore we can accumulate with no risk of overflow (unless dec_subframe_length > 128)  */
161
2.25G
                x_tmp = silk_RSHIFT(
162
2.25G
                    X[ X_offset[ b ] + i + dec_subframe_offset ], 3 );
163
2.25G
                sumSquared = silk_SMLABB( sumSquared, x_tmp, x_tmp );
164
165
                /* Safety check */
166
2.25G
                silk_assert( sumSquared >= 0 );
167
2.25G
            }
168
169
            /* Add/saturate summed energy of current subframe */
170
592M
            if( s < VAD_INTERNAL_SUBFRAMES - 1 ) {
171
444M
                Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], sumSquared );
172
444M
            } else {
173
                /* Look-ahead subframe */
174
148M
                Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], silk_RSHIFT( sumSquared, 1 ) );
175
148M
            }
176
177
592M
            dec_subframe_offset += dec_subframe_length;
178
592M
        }
179
148M
        psSilk_VAD->XnrgSubfr[ b ] = sumSquared;
180
148M
    }
181
182
    /********************/
183
    /* Noise estimation */
184
    /********************/
185
37.0M
    silk_VAD_GetNoiseLevels( &Xnrg[ 0 ], psSilk_VAD );
186
187
    /***********************************************/
188
    /* Signal-plus-noise to noise ratio estimation */
189
    /***********************************************/
190
37.0M
    sumSquared = 0;
191
37.0M
    input_tilt = 0;
192
185M
    for( b = 0; b < VAD_N_BANDS; b++ ) {
193
148M
        speech_nrg = Xnrg[ b ] - psSilk_VAD->NL[ b ];
194
148M
        if( speech_nrg > 0 ) {
195
            /* Divide, with sufficient resolution */
196
4.19M
            if( ( Xnrg[ b ] & 0xFF800000 ) == 0 ) {
197
2.75M
                NrgToNoiseRatio_Q8[ b ] = silk_DIV32( silk_LSHIFT( Xnrg[ b ], 8 ), psSilk_VAD->NL[ b ] + 1 );
198
2.75M
            } else {
199
1.44M
                NrgToNoiseRatio_Q8[ b ] = silk_DIV32( Xnrg[ b ], silk_RSHIFT( psSilk_VAD->NL[ b ], 8 ) + 1 );
200
1.44M
            }
201
202
            /* Convert to log domain */
203
4.19M
            SNR_Q7 = silk_lin2log( NrgToNoiseRatio_Q8[ b ] ) - 8 * 128;
204
205
            /* Sum-of-squares */
206
4.19M
            sumSquared = silk_SMLABB( sumSquared, SNR_Q7, SNR_Q7 );          /* Q14 */
207
208
            /* Tilt measure */
209
4.19M
            if( speech_nrg < ( (opus_int32)1 << 20 ) ) {
210
                /* Scale down SNR value for small subband speech energies */
211
1.90M
                SNR_Q7 = silk_SMULWB( silk_LSHIFT( silk_SQRT_APPROX( speech_nrg ), 6 ), SNR_Q7 );
212
1.90M
            }
213
4.19M
            input_tilt = silk_SMLAWB( input_tilt, tiltWeights[ b ], SNR_Q7 );
214
143M
        } else {
215
143M
            NrgToNoiseRatio_Q8[ b ] = 256;
216
143M
        }
217
148M
    }
218
219
    /* Mean-of-squares */
220
37.0M
    sumSquared = silk_DIV32_16( sumSquared, VAD_N_BANDS ); /* Q14 */
221
222
    /* Root-mean-square approximation, scale to dBs, and write to output pointer */
223
37.0M
    pSNR_dB_Q7 = (opus_int16)( 3 * silk_SQRT_APPROX( sumSquared ) ); /* Q7 */
224
225
    /*********************************/
226
    /* Speech Probability Estimation */
227
    /*********************************/
228
37.0M
    SA_Q15 = silk_sigm_Q15( silk_SMULWB( VAD_SNR_FACTOR_Q16, pSNR_dB_Q7 ) - VAD_NEGATIVE_OFFSET_Q5 );
229
230
    /**************************/
231
    /* Frequency Tilt Measure */
232
    /**************************/
233
37.0M
    psEncC->input_tilt_Q15 = silk_LSHIFT( silk_sigm_Q15( input_tilt ) - 16384, 1 );
234
235
    /**************************************************/
236
    /* Scale the sigmoid output based on power levels */
237
    /**************************************************/
238
37.0M
    speech_nrg = 0;
239
185M
    for( b = 0; b < VAD_N_BANDS; b++ ) {
240
        /* Accumulate signal-without-noise energies, higher frequency bands have more weight */
241
148M
        speech_nrg += ( b + 1 ) * silk_RSHIFT( Xnrg[ b ] - psSilk_VAD->NL[ b ], 4 );
242
148M
    }
243
244
37.0M
    if( psEncC->frame_length == 20 * psEncC->fs_kHz ) {
245
22.1M
        speech_nrg = silk_RSHIFT32( speech_nrg, 1 );
246
22.1M
    }
247
    /* Power scaling */
248
37.0M
    if( speech_nrg <= 0 ) {
249
35.8M
        SA_Q15 = silk_RSHIFT( SA_Q15, 1 );
250
35.8M
    } else if( speech_nrg < 16384 ) {
251
156k
        speech_nrg = silk_LSHIFT32( speech_nrg, 16 );
252
253
        /* square-root */
254
156k
        speech_nrg = silk_SQRT_APPROX( speech_nrg );
255
156k
        SA_Q15 = silk_SMULWB( 32768 + speech_nrg, SA_Q15 );
256
156k
    }
257
258
    /* Copy the resulting speech activity in Q8 */
259
37.0M
    psEncC->speech_activity_Q8 = silk_min_int( silk_RSHIFT( SA_Q15, 7 ), silk_uint8_MAX );
260
261
    /***********************************/
262
    /* Energy Level and SNR estimation */
263
    /***********************************/
264
    /* Smoothing coefficient */
265
37.0M
    smooth_coef_Q16 = silk_SMULWB( VAD_SNR_SMOOTH_COEF_Q18, silk_SMULWB( (opus_int32)SA_Q15, SA_Q15 ) );
266
267
37.0M
    if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
268
14.8M
        smooth_coef_Q16 >>= 1;
269
14.8M
    }
270
271
185M
    for( b = 0; b < VAD_N_BANDS; b++ ) {
272
        /* compute smoothed energy-to-noise ratio per band */
273
148M
        psSilk_VAD->NrgRatioSmth_Q8[ b ] = silk_SMLAWB( psSilk_VAD->NrgRatioSmth_Q8[ b ],
274
148M
            NrgToNoiseRatio_Q8[ b ] - psSilk_VAD->NrgRatioSmth_Q8[ b ], smooth_coef_Q16 );
275
276
        /* signal to noise ratio in dB per band */
277
148M
        SNR_Q7 = 3 * ( silk_lin2log( psSilk_VAD->NrgRatioSmth_Q8[b] ) - 8 * 128 );
278
        /* quality = sigmoid( 0.25 * ( SNR_dB - 16 ) ); */
279
148M
        psEncC->input_quality_bands_Q15[ b ] = silk_sigm_Q15( silk_RSHIFT( SNR_Q7 - 16 * 128, 4 ) );
280
148M
    }
281
282
#ifdef OPUS_CHECK_ASM
283
    silk_assert( ret == ret_c );
284
    silk_assert( !memcmp( &psEncC_c, psEncC, sizeof( psEncC_c ) ) );
285
#endif
286
287
37.0M
    RESTORE_STACK;
288
37.0M
    return( ret );
289
37.0M
}