Coverage Report

Created: 2026-03-19 07:24

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/opus/silk/x86/VAD_sse4_1.c
Line
Count
Source
1
/* Copyright (c) 2014-2020, Cisco Systems, INC
2
   Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
3
4
   Redistribution and use in source and binary forms, with or without
5
   modification, are permitted provided that the following conditions
6
   are met:
7
8
   - Redistributions of source code must retain the above copyright
9
   notice, this list of conditions and the following disclaimer.
10
11
   - Redistributions in binary form must reproduce the above copyright
12
   notice, this list of conditions and the following disclaimer in the
13
   documentation and/or other materials provided with the distribution.
14
15
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18
   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
*/
27
28
#ifdef HAVE_CONFIG_H
29
#include "config.h"
30
#endif
31
32
#include <xmmintrin.h>
33
#include <emmintrin.h>
34
#include <smmintrin.h>
35
36
#include "main.h"
37
#include "stack_alloc.h"
38
39
/* Weighting factors for tilt measure */
40
static const opus_int32 tiltWeights[ VAD_N_BANDS ] = { 30000, 6000, -12000, -12000 };
41
42
/***************************************/
43
/* Get the speech activity level in Q8 */
44
/***************************************/
45
opus_int silk_VAD_GetSA_Q8_sse4_1(                  /* O    Return value, 0 if success                  */
46
    silk_encoder_state          *psEncC,            /* I/O  Encoder state                               */
47
    const opus_int16            pIn[]               /* I    PCM input                                   */
48
)
49
42.7M
{
50
42.7M
    opus_int   SA_Q15, pSNR_dB_Q7, input_tilt;
51
42.7M
    opus_int   decimated_framelength1, decimated_framelength2;
52
42.7M
    opus_int   decimated_framelength;
53
42.7M
    opus_int   dec_subframe_length, dec_subframe_offset, SNR_Q7, i, b, s;
54
42.7M
    opus_int32 sumSquared, smooth_coef_Q16;
55
42.7M
    opus_int16 HPstateTmp;
56
42.7M
    VARDECL( opus_int16, X );
57
42.7M
    opus_int32 Xnrg[ VAD_N_BANDS ];
58
42.7M
    opus_int32 NrgToNoiseRatio_Q8[ VAD_N_BANDS ];
59
42.7M
    opus_int32 speech_nrg, x_tmp;
60
42.7M
    opus_int   X_offset[ VAD_N_BANDS ];
61
42.7M
    opus_int   ret = 0;
62
42.7M
    silk_VAD_state *psSilk_VAD = &psEncC->sVAD;
63
64
42.7M
    SAVE_STACK;
65
66
#ifdef OPUS_CHECK_ASM
67
    silk_encoder_state psEncC_c;
68
42.4M
    opus_int ret_c;
69
70
42.4M
    silk_memcpy( &psEncC_c, psEncC, sizeof( psEncC_c ) );
71
    ret_c = silk_VAD_GetSA_Q8_c( &psEncC_c, pIn );
72
#endif
73
74
    /* Safety checks */
75
42.7M
    silk_assert( VAD_N_BANDS == 4 );
76
42.7M
    celt_assert( MAX_FRAME_LENGTH >= psEncC->frame_length );
77
42.7M
    celt_assert( psEncC->frame_length <= 512 );
78
42.7M
    celt_assert( psEncC->frame_length == 8 * silk_RSHIFT( psEncC->frame_length, 3 ) );
79
80
    /***********************/
81
    /* Filter and Decimate */
82
    /***********************/
83
42.7M
    decimated_framelength1 = silk_RSHIFT( psEncC->frame_length, 1 );
84
42.7M
    decimated_framelength2 = silk_RSHIFT( psEncC->frame_length, 2 );
85
42.7M
    decimated_framelength = silk_RSHIFT( psEncC->frame_length, 3 );
86
    /* Decimate into 4 bands:
87
       0       L      3L       L              3L                             5L
88
               -      --       -              --                             --
89
               8       8       2               4                              4
90
91
       [0-1 kHz| temp. |1-2 kHz|    2-4 kHz    |            4-8 kHz           |
92
93
       They're arranged to allow the minimal ( frame_length / 4 ) extra
94
       scratch space during the downsampling process */
95
42.7M
    X_offset[ 0 ] = 0;
96
42.7M
    X_offset[ 1 ] = decimated_framelength + decimated_framelength2;
97
42.7M
    X_offset[ 2 ] = X_offset[ 1 ] + decimated_framelength;
98
42.7M
    X_offset[ 3 ] = X_offset[ 2 ] + decimated_framelength2;
99
42.7M
    ALLOC( X, X_offset[ 3 ] + decimated_framelength1, opus_int16 );
100
101
    /* 0-8 kHz to 0-4 kHz and 4-8 kHz */
102
42.7M
    silk_ana_filt_bank_1( pIn, &psSilk_VAD->AnaState[  0 ],
103
42.7M
        X, &X[ X_offset[ 3 ] ], psEncC->frame_length );
104
105
    /* 0-4 kHz to 0-2 kHz and 2-4 kHz */
106
42.7M
    silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState1[ 0 ],
107
42.7M
        X, &X[ X_offset[ 2 ] ], decimated_framelength1 );
108
109
    /* 0-2 kHz to 0-1 kHz and 1-2 kHz */
110
42.7M
    silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState2[ 0 ],
111
42.7M
        X, &X[ X_offset[ 1 ] ], decimated_framelength2 );
112
113
    /*********************************************/
114
    /* HP filter on lowest band (differentiator) */
115
    /*********************************************/
116
42.7M
    X[ decimated_framelength - 1 ] = silk_RSHIFT( X[ decimated_framelength - 1 ], 1 );
117
42.7M
    HPstateTmp = X[ decimated_framelength - 1 ];
118
834M
    for( i = decimated_framelength - 1; i > 0; i-- ) {
119
791M
        X[ i - 1 ]  = silk_RSHIFT( X[ i - 1 ], 1 );
120
791M
        X[ i ]     -= X[ i - 1 ];
121
791M
    }
122
42.7M
    X[ 0 ] -= psSilk_VAD->HPstate;
123
42.7M
    psSilk_VAD->HPstate = HPstateTmp;
124
125
    /*************************************/
126
    /* Calculate the energy in each band */
127
    /*************************************/
128
213M
    for( b = 0; b < VAD_N_BANDS; b++ ) {
129
        /* Find the decimated framelength in the non-uniformly divided bands */
130
170M
        decimated_framelength = silk_RSHIFT( psEncC->frame_length, silk_min_int( VAD_N_BANDS - b, VAD_N_BANDS - 1 ) );
131
132
        /* Split length into subframe lengths */
133
170M
        dec_subframe_length = silk_RSHIFT( decimated_framelength, VAD_INTERNAL_SUBFRAMES_LOG2 );
134
170M
        dec_subframe_offset = 0;
135
136
        /* Compute energy per sub-frame */
137
        /* initialize with summed energy of last subframe */
138
170M
        Xnrg[ b ] = psSilk_VAD->XnrgSubfr[ b ];
139
854M
        for( s = 0; s < VAD_INTERNAL_SUBFRAMES; s++ ) {
140
683M
            __m128i xmm_X, xmm_acc;
141
683M
            sumSquared = 0;
142
143
683M
            xmm_acc = _mm_setzero_si128();
144
145
1.17G
            for( i = 0; i < dec_subframe_length - 7; i += 8 )
146
493M
            {
147
493M
                xmm_X   = _mm_loadu_si128( (__m128i *)(void*)&(X[ X_offset[ b ] + i + dec_subframe_offset ] ) );
148
493M
                xmm_X   = _mm_srai_epi16( xmm_X, 3 );
149
493M
                xmm_X   = _mm_madd_epi16( xmm_X, xmm_X );
150
493M
                xmm_acc = _mm_add_epi32( xmm_acc, xmm_X );
151
493M
            }
152
153
683M
            xmm_acc = _mm_add_epi32( xmm_acc, _mm_unpackhi_epi64( xmm_acc, xmm_acc ) );
154
683M
            xmm_acc = _mm_add_epi32( xmm_acc, _mm_shufflelo_epi16( xmm_acc, 0x0E ) );
155
156
683M
            sumSquared += _mm_cvtsi128_si32( xmm_acc );
157
158
3.33G
            for( ; i < dec_subframe_length; i++ ) {
159
                /* The energy will be less than dec_subframe_length * ( silk_int16_MIN / 8 ) ^ 2.            */
160
                /* Therefore we can accumulate with no risk of overflow (unless dec_subframe_length > 128)  */
161
2.65G
                x_tmp = silk_RSHIFT(
162
2.65G
                    X[ X_offset[ b ] + i + dec_subframe_offset ], 3 );
163
2.65G
                sumSquared = silk_SMLABB( sumSquared, x_tmp, x_tmp );
164
165
                /* Safety check */
166
2.65G
                silk_assert( sumSquared >= 0 );
167
2.65G
            }
168
169
            /* Add/saturate summed energy of current subframe */
170
683M
            if( s < VAD_INTERNAL_SUBFRAMES - 1 ) {
171
512M
                Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], sumSquared );
172
512M
            } else {
173
                /* Look-ahead subframe */
174
170M
                Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], silk_RSHIFT( sumSquared, 1 ) );
175
170M
            }
176
177
683M
            dec_subframe_offset += dec_subframe_length;
178
683M
        }
179
170M
        psSilk_VAD->XnrgSubfr[ b ] = sumSquared;
180
170M
    }
181
182
    /********************/
183
    /* Noise estimation */
184
    /********************/
185
42.7M
    silk_VAD_GetNoiseLevels( &Xnrg[ 0 ], psSilk_VAD );
186
187
    /***********************************************/
188
    /* Signal-plus-noise to noise ratio estimation */
189
    /***********************************************/
190
231k
    sumSquared = 0;
191
231k
    input_tilt = 0;
192
213M
    for( b = 0; b < VAD_N_BANDS; b++ ) {
193
170M
        speech_nrg = Xnrg[ b ] - psSilk_VAD->NL[ b ];
194
170M
        if( speech_nrg > 0 ) {
195
            /* Divide, with sufficient resolution */
196
4.04M
            if( ( Xnrg[ b ] & 0xFF800000 ) == 0 ) {
197
2.73M
                NrgToNoiseRatio_Q8[ b ] = silk_DIV32( silk_LSHIFT( Xnrg[ b ], 8 ), psSilk_VAD->NL[ b ] + 1 );
198
2.73M
            } else {
199
1.30M
                NrgToNoiseRatio_Q8[ b ] = silk_DIV32( Xnrg[ b ], silk_RSHIFT( psSilk_VAD->NL[ b ], 8 ) + 1 );
200
1.30M
            }
201
202
            /* Convert to log domain */
203
4.04M
            SNR_Q7 = silk_lin2log( NrgToNoiseRatio_Q8[ b ] ) - 8 * 128;
204
205
            /* Sum-of-squares */
206
4.04M
            sumSquared = silk_SMLABB( sumSquared, SNR_Q7, SNR_Q7 );          /* Q14 */
207
208
            /* Tilt measure */
209
4.04M
            if( speech_nrg < ( (opus_int32)1 << 20 ) ) {
210
                /* Scale down SNR value for small subband speech energies */
211
1.77M
                SNR_Q7 = silk_SMULWB( silk_LSHIFT( silk_SQRT_APPROX( speech_nrg ), 6 ), SNR_Q7 );
212
1.77M
            }
213
4.04M
            input_tilt = silk_SMLAWB( input_tilt, tiltWeights[ b ], SNR_Q7 );
214
166M
        } else {
215
166M
            NrgToNoiseRatio_Q8[ b ] = 256;
216
166M
        }
217
170M
    }
218
219
    /* Mean-of-squares */
220
42.7M
    sumSquared = silk_DIV32_16( sumSquared, VAD_N_BANDS ); /* Q14 */
221
222
    /* Root-mean-square approximation, scale to dBs, and write to output pointer */
223
231k
    pSNR_dB_Q7 = (opus_int16)( 3 * silk_SQRT_APPROX( sumSquared ) ); /* Q7 */
224
225
    /*********************************/
226
    /* Speech Probability Estimation */
227
    /*********************************/
228
42.7M
    SA_Q15 = silk_sigm_Q15( silk_SMULWB( VAD_SNR_FACTOR_Q16, pSNR_dB_Q7 ) - VAD_NEGATIVE_OFFSET_Q5 );
229
230
    /**************************/
231
    /* Frequency Tilt Measure */
232
    /**************************/
233
42.7M
    psEncC->input_tilt_Q15 = silk_LSHIFT( silk_sigm_Q15( input_tilt ) - 16384, 1 );
234
235
    /**************************************************/
236
    /* Scale the sigmoid output based on power levels */
237
    /**************************************************/
238
231k
    speech_nrg = 0;
239
213M
    for( b = 0; b < VAD_N_BANDS; b++ ) {
240
        /* Accumulate signal-without-noise energies, higher frequency bands have more weight */
241
170M
        speech_nrg += ( b + 1 ) * silk_RSHIFT( Xnrg[ b ] - psSilk_VAD->NL[ b ], 4 );
242
170M
    }
243
244
42.7M
    if( psEncC->frame_length == 20 * psEncC->fs_kHz ) {
245
29.2M
        speech_nrg = silk_RSHIFT32( speech_nrg, 1 );
246
29.2M
    }
247
    /* Power scaling */
248
42.7M
    if( speech_nrg <= 0 ) {
249
41.5M
        SA_Q15 = silk_RSHIFT( SA_Q15, 1 );
250
41.5M
    } else if( speech_nrg < 16384 ) {
251
135k
        speech_nrg = silk_LSHIFT32( speech_nrg, 16 );
252
253
        /* square-root */
254
135k
        speech_nrg = silk_SQRT_APPROX( speech_nrg );
255
135k
        SA_Q15 = silk_SMULWB( 32768 + speech_nrg, SA_Q15 );
256
135k
    }
257
258
    /* Copy the resulting speech activity in Q8 */
259
42.7M
    psEncC->speech_activity_Q8 = silk_min_int( silk_RSHIFT( SA_Q15, 7 ), silk_uint8_MAX );
260
261
    /***********************************/
262
    /* Energy Level and SNR estimation */
263
    /***********************************/
264
    /* Smoothing coefficient */
265
42.7M
    smooth_coef_Q16 = silk_SMULWB( VAD_SNR_SMOOTH_COEF_Q18, silk_SMULWB( (opus_int32)SA_Q15, SA_Q15 ) );
266
267
42.7M
    if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
268
13.5M
        smooth_coef_Q16 >>= 1;
269
13.5M
    }
270
271
213M
    for( b = 0; b < VAD_N_BANDS; b++ ) {
272
        /* compute smoothed energy-to-noise ratio per band */
273
170M
        psSilk_VAD->NrgRatioSmth_Q8[ b ] = silk_SMLAWB( psSilk_VAD->NrgRatioSmth_Q8[ b ],
274
170M
            NrgToNoiseRatio_Q8[ b ] - psSilk_VAD->NrgRatioSmth_Q8[ b ], smooth_coef_Q16 );
275
276
        /* signal to noise ratio in dB per band */
277
170M
        SNR_Q7 = 3 * ( silk_lin2log( psSilk_VAD->NrgRatioSmth_Q8[b] ) - 8 * 128 );
278
        /* quality = sigmoid( 0.25 * ( SNR_dB - 16 ) ); */
279
170M
        psEncC->input_quality_bands_Q15[ b ] = silk_sigm_Q15( silk_RSHIFT( SNR_Q7 - 16 * 128, 4 ) );
280
170M
    }
281
282
#ifdef OPUS_CHECK_ASM
283
42.4M
    silk_assert( ret == ret_c );
284
42.4M
    silk_assert( !memcmp( &psEncC_c, psEncC, sizeof( psEncC_c ) ) );
285
42.4M
#endif
286
287
42.4M
    RESTORE_STACK;
288
42.4M
    return( ret );
289
42.4M
}
silk_VAD_GetSA_Q8_sse4_1
Line
Count
Source
49
42.4M
{
50
42.4M
    opus_int   SA_Q15, pSNR_dB_Q7, input_tilt;
51
42.4M
    opus_int   decimated_framelength1, decimated_framelength2;
52
42.4M
    opus_int   decimated_framelength;
53
42.4M
    opus_int   dec_subframe_length, dec_subframe_offset, SNR_Q7, i, b, s;
54
42.4M
    opus_int32 sumSquared, smooth_coef_Q16;
55
42.4M
    opus_int16 HPstateTmp;
56
42.4M
    VARDECL( opus_int16, X );
57
42.4M
    opus_int32 Xnrg[ VAD_N_BANDS ];
58
42.4M
    opus_int32 NrgToNoiseRatio_Q8[ VAD_N_BANDS ];
59
42.4M
    opus_int32 speech_nrg, x_tmp;
60
42.4M
    opus_int   X_offset[ VAD_N_BANDS ];
61
42.4M
    opus_int   ret = 0;
62
42.4M
    silk_VAD_state *psSilk_VAD = &psEncC->sVAD;
63
64
42.4M
    SAVE_STACK;
65
66
42.4M
#ifdef OPUS_CHECK_ASM
67
42.4M
    silk_encoder_state psEncC_c;
68
42.4M
    opus_int ret_c;
69
70
42.4M
    silk_memcpy( &psEncC_c, psEncC, sizeof( psEncC_c ) );
71
42.4M
    ret_c = silk_VAD_GetSA_Q8_c( &psEncC_c, pIn );
72
42.4M
#endif
73
74
    /* Safety checks */
75
42.4M
    silk_assert( VAD_N_BANDS == 4 );
76
42.4M
    celt_assert( MAX_FRAME_LENGTH >= psEncC->frame_length );
77
42.4M
    celt_assert( psEncC->frame_length <= 512 );
78
42.4M
    celt_assert( psEncC->frame_length == 8 * silk_RSHIFT( psEncC->frame_length, 3 ) );
79
80
    /***********************/
81
    /* Filter and Decimate */
82
    /***********************/
83
42.4M
    decimated_framelength1 = silk_RSHIFT( psEncC->frame_length, 1 );
84
42.4M
    decimated_framelength2 = silk_RSHIFT( psEncC->frame_length, 2 );
85
42.4M
    decimated_framelength = silk_RSHIFT( psEncC->frame_length, 3 );
86
    /* Decimate into 4 bands:
87
       0       L      3L       L              3L                             5L
88
               -      --       -              --                             --
89
               8       8       2               4                              4
90
91
       [0-1 kHz| temp. |1-2 kHz|    2-4 kHz    |            4-8 kHz           |
92
93
       They're arranged to allow the minimal ( frame_length / 4 ) extra
94
       scratch space during the downsampling process */
95
42.4M
    X_offset[ 0 ] = 0;
96
42.4M
    X_offset[ 1 ] = decimated_framelength + decimated_framelength2;
97
42.4M
    X_offset[ 2 ] = X_offset[ 1 ] + decimated_framelength;
98
42.4M
    X_offset[ 3 ] = X_offset[ 2 ] + decimated_framelength2;
99
42.4M
    ALLOC( X, X_offset[ 3 ] + decimated_framelength1, opus_int16 );
100
101
    /* 0-8 kHz to 0-4 kHz and 4-8 kHz */
102
42.4M
    silk_ana_filt_bank_1( pIn, &psSilk_VAD->AnaState[  0 ],
103
42.4M
        X, &X[ X_offset[ 3 ] ], psEncC->frame_length );
104
105
    /* 0-4 kHz to 0-2 kHz and 2-4 kHz */
106
42.4M
    silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState1[ 0 ],
107
42.4M
        X, &X[ X_offset[ 2 ] ], decimated_framelength1 );
108
109
    /* 0-2 kHz to 0-1 kHz and 1-2 kHz */
110
42.4M
    silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState2[ 0 ],
111
42.4M
        X, &X[ X_offset[ 1 ] ], decimated_framelength2 );
112
113
    /*********************************************/
114
    /* HP filter on lowest band (differentiator) */
115
    /*********************************************/
116
42.4M
    X[ decimated_framelength - 1 ] = silk_RSHIFT( X[ decimated_framelength - 1 ], 1 );
117
42.4M
    HPstateTmp = X[ decimated_framelength - 1 ];
118
829M
    for( i = decimated_framelength - 1; i > 0; i-- ) {
119
786M
        X[ i - 1 ]  = silk_RSHIFT( X[ i - 1 ], 1 );
120
786M
        X[ i ]     -= X[ i - 1 ];
121
786M
    }
122
42.4M
    X[ 0 ] -= psSilk_VAD->HPstate;
123
42.4M
    psSilk_VAD->HPstate = HPstateTmp;
124
125
    /*************************************/
126
    /* Calculate the energy in each band */
127
    /*************************************/
128
212M
    for( b = 0; b < VAD_N_BANDS; b++ ) {
129
        /* Find the decimated framelength in the non-uniformly divided bands */
130
169M
        decimated_framelength = silk_RSHIFT( psEncC->frame_length, silk_min_int( VAD_N_BANDS - b, VAD_N_BANDS - 1 ) );
131
132
        /* Split length into subframe lengths */
133
169M
        dec_subframe_length = silk_RSHIFT( decimated_framelength, VAD_INTERNAL_SUBFRAMES_LOG2 );
134
169M
        dec_subframe_offset = 0;
135
136
        /* Compute energy per sub-frame */
137
        /* initialize with summed energy of last subframe */
138
169M
        Xnrg[ b ] = psSilk_VAD->XnrgSubfr[ b ];
139
849M
        for( s = 0; s < VAD_INTERNAL_SUBFRAMES; s++ ) {
140
679M
            __m128i xmm_X, xmm_acc;
141
679M
            sumSquared = 0;
142
143
679M
            xmm_acc = _mm_setzero_si128();
144
145
1.17G
            for( i = 0; i < dec_subframe_length - 7; i += 8 )
146
490M
            {
147
490M
                xmm_X   = _mm_loadu_si128( (__m128i *)(void*)&(X[ X_offset[ b ] + i + dec_subframe_offset ] ) );
148
490M
                xmm_X   = _mm_srai_epi16( xmm_X, 3 );
149
490M
                xmm_X   = _mm_madd_epi16( xmm_X, xmm_X );
150
490M
                xmm_acc = _mm_add_epi32( xmm_acc, xmm_X );
151
490M
            }
152
153
679M
            xmm_acc = _mm_add_epi32( xmm_acc, _mm_unpackhi_epi64( xmm_acc, xmm_acc ) );
154
679M
            xmm_acc = _mm_add_epi32( xmm_acc, _mm_shufflelo_epi16( xmm_acc, 0x0E ) );
155
156
679M
            sumSquared += _mm_cvtsi128_si32( xmm_acc );
157
158
3.31G
            for( ; i < dec_subframe_length; i++ ) {
159
                /* The energy will be less than dec_subframe_length * ( silk_int16_MIN / 8 ) ^ 2.            */
160
                /* Therefore we can accumulate with no risk of overflow (unless dec_subframe_length > 128)  */
161
2.63G
                x_tmp = silk_RSHIFT(
162
2.63G
                    X[ X_offset[ b ] + i + dec_subframe_offset ], 3 );
163
2.63G
                sumSquared = silk_SMLABB( sumSquared, x_tmp, x_tmp );
164
165
                /* Safety check */
166
2.63G
                silk_assert( sumSquared >= 0 );
167
2.63G
            }
168
169
            /* Add/saturate summed energy of current subframe */
170
679M
            if( s < VAD_INTERNAL_SUBFRAMES - 1 ) {
171
509M
                Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], sumSquared );
172
509M
            } else {
173
                /* Look-ahead subframe */
174
169M
                Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], silk_RSHIFT( sumSquared, 1 ) );
175
169M
            }
176
177
679M
            dec_subframe_offset += dec_subframe_length;
178
679M
        }
179
169M
        psSilk_VAD->XnrgSubfr[ b ] = sumSquared;
180
169M
    }
181
182
    /********************/
183
    /* Noise estimation */
184
    /********************/
185
42.4M
    silk_VAD_GetNoiseLevels( &Xnrg[ 0 ], psSilk_VAD );
186
187
    /***********************************************/
188
    /* Signal-plus-noise to noise ratio estimation */
189
    /***********************************************/
190
42.4M
    sumSquared = 0;
191
42.4M
    input_tilt = 0;
192
212M
    for( b = 0; b < VAD_N_BANDS; b++ ) {
193
169M
        speech_nrg = Xnrg[ b ] - psSilk_VAD->NL[ b ];
194
169M
        if( speech_nrg > 0 ) {
195
            /* Divide, with sufficient resolution */
196
3.35M
            if( ( Xnrg[ b ] & 0xFF800000 ) == 0 ) {
197
2.20M
                NrgToNoiseRatio_Q8[ b ] = silk_DIV32( silk_LSHIFT( Xnrg[ b ], 8 ), psSilk_VAD->NL[ b ] + 1 );
198
2.20M
            } else {
199
1.15M
                NrgToNoiseRatio_Q8[ b ] = silk_DIV32( Xnrg[ b ], silk_RSHIFT( psSilk_VAD->NL[ b ], 8 ) + 1 );
200
1.15M
            }
201
202
            /* Convert to log domain */
203
3.35M
            SNR_Q7 = silk_lin2log( NrgToNoiseRatio_Q8[ b ] ) - 8 * 128;
204
205
            /* Sum-of-squares */
206
3.35M
            sumSquared = silk_SMLABB( sumSquared, SNR_Q7, SNR_Q7 );          /* Q14 */
207
208
            /* Tilt measure */
209
3.35M
            if( speech_nrg < ( (opus_int32)1 << 20 ) ) {
210
                /* Scale down SNR value for small subband speech energies */
211
1.44M
                SNR_Q7 = silk_SMULWB( silk_LSHIFT( silk_SQRT_APPROX( speech_nrg ), 6 ), SNR_Q7 );
212
1.44M
            }
213
3.35M
            input_tilt = silk_SMLAWB( input_tilt, tiltWeights[ b ], SNR_Q7 );
214
166M
        } else {
215
166M
            NrgToNoiseRatio_Q8[ b ] = 256;
216
166M
        }
217
169M
    }
218
219
    /* Mean-of-squares */
220
42.4M
    sumSquared = silk_DIV32_16( sumSquared, VAD_N_BANDS ); /* Q14 */
221
222
    /* Root-mean-square approximation, scale to dBs, and write to output pointer */
223
42.4M
    pSNR_dB_Q7 = (opus_int16)( 3 * silk_SQRT_APPROX( sumSquared ) ); /* Q7 */
224
225
    /*********************************/
226
    /* Speech Probability Estimation */
227
    /*********************************/
228
42.4M
    SA_Q15 = silk_sigm_Q15( silk_SMULWB( VAD_SNR_FACTOR_Q16, pSNR_dB_Q7 ) - VAD_NEGATIVE_OFFSET_Q5 );
229
230
    /**************************/
231
    /* Frequency Tilt Measure */
232
    /**************************/
233
42.4M
    psEncC->input_tilt_Q15 = silk_LSHIFT( silk_sigm_Q15( input_tilt ) - 16384, 1 );
234
235
    /**************************************************/
236
    /* Scale the sigmoid output based on power levels */
237
    /**************************************************/
238
42.4M
    speech_nrg = 0;
239
212M
    for( b = 0; b < VAD_N_BANDS; b++ ) {
240
        /* Accumulate signal-without-noise energies, higher frequency bands have more weight */
241
169M
        speech_nrg += ( b + 1 ) * silk_RSHIFT( Xnrg[ b ] - psSilk_VAD->NL[ b ], 4 );
242
169M
    }
243
244
42.4M
    if( psEncC->frame_length == 20 * psEncC->fs_kHz ) {
245
29.0M
        speech_nrg = silk_RSHIFT32( speech_nrg, 1 );
246
29.0M
    }
247
    /* Power scaling */
248
42.4M
    if( speech_nrg <= 0 ) {
249
41.5M
        SA_Q15 = silk_RSHIFT( SA_Q15, 1 );
250
41.5M
    } else if( speech_nrg < 16384 ) {
251
111k
        speech_nrg = silk_LSHIFT32( speech_nrg, 16 );
252
253
        /* square-root */
254
111k
        speech_nrg = silk_SQRT_APPROX( speech_nrg );
255
111k
        SA_Q15 = silk_SMULWB( 32768 + speech_nrg, SA_Q15 );
256
111k
    }
257
258
    /* Copy the resulting speech activity in Q8 */
259
42.4M
    psEncC->speech_activity_Q8 = silk_min_int( silk_RSHIFT( SA_Q15, 7 ), silk_uint8_MAX );
260
261
    /***********************************/
262
    /* Energy Level and SNR estimation */
263
    /***********************************/
264
    /* Smoothing coefficient */
265
42.4M
    smooth_coef_Q16 = silk_SMULWB( VAD_SNR_SMOOTH_COEF_Q18, silk_SMULWB( (opus_int32)SA_Q15, SA_Q15 ) );
266
267
42.4M
    if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
268
13.4M
        smooth_coef_Q16 >>= 1;
269
13.4M
    }
270
271
212M
    for( b = 0; b < VAD_N_BANDS; b++ ) {
272
        /* compute smoothed energy-to-noise ratio per band */
273
169M
        psSilk_VAD->NrgRatioSmth_Q8[ b ] = silk_SMLAWB( psSilk_VAD->NrgRatioSmth_Q8[ b ],
274
169M
            NrgToNoiseRatio_Q8[ b ] - psSilk_VAD->NrgRatioSmth_Q8[ b ], smooth_coef_Q16 );
275
276
        /* signal to noise ratio in dB per band */
277
169M
        SNR_Q7 = 3 * ( silk_lin2log( psSilk_VAD->NrgRatioSmth_Q8[b] ) - 8 * 128 );
278
        /* quality = sigmoid( 0.25 * ( SNR_dB - 16 ) ); */
279
169M
        psEncC->input_quality_bands_Q15[ b ] = silk_sigm_Q15( silk_RSHIFT( SNR_Q7 - 16 * 128, 4 ) );
280
169M
    }
281
282
42.4M
#ifdef OPUS_CHECK_ASM
283
42.4M
    silk_assert( ret == ret_c );
284
42.4M
    silk_assert( !memcmp( &psEncC_c, psEncC, sizeof( psEncC_c ) ) );
285
42.4M
#endif
286
287
42.4M
    RESTORE_STACK;
288
42.4M
    return( ret );
289
42.4M
}
silk_VAD_GetSA_Q8_sse4_1
Line
Count
Source
49
231k
{
50
231k
    opus_int   SA_Q15, pSNR_dB_Q7, input_tilt;
51
231k
    opus_int   decimated_framelength1, decimated_framelength2;
52
231k
    opus_int   decimated_framelength;
53
231k
    opus_int   dec_subframe_length, dec_subframe_offset, SNR_Q7, i, b, s;
54
231k
    opus_int32 sumSquared, smooth_coef_Q16;
55
231k
    opus_int16 HPstateTmp;
56
231k
    VARDECL( opus_int16, X );
57
231k
    opus_int32 Xnrg[ VAD_N_BANDS ];
58
231k
    opus_int32 NrgToNoiseRatio_Q8[ VAD_N_BANDS ];
59
231k
    opus_int32 speech_nrg, x_tmp;
60
231k
    opus_int   X_offset[ VAD_N_BANDS ];
61
231k
    opus_int   ret = 0;
62
231k
    silk_VAD_state *psSilk_VAD = &psEncC->sVAD;
63
64
231k
    SAVE_STACK;
65
66
#ifdef OPUS_CHECK_ASM
67
    silk_encoder_state psEncC_c;
68
    opus_int ret_c;
69
70
    silk_memcpy( &psEncC_c, psEncC, sizeof( psEncC_c ) );
71
    ret_c = silk_VAD_GetSA_Q8_c( &psEncC_c, pIn );
72
#endif
73
74
    /* Safety checks */
75
231k
    silk_assert( VAD_N_BANDS == 4 );
76
231k
    celt_assert( MAX_FRAME_LENGTH >= psEncC->frame_length );
77
231k
    celt_assert( psEncC->frame_length <= 512 );
78
231k
    celt_assert( psEncC->frame_length == 8 * silk_RSHIFT( psEncC->frame_length, 3 ) );
79
80
    /***********************/
81
    /* Filter and Decimate */
82
    /***********************/
83
231k
    decimated_framelength1 = silk_RSHIFT( psEncC->frame_length, 1 );
84
231k
    decimated_framelength2 = silk_RSHIFT( psEncC->frame_length, 2 );
85
231k
    decimated_framelength = silk_RSHIFT( psEncC->frame_length, 3 );
86
    /* Decimate into 4 bands:
87
       0       L      3L       L              3L                             5L
88
               -      --       -              --                             --
89
               8       8       2               4                              4
90
91
       [0-1 kHz| temp. |1-2 kHz|    2-4 kHz    |            4-8 kHz           |
92
93
       They're arranged to allow the minimal ( frame_length / 4 ) extra
94
       scratch space during the downsampling process */
95
231k
    X_offset[ 0 ] = 0;
96
231k
    X_offset[ 1 ] = decimated_framelength + decimated_framelength2;
97
231k
    X_offset[ 2 ] = X_offset[ 1 ] + decimated_framelength;
98
231k
    X_offset[ 3 ] = X_offset[ 2 ] + decimated_framelength2;
99
231k
    ALLOC( X, X_offset[ 3 ] + decimated_framelength1, opus_int16 );
100
101
    /* 0-8 kHz to 0-4 kHz and 4-8 kHz */
102
231k
    silk_ana_filt_bank_1( pIn, &psSilk_VAD->AnaState[  0 ],
103
231k
        X, &X[ X_offset[ 3 ] ], psEncC->frame_length );
104
105
    /* 0-4 kHz to 0-2 kHz and 2-4 kHz */
106
231k
    silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState1[ 0 ],
107
231k
        X, &X[ X_offset[ 2 ] ], decimated_framelength1 );
108
109
    /* 0-2 kHz to 0-1 kHz and 1-2 kHz */
110
231k
    silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState2[ 0 ],
111
231k
        X, &X[ X_offset[ 1 ] ], decimated_framelength2 );
112
113
    /*********************************************/
114
    /* HP filter on lowest band (differentiator) */
115
    /*********************************************/
116
231k
    X[ decimated_framelength - 1 ] = silk_RSHIFT( X[ decimated_framelength - 1 ], 1 );
117
231k
    HPstateTmp = X[ decimated_framelength - 1 ];
118
5.10M
    for( i = decimated_framelength - 1; i > 0; i-- ) {
119
4.86M
        X[ i - 1 ]  = silk_RSHIFT( X[ i - 1 ], 1 );
120
4.86M
        X[ i ]     -= X[ i - 1 ];
121
4.86M
    }
122
231k
    X[ 0 ] -= psSilk_VAD->HPstate;
123
231k
    psSilk_VAD->HPstate = HPstateTmp;
124
125
    /*************************************/
126
    /* Calculate the energy in each band */
127
    /*************************************/
128
1.15M
    for( b = 0; b < VAD_N_BANDS; b++ ) {
129
        /* Find the decimated framelength in the non-uniformly divided bands */
130
926k
        decimated_framelength = silk_RSHIFT( psEncC->frame_length, silk_min_int( VAD_N_BANDS - b, VAD_N_BANDS - 1 ) );
131
132
        /* Split length into subframe lengths */
133
926k
        dec_subframe_length = silk_RSHIFT( decimated_framelength, VAD_INTERNAL_SUBFRAMES_LOG2 );
134
926k
        dec_subframe_offset = 0;
135
136
        /* Compute energy per sub-frame */
137
        /* initialize with summed energy of last subframe */
138
926k
        Xnrg[ b ] = psSilk_VAD->XnrgSubfr[ b ];
139
4.63M
        for( s = 0; s < VAD_INTERNAL_SUBFRAMES; s++ ) {
140
3.70M
            __m128i xmm_X, xmm_acc;
141
3.70M
            sumSquared = 0;
142
143
3.70M
            xmm_acc = _mm_setzero_si128();
144
145
7.05M
            for( i = 0; i < dec_subframe_length - 7; i += 8 )
146
3.34M
            {
147
3.34M
                xmm_X   = _mm_loadu_si128( (__m128i *)(void*)&(X[ X_offset[ b ] + i + dec_subframe_offset ] ) );
148
3.34M
                xmm_X   = _mm_srai_epi16( xmm_X, 3 );
149
3.34M
                xmm_X   = _mm_madd_epi16( xmm_X, xmm_X );
150
3.34M
                xmm_acc = _mm_add_epi32( xmm_acc, xmm_X );
151
3.34M
            }
152
153
3.70M
            xmm_acc = _mm_add_epi32( xmm_acc, _mm_unpackhi_epi64( xmm_acc, xmm_acc ) );
154
3.70M
            xmm_acc = _mm_add_epi32( xmm_acc, _mm_shufflelo_epi16( xmm_acc, 0x0E ) );
155
156
3.70M
            sumSquared += _mm_cvtsi128_si32( xmm_acc );
157
158
17.4M
            for( ; i < dec_subframe_length; i++ ) {
159
                /* The energy will be less than dec_subframe_length * ( silk_int16_MIN / 8 ) ^ 2.            */
160
                /* Therefore we can accumulate with no risk of overflow (unless dec_subframe_length > 128)  */
161
13.7M
                x_tmp = silk_RSHIFT(
162
13.7M
                    X[ X_offset[ b ] + i + dec_subframe_offset ], 3 );
163
13.7M
                sumSquared = silk_SMLABB( sumSquared, x_tmp, x_tmp );
164
165
                /* Safety check */
166
13.7M
                silk_assert( sumSquared >= 0 );
167
13.7M
            }
168
169
            /* Add/saturate summed energy of current subframe */
170
3.70M
            if( s < VAD_INTERNAL_SUBFRAMES - 1 ) {
171
2.77M
                Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], sumSquared );
172
2.77M
            } else {
173
                /* Look-ahead subframe */
174
926k
                Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], silk_RSHIFT( sumSquared, 1 ) );
175
926k
            }
176
177
3.70M
            dec_subframe_offset += dec_subframe_length;
178
3.70M
        }
179
926k
        psSilk_VAD->XnrgSubfr[ b ] = sumSquared;
180
926k
    }
181
182
    /********************/
183
    /* Noise estimation */
184
    /********************/
185
231k
    silk_VAD_GetNoiseLevels( &Xnrg[ 0 ], psSilk_VAD );
186
187
    /***********************************************/
188
    /* Signal-plus-noise to noise ratio estimation */
189
    /***********************************************/
190
231k
    sumSquared = 0;
191
231k
    input_tilt = 0;
192
1.15M
    for( b = 0; b < VAD_N_BANDS; b++ ) {
193
926k
        speech_nrg = Xnrg[ b ] - psSilk_VAD->NL[ b ];
194
926k
        if( speech_nrg > 0 ) {
195
            /* Divide, with sufficient resolution */
196
685k
            if( ( Xnrg[ b ] & 0xFF800000 ) == 0 ) {
197
531k
                NrgToNoiseRatio_Q8[ b ] = silk_DIV32( silk_LSHIFT( Xnrg[ b ], 8 ), psSilk_VAD->NL[ b ] + 1 );
198
531k
            } else {
199
153k
                NrgToNoiseRatio_Q8[ b ] = silk_DIV32( Xnrg[ b ], silk_RSHIFT( psSilk_VAD->NL[ b ], 8 ) + 1 );
200
153k
            }
201
202
            /* Convert to log domain */
203
685k
            SNR_Q7 = silk_lin2log( NrgToNoiseRatio_Q8[ b ] ) - 8 * 128;
204
205
            /* Sum-of-squares */
206
685k
            sumSquared = silk_SMLABB( sumSquared, SNR_Q7, SNR_Q7 );          /* Q14 */
207
208
            /* Tilt measure */
209
685k
            if( speech_nrg < ( (opus_int32)1 << 20 ) ) {
210
                /* Scale down SNR value for small subband speech energies */
211
336k
                SNR_Q7 = silk_SMULWB( silk_LSHIFT( silk_SQRT_APPROX( speech_nrg ), 6 ), SNR_Q7 );
212
336k
            }
213
685k
            input_tilt = silk_SMLAWB( input_tilt, tiltWeights[ b ], SNR_Q7 );
214
685k
        } else {
215
240k
            NrgToNoiseRatio_Q8[ b ] = 256;
216
240k
        }
217
926k
    }
218
219
    /* Mean-of-squares */
220
231k
    sumSquared = silk_DIV32_16( sumSquared, VAD_N_BANDS ); /* Q14 */
221
222
    /* Root-mean-square approximation, scale to dBs, and write to output pointer */
223
231k
    pSNR_dB_Q7 = (opus_int16)( 3 * silk_SQRT_APPROX( sumSquared ) ); /* Q7 */
224
225
    /*********************************/
226
    /* Speech Probability Estimation */
227
    /*********************************/
228
231k
    SA_Q15 = silk_sigm_Q15( silk_SMULWB( VAD_SNR_FACTOR_Q16, pSNR_dB_Q7 ) - VAD_NEGATIVE_OFFSET_Q5 );
229
230
    /**************************/
231
    /* Frequency Tilt Measure */
232
    /**************************/
233
231k
    psEncC->input_tilt_Q15 = silk_LSHIFT( silk_sigm_Q15( input_tilt ) - 16384, 1 );
234
235
    /**************************************************/
236
    /* Scale the sigmoid output based on power levels */
237
    /**************************************************/
238
231k
    speech_nrg = 0;
239
1.15M
    for( b = 0; b < VAD_N_BANDS; b++ ) {
240
        /* Accumulate signal-without-noise energies, higher frequency bands have more weight */
241
926k
        speech_nrg += ( b + 1 ) * silk_RSHIFT( Xnrg[ b ] - psSilk_VAD->NL[ b ], 4 );
242
926k
    }
243
244
231k
    if( psEncC->frame_length == 20 * psEncC->fs_kHz ) {
245
176k
        speech_nrg = silk_RSHIFT32( speech_nrg, 1 );
246
176k
    }
247
    /* Power scaling */
248
231k
    if( speech_nrg <= 0 ) {
249
44.5k
        SA_Q15 = silk_RSHIFT( SA_Q15, 1 );
250
187k
    } else if( speech_nrg < 16384 ) {
251
23.7k
        speech_nrg = silk_LSHIFT32( speech_nrg, 16 );
252
253
        /* square-root */
254
23.7k
        speech_nrg = silk_SQRT_APPROX( speech_nrg );
255
23.7k
        SA_Q15 = silk_SMULWB( 32768 + speech_nrg, SA_Q15 );
256
23.7k
    }
257
258
    /* Copy the resulting speech activity in Q8 */
259
231k
    psEncC->speech_activity_Q8 = silk_min_int( silk_RSHIFT( SA_Q15, 7 ), silk_uint8_MAX );
260
261
    /***********************************/
262
    /* Energy Level and SNR estimation */
263
    /***********************************/
264
    /* Smoothing coefficient */
265
231k
    smooth_coef_Q16 = silk_SMULWB( VAD_SNR_SMOOTH_COEF_Q18, silk_SMULWB( (opus_int32)SA_Q15, SA_Q15 ) );
266
267
231k
    if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
268
54.8k
        smooth_coef_Q16 >>= 1;
269
54.8k
    }
270
271
1.15M
    for( b = 0; b < VAD_N_BANDS; b++ ) {
272
        /* compute smoothed energy-to-noise ratio per band */
273
926k
        psSilk_VAD->NrgRatioSmth_Q8[ b ] = silk_SMLAWB( psSilk_VAD->NrgRatioSmth_Q8[ b ],
274
926k
            NrgToNoiseRatio_Q8[ b ] - psSilk_VAD->NrgRatioSmth_Q8[ b ], smooth_coef_Q16 );
275
276
        /* signal to noise ratio in dB per band */
277
926k
        SNR_Q7 = 3 * ( silk_lin2log( psSilk_VAD->NrgRatioSmth_Q8[b] ) - 8 * 128 );
278
        /* quality = sigmoid( 0.25 * ( SNR_dB - 16 ) ); */
279
926k
        psEncC->input_quality_bands_Q15[ b ] = silk_sigm_Q15( silk_RSHIFT( SNR_Q7 - 16 * 128, 4 ) );
280
926k
    }
281
282
#ifdef OPUS_CHECK_ASM
283
    silk_assert( ret == ret_c );
284
    silk_assert( !memcmp( &psEncC_c, psEncC, sizeof( psEncC_c ) ) );
285
#endif
286
287
231k
    RESTORE_STACK;
288
231k
    return( ret );
289
231k
}