Coverage Report

Created: 2026-03-19 07:17

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/opus/src/analysis.c
Line
Count
Source
1
/* Copyright (c) 2011 Xiph.Org Foundation
2
   Written by Jean-Marc Valin */
3
/*
4
   Redistribution and use in source and binary forms, with or without
5
   modification, are permitted provided that the following conditions
6
   are met:
7
8
   - Redistributions of source code must retain the above copyright
9
   notice, this list of conditions and the following disclaimer.
10
11
   - Redistributions in binary form must reproduce the above copyright
12
   notice, this list of conditions and the following disclaimer in the
13
   documentation and/or other materials provided with the distribution.
14
15
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18
   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
19
   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
*/
27
28
#ifdef HAVE_CONFIG_H
29
#include "config.h"
30
#endif
31
32
#define ANALYSIS_C
33
34
#ifdef MLP_TRAINING
35
#include <stdio.h>
36
#endif
37
38
#include "mathops.h"
39
#include "kiss_fft.h"
40
#include "celt.h"
41
#include "modes.h"
42
#include "arch.h"
43
#include "quant_bands.h"
44
#include "analysis.h"
45
#include "mlp.h"
46
#include "stack_alloc.h"
47
#include "float_cast.h"
48
49
#ifndef M_PI
50
#define M_PI 3.141592653
51
#endif
52
53
#ifndef DISABLE_FLOAT_API
54
55
#define TRANSITION_PENALTY 10
56
57
static const float dct_table[128] = {
58
        0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
59
        0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
60
        0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.102631f, 0.034654f,
61
       -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.338330f,-0.351851f,
62
        0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.293969f,-0.346760f,
63
       -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.293969f, 0.346760f,
64
        0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.273300f,-0.102631f,
65
        0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.224292f,-0.338330f,
66
        0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
67
        0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
68
        0.311806f, 0.034654f,-0.273300f,-0.338330f,-0.102631f, 0.224292f, 0.351851f, 0.166664f,
69
       -0.166664f,-0.351851f,-0.224292f, 0.102631f, 0.338330f, 0.273300f,-0.034654f,-0.311806f,
70
        0.293969f,-0.068975f,-0.346760f,-0.196424f, 0.196424f, 0.346760f, 0.068975f,-0.293969f,
71
       -0.293969f, 0.068975f, 0.346760f, 0.196424f,-0.196424f,-0.346760f,-0.068975f, 0.293969f,
72
        0.273300f,-0.166664f,-0.338330f, 0.034654f, 0.351851f, 0.102631f,-0.311806f,-0.224292f,
73
        0.224292f, 0.311806f,-0.102631f,-0.351851f,-0.034654f, 0.338330f, 0.166664f,-0.273300f,
74
};
75
76
static const float analysis_window[240] = {
77
      0.000043f, 0.000171f, 0.000385f, 0.000685f, 0.001071f, 0.001541f, 0.002098f, 0.002739f,
78
      0.003466f, 0.004278f, 0.005174f, 0.006156f, 0.007222f, 0.008373f, 0.009607f, 0.010926f,
79
      0.012329f, 0.013815f, 0.015385f, 0.017037f, 0.018772f, 0.020590f, 0.022490f, 0.024472f,
80
      0.026535f, 0.028679f, 0.030904f, 0.033210f, 0.035595f, 0.038060f, 0.040604f, 0.043227f,
81
      0.045928f, 0.048707f, 0.051564f, 0.054497f, 0.057506f, 0.060591f, 0.063752f, 0.066987f,
82
      0.070297f, 0.073680f, 0.077136f, 0.080665f, 0.084265f, 0.087937f, 0.091679f, 0.095492f,
83
      0.099373f, 0.103323f, 0.107342f, 0.111427f, 0.115579f, 0.119797f, 0.124080f, 0.128428f,
84
      0.132839f, 0.137313f, 0.141849f, 0.146447f, 0.151105f, 0.155823f, 0.160600f, 0.165435f,
85
      0.170327f, 0.175276f, 0.180280f, 0.185340f, 0.190453f, 0.195619f, 0.200838f, 0.206107f,
86
      0.211427f, 0.216797f, 0.222215f, 0.227680f, 0.233193f, 0.238751f, 0.244353f, 0.250000f,
87
      0.255689f, 0.261421f, 0.267193f, 0.273005f, 0.278856f, 0.284744f, 0.290670f, 0.296632f,
88
      0.302628f, 0.308658f, 0.314721f, 0.320816f, 0.326941f, 0.333097f, 0.339280f, 0.345492f,
89
      0.351729f, 0.357992f, 0.364280f, 0.370590f, 0.376923f, 0.383277f, 0.389651f, 0.396044f,
90
      0.402455f, 0.408882f, 0.415325f, 0.421783f, 0.428254f, 0.434737f, 0.441231f, 0.447736f,
91
      0.454249f, 0.460770f, 0.467298f, 0.473832f, 0.480370f, 0.486912f, 0.493455f, 0.500000f,
92
      0.506545f, 0.513088f, 0.519630f, 0.526168f, 0.532702f, 0.539230f, 0.545751f, 0.552264f,
93
      0.558769f, 0.565263f, 0.571746f, 0.578217f, 0.584675f, 0.591118f, 0.597545f, 0.603956f,
94
      0.610349f, 0.616723f, 0.623077f, 0.629410f, 0.635720f, 0.642008f, 0.648271f, 0.654508f,
95
      0.660720f, 0.666903f, 0.673059f, 0.679184f, 0.685279f, 0.691342f, 0.697372f, 0.703368f,
96
      0.709330f, 0.715256f, 0.721144f, 0.726995f, 0.732807f, 0.738579f, 0.744311f, 0.750000f,
97
      0.755647f, 0.761249f, 0.766807f, 0.772320f, 0.777785f, 0.783203f, 0.788573f, 0.793893f,
98
      0.799162f, 0.804381f, 0.809547f, 0.814660f, 0.819720f, 0.824724f, 0.829673f, 0.834565f,
99
      0.839400f, 0.844177f, 0.848895f, 0.853553f, 0.858151f, 0.862687f, 0.867161f, 0.871572f,
100
      0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627f, 0.904508f,
101
      0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703f, 0.933013f,
102
      0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072f, 0.956773f,
103
      0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465f, 0.975528f,
104
      0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671f, 0.989074f,
105
      0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534f, 0.997261f,
106
      0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957f, 1.000000f,
107
};
108
109
static const int tbands[NB_TBANDS+1] = {
110
      4, 8, 12, 16, 20, 24, 28, 32, 40, 48, 56, 64, 80, 96, 112, 136, 160, 192, 240
111
};
112
113
5.85M
#define NB_TONAL_SKIP_BANDS 9
114
115
static opus_val32 silk_resampler_down2_hp(
116
    opus_val32                  *S,                 /* I/O  State vector [ 2 ]                                          */
117
    opus_val32                  *out,               /* O    Output signal [ floor(len/2) ]                              */
118
    const opus_val32            *in,                /* I    Input signal [ len ]                                        */
119
    int                         inLen               /* I    Number of input samples                                     */
120
)
121
6.02M
{
122
6.02M
    int k, len2 = inLen/2;
123
6.02M
    opus_val32 in32, out32, out32_hp, Y, X;
124
6.02M
    opus_val64 hp_ener = 0;
125
    /* Internal variables and state are in Q10 format */
126
1.28G
    for( k = 0; k < len2; k++ ) {
127
        /* Convert to Q10 */
128
1.27G
        in32 = in[ 2 * k ];
129
130
        /* All-pass section for even input sample */
131
1.27G
        Y      = SUB32( in32, S[ 0 ] );
132
1.27G
        X      = MULT16_32_Q15(QCONST16(0.6074371f, 15), Y);
133
1.27G
        out32  = ADD32( S[ 0 ], X );
134
1.27G
        S[ 0 ] = ADD32( in32, X );
135
1.27G
        out32_hp = out32;
136
        /* Convert to Q10 */
137
1.27G
        in32 = in[ 2 * k + 1 ];
138
139
        /* All-pass section for odd input sample, and add to output of previous section */
140
1.27G
        Y      = SUB32( in32, S[ 1 ] );
141
1.27G
        X      = MULT16_32_Q15(QCONST16(0.15063f, 15), Y);
142
1.27G
        out32  = ADD32( out32, S[ 1 ] );
143
1.27G
        out32  = ADD32( out32, X );
144
1.27G
        S[ 1 ] = ADD32( in32, X );
145
146
1.27G
        Y      = SUB32( -in32, S[ 2 ] );
147
1.27G
        X      = MULT16_32_Q15(QCONST16(0.15063f, 15), Y);
148
1.27G
        out32_hp  = ADD32( out32_hp, S[ 2 ] );
149
1.27G
        out32_hp  = ADD32( out32_hp, X );
150
1.27G
        S[ 2 ] = ADD32( -in32, X );
151
152
        /* len2 can be up to 480, so we shift by 8 to make it fit. */
153
1.27G
        hp_ener += SHR64(out32_hp*(opus_val64)out32_hp, 8);
154
        /* Add, convert back to int16 and store to output */
155
1.27G
        out[ k ] = HALF32(out32);
156
1.27G
    }
157
6.02M
#ifdef FIXED_POINT
158
    /* Fitting in 32 bits. */
159
6.02M
    hp_ener = hp_ener >> (2*SIG_SHIFT);
160
6.02M
    if (hp_ener > 2147483647) hp_ener = 2147483647;
161
6.02M
#endif
162
6.02M
    return (opus_val32)hp_ener;
163
6.02M
}
164
165
static opus_val32 downmix_and_resample(downmix_func downmix, const void *_x, opus_val32 *y, opus_val32 S[3], int subframe, int offset, int c1, int c2, int C, int Fs)
166
12.9M
{
167
12.9M
   VARDECL(opus_val32, tmp);
168
12.9M
   int j;
169
12.9M
   opus_val32 ret = 0;
170
12.9M
   SAVE_STACK;
171
172
12.9M
   if (subframe==0) return 0;
173
9.02M
   if (Fs == 48000)
174
1.63M
   {
175
1.63M
      subframe *= 2;
176
1.63M
      offset *= 2;
177
7.38M
   } else if (Fs == 16000) {
178
4.38M
      subframe = subframe*2/3;
179
4.38M
      offset = offset*2/3;
180
4.38M
   }
181
9.02M
   else if (Fs != 24000) celt_assert(0);
182
9.02M
   ALLOC(tmp, subframe, opus_val32);
183
184
9.02M
   downmix(_x, tmp, subframe, offset, c1, c2, C);
185
9.02M
   if ((c2==-2 && C==2) || c2>-1) {
186
534M
      for (j=0;j<subframe;j++) {
187
532M
         tmp[j] = HALF32(tmp[j]);
188
532M
      }
189
1.95M
   }
190
9.02M
   if (Fs == 48000)
191
1.63M
   {
192
1.63M
      ret = silk_resampler_down2_hp(S, y, tmp, subframe);
193
7.38M
   } else if (Fs == 24000) {
194
3.00M
      OPUS_COPY(y, tmp, subframe);
195
4.38M
   } else if (Fs == 16000) {
196
4.38M
      VARDECL(opus_val32, tmp3x);
197
4.38M
      ALLOC(tmp3x, 3*subframe, opus_val32);
198
      /* Don't do this at home! This resampler is horrible and it's only (barely)
199
         usable for the purpose of the analysis because we don't care about all
200
         the aliasing between 8 kHz and 12 kHz. */
201
559M
      for (j=0;j<subframe;j++)
202
554M
      {
203
554M
         tmp3x[3*j] = tmp[j];
204
554M
         tmp3x[3*j+1] = tmp[j];
205
554M
         tmp3x[3*j+2] = tmp[j];
206
554M
      }
207
4.38M
      silk_resampler_down2_hp(S, y, tmp3x, 3*subframe);
208
4.38M
   }
209
9.02M
   RESTORE_STACK;
210
#ifndef FIXED_POINT
211
   ret *= 1.f/32768/32768;
212
#endif
213
9.02M
   return ret;
214
9.02M
}
215
216
void tonality_analysis_init(TonalityAnalysisState *tonal, opus_int32 Fs)
217
4.60k
{
218
  /* Initialize reusable fields. */
219
4.60k
  tonal->arch = opus_select_arch();
220
4.60k
  tonal->Fs = Fs;
221
  /* Clear remaining fields. */
222
4.60k
  tonality_analysis_reset(tonal);
223
4.60k
}
224
225
void tonality_analysis_reset(TonalityAnalysisState *tonal)
226
4.60k
{
227
  /* Clear non-reusable fields. */
228
4.60k
  char *start = (char*)&tonal->TONALITY_ANALYSIS_RESET_START;
229
4.60k
  OPUS_CLEAR(start, sizeof(TonalityAnalysisState) - (start - (char*)tonal));
230
4.60k
}
231
232
void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
233
9.53M
{
234
9.53M
   int pos;
235
9.53M
   int curr_lookahead;
236
9.53M
   float tonality_max;
237
9.53M
   float tonality_avg;
238
9.53M
   int tonality_count;
239
9.53M
   int i;
240
9.53M
   int pos0;
241
9.53M
   float prob_avg;
242
9.53M
   float prob_count;
243
9.53M
   float prob_min, prob_max;
244
9.53M
   float vad_prob;
245
9.53M
   int mpos, vpos;
246
9.53M
   int bandwidth_span;
247
248
9.53M
   pos = tonal->read_pos;
249
9.53M
   curr_lookahead = tonal->write_pos-tonal->read_pos;
250
9.53M
   if (curr_lookahead<0)
251
100k
      curr_lookahead += DETECT_SIZE;
252
253
9.53M
   tonal->read_subframe += len/(tonal->Fs/400);
254
15.4M
   while (tonal->read_subframe>=8)
255
5.96M
   {
256
5.96M
      tonal->read_subframe -= 8;
257
5.96M
      tonal->read_pos++;
258
5.96M
   }
259
9.53M
   if (tonal->read_pos>=DETECT_SIZE)
260
59.0k
      tonal->read_pos-=DETECT_SIZE;
261
262
   /* On long frames, look at the second analysis window rather than the first. */
263
9.53M
   if (len > tonal->Fs/50 && pos != tonal->write_pos)
264
682k
   {
265
682k
      pos++;
266
682k
      if (pos==DETECT_SIZE)
267
2.70k
         pos=0;
268
682k
   }
269
9.53M
   if (pos == tonal->write_pos)
270
5.12M
      pos--;
271
9.53M
   if (pos<0)
272
51.6k
      pos = DETECT_SIZE-1;
273
9.53M
   pos0 = pos;
274
9.53M
   OPUS_COPY(info_out, &tonal->info[pos], 1);
275
9.53M
   if (!info_out->valid)
276
419k
      return;
277
9.11M
   tonality_max = tonality_avg = info_out->tonality;
278
9.11M
   tonality_count = 1;
279
   /* Look at the neighbouring frames and pick largest bandwidth found (to be safe). */
280
9.11M
   bandwidth_span = 6;
281
   /* If possible, look ahead for a tone to compensate for the delay in the tone detector. */
282
12.4M
   for (i=0;i<3;i++)
283
11.9M
   {
284
11.9M
      pos++;
285
11.9M
      if (pos==DETECT_SIZE)
286
125k
         pos = 0;
287
11.9M
      if (pos == tonal->write_pos)
288
8.62M
         break;
289
3.29M
      tonality_max = MAX32(tonality_max, tonal->info[pos].tonality);
290
3.29M
      tonality_avg += tonal->info[pos].tonality;
291
3.29M
      tonality_count++;
292
3.29M
      info_out->bandwidth = IMAX(info_out->bandwidth, tonal->info[pos].bandwidth);
293
3.29M
      bandwidth_span--;
294
3.29M
   }
295
9.11M
   pos = pos0;
296
   /* Look back in time to see if any has a wider bandwidth than the current frame. */
297
60.4M
   for (i=0;i<bandwidth_span;i++)
298
51.3M
   {
299
51.3M
      pos--;
300
51.3M
      if (pos < 0)
301
521k
         pos = DETECT_SIZE-1;
302
51.3M
      if (pos == tonal->write_pos)
303
0
         break;
304
51.3M
      info_out->bandwidth = IMAX(info_out->bandwidth, tonal->info[pos].bandwidth);
305
51.3M
   }
306
9.11M
   info_out->tonality = MAX32(tonality_avg/tonality_count, tonality_max-.2f);
307
308
9.11M
   mpos = vpos = pos0;
309
   /* If we have enough look-ahead, compensate for the ~5-frame delay in the music prob and
310
      ~1 frame delay in the VAD prob. */
311
9.11M
   if (curr_lookahead > 15)
312
0
   {
313
0
      mpos += 5;
314
0
      if (mpos>=DETECT_SIZE)
315
0
         mpos -= DETECT_SIZE;
316
0
      vpos += 1;
317
0
      if (vpos>=DETECT_SIZE)
318
0
         vpos -= DETECT_SIZE;
319
0
   }
320
321
   /* The following calculations attempt to minimize a "badness function"
322
      for the transition. When switching from speech to music, the badness
323
      of switching at frame k is
324
      b_k = S*v_k + \sum_{i=0}^{k-1} v_i*(p_i - T)
325
      where
326
      v_i is the activity probability (VAD) at frame i,
327
      p_i is the music probability at frame i
328
      T is the probability threshold for switching
329
      S is the penalty for switching during active audio rather than silence
330
      the current frame has index i=0
331
332
      Rather than apply badness to directly decide when to switch, what we compute
333
      instead is the threshold for which the optimal switching point is now. When
334
      considering whether to switch now (frame 0) or at frame k, we have:
335
      S*v_0 = S*v_k + \sum_{i=0}^{k-1} v_i*(p_i - T)
336
      which gives us:
337
      T = ( \sum_{i=0}^{k-1} v_i*p_i + S*(v_k-v_0) ) / ( \sum_{i=0}^{k-1} v_i )
338
      We take the min threshold across all positive values of k (up to the maximum
339
      amount of lookahead we have) to give us the threshold for which the current
340
      frame is the optimal switch point.
341
342
      The last step is that we need to consider whether we want to switch at all.
343
      For that we use the average of the music probability over the entire window.
344
      If the threshold is higher than that average we're not going to
345
      switch, so we compute a min with the average as well. The result of all these
346
      min operations is music_prob_min, which gives the threshold for switching to music
347
      if we're currently encoding for speech.
348
349
      We do the exact opposite to compute music_prob_max which is used for switching
350
      from music to speech.
351
    */
352
9.11M
   prob_min = 1.f;
353
9.11M
   prob_max = 0.f;
354
9.11M
   vad_prob = tonal->info[vpos].activity_probability;
355
9.11M
   prob_count = MAX16(.1f, vad_prob);
356
9.11M
   prob_avg = MAX16(.1f, vad_prob)*tonal->info[mpos].music_prob;
357
12.7M
   while (1)
358
12.7M
   {
359
12.7M
      float pos_vad;
360
12.7M
      mpos++;
361
12.7M
      if (mpos==DETECT_SIZE)
362
142k
         mpos = 0;
363
12.7M
      if (mpos == tonal->write_pos)
364
9.11M
         break;
365
3.59M
      vpos++;
366
3.59M
      if (vpos==DETECT_SIZE)
367
19.4k
         vpos = 0;
368
3.59M
      if (vpos == tonal->write_pos)
369
0
         break;
370
3.59M
      pos_vad = tonal->info[vpos].activity_probability;
371
3.59M
      prob_min = MIN16((prob_avg - TRANSITION_PENALTY*(vad_prob - pos_vad))/prob_count, prob_min);
372
3.59M
      prob_max = MAX16((prob_avg + TRANSITION_PENALTY*(vad_prob - pos_vad))/prob_count, prob_max);
373
3.59M
      prob_count += MAX16(.1f, pos_vad);
374
3.59M
      prob_avg += MAX16(.1f, pos_vad)*tonal->info[mpos].music_prob;
375
3.59M
   }
376
9.11M
   info_out->music_prob = prob_avg/prob_count;
377
9.11M
   prob_min = MIN16(prob_avg/prob_count, prob_min);
378
9.11M
   prob_max = MAX16(prob_avg/prob_count, prob_max);
379
9.11M
   prob_min = MAX16(prob_min, 0.f);
380
9.11M
   prob_max = MIN16(prob_max, 1.f);
381
382
   /* If we don't have enough look-ahead, do our best to make a decent decision. */
383
9.11M
   if (curr_lookahead < 10)
384
9.11M
   {
385
9.11M
      float pmin, pmax;
386
9.11M
      pmin = prob_min;
387
9.11M
      pmax = prob_max;
388
9.11M
      pos = pos0;
389
      /* Look for min/max in the past. */
390
86.0M
      for (i=0;i<IMIN(tonal->count-1, 15);i++)
391
76.8M
      {
392
76.8M
         pos--;
393
76.8M
         if (pos < 0)
394
764k
            pos = DETECT_SIZE-1;
395
76.8M
         pmin = MIN16(pmin, tonal->info[pos].music_prob);
396
76.8M
         pmax = MAX16(pmax, tonal->info[pos].music_prob);
397
76.8M
      }
398
      /* Bias against switching on active audio. */
399
9.11M
      pmin = MAX16(0.f, pmin - .1f*vad_prob);
400
9.11M
      pmax = MIN16(1.f, pmax + .1f*vad_prob);
401
9.11M
      prob_min += (1.f-.1f*curr_lookahead)*(pmin - prob_min);
402
9.11M
      prob_max += (1.f-.1f*curr_lookahead)*(pmax - prob_max);
403
9.11M
   }
404
9.11M
   info_out->music_prob_min = prob_min;
405
9.11M
   info_out->music_prob_max = prob_max;
406
407
   /* printf("%f %f %f %f %f\n", prob_min, prob_max, prob_avg/prob_count, vad_prob, info_out->music_prob); */
408
9.11M
}
409
410
static const float std_feature_bias[9] = {
411
      5.684947f, 3.475288f, 1.770634f, 1.599784f, 3.773215f,
412
      2.163313f, 1.260756f, 1.116868f, 1.918795f
413
};
414
415
209k
#define LEAKAGE_OFFSET 2.5f
416
7.32M
#define LEAKAGE_SLOPE 2.f
417
418
#ifdef FIXED_POINT
419
/* For fixed-point, the input is +/-2^15 shifted up by SIG_SHIFT, so we need to
420
   compensate for that in the energy. */
421
106M
#define SCALE_COMPENS (1.f/((opus_int32)1<<(15+SIG_SHIFT)))
422
53.3M
#define SCALE_ENER(e) ((SCALE_COMPENS*SCALE_COMPENS)*(e))
423
#else
424
#define SCALE_ENER(e) ((1.f/32768/32768)*e)
425
#endif
426
427
#ifdef FIXED_POINT
428
static int is_digital_silence32(const opus_val32* pcm, int frame_size, int channels, int lsb_depth)
429
3.89M
{
430
3.89M
   int silence = 0;
431
3.89M
   opus_val32 sample_max = 0;
432
#ifdef MLP_TRAINING
433
   return 0;
434
#endif
435
3.89M
   sample_max = celt_maxabs32(pcm, frame_size*channels);
436
437
3.89M
   silence = (sample_max == 0);
438
3.89M
   (void)lsb_depth;
439
3.89M
   return silence;
440
3.89M
}
441
#else
442
#define is_digital_silence32(pcm, frame_size, channels, lsb_depth) is_digital_silence(pcm, frame_size, channels, lsb_depth)
443
#endif
444
445
static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix)
446
9.02M
{
447
9.02M
    int i, b;
448
9.02M
    const kiss_fft_state *kfft;
449
9.02M
    VARDECL(kiss_fft_cpx, in);
450
9.02M
    VARDECL(kiss_fft_cpx, out);
451
9.02M
    int N = 480, N2=240;
452
9.02M
    float * OPUS_RESTRICT A = tonal->angle;
453
9.02M
    float * OPUS_RESTRICT dA = tonal->d_angle;
454
9.02M
    float * OPUS_RESTRICT d2A = tonal->d2_angle;
455
9.02M
    VARDECL(float, tonality);
456
9.02M
    VARDECL(float, noisiness);
457
9.02M
    float band_tonality[NB_TBANDS];
458
9.02M
    float logE[NB_TBANDS];
459
9.02M
    float BFCC[8];
460
9.02M
    float features[25];
461
9.02M
    float frame_tonality;
462
9.02M
    float max_frame_tonality;
463
    /*float tw_sum=0;*/
464
9.02M
    float frame_noisiness;
465
9.02M
    const float pi4 = (float)(M_PI*M_PI*M_PI*M_PI);
466
9.02M
    float slope=0;
467
9.02M
    float frame_stationarity;
468
9.02M
    float relativeE;
469
9.02M
    float frame_probs[2];
470
9.02M
    float alpha, alphaE, alphaE2;
471
9.02M
    float frame_loudness;
472
9.02M
    float bandwidth_mask;
473
9.02M
    int is_masked[NB_TBANDS+1];
474
9.02M
    int bandwidth=0;
475
9.02M
    float maxE = 0;
476
9.02M
    float noise_floor;
477
9.02M
    int remaining;
478
9.02M
    AnalysisInfo *info;
479
9.02M
    float hp_ener;
480
9.02M
    float tonality2[240];
481
9.02M
    float midE[8];
482
9.02M
    float spec_variability=0;
483
9.02M
    float band_log2[NB_TBANDS+1];
484
9.02M
    float leakage_from[NB_TBANDS+1];
485
9.02M
    float leakage_to[NB_TBANDS+1];
486
9.02M
    float layer_out[MAX_NEURONS];
487
9.02M
    float below_max_pitch;
488
9.02M
    float above_max_pitch;
489
9.02M
    int is_silence;
490
9.02M
    SAVE_STACK;
491
492
9.02M
    if (!tonal->initialized)
493
665
    {
494
665
       tonal->mem_fill = 240;
495
665
       tonal->initialized = 1;
496
665
    }
497
9.02M
    alpha = 1.f/IMIN(10, 1+tonal->count);
498
9.02M
    alphaE = 1.f/IMIN(25, 1+tonal->count);
499
    /* Noise floor related decay for bandwidth detection: -2.2 dB/second */
500
9.02M
    alphaE2 = 1.f/IMIN(100, 1+tonal->count);
501
9.02M
    if (tonal->count <= 1) alphaE2 = 1;
502
503
9.02M
    if (tonal->Fs == 48000)
504
1.63M
    {
505
       /* len and offset are now at 24 kHz. */
506
1.63M
       len/= 2;
507
1.63M
       offset /= 2;
508
7.38M
    } else if (tonal->Fs == 16000) {
509
4.38M
       len = 3*len/2;
510
4.38M
       offset = 3*offset/2;
511
4.38M
    }
512
513
9.02M
    kfft = celt_mode->mdct.kfft[0];
514
9.02M
    tonal->hp_ener_accum += (float)downmix_and_resample(downmix, x,
515
9.02M
          &tonal->inmem[tonal->mem_fill], tonal->downmix_state,
516
9.02M
          IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C, tonal->Fs);
517
9.02M
    if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)
518
5.12M
    {
519
5.12M
       tonal->mem_fill += len;
520
       /* Don't have enough to update the analysis */
521
5.12M
       RESTORE_STACK;
522
5.12M
       return;
523
5.12M
    }
524
3.89M
    hp_ener = tonal->hp_ener_accum;
525
3.89M
    info = &tonal->info[tonal->write_pos++];
526
3.89M
    if (tonal->write_pos>=DETECT_SIZE)
527
38.5k
       tonal->write_pos-=DETECT_SIZE;
528
529
3.89M
    is_silence = is_digital_silence32(tonal->inmem, ANALYSIS_BUF_SIZE, 1, lsb_depth);
530
531
3.89M
    ALLOC(in, 480, kiss_fft_cpx);
532
3.89M
    ALLOC(out, 480, kiss_fft_cpx);
533
3.89M
    ALLOC(tonality, 240, float);
534
3.89M
    ALLOC(noisiness, 240, float);
535
938M
    for (i=0;i<N2;i++)
536
934M
    {
537
934M
       float w = analysis_window[i];
538
934M
       in[i].r = (kiss_fft_scalar)(w*tonal->inmem[i]);
539
934M
       in[i].i = (kiss_fft_scalar)(w*tonal->inmem[N2+i]);
540
934M
       in[N-i-1].r = (kiss_fft_scalar)(w*tonal->inmem[N-i-1]);
541
934M
       in[N-i-1].i = (kiss_fft_scalar)(w*tonal->inmem[N+N2-i-1]);
542
934M
    }
543
3.89M
    OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);
544
3.89M
    remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
545
3.89M
    tonal->hp_ener_accum = (float)downmix_and_resample(downmix, x,
546
3.89M
          &tonal->inmem[240], tonal->downmix_state, remaining,
547
3.89M
          offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C, tonal->Fs);
548
3.89M
    tonal->mem_fill = 240 + remaining;
549
3.89M
    if (is_silence)
550
3.68M
    {
551
       /* On silence, copy the previous analysis. */
552
3.68M
       int prev_pos = tonal->write_pos-2;
553
3.68M
       if (prev_pos < 0)
554
73.2k
          prev_pos += DETECT_SIZE;
555
3.68M
       OPUS_COPY(info, &tonal->info[prev_pos], 1);
556
3.68M
       RESTORE_STACK;
557
3.68M
       return;
558
3.68M
    }
559
209k
    opus_fft(kfft, in, out, tonal->arch);
560
#ifndef FIXED_POINT
561
    /* If there's any NaN on the input, the entire output will be NaN, so we only need to check one value. */
562
    if (celt_isnan(out[0].r))
563
    {
564
       info->valid = 0;
565
       RESTORE_STACK;
566
       return;
567
    }
568
#endif
569
570
50.1M
    for (i=1;i<N2;i++)
571
49.9M
    {
572
49.9M
       float X1r, X2r, X1i, X2i;
573
49.9M
       float angle, d_angle, d2_angle;
574
49.9M
       float angle2, d_angle2, d2_angle2;
575
49.9M
       float mod1, mod2, avg_mod;
576
49.9M
       X1r = (float)out[i].r+out[N-i].r;
577
49.9M
       X1i = (float)out[i].i-out[N-i].i;
578
49.9M
       X2r = (float)out[i].i+out[N-i].i;
579
49.9M
       X2i = (float)out[N-i].r-out[i].r;
580
581
49.9M
       angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r);
582
49.9M
       d_angle = angle - A[i];
583
49.9M
       d2_angle = d_angle - dA[i];
584
585
49.9M
       angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r);
586
49.9M
       d_angle2 = angle2 - angle;
587
49.9M
       d2_angle2 = d_angle2 - d_angle;
588
589
49.9M
       mod1 = d2_angle - (float)float2int(d2_angle);
590
49.9M
       noisiness[i] = ABS16(mod1);
591
49.9M
       mod1 *= mod1;
592
49.9M
       mod1 *= mod1;
593
594
49.9M
       mod2 = d2_angle2 - (float)float2int(d2_angle2);
595
49.9M
       noisiness[i] += ABS16(mod2);
596
49.9M
       mod2 *= mod2;
597
49.9M
       mod2 *= mod2;
598
599
49.9M
       avg_mod = .25f*(d2A[i]+mod1+2*mod2);
600
       /* This introduces an extra delay of 2 frames in the detection. */
601
49.9M
       tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f;
602
       /* No delay on this detection, but it's less reliable. */
603
49.9M
       tonality2[i] = 1.f/(1.f+40.f*16.f*pi4*mod2)-.015f;
604
605
49.9M
       A[i] = angle2;
606
49.9M
       dA[i] = d_angle2;
607
49.9M
       d2A[i] = mod2;
608
49.9M
    }
609
49.7M
    for (i=2;i<N2-1;i++)
610
49.5M
    {
611
49.5M
       float tt = MIN32(tonality2[i], MAX32(tonality2[i-1], tonality2[i+1]));
612
49.5M
       tonality[i] = .9f*MAX32(tonality[i], tt-.1f);
613
49.5M
    }
614
209k
    frame_tonality = 0;
615
209k
    max_frame_tonality = 0;
616
    /*tw_sum = 0;*/
617
209k
    info->activity = 0;
618
209k
    frame_noisiness = 0;
619
209k
    frame_stationarity = 0;
620
209k
    if (!tonal->count)
621
640
    {
622
12.1k
       for (b=0;b<NB_TBANDS;b++)
623
11.5k
       {
624
11.5k
          tonal->lowE[b] = 1e10;
625
11.5k
          tonal->highE[b] = -1e10;
626
11.5k
       }
627
640
    }
628
209k
    relativeE = 0;
629
209k
    frame_loudness = 0;
630
    /* The energy of the very first band is special because of DC. */
631
209k
    {
632
209k
       float E = 0;
633
209k
       float X1r, X2r;
634
209k
       X1r = 2*(float)out[0].r;
635
209k
       X2r = 2*(float)out[0].i;
636
209k
       E = X1r*X1r + X2r*X2r;
637
836k
       for (i=1;i<4;i++)
638
627k
       {
639
627k
          float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
640
627k
                     + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
641
627k
          E += binE;
642
627k
       }
643
209k
       E = SCALE_ENER(E);
644
209k
       band_log2[0] = .5f*1.442695f*(float)log(E+1e-10f);
645
209k
    }
646
3.97M
    for (b=0;b<NB_TBANDS;b++)
647
3.76M
    {
648
3.76M
       float E=0, tE=0, nE=0;
649
3.76M
       float L1, L2;
650
3.76M
       float stationarity;
651
53.1M
       for (i=tbands[b];i<tbands[b+1];i++)
652
49.3M
       {
653
49.3M
          float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
654
49.3M
                     + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
655
49.3M
          binE = SCALE_ENER(binE);
656
49.3M
          E += binE;
657
49.3M
          tE += binE*MAX32(0, tonality[i]);
658
49.3M
          nE += binE*2.f*(.5f-noisiness[i]);
659
49.3M
       }
660
#ifndef FIXED_POINT
661
       /* Check for extreme band energies that could cause NaNs later. */
662
       if (!(E<1e9f) || celt_isnan(E))
663
       {
664
          info->valid = 0;
665
          RESTORE_STACK;
666
          return;
667
       }
668
#endif
669
670
3.76M
       tonal->E[tonal->E_count][b] = E;
671
3.76M
       frame_noisiness += nE/(1e-15f+E);
672
673
3.76M
       frame_loudness += (float)sqrt(E+1e-10f);
674
3.76M
       logE[b] = (float)log(E+1e-10f);
675
3.76M
       band_log2[b+1] = .5f*1.442695f*(float)log(E+1e-10f);
676
3.76M
       tonal->logE[tonal->E_count][b] = logE[b];
677
3.76M
       if (tonal->count==0)
678
11.5k
          tonal->highE[b] = tonal->lowE[b] = logE[b];
679
3.76M
       if (tonal->highE[b] > tonal->lowE[b] + 7.5)
680
1.78M
       {
681
1.78M
          if (tonal->highE[b] - logE[b] > logE[b] - tonal->lowE[b])
682
611k
             tonal->highE[b] -= .01f;
683
1.16M
          else
684
1.16M
             tonal->lowE[b] += .01f;
685
1.78M
       }
686
3.76M
       if (logE[b] > tonal->highE[b])
687
83.0k
       {
688
83.0k
          tonal->highE[b] = logE[b];
689
83.0k
          tonal->lowE[b] = MAX32(tonal->highE[b]-15, tonal->lowE[b]);
690
3.68M
       } else if (logE[b] < tonal->lowE[b])
691
146k
       {
692
146k
          tonal->lowE[b] = logE[b];
693
146k
          tonal->highE[b] = MIN32(tonal->lowE[b]+15, tonal->highE[b]);
694
146k
       }
695
3.76M
       relativeE += (logE[b]-tonal->lowE[b])/(1e-5f + (tonal->highE[b]-tonal->lowE[b]));
696
697
3.76M
       L1=L2=0;
698
33.8M
       for (i=0;i<NB_FRAMES;i++)
699
30.1M
       {
700
30.1M
          L1 += (float)sqrt(tonal->E[i][b]);
701
30.1M
          L2 += tonal->E[i][b];
702
30.1M
       }
703
704
3.76M
       stationarity = MIN16(0.99f,L1/(float)sqrt(1e-15+NB_FRAMES*L2));
705
3.76M
       stationarity *= stationarity;
706
3.76M
       stationarity *= stationarity;
707
3.76M
       frame_stationarity += stationarity;
708
3.76M
       /*band_tonality[b] = tE/(1e-15+E)*/;
709
3.76M
       band_tonality[b] = MAX16(tE/(1e-15f+E), stationarity*tonal->prev_band_tonality[b]);
710
#if 0
711
       if (b>=NB_TONAL_SKIP_BANDS)
712
       {
713
          frame_tonality += tweight[b]*band_tonality[b];
714
          tw_sum += tweight[b];
715
       }
716
#else
717
3.76M
       frame_tonality += band_tonality[b];
718
3.76M
       if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS)
719
1.88M
          frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS];
720
3.76M
#endif
721
3.76M
       max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*frame_tonality);
722
3.76M
       slope += band_tonality[b]*(b-8);
723
       /*printf("%f %f ", band_tonality[b], stationarity);*/
724
3.76M
       tonal->prev_band_tonality[b] = band_tonality[b];
725
3.76M
    }
726
727
209k
    leakage_from[0] = band_log2[0];
728
209k
    leakage_to[0] = band_log2[0] - LEAKAGE_OFFSET;
729
3.97M
    for (b=1;b<NB_TBANDS+1;b++)
730
3.76M
    {
731
3.76M
       float leak_slope = LEAKAGE_SLOPE*(tbands[b]-tbands[b-1])/4;
732
3.76M
       leakage_from[b] = MIN16(leakage_from[b-1]+leak_slope, band_log2[b]);
733
3.76M
       leakage_to[b] = MAX16(leakage_to[b-1]-leak_slope, band_log2[b]-LEAKAGE_OFFSET);
734
3.76M
    }
735
3.76M
    for (b=NB_TBANDS-2;b>=0;b--)
736
3.55M
    {
737
3.55M
       float leak_slope = LEAKAGE_SLOPE*(tbands[b+1]-tbands[b])/4;
738
3.55M
       leakage_from[b] = MIN16(leakage_from[b+1]+leak_slope, leakage_from[b]);
739
3.55M
       leakage_to[b] = MAX16(leakage_to[b+1]-leak_slope, leakage_to[b]);
740
3.55M
    }
741
209k
    celt_assert(NB_TBANDS+1 <= LEAK_BANDS);
742
4.18M
    for (b=0;b<NB_TBANDS+1;b++)
743
3.97M
    {
744
       /* leak_boost[] is made up of two terms. The first, based on leakage_to[],
745
          represents the boost needed to overcome the amount of analysis leakage
746
          cause in a weaker band b by louder neighbouring bands.
747
          The second, based on leakage_from[], applies to a loud band b for
748
          which the quantization noise causes synthesis leakage to the weaker
749
          neighbouring bands. */
750
3.97M
       float boost = MAX16(0, leakage_to[b] - band_log2[b]) +
751
3.97M
             MAX16(0, band_log2[b] - (leakage_from[b]+LEAKAGE_OFFSET));
752
3.97M
       info->leak_boost[b] = IMIN(255, (int)floor(.5 + 64.f*boost));
753
3.97M
    }
754
209k
    for (;b<LEAK_BANDS;b++) info->leak_boost[b] = 0;
755
756
1.88M
    for (i=0;i<NB_FRAMES;i++)
757
1.67M
    {
758
1.67M
       int j;
759
1.67M
       float mindist = 1e15f;
760
15.0M
       for (j=0;j<NB_FRAMES;j++)
761
13.3M
       {
762
13.3M
          int k;
763
13.3M
          float dist=0;
764
254M
          for (k=0;k<NB_TBANDS;k++)
765
240M
          {
766
240M
             float tmp;
767
240M
             tmp = tonal->logE[i][k] - tonal->logE[j][k];
768
240M
             dist += tmp*tmp;
769
240M
          }
770
13.3M
          if (j!=i)
771
11.7M
             mindist = MIN32(mindist, dist);
772
13.3M
       }
773
1.67M
       spec_variability += mindist;
774
1.67M
    }
775
209k
    spec_variability = (float)sqrt(spec_variability/NB_FRAMES/NB_TBANDS);
776
209k
    bandwidth_mask = 0;
777
209k
    bandwidth = 0;
778
209k
    maxE = 0;
779
209k
    noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8)));
780
209k
    noise_floor *= noise_floor;
781
209k
    below_max_pitch=0;
782
209k
    above_max_pitch=0;
783
3.97M
    for (b=0;b<NB_TBANDS;b++)
784
3.76M
    {
785
3.76M
       float E=0;
786
3.76M
       float Em;
787
3.76M
       int band_start, band_end;
788
       /* Keep a margin of 300 Hz for aliasing */
789
3.76M
       band_start = tbands[b];
790
3.76M
       band_end = tbands[b+1];
791
53.1M
       for (i=band_start;i<band_end;i++)
792
49.3M
       {
793
49.3M
          float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
794
49.3M
                     + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
795
49.3M
          E += binE;
796
49.3M
       }
797
3.76M
       E = SCALE_ENER(E);
798
3.76M
       maxE = MAX32(maxE, E);
799
3.76M
       if (band_start < 64)
800
2.30M
       {
801
2.30M
          below_max_pitch += E;
802
2.30M
       } else {
803
1.46M
          above_max_pitch += E;
804
1.46M
       }
805
3.76M
       tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
806
3.76M
       Em = MAX32(E, tonal->meanE[b]);
807
       /* Consider the band "active" only if all these conditions are met:
808
          1) less than 90 dB below the peak band (maximal masking possible considering
809
             both the ATH and the loudness-dependent slope of the spreading function)
810
          2) above the PCM quantization noise floor
811
          We use b+1 because the first CELT band isn't included in tbands[]
812
       */
813
3.76M
       if (E*1e9f > maxE && (Em > 3*noise_floor*(band_end-band_start) || E > noise_floor*(band_end-band_start)))
814
3.71M
          bandwidth = b+1;
815
       /* Check if the band is masked (see below). */
816
3.76M
       is_masked[b] = E < (tonal->prev_bandwidth >= b+1  ? .01f : .05f)*bandwidth_mask;
817
       /* Use a simple follower with 13 dB/Bark slope for spreading function. */
818
3.76M
       bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
819
3.76M
    }
820
    /* Special case for the last two bands, for which we don't have spectrum but only
821
       the energy above 12 kHz. The difficulty here is that the high-pass we use
822
       leaks some LF energy, so we need to increase the threshold without accidentally cutting
823
       off the band. */
824
209k
    if (tonal->Fs == 48000) {
825
83.9k
       float noise_ratio;
826
83.9k
       float Em;
827
83.9k
       float E = hp_ener*(1.f/(60*60));
828
83.9k
       noise_ratio = tonal->prev_bandwidth==20 ? 10.f : 30.f;
829
830
83.9k
#ifdef FIXED_POINT
831
       /* silk_resampler_down2_hp() shifted right by an extra 8 bits. */
832
83.9k
       E *= 256.f*(1.f/Q15ONE)*(1.f/Q15ONE);
833
83.9k
#endif
834
83.9k
       above_max_pitch += E;
835
83.9k
       tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
836
83.9k
       Em = MAX32(E, tonal->meanE[b]);
837
83.9k
       if (Em > 3*noise_ratio*noise_floor*160 || E > noise_ratio*noise_floor*160)
838
82.8k
          bandwidth = 20;
839
       /* Check if the band is masked (see below). */
840
83.9k
       is_masked[b] = E < (tonal->prev_bandwidth == 20  ? .01f : .05f)*bandwidth_mask;
841
83.9k
    }
842
209k
    if (above_max_pitch > below_max_pitch)
843
126k
       info->max_pitch_ratio = below_max_pitch/above_max_pitch;
844
82.7k
    else
845
82.7k
       info->max_pitch_ratio = 1;
846
    /* In some cases, resampling aliasing can create a small amount of energy in the first band
847
       being cut. So if the last band is masked, we don't include it.  */
848
209k
    if (bandwidth == 20 && is_masked[NB_TBANDS])
849
9.51k
       bandwidth-=2;
850
199k
    else if (bandwidth > 0 && bandwidth <= NB_TBANDS && is_masked[bandwidth-1])
851
2.76k
       bandwidth--;
852
209k
    if (tonal->count<=2)
853
1.63k
       bandwidth = 20;
854
209k
    frame_loudness = 20*(float)log10(frame_loudness);
855
209k
    tonal->Etracker = MAX32(tonal->Etracker-.003f, frame_loudness);
856
209k
    tonal->lowECount *= (1-alphaE);
857
209k
    if (frame_loudness < tonal->Etracker-30)
858
25.6k
       tonal->lowECount += alphaE;
859
860
1.88M
    for (i=0;i<8;i++)
861
1.67M
    {
862
1.67M
       float sum=0;
863
28.4M
       for (b=0;b<16;b++)
864
26.7M
          sum += dct_table[i*16+b]*logE[b];
865
1.67M
       BFCC[i] = sum;
866
1.67M
    }
867
1.88M
    for (i=0;i<8;i++)
868
1.67M
    {
869
1.67M
       float sum=0;
870
28.4M
       for (b=0;b<16;b++)
871
26.7M
          sum += dct_table[i*16+b]*.5f*(tonal->highE[b]+tonal->lowE[b]);
872
1.67M
       midE[i] = sum;
873
1.67M
    }
874
875
209k
    frame_stationarity /= NB_TBANDS;
876
209k
    relativeE /= NB_TBANDS;
877
209k
    if (tonal->count<10)
878
4.52k
       relativeE = .5f;
879
209k
    frame_noisiness /= NB_TBANDS;
880
209k
#if 1
881
209k
    info->activity = frame_noisiness + (1-frame_noisiness)*relativeE;
882
#else
883
    info->activity = .5*(1+frame_noisiness-frame_stationarity);
884
#endif
885
209k
    frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS));
886
209k
    frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f);
887
209k
    tonal->prev_tonality = frame_tonality;
888
889
209k
    slope /= 8*8;
890
209k
    info->tonality_slope = slope;
891
892
209k
    tonal->E_count = (tonal->E_count+1)%NB_FRAMES;
893
209k
    tonal->count = IMIN(tonal->count+1, ANALYSIS_COUNT_MAX);
894
209k
    info->tonality = frame_tonality;
895
896
1.04M
    for (i=0;i<4;i++)
897
836k
       features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i];
898
899
1.04M
    for (i=0;i<4;i++)
900
836k
       tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i];
901
902
1.04M
    for (i=0;i<4;i++)
903
836k
        features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->mem[i]-tonal->mem[i+16]);
904
836k
    for (i=0;i<3;i++)
905
627k
        features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->mem[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8];
906
907
209k
    if (tonal->count > 5)
908
206k
    {
909
2.06M
       for (i=0;i<9;i++)
910
1.85M
          tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i];
911
206k
    }
912
1.04M
    for (i=0;i<4;i++)
913
836k
       features[i] = BFCC[i]-midE[i];
914
915
1.88M
    for (i=0;i<8;i++)
916
1.67M
    {
917
1.67M
       tonal->mem[i+24] = tonal->mem[i+16];
918
1.67M
       tonal->mem[i+16] = tonal->mem[i+8];
919
1.67M
       tonal->mem[i+8] = tonal->mem[i];
920
1.67M
       tonal->mem[i] = BFCC[i];
921
1.67M
    }
922
2.09M
    for (i=0;i<9;i++)
923
1.88M
       features[11+i] = (float)sqrt(tonal->std[i]) - std_feature_bias[i];
924
209k
    features[18] = spec_variability - 0.78f;
925
209k
    features[20] = info->tonality - 0.154723f;
926
209k
    features[21] = info->activity - 0.724643f;
927
209k
    features[22] = frame_stationarity - 0.743717f;
928
209k
    features[23] = info->tonality_slope + 0.069216f;
929
209k
    features[24] = tonal->lowECount - 0.067930f;
930
931
209k
    analysis_compute_dense(&layer0, layer_out, features);
932
209k
    analysis_compute_gru(&layer1, tonal->rnn_state, layer_out);
933
209k
    analysis_compute_dense(&layer2, frame_probs, tonal->rnn_state);
934
935
    /* Probability of speech or music vs noise */
936
209k
    info->activity_probability = frame_probs[1];
937
209k
    info->music_prob = frame_probs[0];
938
939
    /*printf("%f %f %f\n", frame_probs[0], frame_probs[1], info->music_prob);*/
940
#ifdef MLP_TRAINING
941
    for (i=0;i<25;i++)
942
       printf("%f ", features[i]);
943
    printf("\n");
944
#endif
945
946
209k
    info->bandwidth = bandwidth;
947
209k
    tonal->prev_bandwidth = bandwidth;
948
    /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/
949
209k
    info->noisiness = frame_noisiness;
950
209k
    info->valid = 1;
951
209k
    RESTORE_STACK;
952
209k
}
953
954
void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *analysis_pcm,
955
                 int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs,
956
                 int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info)
957
7.52M
{
958
7.52M
   int offset;
959
7.52M
   int pcm_len;
960
961
7.52M
   analysis_frame_size -= analysis_frame_size&1;
962
7.52M
   if (analysis_pcm != NULL)
963
7.52M
   {
964
      /* Avoid overflow/wrap-around of the analysis buffer */
965
7.52M
      analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/50, analysis_frame_size);
966
967
7.52M
      pcm_len = analysis_frame_size - analysis->analysis_offset;
968
7.52M
      offset = analysis->analysis_offset;
969
16.5M
      while (pcm_len>0) {
970
9.02M
         tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(Fs/50, pcm_len), offset, c1, c2, C, lsb_depth, downmix);
971
9.02M
         offset += Fs/50;
972
9.02M
         pcm_len -= Fs/50;
973
9.02M
      }
974
7.52M
      analysis->analysis_offset = analysis_frame_size;
975
976
7.52M
      analysis->analysis_offset -= frame_size;
977
7.52M
   }
978
979
7.52M
   tonality_get_info(analysis, analysis_info, frame_size);
980
7.52M
}
981
982
#endif /* DISABLE_FLOAT_API */