Coverage Report

Created: 2025-08-28 07:12

/src/ffmpeg/libavcodec/wmavoice.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Windows Media Audio Voice decoder.
3
 * Copyright (c) 2009 Ronald S. Bultje
4
 *
5
 * This file is part of FFmpeg.
6
 *
7
 * FFmpeg is free software; you can redistribute it and/or
8
 * modify it under the terms of the GNU Lesser General Public
9
 * License as published by the Free Software Foundation; either
10
 * version 2.1 of the License, or (at your option) any later version.
11
 *
12
 * FFmpeg is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
 * Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with FFmpeg; if not, write to the Free Software
19
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
 */
21
22
/**
23
 * @file
24
 * @brief Windows Media Audio Voice compatible decoder
25
 * @author Ronald S. Bultje <rsbultje@gmail.com>
26
 */
27
28
#include <math.h>
29
30
#include "libavutil/channel_layout.h"
31
#include "libavutil/float_dsp.h"
32
#include "libavutil/mem.h"
33
#include "libavutil/mem_internal.h"
34
#include "libavutil/thread.h"
35
#include "libavutil/tx.h"
36
#include "avcodec.h"
37
#include "codec_internal.h"
38
#include "decode.h"
39
#include "get_bits.h"
40
#include "put_bits.h"
41
#include "wmavoice_data.h"
42
#include "celp_filters.h"
43
#include "acelp_vectors.h"
44
#include "acelp_filters.h"
45
#include "lsp.h"
46
#include "sinewin.h"
47
48
#define MAX_BLOCKS           8   ///< maximum number of blocks per frame
49
435k
#define MAX_LSPS             16  ///< maximum filter order
50
1.61M
#define MAX_LSPS_ALIGN16     16  ///< same as #MAX_LSPS; needs to be multiple
51
                                 ///< of 16 for ASM input buffer alignment
52
2.28M
#define MAX_FRAMES           3   ///< maximum number of frames per superframe
53
19.5M
#define MAX_FRAMESIZE        160 ///< maximum number of samples per frame
54
436k
#define MAX_SIGNAL_HISTORY   416 ///< maximum excitation signal history
55
2.28M
#define MAX_SFRAMESIZE       (MAX_FRAMESIZE * MAX_FRAMES)
56
                                 ///< maximum number of samples per superframe
57
771k
#define SFRAME_CACHE_MAXSIZE 256 ///< maximum cache size for frame data that
58
                                 ///< was split over two packets
59
#define VLC_NBITS            6   ///< number of bits to read per VLC iteration
60
61
/**
62
 * Frame type VLC coding.
63
 */
64
static VLCElem frame_type_vlc[132];
65
66
/**
67
 * Adaptive codebook types.
68
 */
69
enum {
70
    ACB_TYPE_NONE       = 0, ///< no adaptive codebook (only hardcoded fixed)
71
    ACB_TYPE_ASYMMETRIC = 1, ///< adaptive codebook with per-frame pitch, which
72
                             ///< we interpolate to get a per-sample pitch.
73
                             ///< Signal is generated using an asymmetric sinc
74
                             ///< window function
75
                             ///< @note see #wmavoice_ipol1_coeffs
76
    ACB_TYPE_HAMMING    = 2  ///< Per-block pitch with signal generation using
77
                             ///< a Hamming sinc window function
78
                             ///< @note see #wmavoice_ipol2_coeffs
79
};
80
81
/**
82
 * Fixed codebook types.
83
 */
84
enum {
85
    FCB_TYPE_SILENCE    = 0, ///< comfort noise during silence
86
                             ///< generated from a hardcoded (fixed) codebook
87
                             ///< with per-frame (low) gain values
88
    FCB_TYPE_HARDCODED  = 1, ///< hardcoded (fixed) codebook with per-block
89
                             ///< gain values
90
    FCB_TYPE_AW_PULSES  = 2, ///< Pitch-adaptive window (AW) pulse signals,
91
                             ///< used in particular for low-bitrate streams
92
    FCB_TYPE_EXC_PULSES = 3, ///< Innovation (fixed) codebook pulse sets in
93
                             ///< combinations of either single pulses or
94
                             ///< pulse pairs
95
};
96
97
/**
98
 * Description of frame types.
99
 */
100
static const struct frame_type_desc {
101
    uint8_t n_blocks;     ///< amount of blocks per frame (each block
102
                          ///< (contains 160/#n_blocks samples)
103
    uint8_t log_n_blocks; ///< log2(#n_blocks)
104
    uint8_t acb_type;     ///< Adaptive codebook type (ACB_TYPE_*)
105
    uint8_t fcb_type;     ///< Fixed codebook type (FCB_TYPE_*)
106
    uint8_t dbl_pulses;   ///< how many pulse vectors have pulse pairs
107
                          ///< (rather than just one single pulse)
108
                          ///< only if #fcb_type == #FCB_TYPE_EXC_PULSES
109
} frame_descs[17] = {
110
    { 1, 0, ACB_TYPE_NONE,       FCB_TYPE_SILENCE,    0 },
111
    { 2, 1, ACB_TYPE_NONE,       FCB_TYPE_HARDCODED,  0 },
112
    { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_AW_PULSES,  0 },
113
    { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2 },
114
    { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5 },
115
    { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 0 },
116
    { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2 },
117
    { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5 },
118
    { 2, 1, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 0 },
119
    { 2, 1, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 2 },
120
    { 2, 1, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 5 },
121
    { 4, 2, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 0 },
122
    { 4, 2, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 2 },
123
    { 4, 2, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 5 },
124
    { 8, 3, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 0 },
125
    { 8, 3, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 2 },
126
    { 8, 3, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 5 }
127
};
128
129
/**
130
 * WMA Voice decoding context.
131
 */
132
typedef struct WMAVoiceContext {
133
    /**
134
     * @name Global values specified in the stream header / extradata or used all over.
135
     * @{
136
     */
137
    GetBitContext gb;             ///< packet bitreader. During decoder init,
138
                                  ///< it contains the extradata from the
139
                                  ///< demuxer. During decoding, it contains
140
                                  ///< packet data.
141
    int8_t vbm_tree[25];          ///< converts VLC codes to frame type
142
143
    int spillover_bitsize;        ///< number of bits used to specify
144
                                  ///< #spillover_nbits in the packet header
145
                                  ///< = ceil(log2(ctx->block_align << 3))
146
    int history_nsamples;         ///< number of samples in history for signal
147
                                  ///< prediction (through ACB)
148
149
    /* postfilter specific values */
150
    int do_apf;                   ///< whether to apply the averaged
151
                                  ///< projection filter (APF)
152
    int denoise_strength;         ///< strength of denoising in Wiener filter
153
                                  ///< [0-11]
154
    int denoise_tilt_corr;        ///< Whether to apply tilt correction to the
155
                                  ///< Wiener filter coefficients (postfilter)
156
    int dc_level;                 ///< Predicted amount of DC noise, based
157
                                  ///< on which a DC removal filter is used
158
159
    int lsps;                     ///< number of LSPs per frame [10 or 16]
160
    int lsp_q_mode;               ///< defines quantizer defaults [0, 1]
161
    int lsp_def_mode;             ///< defines different sets of LSP defaults
162
                                  ///< [0, 1]
163
164
    int min_pitch_val;            ///< base value for pitch parsing code
165
    int max_pitch_val;            ///< max value + 1 for pitch parsing
166
    int pitch_nbits;              ///< number of bits used to specify the
167
                                  ///< pitch value in the frame header
168
    int block_pitch_nbits;        ///< number of bits used to specify the
169
                                  ///< first block's pitch value
170
    int block_pitch_range;        ///< range of the block pitch
171
    int block_delta_pitch_nbits;  ///< number of bits used to specify the
172
                                  ///< delta pitch between this and the last
173
                                  ///< block's pitch value, used in all but
174
                                  ///< first block
175
    int block_delta_pitch_hrange; ///< 1/2 range of the delta (full range is
176
                                  ///< from -this to +this-1)
177
    uint16_t block_conv_table[4]; ///< boundaries for block pitch unit/scale
178
                                  ///< conversion
179
180
    /**
181
     * @}
182
     *
183
     * @name Packet values specified in the packet header or related to a packet.
184
     *
185
     * A packet is considered to be a single unit of data provided to this
186
     * decoder by the demuxer.
187
     * @{
188
     */
189
    int spillover_nbits;          ///< number of bits of the previous packet's
190
                                  ///< last superframe preceding this
191
                                  ///< packet's first full superframe (useful
192
                                  ///< for re-synchronization also)
193
    int has_residual_lsps;        ///< if set, superframes contain one set of
194
                                  ///< LSPs that cover all frames, encoded as
195
                                  ///< independent and residual LSPs; if not
196
                                  ///< set, each frame contains its own, fully
197
                                  ///< independent, LSPs
198
    int skip_bits_next;           ///< number of bits to skip at the next call
199
                                  ///< to #wmavoice_decode_packet() (since
200
                                  ///< they're part of the previous superframe)
201
202
    uint8_t sframe_cache[SFRAME_CACHE_MAXSIZE + AV_INPUT_BUFFER_PADDING_SIZE]; ///<
203
                                  ///< cache for superframe data split over
204
                                  ///< multiple packets
205
    int sframe_cache_size;        ///< set to >0 if we have data from an
206
                                  ///< (incomplete) superframe from a previous
207
                                  ///< packet that spilled over in the current
208
                                  ///< packet; specifies the amount of bits in
209
                                  ///< #sframe_cache
210
    PutBitContext pb;             ///< bitstream writer for #sframe_cache
211
212
    /**
213
     * @}
214
     *
215
     * @name Frame and superframe values
216
     * Superframe and frame data - these can change from frame to frame,
217
     * although some of them do in that case serve as a cache / history for
218
     * the next frame or superframe.
219
     * @{
220
     */
221
    double prev_lsps[MAX_LSPS];   ///< LSPs of the last frame of the previous
222
                                  ///< superframe
223
    int last_pitch_val;           ///< pitch value of the previous frame
224
    int last_acb_type;            ///< frame type [0-2] of the previous frame
225
    int pitch_diff_sh16;          ///< ((cur_pitch_val - #last_pitch_val)
226
                                  ///< << 16) / #MAX_FRAMESIZE
227
    float silence_gain;           ///< set for use in blocks if #ACB_TYPE_NONE
228
229
    int aw_idx_is_ext;            ///< whether the AW index was encoded in
230
                                  ///< 8 bits (instead of 6)
231
    int aw_pulse_range;           ///< the range over which #aw_pulse_set1()
232
                                  ///< can apply the pulse, relative to the
233
                                  ///< value in aw_first_pulse_off. The exact
234
                                  ///< position of the first AW-pulse is within
235
                                  ///< [pulse_off, pulse_off + this], and
236
                                  ///< depends on bitstream values; [16 or 24]
237
    int aw_n_pulses[2];           ///< number of AW-pulses in each block; note
238
                                  ///< that this number can be negative (in
239
                                  ///< which case it basically means "zero")
240
    int aw_first_pulse_off[2];    ///< index of first sample to which to
241
                                  ///< apply AW-pulses, or -0xff if unset
242
    int aw_next_pulse_off_cache;  ///< the position (relative to start of the
243
                                  ///< second block) at which pulses should
244
                                  ///< start to be positioned, serves as a
245
                                  ///< cache for pitch-adaptive window pulses
246
                                  ///< between blocks
247
248
    int frame_cntr;               ///< current frame index [0 - 0xFFFE]; is
249
                                  ///< only used for comfort noise in #pRNG()
250
    int nb_superframes;           ///< number of superframes in current packet
251
    float gain_pred_err[6];       ///< cache for gain prediction
252
    float excitation_history[MAX_SIGNAL_HISTORY]; ///< cache of the signal of
253
                                  ///< previous superframes, used as a history
254
                                  ///< for signal generation
255
    float synth_history[MAX_LSPS]; ///< see #excitation_history
256
    /**
257
     * @}
258
     *
259
     * @name Postfilter values
260
     *
261
     * Variables used for postfilter implementation, mostly history for
262
     * smoothing and so on, and context variables for FFT/iFFT.
263
     * @{
264
     */
265
    AVTXContext *rdft, *irdft;    ///< contexts for FFT-calculation in the
266
    av_tx_fn rdft_fn, irdft_fn;   ///< postfilter (for denoise filter)
267
    AVTXContext *dct, *dst;       ///< contexts for phase shift (in Hilbert
268
    av_tx_fn dct_fn, dst_fn;      ///< transform, part of postfilter)
269
    float sin[511], cos[511];     ///< 8-bit cosine/sine windows over [-pi,pi]
270
                                  ///< range
271
    float postfilter_agc;         ///< gain control memory, used in
272
                                  ///< #adaptive_gain_control()
273
    float dcf_mem[2];             ///< DC filter history
274
    /// zero filter output (i.e. excitation) by postfilter
275
    float zero_exc_pf[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE];
276
    float denoise_filter_cache[MAX_FRAMESIZE];
277
    int   denoise_filter_cache_size; ///< samples in #denoise_filter_cache
278
    /// aligned buffer for LPC tilting
279
    DECLARE_ALIGNED(32, float, tilted_lpcs_pf)[0x82];
280
    /// aligned buffer for denoise coefficients
281
    DECLARE_ALIGNED(32, float, denoise_coeffs_pf)[0x82];
282
    /// aligned buffer for postfilter speech synthesis
283
    DECLARE_ALIGNED(32, float, synth_filter_out_buf)[0x80 + MAX_LSPS_ALIGN16];
284
    /**
285
     * @}
286
     */
287
} WMAVoiceContext;
288
289
/**
290
 * Set up the variable bit mode (VBM) tree from container extradata.
291
 * @param gb bit I/O context.
292
 *           The bit context (s->gb) should be loaded with byte 23-46 of the
293
 *           container extradata (i.e. the ones containing the VBM tree).
294
 * @param vbm_tree pointer to array to which the decoded VBM tree will be
295
 *                 written.
296
 * @return 0 on success, <0 on error.
297
 */
298
static av_cold int decode_vbmtree(GetBitContext *gb, int8_t vbm_tree[25])
299
1.39k
{
300
1.39k
    int cntr[8] = { 0 }, n, res;
301
302
1.39k
    memset(vbm_tree, 0xff, sizeof(vbm_tree[0]) * 25);
303
25.0k
    for (n = 0; n < 17; n++) {
304
23.6k
        res = get_bits(gb, 3);
305
23.6k
        if (cntr[res] > 3) // should be >= 3 + (res == 7))
306
2
            return -1;
307
23.6k
        vbm_tree[res * 3 + cntr[res]++] = n;
308
23.6k
    }
309
1.39k
    return 0;
310
1.39k
}
311
312
static av_cold void wmavoice_init_static_data(void)
313
1
{
314
1
    static const uint8_t bits[] = {
315
1
         2,  2,  2,  4,  4,  4,
316
1
         6,  6,  6,  8,  8,  8,
317
1
        10, 10, 10, 12, 12, 12,
318
1
        14, 14, 14, 14
319
1
    };
320
321
1
    VLC_INIT_STATIC_TABLE_FROM_LENGTHS(frame_type_vlc, VLC_NBITS,
322
1
                                       FF_ARRAY_ELEMS(bits), bits,
323
1
                                       1, NULL, 0, 0, 0, 0);
324
1
}
325
326
static av_cold void wmavoice_flush(AVCodecContext *ctx)
327
435k
{
328
435k
    WMAVoiceContext *s = ctx->priv_data;
329
435k
    int n;
330
331
435k
    s->postfilter_agc    = 0;
332
435k
    s->sframe_cache_size = 0;
333
435k
    s->skip_bits_next    = 0;
334
5.64M
    for (n = 0; n < s->lsps; n++)
335
5.21M
        s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
336
435k
    memset(s->excitation_history, 0,
337
435k
           sizeof(*s->excitation_history) * MAX_SIGNAL_HISTORY);
338
435k
    memset(s->synth_history,      0,
339
435k
           sizeof(*s->synth_history)      * MAX_LSPS);
340
435k
    memset(s->gain_pred_err,      0,
341
435k
           sizeof(s->gain_pred_err));
342
343
435k
    if (s->do_apf) {
344
303k
        memset(&s->synth_filter_out_buf[MAX_LSPS_ALIGN16 - s->lsps], 0,
345
303k
               sizeof(*s->synth_filter_out_buf) * s->lsps);
346
303k
        memset(s->dcf_mem,              0,
347
303k
               sizeof(*s->dcf_mem)              * 2);
348
303k
        memset(s->zero_exc_pf,          0,
349
303k
               sizeof(*s->zero_exc_pf)          * s->history_nsamples);
350
303k
        memset(s->denoise_filter_cache, 0, sizeof(s->denoise_filter_cache));
351
303k
    }
352
435k
}
353
354
/**
355
 * Set up decoder with parameters from demuxer (extradata etc.).
356
 */
357
static av_cold int wmavoice_decode_init(AVCodecContext *ctx)
358
1.53k
{
359
1.53k
    static AVOnce init_static_once = AV_ONCE_INIT;
360
1.53k
    int n, flags, pitch_range, lsp16_flag, ret;
361
1.53k
    WMAVoiceContext *s = ctx->priv_data;
362
363
1.53k
    ff_thread_once(&init_static_once, wmavoice_init_static_data);
364
365
    /**
366
     * Extradata layout:
367
     * - byte  0-18: WMAPro-in-WMAVoice extradata (see wmaprodec.c),
368
     * - byte 19-22: flags field (annoyingly in LE; see below for known
369
     *               values),
370
     * - byte 23-46: variable bitmode tree (really just 17 * 3 bits,
371
     *               rest is 0).
372
     */
373
1.53k
    if (ctx->extradata_size != 46) {
374
133
        av_log(ctx, AV_LOG_ERROR,
375
133
               "Invalid extradata size %d (should be 46)\n",
376
133
               ctx->extradata_size);
377
133
        return AVERROR_INVALIDDATA;
378
133
    }
379
1.40k
    if (ctx->block_align <= 0 || ctx->block_align > (1<<22)) {
380
10
        av_log(ctx, AV_LOG_ERROR, "Invalid block alignment %d.\n", ctx->block_align);
381
10
        return AVERROR_INVALIDDATA;
382
10
    }
383
384
1.39k
    flags                = AV_RL32(ctx->extradata + 18);
385
1.39k
    s->spillover_bitsize = 3 + av_ceil_log2(ctx->block_align);
386
1.39k
    s->do_apf            =    flags & 0x1;
387
1.39k
    if (s->do_apf) {
388
732
        float scale = 1.0f;
389
390
732
        ret = av_tx_init(&s->rdft, &s->rdft_fn, AV_TX_FLOAT_RDFT, 0, 1 << 7, &scale, 0);
391
732
        if (ret < 0)
392
0
            return ret;
393
394
732
        ret = av_tx_init(&s->irdft, &s->irdft_fn, AV_TX_FLOAT_RDFT, 1, 1 << 7, &scale, 0);
395
732
        if (ret < 0)
396
0
            return ret;
397
398
732
        scale = 1.0 / (1 << 6);
399
732
        ret = av_tx_init(&s->dct, &s->dct_fn, AV_TX_FLOAT_DCT_I, 0, 1 << 6, &scale, 0);
400
732
        if (ret < 0)
401
0
            return ret;
402
403
732
        scale = 1.0 / (1 << 6);
404
732
        ret = av_tx_init(&s->dst, &s->dst_fn, AV_TX_FLOAT_DST_I, 0, 1 << 6, &scale, 0);
405
732
        if (ret < 0)
406
0
            return ret;
407
408
732
        ff_sine_window_init(s->cos, 256);
409
732
        memcpy(&s->sin[255], s->cos, 256 * sizeof(s->cos[0]));
410
187k
        for (n = 0; n < 255; n++) {
411
186k
            s->sin[n]       = -s->sin[510 - n];
412
186k
            s->cos[510 - n] =  s->cos[n];
413
186k
        }
414
732
    }
415
1.39k
    s->denoise_strength  =   (flags >> 2) & 0xF;
416
1.39k
    if (s->denoise_strength >= 12) {
417
1
        av_log(ctx, AV_LOG_ERROR,
418
1
               "Invalid denoise filter strength %d (max=11)\n",
419
1
               s->denoise_strength);
420
1
        return AVERROR_INVALIDDATA;
421
1
    }
422
1.39k
    s->denoise_tilt_corr = !!(flags & 0x40);
423
1.39k
    s->dc_level          =   (flags >> 7) & 0xF;
424
1.39k
    s->lsp_q_mode        = !!(flags & 0x2000);
425
1.39k
    s->lsp_def_mode      = !!(flags & 0x4000);
426
1.39k
    lsp16_flag           =    flags & 0x1000;
427
1.39k
    if (lsp16_flag) {
428
538
        s->lsps               = 16;
429
855
    } else {
430
855
        s->lsps               = 10;
431
855
    }
432
18.5k
    for (n = 0; n < s->lsps; n++)
433
17.1k
        s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
434
435
1.39k
    init_get_bits(&s->gb, ctx->extradata + 22, (ctx->extradata_size - 22) << 3);
436
1.39k
    if (decode_vbmtree(&s->gb, s->vbm_tree) < 0) {
437
2
        av_log(ctx, AV_LOG_ERROR, "Invalid VBM tree; broken extradata?\n");
438
2
        return AVERROR_INVALIDDATA;
439
2
    }
440
441
1.39k
    if (ctx->sample_rate >= INT_MAX / (256 * 37))
442
9
        return AVERROR_INVALIDDATA;
443
444
1.38k
    s->min_pitch_val    = ((ctx->sample_rate << 8)      /  400 + 50) >> 8;
445
1.38k
    s->max_pitch_val    = ((ctx->sample_rate << 8) * 37 / 2000 + 50) >> 8;
446
1.38k
    pitch_range         = s->max_pitch_val - s->min_pitch_val;
447
1.38k
    if (pitch_range <= 0) {
448
1
        av_log(ctx, AV_LOG_ERROR, "Invalid pitch range; broken extradata?\n");
449
1
        return AVERROR_INVALIDDATA;
450
1
    }
451
1.38k
    s->pitch_nbits      = av_ceil_log2(pitch_range);
452
1.38k
    s->last_pitch_val   = 40;
453
1.38k
    s->last_acb_type    = ACB_TYPE_NONE;
454
1.38k
    s->history_nsamples = s->max_pitch_val + 8;
455
456
1.38k
    if (s->min_pitch_val < 1 || s->history_nsamples > MAX_SIGNAL_HISTORY) {
457
6
        int min_sr = ((((1 << 8) - 50) * 400) + 0xFF) >> 8,
458
6
            max_sr = ((((MAX_SIGNAL_HISTORY - 8) << 8) + 205) * 2000 / 37) >> 8;
459
460
6
        av_log(ctx, AV_LOG_ERROR,
461
6
               "Unsupported samplerate %d (min=%d, max=%d)\n",
462
6
               ctx->sample_rate, min_sr, max_sr); // 322-22097 Hz
463
464
6
        return AVERROR(ENOSYS);
465
6
    }
466
467
1.37k
    s->block_conv_table[0]      = s->min_pitch_val;
468
1.37k
    s->block_conv_table[1]      = (pitch_range * 25) >> 6;
469
1.37k
    s->block_conv_table[2]      = (pitch_range * 44) >> 6;
470
1.37k
    s->block_conv_table[3]      = s->max_pitch_val - 1;
471
1.37k
    s->block_delta_pitch_hrange = (pitch_range >> 3) & ~0xF;
472
1.37k
    if (s->block_delta_pitch_hrange <= 0) {
473
2
        av_log(ctx, AV_LOG_ERROR, "Invalid delta pitch hrange; broken extradata?\n");
474
2
        return AVERROR_INVALIDDATA;
475
2
    }
476
1.37k
    s->block_delta_pitch_nbits  = 1 + av_ceil_log2(s->block_delta_pitch_hrange);
477
1.37k
    s->block_pitch_range        = s->block_conv_table[2] +
478
1.37k
                                  s->block_conv_table[3] + 1 +
479
1.37k
                                  2 * (s->block_conv_table[1] - 2 * s->min_pitch_val);
480
1.37k
    s->block_pitch_nbits        = av_ceil_log2(s->block_pitch_range);
481
482
1.37k
    av_channel_layout_uninit(&ctx->ch_layout);
483
1.37k
    ctx->ch_layout = (AVChannelLayout)AV_CHANNEL_LAYOUT_MONO;
484
1.37k
    ctx->sample_fmt             = AV_SAMPLE_FMT_FLT;
485
486
1.37k
    return 0;
487
1.37k
}
488
489
/**
490
 * @name Postfilter functions
491
 * Postfilter functions (gain control, wiener denoise filter, DC filter,
492
 * kalman smoothening, plus surrounding code to wrap it)
493
 * @{
494
 */
495
/**
496
 * Adaptive gain control (as used in postfilter).
497
 *
498
 * Identical to #ff_adaptive_gain_control() in acelp_vectors.c, except
499
 * that the energy here is calculated using sum(abs(...)), whereas the
500
 * other codecs (e.g. AMR-NB, SIPRO) use sqrt(dotproduct(...)).
501
 *
502
 * @param out output buffer for filtered samples
503
 * @param in input buffer containing the samples as they are after the
504
 *           postfilter steps so far
505
 * @param speech_synth input buffer containing speech synth before postfilter
506
 * @param size input buffer size
507
 * @param alpha exponential filter factor
508
 * @param gain_mem pointer to filter memory (single float)
509
 */
510
static void adaptive_gain_control(float *out, const float *in,
511
                                  const float *speech_synth,
512
                                  int size, float alpha, float *gain_mem)
513
1.30M
{
514
1.30M
    int i;
515
1.30M
    float speech_energy = 0.0, postfilter_energy = 0.0, gain_scale_factor;
516
1.30M
    float mem = *gain_mem;
517
518
106M
    for (i = 0; i < size; i++) {
519
104M
        speech_energy     += fabsf(speech_synth[i]);
520
104M
        postfilter_energy += fabsf(in[i]);
521
104M
    }
522
1.30M
    gain_scale_factor = postfilter_energy == 0.0 ? 0.0 :
523
1.30M
                        (1.0 - alpha) * speech_energy / postfilter_energy;
524
525
106M
    for (i = 0; i < size; i++) {
526
104M
        mem = alpha * mem + gain_scale_factor;
527
104M
        out[i] = in[i] * mem;
528
104M
    }
529
530
1.30M
    *gain_mem = mem;
531
1.30M
}
532
533
/**
534
 * Kalman smoothing function.
535
 *
536
 * This function looks back pitch +/- 3 samples back into history to find
537
 * the best fitting curve (that one giving the optimal gain of the two
538
 * signals, i.e. the highest dot product between the two), and then
539
 * uses that signal history to smoothen the output of the speech synthesis
540
 * filter.
541
 *
542
 * @param s WMA Voice decoding context
543
 * @param pitch pitch of the speech signal
544
 * @param in input speech signal
545
 * @param out output pointer for smoothened signal
546
 * @param size input/output buffer size
547
 *
548
 * @returns -1 if no smoothening took place, e.g. because no optimal
549
 *          fit could be found, or 0 on success.
550
 */
551
static int kalman_smoothen(WMAVoiceContext *s, int pitch,
552
                           const float *in, float *out, int size)
553
1.26M
{
554
1.26M
    int n;
555
1.26M
    float optimal_gain = 0, dot;
556
1.26M
    const float *ptr = &in[-FFMAX(s->min_pitch_val, pitch - 3)],
557
1.26M
                *end = &in[-FFMIN(s->max_pitch_val, pitch + 3)],
558
1.26M
                *best_hist_ptr = NULL;
559
560
    /* find best fitting point in history */
561
8.21M
    do {
562
8.21M
        dot = ff_scalarproduct_float_c(in, ptr, size);
563
8.21M
        if (dot > optimal_gain) {
564
2.58M
            optimal_gain  = dot;
565
2.58M
            best_hist_ptr = ptr;
566
2.58M
        }
567
8.21M
    } while (--ptr >= end);
568
569
1.26M
    if (optimal_gain <= 0)
570
102k
        return -1;
571
1.16M
    dot = ff_scalarproduct_float_c(best_hist_ptr, best_hist_ptr, size);
572
1.16M
    if (dot <= 0) // would be 1.0
573
197
        return -1;
574
575
1.16M
    if (optimal_gain <= dot) {
576
1.03M
        dot = dot / (dot + 0.6 * optimal_gain); // 0.625-1.000
577
1.03M
    } else
578
125k
        dot = 0.625;
579
580
    /* actual smoothing */
581
94.2M
    for (n = 0; n < size; n++)
582
93.0M
        out[n] = best_hist_ptr[n] + dot * (in[n] - best_hist_ptr[n]);
583
584
1.16M
    return 0;
585
1.16M
}
586
587
/**
588
 * Get the tilt factor of a formant filter from its transfer function
589
 * @see #tilt_factor() in amrnbdec.c, which does essentially the same,
590
 *      but somehow (??) it does a speech synthesis filter in the
591
 *      middle, which is missing here
592
 *
593
 * @param lpcs LPC coefficients
594
 * @param n_lpcs Size of LPC buffer
595
 * @returns the tilt factor
596
 */
597
static float tilt_factor(const float *lpcs, int n_lpcs)
598
2.27M
{
599
2.27M
    float rh0, rh1;
600
601
2.27M
    rh0 = 1.0     + ff_scalarproduct_float_c(lpcs,  lpcs,    n_lpcs);
602
2.27M
    rh1 = lpcs[0] + ff_scalarproduct_float_c(lpcs, &lpcs[1], n_lpcs - 1);
603
604
2.27M
    return rh1 / rh0;
605
2.27M
}
606
607
/**
608
 * Derive denoise filter coefficients (in real domain) from the LPCs.
609
 */
610
static void calc_input_response(WMAVoiceContext *s, float *lpcs_src,
611
                                int fcb_type, float *coeffs_dst, int remainder)
612
1.29M
{
613
1.29M
    float last_coeff, min = 15.0, max = -15.0;
614
1.29M
    float irange, angle_mul, gain_mul, range, sq;
615
1.29M
    LOCAL_ALIGNED_32(float, coeffs, [0x82]);
616
1.29M
    LOCAL_ALIGNED_32(float, lpcs, [0x82]);
617
1.29M
    LOCAL_ALIGNED_32(float, lpcs_dct, [0x82]);
618
1.29M
    int n, idx;
619
620
1.29M
    memcpy(coeffs, coeffs_dst, 0x82*sizeof(float));
621
622
    /* Create frequency power spectrum of speech input (i.e. RDFT of LPCs) */
623
1.29M
    s->rdft_fn(s->rdft, lpcs, lpcs_src, sizeof(float));
624
83.8M
#define log_range(var, assign) do { \
625
83.8M
        float tmp = log10f(assign);  var = tmp; \
626
83.8M
        max       = FFMAX(max, tmp); min = FFMIN(min, tmp); \
627
83.8M
    } while (0)
628
1.29M
    log_range(last_coeff,  lpcs[64]         * lpcs[64]);
629
82.5M
    for (n = 1; n < 64; n++)
630
81.2M
        log_range(lpcs[n], lpcs[n * 2]     * lpcs[n * 2] +
631
1.29M
                           lpcs[n * 2 + 1] * lpcs[n * 2 + 1]);
632
1.29M
    log_range(lpcs[0],     lpcs[0]         * lpcs[0]);
633
1.29M
#undef log_range
634
1.29M
    range    = max - min;
635
1.29M
    lpcs[64] = last_coeff;
636
637
    /* Now, use this spectrum to pick out these frequencies with higher
638
     * (relative) power/energy (which we then take to be "not noise"),
639
     * and set up a table (still in lpc[]) of (relative) gains per frequency.
640
     * These frequencies will be maintained, while others ("noise") will be
641
     * decreased in the filter output. */
642
1.29M
    irange    = 64.0 / range; // so irange*(max-value) is in the range [0, 63]
643
1.29M
    gain_mul  = range * (fcb_type == FCB_TYPE_HARDCODED ? (5.0 / 13.0) :
644
1.29M
                                                          (5.0 / 14.7));
645
1.29M
    angle_mul = gain_mul * (8.0 * M_LN10 / M_PI);
646
85.1M
    for (n = 0; n <= 64; n++) {
647
83.8M
        float pwr;
648
649
83.8M
        idx = lrint((max - lpcs[n]) * irange - 1);
650
83.8M
        idx = FFMAX(0, idx);
651
83.8M
        pwr = wmavoice_denoise_power_table[s->denoise_strength][idx];
652
83.8M
        lpcs[n] = angle_mul * pwr;
653
654
        /* 70.57 =~ 1/log10(1.0331663) */
655
83.8M
        idx = av_clipd((pwr * gain_mul - 0.0295) * 70.570526123, 0, INT_MAX / 2);
656
657
83.8M
        if (idx > 127) { // fall back if index falls outside table range
658
10.8M
            coeffs[n] = wmavoice_energy_table[127] *
659
10.8M
                        powf(1.0331663, idx - 127);
660
10.8M
        } else
661
73.0M
            coeffs[n] = wmavoice_energy_table[FFMAX(0, idx)];
662
83.8M
    }
663
664
    /* calculate the Hilbert transform of the gains, which we do (since this
665
     * is a sine input) by doing a phase shift (in theory, H(sin())=cos()).
666
     * Hilbert_Transform(RDFT(x)) = Laplace_Transform(x), which calculates the
667
     * "moment" of the LPCs in this filter. */
668
1.29M
    s->dct_fn(s->dct, lpcs_dct, lpcs, sizeof(float));
669
1.29M
    s->dst_fn(s->dst, lpcs, lpcs_dct, sizeof(float));
670
671
    /* Split out the coefficient indexes into phase/magnitude pairs */
672
1.29M
    idx = 255 + av_clip(lpcs[64],               -255, 255);
673
1.29M
    coeffs[0]  = coeffs[0]  * s->cos[idx];
674
1.29M
    idx = 255 + av_clip(lpcs[64] - 2 * lpcs[63], -255, 255);
675
1.29M
    last_coeff = coeffs[64] * s->cos[idx];
676
41.2M
    for (n = 63;; n--) {
677
41.2M
        idx = 255 + av_clip(-lpcs[64] - 2 * lpcs[n - 1], -255, 255);
678
41.2M
        coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
679
41.2M
        coeffs[n * 2]     = coeffs[n] * s->cos[idx];
680
681
41.2M
        if (!--n) break;
682
683
40.0M
        idx = 255 + av_clip( lpcs[64] - 2 * lpcs[n - 1], -255, 255);
684
40.0M
        coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
685
40.0M
        coeffs[n * 2]     = coeffs[n] * s->cos[idx];
686
40.0M
    }
687
1.29M
    coeffs[64] = last_coeff;
688
689
    /* move into real domain */
690
1.29M
    s->irdft_fn(s->irdft, coeffs_dst, coeffs, sizeof(AVComplexFloat));
691
692
    /* tilt correction and normalize scale */
693
1.29M
    memset(&coeffs_dst[remainder], 0, sizeof(coeffs_dst[0]) * (128 - remainder));
694
1.29M
    if (s->denoise_tilt_corr) {
695
989k
        float tilt_mem = 0;
696
697
989k
        coeffs_dst[remainder - 1] = 0;
698
989k
        ff_tilt_compensation(&tilt_mem,
699
989k
                             -1.8 * tilt_factor(coeffs_dst, remainder - 1),
700
989k
                             coeffs_dst, remainder);
701
989k
    }
702
1.29M
    sq = (1.0 / 64.0) * sqrtf(1 / ff_scalarproduct_float_c(coeffs_dst, coeffs_dst,
703
1.29M
                                                           remainder));
704
61.9M
    for (n = 0; n < remainder; n++)
705
60.6M
        coeffs_dst[n] *= sq;
706
1.29M
}
707
708
/**
709
 * This function applies a Wiener filter on the (noisy) speech signal as
710
 * a means to denoise it.
711
 *
712
 * - take RDFT of LPCs to get the power spectrum of the noise + speech;
713
 * - using this power spectrum, calculate (for each frequency) the Wiener
714
 *    filter gain, which depends on the frequency power and desired level
715
 *    of noise subtraction (when set too high, this leads to artifacts)
716
 *    We can do this symmetrically over the X-axis (so 0-4kHz is the inverse
717
 *    of 4-8kHz);
718
 * - by doing a phase shift, calculate the Hilbert transform of this array
719
 *    of per-frequency filter-gains to get the filtering coefficients;
720
 * - smoothen/normalize/de-tilt these filter coefficients as desired;
721
 * - take RDFT of noisy sound, apply the coefficients and take its IRDFT
722
 *    to get the denoised speech signal;
723
 * - the leftover (i.e. output of the IRDFT on denoised speech data beyond
724
 *    the frame boundary) are saved and applied to subsequent frames by an
725
 *    overlap-add method (otherwise you get clicking-artifacts).
726
 *
727
 * @param s WMA Voice decoding context
728
 * @param fcb_type Frame (codebook) type
729
 * @param synth_pf input: the noisy speech signal, output: denoised speech
730
 *                 data; should be 16-byte aligned (for ASM purposes)
731
 * @param size size of the speech data
732
 * @param lpcs LPCs used to synthesize this frame's speech data
733
 */
734
static void wiener_denoise(WMAVoiceContext *s, int fcb_type,
735
                           float *synth_pf, int size,
736
                           const float *lpcs)
737
1.30M
{
738
1.30M
    int remainder, lim, n;
739
740
1.30M
    if (fcb_type != FCB_TYPE_SILENCE) {
741
1.29M
        LOCAL_ALIGNED_32(float, coeffs_f, [0x82]);
742
1.29M
        LOCAL_ALIGNED_32(float, synth_f, [0x82]);
743
1.29M
        float *tilted_lpcs = s->tilted_lpcs_pf,
744
1.29M
              *coeffs = s->denoise_coeffs_pf, tilt_mem = 0;
745
746
1.29M
        tilted_lpcs[0]           = 1.0;
747
1.29M
        memcpy(&tilted_lpcs[1], lpcs, sizeof(lpcs[0]) * s->lsps);
748
1.29M
        memset(&tilted_lpcs[s->lsps + 1], 0,
749
1.29M
               sizeof(tilted_lpcs[0]) * (128 - s->lsps - 1));
750
1.29M
        ff_tilt_compensation(&tilt_mem, 0.7 * tilt_factor(lpcs, s->lsps),
751
1.29M
                             tilted_lpcs, s->lsps + 2);
752
753
        /* The IRDFT output (127 samples for 7-bit filter) beyond the frame
754
         * size is applied to the next frame. All input beyond this is zero,
755
         * and thus all output beyond this will go towards zero, hence we can
756
         * limit to min(size-1, 127-size) as a performance consideration. */
757
1.29M
        remainder = FFMIN(127 - size, size - 1);
758
1.29M
        calc_input_response(s, tilted_lpcs, fcb_type, coeffs, remainder);
759
760
        /* apply coefficients (in frequency spectrum domain), i.e. complex
761
         * number multiplication */
762
1.29M
        memset(&synth_pf[size], 0, sizeof(synth_pf[0]) * (128 - size));
763
1.29M
        s->rdft_fn(s->rdft, synth_f, synth_pf, sizeof(float));
764
1.29M
        s->rdft_fn(s->rdft, coeffs_f, coeffs, sizeof(float));
765
1.29M
        synth_f[0] *= coeffs_f[0];
766
1.29M
        synth_f[1] *= coeffs_f[1];
767
83.8M
        for (n = 1; n <= 64; n++) {
768
82.5M
            float v1 = synth_f[n * 2], v2 = synth_f[n * 2 + 1];
769
82.5M
            synth_f[n * 2]     = v1 * coeffs_f[n * 2] - v2 * coeffs_f[n * 2 + 1];
770
82.5M
            synth_f[n * 2 + 1] = v2 * coeffs_f[n * 2] + v1 * coeffs_f[n * 2 + 1];
771
82.5M
        }
772
1.29M
        s->irdft_fn(s->irdft, synth_pf, synth_f, sizeof(AVComplexFloat));
773
1.29M
    }
774
775
    /* merge filter output with the history of previous runs */
776
1.30M
    if (s->denoise_filter_cache_size) {
777
1.28M
        lim = FFMIN(s->denoise_filter_cache_size, size);
778
61.9M
        for (n = 0; n < lim; n++)
779
60.6M
            synth_pf[n] += s->denoise_filter_cache[n];
780
1.28M
        s->denoise_filter_cache_size -= lim;
781
1.28M
        memmove(s->denoise_filter_cache, &s->denoise_filter_cache[size],
782
1.28M
                sizeof(s->denoise_filter_cache[0]) * s->denoise_filter_cache_size);
783
1.28M
    }
784
785
    /* move remainder of filter output into a cache for future runs */
786
1.30M
    if (fcb_type != FCB_TYPE_SILENCE) {
787
1.29M
        lim = FFMIN(remainder, s->denoise_filter_cache_size);
788
1.29M
        for (n = 0; n < lim; n++)
789
0
            s->denoise_filter_cache[n] += synth_pf[size + n];
790
1.29M
        if (lim < remainder) {
791
1.29M
            memcpy(&s->denoise_filter_cache[lim], &synth_pf[size + lim],
792
1.29M
                   sizeof(s->denoise_filter_cache[0]) * (remainder - lim));
793
1.29M
            s->denoise_filter_cache_size = remainder;
794
1.29M
        }
795
1.29M
    }
796
1.30M
}
797
798
/**
799
 * Averaging projection filter, the postfilter used in WMAVoice.
800
 *
801
 * This uses the following steps:
802
 * - A zero-synthesis filter (generate excitation from synth signal)
803
 * - Kalman smoothing on excitation, based on pitch
804
 * - Re-synthesized smoothened output
805
 * - Iterative Wiener denoise filter
806
 * - Adaptive gain filter
807
 * - DC filter
808
 *
809
 * @param s WMAVoice decoding context
810
 * @param synth Speech synthesis output (before postfilter)
811
 * @param samples Output buffer for filtered samples
812
 * @param size Buffer size of synth & samples
813
 * @param lpcs Generated LPCs used for speech synthesis
814
 * @param zero_exc_pf destination for zero synthesis filter (16-byte aligned)
815
 * @param fcb_type Frame type (silence, hardcoded, AW-pulses or FCB-pulses)
816
 * @param pitch Pitch of the input signal
817
 */
818
static void postfilter(WMAVoiceContext *s, const float *synth,
819
                       float *samples,    int size,
820
                       const float *lpcs, float *zero_exc_pf,
821
                       int fcb_type,      int pitch)
822
1.30M
{
823
1.30M
    float synth_filter_in_buf[MAX_FRAMESIZE / 2],
824
1.30M
          *synth_pf = &s->synth_filter_out_buf[MAX_LSPS_ALIGN16],
825
1.30M
          *synth_filter_in = zero_exc_pf;
826
827
1.30M
    av_assert0(size <= MAX_FRAMESIZE / 2);
828
829
    /* generate excitation from input signal */
830
1.30M
    ff_celp_lp_zero_synthesis_filterf(zero_exc_pf, lpcs, synth, size, s->lsps);
831
832
1.30M
    if (fcb_type >= FCB_TYPE_AW_PULSES &&
833
1.30M
        !kalman_smoothen(s, pitch, zero_exc_pf, synth_filter_in_buf, size))
834
1.16M
        synth_filter_in = synth_filter_in_buf;
835
836
    /* re-synthesize speech after smoothening, and keep history */
837
1.30M
    ff_celp_lp_synthesis_filterf(synth_pf, lpcs,
838
1.30M
                                 synth_filter_in, size, s->lsps);
839
1.30M
    memcpy(&synth_pf[-s->lsps], &synth_pf[size - s->lsps],
840
1.30M
           sizeof(synth_pf[0]) * s->lsps);
841
842
1.30M
    wiener_denoise(s, fcb_type, synth_pf, size, lpcs);
843
844
1.30M
    adaptive_gain_control(samples, synth_pf, synth, size, 0.99,
845
1.30M
                          &s->postfilter_agc);
846
847
1.30M
    if (s->dc_level > 8) {
848
        /* remove ultra-low frequency DC noise / highpass filter;
849
         * coefficients are identical to those used in SIPR decoding,
850
         * and very closely resemble those used in AMR-NB decoding. */
851
11.3k
        ff_acelp_apply_order_2_transfer_function(samples, samples,
852
11.3k
            (const float[2]) { -1.99997,      1.0 },
853
11.3k
            (const float[2]) { -1.9330735188, 0.93589198496 },
854
11.3k
            0.93980580475, s->dcf_mem, size);
855
11.3k
    }
856
1.30M
}
857
/**
858
 * @}
859
 */
860
861
/**
862
 * Dequantize LSPs
863
 * @param lsps output pointer to the array that will hold the LSPs
864
 * @param num number of LSPs to be dequantized
865
 * @param values quantized values, contains n_stages values
866
 * @param sizes range (i.e. max value) of each quantized value
867
 * @param n_stages number of dequantization runs
868
 * @param table dequantization table to be used
869
 * @param mul_q LSF multiplier
870
 * @param base_q base (lowest) LSF values
871
 */
872
static void dequant_lsps(double *lsps, int num,
873
                         const uint16_t *values,
874
                         const uint16_t *sizes,
875
                         int n_stages, const uint8_t *table,
876
                         const double *mul_q,
877
                         const double *base_q)
878
1.91M
{
879
1.91M
    int n, m;
880
881
1.91M
    memset(lsps, 0, num * sizeof(*lsps));
882
7.45M
    for (n = 0; n < n_stages; n++) {
883
5.54M
        const uint8_t *t_off = &table[values[n] * num];
884
5.54M
        double base = base_q[n], mul = mul_q[n];
885
886
67.5M
        for (m = 0; m < num; m++)
887
62.0M
            lsps[m] += base + mul * t_off[m];
888
889
5.54M
        table += sizes[n] * num;
890
5.54M
    }
891
1.91M
}
892
893
/**
894
 * @name LSP dequantization routines
895
 * LSP dequantization routines, for 10/16LSPs and independent/residual coding.
896
 * lsp10i() consumes 24 bits; lsp10r() consumes an additional 24 bits;
897
 * lsp16i() consumes 34 bits; lsp16r() consumes an additional 26 bits.
898
 * @{
899
 */
900
/**
901
 * Parse 10 independently-coded LSPs.
902
 */
903
static void dequant_lsp10i(GetBitContext *gb, double *lsps)
904
926k
{
905
926k
    static const uint16_t vec_sizes[4] = { 256, 64, 32, 32 };
906
926k
    static const double mul_lsf[4] = {
907
926k
        5.2187144800e-3,    1.4626986422e-3,
908
926k
        9.6179549166e-4,    1.1325736225e-3
909
926k
    };
910
926k
    static const double base_lsf[4] = {
911
926k
        M_PI * -2.15522e-1, M_PI * -6.1646e-2,
912
926k
        M_PI * -3.3486e-2,  M_PI * -5.7408e-2
913
926k
    };
914
926k
    uint16_t v[4];
915
916
926k
    v[0] = get_bits(gb, 8);
917
926k
    v[1] = get_bits(gb, 6);
918
926k
    v[2] = get_bits(gb, 5);
919
926k
    v[3] = get_bits(gb, 5);
920
921
926k
    dequant_lsps(lsps, 10, v, vec_sizes, 4, wmavoice_dq_lsp10i,
922
926k
                 mul_lsf, base_lsf);
923
926k
}
924
925
/**
926
 * Parse 10 independently-coded LSPs, and then derive the tables to
927
 * generate LSPs for the other frames from them (residual coding).
928
 */
929
static void dequant_lsp10r(GetBitContext *gb,
930
                           double *i_lsps, const double *old,
931
                           double *a1, double *a2, int q_mode)
932
307k
{
933
307k
    static const uint16_t vec_sizes[3] = { 128, 64, 64 };
934
307k
    static const double mul_lsf[3] = {
935
307k
        2.5807601174e-3,    1.2354460219e-3,   1.1763821673e-3
936
307k
    };
937
307k
    static const double base_lsf[3] = {
938
307k
        M_PI * -1.07448e-1, M_PI * -5.2706e-2, M_PI * -5.1634e-2
939
307k
    };
940
307k
    const float (*ipol_tab)[2][10] = q_mode ?
941
293k
        wmavoice_lsp10_intercoeff_b : wmavoice_lsp10_intercoeff_a;
942
307k
    uint16_t interpol, v[3];
943
307k
    int n;
944
945
307k
    dequant_lsp10i(gb, i_lsps);
946
947
307k
    interpol = get_bits(gb, 5);
948
307k
    v[0]     = get_bits(gb, 7);
949
307k
    v[1]     = get_bits(gb, 6);
950
307k
    v[2]     = get_bits(gb, 6);
951
952
3.38M
    for (n = 0; n < 10; n++) {
953
3.07M
        double delta = old[n] - i_lsps[n];
954
3.07M
        a1[n]        = ipol_tab[interpol][0][n] * delta + i_lsps[n];
955
3.07M
        a1[10 + n]   = ipol_tab[interpol][1][n] * delta + i_lsps[n];
956
3.07M
    }
957
958
307k
    dequant_lsps(a2, 20, v, vec_sizes, 3, wmavoice_dq_lsp10r,
959
307k
                 mul_lsf, base_lsf);
960
307k
}
961
962
/**
963
 * Parse 16 independently-coded LSPs.
964
 */
965
static void dequant_lsp16i(GetBitContext *gb, double *lsps)
966
120k
{
967
120k
    static const uint16_t vec_sizes[5] = { 256, 64, 128, 64, 128 };
968
120k
    static const double mul_lsf[5] = {
969
120k
        3.3439586280e-3,    6.9908173703e-4,
970
120k
        3.3216608306e-3,    1.0334960326e-3,
971
120k
        3.1899104283e-3
972
120k
    };
973
120k
    static const double base_lsf[5] = {
974
120k
        M_PI * -1.27576e-1, M_PI * -2.4292e-2,
975
120k
        M_PI * -1.28094e-1, M_PI * -3.2128e-2,
976
120k
        M_PI * -1.29816e-1
977
120k
    };
978
120k
    uint16_t v[5];
979
980
120k
    v[0] = get_bits(gb, 8);
981
120k
    v[1] = get_bits(gb, 6);
982
120k
    v[2] = get_bits(gb, 7);
983
120k
    v[3] = get_bits(gb, 6);
984
120k
    v[4] = get_bits(gb, 7);
985
986
120k
    dequant_lsps( lsps,     5,  v,     vec_sizes,    2,
987
120k
                 wmavoice_dq_lsp16i1,  mul_lsf,     base_lsf);
988
120k
    dequant_lsps(&lsps[5],  5, &v[2], &vec_sizes[2], 2,
989
120k
                 wmavoice_dq_lsp16i2, &mul_lsf[2], &base_lsf[2]);
990
120k
    dequant_lsps(&lsps[10], 6, &v[4], &vec_sizes[4], 1,
991
120k
                 wmavoice_dq_lsp16i3, &mul_lsf[4], &base_lsf[4]);
992
120k
}
993
994
/**
995
 * Parse 16 independently-coded LSPs, and then derive the tables to
996
 * generate LSPs for the other frames from them (residual coding).
997
 */
998
static void dequant_lsp16r(GetBitContext *gb,
999
                           double *i_lsps, const double *old,
1000
                           double *a1, double *a2, int q_mode)
1001
105k
{
1002
105k
    static const uint16_t vec_sizes[3] = { 128, 128, 128 };
1003
105k
    static const double mul_lsf[3] = {
1004
105k
        1.2232979501e-3,   1.4062241527e-3,   1.6114744851e-3
1005
105k
    };
1006
105k
    static const double base_lsf[3] = {
1007
105k
        M_PI * -5.5830e-2, M_PI * -5.2908e-2, M_PI * -5.4776e-2
1008
105k
    };
1009
105k
    const float (*ipol_tab)[2][16] = q_mode ?
1010
101k
        wmavoice_lsp16_intercoeff_b : wmavoice_lsp16_intercoeff_a;
1011
105k
    uint16_t interpol, v[3];
1012
105k
    int n;
1013
1014
105k
    dequant_lsp16i(gb, i_lsps);
1015
1016
105k
    interpol = get_bits(gb, 5);
1017
105k
    v[0]     = get_bits(gb, 7);
1018
105k
    v[1]     = get_bits(gb, 7);
1019
105k
    v[2]     = get_bits(gb, 7);
1020
1021
1.79M
    for (n = 0; n < 16; n++) {
1022
1.69M
        double delta = old[n] - i_lsps[n];
1023
1.69M
        a1[n]        = ipol_tab[interpol][0][n] * delta + i_lsps[n];
1024
1.69M
        a1[16 + n]   = ipol_tab[interpol][1][n] * delta + i_lsps[n];
1025
1.69M
    }
1026
1027
105k
    dequant_lsps( a2,     10,  v,     vec_sizes,    1,
1028
105k
                 wmavoice_dq_lsp16r1,  mul_lsf,     base_lsf);
1029
105k
    dequant_lsps(&a2[10], 10, &v[1], &vec_sizes[1], 1,
1030
105k
                 wmavoice_dq_lsp16r2, &mul_lsf[1], &base_lsf[1]);
1031
105k
    dequant_lsps(&a2[20], 12, &v[2], &vec_sizes[2], 1,
1032
105k
                 wmavoice_dq_lsp16r3, &mul_lsf[2], &base_lsf[2]);
1033
105k
}
1034
1035
/**
1036
 * @}
1037
 * @name Pitch-adaptive window coding functions
1038
 * The next few functions are for pitch-adaptive window coding.
1039
 * @{
1040
 */
1041
/**
1042
 * Parse the offset of the first pitch-adaptive window pulses, and
1043
 * the distribution of pulses between the two blocks in this frame.
1044
 * @param s WMA Voice decoding context private data
1045
 * @param gb bit I/O context
1046
 * @param pitch pitch for each block in this frame
1047
 */
1048
static void aw_parse_coords(WMAVoiceContext *s, GetBitContext *gb,
1049
                            const int *pitch)
1050
433k
{
1051
433k
    static const int16_t start_offset[94] = {
1052
433k
        -11,  -9,  -7,  -5,  -3,  -1,   1,   3,   5,   7,   9,  11,
1053
433k
         13,  15,  18,  17,  19,  20,  21,  22,  23,  24,  25,  26,
1054
433k
         27,  28,  29,  30,  31,  32,  33,  35,  37,  39,  41,  43,
1055
433k
         45,  47,  49,  51,  53,  55,  57,  59,  61,  63,  65,  67,
1056
433k
         69,  71,  73,  75,  77,  79,  81,  83,  85,  87,  89,  91,
1057
433k
         93,  95,  97,  99, 101, 103, 105, 107, 109, 111, 113, 115,
1058
433k
        117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137, 139,
1059
433k
        141, 143, 145, 147, 149, 151, 153, 155, 157, 159
1060
433k
    };
1061
433k
    int bits, offset;
1062
1063
    /* position of pulse */
1064
433k
    s->aw_idx_is_ext = 0;
1065
433k
    if ((bits = get_bits(gb, 6)) >= 54) {
1066
1.25k
        s->aw_idx_is_ext = 1;
1067
1.25k
        bits += (bits - 54) * 3 + get_bits(gb, 2);
1068
1.25k
    }
1069
1070
    /* for a repeated pulse at pulse_off with a pitch_lag of pitch[], count
1071
     * the distribution of the pulses in each block contained in this frame. */
1072
433k
    s->aw_pulse_range        = FFMIN(pitch[0], pitch[1]) > 32 ? 24 : 16;
1073
453k
    for (offset = start_offset[bits]; offset < 0; offset += pitch[0]) ;
1074
433k
    s->aw_n_pulses[0]        = (pitch[0] - 1 + MAX_FRAMESIZE / 2 - offset) / pitch[0];
1075
433k
    s->aw_first_pulse_off[0] = offset - s->aw_pulse_range / 2;
1076
433k
    offset                  += s->aw_n_pulses[0] * pitch[0];
1077
433k
    s->aw_n_pulses[1]        = (pitch[1] - 1 + MAX_FRAMESIZE - offset) / pitch[1];
1078
433k
    s->aw_first_pulse_off[1] = offset - (MAX_FRAMESIZE + s->aw_pulse_range) / 2;
1079
1080
    /* if continuing from a position before the block, reset position to
1081
     * start of block (when corrected for the range over which it can be
1082
     * spread in aw_pulse_set1()). */
1083
433k
    if (start_offset[bits] < MAX_FRAMESIZE / 2) {
1084
434k
        while (s->aw_first_pulse_off[1] - pitch[1] + s->aw_pulse_range > 0)
1085
2.77k
            s->aw_first_pulse_off[1] -= pitch[1];
1086
432k
        if (start_offset[bits] < 0)
1087
29.5k
            while (s->aw_first_pulse_off[0] - pitch[0] + s->aw_pulse_range > 0)
1088
9.29k
                s->aw_first_pulse_off[0] -= pitch[0];
1089
432k
    }
1090
433k
}
1091
1092
/**
1093
 * Apply second set of pitch-adaptive window pulses.
1094
 * @param s WMA Voice decoding context private data
1095
 * @param gb bit I/O context
1096
 * @param block_idx block index in frame [0, 1]
1097
 * @param fcb structure containing fixed codebook vector info
1098
 * @return -1 on error, 0 otherwise
1099
 */
1100
static int aw_pulse_set2(WMAVoiceContext *s, GetBitContext *gb,
1101
                         int block_idx, AMRFixed *fcb)
1102
866k
{
1103
866k
    uint16_t use_mask_mem[9]; // only 5 are used, rest is padding
1104
866k
    uint16_t *use_mask = use_mask_mem + 2;
1105
    /* in this function, idx is the index in the 80-bit (+ padding) use_mask
1106
     * bit-array. Since use_mask consists of 16-bit values, the lower 4 bits
1107
     * of idx are the position of the bit within a particular item in the
1108
     * array (0 being the most significant bit, and 15 being the least
1109
     * significant bit), and the remainder (>> 4) is the index in the
1110
     * use_mask[]-array. This is faster and uses less memory than using a
1111
     * 80-byte/80-int array. */
1112
866k
    int pulse_off = s->aw_first_pulse_off[block_idx],
1113
866k
        pulse_start, n, idx, range, aidx, start_off = 0;
1114
1115
    /* set offset of first pulse to within this block */
1116
866k
    if (s->aw_n_pulses[block_idx] > 0)
1117
459k
        while (pulse_off + s->aw_pulse_range < 1)
1118
0
            pulse_off += fcb->pitch_lag;
1119
1120
    /* find range per pulse */
1121
866k
    if (s->aw_n_pulses[0] > 0) {
1122
863k
        if (block_idx == 0) {
1123
431k
            range = 32;
1124
431k
        } else /* block_idx = 1 */ {
1125
431k
            range = 8;
1126
431k
            if (s->aw_n_pulses[block_idx] > 0)
1127
26.2k
                pulse_off = s->aw_next_pulse_off_cache;
1128
431k
        }
1129
863k
    } else
1130
3.62k
        range = 16;
1131
866k
    pulse_start = s->aw_n_pulses[block_idx] > 0 ? pulse_off - range / 2 : 0;
1132
1133
    /* aw_pulse_set1() already applies pulses around pulse_off (to be exactly,
1134
     * in the range of [pulse_off, pulse_off + s->aw_pulse_range], and thus
1135
     * we exclude that range from being pulsed again in this function. */
1136
866k
    memset(&use_mask[-2], 0, 2 * sizeof(use_mask[0]));
1137
866k
    memset( use_mask,   -1, 5 * sizeof(use_mask[0]));
1138
866k
    memset(&use_mask[5], 0, 2 * sizeof(use_mask[0]));
1139
866k
    if (s->aw_n_pulses[block_idx] > 0)
1140
1.01M
        for (idx = pulse_off; idx < MAX_FRAMESIZE / 2; idx += fcb->pitch_lag) {
1141
556k
            int excl_range         = s->aw_pulse_range; // always 16 or 24
1142
556k
            uint16_t *use_mask_ptr = &use_mask[idx >> 4];
1143
556k
            int first_sh           = 16 - (idx & 15);
1144
556k
            *use_mask_ptr++       &= 0xFFFFu << first_sh;
1145
556k
            excl_range            -= first_sh;
1146
556k
            if (excl_range >= 16) {
1147
436k
                *use_mask_ptr++    = 0;
1148
436k
                *use_mask_ptr     &= 0xFFFF >> (excl_range - 16);
1149
436k
            } else
1150
119k
                *use_mask_ptr     &= 0xFFFF >> excl_range;
1151
556k
        }
1152
1153
    /* find the 'aidx'th offset that is not excluded */
1154
866k
    aidx = get_bits(gb, s->aw_n_pulses[0] > 0 ? 5 - 2 * block_idx : 4);
1155
3.57M
    for (n = 0; n <= aidx; pulse_start++) {
1156
2.95M
        for (idx = pulse_start; idx < 0; idx += fcb->pitch_lag) ;
1157
2.71M
        if (idx >= MAX_FRAMESIZE / 2) { // find from zero
1158
19.5k
            if (use_mask[0])      idx = 0x0F;
1159
5.40k
            else if (use_mask[1]) idx = 0x1F;
1160
2.49k
            else if (use_mask[2]) idx = 0x2F;
1161
1.72k
            else if (use_mask[3]) idx = 0x3F;
1162
1.72k
            else if (use_mask[4]) idx = 0x4F;
1163
1.72k
            else return -1;
1164
17.8k
            idx -= av_log2_16bit(use_mask[idx >> 4]);
1165
17.8k
        }
1166
2.71M
        if (use_mask[idx >> 4] & (0x8000 >> (idx & 15))) {
1167
2.26M
            use_mask[idx >> 4] &= ~(0x8000 >> (idx & 15));
1168
2.26M
            n++;
1169
2.26M
            start_off = idx;
1170
2.26M
        }
1171
2.71M
    }
1172
1173
864k
    fcb->x[fcb->n] = start_off;
1174
864k
    fcb->y[fcb->n] = get_bits1(gb) ? -1.0 : 1.0;
1175
864k
    fcb->n++;
1176
1177
    /* set offset for next block, relative to start of that block */
1178
864k
    n = (MAX_FRAMESIZE / 2 - start_off) % fcb->pitch_lag;
1179
864k
    s->aw_next_pulse_off_cache = n ? fcb->pitch_lag - n : 0;
1180
864k
    return 0;
1181
866k
}
1182
1183
/**
1184
 * Apply first set of pitch-adaptive window pulses.
1185
 * @param s WMA Voice decoding context private data
1186
 * @param gb bit I/O context
1187
 * @param block_idx block index in frame [0, 1]
1188
 * @param fcb storage location for fixed codebook pulse info
1189
 */
1190
static void aw_pulse_set1(WMAVoiceContext *s, GetBitContext *gb,
1191
                          int block_idx, AMRFixed *fcb)
1192
866k
{
1193
866k
    int val = get_bits(gb, 12 - 2 * (s->aw_idx_is_ext && !block_idx));
1194
866k
    float v;
1195
1196
866k
    if (s->aw_n_pulses[block_idx] > 0) {
1197
459k
        int n, v_mask, i_mask, sh, n_pulses;
1198
1199
459k
        if (s->aw_pulse_range == 24) { // 3 pulses, 1:sign + 3:index each
1200
432k
            n_pulses = 3;
1201
432k
            v_mask   = 8;
1202
432k
            i_mask   = 7;
1203
432k
            sh       = 4;
1204
432k
        } else { // 4 pulses, 1:sign + 2:index each
1205
27.1k
            n_pulses = 4;
1206
27.1k
            v_mask   = 4;
1207
27.1k
            i_mask   = 3;
1208
27.1k
            sh       = 3;
1209
27.1k
        }
1210
1211
1.86M
        for (n = n_pulses - 1; n >= 0; n--, val >>= sh) {
1212
1.40M
            fcb->y[fcb->n] = (val & v_mask) ? -1.0 : 1.0;
1213
1.40M
            fcb->x[fcb->n] = (val & i_mask) * n_pulses + n +
1214
1.40M
                                 s->aw_first_pulse_off[block_idx];
1215
1.48M
            while (fcb->x[fcb->n] < 0)
1216
77.5k
                fcb->x[fcb->n] += fcb->pitch_lag;
1217
1.40M
            if (fcb->x[fcb->n] < MAX_FRAMESIZE / 2)
1218
1.40M
                fcb->n++;
1219
1.40M
        }
1220
459k
    } else {
1221
407k
        int num2 = (val & 0x1FF) >> 1, delta, idx;
1222
1223
407k
        if (num2 < 1 * 79)      { delta = 1; idx = num2 + 1; }
1224
57.9k
        else if (num2 < 2 * 78) { delta = 3; idx = num2 + 1 - 1 * 77; }
1225
2.02k
        else if (num2 < 3 * 77) { delta = 5; idx = num2 + 1 - 2 * 76; }
1226
1.05k
        else                    { delta = 7; idx = num2 + 1 - 3 * 75; }
1227
407k
        v = (val & 0x200) ? -1.0 : 1.0;
1228
1229
407k
        fcb->no_repeat_mask |= 3 << fcb->n;
1230
407k
        fcb->x[fcb->n]       = idx - delta;
1231
407k
        fcb->y[fcb->n]       = v;
1232
407k
        fcb->x[fcb->n + 1]   = idx;
1233
407k
        fcb->y[fcb->n + 1]   = (val & 1) ? -v : v;
1234
407k
        fcb->n              += 2;
1235
407k
    }
1236
866k
}
1237
1238
/**
1239
 * @}
1240
 *
1241
 * Generate a random number from frame_cntr and block_idx, which will live
1242
 * in the range [0, 1000 - block_size] (so it can be used as an index in a
1243
 * table of size 1000 of which you want to read block_size entries).
1244
 *
1245
 * @param frame_cntr current frame number
1246
 * @param block_num current block index
1247
 * @param block_size amount of entries we want to read from a table
1248
 *                   that has 1000 entries
1249
 * @return a (non-)random number in the [0, 1000 - block_size] range.
1250
 */
1251
static int pRNG(int frame_cntr, int block_num, int block_size)
1252
778k
{
1253
    /* array to simplify the calculation of z:
1254
     * y = (x % 9) * 5 + 6;
1255
     * z = (49995 * x) / y;
1256
     * Since y only has 9 values, we can remove the division by using a
1257
     * LUT and using FASTDIV-style divisions. For each of the 9 values
1258
     * of y, we can rewrite z as:
1259
     * z = x * (49995 / y) + x * ((49995 % y) / y)
1260
     * In this table, each col represents one possible value of y, the
1261
     * first number is 49995 / y, and the second is the FASTDIV variant
1262
     * of 49995 % y / y. */
1263
778k
    static const unsigned int div_tbl[9][2] = {
1264
778k
        { 8332,  3 * 715827883U }, // y =  6
1265
778k
        { 4545,  0 * 390451573U }, // y = 11
1266
778k
        { 3124, 11 * 268435456U }, // y = 16
1267
778k
        { 2380, 15 * 204522253U }, // y = 21
1268
778k
        { 1922, 23 * 165191050U }, // y = 26
1269
778k
        { 1612, 23 * 138547333U }, // y = 31
1270
778k
        { 1388, 27 * 119304648U }, // y = 36
1271
778k
        { 1219, 16 * 104755300U }, // y = 41
1272
778k
        { 1086, 39 *  93368855U }  // y = 46
1273
778k
    };
1274
778k
    unsigned int z, y, x = MUL16(block_num, 1877) + frame_cntr;
1275
778k
    if (x >= 0xFFFF) x -= 0xFFFF;   // max value of x is 8*1877+0xFFFE=0x13AA6,
1276
                                    // so this is effectively a modulo (%)
1277
778k
    y = x - 9 * MULH(477218589, x); // x % 9
1278
778k
    z = (uint16_t) (x * div_tbl[y][0] + UMULH(x, div_tbl[y][1]));
1279
                                    // z = x * 49995 / (y * 5 + 6)
1280
778k
    return z % (1000 - block_size);
1281
778k
}
1282
1283
/**
1284
 * Parse hardcoded signal for a single block.
1285
 * @note see #synth_block().
1286
 */
1287
static void synth_block_hardcoded(WMAVoiceContext *s, GetBitContext *gb,
1288
                                 int block_idx, int size,
1289
                                 const struct frame_type_desc *frame_desc,
1290
                                 float *excitation)
1291
808k
{
1292
808k
    float gain;
1293
808k
    int n, r_idx;
1294
1295
808k
    av_assert0(size <= MAX_FRAMESIZE);
1296
1297
    /* Set the offset from which we start reading wmavoice_std_codebook */
1298
808k
    if (frame_desc->fcb_type == FCB_TYPE_SILENCE) {
1299
776k
        r_idx = pRNG(s->frame_cntr, block_idx, size);
1300
776k
        gain  = s->silence_gain;
1301
776k
    } else /* FCB_TYPE_HARDCODED */ {
1302
31.6k
        r_idx = get_bits(gb, 8);
1303
31.6k
        gain  = wmavoice_gain_universal[get_bits(gb, 6)];
1304
31.6k
    }
1305
1306
    /* Clear gain prediction parameters */
1307
808k
    memset(s->gain_pred_err, 0, sizeof(s->gain_pred_err));
1308
1309
    /* Apply gain to hardcoded codebook and use that as excitation signal */
1310
127M
    for (n = 0; n < size; n++)
1311
126M
        excitation[n] = wmavoice_std_codebook[r_idx + n] * gain;
1312
808k
}
1313
1314
/**
1315
 * Parse FCB/ACB signal for a single block.
1316
 * @note see #synth_block().
1317
 */
1318
static void synth_block_fcb_acb(WMAVoiceContext *s, GetBitContext *gb,
1319
                                int block_idx, int size,
1320
                                int block_pitch_sh2,
1321
                                const struct frame_type_desc *frame_desc,
1322
                                float *excitation)
1323
4.27M
{
1324
4.27M
    static const float gain_coeff[6] = {
1325
4.27M
        0.8169, -0.06545, 0.1726, 0.0185, -0.0359, 0.0458
1326
4.27M
    };
1327
4.27M
    float pulses[MAX_FRAMESIZE / 2], pred_err, acb_gain, fcb_gain;
1328
4.27M
    int n, idx, gain_weight;
1329
4.27M
    AMRFixed fcb;
1330
1331
4.27M
    av_assert0(size <= MAX_FRAMESIZE / 2);
1332
4.27M
    memset(pulses, 0, sizeof(*pulses) * size);
1333
1334
4.27M
    fcb.pitch_lag      = block_pitch_sh2 >> 2;
1335
4.27M
    fcb.pitch_fac      = 1.0;
1336
4.27M
    fcb.no_repeat_mask = 0;
1337
4.27M
    fcb.n              = 0;
1338
1339
    /* For the other frame types, this is where we apply the innovation
1340
     * (fixed) codebook pulses of the speech signal. */
1341
4.27M
    if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1342
866k
        aw_pulse_set1(s, gb, block_idx, &fcb);
1343
866k
        if (aw_pulse_set2(s, gb, block_idx, &fcb)) {
1344
            /* Conceal the block with silence and return.
1345
             * Skip the correct amount of bits to read the next
1346
             * block from the correct offset. */
1347
1.72k
            int r_idx = pRNG(s->frame_cntr, block_idx, size);
1348
1349
139k
            for (n = 0; n < size; n++)
1350
138k
                excitation[n] =
1351
138k
                    wmavoice_std_codebook[r_idx + n] * s->silence_gain;
1352
1.72k
            skip_bits(gb, 7 + 1);
1353
1.72k
            return;
1354
1.72k
        }
1355
3.41M
    } else /* FCB_TYPE_EXC_PULSES */ {
1356
3.41M
        int offset_nbits = 5 - frame_desc->log_n_blocks;
1357
1358
3.41M
        fcb.no_repeat_mask = -1;
1359
        /* similar to ff_decode_10_pulses_35bits(), but with single pulses
1360
         * (instead of double) for a subset of pulses */
1361
20.4M
        for (n = 0; n < 5; n++) {
1362
17.0M
            float sign;
1363
17.0M
            int pos1, pos2;
1364
1365
17.0M
            sign           = get_bits1(gb) ? 1.0 : -1.0;
1366
17.0M
            pos1           = get_bits(gb, offset_nbits);
1367
17.0M
            fcb.x[fcb.n]   = n + 5 * pos1;
1368
17.0M
            fcb.y[fcb.n++] = sign;
1369
17.0M
            if (n < frame_desc->dbl_pulses) {
1370
2.50M
                pos2           = get_bits(gb, offset_nbits);
1371
2.50M
                fcb.x[fcb.n]   = n + 5 * pos2;
1372
2.50M
                fcb.y[fcb.n++] = (pos1 < pos2) ? -sign : sign;
1373
2.50M
            }
1374
17.0M
        }
1375
3.41M
    }
1376
4.27M
    ff_set_fixed_vector(pulses, &fcb, 1.0, size);
1377
1378
    /* Calculate gain for adaptive & fixed codebook signal.
1379
     * see ff_amr_set_fixed_gain(). */
1380
4.27M
    idx = get_bits(gb, 7);
1381
4.27M
    fcb_gain = expf(ff_scalarproduct_float_c(s->gain_pred_err,
1382
4.27M
                                             gain_coeff, 6) -
1383
4.27M
                    5.2409161640 + wmavoice_gain_codebook_fcb[idx]);
1384
4.27M
    acb_gain = wmavoice_gain_codebook_acb[idx];
1385
4.27M
    pred_err = av_clipf(wmavoice_gain_codebook_fcb[idx],
1386
4.27M
                        -2.9957322736 /* log(0.05) */,
1387
4.27M
                         1.6094379124 /* log(5.0)  */);
1388
1389
4.27M
    gain_weight = 8 >> frame_desc->log_n_blocks;
1390
4.27M
    memmove(&s->gain_pred_err[gain_weight], s->gain_pred_err,
1391
4.27M
            sizeof(*s->gain_pred_err) * (6 - gain_weight));
1392
12.8M
    for (n = 0; n < gain_weight; n++)
1393
8.59M
        s->gain_pred_err[n] = pred_err;
1394
1395
    /* Calculation of adaptive codebook */
1396
4.27M
    if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) {
1397
1.96M
        int len;
1398
4.00M
        for (n = 0; n < size; n += len) {
1399
2.03M
            int next_idx_sh16;
1400
2.03M
            int abs_idx    = block_idx * size + n;
1401
2.03M
            int pitch_sh16 = (s->last_pitch_val << 16) +
1402
2.03M
                             s->pitch_diff_sh16 * abs_idx;
1403
2.03M
            int pitch      = (pitch_sh16 + 0x6FFF) >> 16;
1404
2.03M
            int idx_sh16   = ((pitch << 16) - pitch_sh16) * 8 + 0x58000;
1405
2.03M
            idx            = idx_sh16 >> 16;
1406
2.03M
            if (s->pitch_diff_sh16) {
1407
75.9k
                if (s->pitch_diff_sh16 > 0) {
1408
34.8k
                    next_idx_sh16 = (idx_sh16) &~ 0xFFFF;
1409
34.8k
                } else
1410
41.0k
                    next_idx_sh16 = (idx_sh16 + 0x10000) &~ 0xFFFF;
1411
75.9k
                len = av_clip((idx_sh16 - next_idx_sh16) / s->pitch_diff_sh16 / 8,
1412
75.9k
                              1, size - n);
1413
75.9k
            } else
1414
1.96M
                len = size;
1415
1416
2.03M
            ff_acelp_interpolatef(&excitation[n], &excitation[n - pitch],
1417
2.03M
                                  wmavoice_ipol1_coeffs, 17,
1418
2.03M
                                  idx, 9, len);
1419
2.03M
        }
1420
2.30M
    } else /* ACB_TYPE_HAMMING */ {
1421
2.30M
        int block_pitch = block_pitch_sh2 >> 2;
1422
2.30M
        idx             = block_pitch_sh2 & 3;
1423
2.30M
        if (idx) {
1424
7.79k
            ff_acelp_interpolatef(excitation, &excitation[-block_pitch],
1425
7.79k
                                  wmavoice_ipol2_coeffs, 4,
1426
7.79k
                                  idx, 8, size);
1427
7.79k
        } else
1428
2.30M
            av_memcpy_backptr((uint8_t *) excitation, sizeof(float) * block_pitch,
1429
2.30M
                              sizeof(float) * size);
1430
2.30M
    }
1431
1432
    /* Interpolate ACB/FCB and use as excitation signal */
1433
4.27M
    ff_weighted_vector_sumf(excitation, excitation, pulses,
1434
4.27M
                            acb_gain, fcb_gain, size);
1435
4.27M
}
1436
1437
/**
1438
 * Parse data in a single block.
1439
 *
1440
 * @param s WMA Voice decoding context private data
1441
 * @param gb bit I/O context
1442
 * @param block_idx index of the to-be-read block
1443
 * @param size amount of samples to be read in this block
1444
 * @param block_pitch_sh2 pitch for this block << 2
1445
 * @param lsps LSPs for (the end of) this frame
1446
 * @param prev_lsps LSPs for the last frame
1447
 * @param frame_desc frame type descriptor
1448
 * @param excitation target memory for the ACB+FCB interpolated signal
1449
 * @param synth target memory for the speech synthesis filter output
1450
 * @return 0 on success, <0 on error.
1451
 */
1452
static void synth_block(WMAVoiceContext *s, GetBitContext *gb,
1453
                        int block_idx, int size,
1454
                        int block_pitch_sh2,
1455
                        const double *lsps, const double *prev_lsps,
1456
                        const struct frame_type_desc *frame_desc,
1457
                        float *excitation, float *synth)
1458
5.08M
{
1459
5.08M
    double i_lsps[MAX_LSPS];
1460
5.08M
    float lpcs[MAX_LSPS];
1461
5.08M
    float fac;
1462
5.08M
    int n;
1463
1464
5.08M
    if (frame_desc->acb_type == ACB_TYPE_NONE)
1465
808k
        synth_block_hardcoded(s, gb, block_idx, size, frame_desc, excitation);
1466
4.27M
    else
1467
4.27M
        synth_block_fcb_acb(s, gb, block_idx, size, block_pitch_sh2,
1468
4.27M
                            frame_desc, excitation);
1469
1470
    /* convert interpolated LSPs to LPCs */
1471
5.08M
    fac = (block_idx + 0.5) / frame_desc->n_blocks;
1472
59.0M
    for (n = 0; n < s->lsps; n++) // LSF -> LSP
1473
53.9M
        i_lsps[n] = cos(prev_lsps[n] + fac * (lsps[n] - prev_lsps[n]));
1474
5.08M
    ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1475
1476
    /* Speech synthesis */
1477
5.08M
    ff_celp_lp_synthesis_filterf(synth, lpcs, excitation, size, s->lsps);
1478
5.08M
}
1479
1480
/**
1481
 * Synthesize output samples for a single frame.
1482
 *
1483
 * @param ctx WMA Voice decoder context
1484
 * @param gb bit I/O context (s->gb or one for cross-packet superframes)
1485
 * @param frame_idx Frame number within superframe [0-2]
1486
 * @param samples pointer to output sample buffer, has space for at least 160
1487
 *                samples
1488
 * @param lsps LSP array
1489
 * @param prev_lsps array of previous frame's LSPs
1490
 * @param excitation target buffer for excitation signal
1491
 * @param synth target buffer for synthesized speech data
1492
 * @return 0 on success, <0 on error.
1493
 */
1494
static int synth_frame(AVCodecContext *ctx, GetBitContext *gb, int frame_idx,
1495
                       float *samples,
1496
                       const double *lsps, const double *prev_lsps,
1497
                       float *excitation, float *synth)
1498
1.87M
{
1499
1.87M
    WMAVoiceContext *s = ctx->priv_data;
1500
1.87M
    int n, n_blocks_x2, log_n_blocks_x2, av_uninit(cur_pitch_val);
1501
1.87M
    int pitch[MAX_BLOCKS], av_uninit(last_block_pitch);
1502
1503
    /* Parse frame type ("frame header"), see frame_descs */
1504
1.87M
    int bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc, 6, 3)], block_nsamples;
1505
1506
1.87M
    pitch[0] = INT_MAX;
1507
1508
1.87M
    if (bd_idx < 0) {
1509
3.75k
        av_log(ctx, AV_LOG_ERROR,
1510
3.75k
               "Invalid frame type VLC code, skipping\n");
1511
3.75k
        return AVERROR_INVALIDDATA;
1512
3.75k
    }
1513
1514
1.86M
    block_nsamples = MAX_FRAMESIZE / frame_descs[bd_idx].n_blocks;
1515
1516
    /* Pitch calculation for ACB_TYPE_ASYMMETRIC ("pitch-per-frame") */
1517
1.86M
    if (frame_descs[bd_idx].acb_type == ACB_TYPE_ASYMMETRIC) {
1518
        /* Pitch is provided per frame, which is interpreted as the pitch of
1519
         * the last sample of the last block of this frame. We can interpolate
1520
         * the pitch of other blocks (and even pitch-per-sample) by gradually
1521
         * incrementing/decrementing prev_frame_pitch to cur_pitch_val. */
1522
747k
        n_blocks_x2      = frame_descs[bd_idx].n_blocks << 1;
1523
747k
        log_n_blocks_x2  = frame_descs[bd_idx].log_n_blocks + 1;
1524
747k
        cur_pitch_val    = s->min_pitch_val + get_bits(gb, s->pitch_nbits);
1525
747k
        cur_pitch_val    = FFMIN(cur_pitch_val, s->max_pitch_val - 1);
1526
747k
        if (s->last_acb_type == ACB_TYPE_NONE ||
1527
747k
            20 * abs(cur_pitch_val - s->last_pitch_val) >
1528
383k
                (cur_pitch_val + s->last_pitch_val))
1529
626k
            s->last_pitch_val = cur_pitch_val;
1530
1531
        /* pitch per block */
1532
2.71M
        for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1533
1.96M
            int fac = n * 2 + 1;
1534
1535
1.96M
            pitch[n] = (MUL16(fac,                 cur_pitch_val) +
1536
1.96M
                        MUL16((n_blocks_x2 - fac), s->last_pitch_val) +
1537
1.96M
                        frame_descs[bd_idx].n_blocks) >> log_n_blocks_x2;
1538
1.96M
        }
1539
1540
        /* "pitch-diff-per-sample" for calculation of pitch per sample */
1541
747k
        s->pitch_diff_sh16 =
1542
747k
            (cur_pitch_val - s->last_pitch_val) * (1 << 16) / MAX_FRAMESIZE;
1543
747k
    }
1544
1545
    /* Global gain (if silence) and pitch-adaptive window coordinates */
1546
1.86M
    switch (frame_descs[bd_idx].fcb_type) {
1547
776k
    case FCB_TYPE_SILENCE:
1548
776k
        s->silence_gain = wmavoice_gain_silence[get_bits(gb, 8)];
1549
776k
        break;
1550
433k
    case FCB_TYPE_AW_PULSES:
1551
433k
        aw_parse_coords(s, gb, pitch);
1552
433k
        break;
1553
1.86M
    }
1554
1555
6.95M
    for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1556
5.08M
        int bl_pitch_sh2;
1557
1558
        /* Pitch calculation for ACB_TYPE_HAMMING ("pitch-per-block") */
1559
5.08M
        switch (frame_descs[bd_idx].acb_type) {
1560
2.30M
        case ACB_TYPE_HAMMING: {
1561
            /* Pitch is given per block. Per-block pitches are encoded as an
1562
             * absolute value for the first block, and then delta values
1563
             * relative to this value) for all subsequent blocks. The scale of
1564
             * this pitch value is semi-logarithmic compared to its use in the
1565
             * decoder, so we convert it to normal scale also. */
1566
2.30M
            int block_pitch,
1567
2.30M
                t1 = (s->block_conv_table[1] - s->block_conv_table[0]) << 2,
1568
2.30M
                t2 = (s->block_conv_table[2] - s->block_conv_table[1]) << 1,
1569
2.30M
                t3 =  s->block_conv_table[3] - s->block_conv_table[2] + 1;
1570
1571
2.30M
            if (n == 0) {
1572
327k
                block_pitch = get_bits(gb, s->block_pitch_nbits);
1573
327k
            } else
1574
1.98M
                block_pitch = last_block_pitch - s->block_delta_pitch_hrange +
1575
1.98M
                                 get_bits(gb, s->block_delta_pitch_nbits);
1576
            /* Convert last_ so that any next delta is within _range */
1577
2.30M
            last_block_pitch = av_clip(block_pitch,
1578
2.30M
                                       s->block_delta_pitch_hrange,
1579
2.30M
                                       s->block_pitch_range -
1580
2.30M
                                           s->block_delta_pitch_hrange);
1581
1582
            /* Convert semi-log-style scale back to normal scale */
1583
2.30M
            if (block_pitch < t1) {
1584
25.3k
                bl_pitch_sh2 = (s->block_conv_table[0] << 2) + block_pitch;
1585
2.28M
            } else {
1586
2.28M
                block_pitch -= t1;
1587
2.28M
                if (block_pitch < t2) {
1588
9.63k
                    bl_pitch_sh2 =
1589
9.63k
                        (s->block_conv_table[1] << 2) + (block_pitch << 1);
1590
2.27M
                } else {
1591
2.27M
                    block_pitch -= t2;
1592
2.27M
                    if (block_pitch < t3) {
1593
2.26M
                        bl_pitch_sh2 =
1594
2.26M
                            (s->block_conv_table[2] + block_pitch) << 2;
1595
2.26M
                    } else
1596
3.54k
                        bl_pitch_sh2 = s->block_conv_table[3] << 2;
1597
2.27M
                }
1598
2.28M
            }
1599
2.30M
            pitch[n] = bl_pitch_sh2 >> 2;
1600
2.30M
            break;
1601
0
        }
1602
1603
1.96M
        case ACB_TYPE_ASYMMETRIC: {
1604
1.96M
            bl_pitch_sh2 = pitch[n] << 2;
1605
1.96M
            break;
1606
0
        }
1607
1608
808k
        default: // ACB_TYPE_NONE has no pitch
1609
808k
            bl_pitch_sh2 = 0;
1610
808k
            break;
1611
5.08M
        }
1612
1613
5.08M
        synth_block(s, gb, n, block_nsamples, bl_pitch_sh2,
1614
5.08M
                    lsps, prev_lsps, &frame_descs[bd_idx],
1615
5.08M
                    &excitation[n * block_nsamples],
1616
5.08M
                    &synth[n * block_nsamples]);
1617
5.08M
    }
1618
1619
    /* Averaging projection filter, if applicable. Else, just copy samples
1620
     * from synthesis buffer */
1621
1.86M
    if (s->do_apf) {
1622
654k
        double i_lsps[MAX_LSPS];
1623
654k
        float lpcs[MAX_LSPS];
1624
1625
654k
        if(frame_descs[bd_idx].fcb_type >= FCB_TYPE_AW_PULSES && pitch[0] == INT_MAX)
1626
0
            return AVERROR_INVALIDDATA;
1627
1628
7.30M
        for (n = 0; n < s->lsps; n++) // LSF -> LSP
1629
6.65M
            i_lsps[n] = cos(0.5 * (prev_lsps[n] + lsps[n]));
1630
654k
        ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1631
654k
        postfilter(s, synth, samples, 80, lpcs,
1632
654k
                   &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx],
1633
654k
                   frame_descs[bd_idx].fcb_type, pitch[0]);
1634
1635
7.30M
        for (n = 0; n < s->lsps; n++) // LSF -> LSP
1636
6.65M
            i_lsps[n] = cos(lsps[n]);
1637
654k
        ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1638
654k
        postfilter(s, &synth[80], &samples[80], 80, lpcs,
1639
654k
                   &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx + 80],
1640
654k
                   frame_descs[bd_idx].fcb_type, pitch[0]);
1641
654k
    } else
1642
1.21M
        memcpy(samples, synth, 160 * sizeof(synth[0]));
1643
1644
    /* Cache values for next frame */
1645
1.86M
    s->frame_cntr++;
1646
1.86M
    if (s->frame_cntr >= 0xFFFF) s->frame_cntr -= 0xFFFF; // i.e. modulo (%)
1647
1.86M
    s->last_acb_type = frame_descs[bd_idx].acb_type;
1648
1.86M
    switch (frame_descs[bd_idx].acb_type) {
1649
792k
    case ACB_TYPE_NONE:
1650
792k
        s->last_pitch_val = 0;
1651
792k
        break;
1652
747k
    case ACB_TYPE_ASYMMETRIC:
1653
747k
        s->last_pitch_val = cur_pitch_val;
1654
747k
        break;
1655
327k
    case ACB_TYPE_HAMMING:
1656
327k
        s->last_pitch_val = pitch[frame_descs[bd_idx].n_blocks - 1];
1657
327k
        break;
1658
1.86M
    }
1659
1660
1.86M
    return 0;
1661
1.86M
}
1662
1663
/**
1664
 * Ensure minimum value for first item, maximum value for last value,
1665
 * proper spacing between each value and proper ordering.
1666
 *
1667
 * @param lsps array of LSPs
1668
 * @param num size of LSP array
1669
 *
1670
 * @note basically a double version of #ff_acelp_reorder_lsf(), might be
1671
 *       useful to put in a generic location later on. Parts are also
1672
 *       present in #ff_set_min_dist_lsf() + #ff_sort_nearly_sorted_floats(),
1673
 *       which is in float.
1674
 */
1675
static void stabilize_lsps(double *lsps, int num)
1676
1.87M
{
1677
1.87M
    int n, m, l;
1678
1679
    /* set minimum value for first, maximum value for last and minimum
1680
     * spacing between LSF values.
1681
     * Very similar to ff_set_min_dist_lsf(), but in double. */
1682
1.87M
    lsps[0]       = FFMAX(lsps[0],       0.0015 * M_PI);
1683
20.7M
    for (n = 1; n < num; n++)
1684
18.8M
        lsps[n]   = FFMAX(lsps[n],       lsps[n - 1] + 0.0125 * M_PI);
1685
1.87M
    lsps[num - 1] = FFMIN(lsps[num - 1], 0.9985 * M_PI);
1686
1687
    /* reorder (looks like one-time / non-recursed bubblesort).
1688
     * Very similar to ff_sort_nearly_sorted_floats(), but in double. */
1689
20.7M
    for (n = 1; n < num; n++) {
1690
18.8M
        if (lsps[n] < lsps[n - 1]) {
1691
28.1k
            for (m = 1; m < num; m++) {
1692
25.3k
                double tmp = lsps[m];
1693
29.4k
                for (l = m - 1; l >= 0; l--) {
1694
29.4k
                    if (lsps[l] <= tmp) break;
1695
4.15k
                    lsps[l + 1] = lsps[l];
1696
4.15k
                }
1697
25.3k
                lsps[l + 1] = tmp;
1698
25.3k
            }
1699
2.81k
            break;
1700
2.81k
        }
1701
18.8M
    }
1702
1.87M
}
1703
1704
/**
1705
 * Synthesize output samples for a single superframe. If we have any data
1706
 * cached in s->sframe_cache, that will be used instead of whatever is loaded
1707
 * in s->gb.
1708
 *
1709
 * WMA Voice superframes contain 3 frames, each containing 160 audio samples,
1710
 * to give a total of 480 samples per frame. See #synth_frame() for frame
1711
 * parsing. In addition to 3 frames, superframes can also contain the LSPs
1712
 * (if these are globally specified for all frames (residually); they can
1713
 * also be specified individually per-frame. See the s->has_residual_lsps
1714
 * option), and can specify the number of samples encoded in this superframe
1715
 * (if less than 480), usually used to prevent blanks at track boundaries.
1716
 *
1717
 * @param ctx WMA Voice decoder context
1718
 * @return 0 on success, <0 on error or 1 if there was not enough data to
1719
 *         fully parse the superframe
1720
 */
1721
static int synth_superframe(AVCodecContext *ctx, AVFrame *frame,
1722
                            int *got_frame_ptr)
1723
777k
{
1724
777k
    WMAVoiceContext *s = ctx->priv_data;
1725
777k
    GetBitContext *gb = &s->gb, s_gb;
1726
777k
    int n, res, n_samples = MAX_SFRAMESIZE;
1727
777k
    double lsps[MAX_FRAMES][MAX_LSPS];
1728
777k
    const double *mean_lsf = s->lsps == 16 ?
1729
531k
        wmavoice_mean_lsf16[s->lsp_def_mode] : wmavoice_mean_lsf10[s->lsp_def_mode];
1730
777k
    float excitation[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE + 12];
1731
777k
    float synth[MAX_LSPS + MAX_SFRAMESIZE];
1732
777k
    float *samples;
1733
1734
777k
    memcpy(synth,      s->synth_history,
1735
777k
           s->lsps             * sizeof(*synth));
1736
777k
    memcpy(excitation, s->excitation_history,
1737
777k
           s->history_nsamples * sizeof(*excitation));
1738
1739
777k
    if (s->sframe_cache_size > 0) {
1740
567k
        gb = &s_gb;
1741
567k
        init_get_bits(gb, s->sframe_cache, s->sframe_cache_size);
1742
567k
        s->sframe_cache_size = 0;
1743
567k
    }
1744
1745
    /* First bit is speech/music bit, it differentiates between WMAVoice
1746
     * speech samples (the actual codec) and WMAVoice music samples, which
1747
     * are really WMAPro-in-WMAVoice-superframes. I've never seen those in
1748
     * the wild yet. */
1749
777k
    if (!get_bits1(gb)) {
1750
77.9k
        avpriv_request_sample(ctx, "WMAPro-in-WMAVoice");
1751
77.9k
        return AVERROR_PATCHWELCOME;
1752
77.9k
    }
1753
1754
    /* (optional) nr. of samples in superframe; always <= 480 and >= 0 */
1755
699k
    if (get_bits1(gb)) {
1756
75.6k
        if ((n_samples = get_bits(gb, 12)) > MAX_SFRAMESIZE) {
1757
73.6k
            av_log(ctx, AV_LOG_ERROR,
1758
73.6k
                   "Superframe encodes > %d samples (%d), not allowed\n",
1759
73.6k
                   MAX_SFRAMESIZE, n_samples);
1760
73.6k
            return AVERROR_INVALIDDATA;
1761
73.6k
        }
1762
75.6k
    }
1763
1764
    /* Parse LSPs, if global for the superframe (can also be per-frame). */
1765
625k
    if (s->has_residual_lsps) {
1766
413k
        double prev_lsps[MAX_LSPS], a1[MAX_LSPS * 2], a2[MAX_LSPS * 2];
1767
1768
5.17M
        for (n = 0; n < s->lsps; n++)
1769
4.76M
            prev_lsps[n] = s->prev_lsps[n] - mean_lsf[n];
1770
1771
413k
        if (s->lsps == 10) {
1772
307k
            dequant_lsp10r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1773
307k
        } else /* s->lsps == 16 */
1774
105k
            dequant_lsp16r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1775
1776
5.17M
        for (n = 0; n < s->lsps; n++) {
1777
4.76M
            lsps[0][n]  = mean_lsf[n] + (a1[n]           - a2[n * 2]);
1778
4.76M
            lsps[1][n]  = mean_lsf[n] + (a1[s->lsps + n] - a2[n * 2 + 1]);
1779
4.76M
            lsps[2][n] += mean_lsf[n];
1780
4.76M
        }
1781
1.65M
        for (n = 0; n < 3; n++)
1782
1.23M
            stabilize_lsps(lsps[n], s->lsps);
1783
413k
    }
1784
1785
    /* synth_superframe can run multiple times per packet
1786
     * free potential previous frame */
1787
625k
    av_frame_unref(frame);
1788
1789
    /* get output buffer */
1790
625k
    frame->nb_samples = MAX_SFRAMESIZE;
1791
625k
    if ((res = ff_get_buffer(ctx, frame, 0)) < 0)
1792
0
        return res;
1793
625k
    frame->nb_samples = n_samples;
1794
625k
    samples = (float *)frame->data[0];
1795
1796
    /* Parse frames, optionally preceded by per-frame (independent) LSPs. */
1797
2.49M
    for (n = 0; n < 3; n++) {
1798
1.87M
        if (!s->has_residual_lsps) {
1799
633k
            int m;
1800
1801
633k
            if (s->lsps == 10) {
1802
619k
                dequant_lsp10i(gb, lsps[n]);
1803
619k
            } else /* s->lsps == 16 */
1804
14.4k
                dequant_lsp16i(gb, lsps[n]);
1805
1806
7.05M
            for (m = 0; m < s->lsps; m++)
1807
6.42M
                lsps[n][m] += mean_lsf[m];
1808
633k
            stabilize_lsps(lsps[n], s->lsps);
1809
633k
        }
1810
1811
1.87M
        if ((res = synth_frame(ctx, gb, n,
1812
1.87M
                               &samples[n * MAX_FRAMESIZE],
1813
1.87M
                               lsps[n], n == 0 ? s->prev_lsps : lsps[n - 1],
1814
1.87M
                               &excitation[s->history_nsamples + n * MAX_FRAMESIZE],
1815
1.87M
                               &synth[s->lsps + n * MAX_FRAMESIZE]))) {
1816
3.75k
            *got_frame_ptr = 0;
1817
3.75k
            return res;
1818
3.75k
        }
1819
1.87M
    }
1820
1821
    /* Statistics? FIXME - we don't check for length, a slight overrun
1822
     * will be caught by internal buffer padding, and anything else
1823
     * will be skipped, not read. */
1824
621k
    if (get_bits1(gb)) {
1825
452k
        res = get_bits(gb, 4);
1826
452k
        skip_bits(gb, 10 * (res + 1));
1827
452k
    }
1828
1829
621k
    if (get_bits_left(gb) < 0) {
1830
259k
        wmavoice_flush(ctx);
1831
259k
        return AVERROR_INVALIDDATA;
1832
259k
    }
1833
1834
361k
    *got_frame_ptr = 1;
1835
1836
    /* Update history */
1837
361k
    memcpy(s->prev_lsps,           lsps[2],
1838
361k
           s->lsps             * sizeof(*s->prev_lsps));
1839
361k
    memcpy(s->synth_history,      &synth[MAX_SFRAMESIZE],
1840
361k
           s->lsps             * sizeof(*synth));
1841
361k
    memcpy(s->excitation_history, &excitation[MAX_SFRAMESIZE],
1842
361k
           s->history_nsamples * sizeof(*excitation));
1843
361k
    if (s->do_apf)
1844
4.87k
        memmove(s->zero_exc_pf,       &s->zero_exc_pf[MAX_SFRAMESIZE],
1845
4.87k
                s->history_nsamples * sizeof(*s->zero_exc_pf));
1846
1847
361k
    return 0;
1848
621k
}
1849
1850
/**
1851
 * Parse the packet header at the start of each packet (input data to this
1852
 * decoder).
1853
 *
1854
 * @param s WMA Voice decoding context private data
1855
 * @return <0 on error, nb_superframes on success.
1856
 */
1857
static int parse_packet_header(WMAVoiceContext *s)
1858
576k
{
1859
576k
    GetBitContext *gb = &s->gb;
1860
576k
    unsigned int res, n_superframes = 0;
1861
1862
576k
    skip_bits(gb, 4);          // packet sequence number
1863
576k
    s->has_residual_lsps = get_bits1(gb);
1864
595k
    do {
1865
595k
        if (get_bits_left(gb) < 6 + s->spillover_bitsize)
1866
231
            return AVERROR_INVALIDDATA;
1867
1868
595k
        res = get_bits(gb, 6); // number of superframes per packet
1869
                               // (minus first one if there is spillover)
1870
595k
        n_superframes += res;
1871
595k
    } while (res == 0x3F);
1872
576k
    s->spillover_nbits   = get_bits(gb, s->spillover_bitsize);
1873
1874
576k
    return get_bits_left(gb) >= 0 ? n_superframes : AVERROR_INVALIDDATA;
1875
576k
}
1876
1877
/**
1878
 * Copy (unaligned) bits from gb/data/size to pb.
1879
 *
1880
 * @param pb target buffer to copy bits into
1881
 * @param data source buffer to copy bits from
1882
 * @param size size of the source data, in bytes
1883
 * @param gb bit I/O context specifying the current position in the source.
1884
 *           data. This function might use this to align the bit position to
1885
 *           a whole-byte boundary before calling #ff_copy_bits() on aligned
1886
 *           source data
1887
 * @param nbits the amount of bits to copy from source to target
1888
 *
1889
 * @note after calling this function, the current position in the input bit
1890
 *       I/O context is undefined.
1891
 */
1892
static void copy_bits(PutBitContext *pb,
1893
                      const uint8_t *data, int size,
1894
                      GetBitContext *gb, int nbits)
1895
1.33M
{
1896
1.33M
    int rmn_bytes, rmn_bits;
1897
1898
1.33M
    rmn_bits = rmn_bytes = get_bits_left(gb);
1899
1.33M
    if (rmn_bits < nbits)
1900
3.04k
        return;
1901
1.33M
    if (nbits > put_bits_left(pb))
1902
1.16k
        return;
1903
1.33M
    rmn_bits &= 7; rmn_bytes >>= 3;
1904
1.33M
    if ((rmn_bits = FFMIN(rmn_bits, nbits)) > 0)
1905
917k
        put_bits(pb, rmn_bits, get_bits(gb, rmn_bits));
1906
1.33M
    ff_copy_bits(pb, data + size - rmn_bytes,
1907
1.33M
                 FFMIN(nbits - rmn_bits, rmn_bytes << 3));
1908
1.33M
}
1909
1910
/**
1911
 * Packet decoding: a packet is anything that the (ASF) demuxer contains,
1912
 * and we expect that the demuxer / application provides it to us as such
1913
 * (else you'll probably get garbage as output). Every packet has a size of
1914
 * ctx->block_align bytes, starts with a packet header (see
1915
 * #parse_packet_header()), and then a series of superframes. Superframe
1916
 * boundaries may exceed packets, i.e. superframes can split data over
1917
 * multiple (two) packets.
1918
 *
1919
 * For more information about frames, see #synth_superframe().
1920
 */
1921
static int wmavoice_decode_packet(AVCodecContext *ctx, AVFrame *frame,
1922
                                  int *got_frame_ptr, AVPacket *avpkt)
1923
1.34M
{
1924
1.34M
    WMAVoiceContext *s = ctx->priv_data;
1925
1.34M
    GetBitContext *gb = &s->gb;
1926
1.34M
    const uint8_t *buf = avpkt->data;
1927
1.34M
    uint8_t dummy[1];
1928
1.34M
    int size, res, pos;
1929
1930
    /* Packets are sometimes a multiple of ctx->block_align, with a packet
1931
     * header at each ctx->block_align bytes. However, FFmpeg's ASF demuxer
1932
     * feeds us ASF packets, which may concatenate multiple "codec" packets
1933
     * in a single "muxer" packet, so we artificially emulate that by
1934
     * capping the packet size at ctx->block_align. */
1935
8.32G
    for (size = avpkt->size; size > ctx->block_align; size -= ctx->block_align);
1936
1.34M
    buf = size ? buf : dummy;
1937
1.34M
    res = init_get_bits8(&s->gb, buf, size);
1938
1.34M
    if (res < 0)
1939
0
        return res;
1940
1941
    /* size == ctx->block_align is used to indicate whether we are dealing with
1942
     * a new packet or a packet of which we already read the packet header
1943
     * previously. */
1944
1.34M
    if (!(size % ctx->block_align)) { // new packet header
1945
577k
        if (!size) {
1946
1.49k
            s->spillover_nbits = 0;
1947
1.49k
            s->nb_superframes = 0;
1948
576k
        } else {
1949
576k
            if ((res = parse_packet_header(s)) < 0)
1950
231
                return res;
1951
576k
            s->nb_superframes = res;
1952
576k
        }
1953
1954
        /* If the packet header specifies a s->spillover_nbits, then we want
1955
         * to push out all data of the previous packet (+ spillover) before
1956
         * continuing to parse new superframes in the current packet. */
1957
577k
        if (s->sframe_cache_size > 0) {
1958
567k
            int cnt = get_bits_count(gb);
1959
567k
            if (cnt + s->spillover_nbits > avpkt->size * 8) {
1960
493
                s->spillover_nbits = avpkt->size * 8 - cnt;
1961
493
            }
1962
567k
            copy_bits(&s->pb, buf, size, gb, s->spillover_nbits);
1963
567k
            flush_put_bits(&s->pb);
1964
567k
            s->sframe_cache_size += s->spillover_nbits;
1965
567k
            if ((res = synth_superframe(ctx, frame, got_frame_ptr)) == 0 &&
1966
567k
                *got_frame_ptr) {
1967
349k
                cnt += s->spillover_nbits;
1968
349k
                s->skip_bits_next = cnt & 7;
1969
349k
                res = cnt >> 3;
1970
349k
                return res;
1971
349k
            } else
1972
218k
                skip_bits_long (gb, s->spillover_nbits - cnt +
1973
218k
                                get_bits_count(gb)); // resync
1974
567k
        } else if (s->spillover_nbits) {
1975
4.77k
            skip_bits_long(gb, s->spillover_nbits);  // resync
1976
4.77k
        }
1977
763k
    } else if (s->skip_bits_next)
1978
358k
        skip_bits(gb, s->skip_bits_next);
1979
1980
    /* Try parsing superframes in current packet */
1981
992k
    s->sframe_cache_size = 0;
1982
992k
    s->skip_bits_next = 0;
1983
992k
    pos = get_bits_left(gb);
1984
992k
    if (s->nb_superframes-- == 0) {
1985
9.05k
        *got_frame_ptr = 0;
1986
9.05k
        return size;
1987
983k
    } else if (s->nb_superframes > 0) {
1988
209k
        if ((res = synth_superframe(ctx, frame, got_frame_ptr)) < 0) {
1989
196k
            return res;
1990
196k
        } else if (*got_frame_ptr) {
1991
12.7k
            int cnt = get_bits_count(gb);
1992
12.7k
            s->skip_bits_next = cnt & 7;
1993
12.7k
            res = cnt >> 3;
1994
12.7k
            return res;
1995
12.7k
        }
1996
773k
    } else if ((s->sframe_cache_size = pos) > 0) {
1997
        /* ... cache it for spillover in next packet */
1998
771k
        init_put_bits(&s->pb, s->sframe_cache, SFRAME_CACHE_MAXSIZE);
1999
771k
        copy_bits(&s->pb, buf, size, gb, s->sframe_cache_size);
2000
        // FIXME bad - just copy bytes as whole and add use the
2001
        // skip_bits_next field
2002
771k
    }
2003
2004
773k
    return size;
2005
992k
}
2006
2007
static av_cold int wmavoice_decode_end(AVCodecContext *ctx)
2008
1.53k
{
2009
1.53k
    WMAVoiceContext *s = ctx->priv_data;
2010
2011
1.53k
    if (s->do_apf) {
2012
732
        av_tx_uninit(&s->rdft);
2013
732
        av_tx_uninit(&s->irdft);
2014
732
        av_tx_uninit(&s->dct);
2015
732
        av_tx_uninit(&s->dst);
2016
732
    }
2017
2018
1.53k
    return 0;
2019
1.53k
}
2020
2021
const FFCodec ff_wmavoice_decoder = {
2022
    .p.name           = "wmavoice",
2023
    CODEC_LONG_NAME("Windows Media Audio Voice"),
2024
    .p.type           = AVMEDIA_TYPE_AUDIO,
2025
    .p.id             = AV_CODEC_ID_WMAVOICE,
2026
    .priv_data_size   = sizeof(WMAVoiceContext),
2027
    .init             = wmavoice_decode_init,
2028
    .close            = wmavoice_decode_end,
2029
    FF_CODEC_DECODE_CB(wmavoice_decode_packet),
2030
    .p.capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY,
2031
    .caps_internal    = FF_CODEC_CAP_INIT_CLEANUP,
2032
    .flush            = wmavoice_flush,
2033
};