Coverage Report

Created: 2026-04-12 08:03

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/opus/celt/mdct.c
Line
Count
Source
1
/* Copyright (c) 2007-2008 CSIRO
2
   Copyright (c) 2007-2008 Xiph.Org Foundation
3
   Written by Jean-Marc Valin */
4
/*
5
   Redistribution and use in source and binary forms, with or without
6
   modification, are permitted provided that the following conditions
7
   are met:
8
9
   - Redistributions of source code must retain the above copyright
10
   notice, this list of conditions and the following disclaimer.
11
12
   - Redistributions in binary form must reproduce the above copyright
13
   notice, this list of conditions and the following disclaimer in the
14
   documentation and/or other materials provided with the distribution.
15
16
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19
   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
20
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
*/
28
29
/* This is a simple MDCT implementation that uses a N/4 complex FFT
30
   to do most of the work. It should be relatively straightforward to
31
   plug in pretty much and FFT here.
32
33
   This replaces the Vorbis FFT (and uses the exact same API), which
34
   was a bit too messy and that was ending up duplicating code
35
   (might as well use the same FFT everywhere).
36
37
   The algorithm is similar to (and inspired from) Fabrice Bellard's
38
   MDCT implementation in FFMPEG, but has differences in signs, ordering
39
   and scaling in many places.
40
*/
41
42
#ifndef SKIP_CONFIG_H
43
#ifdef HAVE_CONFIG_H
44
#include "config.h"
45
#endif
46
#endif
47
48
#include "mdct.h"
49
#include "kiss_fft.h"
50
#include "_kiss_fft_guts.h"
51
#include <math.h>
52
#include "os_support.h"
53
#include "mathops.h"
54
#include "stack_alloc.h"
55
56
#if defined(FIXED_POINT) && defined(__mips) && __mips == 32
57
#include "mips/mdct_mipsr1.h"
58
#endif
59
60
#ifndef M_PI
61
#define M_PI 3.141592653
62
#endif
63
64
#ifdef CUSTOM_MODES
65
66
int clt_mdct_init(mdct_lookup *l,int N, int maxshift, int arch)
67
{
68
   int i;
69
   kiss_twiddle_scalar *trig;
70
   int shift;
71
   int N2=N>>1;
72
   l->n = N;
73
   l->maxshift = maxshift;
74
   for (i=0;i<=maxshift;i++)
75
   {
76
      if (i==0)
77
         l->kfft[i] = opus_fft_alloc(N>>2>>i, 0, 0, arch);
78
      else
79
         l->kfft[i] = opus_fft_alloc_twiddles(N>>2>>i, 0, 0, l->kfft[0], arch);
80
#ifndef ENABLE_TI_DSPLIB55
81
      if (l->kfft[i]==NULL)
82
         return 0;
83
#endif
84
   }
85
   l->trig = trig = (kiss_twiddle_scalar*)opus_alloc((N-(N2>>maxshift))*sizeof(kiss_twiddle_scalar));
86
   if (l->trig==NULL)
87
     return 0;
88
   for (shift=0;shift<=maxshift;shift++)
89
   {
90
      /* We have enough points that sine isn't necessary */
91
#if defined(FIXED_POINT)
92
#ifndef ENABLE_QEXT
93
      for (i=0;i<N2;i++)
94
         trig[i] = TRIG_UPSCALE*celt_cos_norm(DIV32(ADD32(SHL32(EXTEND32(i),17),N2+16384),N));
95
#else
96
      for (i=0;i<N2;i++)
97
         trig[i] = (kiss_twiddle_scalar)MAX32(-2147483647,MIN32(2147483647,floor(.5+2147483648*cos(2*M_PI*(i+.125)/N))));
98
#endif
99
#else
100
      for (i=0;i<N2;i++)
101
         trig[i] = (kiss_twiddle_scalar)cos(2*PI*(i+.125)/N);
102
#endif
103
      trig += N2;
104
      N2 >>= 1;
105
      N >>= 1;
106
   }
107
   return 1;
108
}
109
110
void clt_mdct_clear(mdct_lookup *l, int arch)
111
{
112
   int i;
113
   for (i=0;i<=l->maxshift;i++)
114
      opus_fft_free(l->kfft[i], arch);
115
   opus_free((kiss_twiddle_scalar*)l->trig);
116
}
117
118
#endif /* CUSTOM_MODES */
119
120
/* Forward MDCT trashes the input array */
121
#ifndef OVERRIDE_clt_mdct_forward
122
void clt_mdct_forward_c(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
123
      const celt_coef *window, int overlap, int shift, int stride, int arch)
124
336M
{
125
336M
   int i;
126
336M
   int N, N2, N4;
127
336M
   VARDECL(kiss_fft_scalar, f);
128
336M
   VARDECL(kiss_fft_cpx, f2);
129
336M
   const kiss_fft_state *st = l->kfft[shift];
130
336M
   const kiss_twiddle_scalar *trig;
131
336M
   celt_coef scale;
132
#ifdef FIXED_POINT
133
   /* Allows us to scale with MULT16_32_Q16(), which is faster than
134
      MULT16_32_Q15() on ARM. */
135
   int scale_shift = st->scale_shift-1;
136
   int headroom;
137
#endif
138
336M
   SAVE_STACK;
139
336M
   (void)arch;
140
336M
   scale = st->scale;
141
142
336M
   N = l->n;
143
336M
   trig = l->trig;
144
1.05G
   for (i=0;i<shift;i++)
145
722M
   {
146
722M
      N >>= 1;
147
722M
      trig += N;
148
722M
   }
149
336M
   N2 = N>>1;
150
336M
   N4 = N>>2;
151
152
336M
   ALLOC(f, N2, kiss_fft_scalar);
153
336M
   ALLOC(f2, N4, kiss_fft_cpx);
154
155
   /* Consider the input to be composed of four blocks: [a, b, c, d] */
156
   /* Window, shuffle, fold */
157
336M
   {
158
      /* Temp pointers to make it really clear to the compiler what we're doing */
159
336M
      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1);
160
336M
      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1);
161
336M
      kiss_fft_scalar * OPUS_RESTRICT yp = f;
162
336M
      const celt_coef * OPUS_RESTRICT wp1 = window+(overlap>>1);
163
336M
      const celt_coef * OPUS_RESTRICT wp2 = window+(overlap>>1)-1;
164
10.5G
      for(i=0;i<((overlap+3)>>2);i++)
165
10.2G
      {
166
         /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
167
10.2G
         *yp++ = S_MUL(xp1[N2], *wp2) + S_MUL(*xp2, *wp1);
168
10.2G
         *yp++ = S_MUL(*xp1, *wp1)    - S_MUL(xp2[-N2], *wp2);
169
10.2G
         xp1+=2;
170
10.2G
         xp2-=2;
171
10.2G
         wp1+=2;
172
10.2G
         wp2-=2;
173
10.2G
      }
174
336M
      wp1 = window;
175
336M
      wp2 = window+overlap-1;
176
34.4G
      for(;i<N4-((overlap+3)>>2);i++)
177
34.1G
      {
178
         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
179
34.1G
         *yp++ = *xp2;
180
34.1G
         *yp++ = *xp1;
181
34.1G
         xp1+=2;
182
34.1G
         xp2-=2;
183
34.1G
      }
184
10.5G
      for(;i<N4;i++)
185
10.2G
      {
186
         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
187
10.2G
         *yp++ =  -S_MUL(xp1[-N2], *wp1) + S_MUL(*xp2, *wp2);
188
10.2G
         *yp++ = S_MUL(*xp1, *wp2)     + S_MUL(xp2[N2], *wp1);
189
10.2G
         xp1+=2;
190
10.2G
         xp2-=2;
191
10.2G
         wp1+=2;
192
10.2G
         wp2-=2;
193
10.2G
      }
194
336M
   }
195
   /* Pre-rotation */
196
336M
   {
197
336M
      kiss_fft_scalar * OPUS_RESTRICT yp = f;
198
336M
      const kiss_twiddle_scalar *t = &trig[0];
199
#ifdef FIXED_POINT
200
      opus_val32 maxval=1;
201
#endif
202
54.9G
      for(i=0;i<N4;i++)
203
54.6G
      {
204
54.6G
         kiss_fft_cpx yc;
205
54.6G
         kiss_twiddle_scalar t0, t1;
206
54.6G
         kiss_fft_scalar re, im, yr, yi;
207
54.6G
         t0 = t[i];
208
54.6G
         t1 = t[N4+i];
209
54.6G
         re = *yp++;
210
54.6G
         im = *yp++;
211
54.6G
         yr = S_MUL(re,t0)  -  S_MUL(im,t1);
212
54.6G
         yi = S_MUL(im,t0)  +  S_MUL(re,t1);
213
         /* For QEXT, it's best to scale before the FFT, but otherwise it's best to scale after.
214
            For floating-point it doesn't matter. */
215
#ifdef ENABLE_QEXT
216
         yc.r = yr;
217
         yc.i = yi;
218
#else
219
27.3G
         yc.r = S_MUL2(yr, scale);
220
27.3G
         yc.i = S_MUL2(yi, scale);
221
#endif
222
#ifdef FIXED_POINT
223
54.4G
         maxval = MAX32(maxval, MAX32(ABS32(yc.r), ABS32(yc.i)));
224
#endif
225
54.6G
         f2[st->bitrev[i]] = yc;
226
54.6G
      }
227
#ifdef FIXED_POINT
228
334M
      headroom = IMAX(0, IMIN(scale_shift, 28-celt_ilog2(maxval)));
229
#endif
230
336M
   }
231
232
   /* N/4 complex FFT, does not downscale anymore */
233
336M
   opus_fft_impl(st, f2 ARG_FIXED(scale_shift-headroom));
234
235
   /* Post-rotate */
236
336M
   {
237
      /* Temp pointers to make it really clear to the compiler what we're doing */
238
336M
      const kiss_fft_cpx * OPUS_RESTRICT fp = f2;
239
336M
      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
240
336M
      kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
241
336M
      const kiss_twiddle_scalar *t = &trig[0];
242
      /* Temp pointers to make it really clear to the compiler what we're doing */
243
54.9G
      for(i=0;i<N4;i++)
244
54.6G
      {
245
54.6G
         kiss_fft_scalar yr, yi;
246
54.6G
         kiss_fft_scalar t0, t1;
247
#ifdef ENABLE_QEXT
248
27.3G
         t0 = S_MUL2(t[i], scale);
249
27.3G
         t1 = S_MUL2(t[N4+i], scale);
250
#else
251
         t0 = t[i];
252
         t1 = t[N4+i];
253
#endif
254
54.6G
         yr = PSHR32(S_MUL(fp->i,t1) - S_MUL(fp->r,t0), headroom);
255
54.6G
         yi = PSHR32(S_MUL(fp->r,t1) + S_MUL(fp->i,t0), headroom);
256
54.6G
         *yp1 = yr;
257
54.6G
         *yp2 = yi;
258
54.6G
         fp++;
259
54.6G
         yp1 += 2*stride;
260
54.6G
         yp2 -= 2*stride;
261
54.6G
      }
262
336M
   }
263
336M
   RESTORE_STACK;
264
336M
}
clt_mdct_forward_c
Line
Count
Source
124
167M
{
125
167M
   int i;
126
167M
   int N, N2, N4;
127
167M
   VARDECL(kiss_fft_scalar, f);
128
167M
   VARDECL(kiss_fft_cpx, f2);
129
167M
   const kiss_fft_state *st = l->kfft[shift];
130
167M
   const kiss_twiddle_scalar *trig;
131
167M
   celt_coef scale;
132
167M
#ifdef FIXED_POINT
133
   /* Allows us to scale with MULT16_32_Q16(), which is faster than
134
      MULT16_32_Q15() on ARM. */
135
167M
   int scale_shift = st->scale_shift-1;
136
167M
   int headroom;
137
167M
#endif
138
167M
   SAVE_STACK;
139
167M
   (void)arch;
140
167M
   scale = st->scale;
141
142
167M
   N = l->n;
143
167M
   trig = l->trig;
144
527M
   for (i=0;i<shift;i++)
145
359M
   {
146
359M
      N >>= 1;
147
359M
      trig += N;
148
359M
   }
149
167M
   N2 = N>>1;
150
167M
   N4 = N>>2;
151
152
167M
   ALLOC(f, N2, kiss_fft_scalar);
153
167M
   ALLOC(f2, N4, kiss_fft_cpx);
154
155
   /* Consider the input to be composed of four blocks: [a, b, c, d] */
156
   /* Window, shuffle, fold */
157
167M
   {
158
      /* Temp pointers to make it really clear to the compiler what we're doing */
159
167M
      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1);
160
167M
      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1);
161
167M
      kiss_fft_scalar * OPUS_RESTRICT yp = f;
162
167M
      const celt_coef * OPUS_RESTRICT wp1 = window+(overlap>>1);
163
167M
      const celt_coef * OPUS_RESTRICT wp2 = window+(overlap>>1)-1;
164
5.26G
      for(i=0;i<((overlap+3)>>2);i++)
165
5.10G
      {
166
         /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
167
5.10G
         *yp++ = S_MUL(xp1[N2], *wp2) + S_MUL(*xp2, *wp1);
168
5.10G
         *yp++ = S_MUL(*xp1, *wp1)    - S_MUL(xp2[-N2], *wp2);
169
5.10G
         xp1+=2;
170
5.10G
         xp2-=2;
171
5.10G
         wp1+=2;
172
5.10G
         wp2-=2;
173
5.10G
      }
174
167M
      wp1 = window;
175
167M
      wp2 = window+overlap-1;
176
17.1G
      for(;i<N4-((overlap+3)>>2);i++)
177
17.0G
      {
178
         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
179
17.0G
         *yp++ = *xp2;
180
17.0G
         *yp++ = *xp1;
181
17.0G
         xp1+=2;
182
17.0G
         xp2-=2;
183
17.0G
      }
184
5.26G
      for(;i<N4;i++)
185
5.10G
      {
186
         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
187
5.10G
         *yp++ =  -S_MUL(xp1[-N2], *wp1) + S_MUL(*xp2, *wp2);
188
5.10G
         *yp++ = S_MUL(*xp1, *wp2)     + S_MUL(xp2[N2], *wp1);
189
5.10G
         xp1+=2;
190
5.10G
         xp2-=2;
191
5.10G
         wp1+=2;
192
5.10G
         wp2-=2;
193
5.10G
      }
194
167M
   }
195
   /* Pre-rotation */
196
167M
   {
197
167M
      kiss_fft_scalar * OPUS_RESTRICT yp = f;
198
167M
      const kiss_twiddle_scalar *t = &trig[0];
199
167M
#ifdef FIXED_POINT
200
167M
      opus_val32 maxval=1;
201
167M
#endif
202
27.3G
      for(i=0;i<N4;i++)
203
27.2G
      {
204
27.2G
         kiss_fft_cpx yc;
205
27.2G
         kiss_twiddle_scalar t0, t1;
206
27.2G
         kiss_fft_scalar re, im, yr, yi;
207
27.2G
         t0 = t[i];
208
27.2G
         t1 = t[N4+i];
209
27.2G
         re = *yp++;
210
27.2G
         im = *yp++;
211
27.2G
         yr = S_MUL(re,t0)  -  S_MUL(im,t1);
212
27.2G
         yi = S_MUL(im,t0)  +  S_MUL(re,t1);
213
         /* For QEXT, it's best to scale before the FFT, but otherwise it's best to scale after.
214
            For floating-point it doesn't matter. */
215
#ifdef ENABLE_QEXT
216
         yc.r = yr;
217
         yc.i = yi;
218
#else
219
27.2G
         yc.r = S_MUL2(yr, scale);
220
27.2G
         yc.i = S_MUL2(yi, scale);
221
27.2G
#endif
222
27.2G
#ifdef FIXED_POINT
223
27.2G
         maxval = MAX32(maxval, MAX32(ABS32(yc.r), ABS32(yc.i)));
224
27.2G
#endif
225
27.2G
         f2[st->bitrev[i]] = yc;
226
27.2G
      }
227
167M
#ifdef FIXED_POINT
228
167M
      headroom = IMAX(0, IMIN(scale_shift, 28-celt_ilog2(maxval)));
229
167M
#endif
230
167M
   }
231
232
   /* N/4 complex FFT, does not downscale anymore */
233
167M
   opus_fft_impl(st, f2 ARG_FIXED(scale_shift-headroom));
234
235
   /* Post-rotate */
236
167M
   {
237
      /* Temp pointers to make it really clear to the compiler what we're doing */
238
167M
      const kiss_fft_cpx * OPUS_RESTRICT fp = f2;
239
167M
      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
240
167M
      kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
241
167M
      const kiss_twiddle_scalar *t = &trig[0];
242
      /* Temp pointers to make it really clear to the compiler what we're doing */
243
27.3G
      for(i=0;i<N4;i++)
244
27.2G
      {
245
27.2G
         kiss_fft_scalar yr, yi;
246
27.2G
         kiss_fft_scalar t0, t1;
247
#ifdef ENABLE_QEXT
248
         t0 = S_MUL2(t[i], scale);
249
         t1 = S_MUL2(t[N4+i], scale);
250
#else
251
27.2G
         t0 = t[i];
252
27.2G
         t1 = t[N4+i];
253
27.2G
#endif
254
27.2G
         yr = PSHR32(S_MUL(fp->i,t1) - S_MUL(fp->r,t0), headroom);
255
27.2G
         yi = PSHR32(S_MUL(fp->r,t1) + S_MUL(fp->i,t0), headroom);
256
27.2G
         *yp1 = yr;
257
27.2G
         *yp2 = yi;
258
27.2G
         fp++;
259
27.2G
         yp1 += 2*stride;
260
27.2G
         yp2 -= 2*stride;
261
27.2G
      }
262
167M
   }
263
167M
   RESTORE_STACK;
264
167M
}
clt_mdct_forward_c
Line
Count
Source
124
167M
{
125
167M
   int i;
126
167M
   int N, N2, N4;
127
167M
   VARDECL(kiss_fft_scalar, f);
128
167M
   VARDECL(kiss_fft_cpx, f2);
129
167M
   const kiss_fft_state *st = l->kfft[shift];
130
167M
   const kiss_twiddle_scalar *trig;
131
167M
   celt_coef scale;
132
167M
#ifdef FIXED_POINT
133
   /* Allows us to scale with MULT16_32_Q16(), which is faster than
134
      MULT16_32_Q15() on ARM. */
135
167M
   int scale_shift = st->scale_shift-1;
136
167M
   int headroom;
137
167M
#endif
138
167M
   SAVE_STACK;
139
167M
   (void)arch;
140
167M
   scale = st->scale;
141
142
167M
   N = l->n;
143
167M
   trig = l->trig;
144
527M
   for (i=0;i<shift;i++)
145
359M
   {
146
359M
      N >>= 1;
147
359M
      trig += N;
148
359M
   }
149
167M
   N2 = N>>1;
150
167M
   N4 = N>>2;
151
152
167M
   ALLOC(f, N2, kiss_fft_scalar);
153
167M
   ALLOC(f2, N4, kiss_fft_cpx);
154
155
   /* Consider the input to be composed of four blocks: [a, b, c, d] */
156
   /* Window, shuffle, fold */
157
167M
   {
158
      /* Temp pointers to make it really clear to the compiler what we're doing */
159
167M
      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1);
160
167M
      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1);
161
167M
      kiss_fft_scalar * OPUS_RESTRICT yp = f;
162
167M
      const celt_coef * OPUS_RESTRICT wp1 = window+(overlap>>1);
163
167M
      const celt_coef * OPUS_RESTRICT wp2 = window+(overlap>>1)-1;
164
5.26G
      for(i=0;i<((overlap+3)>>2);i++)
165
5.10G
      {
166
         /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
167
5.10G
         *yp++ = S_MUL(xp1[N2], *wp2) + S_MUL(*xp2, *wp1);
168
5.10G
         *yp++ = S_MUL(*xp1, *wp1)    - S_MUL(xp2[-N2], *wp2);
169
5.10G
         xp1+=2;
170
5.10G
         xp2-=2;
171
5.10G
         wp1+=2;
172
5.10G
         wp2-=2;
173
5.10G
      }
174
167M
      wp1 = window;
175
167M
      wp2 = window+overlap-1;
176
17.1G
      for(;i<N4-((overlap+3)>>2);i++)
177
17.0G
      {
178
         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
179
17.0G
         *yp++ = *xp2;
180
17.0G
         *yp++ = *xp1;
181
17.0G
         xp1+=2;
182
17.0G
         xp2-=2;
183
17.0G
      }
184
5.26G
      for(;i<N4;i++)
185
5.10G
      {
186
         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
187
5.10G
         *yp++ =  -S_MUL(xp1[-N2], *wp1) + S_MUL(*xp2, *wp2);
188
5.10G
         *yp++ = S_MUL(*xp1, *wp2)     + S_MUL(xp2[N2], *wp1);
189
5.10G
         xp1+=2;
190
5.10G
         xp2-=2;
191
5.10G
         wp1+=2;
192
5.10G
         wp2-=2;
193
5.10G
      }
194
167M
   }
195
   /* Pre-rotation */
196
167M
   {
197
167M
      kiss_fft_scalar * OPUS_RESTRICT yp = f;
198
167M
      const kiss_twiddle_scalar *t = &trig[0];
199
167M
#ifdef FIXED_POINT
200
167M
      opus_val32 maxval=1;
201
167M
#endif
202
27.3G
      for(i=0;i<N4;i++)
203
27.2G
      {
204
27.2G
         kiss_fft_cpx yc;
205
27.2G
         kiss_twiddle_scalar t0, t1;
206
27.2G
         kiss_fft_scalar re, im, yr, yi;
207
27.2G
         t0 = t[i];
208
27.2G
         t1 = t[N4+i];
209
27.2G
         re = *yp++;
210
27.2G
         im = *yp++;
211
27.2G
         yr = S_MUL(re,t0)  -  S_MUL(im,t1);
212
27.2G
         yi = S_MUL(im,t0)  +  S_MUL(re,t1);
213
         /* For QEXT, it's best to scale before the FFT, but otherwise it's best to scale after.
214
            For floating-point it doesn't matter. */
215
27.2G
#ifdef ENABLE_QEXT
216
27.2G
         yc.r = yr;
217
27.2G
         yc.i = yi;
218
#else
219
         yc.r = S_MUL2(yr, scale);
220
         yc.i = S_MUL2(yi, scale);
221
#endif
222
27.2G
#ifdef FIXED_POINT
223
27.2G
         maxval = MAX32(maxval, MAX32(ABS32(yc.r), ABS32(yc.i)));
224
27.2G
#endif
225
27.2G
         f2[st->bitrev[i]] = yc;
226
27.2G
      }
227
167M
#ifdef FIXED_POINT
228
167M
      headroom = IMAX(0, IMIN(scale_shift, 28-celt_ilog2(maxval)));
229
167M
#endif
230
167M
   }
231
232
   /* N/4 complex FFT, does not downscale anymore */
233
167M
   opus_fft_impl(st, f2 ARG_FIXED(scale_shift-headroom));
234
235
   /* Post-rotate */
236
167M
   {
237
      /* Temp pointers to make it really clear to the compiler what we're doing */
238
167M
      const kiss_fft_cpx * OPUS_RESTRICT fp = f2;
239
167M
      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
240
167M
      kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
241
167M
      const kiss_twiddle_scalar *t = &trig[0];
242
      /* Temp pointers to make it really clear to the compiler what we're doing */
243
27.3G
      for(i=0;i<N4;i++)
244
27.2G
      {
245
27.2G
         kiss_fft_scalar yr, yi;
246
27.2G
         kiss_fft_scalar t0, t1;
247
27.2G
#ifdef ENABLE_QEXT
248
27.2G
         t0 = S_MUL2(t[i], scale);
249
27.2G
         t1 = S_MUL2(t[N4+i], scale);
250
#else
251
         t0 = t[i];
252
         t1 = t[N4+i];
253
#endif
254
27.2G
         yr = PSHR32(S_MUL(fp->i,t1) - S_MUL(fp->r,t0), headroom);
255
27.2G
         yi = PSHR32(S_MUL(fp->r,t1) + S_MUL(fp->i,t0), headroom);
256
27.2G
         *yp1 = yr;
257
27.2G
         *yp2 = yi;
258
27.2G
         fp++;
259
27.2G
         yp1 += 2*stride;
260
27.2G
         yp2 -= 2*stride;
261
27.2G
      }
262
167M
   }
263
167M
   RESTORE_STACK;
264
167M
}
clt_mdct_forward_c
Line
Count
Source
124
736k
{
125
736k
   int i;
126
736k
   int N, N2, N4;
127
736k
   VARDECL(kiss_fft_scalar, f);
128
736k
   VARDECL(kiss_fft_cpx, f2);
129
736k
   const kiss_fft_state *st = l->kfft[shift];
130
736k
   const kiss_twiddle_scalar *trig;
131
736k
   celt_coef scale;
132
#ifdef FIXED_POINT
133
   /* Allows us to scale with MULT16_32_Q16(), which is faster than
134
      MULT16_32_Q15() on ARM. */
135
   int scale_shift = st->scale_shift-1;
136
   int headroom;
137
#endif
138
736k
   SAVE_STACK;
139
736k
   (void)arch;
140
736k
   scale = st->scale;
141
142
736k
   N = l->n;
143
736k
   trig = l->trig;
144
2.59M
   for (i=0;i<shift;i++)
145
1.86M
   {
146
1.86M
      N >>= 1;
147
1.86M
      trig += N;
148
1.86M
   }
149
736k
   N2 = N>>1;
150
736k
   N4 = N>>2;
151
152
736k
   ALLOC(f, N2, kiss_fft_scalar);
153
736k
   ALLOC(f2, N4, kiss_fft_cpx);
154
155
   /* Consider the input to be composed of four blocks: [a, b, c, d] */
156
   /* Window, shuffle, fold */
157
736k
   {
158
      /* Temp pointers to make it really clear to the compiler what we're doing */
159
736k
      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1);
160
736k
      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1);
161
736k
      kiss_fft_scalar * OPUS_RESTRICT yp = f;
162
736k
      const celt_coef * OPUS_RESTRICT wp1 = window+(overlap>>1);
163
736k
      const celt_coef * OPUS_RESTRICT wp2 = window+(overlap>>1)-1;
164
23.1M
      for(i=0;i<((overlap+3)>>2);i++)
165
22.4M
      {
166
         /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
167
22.4M
         *yp++ = S_MUL(xp1[N2], *wp2) + S_MUL(*xp2, *wp1);
168
22.4M
         *yp++ = S_MUL(*xp1, *wp1)    - S_MUL(xp2[-N2], *wp2);
169
22.4M
         xp1+=2;
170
22.4M
         xp2-=2;
171
22.4M
         wp1+=2;
172
22.4M
         wp2-=2;
173
22.4M
      }
174
736k
      wp1 = window;
175
736k
      wp2 = window+overlap-1;
176
47.7M
      for(;i<N4-((overlap+3)>>2);i++)
177
46.9M
      {
178
         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
179
46.9M
         *yp++ = *xp2;
180
46.9M
         *yp++ = *xp1;
181
46.9M
         xp1+=2;
182
46.9M
         xp2-=2;
183
46.9M
      }
184
23.1M
      for(;i<N4;i++)
185
22.4M
      {
186
         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
187
22.4M
         *yp++ =  -S_MUL(xp1[-N2], *wp1) + S_MUL(*xp2, *wp2);
188
22.4M
         *yp++ = S_MUL(*xp1, *wp2)     + S_MUL(xp2[N2], *wp1);
189
22.4M
         xp1+=2;
190
22.4M
         xp2-=2;
191
22.4M
         wp1+=2;
192
22.4M
         wp2-=2;
193
22.4M
      }
194
736k
   }
195
   /* Pre-rotation */
196
736k
   {
197
736k
      kiss_fft_scalar * OPUS_RESTRICT yp = f;
198
736k
      const kiss_twiddle_scalar *t = &trig[0];
199
#ifdef FIXED_POINT
200
      opus_val32 maxval=1;
201
#endif
202
92.5M
      for(i=0;i<N4;i++)
203
91.8M
      {
204
91.8M
         kiss_fft_cpx yc;
205
91.8M
         kiss_twiddle_scalar t0, t1;
206
91.8M
         kiss_fft_scalar re, im, yr, yi;
207
91.8M
         t0 = t[i];
208
91.8M
         t1 = t[N4+i];
209
91.8M
         re = *yp++;
210
91.8M
         im = *yp++;
211
91.8M
         yr = S_MUL(re,t0)  -  S_MUL(im,t1);
212
91.8M
         yi = S_MUL(im,t0)  +  S_MUL(re,t1);
213
         /* For QEXT, it's best to scale before the FFT, but otherwise it's best to scale after.
214
            For floating-point it doesn't matter. */
215
91.8M
#ifdef ENABLE_QEXT
216
91.8M
         yc.r = yr;
217
91.8M
         yc.i = yi;
218
#else
219
         yc.r = S_MUL2(yr, scale);
220
         yc.i = S_MUL2(yi, scale);
221
#endif
222
#ifdef FIXED_POINT
223
         maxval = MAX32(maxval, MAX32(ABS32(yc.r), ABS32(yc.i)));
224
#endif
225
91.8M
         f2[st->bitrev[i]] = yc;
226
91.8M
      }
227
#ifdef FIXED_POINT
228
      headroom = IMAX(0, IMIN(scale_shift, 28-celt_ilog2(maxval)));
229
#endif
230
736k
   }
231
232
   /* N/4 complex FFT, does not downscale anymore */
233
736k
   opus_fft_impl(st, f2 ARG_FIXED(scale_shift-headroom));
234
235
   /* Post-rotate */
236
736k
   {
237
      /* Temp pointers to make it really clear to the compiler what we're doing */
238
736k
      const kiss_fft_cpx * OPUS_RESTRICT fp = f2;
239
736k
      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
240
736k
      kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
241
736k
      const kiss_twiddle_scalar *t = &trig[0];
242
      /* Temp pointers to make it really clear to the compiler what we're doing */
243
92.5M
      for(i=0;i<N4;i++)
244
91.8M
      {
245
91.8M
         kiss_fft_scalar yr, yi;
246
91.8M
         kiss_fft_scalar t0, t1;
247
91.8M
#ifdef ENABLE_QEXT
248
91.8M
         t0 = S_MUL2(t[i], scale);
249
91.8M
         t1 = S_MUL2(t[N4+i], scale);
250
#else
251
         t0 = t[i];
252
         t1 = t[N4+i];
253
#endif
254
91.8M
         yr = PSHR32(S_MUL(fp->i,t1) - S_MUL(fp->r,t0), headroom);
255
91.8M
         yi = PSHR32(S_MUL(fp->r,t1) + S_MUL(fp->i,t0), headroom);
256
91.8M
         *yp1 = yr;
257
91.8M
         *yp2 = yi;
258
91.8M
         fp++;
259
91.8M
         yp1 += 2*stride;
260
91.8M
         yp2 -= 2*stride;
261
91.8M
      }
262
736k
   }
263
736k
   RESTORE_STACK;
264
736k
}
clt_mdct_forward_c
Line
Count
Source
124
736k
{
125
736k
   int i;
126
736k
   int N, N2, N4;
127
736k
   VARDECL(kiss_fft_scalar, f);
128
736k
   VARDECL(kiss_fft_cpx, f2);
129
736k
   const kiss_fft_state *st = l->kfft[shift];
130
736k
   const kiss_twiddle_scalar *trig;
131
736k
   celt_coef scale;
132
#ifdef FIXED_POINT
133
   /* Allows us to scale with MULT16_32_Q16(), which is faster than
134
      MULT16_32_Q15() on ARM. */
135
   int scale_shift = st->scale_shift-1;
136
   int headroom;
137
#endif
138
736k
   SAVE_STACK;
139
736k
   (void)arch;
140
736k
   scale = st->scale;
141
142
736k
   N = l->n;
143
736k
   trig = l->trig;
144
2.59M
   for (i=0;i<shift;i++)
145
1.86M
   {
146
1.86M
      N >>= 1;
147
1.86M
      trig += N;
148
1.86M
   }
149
736k
   N2 = N>>1;
150
736k
   N4 = N>>2;
151
152
736k
   ALLOC(f, N2, kiss_fft_scalar);
153
736k
   ALLOC(f2, N4, kiss_fft_cpx);
154
155
   /* Consider the input to be composed of four blocks: [a, b, c, d] */
156
   /* Window, shuffle, fold */
157
736k
   {
158
      /* Temp pointers to make it really clear to the compiler what we're doing */
159
736k
      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1);
160
736k
      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1);
161
736k
      kiss_fft_scalar * OPUS_RESTRICT yp = f;
162
736k
      const celt_coef * OPUS_RESTRICT wp1 = window+(overlap>>1);
163
736k
      const celt_coef * OPUS_RESTRICT wp2 = window+(overlap>>1)-1;
164
23.1M
      for(i=0;i<((overlap+3)>>2);i++)
165
22.4M
      {
166
         /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
167
22.4M
         *yp++ = S_MUL(xp1[N2], *wp2) + S_MUL(*xp2, *wp1);
168
22.4M
         *yp++ = S_MUL(*xp1, *wp1)    - S_MUL(xp2[-N2], *wp2);
169
22.4M
         xp1+=2;
170
22.4M
         xp2-=2;
171
22.4M
         wp1+=2;
172
22.4M
         wp2-=2;
173
22.4M
      }
174
736k
      wp1 = window;
175
736k
      wp2 = window+overlap-1;
176
47.7M
      for(;i<N4-((overlap+3)>>2);i++)
177
46.9M
      {
178
         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
179
46.9M
         *yp++ = *xp2;
180
46.9M
         *yp++ = *xp1;
181
46.9M
         xp1+=2;
182
46.9M
         xp2-=2;
183
46.9M
      }
184
23.1M
      for(;i<N4;i++)
185
22.4M
      {
186
         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
187
22.4M
         *yp++ =  -S_MUL(xp1[-N2], *wp1) + S_MUL(*xp2, *wp2);
188
22.4M
         *yp++ = S_MUL(*xp1, *wp2)     + S_MUL(xp2[N2], *wp1);
189
22.4M
         xp1+=2;
190
22.4M
         xp2-=2;
191
22.4M
         wp1+=2;
192
22.4M
         wp2-=2;
193
22.4M
      }
194
736k
   }
195
   /* Pre-rotation */
196
736k
   {
197
736k
      kiss_fft_scalar * OPUS_RESTRICT yp = f;
198
736k
      const kiss_twiddle_scalar *t = &trig[0];
199
#ifdef FIXED_POINT
200
      opus_val32 maxval=1;
201
#endif
202
92.5M
      for(i=0;i<N4;i++)
203
91.8M
      {
204
91.8M
         kiss_fft_cpx yc;
205
91.8M
         kiss_twiddle_scalar t0, t1;
206
91.8M
         kiss_fft_scalar re, im, yr, yi;
207
91.8M
         t0 = t[i];
208
91.8M
         t1 = t[N4+i];
209
91.8M
         re = *yp++;
210
91.8M
         im = *yp++;
211
91.8M
         yr = S_MUL(re,t0)  -  S_MUL(im,t1);
212
91.8M
         yi = S_MUL(im,t0)  +  S_MUL(re,t1);
213
         /* For QEXT, it's best to scale before the FFT, but otherwise it's best to scale after.
214
            For floating-point it doesn't matter. */
215
#ifdef ENABLE_QEXT
216
         yc.r = yr;
217
         yc.i = yi;
218
#else
219
91.8M
         yc.r = S_MUL2(yr, scale);
220
91.8M
         yc.i = S_MUL2(yi, scale);
221
91.8M
#endif
222
#ifdef FIXED_POINT
223
         maxval = MAX32(maxval, MAX32(ABS32(yc.r), ABS32(yc.i)));
224
#endif
225
91.8M
         f2[st->bitrev[i]] = yc;
226
91.8M
      }
227
#ifdef FIXED_POINT
228
      headroom = IMAX(0, IMIN(scale_shift, 28-celt_ilog2(maxval)));
229
#endif
230
736k
   }
231
232
   /* N/4 complex FFT, does not downscale anymore */
233
736k
   opus_fft_impl(st, f2 ARG_FIXED(scale_shift-headroom));
234
235
   /* Post-rotate */
236
736k
   {
237
      /* Temp pointers to make it really clear to the compiler what we're doing */
238
736k
      const kiss_fft_cpx * OPUS_RESTRICT fp = f2;
239
736k
      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
240
736k
      kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
241
736k
      const kiss_twiddle_scalar *t = &trig[0];
242
      /* Temp pointers to make it really clear to the compiler what we're doing */
243
92.5M
      for(i=0;i<N4;i++)
244
91.8M
      {
245
91.8M
         kiss_fft_scalar yr, yi;
246
91.8M
         kiss_fft_scalar t0, t1;
247
#ifdef ENABLE_QEXT
248
         t0 = S_MUL2(t[i], scale);
249
         t1 = S_MUL2(t[N4+i], scale);
250
#else
251
91.8M
         t0 = t[i];
252
91.8M
         t1 = t[N4+i];
253
91.8M
#endif
254
91.8M
         yr = PSHR32(S_MUL(fp->i,t1) - S_MUL(fp->r,t0), headroom);
255
91.8M
         yi = PSHR32(S_MUL(fp->r,t1) + S_MUL(fp->i,t0), headroom);
256
91.8M
         *yp1 = yr;
257
91.8M
         *yp2 = yi;
258
91.8M
         fp++;
259
91.8M
         yp1 += 2*stride;
260
91.8M
         yp2 -= 2*stride;
261
91.8M
      }
262
736k
   }
263
736k
   RESTORE_STACK;
264
736k
}
265
#endif /* OVERRIDE_clt_mdct_forward */
266
267
#ifndef OVERRIDE_clt_mdct_backward
268
void clt_mdct_backward_c(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
269
      const celt_coef * OPUS_RESTRICT window, int overlap, int shift, int stride, int arch)
270
645k
{
271
645k
   int i;
272
645k
   int N, N2, N4;
273
645k
   const kiss_twiddle_scalar *trig;
274
#ifdef FIXED_POINT
275
   int pre_shift, post_shift, fft_shift;
276
#endif
277
645k
   (void) arch;
278
279
645k
   N = l->n;
280
645k
   trig = l->trig;
281
2.12M
   for (i=0;i<shift;i++)
282
1.47M
   {
283
1.47M
      N >>= 1;
284
1.47M
      trig += N;
285
1.47M
   }
286
645k
   N2 = N>>1;
287
645k
   N4 = N>>2;
288
289
#ifdef FIXED_POINT
290
   {
291
      opus_val32 sumval=N2;
292
      opus_val32 maxval=0;
293
92.6M
      for (i=0;i<N2;i++) {
294
92.3M
         maxval = MAX32(maxval, ABS32(in[i*stride]));
295
92.3M
         sumval = ADD32_ovflw(sumval, ABS32(SHR32(in[i*stride],11)));
296
92.3M
      }
297
336k
      pre_shift = IMAX(0, 29-celt_zlog2(1+maxval));
298
      /* Worst-case where all the energy goes to a single sample. */
299
336k
      post_shift = IMAX(0, 19-celt_ilog2(ABS32(sumval)));
300
336k
      post_shift = IMIN(post_shift, pre_shift);
301
      fft_shift = pre_shift - post_shift;
302
   }
303
#endif
304
   /* Pre-rotate */
305
645k
   {
306
      /* Temp pointers to make it really clear to the compiler what we're doing */
307
645k
      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
308
645k
      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);
309
645k
      kiss_fft_scalar * OPUS_RESTRICT yp = out+(overlap>>1);
310
645k
      const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0];
311
645k
      const opus_int16 * OPUS_RESTRICT bitrev = l->kfft[shift]->bitrev;
312
93.0M
      for(i=0;i<N4;i++)
313
92.4M
      {
314
92.4M
         int rev;
315
92.4M
         kiss_fft_scalar yr, yi;
316
92.4M
         opus_val32 x1, x2;
317
92.4M
         rev = *bitrev++;
318
92.4M
         x1 = SHL32_ovflw(*xp1, pre_shift);
319
92.4M
         x2 = SHL32_ovflw(*xp2, pre_shift);
320
92.4M
         yr = ADD32_ovflw(S_MUL(x2, t[i]), S_MUL(x1, t[N4+i]));
321
92.4M
         yi = SUB32_ovflw(S_MUL(x1, t[i]), S_MUL(x2, t[N4+i]));
322
         /* We swap real and imag because we use an FFT instead of an IFFT. */
323
92.4M
         yp[2*rev+1] = yr;
324
92.4M
         yp[2*rev] = yi;
325
         /* Storing the pre-rotation directly in the bitrev order. */
326
92.4M
         xp1+=2*stride;
327
92.4M
         xp2-=2*stride;
328
92.4M
      }
329
645k
   }
330
331
645k
   opus_fft_impl(l->kfft[shift], (kiss_fft_cpx*)(out+(overlap>>1)) ARG_FIXED(fft_shift));
332
333
   /* Post-rotate and de-shuffle from both ends of the buffer at once to make
334
      it in-place. */
335
645k
   {
336
645k
      kiss_fft_scalar * yp0 = out+(overlap>>1);
337
645k
      kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2;
338
645k
      const kiss_twiddle_scalar *t = &trig[0];
339
      /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the
340
         middle pair will be computed twice. */
341
46.8M
      for(i=0;i<(N4+1)>>1;i++)
342
46.2M
      {
343
46.2M
         kiss_fft_scalar re, im, yr, yi;
344
46.2M
         kiss_twiddle_scalar t0, t1;
345
         /* We swap real and imag because we're using an FFT instead of an IFFT. */
346
46.2M
         re = yp0[1];
347
46.2M
         im = yp0[0];
348
46.2M
         t0 = t[i];
349
46.2M
         t1 = t[N4+i];
350
         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
351
46.2M
         yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), post_shift);
352
46.2M
         yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), post_shift);
353
         /* We swap real and imag because we're using an FFT instead of an IFFT. */
354
46.2M
         re = yp1[1];
355
46.2M
         im = yp1[0];
356
46.2M
         yp0[0] = yr;
357
46.2M
         yp1[1] = yi;
358
359
46.2M
         t0 = t[(N4-i-1)];
360
46.2M
         t1 = t[(N2-i-1)];
361
         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
362
46.2M
         yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), post_shift);
363
46.2M
         yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), post_shift);
364
46.2M
         yp1[0] = yr;
365
46.2M
         yp0[1] = yi;
366
46.2M
         yp0 += 2;
367
46.2M
         yp1 -= 2;
368
46.2M
      }
369
645k
   }
370
371
   /* Mirror on both sides for TDAC */
372
645k
   {
373
645k
      kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1;
374
645k
      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
375
645k
      const celt_coef * OPUS_RESTRICT wp1 = window;
376
645k
      const celt_coef * OPUS_RESTRICT wp2 = window+overlap-1;
377
378
44.1M
      for(i = 0; i < overlap/2; i++)
379
43.4M
      {
380
43.4M
         kiss_fft_scalar x1, x2;
381
43.4M
         x1 = *xp1;
382
43.4M
         x2 = *yp1;
383
43.4M
         *yp1++ = SUB32_ovflw(S_MUL(x2, *wp2), S_MUL(x1, *wp1));
384
43.4M
         *xp1-- = ADD32_ovflw(S_MUL(x2, *wp1), S_MUL(x1, *wp2));
385
43.4M
         wp1++;
386
43.4M
         wp2--;
387
43.4M
      }
388
645k
   }
389
645k
}
clt_mdct_backward_c
Line
Count
Source
270
336k
{
271
336k
   int i;
272
336k
   int N, N2, N4;
273
336k
   const kiss_twiddle_scalar *trig;
274
336k
#ifdef FIXED_POINT
275
336k
   int pre_shift, post_shift, fft_shift;
276
336k
#endif
277
336k
   (void) arch;
278
279
336k
   N = l->n;
280
336k
   trig = l->trig;
281
1.12M
   for (i=0;i<shift;i++)
282
787k
   {
283
787k
      N >>= 1;
284
787k
      trig += N;
285
787k
   }
286
336k
   N2 = N>>1;
287
336k
   N4 = N>>2;
288
289
336k
#ifdef FIXED_POINT
290
336k
   {
291
336k
      opus_val32 sumval=N2;
292
336k
      opus_val32 maxval=0;
293
92.6M
      for (i=0;i<N2;i++) {
294
92.3M
         maxval = MAX32(maxval, ABS32(in[i*stride]));
295
92.3M
         sumval = ADD32_ovflw(sumval, ABS32(SHR32(in[i*stride],11)));
296
92.3M
      }
297
336k
      pre_shift = IMAX(0, 29-celt_zlog2(1+maxval));
298
      /* Worst-case where all the energy goes to a single sample. */
299
336k
      post_shift = IMAX(0, 19-celt_ilog2(ABS32(sumval)));
300
336k
      post_shift = IMIN(post_shift, pre_shift);
301
336k
      fft_shift = pre_shift - post_shift;
302
336k
   }
303
336k
#endif
304
   /* Pre-rotate */
305
336k
   {
306
      /* Temp pointers to make it really clear to the compiler what we're doing */
307
336k
      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
308
336k
      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);
309
336k
      kiss_fft_scalar * OPUS_RESTRICT yp = out+(overlap>>1);
310
336k
      const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0];
311
336k
      const opus_int16 * OPUS_RESTRICT bitrev = l->kfft[shift]->bitrev;
312
46.4M
      for(i=0;i<N4;i++)
313
46.1M
      {
314
46.1M
         int rev;
315
46.1M
         kiss_fft_scalar yr, yi;
316
46.1M
         opus_val32 x1, x2;
317
46.1M
         rev = *bitrev++;
318
46.1M
         x1 = SHL32_ovflw(*xp1, pre_shift);
319
46.1M
         x2 = SHL32_ovflw(*xp2, pre_shift);
320
46.1M
         yr = ADD32_ovflw(S_MUL(x2, t[i]), S_MUL(x1, t[N4+i]));
321
46.1M
         yi = SUB32_ovflw(S_MUL(x1, t[i]), S_MUL(x2, t[N4+i]));
322
         /* We swap real and imag because we use an FFT instead of an IFFT. */
323
46.1M
         yp[2*rev+1] = yr;
324
46.1M
         yp[2*rev] = yi;
325
         /* Storing the pre-rotation directly in the bitrev order. */
326
46.1M
         xp1+=2*stride;
327
46.1M
         xp2-=2*stride;
328
46.1M
      }
329
336k
   }
330
331
336k
   opus_fft_impl(l->kfft[shift], (kiss_fft_cpx*)(out+(overlap>>1)) ARG_FIXED(fft_shift));
332
333
   /* Post-rotate and de-shuffle from both ends of the buffer at once to make
334
      it in-place. */
335
336k
   {
336
336k
      kiss_fft_scalar * yp0 = out+(overlap>>1);
337
336k
      kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2;
338
336k
      const kiss_twiddle_scalar *t = &trig[0];
339
      /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the
340
         middle pair will be computed twice. */
341
23.4M
      for(i=0;i<(N4+1)>>1;i++)
342
23.0M
      {
343
23.0M
         kiss_fft_scalar re, im, yr, yi;
344
23.0M
         kiss_twiddle_scalar t0, t1;
345
         /* We swap real and imag because we're using an FFT instead of an IFFT. */
346
23.0M
         re = yp0[1];
347
23.0M
         im = yp0[0];
348
23.0M
         t0 = t[i];
349
23.0M
         t1 = t[N4+i];
350
         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
351
23.0M
         yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), post_shift);
352
23.0M
         yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), post_shift);
353
         /* We swap real and imag because we're using an FFT instead of an IFFT. */
354
23.0M
         re = yp1[1];
355
23.0M
         im = yp1[0];
356
23.0M
         yp0[0] = yr;
357
23.0M
         yp1[1] = yi;
358
359
23.0M
         t0 = t[(N4-i-1)];
360
23.0M
         t1 = t[(N2-i-1)];
361
         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
362
23.0M
         yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), post_shift);
363
23.0M
         yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), post_shift);
364
23.0M
         yp1[0] = yr;
365
23.0M
         yp0[1] = yi;
366
23.0M
         yp0 += 2;
367
23.0M
         yp1 -= 2;
368
23.0M
      }
369
336k
   }
370
371
   /* Mirror on both sides for TDAC */
372
336k
   {
373
336k
      kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1;
374
336k
      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
375
336k
      const celt_coef * OPUS_RESTRICT wp1 = window;
376
336k
      const celt_coef * OPUS_RESTRICT wp2 = window+overlap-1;
377
378
23.3M
      for(i = 0; i < overlap/2; i++)
379
23.0M
      {
380
23.0M
         kiss_fft_scalar x1, x2;
381
23.0M
         x1 = *xp1;
382
23.0M
         x2 = *yp1;
383
23.0M
         *yp1++ = SUB32_ovflw(S_MUL(x2, *wp2), S_MUL(x1, *wp1));
384
23.0M
         *xp1-- = ADD32_ovflw(S_MUL(x2, *wp1), S_MUL(x1, *wp2));
385
23.0M
         wp1++;
386
23.0M
         wp2--;
387
23.0M
      }
388
336k
   }
389
336k
}
clt_mdct_backward_c
Line
Count
Source
270
309k
{
271
309k
   int i;
272
309k
   int N, N2, N4;
273
309k
   const kiss_twiddle_scalar *trig;
274
#ifdef FIXED_POINT
275
   int pre_shift, post_shift, fft_shift;
276
#endif
277
309k
   (void) arch;
278
279
309k
   N = l->n;
280
309k
   trig = l->trig;
281
996k
   for (i=0;i<shift;i++)
282
687k
   {
283
687k
      N >>= 1;
284
687k
      trig += N;
285
687k
   }
286
309k
   N2 = N>>1;
287
309k
   N4 = N>>2;
288
289
#ifdef FIXED_POINT
290
   {
291
      opus_val32 sumval=N2;
292
      opus_val32 maxval=0;
293
      for (i=0;i<N2;i++) {
294
         maxval = MAX32(maxval, ABS32(in[i*stride]));
295
         sumval = ADD32_ovflw(sumval, ABS32(SHR32(in[i*stride],11)));
296
      }
297
      pre_shift = IMAX(0, 29-celt_zlog2(1+maxval));
298
      /* Worst-case where all the energy goes to a single sample. */
299
      post_shift = IMAX(0, 19-celt_ilog2(ABS32(sumval)));
300
      post_shift = IMIN(post_shift, pre_shift);
301
      fft_shift = pre_shift - post_shift;
302
   }
303
#endif
304
   /* Pre-rotate */
305
309k
   {
306
      /* Temp pointers to make it really clear to the compiler what we're doing */
307
309k
      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
308
309k
      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);
309
309k
      kiss_fft_scalar * OPUS_RESTRICT yp = out+(overlap>>1);
310
309k
      const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0];
311
309k
      const opus_int16 * OPUS_RESTRICT bitrev = l->kfft[shift]->bitrev;
312
46.5M
      for(i=0;i<N4;i++)
313
46.2M
      {
314
46.2M
         int rev;
315
46.2M
         kiss_fft_scalar yr, yi;
316
46.2M
         opus_val32 x1, x2;
317
46.2M
         rev = *bitrev++;
318
46.2M
         x1 = SHL32_ovflw(*xp1, pre_shift);
319
46.2M
         x2 = SHL32_ovflw(*xp2, pre_shift);
320
46.2M
         yr = ADD32_ovflw(S_MUL(x2, t[i]), S_MUL(x1, t[N4+i]));
321
46.2M
         yi = SUB32_ovflw(S_MUL(x1, t[i]), S_MUL(x2, t[N4+i]));
322
         /* We swap real and imag because we use an FFT instead of an IFFT. */
323
46.2M
         yp[2*rev+1] = yr;
324
46.2M
         yp[2*rev] = yi;
325
         /* Storing the pre-rotation directly in the bitrev order. */
326
46.2M
         xp1+=2*stride;
327
46.2M
         xp2-=2*stride;
328
46.2M
      }
329
309k
   }
330
331
309k
   opus_fft_impl(l->kfft[shift], (kiss_fft_cpx*)(out+(overlap>>1)) ARG_FIXED(fft_shift));
332
333
   /* Post-rotate and de-shuffle from both ends of the buffer at once to make
334
      it in-place. */
335
309k
   {
336
309k
      kiss_fft_scalar * yp0 = out+(overlap>>1);
337
309k
      kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2;
338
309k
      const kiss_twiddle_scalar *t = &trig[0];
339
      /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the
340
         middle pair will be computed twice. */
341
23.4M
      for(i=0;i<(N4+1)>>1;i++)
342
23.1M
      {
343
23.1M
         kiss_fft_scalar re, im, yr, yi;
344
23.1M
         kiss_twiddle_scalar t0, t1;
345
         /* We swap real and imag because we're using an FFT instead of an IFFT. */
346
23.1M
         re = yp0[1];
347
23.1M
         im = yp0[0];
348
23.1M
         t0 = t[i];
349
23.1M
         t1 = t[N4+i];
350
         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
351
23.1M
         yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), post_shift);
352
23.1M
         yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), post_shift);
353
         /* We swap real and imag because we're using an FFT instead of an IFFT. */
354
23.1M
         re = yp1[1];
355
23.1M
         im = yp1[0];
356
23.1M
         yp0[0] = yr;
357
23.1M
         yp1[1] = yi;
358
359
23.1M
         t0 = t[(N4-i-1)];
360
23.1M
         t1 = t[(N2-i-1)];
361
         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
362
23.1M
         yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), post_shift);
363
23.1M
         yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), post_shift);
364
23.1M
         yp1[0] = yr;
365
23.1M
         yp0[1] = yi;
366
23.1M
         yp0 += 2;
367
23.1M
         yp1 -= 2;
368
23.1M
      }
369
309k
   }
370
371
   /* Mirror on both sides for TDAC */
372
309k
   {
373
309k
      kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1;
374
309k
      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
375
309k
      const celt_coef * OPUS_RESTRICT wp1 = window;
376
309k
      const celt_coef * OPUS_RESTRICT wp2 = window+overlap-1;
377
378
20.7M
      for(i = 0; i < overlap/2; i++)
379
20.4M
      {
380
20.4M
         kiss_fft_scalar x1, x2;
381
20.4M
         x1 = *xp1;
382
20.4M
         x2 = *yp1;
383
20.4M
         *yp1++ = SUB32_ovflw(S_MUL(x2, *wp2), S_MUL(x1, *wp1));
384
20.4M
         *xp1-- = ADD32_ovflw(S_MUL(x2, *wp1), S_MUL(x1, *wp2));
385
20.4M
         wp1++;
386
20.4M
         wp2--;
387
20.4M
      }
388
309k
   }
389
309k
}
390
#endif /* OVERRIDE_clt_mdct_backward */