Coverage Report

Created: 2026-06-07 08:05

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/opus/celt/mdct.c
Line
Count
Source
1
/* Copyright (c) 2007-2008 CSIRO
2
   Copyright (c) 2007-2008 Xiph.Org Foundation
3
   Written by Jean-Marc Valin */
4
/*
5
   Redistribution and use in source and binary forms, with or without
6
   modification, are permitted provided that the following conditions
7
   are met:
8
9
   - Redistributions of source code must retain the above copyright
10
   notice, this list of conditions and the following disclaimer.
11
12
   - Redistributions in binary form must reproduce the above copyright
13
   notice, this list of conditions and the following disclaimer in the
14
   documentation and/or other materials provided with the distribution.
15
16
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19
   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
20
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
*/
28
29
/* This is a simple MDCT implementation that uses a N/4 complex FFT
30
   to do most of the work. It should be relatively straightforward to
31
   plug in pretty much and FFT here.
32
33
   This replaces the Vorbis FFT (and uses the exact same API), which
34
   was a bit too messy and that was ending up duplicating code
35
   (might as well use the same FFT everywhere).
36
37
   The algorithm is similar to (and inspired from) Fabrice Bellard's
38
   MDCT implementation in FFMPEG, but has differences in signs, ordering
39
   and scaling in many places.
40
*/
41
42
#ifndef SKIP_CONFIG_H
43
#ifdef HAVE_CONFIG_H
44
#include "config.h"
45
#endif
46
#endif
47
48
#include "mdct.h"
49
#include "kiss_fft.h"
50
#include "_kiss_fft_guts.h"
51
#include <math.h>
52
#include "os_support.h"
53
#include "mathops.h"
54
#include "stack_alloc.h"
55
56
#if defined(FIXED_POINT) && defined(__mips) && __mips == 32
57
#include "mips/mdct_mipsr1.h"
58
#endif
59
60
#ifndef M_PI
61
#define M_PI 3.141592653
62
#endif
63
64
#ifdef CUSTOM_MODES
65
66
int clt_mdct_init(mdct_lookup *l,int N, int maxshift, int arch)
67
{
68
   int i;
69
   kiss_twiddle_scalar *trig;
70
   int shift;
71
   int N2=N>>1;
72
   l->n = N;
73
   l->maxshift = maxshift;
74
   for (i=0;i<=maxshift;i++)
75
   {
76
      if (i==0)
77
         l->kfft[i] = opus_fft_alloc(N>>2>>i, 0, 0, arch);
78
      else
79
         l->kfft[i] = opus_fft_alloc_twiddles(N>>2>>i, 0, 0, l->kfft[0], arch);
80
#ifndef ENABLE_TI_DSPLIB55
81
      if (l->kfft[i]==NULL)
82
         return 0;
83
#endif
84
   }
85
   l->trig = trig = (kiss_twiddle_scalar*)opus_alloc((N-(N2>>maxshift))*sizeof(kiss_twiddle_scalar));
86
   if (l->trig==NULL)
87
     return 0;
88
   for (shift=0;shift<=maxshift;shift++)
89
   {
90
      /* We have enough points that sine isn't necessary */
91
#if defined(FIXED_POINT)
92
#ifndef ENABLE_QEXT
93
      for (i=0;i<N2;i++)
94
         trig[i] = TRIG_UPSCALE*celt_cos_norm(DIV32(ADD32(SHL32(EXTEND32(i),17),N2+16384),N));
95
#else
96
      for (i=0;i<N2;i++)
97
         trig[i] = (kiss_twiddle_scalar)MAX32(-2147483647,MIN32(2147483647,floor(.5+2147483648*cos(2*M_PI*(i+.125)/N))));
98
#endif
99
#else
100
      for (i=0;i<N2;i++)
101
         trig[i] = (kiss_twiddle_scalar)cos(2*PI*(i+.125)/N);
102
#endif
103
      trig += N2;
104
      N2 >>= 1;
105
      N >>= 1;
106
   }
107
   return 1;
108
}
109
110
void clt_mdct_clear(mdct_lookup *l, int arch)
111
{
112
   int i;
113
   for (i=0;i<=l->maxshift;i++)
114
      opus_fft_free(l->kfft[i], arch);
115
   opus_free((kiss_twiddle_scalar*)l->trig);
116
}
117
118
#endif /* CUSTOM_MODES */
119
120
/* Forward MDCT trashes the input array */
121
#ifndef OVERRIDE_clt_mdct_forward
122
void clt_mdct_forward_c(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
123
      const celt_coef *window, int overlap, int shift, int stride, int arch)
124
3.16M
{
125
3.16M
   int i;
126
3.16M
   int N, N2, N4;
127
3.16M
   VARDECL(kiss_fft_scalar, f);
128
3.16M
   VARDECL(kiss_fft_cpx, f2);
129
3.16M
   const kiss_fft_state *st = l->kfft[shift];
130
3.16M
   const kiss_twiddle_scalar *trig;
131
3.16M
   celt_coef scale;
132
#ifdef FIXED_POINT
133
   /* Allows us to scale with MULT16_32_Q16(), which is faster than
134
      MULT16_32_Q15() on ARM. */
135
   int scale_shift = st->scale_shift-1;
136
   int headroom;
137
#endif
138
3.16M
   SAVE_STACK;
139
3.16M
   (void)arch;
140
3.16M
   scale = st->scale;
141
142
3.16M
   N = l->n;
143
3.16M
   trig = l->trig;
144
11.1M
   for (i=0;i<shift;i++)
145
7.98M
   {
146
7.98M
      N >>= 1;
147
7.98M
      trig += N;
148
7.98M
   }
149
3.16M
   N2 = N>>1;
150
3.16M
   N4 = N>>2;
151
152
3.16M
   ALLOC(f, N2, kiss_fft_scalar);
153
3.16M
   ALLOC(f2, N4, kiss_fft_cpx);
154
155
   /* Consider the input to be composed of four blocks: [a, b, c, d] */
156
   /* Window, shuffle, fold */
157
3.16M
   {
158
      /* Temp pointers to make it really clear to the compiler what we're doing */
159
3.16M
      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1);
160
3.16M
      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1);
161
3.16M
      kiss_fft_scalar * OPUS_RESTRICT yp = f;
162
3.16M
      const celt_coef * OPUS_RESTRICT wp1 = window+(overlap>>1);
163
3.16M
      const celt_coef * OPUS_RESTRICT wp2 = window+(overlap>>1)-1;
164
99.4M
      for(i=0;i<((overlap+3)>>2);i++)
165
96.2M
      {
166
         /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
167
96.2M
         *yp++ = S_MUL(xp1[N2], *wp2) + S_MUL(*xp2, *wp1);
168
96.2M
         *yp++ = S_MUL(*xp1, *wp1)    - S_MUL(xp2[-N2], *wp2);
169
96.2M
         xp1+=2;
170
96.2M
         xp2-=2;
171
96.2M
         wp1+=2;
172
96.2M
         wp2-=2;
173
96.2M
      }
174
3.16M
      wp1 = window;
175
3.16M
      wp2 = window+overlap-1;
176
204M
      for(;i<N4-((overlap+3)>>2);i++)
177
200M
      {
178
         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
179
200M
         *yp++ = *xp2;
180
200M
         *yp++ = *xp1;
181
200M
         xp1+=2;
182
200M
         xp2-=2;
183
200M
      }
184
99.4M
      for(;i<N4;i++)
185
96.2M
      {
186
         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
187
96.2M
         *yp++ =  -S_MUL(xp1[-N2], *wp1) + S_MUL(*xp2, *wp2);
188
96.2M
         *yp++ = S_MUL(*xp1, *wp2)     + S_MUL(xp2[N2], *wp1);
189
96.2M
         xp1+=2;
190
96.2M
         xp2-=2;
191
96.2M
         wp1+=2;
192
96.2M
         wp2-=2;
193
96.2M
      }
194
3.16M
   }
195
   /* Pre-rotation */
196
3.16M
   {
197
3.16M
      kiss_fft_scalar * OPUS_RESTRICT yp = f;
198
3.16M
      const kiss_twiddle_scalar *t = &trig[0];
199
#ifdef FIXED_POINT
200
      opus_val32 maxval=1;
201
#endif
202
396M
      for(i=0;i<N4;i++)
203
393M
      {
204
393M
         kiss_fft_cpx yc;
205
393M
         kiss_twiddle_scalar t0, t1;
206
393M
         kiss_fft_scalar re, im, yr, yi;
207
393M
         t0 = t[i];
208
393M
         t1 = t[N4+i];
209
393M
         re = *yp++;
210
393M
         im = *yp++;
211
393M
         yr = S_MUL(re,t0)  -  S_MUL(im,t1);
212
393M
         yi = S_MUL(im,t0)  +  S_MUL(re,t1);
213
         /* For QEXT, it's best to scale before the FFT, but otherwise it's best to scale after.
214
            For floating-point it doesn't matter. */
215
#ifdef ENABLE_QEXT
216
         yc.r = yr;
217
         yc.i = yi;
218
#else
219
196M
         yc.r = S_MUL2(yr, scale);
220
196M
         yc.i = S_MUL2(yi, scale);
221
#endif
222
#ifdef FIXED_POINT
223
205M
         maxval = MAX32(maxval, MAX32(ABS32(yc.r), ABS32(yc.i)));
224
#endif
225
393M
         f2[st->bitrev[i]] = yc;
226
393M
      }
227
#ifdef FIXED_POINT
228
1.66M
      headroom = IMAX(0, IMIN(scale_shift, 28-celt_ilog2(maxval)));
229
#endif
230
3.16M
   }
231
232
   /* N/4 complex FFT, does not downscale anymore */
233
3.16M
   opus_fft_impl(st, f2 ARG_FIXED(scale_shift-headroom));
234
235
   /* Post-rotate */
236
3.16M
   {
237
      /* Temp pointers to make it really clear to the compiler what we're doing */
238
3.16M
      const kiss_fft_cpx * OPUS_RESTRICT fp = f2;
239
3.16M
      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
240
3.16M
      kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
241
3.16M
      const kiss_twiddle_scalar *t = &trig[0];
242
      /* Temp pointers to make it really clear to the compiler what we're doing */
243
396M
      for(i=0;i<N4;i++)
244
393M
      {
245
393M
         kiss_fft_scalar yr, yi;
246
393M
         kiss_fft_scalar t0, t1;
247
#ifdef ENABLE_QEXT
248
196M
         t0 = S_MUL2(t[i], scale);
249
196M
         t1 = S_MUL2(t[N4+i], scale);
250
#else
251
         t0 = t[i];
252
         t1 = t[N4+i];
253
#endif
254
393M
         yr = PSHR32(S_MUL(fp->i,t1) - S_MUL(fp->r,t0), headroom);
255
393M
         yi = PSHR32(S_MUL(fp->r,t1) + S_MUL(fp->i,t0), headroom);
256
393M
         *yp1 = yr;
257
393M
         *yp2 = yi;
258
393M
         fp++;
259
393M
         yp1 += 2*stride;
260
393M
         yp2 -= 2*stride;
261
393M
      }
262
3.16M
   }
263
3.16M
   RESTORE_STACK;
264
3.16M
}
clt_mdct_forward_c
Line
Count
Source
124
831k
{
125
831k
   int i;
126
831k
   int N, N2, N4;
127
831k
   VARDECL(kiss_fft_scalar, f);
128
831k
   VARDECL(kiss_fft_cpx, f2);
129
831k
   const kiss_fft_state *st = l->kfft[shift];
130
831k
   const kiss_twiddle_scalar *trig;
131
831k
   celt_coef scale;
132
831k
#ifdef FIXED_POINT
133
   /* Allows us to scale with MULT16_32_Q16(), which is faster than
134
      MULT16_32_Q15() on ARM. */
135
831k
   int scale_shift = st->scale_shift-1;
136
831k
   int headroom;
137
831k
#endif
138
831k
   SAVE_STACK;
139
831k
   (void)arch;
140
831k
   scale = st->scale;
141
142
831k
   N = l->n;
143
831k
   trig = l->trig;
144
2.93M
   for (i=0;i<shift;i++)
145
2.09M
   {
146
2.09M
      N >>= 1;
147
2.09M
      trig += N;
148
2.09M
   }
149
831k
   N2 = N>>1;
150
831k
   N4 = N>>2;
151
152
831k
   ALLOC(f, N2, kiss_fft_scalar);
153
831k
   ALLOC(f2, N4, kiss_fft_cpx);
154
155
   /* Consider the input to be composed of four blocks: [a, b, c, d] */
156
   /* Window, shuffle, fold */
157
831k
   {
158
      /* Temp pointers to make it really clear to the compiler what we're doing */
159
831k
      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1);
160
831k
      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1);
161
831k
      kiss_fft_scalar * OPUS_RESTRICT yp = f;
162
831k
      const celt_coef * OPUS_RESTRICT wp1 = window+(overlap>>1);
163
831k
      const celt_coef * OPUS_RESTRICT wp2 = window+(overlap>>1)-1;
164
26.1M
      for(i=0;i<((overlap+3)>>2);i++)
165
25.2M
      {
166
         /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
167
25.2M
         *yp++ = S_MUL(xp1[N2], *wp2) + S_MUL(*xp2, *wp1);
168
25.2M
         *yp++ = S_MUL(*xp1, *wp1)    - S_MUL(xp2[-N2], *wp2);
169
25.2M
         xp1+=2;
170
25.2M
         xp2-=2;
171
25.2M
         wp1+=2;
172
25.2M
         wp2-=2;
173
25.2M
      }
174
831k
      wp1 = window;
175
831k
      wp2 = window+overlap-1;
176
53.1M
      for(;i<N4-((overlap+3)>>2);i++)
177
52.3M
      {
178
         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
179
52.3M
         *yp++ = *xp2;
180
52.3M
         *yp++ = *xp1;
181
52.3M
         xp1+=2;
182
52.3M
         xp2-=2;
183
52.3M
      }
184
26.1M
      for(;i<N4;i++)
185
25.2M
      {
186
         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
187
25.2M
         *yp++ =  -S_MUL(xp1[-N2], *wp1) + S_MUL(*xp2, *wp2);
188
25.2M
         *yp++ = S_MUL(*xp1, *wp2)     + S_MUL(xp2[N2], *wp1);
189
25.2M
         xp1+=2;
190
25.2M
         xp2-=2;
191
25.2M
         wp1+=2;
192
25.2M
         wp2-=2;
193
25.2M
      }
194
831k
   }
195
   /* Pre-rotation */
196
831k
   {
197
831k
      kiss_fft_scalar * OPUS_RESTRICT yp = f;
198
831k
      const kiss_twiddle_scalar *t = &trig[0];
199
831k
#ifdef FIXED_POINT
200
831k
      opus_val32 maxval=1;
201
831k
#endif
202
103M
      for(i=0;i<N4;i++)
203
102M
      {
204
102M
         kiss_fft_cpx yc;
205
102M
         kiss_twiddle_scalar t0, t1;
206
102M
         kiss_fft_scalar re, im, yr, yi;
207
102M
         t0 = t[i];
208
102M
         t1 = t[N4+i];
209
102M
         re = *yp++;
210
102M
         im = *yp++;
211
102M
         yr = S_MUL(re,t0)  -  S_MUL(im,t1);
212
102M
         yi = S_MUL(im,t0)  +  S_MUL(re,t1);
213
         /* For QEXT, it's best to scale before the FFT, but otherwise it's best to scale after.
214
            For floating-point it doesn't matter. */
215
#ifdef ENABLE_QEXT
216
         yc.r = yr;
217
         yc.i = yi;
218
#else
219
102M
         yc.r = S_MUL2(yr, scale);
220
102M
         yc.i = S_MUL2(yi, scale);
221
102M
#endif
222
102M
#ifdef FIXED_POINT
223
102M
         maxval = MAX32(maxval, MAX32(ABS32(yc.r), ABS32(yc.i)));
224
102M
#endif
225
102M
         f2[st->bitrev[i]] = yc;
226
102M
      }
227
831k
#ifdef FIXED_POINT
228
831k
      headroom = IMAX(0, IMIN(scale_shift, 28-celt_ilog2(maxval)));
229
831k
#endif
230
831k
   }
231
232
   /* N/4 complex FFT, does not downscale anymore */
233
831k
   opus_fft_impl(st, f2 ARG_FIXED(scale_shift-headroom));
234
235
   /* Post-rotate */
236
831k
   {
237
      /* Temp pointers to make it really clear to the compiler what we're doing */
238
831k
      const kiss_fft_cpx * OPUS_RESTRICT fp = f2;
239
831k
      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
240
831k
      kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
241
831k
      const kiss_twiddle_scalar *t = &trig[0];
242
      /* Temp pointers to make it really clear to the compiler what we're doing */
243
103M
      for(i=0;i<N4;i++)
244
102M
      {
245
102M
         kiss_fft_scalar yr, yi;
246
102M
         kiss_fft_scalar t0, t1;
247
#ifdef ENABLE_QEXT
248
         t0 = S_MUL2(t[i], scale);
249
         t1 = S_MUL2(t[N4+i], scale);
250
#else
251
102M
         t0 = t[i];
252
102M
         t1 = t[N4+i];
253
102M
#endif
254
102M
         yr = PSHR32(S_MUL(fp->i,t1) - S_MUL(fp->r,t0), headroom);
255
102M
         yi = PSHR32(S_MUL(fp->r,t1) + S_MUL(fp->i,t0), headroom);
256
102M
         *yp1 = yr;
257
102M
         *yp2 = yi;
258
102M
         fp++;
259
102M
         yp1 += 2*stride;
260
102M
         yp2 -= 2*stride;
261
102M
      }
262
831k
   }
263
831k
   RESTORE_STACK;
264
831k
}
clt_mdct_forward_c
Line
Count
Source
124
831k
{
125
831k
   int i;
126
831k
   int N, N2, N4;
127
831k
   VARDECL(kiss_fft_scalar, f);
128
831k
   VARDECL(kiss_fft_cpx, f2);
129
831k
   const kiss_fft_state *st = l->kfft[shift];
130
831k
   const kiss_twiddle_scalar *trig;
131
831k
   celt_coef scale;
132
831k
#ifdef FIXED_POINT
133
   /* Allows us to scale with MULT16_32_Q16(), which is faster than
134
      MULT16_32_Q15() on ARM. */
135
831k
   int scale_shift = st->scale_shift-1;
136
831k
   int headroom;
137
831k
#endif
138
831k
   SAVE_STACK;
139
831k
   (void)arch;
140
831k
   scale = st->scale;
141
142
831k
   N = l->n;
143
831k
   trig = l->trig;
144
2.93M
   for (i=0;i<shift;i++)
145
2.09M
   {
146
2.09M
      N >>= 1;
147
2.09M
      trig += N;
148
2.09M
   }
149
831k
   N2 = N>>1;
150
831k
   N4 = N>>2;
151
152
831k
   ALLOC(f, N2, kiss_fft_scalar);
153
831k
   ALLOC(f2, N4, kiss_fft_cpx);
154
155
   /* Consider the input to be composed of four blocks: [a, b, c, d] */
156
   /* Window, shuffle, fold */
157
831k
   {
158
      /* Temp pointers to make it really clear to the compiler what we're doing */
159
831k
      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1);
160
831k
      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1);
161
831k
      kiss_fft_scalar * OPUS_RESTRICT yp = f;
162
831k
      const celt_coef * OPUS_RESTRICT wp1 = window+(overlap>>1);
163
831k
      const celt_coef * OPUS_RESTRICT wp2 = window+(overlap>>1)-1;
164
26.1M
      for(i=0;i<((overlap+3)>>2);i++)
165
25.2M
      {
166
         /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
167
25.2M
         *yp++ = S_MUL(xp1[N2], *wp2) + S_MUL(*xp2, *wp1);
168
25.2M
         *yp++ = S_MUL(*xp1, *wp1)    - S_MUL(xp2[-N2], *wp2);
169
25.2M
         xp1+=2;
170
25.2M
         xp2-=2;
171
25.2M
         wp1+=2;
172
25.2M
         wp2-=2;
173
25.2M
      }
174
831k
      wp1 = window;
175
831k
      wp2 = window+overlap-1;
176
53.1M
      for(;i<N4-((overlap+3)>>2);i++)
177
52.3M
      {
178
         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
179
52.3M
         *yp++ = *xp2;
180
52.3M
         *yp++ = *xp1;
181
52.3M
         xp1+=2;
182
52.3M
         xp2-=2;
183
52.3M
      }
184
26.1M
      for(;i<N4;i++)
185
25.2M
      {
186
         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
187
25.2M
         *yp++ =  -S_MUL(xp1[-N2], *wp1) + S_MUL(*xp2, *wp2);
188
25.2M
         *yp++ = S_MUL(*xp1, *wp2)     + S_MUL(xp2[N2], *wp1);
189
25.2M
         xp1+=2;
190
25.2M
         xp2-=2;
191
25.2M
         wp1+=2;
192
25.2M
         wp2-=2;
193
25.2M
      }
194
831k
   }
195
   /* Pre-rotation */
196
831k
   {
197
831k
      kiss_fft_scalar * OPUS_RESTRICT yp = f;
198
831k
      const kiss_twiddle_scalar *t = &trig[0];
199
831k
#ifdef FIXED_POINT
200
831k
      opus_val32 maxval=1;
201
831k
#endif
202
103M
      for(i=0;i<N4;i++)
203
102M
      {
204
102M
         kiss_fft_cpx yc;
205
102M
         kiss_twiddle_scalar t0, t1;
206
102M
         kiss_fft_scalar re, im, yr, yi;
207
102M
         t0 = t[i];
208
102M
         t1 = t[N4+i];
209
102M
         re = *yp++;
210
102M
         im = *yp++;
211
102M
         yr = S_MUL(re,t0)  -  S_MUL(im,t1);
212
102M
         yi = S_MUL(im,t0)  +  S_MUL(re,t1);
213
         /* For QEXT, it's best to scale before the FFT, but otherwise it's best to scale after.
214
            For floating-point it doesn't matter. */
215
102M
#ifdef ENABLE_QEXT
216
102M
         yc.r = yr;
217
102M
         yc.i = yi;
218
#else
219
         yc.r = S_MUL2(yr, scale);
220
         yc.i = S_MUL2(yi, scale);
221
#endif
222
102M
#ifdef FIXED_POINT
223
102M
         maxval = MAX32(maxval, MAX32(ABS32(yc.r), ABS32(yc.i)));
224
102M
#endif
225
102M
         f2[st->bitrev[i]] = yc;
226
102M
      }
227
831k
#ifdef FIXED_POINT
228
831k
      headroom = IMAX(0, IMIN(scale_shift, 28-celt_ilog2(maxval)));
229
831k
#endif
230
831k
   }
231
232
   /* N/4 complex FFT, does not downscale anymore */
233
831k
   opus_fft_impl(st, f2 ARG_FIXED(scale_shift-headroom));
234
235
   /* Post-rotate */
236
831k
   {
237
      /* Temp pointers to make it really clear to the compiler what we're doing */
238
831k
      const kiss_fft_cpx * OPUS_RESTRICT fp = f2;
239
831k
      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
240
831k
      kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
241
831k
      const kiss_twiddle_scalar *t = &trig[0];
242
      /* Temp pointers to make it really clear to the compiler what we're doing */
243
103M
      for(i=0;i<N4;i++)
244
102M
      {
245
102M
         kiss_fft_scalar yr, yi;
246
102M
         kiss_fft_scalar t0, t1;
247
102M
#ifdef ENABLE_QEXT
248
102M
         t0 = S_MUL2(t[i], scale);
249
102M
         t1 = S_MUL2(t[N4+i], scale);
250
#else
251
         t0 = t[i];
252
         t1 = t[N4+i];
253
#endif
254
102M
         yr = PSHR32(S_MUL(fp->i,t1) - S_MUL(fp->r,t0), headroom);
255
102M
         yi = PSHR32(S_MUL(fp->r,t1) + S_MUL(fp->i,t0), headroom);
256
102M
         *yp1 = yr;
257
102M
         *yp2 = yi;
258
102M
         fp++;
259
102M
         yp1 += 2*stride;
260
102M
         yp2 -= 2*stride;
261
102M
      }
262
831k
   }
263
831k
   RESTORE_STACK;
264
831k
}
clt_mdct_forward_c
Line
Count
Source
124
750k
{
125
750k
   int i;
126
750k
   int N, N2, N4;
127
750k
   VARDECL(kiss_fft_scalar, f);
128
750k
   VARDECL(kiss_fft_cpx, f2);
129
750k
   const kiss_fft_state *st = l->kfft[shift];
130
750k
   const kiss_twiddle_scalar *trig;
131
750k
   celt_coef scale;
132
#ifdef FIXED_POINT
133
   /* Allows us to scale with MULT16_32_Q16(), which is faster than
134
      MULT16_32_Q15() on ARM. */
135
   int scale_shift = st->scale_shift-1;
136
   int headroom;
137
#endif
138
750k
   SAVE_STACK;
139
750k
   (void)arch;
140
750k
   scale = st->scale;
141
142
750k
   N = l->n;
143
750k
   trig = l->trig;
144
2.64M
   for (i=0;i<shift;i++)
145
1.89M
   {
146
1.89M
      N >>= 1;
147
1.89M
      trig += N;
148
1.89M
   }
149
750k
   N2 = N>>1;
150
750k
   N4 = N>>2;
151
152
750k
   ALLOC(f, N2, kiss_fft_scalar);
153
750k
   ALLOC(f2, N4, kiss_fft_cpx);
154
155
   /* Consider the input to be composed of four blocks: [a, b, c, d] */
156
   /* Window, shuffle, fold */
157
750k
   {
158
      /* Temp pointers to make it really clear to the compiler what we're doing */
159
750k
      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1);
160
750k
      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1);
161
750k
      kiss_fft_scalar * OPUS_RESTRICT yp = f;
162
750k
      const celt_coef * OPUS_RESTRICT wp1 = window+(overlap>>1);
163
750k
      const celt_coef * OPUS_RESTRICT wp2 = window+(overlap>>1)-1;
164
23.5M
      for(i=0;i<((overlap+3)>>2);i++)
165
22.8M
      {
166
         /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
167
22.8M
         *yp++ = S_MUL(xp1[N2], *wp2) + S_MUL(*xp2, *wp1);
168
22.8M
         *yp++ = S_MUL(*xp1, *wp1)    - S_MUL(xp2[-N2], *wp2);
169
22.8M
         xp1+=2;
170
22.8M
         xp2-=2;
171
22.8M
         wp1+=2;
172
22.8M
         wp2-=2;
173
22.8M
      }
174
750k
      wp1 = window;
175
750k
      wp2 = window+overlap-1;
176
48.8M
      for(;i<N4-((overlap+3)>>2);i++)
177
48.1M
      {
178
         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
179
48.1M
         *yp++ = *xp2;
180
48.1M
         *yp++ = *xp1;
181
48.1M
         xp1+=2;
182
48.1M
         xp2-=2;
183
48.1M
      }
184
23.5M
      for(;i<N4;i++)
185
22.8M
      {
186
         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
187
22.8M
         *yp++ =  -S_MUL(xp1[-N2], *wp1) + S_MUL(*xp2, *wp2);
188
22.8M
         *yp++ = S_MUL(*xp1, *wp2)     + S_MUL(xp2[N2], *wp1);
189
22.8M
         xp1+=2;
190
22.8M
         xp2-=2;
191
22.8M
         wp1+=2;
192
22.8M
         wp2-=2;
193
22.8M
      }
194
750k
   }
195
   /* Pre-rotation */
196
750k
   {
197
750k
      kiss_fft_scalar * OPUS_RESTRICT yp = f;
198
750k
      const kiss_twiddle_scalar *t = &trig[0];
199
#ifdef FIXED_POINT
200
      opus_val32 maxval=1;
201
#endif
202
94.5M
      for(i=0;i<N4;i++)
203
93.8M
      {
204
93.8M
         kiss_fft_cpx yc;
205
93.8M
         kiss_twiddle_scalar t0, t1;
206
93.8M
         kiss_fft_scalar re, im, yr, yi;
207
93.8M
         t0 = t[i];
208
93.8M
         t1 = t[N4+i];
209
93.8M
         re = *yp++;
210
93.8M
         im = *yp++;
211
93.8M
         yr = S_MUL(re,t0)  -  S_MUL(im,t1);
212
93.8M
         yi = S_MUL(im,t0)  +  S_MUL(re,t1);
213
         /* For QEXT, it's best to scale before the FFT, but otherwise it's best to scale after.
214
            For floating-point it doesn't matter. */
215
93.8M
#ifdef ENABLE_QEXT
216
93.8M
         yc.r = yr;
217
93.8M
         yc.i = yi;
218
#else
219
         yc.r = S_MUL2(yr, scale);
220
         yc.i = S_MUL2(yi, scale);
221
#endif
222
#ifdef FIXED_POINT
223
         maxval = MAX32(maxval, MAX32(ABS32(yc.r), ABS32(yc.i)));
224
#endif
225
93.8M
         f2[st->bitrev[i]] = yc;
226
93.8M
      }
227
#ifdef FIXED_POINT
228
      headroom = IMAX(0, IMIN(scale_shift, 28-celt_ilog2(maxval)));
229
#endif
230
750k
   }
231
232
   /* N/4 complex FFT, does not downscale anymore */
233
750k
   opus_fft_impl(st, f2 ARG_FIXED(scale_shift-headroom));
234
235
   /* Post-rotate */
236
750k
   {
237
      /* Temp pointers to make it really clear to the compiler what we're doing */
238
750k
      const kiss_fft_cpx * OPUS_RESTRICT fp = f2;
239
750k
      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
240
750k
      kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
241
750k
      const kiss_twiddle_scalar *t = &trig[0];
242
      /* Temp pointers to make it really clear to the compiler what we're doing */
243
94.5M
      for(i=0;i<N4;i++)
244
93.8M
      {
245
93.8M
         kiss_fft_scalar yr, yi;
246
93.8M
         kiss_fft_scalar t0, t1;
247
93.8M
#ifdef ENABLE_QEXT
248
93.8M
         t0 = S_MUL2(t[i], scale);
249
93.8M
         t1 = S_MUL2(t[N4+i], scale);
250
#else
251
         t0 = t[i];
252
         t1 = t[N4+i];
253
#endif
254
93.8M
         yr = PSHR32(S_MUL(fp->i,t1) - S_MUL(fp->r,t0), headroom);
255
93.8M
         yi = PSHR32(S_MUL(fp->r,t1) + S_MUL(fp->i,t0), headroom);
256
93.8M
         *yp1 = yr;
257
93.8M
         *yp2 = yi;
258
93.8M
         fp++;
259
93.8M
         yp1 += 2*stride;
260
93.8M
         yp2 -= 2*stride;
261
93.8M
      }
262
750k
   }
263
750k
   RESTORE_STACK;
264
750k
}
clt_mdct_forward_c
Line
Count
Source
124
750k
{
125
750k
   int i;
126
750k
   int N, N2, N4;
127
750k
   VARDECL(kiss_fft_scalar, f);
128
750k
   VARDECL(kiss_fft_cpx, f2);
129
750k
   const kiss_fft_state *st = l->kfft[shift];
130
750k
   const kiss_twiddle_scalar *trig;
131
750k
   celt_coef scale;
132
#ifdef FIXED_POINT
133
   /* Allows us to scale with MULT16_32_Q16(), which is faster than
134
      MULT16_32_Q15() on ARM. */
135
   int scale_shift = st->scale_shift-1;
136
   int headroom;
137
#endif
138
750k
   SAVE_STACK;
139
750k
   (void)arch;
140
750k
   scale = st->scale;
141
142
750k
   N = l->n;
143
750k
   trig = l->trig;
144
2.64M
   for (i=0;i<shift;i++)
145
1.89M
   {
146
1.89M
      N >>= 1;
147
1.89M
      trig += N;
148
1.89M
   }
149
750k
   N2 = N>>1;
150
750k
   N4 = N>>2;
151
152
750k
   ALLOC(f, N2, kiss_fft_scalar);
153
750k
   ALLOC(f2, N4, kiss_fft_cpx);
154
155
   /* Consider the input to be composed of four blocks: [a, b, c, d] */
156
   /* Window, shuffle, fold */
157
750k
   {
158
      /* Temp pointers to make it really clear to the compiler what we're doing */
159
750k
      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1);
160
750k
      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1);
161
750k
      kiss_fft_scalar * OPUS_RESTRICT yp = f;
162
750k
      const celt_coef * OPUS_RESTRICT wp1 = window+(overlap>>1);
163
750k
      const celt_coef * OPUS_RESTRICT wp2 = window+(overlap>>1)-1;
164
23.5M
      for(i=0;i<((overlap+3)>>2);i++)
165
22.8M
      {
166
         /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
167
22.8M
         *yp++ = S_MUL(xp1[N2], *wp2) + S_MUL(*xp2, *wp1);
168
22.8M
         *yp++ = S_MUL(*xp1, *wp1)    - S_MUL(xp2[-N2], *wp2);
169
22.8M
         xp1+=2;
170
22.8M
         xp2-=2;
171
22.8M
         wp1+=2;
172
22.8M
         wp2-=2;
173
22.8M
      }
174
750k
      wp1 = window;
175
750k
      wp2 = window+overlap-1;
176
48.8M
      for(;i<N4-((overlap+3)>>2);i++)
177
48.1M
      {
178
         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
179
48.1M
         *yp++ = *xp2;
180
48.1M
         *yp++ = *xp1;
181
48.1M
         xp1+=2;
182
48.1M
         xp2-=2;
183
48.1M
      }
184
23.5M
      for(;i<N4;i++)
185
22.8M
      {
186
         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
187
22.8M
         *yp++ =  -S_MUL(xp1[-N2], *wp1) + S_MUL(*xp2, *wp2);
188
22.8M
         *yp++ = S_MUL(*xp1, *wp2)     + S_MUL(xp2[N2], *wp1);
189
22.8M
         xp1+=2;
190
22.8M
         xp2-=2;
191
22.8M
         wp1+=2;
192
22.8M
         wp2-=2;
193
22.8M
      }
194
750k
   }
195
   /* Pre-rotation */
196
750k
   {
197
750k
      kiss_fft_scalar * OPUS_RESTRICT yp = f;
198
750k
      const kiss_twiddle_scalar *t = &trig[0];
199
#ifdef FIXED_POINT
200
      opus_val32 maxval=1;
201
#endif
202
94.5M
      for(i=0;i<N4;i++)
203
93.8M
      {
204
93.8M
         kiss_fft_cpx yc;
205
93.8M
         kiss_twiddle_scalar t0, t1;
206
93.8M
         kiss_fft_scalar re, im, yr, yi;
207
93.8M
         t0 = t[i];
208
93.8M
         t1 = t[N4+i];
209
93.8M
         re = *yp++;
210
93.8M
         im = *yp++;
211
93.8M
         yr = S_MUL(re,t0)  -  S_MUL(im,t1);
212
93.8M
         yi = S_MUL(im,t0)  +  S_MUL(re,t1);
213
         /* For QEXT, it's best to scale before the FFT, but otherwise it's best to scale after.
214
            For floating-point it doesn't matter. */
215
#ifdef ENABLE_QEXT
216
         yc.r = yr;
217
         yc.i = yi;
218
#else
219
93.8M
         yc.r = S_MUL2(yr, scale);
220
93.8M
         yc.i = S_MUL2(yi, scale);
221
93.8M
#endif
222
#ifdef FIXED_POINT
223
         maxval = MAX32(maxval, MAX32(ABS32(yc.r), ABS32(yc.i)));
224
#endif
225
93.8M
         f2[st->bitrev[i]] = yc;
226
93.8M
      }
227
#ifdef FIXED_POINT
228
      headroom = IMAX(0, IMIN(scale_shift, 28-celt_ilog2(maxval)));
229
#endif
230
750k
   }
231
232
   /* N/4 complex FFT, does not downscale anymore */
233
750k
   opus_fft_impl(st, f2 ARG_FIXED(scale_shift-headroom));
234
235
   /* Post-rotate */
236
750k
   {
237
      /* Temp pointers to make it really clear to the compiler what we're doing */
238
750k
      const kiss_fft_cpx * OPUS_RESTRICT fp = f2;
239
750k
      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
240
750k
      kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
241
750k
      const kiss_twiddle_scalar *t = &trig[0];
242
      /* Temp pointers to make it really clear to the compiler what we're doing */
243
94.5M
      for(i=0;i<N4;i++)
244
93.8M
      {
245
93.8M
         kiss_fft_scalar yr, yi;
246
93.8M
         kiss_fft_scalar t0, t1;
247
#ifdef ENABLE_QEXT
248
         t0 = S_MUL2(t[i], scale);
249
         t1 = S_MUL2(t[N4+i], scale);
250
#else
251
93.8M
         t0 = t[i];
252
93.8M
         t1 = t[N4+i];
253
93.8M
#endif
254
93.8M
         yr = PSHR32(S_MUL(fp->i,t1) - S_MUL(fp->r,t0), headroom);
255
93.8M
         yi = PSHR32(S_MUL(fp->r,t1) + S_MUL(fp->i,t0), headroom);
256
93.8M
         *yp1 = yr;
257
93.8M
         *yp2 = yi;
258
93.8M
         fp++;
259
93.8M
         yp1 += 2*stride;
260
93.8M
         yp2 -= 2*stride;
261
93.8M
      }
262
750k
   }
263
750k
   RESTORE_STACK;
264
750k
}
265
#endif /* OVERRIDE_clt_mdct_forward */
266
267
#ifndef OVERRIDE_clt_mdct_backward
268
void clt_mdct_backward_c(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
269
      const celt_coef * OPUS_RESTRICT window, int overlap, int shift, int stride, int arch)
270
649k
{
271
649k
   int i;
272
649k
   int N, N2, N4;
273
649k
   const kiss_twiddle_scalar *trig;
274
#ifdef FIXED_POINT
275
   int pre_shift, post_shift, fft_shift;
276
#endif
277
649k
   (void) arch;
278
279
649k
   N = l->n;
280
649k
   trig = l->trig;
281
2.13M
   for (i=0;i<shift;i++)
282
1.48M
   {
283
1.48M
      N >>= 1;
284
1.48M
      trig += N;
285
1.48M
   }
286
649k
   N2 = N>>1;
287
649k
   N4 = N>>2;
288
289
#ifdef FIXED_POINT
290
   {
291
      opus_val32 sumval=N2;
292
      opus_val32 maxval=0;
293
92.3M
      for (i=0;i<N2;i++) {
294
91.9M
         maxval = MAX32(maxval, ABS32(in[i*stride]));
295
91.9M
         sumval = ADD32_ovflw(sumval, ABS32(SHR32(in[i*stride],11)));
296
91.9M
      }
297
339k
      pre_shift = IMAX(0, 29-celt_zlog2(1+maxval));
298
      /* Worst-case where all the energy goes to a single sample. */
299
339k
      post_shift = IMAX(0, 19-celt_ilog2(ABS32(sumval)));
300
339k
      post_shift = IMIN(post_shift, pre_shift);
301
      fft_shift = pre_shift - post_shift;
302
   }
303
#endif
304
   /* Pre-rotate */
305
649k
   {
306
      /* Temp pointers to make it really clear to the compiler what we're doing */
307
649k
      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
308
649k
      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);
309
649k
      kiss_fft_scalar * OPUS_RESTRICT yp = out+(overlap>>1);
310
649k
      const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0];
311
649k
      const opus_int16 * OPUS_RESTRICT bitrev = l->kfft[shift]->bitrev;
312
93.9M
      for(i=0;i<N4;i++)
313
93.2M
      {
314
93.2M
         int rev;
315
93.2M
         kiss_fft_scalar yr, yi;
316
93.2M
         opus_val32 x1, x2;
317
93.2M
         rev = *bitrev++;
318
93.2M
         x1 = SHL32_ovflw(*xp1, pre_shift);
319
93.2M
         x2 = SHL32_ovflw(*xp2, pre_shift);
320
93.2M
         yr = ADD32_ovflw(S_MUL(x2, t[i]), S_MUL(x1, t[N4+i]));
321
93.2M
         yi = SUB32_ovflw(S_MUL(x1, t[i]), S_MUL(x2, t[N4+i]));
322
         /* We swap real and imag because we use an FFT instead of an IFFT. */
323
93.2M
         yp[2*rev+1] = yr;
324
93.2M
         yp[2*rev] = yi;
325
         /* Storing the pre-rotation directly in the bitrev order. */
326
93.2M
         xp1+=2*stride;
327
93.2M
         xp2-=2*stride;
328
93.2M
      }
329
649k
   }
330
331
649k
   opus_fft_impl(l->kfft[shift], (kiss_fft_cpx*)(out+(overlap>>1)) ARG_FIXED(fft_shift));
332
333
   /* Post-rotate and de-shuffle from both ends of the buffer at once to make
334
      it in-place. */
335
649k
   {
336
649k
      kiss_fft_scalar * yp0 = out+(overlap>>1);
337
649k
      kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2;
338
649k
      const kiss_twiddle_scalar *t = &trig[0];
339
      /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the
340
         middle pair will be computed twice. */
341
47.2M
      for(i=0;i<(N4+1)>>1;i++)
342
46.6M
      {
343
46.6M
         kiss_fft_scalar re, im, yr, yi;
344
46.6M
         kiss_twiddle_scalar t0, t1;
345
         /* We swap real and imag because we're using an FFT instead of an IFFT. */
346
46.6M
         re = yp0[1];
347
46.6M
         im = yp0[0];
348
46.6M
         t0 = t[i];
349
46.6M
         t1 = t[N4+i];
350
         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
351
46.6M
         yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), post_shift);
352
46.6M
         yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), post_shift);
353
         /* We swap real and imag because we're using an FFT instead of an IFFT. */
354
46.6M
         re = yp1[1];
355
46.6M
         im = yp1[0];
356
46.6M
         yp0[0] = yr;
357
46.6M
         yp1[1] = yi;
358
359
46.6M
         t0 = t[(N4-i-1)];
360
46.6M
         t1 = t[(N2-i-1)];
361
         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
362
46.6M
         yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), post_shift);
363
46.6M
         yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), post_shift);
364
46.6M
         yp1[0] = yr;
365
46.6M
         yp0[1] = yi;
366
46.6M
         yp0 += 2;
367
46.6M
         yp1 -= 2;
368
46.6M
      }
369
649k
   }
370
371
   /* Mirror on both sides for TDAC */
372
649k
   {
373
649k
      kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1;
374
649k
      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
375
649k
      const celt_coef * OPUS_RESTRICT wp1 = window;
376
649k
      const celt_coef * OPUS_RESTRICT wp2 = window+overlap-1;
377
378
44.4M
      for(i = 0; i < overlap/2; i++)
379
43.7M
      {
380
43.7M
         kiss_fft_scalar x1, x2;
381
43.7M
         x1 = *xp1;
382
43.7M
         x2 = *yp1;
383
43.7M
         *yp1++ = SUB32_ovflw(S_MUL(x2, *wp2), S_MUL(x1, *wp1));
384
43.7M
         *xp1-- = ADD32_ovflw(S_MUL(x2, *wp1), S_MUL(x1, *wp2));
385
43.7M
         wp1++;
386
43.7M
         wp2--;
387
43.7M
      }
388
649k
   }
389
649k
}
clt_mdct_backward_c
Line
Count
Source
270
339k
{
271
339k
   int i;
272
339k
   int N, N2, N4;
273
339k
   const kiss_twiddle_scalar *trig;
274
339k
#ifdef FIXED_POINT
275
339k
   int pre_shift, post_shift, fft_shift;
276
339k
#endif
277
339k
   (void) arch;
278
279
339k
   N = l->n;
280
339k
   trig = l->trig;
281
1.13M
   for (i=0;i<shift;i++)
282
797k
   {
283
797k
      N >>= 1;
284
797k
      trig += N;
285
797k
   }
286
339k
   N2 = N>>1;
287
339k
   N4 = N>>2;
288
289
339k
#ifdef FIXED_POINT
290
339k
   {
291
339k
      opus_val32 sumval=N2;
292
339k
      opus_val32 maxval=0;
293
92.3M
      for (i=0;i<N2;i++) {
294
91.9M
         maxval = MAX32(maxval, ABS32(in[i*stride]));
295
91.9M
         sumval = ADD32_ovflw(sumval, ABS32(SHR32(in[i*stride],11)));
296
91.9M
      }
297
339k
      pre_shift = IMAX(0, 29-celt_zlog2(1+maxval));
298
      /* Worst-case where all the energy goes to a single sample. */
299
339k
      post_shift = IMAX(0, 19-celt_ilog2(ABS32(sumval)));
300
339k
      post_shift = IMIN(post_shift, pre_shift);
301
339k
      fft_shift = pre_shift - post_shift;
302
339k
   }
303
339k
#endif
304
   /* Pre-rotate */
305
339k
   {
306
      /* Temp pointers to make it really clear to the compiler what we're doing */
307
339k
      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
308
339k
      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);
309
339k
      kiss_fft_scalar * OPUS_RESTRICT yp = out+(overlap>>1);
310
339k
      const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0];
311
339k
      const opus_int16 * OPUS_RESTRICT bitrev = l->kfft[shift]->bitrev;
312
46.3M
      for(i=0;i<N4;i++)
313
45.9M
      {
314
45.9M
         int rev;
315
45.9M
         kiss_fft_scalar yr, yi;
316
45.9M
         opus_val32 x1, x2;
317
45.9M
         rev = *bitrev++;
318
45.9M
         x1 = SHL32_ovflw(*xp1, pre_shift);
319
45.9M
         x2 = SHL32_ovflw(*xp2, pre_shift);
320
45.9M
         yr = ADD32_ovflw(S_MUL(x2, t[i]), S_MUL(x1, t[N4+i]));
321
45.9M
         yi = SUB32_ovflw(S_MUL(x1, t[i]), S_MUL(x2, t[N4+i]));
322
         /* We swap real and imag because we use an FFT instead of an IFFT. */
323
45.9M
         yp[2*rev+1] = yr;
324
45.9M
         yp[2*rev] = yi;
325
         /* Storing the pre-rotation directly in the bitrev order. */
326
45.9M
         xp1+=2*stride;
327
45.9M
         xp2-=2*stride;
328
45.9M
      }
329
339k
   }
330
331
339k
   opus_fft_impl(l->kfft[shift], (kiss_fft_cpx*)(out+(overlap>>1)) ARG_FIXED(fft_shift));
332
333
   /* Post-rotate and de-shuffle from both ends of the buffer at once to make
334
      it in-place. */
335
339k
   {
336
339k
      kiss_fft_scalar * yp0 = out+(overlap>>1);
337
339k
      kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2;
338
339k
      const kiss_twiddle_scalar *t = &trig[0];
339
      /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the
340
         middle pair will be computed twice. */
341
23.3M
      for(i=0;i<(N4+1)>>1;i++)
342
22.9M
      {
343
22.9M
         kiss_fft_scalar re, im, yr, yi;
344
22.9M
         kiss_twiddle_scalar t0, t1;
345
         /* We swap real and imag because we're using an FFT instead of an IFFT. */
346
22.9M
         re = yp0[1];
347
22.9M
         im = yp0[0];
348
22.9M
         t0 = t[i];
349
22.9M
         t1 = t[N4+i];
350
         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
351
22.9M
         yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), post_shift);
352
22.9M
         yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), post_shift);
353
         /* We swap real and imag because we're using an FFT instead of an IFFT. */
354
22.9M
         re = yp1[1];
355
22.9M
         im = yp1[0];
356
22.9M
         yp0[0] = yr;
357
22.9M
         yp1[1] = yi;
358
359
22.9M
         t0 = t[(N4-i-1)];
360
22.9M
         t1 = t[(N2-i-1)];
361
         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
362
22.9M
         yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), post_shift);
363
22.9M
         yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), post_shift);
364
22.9M
         yp1[0] = yr;
365
22.9M
         yp0[1] = yi;
366
22.9M
         yp0 += 2;
367
22.9M
         yp1 -= 2;
368
22.9M
      }
369
339k
   }
370
371
   /* Mirror on both sides for TDAC */
372
339k
   {
373
339k
      kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1;
374
339k
      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
375
339k
      const celt_coef * OPUS_RESTRICT wp1 = window;
376
339k
      const celt_coef * OPUS_RESTRICT wp2 = window+overlap-1;
377
378
23.3M
      for(i = 0; i < overlap/2; i++)
379
23.0M
      {
380
23.0M
         kiss_fft_scalar x1, x2;
381
23.0M
         x1 = *xp1;
382
23.0M
         x2 = *yp1;
383
23.0M
         *yp1++ = SUB32_ovflw(S_MUL(x2, *wp2), S_MUL(x1, *wp1));
384
23.0M
         *xp1-- = ADD32_ovflw(S_MUL(x2, *wp1), S_MUL(x1, *wp2));
385
23.0M
         wp1++;
386
23.0M
         wp2--;
387
23.0M
      }
388
339k
   }
389
339k
}
clt_mdct_backward_c
Line
Count
Source
270
310k
{
271
310k
   int i;
272
310k
   int N, N2, N4;
273
310k
   const kiss_twiddle_scalar *trig;
274
#ifdef FIXED_POINT
275
   int pre_shift, post_shift, fft_shift;
276
#endif
277
310k
   (void) arch;
278
279
310k
   N = l->n;
280
310k
   trig = l->trig;
281
999k
   for (i=0;i<shift;i++)
282
689k
   {
283
689k
      N >>= 1;
284
689k
      trig += N;
285
689k
   }
286
310k
   N2 = N>>1;
287
310k
   N4 = N>>2;
288
289
#ifdef FIXED_POINT
290
   {
291
      opus_val32 sumval=N2;
292
      opus_val32 maxval=0;
293
      for (i=0;i<N2;i++) {
294
         maxval = MAX32(maxval, ABS32(in[i*stride]));
295
         sumval = ADD32_ovflw(sumval, ABS32(SHR32(in[i*stride],11)));
296
      }
297
      pre_shift = IMAX(0, 29-celt_zlog2(1+maxval));
298
      /* Worst-case where all the energy goes to a single sample. */
299
      post_shift = IMAX(0, 19-celt_ilog2(ABS32(sumval)));
300
      post_shift = IMIN(post_shift, pre_shift);
301
      fft_shift = pre_shift - post_shift;
302
   }
303
#endif
304
   /* Pre-rotate */
305
310k
   {
306
      /* Temp pointers to make it really clear to the compiler what we're doing */
307
310k
      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
308
310k
      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);
309
310k
      kiss_fft_scalar * OPUS_RESTRICT yp = out+(overlap>>1);
310
310k
      const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0];
311
310k
      const opus_int16 * OPUS_RESTRICT bitrev = l->kfft[shift]->bitrev;
312
47.5M
      for(i=0;i<N4;i++)
313
47.2M
      {
314
47.2M
         int rev;
315
47.2M
         kiss_fft_scalar yr, yi;
316
47.2M
         opus_val32 x1, x2;
317
47.2M
         rev = *bitrev++;
318
47.2M
         x1 = SHL32_ovflw(*xp1, pre_shift);
319
47.2M
         x2 = SHL32_ovflw(*xp2, pre_shift);
320
47.2M
         yr = ADD32_ovflw(S_MUL(x2, t[i]), S_MUL(x1, t[N4+i]));
321
47.2M
         yi = SUB32_ovflw(S_MUL(x1, t[i]), S_MUL(x2, t[N4+i]));
322
         /* We swap real and imag because we use an FFT instead of an IFFT. */
323
47.2M
         yp[2*rev+1] = yr;
324
47.2M
         yp[2*rev] = yi;
325
         /* Storing the pre-rotation directly in the bitrev order. */
326
47.2M
         xp1+=2*stride;
327
47.2M
         xp2-=2*stride;
328
47.2M
      }
329
310k
   }
330
331
310k
   opus_fft_impl(l->kfft[shift], (kiss_fft_cpx*)(out+(overlap>>1)) ARG_FIXED(fft_shift));
332
333
   /* Post-rotate and de-shuffle from both ends of the buffer at once to make
334
      it in-place. */
335
310k
   {
336
310k
      kiss_fft_scalar * yp0 = out+(overlap>>1);
337
310k
      kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2;
338
310k
      const kiss_twiddle_scalar *t = &trig[0];
339
      /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the
340
         middle pair will be computed twice. */
341
23.9M
      for(i=0;i<(N4+1)>>1;i++)
342
23.6M
      {
343
23.6M
         kiss_fft_scalar re, im, yr, yi;
344
23.6M
         kiss_twiddle_scalar t0, t1;
345
         /* We swap real and imag because we're using an FFT instead of an IFFT. */
346
23.6M
         re = yp0[1];
347
23.6M
         im = yp0[0];
348
23.6M
         t0 = t[i];
349
23.6M
         t1 = t[N4+i];
350
         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
351
23.6M
         yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), post_shift);
352
23.6M
         yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), post_shift);
353
         /* We swap real and imag because we're using an FFT instead of an IFFT. */
354
23.6M
         re = yp1[1];
355
23.6M
         im = yp1[0];
356
23.6M
         yp0[0] = yr;
357
23.6M
         yp1[1] = yi;
358
359
23.6M
         t0 = t[(N4-i-1)];
360
23.6M
         t1 = t[(N2-i-1)];
361
         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
362
23.6M
         yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), post_shift);
363
23.6M
         yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), post_shift);
364
23.6M
         yp1[0] = yr;
365
23.6M
         yp0[1] = yi;
366
23.6M
         yp0 += 2;
367
23.6M
         yp1 -= 2;
368
23.6M
      }
369
310k
   }
370
371
   /* Mirror on both sides for TDAC */
372
310k
   {
373
310k
      kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1;
374
310k
      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
375
310k
      const celt_coef * OPUS_RESTRICT wp1 = window;
376
310k
      const celt_coef * OPUS_RESTRICT wp2 = window+overlap-1;
377
378
21.0M
      for(i = 0; i < overlap/2; i++)
379
20.7M
      {
380
20.7M
         kiss_fft_scalar x1, x2;
381
20.7M
         x1 = *xp1;
382
20.7M
         x2 = *yp1;
383
20.7M
         *yp1++ = SUB32_ovflw(S_MUL(x2, *wp2), S_MUL(x1, *wp1));
384
20.7M
         *xp1-- = ADD32_ovflw(S_MUL(x2, *wp1), S_MUL(x1, *wp2));
385
20.7M
         wp1++;
386
20.7M
         wp2--;
387
20.7M
      }
388
310k
   }
389
310k
}
390
#endif /* OVERRIDE_clt_mdct_backward */