Coverage Report

Created: 2025-10-13 07:54

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/opus/celt/mdct.c
Line
Count
Source
1
/* Copyright (c) 2007-2008 CSIRO
2
   Copyright (c) 2007-2008 Xiph.Org Foundation
3
   Written by Jean-Marc Valin */
4
/*
5
   Redistribution and use in source and binary forms, with or without
6
   modification, are permitted provided that the following conditions
7
   are met:
8
9
   - Redistributions of source code must retain the above copyright
10
   notice, this list of conditions and the following disclaimer.
11
12
   - Redistributions in binary form must reproduce the above copyright
13
   notice, this list of conditions and the following disclaimer in the
14
   documentation and/or other materials provided with the distribution.
15
16
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19
   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
20
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
*/
28
29
/* This is a simple MDCT implementation that uses a N/4 complex FFT
30
   to do most of the work. It should be relatively straightforward to
31
   plug in pretty much and FFT here.
32
33
   This replaces the Vorbis FFT (and uses the exact same API), which
34
   was a bit too messy and that was ending up duplicating code
35
   (might as well use the same FFT everywhere).
36
37
   The algorithm is similar to (and inspired from) Fabrice Bellard's
38
   MDCT implementation in FFMPEG, but has differences in signs, ordering
39
   and scaling in many places.
40
*/
41
42
#ifndef SKIP_CONFIG_H
43
#ifdef HAVE_CONFIG_H
44
#include "config.h"
45
#endif
46
#endif
47
48
#include "mdct.h"
49
#include "kiss_fft.h"
50
#include "_kiss_fft_guts.h"
51
#include <math.h>
52
#include "os_support.h"
53
#include "mathops.h"
54
#include "stack_alloc.h"
55
56
#if defined(MIPSr1_ASM)
57
#include "mips/mdct_mipsr1.h"
58
#endif
59
60
#ifndef M_PI
61
#define M_PI 3.141592653
62
#endif
63
64
#ifdef CUSTOM_MODES
65
66
int clt_mdct_init(mdct_lookup *l,int N, int maxshift, int arch)
67
{
68
   int i;
69
   kiss_twiddle_scalar *trig;
70
   int shift;
71
   int N2=N>>1;
72
   l->n = N;
73
   l->maxshift = maxshift;
74
   for (i=0;i<=maxshift;i++)
75
   {
76
      if (i==0)
77
         l->kfft[i] = opus_fft_alloc(N>>2>>i, 0, 0, arch);
78
      else
79
         l->kfft[i] = opus_fft_alloc_twiddles(N>>2>>i, 0, 0, l->kfft[0], arch);
80
#ifndef ENABLE_TI_DSPLIB55
81
      if (l->kfft[i]==NULL)
82
         return 0;
83
#endif
84
   }
85
   l->trig = trig = (kiss_twiddle_scalar*)opus_alloc((N-(N2>>maxshift))*sizeof(kiss_twiddle_scalar));
86
   if (l->trig==NULL)
87
     return 0;
88
   for (shift=0;shift<=maxshift;shift++)
89
   {
90
      /* We have enough points that sine isn't necessary */
91
#if defined(FIXED_POINT)
92
#ifndef ENABLE_QEXT
93
      for (i=0;i<N2;i++)
94
         trig[i] = TRIG_UPSCALE*celt_cos_norm(DIV32(ADD32(SHL32(EXTEND32(i),17),N2+16384),N));
95
#else
96
      for (i=0;i<N2;i++)
97
         trig[i] = (kiss_twiddle_scalar)MAX32(-2147483647,MIN32(2147483647,floor(.5+2147483648*cos(2*M_PI*(i+.125)/N))));
98
#endif
99
#else
100
      for (i=0;i<N2;i++)
101
         trig[i] = (kiss_twiddle_scalar)cos(2*PI*(i+.125)/N);
102
#endif
103
      trig += N2;
104
      N2 >>= 1;
105
      N >>= 1;
106
   }
107
   return 1;
108
}
109
110
void clt_mdct_clear(mdct_lookup *l, int arch)
111
{
112
   int i;
113
   for (i=0;i<=l->maxshift;i++)
114
      opus_fft_free(l->kfft[i], arch);
115
   opus_free((kiss_twiddle_scalar*)l->trig);
116
}
117
118
#endif /* CUSTOM_MODES */
119
120
/* Forward MDCT trashes the input array */
121
#ifndef OVERRIDE_clt_mdct_forward
122
void clt_mdct_forward_c(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
123
      const celt_coef *window, int overlap, int shift, int stride, int arch)
124
3.05M
{
125
3.05M
   int i;
126
3.05M
   int N, N2, N4;
127
3.05M
   VARDECL(kiss_fft_scalar, f);
128
3.05M
   VARDECL(kiss_fft_cpx, f2);
129
3.05M
   const kiss_fft_state *st = l->kfft[shift];
130
3.05M
   const kiss_twiddle_scalar *trig;
131
3.05M
   celt_coef scale;
132
#ifdef FIXED_POINT
133
   /* Allows us to scale with MULT16_32_Q16(), which is faster than
134
      MULT16_32_Q15() on ARM. */
135
   int scale_shift = st->scale_shift-1;
136
   int headroom;
137
#endif
138
3.05M
   SAVE_STACK;
139
3.05M
   (void)arch;
140
3.05M
   scale = st->scale;
141
142
3.05M
   N = l->n;
143
3.05M
   trig = l->trig;
144
10.7M
   for (i=0;i<shift;i++)
145
7.72M
   {
146
7.72M
      N >>= 1;
147
7.72M
      trig += N;
148
7.72M
   }
149
3.05M
   N2 = N>>1;
150
3.05M
   N4 = N>>2;
151
152
3.05M
   ALLOC(f, N2, kiss_fft_scalar);
153
3.05M
   ALLOC(f2, N4, kiss_fft_cpx);
154
155
   /* Consider the input to be composed of four blocks: [a, b, c, d] */
156
   /* Window, shuffle, fold */
157
3.05M
   {
158
      /* Temp pointers to make it really clear to the compiler what we're doing */
159
3.05M
      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1);
160
3.05M
      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1);
161
3.05M
      kiss_fft_scalar * OPUS_RESTRICT yp = f;
162
3.05M
      const celt_coef * OPUS_RESTRICT wp1 = window+(overlap>>1);
163
3.05M
      const celt_coef * OPUS_RESTRICT wp2 = window+(overlap>>1)-1;
164
95.9M
      for(i=0;i<((overlap+3)>>2);i++)
165
92.9M
      {
166
         /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
167
92.9M
         *yp++ = S_MUL(xp1[N2], *wp2) + S_MUL(*xp2, *wp1);
168
92.9M
         *yp++ = S_MUL(*xp1, *wp1)    - S_MUL(xp2[-N2], *wp2);
169
92.9M
         xp1+=2;
170
92.9M
         xp2-=2;
171
92.9M
         wp1+=2;
172
92.9M
         wp2-=2;
173
92.9M
      }
174
3.05M
      wp1 = window;
175
3.05M
      wp2 = window+overlap-1;
176
194M
      for(;i<N4-((overlap+3)>>2);i++)
177
191M
      {
178
         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
179
191M
         *yp++ = *xp2;
180
191M
         *yp++ = *xp1;
181
191M
         xp1+=2;
182
191M
         xp2-=2;
183
191M
      }
184
95.9M
      for(;i<N4;i++)
185
92.9M
      {
186
         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
187
92.9M
         *yp++ =  -S_MUL(xp1[-N2], *wp1) + S_MUL(*xp2, *wp2);
188
92.9M
         *yp++ = S_MUL(*xp1, *wp2)     + S_MUL(xp2[N2], *wp1);
189
92.9M
         xp1+=2;
190
92.9M
         xp2-=2;
191
92.9M
         wp1+=2;
192
92.9M
         wp2-=2;
193
92.9M
      }
194
3.05M
   }
195
   /* Pre-rotation */
196
3.05M
   {
197
3.05M
      kiss_fft_scalar * OPUS_RESTRICT yp = f;
198
3.05M
      const kiss_twiddle_scalar *t = &trig[0];
199
#ifdef FIXED_POINT
200
      opus_val32 maxval=1;
201
#endif
202
380M
      for(i=0;i<N4;i++)
203
377M
      {
204
377M
         kiss_fft_cpx yc;
205
377M
         kiss_twiddle_scalar t0, t1;
206
377M
         kiss_fft_scalar re, im, yr, yi;
207
377M
         t0 = t[i];
208
377M
         t1 = t[N4+i];
209
377M
         re = *yp++;
210
377M
         im = *yp++;
211
377M
         yr = S_MUL(re,t0)  -  S_MUL(im,t1);
212
377M
         yi = S_MUL(im,t0)  +  S_MUL(re,t1);
213
         /* For QEXT, it's best to scale before the FFT, but otherwise it's best to scale after.
214
            For floating-point it doesn't matter. */
215
#ifdef ENABLE_QEXT
216
         yc.r = yr;
217
         yc.i = yi;
218
#else
219
188M
         yc.r = S_MUL2(yr, scale);
220
188M
         yc.i = S_MUL2(yi, scale);
221
#endif
222
#ifdef FIXED_POINT
223
202M
         maxval = MAX32(maxval, MAX32(ABS32(yc.r), ABS32(yc.i)));
224
#endif
225
377M
         f2[st->bitrev[i]] = yc;
226
377M
      }
227
#ifdef FIXED_POINT
228
1.62M
      headroom = IMAX(0, IMIN(scale_shift, 28-celt_ilog2(maxval)));
229
#endif
230
3.05M
   }
231
232
   /* N/4 complex FFT, does not downscale anymore */
233
3.05M
   opus_fft_impl(st, f2 ARG_FIXED(scale_shift-headroom));
234
235
   /* Post-rotate */
236
3.05M
   {
237
      /* Temp pointers to make it really clear to the compiler what we're doing */
238
3.05M
      const kiss_fft_cpx * OPUS_RESTRICT fp = f2;
239
3.05M
      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
240
3.05M
      kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
241
3.05M
      const kiss_twiddle_scalar *t = &trig[0];
242
      /* Temp pointers to make it really clear to the compiler what we're doing */
243
380M
      for(i=0;i<N4;i++)
244
377M
      {
245
377M
         kiss_fft_scalar yr, yi;
246
377M
         kiss_fft_scalar t0, t1;
247
#ifdef ENABLE_QEXT
248
188M
         t0 = S_MUL2(t[i], scale);
249
188M
         t1 = S_MUL2(t[N4+i], scale);
250
#else
251
         t0 = t[i];
252
         t1 = t[N4+i];
253
#endif
254
377M
         yr = PSHR32(S_MUL(fp->i,t1) - S_MUL(fp->r,t0), headroom);
255
377M
         yi = PSHR32(S_MUL(fp->r,t1) + S_MUL(fp->i,t0), headroom);
256
377M
         *yp1 = yr;
257
377M
         *yp2 = yi;
258
377M
         fp++;
259
377M
         yp1 += 2*stride;
260
377M
         yp2 -= 2*stride;
261
377M
      }
262
3.05M
   }
263
3.05M
   RESTORE_STACK;
264
3.05M
}
clt_mdct_forward_c
Line
Count
Source
124
812k
{
125
812k
   int i;
126
812k
   int N, N2, N4;
127
812k
   VARDECL(kiss_fft_scalar, f);
128
812k
   VARDECL(kiss_fft_cpx, f2);
129
812k
   const kiss_fft_state *st = l->kfft[shift];
130
812k
   const kiss_twiddle_scalar *trig;
131
812k
   celt_coef scale;
132
812k
#ifdef FIXED_POINT
133
   /* Allows us to scale with MULT16_32_Q16(), which is faster than
134
      MULT16_32_Q15() on ARM. */
135
812k
   int scale_shift = st->scale_shift-1;
136
812k
   int headroom;
137
812k
#endif
138
812k
   SAVE_STACK;
139
812k
   (void)arch;
140
812k
   scale = st->scale;
141
142
812k
   N = l->n;
143
812k
   trig = l->trig;
144
2.86M
   for (i=0;i<shift;i++)
145
2.05M
   {
146
2.05M
      N >>= 1;
147
2.05M
      trig += N;
148
2.05M
   }
149
812k
   N2 = N>>1;
150
812k
   N4 = N>>2;
151
152
812k
   ALLOC(f, N2, kiss_fft_scalar);
153
812k
   ALLOC(f2, N4, kiss_fft_cpx);
154
155
   /* Consider the input to be composed of four blocks: [a, b, c, d] */
156
   /* Window, shuffle, fold */
157
812k
   {
158
      /* Temp pointers to make it really clear to the compiler what we're doing */
159
812k
      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1);
160
812k
      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1);
161
812k
      kiss_fft_scalar * OPUS_RESTRICT yp = f;
162
812k
      const celt_coef * OPUS_RESTRICT wp1 = window+(overlap>>1);
163
812k
      const celt_coef * OPUS_RESTRICT wp2 = window+(overlap>>1)-1;
164
25.5M
      for(i=0;i<((overlap+3)>>2);i++)
165
24.7M
      {
166
         /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
167
24.7M
         *yp++ = S_MUL(xp1[N2], *wp2) + S_MUL(*xp2, *wp1);
168
24.7M
         *yp++ = S_MUL(*xp1, *wp1)    - S_MUL(xp2[-N2], *wp2);
169
24.7M
         xp1+=2;
170
24.7M
         xp2-=2;
171
24.7M
         wp1+=2;
172
24.7M
         wp2-=2;
173
24.7M
      }
174
812k
      wp1 = window;
175
812k
      wp2 = window+overlap-1;
176
52.5M
      for(;i<N4-((overlap+3)>>2);i++)
177
51.6M
      {
178
         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
179
51.6M
         *yp++ = *xp2;
180
51.6M
         *yp++ = *xp1;
181
51.6M
         xp1+=2;
182
51.6M
         xp2-=2;
183
51.6M
      }
184
25.5M
      for(;i<N4;i++)
185
24.7M
      {
186
         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
187
24.7M
         *yp++ =  -S_MUL(xp1[-N2], *wp1) + S_MUL(*xp2, *wp2);
188
24.7M
         *yp++ = S_MUL(*xp1, *wp2)     + S_MUL(xp2[N2], *wp1);
189
24.7M
         xp1+=2;
190
24.7M
         xp2-=2;
191
24.7M
         wp1+=2;
192
24.7M
         wp2-=2;
193
24.7M
      }
194
812k
   }
195
   /* Pre-rotation */
196
812k
   {
197
812k
      kiss_fft_scalar * OPUS_RESTRICT yp = f;
198
812k
      const kiss_twiddle_scalar *t = &trig[0];
199
812k
#ifdef FIXED_POINT
200
812k
      opus_val32 maxval=1;
201
812k
#endif
202
102M
      for(i=0;i<N4;i++)
203
101M
      {
204
101M
         kiss_fft_cpx yc;
205
101M
         kiss_twiddle_scalar t0, t1;
206
101M
         kiss_fft_scalar re, im, yr, yi;
207
101M
         t0 = t[i];
208
101M
         t1 = t[N4+i];
209
101M
         re = *yp++;
210
101M
         im = *yp++;
211
101M
         yr = S_MUL(re,t0)  -  S_MUL(im,t1);
212
101M
         yi = S_MUL(im,t0)  +  S_MUL(re,t1);
213
         /* For QEXT, it's best to scale before the FFT, but otherwise it's best to scale after.
214
            For floating-point it doesn't matter. */
215
#ifdef ENABLE_QEXT
216
         yc.r = yr;
217
         yc.i = yi;
218
#else
219
101M
         yc.r = S_MUL2(yr, scale);
220
101M
         yc.i = S_MUL2(yi, scale);
221
101M
#endif
222
101M
#ifdef FIXED_POINT
223
101M
         maxval = MAX32(maxval, MAX32(ABS32(yc.r), ABS32(yc.i)));
224
101M
#endif
225
101M
         f2[st->bitrev[i]] = yc;
226
101M
      }
227
812k
#ifdef FIXED_POINT
228
812k
      headroom = IMAX(0, IMIN(scale_shift, 28-celt_ilog2(maxval)));
229
812k
#endif
230
812k
   }
231
232
   /* N/4 complex FFT, does not downscale anymore */
233
812k
   opus_fft_impl(st, f2 ARG_FIXED(scale_shift-headroom));
234
235
   /* Post-rotate */
236
812k
   {
237
      /* Temp pointers to make it really clear to the compiler what we're doing */
238
812k
      const kiss_fft_cpx * OPUS_RESTRICT fp = f2;
239
812k
      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
240
812k
      kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
241
812k
      const kiss_twiddle_scalar *t = &trig[0];
242
      /* Temp pointers to make it really clear to the compiler what we're doing */
243
102M
      for(i=0;i<N4;i++)
244
101M
      {
245
101M
         kiss_fft_scalar yr, yi;
246
101M
         kiss_fft_scalar t0, t1;
247
#ifdef ENABLE_QEXT
248
         t0 = S_MUL2(t[i], scale);
249
         t1 = S_MUL2(t[N4+i], scale);
250
#else
251
101M
         t0 = t[i];
252
101M
         t1 = t[N4+i];
253
101M
#endif
254
101M
         yr = PSHR32(S_MUL(fp->i,t1) - S_MUL(fp->r,t0), headroom);
255
101M
         yi = PSHR32(S_MUL(fp->r,t1) + S_MUL(fp->i,t0), headroom);
256
101M
         *yp1 = yr;
257
101M
         *yp2 = yi;
258
101M
         fp++;
259
101M
         yp1 += 2*stride;
260
101M
         yp2 -= 2*stride;
261
101M
      }
262
812k
   }
263
812k
   RESTORE_STACK;
264
812k
}
clt_mdct_forward_c
Line
Count
Source
124
812k
{
125
812k
   int i;
126
812k
   int N, N2, N4;
127
812k
   VARDECL(kiss_fft_scalar, f);
128
812k
   VARDECL(kiss_fft_cpx, f2);
129
812k
   const kiss_fft_state *st = l->kfft[shift];
130
812k
   const kiss_twiddle_scalar *trig;
131
812k
   celt_coef scale;
132
812k
#ifdef FIXED_POINT
133
   /* Allows us to scale with MULT16_32_Q16(), which is faster than
134
      MULT16_32_Q15() on ARM. */
135
812k
   int scale_shift = st->scale_shift-1;
136
812k
   int headroom;
137
812k
#endif
138
812k
   SAVE_STACK;
139
812k
   (void)arch;
140
812k
   scale = st->scale;
141
142
812k
   N = l->n;
143
812k
   trig = l->trig;
144
2.86M
   for (i=0;i<shift;i++)
145
2.05M
   {
146
2.05M
      N >>= 1;
147
2.05M
      trig += N;
148
2.05M
   }
149
812k
   N2 = N>>1;
150
812k
   N4 = N>>2;
151
152
812k
   ALLOC(f, N2, kiss_fft_scalar);
153
812k
   ALLOC(f2, N4, kiss_fft_cpx);
154
155
   /* Consider the input to be composed of four blocks: [a, b, c, d] */
156
   /* Window, shuffle, fold */
157
812k
   {
158
      /* Temp pointers to make it really clear to the compiler what we're doing */
159
812k
      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1);
160
812k
      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1);
161
812k
      kiss_fft_scalar * OPUS_RESTRICT yp = f;
162
812k
      const celt_coef * OPUS_RESTRICT wp1 = window+(overlap>>1);
163
812k
      const celt_coef * OPUS_RESTRICT wp2 = window+(overlap>>1)-1;
164
25.5M
      for(i=0;i<((overlap+3)>>2);i++)
165
24.7M
      {
166
         /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
167
24.7M
         *yp++ = S_MUL(xp1[N2], *wp2) + S_MUL(*xp2, *wp1);
168
24.7M
         *yp++ = S_MUL(*xp1, *wp1)    - S_MUL(xp2[-N2], *wp2);
169
24.7M
         xp1+=2;
170
24.7M
         xp2-=2;
171
24.7M
         wp1+=2;
172
24.7M
         wp2-=2;
173
24.7M
      }
174
812k
      wp1 = window;
175
812k
      wp2 = window+overlap-1;
176
52.5M
      for(;i<N4-((overlap+3)>>2);i++)
177
51.6M
      {
178
         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
179
51.6M
         *yp++ = *xp2;
180
51.6M
         *yp++ = *xp1;
181
51.6M
         xp1+=2;
182
51.6M
         xp2-=2;
183
51.6M
      }
184
25.5M
      for(;i<N4;i++)
185
24.7M
      {
186
         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
187
24.7M
         *yp++ =  -S_MUL(xp1[-N2], *wp1) + S_MUL(*xp2, *wp2);
188
24.7M
         *yp++ = S_MUL(*xp1, *wp2)     + S_MUL(xp2[N2], *wp1);
189
24.7M
         xp1+=2;
190
24.7M
         xp2-=2;
191
24.7M
         wp1+=2;
192
24.7M
         wp2-=2;
193
24.7M
      }
194
812k
   }
195
   /* Pre-rotation */
196
812k
   {
197
812k
      kiss_fft_scalar * OPUS_RESTRICT yp = f;
198
812k
      const kiss_twiddle_scalar *t = &trig[0];
199
812k
#ifdef FIXED_POINT
200
812k
      opus_val32 maxval=1;
201
812k
#endif
202
102M
      for(i=0;i<N4;i++)
203
101M
      {
204
101M
         kiss_fft_cpx yc;
205
101M
         kiss_twiddle_scalar t0, t1;
206
101M
         kiss_fft_scalar re, im, yr, yi;
207
101M
         t0 = t[i];
208
101M
         t1 = t[N4+i];
209
101M
         re = *yp++;
210
101M
         im = *yp++;
211
101M
         yr = S_MUL(re,t0)  -  S_MUL(im,t1);
212
101M
         yi = S_MUL(im,t0)  +  S_MUL(re,t1);
213
         /* For QEXT, it's best to scale before the FFT, but otherwise it's best to scale after.
214
            For floating-point it doesn't matter. */
215
101M
#ifdef ENABLE_QEXT
216
101M
         yc.r = yr;
217
101M
         yc.i = yi;
218
#else
219
         yc.r = S_MUL2(yr, scale);
220
         yc.i = S_MUL2(yi, scale);
221
#endif
222
101M
#ifdef FIXED_POINT
223
101M
         maxval = MAX32(maxval, MAX32(ABS32(yc.r), ABS32(yc.i)));
224
101M
#endif
225
101M
         f2[st->bitrev[i]] = yc;
226
101M
      }
227
812k
#ifdef FIXED_POINT
228
812k
      headroom = IMAX(0, IMIN(scale_shift, 28-celt_ilog2(maxval)));
229
812k
#endif
230
812k
   }
231
232
   /* N/4 complex FFT, does not downscale anymore */
233
812k
   opus_fft_impl(st, f2 ARG_FIXED(scale_shift-headroom));
234
235
   /* Post-rotate */
236
812k
   {
237
      /* Temp pointers to make it really clear to the compiler what we're doing */
238
812k
      const kiss_fft_cpx * OPUS_RESTRICT fp = f2;
239
812k
      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
240
812k
      kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
241
812k
      const kiss_twiddle_scalar *t = &trig[0];
242
      /* Temp pointers to make it really clear to the compiler what we're doing */
243
102M
      for(i=0;i<N4;i++)
244
101M
      {
245
101M
         kiss_fft_scalar yr, yi;
246
101M
         kiss_fft_scalar t0, t1;
247
101M
#ifdef ENABLE_QEXT
248
101M
         t0 = S_MUL2(t[i], scale);
249
101M
         t1 = S_MUL2(t[N4+i], scale);
250
#else
251
         t0 = t[i];
252
         t1 = t[N4+i];
253
#endif
254
101M
         yr = PSHR32(S_MUL(fp->i,t1) - S_MUL(fp->r,t0), headroom);
255
101M
         yi = PSHR32(S_MUL(fp->r,t1) + S_MUL(fp->i,t0), headroom);
256
101M
         *yp1 = yr;
257
101M
         *yp2 = yi;
258
101M
         fp++;
259
101M
         yp1 += 2*stride;
260
101M
         yp2 -= 2*stride;
261
101M
      }
262
812k
   }
263
812k
   RESTORE_STACK;
264
812k
}
clt_mdct_forward_c
Line
Count
Source
124
713k
{
125
713k
   int i;
126
713k
   int N, N2, N4;
127
713k
   VARDECL(kiss_fft_scalar, f);
128
713k
   VARDECL(kiss_fft_cpx, f2);
129
713k
   const kiss_fft_state *st = l->kfft[shift];
130
713k
   const kiss_twiddle_scalar *trig;
131
713k
   celt_coef scale;
132
#ifdef FIXED_POINT
133
   /* Allows us to scale with MULT16_32_Q16(), which is faster than
134
      MULT16_32_Q15() on ARM. */
135
   int scale_shift = st->scale_shift-1;
136
   int headroom;
137
#endif
138
713k
   SAVE_STACK;
139
713k
   (void)arch;
140
713k
   scale = st->scale;
141
142
713k
   N = l->n;
143
713k
   trig = l->trig;
144
2.52M
   for (i=0;i<shift;i++)
145
1.81M
   {
146
1.81M
      N >>= 1;
147
1.81M
      trig += N;
148
1.81M
   }
149
713k
   N2 = N>>1;
150
713k
   N4 = N>>2;
151
152
713k
   ALLOC(f, N2, kiss_fft_scalar);
153
713k
   ALLOC(f2, N4, kiss_fft_cpx);
154
155
   /* Consider the input to be composed of four blocks: [a, b, c, d] */
156
   /* Window, shuffle, fold */
157
713k
   {
158
      /* Temp pointers to make it really clear to the compiler what we're doing */
159
713k
      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1);
160
713k
      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1);
161
713k
      kiss_fft_scalar * OPUS_RESTRICT yp = f;
162
713k
      const celt_coef * OPUS_RESTRICT wp1 = window+(overlap>>1);
163
713k
      const celt_coef * OPUS_RESTRICT wp2 = window+(overlap>>1)-1;
164
22.3M
      for(i=0;i<((overlap+3)>>2);i++)
165
21.6M
      {
166
         /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
167
21.6M
         *yp++ = S_MUL(xp1[N2], *wp2) + S_MUL(*xp2, *wp1);
168
21.6M
         *yp++ = S_MUL(*xp1, *wp1)    - S_MUL(xp2[-N2], *wp2);
169
21.6M
         xp1+=2;
170
21.6M
         xp2-=2;
171
21.6M
         wp1+=2;
172
21.6M
         wp2-=2;
173
21.6M
      }
174
713k
      wp1 = window;
175
713k
      wp2 = window+overlap-1;
176
44.8M
      for(;i<N4-((overlap+3)>>2);i++)
177
44.1M
      {
178
         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
179
44.1M
         *yp++ = *xp2;
180
44.1M
         *yp++ = *xp1;
181
44.1M
         xp1+=2;
182
44.1M
         xp2-=2;
183
44.1M
      }
184
22.3M
      for(;i<N4;i++)
185
21.6M
      {
186
         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
187
21.6M
         *yp++ =  -S_MUL(xp1[-N2], *wp1) + S_MUL(*xp2, *wp2);
188
21.6M
         *yp++ = S_MUL(*xp1, *wp2)     + S_MUL(xp2[N2], *wp1);
189
21.6M
         xp1+=2;
190
21.6M
         xp2-=2;
191
21.6M
         wp1+=2;
192
21.6M
         wp2-=2;
193
21.6M
      }
194
713k
   }
195
   /* Pre-rotation */
196
713k
   {
197
713k
      kiss_fft_scalar * OPUS_RESTRICT yp = f;
198
713k
      const kiss_twiddle_scalar *t = &trig[0];
199
#ifdef FIXED_POINT
200
      opus_val32 maxval=1;
201
#endif
202
88.1M
      for(i=0;i<N4;i++)
203
87.4M
      {
204
87.4M
         kiss_fft_cpx yc;
205
87.4M
         kiss_twiddle_scalar t0, t1;
206
87.4M
         kiss_fft_scalar re, im, yr, yi;
207
87.4M
         t0 = t[i];
208
87.4M
         t1 = t[N4+i];
209
87.4M
         re = *yp++;
210
87.4M
         im = *yp++;
211
87.4M
         yr = S_MUL(re,t0)  -  S_MUL(im,t1);
212
87.4M
         yi = S_MUL(im,t0)  +  S_MUL(re,t1);
213
         /* For QEXT, it's best to scale before the FFT, but otherwise it's best to scale after.
214
            For floating-point it doesn't matter. */
215
87.4M
#ifdef ENABLE_QEXT
216
87.4M
         yc.r = yr;
217
87.4M
         yc.i = yi;
218
#else
219
         yc.r = S_MUL2(yr, scale);
220
         yc.i = S_MUL2(yi, scale);
221
#endif
222
#ifdef FIXED_POINT
223
         maxval = MAX32(maxval, MAX32(ABS32(yc.r), ABS32(yc.i)));
224
#endif
225
87.4M
         f2[st->bitrev[i]] = yc;
226
87.4M
      }
227
#ifdef FIXED_POINT
228
      headroom = IMAX(0, IMIN(scale_shift, 28-celt_ilog2(maxval)));
229
#endif
230
713k
   }
231
232
   /* N/4 complex FFT, does not downscale anymore */
233
713k
   opus_fft_impl(st, f2 ARG_FIXED(scale_shift-headroom));
234
235
   /* Post-rotate */
236
713k
   {
237
      /* Temp pointers to make it really clear to the compiler what we're doing */
238
713k
      const kiss_fft_cpx * OPUS_RESTRICT fp = f2;
239
713k
      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
240
713k
      kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
241
713k
      const kiss_twiddle_scalar *t = &trig[0];
242
      /* Temp pointers to make it really clear to the compiler what we're doing */
243
88.1M
      for(i=0;i<N4;i++)
244
87.4M
      {
245
87.4M
         kiss_fft_scalar yr, yi;
246
87.4M
         kiss_fft_scalar t0, t1;
247
87.4M
#ifdef ENABLE_QEXT
248
87.4M
         t0 = S_MUL2(t[i], scale);
249
87.4M
         t1 = S_MUL2(t[N4+i], scale);
250
#else
251
         t0 = t[i];
252
         t1 = t[N4+i];
253
#endif
254
87.4M
         yr = PSHR32(S_MUL(fp->i,t1) - S_MUL(fp->r,t0), headroom);
255
87.4M
         yi = PSHR32(S_MUL(fp->r,t1) + S_MUL(fp->i,t0), headroom);
256
87.4M
         *yp1 = yr;
257
87.4M
         *yp2 = yi;
258
87.4M
         fp++;
259
87.4M
         yp1 += 2*stride;
260
87.4M
         yp2 -= 2*stride;
261
87.4M
      }
262
713k
   }
263
713k
   RESTORE_STACK;
264
713k
}
clt_mdct_forward_c
Line
Count
Source
124
713k
{
125
713k
   int i;
126
713k
   int N, N2, N4;
127
713k
   VARDECL(kiss_fft_scalar, f);
128
713k
   VARDECL(kiss_fft_cpx, f2);
129
713k
   const kiss_fft_state *st = l->kfft[shift];
130
713k
   const kiss_twiddle_scalar *trig;
131
713k
   celt_coef scale;
132
#ifdef FIXED_POINT
133
   /* Allows us to scale with MULT16_32_Q16(), which is faster than
134
      MULT16_32_Q15() on ARM. */
135
   int scale_shift = st->scale_shift-1;
136
   int headroom;
137
#endif
138
713k
   SAVE_STACK;
139
713k
   (void)arch;
140
713k
   scale = st->scale;
141
142
713k
   N = l->n;
143
713k
   trig = l->trig;
144
2.52M
   for (i=0;i<shift;i++)
145
1.81M
   {
146
1.81M
      N >>= 1;
147
1.81M
      trig += N;
148
1.81M
   }
149
713k
   N2 = N>>1;
150
713k
   N4 = N>>2;
151
152
713k
   ALLOC(f, N2, kiss_fft_scalar);
153
713k
   ALLOC(f2, N4, kiss_fft_cpx);
154
155
   /* Consider the input to be composed of four blocks: [a, b, c, d] */
156
   /* Window, shuffle, fold */
157
713k
   {
158
      /* Temp pointers to make it really clear to the compiler what we're doing */
159
713k
      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1);
160
713k
      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1);
161
713k
      kiss_fft_scalar * OPUS_RESTRICT yp = f;
162
713k
      const celt_coef * OPUS_RESTRICT wp1 = window+(overlap>>1);
163
713k
      const celt_coef * OPUS_RESTRICT wp2 = window+(overlap>>1)-1;
164
22.3M
      for(i=0;i<((overlap+3)>>2);i++)
165
21.6M
      {
166
         /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
167
21.6M
         *yp++ = S_MUL(xp1[N2], *wp2) + S_MUL(*xp2, *wp1);
168
21.6M
         *yp++ = S_MUL(*xp1, *wp1)    - S_MUL(xp2[-N2], *wp2);
169
21.6M
         xp1+=2;
170
21.6M
         xp2-=2;
171
21.6M
         wp1+=2;
172
21.6M
         wp2-=2;
173
21.6M
      }
174
713k
      wp1 = window;
175
713k
      wp2 = window+overlap-1;
176
44.8M
      for(;i<N4-((overlap+3)>>2);i++)
177
44.1M
      {
178
         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
179
44.1M
         *yp++ = *xp2;
180
44.1M
         *yp++ = *xp1;
181
44.1M
         xp1+=2;
182
44.1M
         xp2-=2;
183
44.1M
      }
184
22.3M
      for(;i<N4;i++)
185
21.6M
      {
186
         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
187
21.6M
         *yp++ =  -S_MUL(xp1[-N2], *wp1) + S_MUL(*xp2, *wp2);
188
21.6M
         *yp++ = S_MUL(*xp1, *wp2)     + S_MUL(xp2[N2], *wp1);
189
21.6M
         xp1+=2;
190
21.6M
         xp2-=2;
191
21.6M
         wp1+=2;
192
21.6M
         wp2-=2;
193
21.6M
      }
194
713k
   }
195
   /* Pre-rotation */
196
713k
   {
197
713k
      kiss_fft_scalar * OPUS_RESTRICT yp = f;
198
713k
      const kiss_twiddle_scalar *t = &trig[0];
199
#ifdef FIXED_POINT
200
      opus_val32 maxval=1;
201
#endif
202
88.1M
      for(i=0;i<N4;i++)
203
87.4M
      {
204
87.4M
         kiss_fft_cpx yc;
205
87.4M
         kiss_twiddle_scalar t0, t1;
206
87.4M
         kiss_fft_scalar re, im, yr, yi;
207
87.4M
         t0 = t[i];
208
87.4M
         t1 = t[N4+i];
209
87.4M
         re = *yp++;
210
87.4M
         im = *yp++;
211
87.4M
         yr = S_MUL(re,t0)  -  S_MUL(im,t1);
212
87.4M
         yi = S_MUL(im,t0)  +  S_MUL(re,t1);
213
         /* For QEXT, it's best to scale before the FFT, but otherwise it's best to scale after.
214
            For floating-point it doesn't matter. */
215
#ifdef ENABLE_QEXT
216
         yc.r = yr;
217
         yc.i = yi;
218
#else
219
87.4M
         yc.r = S_MUL2(yr, scale);
220
87.4M
         yc.i = S_MUL2(yi, scale);
221
87.4M
#endif
222
#ifdef FIXED_POINT
223
         maxval = MAX32(maxval, MAX32(ABS32(yc.r), ABS32(yc.i)));
224
#endif
225
87.4M
         f2[st->bitrev[i]] = yc;
226
87.4M
      }
227
#ifdef FIXED_POINT
228
      headroom = IMAX(0, IMIN(scale_shift, 28-celt_ilog2(maxval)));
229
#endif
230
713k
   }
231
232
   /* N/4 complex FFT, does not downscale anymore */
233
713k
   opus_fft_impl(st, f2 ARG_FIXED(scale_shift-headroom));
234
235
   /* Post-rotate */
236
713k
   {
237
      /* Temp pointers to make it really clear to the compiler what we're doing */
238
713k
      const kiss_fft_cpx * OPUS_RESTRICT fp = f2;
239
713k
      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
240
713k
      kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
241
713k
      const kiss_twiddle_scalar *t = &trig[0];
242
      /* Temp pointers to make it really clear to the compiler what we're doing */
243
88.1M
      for(i=0;i<N4;i++)
244
87.4M
      {
245
87.4M
         kiss_fft_scalar yr, yi;
246
87.4M
         kiss_fft_scalar t0, t1;
247
#ifdef ENABLE_QEXT
248
         t0 = S_MUL2(t[i], scale);
249
         t1 = S_MUL2(t[N4+i], scale);
250
#else
251
87.4M
         t0 = t[i];
252
87.4M
         t1 = t[N4+i];
253
87.4M
#endif
254
87.4M
         yr = PSHR32(S_MUL(fp->i,t1) - S_MUL(fp->r,t0), headroom);
255
87.4M
         yi = PSHR32(S_MUL(fp->r,t1) + S_MUL(fp->i,t0), headroom);
256
87.4M
         *yp1 = yr;
257
87.4M
         *yp2 = yi;
258
87.4M
         fp++;
259
87.4M
         yp1 += 2*stride;
260
87.4M
         yp2 -= 2*stride;
261
87.4M
      }
262
713k
   }
263
713k
   RESTORE_STACK;
264
713k
}
265
#endif /* OVERRIDE_clt_mdct_forward */
266
267
#ifndef OVERRIDE_clt_mdct_backward
268
void clt_mdct_backward_c(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
269
      const celt_coef * OPUS_RESTRICT window, int overlap, int shift, int stride, int arch)
270
609k
{
271
609k
   int i;
272
609k
   int N, N2, N4;
273
609k
   const kiss_twiddle_scalar *trig;
274
#ifdef FIXED_POINT
275
   int pre_shift, post_shift, fft_shift;
276
#endif
277
609k
   (void) arch;
278
279
609k
   N = l->n;
280
609k
   trig = l->trig;
281
2.00M
   for (i=0;i<shift;i++)
282
1.39M
   {
283
1.39M
      N >>= 1;
284
1.39M
      trig += N;
285
1.39M
   }
286
609k
   N2 = N>>1;
287
609k
   N4 = N>>2;
288
289
#ifdef FIXED_POINT
290
   {
291
      opus_val32 sumval=N2;
292
      opus_val32 maxval=0;
293
91.6M
      for (i=0;i<N2;i++) {
294
91.2M
         maxval = MAX32(maxval, ABS32(in[i*stride]));
295
91.2M
         sumval = ADD32_ovflw(sumval, ABS32(SHR32(in[i*stride],11)));
296
91.2M
      }
297
333k
      pre_shift = IMAX(0, 29-celt_zlog2(1+maxval));
298
      /* Worst-case where all the energy goes to a single sample. */
299
333k
      post_shift = IMAX(0, 19-celt_ilog2(ABS32(sumval)));
300
333k
      post_shift = IMIN(post_shift, pre_shift);
301
      fft_shift = pre_shift - post_shift;
302
   }
303
#endif
304
   /* Pre-rotate */
305
609k
   {
306
      /* Temp pointers to make it really clear to the compiler what we're doing */
307
609k
      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
308
609k
      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);
309
609k
      kiss_fft_scalar * OPUS_RESTRICT yp = out+(overlap>>1);
310
609k
      const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0];
311
609k
      const opus_int16 * OPUS_RESTRICT bitrev = l->kfft[shift]->bitrev;
312
87.7M
      for(i=0;i<N4;i++)
313
87.1M
      {
314
87.1M
         int rev;
315
87.1M
         kiss_fft_scalar yr, yi;
316
87.1M
         opus_val32 x1, x2;
317
87.1M
         rev = *bitrev++;
318
87.1M
         x1 = SHL32_ovflw(*xp1, pre_shift);
319
87.1M
         x2 = SHL32_ovflw(*xp2, pre_shift);
320
87.1M
         yr = ADD32_ovflw(S_MUL(x2, t[i]), S_MUL(x1, t[N4+i]));
321
87.1M
         yi = SUB32_ovflw(S_MUL(x1, t[i]), S_MUL(x2, t[N4+i]));
322
         /* We swap real and imag because we use an FFT instead of an IFFT. */
323
87.1M
         yp[2*rev+1] = yr;
324
87.1M
         yp[2*rev] = yi;
325
         /* Storing the pre-rotation directly in the bitrev order. */
326
87.1M
         xp1+=2*stride;
327
87.1M
         xp2-=2*stride;
328
87.1M
      }
329
609k
   }
330
331
609k
   opus_fft_impl(l->kfft[shift], (kiss_fft_cpx*)(out+(overlap>>1)) ARG_FIXED(fft_shift));
332
333
   /* Post-rotate and de-shuffle from both ends of the buffer at once to make
334
      it in-place. */
335
609k
   {
336
609k
      kiss_fft_scalar * yp0 = out+(overlap>>1);
337
609k
      kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2;
338
609k
      const kiss_twiddle_scalar *t = &trig[0];
339
      /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the
340
         middle pair will be computed twice. */
341
44.1M
      for(i=0;i<(N4+1)>>1;i++)
342
43.5M
      {
343
43.5M
         kiss_fft_scalar re, im, yr, yi;
344
43.5M
         kiss_twiddle_scalar t0, t1;
345
         /* We swap real and imag because we're using an FFT instead of an IFFT. */
346
43.5M
         re = yp0[1];
347
43.5M
         im = yp0[0];
348
43.5M
         t0 = t[i];
349
43.5M
         t1 = t[N4+i];
350
         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
351
43.5M
         yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), post_shift);
352
43.5M
         yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), post_shift);
353
         /* We swap real and imag because we're using an FFT instead of an IFFT. */
354
43.5M
         re = yp1[1];
355
43.5M
         im = yp1[0];
356
43.5M
         yp0[0] = yr;
357
43.5M
         yp1[1] = yi;
358
359
43.5M
         t0 = t[(N4-i-1)];
360
43.5M
         t1 = t[(N2-i-1)];
361
         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
362
43.5M
         yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), post_shift);
363
43.5M
         yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), post_shift);
364
43.5M
         yp1[0] = yr;
365
43.5M
         yp0[1] = yi;
366
43.5M
         yp0 += 2;
367
43.5M
         yp1 -= 2;
368
43.5M
      }
369
609k
   }
370
371
   /* Mirror on both sides for TDAC */
372
609k
   {
373
609k
      kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1;
374
609k
      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
375
609k
      const celt_coef * OPUS_RESTRICT wp1 = window;
376
609k
      const celt_coef * OPUS_RESTRICT wp2 = window+overlap-1;
377
378
41.8M
      for(i = 0; i < overlap/2; i++)
379
41.2M
      {
380
41.2M
         kiss_fft_scalar x1, x2;
381
41.2M
         x1 = *xp1;
382
41.2M
         x2 = *yp1;
383
41.2M
         *yp1++ = SUB32_ovflw(S_MUL(x2, *wp2), S_MUL(x1, *wp1));
384
41.2M
         *xp1-- = ADD32_ovflw(S_MUL(x2, *wp1), S_MUL(x1, *wp2));
385
41.2M
         wp1++;
386
41.2M
         wp2--;
387
41.2M
      }
388
609k
   }
389
609k
}
clt_mdct_backward_c
Line
Count
Source
270
333k
{
271
333k
   int i;
272
333k
   int N, N2, N4;
273
333k
   const kiss_twiddle_scalar *trig;
274
333k
#ifdef FIXED_POINT
275
333k
   int pre_shift, post_shift, fft_shift;
276
333k
#endif
277
333k
   (void) arch;
278
279
333k
   N = l->n;
280
333k
   trig = l->trig;
281
1.11M
   for (i=0;i<shift;i++)
282
783k
   {
283
783k
      N >>= 1;
284
783k
      trig += N;
285
783k
   }
286
333k
   N2 = N>>1;
287
333k
   N4 = N>>2;
288
289
333k
#ifdef FIXED_POINT
290
333k
   {
291
333k
      opus_val32 sumval=N2;
292
333k
      opus_val32 maxval=0;
293
91.6M
      for (i=0;i<N2;i++) {
294
91.2M
         maxval = MAX32(maxval, ABS32(in[i*stride]));
295
91.2M
         sumval = ADD32_ovflw(sumval, ABS32(SHR32(in[i*stride],11)));
296
91.2M
      }
297
333k
      pre_shift = IMAX(0, 29-celt_zlog2(1+maxval));
298
      /* Worst-case where all the energy goes to a single sample. */
299
333k
      post_shift = IMAX(0, 19-celt_ilog2(ABS32(sumval)));
300
333k
      post_shift = IMIN(post_shift, pre_shift);
301
333k
      fft_shift = pre_shift - post_shift;
302
333k
   }
303
333k
#endif
304
   /* Pre-rotate */
305
333k
   {
306
      /* Temp pointers to make it really clear to the compiler what we're doing */
307
333k
      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
308
333k
      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);
309
333k
      kiss_fft_scalar * OPUS_RESTRICT yp = out+(overlap>>1);
310
333k
      const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0];
311
333k
      const opus_int16 * OPUS_RESTRICT bitrev = l->kfft[shift]->bitrev;
312
45.9M
      for(i=0;i<N4;i++)
313
45.6M
      {
314
45.6M
         int rev;
315
45.6M
         kiss_fft_scalar yr, yi;
316
45.6M
         opus_val32 x1, x2;
317
45.6M
         rev = *bitrev++;
318
45.6M
         x1 = SHL32_ovflw(*xp1, pre_shift);
319
45.6M
         x2 = SHL32_ovflw(*xp2, pre_shift);
320
45.6M
         yr = ADD32_ovflw(S_MUL(x2, t[i]), S_MUL(x1, t[N4+i]));
321
45.6M
         yi = SUB32_ovflw(S_MUL(x1, t[i]), S_MUL(x2, t[N4+i]));
322
         /* We swap real and imag because we use an FFT instead of an IFFT. */
323
45.6M
         yp[2*rev+1] = yr;
324
45.6M
         yp[2*rev] = yi;
325
         /* Storing the pre-rotation directly in the bitrev order. */
326
45.6M
         xp1+=2*stride;
327
45.6M
         xp2-=2*stride;
328
45.6M
      }
329
333k
   }
330
331
333k
   opus_fft_impl(l->kfft[shift], (kiss_fft_cpx*)(out+(overlap>>1)) ARG_FIXED(fft_shift));
332
333
   /* Post-rotate and de-shuffle from both ends of the buffer at once to make
334
      it in-place. */
335
333k
   {
336
333k
      kiss_fft_scalar * yp0 = out+(overlap>>1);
337
333k
      kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2;
338
333k
      const kiss_twiddle_scalar *t = &trig[0];
339
      /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the
340
         middle pair will be computed twice. */
341
23.1M
      for(i=0;i<(N4+1)>>1;i++)
342
22.8M
      {
343
22.8M
         kiss_fft_scalar re, im, yr, yi;
344
22.8M
         kiss_twiddle_scalar t0, t1;
345
         /* We swap real and imag because we're using an FFT instead of an IFFT. */
346
22.8M
         re = yp0[1];
347
22.8M
         im = yp0[0];
348
22.8M
         t0 = t[i];
349
22.8M
         t1 = t[N4+i];
350
         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
351
22.8M
         yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), post_shift);
352
22.8M
         yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), post_shift);
353
         /* We swap real and imag because we're using an FFT instead of an IFFT. */
354
22.8M
         re = yp1[1];
355
22.8M
         im = yp1[0];
356
22.8M
         yp0[0] = yr;
357
22.8M
         yp1[1] = yi;
358
359
22.8M
         t0 = t[(N4-i-1)];
360
22.8M
         t1 = t[(N2-i-1)];
361
         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
362
22.8M
         yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), post_shift);
363
22.8M
         yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), post_shift);
364
22.8M
         yp1[0] = yr;
365
22.8M
         yp0[1] = yi;
366
22.8M
         yp0 += 2;
367
22.8M
         yp1 -= 2;
368
22.8M
      }
369
333k
   }
370
371
   /* Mirror on both sides for TDAC */
372
333k
   {
373
333k
      kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1;
374
333k
      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
375
333k
      const celt_coef * OPUS_RESTRICT wp1 = window;
376
333k
      const celt_coef * OPUS_RESTRICT wp2 = window+overlap-1;
377
378
23.2M
      for(i = 0; i < overlap/2; i++)
379
22.9M
      {
380
22.9M
         kiss_fft_scalar x1, x2;
381
22.9M
         x1 = *xp1;
382
22.9M
         x2 = *yp1;
383
22.9M
         *yp1++ = SUB32_ovflw(S_MUL(x2, *wp2), S_MUL(x1, *wp1));
384
22.9M
         *xp1-- = ADD32_ovflw(S_MUL(x2, *wp1), S_MUL(x1, *wp2));
385
22.9M
         wp1++;
386
22.9M
         wp2--;
387
22.9M
      }
388
333k
   }
389
333k
}
clt_mdct_backward_c
Line
Count
Source
270
276k
{
271
276k
   int i;
272
276k
   int N, N2, N4;
273
276k
   const kiss_twiddle_scalar *trig;
274
#ifdef FIXED_POINT
275
   int pre_shift, post_shift, fft_shift;
276
#endif
277
276k
   (void) arch;
278
279
276k
   N = l->n;
280
276k
   trig = l->trig;
281
886k
   for (i=0;i<shift;i++)
282
609k
   {
283
609k
      N >>= 1;
284
609k
      trig += N;
285
609k
   }
286
276k
   N2 = N>>1;
287
276k
   N4 = N>>2;
288
289
#ifdef FIXED_POINT
290
   {
291
      opus_val32 sumval=N2;
292
      opus_val32 maxval=0;
293
      for (i=0;i<N2;i++) {
294
         maxval = MAX32(maxval, ABS32(in[i*stride]));
295
         sumval = ADD32_ovflw(sumval, ABS32(SHR32(in[i*stride],11)));
296
      }
297
      pre_shift = IMAX(0, 29-celt_zlog2(1+maxval));
298
      /* Worst-case where all the energy goes to a single sample. */
299
      post_shift = IMAX(0, 19-celt_ilog2(ABS32(sumval)));
300
      post_shift = IMIN(post_shift, pre_shift);
301
      fft_shift = pre_shift - post_shift;
302
   }
303
#endif
304
   /* Pre-rotate */
305
276k
   {
306
      /* Temp pointers to make it really clear to the compiler what we're doing */
307
276k
      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
308
276k
      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);
309
276k
      kiss_fft_scalar * OPUS_RESTRICT yp = out+(overlap>>1);
310
276k
      const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0];
311
276k
      const opus_int16 * OPUS_RESTRICT bitrev = l->kfft[shift]->bitrev;
312
41.7M
      for(i=0;i<N4;i++)
313
41.4M
      {
314
41.4M
         int rev;
315
41.4M
         kiss_fft_scalar yr, yi;
316
41.4M
         opus_val32 x1, x2;
317
41.4M
         rev = *bitrev++;
318
41.4M
         x1 = SHL32_ovflw(*xp1, pre_shift);
319
41.4M
         x2 = SHL32_ovflw(*xp2, pre_shift);
320
41.4M
         yr = ADD32_ovflw(S_MUL(x2, t[i]), S_MUL(x1, t[N4+i]));
321
41.4M
         yi = SUB32_ovflw(S_MUL(x1, t[i]), S_MUL(x2, t[N4+i]));
322
         /* We swap real and imag because we use an FFT instead of an IFFT. */
323
41.4M
         yp[2*rev+1] = yr;
324
41.4M
         yp[2*rev] = yi;
325
         /* Storing the pre-rotation directly in the bitrev order. */
326
41.4M
         xp1+=2*stride;
327
41.4M
         xp2-=2*stride;
328
41.4M
      }
329
276k
   }
330
331
276k
   opus_fft_impl(l->kfft[shift], (kiss_fft_cpx*)(out+(overlap>>1)) ARG_FIXED(fft_shift));
332
333
   /* Post-rotate and de-shuffle from both ends of the buffer at once to make
334
      it in-place. */
335
276k
   {
336
276k
      kiss_fft_scalar * yp0 = out+(overlap>>1);
337
276k
      kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2;
338
276k
      const kiss_twiddle_scalar *t = &trig[0];
339
      /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the
340
         middle pair will be computed twice. */
341
21.0M
      for(i=0;i<(N4+1)>>1;i++)
342
20.7M
      {
343
20.7M
         kiss_fft_scalar re, im, yr, yi;
344
20.7M
         kiss_twiddle_scalar t0, t1;
345
         /* We swap real and imag because we're using an FFT instead of an IFFT. */
346
20.7M
         re = yp0[1];
347
20.7M
         im = yp0[0];
348
20.7M
         t0 = t[i];
349
20.7M
         t1 = t[N4+i];
350
         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
351
20.7M
         yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), post_shift);
352
20.7M
         yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), post_shift);
353
         /* We swap real and imag because we're using an FFT instead of an IFFT. */
354
20.7M
         re = yp1[1];
355
20.7M
         im = yp1[0];
356
20.7M
         yp0[0] = yr;
357
20.7M
         yp1[1] = yi;
358
359
20.7M
         t0 = t[(N4-i-1)];
360
20.7M
         t1 = t[(N2-i-1)];
361
         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
362
20.7M
         yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), post_shift);
363
20.7M
         yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), post_shift);
364
20.7M
         yp1[0] = yr;
365
20.7M
         yp0[1] = yi;
366
20.7M
         yp0 += 2;
367
20.7M
         yp1 -= 2;
368
20.7M
      }
369
276k
   }
370
371
   /* Mirror on both sides for TDAC */
372
276k
   {
373
276k
      kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1;
374
276k
      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
375
276k
      const celt_coef * OPUS_RESTRICT wp1 = window;
376
276k
      const celt_coef * OPUS_RESTRICT wp2 = window+overlap-1;
377
378
18.5M
      for(i = 0; i < overlap/2; i++)
379
18.2M
      {
380
18.2M
         kiss_fft_scalar x1, x2;
381
18.2M
         x1 = *xp1;
382
18.2M
         x2 = *yp1;
383
18.2M
         *yp1++ = SUB32_ovflw(S_MUL(x2, *wp2), S_MUL(x1, *wp1));
384
18.2M
         *xp1-- = ADD32_ovflw(S_MUL(x2, *wp1), S_MUL(x1, *wp2));
385
18.2M
         wp1++;
386
18.2M
         wp2--;
387
18.2M
      }
388
276k
   }
389
276k
}
390
#endif /* OVERRIDE_clt_mdct_backward */