Line | Count | Source |
1 | | /* Copyright (c) 2007-2008 CSIRO |
2 | | Copyright (c) 2007-2008 Xiph.Org Foundation |
3 | | Written by Jean-Marc Valin */ |
4 | | /* |
5 | | Redistribution and use in source and binary forms, with or without |
6 | | modification, are permitted provided that the following conditions |
7 | | are met: |
8 | | |
9 | | - Redistributions of source code must retain the above copyright |
10 | | notice, this list of conditions and the following disclaimer. |
11 | | |
12 | | - Redistributions in binary form must reproduce the above copyright |
13 | | notice, this list of conditions and the following disclaimer in the |
14 | | documentation and/or other materials provided with the distribution. |
15 | | |
16 | | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
17 | | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
18 | | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
19 | | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER |
20 | | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
21 | | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
22 | | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
23 | | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
24 | | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
25 | | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
26 | | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
27 | | */ |
28 | | |
29 | | /* This is a simple MDCT implementation that uses a N/4 complex FFT |
30 | | to do most of the work. It should be relatively straightforward to |
31 | | plug in pretty much and FFT here. |
32 | | |
33 | | This replaces the Vorbis FFT (and uses the exact same API), which |
34 | | was a bit too messy and that was ending up duplicating code |
35 | | (might as well use the same FFT everywhere). |
36 | | |
37 | | The algorithm is similar to (and inspired from) Fabrice Bellard's |
38 | | MDCT implementation in FFMPEG, but has differences in signs, ordering |
39 | | and scaling in many places. |
40 | | */ |
41 | | |
42 | | #ifndef SKIP_CONFIG_H |
43 | | #ifdef HAVE_CONFIG_H |
44 | | #include "config.h" |
45 | | #endif |
46 | | #endif |
47 | | |
48 | | #include "mdct.h" |
49 | | #include "kiss_fft.h" |
50 | | #include "_kiss_fft_guts.h" |
51 | | #include <math.h> |
52 | | #include "os_support.h" |
53 | | #include "mathops.h" |
54 | | #include "stack_alloc.h" |
55 | | |
56 | | #if defined(MIPSr1_ASM) |
57 | | #include "mips/mdct_mipsr1.h" |
58 | | #endif |
59 | | |
60 | | #ifndef M_PI |
61 | | #define M_PI 3.141592653 |
62 | | #endif |
63 | | |
64 | | #ifdef CUSTOM_MODES |
65 | | |
66 | | int clt_mdct_init(mdct_lookup *l,int N, int maxshift, int arch) |
67 | | { |
68 | | int i; |
69 | | kiss_twiddle_scalar *trig; |
70 | | int shift; |
71 | | int N2=N>>1; |
72 | | l->n = N; |
73 | | l->maxshift = maxshift; |
74 | | for (i=0;i<=maxshift;i++) |
75 | | { |
76 | | if (i==0) |
77 | | l->kfft[i] = opus_fft_alloc(N>>2>>i, 0, 0, arch); |
78 | | else |
79 | | l->kfft[i] = opus_fft_alloc_twiddles(N>>2>>i, 0, 0, l->kfft[0], arch); |
80 | | #ifndef ENABLE_TI_DSPLIB55 |
81 | | if (l->kfft[i]==NULL) |
82 | | return 0; |
83 | | #endif |
84 | | } |
85 | | l->trig = trig = (kiss_twiddle_scalar*)opus_alloc((N-(N2>>maxshift))*sizeof(kiss_twiddle_scalar)); |
86 | | if (l->trig==NULL) |
87 | | return 0; |
88 | | for (shift=0;shift<=maxshift;shift++) |
89 | | { |
90 | | /* We have enough points that sine isn't necessary */ |
91 | | #if defined(FIXED_POINT) |
92 | | #ifndef ENABLE_QEXT |
93 | | for (i=0;i<N2;i++) |
94 | | trig[i] = TRIG_UPSCALE*celt_cos_norm(DIV32(ADD32(SHL32(EXTEND32(i),17),N2+16384),N)); |
95 | | #else |
96 | | for (i=0;i<N2;i++) |
97 | | trig[i] = (kiss_twiddle_scalar)MAX32(-2147483647,MIN32(2147483647,floor(.5+2147483648*cos(2*M_PI*(i+.125)/N)))); |
98 | | #endif |
99 | | #else |
100 | | for (i=0;i<N2;i++) |
101 | | trig[i] = (kiss_twiddle_scalar)cos(2*PI*(i+.125)/N); |
102 | | #endif |
103 | | trig += N2; |
104 | | N2 >>= 1; |
105 | | N >>= 1; |
106 | | } |
107 | | return 1; |
108 | | } |
109 | | |
110 | | void clt_mdct_clear(mdct_lookup *l, int arch) |
111 | | { |
112 | | int i; |
113 | | for (i=0;i<=l->maxshift;i++) |
114 | | opus_fft_free(l->kfft[i], arch); |
115 | | opus_free((kiss_twiddle_scalar*)l->trig); |
116 | | } |
117 | | |
118 | | #endif /* CUSTOM_MODES */ |
119 | | |
120 | | /* Forward MDCT trashes the input array */ |
121 | | #ifndef OVERRIDE_clt_mdct_forward |
122 | | void clt_mdct_forward_c(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out, |
123 | | const celt_coef *window, int overlap, int shift, int stride, int arch) |
124 | 3.05M | { |
125 | 3.05M | int i; |
126 | 3.05M | int N, N2, N4; |
127 | 3.05M | VARDECL(kiss_fft_scalar, f); |
128 | 3.05M | VARDECL(kiss_fft_cpx, f2); |
129 | 3.05M | const kiss_fft_state *st = l->kfft[shift]; |
130 | 3.05M | const kiss_twiddle_scalar *trig; |
131 | 3.05M | celt_coef scale; |
132 | | #ifdef FIXED_POINT |
133 | | /* Allows us to scale with MULT16_32_Q16(), which is faster than |
134 | | MULT16_32_Q15() on ARM. */ |
135 | | int scale_shift = st->scale_shift-1; |
136 | | int headroom; |
137 | | #endif |
138 | 3.05M | SAVE_STACK; |
139 | 3.05M | (void)arch; |
140 | 3.05M | scale = st->scale; |
141 | | |
142 | 3.05M | N = l->n; |
143 | 3.05M | trig = l->trig; |
144 | 10.7M | for (i=0;i<shift;i++) |
145 | 7.72M | { |
146 | 7.72M | N >>= 1; |
147 | 7.72M | trig += N; |
148 | 7.72M | } |
149 | 3.05M | N2 = N>>1; |
150 | 3.05M | N4 = N>>2; |
151 | | |
152 | 3.05M | ALLOC(f, N2, kiss_fft_scalar); |
153 | 3.05M | ALLOC(f2, N4, kiss_fft_cpx); |
154 | | |
155 | | /* Consider the input to be composed of four blocks: [a, b, c, d] */ |
156 | | /* Window, shuffle, fold */ |
157 | 3.05M | { |
158 | | /* Temp pointers to make it really clear to the compiler what we're doing */ |
159 | 3.05M | const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1); |
160 | 3.05M | const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1); |
161 | 3.05M | kiss_fft_scalar * OPUS_RESTRICT yp = f; |
162 | 3.05M | const celt_coef * OPUS_RESTRICT wp1 = window+(overlap>>1); |
163 | 3.05M | const celt_coef * OPUS_RESTRICT wp2 = window+(overlap>>1)-1; |
164 | 95.9M | for(i=0;i<((overlap+3)>>2);i++) |
165 | 92.9M | { |
166 | | /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/ |
167 | 92.9M | *yp++ = S_MUL(xp1[N2], *wp2) + S_MUL(*xp2, *wp1); |
168 | 92.9M | *yp++ = S_MUL(*xp1, *wp1) - S_MUL(xp2[-N2], *wp2); |
169 | 92.9M | xp1+=2; |
170 | 92.9M | xp2-=2; |
171 | 92.9M | wp1+=2; |
172 | 92.9M | wp2-=2; |
173 | 92.9M | } |
174 | 3.05M | wp1 = window; |
175 | 3.05M | wp2 = window+overlap-1; |
176 | 194M | for(;i<N4-((overlap+3)>>2);i++) |
177 | 191M | { |
178 | | /* Real part arranged as a-bR, Imag part arranged as -c-dR */ |
179 | 191M | *yp++ = *xp2; |
180 | 191M | *yp++ = *xp1; |
181 | 191M | xp1+=2; |
182 | 191M | xp2-=2; |
183 | 191M | } |
184 | 95.9M | for(;i<N4;i++) |
185 | 92.9M | { |
186 | | /* Real part arranged as a-bR, Imag part arranged as -c-dR */ |
187 | 92.9M | *yp++ = -S_MUL(xp1[-N2], *wp1) + S_MUL(*xp2, *wp2); |
188 | 92.9M | *yp++ = S_MUL(*xp1, *wp2) + S_MUL(xp2[N2], *wp1); |
189 | 92.9M | xp1+=2; |
190 | 92.9M | xp2-=2; |
191 | 92.9M | wp1+=2; |
192 | 92.9M | wp2-=2; |
193 | 92.9M | } |
194 | 3.05M | } |
195 | | /* Pre-rotation */ |
196 | 3.05M | { |
197 | 3.05M | kiss_fft_scalar * OPUS_RESTRICT yp = f; |
198 | 3.05M | const kiss_twiddle_scalar *t = &trig[0]; |
199 | | #ifdef FIXED_POINT |
200 | | opus_val32 maxval=1; |
201 | | #endif |
202 | 380M | for(i=0;i<N4;i++) |
203 | 377M | { |
204 | 377M | kiss_fft_cpx yc; |
205 | 377M | kiss_twiddle_scalar t0, t1; |
206 | 377M | kiss_fft_scalar re, im, yr, yi; |
207 | 377M | t0 = t[i]; |
208 | 377M | t1 = t[N4+i]; |
209 | 377M | re = *yp++; |
210 | 377M | im = *yp++; |
211 | 377M | yr = S_MUL(re,t0) - S_MUL(im,t1); |
212 | 377M | yi = S_MUL(im,t0) + S_MUL(re,t1); |
213 | | /* For QEXT, it's best to scale before the FFT, but otherwise it's best to scale after. |
214 | | For floating-point it doesn't matter. */ |
215 | | #ifdef ENABLE_QEXT |
216 | | yc.r = yr; |
217 | | yc.i = yi; |
218 | | #else |
219 | 188M | yc.r = S_MUL2(yr, scale); |
220 | 188M | yc.i = S_MUL2(yi, scale); |
221 | | #endif |
222 | | #ifdef FIXED_POINT |
223 | 202M | maxval = MAX32(maxval, MAX32(ABS32(yc.r), ABS32(yc.i))); |
224 | | #endif |
225 | 377M | f2[st->bitrev[i]] = yc; |
226 | 377M | } |
227 | | #ifdef FIXED_POINT |
228 | 1.62M | headroom = IMAX(0, IMIN(scale_shift, 28-celt_ilog2(maxval))); |
229 | | #endif |
230 | 3.05M | } |
231 | | |
232 | | /* N/4 complex FFT, does not downscale anymore */ |
233 | 3.05M | opus_fft_impl(st, f2 ARG_FIXED(scale_shift-headroom)); |
234 | | |
235 | | /* Post-rotate */ |
236 | 3.05M | { |
237 | | /* Temp pointers to make it really clear to the compiler what we're doing */ |
238 | 3.05M | const kiss_fft_cpx * OPUS_RESTRICT fp = f2; |
239 | 3.05M | kiss_fft_scalar * OPUS_RESTRICT yp1 = out; |
240 | 3.05M | kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1); |
241 | 3.05M | const kiss_twiddle_scalar *t = &trig[0]; |
242 | | /* Temp pointers to make it really clear to the compiler what we're doing */ |
243 | 380M | for(i=0;i<N4;i++) |
244 | 377M | { |
245 | 377M | kiss_fft_scalar yr, yi; |
246 | 377M | kiss_fft_scalar t0, t1; |
247 | | #ifdef ENABLE_QEXT |
248 | 188M | t0 = S_MUL2(t[i], scale); |
249 | 188M | t1 = S_MUL2(t[N4+i], scale); |
250 | | #else |
251 | | t0 = t[i]; |
252 | | t1 = t[N4+i]; |
253 | | #endif |
254 | 377M | yr = PSHR32(S_MUL(fp->i,t1) - S_MUL(fp->r,t0), headroom); |
255 | 377M | yi = PSHR32(S_MUL(fp->r,t1) + S_MUL(fp->i,t0), headroom); |
256 | 377M | *yp1 = yr; |
257 | 377M | *yp2 = yi; |
258 | 377M | fp++; |
259 | 377M | yp1 += 2*stride; |
260 | 377M | yp2 -= 2*stride; |
261 | 377M | } |
262 | 3.05M | } |
263 | 3.05M | RESTORE_STACK; |
264 | 3.05M | } Line | Count | Source | 124 | 812k | { | 125 | 812k | int i; | 126 | 812k | int N, N2, N4; | 127 | 812k | VARDECL(kiss_fft_scalar, f); | 128 | 812k | VARDECL(kiss_fft_cpx, f2); | 129 | 812k | const kiss_fft_state *st = l->kfft[shift]; | 130 | 812k | const kiss_twiddle_scalar *trig; | 131 | 812k | celt_coef scale; | 132 | 812k | #ifdef FIXED_POINT | 133 | | /* Allows us to scale with MULT16_32_Q16(), which is faster than | 134 | | MULT16_32_Q15() on ARM. */ | 135 | 812k | int scale_shift = st->scale_shift-1; | 136 | 812k | int headroom; | 137 | 812k | #endif | 138 | 812k | SAVE_STACK; | 139 | 812k | (void)arch; | 140 | 812k | scale = st->scale; | 141 | | | 142 | 812k | N = l->n; | 143 | 812k | trig = l->trig; | 144 | 2.86M | for (i=0;i<shift;i++) | 145 | 2.05M | { | 146 | 2.05M | N >>= 1; | 147 | 2.05M | trig += N; | 148 | 2.05M | } | 149 | 812k | N2 = N>>1; | 150 | 812k | N4 = N>>2; | 151 | | | 152 | 812k | ALLOC(f, N2, kiss_fft_scalar); | 153 | 812k | ALLOC(f2, N4, kiss_fft_cpx); | 154 | | | 155 | | /* Consider the input to be composed of four blocks: [a, b, c, d] */ | 156 | | /* Window, shuffle, fold */ | 157 | 812k | { | 158 | | /* Temp pointers to make it really clear to the compiler what we're doing */ | 159 | 812k | const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1); | 160 | 812k | const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1); | 161 | 812k | kiss_fft_scalar * OPUS_RESTRICT yp = f; | 162 | 812k | const celt_coef * OPUS_RESTRICT wp1 = window+(overlap>>1); | 163 | 812k | const celt_coef * OPUS_RESTRICT wp2 = window+(overlap>>1)-1; | 164 | 25.5M | for(i=0;i<((overlap+3)>>2);i++) | 165 | 24.7M | { | 166 | | /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/ | 167 | 24.7M | *yp++ = S_MUL(xp1[N2], *wp2) + S_MUL(*xp2, *wp1); | 168 | 24.7M | *yp++ = S_MUL(*xp1, *wp1) - S_MUL(xp2[-N2], *wp2); | 169 | 24.7M | xp1+=2; | 170 | 24.7M | xp2-=2; | 171 | 24.7M | wp1+=2; | 172 | 24.7M | wp2-=2; | 173 | 24.7M | } | 174 | 812k | wp1 = window; | 175 | 812k | wp2 = window+overlap-1; | 176 | 52.5M | for(;i<N4-((overlap+3)>>2);i++) | 177 | 51.6M | { | 178 | | /* Real part arranged as a-bR, Imag part arranged as -c-dR */ | 179 | 51.6M | *yp++ = *xp2; | 180 | 51.6M | *yp++ = *xp1; | 181 | 51.6M | xp1+=2; | 182 | 51.6M | xp2-=2; | 183 | 51.6M | } | 184 | 25.5M | for(;i<N4;i++) | 185 | 24.7M | { | 186 | | /* Real part arranged as a-bR, Imag part arranged as -c-dR */ | 187 | 24.7M | *yp++ = -S_MUL(xp1[-N2], *wp1) + S_MUL(*xp2, *wp2); | 188 | 24.7M | *yp++ = S_MUL(*xp1, *wp2) + S_MUL(xp2[N2], *wp1); | 189 | 24.7M | xp1+=2; | 190 | 24.7M | xp2-=2; | 191 | 24.7M | wp1+=2; | 192 | 24.7M | wp2-=2; | 193 | 24.7M | } | 194 | 812k | } | 195 | | /* Pre-rotation */ | 196 | 812k | { | 197 | 812k | kiss_fft_scalar * OPUS_RESTRICT yp = f; | 198 | 812k | const kiss_twiddle_scalar *t = &trig[0]; | 199 | 812k | #ifdef FIXED_POINT | 200 | 812k | opus_val32 maxval=1; | 201 | 812k | #endif | 202 | 102M | for(i=0;i<N4;i++) | 203 | 101M | { | 204 | 101M | kiss_fft_cpx yc; | 205 | 101M | kiss_twiddle_scalar t0, t1; | 206 | 101M | kiss_fft_scalar re, im, yr, yi; | 207 | 101M | t0 = t[i]; | 208 | 101M | t1 = t[N4+i]; | 209 | 101M | re = *yp++; | 210 | 101M | im = *yp++; | 211 | 101M | yr = S_MUL(re,t0) - S_MUL(im,t1); | 212 | 101M | yi = S_MUL(im,t0) + S_MUL(re,t1); | 213 | | /* For QEXT, it's best to scale before the FFT, but otherwise it's best to scale after. | 214 | | For floating-point it doesn't matter. */ | 215 | | #ifdef ENABLE_QEXT | 216 | | yc.r = yr; | 217 | | yc.i = yi; | 218 | | #else | 219 | 101M | yc.r = S_MUL2(yr, scale); | 220 | 101M | yc.i = S_MUL2(yi, scale); | 221 | 101M | #endif | 222 | 101M | #ifdef FIXED_POINT | 223 | 101M | maxval = MAX32(maxval, MAX32(ABS32(yc.r), ABS32(yc.i))); | 224 | 101M | #endif | 225 | 101M | f2[st->bitrev[i]] = yc; | 226 | 101M | } | 227 | 812k | #ifdef FIXED_POINT | 228 | 812k | headroom = IMAX(0, IMIN(scale_shift, 28-celt_ilog2(maxval))); | 229 | 812k | #endif | 230 | 812k | } | 231 | | | 232 | | /* N/4 complex FFT, does not downscale anymore */ | 233 | 812k | opus_fft_impl(st, f2 ARG_FIXED(scale_shift-headroom)); | 234 | | | 235 | | /* Post-rotate */ | 236 | 812k | { | 237 | | /* Temp pointers to make it really clear to the compiler what we're doing */ | 238 | 812k | const kiss_fft_cpx * OPUS_RESTRICT fp = f2; | 239 | 812k | kiss_fft_scalar * OPUS_RESTRICT yp1 = out; | 240 | 812k | kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1); | 241 | 812k | const kiss_twiddle_scalar *t = &trig[0]; | 242 | | /* Temp pointers to make it really clear to the compiler what we're doing */ | 243 | 102M | for(i=0;i<N4;i++) | 244 | 101M | { | 245 | 101M | kiss_fft_scalar yr, yi; | 246 | 101M | kiss_fft_scalar t0, t1; | 247 | | #ifdef ENABLE_QEXT | 248 | | t0 = S_MUL2(t[i], scale); | 249 | | t1 = S_MUL2(t[N4+i], scale); | 250 | | #else | 251 | 101M | t0 = t[i]; | 252 | 101M | t1 = t[N4+i]; | 253 | 101M | #endif | 254 | 101M | yr = PSHR32(S_MUL(fp->i,t1) - S_MUL(fp->r,t0), headroom); | 255 | 101M | yi = PSHR32(S_MUL(fp->r,t1) + S_MUL(fp->i,t0), headroom); | 256 | 101M | *yp1 = yr; | 257 | 101M | *yp2 = yi; | 258 | 101M | fp++; | 259 | 101M | yp1 += 2*stride; | 260 | 101M | yp2 -= 2*stride; | 261 | 101M | } | 262 | 812k | } | 263 | 812k | RESTORE_STACK; | 264 | 812k | } |
Line | Count | Source | 124 | 812k | { | 125 | 812k | int i; | 126 | 812k | int N, N2, N4; | 127 | 812k | VARDECL(kiss_fft_scalar, f); | 128 | 812k | VARDECL(kiss_fft_cpx, f2); | 129 | 812k | const kiss_fft_state *st = l->kfft[shift]; | 130 | 812k | const kiss_twiddle_scalar *trig; | 131 | 812k | celt_coef scale; | 132 | 812k | #ifdef FIXED_POINT | 133 | | /* Allows us to scale with MULT16_32_Q16(), which is faster than | 134 | | MULT16_32_Q15() on ARM. */ | 135 | 812k | int scale_shift = st->scale_shift-1; | 136 | 812k | int headroom; | 137 | 812k | #endif | 138 | 812k | SAVE_STACK; | 139 | 812k | (void)arch; | 140 | 812k | scale = st->scale; | 141 | | | 142 | 812k | N = l->n; | 143 | 812k | trig = l->trig; | 144 | 2.86M | for (i=0;i<shift;i++) | 145 | 2.05M | { | 146 | 2.05M | N >>= 1; | 147 | 2.05M | trig += N; | 148 | 2.05M | } | 149 | 812k | N2 = N>>1; | 150 | 812k | N4 = N>>2; | 151 | | | 152 | 812k | ALLOC(f, N2, kiss_fft_scalar); | 153 | 812k | ALLOC(f2, N4, kiss_fft_cpx); | 154 | | | 155 | | /* Consider the input to be composed of four blocks: [a, b, c, d] */ | 156 | | /* Window, shuffle, fold */ | 157 | 812k | { | 158 | | /* Temp pointers to make it really clear to the compiler what we're doing */ | 159 | 812k | const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1); | 160 | 812k | const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1); | 161 | 812k | kiss_fft_scalar * OPUS_RESTRICT yp = f; | 162 | 812k | const celt_coef * OPUS_RESTRICT wp1 = window+(overlap>>1); | 163 | 812k | const celt_coef * OPUS_RESTRICT wp2 = window+(overlap>>1)-1; | 164 | 25.5M | for(i=0;i<((overlap+3)>>2);i++) | 165 | 24.7M | { | 166 | | /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/ | 167 | 24.7M | *yp++ = S_MUL(xp1[N2], *wp2) + S_MUL(*xp2, *wp1); | 168 | 24.7M | *yp++ = S_MUL(*xp1, *wp1) - S_MUL(xp2[-N2], *wp2); | 169 | 24.7M | xp1+=2; | 170 | 24.7M | xp2-=2; | 171 | 24.7M | wp1+=2; | 172 | 24.7M | wp2-=2; | 173 | 24.7M | } | 174 | 812k | wp1 = window; | 175 | 812k | wp2 = window+overlap-1; | 176 | 52.5M | for(;i<N4-((overlap+3)>>2);i++) | 177 | 51.6M | { | 178 | | /* Real part arranged as a-bR, Imag part arranged as -c-dR */ | 179 | 51.6M | *yp++ = *xp2; | 180 | 51.6M | *yp++ = *xp1; | 181 | 51.6M | xp1+=2; | 182 | 51.6M | xp2-=2; | 183 | 51.6M | } | 184 | 25.5M | for(;i<N4;i++) | 185 | 24.7M | { | 186 | | /* Real part arranged as a-bR, Imag part arranged as -c-dR */ | 187 | 24.7M | *yp++ = -S_MUL(xp1[-N2], *wp1) + S_MUL(*xp2, *wp2); | 188 | 24.7M | *yp++ = S_MUL(*xp1, *wp2) + S_MUL(xp2[N2], *wp1); | 189 | 24.7M | xp1+=2; | 190 | 24.7M | xp2-=2; | 191 | 24.7M | wp1+=2; | 192 | 24.7M | wp2-=2; | 193 | 24.7M | } | 194 | 812k | } | 195 | | /* Pre-rotation */ | 196 | 812k | { | 197 | 812k | kiss_fft_scalar * OPUS_RESTRICT yp = f; | 198 | 812k | const kiss_twiddle_scalar *t = &trig[0]; | 199 | 812k | #ifdef FIXED_POINT | 200 | 812k | opus_val32 maxval=1; | 201 | 812k | #endif | 202 | 102M | for(i=0;i<N4;i++) | 203 | 101M | { | 204 | 101M | kiss_fft_cpx yc; | 205 | 101M | kiss_twiddle_scalar t0, t1; | 206 | 101M | kiss_fft_scalar re, im, yr, yi; | 207 | 101M | t0 = t[i]; | 208 | 101M | t1 = t[N4+i]; | 209 | 101M | re = *yp++; | 210 | 101M | im = *yp++; | 211 | 101M | yr = S_MUL(re,t0) - S_MUL(im,t1); | 212 | 101M | yi = S_MUL(im,t0) + S_MUL(re,t1); | 213 | | /* For QEXT, it's best to scale before the FFT, but otherwise it's best to scale after. | 214 | | For floating-point it doesn't matter. */ | 215 | 101M | #ifdef ENABLE_QEXT | 216 | 101M | yc.r = yr; | 217 | 101M | yc.i = yi; | 218 | | #else | 219 | | yc.r = S_MUL2(yr, scale); | 220 | | yc.i = S_MUL2(yi, scale); | 221 | | #endif | 222 | 101M | #ifdef FIXED_POINT | 223 | 101M | maxval = MAX32(maxval, MAX32(ABS32(yc.r), ABS32(yc.i))); | 224 | 101M | #endif | 225 | 101M | f2[st->bitrev[i]] = yc; | 226 | 101M | } | 227 | 812k | #ifdef FIXED_POINT | 228 | 812k | headroom = IMAX(0, IMIN(scale_shift, 28-celt_ilog2(maxval))); | 229 | 812k | #endif | 230 | 812k | } | 231 | | | 232 | | /* N/4 complex FFT, does not downscale anymore */ | 233 | 812k | opus_fft_impl(st, f2 ARG_FIXED(scale_shift-headroom)); | 234 | | | 235 | | /* Post-rotate */ | 236 | 812k | { | 237 | | /* Temp pointers to make it really clear to the compiler what we're doing */ | 238 | 812k | const kiss_fft_cpx * OPUS_RESTRICT fp = f2; | 239 | 812k | kiss_fft_scalar * OPUS_RESTRICT yp1 = out; | 240 | 812k | kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1); | 241 | 812k | const kiss_twiddle_scalar *t = &trig[0]; | 242 | | /* Temp pointers to make it really clear to the compiler what we're doing */ | 243 | 102M | for(i=0;i<N4;i++) | 244 | 101M | { | 245 | 101M | kiss_fft_scalar yr, yi; | 246 | 101M | kiss_fft_scalar t0, t1; | 247 | 101M | #ifdef ENABLE_QEXT | 248 | 101M | t0 = S_MUL2(t[i], scale); | 249 | 101M | t1 = S_MUL2(t[N4+i], scale); | 250 | | #else | 251 | | t0 = t[i]; | 252 | | t1 = t[N4+i]; | 253 | | #endif | 254 | 101M | yr = PSHR32(S_MUL(fp->i,t1) - S_MUL(fp->r,t0), headroom); | 255 | 101M | yi = PSHR32(S_MUL(fp->r,t1) + S_MUL(fp->i,t0), headroom); | 256 | 101M | *yp1 = yr; | 257 | 101M | *yp2 = yi; | 258 | 101M | fp++; | 259 | 101M | yp1 += 2*stride; | 260 | 101M | yp2 -= 2*stride; | 261 | 101M | } | 262 | 812k | } | 263 | 812k | RESTORE_STACK; | 264 | 812k | } |
Line | Count | Source | 124 | 713k | { | 125 | 713k | int i; | 126 | 713k | int N, N2, N4; | 127 | 713k | VARDECL(kiss_fft_scalar, f); | 128 | 713k | VARDECL(kiss_fft_cpx, f2); | 129 | 713k | const kiss_fft_state *st = l->kfft[shift]; | 130 | 713k | const kiss_twiddle_scalar *trig; | 131 | 713k | celt_coef scale; | 132 | | #ifdef FIXED_POINT | 133 | | /* Allows us to scale with MULT16_32_Q16(), which is faster than | 134 | | MULT16_32_Q15() on ARM. */ | 135 | | int scale_shift = st->scale_shift-1; | 136 | | int headroom; | 137 | | #endif | 138 | 713k | SAVE_STACK; | 139 | 713k | (void)arch; | 140 | 713k | scale = st->scale; | 141 | | | 142 | 713k | N = l->n; | 143 | 713k | trig = l->trig; | 144 | 2.52M | for (i=0;i<shift;i++) | 145 | 1.81M | { | 146 | 1.81M | N >>= 1; | 147 | 1.81M | trig += N; | 148 | 1.81M | } | 149 | 713k | N2 = N>>1; | 150 | 713k | N4 = N>>2; | 151 | | | 152 | 713k | ALLOC(f, N2, kiss_fft_scalar); | 153 | 713k | ALLOC(f2, N4, kiss_fft_cpx); | 154 | | | 155 | | /* Consider the input to be composed of four blocks: [a, b, c, d] */ | 156 | | /* Window, shuffle, fold */ | 157 | 713k | { | 158 | | /* Temp pointers to make it really clear to the compiler what we're doing */ | 159 | 713k | const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1); | 160 | 713k | const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1); | 161 | 713k | kiss_fft_scalar * OPUS_RESTRICT yp = f; | 162 | 713k | const celt_coef * OPUS_RESTRICT wp1 = window+(overlap>>1); | 163 | 713k | const celt_coef * OPUS_RESTRICT wp2 = window+(overlap>>1)-1; | 164 | 22.3M | for(i=0;i<((overlap+3)>>2);i++) | 165 | 21.6M | { | 166 | | /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/ | 167 | 21.6M | *yp++ = S_MUL(xp1[N2], *wp2) + S_MUL(*xp2, *wp1); | 168 | 21.6M | *yp++ = S_MUL(*xp1, *wp1) - S_MUL(xp2[-N2], *wp2); | 169 | 21.6M | xp1+=2; | 170 | 21.6M | xp2-=2; | 171 | 21.6M | wp1+=2; | 172 | 21.6M | wp2-=2; | 173 | 21.6M | } | 174 | 713k | wp1 = window; | 175 | 713k | wp2 = window+overlap-1; | 176 | 44.8M | for(;i<N4-((overlap+3)>>2);i++) | 177 | 44.1M | { | 178 | | /* Real part arranged as a-bR, Imag part arranged as -c-dR */ | 179 | 44.1M | *yp++ = *xp2; | 180 | 44.1M | *yp++ = *xp1; | 181 | 44.1M | xp1+=2; | 182 | 44.1M | xp2-=2; | 183 | 44.1M | } | 184 | 22.3M | for(;i<N4;i++) | 185 | 21.6M | { | 186 | | /* Real part arranged as a-bR, Imag part arranged as -c-dR */ | 187 | 21.6M | *yp++ = -S_MUL(xp1[-N2], *wp1) + S_MUL(*xp2, *wp2); | 188 | 21.6M | *yp++ = S_MUL(*xp1, *wp2) + S_MUL(xp2[N2], *wp1); | 189 | 21.6M | xp1+=2; | 190 | 21.6M | xp2-=2; | 191 | 21.6M | wp1+=2; | 192 | 21.6M | wp2-=2; | 193 | 21.6M | } | 194 | 713k | } | 195 | | /* Pre-rotation */ | 196 | 713k | { | 197 | 713k | kiss_fft_scalar * OPUS_RESTRICT yp = f; | 198 | 713k | const kiss_twiddle_scalar *t = &trig[0]; | 199 | | #ifdef FIXED_POINT | 200 | | opus_val32 maxval=1; | 201 | | #endif | 202 | 88.1M | for(i=0;i<N4;i++) | 203 | 87.4M | { | 204 | 87.4M | kiss_fft_cpx yc; | 205 | 87.4M | kiss_twiddle_scalar t0, t1; | 206 | 87.4M | kiss_fft_scalar re, im, yr, yi; | 207 | 87.4M | t0 = t[i]; | 208 | 87.4M | t1 = t[N4+i]; | 209 | 87.4M | re = *yp++; | 210 | 87.4M | im = *yp++; | 211 | 87.4M | yr = S_MUL(re,t0) - S_MUL(im,t1); | 212 | 87.4M | yi = S_MUL(im,t0) + S_MUL(re,t1); | 213 | | /* For QEXT, it's best to scale before the FFT, but otherwise it's best to scale after. | 214 | | For floating-point it doesn't matter. */ | 215 | 87.4M | #ifdef ENABLE_QEXT | 216 | 87.4M | yc.r = yr; | 217 | 87.4M | yc.i = yi; | 218 | | #else | 219 | | yc.r = S_MUL2(yr, scale); | 220 | | yc.i = S_MUL2(yi, scale); | 221 | | #endif | 222 | | #ifdef FIXED_POINT | 223 | | maxval = MAX32(maxval, MAX32(ABS32(yc.r), ABS32(yc.i))); | 224 | | #endif | 225 | 87.4M | f2[st->bitrev[i]] = yc; | 226 | 87.4M | } | 227 | | #ifdef FIXED_POINT | 228 | | headroom = IMAX(0, IMIN(scale_shift, 28-celt_ilog2(maxval))); | 229 | | #endif | 230 | 713k | } | 231 | | | 232 | | /* N/4 complex FFT, does not downscale anymore */ | 233 | 713k | opus_fft_impl(st, f2 ARG_FIXED(scale_shift-headroom)); | 234 | | | 235 | | /* Post-rotate */ | 236 | 713k | { | 237 | | /* Temp pointers to make it really clear to the compiler what we're doing */ | 238 | 713k | const kiss_fft_cpx * OPUS_RESTRICT fp = f2; | 239 | 713k | kiss_fft_scalar * OPUS_RESTRICT yp1 = out; | 240 | 713k | kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1); | 241 | 713k | const kiss_twiddle_scalar *t = &trig[0]; | 242 | | /* Temp pointers to make it really clear to the compiler what we're doing */ | 243 | 88.1M | for(i=0;i<N4;i++) | 244 | 87.4M | { | 245 | 87.4M | kiss_fft_scalar yr, yi; | 246 | 87.4M | kiss_fft_scalar t0, t1; | 247 | 87.4M | #ifdef ENABLE_QEXT | 248 | 87.4M | t0 = S_MUL2(t[i], scale); | 249 | 87.4M | t1 = S_MUL2(t[N4+i], scale); | 250 | | #else | 251 | | t0 = t[i]; | 252 | | t1 = t[N4+i]; | 253 | | #endif | 254 | 87.4M | yr = PSHR32(S_MUL(fp->i,t1) - S_MUL(fp->r,t0), headroom); | 255 | 87.4M | yi = PSHR32(S_MUL(fp->r,t1) + S_MUL(fp->i,t0), headroom); | 256 | 87.4M | *yp1 = yr; | 257 | 87.4M | *yp2 = yi; | 258 | 87.4M | fp++; | 259 | 87.4M | yp1 += 2*stride; | 260 | 87.4M | yp2 -= 2*stride; | 261 | 87.4M | } | 262 | 713k | } | 263 | 713k | RESTORE_STACK; | 264 | 713k | } |
Line | Count | Source | 124 | 713k | { | 125 | 713k | int i; | 126 | 713k | int N, N2, N4; | 127 | 713k | VARDECL(kiss_fft_scalar, f); | 128 | 713k | VARDECL(kiss_fft_cpx, f2); | 129 | 713k | const kiss_fft_state *st = l->kfft[shift]; | 130 | 713k | const kiss_twiddle_scalar *trig; | 131 | 713k | celt_coef scale; | 132 | | #ifdef FIXED_POINT | 133 | | /* Allows us to scale with MULT16_32_Q16(), which is faster than | 134 | | MULT16_32_Q15() on ARM. */ | 135 | | int scale_shift = st->scale_shift-1; | 136 | | int headroom; | 137 | | #endif | 138 | 713k | SAVE_STACK; | 139 | 713k | (void)arch; | 140 | 713k | scale = st->scale; | 141 | | | 142 | 713k | N = l->n; | 143 | 713k | trig = l->trig; | 144 | 2.52M | for (i=0;i<shift;i++) | 145 | 1.81M | { | 146 | 1.81M | N >>= 1; | 147 | 1.81M | trig += N; | 148 | 1.81M | } | 149 | 713k | N2 = N>>1; | 150 | 713k | N4 = N>>2; | 151 | | | 152 | 713k | ALLOC(f, N2, kiss_fft_scalar); | 153 | 713k | ALLOC(f2, N4, kiss_fft_cpx); | 154 | | | 155 | | /* Consider the input to be composed of four blocks: [a, b, c, d] */ | 156 | | /* Window, shuffle, fold */ | 157 | 713k | { | 158 | | /* Temp pointers to make it really clear to the compiler what we're doing */ | 159 | 713k | const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1); | 160 | 713k | const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1); | 161 | 713k | kiss_fft_scalar * OPUS_RESTRICT yp = f; | 162 | 713k | const celt_coef * OPUS_RESTRICT wp1 = window+(overlap>>1); | 163 | 713k | const celt_coef * OPUS_RESTRICT wp2 = window+(overlap>>1)-1; | 164 | 22.3M | for(i=0;i<((overlap+3)>>2);i++) | 165 | 21.6M | { | 166 | | /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/ | 167 | 21.6M | *yp++ = S_MUL(xp1[N2], *wp2) + S_MUL(*xp2, *wp1); | 168 | 21.6M | *yp++ = S_MUL(*xp1, *wp1) - S_MUL(xp2[-N2], *wp2); | 169 | 21.6M | xp1+=2; | 170 | 21.6M | xp2-=2; | 171 | 21.6M | wp1+=2; | 172 | 21.6M | wp2-=2; | 173 | 21.6M | } | 174 | 713k | wp1 = window; | 175 | 713k | wp2 = window+overlap-1; | 176 | 44.8M | for(;i<N4-((overlap+3)>>2);i++) | 177 | 44.1M | { | 178 | | /* Real part arranged as a-bR, Imag part arranged as -c-dR */ | 179 | 44.1M | *yp++ = *xp2; | 180 | 44.1M | *yp++ = *xp1; | 181 | 44.1M | xp1+=2; | 182 | 44.1M | xp2-=2; | 183 | 44.1M | } | 184 | 22.3M | for(;i<N4;i++) | 185 | 21.6M | { | 186 | | /* Real part arranged as a-bR, Imag part arranged as -c-dR */ | 187 | 21.6M | *yp++ = -S_MUL(xp1[-N2], *wp1) + S_MUL(*xp2, *wp2); | 188 | 21.6M | *yp++ = S_MUL(*xp1, *wp2) + S_MUL(xp2[N2], *wp1); | 189 | 21.6M | xp1+=2; | 190 | 21.6M | xp2-=2; | 191 | 21.6M | wp1+=2; | 192 | 21.6M | wp2-=2; | 193 | 21.6M | } | 194 | 713k | } | 195 | | /* Pre-rotation */ | 196 | 713k | { | 197 | 713k | kiss_fft_scalar * OPUS_RESTRICT yp = f; | 198 | 713k | const kiss_twiddle_scalar *t = &trig[0]; | 199 | | #ifdef FIXED_POINT | 200 | | opus_val32 maxval=1; | 201 | | #endif | 202 | 88.1M | for(i=0;i<N4;i++) | 203 | 87.4M | { | 204 | 87.4M | kiss_fft_cpx yc; | 205 | 87.4M | kiss_twiddle_scalar t0, t1; | 206 | 87.4M | kiss_fft_scalar re, im, yr, yi; | 207 | 87.4M | t0 = t[i]; | 208 | 87.4M | t1 = t[N4+i]; | 209 | 87.4M | re = *yp++; | 210 | 87.4M | im = *yp++; | 211 | 87.4M | yr = S_MUL(re,t0) - S_MUL(im,t1); | 212 | 87.4M | yi = S_MUL(im,t0) + S_MUL(re,t1); | 213 | | /* For QEXT, it's best to scale before the FFT, but otherwise it's best to scale after. | 214 | | For floating-point it doesn't matter. */ | 215 | | #ifdef ENABLE_QEXT | 216 | | yc.r = yr; | 217 | | yc.i = yi; | 218 | | #else | 219 | 87.4M | yc.r = S_MUL2(yr, scale); | 220 | 87.4M | yc.i = S_MUL2(yi, scale); | 221 | 87.4M | #endif | 222 | | #ifdef FIXED_POINT | 223 | | maxval = MAX32(maxval, MAX32(ABS32(yc.r), ABS32(yc.i))); | 224 | | #endif | 225 | 87.4M | f2[st->bitrev[i]] = yc; | 226 | 87.4M | } | 227 | | #ifdef FIXED_POINT | 228 | | headroom = IMAX(0, IMIN(scale_shift, 28-celt_ilog2(maxval))); | 229 | | #endif | 230 | 713k | } | 231 | | | 232 | | /* N/4 complex FFT, does not downscale anymore */ | 233 | 713k | opus_fft_impl(st, f2 ARG_FIXED(scale_shift-headroom)); | 234 | | | 235 | | /* Post-rotate */ | 236 | 713k | { | 237 | | /* Temp pointers to make it really clear to the compiler what we're doing */ | 238 | 713k | const kiss_fft_cpx * OPUS_RESTRICT fp = f2; | 239 | 713k | kiss_fft_scalar * OPUS_RESTRICT yp1 = out; | 240 | 713k | kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1); | 241 | 713k | const kiss_twiddle_scalar *t = &trig[0]; | 242 | | /* Temp pointers to make it really clear to the compiler what we're doing */ | 243 | 88.1M | for(i=0;i<N4;i++) | 244 | 87.4M | { | 245 | 87.4M | kiss_fft_scalar yr, yi; | 246 | 87.4M | kiss_fft_scalar t0, t1; | 247 | | #ifdef ENABLE_QEXT | 248 | | t0 = S_MUL2(t[i], scale); | 249 | | t1 = S_MUL2(t[N4+i], scale); | 250 | | #else | 251 | 87.4M | t0 = t[i]; | 252 | 87.4M | t1 = t[N4+i]; | 253 | 87.4M | #endif | 254 | 87.4M | yr = PSHR32(S_MUL(fp->i,t1) - S_MUL(fp->r,t0), headroom); | 255 | 87.4M | yi = PSHR32(S_MUL(fp->r,t1) + S_MUL(fp->i,t0), headroom); | 256 | 87.4M | *yp1 = yr; | 257 | 87.4M | *yp2 = yi; | 258 | 87.4M | fp++; | 259 | 87.4M | yp1 += 2*stride; | 260 | 87.4M | yp2 -= 2*stride; | 261 | 87.4M | } | 262 | 713k | } | 263 | 713k | RESTORE_STACK; | 264 | 713k | } |
|
265 | | #endif /* OVERRIDE_clt_mdct_forward */ |
266 | | |
267 | | #ifndef OVERRIDE_clt_mdct_backward |
268 | | void clt_mdct_backward_c(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out, |
269 | | const celt_coef * OPUS_RESTRICT window, int overlap, int shift, int stride, int arch) |
270 | 609k | { |
271 | 609k | int i; |
272 | 609k | int N, N2, N4; |
273 | 609k | const kiss_twiddle_scalar *trig; |
274 | | #ifdef FIXED_POINT |
275 | | int pre_shift, post_shift, fft_shift; |
276 | | #endif |
277 | 609k | (void) arch; |
278 | | |
279 | 609k | N = l->n; |
280 | 609k | trig = l->trig; |
281 | 2.00M | for (i=0;i<shift;i++) |
282 | 1.39M | { |
283 | 1.39M | N >>= 1; |
284 | 1.39M | trig += N; |
285 | 1.39M | } |
286 | 609k | N2 = N>>1; |
287 | 609k | N4 = N>>2; |
288 | | |
289 | | #ifdef FIXED_POINT |
290 | | { |
291 | | opus_val32 sumval=N2; |
292 | | opus_val32 maxval=0; |
293 | 91.6M | for (i=0;i<N2;i++) { |
294 | 91.2M | maxval = MAX32(maxval, ABS32(in[i*stride])); |
295 | 91.2M | sumval = ADD32_ovflw(sumval, ABS32(SHR32(in[i*stride],11))); |
296 | 91.2M | } |
297 | 333k | pre_shift = IMAX(0, 29-celt_zlog2(1+maxval)); |
298 | | /* Worst-case where all the energy goes to a single sample. */ |
299 | 333k | post_shift = IMAX(0, 19-celt_ilog2(ABS32(sumval))); |
300 | 333k | post_shift = IMIN(post_shift, pre_shift); |
301 | | fft_shift = pre_shift - post_shift; |
302 | | } |
303 | | #endif |
304 | | /* Pre-rotate */ |
305 | 609k | { |
306 | | /* Temp pointers to make it really clear to the compiler what we're doing */ |
307 | 609k | const kiss_fft_scalar * OPUS_RESTRICT xp1 = in; |
308 | 609k | const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1); |
309 | 609k | kiss_fft_scalar * OPUS_RESTRICT yp = out+(overlap>>1); |
310 | 609k | const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0]; |
311 | 609k | const opus_int16 * OPUS_RESTRICT bitrev = l->kfft[shift]->bitrev; |
312 | 87.7M | for(i=0;i<N4;i++) |
313 | 87.1M | { |
314 | 87.1M | int rev; |
315 | 87.1M | kiss_fft_scalar yr, yi; |
316 | 87.1M | opus_val32 x1, x2; |
317 | 87.1M | rev = *bitrev++; |
318 | 87.1M | x1 = SHL32_ovflw(*xp1, pre_shift); |
319 | 87.1M | x2 = SHL32_ovflw(*xp2, pre_shift); |
320 | 87.1M | yr = ADD32_ovflw(S_MUL(x2, t[i]), S_MUL(x1, t[N4+i])); |
321 | 87.1M | yi = SUB32_ovflw(S_MUL(x1, t[i]), S_MUL(x2, t[N4+i])); |
322 | | /* We swap real and imag because we use an FFT instead of an IFFT. */ |
323 | 87.1M | yp[2*rev+1] = yr; |
324 | 87.1M | yp[2*rev] = yi; |
325 | | /* Storing the pre-rotation directly in the bitrev order. */ |
326 | 87.1M | xp1+=2*stride; |
327 | 87.1M | xp2-=2*stride; |
328 | 87.1M | } |
329 | 609k | } |
330 | | |
331 | 609k | opus_fft_impl(l->kfft[shift], (kiss_fft_cpx*)(out+(overlap>>1)) ARG_FIXED(fft_shift)); |
332 | | |
333 | | /* Post-rotate and de-shuffle from both ends of the buffer at once to make |
334 | | it in-place. */ |
335 | 609k | { |
336 | 609k | kiss_fft_scalar * yp0 = out+(overlap>>1); |
337 | 609k | kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2; |
338 | 609k | const kiss_twiddle_scalar *t = &trig[0]; |
339 | | /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the |
340 | | middle pair will be computed twice. */ |
341 | 44.1M | for(i=0;i<(N4+1)>>1;i++) |
342 | 43.5M | { |
343 | 43.5M | kiss_fft_scalar re, im, yr, yi; |
344 | 43.5M | kiss_twiddle_scalar t0, t1; |
345 | | /* We swap real and imag because we're using an FFT instead of an IFFT. */ |
346 | 43.5M | re = yp0[1]; |
347 | 43.5M | im = yp0[0]; |
348 | 43.5M | t0 = t[i]; |
349 | 43.5M | t1 = t[N4+i]; |
350 | | /* We'd scale up by 2 here, but instead it's done when mixing the windows */ |
351 | 43.5M | yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), post_shift); |
352 | 43.5M | yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), post_shift); |
353 | | /* We swap real and imag because we're using an FFT instead of an IFFT. */ |
354 | 43.5M | re = yp1[1]; |
355 | 43.5M | im = yp1[0]; |
356 | 43.5M | yp0[0] = yr; |
357 | 43.5M | yp1[1] = yi; |
358 | | |
359 | 43.5M | t0 = t[(N4-i-1)]; |
360 | 43.5M | t1 = t[(N2-i-1)]; |
361 | | /* We'd scale up by 2 here, but instead it's done when mixing the windows */ |
362 | 43.5M | yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), post_shift); |
363 | 43.5M | yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), post_shift); |
364 | 43.5M | yp1[0] = yr; |
365 | 43.5M | yp0[1] = yi; |
366 | 43.5M | yp0 += 2; |
367 | 43.5M | yp1 -= 2; |
368 | 43.5M | } |
369 | 609k | } |
370 | | |
371 | | /* Mirror on both sides for TDAC */ |
372 | 609k | { |
373 | 609k | kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1; |
374 | 609k | kiss_fft_scalar * OPUS_RESTRICT yp1 = out; |
375 | 609k | const celt_coef * OPUS_RESTRICT wp1 = window; |
376 | 609k | const celt_coef * OPUS_RESTRICT wp2 = window+overlap-1; |
377 | | |
378 | 41.8M | for(i = 0; i < overlap/2; i++) |
379 | 41.2M | { |
380 | 41.2M | kiss_fft_scalar x1, x2; |
381 | 41.2M | x1 = *xp1; |
382 | 41.2M | x2 = *yp1; |
383 | 41.2M | *yp1++ = SUB32_ovflw(S_MUL(x2, *wp2), S_MUL(x1, *wp1)); |
384 | 41.2M | *xp1-- = ADD32_ovflw(S_MUL(x2, *wp1), S_MUL(x1, *wp2)); |
385 | 41.2M | wp1++; |
386 | 41.2M | wp2--; |
387 | 41.2M | } |
388 | 609k | } |
389 | 609k | } Line | Count | Source | 270 | 333k | { | 271 | 333k | int i; | 272 | 333k | int N, N2, N4; | 273 | 333k | const kiss_twiddle_scalar *trig; | 274 | 333k | #ifdef FIXED_POINT | 275 | 333k | int pre_shift, post_shift, fft_shift; | 276 | 333k | #endif | 277 | 333k | (void) arch; | 278 | | | 279 | 333k | N = l->n; | 280 | 333k | trig = l->trig; | 281 | 1.11M | for (i=0;i<shift;i++) | 282 | 783k | { | 283 | 783k | N >>= 1; | 284 | 783k | trig += N; | 285 | 783k | } | 286 | 333k | N2 = N>>1; | 287 | 333k | N4 = N>>2; | 288 | | | 289 | 333k | #ifdef FIXED_POINT | 290 | 333k | { | 291 | 333k | opus_val32 sumval=N2; | 292 | 333k | opus_val32 maxval=0; | 293 | 91.6M | for (i=0;i<N2;i++) { | 294 | 91.2M | maxval = MAX32(maxval, ABS32(in[i*stride])); | 295 | 91.2M | sumval = ADD32_ovflw(sumval, ABS32(SHR32(in[i*stride],11))); | 296 | 91.2M | } | 297 | 333k | pre_shift = IMAX(0, 29-celt_zlog2(1+maxval)); | 298 | | /* Worst-case where all the energy goes to a single sample. */ | 299 | 333k | post_shift = IMAX(0, 19-celt_ilog2(ABS32(sumval))); | 300 | 333k | post_shift = IMIN(post_shift, pre_shift); | 301 | 333k | fft_shift = pre_shift - post_shift; | 302 | 333k | } | 303 | 333k | #endif | 304 | | /* Pre-rotate */ | 305 | 333k | { | 306 | | /* Temp pointers to make it really clear to the compiler what we're doing */ | 307 | 333k | const kiss_fft_scalar * OPUS_RESTRICT xp1 = in; | 308 | 333k | const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1); | 309 | 333k | kiss_fft_scalar * OPUS_RESTRICT yp = out+(overlap>>1); | 310 | 333k | const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0]; | 311 | 333k | const opus_int16 * OPUS_RESTRICT bitrev = l->kfft[shift]->bitrev; | 312 | 45.9M | for(i=0;i<N4;i++) | 313 | 45.6M | { | 314 | 45.6M | int rev; | 315 | 45.6M | kiss_fft_scalar yr, yi; | 316 | 45.6M | opus_val32 x1, x2; | 317 | 45.6M | rev = *bitrev++; | 318 | 45.6M | x1 = SHL32_ovflw(*xp1, pre_shift); | 319 | 45.6M | x2 = SHL32_ovflw(*xp2, pre_shift); | 320 | 45.6M | yr = ADD32_ovflw(S_MUL(x2, t[i]), S_MUL(x1, t[N4+i])); | 321 | 45.6M | yi = SUB32_ovflw(S_MUL(x1, t[i]), S_MUL(x2, t[N4+i])); | 322 | | /* We swap real and imag because we use an FFT instead of an IFFT. */ | 323 | 45.6M | yp[2*rev+1] = yr; | 324 | 45.6M | yp[2*rev] = yi; | 325 | | /* Storing the pre-rotation directly in the bitrev order. */ | 326 | 45.6M | xp1+=2*stride; | 327 | 45.6M | xp2-=2*stride; | 328 | 45.6M | } | 329 | 333k | } | 330 | | | 331 | 333k | opus_fft_impl(l->kfft[shift], (kiss_fft_cpx*)(out+(overlap>>1)) ARG_FIXED(fft_shift)); | 332 | | | 333 | | /* Post-rotate and de-shuffle from both ends of the buffer at once to make | 334 | | it in-place. */ | 335 | 333k | { | 336 | 333k | kiss_fft_scalar * yp0 = out+(overlap>>1); | 337 | 333k | kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2; | 338 | 333k | const kiss_twiddle_scalar *t = &trig[0]; | 339 | | /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the | 340 | | middle pair will be computed twice. */ | 341 | 23.1M | for(i=0;i<(N4+1)>>1;i++) | 342 | 22.8M | { | 343 | 22.8M | kiss_fft_scalar re, im, yr, yi; | 344 | 22.8M | kiss_twiddle_scalar t0, t1; | 345 | | /* We swap real and imag because we're using an FFT instead of an IFFT. */ | 346 | 22.8M | re = yp0[1]; | 347 | 22.8M | im = yp0[0]; | 348 | 22.8M | t0 = t[i]; | 349 | 22.8M | t1 = t[N4+i]; | 350 | | /* We'd scale up by 2 here, but instead it's done when mixing the windows */ | 351 | 22.8M | yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), post_shift); | 352 | 22.8M | yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), post_shift); | 353 | | /* We swap real and imag because we're using an FFT instead of an IFFT. */ | 354 | 22.8M | re = yp1[1]; | 355 | 22.8M | im = yp1[0]; | 356 | 22.8M | yp0[0] = yr; | 357 | 22.8M | yp1[1] = yi; | 358 | | | 359 | 22.8M | t0 = t[(N4-i-1)]; | 360 | 22.8M | t1 = t[(N2-i-1)]; | 361 | | /* We'd scale up by 2 here, but instead it's done when mixing the windows */ | 362 | 22.8M | yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), post_shift); | 363 | 22.8M | yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), post_shift); | 364 | 22.8M | yp1[0] = yr; | 365 | 22.8M | yp0[1] = yi; | 366 | 22.8M | yp0 += 2; | 367 | 22.8M | yp1 -= 2; | 368 | 22.8M | } | 369 | 333k | } | 370 | | | 371 | | /* Mirror on both sides for TDAC */ | 372 | 333k | { | 373 | 333k | kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1; | 374 | 333k | kiss_fft_scalar * OPUS_RESTRICT yp1 = out; | 375 | 333k | const celt_coef * OPUS_RESTRICT wp1 = window; | 376 | 333k | const celt_coef * OPUS_RESTRICT wp2 = window+overlap-1; | 377 | | | 378 | 23.2M | for(i = 0; i < overlap/2; i++) | 379 | 22.9M | { | 380 | 22.9M | kiss_fft_scalar x1, x2; | 381 | 22.9M | x1 = *xp1; | 382 | 22.9M | x2 = *yp1; | 383 | 22.9M | *yp1++ = SUB32_ovflw(S_MUL(x2, *wp2), S_MUL(x1, *wp1)); | 384 | 22.9M | *xp1-- = ADD32_ovflw(S_MUL(x2, *wp1), S_MUL(x1, *wp2)); | 385 | 22.9M | wp1++; | 386 | 22.9M | wp2--; | 387 | 22.9M | } | 388 | 333k | } | 389 | 333k | } |
Line | Count | Source | 270 | 276k | { | 271 | 276k | int i; | 272 | 276k | int N, N2, N4; | 273 | 276k | const kiss_twiddle_scalar *trig; | 274 | | #ifdef FIXED_POINT | 275 | | int pre_shift, post_shift, fft_shift; | 276 | | #endif | 277 | 276k | (void) arch; | 278 | | | 279 | 276k | N = l->n; | 280 | 276k | trig = l->trig; | 281 | 886k | for (i=0;i<shift;i++) | 282 | 609k | { | 283 | 609k | N >>= 1; | 284 | 609k | trig += N; | 285 | 609k | } | 286 | 276k | N2 = N>>1; | 287 | 276k | N4 = N>>2; | 288 | | | 289 | | #ifdef FIXED_POINT | 290 | | { | 291 | | opus_val32 sumval=N2; | 292 | | opus_val32 maxval=0; | 293 | | for (i=0;i<N2;i++) { | 294 | | maxval = MAX32(maxval, ABS32(in[i*stride])); | 295 | | sumval = ADD32_ovflw(sumval, ABS32(SHR32(in[i*stride],11))); | 296 | | } | 297 | | pre_shift = IMAX(0, 29-celt_zlog2(1+maxval)); | 298 | | /* Worst-case where all the energy goes to a single sample. */ | 299 | | post_shift = IMAX(0, 19-celt_ilog2(ABS32(sumval))); | 300 | | post_shift = IMIN(post_shift, pre_shift); | 301 | | fft_shift = pre_shift - post_shift; | 302 | | } | 303 | | #endif | 304 | | /* Pre-rotate */ | 305 | 276k | { | 306 | | /* Temp pointers to make it really clear to the compiler what we're doing */ | 307 | 276k | const kiss_fft_scalar * OPUS_RESTRICT xp1 = in; | 308 | 276k | const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1); | 309 | 276k | kiss_fft_scalar * OPUS_RESTRICT yp = out+(overlap>>1); | 310 | 276k | const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0]; | 311 | 276k | const opus_int16 * OPUS_RESTRICT bitrev = l->kfft[shift]->bitrev; | 312 | 41.7M | for(i=0;i<N4;i++) | 313 | 41.4M | { | 314 | 41.4M | int rev; | 315 | 41.4M | kiss_fft_scalar yr, yi; | 316 | 41.4M | opus_val32 x1, x2; | 317 | 41.4M | rev = *bitrev++; | 318 | 41.4M | x1 = SHL32_ovflw(*xp1, pre_shift); | 319 | 41.4M | x2 = SHL32_ovflw(*xp2, pre_shift); | 320 | 41.4M | yr = ADD32_ovflw(S_MUL(x2, t[i]), S_MUL(x1, t[N4+i])); | 321 | 41.4M | yi = SUB32_ovflw(S_MUL(x1, t[i]), S_MUL(x2, t[N4+i])); | 322 | | /* We swap real and imag because we use an FFT instead of an IFFT. */ | 323 | 41.4M | yp[2*rev+1] = yr; | 324 | 41.4M | yp[2*rev] = yi; | 325 | | /* Storing the pre-rotation directly in the bitrev order. */ | 326 | 41.4M | xp1+=2*stride; | 327 | 41.4M | xp2-=2*stride; | 328 | 41.4M | } | 329 | 276k | } | 330 | | | 331 | 276k | opus_fft_impl(l->kfft[shift], (kiss_fft_cpx*)(out+(overlap>>1)) ARG_FIXED(fft_shift)); | 332 | | | 333 | | /* Post-rotate and de-shuffle from both ends of the buffer at once to make | 334 | | it in-place. */ | 335 | 276k | { | 336 | 276k | kiss_fft_scalar * yp0 = out+(overlap>>1); | 337 | 276k | kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2; | 338 | 276k | const kiss_twiddle_scalar *t = &trig[0]; | 339 | | /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the | 340 | | middle pair will be computed twice. */ | 341 | 21.0M | for(i=0;i<(N4+1)>>1;i++) | 342 | 20.7M | { | 343 | 20.7M | kiss_fft_scalar re, im, yr, yi; | 344 | 20.7M | kiss_twiddle_scalar t0, t1; | 345 | | /* We swap real and imag because we're using an FFT instead of an IFFT. */ | 346 | 20.7M | re = yp0[1]; | 347 | 20.7M | im = yp0[0]; | 348 | 20.7M | t0 = t[i]; | 349 | 20.7M | t1 = t[N4+i]; | 350 | | /* We'd scale up by 2 here, but instead it's done when mixing the windows */ | 351 | 20.7M | yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), post_shift); | 352 | 20.7M | yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), post_shift); | 353 | | /* We swap real and imag because we're using an FFT instead of an IFFT. */ | 354 | 20.7M | re = yp1[1]; | 355 | 20.7M | im = yp1[0]; | 356 | 20.7M | yp0[0] = yr; | 357 | 20.7M | yp1[1] = yi; | 358 | | | 359 | 20.7M | t0 = t[(N4-i-1)]; | 360 | 20.7M | t1 = t[(N2-i-1)]; | 361 | | /* We'd scale up by 2 here, but instead it's done when mixing the windows */ | 362 | 20.7M | yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), post_shift); | 363 | 20.7M | yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), post_shift); | 364 | 20.7M | yp1[0] = yr; | 365 | 20.7M | yp0[1] = yi; | 366 | 20.7M | yp0 += 2; | 367 | 20.7M | yp1 -= 2; | 368 | 20.7M | } | 369 | 276k | } | 370 | | | 371 | | /* Mirror on both sides for TDAC */ | 372 | 276k | { | 373 | 276k | kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1; | 374 | 276k | kiss_fft_scalar * OPUS_RESTRICT yp1 = out; | 375 | 276k | const celt_coef * OPUS_RESTRICT wp1 = window; | 376 | 276k | const celt_coef * OPUS_RESTRICT wp2 = window+overlap-1; | 377 | | | 378 | 18.5M | for(i = 0; i < overlap/2; i++) | 379 | 18.2M | { | 380 | 18.2M | kiss_fft_scalar x1, x2; | 381 | 18.2M | x1 = *xp1; | 382 | 18.2M | x2 = *yp1; | 383 | 18.2M | *yp1++ = SUB32_ovflw(S_MUL(x2, *wp2), S_MUL(x1, *wp1)); | 384 | 18.2M | *xp1-- = ADD32_ovflw(S_MUL(x2, *wp1), S_MUL(x1, *wp2)); | 385 | 18.2M | wp1++; | 386 | 18.2M | wp2--; | 387 | 18.2M | } | 388 | 276k | } | 389 | 276k | } |
|
390 | | #endif /* OVERRIDE_clt_mdct_backward */ |