Line | Count | Source (jump to first uncovered line) |
1 | | /* Copyright (c) 2007-2008 CSIRO |
2 | | Copyright (c) 2007-2008 Xiph.Org Foundation |
3 | | Written by Jean-Marc Valin */ |
4 | | /* |
5 | | Redistribution and use in source and binary forms, with or without |
6 | | modification, are permitted provided that the following conditions |
7 | | are met: |
8 | | |
9 | | - Redistributions of source code must retain the above copyright |
10 | | notice, this list of conditions and the following disclaimer. |
11 | | |
12 | | - Redistributions in binary form must reproduce the above copyright |
13 | | notice, this list of conditions and the following disclaimer in the |
14 | | documentation and/or other materials provided with the distribution. |
15 | | |
16 | | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
17 | | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
18 | | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
19 | | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER |
20 | | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
21 | | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
22 | | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
23 | | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
24 | | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
25 | | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
26 | | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
27 | | */ |
28 | | |
29 | | /* This is a simple MDCT implementation that uses a N/4 complex FFT |
30 | | to do most of the work. It should be relatively straightforward to |
31 | | plug in pretty much and FFT here. |
32 | | |
33 | | This replaces the Vorbis FFT (and uses the exact same API), which |
34 | | was a bit too messy and that was ending up duplicating code |
35 | | (might as well use the same FFT everywhere). |
36 | | |
37 | | The algorithm is similar to (and inspired from) Fabrice Bellard's |
38 | | MDCT implementation in FFMPEG, but has differences in signs, ordering |
39 | | and scaling in many places. |
40 | | */ |
41 | | |
42 | | #ifndef SKIP_CONFIG_H |
43 | | #ifdef HAVE_CONFIG_H |
44 | | #include "config.h" |
45 | | #endif |
46 | | #endif |
47 | | |
48 | | #include "mdct.h" |
49 | | #include "kiss_fft.h" |
50 | | #include "_kiss_fft_guts.h" |
51 | | #include <math.h> |
52 | | #include "os_support.h" |
53 | | #include "mathops.h" |
54 | | #include "stack_alloc.h" |
55 | | |
56 | | #if defined(FIXED_POINT) && defined(__mips_dsp) && __mips == 32 |
57 | | #include "mips/mdct_mipsr1.h" |
58 | | #endif |
59 | | |
60 | | #ifndef M_PI |
61 | | #define M_PI 3.141592653 |
62 | | #endif |
63 | | |
64 | | #ifdef CUSTOM_MODES |
65 | | |
66 | | int clt_mdct_init(mdct_lookup *l,int N, int maxshift, int arch) |
67 | | { |
68 | | int i; |
69 | | kiss_twiddle_scalar *trig; |
70 | | int shift; |
71 | | int N2=N>>1; |
72 | | l->n = N; |
73 | | l->maxshift = maxshift; |
74 | | for (i=0;i<=maxshift;i++) |
75 | | { |
76 | | if (i==0) |
77 | | l->kfft[i] = opus_fft_alloc(N>>2>>i, 0, 0, arch); |
78 | | else |
79 | | l->kfft[i] = opus_fft_alloc_twiddles(N>>2>>i, 0, 0, l->kfft[0], arch); |
80 | | #ifndef ENABLE_TI_DSPLIB55 |
81 | | if (l->kfft[i]==NULL) |
82 | | return 0; |
83 | | #endif |
84 | | } |
85 | | l->trig = trig = (kiss_twiddle_scalar*)opus_alloc((N-(N2>>maxshift))*sizeof(kiss_twiddle_scalar)); |
86 | | if (l->trig==NULL) |
87 | | return 0; |
88 | | for (shift=0;shift<=maxshift;shift++) |
89 | | { |
90 | | /* We have enough points that sine isn't necessary */ |
91 | | #if defined(FIXED_POINT) |
92 | | #ifndef ENABLE_QEXT |
93 | | for (i=0;i<N2;i++) |
94 | | trig[i] = TRIG_UPSCALE*celt_cos_norm(DIV32(ADD32(SHL32(EXTEND32(i),17),N2+16384),N)); |
95 | | #else |
96 | | for (i=0;i<N2;i++) |
97 | | trig[i] = (kiss_twiddle_scalar)MAX32(-2147483647,MIN32(2147483647,floor(.5+2147483648*cos(2*M_PI*(i+.125)/N)))); |
98 | | #endif |
99 | | #else |
100 | | for (i=0;i<N2;i++) |
101 | | trig[i] = (kiss_twiddle_scalar)cos(2*PI*(i+.125)/N); |
102 | | #endif |
103 | | trig += N2; |
104 | | N2 >>= 1; |
105 | | N >>= 1; |
106 | | } |
107 | | return 1; |
108 | | } |
109 | | |
110 | | void clt_mdct_clear(mdct_lookup *l, int arch) |
111 | | { |
112 | | int i; |
113 | | for (i=0;i<=l->maxshift;i++) |
114 | | opus_fft_free(l->kfft[i], arch); |
115 | | opus_free((kiss_twiddle_scalar*)l->trig); |
116 | | } |
117 | | |
118 | | #endif /* CUSTOM_MODES */ |
119 | | |
120 | | /* Forward MDCT trashes the input array */ |
121 | | #ifndef OVERRIDE_clt_mdct_forward |
122 | | void clt_mdct_forward_c(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out, |
123 | | const celt_coef *window, int overlap, int shift, int stride, int arch) |
124 | 0 | { |
125 | 0 | int i; |
126 | 0 | int N, N2, N4; |
127 | 0 | VARDECL(kiss_fft_scalar, f); |
128 | 0 | VARDECL(kiss_fft_cpx, f2); |
129 | 0 | const kiss_fft_state *st = l->kfft[shift]; |
130 | 0 | const kiss_twiddle_scalar *trig; |
131 | 0 | celt_coef scale; |
132 | | #ifdef FIXED_POINT |
133 | | /* Allows us to scale with MULT16_32_Q16(), which is faster than |
134 | | MULT16_32_Q15() on ARM. */ |
135 | | int scale_shift = st->scale_shift-1; |
136 | | int headroom; |
137 | | #endif |
138 | 0 | SAVE_STACK; |
139 | 0 | (void)arch; |
140 | 0 | scale = st->scale; |
141 | |
|
142 | 0 | N = l->n; |
143 | 0 | trig = l->trig; |
144 | 0 | for (i=0;i<shift;i++) |
145 | 0 | { |
146 | 0 | N >>= 1; |
147 | 0 | trig += N; |
148 | 0 | } |
149 | 0 | N2 = N>>1; |
150 | 0 | N4 = N>>2; |
151 | |
|
152 | 0 | ALLOC(f, N2, kiss_fft_scalar); |
153 | 0 | ALLOC(f2, N4, kiss_fft_cpx); |
154 | | |
155 | | /* Consider the input to be composed of four blocks: [a, b, c, d] */ |
156 | | /* Window, shuffle, fold */ |
157 | 0 | { |
158 | | /* Temp pointers to make it really clear to the compiler what we're doing */ |
159 | 0 | const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1); |
160 | 0 | const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1); |
161 | 0 | kiss_fft_scalar * OPUS_RESTRICT yp = f; |
162 | 0 | const celt_coef * OPUS_RESTRICT wp1 = window+(overlap>>1); |
163 | 0 | const celt_coef * OPUS_RESTRICT wp2 = window+(overlap>>1)-1; |
164 | 0 | for(i=0;i<((overlap+3)>>2);i++) |
165 | 0 | { |
166 | | /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/ |
167 | 0 | *yp++ = S_MUL(xp1[N2], *wp2) + S_MUL(*xp2, *wp1); |
168 | 0 | *yp++ = S_MUL(*xp1, *wp1) - S_MUL(xp2[-N2], *wp2); |
169 | 0 | xp1+=2; |
170 | 0 | xp2-=2; |
171 | 0 | wp1+=2; |
172 | 0 | wp2-=2; |
173 | 0 | } |
174 | 0 | wp1 = window; |
175 | 0 | wp2 = window+overlap-1; |
176 | 0 | for(;i<N4-((overlap+3)>>2);i++) |
177 | 0 | { |
178 | | /* Real part arranged as a-bR, Imag part arranged as -c-dR */ |
179 | 0 | *yp++ = *xp2; |
180 | 0 | *yp++ = *xp1; |
181 | 0 | xp1+=2; |
182 | 0 | xp2-=2; |
183 | 0 | } |
184 | 0 | for(;i<N4;i++) |
185 | 0 | { |
186 | | /* Real part arranged as a-bR, Imag part arranged as -c-dR */ |
187 | 0 | *yp++ = -S_MUL(xp1[-N2], *wp1) + S_MUL(*xp2, *wp2); |
188 | 0 | *yp++ = S_MUL(*xp1, *wp2) + S_MUL(xp2[N2], *wp1); |
189 | 0 | xp1+=2; |
190 | 0 | xp2-=2; |
191 | 0 | wp1+=2; |
192 | 0 | wp2-=2; |
193 | 0 | } |
194 | 0 | } |
195 | | /* Pre-rotation */ |
196 | 0 | { |
197 | 0 | kiss_fft_scalar * OPUS_RESTRICT yp = f; |
198 | 0 | const kiss_twiddle_scalar *t = &trig[0]; |
199 | | #ifdef FIXED_POINT |
200 | | opus_val32 maxval=1; |
201 | | #endif |
202 | 0 | for(i=0;i<N4;i++) |
203 | 0 | { |
204 | 0 | kiss_fft_cpx yc; |
205 | 0 | kiss_twiddle_scalar t0, t1; |
206 | 0 | kiss_fft_scalar re, im, yr, yi; |
207 | 0 | t0 = t[i]; |
208 | 0 | t1 = t[N4+i]; |
209 | 0 | re = *yp++; |
210 | 0 | im = *yp++; |
211 | 0 | yr = S_MUL(re,t0) - S_MUL(im,t1); |
212 | 0 | yi = S_MUL(im,t0) + S_MUL(re,t1); |
213 | | /* For QEXT, it's best to scale before the FFT, but otherwise it's best to scale after. |
214 | | For floating-point it doesn't matter. */ |
215 | | #ifdef ENABLE_QEXT |
216 | | yc.r = yr; |
217 | | yc.i = yi; |
218 | | #else |
219 | 0 | yc.r = S_MUL2(yr, scale); |
220 | 0 | yc.i = S_MUL2(yi, scale); |
221 | 0 | #endif |
222 | | #ifdef FIXED_POINT |
223 | | maxval = MAX32(maxval, MAX32(ABS32(yc.r), ABS32(yc.i))); |
224 | | #endif |
225 | 0 | f2[st->bitrev[i]] = yc; |
226 | 0 | } |
227 | | #ifdef FIXED_POINT |
228 | | headroom = IMAX(0, IMIN(scale_shift, 28-celt_ilog2(maxval))); |
229 | | #endif |
230 | 0 | } |
231 | | |
232 | | /* N/4 complex FFT, does not downscale anymore */ |
233 | 0 | opus_fft_impl(st, f2 ARG_FIXED(scale_shift-headroom)); |
234 | | |
235 | | /* Post-rotate */ |
236 | 0 | { |
237 | | /* Temp pointers to make it really clear to the compiler what we're doing */ |
238 | 0 | const kiss_fft_cpx * OPUS_RESTRICT fp = f2; |
239 | 0 | kiss_fft_scalar * OPUS_RESTRICT yp1 = out; |
240 | 0 | kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1); |
241 | 0 | const kiss_twiddle_scalar *t = &trig[0]; |
242 | | /* Temp pointers to make it really clear to the compiler what we're doing */ |
243 | 0 | for(i=0;i<N4;i++) |
244 | 0 | { |
245 | 0 | kiss_fft_scalar yr, yi; |
246 | 0 | kiss_fft_scalar t0, t1; |
247 | | #ifdef ENABLE_QEXT |
248 | | t0 = S_MUL2(t[i], scale); |
249 | | t1 = S_MUL2(t[N4+i], scale); |
250 | | #else |
251 | 0 | t0 = t[i]; |
252 | 0 | t1 = t[N4+i]; |
253 | 0 | #endif |
254 | 0 | yr = PSHR32(S_MUL(fp->i,t1) - S_MUL(fp->r,t0), headroom); |
255 | 0 | yi = PSHR32(S_MUL(fp->r,t1) + S_MUL(fp->i,t0), headroom); |
256 | 0 | *yp1 = yr; |
257 | 0 | *yp2 = yi; |
258 | 0 | fp++; |
259 | 0 | yp1 += 2*stride; |
260 | 0 | yp2 -= 2*stride; |
261 | 0 | } |
262 | 0 | } |
263 | 0 | RESTORE_STACK; |
264 | 0 | } |
265 | | #endif /* OVERRIDE_clt_mdct_forward */ |
266 | | |
267 | | #ifndef OVERRIDE_clt_mdct_backward |
268 | | void clt_mdct_backward_c(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out, |
269 | | const celt_coef * OPUS_RESTRICT window, int overlap, int shift, int stride, int arch) |
270 | 424k | { |
271 | 424k | int i; |
272 | 424k | int N, N2, N4; |
273 | 424k | const kiss_twiddle_scalar *trig; |
274 | | #ifdef FIXED_POINT |
275 | | int pre_shift, post_shift, fft_shift; |
276 | | #endif |
277 | 424k | (void) arch; |
278 | | |
279 | 424k | N = l->n; |
280 | 424k | trig = l->trig; |
281 | 1.04M | for (i=0;i<shift;i++) |
282 | 616k | { |
283 | 616k | N >>= 1; |
284 | 616k | trig += N; |
285 | 616k | } |
286 | 424k | N2 = N>>1; |
287 | 424k | N4 = N>>2; |
288 | | |
289 | | #ifdef FIXED_POINT |
290 | | { |
291 | | opus_val32 sumval=N2; |
292 | | opus_val32 maxval=0; |
293 | | for (i=0;i<N2;i++) { |
294 | | maxval = MAX32(maxval, ABS32(in[i*stride])); |
295 | | sumval = ADD32_ovflw(sumval, ABS32(SHR32(in[i*stride],11))); |
296 | | } |
297 | | pre_shift = IMAX(0, 29-celt_zlog2(1+maxval)); |
298 | | /* Worst-case where all the energy goes to a single sample. */ |
299 | | post_shift = IMAX(0, 19-celt_ilog2(ABS32(sumval))); |
300 | | post_shift = IMIN(post_shift, pre_shift); |
301 | | fft_shift = pre_shift - post_shift; |
302 | | } |
303 | | #endif |
304 | | /* Pre-rotate */ |
305 | 424k | { |
306 | | /* Temp pointers to make it really clear to the compiler what we're doing */ |
307 | 424k | const kiss_fft_scalar * OPUS_RESTRICT xp1 = in; |
308 | 424k | const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1); |
309 | 424k | kiss_fft_scalar * OPUS_RESTRICT yp = out+(overlap>>1); |
310 | 424k | const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0]; |
311 | 424k | const opus_int16 * OPUS_RESTRICT bitrev = l->kfft[shift]->bitrev; |
312 | 92.8M | for(i=0;i<N4;i++) |
313 | 92.4M | { |
314 | 92.4M | int rev; |
315 | 92.4M | kiss_fft_scalar yr, yi; |
316 | 92.4M | opus_val32 x1, x2; |
317 | 92.4M | rev = *bitrev++; |
318 | 92.4M | x1 = SHL32_ovflw(*xp1, pre_shift); |
319 | 92.4M | x2 = SHL32_ovflw(*xp2, pre_shift); |
320 | 92.4M | yr = ADD32_ovflw(S_MUL(x2, t[i]), S_MUL(x1, t[N4+i])); |
321 | 92.4M | yi = SUB32_ovflw(S_MUL(x1, t[i]), S_MUL(x2, t[N4+i])); |
322 | | /* We swap real and imag because we use an FFT instead of an IFFT. */ |
323 | 92.4M | yp[2*rev+1] = yr; |
324 | 92.4M | yp[2*rev] = yi; |
325 | | /* Storing the pre-rotation directly in the bitrev order. */ |
326 | 92.4M | xp1+=2*stride; |
327 | 92.4M | xp2-=2*stride; |
328 | 92.4M | } |
329 | 424k | } |
330 | | |
331 | 424k | opus_fft_impl(l->kfft[shift], (kiss_fft_cpx*)(out+(overlap>>1)) ARG_FIXED(fft_shift)); |
332 | | |
333 | | /* Post-rotate and de-shuffle from both ends of the buffer at once to make |
334 | | it in-place. */ |
335 | 424k | { |
336 | 424k | kiss_fft_scalar * yp0 = out+(overlap>>1); |
337 | 424k | kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2; |
338 | 424k | const kiss_twiddle_scalar *t = &trig[0]; |
339 | | /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the |
340 | | middle pair will be computed twice. */ |
341 | 46.6M | for(i=0;i<(N4+1)>>1;i++) |
342 | 46.2M | { |
343 | 46.2M | kiss_fft_scalar re, im, yr, yi; |
344 | 46.2M | kiss_twiddle_scalar t0, t1; |
345 | | /* We swap real and imag because we're using an FFT instead of an IFFT. */ |
346 | 46.2M | re = yp0[1]; |
347 | 46.2M | im = yp0[0]; |
348 | 46.2M | t0 = t[i]; |
349 | 46.2M | t1 = t[N4+i]; |
350 | | /* We'd scale up by 2 here, but instead it's done when mixing the windows */ |
351 | 46.2M | yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), post_shift); |
352 | 46.2M | yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), post_shift); |
353 | | /* We swap real and imag because we're using an FFT instead of an IFFT. */ |
354 | 46.2M | re = yp1[1]; |
355 | 46.2M | im = yp1[0]; |
356 | 46.2M | yp0[0] = yr; |
357 | 46.2M | yp1[1] = yi; |
358 | | |
359 | 46.2M | t0 = t[(N4-i-1)]; |
360 | 46.2M | t1 = t[(N2-i-1)]; |
361 | | /* We'd scale up by 2 here, but instead it's done when mixing the windows */ |
362 | 46.2M | yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), post_shift); |
363 | 46.2M | yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), post_shift); |
364 | 46.2M | yp1[0] = yr; |
365 | 46.2M | yp0[1] = yi; |
366 | 46.2M | yp0 += 2; |
367 | 46.2M | yp1 -= 2; |
368 | 46.2M | } |
369 | 424k | } |
370 | | |
371 | | /* Mirror on both sides for TDAC */ |
372 | 424k | { |
373 | 424k | kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1; |
374 | 424k | kiss_fft_scalar * OPUS_RESTRICT yp1 = out; |
375 | 424k | const celt_coef * OPUS_RESTRICT wp1 = window; |
376 | 424k | const celt_coef * OPUS_RESTRICT wp2 = window+overlap-1; |
377 | | |
378 | 25.9M | for(i = 0; i < overlap/2; i++) |
379 | 25.4M | { |
380 | 25.4M | kiss_fft_scalar x1, x2; |
381 | 25.4M | x1 = *xp1; |
382 | 25.4M | x2 = *yp1; |
383 | 25.4M | *yp1++ = SUB32_ovflw(S_MUL(x2, *wp2), S_MUL(x1, *wp1)); |
384 | 25.4M | *xp1-- = ADD32_ovflw(S_MUL(x2, *wp1), S_MUL(x1, *wp2)); |
385 | 25.4M | wp1++; |
386 | 25.4M | wp2--; |
387 | 25.4M | } |
388 | 424k | } |
389 | 424k | } |
390 | | #endif /* OVERRIDE_clt_mdct_backward */ |