Line | Count | Source |
1 | | /* Copyright (c) 2007-2008 CSIRO |
2 | | Copyright (c) 2007-2008 Xiph.Org Foundation |
3 | | Written by Jean-Marc Valin */ |
4 | | /* |
5 | | Redistribution and use in source and binary forms, with or without |
6 | | modification, are permitted provided that the following conditions |
7 | | are met: |
8 | | |
9 | | - Redistributions of source code must retain the above copyright |
10 | | notice, this list of conditions and the following disclaimer. |
11 | | |
12 | | - Redistributions in binary form must reproduce the above copyright |
13 | | notice, this list of conditions and the following disclaimer in the |
14 | | documentation and/or other materials provided with the distribution. |
15 | | |
16 | | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
17 | | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
18 | | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
19 | | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER |
20 | | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
21 | | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
22 | | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
23 | | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
24 | | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
25 | | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
26 | | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
27 | | */ |
28 | | |
29 | | /* This is a simple MDCT implementation that uses a N/4 complex FFT |
30 | | to do most of the work. It should be relatively straightforward to |
31 | | plug in pretty much and FFT here. |
32 | | |
33 | | This replaces the Vorbis FFT (and uses the exact same API), which |
34 | | was a bit too messy and that was ending up duplicating code |
35 | | (might as well use the same FFT everywhere). |
36 | | |
37 | | The algorithm is similar to (and inspired from) Fabrice Bellard's |
38 | | MDCT implementation in FFMPEG, but has differences in signs, ordering |
39 | | and scaling in many places. |
40 | | */ |
41 | | |
42 | | #ifndef SKIP_CONFIG_H |
43 | | #ifdef HAVE_CONFIG_H |
44 | | #include "config.h" |
45 | | #endif |
46 | | #endif |
47 | | |
48 | | #include "mdct.h" |
49 | | #include "kiss_fft.h" |
50 | | #include "_kiss_fft_guts.h" |
51 | | #include <math.h> |
52 | | #include "os_support.h" |
53 | | #include "mathops.h" |
54 | | #include "stack_alloc.h" |
55 | | |
56 | | #if defined(FIXED_POINT) && defined(__mips) && __mips == 32 |
57 | | #include "mips/mdct_mipsr1.h" |
58 | | #endif |
59 | | |
60 | | #ifndef M_PI |
61 | | #define M_PI 3.141592653 |
62 | | #endif |
63 | | |
64 | | #ifdef CUSTOM_MODES |
65 | | |
66 | | int clt_mdct_init(mdct_lookup *l,int N, int maxshift, int arch) |
67 | | { |
68 | | int i; |
69 | | kiss_twiddle_scalar *trig; |
70 | | int shift; |
71 | | int N2=N>>1; |
72 | | l->n = N; |
73 | | l->maxshift = maxshift; |
74 | | for (i=0;i<=maxshift;i++) |
75 | | { |
76 | | if (i==0) |
77 | | l->kfft[i] = opus_fft_alloc(N>>2>>i, 0, 0, arch); |
78 | | else |
79 | | l->kfft[i] = opus_fft_alloc_twiddles(N>>2>>i, 0, 0, l->kfft[0], arch); |
80 | | #ifndef ENABLE_TI_DSPLIB55 |
81 | | if (l->kfft[i]==NULL) |
82 | | return 0; |
83 | | #endif |
84 | | } |
85 | | l->trig = trig = (kiss_twiddle_scalar*)opus_alloc((N-(N2>>maxshift))*sizeof(kiss_twiddle_scalar)); |
86 | | if (l->trig==NULL) |
87 | | return 0; |
88 | | for (shift=0;shift<=maxshift;shift++) |
89 | | { |
90 | | /* We have enough points that sine isn't necessary */ |
91 | | #if defined(FIXED_POINT) |
92 | | #ifndef ENABLE_QEXT |
93 | | for (i=0;i<N2;i++) |
94 | | trig[i] = TRIG_UPSCALE*celt_cos_norm(DIV32(ADD32(SHL32(EXTEND32(i),17),N2+16384),N)); |
95 | | #else |
96 | | for (i=0;i<N2;i++) |
97 | | trig[i] = (kiss_twiddle_scalar)MAX32(-2147483647,MIN32(2147483647,floor(.5+2147483648*cos(2*M_PI*(i+.125)/N)))); |
98 | | #endif |
99 | | #else |
100 | | for (i=0;i<N2;i++) |
101 | | trig[i] = (kiss_twiddle_scalar)cos(2*PI*(i+.125)/N); |
102 | | #endif |
103 | | trig += N2; |
104 | | N2 >>= 1; |
105 | | N >>= 1; |
106 | | } |
107 | | return 1; |
108 | | } |
109 | | |
110 | | void clt_mdct_clear(mdct_lookup *l, int arch) |
111 | | { |
112 | | int i; |
113 | | for (i=0;i<=l->maxshift;i++) |
114 | | opus_fft_free(l->kfft[i], arch); |
115 | | opus_free((kiss_twiddle_scalar*)l->trig); |
116 | | } |
117 | | |
118 | | #endif /* CUSTOM_MODES */ |
119 | | |
120 | | /* Forward MDCT trashes the input array */ |
121 | | #ifndef OVERRIDE_clt_mdct_forward |
122 | | void clt_mdct_forward_c(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out, |
123 | | const celt_coef *window, int overlap, int shift, int stride, int arch) |
124 | 331M | { |
125 | 331M | int i; |
126 | 331M | int N, N2, N4; |
127 | 331M | VARDECL(kiss_fft_scalar, f); |
128 | 331M | VARDECL(kiss_fft_cpx, f2); |
129 | 331M | const kiss_fft_state *st = l->kfft[shift]; |
130 | 331M | const kiss_twiddle_scalar *trig; |
131 | 331M | celt_coef scale; |
132 | | #ifdef FIXED_POINT |
133 | | /* Allows us to scale with MULT16_32_Q16(), which is faster than |
134 | | MULT16_32_Q15() on ARM. */ |
135 | | int scale_shift = st->scale_shift-1; |
136 | | int headroom; |
137 | | #endif |
138 | 331M | SAVE_STACK; |
139 | 331M | (void)arch; |
140 | 331M | scale = st->scale; |
141 | | |
142 | 331M | N = l->n; |
143 | 331M | trig = l->trig; |
144 | 1.10G | for (i=0;i<shift;i++) |
145 | 769M | { |
146 | 769M | N >>= 1; |
147 | 769M | trig += N; |
148 | 769M | } |
149 | 331M | N2 = N>>1; |
150 | 331M | N4 = N>>2; |
151 | | |
152 | 331M | ALLOC(f, N2, kiss_fft_scalar); |
153 | 331M | ALLOC(f2, N4, kiss_fft_cpx); |
154 | | |
155 | | /* Consider the input to be composed of four blocks: [a, b, c, d] */ |
156 | | /* Window, shuffle, fold */ |
157 | 331M | { |
158 | | /* Temp pointers to make it really clear to the compiler what we're doing */ |
159 | 331M | const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1); |
160 | 331M | const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1); |
161 | 331M | kiss_fft_scalar * OPUS_RESTRICT yp = f; |
162 | 331M | const celt_coef * OPUS_RESTRICT wp1 = window+(overlap>>1); |
163 | 331M | const celt_coef * OPUS_RESTRICT wp2 = window+(overlap>>1)-1; |
164 | 10.2G | for(i=0;i<((overlap+3)>>2);i++) |
165 | 9.95G | { |
166 | | /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/ |
167 | 9.95G | *yp++ = S_MUL(xp1[N2], *wp2) + S_MUL(*xp2, *wp1); |
168 | 9.95G | *yp++ = S_MUL(*xp1, *wp1) - S_MUL(xp2[-N2], *wp2); |
169 | 9.95G | xp1+=2; |
170 | 9.95G | xp2-=2; |
171 | 9.95G | wp1+=2; |
172 | 9.95G | wp2-=2; |
173 | 9.95G | } |
174 | 331M | wp1 = window; |
175 | 331M | wp2 = window+overlap-1; |
176 | 25.1G | for(;i<N4-((overlap+3)>>2);i++) |
177 | 24.8G | { |
178 | | /* Real part arranged as a-bR, Imag part arranged as -c-dR */ |
179 | 24.8G | *yp++ = *xp2; |
180 | 24.8G | *yp++ = *xp1; |
181 | 24.8G | xp1+=2; |
182 | 24.8G | xp2-=2; |
183 | 24.8G | } |
184 | 10.2G | for(;i<N4;i++) |
185 | 9.95G | { |
186 | | /* Real part arranged as a-bR, Imag part arranged as -c-dR */ |
187 | 9.95G | *yp++ = -S_MUL(xp1[-N2], *wp1) + S_MUL(*xp2, *wp2); |
188 | 9.95G | *yp++ = S_MUL(*xp1, *wp2) + S_MUL(xp2[N2], *wp1); |
189 | 9.95G | xp1+=2; |
190 | 9.95G | xp2-=2; |
191 | 9.95G | wp1+=2; |
192 | 9.95G | wp2-=2; |
193 | 9.95G | } |
194 | 331M | } |
195 | | /* Pre-rotation */ |
196 | 331M | { |
197 | 331M | kiss_fft_scalar * OPUS_RESTRICT yp = f; |
198 | 331M | const kiss_twiddle_scalar *t = &trig[0]; |
199 | | #ifdef FIXED_POINT |
200 | | opus_val32 maxval=1; |
201 | | #endif |
202 | 45.0G | for(i=0;i<N4;i++) |
203 | 44.7G | { |
204 | 44.7G | kiss_fft_cpx yc; |
205 | 44.7G | kiss_twiddle_scalar t0, t1; |
206 | 44.7G | kiss_fft_scalar re, im, yr, yi; |
207 | 44.7G | t0 = t[i]; |
208 | 44.7G | t1 = t[N4+i]; |
209 | 44.7G | re = *yp++; |
210 | 44.7G | im = *yp++; |
211 | 44.7G | yr = S_MUL(re,t0) - S_MUL(im,t1); |
212 | 44.7G | yi = S_MUL(im,t0) + S_MUL(re,t1); |
213 | | /* For QEXT, it's best to scale before the FFT, but otherwise it's best to scale after. |
214 | | For floating-point it doesn't matter. */ |
215 | | #ifdef ENABLE_QEXT |
216 | | yc.r = yr; |
217 | | yc.i = yi; |
218 | | #else |
219 | 22.3G | yc.r = S_MUL2(yr, scale); |
220 | 22.3G | yc.i = S_MUL2(yi, scale); |
221 | | #endif |
222 | | #ifdef FIXED_POINT |
223 | 22.0G | maxval = MAX32(maxval, MAX32(ABS32(yc.r), ABS32(yc.i))); |
224 | | #endif |
225 | 44.7G | f2[st->bitrev[i]] = yc; |
226 | 44.7G | } |
227 | | #ifdef FIXED_POINT |
228 | 172M | headroom = IMAX(0, IMIN(scale_shift, 28-celt_ilog2(maxval))); |
229 | | #endif |
230 | 331M | } |
231 | | |
232 | | /* N/4 complex FFT, does not downscale anymore */ |
233 | 331M | opus_fft_impl(st, f2 ARG_FIXED(scale_shift-headroom)); |
234 | | |
235 | | /* Post-rotate */ |
236 | 331M | { |
237 | | /* Temp pointers to make it really clear to the compiler what we're doing */ |
238 | 331M | const kiss_fft_cpx * OPUS_RESTRICT fp = f2; |
239 | 331M | kiss_fft_scalar * OPUS_RESTRICT yp1 = out; |
240 | 331M | kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1); |
241 | 331M | const kiss_twiddle_scalar *t = &trig[0]; |
242 | | /* Temp pointers to make it really clear to the compiler what we're doing */ |
243 | 45.0G | for(i=0;i<N4;i++) |
244 | 44.7G | { |
245 | 44.7G | kiss_fft_scalar yr, yi; |
246 | 44.7G | kiss_fft_scalar t0, t1; |
247 | | #ifdef ENABLE_QEXT |
248 | 22.3G | t0 = S_MUL2(t[i], scale); |
249 | 22.3G | t1 = S_MUL2(t[N4+i], scale); |
250 | | #else |
251 | | t0 = t[i]; |
252 | | t1 = t[N4+i]; |
253 | | #endif |
254 | 44.7G | yr = PSHR32(S_MUL(fp->i,t1) - S_MUL(fp->r,t0), headroom); |
255 | 44.7G | yi = PSHR32(S_MUL(fp->r,t1) + S_MUL(fp->i,t0), headroom); |
256 | 44.7G | *yp1 = yr; |
257 | 44.7G | *yp2 = yi; |
258 | 44.7G | fp++; |
259 | 44.7G | yp1 += 2*stride; |
260 | 44.7G | yp2 -= 2*stride; |
261 | 44.7G | } |
262 | 331M | } |
263 | 331M | RESTORE_STACK; |
264 | 331M | } Line | Count | Source | 124 | 86.1M | { | 125 | 86.1M | int i; | 126 | 86.1M | int N, N2, N4; | 127 | 86.1M | VARDECL(kiss_fft_scalar, f); | 128 | 86.1M | VARDECL(kiss_fft_cpx, f2); | 129 | 86.1M | const kiss_fft_state *st = l->kfft[shift]; | 130 | 86.1M | const kiss_twiddle_scalar *trig; | 131 | 86.1M | celt_coef scale; | 132 | 86.1M | #ifdef FIXED_POINT | 133 | | /* Allows us to scale with MULT16_32_Q16(), which is faster than | 134 | | MULT16_32_Q15() on ARM. */ | 135 | 86.1M | int scale_shift = st->scale_shift-1; | 136 | 86.1M | int headroom; | 137 | 86.1M | #endif | 138 | 86.1M | SAVE_STACK; | 139 | 86.1M | (void)arch; | 140 | 86.1M | scale = st->scale; | 141 | | | 142 | 86.1M | N = l->n; | 143 | 86.1M | trig = l->trig; | 144 | 291M | for (i=0;i<shift;i++) | 145 | 205M | { | 146 | 205M | N >>= 1; | 147 | 205M | trig += N; | 148 | 205M | } | 149 | 86.1M | N2 = N>>1; | 150 | 86.1M | N4 = N>>2; | 151 | | | 152 | 86.1M | ALLOC(f, N2, kiss_fft_scalar); | 153 | 86.1M | ALLOC(f2, N4, kiss_fft_cpx); | 154 | | | 155 | | /* Consider the input to be composed of four blocks: [a, b, c, d] */ | 156 | | /* Window, shuffle, fold */ | 157 | 86.1M | { | 158 | | /* Temp pointers to make it really clear to the compiler what we're doing */ | 159 | 86.1M | const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1); | 160 | 86.1M | const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1); | 161 | 86.1M | kiss_fft_scalar * OPUS_RESTRICT yp = f; | 162 | 86.1M | const celt_coef * OPUS_RESTRICT wp1 = window+(overlap>>1); | 163 | 86.1M | const celt_coef * OPUS_RESTRICT wp2 = window+(overlap>>1)-1; | 164 | 2.67G | for(i=0;i<((overlap+3)>>2);i++) | 165 | 2.58G | { | 166 | | /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/ | 167 | 2.58G | *yp++ = S_MUL(xp1[N2], *wp2) + S_MUL(*xp2, *wp1); | 168 | 2.58G | *yp++ = S_MUL(*xp1, *wp1) - S_MUL(xp2[-N2], *wp2); | 169 | 2.58G | xp1+=2; | 170 | 2.58G | xp2-=2; | 171 | 2.58G | wp1+=2; | 172 | 2.58G | wp2-=2; | 173 | 2.58G | } | 174 | 86.1M | wp1 = window; | 175 | 86.1M | wp2 = window+overlap-1; | 176 | 5.96G | for(;i<N4-((overlap+3)>>2);i++) | 177 | 5.87G | { | 178 | | /* Real part arranged as a-bR, Imag part arranged as -c-dR */ | 179 | 5.87G | *yp++ = *xp2; | 180 | 5.87G | *yp++ = *xp1; | 181 | 5.87G | xp1+=2; | 182 | 5.87G | xp2-=2; | 183 | 5.87G | } | 184 | 2.67G | for(;i<N4;i++) | 185 | 2.58G | { | 186 | | /* Real part arranged as a-bR, Imag part arranged as -c-dR */ | 187 | 2.58G | *yp++ = -S_MUL(xp1[-N2], *wp1) + S_MUL(*xp2, *wp2); | 188 | 2.58G | *yp++ = S_MUL(*xp1, *wp2) + S_MUL(xp2[N2], *wp1); | 189 | 2.58G | xp1+=2; | 190 | 2.58G | xp2-=2; | 191 | 2.58G | wp1+=2; | 192 | 2.58G | wp2-=2; | 193 | 2.58G | } | 194 | 86.1M | } | 195 | | /* Pre-rotation */ | 196 | 86.1M | { | 197 | 86.1M | kiss_fft_scalar * OPUS_RESTRICT yp = f; | 198 | 86.1M | const kiss_twiddle_scalar *t = &trig[0]; | 199 | 86.1M | #ifdef FIXED_POINT | 200 | 86.1M | opus_val32 maxval=1; | 201 | 86.1M | #endif | 202 | 11.1G | for(i=0;i<N4;i++) | 203 | 11.0G | { | 204 | 11.0G | kiss_fft_cpx yc; | 205 | 11.0G | kiss_twiddle_scalar t0, t1; | 206 | 11.0G | kiss_fft_scalar re, im, yr, yi; | 207 | 11.0G | t0 = t[i]; | 208 | 11.0G | t1 = t[N4+i]; | 209 | 11.0G | re = *yp++; | 210 | 11.0G | im = *yp++; | 211 | 11.0G | yr = S_MUL(re,t0) - S_MUL(im,t1); | 212 | 11.0G | yi = S_MUL(im,t0) + S_MUL(re,t1); | 213 | | /* For QEXT, it's best to scale before the FFT, but otherwise it's best to scale after. | 214 | | For floating-point it doesn't matter. */ | 215 | | #ifdef ENABLE_QEXT | 216 | | yc.r = yr; | 217 | | yc.i = yi; | 218 | | #else | 219 | 11.0G | yc.r = S_MUL2(yr, scale); | 220 | 11.0G | yc.i = S_MUL2(yi, scale); | 221 | 11.0G | #endif | 222 | 11.0G | #ifdef FIXED_POINT | 223 | 11.0G | maxval = MAX32(maxval, MAX32(ABS32(yc.r), ABS32(yc.i))); | 224 | 11.0G | #endif | 225 | 11.0G | f2[st->bitrev[i]] = yc; | 226 | 11.0G | } | 227 | 86.1M | #ifdef FIXED_POINT | 228 | 86.1M | headroom = IMAX(0, IMIN(scale_shift, 28-celt_ilog2(maxval))); | 229 | 86.1M | #endif | 230 | 86.1M | } | 231 | | | 232 | | /* N/4 complex FFT, does not downscale anymore */ | 233 | 86.1M | opus_fft_impl(st, f2 ARG_FIXED(scale_shift-headroom)); | 234 | | | 235 | | /* Post-rotate */ | 236 | 86.1M | { | 237 | | /* Temp pointers to make it really clear to the compiler what we're doing */ | 238 | 86.1M | const kiss_fft_cpx * OPUS_RESTRICT fp = f2; | 239 | 86.1M | kiss_fft_scalar * OPUS_RESTRICT yp1 = out; | 240 | 86.1M | kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1); | 241 | 86.1M | const kiss_twiddle_scalar *t = &trig[0]; | 242 | | /* Temp pointers to make it really clear to the compiler what we're doing */ | 243 | 11.1G | for(i=0;i<N4;i++) | 244 | 11.0G | { | 245 | 11.0G | kiss_fft_scalar yr, yi; | 246 | 11.0G | kiss_fft_scalar t0, t1; | 247 | | #ifdef ENABLE_QEXT | 248 | | t0 = S_MUL2(t[i], scale); | 249 | | t1 = S_MUL2(t[N4+i], scale); | 250 | | #else | 251 | 11.0G | t0 = t[i]; | 252 | 11.0G | t1 = t[N4+i]; | 253 | 11.0G | #endif | 254 | 11.0G | yr = PSHR32(S_MUL(fp->i,t1) - S_MUL(fp->r,t0), headroom); | 255 | 11.0G | yi = PSHR32(S_MUL(fp->r,t1) + S_MUL(fp->i,t0), headroom); | 256 | 11.0G | *yp1 = yr; | 257 | 11.0G | *yp2 = yi; | 258 | 11.0G | fp++; | 259 | 11.0G | yp1 += 2*stride; | 260 | 11.0G | yp2 -= 2*stride; | 261 | 11.0G | } | 262 | 86.1M | } | 263 | 86.1M | RESTORE_STACK; | 264 | 86.1M | } |
Line | Count | Source | 124 | 86.1M | { | 125 | 86.1M | int i; | 126 | 86.1M | int N, N2, N4; | 127 | 86.1M | VARDECL(kiss_fft_scalar, f); | 128 | 86.1M | VARDECL(kiss_fft_cpx, f2); | 129 | 86.1M | const kiss_fft_state *st = l->kfft[shift]; | 130 | 86.1M | const kiss_twiddle_scalar *trig; | 131 | 86.1M | celt_coef scale; | 132 | 86.1M | #ifdef FIXED_POINT | 133 | | /* Allows us to scale with MULT16_32_Q16(), which is faster than | 134 | | MULT16_32_Q15() on ARM. */ | 135 | 86.1M | int scale_shift = st->scale_shift-1; | 136 | 86.1M | int headroom; | 137 | 86.1M | #endif | 138 | 86.1M | SAVE_STACK; | 139 | 86.1M | (void)arch; | 140 | 86.1M | scale = st->scale; | 141 | | | 142 | 86.1M | N = l->n; | 143 | 86.1M | trig = l->trig; | 144 | 291M | for (i=0;i<shift;i++) | 145 | 205M | { | 146 | 205M | N >>= 1; | 147 | 205M | trig += N; | 148 | 205M | } | 149 | 86.1M | N2 = N>>1; | 150 | 86.1M | N4 = N>>2; | 151 | | | 152 | 86.1M | ALLOC(f, N2, kiss_fft_scalar); | 153 | 86.1M | ALLOC(f2, N4, kiss_fft_cpx); | 154 | | | 155 | | /* Consider the input to be composed of four blocks: [a, b, c, d] */ | 156 | | /* Window, shuffle, fold */ | 157 | 86.1M | { | 158 | | /* Temp pointers to make it really clear to the compiler what we're doing */ | 159 | 86.1M | const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1); | 160 | 86.1M | const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1); | 161 | 86.1M | kiss_fft_scalar * OPUS_RESTRICT yp = f; | 162 | 86.1M | const celt_coef * OPUS_RESTRICT wp1 = window+(overlap>>1); | 163 | 86.1M | const celt_coef * OPUS_RESTRICT wp2 = window+(overlap>>1)-1; | 164 | 2.67G | for(i=0;i<((overlap+3)>>2);i++) | 165 | 2.58G | { | 166 | | /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/ | 167 | 2.58G | *yp++ = S_MUL(xp1[N2], *wp2) + S_MUL(*xp2, *wp1); | 168 | 2.58G | *yp++ = S_MUL(*xp1, *wp1) - S_MUL(xp2[-N2], *wp2); | 169 | 2.58G | xp1+=2; | 170 | 2.58G | xp2-=2; | 171 | 2.58G | wp1+=2; | 172 | 2.58G | wp2-=2; | 173 | 2.58G | } | 174 | 86.1M | wp1 = window; | 175 | 86.1M | wp2 = window+overlap-1; | 176 | 5.96G | for(;i<N4-((overlap+3)>>2);i++) | 177 | 5.87G | { | 178 | | /* Real part arranged as a-bR, Imag part arranged as -c-dR */ | 179 | 5.87G | *yp++ = *xp2; | 180 | 5.87G | *yp++ = *xp1; | 181 | 5.87G | xp1+=2; | 182 | 5.87G | xp2-=2; | 183 | 5.87G | } | 184 | 2.67G | for(;i<N4;i++) | 185 | 2.58G | { | 186 | | /* Real part arranged as a-bR, Imag part arranged as -c-dR */ | 187 | 2.58G | *yp++ = -S_MUL(xp1[-N2], *wp1) + S_MUL(*xp2, *wp2); | 188 | 2.58G | *yp++ = S_MUL(*xp1, *wp2) + S_MUL(xp2[N2], *wp1); | 189 | 2.58G | xp1+=2; | 190 | 2.58G | xp2-=2; | 191 | 2.58G | wp1+=2; | 192 | 2.58G | wp2-=2; | 193 | 2.58G | } | 194 | 86.1M | } | 195 | | /* Pre-rotation */ | 196 | 86.1M | { | 197 | 86.1M | kiss_fft_scalar * OPUS_RESTRICT yp = f; | 198 | 86.1M | const kiss_twiddle_scalar *t = &trig[0]; | 199 | 86.1M | #ifdef FIXED_POINT | 200 | 86.1M | opus_val32 maxval=1; | 201 | 86.1M | #endif | 202 | 11.1G | for(i=0;i<N4;i++) | 203 | 11.0G | { | 204 | 11.0G | kiss_fft_cpx yc; | 205 | 11.0G | kiss_twiddle_scalar t0, t1; | 206 | 11.0G | kiss_fft_scalar re, im, yr, yi; | 207 | 11.0G | t0 = t[i]; | 208 | 11.0G | t1 = t[N4+i]; | 209 | 11.0G | re = *yp++; | 210 | 11.0G | im = *yp++; | 211 | 11.0G | yr = S_MUL(re,t0) - S_MUL(im,t1); | 212 | 11.0G | yi = S_MUL(im,t0) + S_MUL(re,t1); | 213 | | /* For QEXT, it's best to scale before the FFT, but otherwise it's best to scale after. | 214 | | For floating-point it doesn't matter. */ | 215 | 11.0G | #ifdef ENABLE_QEXT | 216 | 11.0G | yc.r = yr; | 217 | 11.0G | yc.i = yi; | 218 | | #else | 219 | | yc.r = S_MUL2(yr, scale); | 220 | | yc.i = S_MUL2(yi, scale); | 221 | | #endif | 222 | 11.0G | #ifdef FIXED_POINT | 223 | 11.0G | maxval = MAX32(maxval, MAX32(ABS32(yc.r), ABS32(yc.i))); | 224 | 11.0G | #endif | 225 | 11.0G | f2[st->bitrev[i]] = yc; | 226 | 11.0G | } | 227 | 86.1M | #ifdef FIXED_POINT | 228 | 86.1M | headroom = IMAX(0, IMIN(scale_shift, 28-celt_ilog2(maxval))); | 229 | 86.1M | #endif | 230 | 86.1M | } | 231 | | | 232 | | /* N/4 complex FFT, does not downscale anymore */ | 233 | 86.1M | opus_fft_impl(st, f2 ARG_FIXED(scale_shift-headroom)); | 234 | | | 235 | | /* Post-rotate */ | 236 | 86.1M | { | 237 | | /* Temp pointers to make it really clear to the compiler what we're doing */ | 238 | 86.1M | const kiss_fft_cpx * OPUS_RESTRICT fp = f2; | 239 | 86.1M | kiss_fft_scalar * OPUS_RESTRICT yp1 = out; | 240 | 86.1M | kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1); | 241 | 86.1M | const kiss_twiddle_scalar *t = &trig[0]; | 242 | | /* Temp pointers to make it really clear to the compiler what we're doing */ | 243 | 11.1G | for(i=0;i<N4;i++) | 244 | 11.0G | { | 245 | 11.0G | kiss_fft_scalar yr, yi; | 246 | 11.0G | kiss_fft_scalar t0, t1; | 247 | 11.0G | #ifdef ENABLE_QEXT | 248 | 11.0G | t0 = S_MUL2(t[i], scale); | 249 | 11.0G | t1 = S_MUL2(t[N4+i], scale); | 250 | | #else | 251 | | t0 = t[i]; | 252 | | t1 = t[N4+i]; | 253 | | #endif | 254 | 11.0G | yr = PSHR32(S_MUL(fp->i,t1) - S_MUL(fp->r,t0), headroom); | 255 | 11.0G | yi = PSHR32(S_MUL(fp->r,t1) + S_MUL(fp->i,t0), headroom); | 256 | 11.0G | *yp1 = yr; | 257 | 11.0G | *yp2 = yi; | 258 | 11.0G | fp++; | 259 | 11.0G | yp1 += 2*stride; | 260 | 11.0G | yp2 -= 2*stride; | 261 | 11.0G | } | 262 | 86.1M | } | 263 | 86.1M | RESTORE_STACK; | 264 | 86.1M | } |
Line | Count | Source | 124 | 79.7M | { | 125 | 79.7M | int i; | 126 | 79.7M | int N, N2, N4; | 127 | 79.7M | VARDECL(kiss_fft_scalar, f); | 128 | 79.7M | VARDECL(kiss_fft_cpx, f2); | 129 | 79.7M | const kiss_fft_state *st = l->kfft[shift]; | 130 | 79.7M | const kiss_twiddle_scalar *trig; | 131 | 79.7M | celt_coef scale; | 132 | | #ifdef FIXED_POINT | 133 | | /* Allows us to scale with MULT16_32_Q16(), which is faster than | 134 | | MULT16_32_Q15() on ARM. */ | 135 | | int scale_shift = st->scale_shift-1; | 136 | | int headroom; | 137 | | #endif | 138 | 79.7M | SAVE_STACK; | 139 | 79.7M | (void)arch; | 140 | 79.7M | scale = st->scale; | 141 | | | 142 | 79.7M | N = l->n; | 143 | 79.7M | trig = l->trig; | 144 | 259M | for (i=0;i<shift;i++) | 145 | 179M | { | 146 | 179M | N >>= 1; | 147 | 179M | trig += N; | 148 | 179M | } | 149 | 79.7M | N2 = N>>1; | 150 | 79.7M | N4 = N>>2; | 151 | | | 152 | 79.7M | ALLOC(f, N2, kiss_fft_scalar); | 153 | 79.7M | ALLOC(f2, N4, kiss_fft_cpx); | 154 | | | 155 | | /* Consider the input to be composed of four blocks: [a, b, c, d] */ | 156 | | /* Window, shuffle, fold */ | 157 | 79.7M | { | 158 | | /* Temp pointers to make it really clear to the compiler what we're doing */ | 159 | 79.7M | const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1); | 160 | 79.7M | const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1); | 161 | 79.7M | kiss_fft_scalar * OPUS_RESTRICT yp = f; | 162 | 79.7M | const celt_coef * OPUS_RESTRICT wp1 = window+(overlap>>1); | 163 | 79.7M | const celt_coef * OPUS_RESTRICT wp2 = window+(overlap>>1)-1; | 164 | 2.47G | for(i=0;i<((overlap+3)>>2);i++) | 165 | 2.39G | { | 166 | | /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/ | 167 | 2.39G | *yp++ = S_MUL(xp1[N2], *wp2) + S_MUL(*xp2, *wp1); | 168 | 2.39G | *yp++ = S_MUL(*xp1, *wp1) - S_MUL(xp2[-N2], *wp2); | 169 | 2.39G | xp1+=2; | 170 | 2.39G | xp2-=2; | 171 | 2.39G | wp1+=2; | 172 | 2.39G | wp2-=2; | 173 | 2.39G | } | 174 | 79.7M | wp1 = window; | 175 | 79.7M | wp2 = window+overlap-1; | 176 | 6.61G | for(;i<N4-((overlap+3)>>2);i++) | 177 | 6.53G | { | 178 | | /* Real part arranged as a-bR, Imag part arranged as -c-dR */ | 179 | 6.53G | *yp++ = *xp2; | 180 | 6.53G | *yp++ = *xp1; | 181 | 6.53G | xp1+=2; | 182 | 6.53G | xp2-=2; | 183 | 6.53G | } | 184 | 2.47G | for(;i<N4;i++) | 185 | 2.39G | { | 186 | | /* Real part arranged as a-bR, Imag part arranged as -c-dR */ | 187 | 2.39G | *yp++ = -S_MUL(xp1[-N2], *wp1) + S_MUL(*xp2, *wp2); | 188 | 2.39G | *yp++ = S_MUL(*xp1, *wp2) + S_MUL(xp2[N2], *wp1); | 189 | 2.39G | xp1+=2; | 190 | 2.39G | xp2-=2; | 191 | 2.39G | wp1+=2; | 192 | 2.39G | wp2-=2; | 193 | 2.39G | } | 194 | 79.7M | } | 195 | | /* Pre-rotation */ | 196 | 79.7M | { | 197 | 79.7M | kiss_fft_scalar * OPUS_RESTRICT yp = f; | 198 | 79.7M | const kiss_twiddle_scalar *t = &trig[0]; | 199 | | #ifdef FIXED_POINT | 200 | | opus_val32 maxval=1; | 201 | | #endif | 202 | 11.4G | for(i=0;i<N4;i++) | 203 | 11.3G | { | 204 | 11.3G | kiss_fft_cpx yc; | 205 | 11.3G | kiss_twiddle_scalar t0, t1; | 206 | 11.3G | kiss_fft_scalar re, im, yr, yi; | 207 | 11.3G | t0 = t[i]; | 208 | 11.3G | t1 = t[N4+i]; | 209 | 11.3G | re = *yp++; | 210 | 11.3G | im = *yp++; | 211 | 11.3G | yr = S_MUL(re,t0) - S_MUL(im,t1); | 212 | 11.3G | yi = S_MUL(im,t0) + S_MUL(re,t1); | 213 | | /* For QEXT, it's best to scale before the FFT, but otherwise it's best to scale after. | 214 | | For floating-point it doesn't matter. */ | 215 | 11.3G | #ifdef ENABLE_QEXT | 216 | 11.3G | yc.r = yr; | 217 | 11.3G | yc.i = yi; | 218 | | #else | 219 | | yc.r = S_MUL2(yr, scale); | 220 | | yc.i = S_MUL2(yi, scale); | 221 | | #endif | 222 | | #ifdef FIXED_POINT | 223 | | maxval = MAX32(maxval, MAX32(ABS32(yc.r), ABS32(yc.i))); | 224 | | #endif | 225 | 11.3G | f2[st->bitrev[i]] = yc; | 226 | 11.3G | } | 227 | | #ifdef FIXED_POINT | 228 | | headroom = IMAX(0, IMIN(scale_shift, 28-celt_ilog2(maxval))); | 229 | | #endif | 230 | 79.7M | } | 231 | | | 232 | | /* N/4 complex FFT, does not downscale anymore */ | 233 | 79.7M | opus_fft_impl(st, f2 ARG_FIXED(scale_shift-headroom)); | 234 | | | 235 | | /* Post-rotate */ | 236 | 79.7M | { | 237 | | /* Temp pointers to make it really clear to the compiler what we're doing */ | 238 | 79.7M | const kiss_fft_cpx * OPUS_RESTRICT fp = f2; | 239 | 79.7M | kiss_fft_scalar * OPUS_RESTRICT yp1 = out; | 240 | 79.7M | kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1); | 241 | 79.7M | const kiss_twiddle_scalar *t = &trig[0]; | 242 | | /* Temp pointers to make it really clear to the compiler what we're doing */ | 243 | 11.4G | for(i=0;i<N4;i++) | 244 | 11.3G | { | 245 | 11.3G | kiss_fft_scalar yr, yi; | 246 | 11.3G | kiss_fft_scalar t0, t1; | 247 | 11.3G | #ifdef ENABLE_QEXT | 248 | 11.3G | t0 = S_MUL2(t[i], scale); | 249 | 11.3G | t1 = S_MUL2(t[N4+i], scale); | 250 | | #else | 251 | | t0 = t[i]; | 252 | | t1 = t[N4+i]; | 253 | | #endif | 254 | 11.3G | yr = PSHR32(S_MUL(fp->i,t1) - S_MUL(fp->r,t0), headroom); | 255 | 11.3G | yi = PSHR32(S_MUL(fp->r,t1) + S_MUL(fp->i,t0), headroom); | 256 | 11.3G | *yp1 = yr; | 257 | 11.3G | *yp2 = yi; | 258 | 11.3G | fp++; | 259 | 11.3G | yp1 += 2*stride; | 260 | 11.3G | yp2 -= 2*stride; | 261 | 11.3G | } | 262 | 79.7M | } | 263 | 79.7M | RESTORE_STACK; | 264 | 79.7M | } |
Line | Count | Source | 124 | 79.7M | { | 125 | 79.7M | int i; | 126 | 79.7M | int N, N2, N4; | 127 | 79.7M | VARDECL(kiss_fft_scalar, f); | 128 | 79.7M | VARDECL(kiss_fft_cpx, f2); | 129 | 79.7M | const kiss_fft_state *st = l->kfft[shift]; | 130 | 79.7M | const kiss_twiddle_scalar *trig; | 131 | 79.7M | celt_coef scale; | 132 | | #ifdef FIXED_POINT | 133 | | /* Allows us to scale with MULT16_32_Q16(), which is faster than | 134 | | MULT16_32_Q15() on ARM. */ | 135 | | int scale_shift = st->scale_shift-1; | 136 | | int headroom; | 137 | | #endif | 138 | 79.7M | SAVE_STACK; | 139 | 79.7M | (void)arch; | 140 | 79.7M | scale = st->scale; | 141 | | | 142 | 79.7M | N = l->n; | 143 | 79.7M | trig = l->trig; | 144 | 259M | for (i=0;i<shift;i++) | 145 | 179M | { | 146 | 179M | N >>= 1; | 147 | 179M | trig += N; | 148 | 179M | } | 149 | 79.7M | N2 = N>>1; | 150 | 79.7M | N4 = N>>2; | 151 | | | 152 | 79.7M | ALLOC(f, N2, kiss_fft_scalar); | 153 | 79.7M | ALLOC(f2, N4, kiss_fft_cpx); | 154 | | | 155 | | /* Consider the input to be composed of four blocks: [a, b, c, d] */ | 156 | | /* Window, shuffle, fold */ | 157 | 79.7M | { | 158 | | /* Temp pointers to make it really clear to the compiler what we're doing */ | 159 | 79.7M | const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1); | 160 | 79.7M | const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1); | 161 | 79.7M | kiss_fft_scalar * OPUS_RESTRICT yp = f; | 162 | 79.7M | const celt_coef * OPUS_RESTRICT wp1 = window+(overlap>>1); | 163 | 79.7M | const celt_coef * OPUS_RESTRICT wp2 = window+(overlap>>1)-1; | 164 | 2.47G | for(i=0;i<((overlap+3)>>2);i++) | 165 | 2.39G | { | 166 | | /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/ | 167 | 2.39G | *yp++ = S_MUL(xp1[N2], *wp2) + S_MUL(*xp2, *wp1); | 168 | 2.39G | *yp++ = S_MUL(*xp1, *wp1) - S_MUL(xp2[-N2], *wp2); | 169 | 2.39G | xp1+=2; | 170 | 2.39G | xp2-=2; | 171 | 2.39G | wp1+=2; | 172 | 2.39G | wp2-=2; | 173 | 2.39G | } | 174 | 79.7M | wp1 = window; | 175 | 79.7M | wp2 = window+overlap-1; | 176 | 6.61G | for(;i<N4-((overlap+3)>>2);i++) | 177 | 6.53G | { | 178 | | /* Real part arranged as a-bR, Imag part arranged as -c-dR */ | 179 | 6.53G | *yp++ = *xp2; | 180 | 6.53G | *yp++ = *xp1; | 181 | 6.53G | xp1+=2; | 182 | 6.53G | xp2-=2; | 183 | 6.53G | } | 184 | 2.47G | for(;i<N4;i++) | 185 | 2.39G | { | 186 | | /* Real part arranged as a-bR, Imag part arranged as -c-dR */ | 187 | 2.39G | *yp++ = -S_MUL(xp1[-N2], *wp1) + S_MUL(*xp2, *wp2); | 188 | 2.39G | *yp++ = S_MUL(*xp1, *wp2) + S_MUL(xp2[N2], *wp1); | 189 | 2.39G | xp1+=2; | 190 | 2.39G | xp2-=2; | 191 | 2.39G | wp1+=2; | 192 | 2.39G | wp2-=2; | 193 | 2.39G | } | 194 | 79.7M | } | 195 | | /* Pre-rotation */ | 196 | 79.7M | { | 197 | 79.7M | kiss_fft_scalar * OPUS_RESTRICT yp = f; | 198 | 79.7M | const kiss_twiddle_scalar *t = &trig[0]; | 199 | | #ifdef FIXED_POINT | 200 | | opus_val32 maxval=1; | 201 | | #endif | 202 | 11.4G | for(i=0;i<N4;i++) | 203 | 11.3G | { | 204 | 11.3G | kiss_fft_cpx yc; | 205 | 11.3G | kiss_twiddle_scalar t0, t1; | 206 | 11.3G | kiss_fft_scalar re, im, yr, yi; | 207 | 11.3G | t0 = t[i]; | 208 | 11.3G | t1 = t[N4+i]; | 209 | 11.3G | re = *yp++; | 210 | 11.3G | im = *yp++; | 211 | 11.3G | yr = S_MUL(re,t0) - S_MUL(im,t1); | 212 | 11.3G | yi = S_MUL(im,t0) + S_MUL(re,t1); | 213 | | /* For QEXT, it's best to scale before the FFT, but otherwise it's best to scale after. | 214 | | For floating-point it doesn't matter. */ | 215 | | #ifdef ENABLE_QEXT | 216 | | yc.r = yr; | 217 | | yc.i = yi; | 218 | | #else | 219 | 11.3G | yc.r = S_MUL2(yr, scale); | 220 | 11.3G | yc.i = S_MUL2(yi, scale); | 221 | 11.3G | #endif | 222 | | #ifdef FIXED_POINT | 223 | | maxval = MAX32(maxval, MAX32(ABS32(yc.r), ABS32(yc.i))); | 224 | | #endif | 225 | 11.3G | f2[st->bitrev[i]] = yc; | 226 | 11.3G | } | 227 | | #ifdef FIXED_POINT | 228 | | headroom = IMAX(0, IMIN(scale_shift, 28-celt_ilog2(maxval))); | 229 | | #endif | 230 | 79.7M | } | 231 | | | 232 | | /* N/4 complex FFT, does not downscale anymore */ | 233 | 79.7M | opus_fft_impl(st, f2 ARG_FIXED(scale_shift-headroom)); | 234 | | | 235 | | /* Post-rotate */ | 236 | 79.7M | { | 237 | | /* Temp pointers to make it really clear to the compiler what we're doing */ | 238 | 79.7M | const kiss_fft_cpx * OPUS_RESTRICT fp = f2; | 239 | 79.7M | kiss_fft_scalar * OPUS_RESTRICT yp1 = out; | 240 | 79.7M | kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1); | 241 | 79.7M | const kiss_twiddle_scalar *t = &trig[0]; | 242 | | /* Temp pointers to make it really clear to the compiler what we're doing */ | 243 | 11.4G | for(i=0;i<N4;i++) | 244 | 11.3G | { | 245 | 11.3G | kiss_fft_scalar yr, yi; | 246 | 11.3G | kiss_fft_scalar t0, t1; | 247 | | #ifdef ENABLE_QEXT | 248 | | t0 = S_MUL2(t[i], scale); | 249 | | t1 = S_MUL2(t[N4+i], scale); | 250 | | #else | 251 | 11.3G | t0 = t[i]; | 252 | 11.3G | t1 = t[N4+i]; | 253 | 11.3G | #endif | 254 | 11.3G | yr = PSHR32(S_MUL(fp->i,t1) - S_MUL(fp->r,t0), headroom); | 255 | 11.3G | yi = PSHR32(S_MUL(fp->r,t1) + S_MUL(fp->i,t0), headroom); | 256 | 11.3G | *yp1 = yr; | 257 | 11.3G | *yp2 = yi; | 258 | 11.3G | fp++; | 259 | 11.3G | yp1 += 2*stride; | 260 | 11.3G | yp2 -= 2*stride; | 261 | 11.3G | } | 262 | 79.7M | } | 263 | 79.7M | RESTORE_STACK; | 264 | 79.7M | } |
|
265 | | #endif /* OVERRIDE_clt_mdct_forward */ |
266 | | |
267 | | #ifndef OVERRIDE_clt_mdct_backward |
268 | | void clt_mdct_backward_c(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out, |
269 | | const celt_coef * OPUS_RESTRICT window, int overlap, int shift, int stride, int arch) |
270 | 627k | { |
271 | 627k | int i; |
272 | 627k | int N, N2, N4; |
273 | 627k | const kiss_twiddle_scalar *trig; |
274 | | #ifdef FIXED_POINT |
275 | | int pre_shift, post_shift, fft_shift; |
276 | | #endif |
277 | 627k | (void) arch; |
278 | | |
279 | 627k | N = l->n; |
280 | 627k | trig = l->trig; |
281 | 2.06M | for (i=0;i<shift;i++) |
282 | 1.43M | { |
283 | 1.43M | N >>= 1; |
284 | 1.43M | trig += N; |
285 | 1.43M | } |
286 | 627k | N2 = N>>1; |
287 | 627k | N4 = N>>2; |
288 | | |
289 | | #ifdef FIXED_POINT |
290 | | { |
291 | | opus_val32 sumval=N2; |
292 | | opus_val32 maxval=0; |
293 | 91.9M | for (i=0;i<N2;i++) { |
294 | 91.6M | maxval = MAX32(maxval, ABS32(in[i*stride])); |
295 | 91.6M | sumval = ADD32_ovflw(sumval, ABS32(SHR32(in[i*stride],11))); |
296 | 91.6M | } |
297 | 336k | pre_shift = IMAX(0, 29-celt_zlog2(1+maxval)); |
298 | | /* Worst-case where all the energy goes to a single sample. */ |
299 | 336k | post_shift = IMAX(0, 19-celt_ilog2(ABS32(sumval))); |
300 | 336k | post_shift = IMIN(post_shift, pre_shift); |
301 | | fft_shift = pre_shift - post_shift; |
302 | | } |
303 | | #endif |
304 | | /* Pre-rotate */ |
305 | 627k | { |
306 | | /* Temp pointers to make it really clear to the compiler what we're doing */ |
307 | 627k | const kiss_fft_scalar * OPUS_RESTRICT xp1 = in; |
308 | 627k | const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1); |
309 | 627k | kiss_fft_scalar * OPUS_RESTRICT yp = out+(overlap>>1); |
310 | 627k | const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0]; |
311 | 627k | const opus_int16 * OPUS_RESTRICT bitrev = l->kfft[shift]->bitrev; |
312 | 89.9M | for(i=0;i<N4;i++) |
313 | 89.3M | { |
314 | 89.3M | int rev; |
315 | 89.3M | kiss_fft_scalar yr, yi; |
316 | 89.3M | opus_val32 x1, x2; |
317 | 89.3M | rev = *bitrev++; |
318 | 89.3M | x1 = SHL32_ovflw(*xp1, pre_shift); |
319 | 89.3M | x2 = SHL32_ovflw(*xp2, pre_shift); |
320 | 89.3M | yr = ADD32_ovflw(S_MUL(x2, t[i]), S_MUL(x1, t[N4+i])); |
321 | 89.3M | yi = SUB32_ovflw(S_MUL(x1, t[i]), S_MUL(x2, t[N4+i])); |
322 | | /* We swap real and imag because we use an FFT instead of an IFFT. */ |
323 | 89.3M | yp[2*rev+1] = yr; |
324 | 89.3M | yp[2*rev] = yi; |
325 | | /* Storing the pre-rotation directly in the bitrev order. */ |
326 | 89.3M | xp1+=2*stride; |
327 | 89.3M | xp2-=2*stride; |
328 | 89.3M | } |
329 | 627k | } |
330 | | |
331 | 627k | opus_fft_impl(l->kfft[shift], (kiss_fft_cpx*)(out+(overlap>>1)) ARG_FIXED(fft_shift)); |
332 | | |
333 | | /* Post-rotate and de-shuffle from both ends of the buffer at once to make |
334 | | it in-place. */ |
335 | 627k | { |
336 | 627k | kiss_fft_scalar * yp0 = out+(overlap>>1); |
337 | 627k | kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2; |
338 | 627k | const kiss_twiddle_scalar *t = &trig[0]; |
339 | | /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the |
340 | | middle pair will be computed twice. */ |
341 | 45.3M | for(i=0;i<(N4+1)>>1;i++) |
342 | 44.6M | { |
343 | 44.6M | kiss_fft_scalar re, im, yr, yi; |
344 | 44.6M | kiss_twiddle_scalar t0, t1; |
345 | | /* We swap real and imag because we're using an FFT instead of an IFFT. */ |
346 | 44.6M | re = yp0[1]; |
347 | 44.6M | im = yp0[0]; |
348 | 44.6M | t0 = t[i]; |
349 | 44.6M | t1 = t[N4+i]; |
350 | | /* We'd scale up by 2 here, but instead it's done when mixing the windows */ |
351 | 44.6M | yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), post_shift); |
352 | 44.6M | yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), post_shift); |
353 | | /* We swap real and imag because we're using an FFT instead of an IFFT. */ |
354 | 44.6M | re = yp1[1]; |
355 | 44.6M | im = yp1[0]; |
356 | 44.6M | yp0[0] = yr; |
357 | 44.6M | yp1[1] = yi; |
358 | | |
359 | 44.6M | t0 = t[(N4-i-1)]; |
360 | 44.6M | t1 = t[(N2-i-1)]; |
361 | | /* We'd scale up by 2 here, but instead it's done when mixing the windows */ |
362 | 44.6M | yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), post_shift); |
363 | 44.6M | yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), post_shift); |
364 | 44.6M | yp1[0] = yr; |
365 | 44.6M | yp0[1] = yi; |
366 | 44.6M | yp0 += 2; |
367 | 44.6M | yp1 -= 2; |
368 | 44.6M | } |
369 | 627k | } |
370 | | |
371 | | /* Mirror on both sides for TDAC */ |
372 | 627k | { |
373 | 627k | kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1; |
374 | 627k | kiss_fft_scalar * OPUS_RESTRICT yp1 = out; |
375 | 627k | const celt_coef * OPUS_RESTRICT wp1 = window; |
376 | 627k | const celt_coef * OPUS_RESTRICT wp2 = window+overlap-1; |
377 | | |
378 | 43.0M | for(i = 0; i < overlap/2; i++) |
379 | 42.4M | { |
380 | 42.4M | kiss_fft_scalar x1, x2; |
381 | 42.4M | x1 = *xp1; |
382 | 42.4M | x2 = *yp1; |
383 | 42.4M | *yp1++ = SUB32_ovflw(S_MUL(x2, *wp2), S_MUL(x1, *wp1)); |
384 | 42.4M | *xp1-- = ADD32_ovflw(S_MUL(x2, *wp1), S_MUL(x1, *wp2)); |
385 | 42.4M | wp1++; |
386 | 42.4M | wp2--; |
387 | 42.4M | } |
388 | 627k | } |
389 | 627k | } Line | Count | Source | 270 | 336k | { | 271 | 336k | int i; | 272 | 336k | int N, N2, N4; | 273 | 336k | const kiss_twiddle_scalar *trig; | 274 | 336k | #ifdef FIXED_POINT | 275 | 336k | int pre_shift, post_shift, fft_shift; | 276 | 336k | #endif | 277 | 336k | (void) arch; | 278 | | | 279 | 336k | N = l->n; | 280 | 336k | trig = l->trig; | 281 | 1.12M | for (i=0;i<shift;i++) | 282 | 792k | { | 283 | 792k | N >>= 1; | 284 | 792k | trig += N; | 285 | 792k | } | 286 | 336k | N2 = N>>1; | 287 | 336k | N4 = N>>2; | 288 | | | 289 | 336k | #ifdef FIXED_POINT | 290 | 336k | { | 291 | 336k | opus_val32 sumval=N2; | 292 | 336k | opus_val32 maxval=0; | 293 | 91.9M | for (i=0;i<N2;i++) { | 294 | 91.6M | maxval = MAX32(maxval, ABS32(in[i*stride])); | 295 | 91.6M | sumval = ADD32_ovflw(sumval, ABS32(SHR32(in[i*stride],11))); | 296 | 91.6M | } | 297 | 336k | pre_shift = IMAX(0, 29-celt_zlog2(1+maxval)); | 298 | | /* Worst-case where all the energy goes to a single sample. */ | 299 | 336k | post_shift = IMAX(0, 19-celt_ilog2(ABS32(sumval))); | 300 | 336k | post_shift = IMIN(post_shift, pre_shift); | 301 | 336k | fft_shift = pre_shift - post_shift; | 302 | 336k | } | 303 | 336k | #endif | 304 | | /* Pre-rotate */ | 305 | 336k | { | 306 | | /* Temp pointers to make it really clear to the compiler what we're doing */ | 307 | 336k | const kiss_fft_scalar * OPUS_RESTRICT xp1 = in; | 308 | 336k | const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1); | 309 | 336k | kiss_fft_scalar * OPUS_RESTRICT yp = out+(overlap>>1); | 310 | 336k | const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0]; | 311 | 336k | const opus_int16 * OPUS_RESTRICT bitrev = l->kfft[shift]->bitrev; | 312 | 46.1M | for(i=0;i<N4;i++) | 313 | 45.8M | { | 314 | 45.8M | int rev; | 315 | 45.8M | kiss_fft_scalar yr, yi; | 316 | 45.8M | opus_val32 x1, x2; | 317 | 45.8M | rev = *bitrev++; | 318 | 45.8M | x1 = SHL32_ovflw(*xp1, pre_shift); | 319 | 45.8M | x2 = SHL32_ovflw(*xp2, pre_shift); | 320 | 45.8M | yr = ADD32_ovflw(S_MUL(x2, t[i]), S_MUL(x1, t[N4+i])); | 321 | 45.8M | yi = SUB32_ovflw(S_MUL(x1, t[i]), S_MUL(x2, t[N4+i])); | 322 | | /* We swap real and imag because we use an FFT instead of an IFFT. */ | 323 | 45.8M | yp[2*rev+1] = yr; | 324 | 45.8M | yp[2*rev] = yi; | 325 | | /* Storing the pre-rotation directly in the bitrev order. */ | 326 | 45.8M | xp1+=2*stride; | 327 | 45.8M | xp2-=2*stride; | 328 | 45.8M | } | 329 | 336k | } | 330 | | | 331 | 336k | opus_fft_impl(l->kfft[shift], (kiss_fft_cpx*)(out+(overlap>>1)) ARG_FIXED(fft_shift)); | 332 | | | 333 | | /* Post-rotate and de-shuffle from both ends of the buffer at once to make | 334 | | it in-place. */ | 335 | 336k | { | 336 | 336k | kiss_fft_scalar * yp0 = out+(overlap>>1); | 337 | 336k | kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2; | 338 | 336k | const kiss_twiddle_scalar *t = &trig[0]; | 339 | | /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the | 340 | | middle pair will be computed twice. */ | 341 | 23.2M | for(i=0;i<(N4+1)>>1;i++) | 342 | 22.9M | { | 343 | 22.9M | kiss_fft_scalar re, im, yr, yi; | 344 | 22.9M | kiss_twiddle_scalar t0, t1; | 345 | | /* We swap real and imag because we're using an FFT instead of an IFFT. */ | 346 | 22.9M | re = yp0[1]; | 347 | 22.9M | im = yp0[0]; | 348 | 22.9M | t0 = t[i]; | 349 | 22.9M | t1 = t[N4+i]; | 350 | | /* We'd scale up by 2 here, but instead it's done when mixing the windows */ | 351 | 22.9M | yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), post_shift); | 352 | 22.9M | yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), post_shift); | 353 | | /* We swap real and imag because we're using an FFT instead of an IFFT. */ | 354 | 22.9M | re = yp1[1]; | 355 | 22.9M | im = yp1[0]; | 356 | 22.9M | yp0[0] = yr; | 357 | 22.9M | yp1[1] = yi; | 358 | | | 359 | 22.9M | t0 = t[(N4-i-1)]; | 360 | 22.9M | t1 = t[(N2-i-1)]; | 361 | | /* We'd scale up by 2 here, but instead it's done when mixing the windows */ | 362 | 22.9M | yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), post_shift); | 363 | 22.9M | yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), post_shift); | 364 | 22.9M | yp1[0] = yr; | 365 | 22.9M | yp0[1] = yi; | 366 | 22.9M | yp0 += 2; | 367 | 22.9M | yp1 -= 2; | 368 | 22.9M | } | 369 | 336k | } | 370 | | | 371 | | /* Mirror on both sides for TDAC */ | 372 | 336k | { | 373 | 336k | kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1; | 374 | 336k | kiss_fft_scalar * OPUS_RESTRICT yp1 = out; | 375 | 336k | const celt_coef * OPUS_RESTRICT wp1 = window; | 376 | 336k | const celt_coef * OPUS_RESTRICT wp2 = window+overlap-1; | 377 | | | 378 | 23.4M | for(i = 0; i < overlap/2; i++) | 379 | 23.0M | { | 380 | 23.0M | kiss_fft_scalar x1, x2; | 381 | 23.0M | x1 = *xp1; | 382 | 23.0M | x2 = *yp1; | 383 | 23.0M | *yp1++ = SUB32_ovflw(S_MUL(x2, *wp2), S_MUL(x1, *wp1)); | 384 | 23.0M | *xp1-- = ADD32_ovflw(S_MUL(x2, *wp1), S_MUL(x1, *wp2)); | 385 | 23.0M | wp1++; | 386 | 23.0M | wp2--; | 387 | 23.0M | } | 388 | 336k | } | 389 | 336k | } |
Line | Count | Source | 270 | 290k | { | 271 | 290k | int i; | 272 | 290k | int N, N2, N4; | 273 | 290k | const kiss_twiddle_scalar *trig; | 274 | | #ifdef FIXED_POINT | 275 | | int pre_shift, post_shift, fft_shift; | 276 | | #endif | 277 | 290k | (void) arch; | 278 | | | 279 | 290k | N = l->n; | 280 | 290k | trig = l->trig; | 281 | 937k | for (i=0;i<shift;i++) | 282 | 646k | { | 283 | 646k | N >>= 1; | 284 | 646k | trig += N; | 285 | 646k | } | 286 | 290k | N2 = N>>1; | 287 | 290k | N4 = N>>2; | 288 | | | 289 | | #ifdef FIXED_POINT | 290 | | { | 291 | | opus_val32 sumval=N2; | 292 | | opus_val32 maxval=0; | 293 | | for (i=0;i<N2;i++) { | 294 | | maxval = MAX32(maxval, ABS32(in[i*stride])); | 295 | | sumval = ADD32_ovflw(sumval, ABS32(SHR32(in[i*stride],11))); | 296 | | } | 297 | | pre_shift = IMAX(0, 29-celt_zlog2(1+maxval)); | 298 | | /* Worst-case where all the energy goes to a single sample. */ | 299 | | post_shift = IMAX(0, 19-celt_ilog2(ABS32(sumval))); | 300 | | post_shift = IMIN(post_shift, pre_shift); | 301 | | fft_shift = pre_shift - post_shift; | 302 | | } | 303 | | #endif | 304 | | /* Pre-rotate */ | 305 | 290k | { | 306 | | /* Temp pointers to make it really clear to the compiler what we're doing */ | 307 | 290k | const kiss_fft_scalar * OPUS_RESTRICT xp1 = in; | 308 | 290k | const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1); | 309 | 290k | kiss_fft_scalar * OPUS_RESTRICT yp = out+(overlap>>1); | 310 | 290k | const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0]; | 311 | 290k | const opus_int16 * OPUS_RESTRICT bitrev = l->kfft[shift]->bitrev; | 312 | 43.8M | for(i=0;i<N4;i++) | 313 | 43.5M | { | 314 | 43.5M | int rev; | 315 | 43.5M | kiss_fft_scalar yr, yi; | 316 | 43.5M | opus_val32 x1, x2; | 317 | 43.5M | rev = *bitrev++; | 318 | 43.5M | x1 = SHL32_ovflw(*xp1, pre_shift); | 319 | 43.5M | x2 = SHL32_ovflw(*xp2, pre_shift); | 320 | 43.5M | yr = ADD32_ovflw(S_MUL(x2, t[i]), S_MUL(x1, t[N4+i])); | 321 | 43.5M | yi = SUB32_ovflw(S_MUL(x1, t[i]), S_MUL(x2, t[N4+i])); | 322 | | /* We swap real and imag because we use an FFT instead of an IFFT. */ | 323 | 43.5M | yp[2*rev+1] = yr; | 324 | 43.5M | yp[2*rev] = yi; | 325 | | /* Storing the pre-rotation directly in the bitrev order. */ | 326 | 43.5M | xp1+=2*stride; | 327 | 43.5M | xp2-=2*stride; | 328 | 43.5M | } | 329 | 290k | } | 330 | | | 331 | 290k | opus_fft_impl(l->kfft[shift], (kiss_fft_cpx*)(out+(overlap>>1)) ARG_FIXED(fft_shift)); | 332 | | | 333 | | /* Post-rotate and de-shuffle from both ends of the buffer at once to make | 334 | | it in-place. */ | 335 | 290k | { | 336 | 290k | kiss_fft_scalar * yp0 = out+(overlap>>1); | 337 | 290k | kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2; | 338 | 290k | const kiss_twiddle_scalar *t = &trig[0]; | 339 | | /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the | 340 | | middle pair will be computed twice. */ | 341 | 22.0M | for(i=0;i<(N4+1)>>1;i++) | 342 | 21.7M | { | 343 | 21.7M | kiss_fft_scalar re, im, yr, yi; | 344 | 21.7M | kiss_twiddle_scalar t0, t1; | 345 | | /* We swap real and imag because we're using an FFT instead of an IFFT. */ | 346 | 21.7M | re = yp0[1]; | 347 | 21.7M | im = yp0[0]; | 348 | 21.7M | t0 = t[i]; | 349 | 21.7M | t1 = t[N4+i]; | 350 | | /* We'd scale up by 2 here, but instead it's done when mixing the windows */ | 351 | 21.7M | yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), post_shift); | 352 | 21.7M | yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), post_shift); | 353 | | /* We swap real and imag because we're using an FFT instead of an IFFT. */ | 354 | 21.7M | re = yp1[1]; | 355 | 21.7M | im = yp1[0]; | 356 | 21.7M | yp0[0] = yr; | 357 | 21.7M | yp1[1] = yi; | 358 | | | 359 | 21.7M | t0 = t[(N4-i-1)]; | 360 | 21.7M | t1 = t[(N2-i-1)]; | 361 | | /* We'd scale up by 2 here, but instead it's done when mixing the windows */ | 362 | 21.7M | yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), post_shift); | 363 | 21.7M | yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), post_shift); | 364 | 21.7M | yp1[0] = yr; | 365 | 21.7M | yp0[1] = yi; | 366 | 21.7M | yp0 += 2; | 367 | 21.7M | yp1 -= 2; | 368 | 21.7M | } | 369 | 290k | } | 370 | | | 371 | | /* Mirror on both sides for TDAC */ | 372 | 290k | { | 373 | 290k | kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1; | 374 | 290k | kiss_fft_scalar * OPUS_RESTRICT yp1 = out; | 375 | 290k | const celt_coef * OPUS_RESTRICT wp1 = window; | 376 | 290k | const celt_coef * OPUS_RESTRICT wp2 = window+overlap-1; | 377 | | | 378 | 19.6M | for(i = 0; i < overlap/2; i++) | 379 | 19.3M | { | 380 | 19.3M | kiss_fft_scalar x1, x2; | 381 | 19.3M | x1 = *xp1; | 382 | 19.3M | x2 = *yp1; | 383 | 19.3M | *yp1++ = SUB32_ovflw(S_MUL(x2, *wp2), S_MUL(x1, *wp1)); | 384 | 19.3M | *xp1-- = ADD32_ovflw(S_MUL(x2, *wp1), S_MUL(x1, *wp2)); | 385 | 19.3M | wp1++; | 386 | 19.3M | wp2--; | 387 | 19.3M | } | 388 | 290k | } | 389 | 290k | } |
|
390 | | #endif /* OVERRIDE_clt_mdct_backward */ |