/src/libxaac/encoder/iusace_fft.c
Line | Count | Source (jump to first uncovered line) |
1 | | /****************************************************************************** |
2 | | * * |
3 | | * Copyright (C) 2023 The Android Open Source Project |
4 | | * |
5 | | * Licensed under the Apache License, Version 2.0 (the "License"); |
6 | | * you may not use this file except in compliance with the License. |
7 | | * You may obtain a copy of the License at: |
8 | | * |
9 | | * http://www.apache.org/licenses/LICENSE-2.0 |
10 | | * |
11 | | * Unless required by applicable law or agreed to in writing, software |
12 | | * distributed under the License is distributed on an "AS IS" BASIS, |
13 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | * See the License for the specific language governing permissions and |
15 | | * limitations under the License. |
16 | | * |
17 | | ***************************************************************************** |
18 | | * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore |
19 | | */ |
20 | | |
21 | | #include <string.h> |
22 | | #include "ixheaac_type_def.h" |
23 | | #include "ixheaace_adjust_threshold_data.h" |
24 | | #include "iusace_cnst.h" |
25 | | #include "iusace_block_switch_const.h" |
26 | | #include "iusace_rom.h" |
27 | | #include "iusace_bitbuffer.h" |
28 | | |
29 | | /* DRC */ |
30 | | #include "impd_drc_common_enc.h" |
31 | | #include "impd_drc_uni_drc.h" |
32 | | #include "impd_drc_tables.h" |
33 | | #include "impd_drc_api.h" |
34 | | #include "impd_drc_uni_drc_eq.h" |
35 | | #include "impd_drc_uni_drc_filter_bank.h" |
36 | | #include "impd_drc_gain_enc.h" |
37 | | #include "impd_drc_struct_def.h" |
38 | | |
39 | | #include "iusace_tns_usac.h" |
40 | | #include "iusace_psy_mod.h" |
41 | | #include "iusace_config.h" |
42 | | #include "iusace_fft.h" |
43 | | #include "iusace_basic_ops_flt.h" |
44 | | #include "ixheaac_constants.h" |
45 | | #include "ixheaace_aac_constants.h" |
46 | | #include "ixheaac_basic_ops32.h" |
47 | | #include "ixheaace_common_utils.h" |
48 | | #include "ixheaac_error_standards.h" |
49 | | #include "ixheaace_error_codes.h" |
50 | | |
51 | | #define DIG_REV(i, m, j) \ |
52 | 573M | do { \ |
53 | 573M | unsigned _ = (i); \ |
54 | 573M | _ = ((_ & 0x33333333) << 2) | ((_ & ~0x33333333) >> 2); \ |
55 | 573M | _ = ((_ & 0x0F0F0F0F) << 4) | ((_ & ~0x0F0F0F0F) >> 4); \ |
56 | 573M | _ = ((_ & 0x00FF00FF) << 8) | ((_ & ~0x00FF00FF) >> 8); \ |
57 | 573M | (j) = _ >> (m); \ |
58 | 573M | } while (0) |
59 | | |
60 | 56.4M | static PLATFORM_INLINE WORD8 iusace_calc_norm(WORD32 a) { |
61 | 56.4M | WORD8 norm_val; |
62 | | |
63 | 56.4M | if (a == 0) { |
64 | 0 | norm_val = 31; |
65 | 56.4M | } else { |
66 | 56.4M | if (a == (WORD32)0xffffffffL) { |
67 | 0 | norm_val = 31; |
68 | 56.4M | } else { |
69 | 56.4M | if (a < 0) { |
70 | 0 | a = ~a; |
71 | 0 | } |
72 | 1.46G | for (norm_val = 0; a < (WORD32)0x40000000L; norm_val++) { |
73 | 1.41G | a <<= 1; |
74 | 1.41G | } |
75 | 56.4M | } |
76 | 56.4M | } |
77 | | |
78 | 56.4M | return norm_val; |
79 | 56.4M | } |
80 | | |
81 | 167M | static PLATFORM_INLINE VOID iusace_complex_3point_fft(FLOAT32 *ptr_in, FLOAT32 *ptr_out) { |
82 | 167M | FLOAT32 add_r, sub_r; |
83 | 167M | FLOAT32 add_i, sub_i; |
84 | 167M | FLOAT32 x01r, x01i, temp; |
85 | 167M | FLOAT32 p1, p2, p3, p4; |
86 | 167M | FLOAT64 sinmu; |
87 | | |
88 | 167M | sinmu = 0.866025403784439; |
89 | | |
90 | 167M | x01r = ptr_in[0] + ptr_in[2]; |
91 | 167M | x01i = ptr_in[1] + ptr_in[3]; |
92 | | |
93 | 167M | add_r = ptr_in[2] + ptr_in[4]; |
94 | 167M | add_i = ptr_in[3] + ptr_in[5]; |
95 | | |
96 | 167M | sub_r = ptr_in[2] - ptr_in[4]; |
97 | 167M | sub_i = ptr_in[3] - ptr_in[5]; |
98 | | |
99 | 167M | p1 = add_r / (FLOAT32)2.0; |
100 | 167M | p4 = add_i / (FLOAT32)2.0; |
101 | 167M | p2 = (FLOAT32)((FLOAT64)sub_i * sinmu); |
102 | 167M | p3 = (FLOAT32)((FLOAT64)sub_r * sinmu); |
103 | | |
104 | 167M | temp = ptr_in[0] - p1; |
105 | | |
106 | 167M | ptr_out[0] = x01r + ptr_in[4]; |
107 | 167M | ptr_out[1] = x01i + ptr_in[5]; |
108 | 167M | ptr_out[2] = temp + p2; |
109 | 167M | ptr_out[3] = (ptr_in[1] - p3) - p4; |
110 | 167M | ptr_out[4] = temp - p2; |
111 | 167M | ptr_out[5] = (ptr_in[1] + p3) - p4; |
112 | | |
113 | 167M | return; |
114 | 167M | } |
115 | | |
116 | 28.2M | VOID iusace_complex_fft_p2(FLOAT32 *ptr_x, WORD32 nlength, FLOAT32 *scratch_fft_p2_y) { |
117 | 28.2M | WORD32 i, j, k, n_stages, h2; |
118 | 28.2M | FLOAT32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; |
119 | 28.2M | FLOAT32 tmp; |
120 | 28.2M | WORD32 del, nodespacing, in_loop_cnt; |
121 | 28.2M | WORD32 not_power_4; |
122 | 28.2M | WORD32 dig_rev_shift; |
123 | 28.2M | FLOAT32 *y = scratch_fft_p2_y; |
124 | 28.2M | WORD32 mpass = nlength; |
125 | 28.2M | WORD32 npoints = nlength; |
126 | 28.2M | FLOAT32 *ptr_y = y; |
127 | 28.2M | const FLOAT64 *ptr_w; |
128 | | |
129 | 28.2M | dig_rev_shift = iusace_calc_norm(mpass) + 1 - 16; |
130 | 28.2M | n_stages = 30 - iusace_calc_norm(mpass); |
131 | 28.2M | not_power_4 = n_stages & 1; |
132 | | |
133 | 28.2M | n_stages = n_stages >> 1; |
134 | | |
135 | 28.2M | ptr_w = iusace_twiddle_table_fft_32x32; |
136 | | |
137 | 28.2M | if (dig_rev_shift < 0) { |
138 | 0 | dig_rev_shift = 0; |
139 | 0 | } |
140 | | |
141 | 556M | for (i = 0; i < npoints; i += 4) { |
142 | 527M | FLOAT32 *inp = ptr_x; |
143 | 527M | FLOAT32 tmk; |
144 | | |
145 | 527M | DIG_REV(i, dig_rev_shift, h2); |
146 | 527M | if (not_power_4) { |
147 | 240M | h2 += 1; |
148 | 240M | h2 &= ~1; |
149 | 240M | } |
150 | 527M | inp += (h2); |
151 | | |
152 | 527M | x0r = *inp; |
153 | 527M | x0i = *(inp + 1); |
154 | 527M | inp += (npoints >> 1); |
155 | | |
156 | 527M | x1r = *inp; |
157 | 527M | x1i = *(inp + 1); |
158 | 527M | inp += (npoints >> 1); |
159 | | |
160 | 527M | x2r = *inp; |
161 | 527M | x2i = *(inp + 1); |
162 | 527M | inp += (npoints >> 1); |
163 | | |
164 | 527M | x3r = *inp; |
165 | 527M | x3i = *(inp + 1); |
166 | | |
167 | 527M | x0r = x0r + x2r; |
168 | 527M | x0i = x0i + x2i; |
169 | | |
170 | 527M | tmk = x0r - x2r; |
171 | 527M | x2r = tmk - x2r; |
172 | 527M | tmk = x0i - x2i; |
173 | 527M | x2i = tmk - x2i; |
174 | | |
175 | 527M | x1r = x1r + x3r; |
176 | 527M | x1i = x1i + x3i; |
177 | | |
178 | 527M | tmk = x1r - x3r; |
179 | 527M | x3r = tmk - x3r; |
180 | 527M | tmk = x1i - x3i; |
181 | 527M | x3i = tmk - x3i; |
182 | | |
183 | 527M | x0r = x0r + x1r; |
184 | 527M | x0i = x0i + x1i; |
185 | | |
186 | 527M | tmk = x0r - x1r; |
187 | 527M | x1r = tmk - x1r; |
188 | 527M | tmk = x0i - x1i; |
189 | 527M | x1i = tmk - x1i; |
190 | | |
191 | 527M | x2r = x2r + x3i; |
192 | 527M | x2i = x2i - x3r; |
193 | | |
194 | 527M | tmk = x2r - x3i; |
195 | 527M | x3i = tmk - x3i; |
196 | 527M | tmk = x2i + x3r; |
197 | 527M | x3r = tmk + x3r; |
198 | | |
199 | 527M | *ptr_y++ = x0r; |
200 | 527M | *ptr_y++ = x0i; |
201 | 527M | *ptr_y++ = x2r; |
202 | 527M | *ptr_y++ = x2i; |
203 | 527M | *ptr_y++ = x1r; |
204 | 527M | *ptr_y++ = x1i; |
205 | 527M | *ptr_y++ = x3i; |
206 | 527M | *ptr_y++ = x3r; |
207 | 527M | } |
208 | 28.2M | ptr_y -= 2 * npoints; |
209 | 28.2M | del = 4; |
210 | 28.2M | nodespacing = 64; |
211 | 28.2M | in_loop_cnt = npoints >> 4; |
212 | 64.2M | for (i = n_stages - 1; i > 0; i--) { |
213 | 35.9M | const FLOAT64 *twiddles = ptr_w; |
214 | 35.9M | FLOAT32 *data = ptr_y; |
215 | 35.9M | FLOAT64 w_1, w_2, w_3, w_4, w_5, w_6; |
216 | 35.9M | WORD32 sec_loop_cnt; |
217 | | |
218 | 198M | for (k = in_loop_cnt; k != 0; k--) { |
219 | 162M | x0r = (*data); |
220 | 162M | x0i = (*(data + 1)); |
221 | 162M | data += ((SIZE_T)del << 1); |
222 | | |
223 | 162M | x1r = (*data); |
224 | 162M | x1i = (*(data + 1)); |
225 | 162M | data += ((SIZE_T)del << 1); |
226 | | |
227 | 162M | x2r = (*data); |
228 | 162M | x2i = (*(data + 1)); |
229 | 162M | data += ((SIZE_T)del << 1); |
230 | | |
231 | 162M | x3r = (*data); |
232 | 162M | x3i = (*(data + 1)); |
233 | 162M | data -= 3 * (del << 1); |
234 | | |
235 | 162M | x0r = x0r + x2r; |
236 | 162M | x0i = x0i + x2i; |
237 | 162M | x2r = x0r - (x2r * 2); |
238 | 162M | x2i = x0i - (x2i * 2); |
239 | 162M | x1r = x1r + x3r; |
240 | 162M | x1i = x1i + x3i; |
241 | 162M | x3r = x1r - (x3r * 2); |
242 | 162M | x3i = x1i - (x3i * 2); |
243 | | |
244 | 162M | x0r = x0r + x1r; |
245 | 162M | x0i = x0i + x1i; |
246 | 162M | x1r = x0r - (x1r * 2); |
247 | 162M | x1i = x0i - (x1i * 2); |
248 | 162M | x2r = x2r + x3i; |
249 | 162M | x2i = x2i - x3r; |
250 | 162M | x3i = x2r - (x3i * 2); |
251 | 162M | x3r = x2i + (x3r * 2); |
252 | | |
253 | 162M | *data = x0r; |
254 | 162M | *(data + 1) = x0i; |
255 | 162M | data += ((SIZE_T)del << 1); |
256 | | |
257 | 162M | *data = x2r; |
258 | 162M | *(data + 1) = x2i; |
259 | 162M | data += ((SIZE_T)del << 1); |
260 | | |
261 | 162M | *data = x1r; |
262 | 162M | *(data + 1) = x1i; |
263 | 162M | data += ((SIZE_T)del << 1); |
264 | | |
265 | 162M | *data = x3i; |
266 | 162M | *(data + 1) = x3r; |
267 | 162M | data += ((SIZE_T)del << 1); |
268 | 162M | } |
269 | 35.9M | data = ptr_y + 2; |
270 | | |
271 | 35.9M | sec_loop_cnt = (nodespacing * del); |
272 | 35.9M | sec_loop_cnt = (sec_loop_cnt / 4) + (sec_loop_cnt / 8) - (sec_loop_cnt / 16) + |
273 | 35.9M | (sec_loop_cnt / 32) - (sec_loop_cnt / 64) + (sec_loop_cnt / 128) - |
274 | 35.9M | (sec_loop_cnt / 256); |
275 | | |
276 | 192M | for (j = nodespacing; j <= sec_loop_cnt; j += nodespacing) { |
277 | 156M | w_1 = *(twiddles + j); |
278 | 156M | w_4 = *(twiddles + j + 257); |
279 | 156M | w_2 = *(twiddles + ((SIZE_T)j << 1)); |
280 | 156M | w_5 = *(twiddles + ((SIZE_T)j << 1) + 257); |
281 | 156M | w_3 = *(twiddles + j + ((SIZE_T)j << 1)); |
282 | 156M | w_6 = *(twiddles + j + ((SIZE_T)j << 1) + 257); |
283 | | |
284 | 555M | for (k = in_loop_cnt; k != 0; k--) { |
285 | 398M | data += ((SIZE_T)del << 1); |
286 | | |
287 | 398M | x1r = *data; |
288 | 398M | x1i = *(data + 1); |
289 | 398M | data += ((SIZE_T)del << 1); |
290 | | |
291 | 398M | x2r = *data; |
292 | 398M | x2i = *(data + 1); |
293 | 398M | data += ((SIZE_T)del << 1); |
294 | | |
295 | 398M | x3r = *data; |
296 | 398M | x3i = *(data + 1); |
297 | 398M | data -= 3 * (del << 1); |
298 | | |
299 | 398M | tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_1) - ixheaace_dmult((FLOAT64)x1i, w_4)); |
300 | 398M | x1i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x1r, w_4), (FLOAT64)x1i, w_1); |
301 | 398M | x1r = tmp; |
302 | | |
303 | 398M | tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x2r, w_2) - ixheaace_dmult((FLOAT64)x2i, w_5)); |
304 | 398M | x2i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x2r, w_5), (FLOAT64)x2i, w_2); |
305 | 398M | x2r = tmp; |
306 | | |
307 | 398M | tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x3r, w_3) - ixheaace_dmult((FLOAT64)x3i, w_6)); |
308 | 398M | x3i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x3r, w_6), (FLOAT64)x3i, w_3); |
309 | 398M | x3r = tmp; |
310 | | |
311 | 398M | x0r = (*data); |
312 | 398M | x0i = (*(data + 1)); |
313 | | |
314 | 398M | x0r = x0r + (x2r); |
315 | 398M | x0i = x0i + (x2i); |
316 | 398M | x2r = x0r - (x2r * 2); |
317 | 398M | x2i = x0i - (x2i * 2); |
318 | 398M | x1r = x1r + x3r; |
319 | 398M | x1i = x1i + x3i; |
320 | 398M | x3r = x1r - (x3r * 2); |
321 | 398M | x3i = x1i - (x3i * 2); |
322 | | |
323 | 398M | x0r = x0r + (x1r); |
324 | 398M | x0i = x0i + (x1i); |
325 | 398M | x1r = x0r - (x1r * 2); |
326 | 398M | x1i = x0i - (x1i * 2); |
327 | 398M | x2r = x2r + (x3i); |
328 | 398M | x2i = x2i - (x3r); |
329 | 398M | x3i = x2r - (x3i * 2); |
330 | 398M | x3r = x2i + (x3r * 2); |
331 | | |
332 | 398M | *data = x0r; |
333 | 398M | *(data + 1) = x0i; |
334 | 398M | data += ((SIZE_T)del << 1); |
335 | | |
336 | 398M | *data = x2r; |
337 | 398M | *(data + 1) = x2i; |
338 | 398M | data += ((SIZE_T)del << 1); |
339 | | |
340 | 398M | *data = x1r; |
341 | 398M | *(data + 1) = x1i; |
342 | 398M | data += ((SIZE_T)del << 1); |
343 | | |
344 | 398M | *data = x3i; |
345 | 398M | *(data + 1) = x3r; |
346 | 398M | data += ((SIZE_T)del << 1); |
347 | 398M | } |
348 | 156M | data -= 2 * npoints; |
349 | 156M | data += 2; |
350 | 156M | } |
351 | 132M | for (; j <= (nodespacing * del) >> 1; j += nodespacing) { |
352 | 96.3M | w_1 = *(twiddles + j); |
353 | 96.3M | w_4 = *(twiddles + j + 257); |
354 | 96.3M | w_2 = *(twiddles + ((SIZE_T)j << 1)); |
355 | 96.3M | w_5 = *(twiddles + ((SIZE_T)j << 1) + 257); |
356 | 96.3M | w_3 = *(twiddles + j + ((SIZE_T)j << 1) - 256); |
357 | 96.3M | w_6 = *(twiddles + j + ((SIZE_T)j << 1) + 1); |
358 | | |
359 | 376M | for (k = in_loop_cnt; k != 0; k--) { |
360 | 280M | data += ((SIZE_T)del << 1); |
361 | | |
362 | 280M | x1r = *data; |
363 | 280M | x1i = *(data + 1); |
364 | 280M | data += ((SIZE_T)del << 1); |
365 | | |
366 | 280M | x2r = *data; |
367 | 280M | x2i = *(data + 1); |
368 | 280M | data += ((SIZE_T)del << 1); |
369 | | |
370 | 280M | x3r = *data; |
371 | 280M | x3i = *(data + 1); |
372 | 280M | data -= 3 * (del << 1); |
373 | | |
374 | 280M | tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_1) - ixheaace_dmult((FLOAT64)x1i, w_4)); |
375 | 280M | x1i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x1r, w_4), (FLOAT64)x1i, w_1); |
376 | 280M | x1r = tmp; |
377 | | |
378 | 280M | tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x2r, w_2) - ixheaace_dmult((FLOAT64)x2i, w_5)); |
379 | 280M | x2i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x2r, w_5), (FLOAT64)x2i, w_2); |
380 | 280M | x2r = tmp; |
381 | | |
382 | 280M | tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x3r, w_6) + ixheaace_dmult((FLOAT64)x3i, w_3)); |
383 | 280M | x3i = (FLOAT32)(-ixheaace_dmult((FLOAT64)x3r, w_3) + ixheaace_dmult((FLOAT64)x3i, w_6)); |
384 | 280M | x3r = tmp; |
385 | | |
386 | 280M | x0r = (*data); |
387 | 280M | x0i = (*(data + 1)); |
388 | | |
389 | 280M | x0r = x0r + (x2r); |
390 | 280M | x0i = x0i + (x2i); |
391 | 280M | x2r = x0r - (x2r * 2); |
392 | 280M | x2i = x0i - (x2i * 2); |
393 | 280M | x1r = x1r + x3r; |
394 | 280M | x1i = x1i + x3i; |
395 | 280M | x3r = x1r - (x3r * 2); |
396 | 280M | x3i = x1i - (x3i * 2); |
397 | | |
398 | 280M | x0r = x0r + (x1r); |
399 | 280M | x0i = x0i + (x1i); |
400 | 280M | x1r = x0r - (x1r * 2); |
401 | 280M | x1i = x0i - (x1i * 2); |
402 | 280M | x2r = x2r + (x3i); |
403 | 280M | x2i = x2i - (x3r); |
404 | 280M | x3i = x2r - (x3i * 2); |
405 | 280M | x3r = x2i + (x3r * 2); |
406 | | |
407 | 280M | *data = x0r; |
408 | 280M | *(data + 1) = x0i; |
409 | 280M | data += ((SIZE_T)del << 1); |
410 | | |
411 | 280M | *data = x2r; |
412 | 280M | *(data + 1) = x2i; |
413 | 280M | data += ((SIZE_T)del << 1); |
414 | | |
415 | 280M | *data = x1r; |
416 | 280M | *(data + 1) = x1i; |
417 | 280M | data += ((SIZE_T)del << 1); |
418 | | |
419 | 280M | *data = x3i; |
420 | 280M | *(data + 1) = x3r; |
421 | 280M | data += ((SIZE_T)del << 1); |
422 | 280M | } |
423 | 96.3M | data -= 2 * npoints; |
424 | 96.3M | data += 2; |
425 | 96.3M | } |
426 | 96.3M | for (; j <= sec_loop_cnt * 2; j += nodespacing) { |
427 | 60.3M | w_1 = *(twiddles + j); |
428 | 60.3M | w_4 = *(twiddles + j + 257); |
429 | 60.3M | w_2 = *(twiddles + ((SIZE_T)j << 1) - 256); |
430 | 60.3M | w_5 = *(twiddles + ((SIZE_T)j << 1) + 1); |
431 | 60.3M | w_3 = *(twiddles + j + ((SIZE_T)j << 1) - 256); |
432 | 60.3M | w_6 = *(twiddles + j + ((SIZE_T)j << 1) + 1); |
433 | | |
434 | 178M | for (k = in_loop_cnt; k != 0; k--) { |
435 | 118M | data += ((SIZE_T)del << 1); |
436 | | |
437 | 118M | x1r = *data; |
438 | 118M | x1i = *(data + 1); |
439 | 118M | data += ((SIZE_T)del << 1); |
440 | | |
441 | 118M | x2r = *data; |
442 | 118M | x2i = *(data + 1); |
443 | 118M | data += ((SIZE_T)del << 1); |
444 | | |
445 | 118M | x3r = *data; |
446 | 118M | x3i = *(data + 1); |
447 | 118M | data -= 3 * (del << 1); |
448 | | |
449 | 118M | tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_1) - ixheaace_dmult((FLOAT64)x1i, w_4)); |
450 | 118M | x1i = (FLOAT32)ixheaace_dmac(ixheaace_dmult(x1r, w_4), x1i, w_1); |
451 | 118M | x1r = tmp; |
452 | | |
453 | 118M | tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x2r, w_5) + ixheaace_dmult((FLOAT64)x2i, w_2)); |
454 | 118M | x2i = (FLOAT32)(-ixheaace_dmult(x2r, w_2) + ixheaace_dmult(x2i, w_5)); |
455 | 118M | x2r = tmp; |
456 | | |
457 | 118M | tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x3r, w_6) + ixheaace_dmult((FLOAT64)x3i, w_3)); |
458 | 118M | x3i = (FLOAT32)(-ixheaace_dmult((FLOAT64)x3r, w_3) + ixheaace_dmult((FLOAT64)x3i, w_6)); |
459 | 118M | x3r = tmp; |
460 | | |
461 | 118M | x0r = (*data); |
462 | 118M | x0i = (*(data + 1)); |
463 | | |
464 | 118M | x0r = x0r + (x2r); |
465 | 118M | x0i = x0i + (x2i); |
466 | 118M | x2r = x0r - (x2r * 2); |
467 | 118M | x2i = x0i - (x2i * 2); |
468 | 118M | x1r = x1r + x3r; |
469 | 118M | x1i = x1i + x3i; |
470 | 118M | x3r = x1r - (x3r * 2); |
471 | 118M | x3i = x1i - (x3i * 2); |
472 | | |
473 | 118M | x0r = x0r + (x1r); |
474 | 118M | x0i = x0i + (x1i); |
475 | 118M | x1r = x0r - (x1r * 2); |
476 | 118M | x1i = x0i - (x1i * 2); |
477 | 118M | x2r = x2r + (x3i); |
478 | 118M | x2i = x2i - (x3r); |
479 | 118M | x3i = x2r - (x3i * 2); |
480 | 118M | x3r = x2i + (x3r * 2); |
481 | | |
482 | 118M | *data = x0r; |
483 | 118M | *(data + 1) = x0i; |
484 | 118M | data += ((SIZE_T)del << 1); |
485 | | |
486 | 118M | *data = x2r; |
487 | 118M | *(data + 1) = x2i; |
488 | 118M | data += ((SIZE_T)del << 1); |
489 | | |
490 | 118M | *data = x1r; |
491 | 118M | *(data + 1) = x1i; |
492 | 118M | data += ((SIZE_T)del << 1); |
493 | | |
494 | 118M | *data = x3i; |
495 | 118M | *(data + 1) = x3r; |
496 | 118M | data += ((SIZE_T)del << 1); |
497 | 118M | } |
498 | 60.3M | data -= 2 * npoints; |
499 | 60.3M | data += 2; |
500 | 60.3M | } |
501 | 192M | for (; j < nodespacing * del; j += nodespacing) { |
502 | 156M | w_1 = *(twiddles + j); |
503 | 156M | w_4 = *(twiddles + j + 257); |
504 | 156M | w_2 = *(twiddles + ((SIZE_T)j << 1) - 256); |
505 | 156M | w_5 = *(twiddles + ((SIZE_T)j << 1) + 1); |
506 | 156M | w_3 = *(twiddles + j + ((SIZE_T)j << 1) - 512); |
507 | 156M | w_6 = *(twiddles + j + ((SIZE_T)j << 1) - 512 + 257); |
508 | | |
509 | 555M | for (k = in_loop_cnt; k != 0; k--) { |
510 | 398M | data += ((SIZE_T)del << 1); |
511 | | |
512 | 398M | x1r = *data; |
513 | 398M | x1i = *(data + 1); |
514 | 398M | data += ((SIZE_T)del << 1); |
515 | | |
516 | 398M | x2r = *data; |
517 | 398M | x2i = *(data + 1); |
518 | 398M | data += ((SIZE_T)del << 1); |
519 | | |
520 | 398M | x3r = *data; |
521 | 398M | x3i = *(data + 1); |
522 | 398M | data -= 3 * ((SIZE_T)del << 1); |
523 | | |
524 | 398M | tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_1) - ixheaace_dmult((FLOAT64)x1i, w_4)); |
525 | 398M | x1i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x1r, w_4), (FLOAT64)x1i, w_1); |
526 | 398M | x1r = tmp; |
527 | | |
528 | 398M | tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x2r, w_5) + ixheaace_dmult((FLOAT64)x2i, w_2)); |
529 | 398M | x2i = (FLOAT32)(-ixheaace_dmult((FLOAT64)x2r, w_2) + ixheaace_dmult((FLOAT64)x2i, w_5)); |
530 | 398M | x2r = tmp; |
531 | | |
532 | 398M | tmp = (FLOAT32)(-ixheaace_dmult((FLOAT64)x3r, w_3) + ixheaace_dmult((FLOAT64)x3i, w_6)); |
533 | 398M | x3i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x3r, w_6), (FLOAT64)x3i, w_3); |
534 | 398M | x3r = tmp; |
535 | | |
536 | 398M | x0r = (*data); |
537 | 398M | x0i = (*(data + 1)); |
538 | | |
539 | 398M | x0r = x0r + (x2r); |
540 | 398M | x0i = x0i + (x2i); |
541 | 398M | x2r = x0r - (x2r * 2); |
542 | 398M | x2i = x0i - (x2i * 2); |
543 | 398M | x1r = x1r + x3r; |
544 | 398M | x1i = x1i - x3i; |
545 | 398M | x3r = x1r - (x3r * 2); |
546 | 398M | x3i = x1i + (x3i * 2); |
547 | | |
548 | 398M | x0r = x0r + (x1r); |
549 | 398M | x0i = x0i + (x1i); |
550 | 398M | x1r = x0r - (x1r * 2); |
551 | 398M | x1i = x0i - (x1i * 2); |
552 | 398M | x2r = x2r + (x3i); |
553 | 398M | x2i = x2i - (x3r); |
554 | 398M | x3i = x2r - (x3i * 2); |
555 | 398M | x3r = x2i + (x3r * 2); |
556 | | |
557 | 398M | *data = x0r; |
558 | 398M | *(data + 1) = x0i; |
559 | 398M | data += ((SIZE_T)del << 1); |
560 | | |
561 | 398M | *data = x2r; |
562 | 398M | *(data + 1) = x2i; |
563 | 398M | data += ((SIZE_T)del << 1); |
564 | | |
565 | 398M | *data = x1r; |
566 | 398M | *(data + 1) = x1i; |
567 | 398M | data += ((SIZE_T)del << 1); |
568 | | |
569 | 398M | *data = x3i; |
570 | 398M | *(data + 1) = x3r; |
571 | 398M | data += ((SIZE_T)del << 1); |
572 | 398M | } |
573 | 156M | data -= 2 * npoints; |
574 | 156M | data += 2; |
575 | 156M | } |
576 | 35.9M | nodespacing >>= 2; |
577 | 35.9M | del <<= 2; |
578 | 35.9M | in_loop_cnt >>= 2; |
579 | 35.9M | } |
580 | 28.2M | if (not_power_4) { |
581 | 13.0M | const FLOAT64 *twiddles = ptr_w; |
582 | 13.0M | nodespacing <<= 1; |
583 | | |
584 | 253M | for (j = del / 2; j != 0; j--) { |
585 | 240M | FLOAT64 w_1 = *twiddles; |
586 | 240M | FLOAT64 w_4 = *(twiddles + 257); |
587 | 240M | twiddles += nodespacing; |
588 | | |
589 | 240M | x0r = *ptr_y; |
590 | 240M | x0i = *(ptr_y + 1); |
591 | 240M | ptr_y += ((SIZE_T)del << 1); |
592 | | |
593 | 240M | x1r = *ptr_y; |
594 | 240M | x1i = *(ptr_y + 1); |
595 | | |
596 | 240M | tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_1) - ixheaace_dmult((FLOAT64)x1i, w_4)); |
597 | 240M | x1i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x1r, w_4), (FLOAT64)x1i, w_1); |
598 | 240M | x1r = tmp; |
599 | | |
600 | 240M | *ptr_y = (x0r) - (x1r); |
601 | 240M | *(ptr_y + 1) = (x0i) - (x1i); |
602 | 240M | ptr_y -= ((SIZE_T)del << 1); |
603 | | |
604 | 240M | *ptr_y = (x0r) + (x1r); |
605 | 240M | *(ptr_y + 1) = (x0i) + (x1i); |
606 | 240M | ptr_y += 2; |
607 | 240M | } |
608 | 13.0M | twiddles = ptr_w; |
609 | 253M | for (j = del / 2; j != 0; j--) { |
610 | 240M | FLOAT64 w_1 = *twiddles; |
611 | 240M | FLOAT64 w_4 = *(twiddles + 257); |
612 | 240M | twiddles += nodespacing; |
613 | | |
614 | 240M | x0r = *ptr_y; |
615 | 240M | x0i = *(ptr_y + 1); |
616 | 240M | ptr_y += ((SIZE_T)del << 1); |
617 | | |
618 | 240M | x1r = *ptr_y; |
619 | 240M | x1i = *(ptr_y + 1); |
620 | | |
621 | 240M | tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_4) + ixheaace_dmult((FLOAT64)x1i, w_1)); |
622 | 240M | x1i = (FLOAT32)(-ixheaace_dmult((FLOAT64)x1r, w_1) + ixheaace_dmult((FLOAT64)x1i, w_4)); |
623 | 240M | x1r = tmp; |
624 | | |
625 | 240M | *ptr_y = (x0r) - (x1r); |
626 | 240M | *(ptr_y + 1) = (x0i) - (x1i); |
627 | 240M | ptr_y -= ((SIZE_T)del << 1); |
628 | | |
629 | 240M | *ptr_y = (x0r) + (x1r); |
630 | 240M | *(ptr_y + 1) = (x0i) + (x1i); |
631 | 240M | ptr_y += 2; |
632 | 240M | } |
633 | 13.0M | } |
634 | | |
635 | 2.13G | for (i = 0; i < nlength; i++) { |
636 | 2.11G | *(ptr_x + 2 * i) = y[2 * i]; |
637 | 2.11G | *(ptr_x + 2 * i + 1) = y[2 * i + 1]; |
638 | 2.11G | } |
639 | 28.2M | } |
640 | | |
641 | | static VOID iusace_complex_fft_p3(FLOAT32 *data, WORD32 nlength, |
642 | 4.96M | iusace_scratch_mem *pstr_scratch) { |
643 | 4.96M | WORD32 i, j; |
644 | 4.96M | FLOAT32 *data_3 = pstr_scratch->p_fft_p3_data_3; |
645 | 4.96M | FLOAT32 *y = pstr_scratch->p_fft_p3_y; |
646 | 4.96M | WORD32 cnfac; |
647 | 4.96M | WORD32 mpass = nlength; |
648 | 4.96M | FLOAT32 *ptr_x = data; |
649 | 4.96M | FLOAT32 *ptr_y = y; |
650 | | |
651 | 4.96M | cnfac = 0; |
652 | 9.93M | while (mpass % 3 == 0) { |
653 | 4.96M | mpass /= 3; |
654 | 4.96M | cnfac++; |
655 | 4.96M | } |
656 | | |
657 | 19.8M | for (i = 0; i < 3 * cnfac; i++) { |
658 | 518M | for (j = 0; j < mpass; j++) { |
659 | 503M | data_3[2 * j] = data[3 * (2 * j) + (2 * i)]; |
660 | 503M | data_3[2 * j + 1] = data[3 * (2 * j) + 1 + (2 * i)]; |
661 | 503M | } |
662 | 14.8M | iusace_complex_fft_p2(data_3, mpass, pstr_scratch->p_fft_p2_y); |
663 | | |
664 | 518M | for (j = 0; j < mpass; j++) { |
665 | 503M | data[3 * (2 * j) + (2 * i)] = data_3[2 * j]; |
666 | 503M | data[3 * (2 * j) + 1 + (2 * i)] = data_3[2 * j + 1]; |
667 | 503M | } |
668 | 14.8M | } |
669 | | |
670 | 4.96M | { |
671 | 4.96M | const FLOAT64 *w1r, *w1i; |
672 | 4.96M | FLOAT32 tmp; |
673 | 4.96M | w1r = iusace_twiddle_table_3pr; |
674 | 4.96M | w1i = iusace_twiddle_table_3pi; |
675 | | |
676 | 172M | for (i = 0; i < nlength; i += 3) { |
677 | 167M | tmp = (FLOAT32)((FLOAT64)data[2 * i] * (*w1r) - (FLOAT64)data[2 * i + 1] * (*w1i)); |
678 | 167M | data[2 * i + 1] = |
679 | 167M | (FLOAT32)((FLOAT64)data[2 * i] * (*w1i) + (FLOAT64)data[2 * i + 1] * (*w1r)); |
680 | 167M | data[2 * i] = tmp; |
681 | | |
682 | 167M | w1r++; |
683 | 167M | w1i++; |
684 | | |
685 | 167M | tmp = (FLOAT32)((FLOAT64)data[2 * (i + 1)] * (*w1r) - |
686 | 167M | (FLOAT64)data[2 * (i + 1) + 1] * (*w1i)); |
687 | 167M | data[2 * (i + 1) + 1] = (FLOAT32)((FLOAT64)data[2 * (i + 1)] * (*w1i) + |
688 | 167M | (FLOAT64)data[2 * (i + 1) + 1] * (*w1r)); |
689 | 167M | data[2 * (i + 1)] = tmp; |
690 | | |
691 | 167M | w1r++; |
692 | 167M | w1i++; |
693 | | |
694 | 167M | tmp = (FLOAT32)((FLOAT64)data[2 * (i + 2)] * (*w1r) - |
695 | 167M | (FLOAT64)data[2 * (i + 2) + 1] * (*w1i)); |
696 | 167M | data[2 * (i + 2) + 1] = (FLOAT32)((FLOAT64)data[2 * (i + 2)] * (*w1i) + |
697 | 167M | (FLOAT64)data[2 * (i + 2) + 1] * (*w1r)); |
698 | 167M | data[2 * (i + 2)] = tmp; |
699 | | |
700 | 167M | w1r += 3 * (128 / mpass - 1) + 1; |
701 | 167M | w1i += 3 * (128 / mpass - 1) + 1; |
702 | 167M | } |
703 | 4.96M | } |
704 | | |
705 | 172M | for (i = 0; i < mpass; i++) { |
706 | 167M | iusace_complex_3point_fft(ptr_x, ptr_y); |
707 | | |
708 | 167M | ptr_x = ptr_x + 6; |
709 | 167M | ptr_y = ptr_y + 6; |
710 | 167M | } |
711 | | |
712 | 172M | for (i = 0; i < mpass; i++) { |
713 | 167M | data[2 * i] = y[6 * i]; |
714 | 167M | data[2 * i + 1] = y[6 * i + 1]; |
715 | 167M | } |
716 | | |
717 | 172M | for (i = 0; i < mpass; i++) { |
718 | 167M | data[2 * (i + mpass)] = y[6 * i + 2]; |
719 | 167M | data[2 * (i + mpass) + 1] = y[6 * i + 3]; |
720 | 167M | } |
721 | | |
722 | 172M | for (i = 0; i < mpass; i++) { |
723 | 167M | data[2 * (i + 2 * mpass)] = y[6 * i + 4]; |
724 | 167M | data[2 * (i + 2 * mpass) + 1] = y[6 * i + 5]; |
725 | 167M | } |
726 | 4.96M | } |
727 | | |
728 | 0 | VOID iusace_complex_fft_p3_no_scratch(FLOAT32 *data, WORD32 nlength) { |
729 | 0 | WORD32 i, j; |
730 | |
|
731 | 0 | FLOAT32 data_3[800]; |
732 | 0 | FLOAT32 y[1024]; |
733 | 0 | FLOAT32 p_fft_p2_y[2048]; |
734 | 0 | WORD32 cnfac; |
735 | 0 | WORD32 mpass = nlength; |
736 | 0 | FLOAT32 *ptr_x = data; |
737 | 0 | FLOAT32 *ptr_y = y; |
738 | |
|
739 | 0 | cnfac = 0; |
740 | 0 | while (mpass % 3 == 0) { |
741 | 0 | mpass /= 3; |
742 | 0 | cnfac++; |
743 | 0 | } |
744 | |
|
745 | 0 | for (i = 0; i < 3 * cnfac; i++) { |
746 | 0 | for (j = 0; j < mpass; j++) { |
747 | 0 | data_3[2 * j] = data[3 * (2 * j) + (2 * i)]; |
748 | 0 | data_3[2 * j + 1] = data[3 * (2 * j) + 1 + (2 * i)]; |
749 | 0 | } |
750 | 0 | iusace_complex_fft_p2(data_3, mpass, p_fft_p2_y); |
751 | |
|
752 | 0 | for (j = 0; j < mpass; j++) { |
753 | 0 | data[3 * (2 * j) + (2 * i)] = data_3[2 * j]; |
754 | 0 | data[3 * (2 * j) + 1 + (2 * i)] = data_3[2 * j + 1]; |
755 | 0 | } |
756 | 0 | } |
757 | |
|
758 | 0 | { |
759 | 0 | const FLOAT64 *w1r, *w1i; |
760 | 0 | FLOAT32 tmp; |
761 | 0 | w1r = iusace_twiddle_table_3pr; |
762 | 0 | w1i = iusace_twiddle_table_3pi; |
763 | |
|
764 | 0 | for (i = 0; i < nlength; i += 3) { |
765 | 0 | tmp = (FLOAT32)((FLOAT64)data[2 * i] * (*w1r) - (FLOAT64)data[2 * i + 1] * (*w1i)); |
766 | 0 | data[2 * i + 1] = |
767 | 0 | (FLOAT32)((FLOAT64)data[2 * i] * (*w1i) + (FLOAT64)data[2 * i + 1] * (*w1r)); |
768 | 0 | data[2 * i] = tmp; |
769 | |
|
770 | 0 | w1r++; |
771 | 0 | w1i++; |
772 | |
|
773 | 0 | tmp = (FLOAT32)((FLOAT64)data[2 * (i + 1)] * (*w1r) - |
774 | 0 | (FLOAT64)data[2 * (i + 1) + 1] * (*w1i)); |
775 | 0 | data[2 * (i + 1) + 1] = (FLOAT32)((FLOAT64)data[2 * (i + 1)] * (*w1i) + |
776 | 0 | (FLOAT64)data[2 * (i + 1) + 1] * (*w1r)); |
777 | 0 | data[2 * (i + 1)] = tmp; |
778 | |
|
779 | 0 | w1r++; |
780 | 0 | w1i++; |
781 | |
|
782 | 0 | tmp = (FLOAT32)((FLOAT64)data[2 * (i + 2)] * (*w1r) - |
783 | 0 | (FLOAT64)data[2 * (i + 2) + 1] * (*w1i)); |
784 | 0 | data[2 * (i + 2) + 1] = (FLOAT32)((FLOAT64)data[2 * (i + 2)] * (*w1i) + |
785 | 0 | (FLOAT64)data[2 * (i + 2) + 1] * (*w1r)); |
786 | 0 | data[2 * (i + 2)] = tmp; |
787 | |
|
788 | 0 | w1r += 3 * (128 / mpass - 1) + 1; |
789 | 0 | w1i += 3 * (128 / mpass - 1) + 1; |
790 | 0 | } |
791 | 0 | } |
792 | |
|
793 | 0 | for (i = 0; i < mpass; i++) { |
794 | 0 | iusace_complex_3point_fft(ptr_x, ptr_y); |
795 | |
|
796 | 0 | ptr_x = ptr_x + 6; |
797 | 0 | ptr_y = ptr_y + 6; |
798 | 0 | } |
799 | |
|
800 | 0 | for (i = 0; i < mpass; i++) { |
801 | 0 | data[2 * i] = y[6 * i]; |
802 | 0 | data[2 * i + 1] = y[6 * i + 1]; |
803 | 0 | } |
804 | |
|
805 | 0 | for (i = 0; i < mpass; i++) { |
806 | 0 | data[2 * (i + mpass)] = y[6 * i + 2]; |
807 | 0 | data[2 * (i + mpass) + 1] = y[6 * i + 3]; |
808 | 0 | } |
809 | |
|
810 | 0 | for (i = 0; i < mpass; i++) { |
811 | 0 | data[2 * (i + 2 * mpass)] = y[6 * i + 4]; |
812 | 0 | data[2 * (i + 2 * mpass) + 1] = y[6 * i + 5]; |
813 | 0 | } |
814 | 0 | } |
815 | | |
816 | | static VOID iusace_calc_pre_twid_enc(FLOAT64 *ptr_in, FLOAT32 *fft_ptr, WORD32 npoints, |
817 | | const FLOAT64 *cos_ptr, const FLOAT64 *sin_ptr, |
818 | 3.30M | const WORD32 tx_flag) { |
819 | 3.30M | WORD32 i, n; |
820 | 3.30M | WORD32 b = npoints >> 1; |
821 | 3.30M | WORD32 a = npoints - b; |
822 | 3.30M | WORD32 nlength = npoints >> 2; |
823 | 3.30M | FLOAT64 tempr, tempi; |
824 | | |
825 | 3.30M | if (tx_flag == 0) { |
826 | 1.65M | FLOAT64 norm; |
827 | 457M | for (i = 0; i < b; i++) { |
828 | 456M | norm = ptr_in[i]; /* reuse MDCT: spectrally reverse all bins */ |
829 | 456M | ptr_in[i] = ptr_in[npoints - 1 - i]; |
830 | 456M | ptr_in[npoints - 1 - i] = norm; |
831 | 456M | } |
832 | 1.65M | } |
833 | 459M | for (i = 0; i < nlength; i++) { |
834 | 456M | n = npoints / 2 - 1 - 2 * i; |
835 | 456M | if (i < b / 4) { |
836 | 228M | tempr = ptr_in[a / 2 + n] + ptr_in[npoints + a / 2 - 1 - n]; |
837 | 228M | } else { |
838 | 228M | tempr = ptr_in[a / 2 + n] - ptr_in[a / 2 - 1 - n]; |
839 | 228M | } |
840 | 456M | n = 2 * i; |
841 | 456M | if (i < a / 4) { |
842 | 228M | tempi = ptr_in[a / 2 + n] - ptr_in[a / 2 - 1 - n]; |
843 | 228M | } else { |
844 | 228M | tempi = ptr_in[a / 2 + n] + ptr_in[npoints + a / 2 - 1 - n]; |
845 | 228M | } |
846 | | |
847 | 456M | fft_ptr[2 * i] = (FLOAT32)(tempr * (*cos_ptr) + tempi * (*sin_ptr)); |
848 | 456M | fft_ptr[2 * i + 1] = (FLOAT32)(tempi * (*cos_ptr++) - tempr * (*sin_ptr++)); |
849 | 456M | } |
850 | 3.30M | } |
851 | | |
852 | 12.9M | VOID iusace_complex_fft(FLOAT32 *data, WORD32 nlength, iusace_scratch_mem *pstr_scratch) { |
853 | 12.9M | if (nlength & (nlength - 1)) { |
854 | 4.96M | iusace_complex_fft_p3(data, nlength, pstr_scratch); |
855 | 7.96M | } else { |
856 | 7.96M | iusace_complex_fft_p2(data, nlength, pstr_scratch->p_fft_p2_y); |
857 | 7.96M | } |
858 | 12.9M | } |
859 | | |
860 | | static VOID iusace_calc_post_twid_enc(FLOAT64 *ptr_out, FLOAT32 *fft_ptr, WORD32 npoints, |
861 | | const FLOAT64 *cos_ptr, const FLOAT64 *sin_ptr, |
862 | 3.30M | const WORD32 tx_flag) { |
863 | 3.30M | WORD32 i; |
864 | 3.30M | WORD32 nlength = npoints >> 2; |
865 | 3.30M | FLOAT64 tempr, tempi; |
866 | | |
867 | | /* post-twiddle FFT output and then get output data */ |
868 | 459M | for (i = 0; i < nlength; i++) { |
869 | 456M | tempr = |
870 | 456M | 2 * ((FLOAT64)(fft_ptr[2 * i]) * (*cos_ptr) + (FLOAT64)(fft_ptr[2 * i + 1]) * (*sin_ptr)); |
871 | 456M | tempi = 2 * ((FLOAT64)(fft_ptr[2 * i + 1]) * (*cos_ptr++) - |
872 | 456M | (FLOAT64)(fft_ptr[2 * i]) * (*sin_ptr++)); |
873 | | |
874 | 456M | ptr_out[2 * i] = -tempr; |
875 | 456M | ptr_out[npoints / 2 - 1 - 2 * i] = tempi; |
876 | 456M | ptr_out[npoints / 2 + 2 * i] = -tempi; |
877 | 456M | ptr_out[npoints - 1 - 2 * i] = tempr; |
878 | 456M | } |
879 | 3.30M | if (tx_flag == 0) { |
880 | 457M | for (i = 0; i < npoints; i += 2) { |
881 | 456M | ptr_out[i] *= -1; /* reuse MDCT: flip signs at odd indices */ |
882 | 456M | } |
883 | 1.65M | } |
884 | 3.30M | } |
885 | | |
886 | | IA_ERRORCODE iusace_fft_based_mdct(FLOAT64 *ptr_in, FLOAT64 *ptr_out, WORD32 npoints, |
887 | 3.30M | const WORD32 tx_flag, iusace_scratch_mem *pstr_scratch) { |
888 | 3.30M | FLOAT32 *ptr_scratch1 = pstr_scratch->p_fft_mdct_buf; |
889 | 3.30M | const FLOAT64 *cos_ptr = NULL; |
890 | 3.30M | const FLOAT64 *sin_ptr = NULL; |
891 | 3.30M | WORD32 nlength = npoints >> 1; |
892 | 3.30M | WORD32 n_total = npoints << 1; |
893 | | |
894 | 3.30M | memset(ptr_scratch1, 0, ((SIZE_T)n_total << 1) * sizeof(*ptr_scratch1)); |
895 | | |
896 | 3.30M | switch (npoints) { |
897 | 1.27M | case (96): |
898 | 1.27M | cos_ptr = iexheaac_pre_post_twid_cos_192; |
899 | 1.27M | sin_ptr = iexheaac_pre_post_twid_sin_192; |
900 | 1.27M | break; |
901 | 1.36M | case (128): |
902 | 1.36M | cos_ptr = iusace_pre_post_twid_cos_256; |
903 | 1.36M | sin_ptr = iusace_pre_post_twid_sin_256; |
904 | 1.36M | break; |
905 | 238k | case (768): |
906 | 238k | cos_ptr = iexheaac_pre_post_twid_cos_1536; |
907 | 238k | sin_ptr = iexheaac_pre_post_twid_sin_1536; |
908 | 238k | break; |
909 | 422k | case (1024): |
910 | 422k | cos_ptr = iusace_pre_post_twid_cos_2048; |
911 | 422k | sin_ptr = iusace_pre_post_twid_sin_2048; |
912 | 422k | break; |
913 | 0 | default: |
914 | 0 | return IA_EXHEAACE_EXE_FATAL_USAC_INVALID_WINDOW_LENGTH; |
915 | 3.30M | } |
916 | | |
917 | | /* pre-twiddle */ |
918 | 3.30M | iusace_calc_pre_twid_enc(ptr_in, ptr_scratch1, npoints << 1, cos_ptr, sin_ptr, tx_flag); |
919 | | |
920 | | /* complex FFT */ |
921 | 3.30M | iusace_complex_fft(ptr_scratch1, nlength, pstr_scratch); |
922 | | |
923 | | /* post-twiddle */ |
924 | 3.30M | iusace_calc_post_twid_enc(ptr_out, ptr_scratch1, npoints << 1, cos_ptr, sin_ptr, tx_flag); |
925 | | |
926 | 3.30M | return IA_NO_ERROR; |
927 | 3.30M | } |
928 | | |
929 | 208k | VOID iusace_complex_fft_2048(FLOAT32 *ptr_x, FLOAT32 *scratch_fft) { |
930 | 208k | WORD32 i; |
931 | 208k | FLOAT32 re, im, c_v, s_v, tmp_re, tmp_im; |
932 | 208k | FLOAT32 *ptr_re, *ptr_im, *ptr_re_h, *ptr_im_h; |
933 | 208k | FLOAT32 *ptr_cos_val, *ptr_sin_val; |
934 | 208k | iusace_complex_fft_p2(ptr_x, 1024, scratch_fft); |
935 | 208k | iusace_complex_fft_p2(ptr_x + 2048, 1024, scratch_fft); |
936 | | |
937 | 208k | ptr_re = ptr_x; |
938 | 208k | ptr_im = ptr_x + 1; |
939 | 208k | ptr_re_h = ptr_x + 2048; |
940 | 208k | ptr_im_h = ptr_x + 2048 + 1; |
941 | 208k | ptr_cos_val = (FLOAT32 *)&iusace_twiddle_cos_2048[0]; |
942 | 208k | ptr_sin_val = (FLOAT32 *)&iusace_twiddle_sin_2048[0]; |
943 | 213M | for (i = 0; i < 1024; i++) { |
944 | 213M | re = *ptr_re_h; |
945 | 213M | im = *ptr_im_h; |
946 | 213M | c_v = ptr_cos_val[i]; |
947 | 213M | s_v = ptr_sin_val[i]; |
948 | 213M | tmp_re = (re * c_v) + (im * s_v); |
949 | 213M | tmp_im = -(re * s_v) + (im * c_v); |
950 | 213M | re = *ptr_re; |
951 | 213M | im = *ptr_im; |
952 | | |
953 | 213M | *ptr_re = re + tmp_re; |
954 | 213M | *ptr_im = im + tmp_im; |
955 | 213M | *ptr_re_h = re - tmp_re; |
956 | 213M | *ptr_im_h = im - tmp_im; |
957 | | |
958 | 213M | ptr_re += 2; |
959 | 213M | ptr_im += 2; |
960 | 213M | ptr_re_h += 2; |
961 | 213M | ptr_im_h += 2; |
962 | 213M | } |
963 | 208k | } |
964 | | static VOID ixheaace_rad2_cplx_fft(FLOAT32 *ptr_real, FLOAT32 *ptr_imag, WORD32 n_points, |
965 | 179k | FLOAT32 *ptr_scratch) { |
966 | 179k | WORD32 i, j, k, n_stages, h2; |
967 | 179k | FLOAT32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; |
968 | 179k | WORD32 del, nodespacing, in_loop_cnt; |
969 | 179k | WORD32 not_power_4; |
970 | 179k | WORD32 dig_rev_shift; |
971 | 179k | WORD32 m_points = n_points; |
972 | 179k | FLOAT32 *ptr_x = ptr_scratch; |
973 | 179k | FLOAT32 *y = ptr_scratch + 2048; |
974 | 179k | FLOAT32 *ptr_y = y; |
975 | 179k | const FLOAT32 *ptr_w; |
976 | | |
977 | 179k | dig_rev_shift = ixheaac_norm32(m_points) + 1 - 16; |
978 | 179k | n_stages = 30 - ixheaac_norm32(m_points); |
979 | 179k | not_power_4 = n_stages & 1; |
980 | | |
981 | 179k | n_stages = n_stages >> 1; |
982 | | |
983 | 179k | ptr_w = ia_fft_twiddle_table_float; |
984 | | |
985 | 183M | for (i = 0; i < n_points; i++) { |
986 | 183M | ptr_x[2 * i] = ptr_real[i]; |
987 | 183M | ptr_x[2 * i + 1] = ptr_imag[i]; |
988 | 183M | } |
989 | 179k | dig_rev_shift = max(dig_rev_shift, 0); |
990 | 46.0M | for (i = 0; i < n_points; i += 4) { |
991 | 45.8M | FLOAT32 *inp = ptr_x; |
992 | 45.8M | FLOAT32 tmk; |
993 | | |
994 | 45.8M | DIG_REV(i, dig_rev_shift, h2); |
995 | 45.8M | if (not_power_4) { |
996 | 0 | h2 += 1; |
997 | 0 | h2 &= ~1; |
998 | 0 | } |
999 | 45.8M | inp += (h2); |
1000 | | |
1001 | 45.8M | x0r = *inp; |
1002 | 45.8M | x0i = *(inp + 1); |
1003 | 45.8M | inp += (n_points >> 1); |
1004 | | |
1005 | 45.8M | x1r = *inp; |
1006 | 45.8M | x1i = *(inp + 1); |
1007 | 45.8M | inp += (n_points >> 1); |
1008 | | |
1009 | 45.8M | x2r = *inp; |
1010 | 45.8M | x2i = *(inp + 1); |
1011 | 45.8M | inp += (n_points >> 1); |
1012 | | |
1013 | 45.8M | x3r = *inp; |
1014 | 45.8M | x3i = *(inp + 1); |
1015 | | |
1016 | 45.8M | x0r = ia_add_flt(x0r, x2r); |
1017 | 45.8M | x0i = ia_add_flt(x0i, x2i); |
1018 | | |
1019 | 45.8M | tmk = ia_sub_flt(x0r, x2r); |
1020 | 45.8M | x2r = ia_sub_flt(tmk, x2r); |
1021 | 45.8M | tmk = ia_sub_flt(x0i, x2i); |
1022 | 45.8M | x2i = ia_sub_flt(tmk, x2i); |
1023 | | |
1024 | 45.8M | x1r = ia_add_flt(x1r, x3r); |
1025 | 45.8M | x1i = ia_add_flt(x1i, x3i); |
1026 | | |
1027 | 45.8M | tmk = ia_sub_flt(x1r, x3r); |
1028 | 45.8M | x3r = ia_sub_flt(tmk, x3r); |
1029 | 45.8M | tmk = ia_sub_flt(x1i, x3i); |
1030 | 45.8M | x3i = ia_sub_flt(tmk, x3i); |
1031 | | |
1032 | 45.8M | x0r = ia_add_flt(x0r, x1r); |
1033 | 45.8M | x0i = ia_add_flt(x0i, x1i); |
1034 | | |
1035 | 45.8M | tmk = ia_sub_flt(x0r, x1r); |
1036 | 45.8M | x1r = ia_sub_flt(tmk, x1r); |
1037 | 45.8M | tmk = ia_sub_flt(x0i, x1i); |
1038 | 45.8M | x1i = ia_sub_flt(tmk, x1i); |
1039 | | |
1040 | 45.8M | x2r = ia_add_flt(x2r, x3i); |
1041 | 45.8M | x2i = ia_sub_flt(x2i, x3r); |
1042 | | |
1043 | 45.8M | tmk = ia_sub_flt(x2r, x3i); |
1044 | 45.8M | x3i = ia_sub_flt(tmk, x3i); |
1045 | 45.8M | tmk = ia_add_flt(x2i, x3r); |
1046 | 45.8M | x3r = ia_add_flt(tmk, x3r); |
1047 | | |
1048 | 45.8M | *ptr_y++ = x0r; |
1049 | 45.8M | *ptr_y++ = x0i; |
1050 | 45.8M | *ptr_y++ = x2r; |
1051 | 45.8M | *ptr_y++ = x2i; |
1052 | 45.8M | *ptr_y++ = x1r; |
1053 | 45.8M | *ptr_y++ = x1i; |
1054 | 45.8M | *ptr_y++ = x3i; |
1055 | 45.8M | *ptr_y++ = x3r; |
1056 | 45.8M | } |
1057 | 179k | ptr_y -= 2 * n_points; |
1058 | 179k | del = 4; |
1059 | 179k | nodespacing = 64; |
1060 | 179k | in_loop_cnt = n_points >> 4; |
1061 | 896k | for (i = n_stages - 1; i > 0; i--) { |
1062 | 716k | const FLOAT32 *twiddles = ptr_w; |
1063 | 716k | FLOAT32 *data = ptr_y; |
1064 | 716k | FLOAT32 w_1, w_2, w_3, w_4, w_5, w_6; |
1065 | 716k | WORD32 sec_loop_cnt; |
1066 | | |
1067 | 15.9M | for (k = in_loop_cnt; k != 0; k--) { |
1068 | 15.2M | x0r = (*data); |
1069 | 15.2M | x0i = (*(data + 1)); |
1070 | 15.2M | data += ((SIZE_T)del << 1); |
1071 | | |
1072 | 15.2M | x1r = (*data); |
1073 | 15.2M | x1i = (*(data + 1)); |
1074 | 15.2M | data += ((SIZE_T)del << 1); |
1075 | | |
1076 | 15.2M | x2r = (*data); |
1077 | 15.2M | x2i = (*(data + 1)); |
1078 | 15.2M | data += ((SIZE_T)del << 1); |
1079 | | |
1080 | 15.2M | x3r = (*data); |
1081 | 15.2M | x3i = (*(data + 1)); |
1082 | 15.2M | data -= 3 * (del << 1); |
1083 | | |
1084 | 15.2M | x0r = ia_add_flt(x0r, x2r); |
1085 | 15.2M | x0i = ia_add_flt(x0i, x2i); |
1086 | 15.2M | x2r = ia_msu_flt(x0r, x2r, 2); |
1087 | 15.2M | x2i = ia_msu_flt(x0i, x2i, 2); |
1088 | 15.2M | x1r = ia_add_flt(x1r, x3r); |
1089 | 15.2M | x1i = ia_add_flt(x1i, x3i); |
1090 | 15.2M | x3r = ia_msu_flt(x1r, x3r, 2); |
1091 | 15.2M | x3i = ia_msu_flt(x1i, x3i, 2); |
1092 | | |
1093 | 15.2M | x0r = ia_add_flt(x0r, x1r); |
1094 | 15.2M | x0i = ia_add_flt(x0i, x1i); |
1095 | 15.2M | x1r = ia_msu_flt(x0r, x1r, 2); |
1096 | 15.2M | x1i = ia_msu_flt(x0i, x1i, 2); |
1097 | 15.2M | x2r = ia_add_flt(x2r, x3i); |
1098 | 15.2M | x2i = ia_sub_flt(x2i, x3r); |
1099 | 15.2M | x3i = ia_msu_flt(x2r, x3i, 2); |
1100 | 15.2M | x3r = ia_mac_flt(x2i, x3r, 2); |
1101 | | |
1102 | 15.2M | *data = x0r; |
1103 | 15.2M | *(data + 1) = x0i; |
1104 | 15.2M | data += ((SIZE_T)del << 1); |
1105 | | |
1106 | 15.2M | *data = x2r; |
1107 | 15.2M | *(data + 1) = x2i; |
1108 | 15.2M | data += ((SIZE_T)del << 1); |
1109 | | |
1110 | 15.2M | *data = x1r; |
1111 | 15.2M | *(data + 1) = x1i; |
1112 | 15.2M | data += ((SIZE_T)del << 1); |
1113 | | |
1114 | 15.2M | *data = x3i; |
1115 | 15.2M | *(data + 1) = x3r; |
1116 | 15.2M | data += ((SIZE_T)del << 1); |
1117 | 15.2M | } |
1118 | 716k | data = ptr_y + 2; |
1119 | | |
1120 | 716k | sec_loop_cnt = (nodespacing * del); |
1121 | 716k | sec_loop_cnt = (sec_loop_cnt / 4) + (sec_loop_cnt / 8) - (sec_loop_cnt / 16) + |
1122 | 716k | (sec_loop_cnt / 32) - (sec_loop_cnt / 64) + (sec_loop_cnt / 128) - |
1123 | 716k | (sec_loop_cnt / 256); |
1124 | | |
1125 | 20.7M | for (j = nodespacing; j <= sec_loop_cnt; j += nodespacing) { |
1126 | 20.0M | w_1 = *(twiddles + j); |
1127 | 20.0M | w_4 = *(twiddles + j + 257); |
1128 | 20.0M | w_2 = *(twiddles + ((SIZE_T)j << 1)); |
1129 | 20.0M | w_5 = *(twiddles + ((SIZE_T)j << 1) + 257); |
1130 | 20.0M | w_3 = *(twiddles + j + ((SIZE_T)j << 1)); |
1131 | 20.0M | w_6 = *(twiddles + j + ((SIZE_T)j << 1) + 257); |
1132 | | |
1133 | 76.1M | for (k = in_loop_cnt; k != 0; k--) { |
1134 | 56.0M | FLOAT32 tmp; |
1135 | | /*x0 is loaded later to avoid register crunch*/ |
1136 | | |
1137 | 56.0M | data += ((SIZE_T)del << 1); |
1138 | | |
1139 | 56.0M | x1r = *data; |
1140 | 56.0M | x1i = *(data + 1); |
1141 | 56.0M | data += ((SIZE_T)del << 1); |
1142 | | |
1143 | 56.0M | x2r = *data; |
1144 | 56.0M | x2i = *(data + 1); |
1145 | 56.0M | data += ((SIZE_T)del << 1); |
1146 | | |
1147 | 56.0M | x3r = *data; |
1148 | 56.0M | x3i = *(data + 1); |
1149 | 56.0M | data -= 3 * (del << 1); |
1150 | | |
1151 | 56.0M | tmp = ia_sub_flt(ia_mul_flt(x1r, w_1), ia_mul_flt(x1i, w_4)); |
1152 | 56.0M | x1i = ia_mac_flt(ia_mul_flt(x1r, w_4), x1i, w_1); |
1153 | 56.0M | x1r = tmp; |
1154 | | |
1155 | 56.0M | tmp = ia_sub_flt(ia_mul_flt(x2r, w_2), ia_mul_flt(x2i, w_5)); |
1156 | 56.0M | x2i = ia_mac_flt(ia_mul_flt(x2r, w_5), x2i, w_2); |
1157 | 56.0M | x2r = tmp; |
1158 | | |
1159 | 56.0M | tmp = ia_sub_flt(ia_mul_flt(x3r, w_3), ia_mul_flt(x3i, w_6)); |
1160 | 56.0M | x3i = ia_mac_flt(ia_mul_flt(x3r, w_6), x3i, w_3); |
1161 | 56.0M | x3r = tmp; |
1162 | | |
1163 | 56.0M | x0r = (*data); |
1164 | 56.0M | x0i = (*(data + 1)); |
1165 | | |
1166 | 56.0M | x0r = ia_add_flt(x0r, (x2r)); |
1167 | 56.0M | x0i = ia_add_flt(x0i, (x2i)); |
1168 | 56.0M | x2r = ia_msu_flt(x0r, x2r, 2); |
1169 | 56.0M | x2i = ia_msu_flt(x0i, x2i, 2); |
1170 | 56.0M | x1r = ia_add_flt(x1r, x3r); |
1171 | 56.0M | x1i = ia_add_flt(x1i, x3i); |
1172 | 56.0M | x3r = ia_msu_flt(x1r, x3r, 2); |
1173 | 56.0M | x3i = ia_msu_flt(x1i, x3i, 2); |
1174 | | |
1175 | 56.0M | x0r = ia_add_flt(x0r, (x1r)); |
1176 | 56.0M | x0i = ia_add_flt(x0i, (x1i)); |
1177 | 56.0M | x1r = ia_msu_flt(x0r, x1r, 2); |
1178 | 56.0M | x1i = ia_msu_flt(x0i, x1i, 2); |
1179 | 56.0M | x2r = ia_add_flt(x2r, (x3i)); |
1180 | 56.0M | x2i = ia_sub_flt(x2i, (x3r)); |
1181 | 56.0M | x3i = ia_msu_flt(x2r, x3i, 2); |
1182 | 56.0M | x3r = ia_mac_flt(x2i, x3r, 2); |
1183 | | |
1184 | 56.0M | *data = x0r; |
1185 | 56.0M | *(data + 1) = x0i; |
1186 | 56.0M | data += ((SIZE_T)del << 1); |
1187 | | |
1188 | 56.0M | *data = x2r; |
1189 | 56.0M | *(data + 1) = x2i; |
1190 | 56.0M | data += ((SIZE_T)del << 1); |
1191 | | |
1192 | 56.0M | *data = x1r; |
1193 | 56.0M | *(data + 1) = x1i; |
1194 | 56.0M | data += ((SIZE_T)del << 1); |
1195 | | |
1196 | 56.0M | *data = x3i; |
1197 | 56.0M | *(data + 1) = x3r; |
1198 | 56.0M | data += ((SIZE_T)del << 1); |
1199 | 56.0M | } |
1200 | 20.0M | data -= 2 * n_points; |
1201 | 20.0M | data += 2; |
1202 | 20.0M | } |
1203 | 11.1M | for (; j <= (nodespacing * del) >> 1; j += nodespacing) { |
1204 | 10.3M | w_1 = *(twiddles + j); |
1205 | 10.3M | w_4 = *(twiddles + j + 257); |
1206 | 10.3M | w_2 = *(twiddles + ((SIZE_T)j << 1)); |
1207 | 10.3M | w_5 = *(twiddles + ((SIZE_T)j << 1) + 257); |
1208 | 10.3M | w_3 = *(twiddles + j + ((SIZE_T)j << 1) - 256); |
1209 | 10.3M | w_6 = *(twiddles + j + ((SIZE_T)j << 1) + 1); |
1210 | | |
1211 | 46.0M | for (k = in_loop_cnt; k != 0; k--) { |
1212 | 35.6M | FLOAT32 tmp; |
1213 | | /*x0 is loaded later to avoid register crunch*/ |
1214 | | |
1215 | 35.6M | data += ((SIZE_T)del << 1); |
1216 | | |
1217 | 35.6M | x1r = *data; |
1218 | 35.6M | x1i = *(data + 1); |
1219 | 35.6M | data += ((SIZE_T)del << 1); |
1220 | | |
1221 | 35.6M | x2r = *data; |
1222 | 35.6M | x2i = *(data + 1); |
1223 | 35.6M | data += ((SIZE_T)del << 1); |
1224 | | |
1225 | 35.6M | x3r = *data; |
1226 | 35.6M | x3i = *(data + 1); |
1227 | 35.6M | data -= 3 * (del << 1); |
1228 | | |
1229 | 35.6M | tmp = ia_sub_flt(ia_mul_flt(x1r, w_1), ia_mul_flt(x1i, w_4)); |
1230 | 35.6M | x1i = ia_mac_flt(ia_mul_flt(x1r, w_4), x1i, w_1); |
1231 | 35.6M | x1r = tmp; |
1232 | | |
1233 | 35.6M | tmp = ia_sub_flt(ia_mul_flt(x2r, w_2), ia_mul_flt(x2i, w_5)); |
1234 | 35.6M | x2i = ia_mac_flt(ia_mul_flt(x2r, w_5), x2i, w_2); |
1235 | 35.6M | x2r = tmp; |
1236 | | |
1237 | 35.6M | tmp = ia_add_flt(ia_mul_flt(x3r, w_6), ia_mul_flt(x3i, w_3)); |
1238 | 35.6M | x3i = ia_add_flt(ia_negate_flt(ia_mul_flt(x3r, w_3)), ia_mul_flt(x3i, w_6)); |
1239 | 35.6M | x3r = tmp; |
1240 | | |
1241 | 35.6M | x0r = (*data); |
1242 | 35.6M | x0i = (*(data + 1)); |
1243 | | |
1244 | 35.6M | x0r = ia_add_flt(x0r, (x2r)); |
1245 | 35.6M | x0i = ia_add_flt(x0i, (x2i)); |
1246 | 35.6M | x2r = ia_msu_flt(x0r, x2r, 2); |
1247 | 35.6M | x2i = ia_msu_flt(x0i, x2i, 2); |
1248 | 35.6M | x1r = ia_add_flt(x1r, x3r); |
1249 | 35.6M | x1i = ia_add_flt(x1i, x3i); |
1250 | 35.6M | x3r = ia_msu_flt(x1r, x3r, 2); |
1251 | 35.6M | x3i = ia_msu_flt(x1i, x3i, 2); |
1252 | | |
1253 | 35.6M | x0r = ia_add_flt(x0r, (x1r)); |
1254 | 35.6M | x0i = ia_add_flt(x0i, (x1i)); |
1255 | 35.6M | x1r = ia_msu_flt(x0r, x1r, 2); |
1256 | 35.6M | x1i = ia_msu_flt(x0i, x1i, 2); |
1257 | 35.6M | x2r = ia_add_flt(x2r, (x3i)); |
1258 | 35.6M | x2i = ia_sub_flt(x2i, (x3r)); |
1259 | 35.6M | x3i = ia_msu_flt(x2r, x3i, 2); |
1260 | 35.6M | x3r = ia_mac_flt(x2i, x3r, 2); |
1261 | | |
1262 | 35.6M | *data = x0r; |
1263 | 35.6M | *(data + 1) = x0i; |
1264 | 35.6M | data += ((SIZE_T)del << 1); |
1265 | | |
1266 | 35.6M | *data = x2r; |
1267 | 35.6M | *(data + 1) = x2i; |
1268 | 35.6M | data += ((SIZE_T)del << 1); |
1269 | | |
1270 | 35.6M | *data = x1r; |
1271 | 35.6M | *(data + 1) = x1i; |
1272 | 35.6M | data += ((SIZE_T)del << 1); |
1273 | | |
1274 | 35.6M | *data = x3i; |
1275 | 35.6M | *(data + 1) = x3r; |
1276 | 35.6M | data += ((SIZE_T)del << 1); |
1277 | 35.6M | } |
1278 | 10.3M | data -= 2 * n_points; |
1279 | 10.3M | data += 2; |
1280 | 10.3M | } |
1281 | 10.3M | for (; j <= sec_loop_cnt * 2; j += nodespacing) { |
1282 | 9.67M | w_1 = *(twiddles + j); |
1283 | 9.67M | w_4 = *(twiddles + j + 257); |
1284 | 9.67M | w_2 = *(twiddles + ((SIZE_T)j << 1) - 256); |
1285 | 9.67M | w_5 = *(twiddles + ((SIZE_T)j << 1) + 1); |
1286 | 9.67M | w_3 = *(twiddles + j + ((SIZE_T)j << 1) - 256); |
1287 | 9.67M | w_6 = *(twiddles + j + ((SIZE_T)j << 1) + 1); |
1288 | | |
1289 | 30.1M | for (k = in_loop_cnt; k != 0; k--) { |
1290 | 20.4M | FLOAT32 tmp; |
1291 | | /*x0 is loaded later to avoid register crunch*/ |
1292 | | |
1293 | 20.4M | data += ((SIZE_T)del << 1); |
1294 | | |
1295 | 20.4M | x1r = *data; |
1296 | 20.4M | x1i = *(data + 1); |
1297 | 20.4M | data += ((SIZE_T)del << 1); |
1298 | | |
1299 | 20.4M | x2r = *data; |
1300 | 20.4M | x2i = *(data + 1); |
1301 | 20.4M | data += ((SIZE_T)del << 1); |
1302 | | |
1303 | 20.4M | x3r = *data; |
1304 | 20.4M | x3i = *(data + 1); |
1305 | 20.4M | data -= 3 * (del << 1); |
1306 | | |
1307 | 20.4M | tmp = ia_sub_flt(ia_mul_flt(x1r, w_1), ia_mul_flt(x1i, w_4)); |
1308 | 20.4M | x1i = ia_mac_flt(ia_mul_flt(x1r, w_4), x1i, w_1); |
1309 | 20.4M | x1r = tmp; |
1310 | | |
1311 | 20.4M | tmp = ia_add_flt(ia_mul_flt(x2r, w_5), ia_mul_flt(x2i, w_2)); |
1312 | 20.4M | x2i = ia_add_flt(ia_negate_flt(ia_mul_flt(x2r, w_2)), ia_mul_flt(x2i, w_5)); |
1313 | 20.4M | x2r = tmp; |
1314 | | |
1315 | 20.4M | tmp = ia_add_flt(ia_mul_flt(x3r, w_6), ia_mul_flt(x3i, w_3)); |
1316 | 20.4M | x3i = ia_add_flt(ia_negate_flt(ia_mul_flt(x3r, w_3)), ia_mul_flt(x3i, w_6)); |
1317 | 20.4M | x3r = tmp; |
1318 | | |
1319 | 20.4M | x0r = (*data); |
1320 | 20.4M | x0i = (*(data + 1)); |
1321 | | |
1322 | 20.4M | x0r = ia_add_flt(x0r, (x2r)); |
1323 | 20.4M | x0i = ia_add_flt(x0i, (x2i)); |
1324 | 20.4M | x2r = ia_msu_flt(x0r, x2r, 2); |
1325 | 20.4M | x2i = ia_msu_flt(x0i, x2i, 2); |
1326 | 20.4M | x1r = ia_add_flt(x1r, x3r); |
1327 | 20.4M | x1i = ia_add_flt(x1i, x3i); |
1328 | 20.4M | x3r = ia_msu_flt(x1r, x3r, 2); |
1329 | 20.4M | x3i = ia_msu_flt(x1i, x3i, 2); |
1330 | | |
1331 | 20.4M | x0r = ia_add_flt(x0r, (x1r)); |
1332 | 20.4M | x0i = ia_add_flt(x0i, (x1i)); |
1333 | 20.4M | x1r = ia_msu_flt(x0r, x1r, 2); |
1334 | 20.4M | x1i = ia_msu_flt(x0i, x1i, 2); |
1335 | 20.4M | x2r = ia_add_flt(x2r, (x3i)); |
1336 | 20.4M | x2i = ia_sub_flt(x2i, (x3r)); |
1337 | 20.4M | x3i = ia_msu_flt(x2r, x3i, 2); |
1338 | 20.4M | x3r = ia_mac_flt(x2i, x3r, 2); |
1339 | | |
1340 | 20.4M | *data = x0r; |
1341 | 20.4M | *(data + 1) = x0i; |
1342 | 20.4M | data += ((SIZE_T)del << 1); |
1343 | | |
1344 | 20.4M | *data = x2r; |
1345 | 20.4M | *(data + 1) = x2i; |
1346 | 20.4M | data += ((SIZE_T)del << 1); |
1347 | | |
1348 | 20.4M | *data = x1r; |
1349 | 20.4M | *(data + 1) = x1i; |
1350 | 20.4M | data += ((SIZE_T)del << 1); |
1351 | | |
1352 | 20.4M | *data = x3i; |
1353 | 20.4M | *(data + 1) = x3r; |
1354 | 20.4M | data += ((SIZE_T)del << 1); |
1355 | 20.4M | } |
1356 | 9.67M | data -= 2 * n_points; |
1357 | 9.67M | data += 2; |
1358 | 9.67M | } |
1359 | 20.7M | for (; j < nodespacing * del; j += nodespacing) { |
1360 | 20.0M | w_1 = *(twiddles + j); |
1361 | 20.0M | w_4 = *(twiddles + j + 257); |
1362 | 20.0M | w_2 = *(twiddles + ((SIZE_T)j << 1) - 256); |
1363 | 20.0M | w_5 = *(twiddles + ((SIZE_T)j << 1) + 1); |
1364 | 20.0M | w_3 = *(twiddles + j + ((SIZE_T)j << 1) - 512); |
1365 | 20.0M | w_6 = *(twiddles + j + ((SIZE_T)j << 1) - 512 + 257); |
1366 | | |
1367 | 76.1M | for (k = in_loop_cnt; k != 0; k--) { |
1368 | 56.0M | FLOAT32 tmp; |
1369 | | /*x0 is loaded later to avoid register crunch*/ |
1370 | | |
1371 | 56.0M | data += ((SIZE_T)del << 1); |
1372 | | |
1373 | 56.0M | x1r = *data; |
1374 | 56.0M | x1i = *(data + 1); |
1375 | 56.0M | data += ((SIZE_T)del << 1); |
1376 | | |
1377 | 56.0M | x2r = *data; |
1378 | 56.0M | x2i = *(data + 1); |
1379 | 56.0M | data += ((SIZE_T)del << 1); |
1380 | | |
1381 | 56.0M | x3r = *data; |
1382 | 56.0M | x3i = *(data + 1); |
1383 | 56.0M | data -= 3 * (del << 1); |
1384 | | |
1385 | 56.0M | tmp = ia_sub_flt(ia_mul_flt(x1r, w_1), ia_mul_flt(x1i, w_4)); |
1386 | 56.0M | x1i = ia_mac_flt(ia_mul_flt(x1r, w_4), x1i, w_1); |
1387 | 56.0M | x1r = tmp; |
1388 | | |
1389 | 56.0M | tmp = ia_add_flt(ia_mul_flt(x2r, w_5), ia_mul_flt(x2i, w_2)); |
1390 | 56.0M | x2i = ia_add_flt(ia_negate_flt(ia_mul_flt(x2r, w_2)), ia_mul_flt(x2i, w_5)); |
1391 | 56.0M | x2r = tmp; |
1392 | | |
1393 | 56.0M | tmp = ia_add_flt(ia_negate_flt(ia_mul_flt(x3r, w_3)), ia_mul_flt(x3i, w_6)); |
1394 | 56.0M | x3i = ia_mac_flt(ia_mul_flt(x3r, w_6), x3i, w_3); |
1395 | 56.0M | x3r = tmp; |
1396 | | |
1397 | 56.0M | x0r = (*data); |
1398 | 56.0M | x0i = (*(data + 1)); |
1399 | | |
1400 | 56.0M | x0r = ia_add_flt(x0r, (x2r)); |
1401 | 56.0M | x0i = ia_add_flt(x0i, (x2i)); |
1402 | 56.0M | x2r = ia_msu_flt(x0r, x2r, 2); |
1403 | 56.0M | x2i = ia_msu_flt(x0i, x2i, 2); |
1404 | 56.0M | x1r = ia_add_flt(x1r, x3r); |
1405 | 56.0M | x1i = ia_sub_flt(x1i, x3i); |
1406 | 56.0M | x3r = ia_msu_flt(x1r, x3r, 2); |
1407 | 56.0M | x3i = ia_mac_flt(x1i, x3i, 2); |
1408 | | |
1409 | 56.0M | x0r = ia_add_flt(x0r, (x1r)); |
1410 | 56.0M | x0i = ia_add_flt(x0i, (x1i)); |
1411 | 56.0M | x1r = ia_msu_flt(x0r, x1r, 2); |
1412 | 56.0M | x1i = ia_msu_flt(x0i, x1i, 2); |
1413 | 56.0M | x2r = ia_add_flt(x2r, (x3i)); |
1414 | 56.0M | x2i = ia_sub_flt(x2i, (x3r)); |
1415 | 56.0M | x3i = ia_msu_flt(x2r, x3i, 2); |
1416 | 56.0M | x3r = ia_mac_flt(x2i, x3r, 2); |
1417 | | |
1418 | 56.0M | *data = x0r; |
1419 | 56.0M | *(data + 1) = x0i; |
1420 | 56.0M | data += ((SIZE_T)del << 1); |
1421 | | |
1422 | 56.0M | *data = x2r; |
1423 | 56.0M | *(data + 1) = x2i; |
1424 | 56.0M | data += ((SIZE_T)del << 1); |
1425 | | |
1426 | 56.0M | *data = x1r; |
1427 | 56.0M | *(data + 1) = x1i; |
1428 | 56.0M | data += ((SIZE_T)del << 1); |
1429 | | |
1430 | 56.0M | *data = x3i; |
1431 | 56.0M | *(data + 1) = x3r; |
1432 | 56.0M | data += ((SIZE_T)del << 1); |
1433 | 56.0M | } |
1434 | 20.0M | data -= 2 * n_points; |
1435 | 20.0M | data += 2; |
1436 | 20.0M | } |
1437 | 716k | nodespacing >>= 2; |
1438 | 716k | del <<= 2; |
1439 | 716k | in_loop_cnt >>= 2; |
1440 | 716k | } |
1441 | 179k | if (not_power_4) { |
1442 | 0 | const FLOAT32 *twiddles = ptr_w; |
1443 | 0 | nodespacing <<= 1; |
1444 | |
|
1445 | 0 | for (j = del / 2; j != 0; j--) { |
1446 | 0 | FLOAT32 w_1 = *twiddles; |
1447 | 0 | FLOAT32 w_4 = *(twiddles + 257); |
1448 | 0 | FLOAT32 tmp; |
1449 | 0 | twiddles += nodespacing; |
1450 | |
|
1451 | 0 | x0r = *ptr_y; |
1452 | 0 | x0i = *(ptr_y + 1); |
1453 | 0 | ptr_y += ((SIZE_T)del << 1); |
1454 | |
|
1455 | 0 | x1r = *ptr_y; |
1456 | 0 | x1i = *(ptr_y + 1); |
1457 | |
|
1458 | 0 | tmp = ia_sub_flt(ia_mul_flt(x1r, w_1), ia_mul_flt(x1i, w_4)); |
1459 | 0 | x1i = (FLOAT32)ia_mac_flt(ia_mul_flt(x1r, w_4), x1i, w_1); |
1460 | 0 | x1r = tmp; |
1461 | |
|
1462 | 0 | *ptr_y = ia_sub_flt((x0r), (x1r)); |
1463 | 0 | *(ptr_y + 1) = ia_sub_flt((x0i), (x1i)); |
1464 | 0 | ptr_y -= ((SIZE_T)del << 1); |
1465 | |
|
1466 | 0 | *ptr_y = ia_add_flt((x0r), (x1r)); |
1467 | 0 | *(ptr_y + 1) = ia_add_flt((x0i), (x1i)); |
1468 | 0 | ptr_y += 2; |
1469 | 0 | } |
1470 | 0 | twiddles = ptr_w; |
1471 | 0 | for (j = del / 2; j != 0; j--) { |
1472 | 0 | FLOAT32 w_1 = *twiddles; |
1473 | 0 | FLOAT32 w_4 = *(twiddles + 257); |
1474 | 0 | FLOAT32 tmp; |
1475 | 0 | twiddles += nodespacing; |
1476 | |
|
1477 | 0 | x0r = *ptr_y; |
1478 | 0 | x0i = *(ptr_y + 1); |
1479 | 0 | ptr_y += ((SIZE_T)del << 1); |
1480 | |
|
1481 | 0 | x1r = *ptr_y; |
1482 | 0 | x1i = *(ptr_y + 1); |
1483 | |
|
1484 | 0 | tmp = ia_add_flt(ia_mul_flt(x1r, w_4), ia_mul_flt(x1i, w_1)); |
1485 | 0 | x1i = ia_add_flt(ia_negate_flt(ia_mul_flt(x1r, w_1)), ia_mul_flt(x1i, w_4)); |
1486 | 0 | x1r = tmp; |
1487 | |
|
1488 | 0 | *ptr_y = ia_sub_flt((x0r), (x1r)); |
1489 | 0 | *(ptr_y + 1) = ia_sub_flt((x0i), (x1i)); |
1490 | 0 | ptr_y -= ((SIZE_T)del << 1); |
1491 | |
|
1492 | 0 | *ptr_y = ia_add_flt((x0r), (x1r)); |
1493 | 0 | *(ptr_y + 1) = ia_add_flt((x0i), (x1i)); |
1494 | 0 | ptr_y += 2; |
1495 | 0 | } |
1496 | 0 | } |
1497 | | |
1498 | 183M | for (i = 0; i < n_points; i++) { |
1499 | 183M | ptr_real[i] = y[2 * i]; |
1500 | 183M | ptr_imag[i] = y[2 * i + 1]; |
1501 | 183M | } |
1502 | 179k | } |
1503 | 45.8M | static VOID ixheaace_cplx_fft_4(FLOAT32 *x_r, FLOAT32 *x_i) { |
1504 | 45.8M | FLOAT32 x_0, x_1, x_2, x_3; |
1505 | 45.8M | FLOAT32 x_4, x_5, x_6, x_7; |
1506 | 45.8M | FLOAT32 x0r, x1r, x2r, x3r; |
1507 | 45.8M | FLOAT32 x0i, x1i, x2i, x3i; |
1508 | | |
1509 | | // 4 Point FFT |
1510 | 45.8M | x_0 = x_r[0]; |
1511 | 45.8M | x_1 = x_i[0]; |
1512 | 45.8M | x_2 = x_r[1]; |
1513 | 45.8M | x_3 = x_i[1]; |
1514 | 45.8M | x_4 = x_r[2]; |
1515 | 45.8M | x_5 = x_i[2]; |
1516 | 45.8M | x_6 = x_r[3]; |
1517 | 45.8M | x_7 = x_i[3]; |
1518 | | |
1519 | 45.8M | x0r = ia_add_flt(x_0, x_4); |
1520 | 45.8M | x0i = ia_add_flt(x_1, x_5); |
1521 | 45.8M | x2r = ia_sub_flt(x_0, x_4); |
1522 | 45.8M | x2i = ia_sub_flt(x_1, x_5); |
1523 | 45.8M | x1r = ia_add_flt(x_2, x_6); |
1524 | 45.8M | x1i = ia_add_flt(x_3, x_7); |
1525 | 45.8M | x3r = ia_sub_flt(x_2, x_6); |
1526 | 45.8M | x3i = ia_sub_flt(x_3, x_7); |
1527 | | |
1528 | 45.8M | x_r[0] = ia_add_flt(x0r, x1r); |
1529 | 45.8M | x_i[0] = ia_add_flt(x0i, x1i); |
1530 | 45.8M | x_r[2] = ia_sub_flt(x0r, x1r); |
1531 | 45.8M | x_i[2] = ia_sub_flt(x0i, x1i); |
1532 | 45.8M | x_r[1] = ia_add_flt(x2r, x3i); |
1533 | 45.8M | x_i[1] = ia_sub_flt(x2i, x3r); |
1534 | 45.8M | x_r[3] = ia_sub_flt(x2r, x3i); |
1535 | 45.8M | x_i[3] = ia_add_flt(x2i, x3r); |
1536 | 45.8M | return; |
1537 | 45.8M | } |
1538 | 44.8k | VOID iusace_complex_fft_4096(FLOAT32 *ptr_x_r, FLOAT32 *ptr_x_i, FLOAT32 *ptr_scratch_buf) { |
1539 | 44.8k | FLOAT32 *ptr_data_r; |
1540 | 44.8k | FLOAT32 *ptr_data_i; |
1541 | 44.8k | WORD32 fft_len = 4096; |
1542 | 44.8k | FLOAT32 *ptr_fft_interim_buf = &ptr_scratch_buf[2 * fft_len]; |
1543 | 44.8k | WORD32 i, j; |
1544 | 44.8k | WORD32 dim2 = fft_len >> 10; |
1545 | 44.8k | WORD32 dim1 = fft_len / dim2; |
1546 | 44.8k | WORD32 fac = 4; |
1547 | | |
1548 | 224k | for (i = 0; i < dim2; i++) { |
1549 | 179k | ptr_data_r = &ptr_scratch_buf[(2 * i + 0) * dim1]; |
1550 | 179k | ptr_data_i = &ptr_scratch_buf[(2 * i + 1) * dim1]; |
1551 | 183M | for (j = 0; j < dim1; j++) { |
1552 | 183M | ptr_data_r[j] = ptr_x_r[(dim2 * j + i)]; |
1553 | 183M | ptr_data_i[j] = 0; |
1554 | 183M | } |
1555 | 179k | ixheaace_rad2_cplx_fft(ptr_data_r, ptr_data_i, dim1, ptr_fft_interim_buf); |
1556 | 179k | } |
1557 | 44.8k | ptr_data_r = &ptr_scratch_buf[0]; |
1558 | 44.8k | ptr_data_i = &ptr_scratch_buf[0]; |
1559 | 45.9M | for (i = 0; i < dim1; i++) { |
1560 | 45.8M | FLOAT32 *ptr_cos_val = (FLOAT32 *)&ia_mixed_rad_twiddle_cos[i * dim2 * fac]; |
1561 | 45.8M | FLOAT32 *ptr_sin_val = (FLOAT32 *)&ia_mixed_rad_twiddle_sin[i * dim2 * fac]; |
1562 | 229M | for (j = 0; j < dim2; j++) { |
1563 | 183M | FLOAT32 real = ptr_data_r[(2 * j + 0) * dim1 + i]; |
1564 | 183M | FLOAT32 imag = ptr_data_i[(2 * j + 1) * dim1 + i]; |
1565 | 183M | FLOAT32 cos_val = ptr_cos_val[j * fac]; |
1566 | 183M | FLOAT32 sin_val = ptr_sin_val[j * fac]; |
1567 | 183M | FLOAT32 temp_real = (FLOAT32)(real * cos_val + imag * sin_val); |
1568 | 183M | FLOAT32 temp_imag = (FLOAT32)(imag * cos_val - real * sin_val); |
1569 | 183M | ptr_fft_interim_buf[(2 * i + 0) * dim2 + j] = temp_real; |
1570 | 183M | ptr_fft_interim_buf[(2 * i + 1) * dim2 + j] = temp_imag; |
1571 | 183M | } |
1572 | 45.8M | } |
1573 | 45.9M | for (i = 0; i < dim1; i++) { |
1574 | 45.8M | ptr_data_r = &ptr_fft_interim_buf[(2 * i + 0) * dim2]; |
1575 | 45.8M | ptr_data_i = &ptr_fft_interim_buf[(2 * i + 1) * dim2]; |
1576 | 45.8M | ixheaace_cplx_fft_4(ptr_data_r, ptr_data_i); |
1577 | 45.8M | } |
1578 | 44.8k | ptr_data_r = &ptr_fft_interim_buf[0]; |
1579 | 44.8k | ptr_data_i = &ptr_fft_interim_buf[0]; |
1580 | 45.9M | for (i = 0; i < dim1; i++) { |
1581 | 229M | for (j = 0; j < dim2; j++) { |
1582 | 183M | ptr_x_r[(j * dim1 + i)] = ptr_data_r[(2 * i + 0) * dim2 + j]; |
1583 | 183M | ptr_x_i[(j * dim1 + i)] = ptr_data_i[(2 * i + 1) * dim2 + j]; |
1584 | 183M | } |
1585 | 45.8M | } |
1586 | 44.8k | } |