Coverage Report

Created: 2026-01-17 06:57

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libxaac/encoder/iusace_fft.c
Line
Count
Source
1
/******************************************************************************
2
 *                                                                            *
3
 * Copyright (C) 2023 The Android Open Source Project
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at:
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 *****************************************************************************
18
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19
 */
20
21
#include <string.h>
22
#include "ixheaac_type_def.h"
23
#include "ixheaace_adjust_threshold_data.h"
24
#include "iusace_cnst.h"
25
#include "iusace_block_switch_const.h"
26
#include "iusace_rom.h"
27
#include "iusace_bitbuffer.h"
28
29
/* DRC */
30
#include "impd_drc_common_enc.h"
31
#include "impd_drc_uni_drc.h"
32
#include "impd_drc_tables.h"
33
#include "impd_drc_api.h"
34
#include "impd_drc_uni_drc_eq.h"
35
#include "impd_drc_uni_drc_filter_bank.h"
36
#include "impd_drc_gain_enc.h"
37
#include "impd_drc_struct_def.h"
38
39
#include "iusace_tns_usac.h"
40
#include "iusace_psy_mod.h"
41
#include "iusace_config.h"
42
#include "iusace_fft.h"
43
#include "iusace_basic_ops_flt.h"
44
#include "ixheaac_constants.h"
45
#include "ixheaace_aac_constants.h"
46
#include "ixheaac_basic_ops32.h"
47
#include "ixheaace_common_utils.h"
48
#include "ixheaac_error_standards.h"
49
#include "ixheaace_error_codes.h"
50
51
#define DIG_REV(i, m, j)                                    \
52
543M
  do {                                                      \
53
543M
    unsigned _ = (i);                                       \
54
543M
    _ = ((_ & 0x33333333) << 2) | ((_ & ~0x33333333) >> 2); \
55
543M
    _ = ((_ & 0x0F0F0F0F) << 4) | ((_ & ~0x0F0F0F0F) >> 4); \
56
543M
    _ = ((_ & 0x00FF00FF) << 8) | ((_ & ~0x00FF00FF) >> 8); \
57
543M
    (j) = _ >> (m);                                         \
58
543M
  } while (0)
59
60
50.8M
static PLATFORM_INLINE WORD8 iusace_calc_norm(WORD32 a) {
61
50.8M
  WORD8 norm_val;
62
63
50.8M
  if (a == 0) {
64
0
    norm_val = 31;
65
50.8M
  } else {
66
50.8M
    if (a == (WORD32)0xffffffffL) {
67
0
      norm_val = 31;
68
50.8M
    } else {
69
50.8M
      if (a < 0) {
70
0
        a = ~a;
71
0
      }
72
1.32G
      for (norm_val = 0; a < (WORD32)0x40000000L; norm_val++) {
73
1.27G
        a <<= 1;
74
1.27G
      }
75
50.8M
    }
76
50.8M
  }
77
78
50.8M
  return norm_val;
79
50.8M
}
80
81
123M
static PLATFORM_INLINE VOID iusace_complex_3point_fft(FLOAT32 *ptr_in, FLOAT32 *ptr_out) {
82
123M
  FLOAT32 add_r, sub_r;
83
123M
  FLOAT32 add_i, sub_i;
84
123M
  FLOAT32 x01r, x01i, temp;
85
123M
  FLOAT32 p1, p2, p3, p4;
86
123M
  FLOAT64 sinmu;
87
88
123M
  sinmu = 0.866025403784439;
89
90
123M
  x01r = ptr_in[0] + ptr_in[2];
91
123M
  x01i = ptr_in[1] + ptr_in[3];
92
93
123M
  add_r = ptr_in[2] + ptr_in[4];
94
123M
  add_i = ptr_in[3] + ptr_in[5];
95
96
123M
  sub_r = ptr_in[2] - ptr_in[4];
97
123M
  sub_i = ptr_in[3] - ptr_in[5];
98
99
123M
  p1 = add_r / (FLOAT32)2.0;
100
123M
  p4 = add_i / (FLOAT32)2.0;
101
123M
  p2 = (FLOAT32)((FLOAT64)sub_i * sinmu);
102
123M
  p3 = (FLOAT32)((FLOAT64)sub_r * sinmu);
103
104
123M
  temp = ptr_in[0] - p1;
105
106
123M
  ptr_out[0] = x01r + ptr_in[4];
107
123M
  ptr_out[1] = x01i + ptr_in[5];
108
123M
  ptr_out[2] = temp + p2;
109
123M
  ptr_out[3] = (ptr_in[1] - p3) - p4;
110
123M
  ptr_out[4] = temp - p2;
111
123M
  ptr_out[5] = (ptr_in[1] + p3) - p4;
112
113
123M
  return;
114
123M
}
115
116
25.4M
VOID iusace_complex_fft_p2(FLOAT32 *ptr_x, WORD32 nlength, FLOAT32 *scratch_fft_p2_y) {
117
25.4M
  WORD32 i, j, k, n_stages, h2;
118
25.4M
  FLOAT32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
119
25.4M
  FLOAT32 tmp;
120
25.4M
  WORD32 del, nodespacing, in_loop_cnt;
121
25.4M
  WORD32 not_power_4;
122
25.4M
  WORD32 dig_rev_shift;
123
25.4M
  FLOAT32 *y = scratch_fft_p2_y;
124
25.4M
  WORD32 mpass = nlength;
125
25.4M
  WORD32 npoints = nlength;
126
25.4M
  FLOAT32 *ptr_y = y;
127
25.4M
  const FLOAT64 *ptr_w;
128
129
25.4M
  dig_rev_shift = iusace_calc_norm(mpass) + 1 - 16;
130
25.4M
  n_stages = 30 - iusace_calc_norm(mpass);
131
25.4M
  not_power_4 = n_stages & 1;
132
133
25.4M
  n_stages = n_stages >> 1;
134
135
25.4M
  ptr_w = iusace_twiddle_table_fft_32x32;
136
137
25.4M
  if (dig_rev_shift < 0) {
138
0
    dig_rev_shift = 0;
139
0
  }
140
141
514M
  for (i = 0; i < npoints; i += 4) {
142
489M
    FLOAT32 *inp = ptr_x;
143
489M
    FLOAT32 tmk;
144
145
489M
    DIG_REV(i, dig_rev_shift, h2);
146
489M
    if (not_power_4) {
147
224M
      h2 += 1;
148
224M
      h2 &= ~1;
149
224M
    }
150
489M
    inp += (h2);
151
152
489M
    x0r = *inp;
153
489M
    x0i = *(inp + 1);
154
489M
    inp += (npoints >> 1);
155
156
489M
    x1r = *inp;
157
489M
    x1i = *(inp + 1);
158
489M
    inp += (npoints >> 1);
159
160
489M
    x2r = *inp;
161
489M
    x2i = *(inp + 1);
162
489M
    inp += (npoints >> 1);
163
164
489M
    x3r = *inp;
165
489M
    x3i = *(inp + 1);
166
167
489M
    x0r = x0r + x2r;
168
489M
    x0i = x0i + x2i;
169
170
489M
    tmk = x0r - x2r;
171
489M
    x2r = tmk - x2r;
172
489M
    tmk = x0i - x2i;
173
489M
    x2i = tmk - x2i;
174
175
489M
    x1r = x1r + x3r;
176
489M
    x1i = x1i + x3i;
177
178
489M
    tmk = x1r - x3r;
179
489M
    x3r = tmk - x3r;
180
489M
    tmk = x1i - x3i;
181
489M
    x3i = tmk - x3i;
182
183
489M
    x0r = x0r + x1r;
184
489M
    x0i = x0i + x1i;
185
186
489M
    tmk = x0r - x1r;
187
489M
    x1r = tmk - x1r;
188
489M
    tmk = x0i - x1i;
189
489M
    x1i = tmk - x1i;
190
191
489M
    x2r = x2r + x3i;
192
489M
    x2i = x2i - x3r;
193
194
489M
    tmk = x2r - x3i;
195
489M
    x3i = tmk - x3i;
196
489M
    tmk = x2i + x3r;
197
489M
    x3r = tmk + x3r;
198
199
489M
    *ptr_y++ = x0r;
200
489M
    *ptr_y++ = x0i;
201
489M
    *ptr_y++ = x2r;
202
489M
    *ptr_y++ = x2i;
203
489M
    *ptr_y++ = x1r;
204
489M
    *ptr_y++ = x1i;
205
489M
    *ptr_y++ = x3i;
206
489M
    *ptr_y++ = x3r;
207
489M
  }
208
25.4M
  ptr_y -= 2 * npoints;
209
25.4M
  del = 4;
210
25.4M
  nodespacing = 64;
211
25.4M
  in_loop_cnt = npoints >> 4;
212
56.4M
  for (i = n_stages - 1; i > 0; i--) {
213
30.9M
    const FLOAT64 *twiddles = ptr_w;
214
30.9M
    FLOAT32 *data = ptr_y;
215
30.9M
    FLOAT64 w_1, w_2, w_3, w_4, w_5, w_6;
216
30.9M
    WORD32 sec_loop_cnt;
217
218
181M
    for (k = in_loop_cnt; k != 0; k--) {
219
150M
      x0r = (*data);
220
150M
      x0i = (*(data + 1));
221
150M
      data += ((SIZE_T)del << 1);
222
223
150M
      x1r = (*data);
224
150M
      x1i = (*(data + 1));
225
150M
      data += ((SIZE_T)del << 1);
226
227
150M
      x2r = (*data);
228
150M
      x2i = (*(data + 1));
229
150M
      data += ((SIZE_T)del << 1);
230
231
150M
      x3r = (*data);
232
150M
      x3i = (*(data + 1));
233
150M
      data -= 3 * (del << 1);
234
235
150M
      x0r = x0r + x2r;
236
150M
      x0i = x0i + x2i;
237
150M
      x2r = x0r - (x2r * 2);
238
150M
      x2i = x0i - (x2i * 2);
239
150M
      x1r = x1r + x3r;
240
150M
      x1i = x1i + x3i;
241
150M
      x3r = x1r - (x3r * 2);
242
150M
      x3i = x1i - (x3i * 2);
243
244
150M
      x0r = x0r + x1r;
245
150M
      x0i = x0i + x1i;
246
150M
      x1r = x0r - (x1r * 2);
247
150M
      x1i = x0i - (x1i * 2);
248
150M
      x2r = x2r + x3i;
249
150M
      x2i = x2i - x3r;
250
150M
      x3i = x2r - (x3i * 2);
251
150M
      x3r = x2i + (x3r * 2);
252
253
150M
      *data = x0r;
254
150M
      *(data + 1) = x0i;
255
150M
      data += ((SIZE_T)del << 1);
256
257
150M
      *data = x2r;
258
150M
      *(data + 1) = x2i;
259
150M
      data += ((SIZE_T)del << 1);
260
261
150M
      *data = x1r;
262
150M
      *(data + 1) = x1i;
263
150M
      data += ((SIZE_T)del << 1);
264
265
150M
      *data = x3i;
266
150M
      *(data + 1) = x3r;
267
150M
      data += ((SIZE_T)del << 1);
268
150M
    }
269
30.9M
    data = ptr_y + 2;
270
271
30.9M
    sec_loop_cnt = (nodespacing * del);
272
30.9M
    sec_loop_cnt = (sec_loop_cnt / 4) + (sec_loop_cnt / 8) - (sec_loop_cnt / 16) +
273
30.9M
                   (sec_loop_cnt / 32) - (sec_loop_cnt / 64) + (sec_loop_cnt / 128) -
274
30.9M
                   (sec_loop_cnt / 256);
275
276
176M
    for (j = nodespacing; j <= sec_loop_cnt; j += nodespacing) {
277
145M
      w_1 = *(twiddles + j);
278
145M
      w_4 = *(twiddles + j + 257);
279
145M
      w_2 = *(twiddles + ((SIZE_T)j << 1));
280
145M
      w_5 = *(twiddles + ((SIZE_T)j << 1) + 257);
281
145M
      w_3 = *(twiddles + j + ((SIZE_T)j << 1));
282
145M
      w_6 = *(twiddles + j + ((SIZE_T)j << 1) + 257);
283
284
524M
      for (k = in_loop_cnt; k != 0; k--) {
285
378M
        data += ((SIZE_T)del << 1);
286
287
378M
        x1r = *data;
288
378M
        x1i = *(data + 1);
289
378M
        data += ((SIZE_T)del << 1);
290
291
378M
        x2r = *data;
292
378M
        x2i = *(data + 1);
293
378M
        data += ((SIZE_T)del << 1);
294
295
378M
        x3r = *data;
296
378M
        x3i = *(data + 1);
297
378M
        data -= 3 * (del << 1);
298
299
378M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_1) - ixheaace_dmult((FLOAT64)x1i, w_4));
300
378M
        x1i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x1r, w_4), (FLOAT64)x1i, w_1);
301
378M
        x1r = tmp;
302
303
378M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x2r, w_2) - ixheaace_dmult((FLOAT64)x2i, w_5));
304
378M
        x2i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x2r, w_5), (FLOAT64)x2i, w_2);
305
378M
        x2r = tmp;
306
307
378M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x3r, w_3) - ixheaace_dmult((FLOAT64)x3i, w_6));
308
378M
        x3i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x3r, w_6), (FLOAT64)x3i, w_3);
309
378M
        x3r = tmp;
310
311
378M
        x0r = (*data);
312
378M
        x0i = (*(data + 1));
313
314
378M
        x0r = x0r + (x2r);
315
378M
        x0i = x0i + (x2i);
316
378M
        x2r = x0r - (x2r * 2);
317
378M
        x2i = x0i - (x2i * 2);
318
378M
        x1r = x1r + x3r;
319
378M
        x1i = x1i + x3i;
320
378M
        x3r = x1r - (x3r * 2);
321
378M
        x3i = x1i - (x3i * 2);
322
323
378M
        x0r = x0r + (x1r);
324
378M
        x0i = x0i + (x1i);
325
378M
        x1r = x0r - (x1r * 2);
326
378M
        x1i = x0i - (x1i * 2);
327
378M
        x2r = x2r + (x3i);
328
378M
        x2i = x2i - (x3r);
329
378M
        x3i = x2r - (x3i * 2);
330
378M
        x3r = x2i + (x3r * 2);
331
332
378M
        *data = x0r;
333
378M
        *(data + 1) = x0i;
334
378M
        data += ((SIZE_T)del << 1);
335
336
378M
        *data = x2r;
337
378M
        *(data + 1) = x2i;
338
378M
        data += ((SIZE_T)del << 1);
339
340
378M
        *data = x1r;
341
378M
        *(data + 1) = x1i;
342
378M
        data += ((SIZE_T)del << 1);
343
344
378M
        *data = x3i;
345
378M
        *(data + 1) = x3r;
346
378M
        data += ((SIZE_T)del << 1);
347
378M
      }
348
145M
      data -= 2 * npoints;
349
145M
      data += 2;
350
145M
    }
351
119M
    for (; j <= (nodespacing * del) >> 1; j += nodespacing) {
352
88.4M
      w_1 = *(twiddles + j);
353
88.4M
      w_4 = *(twiddles + j + 257);
354
88.4M
      w_2 = *(twiddles + ((SIZE_T)j << 1));
355
88.4M
      w_5 = *(twiddles + ((SIZE_T)j << 1) + 257);
356
88.4M
      w_3 = *(twiddles + j + ((SIZE_T)j << 1) - 256);
357
88.4M
      w_6 = *(twiddles + j + ((SIZE_T)j << 1) + 1);
358
359
352M
      for (k = in_loop_cnt; k != 0; k--) {
360
264M
        data += ((SIZE_T)del << 1);
361
362
264M
        x1r = *data;
363
264M
        x1i = *(data + 1);
364
264M
        data += ((SIZE_T)del << 1);
365
366
264M
        x2r = *data;
367
264M
        x2i = *(data + 1);
368
264M
        data += ((SIZE_T)del << 1);
369
370
264M
        x3r = *data;
371
264M
        x3i = *(data + 1);
372
264M
        data -= 3 * (del << 1);
373
374
264M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_1) - ixheaace_dmult((FLOAT64)x1i, w_4));
375
264M
        x1i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x1r, w_4), (FLOAT64)x1i, w_1);
376
264M
        x1r = tmp;
377
378
264M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x2r, w_2) - ixheaace_dmult((FLOAT64)x2i, w_5));
379
264M
        x2i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x2r, w_5), (FLOAT64)x2i, w_2);
380
264M
        x2r = tmp;
381
382
264M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x3r, w_6) + ixheaace_dmult((FLOAT64)x3i, w_3));
383
264M
        x3i = (FLOAT32)(-ixheaace_dmult((FLOAT64)x3r, w_3) + ixheaace_dmult((FLOAT64)x3i, w_6));
384
264M
        x3r = tmp;
385
386
264M
        x0r = (*data);
387
264M
        x0i = (*(data + 1));
388
389
264M
        x0r = x0r + (x2r);
390
264M
        x0i = x0i + (x2i);
391
264M
        x2r = x0r - (x2r * 2);
392
264M
        x2i = x0i - (x2i * 2);
393
264M
        x1r = x1r + x3r;
394
264M
        x1i = x1i + x3i;
395
264M
        x3r = x1r - (x3r * 2);
396
264M
        x3i = x1i - (x3i * 2);
397
398
264M
        x0r = x0r + (x1r);
399
264M
        x0i = x0i + (x1i);
400
264M
        x1r = x0r - (x1r * 2);
401
264M
        x1i = x0i - (x1i * 2);
402
264M
        x2r = x2r + (x3i);
403
264M
        x2i = x2i - (x3r);
404
264M
        x3i = x2r - (x3i * 2);
405
264M
        x3r = x2i + (x3r * 2);
406
407
264M
        *data = x0r;
408
264M
        *(data + 1) = x0i;
409
264M
        data += ((SIZE_T)del << 1);
410
411
264M
        *data = x2r;
412
264M
        *(data + 1) = x2i;
413
264M
        data += ((SIZE_T)del << 1);
414
415
264M
        *data = x1r;
416
264M
        *(data + 1) = x1i;
417
264M
        data += ((SIZE_T)del << 1);
418
419
264M
        *data = x3i;
420
264M
        *(data + 1) = x3r;
421
264M
        data += ((SIZE_T)del << 1);
422
264M
      }
423
88.4M
      data -= 2 * npoints;
424
88.4M
      data += 2;
425
88.4M
    }
426
88.4M
    for (; j <= sec_loop_cnt * 2; j += nodespacing) {
427
57.4M
      w_1 = *(twiddles + j);
428
57.4M
      w_4 = *(twiddles + j + 257);
429
57.4M
      w_2 = *(twiddles + ((SIZE_T)j << 1) - 256);
430
57.4M
      w_5 = *(twiddles + ((SIZE_T)j << 1) + 1);
431
57.4M
      w_3 = *(twiddles + j + ((SIZE_T)j << 1) - 256);
432
57.4M
      w_6 = *(twiddles + j + ((SIZE_T)j << 1) + 1);
433
434
171M
      for (k = in_loop_cnt; k != 0; k--) {
435
114M
        data += ((SIZE_T)del << 1);
436
437
114M
        x1r = *data;
438
114M
        x1i = *(data + 1);
439
114M
        data += ((SIZE_T)del << 1);
440
441
114M
        x2r = *data;
442
114M
        x2i = *(data + 1);
443
114M
        data += ((SIZE_T)del << 1);
444
445
114M
        x3r = *data;
446
114M
        x3i = *(data + 1);
447
114M
        data -= 3 * (del << 1);
448
449
114M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_1) - ixheaace_dmult((FLOAT64)x1i, w_4));
450
114M
        x1i = (FLOAT32)ixheaace_dmac(ixheaace_dmult(x1r, w_4), x1i, w_1);
451
114M
        x1r = tmp;
452
453
114M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x2r, w_5) + ixheaace_dmult((FLOAT64)x2i, w_2));
454
114M
        x2i = (FLOAT32)(-ixheaace_dmult(x2r, w_2) + ixheaace_dmult(x2i, w_5));
455
114M
        x2r = tmp;
456
457
114M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x3r, w_6) + ixheaace_dmult((FLOAT64)x3i, w_3));
458
114M
        x3i = (FLOAT32)(-ixheaace_dmult((FLOAT64)x3r, w_3) + ixheaace_dmult((FLOAT64)x3i, w_6));
459
114M
        x3r = tmp;
460
461
114M
        x0r = (*data);
462
114M
        x0i = (*(data + 1));
463
464
114M
        x0r = x0r + (x2r);
465
114M
        x0i = x0i + (x2i);
466
114M
        x2r = x0r - (x2r * 2);
467
114M
        x2i = x0i - (x2i * 2);
468
114M
        x1r = x1r + x3r;
469
114M
        x1i = x1i + x3i;
470
114M
        x3r = x1r - (x3r * 2);
471
114M
        x3i = x1i - (x3i * 2);
472
473
114M
        x0r = x0r + (x1r);
474
114M
        x0i = x0i + (x1i);
475
114M
        x1r = x0r - (x1r * 2);
476
114M
        x1i = x0i - (x1i * 2);
477
114M
        x2r = x2r + (x3i);
478
114M
        x2i = x2i - (x3r);
479
114M
        x3i = x2r - (x3i * 2);
480
114M
        x3r = x2i + (x3r * 2);
481
482
114M
        *data = x0r;
483
114M
        *(data + 1) = x0i;
484
114M
        data += ((SIZE_T)del << 1);
485
486
114M
        *data = x2r;
487
114M
        *(data + 1) = x2i;
488
114M
        data += ((SIZE_T)del << 1);
489
490
114M
        *data = x1r;
491
114M
        *(data + 1) = x1i;
492
114M
        data += ((SIZE_T)del << 1);
493
494
114M
        *data = x3i;
495
114M
        *(data + 1) = x3r;
496
114M
        data += ((SIZE_T)del << 1);
497
114M
      }
498
57.4M
      data -= 2 * npoints;
499
57.4M
      data += 2;
500
57.4M
    }
501
176M
    for (; j < nodespacing * del; j += nodespacing) {
502
145M
      w_1 = *(twiddles + j);
503
145M
      w_4 = *(twiddles + j + 257);
504
145M
      w_2 = *(twiddles + ((SIZE_T)j << 1) - 256);
505
145M
      w_5 = *(twiddles + ((SIZE_T)j << 1) + 1);
506
145M
      w_3 = *(twiddles + j + ((SIZE_T)j << 1) - 512);
507
145M
      w_6 = *(twiddles + j + ((SIZE_T)j << 1) - 512 + 257);
508
509
524M
      for (k = in_loop_cnt; k != 0; k--) {
510
378M
        data += ((SIZE_T)del << 1);
511
512
378M
        x1r = *data;
513
378M
        x1i = *(data + 1);
514
378M
        data += ((SIZE_T)del << 1);
515
516
378M
        x2r = *data;
517
378M
        x2i = *(data + 1);
518
378M
        data += ((SIZE_T)del << 1);
519
520
378M
        x3r = *data;
521
378M
        x3i = *(data + 1);
522
378M
        data -= 3 * ((SIZE_T)del << 1);
523
524
378M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_1) - ixheaace_dmult((FLOAT64)x1i, w_4));
525
378M
        x1i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x1r, w_4), (FLOAT64)x1i, w_1);
526
378M
        x1r = tmp;
527
528
378M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x2r, w_5) + ixheaace_dmult((FLOAT64)x2i, w_2));
529
378M
        x2i = (FLOAT32)(-ixheaace_dmult((FLOAT64)x2r, w_2) + ixheaace_dmult((FLOAT64)x2i, w_5));
530
378M
        x2r = tmp;
531
532
378M
        tmp = (FLOAT32)(-ixheaace_dmult((FLOAT64)x3r, w_3) + ixheaace_dmult((FLOAT64)x3i, w_6));
533
378M
        x3i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x3r, w_6), (FLOAT64)x3i, w_3);
534
378M
        x3r = tmp;
535
536
378M
        x0r = (*data);
537
378M
        x0i = (*(data + 1));
538
539
378M
        x0r = x0r + (x2r);
540
378M
        x0i = x0i + (x2i);
541
378M
        x2r = x0r - (x2r * 2);
542
378M
        x2i = x0i - (x2i * 2);
543
378M
        x1r = x1r + x3r;
544
378M
        x1i = x1i - x3i;
545
378M
        x3r = x1r - (x3r * 2);
546
378M
        x3i = x1i + (x3i * 2);
547
548
378M
        x0r = x0r + (x1r);
549
378M
        x0i = x0i + (x1i);
550
378M
        x1r = x0r - (x1r * 2);
551
378M
        x1i = x0i - (x1i * 2);
552
378M
        x2r = x2r + (x3i);
553
378M
        x2i = x2i - (x3r);
554
378M
        x3i = x2r - (x3i * 2);
555
378M
        x3r = x2i + (x3r * 2);
556
557
378M
        *data = x0r;
558
378M
        *(data + 1) = x0i;
559
378M
        data += ((SIZE_T)del << 1);
560
561
378M
        *data = x2r;
562
378M
        *(data + 1) = x2i;
563
378M
        data += ((SIZE_T)del << 1);
564
565
378M
        *data = x1r;
566
378M
        *(data + 1) = x1i;
567
378M
        data += ((SIZE_T)del << 1);
568
569
378M
        *data = x3i;
570
378M
        *(data + 1) = x3r;
571
378M
        data += ((SIZE_T)del << 1);
572
378M
      }
573
145M
      data -= 2 * npoints;
574
145M
      data += 2;
575
145M
    }
576
30.9M
    nodespacing >>= 2;
577
30.9M
    del <<= 2;
578
30.9M
    in_loop_cnt >>= 2;
579
30.9M
  }
580
25.4M
  if (not_power_4) {
581
12.7M
    const FLOAT64 *twiddles = ptr_w;
582
12.7M
    nodespacing <<= 1;
583
584
237M
    for (j = del / 2; j != 0; j--) {
585
224M
      FLOAT64 w_1 = *twiddles;
586
224M
      FLOAT64 w_4 = *(twiddles + 257);
587
224M
      twiddles += nodespacing;
588
589
224M
      x0r = *ptr_y;
590
224M
      x0i = *(ptr_y + 1);
591
224M
      ptr_y += ((SIZE_T)del << 1);
592
593
224M
      x1r = *ptr_y;
594
224M
      x1i = *(ptr_y + 1);
595
596
224M
      tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_1) - ixheaace_dmult((FLOAT64)x1i, w_4));
597
224M
      x1i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x1r, w_4), (FLOAT64)x1i, w_1);
598
224M
      x1r = tmp;
599
600
224M
      *ptr_y = (x0r) - (x1r);
601
224M
      *(ptr_y + 1) = (x0i) - (x1i);
602
224M
      ptr_y -= ((SIZE_T)del << 1);
603
604
224M
      *ptr_y = (x0r) + (x1r);
605
224M
      *(ptr_y + 1) = (x0i) + (x1i);
606
224M
      ptr_y += 2;
607
224M
    }
608
12.7M
    twiddles = ptr_w;
609
237M
    for (j = del / 2; j != 0; j--) {
610
224M
      FLOAT64 w_1 = *twiddles;
611
224M
      FLOAT64 w_4 = *(twiddles + 257);
612
224M
      twiddles += nodespacing;
613
614
224M
      x0r = *ptr_y;
615
224M
      x0i = *(ptr_y + 1);
616
224M
      ptr_y += ((SIZE_T)del << 1);
617
618
224M
      x1r = *ptr_y;
619
224M
      x1i = *(ptr_y + 1);
620
621
224M
      tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_4) + ixheaace_dmult((FLOAT64)x1i, w_1));
622
224M
      x1i = (FLOAT32)(-ixheaace_dmult((FLOAT64)x1r, w_1) + ixheaace_dmult((FLOAT64)x1i, w_4));
623
224M
      x1r = tmp;
624
625
224M
      *ptr_y = (x0r) - (x1r);
626
224M
      *(ptr_y + 1) = (x0i) - (x1i);
627
224M
      ptr_y -= ((SIZE_T)del << 1);
628
629
224M
      *ptr_y = (x0r) + (x1r);
630
224M
      *(ptr_y + 1) = (x0i) + (x1i);
631
224M
      ptr_y += 2;
632
224M
    }
633
12.7M
  }
634
635
1.98G
  for (i = 0; i < nlength; i++) {
636
1.95G
    *(ptr_x + 2 * i) = y[2 * i];
637
1.95G
    *(ptr_x + 2 * i + 1) = y[2 * i + 1];
638
1.95G
  }
639
25.4M
}
640
641
static VOID iusace_complex_fft_p3(FLOAT32 *data, WORD32 nlength,
642
3.75M
                                  iusace_scratch_mem *pstr_scratch) {
643
3.75M
  WORD32 i, j;
644
3.75M
  FLOAT32 *data_3 = pstr_scratch->p_fft_p3_data_3;
645
3.75M
  FLOAT32 *y = pstr_scratch->p_fft_p3_y;
646
3.75M
  WORD32 cnfac;
647
3.75M
  WORD32 mpass = nlength;
648
3.75M
  FLOAT32 *ptr_x = data;
649
3.75M
  FLOAT32 *ptr_y = y;
650
651
3.75M
  cnfac = 0;
652
7.51M
  while (mpass % 3 == 0) {
653
3.75M
    mpass /= 3;
654
3.75M
    cnfac++;
655
3.75M
  }
656
657
15.0M
  for (i = 0; i < 3 * cnfac; i++) {
658
382M
    for (j = 0; j < mpass; j++) {
659
371M
      data_3[2 * j] = data[3 * (2 * j) + (2 * i)];
660
371M
      data_3[2 * j + 1] = data[3 * (2 * j) + 1 + (2 * i)];
661
371M
    }
662
11.2M
    iusace_complex_fft_p2(data_3, mpass, pstr_scratch->p_fft_p2_y);
663
664
382M
    for (j = 0; j < mpass; j++) {
665
371M
      data[3 * (2 * j) + (2 * i)] = data_3[2 * j];
666
371M
      data[3 * (2 * j) + 1 + (2 * i)] = data_3[2 * j + 1];
667
371M
    }
668
11.2M
  }
669
670
3.75M
  {
671
3.75M
    const FLOAT64 *w1r, *w1i;
672
3.75M
    FLOAT32 tmp;
673
3.75M
    w1r = iusace_twiddle_table_3pr;
674
3.75M
    w1i = iusace_twiddle_table_3pi;
675
676
127M
    for (i = 0; i < nlength; i += 3) {
677
123M
      tmp = (FLOAT32)((FLOAT64)data[2 * i] * (*w1r) - (FLOAT64)data[2 * i + 1] * (*w1i));
678
123M
      data[2 * i + 1] =
679
123M
          (FLOAT32)((FLOAT64)data[2 * i] * (*w1i) + (FLOAT64)data[2 * i + 1] * (*w1r));
680
123M
      data[2 * i] = tmp;
681
682
123M
      w1r++;
683
123M
      w1i++;
684
685
123M
      tmp = (FLOAT32)((FLOAT64)data[2 * (i + 1)] * (*w1r) -
686
123M
                      (FLOAT64)data[2 * (i + 1) + 1] * (*w1i));
687
123M
      data[2 * (i + 1) + 1] = (FLOAT32)((FLOAT64)data[2 * (i + 1)] * (*w1i) +
688
123M
                                        (FLOAT64)data[2 * (i + 1) + 1] * (*w1r));
689
123M
      data[2 * (i + 1)] = tmp;
690
691
123M
      w1r++;
692
123M
      w1i++;
693
694
123M
      tmp = (FLOAT32)((FLOAT64)data[2 * (i + 2)] * (*w1r) -
695
123M
                      (FLOAT64)data[2 * (i + 2) + 1] * (*w1i));
696
123M
      data[2 * (i + 2) + 1] = (FLOAT32)((FLOAT64)data[2 * (i + 2)] * (*w1i) +
697
123M
                                        (FLOAT64)data[2 * (i + 2) + 1] * (*w1r));
698
123M
      data[2 * (i + 2)] = tmp;
699
700
123M
      w1r += 3 * (128 / mpass - 1) + 1;
701
123M
      w1i += 3 * (128 / mpass - 1) + 1;
702
123M
    }
703
3.75M
  }
704
705
127M
  for (i = 0; i < mpass; i++) {
706
123M
    iusace_complex_3point_fft(ptr_x, ptr_y);
707
708
123M
    ptr_x = ptr_x + 6;
709
123M
    ptr_y = ptr_y + 6;
710
123M
  }
711
712
127M
  for (i = 0; i < mpass; i++) {
713
123M
    data[2 * i] = y[6 * i];
714
123M
    data[2 * i + 1] = y[6 * i + 1];
715
123M
  }
716
717
127M
  for (i = 0; i < mpass; i++) {
718
123M
    data[2 * (i + mpass)] = y[6 * i + 2];
719
123M
    data[2 * (i + mpass) + 1] = y[6 * i + 3];
720
123M
  }
721
722
127M
  for (i = 0; i < mpass; i++) {
723
123M
    data[2 * (i + 2 * mpass)] = y[6 * i + 4];
724
123M
    data[2 * (i + 2 * mpass) + 1] = y[6 * i + 5];
725
123M
  }
726
3.75M
}
727
728
0
VOID iusace_complex_fft_p3_no_scratch(FLOAT32 *data, WORD32 nlength) {
729
0
  WORD32 i, j;
730
731
0
  FLOAT32 data_3[800];
732
0
  FLOAT32 y[1024];
733
0
  FLOAT32 p_fft_p2_y[2048];
734
0
  WORD32 cnfac;
735
0
  WORD32 mpass = nlength;
736
0
  FLOAT32 *ptr_x = data;
737
0
  FLOAT32 *ptr_y = y;
738
739
0
  cnfac = 0;
740
0
  while (mpass % 3 == 0) {
741
0
    mpass /= 3;
742
0
    cnfac++;
743
0
  }
744
745
0
  for (i = 0; i < 3 * cnfac; i++) {
746
0
    for (j = 0; j < mpass; j++) {
747
0
      data_3[2 * j] = data[3 * (2 * j) + (2 * i)];
748
0
      data_3[2 * j + 1] = data[3 * (2 * j) + 1 + (2 * i)];
749
0
    }
750
0
    iusace_complex_fft_p2(data_3, mpass, p_fft_p2_y);
751
752
0
    for (j = 0; j < mpass; j++) {
753
0
      data[3 * (2 * j) + (2 * i)] = data_3[2 * j];
754
0
      data[3 * (2 * j) + 1 + (2 * i)] = data_3[2 * j + 1];
755
0
    }
756
0
  }
757
758
0
  {
759
0
    const FLOAT64 *w1r, *w1i;
760
0
    FLOAT32 tmp;
761
0
    w1r = iusace_twiddle_table_3pr;
762
0
    w1i = iusace_twiddle_table_3pi;
763
764
0
    for (i = 0; i < nlength; i += 3) {
765
0
      tmp = (FLOAT32)((FLOAT64)data[2 * i] * (*w1r) - (FLOAT64)data[2 * i + 1] * (*w1i));
766
0
      data[2 * i + 1] =
767
0
          (FLOAT32)((FLOAT64)data[2 * i] * (*w1i) + (FLOAT64)data[2 * i + 1] * (*w1r));
768
0
      data[2 * i] = tmp;
769
770
0
      w1r++;
771
0
      w1i++;
772
773
0
      tmp = (FLOAT32)((FLOAT64)data[2 * (i + 1)] * (*w1r) -
774
0
                      (FLOAT64)data[2 * (i + 1) + 1] * (*w1i));
775
0
      data[2 * (i + 1) + 1] = (FLOAT32)((FLOAT64)data[2 * (i + 1)] * (*w1i) +
776
0
                                        (FLOAT64)data[2 * (i + 1) + 1] * (*w1r));
777
0
      data[2 * (i + 1)] = tmp;
778
779
0
      w1r++;
780
0
      w1i++;
781
782
0
      tmp = (FLOAT32)((FLOAT64)data[2 * (i + 2)] * (*w1r) -
783
0
                      (FLOAT64)data[2 * (i + 2) + 1] * (*w1i));
784
0
      data[2 * (i + 2) + 1] = (FLOAT32)((FLOAT64)data[2 * (i + 2)] * (*w1i) +
785
0
                                        (FLOAT64)data[2 * (i + 2) + 1] * (*w1r));
786
0
      data[2 * (i + 2)] = tmp;
787
788
0
      w1r += 3 * (128 / mpass - 1) + 1;
789
0
      w1i += 3 * (128 / mpass - 1) + 1;
790
0
    }
791
0
  }
792
793
0
  for (i = 0; i < mpass; i++) {
794
0
    iusace_complex_3point_fft(ptr_x, ptr_y);
795
796
0
    ptr_x = ptr_x + 6;
797
0
    ptr_y = ptr_y + 6;
798
0
  }
799
800
0
  for (i = 0; i < mpass; i++) {
801
0
    data[2 * i] = y[6 * i];
802
0
    data[2 * i + 1] = y[6 * i + 1];
803
0
  }
804
805
0
  for (i = 0; i < mpass; i++) {
806
0
    data[2 * (i + mpass)] = y[6 * i + 2];
807
0
    data[2 * (i + mpass) + 1] = y[6 * i + 3];
808
0
  }
809
810
0
  for (i = 0; i < mpass; i++) {
811
0
    data[2 * (i + 2 * mpass)] = y[6 * i + 4];
812
0
    data[2 * (i + 2 * mpass) + 1] = y[6 * i + 5];
813
0
  }
814
0
}
815
816
static VOID iusace_calc_pre_twid_enc(FLOAT64 *ptr_in, FLOAT32 *fft_ptr, WORD32 npoints,
817
                                     const FLOAT64 *cos_ptr, const FLOAT64 *sin_ptr,
818
3.23M
                                     const WORD32 tx_flag) {
819
3.23M
  WORD32 i, n;
820
3.23M
  WORD32 b = npoints >> 1;
821
3.23M
  WORD32 a = npoints - b;
822
3.23M
  WORD32 nlength = npoints >> 2;
823
3.23M
  FLOAT64 tempr, tempi;
824
825
3.23M
  if (tx_flag == 0) {
826
1.61M
    FLOAT64 norm;
827
448M
    for (i = 0; i < b; i++) {
828
447M
      norm = ptr_in[i]; /* reuse MDCT: spectrally reverse all bins */
829
447M
      ptr_in[i] = ptr_in[npoints - 1 - i];
830
447M
      ptr_in[npoints - 1 - i] = norm;
831
447M
    }
832
1.61M
  }
833
450M
  for (i = 0; i < nlength; i++) {
834
447M
    n = npoints / 2 - 1 - 2 * i;
835
447M
    if (i < b / 4) {
836
223M
      tempr = ptr_in[a / 2 + n] + ptr_in[npoints + a / 2 - 1 - n];
837
223M
    } else {
838
223M
      tempr = ptr_in[a / 2 + n] - ptr_in[a / 2 - 1 - n];
839
223M
    }
840
447M
    n = 2 * i;
841
447M
    if (i < a / 4) {
842
223M
      tempi = ptr_in[a / 2 + n] - ptr_in[a / 2 - 1 - n];
843
223M
    } else {
844
223M
      tempi = ptr_in[a / 2 + n] + ptr_in[npoints + a / 2 - 1 - n];
845
223M
    }
846
847
447M
    fft_ptr[2 * i] = (FLOAT32)(tempr * (*cos_ptr) + tempi * (*sin_ptr));
848
447M
    fft_ptr[2 * i + 1] = (FLOAT32)(tempi * (*cos_ptr++) - tempr * (*sin_ptr++));
849
447M
  }
850
3.23M
}
851
852
11.3M
VOID iusace_complex_fft(FLOAT32 *data, WORD32 nlength, iusace_scratch_mem *pstr_scratch) {
853
11.3M
  if (nlength & (nlength - 1)) {
854
3.75M
    iusace_complex_fft_p3(data, nlength, pstr_scratch);
855
7.58M
  } else {
856
7.58M
    iusace_complex_fft_p2(data, nlength, pstr_scratch->p_fft_p2_y);
857
7.58M
  }
858
11.3M
}
859
860
static VOID iusace_calc_post_twid_enc(FLOAT64 *ptr_out, FLOAT32 *fft_ptr, WORD32 npoints,
861
                                      const FLOAT64 *cos_ptr, const FLOAT64 *sin_ptr,
862
3.23M
                                      const WORD32 tx_flag) {
863
3.23M
  WORD32 i;
864
3.23M
  WORD32 nlength = npoints >> 2;
865
3.23M
  FLOAT64 tempr, tempi;
866
867
  /* post-twiddle FFT output and then get output data */
868
450M
  for (i = 0; i < nlength; i++) {
869
447M
    tempr =
870
447M
        2 * ((FLOAT64)(fft_ptr[2 * i]) * (*cos_ptr) + (FLOAT64)(fft_ptr[2 * i + 1]) * (*sin_ptr));
871
447M
    tempi = 2 * ((FLOAT64)(fft_ptr[2 * i + 1]) * (*cos_ptr++) -
872
447M
                 (FLOAT64)(fft_ptr[2 * i]) * (*sin_ptr++));
873
874
447M
    ptr_out[2 * i] = -tempr;
875
447M
    ptr_out[npoints / 2 - 1 - 2 * i] = tempi;
876
447M
    ptr_out[npoints / 2 + 2 * i] = -tempi;
877
447M
    ptr_out[npoints - 1 - 2 * i] = tempr;
878
447M
  }
879
3.23M
  if (tx_flag == 0) {
880
448M
    for (i = 0; i < npoints; i += 2) {
881
447M
      ptr_out[i] *= -1; /* reuse MDCT: flip signs at odd indices */
882
447M
    }
883
1.61M
  }
884
3.23M
}
885
886
IA_ERRORCODE iusace_fft_based_mdct(FLOAT64 *ptr_in, FLOAT64 *ptr_out, WORD32 npoints,
887
3.23M
                                   const WORD32 tx_flag, iusace_scratch_mem *pstr_scratch) {
888
3.23M
  FLOAT32 *ptr_scratch1 = pstr_scratch->p_fft_mdct_buf;
889
3.23M
  const FLOAT64 *cos_ptr = NULL;
890
3.23M
  const FLOAT64 *sin_ptr = NULL;
891
3.23M
  WORD32 nlength = npoints >> 1;
892
3.23M
  WORD32 n_total = npoints << 1;
893
894
3.23M
  memset(ptr_scratch1, 0, ((SIZE_T)n_total << 1) * sizeof(*ptr_scratch1));
895
896
3.23M
  switch (npoints) {
897
1.09M
    case (96):
898
1.09M
      cos_ptr = iexheaac_pre_post_twid_cos_192;
899
1.09M
      sin_ptr = iexheaac_pre_post_twid_sin_192;
900
1.09M
      break;
901
1.51M
    case (128):
902
1.51M
      cos_ptr = iusace_pre_post_twid_cos_256;
903
1.51M
      sin_ptr = iusace_pre_post_twid_sin_256;
904
1.51M
      break;
905
181k
    case (768):
906
181k
      cos_ptr = iexheaac_pre_post_twid_cos_1536;
907
181k
      sin_ptr = iexheaac_pre_post_twid_sin_1536;
908
181k
      break;
909
444k
    case (1024):
910
444k
      cos_ptr = iusace_pre_post_twid_cos_2048;
911
444k
      sin_ptr = iusace_pre_post_twid_sin_2048;
912
444k
      break;
913
0
    default:
914
0
      return IA_EXHEAACE_EXE_FATAL_USAC_INVALID_WINDOW_LENGTH;
915
3.23M
  }
916
917
  /* pre-twiddle */
918
3.23M
  iusace_calc_pre_twid_enc(ptr_in, ptr_scratch1, npoints << 1, cos_ptr, sin_ptr, tx_flag);
919
920
  /* complex FFT */
921
3.23M
  iusace_complex_fft(ptr_scratch1, nlength, pstr_scratch);
922
923
  /* post-twiddle */
924
3.23M
  iusace_calc_post_twid_enc(ptr_out, ptr_scratch1, npoints << 1, cos_ptr, sin_ptr, tx_flag);
925
926
3.23M
  return IA_NO_ERROR;
927
3.23M
}
928
929
208k
VOID iusace_complex_fft_2048(FLOAT32 *ptr_x, FLOAT32 *scratch_fft) {
930
208k
  WORD32 i;
931
208k
  FLOAT32 re, im, c_v, s_v, tmp_re, tmp_im;
932
208k
  FLOAT32 *ptr_re, *ptr_im, *ptr_re_h, *ptr_im_h;
933
208k
  FLOAT32 *ptr_cos_val, *ptr_sin_val;
934
208k
  iusace_complex_fft_p2(ptr_x, 1024, scratch_fft);
935
208k
  iusace_complex_fft_p2(ptr_x + 2048, 1024, scratch_fft);
936
937
208k
  ptr_re = ptr_x;
938
208k
  ptr_im = ptr_x + 1;
939
208k
  ptr_re_h = ptr_x + 2048;
940
208k
  ptr_im_h = ptr_x + 2048 + 1;
941
208k
  ptr_cos_val = (FLOAT32 *)&iusace_twiddle_cos_2048[0];
942
208k
  ptr_sin_val = (FLOAT32 *)&iusace_twiddle_sin_2048[0];
943
213M
  for (i = 0; i < 1024; i++) {
944
212M
    re = *ptr_re_h;
945
212M
    im = *ptr_im_h;
946
212M
    c_v = ptr_cos_val[i];
947
212M
    s_v = ptr_sin_val[i];
948
212M
    tmp_re = (re * c_v) + (im * s_v);
949
212M
    tmp_im = -(re * s_v) + (im * c_v);
950
212M
    re = *ptr_re;
951
212M
    im = *ptr_im;
952
953
212M
    *ptr_re = re + tmp_re;
954
212M
    *ptr_im = im + tmp_im;
955
212M
    *ptr_re_h = re - tmp_re;
956
212M
    *ptr_im_h = im - tmp_im;
957
958
212M
    ptr_re += 2;
959
212M
    ptr_im += 2;
960
212M
    ptr_re_h += 2;
961
212M
    ptr_im_h += 2;
962
212M
  }
963
208k
}
964
static VOID ixheaace_rad2_cplx_fft(FLOAT32 *ptr_real, FLOAT32 *ptr_imag, WORD32 n_points,
965
212k
                                   FLOAT32 *ptr_scratch) {
966
212k
  WORD32 i, j, k, n_stages, h2;
967
212k
  FLOAT32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
968
212k
  WORD32 del, nodespacing, in_loop_cnt;
969
212k
  WORD32 not_power_4;
970
212k
  WORD32 dig_rev_shift;
971
212k
  WORD32 m_points = n_points;
972
212k
  FLOAT32 *ptr_x = ptr_scratch;
973
212k
  FLOAT32 *y = ptr_scratch + 2048;
974
212k
  FLOAT32 *ptr_y = y;
975
212k
  const FLOAT32 *ptr_w;
976
977
212k
  dig_rev_shift = ixheaac_norm32(m_points) + 1 - 16;
978
212k
  n_stages = 30 - ixheaac_norm32(m_points);
979
212k
  not_power_4 = n_stages & 1;
980
981
212k
  n_stages = n_stages >> 1;
982
983
212k
  ptr_w = ia_fft_twiddle_table_float;
984
985
217M
  for (i = 0; i < n_points; i++) {
986
217M
    ptr_x[2 * i] = ptr_real[i];
987
217M
    ptr_x[2 * i + 1] = ptr_imag[i];
988
217M
  }
989
212k
  dig_rev_shift = max(dig_rev_shift, 0);
990
54.5M
  for (i = 0; i < n_points; i += 4) {
991
54.2M
    FLOAT32 *inp = ptr_x;
992
54.2M
    FLOAT32 tmk;
993
994
54.2M
    DIG_REV(i, dig_rev_shift, h2);
995
54.2M
    if (not_power_4) {
996
0
      h2 += 1;
997
0
      h2 &= ~1;
998
0
    }
999
54.2M
    inp += (h2);
1000
1001
54.2M
    x0r = *inp;
1002
54.2M
    x0i = *(inp + 1);
1003
54.2M
    inp += (n_points >> 1);
1004
1005
54.2M
    x1r = *inp;
1006
54.2M
    x1i = *(inp + 1);
1007
54.2M
    inp += (n_points >> 1);
1008
1009
54.2M
    x2r = *inp;
1010
54.2M
    x2i = *(inp + 1);
1011
54.2M
    inp += (n_points >> 1);
1012
1013
54.2M
    x3r = *inp;
1014
54.2M
    x3i = *(inp + 1);
1015
1016
54.2M
    x0r = ia_add_flt(x0r, x2r);
1017
54.2M
    x0i = ia_add_flt(x0i, x2i);
1018
1019
54.2M
    tmk = ia_sub_flt(x0r, x2r);
1020
54.2M
    x2r = ia_sub_flt(tmk, x2r);
1021
54.2M
    tmk = ia_sub_flt(x0i, x2i);
1022
54.2M
    x2i = ia_sub_flt(tmk, x2i);
1023
1024
54.2M
    x1r = ia_add_flt(x1r, x3r);
1025
54.2M
    x1i = ia_add_flt(x1i, x3i);
1026
1027
54.2M
    tmk = ia_sub_flt(x1r, x3r);
1028
54.2M
    x3r = ia_sub_flt(tmk, x3r);
1029
54.2M
    tmk = ia_sub_flt(x1i, x3i);
1030
54.2M
    x3i = ia_sub_flt(tmk, x3i);
1031
1032
54.2M
    x0r = ia_add_flt(x0r, x1r);
1033
54.2M
    x0i = ia_add_flt(x0i, x1i);
1034
1035
54.2M
    tmk = ia_sub_flt(x0r, x1r);
1036
54.2M
    x1r = ia_sub_flt(tmk, x1r);
1037
54.2M
    tmk = ia_sub_flt(x0i, x1i);
1038
54.2M
    x1i = ia_sub_flt(tmk, x1i);
1039
1040
54.2M
    x2r = ia_add_flt(x2r, x3i);
1041
54.2M
    x2i = ia_sub_flt(x2i, x3r);
1042
1043
54.2M
    tmk = ia_sub_flt(x2r, x3i);
1044
54.2M
    x3i = ia_sub_flt(tmk, x3i);
1045
54.2M
    tmk = ia_add_flt(x2i, x3r);
1046
54.2M
    x3r = ia_add_flt(tmk, x3r);
1047
1048
54.2M
    *ptr_y++ = x0r;
1049
54.2M
    *ptr_y++ = x0i;
1050
54.2M
    *ptr_y++ = x2r;
1051
54.2M
    *ptr_y++ = x2i;
1052
54.2M
    *ptr_y++ = x1r;
1053
54.2M
    *ptr_y++ = x1i;
1054
54.2M
    *ptr_y++ = x3i;
1055
54.2M
    *ptr_y++ = x3r;
1056
54.2M
  }
1057
212k
  ptr_y -= 2 * n_points;
1058
212k
  del = 4;
1059
212k
  nodespacing = 64;
1060
212k
  in_loop_cnt = n_points >> 4;
1061
1.06M
  for (i = n_stages - 1; i > 0; i--) {
1062
848k
    const FLOAT32 *twiddles = ptr_w;
1063
848k
    FLOAT32 *data = ptr_y;
1064
848k
    FLOAT32 w_1, w_2, w_3, w_4, w_5, w_6;
1065
848k
    WORD32 sec_loop_cnt;
1066
1067
18.8M
    for (k = in_loop_cnt; k != 0; k--) {
1068
18.0M
      x0r = (*data);
1069
18.0M
      x0i = (*(data + 1));
1070
18.0M
      data += ((SIZE_T)del << 1);
1071
1072
18.0M
      x1r = (*data);
1073
18.0M
      x1i = (*(data + 1));
1074
18.0M
      data += ((SIZE_T)del << 1);
1075
1076
18.0M
      x2r = (*data);
1077
18.0M
      x2i = (*(data + 1));
1078
18.0M
      data += ((SIZE_T)del << 1);
1079
1080
18.0M
      x3r = (*data);
1081
18.0M
      x3i = (*(data + 1));
1082
18.0M
      data -= 3 * (del << 1);
1083
1084
18.0M
      x0r = ia_add_flt(x0r, x2r);
1085
18.0M
      x0i = ia_add_flt(x0i, x2i);
1086
18.0M
      x2r = ia_msu_flt(x0r, x2r, 2);
1087
18.0M
      x2i = ia_msu_flt(x0i, x2i, 2);
1088
18.0M
      x1r = ia_add_flt(x1r, x3r);
1089
18.0M
      x1i = ia_add_flt(x1i, x3i);
1090
18.0M
      x3r = ia_msu_flt(x1r, x3r, 2);
1091
18.0M
      x3i = ia_msu_flt(x1i, x3i, 2);
1092
1093
18.0M
      x0r = ia_add_flt(x0r, x1r);
1094
18.0M
      x0i = ia_add_flt(x0i, x1i);
1095
18.0M
      x1r = ia_msu_flt(x0r, x1r, 2);
1096
18.0M
      x1i = ia_msu_flt(x0i, x1i, 2);
1097
18.0M
      x2r = ia_add_flt(x2r, x3i);
1098
18.0M
      x2i = ia_sub_flt(x2i, x3r);
1099
18.0M
      x3i = ia_msu_flt(x2r, x3i, 2);
1100
18.0M
      x3r = ia_mac_flt(x2i, x3r, 2);
1101
1102
18.0M
      *data = x0r;
1103
18.0M
      *(data + 1) = x0i;
1104
18.0M
      data += ((SIZE_T)del << 1);
1105
1106
18.0M
      *data = x2r;
1107
18.0M
      *(data + 1) = x2i;
1108
18.0M
      data += ((SIZE_T)del << 1);
1109
1110
18.0M
      *data = x1r;
1111
18.0M
      *(data + 1) = x1i;
1112
18.0M
      data += ((SIZE_T)del << 1);
1113
1114
18.0M
      *data = x3i;
1115
18.0M
      *(data + 1) = x3r;
1116
18.0M
      data += ((SIZE_T)del << 1);
1117
18.0M
    }
1118
848k
    data = ptr_y + 2;
1119
1120
848k
    sec_loop_cnt = (nodespacing * del);
1121
848k
    sec_loop_cnt = (sec_loop_cnt / 4) + (sec_loop_cnt / 8) - (sec_loop_cnt / 16) +
1122
848k
                   (sec_loop_cnt / 32) - (sec_loop_cnt / 64) + (sec_loop_cnt / 128) -
1123
848k
                   (sec_loop_cnt / 256);
1124
1125
24.6M
    for (j = nodespacing; j <= sec_loop_cnt; j += nodespacing) {
1126
23.7M
      w_1 = *(twiddles + j);
1127
23.7M
      w_4 = *(twiddles + j + 257);
1128
23.7M
      w_2 = *(twiddles + ((SIZE_T)j << 1));
1129
23.7M
      w_5 = *(twiddles + ((SIZE_T)j << 1) + 257);
1130
23.7M
      w_3 = *(twiddles + j + ((SIZE_T)j << 1));
1131
23.7M
      w_6 = *(twiddles + j + ((SIZE_T)j << 1) + 257);
1132
1133
90.1M
      for (k = in_loop_cnt; k != 0; k--) {
1134
66.3M
        FLOAT32 tmp;
1135
        /*x0 is loaded later to avoid register crunch*/
1136
1137
66.3M
        data += ((SIZE_T)del << 1);
1138
1139
66.3M
        x1r = *data;
1140
66.3M
        x1i = *(data + 1);
1141
66.3M
        data += ((SIZE_T)del << 1);
1142
1143
66.3M
        x2r = *data;
1144
66.3M
        x2i = *(data + 1);
1145
66.3M
        data += ((SIZE_T)del << 1);
1146
1147
66.3M
        x3r = *data;
1148
66.3M
        x3i = *(data + 1);
1149
66.3M
        data -= 3 * (del << 1);
1150
1151
66.3M
        tmp = ia_sub_flt(ia_mul_flt(x1r, w_1), ia_mul_flt(x1i, w_4));
1152
66.3M
        x1i = ia_mac_flt(ia_mul_flt(x1r, w_4), x1i, w_1);
1153
66.3M
        x1r = tmp;
1154
1155
66.3M
        tmp = ia_sub_flt(ia_mul_flt(x2r, w_2), ia_mul_flt(x2i, w_5));
1156
66.3M
        x2i = ia_mac_flt(ia_mul_flt(x2r, w_5), x2i, w_2);
1157
66.3M
        x2r = tmp;
1158
1159
66.3M
        tmp = ia_sub_flt(ia_mul_flt(x3r, w_3), ia_mul_flt(x3i, w_6));
1160
66.3M
        x3i = ia_mac_flt(ia_mul_flt(x3r, w_6), x3i, w_3);
1161
66.3M
        x3r = tmp;
1162
1163
66.3M
        x0r = (*data);
1164
66.3M
        x0i = (*(data + 1));
1165
1166
66.3M
        x0r = ia_add_flt(x0r, (x2r));
1167
66.3M
        x0i = ia_add_flt(x0i, (x2i));
1168
66.3M
        x2r = ia_msu_flt(x0r, x2r, 2);
1169
66.3M
        x2i = ia_msu_flt(x0i, x2i, 2);
1170
66.3M
        x1r = ia_add_flt(x1r, x3r);
1171
66.3M
        x1i = ia_add_flt(x1i, x3i);
1172
66.3M
        x3r = ia_msu_flt(x1r, x3r, 2);
1173
66.3M
        x3i = ia_msu_flt(x1i, x3i, 2);
1174
1175
66.3M
        x0r = ia_add_flt(x0r, (x1r));
1176
66.3M
        x0i = ia_add_flt(x0i, (x1i));
1177
66.3M
        x1r = ia_msu_flt(x0r, x1r, 2);
1178
66.3M
        x1i = ia_msu_flt(x0i, x1i, 2);
1179
66.3M
        x2r = ia_add_flt(x2r, (x3i));
1180
66.3M
        x2i = ia_sub_flt(x2i, (x3r));
1181
66.3M
        x3i = ia_msu_flt(x2r, x3i, 2);
1182
66.3M
        x3r = ia_mac_flt(x2i, x3r, 2);
1183
1184
66.3M
        *data = x0r;
1185
66.3M
        *(data + 1) = x0i;
1186
66.3M
        data += ((SIZE_T)del << 1);
1187
1188
66.3M
        *data = x2r;
1189
66.3M
        *(data + 1) = x2i;
1190
66.3M
        data += ((SIZE_T)del << 1);
1191
1192
66.3M
        *data = x1r;
1193
66.3M
        *(data + 1) = x1i;
1194
66.3M
        data += ((SIZE_T)del << 1);
1195
1196
66.3M
        *data = x3i;
1197
66.3M
        *(data + 1) = x3r;
1198
66.3M
        data += ((SIZE_T)del << 1);
1199
66.3M
      }
1200
23.7M
      data -= 2 * n_points;
1201
23.7M
      data += 2;
1202
23.7M
    }
1203
13.1M
    for (; j <= (nodespacing * del) >> 1; j += nodespacing) {
1204
12.3M
      w_1 = *(twiddles + j);
1205
12.3M
      w_4 = *(twiddles + j + 257);
1206
12.3M
      w_2 = *(twiddles + ((SIZE_T)j << 1));
1207
12.3M
      w_5 = *(twiddles + ((SIZE_T)j << 1) + 257);
1208
12.3M
      w_3 = *(twiddles + j + ((SIZE_T)j << 1) - 256);
1209
12.3M
      w_6 = *(twiddles + j + ((SIZE_T)j << 1) + 1);
1210
1211
54.5M
      for (k = in_loop_cnt; k != 0; k--) {
1212
42.2M
        FLOAT32 tmp;
1213
        /*x0 is loaded later to avoid register crunch*/
1214
1215
42.2M
        data += ((SIZE_T)del << 1);
1216
1217
42.2M
        x1r = *data;
1218
42.2M
        x1i = *(data + 1);
1219
42.2M
        data += ((SIZE_T)del << 1);
1220
1221
42.2M
        x2r = *data;
1222
42.2M
        x2i = *(data + 1);
1223
42.2M
        data += ((SIZE_T)del << 1);
1224
1225
42.2M
        x3r = *data;
1226
42.2M
        x3i = *(data + 1);
1227
42.2M
        data -= 3 * (del << 1);
1228
1229
42.2M
        tmp = ia_sub_flt(ia_mul_flt(x1r, w_1), ia_mul_flt(x1i, w_4));
1230
42.2M
        x1i = ia_mac_flt(ia_mul_flt(x1r, w_4), x1i, w_1);
1231
42.2M
        x1r = tmp;
1232
1233
42.2M
        tmp = ia_sub_flt(ia_mul_flt(x2r, w_2), ia_mul_flt(x2i, w_5));
1234
42.2M
        x2i = ia_mac_flt(ia_mul_flt(x2r, w_5), x2i, w_2);
1235
42.2M
        x2r = tmp;
1236
1237
42.2M
        tmp = ia_add_flt(ia_mul_flt(x3r, w_6), ia_mul_flt(x3i, w_3));
1238
42.2M
        x3i = ia_add_flt(ia_negate_flt(ia_mul_flt(x3r, w_3)), ia_mul_flt(x3i, w_6));
1239
42.2M
        x3r = tmp;
1240
1241
42.2M
        x0r = (*data);
1242
42.2M
        x0i = (*(data + 1));
1243
1244
42.2M
        x0r = ia_add_flt(x0r, (x2r));
1245
42.2M
        x0i = ia_add_flt(x0i, (x2i));
1246
42.2M
        x2r = ia_msu_flt(x0r, x2r, 2);
1247
42.2M
        x2i = ia_msu_flt(x0i, x2i, 2);
1248
42.2M
        x1r = ia_add_flt(x1r, x3r);
1249
42.2M
        x1i = ia_add_flt(x1i, x3i);
1250
42.2M
        x3r = ia_msu_flt(x1r, x3r, 2);
1251
42.2M
        x3i = ia_msu_flt(x1i, x3i, 2);
1252
1253
42.2M
        x0r = ia_add_flt(x0r, (x1r));
1254
42.2M
        x0i = ia_add_flt(x0i, (x1i));
1255
42.2M
        x1r = ia_msu_flt(x0r, x1r, 2);
1256
42.2M
        x1i = ia_msu_flt(x0i, x1i, 2);
1257
42.2M
        x2r = ia_add_flt(x2r, (x3i));
1258
42.2M
        x2i = ia_sub_flt(x2i, (x3r));
1259
42.2M
        x3i = ia_msu_flt(x2r, x3i, 2);
1260
42.2M
        x3r = ia_mac_flt(x2i, x3r, 2);
1261
1262
42.2M
        *data = x0r;
1263
42.2M
        *(data + 1) = x0i;
1264
42.2M
        data += ((SIZE_T)del << 1);
1265
1266
42.2M
        *data = x2r;
1267
42.2M
        *(data + 1) = x2i;
1268
42.2M
        data += ((SIZE_T)del << 1);
1269
1270
42.2M
        *data = x1r;
1271
42.2M
        *(data + 1) = x1i;
1272
42.2M
        data += ((SIZE_T)del << 1);
1273
1274
42.2M
        *data = x3i;
1275
42.2M
        *(data + 1) = x3r;
1276
42.2M
        data += ((SIZE_T)del << 1);
1277
42.2M
      }
1278
12.3M
      data -= 2 * n_points;
1279
12.3M
      data += 2;
1280
12.3M
    }
1281
12.3M
    for (; j <= sec_loop_cnt * 2; j += nodespacing) {
1282
11.4M
      w_1 = *(twiddles + j);
1283
11.4M
      w_4 = *(twiddles + j + 257);
1284
11.4M
      w_2 = *(twiddles + ((SIZE_T)j << 1) - 256);
1285
11.4M
      w_5 = *(twiddles + ((SIZE_T)j << 1) + 1);
1286
11.4M
      w_3 = *(twiddles + j + ((SIZE_T)j << 1) - 256);
1287
11.4M
      w_6 = *(twiddles + j + ((SIZE_T)j << 1) + 1);
1288
1289
35.6M
      for (k = in_loop_cnt; k != 0; k--) {
1290
24.1M
        FLOAT32 tmp;
1291
        /*x0 is loaded later to avoid register crunch*/
1292
1293
24.1M
        data += ((SIZE_T)del << 1);
1294
1295
24.1M
        x1r = *data;
1296
24.1M
        x1i = *(data + 1);
1297
24.1M
        data += ((SIZE_T)del << 1);
1298
1299
24.1M
        x2r = *data;
1300
24.1M
        x2i = *(data + 1);
1301
24.1M
        data += ((SIZE_T)del << 1);
1302
1303
24.1M
        x3r = *data;
1304
24.1M
        x3i = *(data + 1);
1305
24.1M
        data -= 3 * (del << 1);
1306
1307
24.1M
        tmp = ia_sub_flt(ia_mul_flt(x1r, w_1), ia_mul_flt(x1i, w_4));
1308
24.1M
        x1i = ia_mac_flt(ia_mul_flt(x1r, w_4), x1i, w_1);
1309
24.1M
        x1r = tmp;
1310
1311
24.1M
        tmp = ia_add_flt(ia_mul_flt(x2r, w_5), ia_mul_flt(x2i, w_2));
1312
24.1M
        x2i = ia_add_flt(ia_negate_flt(ia_mul_flt(x2r, w_2)), ia_mul_flt(x2i, w_5));
1313
24.1M
        x2r = tmp;
1314
1315
24.1M
        tmp = ia_add_flt(ia_mul_flt(x3r, w_6), ia_mul_flt(x3i, w_3));
1316
24.1M
        x3i = ia_add_flt(ia_negate_flt(ia_mul_flt(x3r, w_3)), ia_mul_flt(x3i, w_6));
1317
24.1M
        x3r = tmp;
1318
1319
24.1M
        x0r = (*data);
1320
24.1M
        x0i = (*(data + 1));
1321
1322
24.1M
        x0r = ia_add_flt(x0r, (x2r));
1323
24.1M
        x0i = ia_add_flt(x0i, (x2i));
1324
24.1M
        x2r = ia_msu_flt(x0r, x2r, 2);
1325
24.1M
        x2i = ia_msu_flt(x0i, x2i, 2);
1326
24.1M
        x1r = ia_add_flt(x1r, x3r);
1327
24.1M
        x1i = ia_add_flt(x1i, x3i);
1328
24.1M
        x3r = ia_msu_flt(x1r, x3r, 2);
1329
24.1M
        x3i = ia_msu_flt(x1i, x3i, 2);
1330
1331
24.1M
        x0r = ia_add_flt(x0r, (x1r));
1332
24.1M
        x0i = ia_add_flt(x0i, (x1i));
1333
24.1M
        x1r = ia_msu_flt(x0r, x1r, 2);
1334
24.1M
        x1i = ia_msu_flt(x0i, x1i, 2);
1335
24.1M
        x2r = ia_add_flt(x2r, (x3i));
1336
24.1M
        x2i = ia_sub_flt(x2i, (x3r));
1337
24.1M
        x3i = ia_msu_flt(x2r, x3i, 2);
1338
24.1M
        x3r = ia_mac_flt(x2i, x3r, 2);
1339
1340
24.1M
        *data = x0r;
1341
24.1M
        *(data + 1) = x0i;
1342
24.1M
        data += ((SIZE_T)del << 1);
1343
1344
24.1M
        *data = x2r;
1345
24.1M
        *(data + 1) = x2i;
1346
24.1M
        data += ((SIZE_T)del << 1);
1347
1348
24.1M
        *data = x1r;
1349
24.1M
        *(data + 1) = x1i;
1350
24.1M
        data += ((SIZE_T)del << 1);
1351
1352
24.1M
        *data = x3i;
1353
24.1M
        *(data + 1) = x3r;
1354
24.1M
        data += ((SIZE_T)del << 1);
1355
24.1M
      }
1356
11.4M
      data -= 2 * n_points;
1357
11.4M
      data += 2;
1358
11.4M
    }
1359
24.6M
    for (; j < nodespacing * del; j += nodespacing) {
1360
23.7M
      w_1 = *(twiddles + j);
1361
23.7M
      w_4 = *(twiddles + j + 257);
1362
23.7M
      w_2 = *(twiddles + ((SIZE_T)j << 1) - 256);
1363
23.7M
      w_5 = *(twiddles + ((SIZE_T)j << 1) + 1);
1364
23.7M
      w_3 = *(twiddles + j + ((SIZE_T)j << 1) - 512);
1365
23.7M
      w_6 = *(twiddles + j + ((SIZE_T)j << 1) - 512 + 257);
1366
1367
90.1M
      for (k = in_loop_cnt; k != 0; k--) {
1368
66.3M
        FLOAT32 tmp;
1369
        /*x0 is loaded later to avoid register crunch*/
1370
1371
66.3M
        data += ((SIZE_T)del << 1);
1372
1373
66.3M
        x1r = *data;
1374
66.3M
        x1i = *(data + 1);
1375
66.3M
        data += ((SIZE_T)del << 1);
1376
1377
66.3M
        x2r = *data;
1378
66.3M
        x2i = *(data + 1);
1379
66.3M
        data += ((SIZE_T)del << 1);
1380
1381
66.3M
        x3r = *data;
1382
66.3M
        x3i = *(data + 1);
1383
66.3M
        data -= 3 * (del << 1);
1384
1385
66.3M
        tmp = ia_sub_flt(ia_mul_flt(x1r, w_1), ia_mul_flt(x1i, w_4));
1386
66.3M
        x1i = ia_mac_flt(ia_mul_flt(x1r, w_4), x1i, w_1);
1387
66.3M
        x1r = tmp;
1388
1389
66.3M
        tmp = ia_add_flt(ia_mul_flt(x2r, w_5), ia_mul_flt(x2i, w_2));
1390
66.3M
        x2i = ia_add_flt(ia_negate_flt(ia_mul_flt(x2r, w_2)), ia_mul_flt(x2i, w_5));
1391
66.3M
        x2r = tmp;
1392
1393
66.3M
        tmp = ia_add_flt(ia_negate_flt(ia_mul_flt(x3r, w_3)), ia_mul_flt(x3i, w_6));
1394
66.3M
        x3i = ia_mac_flt(ia_mul_flt(x3r, w_6), x3i, w_3);
1395
66.3M
        x3r = tmp;
1396
1397
66.3M
        x0r = (*data);
1398
66.3M
        x0i = (*(data + 1));
1399
1400
66.3M
        x0r = ia_add_flt(x0r, (x2r));
1401
66.3M
        x0i = ia_add_flt(x0i, (x2i));
1402
66.3M
        x2r = ia_msu_flt(x0r, x2r, 2);
1403
66.3M
        x2i = ia_msu_flt(x0i, x2i, 2);
1404
66.3M
        x1r = ia_add_flt(x1r, x3r);
1405
66.3M
        x1i = ia_sub_flt(x1i, x3i);
1406
66.3M
        x3r = ia_msu_flt(x1r, x3r, 2);
1407
66.3M
        x3i = ia_mac_flt(x1i, x3i, 2);
1408
1409
66.3M
        x0r = ia_add_flt(x0r, (x1r));
1410
66.3M
        x0i = ia_add_flt(x0i, (x1i));
1411
66.3M
        x1r = ia_msu_flt(x0r, x1r, 2);
1412
66.3M
        x1i = ia_msu_flt(x0i, x1i, 2);
1413
66.3M
        x2r = ia_add_flt(x2r, (x3i));
1414
66.3M
        x2i = ia_sub_flt(x2i, (x3r));
1415
66.3M
        x3i = ia_msu_flt(x2r, x3i, 2);
1416
66.3M
        x3r = ia_mac_flt(x2i, x3r, 2);
1417
1418
66.3M
        *data = x0r;
1419
66.3M
        *(data + 1) = x0i;
1420
66.3M
        data += ((SIZE_T)del << 1);
1421
1422
66.3M
        *data = x2r;
1423
66.3M
        *(data + 1) = x2i;
1424
66.3M
        data += ((SIZE_T)del << 1);
1425
1426
66.3M
        *data = x1r;
1427
66.3M
        *(data + 1) = x1i;
1428
66.3M
        data += ((SIZE_T)del << 1);
1429
1430
66.3M
        *data = x3i;
1431
66.3M
        *(data + 1) = x3r;
1432
66.3M
        data += ((SIZE_T)del << 1);
1433
66.3M
      }
1434
23.7M
      data -= 2 * n_points;
1435
23.7M
      data += 2;
1436
23.7M
    }
1437
848k
    nodespacing >>= 2;
1438
848k
    del <<= 2;
1439
848k
    in_loop_cnt >>= 2;
1440
848k
  }
1441
212k
  if (not_power_4) {
1442
0
    const FLOAT32 *twiddles = ptr_w;
1443
0
    nodespacing <<= 1;
1444
1445
0
    for (j = del / 2; j != 0; j--) {
1446
0
      FLOAT32 w_1 = *twiddles;
1447
0
      FLOAT32 w_4 = *(twiddles + 257);
1448
0
      FLOAT32 tmp;
1449
0
      twiddles += nodespacing;
1450
1451
0
      x0r = *ptr_y;
1452
0
      x0i = *(ptr_y + 1);
1453
0
      ptr_y += ((SIZE_T)del << 1);
1454
1455
0
      x1r = *ptr_y;
1456
0
      x1i = *(ptr_y + 1);
1457
1458
0
      tmp = ia_sub_flt(ia_mul_flt(x1r, w_1), ia_mul_flt(x1i, w_4));
1459
0
      x1i = (FLOAT32)ia_mac_flt(ia_mul_flt(x1r, w_4), x1i, w_1);
1460
0
      x1r = tmp;
1461
1462
0
      *ptr_y = ia_sub_flt((x0r), (x1r));
1463
0
      *(ptr_y + 1) = ia_sub_flt((x0i), (x1i));
1464
0
      ptr_y -= ((SIZE_T)del << 1);
1465
1466
0
      *ptr_y = ia_add_flt((x0r), (x1r));
1467
0
      *(ptr_y + 1) = ia_add_flt((x0i), (x1i));
1468
0
      ptr_y += 2;
1469
0
    }
1470
0
    twiddles = ptr_w;
1471
0
    for (j = del / 2; j != 0; j--) {
1472
0
      FLOAT32 w_1 = *twiddles;
1473
0
      FLOAT32 w_4 = *(twiddles + 257);
1474
0
      FLOAT32 tmp;
1475
0
      twiddles += nodespacing;
1476
1477
0
      x0r = *ptr_y;
1478
0
      x0i = *(ptr_y + 1);
1479
0
      ptr_y += ((SIZE_T)del << 1);
1480
1481
0
      x1r = *ptr_y;
1482
0
      x1i = *(ptr_y + 1);
1483
1484
0
      tmp = ia_add_flt(ia_mul_flt(x1r, w_4), ia_mul_flt(x1i, w_1));
1485
0
      x1i = ia_add_flt(ia_negate_flt(ia_mul_flt(x1r, w_1)), ia_mul_flt(x1i, w_4));
1486
0
      x1r = tmp;
1487
1488
0
      *ptr_y = ia_sub_flt((x0r), (x1r));
1489
0
      *(ptr_y + 1) = ia_sub_flt((x0i), (x1i));
1490
0
      ptr_y -= ((SIZE_T)del << 1);
1491
1492
0
      *ptr_y = ia_add_flt((x0r), (x1r));
1493
0
      *(ptr_y + 1) = ia_add_flt((x0i), (x1i));
1494
0
      ptr_y += 2;
1495
0
    }
1496
0
  }
1497
1498
217M
  for (i = 0; i < n_points; i++) {
1499
217M
    ptr_real[i] = y[2 * i];
1500
217M
    ptr_imag[i] = y[2 * i + 1];
1501
217M
  }
1502
212k
}
1503
54.2M
static VOID ixheaace_cplx_fft_4(FLOAT32 *x_r, FLOAT32 *x_i) {
1504
54.2M
  FLOAT32 x_0, x_1, x_2, x_3;
1505
54.2M
  FLOAT32 x_4, x_5, x_6, x_7;
1506
54.2M
  FLOAT32 x0r, x1r, x2r, x3r;
1507
54.2M
  FLOAT32 x0i, x1i, x2i, x3i;
1508
1509
  // 4 Point FFT
1510
54.2M
  x_0 = x_r[0];
1511
54.2M
  x_1 = x_i[0];
1512
54.2M
  x_2 = x_r[1];
1513
54.2M
  x_3 = x_i[1];
1514
54.2M
  x_4 = x_r[2];
1515
54.2M
  x_5 = x_i[2];
1516
54.2M
  x_6 = x_r[3];
1517
54.2M
  x_7 = x_i[3];
1518
1519
54.2M
  x0r = ia_add_flt(x_0, x_4);
1520
54.2M
  x0i = ia_add_flt(x_1, x_5);
1521
54.2M
  x2r = ia_sub_flt(x_0, x_4);
1522
54.2M
  x2i = ia_sub_flt(x_1, x_5);
1523
54.2M
  x1r = ia_add_flt(x_2, x_6);
1524
54.2M
  x1i = ia_add_flt(x_3, x_7);
1525
54.2M
  x3r = ia_sub_flt(x_2, x_6);
1526
54.2M
  x3i = ia_sub_flt(x_3, x_7);
1527
1528
54.2M
  x_r[0] = ia_add_flt(x0r, x1r);
1529
54.2M
  x_i[0] = ia_add_flt(x0i, x1i);
1530
54.2M
  x_r[2] = ia_sub_flt(x0r, x1r);
1531
54.2M
  x_i[2] = ia_sub_flt(x0i, x1i);
1532
54.2M
  x_r[1] = ia_add_flt(x2r, x3i);
1533
54.2M
  x_i[1] = ia_sub_flt(x2i, x3r);
1534
54.2M
  x_r[3] = ia_sub_flt(x2r, x3i);
1535
54.2M
  x_i[3] = ia_add_flt(x2i, x3r);
1536
54.2M
  return;
1537
54.2M
}
1538
53.0k
VOID iusace_complex_fft_4096(FLOAT32 *ptr_x_r, FLOAT32 *ptr_x_i, FLOAT32 *ptr_scratch_buf) {
1539
53.0k
  FLOAT32 *ptr_data_r;
1540
53.0k
  FLOAT32 *ptr_data_i;
1541
53.0k
  WORD32 fft_len = 4096;
1542
53.0k
  FLOAT32 *ptr_fft_interim_buf = &ptr_scratch_buf[2 * fft_len];
1543
53.0k
  WORD32 i, j;
1544
53.0k
  WORD32 dim2 = fft_len >> 10;
1545
53.0k
  WORD32 dim1 = fft_len / dim2;
1546
53.0k
  WORD32 fac = 4;
1547
1548
265k
  for (i = 0; i < dim2; i++) {
1549
212k
    ptr_data_r = &ptr_scratch_buf[(2 * i + 0) * dim1];
1550
212k
    ptr_data_i = &ptr_scratch_buf[(2 * i + 1) * dim1];
1551
217M
    for (j = 0; j < dim1; j++) {
1552
217M
      ptr_data_r[j] = ptr_x_r[(dim2 * j + i)];
1553
217M
      ptr_data_i[j] = 0;
1554
217M
    }
1555
212k
    ixheaace_rad2_cplx_fft(ptr_data_r, ptr_data_i, dim1, ptr_fft_interim_buf);
1556
212k
  }
1557
53.0k
  ptr_data_r = &ptr_scratch_buf[0];
1558
53.0k
  ptr_data_i = &ptr_scratch_buf[0];
1559
54.3M
  for (i = 0; i < dim1; i++) {
1560
54.2M
    FLOAT32 *ptr_cos_val = (FLOAT32 *)&ia_mixed_rad_twiddle_cos[i * dim2 * fac];
1561
54.2M
    FLOAT32 *ptr_sin_val = (FLOAT32 *)&ia_mixed_rad_twiddle_sin[i * dim2 * fac];
1562
271M
    for (j = 0; j < dim2; j++) {
1563
217M
      FLOAT32 real = ptr_data_r[(2 * j + 0) * dim1 + i];
1564
217M
      FLOAT32 imag = ptr_data_i[(2 * j + 1) * dim1 + i];
1565
217M
      FLOAT32 cos_val = ptr_cos_val[j * fac];
1566
217M
      FLOAT32 sin_val = ptr_sin_val[j * fac];
1567
217M
      FLOAT32 temp_real = (FLOAT32)(real * cos_val + imag * sin_val);
1568
217M
      FLOAT32 temp_imag = (FLOAT32)(imag * cos_val - real * sin_val);
1569
217M
      ptr_fft_interim_buf[(2 * i + 0) * dim2 + j] = temp_real;
1570
217M
      ptr_fft_interim_buf[(2 * i + 1) * dim2 + j] = temp_imag;
1571
217M
    }
1572
54.2M
  }
1573
54.3M
  for (i = 0; i < dim1; i++) {
1574
54.2M
    ptr_data_r = &ptr_fft_interim_buf[(2 * i + 0) * dim2];
1575
54.2M
    ptr_data_i = &ptr_fft_interim_buf[(2 * i + 1) * dim2];
1576
54.2M
    ixheaace_cplx_fft_4(ptr_data_r, ptr_data_i);
1577
54.2M
  }
1578
53.0k
  ptr_data_r = &ptr_fft_interim_buf[0];
1579
53.0k
  ptr_data_i = &ptr_fft_interim_buf[0];
1580
54.3M
  for (i = 0; i < dim1; i++) {
1581
271M
    for (j = 0; j < dim2; j++) {
1582
217M
      ptr_x_r[(j * dim1 + i)] = ptr_data_r[(2 * i + 0) * dim2 + j];
1583
217M
      ptr_x_i[(j * dim1 + i)] = ptr_data_i[(2 * i + 1) * dim2 + j];
1584
217M
    }
1585
54.2M
  }
1586
53.0k
}