Coverage Report

Created: 2025-08-03 06:57

/src/libxaac/encoder/iusace_fft.c
Line
Count
Source (jump to first uncovered line)
1
/******************************************************************************
2
 *                                                                            *
3
 * Copyright (C) 2023 The Android Open Source Project
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at:
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 *****************************************************************************
18
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19
 */
20
21
#include <string.h>
22
#include "ixheaac_type_def.h"
23
#include "ixheaace_adjust_threshold_data.h"
24
#include "iusace_cnst.h"
25
#include "iusace_block_switch_const.h"
26
#include "iusace_rom.h"
27
#include "iusace_bitbuffer.h"
28
29
/* DRC */
30
#include "impd_drc_common_enc.h"
31
#include "impd_drc_uni_drc.h"
32
#include "impd_drc_tables.h"
33
#include "impd_drc_api.h"
34
#include "impd_drc_uni_drc_eq.h"
35
#include "impd_drc_uni_drc_filter_bank.h"
36
#include "impd_drc_gain_enc.h"
37
#include "impd_drc_struct_def.h"
38
39
#include "iusace_tns_usac.h"
40
#include "iusace_psy_mod.h"
41
#include "iusace_config.h"
42
#include "iusace_fft.h"
43
#include "iusace_basic_ops_flt.h"
44
#include "ixheaac_constants.h"
45
#include "ixheaace_aac_constants.h"
46
#include "ixheaac_basic_ops32.h"
47
#include "ixheaace_common_utils.h"
48
#include "ixheaac_error_standards.h"
49
#include "ixheaace_error_codes.h"
50
51
#define DIG_REV(i, m, j)                                    \
52
400M
  do {                                                      \
53
400M
    unsigned _ = (i);                                       \
54
400M
    _ = ((_ & 0x33333333) << 2) | ((_ & ~0x33333333) >> 2); \
55
400M
    _ = ((_ & 0x0F0F0F0F) << 4) | ((_ & ~0x0F0F0F0F) >> 4); \
56
400M
    _ = ((_ & 0x00FF00FF) << 8) | ((_ & ~0x00FF00FF) >> 8); \
57
400M
    (j) = _ >> (m);                                         \
58
400M
  } while (0)
59
60
41.1M
static PLATFORM_INLINE WORD8 iusace_calc_norm(WORD32 a) {
61
41.1M
  WORD8 norm_val;
62
63
41.1M
  if (a == 0) {
64
0
    norm_val = 31;
65
41.1M
  } else {
66
41.1M
    if (a == (WORD32)0xffffffffL) {
67
0
      norm_val = 31;
68
41.1M
    } else {
69
41.1M
      if (a < 0) {
70
0
        a = ~a;
71
0
      }
72
1.07G
      for (norm_val = 0; a < (WORD32)0x40000000L; norm_val++) {
73
1.03G
        a <<= 1;
74
1.03G
      }
75
41.1M
    }
76
41.1M
  }
77
78
41.1M
  return norm_val;
79
41.1M
}
80
81
90.2M
static PLATFORM_INLINE VOID iusace_complex_3point_fft(FLOAT32 *ptr_in, FLOAT32 *ptr_out) {
82
90.2M
  FLOAT32 add_r, sub_r;
83
90.2M
  FLOAT32 add_i, sub_i;
84
90.2M
  FLOAT32 x01r, x01i, temp;
85
90.2M
  FLOAT32 p1, p2, p3, p4;
86
90.2M
  FLOAT64 sinmu;
87
88
90.2M
  sinmu = 0.866025403784439;
89
90
90.2M
  x01r = ptr_in[0] + ptr_in[2];
91
90.2M
  x01i = ptr_in[1] + ptr_in[3];
92
93
90.2M
  add_r = ptr_in[2] + ptr_in[4];
94
90.2M
  add_i = ptr_in[3] + ptr_in[5];
95
96
90.2M
  sub_r = ptr_in[2] - ptr_in[4];
97
90.2M
  sub_i = ptr_in[3] - ptr_in[5];
98
99
90.2M
  p1 = add_r / (FLOAT32)2.0;
100
90.2M
  p4 = add_i / (FLOAT32)2.0;
101
90.2M
  p2 = (FLOAT32)((FLOAT64)sub_i * sinmu);
102
90.2M
  p3 = (FLOAT32)((FLOAT64)sub_r * sinmu);
103
104
90.2M
  temp = ptr_in[0] - p1;
105
106
90.2M
  ptr_out[0] = x01r + ptr_in[4];
107
90.2M
  ptr_out[1] = x01i + ptr_in[5];
108
90.2M
  ptr_out[2] = temp + p2;
109
90.2M
  ptr_out[3] = (ptr_in[1] - p3) - p4;
110
90.2M
  ptr_out[4] = temp - p2;
111
90.2M
  ptr_out[5] = (ptr_in[1] + p3) - p4;
112
113
90.2M
  return;
114
90.2M
}
115
116
20.5M
VOID iusace_complex_fft_p2(FLOAT32 *ptr_x, WORD32 nlength, FLOAT32 *scratch_fft_p2_y) {
117
20.5M
  WORD32 i, j, k, n_stages, h2;
118
20.5M
  FLOAT32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
119
20.5M
  FLOAT32 tmp;
120
20.5M
  WORD32 del, nodespacing, in_loop_cnt;
121
20.5M
  WORD32 not_power_4;
122
20.5M
  WORD32 dig_rev_shift;
123
20.5M
  FLOAT32 *y = scratch_fft_p2_y;
124
20.5M
  WORD32 mpass = nlength;
125
20.5M
  WORD32 npoints = nlength;
126
20.5M
  FLOAT32 *ptr_y = y;
127
20.5M
  const FLOAT64 *ptr_w;
128
129
20.5M
  dig_rev_shift = iusace_calc_norm(mpass) + 1 - 16;
130
20.5M
  n_stages = 30 - iusace_calc_norm(mpass);
131
20.5M
  not_power_4 = n_stages & 1;
132
133
20.5M
  n_stages = n_stages >> 1;
134
135
20.5M
  ptr_w = iusace_twiddle_table_fft_32x32;
136
137
20.5M
  if (dig_rev_shift < 0) {
138
0
    dig_rev_shift = 0;
139
0
  }
140
141
372M
  for (i = 0; i < npoints; i += 4) {
142
351M
    FLOAT32 *inp = ptr_x;
143
351M
    FLOAT32 tmk;
144
145
351M
    DIG_REV(i, dig_rev_shift, h2);
146
351M
    if (not_power_4) {
147
137M
      h2 += 1;
148
137M
      h2 &= ~1;
149
137M
    }
150
351M
    inp += (h2);
151
152
351M
    x0r = *inp;
153
351M
    x0i = *(inp + 1);
154
351M
    inp += (npoints >> 1);
155
156
351M
    x1r = *inp;
157
351M
    x1i = *(inp + 1);
158
351M
    inp += (npoints >> 1);
159
160
351M
    x2r = *inp;
161
351M
    x2i = *(inp + 1);
162
351M
    inp += (npoints >> 1);
163
164
351M
    x3r = *inp;
165
351M
    x3i = *(inp + 1);
166
167
351M
    x0r = x0r + x2r;
168
351M
    x0i = x0i + x2i;
169
170
351M
    tmk = x0r - x2r;
171
351M
    x2r = tmk - x2r;
172
351M
    tmk = x0i - x2i;
173
351M
    x2i = tmk - x2i;
174
175
351M
    x1r = x1r + x3r;
176
351M
    x1i = x1i + x3i;
177
178
351M
    tmk = x1r - x3r;
179
351M
    x3r = tmk - x3r;
180
351M
    tmk = x1i - x3i;
181
351M
    x3i = tmk - x3i;
182
183
351M
    x0r = x0r + x1r;
184
351M
    x0i = x0i + x1i;
185
186
351M
    tmk = x0r - x1r;
187
351M
    x1r = tmk - x1r;
188
351M
    tmk = x0i - x1i;
189
351M
    x1i = tmk - x1i;
190
191
351M
    x2r = x2r + x3i;
192
351M
    x2i = x2i - x3r;
193
194
351M
    tmk = x2r - x3i;
195
351M
    x3i = tmk - x3i;
196
351M
    tmk = x2i + x3r;
197
351M
    x3r = tmk + x3r;
198
199
351M
    *ptr_y++ = x0r;
200
351M
    *ptr_y++ = x0i;
201
351M
    *ptr_y++ = x2r;
202
351M
    *ptr_y++ = x2i;
203
351M
    *ptr_y++ = x1r;
204
351M
    *ptr_y++ = x1i;
205
351M
    *ptr_y++ = x3i;
206
351M
    *ptr_y++ = x3r;
207
351M
  }
208
20.5M
  ptr_y -= 2 * npoints;
209
20.5M
  del = 4;
210
20.5M
  nodespacing = 64;
211
20.5M
  in_loop_cnt = npoints >> 4;
212
44.9M
  for (i = n_stages - 1; i > 0; i--) {
213
24.4M
    const FLOAT64 *twiddles = ptr_w;
214
24.4M
    FLOAT32 *data = ptr_y;
215
24.4M
    FLOAT64 w_1, w_2, w_3, w_4, w_5, w_6;
216
24.4M
    WORD32 sec_loop_cnt;
217
218
131M
    for (k = in_loop_cnt; k != 0; k--) {
219
107M
      x0r = (*data);
220
107M
      x0i = (*(data + 1));
221
107M
      data += ((SIZE_T)del << 1);
222
223
107M
      x1r = (*data);
224
107M
      x1i = (*(data + 1));
225
107M
      data += ((SIZE_T)del << 1);
226
227
107M
      x2r = (*data);
228
107M
      x2i = (*(data + 1));
229
107M
      data += ((SIZE_T)del << 1);
230
231
107M
      x3r = (*data);
232
107M
      x3i = (*(data + 1));
233
107M
      data -= 3 * (del << 1);
234
235
107M
      x0r = x0r + x2r;
236
107M
      x0i = x0i + x2i;
237
107M
      x2r = x0r - (x2r * 2);
238
107M
      x2i = x0i - (x2i * 2);
239
107M
      x1r = x1r + x3r;
240
107M
      x1i = x1i + x3i;
241
107M
      x3r = x1r - (x3r * 2);
242
107M
      x3i = x1i - (x3i * 2);
243
244
107M
      x0r = x0r + x1r;
245
107M
      x0i = x0i + x1i;
246
107M
      x1r = x0r - (x1r * 2);
247
107M
      x1i = x0i - (x1i * 2);
248
107M
      x2r = x2r + x3i;
249
107M
      x2i = x2i - x3r;
250
107M
      x3i = x2r - (x3i * 2);
251
107M
      x3r = x2i + (x3r * 2);
252
253
107M
      *data = x0r;
254
107M
      *(data + 1) = x0i;
255
107M
      data += ((SIZE_T)del << 1);
256
257
107M
      *data = x2r;
258
107M
      *(data + 1) = x2i;
259
107M
      data += ((SIZE_T)del << 1);
260
261
107M
      *data = x1r;
262
107M
      *(data + 1) = x1i;
263
107M
      data += ((SIZE_T)del << 1);
264
265
107M
      *data = x3i;
266
107M
      *(data + 1) = x3r;
267
107M
      data += ((SIZE_T)del << 1);
268
107M
    }
269
24.4M
    data = ptr_y + 2;
270
271
24.4M
    sec_loop_cnt = (nodespacing * del);
272
24.4M
    sec_loop_cnt = (sec_loop_cnt / 4) + (sec_loop_cnt / 8) - (sec_loop_cnt / 16) +
273
24.4M
                   (sec_loop_cnt / 32) - (sec_loop_cnt / 64) + (sec_loop_cnt / 128) -
274
24.4M
                   (sec_loop_cnt / 256);
275
276
133M
    for (j = nodespacing; j <= sec_loop_cnt; j += nodespacing) {
277
108M
      w_1 = *(twiddles + j);
278
108M
      w_4 = *(twiddles + j + 257);
279
108M
      w_2 = *(twiddles + ((SIZE_T)j << 1));
280
108M
      w_5 = *(twiddles + ((SIZE_T)j << 1) + 257);
281
108M
      w_3 = *(twiddles + j + ((SIZE_T)j << 1));
282
108M
      w_6 = *(twiddles + j + ((SIZE_T)j << 1) + 257);
283
284
375M
      for (k = in_loop_cnt; k != 0; k--) {
285
266M
        data += ((SIZE_T)del << 1);
286
287
266M
        x1r = *data;
288
266M
        x1i = *(data + 1);
289
266M
        data += ((SIZE_T)del << 1);
290
291
266M
        x2r = *data;
292
266M
        x2i = *(data + 1);
293
266M
        data += ((SIZE_T)del << 1);
294
295
266M
        x3r = *data;
296
266M
        x3i = *(data + 1);
297
266M
        data -= 3 * (del << 1);
298
299
266M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_1) - ixheaace_dmult((FLOAT64)x1i, w_4));
300
266M
        x1i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x1r, w_4), (FLOAT64)x1i, w_1);
301
266M
        x1r = tmp;
302
303
266M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x2r, w_2) - ixheaace_dmult((FLOAT64)x2i, w_5));
304
266M
        x2i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x2r, w_5), (FLOAT64)x2i, w_2);
305
266M
        x2r = tmp;
306
307
266M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x3r, w_3) - ixheaace_dmult((FLOAT64)x3i, w_6));
308
266M
        x3i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x3r, w_6), (FLOAT64)x3i, w_3);
309
266M
        x3r = tmp;
310
311
266M
        x0r = (*data);
312
266M
        x0i = (*(data + 1));
313
314
266M
        x0r = x0r + (x2r);
315
266M
        x0i = x0i + (x2i);
316
266M
        x2r = x0r - (x2r * 2);
317
266M
        x2i = x0i - (x2i * 2);
318
266M
        x1r = x1r + x3r;
319
266M
        x1i = x1i + x3i;
320
266M
        x3r = x1r - (x3r * 2);
321
266M
        x3i = x1i - (x3i * 2);
322
323
266M
        x0r = x0r + (x1r);
324
266M
        x0i = x0i + (x1i);
325
266M
        x1r = x0r - (x1r * 2);
326
266M
        x1i = x0i - (x1i * 2);
327
266M
        x2r = x2r + (x3i);
328
266M
        x2i = x2i - (x3r);
329
266M
        x3i = x2r - (x3i * 2);
330
266M
        x3r = x2i + (x3r * 2);
331
332
266M
        *data = x0r;
333
266M
        *(data + 1) = x0i;
334
266M
        data += ((SIZE_T)del << 1);
335
336
266M
        *data = x2r;
337
266M
        *(data + 1) = x2i;
338
266M
        data += ((SIZE_T)del << 1);
339
340
266M
        *data = x1r;
341
266M
        *(data + 1) = x1i;
342
266M
        data += ((SIZE_T)del << 1);
343
344
266M
        *data = x3i;
345
266M
        *(data + 1) = x3r;
346
266M
        data += ((SIZE_T)del << 1);
347
266M
      }
348
108M
      data -= 2 * npoints;
349
108M
      data += 2;
350
108M
    }
351
90.9M
    for (; j <= (nodespacing * del) >> 1; j += nodespacing) {
352
66.5M
      w_1 = *(twiddles + j);
353
66.5M
      w_4 = *(twiddles + j + 257);
354
66.5M
      w_2 = *(twiddles + ((SIZE_T)j << 1));
355
66.5M
      w_5 = *(twiddles + ((SIZE_T)j << 1) + 257);
356
66.5M
      w_3 = *(twiddles + j + ((SIZE_T)j << 1) - 256);
357
66.5M
      w_6 = *(twiddles + j + ((SIZE_T)j << 1) + 1);
358
359
253M
      for (k = in_loop_cnt; k != 0; k--) {
360
186M
        data += ((SIZE_T)del << 1);
361
362
186M
        x1r = *data;
363
186M
        x1i = *(data + 1);
364
186M
        data += ((SIZE_T)del << 1);
365
366
186M
        x2r = *data;
367
186M
        x2i = *(data + 1);
368
186M
        data += ((SIZE_T)del << 1);
369
370
186M
        x3r = *data;
371
186M
        x3i = *(data + 1);
372
186M
        data -= 3 * (del << 1);
373
374
186M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_1) - ixheaace_dmult((FLOAT64)x1i, w_4));
375
186M
        x1i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x1r, w_4), (FLOAT64)x1i, w_1);
376
186M
        x1r = tmp;
377
378
186M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x2r, w_2) - ixheaace_dmult((FLOAT64)x2i, w_5));
379
186M
        x2i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x2r, w_5), (FLOAT64)x2i, w_2);
380
186M
        x2r = tmp;
381
382
186M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x3r, w_6) + ixheaace_dmult((FLOAT64)x3i, w_3));
383
186M
        x3i = (FLOAT32)(-ixheaace_dmult((FLOAT64)x3r, w_3) + ixheaace_dmult((FLOAT64)x3i, w_6));
384
186M
        x3r = tmp;
385
386
186M
        x0r = (*data);
387
186M
        x0i = (*(data + 1));
388
389
186M
        x0r = x0r + (x2r);
390
186M
        x0i = x0i + (x2i);
391
186M
        x2r = x0r - (x2r * 2);
392
186M
        x2i = x0i - (x2i * 2);
393
186M
        x1r = x1r + x3r;
394
186M
        x1i = x1i + x3i;
395
186M
        x3r = x1r - (x3r * 2);
396
186M
        x3i = x1i - (x3i * 2);
397
398
186M
        x0r = x0r + (x1r);
399
186M
        x0i = x0i + (x1i);
400
186M
        x1r = x0r - (x1r * 2);
401
186M
        x1i = x0i - (x1i * 2);
402
186M
        x2r = x2r + (x3i);
403
186M
        x2i = x2i - (x3r);
404
186M
        x3i = x2r - (x3i * 2);
405
186M
        x3r = x2i + (x3r * 2);
406
407
186M
        *data = x0r;
408
186M
        *(data + 1) = x0i;
409
186M
        data += ((SIZE_T)del << 1);
410
411
186M
        *data = x2r;
412
186M
        *(data + 1) = x2i;
413
186M
        data += ((SIZE_T)del << 1);
414
415
186M
        *data = x1r;
416
186M
        *(data + 1) = x1i;
417
186M
        data += ((SIZE_T)del << 1);
418
419
186M
        *data = x3i;
420
186M
        *(data + 1) = x3r;
421
186M
        data += ((SIZE_T)del << 1);
422
186M
      }
423
66.5M
      data -= 2 * npoints;
424
66.5M
      data += 2;
425
66.5M
    }
426
66.5M
    for (; j <= sec_loop_cnt * 2; j += nodespacing) {
427
42.1M
      w_1 = *(twiddles + j);
428
42.1M
      w_4 = *(twiddles + j + 257);
429
42.1M
      w_2 = *(twiddles + ((SIZE_T)j << 1) - 256);
430
42.1M
      w_5 = *(twiddles + ((SIZE_T)j << 1) + 1);
431
42.1M
      w_3 = *(twiddles + j + ((SIZE_T)j << 1) - 256);
432
42.1M
      w_6 = *(twiddles + j + ((SIZE_T)j << 1) + 1);
433
434
121M
      for (k = in_loop_cnt; k != 0; k--) {
435
79.5M
        data += ((SIZE_T)del << 1);
436
437
79.5M
        x1r = *data;
438
79.5M
        x1i = *(data + 1);
439
79.5M
        data += ((SIZE_T)del << 1);
440
441
79.5M
        x2r = *data;
442
79.5M
        x2i = *(data + 1);
443
79.5M
        data += ((SIZE_T)del << 1);
444
445
79.5M
        x3r = *data;
446
79.5M
        x3i = *(data + 1);
447
79.5M
        data -= 3 * (del << 1);
448
449
79.5M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_1) - ixheaace_dmult((FLOAT64)x1i, w_4));
450
79.5M
        x1i = (FLOAT32)ixheaace_dmac(ixheaace_dmult(x1r, w_4), x1i, w_1);
451
79.5M
        x1r = tmp;
452
453
79.5M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x2r, w_5) + ixheaace_dmult((FLOAT64)x2i, w_2));
454
79.5M
        x2i = (FLOAT32)(-ixheaace_dmult(x2r, w_2) + ixheaace_dmult(x2i, w_5));
455
79.5M
        x2r = tmp;
456
457
79.5M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x3r, w_6) + ixheaace_dmult((FLOAT64)x3i, w_3));
458
79.5M
        x3i = (FLOAT32)(-ixheaace_dmult((FLOAT64)x3r, w_3) + ixheaace_dmult((FLOAT64)x3i, w_6));
459
79.5M
        x3r = tmp;
460
461
79.5M
        x0r = (*data);
462
79.5M
        x0i = (*(data + 1));
463
464
79.5M
        x0r = x0r + (x2r);
465
79.5M
        x0i = x0i + (x2i);
466
79.5M
        x2r = x0r - (x2r * 2);
467
79.5M
        x2i = x0i - (x2i * 2);
468
79.5M
        x1r = x1r + x3r;
469
79.5M
        x1i = x1i + x3i;
470
79.5M
        x3r = x1r - (x3r * 2);
471
79.5M
        x3i = x1i - (x3i * 2);
472
473
79.5M
        x0r = x0r + (x1r);
474
79.5M
        x0i = x0i + (x1i);
475
79.5M
        x1r = x0r - (x1r * 2);
476
79.5M
        x1i = x0i - (x1i * 2);
477
79.5M
        x2r = x2r + (x3i);
478
79.5M
        x2i = x2i - (x3r);
479
79.5M
        x3i = x2r - (x3i * 2);
480
79.5M
        x3r = x2i + (x3r * 2);
481
482
79.5M
        *data = x0r;
483
79.5M
        *(data + 1) = x0i;
484
79.5M
        data += ((SIZE_T)del << 1);
485
486
79.5M
        *data = x2r;
487
79.5M
        *(data + 1) = x2i;
488
79.5M
        data += ((SIZE_T)del << 1);
489
490
79.5M
        *data = x1r;
491
79.5M
        *(data + 1) = x1i;
492
79.5M
        data += ((SIZE_T)del << 1);
493
494
79.5M
        *data = x3i;
495
79.5M
        *(data + 1) = x3r;
496
79.5M
        data += ((SIZE_T)del << 1);
497
79.5M
      }
498
42.1M
      data -= 2 * npoints;
499
42.1M
      data += 2;
500
42.1M
    }
501
133M
    for (; j < nodespacing * del; j += nodespacing) {
502
108M
      w_1 = *(twiddles + j);
503
108M
      w_4 = *(twiddles + j + 257);
504
108M
      w_2 = *(twiddles + ((SIZE_T)j << 1) - 256);
505
108M
      w_5 = *(twiddles + ((SIZE_T)j << 1) + 1);
506
108M
      w_3 = *(twiddles + j + ((SIZE_T)j << 1) - 512);
507
108M
      w_6 = *(twiddles + j + ((SIZE_T)j << 1) - 512 + 257);
508
509
375M
      for (k = in_loop_cnt; k != 0; k--) {
510
266M
        data += ((SIZE_T)del << 1);
511
512
266M
        x1r = *data;
513
266M
        x1i = *(data + 1);
514
266M
        data += ((SIZE_T)del << 1);
515
516
266M
        x2r = *data;
517
266M
        x2i = *(data + 1);
518
266M
        data += ((SIZE_T)del << 1);
519
520
266M
        x3r = *data;
521
266M
        x3i = *(data + 1);
522
266M
        data -= 3 * ((SIZE_T)del << 1);
523
524
266M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_1) - ixheaace_dmult((FLOAT64)x1i, w_4));
525
266M
        x1i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x1r, w_4), (FLOAT64)x1i, w_1);
526
266M
        x1r = tmp;
527
528
266M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x2r, w_5) + ixheaace_dmult((FLOAT64)x2i, w_2));
529
266M
        x2i = (FLOAT32)(-ixheaace_dmult((FLOAT64)x2r, w_2) + ixheaace_dmult((FLOAT64)x2i, w_5));
530
266M
        x2r = tmp;
531
532
266M
        tmp = (FLOAT32)(-ixheaace_dmult((FLOAT64)x3r, w_3) + ixheaace_dmult((FLOAT64)x3i, w_6));
533
266M
        x3i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x3r, w_6), (FLOAT64)x3i, w_3);
534
266M
        x3r = tmp;
535
536
266M
        x0r = (*data);
537
266M
        x0i = (*(data + 1));
538
539
266M
        x0r = x0r + (x2r);
540
266M
        x0i = x0i + (x2i);
541
266M
        x2r = x0r - (x2r * 2);
542
266M
        x2i = x0i - (x2i * 2);
543
266M
        x1r = x1r + x3r;
544
266M
        x1i = x1i - x3i;
545
266M
        x3r = x1r - (x3r * 2);
546
266M
        x3i = x1i + (x3i * 2);
547
548
266M
        x0r = x0r + (x1r);
549
266M
        x0i = x0i + (x1i);
550
266M
        x1r = x0r - (x1r * 2);
551
266M
        x1i = x0i - (x1i * 2);
552
266M
        x2r = x2r + (x3i);
553
266M
        x2i = x2i - (x3r);
554
266M
        x3i = x2r - (x3i * 2);
555
266M
        x3r = x2i + (x3r * 2);
556
557
266M
        *data = x0r;
558
266M
        *(data + 1) = x0i;
559
266M
        data += ((SIZE_T)del << 1);
560
561
266M
        *data = x2r;
562
266M
        *(data + 1) = x2i;
563
266M
        data += ((SIZE_T)del << 1);
564
565
266M
        *data = x1r;
566
266M
        *(data + 1) = x1i;
567
266M
        data += ((SIZE_T)del << 1);
568
569
266M
        *data = x3i;
570
266M
        *(data + 1) = x3r;
571
266M
        data += ((SIZE_T)del << 1);
572
266M
      }
573
108M
      data -= 2 * npoints;
574
108M
      data += 2;
575
108M
    }
576
24.4M
    nodespacing >>= 2;
577
24.4M
    del <<= 2;
578
24.4M
    in_loop_cnt >>= 2;
579
24.4M
  }
580
20.5M
  if (not_power_4) {
581
9.60M
    const FLOAT64 *twiddles = ptr_w;
582
9.60M
    nodespacing <<= 1;
583
584
146M
    for (j = del / 2; j != 0; j--) {
585
137M
      FLOAT64 w_1 = *twiddles;
586
137M
      FLOAT64 w_4 = *(twiddles + 257);
587
137M
      twiddles += nodespacing;
588
589
137M
      x0r = *ptr_y;
590
137M
      x0i = *(ptr_y + 1);
591
137M
      ptr_y += ((SIZE_T)del << 1);
592
593
137M
      x1r = *ptr_y;
594
137M
      x1i = *(ptr_y + 1);
595
596
137M
      tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_1) - ixheaace_dmult((FLOAT64)x1i, w_4));
597
137M
      x1i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x1r, w_4), (FLOAT64)x1i, w_1);
598
137M
      x1r = tmp;
599
600
137M
      *ptr_y = (x0r) - (x1r);
601
137M
      *(ptr_y + 1) = (x0i) - (x1i);
602
137M
      ptr_y -= ((SIZE_T)del << 1);
603
604
137M
      *ptr_y = (x0r) + (x1r);
605
137M
      *(ptr_y + 1) = (x0i) + (x1i);
606
137M
      ptr_y += 2;
607
137M
    }
608
9.60M
    twiddles = ptr_w;
609
146M
    for (j = del / 2; j != 0; j--) {
610
137M
      FLOAT64 w_1 = *twiddles;
611
137M
      FLOAT64 w_4 = *(twiddles + 257);
612
137M
      twiddles += nodespacing;
613
614
137M
      x0r = *ptr_y;
615
137M
      x0i = *(ptr_y + 1);
616
137M
      ptr_y += ((SIZE_T)del << 1);
617
618
137M
      x1r = *ptr_y;
619
137M
      x1i = *(ptr_y + 1);
620
621
137M
      tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_4) + ixheaace_dmult((FLOAT64)x1i, w_1));
622
137M
      x1i = (FLOAT32)(-ixheaace_dmult((FLOAT64)x1r, w_1) + ixheaace_dmult((FLOAT64)x1i, w_4));
623
137M
      x1r = tmp;
624
625
137M
      *ptr_y = (x0r) - (x1r);
626
137M
      *(ptr_y + 1) = (x0i) - (x1i);
627
137M
      ptr_y -= ((SIZE_T)del << 1);
628
629
137M
      *ptr_y = (x0r) + (x1r);
630
137M
      *(ptr_y + 1) = (x0i) + (x1i);
631
137M
      ptr_y += 2;
632
137M
    }
633
9.60M
  }
634
635
1.42G
  for (i = 0; i < nlength; i++) {
636
1.40G
    *(ptr_x + 2 * i) = y[2 * i];
637
1.40G
    *(ptr_x + 2 * i + 1) = y[2 * i + 1];
638
1.40G
  }
639
20.5M
}
640
641
static VOID iusace_complex_fft_p3(FLOAT32 *data, WORD32 nlength,
642
2.94M
                                  iusace_scratch_mem *pstr_scratch) {
643
2.94M
  WORD32 i, j;
644
2.94M
  FLOAT32 *data_3 = pstr_scratch->p_fft_p3_data_3;
645
2.94M
  FLOAT32 *y = pstr_scratch->p_fft_p3_y;
646
2.94M
  WORD32 cnfac;
647
2.94M
  WORD32 mpass = nlength;
648
2.94M
  FLOAT32 *ptr_x = data;
649
2.94M
  FLOAT32 *ptr_y = y;
650
651
2.94M
  cnfac = 0;
652
5.89M
  while (mpass % 3 == 0) {
653
2.94M
    mpass /= 3;
654
2.94M
    cnfac++;
655
2.94M
  }
656
657
11.7M
  for (i = 0; i < 3 * cnfac; i++) {
658
279M
    for (j = 0; j < mpass; j++) {
659
270M
      data_3[2 * j] = data[3 * (2 * j) + (2 * i)];
660
270M
      data_3[2 * j + 1] = data[3 * (2 * j) + 1 + (2 * i)];
661
270M
    }
662
8.83M
    iusace_complex_fft_p2(data_3, mpass, pstr_scratch->p_fft_p2_y);
663
664
279M
    for (j = 0; j < mpass; j++) {
665
270M
      data[3 * (2 * j) + (2 * i)] = data_3[2 * j];
666
270M
      data[3 * (2 * j) + 1 + (2 * i)] = data_3[2 * j + 1];
667
270M
    }
668
8.83M
  }
669
670
2.94M
  {
671
2.94M
    const FLOAT64 *w1r, *w1i;
672
2.94M
    FLOAT32 tmp;
673
2.94M
    w1r = iusace_twiddle_table_3pr;
674
2.94M
    w1i = iusace_twiddle_table_3pi;
675
676
93.1M
    for (i = 0; i < nlength; i += 3) {
677
90.2M
      tmp = (FLOAT32)((FLOAT64)data[2 * i] * (*w1r) - (FLOAT64)data[2 * i + 1] * (*w1i));
678
90.2M
      data[2 * i + 1] =
679
90.2M
          (FLOAT32)((FLOAT64)data[2 * i] * (*w1i) + (FLOAT64)data[2 * i + 1] * (*w1r));
680
90.2M
      data[2 * i] = tmp;
681
682
90.2M
      w1r++;
683
90.2M
      w1i++;
684
685
90.2M
      tmp = (FLOAT32)((FLOAT64)data[2 * (i + 1)] * (*w1r) -
686
90.2M
                      (FLOAT64)data[2 * (i + 1) + 1] * (*w1i));
687
90.2M
      data[2 * (i + 1) + 1] = (FLOAT32)((FLOAT64)data[2 * (i + 1)] * (*w1i) +
688
90.2M
                                        (FLOAT64)data[2 * (i + 1) + 1] * (*w1r));
689
90.2M
      data[2 * (i + 1)] = tmp;
690
691
90.2M
      w1r++;
692
90.2M
      w1i++;
693
694
90.2M
      tmp = (FLOAT32)((FLOAT64)data[2 * (i + 2)] * (*w1r) -
695
90.2M
                      (FLOAT64)data[2 * (i + 2) + 1] * (*w1i));
696
90.2M
      data[2 * (i + 2) + 1] = (FLOAT32)((FLOAT64)data[2 * (i + 2)] * (*w1i) +
697
90.2M
                                        (FLOAT64)data[2 * (i + 2) + 1] * (*w1r));
698
90.2M
      data[2 * (i + 2)] = tmp;
699
700
90.2M
      w1r += 3 * (128 / mpass - 1) + 1;
701
90.2M
      w1i += 3 * (128 / mpass - 1) + 1;
702
90.2M
    }
703
2.94M
  }
704
705
93.1M
  for (i = 0; i < mpass; i++) {
706
90.2M
    iusace_complex_3point_fft(ptr_x, ptr_y);
707
708
90.2M
    ptr_x = ptr_x + 6;
709
90.2M
    ptr_y = ptr_y + 6;
710
90.2M
  }
711
712
93.1M
  for (i = 0; i < mpass; i++) {
713
90.2M
    data[2 * i] = y[6 * i];
714
90.2M
    data[2 * i + 1] = y[6 * i + 1];
715
90.2M
  }
716
717
93.1M
  for (i = 0; i < mpass; i++) {
718
90.2M
    data[2 * (i + mpass)] = y[6 * i + 2];
719
90.2M
    data[2 * (i + mpass) + 1] = y[6 * i + 3];
720
90.2M
  }
721
722
93.1M
  for (i = 0; i < mpass; i++) {
723
90.2M
    data[2 * (i + 2 * mpass)] = y[6 * i + 4];
724
90.2M
    data[2 * (i + 2 * mpass) + 1] = y[6 * i + 5];
725
90.2M
  }
726
2.94M
}
727
728
0
VOID iusace_complex_fft_p3_no_scratch(FLOAT32 *data, WORD32 nlength) {
729
0
  WORD32 i, j;
730
731
0
  FLOAT32 data_3[800];
732
0
  FLOAT32 y[1024];
733
0
  FLOAT32 p_fft_p2_y[2048];
734
0
  WORD32 cnfac;
735
0
  WORD32 mpass = nlength;
736
0
  FLOAT32 *ptr_x = data;
737
0
  FLOAT32 *ptr_y = y;
738
739
0
  cnfac = 0;
740
0
  while (mpass % 3 == 0) {
741
0
    mpass /= 3;
742
0
    cnfac++;
743
0
  }
744
745
0
  for (i = 0; i < 3 * cnfac; i++) {
746
0
    for (j = 0; j < mpass; j++) {
747
0
      data_3[2 * j] = data[3 * (2 * j) + (2 * i)];
748
0
      data_3[2 * j + 1] = data[3 * (2 * j) + 1 + (2 * i)];
749
0
    }
750
0
    iusace_complex_fft_p2(data_3, mpass, p_fft_p2_y);
751
752
0
    for (j = 0; j < mpass; j++) {
753
0
      data[3 * (2 * j) + (2 * i)] = data_3[2 * j];
754
0
      data[3 * (2 * j) + 1 + (2 * i)] = data_3[2 * j + 1];
755
0
    }
756
0
  }
757
758
0
  {
759
0
    const FLOAT64 *w1r, *w1i;
760
0
    FLOAT32 tmp;
761
0
    w1r = iusace_twiddle_table_3pr;
762
0
    w1i = iusace_twiddle_table_3pi;
763
764
0
    for (i = 0; i < nlength; i += 3) {
765
0
      tmp = (FLOAT32)((FLOAT64)data[2 * i] * (*w1r) - (FLOAT64)data[2 * i + 1] * (*w1i));
766
0
      data[2 * i + 1] =
767
0
          (FLOAT32)((FLOAT64)data[2 * i] * (*w1i) + (FLOAT64)data[2 * i + 1] * (*w1r));
768
0
      data[2 * i] = tmp;
769
770
0
      w1r++;
771
0
      w1i++;
772
773
0
      tmp = (FLOAT32)((FLOAT64)data[2 * (i + 1)] * (*w1r) -
774
0
                      (FLOAT64)data[2 * (i + 1) + 1] * (*w1i));
775
0
      data[2 * (i + 1) + 1] = (FLOAT32)((FLOAT64)data[2 * (i + 1)] * (*w1i) +
776
0
                                        (FLOAT64)data[2 * (i + 1) + 1] * (*w1r));
777
0
      data[2 * (i + 1)] = tmp;
778
779
0
      w1r++;
780
0
      w1i++;
781
782
0
      tmp = (FLOAT32)((FLOAT64)data[2 * (i + 2)] * (*w1r) -
783
0
                      (FLOAT64)data[2 * (i + 2) + 1] * (*w1i));
784
0
      data[2 * (i + 2) + 1] = (FLOAT32)((FLOAT64)data[2 * (i + 2)] * (*w1i) +
785
0
                                        (FLOAT64)data[2 * (i + 2) + 1] * (*w1r));
786
0
      data[2 * (i + 2)] = tmp;
787
788
0
      w1r += 3 * (128 / mpass - 1) + 1;
789
0
      w1i += 3 * (128 / mpass - 1) + 1;
790
0
    }
791
0
  }
792
793
0
  for (i = 0; i < mpass; i++) {
794
0
    iusace_complex_3point_fft(ptr_x, ptr_y);
795
796
0
    ptr_x = ptr_x + 6;
797
0
    ptr_y = ptr_y + 6;
798
0
  }
799
800
0
  for (i = 0; i < mpass; i++) {
801
0
    data[2 * i] = y[6 * i];
802
0
    data[2 * i + 1] = y[6 * i + 1];
803
0
  }
804
805
0
  for (i = 0; i < mpass; i++) {
806
0
    data[2 * (i + mpass)] = y[6 * i + 2];
807
0
    data[2 * (i + mpass) + 1] = y[6 * i + 3];
808
0
  }
809
810
0
  for (i = 0; i < mpass; i++) {
811
0
    data[2 * (i + 2 * mpass)] = y[6 * i + 4];
812
0
    data[2 * (i + 2 * mpass) + 1] = y[6 * i + 5];
813
0
  }
814
0
}
815
816
static VOID iusace_calc_pre_twid_enc(FLOAT64 *ptr_in, FLOAT32 *fft_ptr, WORD32 npoints,
817
                                     const FLOAT64 *cos_ptr, const FLOAT64 *sin_ptr,
818
3.20M
                                     const WORD32 tx_flag) {
819
3.20M
  WORD32 i, n;
820
3.20M
  WORD32 b = npoints >> 1;
821
3.20M
  WORD32 a = npoints - b;
822
3.20M
  WORD32 nlength = npoints >> 2;
823
3.20M
  FLOAT64 tempr, tempi;
824
825
3.20M
  if (tx_flag == 0) {
826
1.60M
    FLOAT64 norm;
827
296M
    for (i = 0; i < b; i++) {
828
294M
      norm = ptr_in[i]; /* reuse MDCT: spectrally reverse all bins */
829
294M
      ptr_in[i] = ptr_in[npoints - 1 - i];
830
294M
      ptr_in[npoints - 1 - i] = norm;
831
294M
    }
832
1.60M
  }
833
298M
  for (i = 0; i < nlength; i++) {
834
294M
    n = npoints / 2 - 1 - 2 * i;
835
294M
    if (i < b / 4) {
836
147M
      tempr = ptr_in[a / 2 + n] + ptr_in[npoints + a / 2 - 1 - n];
837
147M
    } else {
838
147M
      tempr = ptr_in[a / 2 + n] - ptr_in[a / 2 - 1 - n];
839
147M
    }
840
294M
    n = 2 * i;
841
294M
    if (i < a / 4) {
842
147M
      tempi = ptr_in[a / 2 + n] - ptr_in[a / 2 - 1 - n];
843
147M
    } else {
844
147M
      tempi = ptr_in[a / 2 + n] + ptr_in[npoints + a / 2 - 1 - n];
845
147M
    }
846
847
294M
    fft_ptr[2 * i] = (FLOAT32)(tempr * (*cos_ptr) + tempi * (*sin_ptr));
848
294M
    fft_ptr[2 * i + 1] = (FLOAT32)(tempi * (*cos_ptr++) - tempr * (*sin_ptr++));
849
294M
  }
850
3.20M
}
851
852
9.18M
VOID iusace_complex_fft(FLOAT32 *data, WORD32 nlength, iusace_scratch_mem *pstr_scratch) {
853
9.18M
  if (nlength & (nlength - 1)) {
854
2.94M
    iusace_complex_fft_p3(data, nlength, pstr_scratch);
855
6.24M
  } else {
856
6.24M
    iusace_complex_fft_p2(data, nlength, pstr_scratch->p_fft_p2_y);
857
6.24M
  }
858
9.18M
}
859
860
static VOID iusace_calc_post_twid_enc(FLOAT64 *ptr_out, FLOAT32 *fft_ptr, WORD32 npoints,
861
                                      const FLOAT64 *cos_ptr, const FLOAT64 *sin_ptr,
862
3.20M
                                      const WORD32 tx_flag) {
863
3.20M
  WORD32 i;
864
3.20M
  WORD32 nlength = npoints >> 2;
865
3.20M
  FLOAT64 tempr, tempi;
866
867
  /* post-twiddle FFT output and then get output data */
868
298M
  for (i = 0; i < nlength; i++) {
869
294M
    tempr =
870
294M
        2 * ((FLOAT64)(fft_ptr[2 * i]) * (*cos_ptr) + (FLOAT64)(fft_ptr[2 * i + 1]) * (*sin_ptr));
871
294M
    tempi = 2 * ((FLOAT64)(fft_ptr[2 * i + 1]) * (*cos_ptr++) -
872
294M
                 (FLOAT64)(fft_ptr[2 * i]) * (*sin_ptr++));
873
874
294M
    ptr_out[2 * i] = -tempr;
875
294M
    ptr_out[npoints / 2 - 1 - 2 * i] = tempi;
876
294M
    ptr_out[npoints / 2 + 2 * i] = -tempi;
877
294M
    ptr_out[npoints - 1 - 2 * i] = tempr;
878
294M
  }
879
3.20M
  if (tx_flag == 0) {
880
296M
    for (i = 0; i < npoints; i += 2) {
881
294M
      ptr_out[i] *= -1; /* reuse MDCT: flip signs at odd indices */
882
294M
    }
883
1.60M
  }
884
3.20M
}
885
886
IA_ERRORCODE iusace_fft_based_mdct(FLOAT64 *ptr_in, FLOAT64 *ptr_out, WORD32 npoints,
887
3.20M
                                   const WORD32 tx_flag, iusace_scratch_mem *pstr_scratch) {
888
3.20M
  FLOAT32 *ptr_scratch1 = pstr_scratch->p_fft_mdct_buf;
889
3.20M
  const FLOAT64 *cos_ptr = NULL;
890
3.20M
  const FLOAT64 *sin_ptr = NULL;
891
3.20M
  WORD32 nlength = npoints >> 1;
892
3.20M
  WORD32 n_total = npoints << 1;
893
894
3.20M
  memset(ptr_scratch1, 0, ((SIZE_T)n_total << 1) * sizeof(*ptr_scratch1));
895
896
3.20M
  switch (npoints) {
897
1.14M
    case (96):
898
1.14M
      cos_ptr = iexheaac_pre_post_twid_cos_192;
899
1.14M
      sin_ptr = iexheaac_pre_post_twid_sin_192;
900
1.14M
      break;
901
1.78M
    case (128):
902
1.78M
      cos_ptr = iusace_pre_post_twid_cos_256;
903
1.78M
      sin_ptr = iusace_pre_post_twid_sin_256;
904
1.78M
      break;
905
105k
    case (768):
906
105k
      cos_ptr = iexheaac_pre_post_twid_cos_1536;
907
105k
      sin_ptr = iexheaac_pre_post_twid_sin_1536;
908
105k
      break;
909
165k
    case (1024):
910
165k
      cos_ptr = iusace_pre_post_twid_cos_2048;
911
165k
      sin_ptr = iusace_pre_post_twid_sin_2048;
912
165k
      break;
913
0
    default:
914
0
      return IA_EXHEAACE_EXE_FATAL_USAC_INVALID_WINDOW_LENGTH;
915
3.20M
  }
916
917
  /* pre-twiddle */
918
3.20M
  iusace_calc_pre_twid_enc(ptr_in, ptr_scratch1, npoints << 1, cos_ptr, sin_ptr, tx_flag);
919
920
  /* complex FFT */
921
3.20M
  iusace_complex_fft(ptr_scratch1, nlength, pstr_scratch);
922
923
  /* post-twiddle */
924
3.20M
  iusace_calc_post_twid_enc(ptr_out, ptr_scratch1, npoints << 1, cos_ptr, sin_ptr, tx_flag);
925
926
3.20M
  return IA_NO_ERROR;
927
3.20M
}
928
929
156k
VOID iusace_complex_fft_2048(FLOAT32 *ptr_x, FLOAT32 *scratch_fft) {
930
156k
  WORD32 i;
931
156k
  FLOAT32 re, im, c_v, s_v, tmp_re, tmp_im;
932
156k
  FLOAT32 *ptr_re, *ptr_im, *ptr_re_h, *ptr_im_h;
933
156k
  FLOAT32 *ptr_cos_val, *ptr_sin_val;
934
156k
  iusace_complex_fft_p2(ptr_x, 1024, scratch_fft);
935
156k
  iusace_complex_fft_p2(ptr_x + 2048, 1024, scratch_fft);
936
937
156k
  ptr_re = ptr_x;
938
156k
  ptr_im = ptr_x + 1;
939
156k
  ptr_re_h = ptr_x + 2048;
940
156k
  ptr_im_h = ptr_x + 2048 + 1;
941
156k
  ptr_cos_val = (FLOAT32 *)&iusace_twiddle_cos_2048[0];
942
156k
  ptr_sin_val = (FLOAT32 *)&iusace_twiddle_sin_2048[0];
943
160M
  for (i = 0; i < 1024; i++) {
944
160M
    re = *ptr_re_h;
945
160M
    im = *ptr_im_h;
946
160M
    c_v = ptr_cos_val[i];
947
160M
    s_v = ptr_sin_val[i];
948
160M
    tmp_re = (re * c_v) + (im * s_v);
949
160M
    tmp_im = -(re * s_v) + (im * c_v);
950
160M
    re = *ptr_re;
951
160M
    im = *ptr_im;
952
953
160M
    *ptr_re = re + tmp_re;
954
160M
    *ptr_im = im + tmp_im;
955
160M
    *ptr_re_h = re - tmp_re;
956
160M
    *ptr_im_h = im - tmp_im;
957
958
160M
    ptr_re += 2;
959
160M
    ptr_im += 2;
960
160M
    ptr_re_h += 2;
961
160M
    ptr_im_h += 2;
962
160M
  }
963
156k
}
964
static VOID ixheaace_rad2_cplx_fft(FLOAT32 *ptr_real, FLOAT32 *ptr_imag, WORD32 n_points,
965
191k
                                   FLOAT32 *ptr_scratch) {
966
191k
  WORD32 i, j, k, n_stages, h2;
967
191k
  FLOAT32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
968
191k
  WORD32 del, nodespacing, in_loop_cnt;
969
191k
  WORD32 not_power_4;
970
191k
  WORD32 dig_rev_shift;
971
191k
  WORD32 m_points = n_points;
972
191k
  FLOAT32 *ptr_x = ptr_scratch;
973
191k
  FLOAT32 *y = ptr_scratch + 2048;
974
191k
  FLOAT32 *ptr_y = y;
975
191k
  const FLOAT32 *ptr_w;
976
977
191k
  dig_rev_shift = ixheaac_norm32(m_points) + 1 - 16;
978
191k
  n_stages = 30 - ixheaac_norm32(m_points);
979
191k
  not_power_4 = n_stages & 1;
980
981
191k
  n_stages = n_stages >> 1;
982
983
191k
  ptr_w = ia_fft_twiddle_table_float;
984
985
196M
  for (i = 0; i < n_points; i++) {
986
196M
    ptr_x[2 * i] = ptr_real[i];
987
196M
    ptr_x[2 * i + 1] = ptr_imag[i];
988
196M
  }
989
191k
  dig_rev_shift = max(dig_rev_shift, 0);
990
49.2M
  for (i = 0; i < n_points; i += 4) {
991
49.0M
    FLOAT32 *inp = ptr_x;
992
49.0M
    FLOAT32 tmk;
993
994
49.0M
    DIG_REV(i, dig_rev_shift, h2);
995
49.0M
    if (not_power_4) {
996
0
      h2 += 1;
997
0
      h2 &= ~1;
998
0
    }
999
49.0M
    inp += (h2);
1000
1001
49.0M
    x0r = *inp;
1002
49.0M
    x0i = *(inp + 1);
1003
49.0M
    inp += (n_points >> 1);
1004
1005
49.0M
    x1r = *inp;
1006
49.0M
    x1i = *(inp + 1);
1007
49.0M
    inp += (n_points >> 1);
1008
1009
49.0M
    x2r = *inp;
1010
49.0M
    x2i = *(inp + 1);
1011
49.0M
    inp += (n_points >> 1);
1012
1013
49.0M
    x3r = *inp;
1014
49.0M
    x3i = *(inp + 1);
1015
1016
49.0M
    x0r = ia_add_flt(x0r, x2r);
1017
49.0M
    x0i = ia_add_flt(x0i, x2i);
1018
1019
49.0M
    tmk = ia_sub_flt(x0r, x2r);
1020
49.0M
    x2r = ia_sub_flt(tmk, x2r);
1021
49.0M
    tmk = ia_sub_flt(x0i, x2i);
1022
49.0M
    x2i = ia_sub_flt(tmk, x2i);
1023
1024
49.0M
    x1r = ia_add_flt(x1r, x3r);
1025
49.0M
    x1i = ia_add_flt(x1i, x3i);
1026
1027
49.0M
    tmk = ia_sub_flt(x1r, x3r);
1028
49.0M
    x3r = ia_sub_flt(tmk, x3r);
1029
49.0M
    tmk = ia_sub_flt(x1i, x3i);
1030
49.0M
    x3i = ia_sub_flt(tmk, x3i);
1031
1032
49.0M
    x0r = ia_add_flt(x0r, x1r);
1033
49.0M
    x0i = ia_add_flt(x0i, x1i);
1034
1035
49.0M
    tmk = ia_sub_flt(x0r, x1r);
1036
49.0M
    x1r = ia_sub_flt(tmk, x1r);
1037
49.0M
    tmk = ia_sub_flt(x0i, x1i);
1038
49.0M
    x1i = ia_sub_flt(tmk, x1i);
1039
1040
49.0M
    x2r = ia_add_flt(x2r, x3i);
1041
49.0M
    x2i = ia_sub_flt(x2i, x3r);
1042
1043
49.0M
    tmk = ia_sub_flt(x2r, x3i);
1044
49.0M
    x3i = ia_sub_flt(tmk, x3i);
1045
49.0M
    tmk = ia_add_flt(x2i, x3r);
1046
49.0M
    x3r = ia_add_flt(tmk, x3r);
1047
1048
49.0M
    *ptr_y++ = x0r;
1049
49.0M
    *ptr_y++ = x0i;
1050
49.0M
    *ptr_y++ = x2r;
1051
49.0M
    *ptr_y++ = x2i;
1052
49.0M
    *ptr_y++ = x1r;
1053
49.0M
    *ptr_y++ = x1i;
1054
49.0M
    *ptr_y++ = x3i;
1055
49.0M
    *ptr_y++ = x3r;
1056
49.0M
  }
1057
191k
  ptr_y -= 2 * n_points;
1058
191k
  del = 4;
1059
191k
  nodespacing = 64;
1060
191k
  in_loop_cnt = n_points >> 4;
1061
958k
  for (i = n_stages - 1; i > 0; i--) {
1062
766k
    const FLOAT32 *twiddles = ptr_w;
1063
766k
    FLOAT32 *data = ptr_y;
1064
766k
    FLOAT32 w_1, w_2, w_3, w_4, w_5, w_6;
1065
766k
    WORD32 sec_loop_cnt;
1066
1067
17.0M
    for (k = in_loop_cnt; k != 0; k--) {
1068
16.2M
      x0r = (*data);
1069
16.2M
      x0i = (*(data + 1));
1070
16.2M
      data += ((SIZE_T)del << 1);
1071
1072
16.2M
      x1r = (*data);
1073
16.2M
      x1i = (*(data + 1));
1074
16.2M
      data += ((SIZE_T)del << 1);
1075
1076
16.2M
      x2r = (*data);
1077
16.2M
      x2i = (*(data + 1));
1078
16.2M
      data += ((SIZE_T)del << 1);
1079
1080
16.2M
      x3r = (*data);
1081
16.2M
      x3i = (*(data + 1));
1082
16.2M
      data -= 3 * (del << 1);
1083
1084
16.2M
      x0r = ia_add_flt(x0r, x2r);
1085
16.2M
      x0i = ia_add_flt(x0i, x2i);
1086
16.2M
      x2r = ia_msu_flt(x0r, x2r, 2);
1087
16.2M
      x2i = ia_msu_flt(x0i, x2i, 2);
1088
16.2M
      x1r = ia_add_flt(x1r, x3r);
1089
16.2M
      x1i = ia_add_flt(x1i, x3i);
1090
16.2M
      x3r = ia_msu_flt(x1r, x3r, 2);
1091
16.2M
      x3i = ia_msu_flt(x1i, x3i, 2);
1092
1093
16.2M
      x0r = ia_add_flt(x0r, x1r);
1094
16.2M
      x0i = ia_add_flt(x0i, x1i);
1095
16.2M
      x1r = ia_msu_flt(x0r, x1r, 2);
1096
16.2M
      x1i = ia_msu_flt(x0i, x1i, 2);
1097
16.2M
      x2r = ia_add_flt(x2r, x3i);
1098
16.2M
      x2i = ia_sub_flt(x2i, x3r);
1099
16.2M
      x3i = ia_msu_flt(x2r, x3i, 2);
1100
16.2M
      x3r = ia_mac_flt(x2i, x3r, 2);
1101
1102
16.2M
      *data = x0r;
1103
16.2M
      *(data + 1) = x0i;
1104
16.2M
      data += ((SIZE_T)del << 1);
1105
1106
16.2M
      *data = x2r;
1107
16.2M
      *(data + 1) = x2i;
1108
16.2M
      data += ((SIZE_T)del << 1);
1109
1110
16.2M
      *data = x1r;
1111
16.2M
      *(data + 1) = x1i;
1112
16.2M
      data += ((SIZE_T)del << 1);
1113
1114
16.2M
      *data = x3i;
1115
16.2M
      *(data + 1) = x3r;
1116
16.2M
      data += ((SIZE_T)del << 1);
1117
16.2M
    }
1118
766k
    data = ptr_y + 2;
1119
1120
766k
    sec_loop_cnt = (nodespacing * del);
1121
766k
    sec_loop_cnt = (sec_loop_cnt / 4) + (sec_loop_cnt / 8) - (sec_loop_cnt / 16) +
1122
766k
                   (sec_loop_cnt / 32) - (sec_loop_cnt / 64) + (sec_loop_cnt / 128) -
1123
766k
                   (sec_loop_cnt / 256);
1124
1125
22.2M
    for (j = nodespacing; j <= sec_loop_cnt; j += nodespacing) {
1126
21.4M
      w_1 = *(twiddles + j);
1127
21.4M
      w_4 = *(twiddles + j + 257);
1128
21.4M
      w_2 = *(twiddles + ((SIZE_T)j << 1));
1129
21.4M
      w_5 = *(twiddles + ((SIZE_T)j << 1) + 257);
1130
21.4M
      w_3 = *(twiddles + j + ((SIZE_T)j << 1));
1131
21.4M
      w_6 = *(twiddles + j + ((SIZE_T)j << 1) + 257);
1132
1133
81.4M
      for (k = in_loop_cnt; k != 0; k--) {
1134
60.0M
        FLOAT32 tmp;
1135
        /*x0 is loaded later to avoid register crunch*/
1136
1137
60.0M
        data += ((SIZE_T)del << 1);
1138
1139
60.0M
        x1r = *data;
1140
60.0M
        x1i = *(data + 1);
1141
60.0M
        data += ((SIZE_T)del << 1);
1142
1143
60.0M
        x2r = *data;
1144
60.0M
        x2i = *(data + 1);
1145
60.0M
        data += ((SIZE_T)del << 1);
1146
1147
60.0M
        x3r = *data;
1148
60.0M
        x3i = *(data + 1);
1149
60.0M
        data -= 3 * (del << 1);
1150
1151
60.0M
        tmp = ia_sub_flt(ia_mul_flt(x1r, w_1), ia_mul_flt(x1i, w_4));
1152
60.0M
        x1i = ia_mac_flt(ia_mul_flt(x1r, w_4), x1i, w_1);
1153
60.0M
        x1r = tmp;
1154
1155
60.0M
        tmp = ia_sub_flt(ia_mul_flt(x2r, w_2), ia_mul_flt(x2i, w_5));
1156
60.0M
        x2i = ia_mac_flt(ia_mul_flt(x2r, w_5), x2i, w_2);
1157
60.0M
        x2r = tmp;
1158
1159
60.0M
        tmp = ia_sub_flt(ia_mul_flt(x3r, w_3), ia_mul_flt(x3i, w_6));
1160
60.0M
        x3i = ia_mac_flt(ia_mul_flt(x3r, w_6), x3i, w_3);
1161
60.0M
        x3r = tmp;
1162
1163
60.0M
        x0r = (*data);
1164
60.0M
        x0i = (*(data + 1));
1165
1166
60.0M
        x0r = ia_add_flt(x0r, (x2r));
1167
60.0M
        x0i = ia_add_flt(x0i, (x2i));
1168
60.0M
        x2r = ia_msu_flt(x0r, x2r, 2);
1169
60.0M
        x2i = ia_msu_flt(x0i, x2i, 2);
1170
60.0M
        x1r = ia_add_flt(x1r, x3r);
1171
60.0M
        x1i = ia_add_flt(x1i, x3i);
1172
60.0M
        x3r = ia_msu_flt(x1r, x3r, 2);
1173
60.0M
        x3i = ia_msu_flt(x1i, x3i, 2);
1174
1175
60.0M
        x0r = ia_add_flt(x0r, (x1r));
1176
60.0M
        x0i = ia_add_flt(x0i, (x1i));
1177
60.0M
        x1r = ia_msu_flt(x0r, x1r, 2);
1178
60.0M
        x1i = ia_msu_flt(x0i, x1i, 2);
1179
60.0M
        x2r = ia_add_flt(x2r, (x3i));
1180
60.0M
        x2i = ia_sub_flt(x2i, (x3r));
1181
60.0M
        x3i = ia_msu_flt(x2r, x3i, 2);
1182
60.0M
        x3r = ia_mac_flt(x2i, x3r, 2);
1183
1184
60.0M
        *data = x0r;
1185
60.0M
        *(data + 1) = x0i;
1186
60.0M
        data += ((SIZE_T)del << 1);
1187
1188
60.0M
        *data = x2r;
1189
60.0M
        *(data + 1) = x2i;
1190
60.0M
        data += ((SIZE_T)del << 1);
1191
1192
60.0M
        *data = x1r;
1193
60.0M
        *(data + 1) = x1i;
1194
60.0M
        data += ((SIZE_T)del << 1);
1195
1196
60.0M
        *data = x3i;
1197
60.0M
        *(data + 1) = x3r;
1198
60.0M
        data += ((SIZE_T)del << 1);
1199
60.0M
      }
1200
21.4M
      data -= 2 * n_points;
1201
21.4M
      data += 2;
1202
21.4M
    }
1203
11.8M
    for (; j <= (nodespacing * del) >> 1; j += nodespacing) {
1204
11.1M
      w_1 = *(twiddles + j);
1205
11.1M
      w_4 = *(twiddles + j + 257);
1206
11.1M
      w_2 = *(twiddles + ((SIZE_T)j << 1));
1207
11.1M
      w_5 = *(twiddles + ((SIZE_T)j << 1) + 257);
1208
11.1M
      w_3 = *(twiddles + j + ((SIZE_T)j << 1) - 256);
1209
11.1M
      w_6 = *(twiddles + j + ((SIZE_T)j << 1) + 1);
1210
1211
49.2M
      for (k = in_loop_cnt; k != 0; k--) {
1212
38.1M
        FLOAT32 tmp;
1213
        /*x0 is loaded later to avoid register crunch*/
1214
1215
38.1M
        data += ((SIZE_T)del << 1);
1216
1217
38.1M
        x1r = *data;
1218
38.1M
        x1i = *(data + 1);
1219
38.1M
        data += ((SIZE_T)del << 1);
1220
1221
38.1M
        x2r = *data;
1222
38.1M
        x2i = *(data + 1);
1223
38.1M
        data += ((SIZE_T)del << 1);
1224
1225
38.1M
        x3r = *data;
1226
38.1M
        x3i = *(data + 1);
1227
38.1M
        data -= 3 * (del << 1);
1228
1229
38.1M
        tmp = ia_sub_flt(ia_mul_flt(x1r, w_1), ia_mul_flt(x1i, w_4));
1230
38.1M
        x1i = ia_mac_flt(ia_mul_flt(x1r, w_4), x1i, w_1);
1231
38.1M
        x1r = tmp;
1232
1233
38.1M
        tmp = ia_sub_flt(ia_mul_flt(x2r, w_2), ia_mul_flt(x2i, w_5));
1234
38.1M
        x2i = ia_mac_flt(ia_mul_flt(x2r, w_5), x2i, w_2);
1235
38.1M
        x2r = tmp;
1236
1237
38.1M
        tmp = ia_add_flt(ia_mul_flt(x3r, w_6), ia_mul_flt(x3i, w_3));
1238
38.1M
        x3i = ia_add_flt(ia_negate_flt(ia_mul_flt(x3r, w_3)), ia_mul_flt(x3i, w_6));
1239
38.1M
        x3r = tmp;
1240
1241
38.1M
        x0r = (*data);
1242
38.1M
        x0i = (*(data + 1));
1243
1244
38.1M
        x0r = ia_add_flt(x0r, (x2r));
1245
38.1M
        x0i = ia_add_flt(x0i, (x2i));
1246
38.1M
        x2r = ia_msu_flt(x0r, x2r, 2);
1247
38.1M
        x2i = ia_msu_flt(x0i, x2i, 2);
1248
38.1M
        x1r = ia_add_flt(x1r, x3r);
1249
38.1M
        x1i = ia_add_flt(x1i, x3i);
1250
38.1M
        x3r = ia_msu_flt(x1r, x3r, 2);
1251
38.1M
        x3i = ia_msu_flt(x1i, x3i, 2);
1252
1253
38.1M
        x0r = ia_add_flt(x0r, (x1r));
1254
38.1M
        x0i = ia_add_flt(x0i, (x1i));
1255
38.1M
        x1r = ia_msu_flt(x0r, x1r, 2);
1256
38.1M
        x1i = ia_msu_flt(x0i, x1i, 2);
1257
38.1M
        x2r = ia_add_flt(x2r, (x3i));
1258
38.1M
        x2i = ia_sub_flt(x2i, (x3r));
1259
38.1M
        x3i = ia_msu_flt(x2r, x3i, 2);
1260
38.1M
        x3r = ia_mac_flt(x2i, x3r, 2);
1261
1262
38.1M
        *data = x0r;
1263
38.1M
        *(data + 1) = x0i;
1264
38.1M
        data += ((SIZE_T)del << 1);
1265
1266
38.1M
        *data = x2r;
1267
38.1M
        *(data + 1) = x2i;
1268
38.1M
        data += ((SIZE_T)del << 1);
1269
1270
38.1M
        *data = x1r;
1271
38.1M
        *(data + 1) = x1i;
1272
38.1M
        data += ((SIZE_T)del << 1);
1273
1274
38.1M
        *data = x3i;
1275
38.1M
        *(data + 1) = x3r;
1276
38.1M
        data += ((SIZE_T)del << 1);
1277
38.1M
      }
1278
11.1M
      data -= 2 * n_points;
1279
11.1M
      data += 2;
1280
11.1M
    }
1281
11.1M
    for (; j <= sec_loop_cnt * 2; j += nodespacing) {
1282
10.3M
      w_1 = *(twiddles + j);
1283
10.3M
      w_4 = *(twiddles + j + 257);
1284
10.3M
      w_2 = *(twiddles + ((SIZE_T)j << 1) - 256);
1285
10.3M
      w_5 = *(twiddles + ((SIZE_T)j << 1) + 1);
1286
10.3M
      w_3 = *(twiddles + j + ((SIZE_T)j << 1) - 256);
1287
10.3M
      w_6 = *(twiddles + j + ((SIZE_T)j << 1) + 1);
1288
1289
32.2M
      for (k = in_loop_cnt; k != 0; k--) {
1290
21.8M
        FLOAT32 tmp;
1291
        /*x0 is loaded later to avoid register crunch*/
1292
1293
21.8M
        data += ((SIZE_T)del << 1);
1294
1295
21.8M
        x1r = *data;
1296
21.8M
        x1i = *(data + 1);
1297
21.8M
        data += ((SIZE_T)del << 1);
1298
1299
21.8M
        x2r = *data;
1300
21.8M
        x2i = *(data + 1);
1301
21.8M
        data += ((SIZE_T)del << 1);
1302
1303
21.8M
        x3r = *data;
1304
21.8M
        x3i = *(data + 1);
1305
21.8M
        data -= 3 * (del << 1);
1306
1307
21.8M
        tmp = ia_sub_flt(ia_mul_flt(x1r, w_1), ia_mul_flt(x1i, w_4));
1308
21.8M
        x1i = ia_mac_flt(ia_mul_flt(x1r, w_4), x1i, w_1);
1309
21.8M
        x1r = tmp;
1310
1311
21.8M
        tmp = ia_add_flt(ia_mul_flt(x2r, w_5), ia_mul_flt(x2i, w_2));
1312
21.8M
        x2i = ia_add_flt(ia_negate_flt(ia_mul_flt(x2r, w_2)), ia_mul_flt(x2i, w_5));
1313
21.8M
        x2r = tmp;
1314
1315
21.8M
        tmp = ia_add_flt(ia_mul_flt(x3r, w_6), ia_mul_flt(x3i, w_3));
1316
21.8M
        x3i = ia_add_flt(ia_negate_flt(ia_mul_flt(x3r, w_3)), ia_mul_flt(x3i, w_6));
1317
21.8M
        x3r = tmp;
1318
1319
21.8M
        x0r = (*data);
1320
21.8M
        x0i = (*(data + 1));
1321
1322
21.8M
        x0r = ia_add_flt(x0r, (x2r));
1323
21.8M
        x0i = ia_add_flt(x0i, (x2i));
1324
21.8M
        x2r = ia_msu_flt(x0r, x2r, 2);
1325
21.8M
        x2i = ia_msu_flt(x0i, x2i, 2);
1326
21.8M
        x1r = ia_add_flt(x1r, x3r);
1327
21.8M
        x1i = ia_add_flt(x1i, x3i);
1328
21.8M
        x3r = ia_msu_flt(x1r, x3r, 2);
1329
21.8M
        x3i = ia_msu_flt(x1i, x3i, 2);
1330
1331
21.8M
        x0r = ia_add_flt(x0r, (x1r));
1332
21.8M
        x0i = ia_add_flt(x0i, (x1i));
1333
21.8M
        x1r = ia_msu_flt(x0r, x1r, 2);
1334
21.8M
        x1i = ia_msu_flt(x0i, x1i, 2);
1335
21.8M
        x2r = ia_add_flt(x2r, (x3i));
1336
21.8M
        x2i = ia_sub_flt(x2i, (x3r));
1337
21.8M
        x3i = ia_msu_flt(x2r, x3i, 2);
1338
21.8M
        x3r = ia_mac_flt(x2i, x3r, 2);
1339
1340
21.8M
        *data = x0r;
1341
21.8M
        *(data + 1) = x0i;
1342
21.8M
        data += ((SIZE_T)del << 1);
1343
1344
21.8M
        *data = x2r;
1345
21.8M
        *(data + 1) = x2i;
1346
21.8M
        data += ((SIZE_T)del << 1);
1347
1348
21.8M
        *data = x1r;
1349
21.8M
        *(data + 1) = x1i;
1350
21.8M
        data += ((SIZE_T)del << 1);
1351
1352
21.8M
        *data = x3i;
1353
21.8M
        *(data + 1) = x3r;
1354
21.8M
        data += ((SIZE_T)del << 1);
1355
21.8M
      }
1356
10.3M
      data -= 2 * n_points;
1357
10.3M
      data += 2;
1358
10.3M
    }
1359
22.2M
    for (; j < nodespacing * del; j += nodespacing) {
1360
21.4M
      w_1 = *(twiddles + j);
1361
21.4M
      w_4 = *(twiddles + j + 257);
1362
21.4M
      w_2 = *(twiddles + ((SIZE_T)j << 1) - 256);
1363
21.4M
      w_5 = *(twiddles + ((SIZE_T)j << 1) + 1);
1364
21.4M
      w_3 = *(twiddles + j + ((SIZE_T)j << 1) - 512);
1365
21.4M
      w_6 = *(twiddles + j + ((SIZE_T)j << 1) - 512 + 257);
1366
1367
81.4M
      for (k = in_loop_cnt; k != 0; k--) {
1368
60.0M
        FLOAT32 tmp;
1369
        /*x0 is loaded later to avoid register crunch*/
1370
1371
60.0M
        data += ((SIZE_T)del << 1);
1372
1373
60.0M
        x1r = *data;
1374
60.0M
        x1i = *(data + 1);
1375
60.0M
        data += ((SIZE_T)del << 1);
1376
1377
60.0M
        x2r = *data;
1378
60.0M
        x2i = *(data + 1);
1379
60.0M
        data += ((SIZE_T)del << 1);
1380
1381
60.0M
        x3r = *data;
1382
60.0M
        x3i = *(data + 1);
1383
60.0M
        data -= 3 * (del << 1);
1384
1385
60.0M
        tmp = ia_sub_flt(ia_mul_flt(x1r, w_1), ia_mul_flt(x1i, w_4));
1386
60.0M
        x1i = ia_mac_flt(ia_mul_flt(x1r, w_4), x1i, w_1);
1387
60.0M
        x1r = tmp;
1388
1389
60.0M
        tmp = ia_add_flt(ia_mul_flt(x2r, w_5), ia_mul_flt(x2i, w_2));
1390
60.0M
        x2i = ia_add_flt(ia_negate_flt(ia_mul_flt(x2r, w_2)), ia_mul_flt(x2i, w_5));
1391
60.0M
        x2r = tmp;
1392
1393
60.0M
        tmp = ia_add_flt(ia_negate_flt(ia_mul_flt(x3r, w_3)), ia_mul_flt(x3i, w_6));
1394
60.0M
        x3i = ia_mac_flt(ia_mul_flt(x3r, w_6), x3i, w_3);
1395
60.0M
        x3r = tmp;
1396
1397
60.0M
        x0r = (*data);
1398
60.0M
        x0i = (*(data + 1));
1399
1400
60.0M
        x0r = ia_add_flt(x0r, (x2r));
1401
60.0M
        x0i = ia_add_flt(x0i, (x2i));
1402
60.0M
        x2r = ia_msu_flt(x0r, x2r, 2);
1403
60.0M
        x2i = ia_msu_flt(x0i, x2i, 2);
1404
60.0M
        x1r = ia_add_flt(x1r, x3r);
1405
60.0M
        x1i = ia_sub_flt(x1i, x3i);
1406
60.0M
        x3r = ia_msu_flt(x1r, x3r, 2);
1407
60.0M
        x3i = ia_mac_flt(x1i, x3i, 2);
1408
1409
60.0M
        x0r = ia_add_flt(x0r, (x1r));
1410
60.0M
        x0i = ia_add_flt(x0i, (x1i));
1411
60.0M
        x1r = ia_msu_flt(x0r, x1r, 2);
1412
60.0M
        x1i = ia_msu_flt(x0i, x1i, 2);
1413
60.0M
        x2r = ia_add_flt(x2r, (x3i));
1414
60.0M
        x2i = ia_sub_flt(x2i, (x3r));
1415
60.0M
        x3i = ia_msu_flt(x2r, x3i, 2);
1416
60.0M
        x3r = ia_mac_flt(x2i, x3r, 2);
1417
1418
60.0M
        *data = x0r;
1419
60.0M
        *(data + 1) = x0i;
1420
60.0M
        data += ((SIZE_T)del << 1);
1421
1422
60.0M
        *data = x2r;
1423
60.0M
        *(data + 1) = x2i;
1424
60.0M
        data += ((SIZE_T)del << 1);
1425
1426
60.0M
        *data = x1r;
1427
60.0M
        *(data + 1) = x1i;
1428
60.0M
        data += ((SIZE_T)del << 1);
1429
1430
60.0M
        *data = x3i;
1431
60.0M
        *(data + 1) = x3r;
1432
60.0M
        data += ((SIZE_T)del << 1);
1433
60.0M
      }
1434
21.4M
      data -= 2 * n_points;
1435
21.4M
      data += 2;
1436
21.4M
    }
1437
766k
    nodespacing >>= 2;
1438
766k
    del <<= 2;
1439
766k
    in_loop_cnt >>= 2;
1440
766k
  }
1441
191k
  if (not_power_4) {
1442
0
    const FLOAT32 *twiddles = ptr_w;
1443
0
    nodespacing <<= 1;
1444
1445
0
    for (j = del / 2; j != 0; j--) {
1446
0
      FLOAT32 w_1 = *twiddles;
1447
0
      FLOAT32 w_4 = *(twiddles + 257);
1448
0
      FLOAT32 tmp;
1449
0
      twiddles += nodespacing;
1450
1451
0
      x0r = *ptr_y;
1452
0
      x0i = *(ptr_y + 1);
1453
0
      ptr_y += ((SIZE_T)del << 1);
1454
1455
0
      x1r = *ptr_y;
1456
0
      x1i = *(ptr_y + 1);
1457
1458
0
      tmp = ia_sub_flt(ia_mul_flt(x1r, w_1), ia_mul_flt(x1i, w_4));
1459
0
      x1i = (FLOAT32)ia_mac_flt(ia_mul_flt(x1r, w_4), x1i, w_1);
1460
0
      x1r = tmp;
1461
1462
0
      *ptr_y = ia_sub_flt((x0r), (x1r));
1463
0
      *(ptr_y + 1) = ia_sub_flt((x0i), (x1i));
1464
0
      ptr_y -= ((SIZE_T)del << 1);
1465
1466
0
      *ptr_y = ia_add_flt((x0r), (x1r));
1467
0
      *(ptr_y + 1) = ia_add_flt((x0i), (x1i));
1468
0
      ptr_y += 2;
1469
0
    }
1470
0
    twiddles = ptr_w;
1471
0
    for (j = del / 2; j != 0; j--) {
1472
0
      FLOAT32 w_1 = *twiddles;
1473
0
      FLOAT32 w_4 = *(twiddles + 257);
1474
0
      FLOAT32 tmp;
1475
0
      twiddles += nodespacing;
1476
1477
0
      x0r = *ptr_y;
1478
0
      x0i = *(ptr_y + 1);
1479
0
      ptr_y += ((SIZE_T)del << 1);
1480
1481
0
      x1r = *ptr_y;
1482
0
      x1i = *(ptr_y + 1);
1483
1484
0
      tmp = ia_add_flt(ia_mul_flt(x1r, w_4), ia_mul_flt(x1i, w_1));
1485
0
      x1i = ia_add_flt(ia_negate_flt(ia_mul_flt(x1r, w_1)), ia_mul_flt(x1i, w_4));
1486
0
      x1r = tmp;
1487
1488
0
      *ptr_y = ia_sub_flt((x0r), (x1r));
1489
0
      *(ptr_y + 1) = ia_sub_flt((x0i), (x1i));
1490
0
      ptr_y -= ((SIZE_T)del << 1);
1491
1492
0
      *ptr_y = ia_add_flt((x0r), (x1r));
1493
0
      *(ptr_y + 1) = ia_add_flt((x0i), (x1i));
1494
0
      ptr_y += 2;
1495
0
    }
1496
0
  }
1497
1498
196M
  for (i = 0; i < n_points; i++) {
1499
196M
    ptr_real[i] = y[2 * i];
1500
196M
    ptr_imag[i] = y[2 * i + 1];
1501
196M
  }
1502
191k
}
1503
49.0M
static VOID ixheaace_cplx_fft_4(FLOAT32 *x_r, FLOAT32 *x_i) {
1504
49.0M
  FLOAT32 x_0, x_1, x_2, x_3;
1505
49.0M
  FLOAT32 x_4, x_5, x_6, x_7;
1506
49.0M
  FLOAT32 x0r, x1r, x2r, x3r;
1507
49.0M
  FLOAT32 x0i, x1i, x2i, x3i;
1508
1509
  // 4 Point FFT
1510
49.0M
  x_0 = x_r[0];
1511
49.0M
  x_1 = x_i[0];
1512
49.0M
  x_2 = x_r[1];
1513
49.0M
  x_3 = x_i[1];
1514
49.0M
  x_4 = x_r[2];
1515
49.0M
  x_5 = x_i[2];
1516
49.0M
  x_6 = x_r[3];
1517
49.0M
  x_7 = x_i[3];
1518
1519
49.0M
  x0r = ia_add_flt(x_0, x_4);
1520
49.0M
  x0i = ia_add_flt(x_1, x_5);
1521
49.0M
  x2r = ia_sub_flt(x_0, x_4);
1522
49.0M
  x2i = ia_sub_flt(x_1, x_5);
1523
49.0M
  x1r = ia_add_flt(x_2, x_6);
1524
49.0M
  x1i = ia_add_flt(x_3, x_7);
1525
49.0M
  x3r = ia_sub_flt(x_2, x_6);
1526
49.0M
  x3i = ia_sub_flt(x_3, x_7);
1527
1528
49.0M
  x_r[0] = ia_add_flt(x0r, x1r);
1529
49.0M
  x_i[0] = ia_add_flt(x0i, x1i);
1530
49.0M
  x_r[2] = ia_sub_flt(x0r, x1r);
1531
49.0M
  x_i[2] = ia_sub_flt(x0i, x1i);
1532
49.0M
  x_r[1] = ia_add_flt(x2r, x3i);
1533
49.0M
  x_i[1] = ia_sub_flt(x2i, x3r);
1534
49.0M
  x_r[3] = ia_sub_flt(x2r, x3i);
1535
49.0M
  x_i[3] = ia_add_flt(x2i, x3r);
1536
49.0M
  return;
1537
49.0M
}
1538
47.9k
VOID iusace_complex_fft_4096(FLOAT32 *ptr_x_r, FLOAT32 *ptr_x_i, FLOAT32 *ptr_scratch_buf) {
1539
47.9k
  FLOAT32 *ptr_data_r;
1540
47.9k
  FLOAT32 *ptr_data_i;
1541
47.9k
  WORD32 fft_len = 4096;
1542
47.9k
  FLOAT32 *ptr_fft_interim_buf = &ptr_scratch_buf[2 * fft_len];
1543
47.9k
  WORD32 i, j;
1544
47.9k
  WORD32 dim2 = fft_len >> 10;
1545
47.9k
  WORD32 dim1 = fft_len / dim2;
1546
47.9k
  WORD32 fac = 4;
1547
1548
239k
  for (i = 0; i < dim2; i++) {
1549
191k
    ptr_data_r = &ptr_scratch_buf[(2 * i + 0) * dim1];
1550
191k
    ptr_data_i = &ptr_scratch_buf[(2 * i + 1) * dim1];
1551
196M
    for (j = 0; j < dim1; j++) {
1552
196M
      ptr_data_r[j] = ptr_x_r[(dim2 * j + i)];
1553
196M
      ptr_data_i[j] = 0;
1554
196M
    }
1555
191k
    ixheaace_rad2_cplx_fft(ptr_data_r, ptr_data_i, dim1, ptr_fft_interim_buf);
1556
191k
  }
1557
47.9k
  ptr_data_r = &ptr_scratch_buf[0];
1558
47.9k
  ptr_data_i = &ptr_scratch_buf[0];
1559
49.1M
  for (i = 0; i < dim1; i++) {
1560
49.0M
    FLOAT32 *ptr_cos_val = (FLOAT32 *)&ia_mixed_rad_twiddle_cos[i * dim2 * fac];
1561
49.0M
    FLOAT32 *ptr_sin_val = (FLOAT32 *)&ia_mixed_rad_twiddle_sin[i * dim2 * fac];
1562
245M
    for (j = 0; j < dim2; j++) {
1563
196M
      FLOAT32 real = ptr_data_r[(2 * j + 0) * dim1 + i];
1564
196M
      FLOAT32 imag = ptr_data_i[(2 * j + 1) * dim1 + i];
1565
196M
      FLOAT32 cos_val = ptr_cos_val[j * fac];
1566
196M
      FLOAT32 sin_val = ptr_sin_val[j * fac];
1567
196M
      FLOAT32 temp_real = (FLOAT32)(real * cos_val + imag * sin_val);
1568
196M
      FLOAT32 temp_imag = (FLOAT32)(imag * cos_val - real * sin_val);
1569
196M
      ptr_fft_interim_buf[(2 * i + 0) * dim2 + j] = temp_real;
1570
196M
      ptr_fft_interim_buf[(2 * i + 1) * dim2 + j] = temp_imag;
1571
196M
    }
1572
49.0M
  }
1573
49.1M
  for (i = 0; i < dim1; i++) {
1574
49.0M
    ptr_data_r = &ptr_fft_interim_buf[(2 * i + 0) * dim2];
1575
49.0M
    ptr_data_i = &ptr_fft_interim_buf[(2 * i + 1) * dim2];
1576
49.0M
    ixheaace_cplx_fft_4(ptr_data_r, ptr_data_i);
1577
49.0M
  }
1578
47.9k
  ptr_data_r = &ptr_fft_interim_buf[0];
1579
47.9k
  ptr_data_i = &ptr_fft_interim_buf[0];
1580
49.1M
  for (i = 0; i < dim1; i++) {
1581
245M
    for (j = 0; j < dim2; j++) {
1582
196M
      ptr_x_r[(j * dim1 + i)] = ptr_data_r[(2 * i + 0) * dim2 + j];
1583
196M
      ptr_x_i[(j * dim1 + i)] = ptr_data_i[(2 * i + 1) * dim2 + j];
1584
196M
    }
1585
49.0M
  }
1586
47.9k
}