Coverage Report

Created: 2025-08-26 06:53

/src/libxaac/encoder/iusace_fft.c
Line
Count
Source (jump to first uncovered line)
1
/******************************************************************************
2
 *                                                                            *
3
 * Copyright (C) 2023 The Android Open Source Project
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at:
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 *****************************************************************************
18
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19
 */
20
21
#include <string.h>
22
#include "ixheaac_type_def.h"
23
#include "ixheaace_adjust_threshold_data.h"
24
#include "iusace_cnst.h"
25
#include "iusace_block_switch_const.h"
26
#include "iusace_rom.h"
27
#include "iusace_bitbuffer.h"
28
29
/* DRC */
30
#include "impd_drc_common_enc.h"
31
#include "impd_drc_uni_drc.h"
32
#include "impd_drc_tables.h"
33
#include "impd_drc_api.h"
34
#include "impd_drc_uni_drc_eq.h"
35
#include "impd_drc_uni_drc_filter_bank.h"
36
#include "impd_drc_gain_enc.h"
37
#include "impd_drc_struct_def.h"
38
39
#include "iusace_tns_usac.h"
40
#include "iusace_psy_mod.h"
41
#include "iusace_config.h"
42
#include "iusace_fft.h"
43
#include "iusace_basic_ops_flt.h"
44
#include "ixheaac_constants.h"
45
#include "ixheaace_aac_constants.h"
46
#include "ixheaac_basic_ops32.h"
47
#include "ixheaace_common_utils.h"
48
#include "ixheaac_error_standards.h"
49
#include "ixheaace_error_codes.h"
50
51
#define DIG_REV(i, m, j)                                    \
52
573M
  do {                                                      \
53
573M
    unsigned _ = (i);                                       \
54
573M
    _ = ((_ & 0x33333333) << 2) | ((_ & ~0x33333333) >> 2); \
55
573M
    _ = ((_ & 0x0F0F0F0F) << 4) | ((_ & ~0x0F0F0F0F) >> 4); \
56
573M
    _ = ((_ & 0x00FF00FF) << 8) | ((_ & ~0x00FF00FF) >> 8); \
57
573M
    (j) = _ >> (m);                                         \
58
573M
  } while (0)
59
60
56.4M
static PLATFORM_INLINE WORD8 iusace_calc_norm(WORD32 a) {
61
56.4M
  WORD8 norm_val;
62
63
56.4M
  if (a == 0) {
64
0
    norm_val = 31;
65
56.4M
  } else {
66
56.4M
    if (a == (WORD32)0xffffffffL) {
67
0
      norm_val = 31;
68
56.4M
    } else {
69
56.4M
      if (a < 0) {
70
0
        a = ~a;
71
0
      }
72
1.46G
      for (norm_val = 0; a < (WORD32)0x40000000L; norm_val++) {
73
1.41G
        a <<= 1;
74
1.41G
      }
75
56.4M
    }
76
56.4M
  }
77
78
56.4M
  return norm_val;
79
56.4M
}
80
81
167M
static PLATFORM_INLINE VOID iusace_complex_3point_fft(FLOAT32 *ptr_in, FLOAT32 *ptr_out) {
82
167M
  FLOAT32 add_r, sub_r;
83
167M
  FLOAT32 add_i, sub_i;
84
167M
  FLOAT32 x01r, x01i, temp;
85
167M
  FLOAT32 p1, p2, p3, p4;
86
167M
  FLOAT64 sinmu;
87
88
167M
  sinmu = 0.866025403784439;
89
90
167M
  x01r = ptr_in[0] + ptr_in[2];
91
167M
  x01i = ptr_in[1] + ptr_in[3];
92
93
167M
  add_r = ptr_in[2] + ptr_in[4];
94
167M
  add_i = ptr_in[3] + ptr_in[5];
95
96
167M
  sub_r = ptr_in[2] - ptr_in[4];
97
167M
  sub_i = ptr_in[3] - ptr_in[5];
98
99
167M
  p1 = add_r / (FLOAT32)2.0;
100
167M
  p4 = add_i / (FLOAT32)2.0;
101
167M
  p2 = (FLOAT32)((FLOAT64)sub_i * sinmu);
102
167M
  p3 = (FLOAT32)((FLOAT64)sub_r * sinmu);
103
104
167M
  temp = ptr_in[0] - p1;
105
106
167M
  ptr_out[0] = x01r + ptr_in[4];
107
167M
  ptr_out[1] = x01i + ptr_in[5];
108
167M
  ptr_out[2] = temp + p2;
109
167M
  ptr_out[3] = (ptr_in[1] - p3) - p4;
110
167M
  ptr_out[4] = temp - p2;
111
167M
  ptr_out[5] = (ptr_in[1] + p3) - p4;
112
113
167M
  return;
114
167M
}
115
116
28.2M
VOID iusace_complex_fft_p2(FLOAT32 *ptr_x, WORD32 nlength, FLOAT32 *scratch_fft_p2_y) {
117
28.2M
  WORD32 i, j, k, n_stages, h2;
118
28.2M
  FLOAT32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
119
28.2M
  FLOAT32 tmp;
120
28.2M
  WORD32 del, nodespacing, in_loop_cnt;
121
28.2M
  WORD32 not_power_4;
122
28.2M
  WORD32 dig_rev_shift;
123
28.2M
  FLOAT32 *y = scratch_fft_p2_y;
124
28.2M
  WORD32 mpass = nlength;
125
28.2M
  WORD32 npoints = nlength;
126
28.2M
  FLOAT32 *ptr_y = y;
127
28.2M
  const FLOAT64 *ptr_w;
128
129
28.2M
  dig_rev_shift = iusace_calc_norm(mpass) + 1 - 16;
130
28.2M
  n_stages = 30 - iusace_calc_norm(mpass);
131
28.2M
  not_power_4 = n_stages & 1;
132
133
28.2M
  n_stages = n_stages >> 1;
134
135
28.2M
  ptr_w = iusace_twiddle_table_fft_32x32;
136
137
28.2M
  if (dig_rev_shift < 0) {
138
0
    dig_rev_shift = 0;
139
0
  }
140
141
556M
  for (i = 0; i < npoints; i += 4) {
142
527M
    FLOAT32 *inp = ptr_x;
143
527M
    FLOAT32 tmk;
144
145
527M
    DIG_REV(i, dig_rev_shift, h2);
146
527M
    if (not_power_4) {
147
240M
      h2 += 1;
148
240M
      h2 &= ~1;
149
240M
    }
150
527M
    inp += (h2);
151
152
527M
    x0r = *inp;
153
527M
    x0i = *(inp + 1);
154
527M
    inp += (npoints >> 1);
155
156
527M
    x1r = *inp;
157
527M
    x1i = *(inp + 1);
158
527M
    inp += (npoints >> 1);
159
160
527M
    x2r = *inp;
161
527M
    x2i = *(inp + 1);
162
527M
    inp += (npoints >> 1);
163
164
527M
    x3r = *inp;
165
527M
    x3i = *(inp + 1);
166
167
527M
    x0r = x0r + x2r;
168
527M
    x0i = x0i + x2i;
169
170
527M
    tmk = x0r - x2r;
171
527M
    x2r = tmk - x2r;
172
527M
    tmk = x0i - x2i;
173
527M
    x2i = tmk - x2i;
174
175
527M
    x1r = x1r + x3r;
176
527M
    x1i = x1i + x3i;
177
178
527M
    tmk = x1r - x3r;
179
527M
    x3r = tmk - x3r;
180
527M
    tmk = x1i - x3i;
181
527M
    x3i = tmk - x3i;
182
183
527M
    x0r = x0r + x1r;
184
527M
    x0i = x0i + x1i;
185
186
527M
    tmk = x0r - x1r;
187
527M
    x1r = tmk - x1r;
188
527M
    tmk = x0i - x1i;
189
527M
    x1i = tmk - x1i;
190
191
527M
    x2r = x2r + x3i;
192
527M
    x2i = x2i - x3r;
193
194
527M
    tmk = x2r - x3i;
195
527M
    x3i = tmk - x3i;
196
527M
    tmk = x2i + x3r;
197
527M
    x3r = tmk + x3r;
198
199
527M
    *ptr_y++ = x0r;
200
527M
    *ptr_y++ = x0i;
201
527M
    *ptr_y++ = x2r;
202
527M
    *ptr_y++ = x2i;
203
527M
    *ptr_y++ = x1r;
204
527M
    *ptr_y++ = x1i;
205
527M
    *ptr_y++ = x3i;
206
527M
    *ptr_y++ = x3r;
207
527M
  }
208
28.2M
  ptr_y -= 2 * npoints;
209
28.2M
  del = 4;
210
28.2M
  nodespacing = 64;
211
28.2M
  in_loop_cnt = npoints >> 4;
212
64.2M
  for (i = n_stages - 1; i > 0; i--) {
213
35.9M
    const FLOAT64 *twiddles = ptr_w;
214
35.9M
    FLOAT32 *data = ptr_y;
215
35.9M
    FLOAT64 w_1, w_2, w_3, w_4, w_5, w_6;
216
35.9M
    WORD32 sec_loop_cnt;
217
218
198M
    for (k = in_loop_cnt; k != 0; k--) {
219
162M
      x0r = (*data);
220
162M
      x0i = (*(data + 1));
221
162M
      data += ((SIZE_T)del << 1);
222
223
162M
      x1r = (*data);
224
162M
      x1i = (*(data + 1));
225
162M
      data += ((SIZE_T)del << 1);
226
227
162M
      x2r = (*data);
228
162M
      x2i = (*(data + 1));
229
162M
      data += ((SIZE_T)del << 1);
230
231
162M
      x3r = (*data);
232
162M
      x3i = (*(data + 1));
233
162M
      data -= 3 * (del << 1);
234
235
162M
      x0r = x0r + x2r;
236
162M
      x0i = x0i + x2i;
237
162M
      x2r = x0r - (x2r * 2);
238
162M
      x2i = x0i - (x2i * 2);
239
162M
      x1r = x1r + x3r;
240
162M
      x1i = x1i + x3i;
241
162M
      x3r = x1r - (x3r * 2);
242
162M
      x3i = x1i - (x3i * 2);
243
244
162M
      x0r = x0r + x1r;
245
162M
      x0i = x0i + x1i;
246
162M
      x1r = x0r - (x1r * 2);
247
162M
      x1i = x0i - (x1i * 2);
248
162M
      x2r = x2r + x3i;
249
162M
      x2i = x2i - x3r;
250
162M
      x3i = x2r - (x3i * 2);
251
162M
      x3r = x2i + (x3r * 2);
252
253
162M
      *data = x0r;
254
162M
      *(data + 1) = x0i;
255
162M
      data += ((SIZE_T)del << 1);
256
257
162M
      *data = x2r;
258
162M
      *(data + 1) = x2i;
259
162M
      data += ((SIZE_T)del << 1);
260
261
162M
      *data = x1r;
262
162M
      *(data + 1) = x1i;
263
162M
      data += ((SIZE_T)del << 1);
264
265
162M
      *data = x3i;
266
162M
      *(data + 1) = x3r;
267
162M
      data += ((SIZE_T)del << 1);
268
162M
    }
269
35.9M
    data = ptr_y + 2;
270
271
35.9M
    sec_loop_cnt = (nodespacing * del);
272
35.9M
    sec_loop_cnt = (sec_loop_cnt / 4) + (sec_loop_cnt / 8) - (sec_loop_cnt / 16) +
273
35.9M
                   (sec_loop_cnt / 32) - (sec_loop_cnt / 64) + (sec_loop_cnt / 128) -
274
35.9M
                   (sec_loop_cnt / 256);
275
276
192M
    for (j = nodespacing; j <= sec_loop_cnt; j += nodespacing) {
277
156M
      w_1 = *(twiddles + j);
278
156M
      w_4 = *(twiddles + j + 257);
279
156M
      w_2 = *(twiddles + ((SIZE_T)j << 1));
280
156M
      w_5 = *(twiddles + ((SIZE_T)j << 1) + 257);
281
156M
      w_3 = *(twiddles + j + ((SIZE_T)j << 1));
282
156M
      w_6 = *(twiddles + j + ((SIZE_T)j << 1) + 257);
283
284
555M
      for (k = in_loop_cnt; k != 0; k--) {
285
398M
        data += ((SIZE_T)del << 1);
286
287
398M
        x1r = *data;
288
398M
        x1i = *(data + 1);
289
398M
        data += ((SIZE_T)del << 1);
290
291
398M
        x2r = *data;
292
398M
        x2i = *(data + 1);
293
398M
        data += ((SIZE_T)del << 1);
294
295
398M
        x3r = *data;
296
398M
        x3i = *(data + 1);
297
398M
        data -= 3 * (del << 1);
298
299
398M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_1) - ixheaace_dmult((FLOAT64)x1i, w_4));
300
398M
        x1i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x1r, w_4), (FLOAT64)x1i, w_1);
301
398M
        x1r = tmp;
302
303
398M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x2r, w_2) - ixheaace_dmult((FLOAT64)x2i, w_5));
304
398M
        x2i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x2r, w_5), (FLOAT64)x2i, w_2);
305
398M
        x2r = tmp;
306
307
398M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x3r, w_3) - ixheaace_dmult((FLOAT64)x3i, w_6));
308
398M
        x3i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x3r, w_6), (FLOAT64)x3i, w_3);
309
398M
        x3r = tmp;
310
311
398M
        x0r = (*data);
312
398M
        x0i = (*(data + 1));
313
314
398M
        x0r = x0r + (x2r);
315
398M
        x0i = x0i + (x2i);
316
398M
        x2r = x0r - (x2r * 2);
317
398M
        x2i = x0i - (x2i * 2);
318
398M
        x1r = x1r + x3r;
319
398M
        x1i = x1i + x3i;
320
398M
        x3r = x1r - (x3r * 2);
321
398M
        x3i = x1i - (x3i * 2);
322
323
398M
        x0r = x0r + (x1r);
324
398M
        x0i = x0i + (x1i);
325
398M
        x1r = x0r - (x1r * 2);
326
398M
        x1i = x0i - (x1i * 2);
327
398M
        x2r = x2r + (x3i);
328
398M
        x2i = x2i - (x3r);
329
398M
        x3i = x2r - (x3i * 2);
330
398M
        x3r = x2i + (x3r * 2);
331
332
398M
        *data = x0r;
333
398M
        *(data + 1) = x0i;
334
398M
        data += ((SIZE_T)del << 1);
335
336
398M
        *data = x2r;
337
398M
        *(data + 1) = x2i;
338
398M
        data += ((SIZE_T)del << 1);
339
340
398M
        *data = x1r;
341
398M
        *(data + 1) = x1i;
342
398M
        data += ((SIZE_T)del << 1);
343
344
398M
        *data = x3i;
345
398M
        *(data + 1) = x3r;
346
398M
        data += ((SIZE_T)del << 1);
347
398M
      }
348
156M
      data -= 2 * npoints;
349
156M
      data += 2;
350
156M
    }
351
132M
    for (; j <= (nodespacing * del) >> 1; j += nodespacing) {
352
96.3M
      w_1 = *(twiddles + j);
353
96.3M
      w_4 = *(twiddles + j + 257);
354
96.3M
      w_2 = *(twiddles + ((SIZE_T)j << 1));
355
96.3M
      w_5 = *(twiddles + ((SIZE_T)j << 1) + 257);
356
96.3M
      w_3 = *(twiddles + j + ((SIZE_T)j << 1) - 256);
357
96.3M
      w_6 = *(twiddles + j + ((SIZE_T)j << 1) + 1);
358
359
376M
      for (k = in_loop_cnt; k != 0; k--) {
360
280M
        data += ((SIZE_T)del << 1);
361
362
280M
        x1r = *data;
363
280M
        x1i = *(data + 1);
364
280M
        data += ((SIZE_T)del << 1);
365
366
280M
        x2r = *data;
367
280M
        x2i = *(data + 1);
368
280M
        data += ((SIZE_T)del << 1);
369
370
280M
        x3r = *data;
371
280M
        x3i = *(data + 1);
372
280M
        data -= 3 * (del << 1);
373
374
280M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_1) - ixheaace_dmult((FLOAT64)x1i, w_4));
375
280M
        x1i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x1r, w_4), (FLOAT64)x1i, w_1);
376
280M
        x1r = tmp;
377
378
280M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x2r, w_2) - ixheaace_dmult((FLOAT64)x2i, w_5));
379
280M
        x2i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x2r, w_5), (FLOAT64)x2i, w_2);
380
280M
        x2r = tmp;
381
382
280M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x3r, w_6) + ixheaace_dmult((FLOAT64)x3i, w_3));
383
280M
        x3i = (FLOAT32)(-ixheaace_dmult((FLOAT64)x3r, w_3) + ixheaace_dmult((FLOAT64)x3i, w_6));
384
280M
        x3r = tmp;
385
386
280M
        x0r = (*data);
387
280M
        x0i = (*(data + 1));
388
389
280M
        x0r = x0r + (x2r);
390
280M
        x0i = x0i + (x2i);
391
280M
        x2r = x0r - (x2r * 2);
392
280M
        x2i = x0i - (x2i * 2);
393
280M
        x1r = x1r + x3r;
394
280M
        x1i = x1i + x3i;
395
280M
        x3r = x1r - (x3r * 2);
396
280M
        x3i = x1i - (x3i * 2);
397
398
280M
        x0r = x0r + (x1r);
399
280M
        x0i = x0i + (x1i);
400
280M
        x1r = x0r - (x1r * 2);
401
280M
        x1i = x0i - (x1i * 2);
402
280M
        x2r = x2r + (x3i);
403
280M
        x2i = x2i - (x3r);
404
280M
        x3i = x2r - (x3i * 2);
405
280M
        x3r = x2i + (x3r * 2);
406
407
280M
        *data = x0r;
408
280M
        *(data + 1) = x0i;
409
280M
        data += ((SIZE_T)del << 1);
410
411
280M
        *data = x2r;
412
280M
        *(data + 1) = x2i;
413
280M
        data += ((SIZE_T)del << 1);
414
415
280M
        *data = x1r;
416
280M
        *(data + 1) = x1i;
417
280M
        data += ((SIZE_T)del << 1);
418
419
280M
        *data = x3i;
420
280M
        *(data + 1) = x3r;
421
280M
        data += ((SIZE_T)del << 1);
422
280M
      }
423
96.3M
      data -= 2 * npoints;
424
96.3M
      data += 2;
425
96.3M
    }
426
96.3M
    for (; j <= sec_loop_cnt * 2; j += nodespacing) {
427
60.3M
      w_1 = *(twiddles + j);
428
60.3M
      w_4 = *(twiddles + j + 257);
429
60.3M
      w_2 = *(twiddles + ((SIZE_T)j << 1) - 256);
430
60.3M
      w_5 = *(twiddles + ((SIZE_T)j << 1) + 1);
431
60.3M
      w_3 = *(twiddles + j + ((SIZE_T)j << 1) - 256);
432
60.3M
      w_6 = *(twiddles + j + ((SIZE_T)j << 1) + 1);
433
434
178M
      for (k = in_loop_cnt; k != 0; k--) {
435
118M
        data += ((SIZE_T)del << 1);
436
437
118M
        x1r = *data;
438
118M
        x1i = *(data + 1);
439
118M
        data += ((SIZE_T)del << 1);
440
441
118M
        x2r = *data;
442
118M
        x2i = *(data + 1);
443
118M
        data += ((SIZE_T)del << 1);
444
445
118M
        x3r = *data;
446
118M
        x3i = *(data + 1);
447
118M
        data -= 3 * (del << 1);
448
449
118M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_1) - ixheaace_dmult((FLOAT64)x1i, w_4));
450
118M
        x1i = (FLOAT32)ixheaace_dmac(ixheaace_dmult(x1r, w_4), x1i, w_1);
451
118M
        x1r = tmp;
452
453
118M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x2r, w_5) + ixheaace_dmult((FLOAT64)x2i, w_2));
454
118M
        x2i = (FLOAT32)(-ixheaace_dmult(x2r, w_2) + ixheaace_dmult(x2i, w_5));
455
118M
        x2r = tmp;
456
457
118M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x3r, w_6) + ixheaace_dmult((FLOAT64)x3i, w_3));
458
118M
        x3i = (FLOAT32)(-ixheaace_dmult((FLOAT64)x3r, w_3) + ixheaace_dmult((FLOAT64)x3i, w_6));
459
118M
        x3r = tmp;
460
461
118M
        x0r = (*data);
462
118M
        x0i = (*(data + 1));
463
464
118M
        x0r = x0r + (x2r);
465
118M
        x0i = x0i + (x2i);
466
118M
        x2r = x0r - (x2r * 2);
467
118M
        x2i = x0i - (x2i * 2);
468
118M
        x1r = x1r + x3r;
469
118M
        x1i = x1i + x3i;
470
118M
        x3r = x1r - (x3r * 2);
471
118M
        x3i = x1i - (x3i * 2);
472
473
118M
        x0r = x0r + (x1r);
474
118M
        x0i = x0i + (x1i);
475
118M
        x1r = x0r - (x1r * 2);
476
118M
        x1i = x0i - (x1i * 2);
477
118M
        x2r = x2r + (x3i);
478
118M
        x2i = x2i - (x3r);
479
118M
        x3i = x2r - (x3i * 2);
480
118M
        x3r = x2i + (x3r * 2);
481
482
118M
        *data = x0r;
483
118M
        *(data + 1) = x0i;
484
118M
        data += ((SIZE_T)del << 1);
485
486
118M
        *data = x2r;
487
118M
        *(data + 1) = x2i;
488
118M
        data += ((SIZE_T)del << 1);
489
490
118M
        *data = x1r;
491
118M
        *(data + 1) = x1i;
492
118M
        data += ((SIZE_T)del << 1);
493
494
118M
        *data = x3i;
495
118M
        *(data + 1) = x3r;
496
118M
        data += ((SIZE_T)del << 1);
497
118M
      }
498
60.3M
      data -= 2 * npoints;
499
60.3M
      data += 2;
500
60.3M
    }
501
192M
    for (; j < nodespacing * del; j += nodespacing) {
502
156M
      w_1 = *(twiddles + j);
503
156M
      w_4 = *(twiddles + j + 257);
504
156M
      w_2 = *(twiddles + ((SIZE_T)j << 1) - 256);
505
156M
      w_5 = *(twiddles + ((SIZE_T)j << 1) + 1);
506
156M
      w_3 = *(twiddles + j + ((SIZE_T)j << 1) - 512);
507
156M
      w_6 = *(twiddles + j + ((SIZE_T)j << 1) - 512 + 257);
508
509
555M
      for (k = in_loop_cnt; k != 0; k--) {
510
398M
        data += ((SIZE_T)del << 1);
511
512
398M
        x1r = *data;
513
398M
        x1i = *(data + 1);
514
398M
        data += ((SIZE_T)del << 1);
515
516
398M
        x2r = *data;
517
398M
        x2i = *(data + 1);
518
398M
        data += ((SIZE_T)del << 1);
519
520
398M
        x3r = *data;
521
398M
        x3i = *(data + 1);
522
398M
        data -= 3 * ((SIZE_T)del << 1);
523
524
398M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_1) - ixheaace_dmult((FLOAT64)x1i, w_4));
525
398M
        x1i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x1r, w_4), (FLOAT64)x1i, w_1);
526
398M
        x1r = tmp;
527
528
398M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x2r, w_5) + ixheaace_dmult((FLOAT64)x2i, w_2));
529
398M
        x2i = (FLOAT32)(-ixheaace_dmult((FLOAT64)x2r, w_2) + ixheaace_dmult((FLOAT64)x2i, w_5));
530
398M
        x2r = tmp;
531
532
398M
        tmp = (FLOAT32)(-ixheaace_dmult((FLOAT64)x3r, w_3) + ixheaace_dmult((FLOAT64)x3i, w_6));
533
398M
        x3i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x3r, w_6), (FLOAT64)x3i, w_3);
534
398M
        x3r = tmp;
535
536
398M
        x0r = (*data);
537
398M
        x0i = (*(data + 1));
538
539
398M
        x0r = x0r + (x2r);
540
398M
        x0i = x0i + (x2i);
541
398M
        x2r = x0r - (x2r * 2);
542
398M
        x2i = x0i - (x2i * 2);
543
398M
        x1r = x1r + x3r;
544
398M
        x1i = x1i - x3i;
545
398M
        x3r = x1r - (x3r * 2);
546
398M
        x3i = x1i + (x3i * 2);
547
548
398M
        x0r = x0r + (x1r);
549
398M
        x0i = x0i + (x1i);
550
398M
        x1r = x0r - (x1r * 2);
551
398M
        x1i = x0i - (x1i * 2);
552
398M
        x2r = x2r + (x3i);
553
398M
        x2i = x2i - (x3r);
554
398M
        x3i = x2r - (x3i * 2);
555
398M
        x3r = x2i + (x3r * 2);
556
557
398M
        *data = x0r;
558
398M
        *(data + 1) = x0i;
559
398M
        data += ((SIZE_T)del << 1);
560
561
398M
        *data = x2r;
562
398M
        *(data + 1) = x2i;
563
398M
        data += ((SIZE_T)del << 1);
564
565
398M
        *data = x1r;
566
398M
        *(data + 1) = x1i;
567
398M
        data += ((SIZE_T)del << 1);
568
569
398M
        *data = x3i;
570
398M
        *(data + 1) = x3r;
571
398M
        data += ((SIZE_T)del << 1);
572
398M
      }
573
156M
      data -= 2 * npoints;
574
156M
      data += 2;
575
156M
    }
576
35.9M
    nodespacing >>= 2;
577
35.9M
    del <<= 2;
578
35.9M
    in_loop_cnt >>= 2;
579
35.9M
  }
580
28.2M
  if (not_power_4) {
581
13.0M
    const FLOAT64 *twiddles = ptr_w;
582
13.0M
    nodespacing <<= 1;
583
584
253M
    for (j = del / 2; j != 0; j--) {
585
240M
      FLOAT64 w_1 = *twiddles;
586
240M
      FLOAT64 w_4 = *(twiddles + 257);
587
240M
      twiddles += nodespacing;
588
589
240M
      x0r = *ptr_y;
590
240M
      x0i = *(ptr_y + 1);
591
240M
      ptr_y += ((SIZE_T)del << 1);
592
593
240M
      x1r = *ptr_y;
594
240M
      x1i = *(ptr_y + 1);
595
596
240M
      tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_1) - ixheaace_dmult((FLOAT64)x1i, w_4));
597
240M
      x1i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x1r, w_4), (FLOAT64)x1i, w_1);
598
240M
      x1r = tmp;
599
600
240M
      *ptr_y = (x0r) - (x1r);
601
240M
      *(ptr_y + 1) = (x0i) - (x1i);
602
240M
      ptr_y -= ((SIZE_T)del << 1);
603
604
240M
      *ptr_y = (x0r) + (x1r);
605
240M
      *(ptr_y + 1) = (x0i) + (x1i);
606
240M
      ptr_y += 2;
607
240M
    }
608
13.0M
    twiddles = ptr_w;
609
253M
    for (j = del / 2; j != 0; j--) {
610
240M
      FLOAT64 w_1 = *twiddles;
611
240M
      FLOAT64 w_4 = *(twiddles + 257);
612
240M
      twiddles += nodespacing;
613
614
240M
      x0r = *ptr_y;
615
240M
      x0i = *(ptr_y + 1);
616
240M
      ptr_y += ((SIZE_T)del << 1);
617
618
240M
      x1r = *ptr_y;
619
240M
      x1i = *(ptr_y + 1);
620
621
240M
      tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_4) + ixheaace_dmult((FLOAT64)x1i, w_1));
622
240M
      x1i = (FLOAT32)(-ixheaace_dmult((FLOAT64)x1r, w_1) + ixheaace_dmult((FLOAT64)x1i, w_4));
623
240M
      x1r = tmp;
624
625
240M
      *ptr_y = (x0r) - (x1r);
626
240M
      *(ptr_y + 1) = (x0i) - (x1i);
627
240M
      ptr_y -= ((SIZE_T)del << 1);
628
629
240M
      *ptr_y = (x0r) + (x1r);
630
240M
      *(ptr_y + 1) = (x0i) + (x1i);
631
240M
      ptr_y += 2;
632
240M
    }
633
13.0M
  }
634
635
2.13G
  for (i = 0; i < nlength; i++) {
636
2.11G
    *(ptr_x + 2 * i) = y[2 * i];
637
2.11G
    *(ptr_x + 2 * i + 1) = y[2 * i + 1];
638
2.11G
  }
639
28.2M
}
640
641
static VOID iusace_complex_fft_p3(FLOAT32 *data, WORD32 nlength,
642
4.96M
                                  iusace_scratch_mem *pstr_scratch) {
643
4.96M
  WORD32 i, j;
644
4.96M
  FLOAT32 *data_3 = pstr_scratch->p_fft_p3_data_3;
645
4.96M
  FLOAT32 *y = pstr_scratch->p_fft_p3_y;
646
4.96M
  WORD32 cnfac;
647
4.96M
  WORD32 mpass = nlength;
648
4.96M
  FLOAT32 *ptr_x = data;
649
4.96M
  FLOAT32 *ptr_y = y;
650
651
4.96M
  cnfac = 0;
652
9.93M
  while (mpass % 3 == 0) {
653
4.96M
    mpass /= 3;
654
4.96M
    cnfac++;
655
4.96M
  }
656
657
19.8M
  for (i = 0; i < 3 * cnfac; i++) {
658
518M
    for (j = 0; j < mpass; j++) {
659
503M
      data_3[2 * j] = data[3 * (2 * j) + (2 * i)];
660
503M
      data_3[2 * j + 1] = data[3 * (2 * j) + 1 + (2 * i)];
661
503M
    }
662
14.8M
    iusace_complex_fft_p2(data_3, mpass, pstr_scratch->p_fft_p2_y);
663
664
518M
    for (j = 0; j < mpass; j++) {
665
503M
      data[3 * (2 * j) + (2 * i)] = data_3[2 * j];
666
503M
      data[3 * (2 * j) + 1 + (2 * i)] = data_3[2 * j + 1];
667
503M
    }
668
14.8M
  }
669
670
4.96M
  {
671
4.96M
    const FLOAT64 *w1r, *w1i;
672
4.96M
    FLOAT32 tmp;
673
4.96M
    w1r = iusace_twiddle_table_3pr;
674
4.96M
    w1i = iusace_twiddle_table_3pi;
675
676
172M
    for (i = 0; i < nlength; i += 3) {
677
167M
      tmp = (FLOAT32)((FLOAT64)data[2 * i] * (*w1r) - (FLOAT64)data[2 * i + 1] * (*w1i));
678
167M
      data[2 * i + 1] =
679
167M
          (FLOAT32)((FLOAT64)data[2 * i] * (*w1i) + (FLOAT64)data[2 * i + 1] * (*w1r));
680
167M
      data[2 * i] = tmp;
681
682
167M
      w1r++;
683
167M
      w1i++;
684
685
167M
      tmp = (FLOAT32)((FLOAT64)data[2 * (i + 1)] * (*w1r) -
686
167M
                      (FLOAT64)data[2 * (i + 1) + 1] * (*w1i));
687
167M
      data[2 * (i + 1) + 1] = (FLOAT32)((FLOAT64)data[2 * (i + 1)] * (*w1i) +
688
167M
                                        (FLOAT64)data[2 * (i + 1) + 1] * (*w1r));
689
167M
      data[2 * (i + 1)] = tmp;
690
691
167M
      w1r++;
692
167M
      w1i++;
693
694
167M
      tmp = (FLOAT32)((FLOAT64)data[2 * (i + 2)] * (*w1r) -
695
167M
                      (FLOAT64)data[2 * (i + 2) + 1] * (*w1i));
696
167M
      data[2 * (i + 2) + 1] = (FLOAT32)((FLOAT64)data[2 * (i + 2)] * (*w1i) +
697
167M
                                        (FLOAT64)data[2 * (i + 2) + 1] * (*w1r));
698
167M
      data[2 * (i + 2)] = tmp;
699
700
167M
      w1r += 3 * (128 / mpass - 1) + 1;
701
167M
      w1i += 3 * (128 / mpass - 1) + 1;
702
167M
    }
703
4.96M
  }
704
705
172M
  for (i = 0; i < mpass; i++) {
706
167M
    iusace_complex_3point_fft(ptr_x, ptr_y);
707
708
167M
    ptr_x = ptr_x + 6;
709
167M
    ptr_y = ptr_y + 6;
710
167M
  }
711
712
172M
  for (i = 0; i < mpass; i++) {
713
167M
    data[2 * i] = y[6 * i];
714
167M
    data[2 * i + 1] = y[6 * i + 1];
715
167M
  }
716
717
172M
  for (i = 0; i < mpass; i++) {
718
167M
    data[2 * (i + mpass)] = y[6 * i + 2];
719
167M
    data[2 * (i + mpass) + 1] = y[6 * i + 3];
720
167M
  }
721
722
172M
  for (i = 0; i < mpass; i++) {
723
167M
    data[2 * (i + 2 * mpass)] = y[6 * i + 4];
724
167M
    data[2 * (i + 2 * mpass) + 1] = y[6 * i + 5];
725
167M
  }
726
4.96M
}
727
728
0
VOID iusace_complex_fft_p3_no_scratch(FLOAT32 *data, WORD32 nlength) {
729
0
  WORD32 i, j;
730
731
0
  FLOAT32 data_3[800];
732
0
  FLOAT32 y[1024];
733
0
  FLOAT32 p_fft_p2_y[2048];
734
0
  WORD32 cnfac;
735
0
  WORD32 mpass = nlength;
736
0
  FLOAT32 *ptr_x = data;
737
0
  FLOAT32 *ptr_y = y;
738
739
0
  cnfac = 0;
740
0
  while (mpass % 3 == 0) {
741
0
    mpass /= 3;
742
0
    cnfac++;
743
0
  }
744
745
0
  for (i = 0; i < 3 * cnfac; i++) {
746
0
    for (j = 0; j < mpass; j++) {
747
0
      data_3[2 * j] = data[3 * (2 * j) + (2 * i)];
748
0
      data_3[2 * j + 1] = data[3 * (2 * j) + 1 + (2 * i)];
749
0
    }
750
0
    iusace_complex_fft_p2(data_3, mpass, p_fft_p2_y);
751
752
0
    for (j = 0; j < mpass; j++) {
753
0
      data[3 * (2 * j) + (2 * i)] = data_3[2 * j];
754
0
      data[3 * (2 * j) + 1 + (2 * i)] = data_3[2 * j + 1];
755
0
    }
756
0
  }
757
758
0
  {
759
0
    const FLOAT64 *w1r, *w1i;
760
0
    FLOAT32 tmp;
761
0
    w1r = iusace_twiddle_table_3pr;
762
0
    w1i = iusace_twiddle_table_3pi;
763
764
0
    for (i = 0; i < nlength; i += 3) {
765
0
      tmp = (FLOAT32)((FLOAT64)data[2 * i] * (*w1r) - (FLOAT64)data[2 * i + 1] * (*w1i));
766
0
      data[2 * i + 1] =
767
0
          (FLOAT32)((FLOAT64)data[2 * i] * (*w1i) + (FLOAT64)data[2 * i + 1] * (*w1r));
768
0
      data[2 * i] = tmp;
769
770
0
      w1r++;
771
0
      w1i++;
772
773
0
      tmp = (FLOAT32)((FLOAT64)data[2 * (i + 1)] * (*w1r) -
774
0
                      (FLOAT64)data[2 * (i + 1) + 1] * (*w1i));
775
0
      data[2 * (i + 1) + 1] = (FLOAT32)((FLOAT64)data[2 * (i + 1)] * (*w1i) +
776
0
                                        (FLOAT64)data[2 * (i + 1) + 1] * (*w1r));
777
0
      data[2 * (i + 1)] = tmp;
778
779
0
      w1r++;
780
0
      w1i++;
781
782
0
      tmp = (FLOAT32)((FLOAT64)data[2 * (i + 2)] * (*w1r) -
783
0
                      (FLOAT64)data[2 * (i + 2) + 1] * (*w1i));
784
0
      data[2 * (i + 2) + 1] = (FLOAT32)((FLOAT64)data[2 * (i + 2)] * (*w1i) +
785
0
                                        (FLOAT64)data[2 * (i + 2) + 1] * (*w1r));
786
0
      data[2 * (i + 2)] = tmp;
787
788
0
      w1r += 3 * (128 / mpass - 1) + 1;
789
0
      w1i += 3 * (128 / mpass - 1) + 1;
790
0
    }
791
0
  }
792
793
0
  for (i = 0; i < mpass; i++) {
794
0
    iusace_complex_3point_fft(ptr_x, ptr_y);
795
796
0
    ptr_x = ptr_x + 6;
797
0
    ptr_y = ptr_y + 6;
798
0
  }
799
800
0
  for (i = 0; i < mpass; i++) {
801
0
    data[2 * i] = y[6 * i];
802
0
    data[2 * i + 1] = y[6 * i + 1];
803
0
  }
804
805
0
  for (i = 0; i < mpass; i++) {
806
0
    data[2 * (i + mpass)] = y[6 * i + 2];
807
0
    data[2 * (i + mpass) + 1] = y[6 * i + 3];
808
0
  }
809
810
0
  for (i = 0; i < mpass; i++) {
811
0
    data[2 * (i + 2 * mpass)] = y[6 * i + 4];
812
0
    data[2 * (i + 2 * mpass) + 1] = y[6 * i + 5];
813
0
  }
814
0
}
815
816
static VOID iusace_calc_pre_twid_enc(FLOAT64 *ptr_in, FLOAT32 *fft_ptr, WORD32 npoints,
817
                                     const FLOAT64 *cos_ptr, const FLOAT64 *sin_ptr,
818
3.30M
                                     const WORD32 tx_flag) {
819
3.30M
  WORD32 i, n;
820
3.30M
  WORD32 b = npoints >> 1;
821
3.30M
  WORD32 a = npoints - b;
822
3.30M
  WORD32 nlength = npoints >> 2;
823
3.30M
  FLOAT64 tempr, tempi;
824
825
3.30M
  if (tx_flag == 0) {
826
1.65M
    FLOAT64 norm;
827
457M
    for (i = 0; i < b; i++) {
828
456M
      norm = ptr_in[i]; /* reuse MDCT: spectrally reverse all bins */
829
456M
      ptr_in[i] = ptr_in[npoints - 1 - i];
830
456M
      ptr_in[npoints - 1 - i] = norm;
831
456M
    }
832
1.65M
  }
833
459M
  for (i = 0; i < nlength; i++) {
834
456M
    n = npoints / 2 - 1 - 2 * i;
835
456M
    if (i < b / 4) {
836
228M
      tempr = ptr_in[a / 2 + n] + ptr_in[npoints + a / 2 - 1 - n];
837
228M
    } else {
838
228M
      tempr = ptr_in[a / 2 + n] - ptr_in[a / 2 - 1 - n];
839
228M
    }
840
456M
    n = 2 * i;
841
456M
    if (i < a / 4) {
842
228M
      tempi = ptr_in[a / 2 + n] - ptr_in[a / 2 - 1 - n];
843
228M
    } else {
844
228M
      tempi = ptr_in[a / 2 + n] + ptr_in[npoints + a / 2 - 1 - n];
845
228M
    }
846
847
456M
    fft_ptr[2 * i] = (FLOAT32)(tempr * (*cos_ptr) + tempi * (*sin_ptr));
848
456M
    fft_ptr[2 * i + 1] = (FLOAT32)(tempi * (*cos_ptr++) - tempr * (*sin_ptr++));
849
456M
  }
850
3.30M
}
851
852
12.9M
VOID iusace_complex_fft(FLOAT32 *data, WORD32 nlength, iusace_scratch_mem *pstr_scratch) {
853
12.9M
  if (nlength & (nlength - 1)) {
854
4.96M
    iusace_complex_fft_p3(data, nlength, pstr_scratch);
855
7.96M
  } else {
856
7.96M
    iusace_complex_fft_p2(data, nlength, pstr_scratch->p_fft_p2_y);
857
7.96M
  }
858
12.9M
}
859
860
static VOID iusace_calc_post_twid_enc(FLOAT64 *ptr_out, FLOAT32 *fft_ptr, WORD32 npoints,
861
                                      const FLOAT64 *cos_ptr, const FLOAT64 *sin_ptr,
862
3.30M
                                      const WORD32 tx_flag) {
863
3.30M
  WORD32 i;
864
3.30M
  WORD32 nlength = npoints >> 2;
865
3.30M
  FLOAT64 tempr, tempi;
866
867
  /* post-twiddle FFT output and then get output data */
868
459M
  for (i = 0; i < nlength; i++) {
869
456M
    tempr =
870
456M
        2 * ((FLOAT64)(fft_ptr[2 * i]) * (*cos_ptr) + (FLOAT64)(fft_ptr[2 * i + 1]) * (*sin_ptr));
871
456M
    tempi = 2 * ((FLOAT64)(fft_ptr[2 * i + 1]) * (*cos_ptr++) -
872
456M
                 (FLOAT64)(fft_ptr[2 * i]) * (*sin_ptr++));
873
874
456M
    ptr_out[2 * i] = -tempr;
875
456M
    ptr_out[npoints / 2 - 1 - 2 * i] = tempi;
876
456M
    ptr_out[npoints / 2 + 2 * i] = -tempi;
877
456M
    ptr_out[npoints - 1 - 2 * i] = tempr;
878
456M
  }
879
3.30M
  if (tx_flag == 0) {
880
457M
    for (i = 0; i < npoints; i += 2) {
881
456M
      ptr_out[i] *= -1; /* reuse MDCT: flip signs at odd indices */
882
456M
    }
883
1.65M
  }
884
3.30M
}
885
886
IA_ERRORCODE iusace_fft_based_mdct(FLOAT64 *ptr_in, FLOAT64 *ptr_out, WORD32 npoints,
887
3.30M
                                   const WORD32 tx_flag, iusace_scratch_mem *pstr_scratch) {
888
3.30M
  FLOAT32 *ptr_scratch1 = pstr_scratch->p_fft_mdct_buf;
889
3.30M
  const FLOAT64 *cos_ptr = NULL;
890
3.30M
  const FLOAT64 *sin_ptr = NULL;
891
3.30M
  WORD32 nlength = npoints >> 1;
892
3.30M
  WORD32 n_total = npoints << 1;
893
894
3.30M
  memset(ptr_scratch1, 0, ((SIZE_T)n_total << 1) * sizeof(*ptr_scratch1));
895
896
3.30M
  switch (npoints) {
897
1.27M
    case (96):
898
1.27M
      cos_ptr = iexheaac_pre_post_twid_cos_192;
899
1.27M
      sin_ptr = iexheaac_pre_post_twid_sin_192;
900
1.27M
      break;
901
1.36M
    case (128):
902
1.36M
      cos_ptr = iusace_pre_post_twid_cos_256;
903
1.36M
      sin_ptr = iusace_pre_post_twid_sin_256;
904
1.36M
      break;
905
238k
    case (768):
906
238k
      cos_ptr = iexheaac_pre_post_twid_cos_1536;
907
238k
      sin_ptr = iexheaac_pre_post_twid_sin_1536;
908
238k
      break;
909
422k
    case (1024):
910
422k
      cos_ptr = iusace_pre_post_twid_cos_2048;
911
422k
      sin_ptr = iusace_pre_post_twid_sin_2048;
912
422k
      break;
913
0
    default:
914
0
      return IA_EXHEAACE_EXE_FATAL_USAC_INVALID_WINDOW_LENGTH;
915
3.30M
  }
916
917
  /* pre-twiddle */
918
3.30M
  iusace_calc_pre_twid_enc(ptr_in, ptr_scratch1, npoints << 1, cos_ptr, sin_ptr, tx_flag);
919
920
  /* complex FFT */
921
3.30M
  iusace_complex_fft(ptr_scratch1, nlength, pstr_scratch);
922
923
  /* post-twiddle */
924
3.30M
  iusace_calc_post_twid_enc(ptr_out, ptr_scratch1, npoints << 1, cos_ptr, sin_ptr, tx_flag);
925
926
3.30M
  return IA_NO_ERROR;
927
3.30M
}
928
929
208k
VOID iusace_complex_fft_2048(FLOAT32 *ptr_x, FLOAT32 *scratch_fft) {
930
208k
  WORD32 i;
931
208k
  FLOAT32 re, im, c_v, s_v, tmp_re, tmp_im;
932
208k
  FLOAT32 *ptr_re, *ptr_im, *ptr_re_h, *ptr_im_h;
933
208k
  FLOAT32 *ptr_cos_val, *ptr_sin_val;
934
208k
  iusace_complex_fft_p2(ptr_x, 1024, scratch_fft);
935
208k
  iusace_complex_fft_p2(ptr_x + 2048, 1024, scratch_fft);
936
937
208k
  ptr_re = ptr_x;
938
208k
  ptr_im = ptr_x + 1;
939
208k
  ptr_re_h = ptr_x + 2048;
940
208k
  ptr_im_h = ptr_x + 2048 + 1;
941
208k
  ptr_cos_val = (FLOAT32 *)&iusace_twiddle_cos_2048[0];
942
208k
  ptr_sin_val = (FLOAT32 *)&iusace_twiddle_sin_2048[0];
943
213M
  for (i = 0; i < 1024; i++) {
944
213M
    re = *ptr_re_h;
945
213M
    im = *ptr_im_h;
946
213M
    c_v = ptr_cos_val[i];
947
213M
    s_v = ptr_sin_val[i];
948
213M
    tmp_re = (re * c_v) + (im * s_v);
949
213M
    tmp_im = -(re * s_v) + (im * c_v);
950
213M
    re = *ptr_re;
951
213M
    im = *ptr_im;
952
953
213M
    *ptr_re = re + tmp_re;
954
213M
    *ptr_im = im + tmp_im;
955
213M
    *ptr_re_h = re - tmp_re;
956
213M
    *ptr_im_h = im - tmp_im;
957
958
213M
    ptr_re += 2;
959
213M
    ptr_im += 2;
960
213M
    ptr_re_h += 2;
961
213M
    ptr_im_h += 2;
962
213M
  }
963
208k
}
964
static VOID ixheaace_rad2_cplx_fft(FLOAT32 *ptr_real, FLOAT32 *ptr_imag, WORD32 n_points,
965
179k
                                   FLOAT32 *ptr_scratch) {
966
179k
  WORD32 i, j, k, n_stages, h2;
967
179k
  FLOAT32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
968
179k
  WORD32 del, nodespacing, in_loop_cnt;
969
179k
  WORD32 not_power_4;
970
179k
  WORD32 dig_rev_shift;
971
179k
  WORD32 m_points = n_points;
972
179k
  FLOAT32 *ptr_x = ptr_scratch;
973
179k
  FLOAT32 *y = ptr_scratch + 2048;
974
179k
  FLOAT32 *ptr_y = y;
975
179k
  const FLOAT32 *ptr_w;
976
977
179k
  dig_rev_shift = ixheaac_norm32(m_points) + 1 - 16;
978
179k
  n_stages = 30 - ixheaac_norm32(m_points);
979
179k
  not_power_4 = n_stages & 1;
980
981
179k
  n_stages = n_stages >> 1;
982
983
179k
  ptr_w = ia_fft_twiddle_table_float;
984
985
183M
  for (i = 0; i < n_points; i++) {
986
183M
    ptr_x[2 * i] = ptr_real[i];
987
183M
    ptr_x[2 * i + 1] = ptr_imag[i];
988
183M
  }
989
179k
  dig_rev_shift = max(dig_rev_shift, 0);
990
46.0M
  for (i = 0; i < n_points; i += 4) {
991
45.8M
    FLOAT32 *inp = ptr_x;
992
45.8M
    FLOAT32 tmk;
993
994
45.8M
    DIG_REV(i, dig_rev_shift, h2);
995
45.8M
    if (not_power_4) {
996
0
      h2 += 1;
997
0
      h2 &= ~1;
998
0
    }
999
45.8M
    inp += (h2);
1000
1001
45.8M
    x0r = *inp;
1002
45.8M
    x0i = *(inp + 1);
1003
45.8M
    inp += (n_points >> 1);
1004
1005
45.8M
    x1r = *inp;
1006
45.8M
    x1i = *(inp + 1);
1007
45.8M
    inp += (n_points >> 1);
1008
1009
45.8M
    x2r = *inp;
1010
45.8M
    x2i = *(inp + 1);
1011
45.8M
    inp += (n_points >> 1);
1012
1013
45.8M
    x3r = *inp;
1014
45.8M
    x3i = *(inp + 1);
1015
1016
45.8M
    x0r = ia_add_flt(x0r, x2r);
1017
45.8M
    x0i = ia_add_flt(x0i, x2i);
1018
1019
45.8M
    tmk = ia_sub_flt(x0r, x2r);
1020
45.8M
    x2r = ia_sub_flt(tmk, x2r);
1021
45.8M
    tmk = ia_sub_flt(x0i, x2i);
1022
45.8M
    x2i = ia_sub_flt(tmk, x2i);
1023
1024
45.8M
    x1r = ia_add_flt(x1r, x3r);
1025
45.8M
    x1i = ia_add_flt(x1i, x3i);
1026
1027
45.8M
    tmk = ia_sub_flt(x1r, x3r);
1028
45.8M
    x3r = ia_sub_flt(tmk, x3r);
1029
45.8M
    tmk = ia_sub_flt(x1i, x3i);
1030
45.8M
    x3i = ia_sub_flt(tmk, x3i);
1031
1032
45.8M
    x0r = ia_add_flt(x0r, x1r);
1033
45.8M
    x0i = ia_add_flt(x0i, x1i);
1034
1035
45.8M
    tmk = ia_sub_flt(x0r, x1r);
1036
45.8M
    x1r = ia_sub_flt(tmk, x1r);
1037
45.8M
    tmk = ia_sub_flt(x0i, x1i);
1038
45.8M
    x1i = ia_sub_flt(tmk, x1i);
1039
1040
45.8M
    x2r = ia_add_flt(x2r, x3i);
1041
45.8M
    x2i = ia_sub_flt(x2i, x3r);
1042
1043
45.8M
    tmk = ia_sub_flt(x2r, x3i);
1044
45.8M
    x3i = ia_sub_flt(tmk, x3i);
1045
45.8M
    tmk = ia_add_flt(x2i, x3r);
1046
45.8M
    x3r = ia_add_flt(tmk, x3r);
1047
1048
45.8M
    *ptr_y++ = x0r;
1049
45.8M
    *ptr_y++ = x0i;
1050
45.8M
    *ptr_y++ = x2r;
1051
45.8M
    *ptr_y++ = x2i;
1052
45.8M
    *ptr_y++ = x1r;
1053
45.8M
    *ptr_y++ = x1i;
1054
45.8M
    *ptr_y++ = x3i;
1055
45.8M
    *ptr_y++ = x3r;
1056
45.8M
  }
1057
179k
  ptr_y -= 2 * n_points;
1058
179k
  del = 4;
1059
179k
  nodespacing = 64;
1060
179k
  in_loop_cnt = n_points >> 4;
1061
896k
  for (i = n_stages - 1; i > 0; i--) {
1062
716k
    const FLOAT32 *twiddles = ptr_w;
1063
716k
    FLOAT32 *data = ptr_y;
1064
716k
    FLOAT32 w_1, w_2, w_3, w_4, w_5, w_6;
1065
716k
    WORD32 sec_loop_cnt;
1066
1067
15.9M
    for (k = in_loop_cnt; k != 0; k--) {
1068
15.2M
      x0r = (*data);
1069
15.2M
      x0i = (*(data + 1));
1070
15.2M
      data += ((SIZE_T)del << 1);
1071
1072
15.2M
      x1r = (*data);
1073
15.2M
      x1i = (*(data + 1));
1074
15.2M
      data += ((SIZE_T)del << 1);
1075
1076
15.2M
      x2r = (*data);
1077
15.2M
      x2i = (*(data + 1));
1078
15.2M
      data += ((SIZE_T)del << 1);
1079
1080
15.2M
      x3r = (*data);
1081
15.2M
      x3i = (*(data + 1));
1082
15.2M
      data -= 3 * (del << 1);
1083
1084
15.2M
      x0r = ia_add_flt(x0r, x2r);
1085
15.2M
      x0i = ia_add_flt(x0i, x2i);
1086
15.2M
      x2r = ia_msu_flt(x0r, x2r, 2);
1087
15.2M
      x2i = ia_msu_flt(x0i, x2i, 2);
1088
15.2M
      x1r = ia_add_flt(x1r, x3r);
1089
15.2M
      x1i = ia_add_flt(x1i, x3i);
1090
15.2M
      x3r = ia_msu_flt(x1r, x3r, 2);
1091
15.2M
      x3i = ia_msu_flt(x1i, x3i, 2);
1092
1093
15.2M
      x0r = ia_add_flt(x0r, x1r);
1094
15.2M
      x0i = ia_add_flt(x0i, x1i);
1095
15.2M
      x1r = ia_msu_flt(x0r, x1r, 2);
1096
15.2M
      x1i = ia_msu_flt(x0i, x1i, 2);
1097
15.2M
      x2r = ia_add_flt(x2r, x3i);
1098
15.2M
      x2i = ia_sub_flt(x2i, x3r);
1099
15.2M
      x3i = ia_msu_flt(x2r, x3i, 2);
1100
15.2M
      x3r = ia_mac_flt(x2i, x3r, 2);
1101
1102
15.2M
      *data = x0r;
1103
15.2M
      *(data + 1) = x0i;
1104
15.2M
      data += ((SIZE_T)del << 1);
1105
1106
15.2M
      *data = x2r;
1107
15.2M
      *(data + 1) = x2i;
1108
15.2M
      data += ((SIZE_T)del << 1);
1109
1110
15.2M
      *data = x1r;
1111
15.2M
      *(data + 1) = x1i;
1112
15.2M
      data += ((SIZE_T)del << 1);
1113
1114
15.2M
      *data = x3i;
1115
15.2M
      *(data + 1) = x3r;
1116
15.2M
      data += ((SIZE_T)del << 1);
1117
15.2M
    }
1118
716k
    data = ptr_y + 2;
1119
1120
716k
    sec_loop_cnt = (nodespacing * del);
1121
716k
    sec_loop_cnt = (sec_loop_cnt / 4) + (sec_loop_cnt / 8) - (sec_loop_cnt / 16) +
1122
716k
                   (sec_loop_cnt / 32) - (sec_loop_cnt / 64) + (sec_loop_cnt / 128) -
1123
716k
                   (sec_loop_cnt / 256);
1124
1125
20.7M
    for (j = nodespacing; j <= sec_loop_cnt; j += nodespacing) {
1126
20.0M
      w_1 = *(twiddles + j);
1127
20.0M
      w_4 = *(twiddles + j + 257);
1128
20.0M
      w_2 = *(twiddles + ((SIZE_T)j << 1));
1129
20.0M
      w_5 = *(twiddles + ((SIZE_T)j << 1) + 257);
1130
20.0M
      w_3 = *(twiddles + j + ((SIZE_T)j << 1));
1131
20.0M
      w_6 = *(twiddles + j + ((SIZE_T)j << 1) + 257);
1132
1133
76.1M
      for (k = in_loop_cnt; k != 0; k--) {
1134
56.0M
        FLOAT32 tmp;
1135
        /*x0 is loaded later to avoid register crunch*/
1136
1137
56.0M
        data += ((SIZE_T)del << 1);
1138
1139
56.0M
        x1r = *data;
1140
56.0M
        x1i = *(data + 1);
1141
56.0M
        data += ((SIZE_T)del << 1);
1142
1143
56.0M
        x2r = *data;
1144
56.0M
        x2i = *(data + 1);
1145
56.0M
        data += ((SIZE_T)del << 1);
1146
1147
56.0M
        x3r = *data;
1148
56.0M
        x3i = *(data + 1);
1149
56.0M
        data -= 3 * (del << 1);
1150
1151
56.0M
        tmp = ia_sub_flt(ia_mul_flt(x1r, w_1), ia_mul_flt(x1i, w_4));
1152
56.0M
        x1i = ia_mac_flt(ia_mul_flt(x1r, w_4), x1i, w_1);
1153
56.0M
        x1r = tmp;
1154
1155
56.0M
        tmp = ia_sub_flt(ia_mul_flt(x2r, w_2), ia_mul_flt(x2i, w_5));
1156
56.0M
        x2i = ia_mac_flt(ia_mul_flt(x2r, w_5), x2i, w_2);
1157
56.0M
        x2r = tmp;
1158
1159
56.0M
        tmp = ia_sub_flt(ia_mul_flt(x3r, w_3), ia_mul_flt(x3i, w_6));
1160
56.0M
        x3i = ia_mac_flt(ia_mul_flt(x3r, w_6), x3i, w_3);
1161
56.0M
        x3r = tmp;
1162
1163
56.0M
        x0r = (*data);
1164
56.0M
        x0i = (*(data + 1));
1165
1166
56.0M
        x0r = ia_add_flt(x0r, (x2r));
1167
56.0M
        x0i = ia_add_flt(x0i, (x2i));
1168
56.0M
        x2r = ia_msu_flt(x0r, x2r, 2);
1169
56.0M
        x2i = ia_msu_flt(x0i, x2i, 2);
1170
56.0M
        x1r = ia_add_flt(x1r, x3r);
1171
56.0M
        x1i = ia_add_flt(x1i, x3i);
1172
56.0M
        x3r = ia_msu_flt(x1r, x3r, 2);
1173
56.0M
        x3i = ia_msu_flt(x1i, x3i, 2);
1174
1175
56.0M
        x0r = ia_add_flt(x0r, (x1r));
1176
56.0M
        x0i = ia_add_flt(x0i, (x1i));
1177
56.0M
        x1r = ia_msu_flt(x0r, x1r, 2);
1178
56.0M
        x1i = ia_msu_flt(x0i, x1i, 2);
1179
56.0M
        x2r = ia_add_flt(x2r, (x3i));
1180
56.0M
        x2i = ia_sub_flt(x2i, (x3r));
1181
56.0M
        x3i = ia_msu_flt(x2r, x3i, 2);
1182
56.0M
        x3r = ia_mac_flt(x2i, x3r, 2);
1183
1184
56.0M
        *data = x0r;
1185
56.0M
        *(data + 1) = x0i;
1186
56.0M
        data += ((SIZE_T)del << 1);
1187
1188
56.0M
        *data = x2r;
1189
56.0M
        *(data + 1) = x2i;
1190
56.0M
        data += ((SIZE_T)del << 1);
1191
1192
56.0M
        *data = x1r;
1193
56.0M
        *(data + 1) = x1i;
1194
56.0M
        data += ((SIZE_T)del << 1);
1195
1196
56.0M
        *data = x3i;
1197
56.0M
        *(data + 1) = x3r;
1198
56.0M
        data += ((SIZE_T)del << 1);
1199
56.0M
      }
1200
20.0M
      data -= 2 * n_points;
1201
20.0M
      data += 2;
1202
20.0M
    }
1203
11.1M
    for (; j <= (nodespacing * del) >> 1; j += nodespacing) {
1204
10.3M
      w_1 = *(twiddles + j);
1205
10.3M
      w_4 = *(twiddles + j + 257);
1206
10.3M
      w_2 = *(twiddles + ((SIZE_T)j << 1));
1207
10.3M
      w_5 = *(twiddles + ((SIZE_T)j << 1) + 257);
1208
10.3M
      w_3 = *(twiddles + j + ((SIZE_T)j << 1) - 256);
1209
10.3M
      w_6 = *(twiddles + j + ((SIZE_T)j << 1) + 1);
1210
1211
46.0M
      for (k = in_loop_cnt; k != 0; k--) {
1212
35.6M
        FLOAT32 tmp;
1213
        /*x0 is loaded later to avoid register crunch*/
1214
1215
35.6M
        data += ((SIZE_T)del << 1);
1216
1217
35.6M
        x1r = *data;
1218
35.6M
        x1i = *(data + 1);
1219
35.6M
        data += ((SIZE_T)del << 1);
1220
1221
35.6M
        x2r = *data;
1222
35.6M
        x2i = *(data + 1);
1223
35.6M
        data += ((SIZE_T)del << 1);
1224
1225
35.6M
        x3r = *data;
1226
35.6M
        x3i = *(data + 1);
1227
35.6M
        data -= 3 * (del << 1);
1228
1229
35.6M
        tmp = ia_sub_flt(ia_mul_flt(x1r, w_1), ia_mul_flt(x1i, w_4));
1230
35.6M
        x1i = ia_mac_flt(ia_mul_flt(x1r, w_4), x1i, w_1);
1231
35.6M
        x1r = tmp;
1232
1233
35.6M
        tmp = ia_sub_flt(ia_mul_flt(x2r, w_2), ia_mul_flt(x2i, w_5));
1234
35.6M
        x2i = ia_mac_flt(ia_mul_flt(x2r, w_5), x2i, w_2);
1235
35.6M
        x2r = tmp;
1236
1237
35.6M
        tmp = ia_add_flt(ia_mul_flt(x3r, w_6), ia_mul_flt(x3i, w_3));
1238
35.6M
        x3i = ia_add_flt(ia_negate_flt(ia_mul_flt(x3r, w_3)), ia_mul_flt(x3i, w_6));
1239
35.6M
        x3r = tmp;
1240
1241
35.6M
        x0r = (*data);
1242
35.6M
        x0i = (*(data + 1));
1243
1244
35.6M
        x0r = ia_add_flt(x0r, (x2r));
1245
35.6M
        x0i = ia_add_flt(x0i, (x2i));
1246
35.6M
        x2r = ia_msu_flt(x0r, x2r, 2);
1247
35.6M
        x2i = ia_msu_flt(x0i, x2i, 2);
1248
35.6M
        x1r = ia_add_flt(x1r, x3r);
1249
35.6M
        x1i = ia_add_flt(x1i, x3i);
1250
35.6M
        x3r = ia_msu_flt(x1r, x3r, 2);
1251
35.6M
        x3i = ia_msu_flt(x1i, x3i, 2);
1252
1253
35.6M
        x0r = ia_add_flt(x0r, (x1r));
1254
35.6M
        x0i = ia_add_flt(x0i, (x1i));
1255
35.6M
        x1r = ia_msu_flt(x0r, x1r, 2);
1256
35.6M
        x1i = ia_msu_flt(x0i, x1i, 2);
1257
35.6M
        x2r = ia_add_flt(x2r, (x3i));
1258
35.6M
        x2i = ia_sub_flt(x2i, (x3r));
1259
35.6M
        x3i = ia_msu_flt(x2r, x3i, 2);
1260
35.6M
        x3r = ia_mac_flt(x2i, x3r, 2);
1261
1262
35.6M
        *data = x0r;
1263
35.6M
        *(data + 1) = x0i;
1264
35.6M
        data += ((SIZE_T)del << 1);
1265
1266
35.6M
        *data = x2r;
1267
35.6M
        *(data + 1) = x2i;
1268
35.6M
        data += ((SIZE_T)del << 1);
1269
1270
35.6M
        *data = x1r;
1271
35.6M
        *(data + 1) = x1i;
1272
35.6M
        data += ((SIZE_T)del << 1);
1273
1274
35.6M
        *data = x3i;
1275
35.6M
        *(data + 1) = x3r;
1276
35.6M
        data += ((SIZE_T)del << 1);
1277
35.6M
      }
1278
10.3M
      data -= 2 * n_points;
1279
10.3M
      data += 2;
1280
10.3M
    }
1281
10.3M
    for (; j <= sec_loop_cnt * 2; j += nodespacing) {
1282
9.67M
      w_1 = *(twiddles + j);
1283
9.67M
      w_4 = *(twiddles + j + 257);
1284
9.67M
      w_2 = *(twiddles + ((SIZE_T)j << 1) - 256);
1285
9.67M
      w_5 = *(twiddles + ((SIZE_T)j << 1) + 1);
1286
9.67M
      w_3 = *(twiddles + j + ((SIZE_T)j << 1) - 256);
1287
9.67M
      w_6 = *(twiddles + j + ((SIZE_T)j << 1) + 1);
1288
1289
30.1M
      for (k = in_loop_cnt; k != 0; k--) {
1290
20.4M
        FLOAT32 tmp;
1291
        /*x0 is loaded later to avoid register crunch*/
1292
1293
20.4M
        data += ((SIZE_T)del << 1);
1294
1295
20.4M
        x1r = *data;
1296
20.4M
        x1i = *(data + 1);
1297
20.4M
        data += ((SIZE_T)del << 1);
1298
1299
20.4M
        x2r = *data;
1300
20.4M
        x2i = *(data + 1);
1301
20.4M
        data += ((SIZE_T)del << 1);
1302
1303
20.4M
        x3r = *data;
1304
20.4M
        x3i = *(data + 1);
1305
20.4M
        data -= 3 * (del << 1);
1306
1307
20.4M
        tmp = ia_sub_flt(ia_mul_flt(x1r, w_1), ia_mul_flt(x1i, w_4));
1308
20.4M
        x1i = ia_mac_flt(ia_mul_flt(x1r, w_4), x1i, w_1);
1309
20.4M
        x1r = tmp;
1310
1311
20.4M
        tmp = ia_add_flt(ia_mul_flt(x2r, w_5), ia_mul_flt(x2i, w_2));
1312
20.4M
        x2i = ia_add_flt(ia_negate_flt(ia_mul_flt(x2r, w_2)), ia_mul_flt(x2i, w_5));
1313
20.4M
        x2r = tmp;
1314
1315
20.4M
        tmp = ia_add_flt(ia_mul_flt(x3r, w_6), ia_mul_flt(x3i, w_3));
1316
20.4M
        x3i = ia_add_flt(ia_negate_flt(ia_mul_flt(x3r, w_3)), ia_mul_flt(x3i, w_6));
1317
20.4M
        x3r = tmp;
1318
1319
20.4M
        x0r = (*data);
1320
20.4M
        x0i = (*(data + 1));
1321
1322
20.4M
        x0r = ia_add_flt(x0r, (x2r));
1323
20.4M
        x0i = ia_add_flt(x0i, (x2i));
1324
20.4M
        x2r = ia_msu_flt(x0r, x2r, 2);
1325
20.4M
        x2i = ia_msu_flt(x0i, x2i, 2);
1326
20.4M
        x1r = ia_add_flt(x1r, x3r);
1327
20.4M
        x1i = ia_add_flt(x1i, x3i);
1328
20.4M
        x3r = ia_msu_flt(x1r, x3r, 2);
1329
20.4M
        x3i = ia_msu_flt(x1i, x3i, 2);
1330
1331
20.4M
        x0r = ia_add_flt(x0r, (x1r));
1332
20.4M
        x0i = ia_add_flt(x0i, (x1i));
1333
20.4M
        x1r = ia_msu_flt(x0r, x1r, 2);
1334
20.4M
        x1i = ia_msu_flt(x0i, x1i, 2);
1335
20.4M
        x2r = ia_add_flt(x2r, (x3i));
1336
20.4M
        x2i = ia_sub_flt(x2i, (x3r));
1337
20.4M
        x3i = ia_msu_flt(x2r, x3i, 2);
1338
20.4M
        x3r = ia_mac_flt(x2i, x3r, 2);
1339
1340
20.4M
        *data = x0r;
1341
20.4M
        *(data + 1) = x0i;
1342
20.4M
        data += ((SIZE_T)del << 1);
1343
1344
20.4M
        *data = x2r;
1345
20.4M
        *(data + 1) = x2i;
1346
20.4M
        data += ((SIZE_T)del << 1);
1347
1348
20.4M
        *data = x1r;
1349
20.4M
        *(data + 1) = x1i;
1350
20.4M
        data += ((SIZE_T)del << 1);
1351
1352
20.4M
        *data = x3i;
1353
20.4M
        *(data + 1) = x3r;
1354
20.4M
        data += ((SIZE_T)del << 1);
1355
20.4M
      }
1356
9.67M
      data -= 2 * n_points;
1357
9.67M
      data += 2;
1358
9.67M
    }
1359
20.7M
    for (; j < nodespacing * del; j += nodespacing) {
1360
20.0M
      w_1 = *(twiddles + j);
1361
20.0M
      w_4 = *(twiddles + j + 257);
1362
20.0M
      w_2 = *(twiddles + ((SIZE_T)j << 1) - 256);
1363
20.0M
      w_5 = *(twiddles + ((SIZE_T)j << 1) + 1);
1364
20.0M
      w_3 = *(twiddles + j + ((SIZE_T)j << 1) - 512);
1365
20.0M
      w_6 = *(twiddles + j + ((SIZE_T)j << 1) - 512 + 257);
1366
1367
76.1M
      for (k = in_loop_cnt; k != 0; k--) {
1368
56.0M
        FLOAT32 tmp;
1369
        /*x0 is loaded later to avoid register crunch*/
1370
1371
56.0M
        data += ((SIZE_T)del << 1);
1372
1373
56.0M
        x1r = *data;
1374
56.0M
        x1i = *(data + 1);
1375
56.0M
        data += ((SIZE_T)del << 1);
1376
1377
56.0M
        x2r = *data;
1378
56.0M
        x2i = *(data + 1);
1379
56.0M
        data += ((SIZE_T)del << 1);
1380
1381
56.0M
        x3r = *data;
1382
56.0M
        x3i = *(data + 1);
1383
56.0M
        data -= 3 * (del << 1);
1384
1385
56.0M
        tmp = ia_sub_flt(ia_mul_flt(x1r, w_1), ia_mul_flt(x1i, w_4));
1386
56.0M
        x1i = ia_mac_flt(ia_mul_flt(x1r, w_4), x1i, w_1);
1387
56.0M
        x1r = tmp;
1388
1389
56.0M
        tmp = ia_add_flt(ia_mul_flt(x2r, w_5), ia_mul_flt(x2i, w_2));
1390
56.0M
        x2i = ia_add_flt(ia_negate_flt(ia_mul_flt(x2r, w_2)), ia_mul_flt(x2i, w_5));
1391
56.0M
        x2r = tmp;
1392
1393
56.0M
        tmp = ia_add_flt(ia_negate_flt(ia_mul_flt(x3r, w_3)), ia_mul_flt(x3i, w_6));
1394
56.0M
        x3i = ia_mac_flt(ia_mul_flt(x3r, w_6), x3i, w_3);
1395
56.0M
        x3r = tmp;
1396
1397
56.0M
        x0r = (*data);
1398
56.0M
        x0i = (*(data + 1));
1399
1400
56.0M
        x0r = ia_add_flt(x0r, (x2r));
1401
56.0M
        x0i = ia_add_flt(x0i, (x2i));
1402
56.0M
        x2r = ia_msu_flt(x0r, x2r, 2);
1403
56.0M
        x2i = ia_msu_flt(x0i, x2i, 2);
1404
56.0M
        x1r = ia_add_flt(x1r, x3r);
1405
56.0M
        x1i = ia_sub_flt(x1i, x3i);
1406
56.0M
        x3r = ia_msu_flt(x1r, x3r, 2);
1407
56.0M
        x3i = ia_mac_flt(x1i, x3i, 2);
1408
1409
56.0M
        x0r = ia_add_flt(x0r, (x1r));
1410
56.0M
        x0i = ia_add_flt(x0i, (x1i));
1411
56.0M
        x1r = ia_msu_flt(x0r, x1r, 2);
1412
56.0M
        x1i = ia_msu_flt(x0i, x1i, 2);
1413
56.0M
        x2r = ia_add_flt(x2r, (x3i));
1414
56.0M
        x2i = ia_sub_flt(x2i, (x3r));
1415
56.0M
        x3i = ia_msu_flt(x2r, x3i, 2);
1416
56.0M
        x3r = ia_mac_flt(x2i, x3r, 2);
1417
1418
56.0M
        *data = x0r;
1419
56.0M
        *(data + 1) = x0i;
1420
56.0M
        data += ((SIZE_T)del << 1);
1421
1422
56.0M
        *data = x2r;
1423
56.0M
        *(data + 1) = x2i;
1424
56.0M
        data += ((SIZE_T)del << 1);
1425
1426
56.0M
        *data = x1r;
1427
56.0M
        *(data + 1) = x1i;
1428
56.0M
        data += ((SIZE_T)del << 1);
1429
1430
56.0M
        *data = x3i;
1431
56.0M
        *(data + 1) = x3r;
1432
56.0M
        data += ((SIZE_T)del << 1);
1433
56.0M
      }
1434
20.0M
      data -= 2 * n_points;
1435
20.0M
      data += 2;
1436
20.0M
    }
1437
716k
    nodespacing >>= 2;
1438
716k
    del <<= 2;
1439
716k
    in_loop_cnt >>= 2;
1440
716k
  }
1441
179k
  if (not_power_4) {
1442
0
    const FLOAT32 *twiddles = ptr_w;
1443
0
    nodespacing <<= 1;
1444
1445
0
    for (j = del / 2; j != 0; j--) {
1446
0
      FLOAT32 w_1 = *twiddles;
1447
0
      FLOAT32 w_4 = *(twiddles + 257);
1448
0
      FLOAT32 tmp;
1449
0
      twiddles += nodespacing;
1450
1451
0
      x0r = *ptr_y;
1452
0
      x0i = *(ptr_y + 1);
1453
0
      ptr_y += ((SIZE_T)del << 1);
1454
1455
0
      x1r = *ptr_y;
1456
0
      x1i = *(ptr_y + 1);
1457
1458
0
      tmp = ia_sub_flt(ia_mul_flt(x1r, w_1), ia_mul_flt(x1i, w_4));
1459
0
      x1i = (FLOAT32)ia_mac_flt(ia_mul_flt(x1r, w_4), x1i, w_1);
1460
0
      x1r = tmp;
1461
1462
0
      *ptr_y = ia_sub_flt((x0r), (x1r));
1463
0
      *(ptr_y + 1) = ia_sub_flt((x0i), (x1i));
1464
0
      ptr_y -= ((SIZE_T)del << 1);
1465
1466
0
      *ptr_y = ia_add_flt((x0r), (x1r));
1467
0
      *(ptr_y + 1) = ia_add_flt((x0i), (x1i));
1468
0
      ptr_y += 2;
1469
0
    }
1470
0
    twiddles = ptr_w;
1471
0
    for (j = del / 2; j != 0; j--) {
1472
0
      FLOAT32 w_1 = *twiddles;
1473
0
      FLOAT32 w_4 = *(twiddles + 257);
1474
0
      FLOAT32 tmp;
1475
0
      twiddles += nodespacing;
1476
1477
0
      x0r = *ptr_y;
1478
0
      x0i = *(ptr_y + 1);
1479
0
      ptr_y += ((SIZE_T)del << 1);
1480
1481
0
      x1r = *ptr_y;
1482
0
      x1i = *(ptr_y + 1);
1483
1484
0
      tmp = ia_add_flt(ia_mul_flt(x1r, w_4), ia_mul_flt(x1i, w_1));
1485
0
      x1i = ia_add_flt(ia_negate_flt(ia_mul_flt(x1r, w_1)), ia_mul_flt(x1i, w_4));
1486
0
      x1r = tmp;
1487
1488
0
      *ptr_y = ia_sub_flt((x0r), (x1r));
1489
0
      *(ptr_y + 1) = ia_sub_flt((x0i), (x1i));
1490
0
      ptr_y -= ((SIZE_T)del << 1);
1491
1492
0
      *ptr_y = ia_add_flt((x0r), (x1r));
1493
0
      *(ptr_y + 1) = ia_add_flt((x0i), (x1i));
1494
0
      ptr_y += 2;
1495
0
    }
1496
0
  }
1497
1498
183M
  for (i = 0; i < n_points; i++) {
1499
183M
    ptr_real[i] = y[2 * i];
1500
183M
    ptr_imag[i] = y[2 * i + 1];
1501
183M
  }
1502
179k
}
1503
45.8M
static VOID ixheaace_cplx_fft_4(FLOAT32 *x_r, FLOAT32 *x_i) {
1504
45.8M
  FLOAT32 x_0, x_1, x_2, x_3;
1505
45.8M
  FLOAT32 x_4, x_5, x_6, x_7;
1506
45.8M
  FLOAT32 x0r, x1r, x2r, x3r;
1507
45.8M
  FLOAT32 x0i, x1i, x2i, x3i;
1508
1509
  // 4 Point FFT
1510
45.8M
  x_0 = x_r[0];
1511
45.8M
  x_1 = x_i[0];
1512
45.8M
  x_2 = x_r[1];
1513
45.8M
  x_3 = x_i[1];
1514
45.8M
  x_4 = x_r[2];
1515
45.8M
  x_5 = x_i[2];
1516
45.8M
  x_6 = x_r[3];
1517
45.8M
  x_7 = x_i[3];
1518
1519
45.8M
  x0r = ia_add_flt(x_0, x_4);
1520
45.8M
  x0i = ia_add_flt(x_1, x_5);
1521
45.8M
  x2r = ia_sub_flt(x_0, x_4);
1522
45.8M
  x2i = ia_sub_flt(x_1, x_5);
1523
45.8M
  x1r = ia_add_flt(x_2, x_6);
1524
45.8M
  x1i = ia_add_flt(x_3, x_7);
1525
45.8M
  x3r = ia_sub_flt(x_2, x_6);
1526
45.8M
  x3i = ia_sub_flt(x_3, x_7);
1527
1528
45.8M
  x_r[0] = ia_add_flt(x0r, x1r);
1529
45.8M
  x_i[0] = ia_add_flt(x0i, x1i);
1530
45.8M
  x_r[2] = ia_sub_flt(x0r, x1r);
1531
45.8M
  x_i[2] = ia_sub_flt(x0i, x1i);
1532
45.8M
  x_r[1] = ia_add_flt(x2r, x3i);
1533
45.8M
  x_i[1] = ia_sub_flt(x2i, x3r);
1534
45.8M
  x_r[3] = ia_sub_flt(x2r, x3i);
1535
45.8M
  x_i[3] = ia_add_flt(x2i, x3r);
1536
45.8M
  return;
1537
45.8M
}
1538
44.8k
VOID iusace_complex_fft_4096(FLOAT32 *ptr_x_r, FLOAT32 *ptr_x_i, FLOAT32 *ptr_scratch_buf) {
1539
44.8k
  FLOAT32 *ptr_data_r;
1540
44.8k
  FLOAT32 *ptr_data_i;
1541
44.8k
  WORD32 fft_len = 4096;
1542
44.8k
  FLOAT32 *ptr_fft_interim_buf = &ptr_scratch_buf[2 * fft_len];
1543
44.8k
  WORD32 i, j;
1544
44.8k
  WORD32 dim2 = fft_len >> 10;
1545
44.8k
  WORD32 dim1 = fft_len / dim2;
1546
44.8k
  WORD32 fac = 4;
1547
1548
224k
  for (i = 0; i < dim2; i++) {
1549
179k
    ptr_data_r = &ptr_scratch_buf[(2 * i + 0) * dim1];
1550
179k
    ptr_data_i = &ptr_scratch_buf[(2 * i + 1) * dim1];
1551
183M
    for (j = 0; j < dim1; j++) {
1552
183M
      ptr_data_r[j] = ptr_x_r[(dim2 * j + i)];
1553
183M
      ptr_data_i[j] = 0;
1554
183M
    }
1555
179k
    ixheaace_rad2_cplx_fft(ptr_data_r, ptr_data_i, dim1, ptr_fft_interim_buf);
1556
179k
  }
1557
44.8k
  ptr_data_r = &ptr_scratch_buf[0];
1558
44.8k
  ptr_data_i = &ptr_scratch_buf[0];
1559
45.9M
  for (i = 0; i < dim1; i++) {
1560
45.8M
    FLOAT32 *ptr_cos_val = (FLOAT32 *)&ia_mixed_rad_twiddle_cos[i * dim2 * fac];
1561
45.8M
    FLOAT32 *ptr_sin_val = (FLOAT32 *)&ia_mixed_rad_twiddle_sin[i * dim2 * fac];
1562
229M
    for (j = 0; j < dim2; j++) {
1563
183M
      FLOAT32 real = ptr_data_r[(2 * j + 0) * dim1 + i];
1564
183M
      FLOAT32 imag = ptr_data_i[(2 * j + 1) * dim1 + i];
1565
183M
      FLOAT32 cos_val = ptr_cos_val[j * fac];
1566
183M
      FLOAT32 sin_val = ptr_sin_val[j * fac];
1567
183M
      FLOAT32 temp_real = (FLOAT32)(real * cos_val + imag * sin_val);
1568
183M
      FLOAT32 temp_imag = (FLOAT32)(imag * cos_val - real * sin_val);
1569
183M
      ptr_fft_interim_buf[(2 * i + 0) * dim2 + j] = temp_real;
1570
183M
      ptr_fft_interim_buf[(2 * i + 1) * dim2 + j] = temp_imag;
1571
183M
    }
1572
45.8M
  }
1573
45.9M
  for (i = 0; i < dim1; i++) {
1574
45.8M
    ptr_data_r = &ptr_fft_interim_buf[(2 * i + 0) * dim2];
1575
45.8M
    ptr_data_i = &ptr_fft_interim_buf[(2 * i + 1) * dim2];
1576
45.8M
    ixheaace_cplx_fft_4(ptr_data_r, ptr_data_i);
1577
45.8M
  }
1578
44.8k
  ptr_data_r = &ptr_fft_interim_buf[0];
1579
44.8k
  ptr_data_i = &ptr_fft_interim_buf[0];
1580
45.9M
  for (i = 0; i < dim1; i++) {
1581
229M
    for (j = 0; j < dim2; j++) {
1582
183M
      ptr_x_r[(j * dim1 + i)] = ptr_data_r[(2 * i + 0) * dim2 + j];
1583
183M
      ptr_x_i[(j * dim1 + i)] = ptr_data_i[(2 * i + 1) * dim2 + j];
1584
183M
    }
1585
45.8M
  }
1586
44.8k
}