Coverage Report

Created: 2026-05-30 06:39

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libxaac/encoder/iusace_fft.c
Line
Count
Source
1
/******************************************************************************
2
 *                                                                            *
3
 * Copyright (C) 2023 The Android Open Source Project
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at:
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 *****************************************************************************
18
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19
 */
20
21
#include <string.h>
22
#include "ixheaac_type_def.h"
23
#include "ixheaace_adjust_threshold_data.h"
24
#include "iusace_cnst.h"
25
#include "iusace_block_switch_const.h"
26
#include "iusace_rom.h"
27
#include "iusace_bitbuffer.h"
28
29
/* DRC */
30
#include "impd_drc_common_enc.h"
31
#include "impd_drc_uni_drc.h"
32
#include "impd_drc_tables.h"
33
#include "impd_drc_api.h"
34
#include "impd_drc_uni_drc_eq.h"
35
#include "impd_drc_uni_drc_filter_bank.h"
36
#include "impd_drc_gain_enc.h"
37
#include "impd_drc_struct_def.h"
38
39
#include "iusace_tns_usac.h"
40
#include "iusace_psy_mod.h"
41
#include "iusace_config.h"
42
#include "iusace_fft.h"
43
#include "iusace_basic_ops_flt.h"
44
#include "ixheaac_constants.h"
45
#include "ixheaace_aac_constants.h"
46
#include "ixheaac_basic_ops32.h"
47
#include "ixheaace_common_utils.h"
48
#include "ixheaac_error_standards.h"
49
#include "ixheaace_error_codes.h"
50
51
#define DIG_REV(i, m, j)                                    \
52
458M
  do {                                                      \
53
458M
    unsigned _ = (i);                                       \
54
458M
    _ = ((_ & 0x33333333) << 2) | ((_ & ~0x33333333) >> 2); \
55
458M
    _ = ((_ & 0x0F0F0F0F) << 4) | ((_ & ~0x0F0F0F0F) >> 4); \
56
458M
    _ = ((_ & 0x00FF00FF) << 8) | ((_ & ~0x00FF00FF) >> 8); \
57
458M
    (j) = _ >> (m);                                         \
58
458M
  } while (0)
59
60
40.5M
static PLATFORM_INLINE WORD8 iusace_calc_norm(WORD32 a) {
61
40.5M
  WORD8 norm_val;
62
63
40.5M
  if (a == 0) {
64
0
    norm_val = 31;
65
40.5M
  } else {
66
40.5M
    if (a == (WORD32)0xffffffffL) {
67
0
      norm_val = 31;
68
40.5M
    } else {
69
40.5M
      if (a < 0) {
70
0
        a = ~a;
71
0
      }
72
1.05G
      for (norm_val = 0; a < (WORD32)0x40000000L; norm_val++) {
73
1.01G
        a <<= 1;
74
1.01G
      }
75
40.5M
    }
76
40.5M
  }
77
78
40.5M
  return norm_val;
79
40.5M
}
80
81
91.7M
static PLATFORM_INLINE VOID iusace_complex_3point_fft(FLOAT32 *ptr_in, FLOAT32 *ptr_out) {
82
91.7M
  FLOAT32 add_r, sub_r;
83
91.7M
  FLOAT32 add_i, sub_i;
84
91.7M
  FLOAT32 x01r, x01i, temp;
85
91.7M
  FLOAT32 p1, p2, p3, p4;
86
91.7M
  FLOAT64 sinmu;
87
88
91.7M
  sinmu = 0.866025403784439;
89
90
91.7M
  x01r = ptr_in[0] + ptr_in[2];
91
91.7M
  x01i = ptr_in[1] + ptr_in[3];
92
93
91.7M
  add_r = ptr_in[2] + ptr_in[4];
94
91.7M
  add_i = ptr_in[3] + ptr_in[5];
95
96
91.7M
  sub_r = ptr_in[2] - ptr_in[4];
97
91.7M
  sub_i = ptr_in[3] - ptr_in[5];
98
99
91.7M
  p1 = add_r / (FLOAT32)2.0;
100
91.7M
  p4 = add_i / (FLOAT32)2.0;
101
91.7M
  p2 = (FLOAT32)((FLOAT64)sub_i * sinmu);
102
91.7M
  p3 = (FLOAT32)((FLOAT64)sub_r * sinmu);
103
104
91.7M
  temp = ptr_in[0] - p1;
105
106
91.7M
  ptr_out[0] = x01r + ptr_in[4];
107
91.7M
  ptr_out[1] = x01i + ptr_in[5];
108
91.7M
  ptr_out[2] = temp + p2;
109
91.7M
  ptr_out[3] = (ptr_in[1] - p3) - p4;
110
91.7M
  ptr_out[4] = temp - p2;
111
91.7M
  ptr_out[5] = (ptr_in[1] + p3) - p4;
112
113
91.7M
  return;
114
91.7M
}
115
116
20.2M
VOID iusace_complex_fft_p2(FLOAT32 *ptr_x, WORD32 nlength, FLOAT32 *scratch_fft_p2_y) {
117
20.2M
  WORD32 i, j, k, n_stages, h2;
118
20.2M
  FLOAT32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
119
20.2M
  FLOAT32 tmp;
120
20.2M
  WORD32 del, nodespacing, in_loop_cnt;
121
20.2M
  WORD32 not_power_4;
122
20.2M
  WORD32 dig_rev_shift;
123
20.2M
  FLOAT32 *y = scratch_fft_p2_y;
124
20.2M
  WORD32 mpass = nlength;
125
20.2M
  WORD32 npoints = nlength;
126
20.2M
  FLOAT32 *ptr_y = y;
127
20.2M
  const FLOAT64 *ptr_w;
128
129
20.2M
  dig_rev_shift = iusace_calc_norm(mpass) + 1 - 16;
130
20.2M
  n_stages = 30 - iusace_calc_norm(mpass);
131
20.2M
  not_power_4 = n_stages & 1;
132
133
20.2M
  n_stages = n_stages >> 1;
134
135
20.2M
  ptr_w = iusace_twiddle_table_fft_32x32;
136
137
20.2M
  if (dig_rev_shift < 0) {
138
0
    dig_rev_shift = 0;
139
0
  }
140
141
433M
  for (i = 0; i < npoints; i += 4) {
142
413M
    FLOAT32 *inp = ptr_x;
143
413M
    FLOAT32 tmk;
144
145
413M
    DIG_REV(i, dig_rev_shift, h2);
146
413M
    if (not_power_4) {
147
192M
      h2 += 1;
148
192M
      h2 &= ~1;
149
192M
    }
150
413M
    inp += (h2);
151
152
413M
    x0r = *inp;
153
413M
    x0i = *(inp + 1);
154
413M
    inp += (npoints >> 1);
155
156
413M
    x1r = *inp;
157
413M
    x1i = *(inp + 1);
158
413M
    inp += (npoints >> 1);
159
160
413M
    x2r = *inp;
161
413M
    x2i = *(inp + 1);
162
413M
    inp += (npoints >> 1);
163
164
413M
    x3r = *inp;
165
413M
    x3i = *(inp + 1);
166
167
413M
    x0r = x0r + x2r;
168
413M
    x0i = x0i + x2i;
169
170
413M
    tmk = x0r - x2r;
171
413M
    x2r = tmk - x2r;
172
413M
    tmk = x0i - x2i;
173
413M
    x2i = tmk - x2i;
174
175
413M
    x1r = x1r + x3r;
176
413M
    x1i = x1i + x3i;
177
178
413M
    tmk = x1r - x3r;
179
413M
    x3r = tmk - x3r;
180
413M
    tmk = x1i - x3i;
181
413M
    x3i = tmk - x3i;
182
183
413M
    x0r = x0r + x1r;
184
413M
    x0i = x0i + x1i;
185
186
413M
    tmk = x0r - x1r;
187
413M
    x1r = tmk - x1r;
188
413M
    tmk = x0i - x1i;
189
413M
    x1i = tmk - x1i;
190
191
413M
    x2r = x2r + x3i;
192
413M
    x2i = x2i - x3r;
193
194
413M
    tmk = x2r - x3i;
195
413M
    x3i = tmk - x3i;
196
413M
    tmk = x2i + x3r;
197
413M
    x3r = tmk + x3r;
198
199
413M
    *ptr_y++ = x0r;
200
413M
    *ptr_y++ = x0i;
201
413M
    *ptr_y++ = x2r;
202
413M
    *ptr_y++ = x2i;
203
413M
    *ptr_y++ = x1r;
204
413M
    *ptr_y++ = x1i;
205
413M
    *ptr_y++ = x3i;
206
413M
    *ptr_y++ = x3r;
207
413M
  }
208
20.2M
  ptr_y -= 2 * npoints;
209
20.2M
  del = 4;
210
20.2M
  nodespacing = 64;
211
20.2M
  in_loop_cnt = npoints >> 4;
212
45.3M
  for (i = n_stages - 1; i > 0; i--) {
213
25.0M
    const FLOAT64 *twiddles = ptr_w;
214
25.0M
    FLOAT32 *data = ptr_y;
215
25.0M
    FLOAT64 w_1, w_2, w_3, w_4, w_5, w_6;
216
25.0M
    WORD32 sec_loop_cnt;
217
218
152M
    for (k = in_loop_cnt; k != 0; k--) {
219
127M
      x0r = (*data);
220
127M
      x0i = (*(data + 1));
221
127M
      data += ((SIZE_T)del << 1);
222
223
127M
      x1r = (*data);
224
127M
      x1i = (*(data + 1));
225
127M
      data += ((SIZE_T)del << 1);
226
227
127M
      x2r = (*data);
228
127M
      x2i = (*(data + 1));
229
127M
      data += ((SIZE_T)del << 1);
230
231
127M
      x3r = (*data);
232
127M
      x3i = (*(data + 1));
233
127M
      data -= 3 * (del << 1);
234
235
127M
      x0r = x0r + x2r;
236
127M
      x0i = x0i + x2i;
237
127M
      x2r = x0r - (x2r * 2);
238
127M
      x2i = x0i - (x2i * 2);
239
127M
      x1r = x1r + x3r;
240
127M
      x1i = x1i + x3i;
241
127M
      x3r = x1r - (x3r * 2);
242
127M
      x3i = x1i - (x3i * 2);
243
244
127M
      x0r = x0r + x1r;
245
127M
      x0i = x0i + x1i;
246
127M
      x1r = x0r - (x1r * 2);
247
127M
      x1i = x0i - (x1i * 2);
248
127M
      x2r = x2r + x3i;
249
127M
      x2i = x2i - x3r;
250
127M
      x3i = x2r - (x3i * 2);
251
127M
      x3r = x2i + (x3r * 2);
252
253
127M
      *data = x0r;
254
127M
      *(data + 1) = x0i;
255
127M
      data += ((SIZE_T)del << 1);
256
257
127M
      *data = x2r;
258
127M
      *(data + 1) = x2i;
259
127M
      data += ((SIZE_T)del << 1);
260
261
127M
      *data = x1r;
262
127M
      *(data + 1) = x1i;
263
127M
      data += ((SIZE_T)del << 1);
264
265
127M
      *data = x3i;
266
127M
      *(data + 1) = x3r;
267
127M
      data += ((SIZE_T)del << 1);
268
127M
    }
269
25.0M
    data = ptr_y + 2;
270
271
25.0M
    sec_loop_cnt = (nodespacing * del);
272
25.0M
    sec_loop_cnt = (sec_loop_cnt / 4) + (sec_loop_cnt / 8) - (sec_loop_cnt / 16) +
273
25.0M
                   (sec_loop_cnt / 32) - (sec_loop_cnt / 64) + (sec_loop_cnt / 128) -
274
25.0M
                   (sec_loop_cnt / 256);
275
276
148M
    for (j = nodespacing; j <= sec_loop_cnt; j += nodespacing) {
277
123M
      w_1 = *(twiddles + j);
278
123M
      w_4 = *(twiddles + j + 257);
279
123M
      w_2 = *(twiddles + ((SIZE_T)j << 1));
280
123M
      w_5 = *(twiddles + ((SIZE_T)j << 1) + 257);
281
123M
      w_3 = *(twiddles + j + ((SIZE_T)j << 1));
282
123M
      w_6 = *(twiddles + j + ((SIZE_T)j << 1) + 257);
283
284
449M
      for (k = in_loop_cnt; k != 0; k--) {
285
325M
        data += ((SIZE_T)del << 1);
286
287
325M
        x1r = *data;
288
325M
        x1i = *(data + 1);
289
325M
        data += ((SIZE_T)del << 1);
290
291
325M
        x2r = *data;
292
325M
        x2i = *(data + 1);
293
325M
        data += ((SIZE_T)del << 1);
294
295
325M
        x3r = *data;
296
325M
        x3i = *(data + 1);
297
325M
        data -= 3 * (del << 1);
298
299
325M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_1) - ixheaace_dmult((FLOAT64)x1i, w_4));
300
325M
        x1i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x1r, w_4), (FLOAT64)x1i, w_1);
301
325M
        x1r = tmp;
302
303
325M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x2r, w_2) - ixheaace_dmult((FLOAT64)x2i, w_5));
304
325M
        x2i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x2r, w_5), (FLOAT64)x2i, w_2);
305
325M
        x2r = tmp;
306
307
325M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x3r, w_3) - ixheaace_dmult((FLOAT64)x3i, w_6));
308
325M
        x3i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x3r, w_6), (FLOAT64)x3i, w_3);
309
325M
        x3r = tmp;
310
311
325M
        x0r = (*data);
312
325M
        x0i = (*(data + 1));
313
314
325M
        x0r = x0r + (x2r);
315
325M
        x0i = x0i + (x2i);
316
325M
        x2r = x0r - (x2r * 2);
317
325M
        x2i = x0i - (x2i * 2);
318
325M
        x1r = x1r + x3r;
319
325M
        x1i = x1i + x3i;
320
325M
        x3r = x1r - (x3r * 2);
321
325M
        x3i = x1i - (x3i * 2);
322
323
325M
        x0r = x0r + (x1r);
324
325M
        x0i = x0i + (x1i);
325
325M
        x1r = x0r - (x1r * 2);
326
325M
        x1i = x0i - (x1i * 2);
327
325M
        x2r = x2r + (x3i);
328
325M
        x2i = x2i - (x3r);
329
325M
        x3i = x2r - (x3i * 2);
330
325M
        x3r = x2i + (x3r * 2);
331
332
325M
        *data = x0r;
333
325M
        *(data + 1) = x0i;
334
325M
        data += ((SIZE_T)del << 1);
335
336
325M
        *data = x2r;
337
325M
        *(data + 1) = x2i;
338
325M
        data += ((SIZE_T)del << 1);
339
340
325M
        *data = x1r;
341
325M
        *(data + 1) = x1i;
342
325M
        data += ((SIZE_T)del << 1);
343
344
325M
        *data = x3i;
345
325M
        *(data + 1) = x3r;
346
325M
        data += ((SIZE_T)del << 1);
347
325M
      }
348
123M
      data -= 2 * npoints;
349
123M
      data += 2;
350
123M
    }
351
99.4M
    for (; j <= (nodespacing * del) >> 1; j += nodespacing) {
352
74.3M
      w_1 = *(twiddles + j);
353
74.3M
      w_4 = *(twiddles + j + 257);
354
74.3M
      w_2 = *(twiddles + ((SIZE_T)j << 1));
355
74.3M
      w_5 = *(twiddles + ((SIZE_T)j << 1) + 257);
356
74.3M
      w_3 = *(twiddles + j + ((SIZE_T)j << 1) - 256);
357
74.3M
      w_6 = *(twiddles + j + ((SIZE_T)j << 1) + 1);
358
359
301M
      for (k = in_loop_cnt; k != 0; k--) {
360
226M
        data += ((SIZE_T)del << 1);
361
362
226M
        x1r = *data;
363
226M
        x1i = *(data + 1);
364
226M
        data += ((SIZE_T)del << 1);
365
366
226M
        x2r = *data;
367
226M
        x2i = *(data + 1);
368
226M
        data += ((SIZE_T)del << 1);
369
370
226M
        x3r = *data;
371
226M
        x3i = *(data + 1);
372
226M
        data -= 3 * (del << 1);
373
374
226M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_1) - ixheaace_dmult((FLOAT64)x1i, w_4));
375
226M
        x1i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x1r, w_4), (FLOAT64)x1i, w_1);
376
226M
        x1r = tmp;
377
378
226M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x2r, w_2) - ixheaace_dmult((FLOAT64)x2i, w_5));
379
226M
        x2i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x2r, w_5), (FLOAT64)x2i, w_2);
380
226M
        x2r = tmp;
381
382
226M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x3r, w_6) + ixheaace_dmult((FLOAT64)x3i, w_3));
383
226M
        x3i = (FLOAT32)(-ixheaace_dmult((FLOAT64)x3r, w_3) + ixheaace_dmult((FLOAT64)x3i, w_6));
384
226M
        x3r = tmp;
385
386
226M
        x0r = (*data);
387
226M
        x0i = (*(data + 1));
388
389
226M
        x0r = x0r + (x2r);
390
226M
        x0i = x0i + (x2i);
391
226M
        x2r = x0r - (x2r * 2);
392
226M
        x2i = x0i - (x2i * 2);
393
226M
        x1r = x1r + x3r;
394
226M
        x1i = x1i + x3i;
395
226M
        x3r = x1r - (x3r * 2);
396
226M
        x3i = x1i - (x3i * 2);
397
398
226M
        x0r = x0r + (x1r);
399
226M
        x0i = x0i + (x1i);
400
226M
        x1r = x0r - (x1r * 2);
401
226M
        x1i = x0i - (x1i * 2);
402
226M
        x2r = x2r + (x3i);
403
226M
        x2i = x2i - (x3r);
404
226M
        x3i = x2r - (x3i * 2);
405
226M
        x3r = x2i + (x3r * 2);
406
407
226M
        *data = x0r;
408
226M
        *(data + 1) = x0i;
409
226M
        data += ((SIZE_T)del << 1);
410
411
226M
        *data = x2r;
412
226M
        *(data + 1) = x2i;
413
226M
        data += ((SIZE_T)del << 1);
414
415
226M
        *data = x1r;
416
226M
        *(data + 1) = x1i;
417
226M
        data += ((SIZE_T)del << 1);
418
419
226M
        *data = x3i;
420
226M
        *(data + 1) = x3r;
421
226M
        data += ((SIZE_T)del << 1);
422
226M
      }
423
74.3M
      data -= 2 * npoints;
424
74.3M
      data += 2;
425
74.3M
    }
426
74.3M
    for (; j <= sec_loop_cnt * 2; j += nodespacing) {
427
49.2M
      w_1 = *(twiddles + j);
428
49.2M
      w_4 = *(twiddles + j + 257);
429
49.2M
      w_2 = *(twiddles + ((SIZE_T)j << 1) - 256);
430
49.2M
      w_5 = *(twiddles + ((SIZE_T)j << 1) + 1);
431
49.2M
      w_3 = *(twiddles + j + ((SIZE_T)j << 1) - 256);
432
49.2M
      w_6 = *(twiddles + j + ((SIZE_T)j << 1) + 1);
433
434
148M
      for (k = in_loop_cnt; k != 0; k--) {
435
99.0M
        data += ((SIZE_T)del << 1);
436
437
99.0M
        x1r = *data;
438
99.0M
        x1i = *(data + 1);
439
99.0M
        data += ((SIZE_T)del << 1);
440
441
99.0M
        x2r = *data;
442
99.0M
        x2i = *(data + 1);
443
99.0M
        data += ((SIZE_T)del << 1);
444
445
99.0M
        x3r = *data;
446
99.0M
        x3i = *(data + 1);
447
99.0M
        data -= 3 * (del << 1);
448
449
99.0M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_1) - ixheaace_dmult((FLOAT64)x1i, w_4));
450
99.0M
        x1i = (FLOAT32)ixheaace_dmac(ixheaace_dmult(x1r, w_4), x1i, w_1);
451
99.0M
        x1r = tmp;
452
453
99.0M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x2r, w_5) + ixheaace_dmult((FLOAT64)x2i, w_2));
454
99.0M
        x2i = (FLOAT32)(-ixheaace_dmult(x2r, w_2) + ixheaace_dmult(x2i, w_5));
455
99.0M
        x2r = tmp;
456
457
99.0M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x3r, w_6) + ixheaace_dmult((FLOAT64)x3i, w_3));
458
99.0M
        x3i = (FLOAT32)(-ixheaace_dmult((FLOAT64)x3r, w_3) + ixheaace_dmult((FLOAT64)x3i, w_6));
459
99.0M
        x3r = tmp;
460
461
99.0M
        x0r = (*data);
462
99.0M
        x0i = (*(data + 1));
463
464
99.0M
        x0r = x0r + (x2r);
465
99.0M
        x0i = x0i + (x2i);
466
99.0M
        x2r = x0r - (x2r * 2);
467
99.0M
        x2i = x0i - (x2i * 2);
468
99.0M
        x1r = x1r + x3r;
469
99.0M
        x1i = x1i + x3i;
470
99.0M
        x3r = x1r - (x3r * 2);
471
99.0M
        x3i = x1i - (x3i * 2);
472
473
99.0M
        x0r = x0r + (x1r);
474
99.0M
        x0i = x0i + (x1i);
475
99.0M
        x1r = x0r - (x1r * 2);
476
99.0M
        x1i = x0i - (x1i * 2);
477
99.0M
        x2r = x2r + (x3i);
478
99.0M
        x2i = x2i - (x3r);
479
99.0M
        x3i = x2r - (x3i * 2);
480
99.0M
        x3r = x2i + (x3r * 2);
481
482
99.0M
        *data = x0r;
483
99.0M
        *(data + 1) = x0i;
484
99.0M
        data += ((SIZE_T)del << 1);
485
486
99.0M
        *data = x2r;
487
99.0M
        *(data + 1) = x2i;
488
99.0M
        data += ((SIZE_T)del << 1);
489
490
99.0M
        *data = x1r;
491
99.0M
        *(data + 1) = x1i;
492
99.0M
        data += ((SIZE_T)del << 1);
493
494
99.0M
        *data = x3i;
495
99.0M
        *(data + 1) = x3r;
496
99.0M
        data += ((SIZE_T)del << 1);
497
99.0M
      }
498
49.2M
      data -= 2 * npoints;
499
49.2M
      data += 2;
500
49.2M
    }
501
148M
    for (; j < nodespacing * del; j += nodespacing) {
502
123M
      w_1 = *(twiddles + j);
503
123M
      w_4 = *(twiddles + j + 257);
504
123M
      w_2 = *(twiddles + ((SIZE_T)j << 1) - 256);
505
123M
      w_5 = *(twiddles + ((SIZE_T)j << 1) + 1);
506
123M
      w_3 = *(twiddles + j + ((SIZE_T)j << 1) - 512);
507
123M
      w_6 = *(twiddles + j + ((SIZE_T)j << 1) - 512 + 257);
508
509
449M
      for (k = in_loop_cnt; k != 0; k--) {
510
325M
        data += ((SIZE_T)del << 1);
511
512
325M
        x1r = *data;
513
325M
        x1i = *(data + 1);
514
325M
        data += ((SIZE_T)del << 1);
515
516
325M
        x2r = *data;
517
325M
        x2i = *(data + 1);
518
325M
        data += ((SIZE_T)del << 1);
519
520
325M
        x3r = *data;
521
325M
        x3i = *(data + 1);
522
325M
        data -= 3 * ((SIZE_T)del << 1);
523
524
325M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_1) - ixheaace_dmult((FLOAT64)x1i, w_4));
525
325M
        x1i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x1r, w_4), (FLOAT64)x1i, w_1);
526
325M
        x1r = tmp;
527
528
325M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x2r, w_5) + ixheaace_dmult((FLOAT64)x2i, w_2));
529
325M
        x2i = (FLOAT32)(-ixheaace_dmult((FLOAT64)x2r, w_2) + ixheaace_dmult((FLOAT64)x2i, w_5));
530
325M
        x2r = tmp;
531
532
325M
        tmp = (FLOAT32)(-ixheaace_dmult((FLOAT64)x3r, w_3) + ixheaace_dmult((FLOAT64)x3i, w_6));
533
325M
        x3i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x3r, w_6), (FLOAT64)x3i, w_3);
534
325M
        x3r = tmp;
535
536
325M
        x0r = (*data);
537
325M
        x0i = (*(data + 1));
538
539
325M
        x0r = x0r + (x2r);
540
325M
        x0i = x0i + (x2i);
541
325M
        x2r = x0r - (x2r * 2);
542
325M
        x2i = x0i - (x2i * 2);
543
325M
        x1r = x1r + x3r;
544
325M
        x1i = x1i - x3i;
545
325M
        x3r = x1r - (x3r * 2);
546
325M
        x3i = x1i + (x3i * 2);
547
548
325M
        x0r = x0r + (x1r);
549
325M
        x0i = x0i + (x1i);
550
325M
        x1r = x0r - (x1r * 2);
551
325M
        x1i = x0i - (x1i * 2);
552
325M
        x2r = x2r + (x3i);
553
325M
        x2i = x2i - (x3r);
554
325M
        x3i = x2r - (x3i * 2);
555
325M
        x3r = x2i + (x3r * 2);
556
557
325M
        *data = x0r;
558
325M
        *(data + 1) = x0i;
559
325M
        data += ((SIZE_T)del << 1);
560
561
325M
        *data = x2r;
562
325M
        *(data + 1) = x2i;
563
325M
        data += ((SIZE_T)del << 1);
564
565
325M
        *data = x1r;
566
325M
        *(data + 1) = x1i;
567
325M
        data += ((SIZE_T)del << 1);
568
569
325M
        *data = x3i;
570
325M
        *(data + 1) = x3r;
571
325M
        data += ((SIZE_T)del << 1);
572
325M
      }
573
123M
      data -= 2 * npoints;
574
123M
      data += 2;
575
123M
    }
576
25.0M
    nodespacing >>= 2;
577
25.0M
    del <<= 2;
578
25.0M
    in_loop_cnt >>= 2;
579
25.0M
  }
580
20.2M
  if (not_power_4) {
581
10.3M
    const FLOAT64 *twiddles = ptr_w;
582
10.3M
    nodespacing <<= 1;
583
584
202M
    for (j = del / 2; j != 0; j--) {
585
192M
      FLOAT64 w_1 = *twiddles;
586
192M
      FLOAT64 w_4 = *(twiddles + 257);
587
192M
      twiddles += nodespacing;
588
589
192M
      x0r = *ptr_y;
590
192M
      x0i = *(ptr_y + 1);
591
192M
      ptr_y += ((SIZE_T)del << 1);
592
593
192M
      x1r = *ptr_y;
594
192M
      x1i = *(ptr_y + 1);
595
596
192M
      tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_1) - ixheaace_dmult((FLOAT64)x1i, w_4));
597
192M
      x1i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x1r, w_4), (FLOAT64)x1i, w_1);
598
192M
      x1r = tmp;
599
600
192M
      *ptr_y = (x0r) - (x1r);
601
192M
      *(ptr_y + 1) = (x0i) - (x1i);
602
192M
      ptr_y -= ((SIZE_T)del << 1);
603
604
192M
      *ptr_y = (x0r) + (x1r);
605
192M
      *(ptr_y + 1) = (x0i) + (x1i);
606
192M
      ptr_y += 2;
607
192M
    }
608
10.3M
    twiddles = ptr_w;
609
202M
    for (j = del / 2; j != 0; j--) {
610
192M
      FLOAT64 w_1 = *twiddles;
611
192M
      FLOAT64 w_4 = *(twiddles + 257);
612
192M
      twiddles += nodespacing;
613
614
192M
      x0r = *ptr_y;
615
192M
      x0i = *(ptr_y + 1);
616
192M
      ptr_y += ((SIZE_T)del << 1);
617
618
192M
      x1r = *ptr_y;
619
192M
      x1i = *(ptr_y + 1);
620
621
192M
      tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_4) + ixheaace_dmult((FLOAT64)x1i, w_1));
622
192M
      x1i = (FLOAT32)(-ixheaace_dmult((FLOAT64)x1r, w_1) + ixheaace_dmult((FLOAT64)x1i, w_4));
623
192M
      x1r = tmp;
624
625
192M
      *ptr_y = (x0r) - (x1r);
626
192M
      *(ptr_y + 1) = (x0i) - (x1i);
627
192M
      ptr_y -= ((SIZE_T)del << 1);
628
629
192M
      *ptr_y = (x0r) + (x1r);
630
192M
      *(ptr_y + 1) = (x0i) + (x1i);
631
192M
      ptr_y += 2;
632
192M
    }
633
10.3M
  }
634
635
1.67G
  for (i = 0; i < nlength; i++) {
636
1.65G
    *(ptr_x + 2 * i) = y[2 * i];
637
1.65G
    *(ptr_x + 2 * i + 1) = y[2 * i + 1];
638
1.65G
  }
639
20.2M
}
640
641
static VOID iusace_complex_fft_p3(FLOAT32 *data, WORD32 nlength,
642
2.79M
                                  iusace_scratch_mem *pstr_scratch) {
643
2.79M
  WORD32 i, j;
644
2.79M
  FLOAT32 *data_3 = pstr_scratch->p_fft_p3_data_3;
645
2.79M
  FLOAT32 *y = pstr_scratch->p_fft_p3_y;
646
2.79M
  WORD32 cnfac;
647
2.79M
  WORD32 mpass = nlength;
648
2.79M
  FLOAT32 *ptr_x = data;
649
2.79M
  FLOAT32 *ptr_y = y;
650
651
2.79M
  cnfac = 0;
652
5.59M
  while (mpass % 3 == 0) {
653
2.79M
    mpass /= 3;
654
2.79M
    cnfac++;
655
2.79M
  }
656
657
11.1M
  for (i = 0; i < 3 * cnfac; i++) {
658
283M
    for (j = 0; j < mpass; j++) {
659
275M
      data_3[2 * j] = data[3 * (2 * j) + (2 * i)];
660
275M
      data_3[2 * j + 1] = data[3 * (2 * j) + 1 + (2 * i)];
661
275M
    }
662
8.39M
    iusace_complex_fft_p2(data_3, mpass, pstr_scratch->p_fft_p2_y);
663
664
283M
    for (j = 0; j < mpass; j++) {
665
275M
      data[3 * (2 * j) + (2 * i)] = data_3[2 * j];
666
275M
      data[3 * (2 * j) + 1 + (2 * i)] = data_3[2 * j + 1];
667
275M
    }
668
8.39M
  }
669
670
2.79M
  {
671
2.79M
    const FLOAT64 *w1r, *w1i;
672
2.79M
    FLOAT32 tmp;
673
2.79M
    w1r = iusace_twiddle_table_3pr;
674
2.79M
    w1i = iusace_twiddle_table_3pi;
675
676
94.5M
    for (i = 0; i < nlength; i += 3) {
677
91.7M
      tmp = (FLOAT32)((FLOAT64)data[2 * i] * (*w1r) - (FLOAT64)data[2 * i + 1] * (*w1i));
678
91.7M
      data[2 * i + 1] =
679
91.7M
          (FLOAT32)((FLOAT64)data[2 * i] * (*w1i) + (FLOAT64)data[2 * i + 1] * (*w1r));
680
91.7M
      data[2 * i] = tmp;
681
682
91.7M
      w1r++;
683
91.7M
      w1i++;
684
685
91.7M
      tmp = (FLOAT32)((FLOAT64)data[2 * (i + 1)] * (*w1r) -
686
91.7M
                      (FLOAT64)data[2 * (i + 1) + 1] * (*w1i));
687
91.7M
      data[2 * (i + 1) + 1] = (FLOAT32)((FLOAT64)data[2 * (i + 1)] * (*w1i) +
688
91.7M
                                        (FLOAT64)data[2 * (i + 1) + 1] * (*w1r));
689
91.7M
      data[2 * (i + 1)] = tmp;
690
691
91.7M
      w1r++;
692
91.7M
      w1i++;
693
694
91.7M
      tmp = (FLOAT32)((FLOAT64)data[2 * (i + 2)] * (*w1r) -
695
91.7M
                      (FLOAT64)data[2 * (i + 2) + 1] * (*w1i));
696
91.7M
      data[2 * (i + 2) + 1] = (FLOAT32)((FLOAT64)data[2 * (i + 2)] * (*w1i) +
697
91.7M
                                        (FLOAT64)data[2 * (i + 2) + 1] * (*w1r));
698
91.7M
      data[2 * (i + 2)] = tmp;
699
700
91.7M
      w1r += 3 * (128 / mpass - 1) + 1;
701
91.7M
      w1i += 3 * (128 / mpass - 1) + 1;
702
91.7M
    }
703
2.79M
  }
704
705
94.5M
  for (i = 0; i < mpass; i++) {
706
91.7M
    iusace_complex_3point_fft(ptr_x, ptr_y);
707
708
91.7M
    ptr_x = ptr_x + 6;
709
91.7M
    ptr_y = ptr_y + 6;
710
91.7M
  }
711
712
94.5M
  for (i = 0; i < mpass; i++) {
713
91.7M
    data[2 * i] = y[6 * i];
714
91.7M
    data[2 * i + 1] = y[6 * i + 1];
715
91.7M
  }
716
717
94.5M
  for (i = 0; i < mpass; i++) {
718
91.7M
    data[2 * (i + mpass)] = y[6 * i + 2];
719
91.7M
    data[2 * (i + mpass) + 1] = y[6 * i + 3];
720
91.7M
  }
721
722
94.5M
  for (i = 0; i < mpass; i++) {
723
91.7M
    data[2 * (i + 2 * mpass)] = y[6 * i + 4];
724
91.7M
    data[2 * (i + 2 * mpass) + 1] = y[6 * i + 5];
725
91.7M
  }
726
2.79M
}
727
728
0
VOID iusace_complex_fft_p3_no_scratch(FLOAT32 *data, WORD32 nlength) {
729
0
  WORD32 i, j;
730
731
0
  FLOAT32 data_3[800];
732
0
  FLOAT32 y[1024];
733
0
  FLOAT32 p_fft_p2_y[2048];
734
0
  WORD32 cnfac;
735
0
  WORD32 mpass = nlength;
736
0
  FLOAT32 *ptr_x = data;
737
0
  FLOAT32 *ptr_y = y;
738
739
0
  cnfac = 0;
740
0
  while (mpass % 3 == 0) {
741
0
    mpass /= 3;
742
0
    cnfac++;
743
0
  }
744
745
0
  for (i = 0; i < 3 * cnfac; i++) {
746
0
    for (j = 0; j < mpass; j++) {
747
0
      data_3[2 * j] = data[3 * (2 * j) + (2 * i)];
748
0
      data_3[2 * j + 1] = data[3 * (2 * j) + 1 + (2 * i)];
749
0
    }
750
0
    iusace_complex_fft_p2(data_3, mpass, p_fft_p2_y);
751
752
0
    for (j = 0; j < mpass; j++) {
753
0
      data[3 * (2 * j) + (2 * i)] = data_3[2 * j];
754
0
      data[3 * (2 * j) + 1 + (2 * i)] = data_3[2 * j + 1];
755
0
    }
756
0
  }
757
758
0
  {
759
0
    const FLOAT64 *w1r, *w1i;
760
0
    FLOAT32 tmp;
761
0
    w1r = iusace_twiddle_table_3pr;
762
0
    w1i = iusace_twiddle_table_3pi;
763
764
0
    for (i = 0; i < nlength; i += 3) {
765
0
      tmp = (FLOAT32)((FLOAT64)data[2 * i] * (*w1r) - (FLOAT64)data[2 * i + 1] * (*w1i));
766
0
      data[2 * i + 1] =
767
0
          (FLOAT32)((FLOAT64)data[2 * i] * (*w1i) + (FLOAT64)data[2 * i + 1] * (*w1r));
768
0
      data[2 * i] = tmp;
769
770
0
      w1r++;
771
0
      w1i++;
772
773
0
      tmp = (FLOAT32)((FLOAT64)data[2 * (i + 1)] * (*w1r) -
774
0
                      (FLOAT64)data[2 * (i + 1) + 1] * (*w1i));
775
0
      data[2 * (i + 1) + 1] = (FLOAT32)((FLOAT64)data[2 * (i + 1)] * (*w1i) +
776
0
                                        (FLOAT64)data[2 * (i + 1) + 1] * (*w1r));
777
0
      data[2 * (i + 1)] = tmp;
778
779
0
      w1r++;
780
0
      w1i++;
781
782
0
      tmp = (FLOAT32)((FLOAT64)data[2 * (i + 2)] * (*w1r) -
783
0
                      (FLOAT64)data[2 * (i + 2) + 1] * (*w1i));
784
0
      data[2 * (i + 2) + 1] = (FLOAT32)((FLOAT64)data[2 * (i + 2)] * (*w1i) +
785
0
                                        (FLOAT64)data[2 * (i + 2) + 1] * (*w1r));
786
0
      data[2 * (i + 2)] = tmp;
787
788
0
      w1r += 3 * (128 / mpass - 1) + 1;
789
0
      w1i += 3 * (128 / mpass - 1) + 1;
790
0
    }
791
0
  }
792
793
0
  for (i = 0; i < mpass; i++) {
794
0
    iusace_complex_3point_fft(ptr_x, ptr_y);
795
796
0
    ptr_x = ptr_x + 6;
797
0
    ptr_y = ptr_y + 6;
798
0
  }
799
800
0
  for (i = 0; i < mpass; i++) {
801
0
    data[2 * i] = y[6 * i];
802
0
    data[2 * i + 1] = y[6 * i + 1];
803
0
  }
804
805
0
  for (i = 0; i < mpass; i++) {
806
0
    data[2 * (i + mpass)] = y[6 * i + 2];
807
0
    data[2 * (i + mpass) + 1] = y[6 * i + 3];
808
0
  }
809
810
0
  for (i = 0; i < mpass; i++) {
811
0
    data[2 * (i + 2 * mpass)] = y[6 * i + 4];
812
0
    data[2 * (i + 2 * mpass) + 1] = y[6 * i + 5];
813
0
  }
814
0
}
815
816
static VOID iusace_calc_pre_twid_enc(FLOAT64 *ptr_in, FLOAT32 *fft_ptr, WORD32 npoints,
817
                                     const FLOAT64 *cos_ptr, const FLOAT64 *sin_ptr,
818
2.69M
                                     const WORD32 tx_flag) {
819
2.69M
  WORD32 i, n;
820
2.69M
  WORD32 b = npoints >> 1;
821
2.69M
  WORD32 a = npoints - b;
822
2.69M
  WORD32 nlength = npoints >> 2;
823
2.69M
  FLOAT64 tempr, tempi;
824
825
2.69M
  if (tx_flag == 0) {
826
1.34M
    FLOAT64 norm;
827
388M
    for (i = 0; i < b; i++) {
828
386M
      norm = ptr_in[i]; /* reuse MDCT: spectrally reverse all bins */
829
386M
      ptr_in[i] = ptr_in[npoints - 1 - i];
830
386M
      ptr_in[npoints - 1 - i] = norm;
831
386M
    }
832
1.34M
  }
833
389M
  for (i = 0; i < nlength; i++) {
834
386M
    n = npoints / 2 - 1 - 2 * i;
835
386M
    if (i < b / 4) {
836
193M
      tempr = ptr_in[a / 2 + n] + ptr_in[npoints + a / 2 - 1 - n];
837
193M
    } else {
838
193M
      tempr = ptr_in[a / 2 + n] - ptr_in[a / 2 - 1 - n];
839
193M
    }
840
386M
    n = 2 * i;
841
386M
    if (i < a / 4) {
842
193M
      tempi = ptr_in[a / 2 + n] - ptr_in[a / 2 - 1 - n];
843
193M
    } else {
844
193M
      tempi = ptr_in[a / 2 + n] + ptr_in[npoints + a / 2 - 1 - n];
845
193M
    }
846
847
386M
    fft_ptr[2 * i] = (FLOAT32)(tempr * (*cos_ptr) + tempi * (*sin_ptr));
848
386M
    fft_ptr[2 * i + 1] = (FLOAT32)(tempi * (*cos_ptr++) - tempr * (*sin_ptr++));
849
386M
  }
850
2.69M
}
851
852
9.27M
VOID iusace_complex_fft(FLOAT32 *data, WORD32 nlength, iusace_scratch_mem *pstr_scratch) {
853
9.27M
  if (nlength & (nlength - 1)) {
854
2.79M
    iusace_complex_fft_p3(data, nlength, pstr_scratch);
855
6.47M
  } else {
856
6.47M
    iusace_complex_fft_p2(data, nlength, pstr_scratch->p_fft_p2_y);
857
6.47M
  }
858
9.27M
}
859
860
static VOID iusace_calc_post_twid_enc(FLOAT64 *ptr_out, FLOAT32 *fft_ptr, WORD32 npoints,
861
                                      const FLOAT64 *cos_ptr, const FLOAT64 *sin_ptr,
862
2.69M
                                      const WORD32 tx_flag) {
863
2.69M
  WORD32 i;
864
2.69M
  WORD32 nlength = npoints >> 2;
865
2.69M
  FLOAT64 tempr, tempi;
866
867
  /* post-twiddle FFT output and then get output data */
868
389M
  for (i = 0; i < nlength; i++) {
869
386M
    tempr =
870
386M
        2 * ((FLOAT64)(fft_ptr[2 * i]) * (*cos_ptr) + (FLOAT64)(fft_ptr[2 * i + 1]) * (*sin_ptr));
871
386M
    tempi = 2 * ((FLOAT64)(fft_ptr[2 * i + 1]) * (*cos_ptr++) -
872
386M
                 (FLOAT64)(fft_ptr[2 * i]) * (*sin_ptr++));
873
874
386M
    ptr_out[2 * i] = -tempr;
875
386M
    ptr_out[npoints / 2 - 1 - 2 * i] = tempi;
876
386M
    ptr_out[npoints / 2 + 2 * i] = -tempi;
877
386M
    ptr_out[npoints - 1 - 2 * i] = tempr;
878
386M
  }
879
2.69M
  if (tx_flag == 0) {
880
388M
    for (i = 0; i < npoints; i += 2) {
881
386M
      ptr_out[i] *= -1; /* reuse MDCT: flip signs at odd indices */
882
386M
    }
883
1.34M
  }
884
2.69M
}
885
886
IA_ERRORCODE iusace_fft_based_mdct(FLOAT64 *ptr_in, FLOAT64 *ptr_out, WORD32 npoints,
887
2.69M
                                   const WORD32 tx_flag, iusace_scratch_mem *pstr_scratch) {
888
2.69M
  FLOAT32 *ptr_scratch1 = pstr_scratch->p_fft_mdct_buf;
889
2.69M
  const FLOAT64 *cos_ptr = NULL;
890
2.69M
  const FLOAT64 *sin_ptr = NULL;
891
2.69M
  WORD32 nlength = npoints >> 1;
892
2.69M
  WORD32 n_total = npoints << 1;
893
894
2.69M
  memset(ptr_scratch1, 0, ((SIZE_T)n_total << 1) * sizeof(*ptr_scratch1));
895
896
2.69M
  switch (npoints) {
897
734k
    case (96):
898
734k
      cos_ptr = iexheaac_pre_post_twid_cos_192;
899
734k
      sin_ptr = iexheaac_pre_post_twid_sin_192;
900
734k
      break;
901
1.41M
    case (128):
902
1.41M
      cos_ptr = iusace_pre_post_twid_cos_256;
903
1.41M
      sin_ptr = iusace_pre_post_twid_sin_256;
904
1.41M
      break;
905
116k
    case (768):
906
116k
      cos_ptr = iexheaac_pre_post_twid_cos_1536;
907
116k
      sin_ptr = iexheaac_pre_post_twid_sin_1536;
908
116k
      break;
909
421k
    case (1024):
910
421k
      cos_ptr = iusace_pre_post_twid_cos_2048;
911
421k
      sin_ptr = iusace_pre_post_twid_sin_2048;
912
421k
      break;
913
0
    default:
914
0
      return IA_EXHEAACE_EXE_FATAL_USAC_INVALID_WINDOW_LENGTH;
915
2.69M
  }
916
917
  /* pre-twiddle */
918
2.69M
  iusace_calc_pre_twid_enc(ptr_in, ptr_scratch1, npoints << 1, cos_ptr, sin_ptr, tx_flag);
919
920
  /* complex FFT */
921
2.69M
  iusace_complex_fft(ptr_scratch1, nlength, pstr_scratch);
922
923
  /* post-twiddle */
924
2.69M
  iusace_calc_post_twid_enc(ptr_out, ptr_scratch1, npoints << 1, cos_ptr, sin_ptr, tx_flag);
925
926
2.69M
  return IA_NO_ERROR;
927
2.69M
}
928
929
178k
VOID iusace_complex_fft_2048(FLOAT32 *ptr_x, FLOAT32 *scratch_fft) {
930
178k
  WORD32 i;
931
178k
  FLOAT32 re, im, c_v, s_v, tmp_re, tmp_im;
932
178k
  FLOAT32 *ptr_re, *ptr_im, *ptr_re_h, *ptr_im_h;
933
178k
  FLOAT32 *ptr_cos_val, *ptr_sin_val;
934
178k
  iusace_complex_fft_p2(ptr_x, 1024, scratch_fft);
935
178k
  iusace_complex_fft_p2(ptr_x + 2048, 1024, scratch_fft);
936
937
178k
  ptr_re = ptr_x;
938
178k
  ptr_im = ptr_x + 1;
939
178k
  ptr_re_h = ptr_x + 2048;
940
178k
  ptr_im_h = ptr_x + 2048 + 1;
941
178k
  ptr_cos_val = (FLOAT32 *)&iusace_twiddle_cos_2048[0];
942
178k
  ptr_sin_val = (FLOAT32 *)&iusace_twiddle_sin_2048[0];
943
182M
  for (i = 0; i < 1024; i++) {
944
182M
    re = *ptr_re_h;
945
182M
    im = *ptr_im_h;
946
182M
    c_v = ptr_cos_val[i];
947
182M
    s_v = ptr_sin_val[i];
948
182M
    tmp_re = (re * c_v) + (im * s_v);
949
182M
    tmp_im = -(re * s_v) + (im * c_v);
950
182M
    re = *ptr_re;
951
182M
    im = *ptr_im;
952
953
182M
    *ptr_re = re + tmp_re;
954
182M
    *ptr_im = im + tmp_im;
955
182M
    *ptr_re_h = re - tmp_re;
956
182M
    *ptr_im_h = im - tmp_im;
957
958
182M
    ptr_re += 2;
959
182M
    ptr_im += 2;
960
182M
    ptr_re_h += 2;
961
182M
    ptr_im_h += 2;
962
182M
  }
963
178k
}
964
static VOID ixheaace_rad2_cplx_fft(FLOAT32 *ptr_real, FLOAT32 *ptr_imag, WORD32 n_points,
965
174k
                                   FLOAT32 *ptr_scratch) {
966
174k
  WORD32 i, j, k, n_stages, h2;
967
174k
  FLOAT32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
968
174k
  WORD32 del, nodespacing, in_loop_cnt;
969
174k
  WORD32 not_power_4;
970
174k
  WORD32 dig_rev_shift;
971
174k
  WORD32 m_points = n_points;
972
174k
  FLOAT32 *ptr_x = ptr_scratch;
973
174k
  FLOAT32 *y = ptr_scratch + 2048;
974
174k
  FLOAT32 *ptr_y = y;
975
174k
  const FLOAT32 *ptr_w;
976
977
174k
  dig_rev_shift = ixheaac_norm32(m_points) + 1 - 16;
978
174k
  n_stages = 30 - ixheaac_norm32(m_points);
979
174k
  not_power_4 = n_stages & 1;
980
981
174k
  n_stages = n_stages >> 1;
982
983
174k
  ptr_w = ia_fft_twiddle_table_float;
984
985
179M
  for (i = 0; i < n_points; i++) {
986
178M
    ptr_x[2 * i] = ptr_real[i];
987
178M
    ptr_x[2 * i + 1] = ptr_imag[i];
988
178M
  }
989
174k
  dig_rev_shift = max(dig_rev_shift, 0);
990
44.8M
  for (i = 0; i < n_points; i += 4) {
991
44.7M
    FLOAT32 *inp = ptr_x;
992
44.7M
    FLOAT32 tmk;
993
994
44.7M
    DIG_REV(i, dig_rev_shift, h2);
995
44.7M
    if (not_power_4) {
996
0
      h2 += 1;
997
0
      h2 &= ~1;
998
0
    }
999
44.7M
    inp += (h2);
1000
1001
44.7M
    x0r = *inp;
1002
44.7M
    x0i = *(inp + 1);
1003
44.7M
    inp += (n_points >> 1);
1004
1005
44.7M
    x1r = *inp;
1006
44.7M
    x1i = *(inp + 1);
1007
44.7M
    inp += (n_points >> 1);
1008
1009
44.7M
    x2r = *inp;
1010
44.7M
    x2i = *(inp + 1);
1011
44.7M
    inp += (n_points >> 1);
1012
1013
44.7M
    x3r = *inp;
1014
44.7M
    x3i = *(inp + 1);
1015
1016
44.7M
    x0r = ia_add_flt(x0r, x2r);
1017
44.7M
    x0i = ia_add_flt(x0i, x2i);
1018
1019
44.7M
    tmk = ia_sub_flt(x0r, x2r);
1020
44.7M
    x2r = ia_sub_flt(tmk, x2r);
1021
44.7M
    tmk = ia_sub_flt(x0i, x2i);
1022
44.7M
    x2i = ia_sub_flt(tmk, x2i);
1023
1024
44.7M
    x1r = ia_add_flt(x1r, x3r);
1025
44.7M
    x1i = ia_add_flt(x1i, x3i);
1026
1027
44.7M
    tmk = ia_sub_flt(x1r, x3r);
1028
44.7M
    x3r = ia_sub_flt(tmk, x3r);
1029
44.7M
    tmk = ia_sub_flt(x1i, x3i);
1030
44.7M
    x3i = ia_sub_flt(tmk, x3i);
1031
1032
44.7M
    x0r = ia_add_flt(x0r, x1r);
1033
44.7M
    x0i = ia_add_flt(x0i, x1i);
1034
1035
44.7M
    tmk = ia_sub_flt(x0r, x1r);
1036
44.7M
    x1r = ia_sub_flt(tmk, x1r);
1037
44.7M
    tmk = ia_sub_flt(x0i, x1i);
1038
44.7M
    x1i = ia_sub_flt(tmk, x1i);
1039
1040
44.7M
    x2r = ia_add_flt(x2r, x3i);
1041
44.7M
    x2i = ia_sub_flt(x2i, x3r);
1042
1043
44.7M
    tmk = ia_sub_flt(x2r, x3i);
1044
44.7M
    x3i = ia_sub_flt(tmk, x3i);
1045
44.7M
    tmk = ia_add_flt(x2i, x3r);
1046
44.7M
    x3r = ia_add_flt(tmk, x3r);
1047
1048
44.7M
    *ptr_y++ = x0r;
1049
44.7M
    *ptr_y++ = x0i;
1050
44.7M
    *ptr_y++ = x2r;
1051
44.7M
    *ptr_y++ = x2i;
1052
44.7M
    *ptr_y++ = x1r;
1053
44.7M
    *ptr_y++ = x1i;
1054
44.7M
    *ptr_y++ = x3i;
1055
44.7M
    *ptr_y++ = x3r;
1056
44.7M
  }
1057
174k
  ptr_y -= 2 * n_points;
1058
174k
  del = 4;
1059
174k
  nodespacing = 64;
1060
174k
  in_loop_cnt = n_points >> 4;
1061
873k
  for (i = n_stages - 1; i > 0; i--) {
1062
698k
    const FLOAT32 *twiddles = ptr_w;
1063
698k
    FLOAT32 *data = ptr_y;
1064
698k
    FLOAT32 w_1, w_2, w_3, w_4, w_5, w_6;
1065
698k
    WORD32 sec_loop_cnt;
1066
1067
15.5M
    for (k = in_loop_cnt; k != 0; k--) {
1068
14.8M
      x0r = (*data);
1069
14.8M
      x0i = (*(data + 1));
1070
14.8M
      data += ((SIZE_T)del << 1);
1071
1072
14.8M
      x1r = (*data);
1073
14.8M
      x1i = (*(data + 1));
1074
14.8M
      data += ((SIZE_T)del << 1);
1075
1076
14.8M
      x2r = (*data);
1077
14.8M
      x2i = (*(data + 1));
1078
14.8M
      data += ((SIZE_T)del << 1);
1079
1080
14.8M
      x3r = (*data);
1081
14.8M
      x3i = (*(data + 1));
1082
14.8M
      data -= 3 * (del << 1);
1083
1084
14.8M
      x0r = ia_add_flt(x0r, x2r);
1085
14.8M
      x0i = ia_add_flt(x0i, x2i);
1086
14.8M
      x2r = ia_msu_flt(x0r, x2r, 2);
1087
14.8M
      x2i = ia_msu_flt(x0i, x2i, 2);
1088
14.8M
      x1r = ia_add_flt(x1r, x3r);
1089
14.8M
      x1i = ia_add_flt(x1i, x3i);
1090
14.8M
      x3r = ia_msu_flt(x1r, x3r, 2);
1091
14.8M
      x3i = ia_msu_flt(x1i, x3i, 2);
1092
1093
14.8M
      x0r = ia_add_flt(x0r, x1r);
1094
14.8M
      x0i = ia_add_flt(x0i, x1i);
1095
14.8M
      x1r = ia_msu_flt(x0r, x1r, 2);
1096
14.8M
      x1i = ia_msu_flt(x0i, x1i, 2);
1097
14.8M
      x2r = ia_add_flt(x2r, x3i);
1098
14.8M
      x2i = ia_sub_flt(x2i, x3r);
1099
14.8M
      x3i = ia_msu_flt(x2r, x3i, 2);
1100
14.8M
      x3r = ia_mac_flt(x2i, x3r, 2);
1101
1102
14.8M
      *data = x0r;
1103
14.8M
      *(data + 1) = x0i;
1104
14.8M
      data += ((SIZE_T)del << 1);
1105
1106
14.8M
      *data = x2r;
1107
14.8M
      *(data + 1) = x2i;
1108
14.8M
      data += ((SIZE_T)del << 1);
1109
1110
14.8M
      *data = x1r;
1111
14.8M
      *(data + 1) = x1i;
1112
14.8M
      data += ((SIZE_T)del << 1);
1113
1114
14.8M
      *data = x3i;
1115
14.8M
      *(data + 1) = x3r;
1116
14.8M
      data += ((SIZE_T)del << 1);
1117
14.8M
    }
1118
698k
    data = ptr_y + 2;
1119
1120
698k
    sec_loop_cnt = (nodespacing * del);
1121
698k
    sec_loop_cnt = (sec_loop_cnt / 4) + (sec_loop_cnt / 8) - (sec_loop_cnt / 16) +
1122
698k
                   (sec_loop_cnt / 32) - (sec_loop_cnt / 64) + (sec_loop_cnt / 128) -
1123
698k
                   (sec_loop_cnt / 256);
1124
1125
20.2M
    for (j = nodespacing; j <= sec_loop_cnt; j += nodespacing) {
1126
19.5M
      w_1 = *(twiddles + j);
1127
19.5M
      w_4 = *(twiddles + j + 257);
1128
19.5M
      w_2 = *(twiddles + ((SIZE_T)j << 1));
1129
19.5M
      w_5 = *(twiddles + ((SIZE_T)j << 1) + 257);
1130
19.5M
      w_3 = *(twiddles + j + ((SIZE_T)j << 1));
1131
19.5M
      w_6 = *(twiddles + j + ((SIZE_T)j << 1) + 257);
1132
1133
74.2M
      for (k = in_loop_cnt; k != 0; k--) {
1134
54.6M
        FLOAT32 tmp;
1135
        /*x0 is loaded later to avoid register crunch*/
1136
1137
54.6M
        data += ((SIZE_T)del << 1);
1138
1139
54.6M
        x1r = *data;
1140
54.6M
        x1i = *(data + 1);
1141
54.6M
        data += ((SIZE_T)del << 1);
1142
1143
54.6M
        x2r = *data;
1144
54.6M
        x2i = *(data + 1);
1145
54.6M
        data += ((SIZE_T)del << 1);
1146
1147
54.6M
        x3r = *data;
1148
54.6M
        x3i = *(data + 1);
1149
54.6M
        data -= 3 * (del << 1);
1150
1151
54.6M
        tmp = ia_sub_flt(ia_mul_flt(x1r, w_1), ia_mul_flt(x1i, w_4));
1152
54.6M
        x1i = ia_mac_flt(ia_mul_flt(x1r, w_4), x1i, w_1);
1153
54.6M
        x1r = tmp;
1154
1155
54.6M
        tmp = ia_sub_flt(ia_mul_flt(x2r, w_2), ia_mul_flt(x2i, w_5));
1156
54.6M
        x2i = ia_mac_flt(ia_mul_flt(x2r, w_5), x2i, w_2);
1157
54.6M
        x2r = tmp;
1158
1159
54.6M
        tmp = ia_sub_flt(ia_mul_flt(x3r, w_3), ia_mul_flt(x3i, w_6));
1160
54.6M
        x3i = ia_mac_flt(ia_mul_flt(x3r, w_6), x3i, w_3);
1161
54.6M
        x3r = tmp;
1162
1163
54.6M
        x0r = (*data);
1164
54.6M
        x0i = (*(data + 1));
1165
1166
54.6M
        x0r = ia_add_flt(x0r, (x2r));
1167
54.6M
        x0i = ia_add_flt(x0i, (x2i));
1168
54.6M
        x2r = ia_msu_flt(x0r, x2r, 2);
1169
54.6M
        x2i = ia_msu_flt(x0i, x2i, 2);
1170
54.6M
        x1r = ia_add_flt(x1r, x3r);
1171
54.6M
        x1i = ia_add_flt(x1i, x3i);
1172
54.6M
        x3r = ia_msu_flt(x1r, x3r, 2);
1173
54.6M
        x3i = ia_msu_flt(x1i, x3i, 2);
1174
1175
54.6M
        x0r = ia_add_flt(x0r, (x1r));
1176
54.6M
        x0i = ia_add_flt(x0i, (x1i));
1177
54.6M
        x1r = ia_msu_flt(x0r, x1r, 2);
1178
54.6M
        x1i = ia_msu_flt(x0i, x1i, 2);
1179
54.6M
        x2r = ia_add_flt(x2r, (x3i));
1180
54.6M
        x2i = ia_sub_flt(x2i, (x3r));
1181
54.6M
        x3i = ia_msu_flt(x2r, x3i, 2);
1182
54.6M
        x3r = ia_mac_flt(x2i, x3r, 2);
1183
1184
54.6M
        *data = x0r;
1185
54.6M
        *(data + 1) = x0i;
1186
54.6M
        data += ((SIZE_T)del << 1);
1187
1188
54.6M
        *data = x2r;
1189
54.6M
        *(data + 1) = x2i;
1190
54.6M
        data += ((SIZE_T)del << 1);
1191
1192
54.6M
        *data = x1r;
1193
54.6M
        *(data + 1) = x1i;
1194
54.6M
        data += ((SIZE_T)del << 1);
1195
1196
54.6M
        *data = x3i;
1197
54.6M
        *(data + 1) = x3r;
1198
54.6M
        data += ((SIZE_T)del << 1);
1199
54.6M
      }
1200
19.5M
      data -= 2 * n_points;
1201
19.5M
      data += 2;
1202
19.5M
    }
1203
10.8M
    for (; j <= (nodespacing * del) >> 1; j += nodespacing) {
1204
10.1M
      w_1 = *(twiddles + j);
1205
10.1M
      w_4 = *(twiddles + j + 257);
1206
10.1M
      w_2 = *(twiddles + ((SIZE_T)j << 1));
1207
10.1M
      w_5 = *(twiddles + ((SIZE_T)j << 1) + 257);
1208
10.1M
      w_3 = *(twiddles + j + ((SIZE_T)j << 1) - 256);
1209
10.1M
      w_6 = *(twiddles + j + ((SIZE_T)j << 1) + 1);
1210
1211
44.8M
      for (k = in_loop_cnt; k != 0; k--) {
1212
34.7M
        FLOAT32 tmp;
1213
        /*x0 is loaded later to avoid register crunch*/
1214
1215
34.7M
        data += ((SIZE_T)del << 1);
1216
1217
34.7M
        x1r = *data;
1218
34.7M
        x1i = *(data + 1);
1219
34.7M
        data += ((SIZE_T)del << 1);
1220
1221
34.7M
        x2r = *data;
1222
34.7M
        x2i = *(data + 1);
1223
34.7M
        data += ((SIZE_T)del << 1);
1224
1225
34.7M
        x3r = *data;
1226
34.7M
        x3i = *(data + 1);
1227
34.7M
        data -= 3 * (del << 1);
1228
1229
34.7M
        tmp = ia_sub_flt(ia_mul_flt(x1r, w_1), ia_mul_flt(x1i, w_4));
1230
34.7M
        x1i = ia_mac_flt(ia_mul_flt(x1r, w_4), x1i, w_1);
1231
34.7M
        x1r = tmp;
1232
1233
34.7M
        tmp = ia_sub_flt(ia_mul_flt(x2r, w_2), ia_mul_flt(x2i, w_5));
1234
34.7M
        x2i = ia_mac_flt(ia_mul_flt(x2r, w_5), x2i, w_2);
1235
34.7M
        x2r = tmp;
1236
1237
34.7M
        tmp = ia_add_flt(ia_mul_flt(x3r, w_6), ia_mul_flt(x3i, w_3));
1238
34.7M
        x3i = ia_add_flt(ia_negate_flt(ia_mul_flt(x3r, w_3)), ia_mul_flt(x3i, w_6));
1239
34.7M
        x3r = tmp;
1240
1241
34.7M
        x0r = (*data);
1242
34.7M
        x0i = (*(data + 1));
1243
1244
34.7M
        x0r = ia_add_flt(x0r, (x2r));
1245
34.7M
        x0i = ia_add_flt(x0i, (x2i));
1246
34.7M
        x2r = ia_msu_flt(x0r, x2r, 2);
1247
34.7M
        x2i = ia_msu_flt(x0i, x2i, 2);
1248
34.7M
        x1r = ia_add_flt(x1r, x3r);
1249
34.7M
        x1i = ia_add_flt(x1i, x3i);
1250
34.7M
        x3r = ia_msu_flt(x1r, x3r, 2);
1251
34.7M
        x3i = ia_msu_flt(x1i, x3i, 2);
1252
1253
34.7M
        x0r = ia_add_flt(x0r, (x1r));
1254
34.7M
        x0i = ia_add_flt(x0i, (x1i));
1255
34.7M
        x1r = ia_msu_flt(x0r, x1r, 2);
1256
34.7M
        x1i = ia_msu_flt(x0i, x1i, 2);
1257
34.7M
        x2r = ia_add_flt(x2r, (x3i));
1258
34.7M
        x2i = ia_sub_flt(x2i, (x3r));
1259
34.7M
        x3i = ia_msu_flt(x2r, x3i, 2);
1260
34.7M
        x3r = ia_mac_flt(x2i, x3r, 2);
1261
1262
34.7M
        *data = x0r;
1263
34.7M
        *(data + 1) = x0i;
1264
34.7M
        data += ((SIZE_T)del << 1);
1265
1266
34.7M
        *data = x2r;
1267
34.7M
        *(data + 1) = x2i;
1268
34.7M
        data += ((SIZE_T)del << 1);
1269
1270
34.7M
        *data = x1r;
1271
34.7M
        *(data + 1) = x1i;
1272
34.7M
        data += ((SIZE_T)del << 1);
1273
1274
34.7M
        *data = x3i;
1275
34.7M
        *(data + 1) = x3r;
1276
34.7M
        data += ((SIZE_T)del << 1);
1277
34.7M
      }
1278
10.1M
      data -= 2 * n_points;
1279
10.1M
      data += 2;
1280
10.1M
    }
1281
10.1M
    for (; j <= sec_loop_cnt * 2; j += nodespacing) {
1282
9.43M
      w_1 = *(twiddles + j);
1283
9.43M
      w_4 = *(twiddles + j + 257);
1284
9.43M
      w_2 = *(twiddles + ((SIZE_T)j << 1) - 256);
1285
9.43M
      w_5 = *(twiddles + ((SIZE_T)j << 1) + 1);
1286
9.43M
      w_3 = *(twiddles + j + ((SIZE_T)j << 1) - 256);
1287
9.43M
      w_6 = *(twiddles + j + ((SIZE_T)j << 1) + 1);
1288
1289
29.3M
      for (k = in_loop_cnt; k != 0; k--) {
1290
19.9M
        FLOAT32 tmp;
1291
        /*x0 is loaded later to avoid register crunch*/
1292
1293
19.9M
        data += ((SIZE_T)del << 1);
1294
1295
19.9M
        x1r = *data;
1296
19.9M
        x1i = *(data + 1);
1297
19.9M
        data += ((SIZE_T)del << 1);
1298
1299
19.9M
        x2r = *data;
1300
19.9M
        x2i = *(data + 1);
1301
19.9M
        data += ((SIZE_T)del << 1);
1302
1303
19.9M
        x3r = *data;
1304
19.9M
        x3i = *(data + 1);
1305
19.9M
        data -= 3 * (del << 1);
1306
1307
19.9M
        tmp = ia_sub_flt(ia_mul_flt(x1r, w_1), ia_mul_flt(x1i, w_4));
1308
19.9M
        x1i = ia_mac_flt(ia_mul_flt(x1r, w_4), x1i, w_1);
1309
19.9M
        x1r = tmp;
1310
1311
19.9M
        tmp = ia_add_flt(ia_mul_flt(x2r, w_5), ia_mul_flt(x2i, w_2));
1312
19.9M
        x2i = ia_add_flt(ia_negate_flt(ia_mul_flt(x2r, w_2)), ia_mul_flt(x2i, w_5));
1313
19.9M
        x2r = tmp;
1314
1315
19.9M
        tmp = ia_add_flt(ia_mul_flt(x3r, w_6), ia_mul_flt(x3i, w_3));
1316
19.9M
        x3i = ia_add_flt(ia_negate_flt(ia_mul_flt(x3r, w_3)), ia_mul_flt(x3i, w_6));
1317
19.9M
        x3r = tmp;
1318
1319
19.9M
        x0r = (*data);
1320
19.9M
        x0i = (*(data + 1));
1321
1322
19.9M
        x0r = ia_add_flt(x0r, (x2r));
1323
19.9M
        x0i = ia_add_flt(x0i, (x2i));
1324
19.9M
        x2r = ia_msu_flt(x0r, x2r, 2);
1325
19.9M
        x2i = ia_msu_flt(x0i, x2i, 2);
1326
19.9M
        x1r = ia_add_flt(x1r, x3r);
1327
19.9M
        x1i = ia_add_flt(x1i, x3i);
1328
19.9M
        x3r = ia_msu_flt(x1r, x3r, 2);
1329
19.9M
        x3i = ia_msu_flt(x1i, x3i, 2);
1330
1331
19.9M
        x0r = ia_add_flt(x0r, (x1r));
1332
19.9M
        x0i = ia_add_flt(x0i, (x1i));
1333
19.9M
        x1r = ia_msu_flt(x0r, x1r, 2);
1334
19.9M
        x1i = ia_msu_flt(x0i, x1i, 2);
1335
19.9M
        x2r = ia_add_flt(x2r, (x3i));
1336
19.9M
        x2i = ia_sub_flt(x2i, (x3r));
1337
19.9M
        x3i = ia_msu_flt(x2r, x3i, 2);
1338
19.9M
        x3r = ia_mac_flt(x2i, x3r, 2);
1339
1340
19.9M
        *data = x0r;
1341
19.9M
        *(data + 1) = x0i;
1342
19.9M
        data += ((SIZE_T)del << 1);
1343
1344
19.9M
        *data = x2r;
1345
19.9M
        *(data + 1) = x2i;
1346
19.9M
        data += ((SIZE_T)del << 1);
1347
1348
19.9M
        *data = x1r;
1349
19.9M
        *(data + 1) = x1i;
1350
19.9M
        data += ((SIZE_T)del << 1);
1351
1352
19.9M
        *data = x3i;
1353
19.9M
        *(data + 1) = x3r;
1354
19.9M
        data += ((SIZE_T)del << 1);
1355
19.9M
      }
1356
9.43M
      data -= 2 * n_points;
1357
9.43M
      data += 2;
1358
9.43M
    }
1359
20.2M
    for (; j < nodespacing * del; j += nodespacing) {
1360
19.5M
      w_1 = *(twiddles + j);
1361
19.5M
      w_4 = *(twiddles + j + 257);
1362
19.5M
      w_2 = *(twiddles + ((SIZE_T)j << 1) - 256);
1363
19.5M
      w_5 = *(twiddles + ((SIZE_T)j << 1) + 1);
1364
19.5M
      w_3 = *(twiddles + j + ((SIZE_T)j << 1) - 512);
1365
19.5M
      w_6 = *(twiddles + j + ((SIZE_T)j << 1) - 512 + 257);
1366
1367
74.2M
      for (k = in_loop_cnt; k != 0; k--) {
1368
54.6M
        FLOAT32 tmp;
1369
        /*x0 is loaded later to avoid register crunch*/
1370
1371
54.6M
        data += ((SIZE_T)del << 1);
1372
1373
54.6M
        x1r = *data;
1374
54.6M
        x1i = *(data + 1);
1375
54.6M
        data += ((SIZE_T)del << 1);
1376
1377
54.6M
        x2r = *data;
1378
54.6M
        x2i = *(data + 1);
1379
54.6M
        data += ((SIZE_T)del << 1);
1380
1381
54.6M
        x3r = *data;
1382
54.6M
        x3i = *(data + 1);
1383
54.6M
        data -= 3 * (del << 1);
1384
1385
54.6M
        tmp = ia_sub_flt(ia_mul_flt(x1r, w_1), ia_mul_flt(x1i, w_4));
1386
54.6M
        x1i = ia_mac_flt(ia_mul_flt(x1r, w_4), x1i, w_1);
1387
54.6M
        x1r = tmp;
1388
1389
54.6M
        tmp = ia_add_flt(ia_mul_flt(x2r, w_5), ia_mul_flt(x2i, w_2));
1390
54.6M
        x2i = ia_add_flt(ia_negate_flt(ia_mul_flt(x2r, w_2)), ia_mul_flt(x2i, w_5));
1391
54.6M
        x2r = tmp;
1392
1393
54.6M
        tmp = ia_add_flt(ia_negate_flt(ia_mul_flt(x3r, w_3)), ia_mul_flt(x3i, w_6));
1394
54.6M
        x3i = ia_mac_flt(ia_mul_flt(x3r, w_6), x3i, w_3);
1395
54.6M
        x3r = tmp;
1396
1397
54.6M
        x0r = (*data);
1398
54.6M
        x0i = (*(data + 1));
1399
1400
54.6M
        x0r = ia_add_flt(x0r, (x2r));
1401
54.6M
        x0i = ia_add_flt(x0i, (x2i));
1402
54.6M
        x2r = ia_msu_flt(x0r, x2r, 2);
1403
54.6M
        x2i = ia_msu_flt(x0i, x2i, 2);
1404
54.6M
        x1r = ia_add_flt(x1r, x3r);
1405
54.6M
        x1i = ia_sub_flt(x1i, x3i);
1406
54.6M
        x3r = ia_msu_flt(x1r, x3r, 2);
1407
54.6M
        x3i = ia_mac_flt(x1i, x3i, 2);
1408
1409
54.6M
        x0r = ia_add_flt(x0r, (x1r));
1410
54.6M
        x0i = ia_add_flt(x0i, (x1i));
1411
54.6M
        x1r = ia_msu_flt(x0r, x1r, 2);
1412
54.6M
        x1i = ia_msu_flt(x0i, x1i, 2);
1413
54.6M
        x2r = ia_add_flt(x2r, (x3i));
1414
54.6M
        x2i = ia_sub_flt(x2i, (x3r));
1415
54.6M
        x3i = ia_msu_flt(x2r, x3i, 2);
1416
54.6M
        x3r = ia_mac_flt(x2i, x3r, 2);
1417
1418
54.6M
        *data = x0r;
1419
54.6M
        *(data + 1) = x0i;
1420
54.6M
        data += ((SIZE_T)del << 1);
1421
1422
54.6M
        *data = x2r;
1423
54.6M
        *(data + 1) = x2i;
1424
54.6M
        data += ((SIZE_T)del << 1);
1425
1426
54.6M
        *data = x1r;
1427
54.6M
        *(data + 1) = x1i;
1428
54.6M
        data += ((SIZE_T)del << 1);
1429
1430
54.6M
        *data = x3i;
1431
54.6M
        *(data + 1) = x3r;
1432
54.6M
        data += ((SIZE_T)del << 1);
1433
54.6M
      }
1434
19.5M
      data -= 2 * n_points;
1435
19.5M
      data += 2;
1436
19.5M
    }
1437
698k
    nodespacing >>= 2;
1438
698k
    del <<= 2;
1439
698k
    in_loop_cnt >>= 2;
1440
698k
  }
1441
174k
  if (not_power_4) {
1442
0
    const FLOAT32 *twiddles = ptr_w;
1443
0
    nodespacing <<= 1;
1444
1445
0
    for (j = del / 2; j != 0; j--) {
1446
0
      FLOAT32 w_1 = *twiddles;
1447
0
      FLOAT32 w_4 = *(twiddles + 257);
1448
0
      FLOAT32 tmp;
1449
0
      twiddles += nodespacing;
1450
1451
0
      x0r = *ptr_y;
1452
0
      x0i = *(ptr_y + 1);
1453
0
      ptr_y += ((SIZE_T)del << 1);
1454
1455
0
      x1r = *ptr_y;
1456
0
      x1i = *(ptr_y + 1);
1457
1458
0
      tmp = ia_sub_flt(ia_mul_flt(x1r, w_1), ia_mul_flt(x1i, w_4));
1459
0
      x1i = (FLOAT32)ia_mac_flt(ia_mul_flt(x1r, w_4), x1i, w_1);
1460
0
      x1r = tmp;
1461
1462
0
      *ptr_y = ia_sub_flt((x0r), (x1r));
1463
0
      *(ptr_y + 1) = ia_sub_flt((x0i), (x1i));
1464
0
      ptr_y -= ((SIZE_T)del << 1);
1465
1466
0
      *ptr_y = ia_add_flt((x0r), (x1r));
1467
0
      *(ptr_y + 1) = ia_add_flt((x0i), (x1i));
1468
0
      ptr_y += 2;
1469
0
    }
1470
0
    twiddles = ptr_w;
1471
0
    for (j = del / 2; j != 0; j--) {
1472
0
      FLOAT32 w_1 = *twiddles;
1473
0
      FLOAT32 w_4 = *(twiddles + 257);
1474
0
      FLOAT32 tmp;
1475
0
      twiddles += nodespacing;
1476
1477
0
      x0r = *ptr_y;
1478
0
      x0i = *(ptr_y + 1);
1479
0
      ptr_y += ((SIZE_T)del << 1);
1480
1481
0
      x1r = *ptr_y;
1482
0
      x1i = *(ptr_y + 1);
1483
1484
0
      tmp = ia_add_flt(ia_mul_flt(x1r, w_4), ia_mul_flt(x1i, w_1));
1485
0
      x1i = ia_add_flt(ia_negate_flt(ia_mul_flt(x1r, w_1)), ia_mul_flt(x1i, w_4));
1486
0
      x1r = tmp;
1487
1488
0
      *ptr_y = ia_sub_flt((x0r), (x1r));
1489
0
      *(ptr_y + 1) = ia_sub_flt((x0i), (x1i));
1490
0
      ptr_y -= ((SIZE_T)del << 1);
1491
1492
0
      *ptr_y = ia_add_flt((x0r), (x1r));
1493
0
      *(ptr_y + 1) = ia_add_flt((x0i), (x1i));
1494
0
      ptr_y += 2;
1495
0
    }
1496
0
  }
1497
1498
179M
  for (i = 0; i < n_points; i++) {
1499
178M
    ptr_real[i] = y[2 * i];
1500
178M
    ptr_imag[i] = y[2 * i + 1];
1501
178M
  }
1502
174k
}
1503
44.7M
static VOID ixheaace_cplx_fft_4(FLOAT32 *x_r, FLOAT32 *x_i) {
1504
44.7M
  FLOAT32 x_0, x_1, x_2, x_3;
1505
44.7M
  FLOAT32 x_4, x_5, x_6, x_7;
1506
44.7M
  FLOAT32 x0r, x1r, x2r, x3r;
1507
44.7M
  FLOAT32 x0i, x1i, x2i, x3i;
1508
1509
  // 4 Point FFT
1510
44.7M
  x_0 = x_r[0];
1511
44.7M
  x_1 = x_i[0];
1512
44.7M
  x_2 = x_r[1];
1513
44.7M
  x_3 = x_i[1];
1514
44.7M
  x_4 = x_r[2];
1515
44.7M
  x_5 = x_i[2];
1516
44.7M
  x_6 = x_r[3];
1517
44.7M
  x_7 = x_i[3];
1518
1519
44.7M
  x0r = ia_add_flt(x_0, x_4);
1520
44.7M
  x0i = ia_add_flt(x_1, x_5);
1521
44.7M
  x2r = ia_sub_flt(x_0, x_4);
1522
44.7M
  x2i = ia_sub_flt(x_1, x_5);
1523
44.7M
  x1r = ia_add_flt(x_2, x_6);
1524
44.7M
  x1i = ia_add_flt(x_3, x_7);
1525
44.7M
  x3r = ia_sub_flt(x_2, x_6);
1526
44.7M
  x3i = ia_sub_flt(x_3, x_7);
1527
1528
44.7M
  x_r[0] = ia_add_flt(x0r, x1r);
1529
44.7M
  x_i[0] = ia_add_flt(x0i, x1i);
1530
44.7M
  x_r[2] = ia_sub_flt(x0r, x1r);
1531
44.7M
  x_i[2] = ia_sub_flt(x0i, x1i);
1532
44.7M
  x_r[1] = ia_add_flt(x2r, x3i);
1533
44.7M
  x_i[1] = ia_sub_flt(x2i, x3r);
1534
44.7M
  x_r[3] = ia_sub_flt(x2r, x3i);
1535
44.7M
  x_i[3] = ia_add_flt(x2i, x3r);
1536
44.7M
  return;
1537
44.7M
}
1538
43.6k
VOID iusace_complex_fft_4096(FLOAT32 *ptr_x_r, FLOAT32 *ptr_x_i, FLOAT32 *ptr_scratch_buf) {
1539
43.6k
  FLOAT32 *ptr_data_r;
1540
43.6k
  FLOAT32 *ptr_data_i;
1541
43.6k
  WORD32 fft_len = 4096;
1542
43.6k
  FLOAT32 *ptr_fft_interim_buf = &ptr_scratch_buf[2 * fft_len];
1543
43.6k
  WORD32 i, j;
1544
43.6k
  WORD32 dim2 = fft_len >> 10;
1545
43.6k
  WORD32 dim1 = fft_len / dim2;
1546
43.6k
  WORD32 fac = 4;
1547
1548
218k
  for (i = 0; i < dim2; i++) {
1549
174k
    ptr_data_r = &ptr_scratch_buf[(2 * i + 0) * dim1];
1550
174k
    ptr_data_i = &ptr_scratch_buf[(2 * i + 1) * dim1];
1551
179M
    for (j = 0; j < dim1; j++) {
1552
178M
      ptr_data_r[j] = ptr_x_r[(dim2 * j + i)];
1553
178M
      ptr_data_i[j] = 0;
1554
178M
    }
1555
174k
    ixheaace_rad2_cplx_fft(ptr_data_r, ptr_data_i, dim1, ptr_fft_interim_buf);
1556
174k
  }
1557
43.6k
  ptr_data_r = &ptr_scratch_buf[0];
1558
43.6k
  ptr_data_i = &ptr_scratch_buf[0];
1559
44.7M
  for (i = 0; i < dim1; i++) {
1560
44.7M
    FLOAT32 *ptr_cos_val = (FLOAT32 *)&ia_mixed_rad_twiddle_cos[i * dim2 * fac];
1561
44.7M
    FLOAT32 *ptr_sin_val = (FLOAT32 *)&ia_mixed_rad_twiddle_sin[i * dim2 * fac];
1562
223M
    for (j = 0; j < dim2; j++) {
1563
178M
      FLOAT32 real = ptr_data_r[(2 * j + 0) * dim1 + i];
1564
178M
      FLOAT32 imag = ptr_data_i[(2 * j + 1) * dim1 + i];
1565
178M
      FLOAT32 cos_val = ptr_cos_val[j * fac];
1566
178M
      FLOAT32 sin_val = ptr_sin_val[j * fac];
1567
178M
      FLOAT32 temp_real = (FLOAT32)(real * cos_val + imag * sin_val);
1568
178M
      FLOAT32 temp_imag = (FLOAT32)(imag * cos_val - real * sin_val);
1569
178M
      ptr_fft_interim_buf[(2 * i + 0) * dim2 + j] = temp_real;
1570
178M
      ptr_fft_interim_buf[(2 * i + 1) * dim2 + j] = temp_imag;
1571
178M
    }
1572
44.7M
  }
1573
44.7M
  for (i = 0; i < dim1; i++) {
1574
44.7M
    ptr_data_r = &ptr_fft_interim_buf[(2 * i + 0) * dim2];
1575
44.7M
    ptr_data_i = &ptr_fft_interim_buf[(2 * i + 1) * dim2];
1576
44.7M
    ixheaace_cplx_fft_4(ptr_data_r, ptr_data_i);
1577
44.7M
  }
1578
43.6k
  ptr_data_r = &ptr_fft_interim_buf[0];
1579
43.6k
  ptr_data_i = &ptr_fft_interim_buf[0];
1580
44.7M
  for (i = 0; i < dim1; i++) {
1581
223M
    for (j = 0; j < dim2; j++) {
1582
178M
      ptr_x_r[(j * dim1 + i)] = ptr_data_r[(2 * i + 0) * dim2 + j];
1583
178M
      ptr_x_i[(j * dim1 + i)] = ptr_data_i[(2 * i + 1) * dim2 + j];
1584
178M
    }
1585
44.7M
  }
1586
43.6k
}