Coverage Report

Created: 2025-12-10 06:52

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libxaac/encoder/iusace_fft.c
Line
Count
Source
1
/******************************************************************************
2
 *                                                                            *
3
 * Copyright (C) 2023 The Android Open Source Project
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at:
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 *****************************************************************************
18
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19
 */
20
21
#include <string.h>
22
#include "ixheaac_type_def.h"
23
#include "ixheaace_adjust_threshold_data.h"
24
#include "iusace_cnst.h"
25
#include "iusace_block_switch_const.h"
26
#include "iusace_rom.h"
27
#include "iusace_bitbuffer.h"
28
29
/* DRC */
30
#include "impd_drc_common_enc.h"
31
#include "impd_drc_uni_drc.h"
32
#include "impd_drc_tables.h"
33
#include "impd_drc_api.h"
34
#include "impd_drc_uni_drc_eq.h"
35
#include "impd_drc_uni_drc_filter_bank.h"
36
#include "impd_drc_gain_enc.h"
37
#include "impd_drc_struct_def.h"
38
39
#include "iusace_tns_usac.h"
40
#include "iusace_psy_mod.h"
41
#include "iusace_config.h"
42
#include "iusace_fft.h"
43
#include "iusace_basic_ops_flt.h"
44
#include "ixheaac_constants.h"
45
#include "ixheaace_aac_constants.h"
46
#include "ixheaac_basic_ops32.h"
47
#include "ixheaace_common_utils.h"
48
#include "ixheaac_error_standards.h"
49
#include "ixheaace_error_codes.h"
50
51
#define DIG_REV(i, m, j)                                    \
52
431M
  do {                                                      \
53
431M
    unsigned _ = (i);                                       \
54
431M
    _ = ((_ & 0x33333333) << 2) | ((_ & ~0x33333333) >> 2); \
55
431M
    _ = ((_ & 0x0F0F0F0F) << 4) | ((_ & ~0x0F0F0F0F) >> 4); \
56
431M
    _ = ((_ & 0x00FF00FF) << 8) | ((_ & ~0x00FF00FF) >> 8); \
57
431M
    (j) = _ >> (m);                                         \
58
431M
  } while (0)
59
60
43.4M
static PLATFORM_INLINE WORD8 iusace_calc_norm(WORD32 a) {
61
43.4M
  WORD8 norm_val;
62
63
43.4M
  if (a == 0) {
64
0
    norm_val = 31;
65
43.4M
  } else {
66
43.4M
    if (a == (WORD32)0xffffffffL) {
67
0
      norm_val = 31;
68
43.4M
    } else {
69
43.4M
      if (a < 0) {
70
0
        a = ~a;
71
0
      }
72
1.13G
      for (norm_val = 0; a < (WORD32)0x40000000L; norm_val++) {
73
1.09G
        a <<= 1;
74
1.09G
      }
75
43.4M
    }
76
43.4M
  }
77
78
43.4M
  return norm_val;
79
43.4M
}
80
81
102M
static PLATFORM_INLINE VOID iusace_complex_3point_fft(FLOAT32 *ptr_in, FLOAT32 *ptr_out) {
82
102M
  FLOAT32 add_r, sub_r;
83
102M
  FLOAT32 add_i, sub_i;
84
102M
  FLOAT32 x01r, x01i, temp;
85
102M
  FLOAT32 p1, p2, p3, p4;
86
102M
  FLOAT64 sinmu;
87
88
102M
  sinmu = 0.866025403784439;
89
90
102M
  x01r = ptr_in[0] + ptr_in[2];
91
102M
  x01i = ptr_in[1] + ptr_in[3];
92
93
102M
  add_r = ptr_in[2] + ptr_in[4];
94
102M
  add_i = ptr_in[3] + ptr_in[5];
95
96
102M
  sub_r = ptr_in[2] - ptr_in[4];
97
102M
  sub_i = ptr_in[3] - ptr_in[5];
98
99
102M
  p1 = add_r / (FLOAT32)2.0;
100
102M
  p4 = add_i / (FLOAT32)2.0;
101
102M
  p2 = (FLOAT32)((FLOAT64)sub_i * sinmu);
102
102M
  p3 = (FLOAT32)((FLOAT64)sub_r * sinmu);
103
104
102M
  temp = ptr_in[0] - p1;
105
106
102M
  ptr_out[0] = x01r + ptr_in[4];
107
102M
  ptr_out[1] = x01i + ptr_in[5];
108
102M
  ptr_out[2] = temp + p2;
109
102M
  ptr_out[3] = (ptr_in[1] - p3) - p4;
110
102M
  ptr_out[4] = temp - p2;
111
102M
  ptr_out[5] = (ptr_in[1] + p3) - p4;
112
113
102M
  return;
114
102M
}
115
116
21.7M
VOID iusace_complex_fft_p2(FLOAT32 *ptr_x, WORD32 nlength, FLOAT32 *scratch_fft_p2_y) {
117
21.7M
  WORD32 i, j, k, n_stages, h2;
118
21.7M
  FLOAT32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
119
21.7M
  FLOAT32 tmp;
120
21.7M
  WORD32 del, nodespacing, in_loop_cnt;
121
21.7M
  WORD32 not_power_4;
122
21.7M
  WORD32 dig_rev_shift;
123
21.7M
  FLOAT32 *y = scratch_fft_p2_y;
124
21.7M
  WORD32 mpass = nlength;
125
21.7M
  WORD32 npoints = nlength;
126
21.7M
  FLOAT32 *ptr_y = y;
127
21.7M
  const FLOAT64 *ptr_w;
128
129
21.7M
  dig_rev_shift = iusace_calc_norm(mpass) + 1 - 16;
130
21.7M
  n_stages = 30 - iusace_calc_norm(mpass);
131
21.7M
  not_power_4 = n_stages & 1;
132
133
21.7M
  n_stages = n_stages >> 1;
134
135
21.7M
  ptr_w = iusace_twiddle_table_fft_32x32;
136
137
21.7M
  if (dig_rev_shift < 0) {
138
0
    dig_rev_shift = 0;
139
0
  }
140
141
400M
  for (i = 0; i < npoints; i += 4) {
142
379M
    FLOAT32 *inp = ptr_x;
143
379M
    FLOAT32 tmk;
144
145
379M
    DIG_REV(i, dig_rev_shift, h2);
146
379M
    if (not_power_4) {
147
178M
      h2 += 1;
148
178M
      h2 &= ~1;
149
178M
    }
150
379M
    inp += (h2);
151
152
379M
    x0r = *inp;
153
379M
    x0i = *(inp + 1);
154
379M
    inp += (npoints >> 1);
155
156
379M
    x1r = *inp;
157
379M
    x1i = *(inp + 1);
158
379M
    inp += (npoints >> 1);
159
160
379M
    x2r = *inp;
161
379M
    x2i = *(inp + 1);
162
379M
    inp += (npoints >> 1);
163
164
379M
    x3r = *inp;
165
379M
    x3i = *(inp + 1);
166
167
379M
    x0r = x0r + x2r;
168
379M
    x0i = x0i + x2i;
169
170
379M
    tmk = x0r - x2r;
171
379M
    x2r = tmk - x2r;
172
379M
    tmk = x0i - x2i;
173
379M
    x2i = tmk - x2i;
174
175
379M
    x1r = x1r + x3r;
176
379M
    x1i = x1i + x3i;
177
178
379M
    tmk = x1r - x3r;
179
379M
    x3r = tmk - x3r;
180
379M
    tmk = x1i - x3i;
181
379M
    x3i = tmk - x3i;
182
183
379M
    x0r = x0r + x1r;
184
379M
    x0i = x0i + x1i;
185
186
379M
    tmk = x0r - x1r;
187
379M
    x1r = tmk - x1r;
188
379M
    tmk = x0i - x1i;
189
379M
    x1i = tmk - x1i;
190
191
379M
    x2r = x2r + x3i;
192
379M
    x2i = x2i - x3r;
193
194
379M
    tmk = x2r - x3i;
195
379M
    x3i = tmk - x3i;
196
379M
    tmk = x2i + x3r;
197
379M
    x3r = tmk + x3r;
198
199
379M
    *ptr_y++ = x0r;
200
379M
    *ptr_y++ = x0i;
201
379M
    *ptr_y++ = x2r;
202
379M
    *ptr_y++ = x2i;
203
379M
    *ptr_y++ = x1r;
204
379M
    *ptr_y++ = x1i;
205
379M
    *ptr_y++ = x3i;
206
379M
    *ptr_y++ = x3r;
207
379M
  }
208
21.7M
  ptr_y -= 2 * npoints;
209
21.7M
  del = 4;
210
21.7M
  nodespacing = 64;
211
21.7M
  in_loop_cnt = npoints >> 4;
212
46.5M
  for (i = n_stages - 1; i > 0; i--) {
213
24.8M
    const FLOAT64 *twiddles = ptr_w;
214
24.8M
    FLOAT32 *data = ptr_y;
215
24.8M
    FLOAT64 w_1, w_2, w_3, w_4, w_5, w_6;
216
24.8M
    WORD32 sec_loop_cnt;
217
218
140M
    for (k = in_loop_cnt; k != 0; k--) {
219
115M
      x0r = (*data);
220
115M
      x0i = (*(data + 1));
221
115M
      data += ((SIZE_T)del << 1);
222
223
115M
      x1r = (*data);
224
115M
      x1i = (*(data + 1));
225
115M
      data += ((SIZE_T)del << 1);
226
227
115M
      x2r = (*data);
228
115M
      x2i = (*(data + 1));
229
115M
      data += ((SIZE_T)del << 1);
230
231
115M
      x3r = (*data);
232
115M
      x3i = (*(data + 1));
233
115M
      data -= 3 * (del << 1);
234
235
115M
      x0r = x0r + x2r;
236
115M
      x0i = x0i + x2i;
237
115M
      x2r = x0r - (x2r * 2);
238
115M
      x2i = x0i - (x2i * 2);
239
115M
      x1r = x1r + x3r;
240
115M
      x1i = x1i + x3i;
241
115M
      x3r = x1r - (x3r * 2);
242
115M
      x3i = x1i - (x3i * 2);
243
244
115M
      x0r = x0r + x1r;
245
115M
      x0i = x0i + x1i;
246
115M
      x1r = x0r - (x1r * 2);
247
115M
      x1i = x0i - (x1i * 2);
248
115M
      x2r = x2r + x3i;
249
115M
      x2i = x2i - x3r;
250
115M
      x3i = x2r - (x3i * 2);
251
115M
      x3r = x2i + (x3r * 2);
252
253
115M
      *data = x0r;
254
115M
      *(data + 1) = x0i;
255
115M
      data += ((SIZE_T)del << 1);
256
257
115M
      *data = x2r;
258
115M
      *(data + 1) = x2i;
259
115M
      data += ((SIZE_T)del << 1);
260
261
115M
      *data = x1r;
262
115M
      *(data + 1) = x1i;
263
115M
      data += ((SIZE_T)del << 1);
264
265
115M
      *data = x3i;
266
115M
      *(data + 1) = x3r;
267
115M
      data += ((SIZE_T)del << 1);
268
115M
    }
269
24.8M
    data = ptr_y + 2;
270
271
24.8M
    sec_loop_cnt = (nodespacing * del);
272
24.8M
    sec_loop_cnt = (sec_loop_cnt / 4) + (sec_loop_cnt / 8) - (sec_loop_cnt / 16) +
273
24.8M
                   (sec_loop_cnt / 32) - (sec_loop_cnt / 64) + (sec_loop_cnt / 128) -
274
24.8M
                   (sec_loop_cnt / 256);
275
276
135M
    for (j = nodespacing; j <= sec_loop_cnt; j += nodespacing) {
277
110M
      w_1 = *(twiddles + j);
278
110M
      w_4 = *(twiddles + j + 257);
279
110M
      w_2 = *(twiddles + ((SIZE_T)j << 1));
280
110M
      w_5 = *(twiddles + ((SIZE_T)j << 1) + 257);
281
110M
      w_3 = *(twiddles + j + ((SIZE_T)j << 1));
282
110M
      w_6 = *(twiddles + j + ((SIZE_T)j << 1) + 257);
283
284
394M
      for (k = in_loop_cnt; k != 0; k--) {
285
283M
        data += ((SIZE_T)del << 1);
286
287
283M
        x1r = *data;
288
283M
        x1i = *(data + 1);
289
283M
        data += ((SIZE_T)del << 1);
290
291
283M
        x2r = *data;
292
283M
        x2i = *(data + 1);
293
283M
        data += ((SIZE_T)del << 1);
294
295
283M
        x3r = *data;
296
283M
        x3i = *(data + 1);
297
283M
        data -= 3 * (del << 1);
298
299
283M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_1) - ixheaace_dmult((FLOAT64)x1i, w_4));
300
283M
        x1i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x1r, w_4), (FLOAT64)x1i, w_1);
301
283M
        x1r = tmp;
302
303
283M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x2r, w_2) - ixheaace_dmult((FLOAT64)x2i, w_5));
304
283M
        x2i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x2r, w_5), (FLOAT64)x2i, w_2);
305
283M
        x2r = tmp;
306
307
283M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x3r, w_3) - ixheaace_dmult((FLOAT64)x3i, w_6));
308
283M
        x3i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x3r, w_6), (FLOAT64)x3i, w_3);
309
283M
        x3r = tmp;
310
311
283M
        x0r = (*data);
312
283M
        x0i = (*(data + 1));
313
314
283M
        x0r = x0r + (x2r);
315
283M
        x0i = x0i + (x2i);
316
283M
        x2r = x0r - (x2r * 2);
317
283M
        x2i = x0i - (x2i * 2);
318
283M
        x1r = x1r + x3r;
319
283M
        x1i = x1i + x3i;
320
283M
        x3r = x1r - (x3r * 2);
321
283M
        x3i = x1i - (x3i * 2);
322
323
283M
        x0r = x0r + (x1r);
324
283M
        x0i = x0i + (x1i);
325
283M
        x1r = x0r - (x1r * 2);
326
283M
        x1i = x0i - (x1i * 2);
327
283M
        x2r = x2r + (x3i);
328
283M
        x2i = x2i - (x3r);
329
283M
        x3i = x2r - (x3i * 2);
330
283M
        x3r = x2i + (x3r * 2);
331
332
283M
        *data = x0r;
333
283M
        *(data + 1) = x0i;
334
283M
        data += ((SIZE_T)del << 1);
335
336
283M
        *data = x2r;
337
283M
        *(data + 1) = x2i;
338
283M
        data += ((SIZE_T)del << 1);
339
340
283M
        *data = x1r;
341
283M
        *(data + 1) = x1i;
342
283M
        data += ((SIZE_T)del << 1);
343
344
283M
        *data = x3i;
345
283M
        *(data + 1) = x3r;
346
283M
        data += ((SIZE_T)del << 1);
347
283M
      }
348
110M
      data -= 2 * npoints;
349
110M
      data += 2;
350
110M
    }
351
92.7M
    for (; j <= (nodespacing * del) >> 1; j += nodespacing) {
352
67.8M
      w_1 = *(twiddles + j);
353
67.8M
      w_4 = *(twiddles + j + 257);
354
67.8M
      w_2 = *(twiddles + ((SIZE_T)j << 1));
355
67.8M
      w_5 = *(twiddles + ((SIZE_T)j << 1) + 257);
356
67.8M
      w_3 = *(twiddles + j + ((SIZE_T)j << 1) - 256);
357
67.8M
      w_6 = *(twiddles + j + ((SIZE_T)j << 1) + 1);
358
359
267M
      for (k = in_loop_cnt; k != 0; k--) {
360
199M
        data += ((SIZE_T)del << 1);
361
362
199M
        x1r = *data;
363
199M
        x1i = *(data + 1);
364
199M
        data += ((SIZE_T)del << 1);
365
366
199M
        x2r = *data;
367
199M
        x2i = *(data + 1);
368
199M
        data += ((SIZE_T)del << 1);
369
370
199M
        x3r = *data;
371
199M
        x3i = *(data + 1);
372
199M
        data -= 3 * (del << 1);
373
374
199M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_1) - ixheaace_dmult((FLOAT64)x1i, w_4));
375
199M
        x1i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x1r, w_4), (FLOAT64)x1i, w_1);
376
199M
        x1r = tmp;
377
378
199M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x2r, w_2) - ixheaace_dmult((FLOAT64)x2i, w_5));
379
199M
        x2i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x2r, w_5), (FLOAT64)x2i, w_2);
380
199M
        x2r = tmp;
381
382
199M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x3r, w_6) + ixheaace_dmult((FLOAT64)x3i, w_3));
383
199M
        x3i = (FLOAT32)(-ixheaace_dmult((FLOAT64)x3r, w_3) + ixheaace_dmult((FLOAT64)x3i, w_6));
384
199M
        x3r = tmp;
385
386
199M
        x0r = (*data);
387
199M
        x0i = (*(data + 1));
388
389
199M
        x0r = x0r + (x2r);
390
199M
        x0i = x0i + (x2i);
391
199M
        x2r = x0r - (x2r * 2);
392
199M
        x2i = x0i - (x2i * 2);
393
199M
        x1r = x1r + x3r;
394
199M
        x1i = x1i + x3i;
395
199M
        x3r = x1r - (x3r * 2);
396
199M
        x3i = x1i - (x3i * 2);
397
398
199M
        x0r = x0r + (x1r);
399
199M
        x0i = x0i + (x1i);
400
199M
        x1r = x0r - (x1r * 2);
401
199M
        x1i = x0i - (x1i * 2);
402
199M
        x2r = x2r + (x3i);
403
199M
        x2i = x2i - (x3r);
404
199M
        x3i = x2r - (x3i * 2);
405
199M
        x3r = x2i + (x3r * 2);
406
407
199M
        *data = x0r;
408
199M
        *(data + 1) = x0i;
409
199M
        data += ((SIZE_T)del << 1);
410
411
199M
        *data = x2r;
412
199M
        *(data + 1) = x2i;
413
199M
        data += ((SIZE_T)del << 1);
414
415
199M
        *data = x1r;
416
199M
        *(data + 1) = x1i;
417
199M
        data += ((SIZE_T)del << 1);
418
419
199M
        *data = x3i;
420
199M
        *(data + 1) = x3r;
421
199M
        data += ((SIZE_T)del << 1);
422
199M
      }
423
67.8M
      data -= 2 * npoints;
424
67.8M
      data += 2;
425
67.8M
    }
426
67.8M
    for (; j <= sec_loop_cnt * 2; j += nodespacing) {
427
42.9M
      w_1 = *(twiddles + j);
428
42.9M
      w_4 = *(twiddles + j + 257);
429
42.9M
      w_2 = *(twiddles + ((SIZE_T)j << 1) - 256);
430
42.9M
      w_5 = *(twiddles + ((SIZE_T)j << 1) + 1);
431
42.9M
      w_3 = *(twiddles + j + ((SIZE_T)j << 1) - 256);
432
42.9M
      w_6 = *(twiddles + j + ((SIZE_T)j << 1) + 1);
433
434
127M
      for (k = in_loop_cnt; k != 0; k--) {
435
84.1M
        data += ((SIZE_T)del << 1);
436
437
84.1M
        x1r = *data;
438
84.1M
        x1i = *(data + 1);
439
84.1M
        data += ((SIZE_T)del << 1);
440
441
84.1M
        x2r = *data;
442
84.1M
        x2i = *(data + 1);
443
84.1M
        data += ((SIZE_T)del << 1);
444
445
84.1M
        x3r = *data;
446
84.1M
        x3i = *(data + 1);
447
84.1M
        data -= 3 * (del << 1);
448
449
84.1M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_1) - ixheaace_dmult((FLOAT64)x1i, w_4));
450
84.1M
        x1i = (FLOAT32)ixheaace_dmac(ixheaace_dmult(x1r, w_4), x1i, w_1);
451
84.1M
        x1r = tmp;
452
453
84.1M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x2r, w_5) + ixheaace_dmult((FLOAT64)x2i, w_2));
454
84.1M
        x2i = (FLOAT32)(-ixheaace_dmult(x2r, w_2) + ixheaace_dmult(x2i, w_5));
455
84.1M
        x2r = tmp;
456
457
84.1M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x3r, w_6) + ixheaace_dmult((FLOAT64)x3i, w_3));
458
84.1M
        x3i = (FLOAT32)(-ixheaace_dmult((FLOAT64)x3r, w_3) + ixheaace_dmult((FLOAT64)x3i, w_6));
459
84.1M
        x3r = tmp;
460
461
84.1M
        x0r = (*data);
462
84.1M
        x0i = (*(data + 1));
463
464
84.1M
        x0r = x0r + (x2r);
465
84.1M
        x0i = x0i + (x2i);
466
84.1M
        x2r = x0r - (x2r * 2);
467
84.1M
        x2i = x0i - (x2i * 2);
468
84.1M
        x1r = x1r + x3r;
469
84.1M
        x1i = x1i + x3i;
470
84.1M
        x3r = x1r - (x3r * 2);
471
84.1M
        x3i = x1i - (x3i * 2);
472
473
84.1M
        x0r = x0r + (x1r);
474
84.1M
        x0i = x0i + (x1i);
475
84.1M
        x1r = x0r - (x1r * 2);
476
84.1M
        x1i = x0i - (x1i * 2);
477
84.1M
        x2r = x2r + (x3i);
478
84.1M
        x2i = x2i - (x3r);
479
84.1M
        x3i = x2r - (x3i * 2);
480
84.1M
        x3r = x2i + (x3r * 2);
481
482
84.1M
        *data = x0r;
483
84.1M
        *(data + 1) = x0i;
484
84.1M
        data += ((SIZE_T)del << 1);
485
486
84.1M
        *data = x2r;
487
84.1M
        *(data + 1) = x2i;
488
84.1M
        data += ((SIZE_T)del << 1);
489
490
84.1M
        *data = x1r;
491
84.1M
        *(data + 1) = x1i;
492
84.1M
        data += ((SIZE_T)del << 1);
493
494
84.1M
        *data = x3i;
495
84.1M
        *(data + 1) = x3r;
496
84.1M
        data += ((SIZE_T)del << 1);
497
84.1M
      }
498
42.9M
      data -= 2 * npoints;
499
42.9M
      data += 2;
500
42.9M
    }
501
135M
    for (; j < nodespacing * del; j += nodespacing) {
502
110M
      w_1 = *(twiddles + j);
503
110M
      w_4 = *(twiddles + j + 257);
504
110M
      w_2 = *(twiddles + ((SIZE_T)j << 1) - 256);
505
110M
      w_5 = *(twiddles + ((SIZE_T)j << 1) + 1);
506
110M
      w_3 = *(twiddles + j + ((SIZE_T)j << 1) - 512);
507
110M
      w_6 = *(twiddles + j + ((SIZE_T)j << 1) - 512 + 257);
508
509
394M
      for (k = in_loop_cnt; k != 0; k--) {
510
283M
        data += ((SIZE_T)del << 1);
511
512
283M
        x1r = *data;
513
283M
        x1i = *(data + 1);
514
283M
        data += ((SIZE_T)del << 1);
515
516
283M
        x2r = *data;
517
283M
        x2i = *(data + 1);
518
283M
        data += ((SIZE_T)del << 1);
519
520
283M
        x3r = *data;
521
283M
        x3i = *(data + 1);
522
283M
        data -= 3 * ((SIZE_T)del << 1);
523
524
283M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_1) - ixheaace_dmult((FLOAT64)x1i, w_4));
525
283M
        x1i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x1r, w_4), (FLOAT64)x1i, w_1);
526
283M
        x1r = tmp;
527
528
283M
        tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x2r, w_5) + ixheaace_dmult((FLOAT64)x2i, w_2));
529
283M
        x2i = (FLOAT32)(-ixheaace_dmult((FLOAT64)x2r, w_2) + ixheaace_dmult((FLOAT64)x2i, w_5));
530
283M
        x2r = tmp;
531
532
283M
        tmp = (FLOAT32)(-ixheaace_dmult((FLOAT64)x3r, w_3) + ixheaace_dmult((FLOAT64)x3i, w_6));
533
283M
        x3i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x3r, w_6), (FLOAT64)x3i, w_3);
534
283M
        x3r = tmp;
535
536
283M
        x0r = (*data);
537
283M
        x0i = (*(data + 1));
538
539
283M
        x0r = x0r + (x2r);
540
283M
        x0i = x0i + (x2i);
541
283M
        x2r = x0r - (x2r * 2);
542
283M
        x2i = x0i - (x2i * 2);
543
283M
        x1r = x1r + x3r;
544
283M
        x1i = x1i - x3i;
545
283M
        x3r = x1r - (x3r * 2);
546
283M
        x3i = x1i + (x3i * 2);
547
548
283M
        x0r = x0r + (x1r);
549
283M
        x0i = x0i + (x1i);
550
283M
        x1r = x0r - (x1r * 2);
551
283M
        x1i = x0i - (x1i * 2);
552
283M
        x2r = x2r + (x3i);
553
283M
        x2i = x2i - (x3r);
554
283M
        x3i = x2r - (x3i * 2);
555
283M
        x3r = x2i + (x3r * 2);
556
557
283M
        *data = x0r;
558
283M
        *(data + 1) = x0i;
559
283M
        data += ((SIZE_T)del << 1);
560
561
283M
        *data = x2r;
562
283M
        *(data + 1) = x2i;
563
283M
        data += ((SIZE_T)del << 1);
564
565
283M
        *data = x1r;
566
283M
        *(data + 1) = x1i;
567
283M
        data += ((SIZE_T)del << 1);
568
569
283M
        *data = x3i;
570
283M
        *(data + 1) = x3r;
571
283M
        data += ((SIZE_T)del << 1);
572
283M
      }
573
110M
      data -= 2 * npoints;
574
110M
      data += 2;
575
110M
    }
576
24.8M
    nodespacing >>= 2;
577
24.8M
    del <<= 2;
578
24.8M
    in_loop_cnt >>= 2;
579
24.8M
  }
580
21.7M
  if (not_power_4) {
581
11.5M
    const FLOAT64 *twiddles = ptr_w;
582
11.5M
    nodespacing <<= 1;
583
584
190M
    for (j = del / 2; j != 0; j--) {
585
178M
      FLOAT64 w_1 = *twiddles;
586
178M
      FLOAT64 w_4 = *(twiddles + 257);
587
178M
      twiddles += nodespacing;
588
589
178M
      x0r = *ptr_y;
590
178M
      x0i = *(ptr_y + 1);
591
178M
      ptr_y += ((SIZE_T)del << 1);
592
593
178M
      x1r = *ptr_y;
594
178M
      x1i = *(ptr_y + 1);
595
596
178M
      tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_1) - ixheaace_dmult((FLOAT64)x1i, w_4));
597
178M
      x1i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x1r, w_4), (FLOAT64)x1i, w_1);
598
178M
      x1r = tmp;
599
600
178M
      *ptr_y = (x0r) - (x1r);
601
178M
      *(ptr_y + 1) = (x0i) - (x1i);
602
178M
      ptr_y -= ((SIZE_T)del << 1);
603
604
178M
      *ptr_y = (x0r) + (x1r);
605
178M
      *(ptr_y + 1) = (x0i) + (x1i);
606
178M
      ptr_y += 2;
607
178M
    }
608
11.5M
    twiddles = ptr_w;
609
190M
    for (j = del / 2; j != 0; j--) {
610
178M
      FLOAT64 w_1 = *twiddles;
611
178M
      FLOAT64 w_4 = *(twiddles + 257);
612
178M
      twiddles += nodespacing;
613
614
178M
      x0r = *ptr_y;
615
178M
      x0i = *(ptr_y + 1);
616
178M
      ptr_y += ((SIZE_T)del << 1);
617
618
178M
      x1r = *ptr_y;
619
178M
      x1i = *(ptr_y + 1);
620
621
178M
      tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_4) + ixheaace_dmult((FLOAT64)x1i, w_1));
622
178M
      x1i = (FLOAT32)(-ixheaace_dmult((FLOAT64)x1r, w_1) + ixheaace_dmult((FLOAT64)x1i, w_4));
623
178M
      x1r = tmp;
624
625
178M
      *ptr_y = (x0r) - (x1r);
626
178M
      *(ptr_y + 1) = (x0i) - (x1i);
627
178M
      ptr_y -= ((SIZE_T)del << 1);
628
629
178M
      *ptr_y = (x0r) + (x1r);
630
178M
      *(ptr_y + 1) = (x0i) + (x1i);
631
178M
      ptr_y += 2;
632
178M
    }
633
11.5M
  }
634
635
1.53G
  for (i = 0; i < nlength; i++) {
636
1.51G
    *(ptr_x + 2 * i) = y[2 * i];
637
1.51G
    *(ptr_x + 2 * i + 1) = y[2 * i + 1];
638
1.51G
  }
639
21.7M
}
640
641
static VOID iusace_complex_fft_p3(FLOAT32 *data, WORD32 nlength,
642
3.07M
                                  iusace_scratch_mem *pstr_scratch) {
643
3.07M
  WORD32 i, j;
644
3.07M
  FLOAT32 *data_3 = pstr_scratch->p_fft_p3_data_3;
645
3.07M
  FLOAT32 *y = pstr_scratch->p_fft_p3_y;
646
3.07M
  WORD32 cnfac;
647
3.07M
  WORD32 mpass = nlength;
648
3.07M
  FLOAT32 *ptr_x = data;
649
3.07M
  FLOAT32 *ptr_y = y;
650
651
3.07M
  cnfac = 0;
652
6.15M
  while (mpass % 3 == 0) {
653
3.07M
    mpass /= 3;
654
3.07M
    cnfac++;
655
3.07M
  }
656
657
12.3M
  for (i = 0; i < 3 * cnfac; i++) {
658
318M
    for (j = 0; j < mpass; j++) {
659
308M
      data_3[2 * j] = data[3 * (2 * j) + (2 * i)];
660
308M
      data_3[2 * j + 1] = data[3 * (2 * j) + 1 + (2 * i)];
661
308M
    }
662
9.23M
    iusace_complex_fft_p2(data_3, mpass, pstr_scratch->p_fft_p2_y);
663
664
318M
    for (j = 0; j < mpass; j++) {
665
308M
      data[3 * (2 * j) + (2 * i)] = data_3[2 * j];
666
308M
      data[3 * (2 * j) + 1 + (2 * i)] = data_3[2 * j + 1];
667
308M
    }
668
9.23M
  }
669
670
3.07M
  {
671
3.07M
    const FLOAT64 *w1r, *w1i;
672
3.07M
    FLOAT32 tmp;
673
3.07M
    w1r = iusace_twiddle_table_3pr;
674
3.07M
    w1i = iusace_twiddle_table_3pi;
675
676
106M
    for (i = 0; i < nlength; i += 3) {
677
102M
      tmp = (FLOAT32)((FLOAT64)data[2 * i] * (*w1r) - (FLOAT64)data[2 * i + 1] * (*w1i));
678
102M
      data[2 * i + 1] =
679
102M
          (FLOAT32)((FLOAT64)data[2 * i] * (*w1i) + (FLOAT64)data[2 * i + 1] * (*w1r));
680
102M
      data[2 * i] = tmp;
681
682
102M
      w1r++;
683
102M
      w1i++;
684
685
102M
      tmp = (FLOAT32)((FLOAT64)data[2 * (i + 1)] * (*w1r) -
686
102M
                      (FLOAT64)data[2 * (i + 1) + 1] * (*w1i));
687
102M
      data[2 * (i + 1) + 1] = (FLOAT32)((FLOAT64)data[2 * (i + 1)] * (*w1i) +
688
102M
                                        (FLOAT64)data[2 * (i + 1) + 1] * (*w1r));
689
102M
      data[2 * (i + 1)] = tmp;
690
691
102M
      w1r++;
692
102M
      w1i++;
693
694
102M
      tmp = (FLOAT32)((FLOAT64)data[2 * (i + 2)] * (*w1r) -
695
102M
                      (FLOAT64)data[2 * (i + 2) + 1] * (*w1i));
696
102M
      data[2 * (i + 2) + 1] = (FLOAT32)((FLOAT64)data[2 * (i + 2)] * (*w1i) +
697
102M
                                        (FLOAT64)data[2 * (i + 2) + 1] * (*w1r));
698
102M
      data[2 * (i + 2)] = tmp;
699
700
102M
      w1r += 3 * (128 / mpass - 1) + 1;
701
102M
      w1i += 3 * (128 / mpass - 1) + 1;
702
102M
    }
703
3.07M
  }
704
705
106M
  for (i = 0; i < mpass; i++) {
706
102M
    iusace_complex_3point_fft(ptr_x, ptr_y);
707
708
102M
    ptr_x = ptr_x + 6;
709
102M
    ptr_y = ptr_y + 6;
710
102M
  }
711
712
106M
  for (i = 0; i < mpass; i++) {
713
102M
    data[2 * i] = y[6 * i];
714
102M
    data[2 * i + 1] = y[6 * i + 1];
715
102M
  }
716
717
106M
  for (i = 0; i < mpass; i++) {
718
102M
    data[2 * (i + mpass)] = y[6 * i + 2];
719
102M
    data[2 * (i + mpass) + 1] = y[6 * i + 3];
720
102M
  }
721
722
106M
  for (i = 0; i < mpass; i++) {
723
102M
    data[2 * (i + 2 * mpass)] = y[6 * i + 4];
724
102M
    data[2 * (i + 2 * mpass) + 1] = y[6 * i + 5];
725
102M
  }
726
3.07M
}
727
728
0
VOID iusace_complex_fft_p3_no_scratch(FLOAT32 *data, WORD32 nlength) {
729
0
  WORD32 i, j;
730
731
0
  FLOAT32 data_3[800];
732
0
  FLOAT32 y[1024];
733
0
  FLOAT32 p_fft_p2_y[2048];
734
0
  WORD32 cnfac;
735
0
  WORD32 mpass = nlength;
736
0
  FLOAT32 *ptr_x = data;
737
0
  FLOAT32 *ptr_y = y;
738
739
0
  cnfac = 0;
740
0
  while (mpass % 3 == 0) {
741
0
    mpass /= 3;
742
0
    cnfac++;
743
0
  }
744
745
0
  for (i = 0; i < 3 * cnfac; i++) {
746
0
    for (j = 0; j < mpass; j++) {
747
0
      data_3[2 * j] = data[3 * (2 * j) + (2 * i)];
748
0
      data_3[2 * j + 1] = data[3 * (2 * j) + 1 + (2 * i)];
749
0
    }
750
0
    iusace_complex_fft_p2(data_3, mpass, p_fft_p2_y);
751
752
0
    for (j = 0; j < mpass; j++) {
753
0
      data[3 * (2 * j) + (2 * i)] = data_3[2 * j];
754
0
      data[3 * (2 * j) + 1 + (2 * i)] = data_3[2 * j + 1];
755
0
    }
756
0
  }
757
758
0
  {
759
0
    const FLOAT64 *w1r, *w1i;
760
0
    FLOAT32 tmp;
761
0
    w1r = iusace_twiddle_table_3pr;
762
0
    w1i = iusace_twiddle_table_3pi;
763
764
0
    for (i = 0; i < nlength; i += 3) {
765
0
      tmp = (FLOAT32)((FLOAT64)data[2 * i] * (*w1r) - (FLOAT64)data[2 * i + 1] * (*w1i));
766
0
      data[2 * i + 1] =
767
0
          (FLOAT32)((FLOAT64)data[2 * i] * (*w1i) + (FLOAT64)data[2 * i + 1] * (*w1r));
768
0
      data[2 * i] = tmp;
769
770
0
      w1r++;
771
0
      w1i++;
772
773
0
      tmp = (FLOAT32)((FLOAT64)data[2 * (i + 1)] * (*w1r) -
774
0
                      (FLOAT64)data[2 * (i + 1) + 1] * (*w1i));
775
0
      data[2 * (i + 1) + 1] = (FLOAT32)((FLOAT64)data[2 * (i + 1)] * (*w1i) +
776
0
                                        (FLOAT64)data[2 * (i + 1) + 1] * (*w1r));
777
0
      data[2 * (i + 1)] = tmp;
778
779
0
      w1r++;
780
0
      w1i++;
781
782
0
      tmp = (FLOAT32)((FLOAT64)data[2 * (i + 2)] * (*w1r) -
783
0
                      (FLOAT64)data[2 * (i + 2) + 1] * (*w1i));
784
0
      data[2 * (i + 2) + 1] = (FLOAT32)((FLOAT64)data[2 * (i + 2)] * (*w1i) +
785
0
                                        (FLOAT64)data[2 * (i + 2) + 1] * (*w1r));
786
0
      data[2 * (i + 2)] = tmp;
787
788
0
      w1r += 3 * (128 / mpass - 1) + 1;
789
0
      w1i += 3 * (128 / mpass - 1) + 1;
790
0
    }
791
0
  }
792
793
0
  for (i = 0; i < mpass; i++) {
794
0
    iusace_complex_3point_fft(ptr_x, ptr_y);
795
796
0
    ptr_x = ptr_x + 6;
797
0
    ptr_y = ptr_y + 6;
798
0
  }
799
800
0
  for (i = 0; i < mpass; i++) {
801
0
    data[2 * i] = y[6 * i];
802
0
    data[2 * i + 1] = y[6 * i + 1];
803
0
  }
804
805
0
  for (i = 0; i < mpass; i++) {
806
0
    data[2 * (i + mpass)] = y[6 * i + 2];
807
0
    data[2 * (i + mpass) + 1] = y[6 * i + 3];
808
0
  }
809
810
0
  for (i = 0; i < mpass; i++) {
811
0
    data[2 * (i + 2 * mpass)] = y[6 * i + 4];
812
0
    data[2 * (i + 2 * mpass) + 1] = y[6 * i + 5];
813
0
  }
814
0
}
815
816
static VOID iusace_calc_pre_twid_enc(FLOAT64 *ptr_in, FLOAT32 *fft_ptr, WORD32 npoints,
817
                                     const FLOAT64 *cos_ptr, const FLOAT64 *sin_ptr,
818
2.29M
                                     const WORD32 tx_flag) {
819
2.29M
  WORD32 i, n;
820
2.29M
  WORD32 b = npoints >> 1;
821
2.29M
  WORD32 a = npoints - b;
822
2.29M
  WORD32 nlength = npoints >> 2;
823
2.29M
  FLOAT64 tempr, tempi;
824
825
2.29M
  if (tx_flag == 0) {
826
1.14M
    FLOAT64 norm;
827
316M
    for (i = 0; i < b; i++) {
828
315M
      norm = ptr_in[i]; /* reuse MDCT: spectrally reverse all bins */
829
315M
      ptr_in[i] = ptr_in[npoints - 1 - i];
830
315M
      ptr_in[npoints - 1 - i] = norm;
831
315M
    }
832
1.14M
  }
833
317M
  for (i = 0; i < nlength; i++) {
834
315M
    n = npoints / 2 - 1 - 2 * i;
835
315M
    if (i < b / 4) {
836
157M
      tempr = ptr_in[a / 2 + n] + ptr_in[npoints + a / 2 - 1 - n];
837
157M
    } else {
838
157M
      tempr = ptr_in[a / 2 + n] - ptr_in[a / 2 - 1 - n];
839
157M
    }
840
315M
    n = 2 * i;
841
315M
    if (i < a / 4) {
842
157M
      tempi = ptr_in[a / 2 + n] - ptr_in[a / 2 - 1 - n];
843
157M
    } else {
844
157M
      tempi = ptr_in[a / 2 + n] + ptr_in[npoints + a / 2 - 1 - n];
845
157M
    }
846
847
315M
    fft_ptr[2 * i] = (FLOAT32)(tempr * (*cos_ptr) + tempi * (*sin_ptr));
848
315M
    fft_ptr[2 * i + 1] = (FLOAT32)(tempi * (*cos_ptr++) - tempr * (*sin_ptr++));
849
315M
  }
850
2.29M
}
851
852
9.17M
VOID iusace_complex_fft(FLOAT32 *data, WORD32 nlength, iusace_scratch_mem *pstr_scratch) {
853
9.17M
  if (nlength & (nlength - 1)) {
854
3.07M
    iusace_complex_fft_p3(data, nlength, pstr_scratch);
855
6.10M
  } else {
856
6.10M
    iusace_complex_fft_p2(data, nlength, pstr_scratch->p_fft_p2_y);
857
6.10M
  }
858
9.17M
}
859
860
static VOID iusace_calc_post_twid_enc(FLOAT64 *ptr_out, FLOAT32 *fft_ptr, WORD32 npoints,
861
                                      const FLOAT64 *cos_ptr, const FLOAT64 *sin_ptr,
862
2.29M
                                      const WORD32 tx_flag) {
863
2.29M
  WORD32 i;
864
2.29M
  WORD32 nlength = npoints >> 2;
865
2.29M
  FLOAT64 tempr, tempi;
866
867
  /* post-twiddle FFT output and then get output data */
868
317M
  for (i = 0; i < nlength; i++) {
869
315M
    tempr =
870
315M
        2 * ((FLOAT64)(fft_ptr[2 * i]) * (*cos_ptr) + (FLOAT64)(fft_ptr[2 * i + 1]) * (*sin_ptr));
871
315M
    tempi = 2 * ((FLOAT64)(fft_ptr[2 * i + 1]) * (*cos_ptr++) -
872
315M
                 (FLOAT64)(fft_ptr[2 * i]) * (*sin_ptr++));
873
874
315M
    ptr_out[2 * i] = -tempr;
875
315M
    ptr_out[npoints / 2 - 1 - 2 * i] = tempi;
876
315M
    ptr_out[npoints / 2 + 2 * i] = -tempi;
877
315M
    ptr_out[npoints - 1 - 2 * i] = tempr;
878
315M
  }
879
2.29M
  if (tx_flag == 0) {
880
316M
    for (i = 0; i < npoints; i += 2) {
881
315M
      ptr_out[i] *= -1; /* reuse MDCT: flip signs at odd indices */
882
315M
    }
883
1.14M
  }
884
2.29M
}
885
886
IA_ERRORCODE iusace_fft_based_mdct(FLOAT64 *ptr_in, FLOAT64 *ptr_out, WORD32 npoints,
887
2.29M
                                   const WORD32 tx_flag, iusace_scratch_mem *pstr_scratch) {
888
2.29M
  FLOAT32 *ptr_scratch1 = pstr_scratch->p_fft_mdct_buf;
889
2.29M
  const FLOAT64 *cos_ptr = NULL;
890
2.29M
  const FLOAT64 *sin_ptr = NULL;
891
2.29M
  WORD32 nlength = npoints >> 1;
892
2.29M
  WORD32 n_total = npoints << 1;
893
894
2.29M
  memset(ptr_scratch1, 0, ((SIZE_T)n_total << 1) * sizeof(*ptr_scratch1));
895
896
2.29M
  switch (npoints) {
897
776k
    case (96):
898
776k
      cos_ptr = iexheaac_pre_post_twid_cos_192;
899
776k
      sin_ptr = iexheaac_pre_post_twid_sin_192;
900
776k
      break;
901
1.07M
    case (128):
902
1.07M
      cos_ptr = iusace_pre_post_twid_cos_256;
903
1.07M
      sin_ptr = iusace_pre_post_twid_sin_256;
904
1.07M
      break;
905
137k
    case (768):
906
137k
      cos_ptr = iexheaac_pre_post_twid_cos_1536;
907
137k
      sin_ptr = iexheaac_pre_post_twid_sin_1536;
908
137k
      break;
909
304k
    case (1024):
910
304k
      cos_ptr = iusace_pre_post_twid_cos_2048;
911
304k
      sin_ptr = iusace_pre_post_twid_sin_2048;
912
304k
      break;
913
0
    default:
914
0
      return IA_EXHEAACE_EXE_FATAL_USAC_INVALID_WINDOW_LENGTH;
915
2.29M
  }
916
917
  /* pre-twiddle */
918
2.29M
  iusace_calc_pre_twid_enc(ptr_in, ptr_scratch1, npoints << 1, cos_ptr, sin_ptr, tx_flag);
919
920
  /* complex FFT */
921
2.29M
  iusace_complex_fft(ptr_scratch1, nlength, pstr_scratch);
922
923
  /* post-twiddle */
924
2.29M
  iusace_calc_post_twid_enc(ptr_out, ptr_scratch1, npoints << 1, cos_ptr, sin_ptr, tx_flag);
925
926
2.29M
  return IA_NO_ERROR;
927
2.29M
}
928
929
140k
VOID iusace_complex_fft_2048(FLOAT32 *ptr_x, FLOAT32 *scratch_fft) {
930
140k
  WORD32 i;
931
140k
  FLOAT32 re, im, c_v, s_v, tmp_re, tmp_im;
932
140k
  FLOAT32 *ptr_re, *ptr_im, *ptr_re_h, *ptr_im_h;
933
140k
  FLOAT32 *ptr_cos_val, *ptr_sin_val;
934
140k
  iusace_complex_fft_p2(ptr_x, 1024, scratch_fft);
935
140k
  iusace_complex_fft_p2(ptr_x + 2048, 1024, scratch_fft);
936
937
140k
  ptr_re = ptr_x;
938
140k
  ptr_im = ptr_x + 1;
939
140k
  ptr_re_h = ptr_x + 2048;
940
140k
  ptr_im_h = ptr_x + 2048 + 1;
941
140k
  ptr_cos_val = (FLOAT32 *)&iusace_twiddle_cos_2048[0];
942
140k
  ptr_sin_val = (FLOAT32 *)&iusace_twiddle_sin_2048[0];
943
143M
  for (i = 0; i < 1024; i++) {
944
143M
    re = *ptr_re_h;
945
143M
    im = *ptr_im_h;
946
143M
    c_v = ptr_cos_val[i];
947
143M
    s_v = ptr_sin_val[i];
948
143M
    tmp_re = (re * c_v) + (im * s_v);
949
143M
    tmp_im = -(re * s_v) + (im * c_v);
950
143M
    re = *ptr_re;
951
143M
    im = *ptr_im;
952
953
143M
    *ptr_re = re + tmp_re;
954
143M
    *ptr_im = im + tmp_im;
955
143M
    *ptr_re_h = re - tmp_re;
956
143M
    *ptr_im_h = im - tmp_im;
957
958
143M
    ptr_re += 2;
959
143M
    ptr_im += 2;
960
143M
    ptr_re_h += 2;
961
143M
    ptr_im_h += 2;
962
143M
  }
963
140k
}
964
static VOID ixheaace_rad2_cplx_fft(FLOAT32 *ptr_real, FLOAT32 *ptr_imag, WORD32 n_points,
965
206k
                                   FLOAT32 *ptr_scratch) {
966
206k
  WORD32 i, j, k, n_stages, h2;
967
206k
  FLOAT32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
968
206k
  WORD32 del, nodespacing, in_loop_cnt;
969
206k
  WORD32 not_power_4;
970
206k
  WORD32 dig_rev_shift;
971
206k
  WORD32 m_points = n_points;
972
206k
  FLOAT32 *ptr_x = ptr_scratch;
973
206k
  FLOAT32 *y = ptr_scratch + 2048;
974
206k
  FLOAT32 *ptr_y = y;
975
206k
  const FLOAT32 *ptr_w;
976
977
206k
  dig_rev_shift = ixheaac_norm32(m_points) + 1 - 16;
978
206k
  n_stages = 30 - ixheaac_norm32(m_points);
979
206k
  not_power_4 = n_stages & 1;
980
981
206k
  n_stages = n_stages >> 1;
982
983
206k
  ptr_w = ia_fft_twiddle_table_float;
984
985
211M
  for (i = 0; i < n_points; i++) {
986
211M
    ptr_x[2 * i] = ptr_real[i];
987
211M
    ptr_x[2 * i + 1] = ptr_imag[i];
988
211M
  }
989
206k
  dig_rev_shift = max(dig_rev_shift, 0);
990
53.0M
  for (i = 0; i < n_points; i += 4) {
991
52.8M
    FLOAT32 *inp = ptr_x;
992
52.8M
    FLOAT32 tmk;
993
994
52.8M
    DIG_REV(i, dig_rev_shift, h2);
995
52.8M
    if (not_power_4) {
996
0
      h2 += 1;
997
0
      h2 &= ~1;
998
0
    }
999
52.8M
    inp += (h2);
1000
1001
52.8M
    x0r = *inp;
1002
52.8M
    x0i = *(inp + 1);
1003
52.8M
    inp += (n_points >> 1);
1004
1005
52.8M
    x1r = *inp;
1006
52.8M
    x1i = *(inp + 1);
1007
52.8M
    inp += (n_points >> 1);
1008
1009
52.8M
    x2r = *inp;
1010
52.8M
    x2i = *(inp + 1);
1011
52.8M
    inp += (n_points >> 1);
1012
1013
52.8M
    x3r = *inp;
1014
52.8M
    x3i = *(inp + 1);
1015
1016
52.8M
    x0r = ia_add_flt(x0r, x2r);
1017
52.8M
    x0i = ia_add_flt(x0i, x2i);
1018
1019
52.8M
    tmk = ia_sub_flt(x0r, x2r);
1020
52.8M
    x2r = ia_sub_flt(tmk, x2r);
1021
52.8M
    tmk = ia_sub_flt(x0i, x2i);
1022
52.8M
    x2i = ia_sub_flt(tmk, x2i);
1023
1024
52.8M
    x1r = ia_add_flt(x1r, x3r);
1025
52.8M
    x1i = ia_add_flt(x1i, x3i);
1026
1027
52.8M
    tmk = ia_sub_flt(x1r, x3r);
1028
52.8M
    x3r = ia_sub_flt(tmk, x3r);
1029
52.8M
    tmk = ia_sub_flt(x1i, x3i);
1030
52.8M
    x3i = ia_sub_flt(tmk, x3i);
1031
1032
52.8M
    x0r = ia_add_flt(x0r, x1r);
1033
52.8M
    x0i = ia_add_flt(x0i, x1i);
1034
1035
52.8M
    tmk = ia_sub_flt(x0r, x1r);
1036
52.8M
    x1r = ia_sub_flt(tmk, x1r);
1037
52.8M
    tmk = ia_sub_flt(x0i, x1i);
1038
52.8M
    x1i = ia_sub_flt(tmk, x1i);
1039
1040
52.8M
    x2r = ia_add_flt(x2r, x3i);
1041
52.8M
    x2i = ia_sub_flt(x2i, x3r);
1042
1043
52.8M
    tmk = ia_sub_flt(x2r, x3i);
1044
52.8M
    x3i = ia_sub_flt(tmk, x3i);
1045
52.8M
    tmk = ia_add_flt(x2i, x3r);
1046
52.8M
    x3r = ia_add_flt(tmk, x3r);
1047
1048
52.8M
    *ptr_y++ = x0r;
1049
52.8M
    *ptr_y++ = x0i;
1050
52.8M
    *ptr_y++ = x2r;
1051
52.8M
    *ptr_y++ = x2i;
1052
52.8M
    *ptr_y++ = x1r;
1053
52.8M
    *ptr_y++ = x1i;
1054
52.8M
    *ptr_y++ = x3i;
1055
52.8M
    *ptr_y++ = x3r;
1056
52.8M
  }
1057
206k
  ptr_y -= 2 * n_points;
1058
206k
  del = 4;
1059
206k
  nodespacing = 64;
1060
206k
  in_loop_cnt = n_points >> 4;
1061
1.03M
  for (i = n_stages - 1; i > 0; i--) {
1062
825k
    const FLOAT32 *twiddles = ptr_w;
1063
825k
    FLOAT32 *data = ptr_y;
1064
825k
    FLOAT32 w_1, w_2, w_3, w_4, w_5, w_6;
1065
825k
    WORD32 sec_loop_cnt;
1066
1067
18.3M
    for (k = in_loop_cnt; k != 0; k--) {
1068
17.5M
      x0r = (*data);
1069
17.5M
      x0i = (*(data + 1));
1070
17.5M
      data += ((SIZE_T)del << 1);
1071
1072
17.5M
      x1r = (*data);
1073
17.5M
      x1i = (*(data + 1));
1074
17.5M
      data += ((SIZE_T)del << 1);
1075
1076
17.5M
      x2r = (*data);
1077
17.5M
      x2i = (*(data + 1));
1078
17.5M
      data += ((SIZE_T)del << 1);
1079
1080
17.5M
      x3r = (*data);
1081
17.5M
      x3i = (*(data + 1));
1082
17.5M
      data -= 3 * (del << 1);
1083
1084
17.5M
      x0r = ia_add_flt(x0r, x2r);
1085
17.5M
      x0i = ia_add_flt(x0i, x2i);
1086
17.5M
      x2r = ia_msu_flt(x0r, x2r, 2);
1087
17.5M
      x2i = ia_msu_flt(x0i, x2i, 2);
1088
17.5M
      x1r = ia_add_flt(x1r, x3r);
1089
17.5M
      x1i = ia_add_flt(x1i, x3i);
1090
17.5M
      x3r = ia_msu_flt(x1r, x3r, 2);
1091
17.5M
      x3i = ia_msu_flt(x1i, x3i, 2);
1092
1093
17.5M
      x0r = ia_add_flt(x0r, x1r);
1094
17.5M
      x0i = ia_add_flt(x0i, x1i);
1095
17.5M
      x1r = ia_msu_flt(x0r, x1r, 2);
1096
17.5M
      x1i = ia_msu_flt(x0i, x1i, 2);
1097
17.5M
      x2r = ia_add_flt(x2r, x3i);
1098
17.5M
      x2i = ia_sub_flt(x2i, x3r);
1099
17.5M
      x3i = ia_msu_flt(x2r, x3i, 2);
1100
17.5M
      x3r = ia_mac_flt(x2i, x3r, 2);
1101
1102
17.5M
      *data = x0r;
1103
17.5M
      *(data + 1) = x0i;
1104
17.5M
      data += ((SIZE_T)del << 1);
1105
1106
17.5M
      *data = x2r;
1107
17.5M
      *(data + 1) = x2i;
1108
17.5M
      data += ((SIZE_T)del << 1);
1109
1110
17.5M
      *data = x1r;
1111
17.5M
      *(data + 1) = x1i;
1112
17.5M
      data += ((SIZE_T)del << 1);
1113
1114
17.5M
      *data = x3i;
1115
17.5M
      *(data + 1) = x3r;
1116
17.5M
      data += ((SIZE_T)del << 1);
1117
17.5M
    }
1118
825k
    data = ptr_y + 2;
1119
1120
825k
    sec_loop_cnt = (nodespacing * del);
1121
825k
    sec_loop_cnt = (sec_loop_cnt / 4) + (sec_loop_cnt / 8) - (sec_loop_cnt / 16) +
1122
825k
                   (sec_loop_cnt / 32) - (sec_loop_cnt / 64) + (sec_loop_cnt / 128) -
1123
825k
                   (sec_loop_cnt / 256);
1124
1125
23.9M
    for (j = nodespacing; j <= sec_loop_cnt; j += nodespacing) {
1126
23.1M
      w_1 = *(twiddles + j);
1127
23.1M
      w_4 = *(twiddles + j + 257);
1128
23.1M
      w_2 = *(twiddles + ((SIZE_T)j << 1));
1129
23.1M
      w_5 = *(twiddles + ((SIZE_T)j << 1) + 257);
1130
23.1M
      w_3 = *(twiddles + j + ((SIZE_T)j << 1));
1131
23.1M
      w_6 = *(twiddles + j + ((SIZE_T)j << 1) + 257);
1132
1133
87.7M
      for (k = in_loop_cnt; k != 0; k--) {
1134
64.6M
        FLOAT32 tmp;
1135
        /*x0 is loaded later to avoid register crunch*/
1136
1137
64.6M
        data += ((SIZE_T)del << 1);
1138
1139
64.6M
        x1r = *data;
1140
64.6M
        x1i = *(data + 1);
1141
64.6M
        data += ((SIZE_T)del << 1);
1142
1143
64.6M
        x2r = *data;
1144
64.6M
        x2i = *(data + 1);
1145
64.6M
        data += ((SIZE_T)del << 1);
1146
1147
64.6M
        x3r = *data;
1148
64.6M
        x3i = *(data + 1);
1149
64.6M
        data -= 3 * (del << 1);
1150
1151
64.6M
        tmp = ia_sub_flt(ia_mul_flt(x1r, w_1), ia_mul_flt(x1i, w_4));
1152
64.6M
        x1i = ia_mac_flt(ia_mul_flt(x1r, w_4), x1i, w_1);
1153
64.6M
        x1r = tmp;
1154
1155
64.6M
        tmp = ia_sub_flt(ia_mul_flt(x2r, w_2), ia_mul_flt(x2i, w_5));
1156
64.6M
        x2i = ia_mac_flt(ia_mul_flt(x2r, w_5), x2i, w_2);
1157
64.6M
        x2r = tmp;
1158
1159
64.6M
        tmp = ia_sub_flt(ia_mul_flt(x3r, w_3), ia_mul_flt(x3i, w_6));
1160
64.6M
        x3i = ia_mac_flt(ia_mul_flt(x3r, w_6), x3i, w_3);
1161
64.6M
        x3r = tmp;
1162
1163
64.6M
        x0r = (*data);
1164
64.6M
        x0i = (*(data + 1));
1165
1166
64.6M
        x0r = ia_add_flt(x0r, (x2r));
1167
64.6M
        x0i = ia_add_flt(x0i, (x2i));
1168
64.6M
        x2r = ia_msu_flt(x0r, x2r, 2);
1169
64.6M
        x2i = ia_msu_flt(x0i, x2i, 2);
1170
64.6M
        x1r = ia_add_flt(x1r, x3r);
1171
64.6M
        x1i = ia_add_flt(x1i, x3i);
1172
64.6M
        x3r = ia_msu_flt(x1r, x3r, 2);
1173
64.6M
        x3i = ia_msu_flt(x1i, x3i, 2);
1174
1175
64.6M
        x0r = ia_add_flt(x0r, (x1r));
1176
64.6M
        x0i = ia_add_flt(x0i, (x1i));
1177
64.6M
        x1r = ia_msu_flt(x0r, x1r, 2);
1178
64.6M
        x1i = ia_msu_flt(x0i, x1i, 2);
1179
64.6M
        x2r = ia_add_flt(x2r, (x3i));
1180
64.6M
        x2i = ia_sub_flt(x2i, (x3r));
1181
64.6M
        x3i = ia_msu_flt(x2r, x3i, 2);
1182
64.6M
        x3r = ia_mac_flt(x2i, x3r, 2);
1183
1184
64.6M
        *data = x0r;
1185
64.6M
        *(data + 1) = x0i;
1186
64.6M
        data += ((SIZE_T)del << 1);
1187
1188
64.6M
        *data = x2r;
1189
64.6M
        *(data + 1) = x2i;
1190
64.6M
        data += ((SIZE_T)del << 1);
1191
1192
64.6M
        *data = x1r;
1193
64.6M
        *(data + 1) = x1i;
1194
64.6M
        data += ((SIZE_T)del << 1);
1195
1196
64.6M
        *data = x3i;
1197
64.6M
        *(data + 1) = x3r;
1198
64.6M
        data += ((SIZE_T)del << 1);
1199
64.6M
      }
1200
23.1M
      data -= 2 * n_points;
1201
23.1M
      data += 2;
1202
23.1M
    }
1203
12.7M
    for (; j <= (nodespacing * del) >> 1; j += nodespacing) {
1204
11.9M
      w_1 = *(twiddles + j);
1205
11.9M
      w_4 = *(twiddles + j + 257);
1206
11.9M
      w_2 = *(twiddles + ((SIZE_T)j << 1));
1207
11.9M
      w_5 = *(twiddles + ((SIZE_T)j << 1) + 257);
1208
11.9M
      w_3 = *(twiddles + j + ((SIZE_T)j << 1) - 256);
1209
11.9M
      w_6 = *(twiddles + j + ((SIZE_T)j << 1) + 1);
1210
1211
53.0M
      for (k = in_loop_cnt; k != 0; k--) {
1212
41.0M
        FLOAT32 tmp;
1213
        /*x0 is loaded later to avoid register crunch*/
1214
1215
41.0M
        data += ((SIZE_T)del << 1);
1216
1217
41.0M
        x1r = *data;
1218
41.0M
        x1i = *(data + 1);
1219
41.0M
        data += ((SIZE_T)del << 1);
1220
1221
41.0M
        x2r = *data;
1222
41.0M
        x2i = *(data + 1);
1223
41.0M
        data += ((SIZE_T)del << 1);
1224
1225
41.0M
        x3r = *data;
1226
41.0M
        x3i = *(data + 1);
1227
41.0M
        data -= 3 * (del << 1);
1228
1229
41.0M
        tmp = ia_sub_flt(ia_mul_flt(x1r, w_1), ia_mul_flt(x1i, w_4));
1230
41.0M
        x1i = ia_mac_flt(ia_mul_flt(x1r, w_4), x1i, w_1);
1231
41.0M
        x1r = tmp;
1232
1233
41.0M
        tmp = ia_sub_flt(ia_mul_flt(x2r, w_2), ia_mul_flt(x2i, w_5));
1234
41.0M
        x2i = ia_mac_flt(ia_mul_flt(x2r, w_5), x2i, w_2);
1235
41.0M
        x2r = tmp;
1236
1237
41.0M
        tmp = ia_add_flt(ia_mul_flt(x3r, w_6), ia_mul_flt(x3i, w_3));
1238
41.0M
        x3i = ia_add_flt(ia_negate_flt(ia_mul_flt(x3r, w_3)), ia_mul_flt(x3i, w_6));
1239
41.0M
        x3r = tmp;
1240
1241
41.0M
        x0r = (*data);
1242
41.0M
        x0i = (*(data + 1));
1243
1244
41.0M
        x0r = ia_add_flt(x0r, (x2r));
1245
41.0M
        x0i = ia_add_flt(x0i, (x2i));
1246
41.0M
        x2r = ia_msu_flt(x0r, x2r, 2);
1247
41.0M
        x2i = ia_msu_flt(x0i, x2i, 2);
1248
41.0M
        x1r = ia_add_flt(x1r, x3r);
1249
41.0M
        x1i = ia_add_flt(x1i, x3i);
1250
41.0M
        x3r = ia_msu_flt(x1r, x3r, 2);
1251
41.0M
        x3i = ia_msu_flt(x1i, x3i, 2);
1252
1253
41.0M
        x0r = ia_add_flt(x0r, (x1r));
1254
41.0M
        x0i = ia_add_flt(x0i, (x1i));
1255
41.0M
        x1r = ia_msu_flt(x0r, x1r, 2);
1256
41.0M
        x1i = ia_msu_flt(x0i, x1i, 2);
1257
41.0M
        x2r = ia_add_flt(x2r, (x3i));
1258
41.0M
        x2i = ia_sub_flt(x2i, (x3r));
1259
41.0M
        x3i = ia_msu_flt(x2r, x3i, 2);
1260
41.0M
        x3r = ia_mac_flt(x2i, x3r, 2);
1261
1262
41.0M
        *data = x0r;
1263
41.0M
        *(data + 1) = x0i;
1264
41.0M
        data += ((SIZE_T)del << 1);
1265
1266
41.0M
        *data = x2r;
1267
41.0M
        *(data + 1) = x2i;
1268
41.0M
        data += ((SIZE_T)del << 1);
1269
1270
41.0M
        *data = x1r;
1271
41.0M
        *(data + 1) = x1i;
1272
41.0M
        data += ((SIZE_T)del << 1);
1273
1274
41.0M
        *data = x3i;
1275
41.0M
        *(data + 1) = x3r;
1276
41.0M
        data += ((SIZE_T)del << 1);
1277
41.0M
      }
1278
11.9M
      data -= 2 * n_points;
1279
11.9M
      data += 2;
1280
11.9M
    }
1281
11.9M
    for (; j <= sec_loop_cnt * 2; j += nodespacing) {
1282
11.1M
      w_1 = *(twiddles + j);
1283
11.1M
      w_4 = *(twiddles + j + 257);
1284
11.1M
      w_2 = *(twiddles + ((SIZE_T)j << 1) - 256);
1285
11.1M
      w_5 = *(twiddles + ((SIZE_T)j << 1) + 1);
1286
11.1M
      w_3 = *(twiddles + j + ((SIZE_T)j << 1) - 256);
1287
11.1M
      w_6 = *(twiddles + j + ((SIZE_T)j << 1) + 1);
1288
1289
34.6M
      for (k = in_loop_cnt; k != 0; k--) {
1290
23.5M
        FLOAT32 tmp;
1291
        /*x0 is loaded later to avoid register crunch*/
1292
1293
23.5M
        data += ((SIZE_T)del << 1);
1294
1295
23.5M
        x1r = *data;
1296
23.5M
        x1i = *(data + 1);
1297
23.5M
        data += ((SIZE_T)del << 1);
1298
1299
23.5M
        x2r = *data;
1300
23.5M
        x2i = *(data + 1);
1301
23.5M
        data += ((SIZE_T)del << 1);
1302
1303
23.5M
        x3r = *data;
1304
23.5M
        x3i = *(data + 1);
1305
23.5M
        data -= 3 * (del << 1);
1306
1307
23.5M
        tmp = ia_sub_flt(ia_mul_flt(x1r, w_1), ia_mul_flt(x1i, w_4));
1308
23.5M
        x1i = ia_mac_flt(ia_mul_flt(x1r, w_4), x1i, w_1);
1309
23.5M
        x1r = tmp;
1310
1311
23.5M
        tmp = ia_add_flt(ia_mul_flt(x2r, w_5), ia_mul_flt(x2i, w_2));
1312
23.5M
        x2i = ia_add_flt(ia_negate_flt(ia_mul_flt(x2r, w_2)), ia_mul_flt(x2i, w_5));
1313
23.5M
        x2r = tmp;
1314
1315
23.5M
        tmp = ia_add_flt(ia_mul_flt(x3r, w_6), ia_mul_flt(x3i, w_3));
1316
23.5M
        x3i = ia_add_flt(ia_negate_flt(ia_mul_flt(x3r, w_3)), ia_mul_flt(x3i, w_6));
1317
23.5M
        x3r = tmp;
1318
1319
23.5M
        x0r = (*data);
1320
23.5M
        x0i = (*(data + 1));
1321
1322
23.5M
        x0r = ia_add_flt(x0r, (x2r));
1323
23.5M
        x0i = ia_add_flt(x0i, (x2i));
1324
23.5M
        x2r = ia_msu_flt(x0r, x2r, 2);
1325
23.5M
        x2i = ia_msu_flt(x0i, x2i, 2);
1326
23.5M
        x1r = ia_add_flt(x1r, x3r);
1327
23.5M
        x1i = ia_add_flt(x1i, x3i);
1328
23.5M
        x3r = ia_msu_flt(x1r, x3r, 2);
1329
23.5M
        x3i = ia_msu_flt(x1i, x3i, 2);
1330
1331
23.5M
        x0r = ia_add_flt(x0r, (x1r));
1332
23.5M
        x0i = ia_add_flt(x0i, (x1i));
1333
23.5M
        x1r = ia_msu_flt(x0r, x1r, 2);
1334
23.5M
        x1i = ia_msu_flt(x0i, x1i, 2);
1335
23.5M
        x2r = ia_add_flt(x2r, (x3i));
1336
23.5M
        x2i = ia_sub_flt(x2i, (x3r));
1337
23.5M
        x3i = ia_msu_flt(x2r, x3i, 2);
1338
23.5M
        x3r = ia_mac_flt(x2i, x3r, 2);
1339
1340
23.5M
        *data = x0r;
1341
23.5M
        *(data + 1) = x0i;
1342
23.5M
        data += ((SIZE_T)del << 1);
1343
1344
23.5M
        *data = x2r;
1345
23.5M
        *(data + 1) = x2i;
1346
23.5M
        data += ((SIZE_T)del << 1);
1347
1348
23.5M
        *data = x1r;
1349
23.5M
        *(data + 1) = x1i;
1350
23.5M
        data += ((SIZE_T)del << 1);
1351
1352
23.5M
        *data = x3i;
1353
23.5M
        *(data + 1) = x3r;
1354
23.5M
        data += ((SIZE_T)del << 1);
1355
23.5M
      }
1356
11.1M
      data -= 2 * n_points;
1357
11.1M
      data += 2;
1358
11.1M
    }
1359
23.9M
    for (; j < nodespacing * del; j += nodespacing) {
1360
23.1M
      w_1 = *(twiddles + j);
1361
23.1M
      w_4 = *(twiddles + j + 257);
1362
23.1M
      w_2 = *(twiddles + ((SIZE_T)j << 1) - 256);
1363
23.1M
      w_5 = *(twiddles + ((SIZE_T)j << 1) + 1);
1364
23.1M
      w_3 = *(twiddles + j + ((SIZE_T)j << 1) - 512);
1365
23.1M
      w_6 = *(twiddles + j + ((SIZE_T)j << 1) - 512 + 257);
1366
1367
87.7M
      for (k = in_loop_cnt; k != 0; k--) {
1368
64.6M
        FLOAT32 tmp;
1369
        /*x0 is loaded later to avoid register crunch*/
1370
1371
64.6M
        data += ((SIZE_T)del << 1);
1372
1373
64.6M
        x1r = *data;
1374
64.6M
        x1i = *(data + 1);
1375
64.6M
        data += ((SIZE_T)del << 1);
1376
1377
64.6M
        x2r = *data;
1378
64.6M
        x2i = *(data + 1);
1379
64.6M
        data += ((SIZE_T)del << 1);
1380
1381
64.6M
        x3r = *data;
1382
64.6M
        x3i = *(data + 1);
1383
64.6M
        data -= 3 * (del << 1);
1384
1385
64.6M
        tmp = ia_sub_flt(ia_mul_flt(x1r, w_1), ia_mul_flt(x1i, w_4));
1386
64.6M
        x1i = ia_mac_flt(ia_mul_flt(x1r, w_4), x1i, w_1);
1387
64.6M
        x1r = tmp;
1388
1389
64.6M
        tmp = ia_add_flt(ia_mul_flt(x2r, w_5), ia_mul_flt(x2i, w_2));
1390
64.6M
        x2i = ia_add_flt(ia_negate_flt(ia_mul_flt(x2r, w_2)), ia_mul_flt(x2i, w_5));
1391
64.6M
        x2r = tmp;
1392
1393
64.6M
        tmp = ia_add_flt(ia_negate_flt(ia_mul_flt(x3r, w_3)), ia_mul_flt(x3i, w_6));
1394
64.6M
        x3i = ia_mac_flt(ia_mul_flt(x3r, w_6), x3i, w_3);
1395
64.6M
        x3r = tmp;
1396
1397
64.6M
        x0r = (*data);
1398
64.6M
        x0i = (*(data + 1));
1399
1400
64.6M
        x0r = ia_add_flt(x0r, (x2r));
1401
64.6M
        x0i = ia_add_flt(x0i, (x2i));
1402
64.6M
        x2r = ia_msu_flt(x0r, x2r, 2);
1403
64.6M
        x2i = ia_msu_flt(x0i, x2i, 2);
1404
64.6M
        x1r = ia_add_flt(x1r, x3r);
1405
64.6M
        x1i = ia_sub_flt(x1i, x3i);
1406
64.6M
        x3r = ia_msu_flt(x1r, x3r, 2);
1407
64.6M
        x3i = ia_mac_flt(x1i, x3i, 2);
1408
1409
64.6M
        x0r = ia_add_flt(x0r, (x1r));
1410
64.6M
        x0i = ia_add_flt(x0i, (x1i));
1411
64.6M
        x1r = ia_msu_flt(x0r, x1r, 2);
1412
64.6M
        x1i = ia_msu_flt(x0i, x1i, 2);
1413
64.6M
        x2r = ia_add_flt(x2r, (x3i));
1414
64.6M
        x2i = ia_sub_flt(x2i, (x3r));
1415
64.6M
        x3i = ia_msu_flt(x2r, x3i, 2);
1416
64.6M
        x3r = ia_mac_flt(x2i, x3r, 2);
1417
1418
64.6M
        *data = x0r;
1419
64.6M
        *(data + 1) = x0i;
1420
64.6M
        data += ((SIZE_T)del << 1);
1421
1422
64.6M
        *data = x2r;
1423
64.6M
        *(data + 1) = x2i;
1424
64.6M
        data += ((SIZE_T)del << 1);
1425
1426
64.6M
        *data = x1r;
1427
64.6M
        *(data + 1) = x1i;
1428
64.6M
        data += ((SIZE_T)del << 1);
1429
1430
64.6M
        *data = x3i;
1431
64.6M
        *(data + 1) = x3r;
1432
64.6M
        data += ((SIZE_T)del << 1);
1433
64.6M
      }
1434
23.1M
      data -= 2 * n_points;
1435
23.1M
      data += 2;
1436
23.1M
    }
1437
825k
    nodespacing >>= 2;
1438
825k
    del <<= 2;
1439
825k
    in_loop_cnt >>= 2;
1440
825k
  }
1441
206k
  if (not_power_4) {
1442
0
    const FLOAT32 *twiddles = ptr_w;
1443
0
    nodespacing <<= 1;
1444
1445
0
    for (j = del / 2; j != 0; j--) {
1446
0
      FLOAT32 w_1 = *twiddles;
1447
0
      FLOAT32 w_4 = *(twiddles + 257);
1448
0
      FLOAT32 tmp;
1449
0
      twiddles += nodespacing;
1450
1451
0
      x0r = *ptr_y;
1452
0
      x0i = *(ptr_y + 1);
1453
0
      ptr_y += ((SIZE_T)del << 1);
1454
1455
0
      x1r = *ptr_y;
1456
0
      x1i = *(ptr_y + 1);
1457
1458
0
      tmp = ia_sub_flt(ia_mul_flt(x1r, w_1), ia_mul_flt(x1i, w_4));
1459
0
      x1i = (FLOAT32)ia_mac_flt(ia_mul_flt(x1r, w_4), x1i, w_1);
1460
0
      x1r = tmp;
1461
1462
0
      *ptr_y = ia_sub_flt((x0r), (x1r));
1463
0
      *(ptr_y + 1) = ia_sub_flt((x0i), (x1i));
1464
0
      ptr_y -= ((SIZE_T)del << 1);
1465
1466
0
      *ptr_y = ia_add_flt((x0r), (x1r));
1467
0
      *(ptr_y + 1) = ia_add_flt((x0i), (x1i));
1468
0
      ptr_y += 2;
1469
0
    }
1470
0
    twiddles = ptr_w;
1471
0
    for (j = del / 2; j != 0; j--) {
1472
0
      FLOAT32 w_1 = *twiddles;
1473
0
      FLOAT32 w_4 = *(twiddles + 257);
1474
0
      FLOAT32 tmp;
1475
0
      twiddles += nodespacing;
1476
1477
0
      x0r = *ptr_y;
1478
0
      x0i = *(ptr_y + 1);
1479
0
      ptr_y += ((SIZE_T)del << 1);
1480
1481
0
      x1r = *ptr_y;
1482
0
      x1i = *(ptr_y + 1);
1483
1484
0
      tmp = ia_add_flt(ia_mul_flt(x1r, w_4), ia_mul_flt(x1i, w_1));
1485
0
      x1i = ia_add_flt(ia_negate_flt(ia_mul_flt(x1r, w_1)), ia_mul_flt(x1i, w_4));
1486
0
      x1r = tmp;
1487
1488
0
      *ptr_y = ia_sub_flt((x0r), (x1r));
1489
0
      *(ptr_y + 1) = ia_sub_flt((x0i), (x1i));
1490
0
      ptr_y -= ((SIZE_T)del << 1);
1491
1492
0
      *ptr_y = ia_add_flt((x0r), (x1r));
1493
0
      *(ptr_y + 1) = ia_add_flt((x0i), (x1i));
1494
0
      ptr_y += 2;
1495
0
    }
1496
0
  }
1497
1498
211M
  for (i = 0; i < n_points; i++) {
1499
211M
    ptr_real[i] = y[2 * i];
1500
211M
    ptr_imag[i] = y[2 * i + 1];
1501
211M
  }
1502
206k
}
1503
52.8M
static VOID ixheaace_cplx_fft_4(FLOAT32 *x_r, FLOAT32 *x_i) {
1504
52.8M
  FLOAT32 x_0, x_1, x_2, x_3;
1505
52.8M
  FLOAT32 x_4, x_5, x_6, x_7;
1506
52.8M
  FLOAT32 x0r, x1r, x2r, x3r;
1507
52.8M
  FLOAT32 x0i, x1i, x2i, x3i;
1508
1509
  // 4 Point FFT
1510
52.8M
  x_0 = x_r[0];
1511
52.8M
  x_1 = x_i[0];
1512
52.8M
  x_2 = x_r[1];
1513
52.8M
  x_3 = x_i[1];
1514
52.8M
  x_4 = x_r[2];
1515
52.8M
  x_5 = x_i[2];
1516
52.8M
  x_6 = x_r[3];
1517
52.8M
  x_7 = x_i[3];
1518
1519
52.8M
  x0r = ia_add_flt(x_0, x_4);
1520
52.8M
  x0i = ia_add_flt(x_1, x_5);
1521
52.8M
  x2r = ia_sub_flt(x_0, x_4);
1522
52.8M
  x2i = ia_sub_flt(x_1, x_5);
1523
52.8M
  x1r = ia_add_flt(x_2, x_6);
1524
52.8M
  x1i = ia_add_flt(x_3, x_7);
1525
52.8M
  x3r = ia_sub_flt(x_2, x_6);
1526
52.8M
  x3i = ia_sub_flt(x_3, x_7);
1527
1528
52.8M
  x_r[0] = ia_add_flt(x0r, x1r);
1529
52.8M
  x_i[0] = ia_add_flt(x0i, x1i);
1530
52.8M
  x_r[2] = ia_sub_flt(x0r, x1r);
1531
52.8M
  x_i[2] = ia_sub_flt(x0i, x1i);
1532
52.8M
  x_r[1] = ia_add_flt(x2r, x3i);
1533
52.8M
  x_i[1] = ia_sub_flt(x2i, x3r);
1534
52.8M
  x_r[3] = ia_sub_flt(x2r, x3i);
1535
52.8M
  x_i[3] = ia_add_flt(x2i, x3r);
1536
52.8M
  return;
1537
52.8M
}
1538
51.6k
VOID iusace_complex_fft_4096(FLOAT32 *ptr_x_r, FLOAT32 *ptr_x_i, FLOAT32 *ptr_scratch_buf) {
1539
51.6k
  FLOAT32 *ptr_data_r;
1540
51.6k
  FLOAT32 *ptr_data_i;
1541
51.6k
  WORD32 fft_len = 4096;
1542
51.6k
  FLOAT32 *ptr_fft_interim_buf = &ptr_scratch_buf[2 * fft_len];
1543
51.6k
  WORD32 i, j;
1544
51.6k
  WORD32 dim2 = fft_len >> 10;
1545
51.6k
  WORD32 dim1 = fft_len / dim2;
1546
51.6k
  WORD32 fac = 4;
1547
1548
258k
  for (i = 0; i < dim2; i++) {
1549
206k
    ptr_data_r = &ptr_scratch_buf[(2 * i + 0) * dim1];
1550
206k
    ptr_data_i = &ptr_scratch_buf[(2 * i + 1) * dim1];
1551
211M
    for (j = 0; j < dim1; j++) {
1552
211M
      ptr_data_r[j] = ptr_x_r[(dim2 * j + i)];
1553
211M
      ptr_data_i[j] = 0;
1554
211M
    }
1555
206k
    ixheaace_rad2_cplx_fft(ptr_data_r, ptr_data_i, dim1, ptr_fft_interim_buf);
1556
206k
  }
1557
51.6k
  ptr_data_r = &ptr_scratch_buf[0];
1558
51.6k
  ptr_data_i = &ptr_scratch_buf[0];
1559
52.8M
  for (i = 0; i < dim1; i++) {
1560
52.8M
    FLOAT32 *ptr_cos_val = (FLOAT32 *)&ia_mixed_rad_twiddle_cos[i * dim2 * fac];
1561
52.8M
    FLOAT32 *ptr_sin_val = (FLOAT32 *)&ia_mixed_rad_twiddle_sin[i * dim2 * fac];
1562
264M
    for (j = 0; j < dim2; j++) {
1563
211M
      FLOAT32 real = ptr_data_r[(2 * j + 0) * dim1 + i];
1564
211M
      FLOAT32 imag = ptr_data_i[(2 * j + 1) * dim1 + i];
1565
211M
      FLOAT32 cos_val = ptr_cos_val[j * fac];
1566
211M
      FLOAT32 sin_val = ptr_sin_val[j * fac];
1567
211M
      FLOAT32 temp_real = (FLOAT32)(real * cos_val + imag * sin_val);
1568
211M
      FLOAT32 temp_imag = (FLOAT32)(imag * cos_val - real * sin_val);
1569
211M
      ptr_fft_interim_buf[(2 * i + 0) * dim2 + j] = temp_real;
1570
211M
      ptr_fft_interim_buf[(2 * i + 1) * dim2 + j] = temp_imag;
1571
211M
    }
1572
52.8M
  }
1573
52.8M
  for (i = 0; i < dim1; i++) {
1574
52.8M
    ptr_data_r = &ptr_fft_interim_buf[(2 * i + 0) * dim2];
1575
52.8M
    ptr_data_i = &ptr_fft_interim_buf[(2 * i + 1) * dim2];
1576
52.8M
    ixheaace_cplx_fft_4(ptr_data_r, ptr_data_i);
1577
52.8M
  }
1578
51.6k
  ptr_data_r = &ptr_fft_interim_buf[0];
1579
51.6k
  ptr_data_i = &ptr_fft_interim_buf[0];
1580
52.8M
  for (i = 0; i < dim1; i++) {
1581
264M
    for (j = 0; j < dim2; j++) {
1582
211M
      ptr_x_r[(j * dim1 + i)] = ptr_data_r[(2 * i + 0) * dim2 + j];
1583
211M
      ptr_x_i[(j * dim1 + i)] = ptr_data_i[(2 * i + 1) * dim2 + j];
1584
211M
    }
1585
52.8M
  }
1586
51.6k
}