Coverage Report

Created: 2022-08-24 06:11

/src/aom/aom_dsp/fft_common.h
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#ifndef AOM_AOM_DSP_FFT_COMMON_H_
13
#define AOM_AOM_DSP_FFT_COMMON_H_
14
15
#ifdef __cplusplus
16
extern "C" {
17
#endif
18
19
/*!\brief A function pointer for computing 1d fft and ifft.
20
 *
21
 * The function will point to an implementation for a specific transform size,
22
 * and may perform the transforms using vectorized instructions.
23
 *
24
 * For a non-vectorized forward transforms of size n, the input and output
25
 * buffers will be size n. The output takes advantage of conjugate symmetry and
26
 * packs the results as: [r_0, r_1, ..., r_{n/2}, i_1, ..., i_{n/2-1}], where
27
 * (r_{j}, i_{j}) is the complex output for index j.
28
 *
29
 * An inverse transform will assume that the complex "input" is packed
30
 * similarly. Its output will be real.
31
 *
32
 * Non-vectorized transforms (e.g., on a single row) would use a stride = 1.
33
 *
34
 * Vectorized implementations are parallelized along the columns so that the fft
35
 * can be performed on multiple columns at a time. In such cases the data block
36
 * for input and output is typically square (n x n) and the stride will
37
 * correspond to the spacing between rows. At minimum, the input size must be
38
 * n x simd_vector_length.
39
 *
40
 * \param[in]  input   Input buffer. See above for size restrictions.
41
 * \param[out] output  Output buffer. See above for size restrictions.
42
 * \param[in]  stride  The spacing in number of elements between rows
43
 *                     (or elements)
44
 */
45
typedef void (*aom_fft_1d_func_t)(const float *input, float *output,
46
                                  int stride);
47
48
// Declare some of the forward non-vectorized transforms which are used in some
49
// of the vectorized implementations
50
void aom_fft1d_4_float(const float *input, float *output, int stride);
51
void aom_fft1d_8_float(const float *input, float *output, int stride);
52
void aom_fft1d_16_float(const float *input, float *output, int stride);
53
void aom_fft1d_32_float(const float *input, float *output, int stride);
54
55
/**\!brief Function pointer for transposing a matrix of floats.
56
 *
57
 * \param[in]  input  Input buffer (size n x n)
58
 * \param[out] output Output buffer (size n x n)
59
 * \param[in]  n      Extent of one dimension of the square matrix.
60
 */
61
typedef void (*aom_fft_transpose_func_t)(const float *input, float *output,
62
                                         int n);
63
64
/**\!brief Function pointer for re-arranging intermediate 2d transform results.
65
 *
66
 * After re-arrangement, the real and imaginary components will be packed
67
 * tightly next to each other.
68
 *
69
 * \param[in]  input  Input buffer (size n x n)
70
 * \param[out] output Output buffer (size 2 x n x n)
71
 * \param[in]  n      Extent of one dimension of the square matrix.
72
 */
73
typedef void (*aom_fft_unpack_func_t)(const float *input, float *output, int n);
74
75
/*!\brief Performs a 2d fft with the given functions.
76
 *
77
 * This generator function allows for multiple different implementations of 2d
78
 * fft with different vector operations, without having to redefine the main
79
 * body multiple times.
80
 *
81
 * \param[in]  input     Input buffer to run the transform on (size n x n)
82
 * \param[out] temp      Working buffer for computing the transform (size n x n)
83
 * \param[out] output    Output buffer (size 2 x n x n)
84
 * \param[in]  tform     Forward transform function
85
 * \param[in]  transpose Transpose function (for n x n matrix)
86
 * \param[in]  unpack    Unpack function used to massage outputs to correct form
87
 * \param[in]  vec_size  Vector size (the transform is done vec_size units at
88
 *                       a time)
89
 */
90
void aom_fft_2d_gen(const float *input, float *temp, float *output, int n,
91
                    aom_fft_1d_func_t tform, aom_fft_transpose_func_t transpose,
92
                    aom_fft_unpack_func_t unpack, int vec_size);
93
94
/*!\brief Perform a 2d inverse fft with the given helper functions
95
 *
96
 * \param[in]  input      Input buffer to run the transform on (size 2 x n x n)
97
 * \param[out] temp       Working buffer for computations (size 2 x n x n)
98
 * \param[out] output     Output buffer (size n x n)
99
 * \param[in]  fft_single Forward transform function (non vectorized)
100
 * \param[in]  fft_multi  Forward transform function (vectorized)
101
 * \param[in]  ifft_multi Inverse transform function (vectorized)
102
 * \param[in]  transpose  Transpose function (for n x n matrix)
103
 * \param[in]  vec_size   Vector size (the transform is done vec_size
104
 *                        units at a time)
105
 */
106
void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n,
107
                     aom_fft_1d_func_t fft_single, aom_fft_1d_func_t fft_multi,
108
                     aom_fft_1d_func_t ifft_multi,
109
                     aom_fft_transpose_func_t transpose, int vec_size);
110
#ifdef __cplusplus
111
}
112
#endif
113
114
// The macros below define 1D fft/ifft for different data types and for
115
// different simd vector intrinsic types.
116
117
#define GEN_FFT_2(ret, suffix, T, T_VEC, load, store)               \
118
0
  ret aom_fft1d_2_##suffix(const T *input, T *output, int stride) { \
119
0
    const T_VEC i0 = load(input + 0 * stride);                      \
120
0
    const T_VEC i1 = load(input + 1 * stride);                      \
121
0
    store(output + 0 * stride, i0 + i1);                            \
122
0
    store(output + 1 * stride, i0 - i1);                            \
123
0
  }
124
125
#define GEN_FFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \
126
0
  ret aom_fft1d_4_##suffix(const T *input, T *output, int stride) {       \
127
0
    const T_VEC kWeight0 = constant(0.0f);                                \
128
0
    const T_VEC i0 = load(input + 0 * stride);                            \
129
0
    const T_VEC i1 = load(input + 1 * stride);                            \
130
0
    const T_VEC i2 = load(input + 2 * stride);                            \
131
0
    const T_VEC i3 = load(input + 3 * stride);                            \
132
0
    const T_VEC w0 = add(i0, i2);                                         \
133
0
    const T_VEC w1 = sub(i0, i2);                                         \
134
0
    const T_VEC w2 = add(i1, i3);                                         \
135
0
    const T_VEC w3 = sub(i1, i3);                                         \
136
0
    store(output + 0 * stride, add(w0, w2));                              \
137
0
    store(output + 1 * stride, w1);                                       \
138
0
    store(output + 2 * stride, sub(w0, w2));                              \
139
0
    store(output + 3 * stride, sub(kWeight0, w3));                        \
140
0
  }
141
142
#define GEN_FFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, mul) \
143
0
  ret aom_fft1d_8_##suffix(const T *input, T *output, int stride) {            \
144
0
    const T_VEC kWeight0 = constant(0.0f);                                     \
145
0
    const T_VEC kWeight2 = constant(0.707107f);                                \
146
0
    const T_VEC i0 = load(input + 0 * stride);                                 \
147
0
    const T_VEC i1 = load(input + 1 * stride);                                 \
148
0
    const T_VEC i2 = load(input + 2 * stride);                                 \
149
0
    const T_VEC i3 = load(input + 3 * stride);                                 \
150
0
    const T_VEC i4 = load(input + 4 * stride);                                 \
151
0
    const T_VEC i5 = load(input + 5 * stride);                                 \
152
0
    const T_VEC i6 = load(input + 6 * stride);                                 \
153
0
    const T_VEC i7 = load(input + 7 * stride);                                 \
154
0
    const T_VEC w0 = add(i0, i4);                                              \
155
0
    const T_VEC w1 = sub(i0, i4);                                              \
156
0
    const T_VEC w2 = add(i2, i6);                                              \
157
0
    const T_VEC w3 = sub(i2, i6);                                              \
158
0
    const T_VEC w4 = add(w0, w2);                                              \
159
0
    const T_VEC w5 = sub(w0, w2);                                              \
160
0
    const T_VEC w7 = add(i1, i5);                                              \
161
0
    const T_VEC w8 = sub(i1, i5);                                              \
162
0
    const T_VEC w9 = add(i3, i7);                                              \
163
0
    const T_VEC w10 = sub(i3, i7);                                             \
164
0
    const T_VEC w11 = add(w7, w9);                                             \
165
0
    const T_VEC w12 = sub(w7, w9);                                             \
166
0
    store(output + 0 * stride, add(w4, w11));                                  \
167
0
    store(output + 1 * stride, add(w1, mul(kWeight2, sub(w8, w10))));          \
168
0
    store(output + 2 * stride, w5);                                            \
169
0
    store(output + 3 * stride, sub(w1, mul(kWeight2, sub(w8, w10))));          \
170
0
    store(output + 4 * stride, sub(w4, w11));                                  \
171
0
    store(output + 5 * stride,                                                 \
172
0
          sub(sub(kWeight0, w3), mul(kWeight2, add(w10, w8))));                \
173
0
    store(output + 6 * stride, sub(kWeight0, w12));                            \
174
0
    store(output + 7 * stride, sub(w3, mul(kWeight2, add(w10, w8))));          \
175
0
  }
176
177
#define GEN_FFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub, \
178
                   mul)                                                    \
179
0
  ret aom_fft1d_16_##suffix(const T *input, T *output, int stride) {       \
180
0
    const T_VEC kWeight0 = constant(0.0f);                                 \
181
0
    const T_VEC kWeight2 = constant(0.707107f);                            \
182
0
    const T_VEC kWeight3 = constant(0.92388f);                             \
183
0
    const T_VEC kWeight4 = constant(0.382683f);                            \
184
0
    const T_VEC i0 = load(input + 0 * stride);                             \
185
0
    const T_VEC i1 = load(input + 1 * stride);                             \
186
0
    const T_VEC i2 = load(input + 2 * stride);                             \
187
0
    const T_VEC i3 = load(input + 3 * stride);                             \
188
0
    const T_VEC i4 = load(input + 4 * stride);                             \
189
0
    const T_VEC i5 = load(input + 5 * stride);                             \
190
0
    const T_VEC i6 = load(input + 6 * stride);                             \
191
0
    const T_VEC i7 = load(input + 7 * stride);                             \
192
0
    const T_VEC i8 = load(input + 8 * stride);                             \
193
0
    const T_VEC i9 = load(input + 9 * stride);                             \
194
0
    const T_VEC i10 = load(input + 10 * stride);                           \
195
0
    const T_VEC i11 = load(input + 11 * stride);                           \
196
0
    const T_VEC i12 = load(input + 12 * stride);                           \
197
0
    const T_VEC i13 = load(input + 13 * stride);                           \
198
0
    const T_VEC i14 = load(input + 14 * stride);                           \
199
0
    const T_VEC i15 = load(input + 15 * stride);                           \
200
0
    const T_VEC w0 = add(i0, i8);                                          \
201
0
    const T_VEC w1 = sub(i0, i8);                                          \
202
0
    const T_VEC w2 = add(i4, i12);                                         \
203
0
    const T_VEC w3 = sub(i4, i12);                                         \
204
0
    const T_VEC w4 = add(w0, w2);                                          \
205
0
    const T_VEC w5 = sub(w0, w2);                                          \
206
0
    const T_VEC w7 = add(i2, i10);                                         \
207
0
    const T_VEC w8 = sub(i2, i10);                                         \
208
0
    const T_VEC w9 = add(i6, i14);                                         \
209
0
    const T_VEC w10 = sub(i6, i14);                                        \
210
0
    const T_VEC w11 = add(w7, w9);                                         \
211
0
    const T_VEC w12 = sub(w7, w9);                                         \
212
0
    const T_VEC w14 = add(w4, w11);                                        \
213
0
    const T_VEC w15 = sub(w4, w11);                                        \
214
0
    const T_VEC w16[2] = { add(w1, mul(kWeight2, sub(w8, w10))),           \
215
0
                           sub(sub(kWeight0, w3),                          \
216
0
                               mul(kWeight2, add(w10, w8))) };             \
217
0
    const T_VEC w18[2] = { sub(w1, mul(kWeight2, sub(w8, w10))),           \
218
0
                           sub(w3, mul(kWeight2, add(w10, w8))) };         \
219
0
    const T_VEC w19 = add(i1, i9);                                         \
220
0
    const T_VEC w20 = sub(i1, i9);                                         \
221
0
    const T_VEC w21 = add(i5, i13);                                        \
222
0
    const T_VEC w22 = sub(i5, i13);                                        \
223
0
    const T_VEC w23 = add(w19, w21);                                       \
224
0
    const T_VEC w24 = sub(w19, w21);                                       \
225
0
    const T_VEC w26 = add(i3, i11);                                        \
226
0
    const T_VEC w27 = sub(i3, i11);                                        \
227
0
    const T_VEC w28 = add(i7, i15);                                        \
228
0
    const T_VEC w29 = sub(i7, i15);                                        \
229
0
    const T_VEC w30 = add(w26, w28);                                       \
230
0
    const T_VEC w31 = sub(w26, w28);                                       \
231
0
    const T_VEC w33 = add(w23, w30);                                       \
232
0
    const T_VEC w34 = sub(w23, w30);                                       \
233
0
    const T_VEC w35[2] = { add(w20, mul(kWeight2, sub(w27, w29))),         \
234
0
                           sub(sub(kWeight0, w22),                         \
235
0
                               mul(kWeight2, add(w29, w27))) };            \
236
0
    const T_VEC w37[2] = { sub(w20, mul(kWeight2, sub(w27, w29))),         \
237
0
                           sub(w22, mul(kWeight2, add(w29, w27))) };       \
238
0
    store(output + 0 * stride, add(w14, w33));                             \
239
0
    store(output + 1 * stride,                                             \
240
0
          add(w16[0], add(mul(kWeight3, w35[0]), mul(kWeight4, w35[1])))); \
241
0
    store(output + 2 * stride, add(w5, mul(kWeight2, sub(w24, w31))));     \
242
0
    store(output + 3 * stride,                                             \
243
0
          add(w18[0], add(mul(kWeight4, w37[0]), mul(kWeight3, w37[1])))); \
244
0
    store(output + 4 * stride, w15);                                       \
245
0
    store(output + 5 * stride,                                             \
246
0
          add(w18[0], sub(sub(kWeight0, mul(kWeight4, w37[0])),            \
247
0
                          mul(kWeight3, w37[1]))));                        \
248
0
    store(output + 6 * stride, sub(w5, mul(kWeight2, sub(w24, w31))));     \
249
0
    store(output + 7 * stride,                                             \
250
0
          add(w16[0], sub(sub(kWeight0, mul(kWeight3, w35[0])),            \
251
0
                          mul(kWeight4, w35[1]))));                        \
252
0
    store(output + 8 * stride, sub(w14, w33));                             \
253
0
    store(output + 9 * stride,                                             \
254
0
          add(w16[1], sub(mul(kWeight3, w35[1]), mul(kWeight4, w35[0])))); \
255
0
    store(output + 10 * stride,                                            \
256
0
          sub(sub(kWeight0, w12), mul(kWeight2, add(w31, w24))));          \
257
0
    store(output + 11 * stride,                                            \
258
0
          add(w18[1], sub(mul(kWeight4, w37[1]), mul(kWeight3, w37[0])))); \
259
0
    store(output + 12 * stride, sub(kWeight0, w34));                       \
260
0
    store(output + 13 * stride,                                            \
261
0
          sub(sub(kWeight0, w18[1]),                                       \
262
0
              sub(mul(kWeight3, w37[0]), mul(kWeight4, w37[1]))));         \
263
0
    store(output + 14 * stride, sub(w12, mul(kWeight2, add(w31, w24))));   \
264
0
    store(output + 15 * stride,                                            \
265
0
          sub(sub(kWeight0, w16[1]),                                       \
266
0
              sub(mul(kWeight4, w35[0]), mul(kWeight3, w35[1]))));         \
267
0
  }
268
269
#define GEN_FFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub,   \
270
                   mul)                                                      \
271
0
  ret aom_fft1d_32_##suffix(const T *input, T *output, int stride) {         \
272
0
    const T_VEC kWeight0 = constant(0.0f);                                   \
273
0
    const T_VEC kWeight2 = constant(0.707107f);                              \
274
0
    const T_VEC kWeight3 = constant(0.92388f);                               \
275
0
    const T_VEC kWeight4 = constant(0.382683f);                              \
276
0
    const T_VEC kWeight5 = constant(0.980785f);                              \
277
0
    const T_VEC kWeight6 = constant(0.19509f);                               \
278
0
    const T_VEC kWeight7 = constant(0.83147f);                               \
279
0
    const T_VEC kWeight8 = constant(0.55557f);                               \
280
0
    const T_VEC i0 = load(input + 0 * stride);                               \
281
0
    const T_VEC i1 = load(input + 1 * stride);                               \
282
0
    const T_VEC i2 = load(input + 2 * stride);                               \
283
0
    const T_VEC i3 = load(input + 3 * stride);                               \
284
0
    const T_VEC i4 = load(input + 4 * stride);                               \
285
0
    const T_VEC i5 = load(input + 5 * stride);                               \
286
0
    const T_VEC i6 = load(input + 6 * stride);                               \
287
0
    const T_VEC i7 = load(input + 7 * stride);                               \
288
0
    const T_VEC i8 = load(input + 8 * stride);                               \
289
0
    const T_VEC i9 = load(input + 9 * stride);                               \
290
0
    const T_VEC i10 = load(input + 10 * stride);                             \
291
0
    const T_VEC i11 = load(input + 11 * stride);                             \
292
0
    const T_VEC i12 = load(input + 12 * stride);                             \
293
0
    const T_VEC i13 = load(input + 13 * stride);                             \
294
0
    const T_VEC i14 = load(input + 14 * stride);                             \
295
0
    const T_VEC i15 = load(input + 15 * stride);                             \
296
0
    const T_VEC i16 = load(input + 16 * stride);                             \
297
0
    const T_VEC i17 = load(input + 17 * stride);                             \
298
0
    const T_VEC i18 = load(input + 18 * stride);                             \
299
0
    const T_VEC i19 = load(input + 19 * stride);                             \
300
0
    const T_VEC i20 = load(input + 20 * stride);                             \
301
0
    const T_VEC i21 = load(input + 21 * stride);                             \
302
0
    const T_VEC i22 = load(input + 22 * stride);                             \
303
0
    const T_VEC i23 = load(input + 23 * stride);                             \
304
0
    const T_VEC i24 = load(input + 24 * stride);                             \
305
0
    const T_VEC i25 = load(input + 25 * stride);                             \
306
0
    const T_VEC i26 = load(input + 26 * stride);                             \
307
0
    const T_VEC i27 = load(input + 27 * stride);                             \
308
0
    const T_VEC i28 = load(input + 28 * stride);                             \
309
0
    const T_VEC i29 = load(input + 29 * stride);                             \
310
0
    const T_VEC i30 = load(input + 30 * stride);                             \
311
0
    const T_VEC i31 = load(input + 31 * stride);                             \
312
0
    const T_VEC w0 = add(i0, i16);                                           \
313
0
    const T_VEC w1 = sub(i0, i16);                                           \
314
0
    const T_VEC w2 = add(i8, i24);                                           \
315
0
    const T_VEC w3 = sub(i8, i24);                                           \
316
0
    const T_VEC w4 = add(w0, w2);                                            \
317
0
    const T_VEC w5 = sub(w0, w2);                                            \
318
0
    const T_VEC w7 = add(i4, i20);                                           \
319
0
    const T_VEC w8 = sub(i4, i20);                                           \
320
0
    const T_VEC w9 = add(i12, i28);                                          \
321
0
    const T_VEC w10 = sub(i12, i28);                                         \
322
0
    const T_VEC w11 = add(w7, w9);                                           \
323
0
    const T_VEC w12 = sub(w7, w9);                                           \
324
0
    const T_VEC w14 = add(w4, w11);                                          \
325
0
    const T_VEC w15 = sub(w4, w11);                                          \
326
0
    const T_VEC w16[2] = { add(w1, mul(kWeight2, sub(w8, w10))),             \
327
0
                           sub(sub(kWeight0, w3),                            \
328
0
                               mul(kWeight2, add(w10, w8))) };               \
329
0
    const T_VEC w18[2] = { sub(w1, mul(kWeight2, sub(w8, w10))),             \
330
0
                           sub(w3, mul(kWeight2, add(w10, w8))) };           \
331
0
    const T_VEC w19 = add(i2, i18);                                          \
332
0
    const T_VEC w20 = sub(i2, i18);                                          \
333
0
    const T_VEC w21 = add(i10, i26);                                         \
334
0
    const T_VEC w22 = sub(i10, i26);                                         \
335
0
    const T_VEC w23 = add(w19, w21);                                         \
336
0
    const T_VEC w24 = sub(w19, w21);                                         \
337
0
    const T_VEC w26 = add(i6, i22);                                          \
338
0
    const T_VEC w27 = sub(i6, i22);                                          \
339
0
    const T_VEC w28 = add(i14, i30);                                         \
340
0
    const T_VEC w29 = sub(i14, i30);                                         \
341
0
    const T_VEC w30 = add(w26, w28);                                         \
342
0
    const T_VEC w31 = sub(w26, w28);                                         \
343
0
    const T_VEC w33 = add(w23, w30);                                         \
344
0
    const T_VEC w34 = sub(w23, w30);                                         \
345
0
    const T_VEC w35[2] = { add(w20, mul(kWeight2, sub(w27, w29))),           \
346
0
                           sub(sub(kWeight0, w22),                           \
347
0
                               mul(kWeight2, add(w29, w27))) };              \
348
0
    const T_VEC w37[2] = { sub(w20, mul(kWeight2, sub(w27, w29))),           \
349
0
                           sub(w22, mul(kWeight2, add(w29, w27))) };         \
350
0
    const T_VEC w38 = add(w14, w33);                                         \
351
0
    const T_VEC w39 = sub(w14, w33);                                         \
352
0
    const T_VEC w40[2] = {                                                   \
353
0
      add(w16[0], add(mul(kWeight3, w35[0]), mul(kWeight4, w35[1]))),        \
354
0
      add(w16[1], sub(mul(kWeight3, w35[1]), mul(kWeight4, w35[0])))         \
355
0
    };                                                                       \
356
0
    const T_VEC w41[2] = { add(w5, mul(kWeight2, sub(w24, w31))),            \
357
0
                           sub(sub(kWeight0, w12),                           \
358
0
                               mul(kWeight2, add(w31, w24))) };              \
359
0
    const T_VEC w42[2] = {                                                   \
360
0
      add(w18[0], add(mul(kWeight4, w37[0]), mul(kWeight3, w37[1]))),        \
361
0
      add(w18[1], sub(mul(kWeight4, w37[1]), mul(kWeight3, w37[0])))         \
362
0
    };                                                                       \
363
0
    const T_VEC w44[2] = {                                                   \
364
0
      add(w18[0],                                                            \
365
0
          sub(sub(kWeight0, mul(kWeight4, w37[0])), mul(kWeight3, w37[1]))), \
366
0
      sub(sub(kWeight0, w18[1]),                                             \
367
0
          sub(mul(kWeight3, w37[0]), mul(kWeight4, w37[1])))                 \
368
0
    };                                                                       \
369
0
    const T_VEC w45[2] = { sub(w5, mul(kWeight2, sub(w24, w31))),            \
370
0
                           sub(w12, mul(kWeight2, add(w31, w24))) };         \
371
0
    const T_VEC w46[2] = {                                                   \
372
0
      add(w16[0],                                                            \
373
0
          sub(sub(kWeight0, mul(kWeight3, w35[0])), mul(kWeight4, w35[1]))), \
374
0
      sub(sub(kWeight0, w16[1]),                                             \
375
0
          sub(mul(kWeight4, w35[0]), mul(kWeight3, w35[1])))                 \
376
0
    };                                                                       \
377
0
    const T_VEC w47 = add(i1, i17);                                          \
378
0
    const T_VEC w48 = sub(i1, i17);                                          \
379
0
    const T_VEC w49 = add(i9, i25);                                          \
380
0
    const T_VEC w50 = sub(i9, i25);                                          \
381
0
    const T_VEC w51 = add(w47, w49);                                         \
382
0
    const T_VEC w52 = sub(w47, w49);                                         \
383
0
    const T_VEC w54 = add(i5, i21);                                          \
384
0
    const T_VEC w55 = sub(i5, i21);                                          \
385
0
    const T_VEC w56 = add(i13, i29);                                         \
386
0
    const T_VEC w57 = sub(i13, i29);                                         \
387
0
    const T_VEC w58 = add(w54, w56);                                         \
388
0
    const T_VEC w59 = sub(w54, w56);                                         \
389
0
    const T_VEC w61 = add(w51, w58);                                         \
390
0
    const T_VEC w62 = sub(w51, w58);                                         \
391
0
    const T_VEC w63[2] = { add(w48, mul(kWeight2, sub(w55, w57))),           \
392
0
                           sub(sub(kWeight0, w50),                           \
393
0
                               mul(kWeight2, add(w57, w55))) };              \
394
0
    const T_VEC w65[2] = { sub(w48, mul(kWeight2, sub(w55, w57))),           \
395
0
                           sub(w50, mul(kWeight2, add(w57, w55))) };         \
396
0
    const T_VEC w66 = add(i3, i19);                                          \
397
0
    const T_VEC w67 = sub(i3, i19);                                          \
398
0
    const T_VEC w68 = add(i11, i27);                                         \
399
0
    const T_VEC w69 = sub(i11, i27);                                         \
400
0
    const T_VEC w70 = add(w66, w68);                                         \
401
0
    const T_VEC w71 = sub(w66, w68);                                         \
402
0
    const T_VEC w73 = add(i7, i23);                                          \
403
0
    const T_VEC w74 = sub(i7, i23);                                          \
404
0
    const T_VEC w75 = add(i15, i31);                                         \
405
0
    const T_VEC w76 = sub(i15, i31);                                         \
406
0
    const T_VEC w77 = add(w73, w75);                                         \
407
0
    const T_VEC w78 = sub(w73, w75);                                         \
408
0
    const T_VEC w80 = add(w70, w77);                                         \
409
0
    const T_VEC w81 = sub(w70, w77);                                         \
410
0
    const T_VEC w82[2] = { add(w67, mul(kWeight2, sub(w74, w76))),           \
411
0
                           sub(sub(kWeight0, w69),                           \
412
0
                               mul(kWeight2, add(w76, w74))) };              \
413
0
    const T_VEC w84[2] = { sub(w67, mul(kWeight2, sub(w74, w76))),           \
414
0
                           sub(w69, mul(kWeight2, add(w76, w74))) };         \
415
0
    const T_VEC w85 = add(w61, w80);                                         \
416
0
    const T_VEC w86 = sub(w61, w80);                                         \
417
0
    const T_VEC w87[2] = {                                                   \
418
0
      add(w63[0], add(mul(kWeight3, w82[0]), mul(kWeight4, w82[1]))),        \
419
0
      add(w63[1], sub(mul(kWeight3, w82[1]), mul(kWeight4, w82[0])))         \
420
0
    };                                                                       \
421
0
    const T_VEC w88[2] = { add(w52, mul(kWeight2, sub(w71, w78))),           \
422
0
                           sub(sub(kWeight0, w59),                           \
423
0
                               mul(kWeight2, add(w78, w71))) };              \
424
0
    const T_VEC w89[2] = {                                                   \
425
0
      add(w65[0], add(mul(kWeight4, w84[0]), mul(kWeight3, w84[1]))),        \
426
0
      add(w65[1], sub(mul(kWeight4, w84[1]), mul(kWeight3, w84[0])))         \
427
0
    };                                                                       \
428
0
    const T_VEC w91[2] = {                                                   \
429
0
      add(w65[0],                                                            \
430
0
          sub(sub(kWeight0, mul(kWeight4, w84[0])), mul(kWeight3, w84[1]))), \
431
0
      sub(sub(kWeight0, w65[1]),                                             \
432
0
          sub(mul(kWeight3, w84[0]), mul(kWeight4, w84[1])))                 \
433
0
    };                                                                       \
434
0
    const T_VEC w92[2] = { sub(w52, mul(kWeight2, sub(w71, w78))),           \
435
0
                           sub(w59, mul(kWeight2, add(w78, w71))) };         \
436
0
    const T_VEC w93[2] = {                                                   \
437
0
      add(w63[0],                                                            \
438
0
          sub(sub(kWeight0, mul(kWeight3, w82[0])), mul(kWeight4, w82[1]))), \
439
0
      sub(sub(kWeight0, w63[1]),                                             \
440
0
          sub(mul(kWeight4, w82[0]), mul(kWeight3, w82[1])))                 \
441
0
    };                                                                       \
442
0
    store(output + 0 * stride, add(w38, w85));                               \
443
0
    store(output + 1 * stride,                                               \
444
0
          add(w40[0], add(mul(kWeight5, w87[0]), mul(kWeight6, w87[1]))));   \
445
0
    store(output + 2 * stride,                                               \
446
0
          add(w41[0], add(mul(kWeight3, w88[0]), mul(kWeight4, w88[1]))));   \
447
0
    store(output + 3 * stride,                                               \
448
0
          add(w42[0], add(mul(kWeight7, w89[0]), mul(kWeight8, w89[1]))));   \
449
0
    store(output + 4 * stride, add(w15, mul(kWeight2, sub(w62, w81))));      \
450
0
    store(output + 5 * stride,                                               \
451
0
          add(w44[0], add(mul(kWeight8, w91[0]), mul(kWeight7, w91[1]))));   \
452
0
    store(output + 6 * stride,                                               \
453
0
          add(w45[0], add(mul(kWeight4, w92[0]), mul(kWeight3, w92[1]))));   \
454
0
    store(output + 7 * stride,                                               \
455
0
          add(w46[0], add(mul(kWeight6, w93[0]), mul(kWeight5, w93[1]))));   \
456
0
    store(output + 8 * stride, w39);                                         \
457
0
    store(output + 9 * stride,                                               \
458
0
          add(w46[0], sub(sub(kWeight0, mul(kWeight6, w93[0])),              \
459
0
                          mul(kWeight5, w93[1]))));                          \
460
0
    store(output + 10 * stride,                                              \
461
0
          add(w45[0], sub(sub(kWeight0, mul(kWeight4, w92[0])),              \
462
0
                          mul(kWeight3, w92[1]))));                          \
463
0
    store(output + 11 * stride,                                              \
464
0
          add(w44[0], sub(sub(kWeight0, mul(kWeight8, w91[0])),              \
465
0
                          mul(kWeight7, w91[1]))));                          \
466
0
    store(output + 12 * stride, sub(w15, mul(kWeight2, sub(w62, w81))));     \
467
0
    store(output + 13 * stride,                                              \
468
0
          add(w42[0], sub(sub(kWeight0, mul(kWeight7, w89[0])),              \
469
0
                          mul(kWeight8, w89[1]))));                          \
470
0
    store(output + 14 * stride,                                              \
471
0
          add(w41[0], sub(sub(kWeight0, mul(kWeight3, w88[0])),              \
472
0
                          mul(kWeight4, w88[1]))));                          \
473
0
    store(output + 15 * stride,                                              \
474
0
          add(w40[0], sub(sub(kWeight0, mul(kWeight5, w87[0])),              \
475
0
                          mul(kWeight6, w87[1]))));                          \
476
0
    store(output + 16 * stride, sub(w38, w85));                              \
477
0
    store(output + 17 * stride,                                              \
478
0
          add(w40[1], sub(mul(kWeight5, w87[1]), mul(kWeight6, w87[0]))));   \
479
0
    store(output + 18 * stride,                                              \
480
0
          add(w41[1], sub(mul(kWeight3, w88[1]), mul(kWeight4, w88[0]))));   \
481
0
    store(output + 19 * stride,                                              \
482
0
          add(w42[1], sub(mul(kWeight7, w89[1]), mul(kWeight8, w89[0]))));   \
483
0
    store(output + 20 * stride,                                              \
484
0
          sub(sub(kWeight0, w34), mul(kWeight2, add(w81, w62))));            \
485
0
    store(output + 21 * stride,                                              \
486
0
          add(w44[1], sub(mul(kWeight8, w91[1]), mul(kWeight7, w91[0]))));   \
487
0
    store(output + 22 * stride,                                              \
488
0
          add(w45[1], sub(mul(kWeight4, w92[1]), mul(kWeight3, w92[0]))));   \
489
0
    store(output + 23 * stride,                                              \
490
0
          add(w46[1], sub(mul(kWeight6, w93[1]), mul(kWeight5, w93[0]))));   \
491
0
    store(output + 24 * stride, sub(kWeight0, w86));                         \
492
0
    store(output + 25 * stride,                                              \
493
0
          sub(sub(kWeight0, w46[1]),                                         \
494
0
              sub(mul(kWeight5, w93[0]), mul(kWeight6, w93[1]))));           \
495
0
    store(output + 26 * stride,                                              \
496
0
          sub(sub(kWeight0, w45[1]),                                         \
497
0
              sub(mul(kWeight3, w92[0]), mul(kWeight4, w92[1]))));           \
498
0
    store(output + 27 * stride,                                              \
499
0
          sub(sub(kWeight0, w44[1]),                                         \
500
0
              sub(mul(kWeight7, w91[0]), mul(kWeight8, w91[1]))));           \
501
0
    store(output + 28 * stride, sub(w34, mul(kWeight2, add(w81, w62))));     \
502
0
    store(output + 29 * stride,                                              \
503
0
          sub(sub(kWeight0, w42[1]),                                         \
504
0
              sub(mul(kWeight8, w89[0]), mul(kWeight7, w89[1]))));           \
505
0
    store(output + 30 * stride,                                              \
506
0
          sub(sub(kWeight0, w41[1]),                                         \
507
0
              sub(mul(kWeight4, w88[0]), mul(kWeight3, w88[1]))));           \
508
0
    store(output + 31 * stride,                                              \
509
0
          sub(sub(kWeight0, w40[1]),                                         \
510
0
              sub(mul(kWeight6, w87[0]), mul(kWeight5, w87[1]))));           \
511
0
  }
512
513
#define GEN_IFFT_2(ret, suffix, T, T_VEC, load, store)               \
514
0
  ret aom_ifft1d_2_##suffix(const T *input, T *output, int stride) { \
515
0
    const T_VEC i0 = load(input + 0 * stride);                       \
516
0
    const T_VEC i1 = load(input + 1 * stride);                       \
517
0
    store(output + 0 * stride, i0 + i1);                             \
518
0
    store(output + 1 * stride, i0 - i1);                             \
519
0
  }
520
521
#define GEN_IFFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \
522
0
  ret aom_ifft1d_4_##suffix(const T *input, T *output, int stride) {       \
523
0
    const T_VEC kWeight0 = constant(0.0f);                                 \
524
0
    const T_VEC i0 = load(input + 0 * stride);                             \
525
0
    const T_VEC i1 = load(input + 1 * stride);                             \
526
0
    const T_VEC i2 = load(input + 2 * stride);                             \
527
0
    const T_VEC i3 = load(input + 3 * stride);                             \
528
0
    const T_VEC w2 = add(i0, i2);                                          \
529
0
    const T_VEC w3 = sub(i0, i2);                                          \
530
0
    const T_VEC w4[2] = { add(i1, i1), sub(i3, i3) };                      \
531
0
    const T_VEC w5[2] = { sub(i1, i1), sub(sub(kWeight0, i3), i3) };       \
532
0
    store(output + 0 * stride, add(w2, w4[0]));                            \
533
0
    store(output + 1 * stride, add(w3, w5[1]));                            \
534
0
    store(output + 2 * stride, sub(w2, w4[0]));                            \
535
0
    store(output + 3 * stride, sub(w3, w5[1]));                            \
536
0
  }
537
538
#define GEN_IFFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, \
539
                   mul)                                                    \
540
0
  ret aom_ifft1d_8_##suffix(const T *input, T *output, int stride) {       \
541
0
    const T_VEC kWeight0 = constant(0.0f);                                 \
542
0
    const T_VEC kWeight2 = constant(0.707107f);                            \
543
0
    const T_VEC i0 = load(input + 0 * stride);                             \
544
0
    const T_VEC i1 = load(input + 1 * stride);                             \
545
0
    const T_VEC i2 = load(input + 2 * stride);                             \
546
0
    const T_VEC i3 = load(input + 3 * stride);                             \
547
0
    const T_VEC i4 = load(input + 4 * stride);                             \
548
0
    const T_VEC i5 = load(input + 5 * stride);                             \
549
0
    const T_VEC i6 = load(input + 6 * stride);                             \
550
0
    const T_VEC i7 = load(input + 7 * stride);                             \
551
0
    const T_VEC w6 = add(i0, i4);                                          \
552
0
    const T_VEC w7 = sub(i0, i4);                                          \
553
0
    const T_VEC w8[2] = { add(i2, i2), sub(i6, i6) };                      \
554
0
    const T_VEC w9[2] = { sub(i2, i2), sub(sub(kWeight0, i6), i6) };       \
555
0
    const T_VEC w10[2] = { add(w6, w8[0]), w8[1] };                        \
556
0
    const T_VEC w11[2] = { sub(w6, w8[0]), sub(kWeight0, w8[1]) };         \
557
0
    const T_VEC w12[2] = { add(w7, w9[1]), sub(kWeight0, w9[0]) };         \
558
0
    const T_VEC w13[2] = { sub(w7, w9[1]), w9[0] };                        \
559
0
    const T_VEC w14[2] = { add(i1, i3), sub(i7, i5) };                     \
560
0
    const T_VEC w15[2] = { sub(i1, i3), sub(sub(kWeight0, i5), i7) };      \
561
0
    const T_VEC w16[2] = { add(i3, i1), sub(i5, i7) };                     \
562
0
    const T_VEC w17[2] = { sub(i3, i1), sub(sub(kWeight0, i7), i5) };      \
563
0
    const T_VEC w18[2] = { add(w14[0], w16[0]), add(w14[1], w16[1]) };     \
564
0
    const T_VEC w19[2] = { sub(w14[0], w16[0]), sub(w14[1], w16[1]) };     \
565
0
    const T_VEC w20[2] = { add(w15[0], w17[1]), sub(w15[1], w17[0]) };     \
566
0
    const T_VEC w21[2] = { sub(w15[0], w17[1]), add(w15[1], w17[0]) };     \
567
0
    store(output + 0 * stride, add(w10[0], w18[0]));                       \
568
0
    store(output + 1 * stride,                                             \
569
0
          add(w12[0], mul(kWeight2, add(w20[0], w20[1]))));                \
570
0
    store(output + 2 * stride, add(w11[0], w19[1]));                       \
571
0
    store(output + 3 * stride,                                             \
572
0
          sub(w13[0], mul(kWeight2, sub(w21[0], w21[1]))));                \
573
0
    store(output + 4 * stride, sub(w10[0], w18[0]));                       \
574
0
    store(output + 5 * stride,                                             \
575
0
          add(w12[0], sub(sub(kWeight0, mul(kWeight2, w20[0])),            \
576
0
                          mul(kWeight2, w20[1]))));                        \
577
0
    store(output + 6 * stride, sub(w11[0], w19[1]));                       \
578
0
    store(output + 7 * stride,                                             \
579
0
          add(w13[0], mul(kWeight2, sub(w21[0], w21[1]))));                \
580
0
  }
581
582
#define GEN_IFFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub,   \
583
                    mul)                                                      \
584
0
  ret aom_ifft1d_16_##suffix(const T *input, T *output, int stride) {         \
585
0
    const T_VEC kWeight0 = constant(0.0f);                                    \
586
0
    const T_VEC kWeight2 = constant(0.707107f);                               \
587
0
    const T_VEC kWeight3 = constant(0.92388f);                                \
588
0
    const T_VEC kWeight4 = constant(0.382683f);                               \
589
0
    const T_VEC i0 = load(input + 0 * stride);                                \
590
0
    const T_VEC i1 = load(input + 1 * stride);                                \
591
0
    const T_VEC i2 = load(input + 2 * stride);                                \
592
0
    const T_VEC i3 = load(input + 3 * stride);                                \
593
0
    const T_VEC i4 = load(input + 4 * stride);                                \
594
0
    const T_VEC i5 = load(input + 5 * stride);                                \
595
0
    const T_VEC i6 = load(input + 6 * stride);                                \
596
0
    const T_VEC i7 = load(input + 7 * stride);                                \
597
0
    const T_VEC i8 = load(input + 8 * stride);                                \
598
0
    const T_VEC i9 = load(input + 9 * stride);                                \
599
0
    const T_VEC i10 = load(input + 10 * stride);                              \
600
0
    const T_VEC i11 = load(input + 11 * stride);                              \
601
0
    const T_VEC i12 = load(input + 12 * stride);                              \
602
0
    const T_VEC i13 = load(input + 13 * stride);                              \
603
0
    const T_VEC i14 = load(input + 14 * stride);                              \
604
0
    const T_VEC i15 = load(input + 15 * stride);                              \
605
0
    const T_VEC w14 = add(i0, i8);                                            \
606
0
    const T_VEC w15 = sub(i0, i8);                                            \
607
0
    const T_VEC w16[2] = { add(i4, i4), sub(i12, i12) };                      \
608
0
    const T_VEC w17[2] = { sub(i4, i4), sub(sub(kWeight0, i12), i12) };       \
609
0
    const T_VEC w18[2] = { add(w14, w16[0]), w16[1] };                        \
610
0
    const T_VEC w19[2] = { sub(w14, w16[0]), sub(kWeight0, w16[1]) };         \
611
0
    const T_VEC w20[2] = { add(w15, w17[1]), sub(kWeight0, w17[0]) };         \
612
0
    const T_VEC w21[2] = { sub(w15, w17[1]), w17[0] };                        \
613
0
    const T_VEC w22[2] = { add(i2, i6), sub(i14, i10) };                      \
614
0
    const T_VEC w23[2] = { sub(i2, i6), sub(sub(kWeight0, i10), i14) };       \
615
0
    const T_VEC w24[2] = { add(i6, i2), sub(i10, i14) };                      \
616
0
    const T_VEC w25[2] = { sub(i6, i2), sub(sub(kWeight0, i14), i10) };       \
617
0
    const T_VEC w26[2] = { add(w22[0], w24[0]), add(w22[1], w24[1]) };        \
618
0
    const T_VEC w27[2] = { sub(w22[0], w24[0]), sub(w22[1], w24[1]) };        \
619
0
    const T_VEC w28[2] = { add(w23[0], w25[1]), sub(w23[1], w25[0]) };        \
620
0
    const T_VEC w29[2] = { sub(w23[0], w25[1]), add(w23[1], w25[0]) };        \
621
0
    const T_VEC w30[2] = { add(w18[0], w26[0]), add(w18[1], w26[1]) };        \
622
0
    const T_VEC w31[2] = { sub(w18[0], w26[0]), sub(w18[1], w26[1]) };        \
623
0
    const T_VEC w32[2] = { add(w20[0], mul(kWeight2, add(w28[0], w28[1]))),   \
624
0
                           add(w20[1], mul(kWeight2, sub(w28[1], w28[0]))) }; \
625
0
    const T_VEC w33[2] = { add(w20[0],                                        \
626
0
                               sub(sub(kWeight0, mul(kWeight2, w28[0])),      \
627
0
                                   mul(kWeight2, w28[1]))),                   \
628
0
                           add(w20[1], mul(kWeight2, sub(w28[0], w28[1]))) }; \
629
0
    const T_VEC w34[2] = { add(w19[0], w27[1]), sub(w19[1], w27[0]) };        \
630
0
    const T_VEC w35[2] = { sub(w19[0], w27[1]), add(w19[1], w27[0]) };        \
631
0
    const T_VEC w36[2] = { sub(w21[0], mul(kWeight2, sub(w29[0], w29[1]))),   \
632
0
                           sub(w21[1], mul(kWeight2, add(w29[1], w29[0]))) }; \
633
0
    const T_VEC w37[2] = { add(w21[0], mul(kWeight2, sub(w29[0], w29[1]))),   \
634
0
                           add(w21[1], mul(kWeight2, add(w29[1], w29[0]))) }; \
635
0
    const T_VEC w38[2] = { add(i1, i7), sub(i15, i9) };                       \
636
0
    const T_VEC w39[2] = { sub(i1, i7), sub(sub(kWeight0, i9), i15) };        \
637
0
    const T_VEC w40[2] = { add(i5, i3), sub(i11, i13) };                      \
638
0
    const T_VEC w41[2] = { sub(i5, i3), sub(sub(kWeight0, i13), i11) };       \
639
0
    const T_VEC w42[2] = { add(w38[0], w40[0]), add(w38[1], w40[1]) };        \
640
0
    const T_VEC w43[2] = { sub(w38[0], w40[0]), sub(w38[1], w40[1]) };        \
641
0
    const T_VEC w44[2] = { add(w39[0], w41[1]), sub(w39[1], w41[0]) };        \
642
0
    const T_VEC w45[2] = { sub(w39[0], w41[1]), add(w39[1], w41[0]) };        \
643
0
    const T_VEC w46[2] = { add(i3, i5), sub(i13, i11) };                      \
644
0
    const T_VEC w47[2] = { sub(i3, i5), sub(sub(kWeight0, i11), i13) };       \
645
0
    const T_VEC w48[2] = { add(i7, i1), sub(i9, i15) };                       \
646
0
    const T_VEC w49[2] = { sub(i7, i1), sub(sub(kWeight0, i15), i9) };        \
647
0
    const T_VEC w50[2] = { add(w46[0], w48[0]), add(w46[1], w48[1]) };        \
648
0
    const T_VEC w51[2] = { sub(w46[0], w48[0]), sub(w46[1], w48[1]) };        \
649
0
    const T_VEC w52[2] = { add(w47[0], w49[1]), sub(w47[1], w49[0]) };        \
650
0
    const T_VEC w53[2] = { sub(w47[0], w49[1]), add(w47[1], w49[0]) };        \
651
0
    const T_VEC w54[2] = { add(w42[0], w50[0]), add(w42[1], w50[1]) };        \
652
0
    const T_VEC w55[2] = { sub(w42[0], w50[0]), sub(w42[1], w50[1]) };        \
653
0
    const T_VEC w56[2] = { add(w44[0], mul(kWeight2, add(w52[0], w52[1]))),   \
654
0
                           add(w44[1], mul(kWeight2, sub(w52[1], w52[0]))) }; \
655
0
    const T_VEC w57[2] = { add(w44[0],                                        \
656
0
                               sub(sub(kWeight0, mul(kWeight2, w52[0])),      \
657
0
                                   mul(kWeight2, w52[1]))),                   \
658
0
                           add(w44[1], mul(kWeight2, sub(w52[0], w52[1]))) }; \
659
0
    const T_VEC w58[2] = { add(w43[0], w51[1]), sub(w43[1], w51[0]) };        \
660
0
    const T_VEC w59[2] = { sub(w43[0], w51[1]), add(w43[1], w51[0]) };        \
661
0
    const T_VEC w60[2] = { sub(w45[0], mul(kWeight2, sub(w53[0], w53[1]))),   \
662
0
                           sub(w45[1], mul(kWeight2, add(w53[1], w53[0]))) }; \
663
0
    const T_VEC w61[2] = { add(w45[0], mul(kWeight2, sub(w53[0], w53[1]))),   \
664
0
                           add(w45[1], mul(kWeight2, add(w53[1], w53[0]))) }; \
665
0
    store(output + 0 * stride, add(w30[0], w54[0]));                          \
666
0
    store(output + 1 * stride,                                                \
667
0
          add(w32[0], add(mul(kWeight3, w56[0]), mul(kWeight4, w56[1]))));    \
668
0
    store(output + 2 * stride,                                                \
669
0
          add(w34[0], mul(kWeight2, add(w58[0], w58[1]))));                   \
670
0
    store(output + 3 * stride,                                                \
671
0
          add(w36[0], add(mul(kWeight4, w60[0]), mul(kWeight3, w60[1]))));    \
672
0
    store(output + 4 * stride, add(w31[0], w55[1]));                          \
673
0
    store(output + 5 * stride,                                                \
674
0
          sub(w33[0], sub(mul(kWeight4, w57[0]), mul(kWeight3, w57[1]))));    \
675
0
    store(output + 6 * stride,                                                \
676
0
          sub(w35[0], mul(kWeight2, sub(w59[0], w59[1]))));                   \
677
0
    store(output + 7 * stride,                                                \
678
0
          sub(w37[0], sub(mul(kWeight3, w61[0]), mul(kWeight4, w61[1]))));    \
679
0
    store(output + 8 * stride, sub(w30[0], w54[0]));                          \
680
0
    store(output + 9 * stride,                                                \
681
0
          add(w32[0], sub(sub(kWeight0, mul(kWeight3, w56[0])),               \
682
0
                          mul(kWeight4, w56[1]))));                           \
683
0
    store(output + 10 * stride,                                               \
684
0
          add(w34[0], sub(sub(kWeight0, mul(kWeight2, w58[0])),               \
685
0
                          mul(kWeight2, w58[1]))));                           \
686
0
    store(output + 11 * stride,                                               \
687
0
          add(w36[0], sub(sub(kWeight0, mul(kWeight4, w60[0])),               \
688
0
                          mul(kWeight3, w60[1]))));                           \
689
0
    store(output + 12 * stride, sub(w31[0], w55[1]));                         \
690
0
    store(output + 13 * stride,                                               \
691
0
          add(w33[0], sub(mul(kWeight4, w57[0]), mul(kWeight3, w57[1]))));    \
692
0
    store(output + 14 * stride,                                               \
693
0
          add(w35[0], mul(kWeight2, sub(w59[0], w59[1]))));                   \
694
0
    store(output + 15 * stride,                                               \
695
0
          add(w37[0], sub(mul(kWeight3, w61[0]), mul(kWeight4, w61[1]))));    \
696
0
  }
697
#define GEN_IFFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub,    \
698
                    mul)                                                       \
699
0
  ret aom_ifft1d_32_##suffix(const T *input, T *output, int stride) {          \
700
0
    const T_VEC kWeight0 = constant(0.0f);                                     \
701
0
    const T_VEC kWeight2 = constant(0.707107f);                                \
702
0
    const T_VEC kWeight3 = constant(0.92388f);                                 \
703
0
    const T_VEC kWeight4 = constant(0.382683f);                                \
704
0
    const T_VEC kWeight5 = constant(0.980785f);                                \
705
0
    const T_VEC kWeight6 = constant(0.19509f);                                 \
706
0
    const T_VEC kWeight7 = constant(0.83147f);                                 \
707
0
    const T_VEC kWeight8 = constant(0.55557f);                                 \
708
0
    const T_VEC i0 = load(input + 0 * stride);                                 \
709
0
    const T_VEC i1 = load(input + 1 * stride);                                 \
710
0
    const T_VEC i2 = load(input + 2 * stride);                                 \
711
0
    const T_VEC i3 = load(input + 3 * stride);                                 \
712
0
    const T_VEC i4 = load(input + 4 * stride);                                 \
713
0
    const T_VEC i5 = load(input + 5 * stride);                                 \
714
0
    const T_VEC i6 = load(input + 6 * stride);                                 \
715
0
    const T_VEC i7 = load(input + 7 * stride);                                 \
716
0
    const T_VEC i8 = load(input + 8 * stride);                                 \
717
0
    const T_VEC i9 = load(input + 9 * stride);                                 \
718
0
    const T_VEC i10 = load(input + 10 * stride);                               \
719
0
    const T_VEC i11 = load(input + 11 * stride);                               \
720
0
    const T_VEC i12 = load(input + 12 * stride);                               \
721
0
    const T_VEC i13 = load(input + 13 * stride);                               \
722
0
    const T_VEC i14 = load(input + 14 * stride);                               \
723
0
    const T_VEC i15 = load(input + 15 * stride);                               \
724
0
    const T_VEC i16 = load(input + 16 * stride);                               \
725
0
    const T_VEC i17 = load(input + 17 * stride);                               \
726
0
    const T_VEC i18 = load(input + 18 * stride);                               \
727
0
    const T_VEC i19 = load(input + 19 * stride);                               \
728
0
    const T_VEC i20 = load(input + 20 * stride);                               \
729
0
    const T_VEC i21 = load(input + 21 * stride);                               \
730
0
    const T_VEC i22 = load(input + 22 * stride);                               \
731
0
    const T_VEC i23 = load(input + 23 * stride);                               \
732
0
    const T_VEC i24 = load(input + 24 * stride);                               \
733
0
    const T_VEC i25 = load(input + 25 * stride);                               \
734
0
    const T_VEC i26 = load(input + 26 * stride);                               \
735
0
    const T_VEC i27 = load(input + 27 * stride);                               \
736
0
    const T_VEC i28 = load(input + 28 * stride);                               \
737
0
    const T_VEC i29 = load(input + 29 * stride);                               \
738
0
    const T_VEC i30 = load(input + 30 * stride);                               \
739
0
    const T_VEC i31 = load(input + 31 * stride);                               \
740
0
    const T_VEC w30 = add(i0, i16);                                            \
741
0
    const T_VEC w31 = sub(i0, i16);                                            \
742
0
    const T_VEC w32[2] = { add(i8, i8), sub(i24, i24) };                       \
743
0
    const T_VEC w33[2] = { sub(i8, i8), sub(sub(kWeight0, i24), i24) };        \
744
0
    const T_VEC w34[2] = { add(w30, w32[0]), w32[1] };                         \
745
0
    const T_VEC w35[2] = { sub(w30, w32[0]), sub(kWeight0, w32[1]) };          \
746
0
    const T_VEC w36[2] = { add(w31, w33[1]), sub(kWeight0, w33[0]) };          \
747
0
    const T_VEC w37[2] = { sub(w31, w33[1]), w33[0] };                         \
748
0
    const T_VEC w38[2] = { add(i4, i12), sub(i28, i20) };                      \
749
0
    const T_VEC w39[2] = { sub(i4, i12), sub(sub(kWeight0, i20), i28) };       \
750
0
    const T_VEC w40[2] = { add(i12, i4), sub(i20, i28) };                      \
751
0
    const T_VEC w41[2] = { sub(i12, i4), sub(sub(kWeight0, i28), i20) };       \
752
0
    const T_VEC w42[2] = { add(w38[0], w40[0]), add(w38[1], w40[1]) };         \
753
0
    const T_VEC w43[2] = { sub(w38[0], w40[0]), sub(w38[1], w40[1]) };         \
754
0
    const T_VEC w44[2] = { add(w39[0], w41[1]), sub(w39[1], w41[0]) };         \
755
0
    const T_VEC w45[2] = { sub(w39[0], w41[1]), add(w39[1], w41[0]) };         \
756
0
    const T_VEC w46[2] = { add(w34[0], w42[0]), add(w34[1], w42[1]) };         \
757
0
    const T_VEC w47[2] = { sub(w34[0], w42[0]), sub(w34[1], w42[1]) };         \
758
0
    const T_VEC w48[2] = { add(w36[0], mul(kWeight2, add(w44[0], w44[1]))),    \
759
0
                           add(w36[1], mul(kWeight2, sub(w44[1], w44[0]))) };  \
760
0
    const T_VEC w49[2] = { add(w36[0],                                         \
761
0
                               sub(sub(kWeight0, mul(kWeight2, w44[0])),       \
762
0
                                   mul(kWeight2, w44[1]))),                    \
763
0
                           add(w36[1], mul(kWeight2, sub(w44[0], w44[1]))) };  \
764
0
    const T_VEC w50[2] = { add(w35[0], w43[1]), sub(w35[1], w43[0]) };         \
765
0
    const T_VEC w51[2] = { sub(w35[0], w43[1]), add(w35[1], w43[0]) };         \
766
0
    const T_VEC w52[2] = { sub(w37[0], mul(kWeight2, sub(w45[0], w45[1]))),    \
767
0
                           sub(w37[1], mul(kWeight2, add(w45[1], w45[0]))) };  \
768
0
    const T_VEC w53[2] = { add(w37[0], mul(kWeight2, sub(w45[0], w45[1]))),    \
769
0
                           add(w37[1], mul(kWeight2, add(w45[1], w45[0]))) };  \
770
0
    const T_VEC w54[2] = { add(i2, i14), sub(i30, i18) };                      \
771
0
    const T_VEC w55[2] = { sub(i2, i14), sub(sub(kWeight0, i18), i30) };       \
772
0
    const T_VEC w56[2] = { add(i10, i6), sub(i22, i26) };                      \
773
0
    const T_VEC w57[2] = { sub(i10, i6), sub(sub(kWeight0, i26), i22) };       \
774
0
    const T_VEC w58[2] = { add(w54[0], w56[0]), add(w54[1], w56[1]) };         \
775
0
    const T_VEC w59[2] = { sub(w54[0], w56[0]), sub(w54[1], w56[1]) };         \
776
0
    const T_VEC w60[2] = { add(w55[0], w57[1]), sub(w55[1], w57[0]) };         \
777
0
    const T_VEC w61[2] = { sub(w55[0], w57[1]), add(w55[1], w57[0]) };         \
778
0
    const T_VEC w62[2] = { add(i6, i10), sub(i26, i22) };                      \
779
0
    const T_VEC w63[2] = { sub(i6, i10), sub(sub(kWeight0, i22), i26) };       \
780
0
    const T_VEC w64[2] = { add(i14, i2), sub(i18, i30) };                      \
781
0
    const T_VEC w65[2] = { sub(i14, i2), sub(sub(kWeight0, i30), i18) };       \
782
0
    const T_VEC w66[2] = { add(w62[0], w64[0]), add(w62[1], w64[1]) };         \
783
0
    const T_VEC w67[2] = { sub(w62[0], w64[0]), sub(w62[1], w64[1]) };         \
784
0
    const T_VEC w68[2] = { add(w63[0], w65[1]), sub(w63[1], w65[0]) };         \
785
0
    const T_VEC w69[2] = { sub(w63[0], w65[1]), add(w63[1], w65[0]) };         \
786
0
    const T_VEC w70[2] = { add(w58[0], w66[0]), add(w58[1], w66[1]) };         \
787
0
    const T_VEC w71[2] = { sub(w58[0], w66[0]), sub(w58[1], w66[1]) };         \
788
0
    const T_VEC w72[2] = { add(w60[0], mul(kWeight2, add(w68[0], w68[1]))),    \
789
0
                           add(w60[1], mul(kWeight2, sub(w68[1], w68[0]))) };  \
790
0
    const T_VEC w73[2] = { add(w60[0],                                         \
791
0
                               sub(sub(kWeight0, mul(kWeight2, w68[0])),       \
792
0
                                   mul(kWeight2, w68[1]))),                    \
793
0
                           add(w60[1], mul(kWeight2, sub(w68[0], w68[1]))) };  \
794
0
    const T_VEC w74[2] = { add(w59[0], w67[1]), sub(w59[1], w67[0]) };         \
795
0
    const T_VEC w75[2] = { sub(w59[0], w67[1]), add(w59[1], w67[0]) };         \
796
0
    const T_VEC w76[2] = { sub(w61[0], mul(kWeight2, sub(w69[0], w69[1]))),    \
797
0
                           sub(w61[1], mul(kWeight2, add(w69[1], w69[0]))) };  \
798
0
    const T_VEC w77[2] = { add(w61[0], mul(kWeight2, sub(w69[0], w69[1]))),    \
799
0
                           add(w61[1], mul(kWeight2, add(w69[1], w69[0]))) };  \
800
0
    const T_VEC w78[2] = { add(w46[0], w70[0]), add(w46[1], w70[1]) };         \
801
0
    const T_VEC w79[2] = { sub(w46[0], w70[0]), sub(w46[1], w70[1]) };         \
802
0
    const T_VEC w80[2] = {                                                     \
803
0
      add(w48[0], add(mul(kWeight3, w72[0]), mul(kWeight4, w72[1]))),          \
804
0
      add(w48[1], sub(mul(kWeight3, w72[1]), mul(kWeight4, w72[0])))           \
805
0
    };                                                                         \
806
0
    const T_VEC w81[2] = {                                                     \
807
0
      add(w48[0],                                                              \
808
0
          sub(sub(kWeight0, mul(kWeight3, w72[0])), mul(kWeight4, w72[1]))),   \
809
0
      add(w48[1], sub(mul(kWeight4, w72[0]), mul(kWeight3, w72[1])))           \
810
0
    };                                                                         \
811
0
    const T_VEC w82[2] = { add(w50[0], mul(kWeight2, add(w74[0], w74[1]))),    \
812
0
                           add(w50[1], mul(kWeight2, sub(w74[1], w74[0]))) };  \
813
0
    const T_VEC w83[2] = { add(w50[0],                                         \
814
0
                               sub(sub(kWeight0, mul(kWeight2, w74[0])),       \
815
0
                                   mul(kWeight2, w74[1]))),                    \
816
0
                           add(w50[1], mul(kWeight2, sub(w74[0], w74[1]))) };  \
817
0
    const T_VEC w84[2] = {                                                     \
818
0
      add(w52[0], add(mul(kWeight4, w76[0]), mul(kWeight3, w76[1]))),          \
819
0
      add(w52[1], sub(mul(kWeight4, w76[1]), mul(kWeight3, w76[0])))           \
820
0
    };                                                                         \
821
0
    const T_VEC w85[2] = {                                                     \
822
0
      add(w52[0],                                                              \
823
0
          sub(sub(kWeight0, mul(kWeight4, w76[0])), mul(kWeight3, w76[1]))),   \
824
0
      add(w52[1], sub(mul(kWeight3, w76[0]), mul(kWeight4, w76[1])))           \
825
0
    };                                                                         \
826
0
    const T_VEC w86[2] = { add(w47[0], w71[1]), sub(w47[1], w71[0]) };         \
827
0
    const T_VEC w87[2] = { sub(w47[0], w71[1]), add(w47[1], w71[0]) };         \
828
0
    const T_VEC w88[2] = {                                                     \
829
0
      sub(w49[0], sub(mul(kWeight4, w73[0]), mul(kWeight3, w73[1]))),          \
830
0
      add(w49[1],                                                              \
831
0
          sub(sub(kWeight0, mul(kWeight4, w73[1])), mul(kWeight3, w73[0])))    \
832
0
    };                                                                         \
833
0
    const T_VEC w89[2] = {                                                     \
834
0
      add(w49[0], sub(mul(kWeight4, w73[0]), mul(kWeight3, w73[1]))),          \
835
0
      add(w49[1], add(mul(kWeight4, w73[1]), mul(kWeight3, w73[0])))           \
836
0
    };                                                                         \
837
0
    const T_VEC w90[2] = { sub(w51[0], mul(kWeight2, sub(w75[0], w75[1]))),    \
838
0
                           sub(w51[1], mul(kWeight2, add(w75[1], w75[0]))) };  \
839
0
    const T_VEC w91[2] = { add(w51[0], mul(kWeight2, sub(w75[0], w75[1]))),    \
840
0
                           add(w51[1], mul(kWeight2, add(w75[1], w75[0]))) };  \
841
0
    const T_VEC w92[2] = {                                                     \
842
0
      sub(w53[0], sub(mul(kWeight3, w77[0]), mul(kWeight4, w77[1]))),          \
843
0
      add(w53[1],                                                              \
844
0
          sub(sub(kWeight0, mul(kWeight3, w77[1])), mul(kWeight4, w77[0])))    \
845
0
    };                                                                         \
846
0
    const T_VEC w93[2] = {                                                     \
847
0
      add(w53[0], sub(mul(kWeight3, w77[0]), mul(kWeight4, w77[1]))),          \
848
0
      add(w53[1], add(mul(kWeight3, w77[1]), mul(kWeight4, w77[0])))           \
849
0
    };                                                                         \
850
0
    const T_VEC w94[2] = { add(i1, i15), sub(i31, i17) };                      \
851
0
    const T_VEC w95[2] = { sub(i1, i15), sub(sub(kWeight0, i17), i31) };       \
852
0
    const T_VEC w96[2] = { add(i9, i7), sub(i23, i25) };                       \
853
0
    const T_VEC w97[2] = { sub(i9, i7), sub(sub(kWeight0, i25), i23) };        \
854
0
    const T_VEC w98[2] = { add(w94[0], w96[0]), add(w94[1], w96[1]) };         \
855
0
    const T_VEC w99[2] = { sub(w94[0], w96[0]), sub(w94[1], w96[1]) };         \
856
0
    const T_VEC w100[2] = { add(w95[0], w97[1]), sub(w95[1], w97[0]) };        \
857
0
    const T_VEC w101[2] = { sub(w95[0], w97[1]), add(w95[1], w97[0]) };        \
858
0
    const T_VEC w102[2] = { add(i5, i11), sub(i27, i21) };                     \
859
0
    const T_VEC w103[2] = { sub(i5, i11), sub(sub(kWeight0, i21), i27) };      \
860
0
    const T_VEC w104[2] = { add(i13, i3), sub(i19, i29) };                     \
861
0
    const T_VEC w105[2] = { sub(i13, i3), sub(sub(kWeight0, i29), i19) };      \
862
0
    const T_VEC w106[2] = { add(w102[0], w104[0]), add(w102[1], w104[1]) };    \
863
0
    const T_VEC w107[2] = { sub(w102[0], w104[0]), sub(w102[1], w104[1]) };    \
864
0
    const T_VEC w108[2] = { add(w103[0], w105[1]), sub(w103[1], w105[0]) };    \
865
0
    const T_VEC w109[2] = { sub(w103[0], w105[1]), add(w103[1], w105[0]) };    \
866
0
    const T_VEC w110[2] = { add(w98[0], w106[0]), add(w98[1], w106[1]) };      \
867
0
    const T_VEC w111[2] = { sub(w98[0], w106[0]), sub(w98[1], w106[1]) };      \
868
0
    const T_VEC w112[2] = {                                                    \
869
0
      add(w100[0], mul(kWeight2, add(w108[0], w108[1]))),                      \
870
0
      add(w100[1], mul(kWeight2, sub(w108[1], w108[0])))                       \
871
0
    };                                                                         \
872
0
    const T_VEC w113[2] = {                                                    \
873
0
      add(w100[0],                                                             \
874
0
          sub(sub(kWeight0, mul(kWeight2, w108[0])), mul(kWeight2, w108[1]))), \
875
0
      add(w100[1], mul(kWeight2, sub(w108[0], w108[1])))                       \
876
0
    };                                                                         \
877
0
    const T_VEC w114[2] = { add(w99[0], w107[1]), sub(w99[1], w107[0]) };      \
878
0
    const T_VEC w115[2] = { sub(w99[0], w107[1]), add(w99[1], w107[0]) };      \
879
0
    const T_VEC w116[2] = {                                                    \
880
0
      sub(w101[0], mul(kWeight2, sub(w109[0], w109[1]))),                      \
881
0
      sub(w101[1], mul(kWeight2, add(w109[1], w109[0])))                       \
882
0
    };                                                                         \
883
0
    const T_VEC w117[2] = {                                                    \
884
0
      add(w101[0], mul(kWeight2, sub(w109[0], w109[1]))),                      \
885
0
      add(w101[1], mul(kWeight2, add(w109[1], w109[0])))                       \
886
0
    };                                                                         \
887
0
    const T_VEC w118[2] = { add(i3, i13), sub(i29, i19) };                     \
888
0
    const T_VEC w119[2] = { sub(i3, i13), sub(sub(kWeight0, i19), i29) };      \
889
0
    const T_VEC w120[2] = { add(i11, i5), sub(i21, i27) };                     \
890
0
    const T_VEC w121[2] = { sub(i11, i5), sub(sub(kWeight0, i27), i21) };      \
891
0
    const T_VEC w122[2] = { add(w118[0], w120[0]), add(w118[1], w120[1]) };    \
892
0
    const T_VEC w123[2] = { sub(w118[0], w120[0]), sub(w118[1], w120[1]) };    \
893
0
    const T_VEC w124[2] = { add(w119[0], w121[1]), sub(w119[1], w121[0]) };    \
894
0
    const T_VEC w125[2] = { sub(w119[0], w121[1]), add(w119[1], w121[0]) };    \
895
0
    const T_VEC w126[2] = { add(i7, i9), sub(i25, i23) };                      \
896
0
    const T_VEC w127[2] = { sub(i7, i9), sub(sub(kWeight0, i23), i25) };       \
897
0
    const T_VEC w128[2] = { add(i15, i1), sub(i17, i31) };                     \
898
0
    const T_VEC w129[2] = { sub(i15, i1), sub(sub(kWeight0, i31), i17) };      \
899
0
    const T_VEC w130[2] = { add(w126[0], w128[0]), add(w126[1], w128[1]) };    \
900
0
    const T_VEC w131[2] = { sub(w126[0], w128[0]), sub(w126[1], w128[1]) };    \
901
0
    const T_VEC w132[2] = { add(w127[0], w129[1]), sub(w127[1], w129[0]) };    \
902
0
    const T_VEC w133[2] = { sub(w127[0], w129[1]), add(w127[1], w129[0]) };    \
903
0
    const T_VEC w134[2] = { add(w122[0], w130[0]), add(w122[1], w130[1]) };    \
904
0
    const T_VEC w135[2] = { sub(w122[0], w130[0]), sub(w122[1], w130[1]) };    \
905
0
    const T_VEC w136[2] = {                                                    \
906
0
      add(w124[0], mul(kWeight2, add(w132[0], w132[1]))),                      \
907
0
      add(w124[1], mul(kWeight2, sub(w132[1], w132[0])))                       \
908
0
    };                                                                         \
909
0
    const T_VEC w137[2] = {                                                    \
910
0
      add(w124[0],                                                             \
911
0
          sub(sub(kWeight0, mul(kWeight2, w132[0])), mul(kWeight2, w132[1]))), \
912
0
      add(w124[1], mul(kWeight2, sub(w132[0], w132[1])))                       \
913
0
    };                                                                         \
914
0
    const T_VEC w138[2] = { add(w123[0], w131[1]), sub(w123[1], w131[0]) };    \
915
0
    const T_VEC w139[2] = { sub(w123[0], w131[1]), add(w123[1], w131[0]) };    \
916
0
    const T_VEC w140[2] = {                                                    \
917
0
      sub(w125[0], mul(kWeight2, sub(w133[0], w133[1]))),                      \
918
0
      sub(w125[1], mul(kWeight2, add(w133[1], w133[0])))                       \
919
0
    };                                                                         \
920
0
    const T_VEC w141[2] = {                                                    \
921
0
      add(w125[0], mul(kWeight2, sub(w133[0], w133[1]))),                      \
922
0
      add(w125[1], mul(kWeight2, add(w133[1], w133[0])))                       \
923
0
    };                                                                         \
924
0
    const T_VEC w142[2] = { add(w110[0], w134[0]), add(w110[1], w134[1]) };    \
925
0
    const T_VEC w143[2] = { sub(w110[0], w134[0]), sub(w110[1], w134[1]) };    \
926
0
    const T_VEC w144[2] = {                                                    \
927
0
      add(w112[0], add(mul(kWeight3, w136[0]), mul(kWeight4, w136[1]))),       \
928
0
      add(w112[1], sub(mul(kWeight3, w136[1]), mul(kWeight4, w136[0])))        \
929
0
    };                                                                         \
930
0
    const T_VEC w145[2] = {                                                    \
931
0
      add(w112[0],                                                             \
932
0
          sub(sub(kWeight0, mul(kWeight3, w136[0])), mul(kWeight4, w136[1]))), \
933
0
      add(w112[1], sub(mul(kWeight4, w136[0]), mul(kWeight3, w136[1])))        \
934
0
    };                                                                         \
935
0
    const T_VEC w146[2] = {                                                    \
936
0
      add(w114[0], mul(kWeight2, add(w138[0], w138[1]))),                      \
937
0
      add(w114[1], mul(kWeight2, sub(w138[1], w138[0])))                       \
938
0
    };                                                                         \
939
0
    const T_VEC w147[2] = {                                                    \
940
0
      add(w114[0],                                                             \
941
0
          sub(sub(kWeight0, mul(kWeight2, w138[0])), mul(kWeight2, w138[1]))), \
942
0
      add(w114[1], mul(kWeight2, sub(w138[0], w138[1])))                       \
943
0
    };                                                                         \
944
0
    const T_VEC w148[2] = {                                                    \
945
0
      add(w116[0], add(mul(kWeight4, w140[0]), mul(kWeight3, w140[1]))),       \
946
0
      add(w116[1], sub(mul(kWeight4, w140[1]), mul(kWeight3, w140[0])))        \
947
0
    };                                                                         \
948
0
    const T_VEC w149[2] = {                                                    \
949
0
      add(w116[0],                                                             \
950
0
          sub(sub(kWeight0, mul(kWeight4, w140[0])), mul(kWeight3, w140[1]))), \
951
0
      add(w116[1], sub(mul(kWeight3, w140[0]), mul(kWeight4, w140[1])))        \
952
0
    };                                                                         \
953
0
    const T_VEC w150[2] = { add(w111[0], w135[1]), sub(w111[1], w135[0]) };    \
954
0
    const T_VEC w151[2] = { sub(w111[0], w135[1]), add(w111[1], w135[0]) };    \
955
0
    const T_VEC w152[2] = {                                                    \
956
0
      sub(w113[0], sub(mul(kWeight4, w137[0]), mul(kWeight3, w137[1]))),       \
957
0
      add(w113[1],                                                             \
958
0
          sub(sub(kWeight0, mul(kWeight4, w137[1])), mul(kWeight3, w137[0])))  \
959
0
    };                                                                         \
960
0
    const T_VEC w153[2] = {                                                    \
961
0
      add(w113[0], sub(mul(kWeight4, w137[0]), mul(kWeight3, w137[1]))),       \
962
0
      add(w113[1], add(mul(kWeight4, w137[1]), mul(kWeight3, w137[0])))        \
963
0
    };                                                                         \
964
0
    const T_VEC w154[2] = {                                                    \
965
0
      sub(w115[0], mul(kWeight2, sub(w139[0], w139[1]))),                      \
966
0
      sub(w115[1], mul(kWeight2, add(w139[1], w139[0])))                       \
967
0
    };                                                                         \
968
0
    const T_VEC w155[2] = {                                                    \
969
0
      add(w115[0], mul(kWeight2, sub(w139[0], w139[1]))),                      \
970
0
      add(w115[1], mul(kWeight2, add(w139[1], w139[0])))                       \
971
0
    };                                                                         \
972
0
    const T_VEC w156[2] = {                                                    \
973
0
      sub(w117[0], sub(mul(kWeight3, w141[0]), mul(kWeight4, w141[1]))),       \
974
0
      add(w117[1],                                                             \
975
0
          sub(sub(kWeight0, mul(kWeight3, w141[1])), mul(kWeight4, w141[0])))  \
976
0
    };                                                                         \
977
0
    const T_VEC w157[2] = {                                                    \
978
0
      add(w117[0], sub(mul(kWeight3, w141[0]), mul(kWeight4, w141[1]))),       \
979
0
      add(w117[1], add(mul(kWeight3, w141[1]), mul(kWeight4, w141[0])))        \
980
0
    };                                                                         \
981
0
    store(output + 0 * stride, add(w78[0], w142[0]));                          \
982
0
    store(output + 1 * stride,                                                 \
983
0
          add(w80[0], add(mul(kWeight5, w144[0]), mul(kWeight6, w144[1]))));   \
984
0
    store(output + 2 * stride,                                                 \
985
0
          add(w82[0], add(mul(kWeight3, w146[0]), mul(kWeight4, w146[1]))));   \
986
0
    store(output + 3 * stride,                                                 \
987
0
          add(w84[0], add(mul(kWeight7, w148[0]), mul(kWeight8, w148[1]))));   \
988
0
    store(output + 4 * stride,                                                 \
989
0
          add(w86[0], mul(kWeight2, add(w150[0], w150[1]))));                  \
990
0
    store(output + 5 * stride,                                                 \
991
0
          add(w88[0], add(mul(kWeight8, w152[0]), mul(kWeight7, w152[1]))));   \
992
0
    store(output + 6 * stride,                                                 \
993
0
          add(w90[0], add(mul(kWeight4, w154[0]), mul(kWeight3, w154[1]))));   \
994
0
    store(output + 7 * stride,                                                 \
995
0
          add(w92[0], add(mul(kWeight6, w156[0]), mul(kWeight5, w156[1]))));   \
996
0
    store(output + 8 * stride, add(w79[0], w143[1]));                          \
997
0
    store(output + 9 * stride,                                                 \
998
0
          sub(w81[0], sub(mul(kWeight6, w145[0]), mul(kWeight5, w145[1]))));   \
999
0
    store(output + 10 * stride,                                                \
1000
0
          sub(w83[0], sub(mul(kWeight4, w147[0]), mul(kWeight3, w147[1]))));   \
1001
0
    store(output + 11 * stride,                                                \
1002
0
          sub(w85[0], sub(mul(kWeight8, w149[0]), mul(kWeight7, w149[1]))));   \
1003
0
    store(output + 12 * stride,                                                \
1004
0
          sub(w87[0], mul(kWeight2, sub(w151[0], w151[1]))));                  \
1005
0
    store(output + 13 * stride,                                                \
1006
0
          sub(w89[0], sub(mul(kWeight7, w153[0]), mul(kWeight8, w153[1]))));   \
1007
0
    store(output + 14 * stride,                                                \
1008
0
          sub(w91[0], sub(mul(kWeight3, w155[0]), mul(kWeight4, w155[1]))));   \
1009
0
    store(output + 15 * stride,                                                \
1010
0
          sub(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1]))));   \
1011
0
    store(output + 16 * stride, sub(w78[0], w142[0]));                         \
1012
0
    store(output + 17 * stride,                                                \
1013
0
          add(w80[0], sub(sub(kWeight0, mul(kWeight5, w144[0])),               \
1014
0
                          mul(kWeight6, w144[1]))));                           \
1015
0
    store(output + 18 * stride,                                                \
1016
0
          add(w82[0], sub(sub(kWeight0, mul(kWeight3, w146[0])),               \
1017
0
                          mul(kWeight4, w146[1]))));                           \
1018
0
    store(output + 19 * stride,                                                \
1019
0
          add(w84[0], sub(sub(kWeight0, mul(kWeight7, w148[0])),               \
1020
0
                          mul(kWeight8, w148[1]))));                           \
1021
0
    store(output + 20 * stride,                                                \
1022
0
          add(w86[0], sub(sub(kWeight0, mul(kWeight2, w150[0])),               \
1023
0
                          mul(kWeight2, w150[1]))));                           \
1024
0
    store(output + 21 * stride,                                                \
1025
0
          add(w88[0], sub(sub(kWeight0, mul(kWeight8, w152[0])),               \
1026
0
                          mul(kWeight7, w152[1]))));                           \
1027
0
    store(output + 22 * stride,                                                \
1028
0
          add(w90[0], sub(sub(kWeight0, mul(kWeight4, w154[0])),               \
1029
0
                          mul(kWeight3, w154[1]))));                           \
1030
0
    store(output + 23 * stride,                                                \
1031
0
          add(w92[0], sub(sub(kWeight0, mul(kWeight6, w156[0])),               \
1032
0
                          mul(kWeight5, w156[1]))));                           \
1033
0
    store(output + 24 * stride, sub(w79[0], w143[1]));                         \
1034
0
    store(output + 25 * stride,                                                \
1035
0
          add(w81[0], sub(mul(kWeight6, w145[0]), mul(kWeight5, w145[1]))));   \
1036
0
    store(output + 26 * stride,                                                \
1037
0
          add(w83[0], sub(mul(kWeight4, w147[0]), mul(kWeight3, w147[1]))));   \
1038
0
    store(output + 27 * stride,                                                \
1039
0
          add(w85[0], sub(mul(kWeight8, w149[0]), mul(kWeight7, w149[1]))));   \
1040
0
    store(output + 28 * stride,                                                \
1041
0
          add(w87[0], mul(kWeight2, sub(w151[0], w151[1]))));                  \
1042
0
    store(output + 29 * stride,                                                \
1043
0
          add(w89[0], sub(mul(kWeight7, w153[0]), mul(kWeight8, w153[1]))));   \
1044
0
    store(output + 30 * stride,                                                \
1045
0
          add(w91[0], sub(mul(kWeight3, w155[0]), mul(kWeight4, w155[1]))));   \
1046
0
    store(output + 31 * stride,                                                \
1047
0
          add(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1]))));   \
1048
0
  }
1049
1050
#endif  // AOM_AOM_DSP_FFT_COMMON_H_