Coverage Report

Created: 2026-05-16 06:41

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/svt-av1/Source/Lib/Codec/fft_common.h
Line
Count
Source
1
/*
2
 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
10
 */
11
12
#ifndef AOM_AOM_DSP_FFT_COMMON_H_
13
#define AOM_AOM_DSP_FFT_COMMON_H_
14
15
#ifdef __cplusplus
16
extern "C" {
17
#endif
18
19
/*!\brief A function pointer for computing 1d fft and ifft.
20
     *
21
     * The function will point to an implementation for a specific transform size,
22
     * and may perform the transforms using vectorized instructions.
23
     *
24
     * For a non-vectorized forward transforms of size n, the input and output
25
     * buffers will be size n. The output takes advantage of conjugate symmetry and
26
     * packs the results as: [r_0, r_1, ..., r_{n/2}, i_1, ..., i_{n/2-1}], where
27
     * (r_{j}, i_{j}) is the complex output for index j.
28
     *
29
     * An inverse transform will assume that the complex "input" is packed
30
     * similarly. Its output will be real.
31
     *
32
     * Non-vectorized transforms (e.g., on a single row) would use a stride = 1.
33
     *
34
     * Vectorized implementations are parallelized along the columns so that the fft
35
     * can be performed on multiple columns at a time. In such cases the data block
36
     * for input and output is typically square (n x n) and the stride will
37
     * correspond to the spacing between rows. At minimum, the input size must be
38
     * n x simd_vector_length.
39
     *
40
     * \param[in]  input   Input buffer. See above for size restrictions.
41
     * \param[out] output  Output buffer. See above for size restrictions.
42
     * \param[in]  stride  The spacing in number of elements between rows
43
     *                     (or elements)
44
     */
45
typedef void (*AomFft1dFunc)(const float* input, float* output, int32_t stride);
46
47
// Declare some of the forward non-vectorized transforms which are used in some
48
// of the vectorized implementations
49
void svt_aom_fft1d_4_float(const float* input, float* output, int32_t stride);
50
void svt_aom_fft1d_8_float(const float* input, float* output, int32_t stride);
51
void svt_aom_fft1d_16_float(const float* input, float* output, int32_t stride);
52
void svt_aom_fft1d_32_float(const float* input, float* output, int32_t stride);
53
54
/**\!brief Function pointer for transposing a matrix of floats.
55
     *
56
     * \param[in]  input  Input buffer (size n x n)
57
     * \param[out] output Output buffer (size n x n)
58
     * \param[in]  n      Extent of one dimension of the square matrix.
59
     */
60
typedef void (*AomFftTransposeFunc)(const float* input, float* output, int32_t n);
61
62
/**\!brief Function pointer for re-arranging intermediate 2d transform results.
63
     *
64
     * After re-arrangement, the real and imaginary components will be packed
65
     * tightly next to each other.
66
     *
67
     * \param[in]  input  Input buffer (size n x n)
68
     * \param[out] output Output buffer (size 2 x n x n)
69
     * \param[in]  n      Extent of one dimension of the square matrix.
70
     */
71
typedef void (*AomFftUnpackFunc)(const float* input, float* output, int32_t n);
72
73
/*!\brief Performs a 2d fft with the given functions.
74
     *
75
     * This generator function allows for multiple different implementations of 2d
76
     * fft with different vector operations, without having to redefine the main
77
     * body multiple times.
78
     *
79
     * \param[in]  input     Input buffer to run the transform on (size n x n)
80
     * \param[out] temp      Working buffer for computing the transform (size n x n)
81
     * \param[out] output    Output buffer (size 2 x n x n)
82
     * \param[in]  tform     Forward transform function
83
     * \param[in]  transpose Transpose function (for n x n matrix)
84
     * \param[in]  unpack    Unpack function used to massage outputs to correct form
85
     * \param[in]  vec_size  Vector size (the transform is done vec_size units at
86
     *                       a time)
87
     */
88
void svt_aom_fft_2d_gen(const float* input, float* temp, float* output, int32_t n, AomFft1dFunc tform,
89
                        AomFftTransposeFunc transpose, AomFftUnpackFunc unpack, int32_t vec_size);
90
91
/*!\brief Perform a 2d inverse fft with the given helper functions
92
     *
93
     * \param[in]  input      Input buffer to run the transform on (size 2 x n x n)
94
     * \param[out] temp       Working buffer for computations (size 2 x n x n)
95
     * \param[out] output     Output buffer (size n x n)
96
     * \param[in]  fft_single Forward transform function (non vectorized)
97
     * \param[in]  fft_multi  Forward transform function (vectorized)
98
     * \param[in]  ifft_multi Inverse transform function (vectorized)
99
     * \param[in]  transpose  Transpose function (for n x n matrix)
100
     * \param[in]  vec_size   Vector size (the transform is done vec_size
101
     *                        units at a time)
102
     */
103
void svt_aom_ifft_2d_gen(const float* input, float* temp, float* output, int32_t n, AomFft1dFunc fft_single,
104
                         AomFft1dFunc fft_multi, AomFft1dFunc ifft_multi, AomFftTransposeFunc transpose,
105
                         int32_t vec_size);
106
#ifdef __cplusplus
107
}
108
#endif
109
110
// The macros below define 1D fft/ifft for different data types and for
111
// different simd vector intrinsic types.
112
113
#define GEN_FFT_2(ret, suffix, T, T_VEC, load, store)                         \
114
0
    ret svt_aom_fft1d_2_##suffix(const T* input, T* output, int32_t stride) { \
115
0
        const T_VEC i0 = load(input + 0 * stride);                            \
116
0
        const T_VEC i1 = load(input + 1 * stride);                            \
117
0
        store(output + 0 * stride, i0 + i1);                                  \
118
0
        store(output + 1 * stride, i0 - i1);                                  \
119
0
    }
120
121
#define GEN_FFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub)     \
122
0
    ret svt_aom_fft1d_4_##suffix(const T* input, T* output, int32_t stride) { \
123
0
        const T_VEC k_weight0 = constant(0.0f);                               \
124
0
        const T_VEC i0        = load(input + 0 * stride);                     \
125
0
        const T_VEC i1        = load(input + 1 * stride);                     \
126
0
        const T_VEC i2        = load(input + 2 * stride);                     \
127
0
        const T_VEC i3        = load(input + 3 * stride);                     \
128
0
        const T_VEC w0        = add(i0, i2);                                  \
129
0
        const T_VEC w1        = sub(i0, i2);                                  \
130
0
        const T_VEC w2        = add(i1, i3);                                  \
131
0
        const T_VEC w3        = sub(i1, i3);                                  \
132
0
        store(output + 0 * stride, add(w0, w2));                              \
133
0
        store(output + 1 * stride, w1);                                       \
134
0
        store(output + 2 * stride, sub(w0, w2));                              \
135
0
        store(output + 3 * stride, sub(k_weight0, w3));                       \
136
0
    }
137
138
#define GEN_FFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, mul)             \
139
0
    ret svt_aom_fft1d_8_##suffix(const T* input, T* output, int32_t stride) {              \
140
0
        const T_VEC k_weight0 = constant(0.0f);                                            \
141
0
        const T_VEC k_weight2 = constant(0.707107f);                                       \
142
0
        const T_VEC i0        = load(input + 0 * stride);                                  \
143
0
        const T_VEC i1        = load(input + 1 * stride);                                  \
144
0
        const T_VEC i2        = load(input + 2 * stride);                                  \
145
0
        const T_VEC i3        = load(input + 3 * stride);                                  \
146
0
        const T_VEC i4        = load(input + 4 * stride);                                  \
147
0
        const T_VEC i5        = load(input + 5 * stride);                                  \
148
0
        const T_VEC i6        = load(input + 6 * stride);                                  \
149
0
        const T_VEC i7        = load(input + 7 * stride);                                  \
150
0
        const T_VEC w0        = add(i0, i4);                                               \
151
0
        const T_VEC w1        = sub(i0, i4);                                               \
152
0
        const T_VEC w2        = add(i2, i6);                                               \
153
0
        const T_VEC w3        = sub(i2, i6);                                               \
154
0
        const T_VEC w4        = add(w0, w2);                                               \
155
0
        const T_VEC w5        = sub(w0, w2);                                               \
156
0
        const T_VEC w7        = add(i1, i5);                                               \
157
0
        const T_VEC w8        = sub(i1, i5);                                               \
158
0
        const T_VEC w9        = add(i3, i7);                                               \
159
0
        const T_VEC w10       = sub(i3, i7);                                               \
160
0
        const T_VEC w11       = add(w7, w9);                                               \
161
0
        const T_VEC w12       = sub(w7, w9);                                               \
162
0
        store(output + 0 * stride, add(w4, w11));                                          \
163
0
        store(output + 1 * stride, add(w1, mul(k_weight2, sub(w8, w10))));                 \
164
0
        store(output + 2 * stride, w5);                                                    \
165
0
        store(output + 3 * stride, sub(w1, mul(k_weight2, sub(w8, w10))));                 \
166
0
        store(output + 4 * stride, sub(w4, w11));                                          \
167
0
        store(output + 5 * stride, sub(sub(k_weight0, w3), mul(k_weight2, add(w10, w8)))); \
168
0
        store(output + 6 * stride, sub(k_weight0, w12));                                   \
169
0
        store(output + 7 * stride, sub(w3, mul(k_weight2, add(w10, w8))));                 \
170
0
    }
171
172
#define GEN_FFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub, mul)                                        \
173
0
    ret svt_aom_fft1d_16_##suffix(const T* input, T* output, int32_t stride) {                                         \
174
0
        const T_VEC k_weight0 = constant(0.0f);                                                                        \
175
0
        const T_VEC k_weight2 = constant(0.707107f);                                                                   \
176
0
        const T_VEC k_weight3 = constant(0.92388f);                                                                    \
177
0
        const T_VEC k_weight4 = constant(0.382683f);                                                                   \
178
0
        const T_VEC i0        = load(input + 0 * stride);                                                              \
179
0
        const T_VEC i1        = load(input + 1 * stride);                                                              \
180
0
        const T_VEC i2        = load(input + 2 * stride);                                                              \
181
0
        const T_VEC i3        = load(input + 3 * stride);                                                              \
182
0
        const T_VEC i4        = load(input + 4 * stride);                                                              \
183
0
        const T_VEC i5        = load(input + 5 * stride);                                                              \
184
0
        const T_VEC i6        = load(input + 6 * stride);                                                              \
185
0
        const T_VEC i7        = load(input + 7 * stride);                                                              \
186
0
        const T_VEC i8        = load(input + 8 * stride);                                                              \
187
0
        const T_VEC i9        = load(input + 9 * stride);                                                              \
188
0
        const T_VEC i10       = load(input + 10 * stride);                                                             \
189
0
        const T_VEC i11       = load(input + 11 * stride);                                                             \
190
0
        const T_VEC i12       = load(input + 12 * stride);                                                             \
191
0
        const T_VEC i13       = load(input + 13 * stride);                                                             \
192
0
        const T_VEC i14       = load(input + 14 * stride);                                                             \
193
0
        const T_VEC i15       = load(input + 15 * stride);                                                             \
194
0
        const T_VEC w0        = add(i0, i8);                                                                           \
195
0
        const T_VEC w1        = sub(i0, i8);                                                                           \
196
0
        const T_VEC w2        = add(i4, i12);                                                                          \
197
0
        const T_VEC w3        = sub(i4, i12);                                                                          \
198
0
        const T_VEC w4        = add(w0, w2);                                                                           \
199
0
        const T_VEC w5        = sub(w0, w2);                                                                           \
200
0
        const T_VEC w7        = add(i2, i10);                                                                          \
201
0
        const T_VEC w8        = sub(i2, i10);                                                                          \
202
0
        const T_VEC w9        = add(i6, i14);                                                                          \
203
0
        const T_VEC w10       = sub(i6, i14);                                                                          \
204
0
        const T_VEC w11       = add(w7, w9);                                                                           \
205
0
        const T_VEC w12       = sub(w7, w9);                                                                           \
206
0
        const T_VEC w14       = add(w4, w11);                                                                          \
207
0
        const T_VEC w15       = sub(w4, w11);                                                                          \
208
0
        const T_VEC w16[2]    = {add(w1, mul(k_weight2, sub(w8, w10))),                                                \
209
0
                                 sub(sub(k_weight0, w3), mul(k_weight2, add(w10, w8)))};                               \
210
0
        const T_VEC w18[2]    = {sub(w1, mul(k_weight2, sub(w8, w10))), sub(w3, mul(k_weight2, add(w10, w8)))};        \
211
0
        const T_VEC w19       = add(i1, i9);                                                                           \
212
0
        const T_VEC w20       = sub(i1, i9);                                                                           \
213
0
        const T_VEC w21       = add(i5, i13);                                                                          \
214
0
        const T_VEC w22       = sub(i5, i13);                                                                          \
215
0
        const T_VEC w23       = add(w19, w21);                                                                         \
216
0
        const T_VEC w24       = sub(w19, w21);                                                                         \
217
0
        const T_VEC w26       = add(i3, i11);                                                                          \
218
0
        const T_VEC w27       = sub(i3, i11);                                                                          \
219
0
        const T_VEC w28       = add(i7, i15);                                                                          \
220
0
        const T_VEC w29       = sub(i7, i15);                                                                          \
221
0
        const T_VEC w30       = add(w26, w28);                                                                         \
222
0
        const T_VEC w31       = sub(w26, w28);                                                                         \
223
0
        const T_VEC w33       = add(w23, w30);                                                                         \
224
0
        const T_VEC w34       = sub(w23, w30);                                                                         \
225
0
        const T_VEC w35[2]    = {add(w20, mul(k_weight2, sub(w27, w29))),                                              \
226
0
                                 sub(sub(k_weight0, w22), mul(k_weight2, add(w29, w27)))};                             \
227
0
        const T_VEC w37[2]    = {sub(w20, mul(k_weight2, sub(w27, w29))), sub(w22, mul(k_weight2, add(w29, w27)))};    \
228
0
        store(output + 0 * stride, add(w14, w33));                                                                     \
229
0
        store(output + 1 * stride, add(w16[0], add(mul(k_weight3, w35[0]), mul(k_weight4, w35[1]))));                  \
230
0
        store(output + 2 * stride, add(w5, mul(k_weight2, sub(w24, w31))));                                            \
231
0
        store(output + 3 * stride, add(w18[0], add(mul(k_weight4, w37[0]), mul(k_weight3, w37[1]))));                  \
232
0
        store(output + 4 * stride, w15);                                                                               \
233
0
        store(output + 5 * stride, add(w18[0], sub(sub(k_weight0, mul(k_weight4, w37[0])), mul(k_weight3, w37[1]))));  \
234
0
        store(output + 6 * stride, sub(w5, mul(k_weight2, sub(w24, w31))));                                            \
235
0
        store(output + 7 * stride, add(w16[0], sub(sub(k_weight0, mul(k_weight3, w35[0])), mul(k_weight4, w35[1]))));  \
236
0
        store(output + 8 * stride, sub(w14, w33));                                                                     \
237
0
        store(output + 9 * stride, add(w16[1], sub(mul(k_weight3, w35[1]), mul(k_weight4, w35[0]))));                  \
238
0
        store(output + 10 * stride, sub(sub(k_weight0, w12), mul(k_weight2, add(w31, w24))));                          \
239
0
        store(output + 11 * stride, add(w18[1], sub(mul(k_weight4, w37[1]), mul(k_weight3, w37[0]))));                 \
240
0
        store(output + 12 * stride, sub(k_weight0, w34));                                                              \
241
0
        store(output + 13 * stride, sub(sub(k_weight0, w18[1]), sub(mul(k_weight3, w37[0]), mul(k_weight4, w37[1])))); \
242
0
        store(output + 14 * stride, sub(w12, mul(k_weight2, add(w31, w24))));                                          \
243
0
        store(output + 15 * stride, sub(sub(k_weight0, w16[1]), sub(mul(k_weight4, w35[0]), mul(k_weight3, w35[1])))); \
244
0
    }
245
246
#define GEN_FFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub, mul)                                        \
247
0
    ret svt_aom_fft1d_32_##suffix(const T* input, T* output, int32_t stride) {                                         \
248
0
        const T_VEC k_weight0 = constant(0.0f);                                                                        \
249
0
        const T_VEC k_weight2 = constant(0.707107f);                                                                   \
250
0
        const T_VEC k_weight3 = constant(0.92388f);                                                                    \
251
0
        const T_VEC k_weight4 = constant(0.382683f);                                                                   \
252
0
        const T_VEC k_weight5 = constant(0.980785f);                                                                   \
253
0
        const T_VEC k_weight6 = constant(0.19509f);                                                                    \
254
0
        const T_VEC k_weight7 = constant(0.83147f);                                                                    \
255
0
        const T_VEC k_weight8 = constant(0.55557f);                                                                    \
256
0
        const T_VEC i0        = load(input + 0 * stride);                                                              \
257
0
        const T_VEC i1        = load(input + 1 * stride);                                                              \
258
0
        const T_VEC i2        = load(input + 2 * stride);                                                              \
259
0
        const T_VEC i3        = load(input + 3 * stride);                                                              \
260
0
        const T_VEC i4        = load(input + 4 * stride);                                                              \
261
0
        const T_VEC i5        = load(input + 5 * stride);                                                              \
262
0
        const T_VEC i6        = load(input + 6 * stride);                                                              \
263
0
        const T_VEC i7        = load(input + 7 * stride);                                                              \
264
0
        const T_VEC i8        = load(input + 8 * stride);                                                              \
265
0
        const T_VEC i9        = load(input + 9 * stride);                                                              \
266
0
        const T_VEC i10       = load(input + 10 * stride);                                                             \
267
0
        const T_VEC i11       = load(input + 11 * stride);                                                             \
268
0
        const T_VEC i12       = load(input + 12 * stride);                                                             \
269
0
        const T_VEC i13       = load(input + 13 * stride);                                                             \
270
0
        const T_VEC i14       = load(input + 14 * stride);                                                             \
271
0
        const T_VEC i15       = load(input + 15 * stride);                                                             \
272
0
        const T_VEC i16       = load(input + 16 * stride);                                                             \
273
0
        const T_VEC i17       = load(input + 17 * stride);                                                             \
274
0
        const T_VEC i18       = load(input + 18 * stride);                                                             \
275
0
        const T_VEC i19       = load(input + 19 * stride);                                                             \
276
0
        const T_VEC i20       = load(input + 20 * stride);                                                             \
277
0
        const T_VEC i21       = load(input + 21 * stride);                                                             \
278
0
        const T_VEC i22       = load(input + 22 * stride);                                                             \
279
0
        const T_VEC i23       = load(input + 23 * stride);                                                             \
280
0
        const T_VEC i24       = load(input + 24 * stride);                                                             \
281
0
        const T_VEC i25       = load(input + 25 * stride);                                                             \
282
0
        const T_VEC i26       = load(input + 26 * stride);                                                             \
283
0
        const T_VEC i27       = load(input + 27 * stride);                                                             \
284
0
        const T_VEC i28       = load(input + 28 * stride);                                                             \
285
0
        const T_VEC i29       = load(input + 29 * stride);                                                             \
286
0
        const T_VEC i30       = load(input + 30 * stride);                                                             \
287
0
        const T_VEC i31       = load(input + 31 * stride);                                                             \
288
0
        const T_VEC w0        = add(i0, i16);                                                                          \
289
0
        const T_VEC w1        = sub(i0, i16);                                                                          \
290
0
        const T_VEC w2        = add(i8, i24);                                                                          \
291
0
        const T_VEC w3        = sub(i8, i24);                                                                          \
292
0
        const T_VEC w4        = add(w0, w2);                                                                           \
293
0
        const T_VEC w5        = sub(w0, w2);                                                                           \
294
0
        const T_VEC w7        = add(i4, i20);                                                                          \
295
0
        const T_VEC w8        = sub(i4, i20);                                                                          \
296
0
        const T_VEC w9        = add(i12, i28);                                                                         \
297
0
        const T_VEC w10       = sub(i12, i28);                                                                         \
298
0
        const T_VEC w11       = add(w7, w9);                                                                           \
299
0
        const T_VEC w12       = sub(w7, w9);                                                                           \
300
0
        const T_VEC w14       = add(w4, w11);                                                                          \
301
0
        const T_VEC w15       = sub(w4, w11);                                                                          \
302
0
        const T_VEC w16[2]    = {add(w1, mul(k_weight2, sub(w8, w10))),                                                \
303
0
                                 sub(sub(k_weight0, w3), mul(k_weight2, add(w10, w8)))};                               \
304
0
        const T_VEC w18[2]    = {sub(w1, mul(k_weight2, sub(w8, w10))), sub(w3, mul(k_weight2, add(w10, w8)))};        \
305
0
        const T_VEC w19       = add(i2, i18);                                                                          \
306
0
        const T_VEC w20       = sub(i2, i18);                                                                          \
307
0
        const T_VEC w21       = add(i10, i26);                                                                         \
308
0
        const T_VEC w22       = sub(i10, i26);                                                                         \
309
0
        const T_VEC w23       = add(w19, w21);                                                                         \
310
0
        const T_VEC w24       = sub(w19, w21);                                                                         \
311
0
        const T_VEC w26       = add(i6, i22);                                                                          \
312
0
        const T_VEC w27       = sub(i6, i22);                                                                          \
313
0
        const T_VEC w28       = add(i14, i30);                                                                         \
314
0
        const T_VEC w29       = sub(i14, i30);                                                                         \
315
0
        const T_VEC w30       = add(w26, w28);                                                                         \
316
0
        const T_VEC w31       = sub(w26, w28);                                                                         \
317
0
        const T_VEC w33       = add(w23, w30);                                                                         \
318
0
        const T_VEC w34       = sub(w23, w30);                                                                         \
319
0
        const T_VEC w35[2]    = {add(w20, mul(k_weight2, sub(w27, w29))),                                              \
320
0
                                 sub(sub(k_weight0, w22), mul(k_weight2, add(w29, w27)))};                             \
321
0
        const T_VEC w37[2]    = {sub(w20, mul(k_weight2, sub(w27, w29))), sub(w22, mul(k_weight2, add(w29, w27)))};    \
322
0
        const T_VEC w38       = add(w14, w33);                                                                         \
323
0
        const T_VEC w39       = sub(w14, w33);                                                                         \
324
0
        const T_VEC w40[2]    = {add(w16[0], add(mul(k_weight3, w35[0]), mul(k_weight4, w35[1]))),                     \
325
0
                                 add(w16[1], sub(mul(k_weight3, w35[1]), mul(k_weight4, w35[0])))};                    \
326
0
        const T_VEC w41[2]    = {add(w5, mul(k_weight2, sub(w24, w31))),                                               \
327
0
                                 sub(sub(k_weight0, w12), mul(k_weight2, add(w31, w24)))};                             \
328
0
        const T_VEC w42[2]    = {add(w18[0], add(mul(k_weight4, w37[0]), mul(k_weight3, w37[1]))),                     \
329
0
                                 add(w18[1], sub(mul(k_weight4, w37[1]), mul(k_weight3, w37[0])))};                    \
330
0
        const T_VEC w44[2]    = {add(w18[0], sub(sub(k_weight0, mul(k_weight4, w37[0])), mul(k_weight3, w37[1]))),     \
331
0
                                 sub(sub(k_weight0, w18[1]), sub(mul(k_weight3, w37[0]), mul(k_weight4, w37[1])))};    \
332
0
        const T_VEC w45[2]    = {sub(w5, mul(k_weight2, sub(w24, w31))), sub(w12, mul(k_weight2, add(w31, w24)))};     \
333
0
        const T_VEC w46[2]    = {add(w16[0], sub(sub(k_weight0, mul(k_weight3, w35[0])), mul(k_weight4, w35[1]))),     \
334
0
                                 sub(sub(k_weight0, w16[1]), sub(mul(k_weight4, w35[0]), mul(k_weight3, w35[1])))};    \
335
0
        const T_VEC w47       = add(i1, i17);                                                                          \
336
0
        const T_VEC w48       = sub(i1, i17);                                                                          \
337
0
        const T_VEC w49       = add(i9, i25);                                                                          \
338
0
        const T_VEC w50       = sub(i9, i25);                                                                          \
339
0
        const T_VEC w51       = add(w47, w49);                                                                         \
340
0
        const T_VEC w52       = sub(w47, w49);                                                                         \
341
0
        const T_VEC w54       = add(i5, i21);                                                                          \
342
0
        const T_VEC w55       = sub(i5, i21);                                                                          \
343
0
        const T_VEC w56       = add(i13, i29);                                                                         \
344
0
        const T_VEC w57       = sub(i13, i29);                                                                         \
345
0
        const T_VEC w58       = add(w54, w56);                                                                         \
346
0
        const T_VEC w59       = sub(w54, w56);                                                                         \
347
0
        const T_VEC w61       = add(w51, w58);                                                                         \
348
0
        const T_VEC w62       = sub(w51, w58);                                                                         \
349
0
        const T_VEC w63[2]    = {add(w48, mul(k_weight2, sub(w55, w57))),                                              \
350
0
                                 sub(sub(k_weight0, w50), mul(k_weight2, add(w57, w55)))};                             \
351
0
        const T_VEC w65[2]    = {sub(w48, mul(k_weight2, sub(w55, w57))), sub(w50, mul(k_weight2, add(w57, w55)))};    \
352
0
        const T_VEC w66       = add(i3, i19);                                                                          \
353
0
        const T_VEC w67       = sub(i3, i19);                                                                          \
354
0
        const T_VEC w68       = add(i11, i27);                                                                         \
355
0
        const T_VEC w69       = sub(i11, i27);                                                                         \
356
0
        const T_VEC w70       = add(w66, w68);                                                                         \
357
0
        const T_VEC w71       = sub(w66, w68);                                                                         \
358
0
        const T_VEC w73       = add(i7, i23);                                                                          \
359
0
        const T_VEC w74       = sub(i7, i23);                                                                          \
360
0
        const T_VEC w75       = add(i15, i31);                                                                         \
361
0
        const T_VEC w76       = sub(i15, i31);                                                                         \
362
0
        const T_VEC w77       = add(w73, w75);                                                                         \
363
0
        const T_VEC w78       = sub(w73, w75);                                                                         \
364
0
        const T_VEC w80       = add(w70, w77);                                                                         \
365
0
        const T_VEC w81       = sub(w70, w77);                                                                         \
366
0
        const T_VEC w82[2]    = {add(w67, mul(k_weight2, sub(w74, w76))),                                              \
367
0
                                 sub(sub(k_weight0, w69), mul(k_weight2, add(w76, w74)))};                             \
368
0
        const T_VEC w84[2]    = {sub(w67, mul(k_weight2, sub(w74, w76))), sub(w69, mul(k_weight2, add(w76, w74)))};    \
369
0
        const T_VEC w85       = add(w61, w80);                                                                         \
370
0
        const T_VEC w86       = sub(w61, w80);                                                                         \
371
0
        const T_VEC w87[2]    = {add(w63[0], add(mul(k_weight3, w82[0]), mul(k_weight4, w82[1]))),                     \
372
0
                                 add(w63[1], sub(mul(k_weight3, w82[1]), mul(k_weight4, w82[0])))};                    \
373
0
        const T_VEC w88[2]    = {add(w52, mul(k_weight2, sub(w71, w78))),                                              \
374
0
                                 sub(sub(k_weight0, w59), mul(k_weight2, add(w78, w71)))};                             \
375
0
        const T_VEC w89[2]    = {add(w65[0], add(mul(k_weight4, w84[0]), mul(k_weight3, w84[1]))),                     \
376
0
                                 add(w65[1], sub(mul(k_weight4, w84[1]), mul(k_weight3, w84[0])))};                    \
377
0
        const T_VEC w91[2]    = {add(w65[0], sub(sub(k_weight0, mul(k_weight4, w84[0])), mul(k_weight3, w84[1]))),     \
378
0
                                 sub(sub(k_weight0, w65[1]), sub(mul(k_weight3, w84[0]), mul(k_weight4, w84[1])))};    \
379
0
        const T_VEC w92[2]    = {sub(w52, mul(k_weight2, sub(w71, w78))), sub(w59, mul(k_weight2, add(w78, w71)))};    \
380
0
        const T_VEC w93[2]    = {add(w63[0], sub(sub(k_weight0, mul(k_weight3, w82[0])), mul(k_weight4, w82[1]))),     \
381
0
                                 sub(sub(k_weight0, w63[1]), sub(mul(k_weight4, w82[0]), mul(k_weight3, w82[1])))};    \
382
0
        store(output + 0 * stride, add(w38, w85));                                                                     \
383
0
        store(output + 1 * stride, add(w40[0], add(mul(k_weight5, w87[0]), mul(k_weight6, w87[1]))));                  \
384
0
        store(output + 2 * stride, add(w41[0], add(mul(k_weight3, w88[0]), mul(k_weight4, w88[1]))));                  \
385
0
        store(output + 3 * stride, add(w42[0], add(mul(k_weight7, w89[0]), mul(k_weight8, w89[1]))));                  \
386
0
        store(output + 4 * stride, add(w15, mul(k_weight2, sub(w62, w81))));                                           \
387
0
        store(output + 5 * stride, add(w44[0], add(mul(k_weight8, w91[0]), mul(k_weight7, w91[1]))));                  \
388
0
        store(output + 6 * stride, add(w45[0], add(mul(k_weight4, w92[0]), mul(k_weight3, w92[1]))));                  \
389
0
        store(output + 7 * stride, add(w46[0], add(mul(k_weight6, w93[0]), mul(k_weight5, w93[1]))));                  \
390
0
        store(output + 8 * stride, w39);                                                                               \
391
0
        store(output + 9 * stride, add(w46[0], sub(sub(k_weight0, mul(k_weight6, w93[0])), mul(k_weight5, w93[1]))));  \
392
0
        store(output + 10 * stride, add(w45[0], sub(sub(k_weight0, mul(k_weight4, w92[0])), mul(k_weight3, w92[1])))); \
393
0
        store(output + 11 * stride, add(w44[0], sub(sub(k_weight0, mul(k_weight8, w91[0])), mul(k_weight7, w91[1])))); \
394
0
        store(output + 12 * stride, sub(w15, mul(k_weight2, sub(w62, w81))));                                          \
395
0
        store(output + 13 * stride, add(w42[0], sub(sub(k_weight0, mul(k_weight7, w89[0])), mul(k_weight8, w89[1])))); \
396
0
        store(output + 14 * stride, add(w41[0], sub(sub(k_weight0, mul(k_weight3, w88[0])), mul(k_weight4, w88[1])))); \
397
0
        store(output + 15 * stride, add(w40[0], sub(sub(k_weight0, mul(k_weight5, w87[0])), mul(k_weight6, w87[1])))); \
398
0
        store(output + 16 * stride, sub(w38, w85));                                                                    \
399
0
        store(output + 17 * stride, add(w40[1], sub(mul(k_weight5, w87[1]), mul(k_weight6, w87[0]))));                 \
400
0
        store(output + 18 * stride, add(w41[1], sub(mul(k_weight3, w88[1]), mul(k_weight4, w88[0]))));                 \
401
0
        store(output + 19 * stride, add(w42[1], sub(mul(k_weight7, w89[1]), mul(k_weight8, w89[0]))));                 \
402
0
        store(output + 20 * stride, sub(sub(k_weight0, w34), mul(k_weight2, add(w81, w62))));                          \
403
0
        store(output + 21 * stride, add(w44[1], sub(mul(k_weight8, w91[1]), mul(k_weight7, w91[0]))));                 \
404
0
        store(output + 22 * stride, add(w45[1], sub(mul(k_weight4, w92[1]), mul(k_weight3, w92[0]))));                 \
405
0
        store(output + 23 * stride, add(w46[1], sub(mul(k_weight6, w93[1]), mul(k_weight5, w93[0]))));                 \
406
0
        store(output + 24 * stride, sub(k_weight0, w86));                                                              \
407
0
        store(output + 25 * stride, sub(sub(k_weight0, w46[1]), sub(mul(k_weight5, w93[0]), mul(k_weight6, w93[1])))); \
408
0
        store(output + 26 * stride, sub(sub(k_weight0, w45[1]), sub(mul(k_weight3, w92[0]), mul(k_weight4, w92[1])))); \
409
0
        store(output + 27 * stride, sub(sub(k_weight0, w44[1]), sub(mul(k_weight7, w91[0]), mul(k_weight8, w91[1])))); \
410
0
        store(output + 28 * stride, sub(w34, mul(k_weight2, add(w81, w62))));                                          \
411
0
        store(output + 29 * stride, sub(sub(k_weight0, w42[1]), sub(mul(k_weight8, w89[0]), mul(k_weight7, w89[1])))); \
412
0
        store(output + 30 * stride, sub(sub(k_weight0, w41[1]), sub(mul(k_weight4, w88[0]), mul(k_weight3, w88[1])))); \
413
0
        store(output + 31 * stride, sub(sub(k_weight0, w40[1]), sub(mul(k_weight6, w87[0]), mul(k_weight5, w87[1])))); \
414
0
    }
415
416
#define GEN_IFFT_2(ret, suffix, T, T_VEC, load, store)                         \
417
0
    ret svt_aom_ifft1d_2_##suffix(const T* input, T* output, int32_t stride) { \
418
0
        const T_VEC i0 = load(input + 0 * stride);                             \
419
0
        const T_VEC i1 = load(input + 1 * stride);                             \
420
0
        store(output + 0 * stride, i0 + i1);                                   \
421
0
        store(output + 1 * stride, i0 - i1);                                   \
422
0
    }
423
424
#define GEN_IFFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub)     \
425
0
    ret svt_aom_ifft1d_4_##suffix(const T* input, T* output, int32_t stride) { \
426
0
        const T_VEC k_weight0 = constant(0.0f);                                \
427
0
        const T_VEC i0        = load(input + 0 * stride);                      \
428
0
        const T_VEC i1        = load(input + 1 * stride);                      \
429
0
        const T_VEC i2        = load(input + 2 * stride);                      \
430
0
        const T_VEC i3        = load(input + 3 * stride);                      \
431
0
        const T_VEC w2        = add(i0, i2);                                   \
432
0
        const T_VEC w3        = sub(i0, i2);                                   \
433
0
        const T_VEC w4[2]     = {add(i1, i1), sub(i3, i3)};                    \
434
0
        const T_VEC w5[2]     = {sub(i1, i1), sub(sub(k_weight0, i3), i3)};    \
435
0
        store(output + 0 * stride, add(w2, w4[0]));                            \
436
0
        store(output + 1 * stride, add(w3, w5[1]));                            \
437
0
        store(output + 2 * stride, sub(w2, w4[0]));                            \
438
0
        store(output + 3 * stride, sub(w3, w5[1]));                            \
439
0
    }
440
441
#define GEN_IFFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, mul)                                       \
442
0
    ret svt_aom_ifft1d_8_##suffix(const T* input, T* output, int32_t stride) {                                        \
443
0
        const T_VEC k_weight0 = constant(0.0f);                                                                       \
444
0
        const T_VEC k_weight2 = constant(0.707107f);                                                                  \
445
0
        const T_VEC i0        = load(input + 0 * stride);                                                             \
446
0
        const T_VEC i1        = load(input + 1 * stride);                                                             \
447
0
        const T_VEC i2        = load(input + 2 * stride);                                                             \
448
0
        const T_VEC i3        = load(input + 3 * stride);                                                             \
449
0
        const T_VEC i4        = load(input + 4 * stride);                                                             \
450
0
        const T_VEC i5        = load(input + 5 * stride);                                                             \
451
0
        const T_VEC i6        = load(input + 6 * stride);                                                             \
452
0
        const T_VEC i7        = load(input + 7 * stride);                                                             \
453
0
        const T_VEC w6        = add(i0, i4);                                                                          \
454
0
        const T_VEC w7        = sub(i0, i4);                                                                          \
455
0
        const T_VEC w8[2]     = {add(i2, i2), sub(i6, i6)};                                                           \
456
0
        const T_VEC w9[2]     = {sub(i2, i2), sub(sub(k_weight0, i6), i6)};                                           \
457
0
        const T_VEC w10[2]    = {add(w6, w8[0]), w8[1]};                                                              \
458
0
        const T_VEC w11[2]    = {sub(w6, w8[0]), sub(k_weight0, w8[1])};                                              \
459
0
        const T_VEC w12[2]    = {add(w7, w9[1]), sub(k_weight0, w9[0])};                                              \
460
0
        const T_VEC w13[2]    = {sub(w7, w9[1]), w9[0]};                                                              \
461
0
        const T_VEC w14[2]    = {add(i1, i3), sub(i7, i5)};                                                           \
462
0
        const T_VEC w15[2]    = {sub(i1, i3), sub(sub(k_weight0, i5), i7)};                                           \
463
0
        const T_VEC w16[2]    = {add(i3, i1), sub(i5, i7)};                                                           \
464
0
        const T_VEC w17[2]    = {sub(i3, i1), sub(sub(k_weight0, i7), i5)};                                           \
465
0
        const T_VEC w18[2]    = {add(w14[0], w16[0]), add(w14[1], w16[1])};                                           \
466
0
        const T_VEC w19[2]    = {sub(w14[0], w16[0]), sub(w14[1], w16[1])};                                           \
467
0
        const T_VEC w20[2]    = {add(w15[0], w17[1]), sub(w15[1], w17[0])};                                           \
468
0
        const T_VEC w21[2]    = {sub(w15[0], w17[1]), add(w15[1], w17[0])};                                           \
469
0
        store(output + 0 * stride, add(w10[0], w18[0]));                                                              \
470
0
        store(output + 1 * stride, add(w12[0], mul(k_weight2, add(w20[0], w20[1]))));                                 \
471
0
        store(output + 2 * stride, add(w11[0], w19[1]));                                                              \
472
0
        store(output + 3 * stride, sub(w13[0], mul(k_weight2, sub(w21[0], w21[1]))));                                 \
473
0
        store(output + 4 * stride, sub(w10[0], w18[0]));                                                              \
474
0
        store(output + 5 * stride, add(w12[0], sub(sub(k_weight0, mul(k_weight2, w20[0])), mul(k_weight2, w20[1])))); \
475
0
        store(output + 6 * stride, sub(w11[0], w19[1]));                                                              \
476
0
        store(output + 7 * stride, add(w13[0], mul(k_weight2, sub(w21[0], w21[1]))));                                 \
477
0
    }
478
479
#define GEN_IFFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub, mul)                                       \
480
0
    ret svt_aom_ifft1d_16_##suffix(const T* input, T* output, int32_t stride) {                                        \
481
0
        const T_VEC k_weight0 = constant(0.0f);                                                                        \
482
0
        const T_VEC k_weight2 = constant(0.707107f);                                                                   \
483
0
        const T_VEC k_weight3 = constant(0.92388f);                                                                    \
484
0
        const T_VEC k_weight4 = constant(0.382683f);                                                                   \
485
0
        const T_VEC i0        = load(input + 0 * stride);                                                              \
486
0
        const T_VEC i1        = load(input + 1 * stride);                                                              \
487
0
        const T_VEC i2        = load(input + 2 * stride);                                                              \
488
0
        const T_VEC i3        = load(input + 3 * stride);                                                              \
489
0
        const T_VEC i4        = load(input + 4 * stride);                                                              \
490
0
        const T_VEC i5        = load(input + 5 * stride);                                                              \
491
0
        const T_VEC i6        = load(input + 6 * stride);                                                              \
492
0
        const T_VEC i7        = load(input + 7 * stride);                                                              \
493
0
        const T_VEC i8        = load(input + 8 * stride);                                                              \
494
0
        const T_VEC i9        = load(input + 9 * stride);                                                              \
495
0
        const T_VEC i10       = load(input + 10 * stride);                                                             \
496
0
        const T_VEC i11       = load(input + 11 * stride);                                                             \
497
0
        const T_VEC i12       = load(input + 12 * stride);                                                             \
498
0
        const T_VEC i13       = load(input + 13 * stride);                                                             \
499
0
        const T_VEC i14       = load(input + 14 * stride);                                                             \
500
0
        const T_VEC i15       = load(input + 15 * stride);                                                             \
501
0
        const T_VEC w14       = add(i0, i8);                                                                           \
502
0
        const T_VEC w15       = sub(i0, i8);                                                                           \
503
0
        const T_VEC w16[2]    = {add(i4, i4), sub(i12, i12)};                                                          \
504
0
        const T_VEC w17[2]    = {sub(i4, i4), sub(sub(k_weight0, i12), i12)};                                          \
505
0
        const T_VEC w18[2]    = {add(w14, w16[0]), w16[1]};                                                            \
506
0
        const T_VEC w19[2]    = {sub(w14, w16[0]), sub(k_weight0, w16[1])};                                            \
507
0
        const T_VEC w20[2]    = {add(w15, w17[1]), sub(k_weight0, w17[0])};                                            \
508
0
        const T_VEC w21[2]    = {sub(w15, w17[1]), w17[0]};                                                            \
509
0
        const T_VEC w22[2]    = {add(i2, i6), sub(i14, i10)};                                                          \
510
0
        const T_VEC w23[2]    = {sub(i2, i6), sub(sub(k_weight0, i10), i14)};                                          \
511
0
        const T_VEC w24[2]    = {add(i6, i2), sub(i10, i14)};                                                          \
512
0
        const T_VEC w25[2]    = {sub(i6, i2), sub(sub(k_weight0, i14), i10)};                                          \
513
0
        const T_VEC w26[2]    = {add(w22[0], w24[0]), add(w22[1], w24[1])};                                            \
514
0
        const T_VEC w27[2]    = {sub(w22[0], w24[0]), sub(w22[1], w24[1])};                                            \
515
0
        const T_VEC w28[2]    = {add(w23[0], w25[1]), sub(w23[1], w25[0])};                                            \
516
0
        const T_VEC w29[2]    = {sub(w23[0], w25[1]), add(w23[1], w25[0])};                                            \
517
0
        const T_VEC w30[2]    = {add(w18[0], w26[0]), add(w18[1], w26[1])};                                            \
518
0
        const T_VEC w31[2]    = {sub(w18[0], w26[0]), sub(w18[1], w26[1])};                                            \
519
0
        const T_VEC w32[2]    = {add(w20[0], mul(k_weight2, add(w28[0], w28[1]))),                                     \
520
0
                                 add(w20[1], mul(k_weight2, sub(w28[1], w28[0])))};                                    \
521
0
        const T_VEC w33[2]    = {add(w20[0], sub(sub(k_weight0, mul(k_weight2, w28[0])), mul(k_weight2, w28[1]))),     \
522
0
                                 add(w20[1], mul(k_weight2, sub(w28[0], w28[1])))};                                    \
523
0
        const T_VEC w34[2]    = {add(w19[0], w27[1]), sub(w19[1], w27[0])};                                            \
524
0
        const T_VEC w35[2]    = {sub(w19[0], w27[1]), add(w19[1], w27[0])};                                            \
525
0
        const T_VEC w36[2]    = {sub(w21[0], mul(k_weight2, sub(w29[0], w29[1]))),                                     \
526
0
                                 sub(w21[1], mul(k_weight2, add(w29[1], w29[0])))};                                    \
527
0
        const T_VEC w37[2]    = {add(w21[0], mul(k_weight2, sub(w29[0], w29[1]))),                                     \
528
0
                                 add(w21[1], mul(k_weight2, add(w29[1], w29[0])))};                                    \
529
0
        const T_VEC w38[2]    = {add(i1, i7), sub(i15, i9)};                                                           \
530
0
        const T_VEC w39[2]    = {sub(i1, i7), sub(sub(k_weight0, i9), i15)};                                           \
531
0
        const T_VEC w40[2]    = {add(i5, i3), sub(i11, i13)};                                                          \
532
0
        const T_VEC w41[2]    = {sub(i5, i3), sub(sub(k_weight0, i13), i11)};                                          \
533
0
        const T_VEC w42[2]    = {add(w38[0], w40[0]), add(w38[1], w40[1])};                                            \
534
0
        const T_VEC w43[2]    = {sub(w38[0], w40[0]), sub(w38[1], w40[1])};                                            \
535
0
        const T_VEC w44[2]    = {add(w39[0], w41[1]), sub(w39[1], w41[0])};                                            \
536
0
        const T_VEC w45[2]    = {sub(w39[0], w41[1]), add(w39[1], w41[0])};                                            \
537
0
        const T_VEC w46[2]    = {add(i3, i5), sub(i13, i11)};                                                          \
538
0
        const T_VEC w47[2]    = {sub(i3, i5), sub(sub(k_weight0, i11), i13)};                                          \
539
0
        const T_VEC w48[2]    = {add(i7, i1), sub(i9, i15)};                                                           \
540
0
        const T_VEC w49[2]    = {sub(i7, i1), sub(sub(k_weight0, i15), i9)};                                           \
541
0
        const T_VEC w50[2]    = {add(w46[0], w48[0]), add(w46[1], w48[1])};                                            \
542
0
        const T_VEC w51[2]    = {sub(w46[0], w48[0]), sub(w46[1], w48[1])};                                            \
543
0
        const T_VEC w52[2]    = {add(w47[0], w49[1]), sub(w47[1], w49[0])};                                            \
544
0
        const T_VEC w53[2]    = {sub(w47[0], w49[1]), add(w47[1], w49[0])};                                            \
545
0
        const T_VEC w54[2]    = {add(w42[0], w50[0]), add(w42[1], w50[1])};                                            \
546
0
        const T_VEC w55[2]    = {sub(w42[0], w50[0]), sub(w42[1], w50[1])};                                            \
547
0
        const T_VEC w56[2]    = {add(w44[0], mul(k_weight2, add(w52[0], w52[1]))),                                     \
548
0
                                 add(w44[1], mul(k_weight2, sub(w52[1], w52[0])))};                                    \
549
0
        const T_VEC w57[2]    = {add(w44[0], sub(sub(k_weight0, mul(k_weight2, w52[0])), mul(k_weight2, w52[1]))),     \
550
0
                                 add(w44[1], mul(k_weight2, sub(w52[0], w52[1])))};                                    \
551
0
        const T_VEC w58[2]    = {add(w43[0], w51[1]), sub(w43[1], w51[0])};                                            \
552
0
        const T_VEC w59[2]    = {sub(w43[0], w51[1]), add(w43[1], w51[0])};                                            \
553
0
        const T_VEC w60[2]    = {sub(w45[0], mul(k_weight2, sub(w53[0], w53[1]))),                                     \
554
0
                                 sub(w45[1], mul(k_weight2, add(w53[1], w53[0])))};                                    \
555
0
        const T_VEC w61[2]    = {add(w45[0], mul(k_weight2, sub(w53[0], w53[1]))),                                     \
556
0
                                 add(w45[1], mul(k_weight2, add(w53[1], w53[0])))};                                    \
557
0
        store(output + 0 * stride, add(w30[0], w54[0]));                                                               \
558
0
        store(output + 1 * stride, add(w32[0], add(mul(k_weight3, w56[0]), mul(k_weight4, w56[1]))));                  \
559
0
        store(output + 2 * stride, add(w34[0], mul(k_weight2, add(w58[0], w58[1]))));                                  \
560
0
        store(output + 3 * stride, add(w36[0], add(mul(k_weight4, w60[0]), mul(k_weight3, w60[1]))));                  \
561
0
        store(output + 4 * stride, add(w31[0], w55[1]));                                                               \
562
0
        store(output + 5 * stride, sub(w33[0], sub(mul(k_weight4, w57[0]), mul(k_weight3, w57[1]))));                  \
563
0
        store(output + 6 * stride, sub(w35[0], mul(k_weight2, sub(w59[0], w59[1]))));                                  \
564
0
        store(output + 7 * stride, sub(w37[0], sub(mul(k_weight3, w61[0]), mul(k_weight4, w61[1]))));                  \
565
0
        store(output + 8 * stride, sub(w30[0], w54[0]));                                                               \
566
0
        store(output + 9 * stride, add(w32[0], sub(sub(k_weight0, mul(k_weight3, w56[0])), mul(k_weight4, w56[1]))));  \
567
0
        store(output + 10 * stride, add(w34[0], sub(sub(k_weight0, mul(k_weight2, w58[0])), mul(k_weight2, w58[1])))); \
568
0
        store(output + 11 * stride, add(w36[0], sub(sub(k_weight0, mul(k_weight4, w60[0])), mul(k_weight3, w60[1])))); \
569
0
        store(output + 12 * stride, sub(w31[0], w55[1]));                                                              \
570
0
        store(output + 13 * stride, add(w33[0], sub(mul(k_weight4, w57[0]), mul(k_weight3, w57[1]))));                 \
571
0
        store(output + 14 * stride, add(w35[0], mul(k_weight2, sub(w59[0], w59[1]))));                                 \
572
0
        store(output + 15 * stride, add(w37[0], sub(mul(k_weight3, w61[0]), mul(k_weight4, w61[1]))));                 \
573
0
    }
574
#define GEN_IFFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub, mul)                                       \
575
0
    ret svt_aom_ifft1d_32_##suffix(const T* input, T* output, int32_t stride) {                                        \
576
0
        const T_VEC k_weight0 = constant(0.0f);                                                                        \
577
0
        const T_VEC k_weight2 = constant(0.707107f);                                                                   \
578
0
        const T_VEC k_weight3 = constant(0.92388f);                                                                    \
579
0
        const T_VEC k_weight4 = constant(0.382683f);                                                                   \
580
0
        const T_VEC k_weight5 = constant(0.980785f);                                                                   \
581
0
        const T_VEC k_weight6 = constant(0.19509f);                                                                    \
582
0
        const T_VEC k_weight7 = constant(0.83147f);                                                                    \
583
0
        const T_VEC k_weight8 = constant(0.55557f);                                                                    \
584
0
        const T_VEC i0        = load(input + 0 * stride);                                                              \
585
0
        const T_VEC i1        = load(input + 1 * stride);                                                              \
586
0
        const T_VEC i2        = load(input + 2 * stride);                                                              \
587
0
        const T_VEC i3        = load(input + 3 * stride);                                                              \
588
0
        const T_VEC i4        = load(input + 4 * stride);                                                              \
589
0
        const T_VEC i5        = load(input + 5 * stride);                                                              \
590
0
        const T_VEC i6        = load(input + 6 * stride);                                                              \
591
0
        const T_VEC i7        = load(input + 7 * stride);                                                              \
592
0
        const T_VEC i8        = load(input + 8 * stride);                                                              \
593
0
        const T_VEC i9        = load(input + 9 * stride);                                                              \
594
0
        const T_VEC i10       = load(input + 10 * stride);                                                             \
595
0
        const T_VEC i11       = load(input + 11 * stride);                                                             \
596
0
        const T_VEC i12       = load(input + 12 * stride);                                                             \
597
0
        const T_VEC i13       = load(input + 13 * stride);                                                             \
598
0
        const T_VEC i14       = load(input + 14 * stride);                                                             \
599
0
        const T_VEC i15       = load(input + 15 * stride);                                                             \
600
0
        const T_VEC i16       = load(input + 16 * stride);                                                             \
601
0
        const T_VEC i17       = load(input + 17 * stride);                                                             \
602
0
        const T_VEC i18       = load(input + 18 * stride);                                                             \
603
0
        const T_VEC i19       = load(input + 19 * stride);                                                             \
604
0
        const T_VEC i20       = load(input + 20 * stride);                                                             \
605
0
        const T_VEC i21       = load(input + 21 * stride);                                                             \
606
0
        const T_VEC i22       = load(input + 22 * stride);                                                             \
607
0
        const T_VEC i23       = load(input + 23 * stride);                                                             \
608
0
        const T_VEC i24       = load(input + 24 * stride);                                                             \
609
0
        const T_VEC i25       = load(input + 25 * stride);                                                             \
610
0
        const T_VEC i26       = load(input + 26 * stride);                                                             \
611
0
        const T_VEC i27       = load(input + 27 * stride);                                                             \
612
0
        const T_VEC i28       = load(input + 28 * stride);                                                             \
613
0
        const T_VEC i29       = load(input + 29 * stride);                                                             \
614
0
        const T_VEC i30       = load(input + 30 * stride);                                                             \
615
0
        const T_VEC i31       = load(input + 31 * stride);                                                             \
616
0
        const T_VEC w30       = add(i0, i16);                                                                          \
617
0
        const T_VEC w31       = sub(i0, i16);                                                                          \
618
0
        const T_VEC w32[2]    = {add(i8, i8), sub(i24, i24)};                                                          \
619
0
        const T_VEC w33[2]    = {sub(i8, i8), sub(sub(k_weight0, i24), i24)};                                          \
620
0
        const T_VEC w34[2]    = {add(w30, w32[0]), w32[1]};                                                            \
621
0
        const T_VEC w35[2]    = {sub(w30, w32[0]), sub(k_weight0, w32[1])};                                            \
622
0
        const T_VEC w36[2]    = {add(w31, w33[1]), sub(k_weight0, w33[0])};                                            \
623
0
        const T_VEC w37[2]    = {sub(w31, w33[1]), w33[0]};                                                            \
624
0
        const T_VEC w38[2]    = {add(i4, i12), sub(i28, i20)};                                                         \
625
0
        const T_VEC w39[2]    = {sub(i4, i12), sub(sub(k_weight0, i20), i28)};                                         \
626
0
        const T_VEC w40[2]    = {add(i12, i4), sub(i20, i28)};                                                         \
627
0
        const T_VEC w41[2]    = {sub(i12, i4), sub(sub(k_weight0, i28), i20)};                                         \
628
0
        const T_VEC w42[2]    = {add(w38[0], w40[0]), add(w38[1], w40[1])};                                            \
629
0
        const T_VEC w43[2]    = {sub(w38[0], w40[0]), sub(w38[1], w40[1])};                                            \
630
0
        const T_VEC w44[2]    = {add(w39[0], w41[1]), sub(w39[1], w41[0])};                                            \
631
0
        const T_VEC w45[2]    = {sub(w39[0], w41[1]), add(w39[1], w41[0])};                                            \
632
0
        const T_VEC w46[2]    = {add(w34[0], w42[0]), add(w34[1], w42[1])};                                            \
633
0
        const T_VEC w47[2]    = {sub(w34[0], w42[0]), sub(w34[1], w42[1])};                                            \
634
0
        const T_VEC w48[2]    = {add(w36[0], mul(k_weight2, add(w44[0], w44[1]))),                                     \
635
0
                                 add(w36[1], mul(k_weight2, sub(w44[1], w44[0])))};                                    \
636
0
        const T_VEC w49[2]    = {add(w36[0], sub(sub(k_weight0, mul(k_weight2, w44[0])), mul(k_weight2, w44[1]))),     \
637
0
                                 add(w36[1], mul(k_weight2, sub(w44[0], w44[1])))};                                    \
638
0
        const T_VEC w50[2]    = {add(w35[0], w43[1]), sub(w35[1], w43[0])};                                            \
639
0
        const T_VEC w51[2]    = {sub(w35[0], w43[1]), add(w35[1], w43[0])};                                            \
640
0
        const T_VEC w52[2]    = {sub(w37[0], mul(k_weight2, sub(w45[0], w45[1]))),                                     \
641
0
                                 sub(w37[1], mul(k_weight2, add(w45[1], w45[0])))};                                    \
642
0
        const T_VEC w53[2]    = {add(w37[0], mul(k_weight2, sub(w45[0], w45[1]))),                                     \
643
0
                                 add(w37[1], mul(k_weight2, add(w45[1], w45[0])))};                                    \
644
0
        const T_VEC w54[2]    = {add(i2, i14), sub(i30, i18)};                                                         \
645
0
        const T_VEC w55[2]    = {sub(i2, i14), sub(sub(k_weight0, i18), i30)};                                         \
646
0
        const T_VEC w56[2]    = {add(i10, i6), sub(i22, i26)};                                                         \
647
0
        const T_VEC w57[2]    = {sub(i10, i6), sub(sub(k_weight0, i26), i22)};                                         \
648
0
        const T_VEC w58[2]    = {add(w54[0], w56[0]), add(w54[1], w56[1])};                                            \
649
0
        const T_VEC w59[2]    = {sub(w54[0], w56[0]), sub(w54[1], w56[1])};                                            \
650
0
        const T_VEC w60[2]    = {add(w55[0], w57[1]), sub(w55[1], w57[0])};                                            \
651
0
        const T_VEC w61[2]    = {sub(w55[0], w57[1]), add(w55[1], w57[0])};                                            \
652
0
        const T_VEC w62[2]    = {add(i6, i10), sub(i26, i22)};                                                         \
653
0
        const T_VEC w63[2]    = {sub(i6, i10), sub(sub(k_weight0, i22), i26)};                                         \
654
0
        const T_VEC w64[2]    = {add(i14, i2), sub(i18, i30)};                                                         \
655
0
        const T_VEC w65[2]    = {sub(i14, i2), sub(sub(k_weight0, i30), i18)};                                         \
656
0
        const T_VEC w66[2]    = {add(w62[0], w64[0]), add(w62[1], w64[1])};                                            \
657
0
        const T_VEC w67[2]    = {sub(w62[0], w64[0]), sub(w62[1], w64[1])};                                            \
658
0
        const T_VEC w68[2]    = {add(w63[0], w65[1]), sub(w63[1], w65[0])};                                            \
659
0
        const T_VEC w69[2]    = {sub(w63[0], w65[1]), add(w63[1], w65[0])};                                            \
660
0
        const T_VEC w70[2]    = {add(w58[0], w66[0]), add(w58[1], w66[1])};                                            \
661
0
        const T_VEC w71[2]    = {sub(w58[0], w66[0]), sub(w58[1], w66[1])};                                            \
662
0
        const T_VEC w72[2]    = {add(w60[0], mul(k_weight2, add(w68[0], w68[1]))),                                     \
663
0
                                 add(w60[1], mul(k_weight2, sub(w68[1], w68[0])))};                                    \
664
0
        const T_VEC w73[2]    = {add(w60[0], sub(sub(k_weight0, mul(k_weight2, w68[0])), mul(k_weight2, w68[1]))),     \
665
0
                                 add(w60[1], mul(k_weight2, sub(w68[0], w68[1])))};                                    \
666
0
        const T_VEC w74[2]    = {add(w59[0], w67[1]), sub(w59[1], w67[0])};                                            \
667
0
        const T_VEC w75[2]    = {sub(w59[0], w67[1]), add(w59[1], w67[0])};                                            \
668
0
        const T_VEC w76[2]    = {sub(w61[0], mul(k_weight2, sub(w69[0], w69[1]))),                                     \
669
0
                                 sub(w61[1], mul(k_weight2, add(w69[1], w69[0])))};                                    \
670
0
        const T_VEC w77[2]    = {add(w61[0], mul(k_weight2, sub(w69[0], w69[1]))),                                     \
671
0
                                 add(w61[1], mul(k_weight2, add(w69[1], w69[0])))};                                    \
672
0
        const T_VEC w78[2]    = {add(w46[0], w70[0]), add(w46[1], w70[1])};                                            \
673
0
        const T_VEC w79[2]    = {sub(w46[0], w70[0]), sub(w46[1], w70[1])};                                            \
674
0
        const T_VEC w80[2]    = {add(w48[0], add(mul(k_weight3, w72[0]), mul(k_weight4, w72[1]))),                     \
675
0
                                 add(w48[1], sub(mul(k_weight3, w72[1]), mul(k_weight4, w72[0])))};                    \
676
0
        const T_VEC w81[2]    = {add(w48[0], sub(sub(k_weight0, mul(k_weight3, w72[0])), mul(k_weight4, w72[1]))),     \
677
0
                                 add(w48[1], sub(mul(k_weight4, w72[0]), mul(k_weight3, w72[1])))};                    \
678
0
        const T_VEC w82[2]    = {add(w50[0], mul(k_weight2, add(w74[0], w74[1]))),                                     \
679
0
                                 add(w50[1], mul(k_weight2, sub(w74[1], w74[0])))};                                    \
680
0
        const T_VEC w83[2]    = {add(w50[0], sub(sub(k_weight0, mul(k_weight2, w74[0])), mul(k_weight2, w74[1]))),     \
681
0
                                 add(w50[1], mul(k_weight2, sub(w74[0], w74[1])))};                                    \
682
0
        const T_VEC w84[2]    = {add(w52[0], add(mul(k_weight4, w76[0]), mul(k_weight3, w76[1]))),                     \
683
0
                                 add(w52[1], sub(mul(k_weight4, w76[1]), mul(k_weight3, w76[0])))};                    \
684
0
        const T_VEC w85[2]    = {add(w52[0], sub(sub(k_weight0, mul(k_weight4, w76[0])), mul(k_weight3, w76[1]))),     \
685
0
                                 add(w52[1], sub(mul(k_weight3, w76[0]), mul(k_weight4, w76[1])))};                    \
686
0
        const T_VEC w86[2]    = {add(w47[0], w71[1]), sub(w47[1], w71[0])};                                            \
687
0
        const T_VEC w87[2]    = {sub(w47[0], w71[1]), add(w47[1], w71[0])};                                            \
688
0
        const T_VEC w88[2]    = {sub(w49[0], sub(mul(k_weight4, w73[0]), mul(k_weight3, w73[1]))),                     \
689
0
                                 add(w49[1], sub(sub(k_weight0, mul(k_weight4, w73[1])), mul(k_weight3, w73[0])))};    \
690
0
        const T_VEC w89[2]    = {add(w49[0], sub(mul(k_weight4, w73[0]), mul(k_weight3, w73[1]))),                     \
691
0
                                 add(w49[1], add(mul(k_weight4, w73[1]), mul(k_weight3, w73[0])))};                    \
692
0
        const T_VEC w90[2]    = {sub(w51[0], mul(k_weight2, sub(w75[0], w75[1]))),                                     \
693
0
                                 sub(w51[1], mul(k_weight2, add(w75[1], w75[0])))};                                    \
694
0
        const T_VEC w91[2]    = {add(w51[0], mul(k_weight2, sub(w75[0], w75[1]))),                                     \
695
0
                                 add(w51[1], mul(k_weight2, add(w75[1], w75[0])))};                                    \
696
0
        const T_VEC w92[2]    = {sub(w53[0], sub(mul(k_weight3, w77[0]), mul(k_weight4, w77[1]))),                     \
697
0
                                 add(w53[1], sub(sub(k_weight0, mul(k_weight3, w77[1])), mul(k_weight4, w77[0])))};    \
698
0
        const T_VEC w93[2]    = {add(w53[0], sub(mul(k_weight3, w77[0]), mul(k_weight4, w77[1]))),                     \
699
0
                                 add(w53[1], add(mul(k_weight3, w77[1]), mul(k_weight4, w77[0])))};                    \
700
0
        const T_VEC w94[2]    = {add(i1, i15), sub(i31, i17)};                                                         \
701
0
        const T_VEC w95[2]    = {sub(i1, i15), sub(sub(k_weight0, i17), i31)};                                         \
702
0
        const T_VEC w96[2]    = {add(i9, i7), sub(i23, i25)};                                                          \
703
0
        const T_VEC w97[2]    = {sub(i9, i7), sub(sub(k_weight0, i25), i23)};                                          \
704
0
        const T_VEC w98[2]    = {add(w94[0], w96[0]), add(w94[1], w96[1])};                                            \
705
0
        const T_VEC w99[2]    = {sub(w94[0], w96[0]), sub(w94[1], w96[1])};                                            \
706
0
        const T_VEC w100[2]   = {add(w95[0], w97[1]), sub(w95[1], w97[0])};                                            \
707
0
        const T_VEC w101[2]   = {sub(w95[0], w97[1]), add(w95[1], w97[0])};                                            \
708
0
        const T_VEC w102[2]   = {add(i5, i11), sub(i27, i21)};                                                         \
709
0
        const T_VEC w103[2]   = {sub(i5, i11), sub(sub(k_weight0, i21), i27)};                                         \
710
0
        const T_VEC w104[2]   = {add(i13, i3), sub(i19, i29)};                                                         \
711
0
        const T_VEC w105[2]   = {sub(i13, i3), sub(sub(k_weight0, i29), i19)};                                         \
712
0
        const T_VEC w106[2]   = {add(w102[0], w104[0]), add(w102[1], w104[1])};                                        \
713
0
        const T_VEC w107[2]   = {sub(w102[0], w104[0]), sub(w102[1], w104[1])};                                        \
714
0
        const T_VEC w108[2]   = {add(w103[0], w105[1]), sub(w103[1], w105[0])};                                        \
715
0
        const T_VEC w109[2]   = {sub(w103[0], w105[1]), add(w103[1], w105[0])};                                        \
716
0
        const T_VEC w110[2]   = {add(w98[0], w106[0]), add(w98[1], w106[1])};                                          \
717
0
        const T_VEC w111[2]   = {sub(w98[0], w106[0]), sub(w98[1], w106[1])};                                          \
718
0
        const T_VEC w112[2]   = {add(w100[0], mul(k_weight2, add(w108[0], w108[1]))),                                  \
719
0
                                 add(w100[1], mul(k_weight2, sub(w108[1], w108[0])))};                                 \
720
0
        const T_VEC w113[2]   = {add(w100[0], sub(sub(k_weight0, mul(k_weight2, w108[0])), mul(k_weight2, w108[1]))),  \
721
0
                                 add(w100[1], mul(k_weight2, sub(w108[0], w108[1])))};                                 \
722
0
        const T_VEC w114[2]   = {add(w99[0], w107[1]), sub(w99[1], w107[0])};                                          \
723
0
        const T_VEC w115[2]   = {sub(w99[0], w107[1]), add(w99[1], w107[0])};                                          \
724
0
        const T_VEC w116[2]   = {sub(w101[0], mul(k_weight2, sub(w109[0], w109[1]))),                                  \
725
0
                                 sub(w101[1], mul(k_weight2, add(w109[1], w109[0])))};                                 \
726
0
        const T_VEC w117[2]   = {add(w101[0], mul(k_weight2, sub(w109[0], w109[1]))),                                  \
727
0
                                 add(w101[1], mul(k_weight2, add(w109[1], w109[0])))};                                 \
728
0
        const T_VEC w118[2]   = {add(i3, i13), sub(i29, i19)};                                                         \
729
0
        const T_VEC w119[2]   = {sub(i3, i13), sub(sub(k_weight0, i19), i29)};                                         \
730
0
        const T_VEC w120[2]   = {add(i11, i5), sub(i21, i27)};                                                         \
731
0
        const T_VEC w121[2]   = {sub(i11, i5), sub(sub(k_weight0, i27), i21)};                                         \
732
0
        const T_VEC w122[2]   = {add(w118[0], w120[0]), add(w118[1], w120[1])};                                        \
733
0
        const T_VEC w123[2]   = {sub(w118[0], w120[0]), sub(w118[1], w120[1])};                                        \
734
0
        const T_VEC w124[2]   = {add(w119[0], w121[1]), sub(w119[1], w121[0])};                                        \
735
0
        const T_VEC w125[2]   = {sub(w119[0], w121[1]), add(w119[1], w121[0])};                                        \
736
0
        const T_VEC w126[2]   = {add(i7, i9), sub(i25, i23)};                                                          \
737
0
        const T_VEC w127[2]   = {sub(i7, i9), sub(sub(k_weight0, i23), i25)};                                          \
738
0
        const T_VEC w128[2]   = {add(i15, i1), sub(i17, i31)};                                                         \
739
0
        const T_VEC w129[2]   = {sub(i15, i1), sub(sub(k_weight0, i31), i17)};                                         \
740
0
        const T_VEC w130[2]   = {add(w126[0], w128[0]), add(w126[1], w128[1])};                                        \
741
0
        const T_VEC w131[2]   = {sub(w126[0], w128[0]), sub(w126[1], w128[1])};                                        \
742
0
        const T_VEC w132[2]   = {add(w127[0], w129[1]), sub(w127[1], w129[0])};                                        \
743
0
        const T_VEC w133[2]   = {sub(w127[0], w129[1]), add(w127[1], w129[0])};                                        \
744
0
        const T_VEC w134[2]   = {add(w122[0], w130[0]), add(w122[1], w130[1])};                                        \
745
0
        const T_VEC w135[2]   = {sub(w122[0], w130[0]), sub(w122[1], w130[1])};                                        \
746
0
        const T_VEC w136[2]   = {add(w124[0], mul(k_weight2, add(w132[0], w132[1]))),                                  \
747
0
                                 add(w124[1], mul(k_weight2, sub(w132[1], w132[0])))};                                 \
748
0
        const T_VEC w137[2]   = {add(w124[0], sub(sub(k_weight0, mul(k_weight2, w132[0])), mul(k_weight2, w132[1]))),  \
749
0
                                 add(w124[1], mul(k_weight2, sub(w132[0], w132[1])))};                                 \
750
0
        const T_VEC w138[2]   = {add(w123[0], w131[1]), sub(w123[1], w131[0])};                                        \
751
0
        const T_VEC w139[2]   = {sub(w123[0], w131[1]), add(w123[1], w131[0])};                                        \
752
0
        const T_VEC w140[2]   = {sub(w125[0], mul(k_weight2, sub(w133[0], w133[1]))),                                  \
753
0
                                 sub(w125[1], mul(k_weight2, add(w133[1], w133[0])))};                                 \
754
0
        const T_VEC w141[2]   = {add(w125[0], mul(k_weight2, sub(w133[0], w133[1]))),                                  \
755
0
                                 add(w125[1], mul(k_weight2, add(w133[1], w133[0])))};                                 \
756
0
        const T_VEC w142[2]   = {add(w110[0], w134[0]), add(w110[1], w134[1])};                                        \
757
0
        const T_VEC w143[2]   = {sub(w110[0], w134[0]), sub(w110[1], w134[1])};                                        \
758
0
        const T_VEC w144[2]   = {add(w112[0], add(mul(k_weight3, w136[0]), mul(k_weight4, w136[1]))),                  \
759
0
                                 add(w112[1], sub(mul(k_weight3, w136[1]), mul(k_weight4, w136[0])))};                 \
760
0
        const T_VEC w145[2]   = {add(w112[0], sub(sub(k_weight0, mul(k_weight3, w136[0])), mul(k_weight4, w136[1]))),  \
761
0
                                 add(w112[1], sub(mul(k_weight4, w136[0]), mul(k_weight3, w136[1])))};                 \
762
0
        const T_VEC w146[2]   = {add(w114[0], mul(k_weight2, add(w138[0], w138[1]))),                                  \
763
0
                                 add(w114[1], mul(k_weight2, sub(w138[1], w138[0])))};                                 \
764
0
        const T_VEC w147[2]   = {add(w114[0], sub(sub(k_weight0, mul(k_weight2, w138[0])), mul(k_weight2, w138[1]))),  \
765
0
                                 add(w114[1], mul(k_weight2, sub(w138[0], w138[1])))};                                 \
766
0
        const T_VEC w148[2]   = {add(w116[0], add(mul(k_weight4, w140[0]), mul(k_weight3, w140[1]))),                  \
767
0
                                 add(w116[1], sub(mul(k_weight4, w140[1]), mul(k_weight3, w140[0])))};                 \
768
0
        const T_VEC w149[2]   = {add(w116[0], sub(sub(k_weight0, mul(k_weight4, w140[0])), mul(k_weight3, w140[1]))),  \
769
0
                                 add(w116[1], sub(mul(k_weight3, w140[0]), mul(k_weight4, w140[1])))};                 \
770
0
        const T_VEC w150[2]   = {add(w111[0], w135[1]), sub(w111[1], w135[0])};                                        \
771
0
        const T_VEC w151[2]   = {sub(w111[0], w135[1]), add(w111[1], w135[0])};                                        \
772
0
        const T_VEC w152[2]   = {sub(w113[0], sub(mul(k_weight4, w137[0]), mul(k_weight3, w137[1]))),                  \
773
0
                                 add(w113[1], sub(sub(k_weight0, mul(k_weight4, w137[1])), mul(k_weight3, w137[0])))}; \
774
0
        const T_VEC w153[2]   = {add(w113[0], sub(mul(k_weight4, w137[0]), mul(k_weight3, w137[1]))),                  \
775
0
                                 add(w113[1], add(mul(k_weight4, w137[1]), mul(k_weight3, w137[0])))};                 \
776
0
        const T_VEC w154[2]   = {sub(w115[0], mul(k_weight2, sub(w139[0], w139[1]))),                                  \
777
0
                                 sub(w115[1], mul(k_weight2, add(w139[1], w139[0])))};                                 \
778
0
        const T_VEC w155[2]   = {add(w115[0], mul(k_weight2, sub(w139[0], w139[1]))),                                  \
779
0
                                 add(w115[1], mul(k_weight2, add(w139[1], w139[0])))};                                 \
780
0
        const T_VEC w156[2]   = {sub(w117[0], sub(mul(k_weight3, w141[0]), mul(k_weight4, w141[1]))),                  \
781
0
                                 add(w117[1], sub(sub(k_weight0, mul(k_weight3, w141[1])), mul(k_weight4, w141[0])))}; \
782
0
        const T_VEC w157[2]   = {add(w117[0], sub(mul(k_weight3, w141[0]), mul(k_weight4, w141[1]))),                  \
783
0
                                 add(w117[1], add(mul(k_weight3, w141[1]), mul(k_weight4, w141[0])))};                 \
784
0
        store(output + 0 * stride, add(w78[0], w142[0]));                                                              \
785
0
        store(output + 1 * stride, add(w80[0], add(mul(k_weight5, w144[0]), mul(k_weight6, w144[1]))));                \
786
0
        store(output + 2 * stride, add(w82[0], add(mul(k_weight3, w146[0]), mul(k_weight4, w146[1]))));                \
787
0
        store(output + 3 * stride, add(w84[0], add(mul(k_weight7, w148[0]), mul(k_weight8, w148[1]))));                \
788
0
        store(output + 4 * stride, add(w86[0], mul(k_weight2, add(w150[0], w150[1]))));                                \
789
0
        store(output + 5 * stride, add(w88[0], add(mul(k_weight8, w152[0]), mul(k_weight7, w152[1]))));                \
790
0
        store(output + 6 * stride, add(w90[0], add(mul(k_weight4, w154[0]), mul(k_weight3, w154[1]))));                \
791
0
        store(output + 7 * stride, add(w92[0], add(mul(k_weight6, w156[0]), mul(k_weight5, w156[1]))));                \
792
0
        store(output + 8 * stride, add(w79[0], w143[1]));                                                              \
793
0
        store(output + 9 * stride, sub(w81[0], sub(mul(k_weight6, w145[0]), mul(k_weight5, w145[1]))));                \
794
0
        store(output + 10 * stride, sub(w83[0], sub(mul(k_weight4, w147[0]), mul(k_weight3, w147[1]))));               \
795
0
        store(output + 11 * stride, sub(w85[0], sub(mul(k_weight8, w149[0]), mul(k_weight7, w149[1]))));               \
796
0
        store(output + 12 * stride, sub(w87[0], mul(k_weight2, sub(w151[0], w151[1]))));                               \
797
0
        store(output + 13 * stride, sub(w89[0], sub(mul(k_weight7, w153[0]), mul(k_weight8, w153[1]))));               \
798
0
        store(output + 14 * stride, sub(w91[0], sub(mul(k_weight3, w155[0]), mul(k_weight4, w155[1]))));               \
799
0
        store(output + 15 * stride, sub(w93[0], sub(mul(k_weight5, w157[0]), mul(k_weight6, w157[1]))));               \
800
0
        store(output + 16 * stride, sub(w78[0], w142[0]));                                                             \
801
0
        store(output + 17 * stride,                                                                                    \
802
0
              add(w80[0], sub(sub(k_weight0, mul(k_weight5, w144[0])), mul(k_weight6, w144[1]))));                     \
803
0
        store(output + 18 * stride,                                                                                    \
804
0
              add(w82[0], sub(sub(k_weight0, mul(k_weight3, w146[0])), mul(k_weight4, w146[1]))));                     \
805
0
        store(output + 19 * stride,                                                                                    \
806
0
              add(w84[0], sub(sub(k_weight0, mul(k_weight7, w148[0])), mul(k_weight8, w148[1]))));                     \
807
0
        store(output + 20 * stride,                                                                                    \
808
0
              add(w86[0], sub(sub(k_weight0, mul(k_weight2, w150[0])), mul(k_weight2, w150[1]))));                     \
809
0
        store(output + 21 * stride,                                                                                    \
810
0
              add(w88[0], sub(sub(k_weight0, mul(k_weight8, w152[0])), mul(k_weight7, w152[1]))));                     \
811
0
        store(output + 22 * stride,                                                                                    \
812
0
              add(w90[0], sub(sub(k_weight0, mul(k_weight4, w154[0])), mul(k_weight3, w154[1]))));                     \
813
0
        store(output + 23 * stride,                                                                                    \
814
0
              add(w92[0], sub(sub(k_weight0, mul(k_weight6, w156[0])), mul(k_weight5, w156[1]))));                     \
815
0
        store(output + 24 * stride, sub(w79[0], w143[1]));                                                             \
816
0
        store(output + 25 * stride, add(w81[0], sub(mul(k_weight6, w145[0]), mul(k_weight5, w145[1]))));               \
817
0
        store(output + 26 * stride, add(w83[0], sub(mul(k_weight4, w147[0]), mul(k_weight3, w147[1]))));               \
818
0
        store(output + 27 * stride, add(w85[0], sub(mul(k_weight8, w149[0]), mul(k_weight7, w149[1]))));               \
819
0
        store(output + 28 * stride, add(w87[0], mul(k_weight2, sub(w151[0], w151[1]))));                               \
820
0
        store(output + 29 * stride, add(w89[0], sub(mul(k_weight7, w153[0]), mul(k_weight8, w153[1]))));               \
821
0
        store(output + 30 * stride, add(w91[0], sub(mul(k_weight3, w155[0]), mul(k_weight4, w155[1]))));               \
822
0
        store(output + 31 * stride, add(w93[0], sub(mul(k_weight5, w157[0]), mul(k_weight6, w157[1]))));               \
823
0
    }
824
825
#endif // AOM_AOM_DSP_FFT_COMMON_H_