/work/svt-av1/Source/Lib/Codec/fft_common.h
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2018, Alliance for Open Media. All rights reserved |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license. |
10 | | */ |
11 | | |
12 | | #ifndef AOM_AOM_DSP_FFT_COMMON_H_ |
13 | | #define AOM_AOM_DSP_FFT_COMMON_H_ |
14 | | |
15 | | #ifdef __cplusplus |
16 | | extern "C" { |
17 | | #endif |
18 | | |
19 | | /*!\brief A function pointer for computing 1d fft and ifft. |
20 | | * |
21 | | * The function will point to an implementation for a specific transform size, |
22 | | * and may perform the transforms using vectorized instructions. |
23 | | * |
24 | | * For a non-vectorized forward transforms of size n, the input and output |
25 | | * buffers will be size n. The output takes advantage of conjugate symmetry and |
26 | | * packs the results as: [r_0, r_1, ..., r_{n/2}, i_1, ..., i_{n/2-1}], where |
27 | | * (r_{j}, i_{j}) is the complex output for index j. |
28 | | * |
29 | | * An inverse transform will assume that the complex "input" is packed |
30 | | * similarly. Its output will be real. |
31 | | * |
32 | | * Non-vectorized transforms (e.g., on a single row) would use a stride = 1. |
33 | | * |
34 | | * Vectorized implementations are parallelized along the columns so that the fft |
35 | | * can be performed on multiple columns at a time. In such cases the data block |
36 | | * for input and output is typically square (n x n) and the stride will |
37 | | * correspond to the spacing between rows. At minimum, the input size must be |
38 | | * n x simd_vector_length. |
39 | | * |
40 | | * \param[in] input Input buffer. See above for size restrictions. |
41 | | * \param[out] output Output buffer. See above for size restrictions. |
42 | | * \param[in] stride The spacing in number of elements between rows |
43 | | * (or elements) |
44 | | */ |
45 | | typedef void (*AomFft1dFunc)(const float* input, float* output, int32_t stride); |
46 | | |
47 | | // Declare some of the forward non-vectorized transforms which are used in some |
48 | | // of the vectorized implementations |
49 | | void svt_aom_fft1d_4_float(const float* input, float* output, int32_t stride); |
50 | | void svt_aom_fft1d_8_float(const float* input, float* output, int32_t stride); |
51 | | void svt_aom_fft1d_16_float(const float* input, float* output, int32_t stride); |
52 | | void svt_aom_fft1d_32_float(const float* input, float* output, int32_t stride); |
53 | | |
54 | | /**\!brief Function pointer for transposing a matrix of floats. |
55 | | * |
56 | | * \param[in] input Input buffer (size n x n) |
57 | | * \param[out] output Output buffer (size n x n) |
58 | | * \param[in] n Extent of one dimension of the square matrix. |
59 | | */ |
60 | | typedef void (*AomFftTransposeFunc)(const float* input, float* output, int32_t n); |
61 | | |
62 | | /**\!brief Function pointer for re-arranging intermediate 2d transform results. |
63 | | * |
64 | | * After re-arrangement, the real and imaginary components will be packed |
65 | | * tightly next to each other. |
66 | | * |
67 | | * \param[in] input Input buffer (size n x n) |
68 | | * \param[out] output Output buffer (size 2 x n x n) |
69 | | * \param[in] n Extent of one dimension of the square matrix. |
70 | | */ |
71 | | typedef void (*AomFftUnpackFunc)(const float* input, float* output, int32_t n); |
72 | | |
73 | | /*!\brief Performs a 2d fft with the given functions. |
74 | | * |
75 | | * This generator function allows for multiple different implementations of 2d |
76 | | * fft with different vector operations, without having to redefine the main |
77 | | * body multiple times. |
78 | | * |
79 | | * \param[in] input Input buffer to run the transform on (size n x n) |
80 | | * \param[out] temp Working buffer for computing the transform (size n x n) |
81 | | * \param[out] output Output buffer (size 2 x n x n) |
82 | | * \param[in] tform Forward transform function |
83 | | * \param[in] transpose Transpose function (for n x n matrix) |
84 | | * \param[in] unpack Unpack function used to massage outputs to correct form |
85 | | * \param[in] vec_size Vector size (the transform is done vec_size units at |
86 | | * a time) |
87 | | */ |
88 | | void svt_aom_fft_2d_gen(const float* input, float* temp, float* output, int32_t n, AomFft1dFunc tform, |
89 | | AomFftTransposeFunc transpose, AomFftUnpackFunc unpack, int32_t vec_size); |
90 | | |
91 | | /*!\brief Perform a 2d inverse fft with the given helper functions |
92 | | * |
93 | | * \param[in] input Input buffer to run the transform on (size 2 x n x n) |
94 | | * \param[out] temp Working buffer for computations (size 2 x n x n) |
95 | | * \param[out] output Output buffer (size n x n) |
96 | | * \param[in] fft_single Forward transform function (non vectorized) |
97 | | * \param[in] fft_multi Forward transform function (vectorized) |
98 | | * \param[in] ifft_multi Inverse transform function (vectorized) |
99 | | * \param[in] transpose Transpose function (for n x n matrix) |
100 | | * \param[in] vec_size Vector size (the transform is done vec_size |
101 | | * units at a time) |
102 | | */ |
103 | | void svt_aom_ifft_2d_gen(const float* input, float* temp, float* output, int32_t n, AomFft1dFunc fft_single, |
104 | | AomFft1dFunc fft_multi, AomFft1dFunc ifft_multi, AomFftTransposeFunc transpose, |
105 | | int32_t vec_size); |
106 | | #ifdef __cplusplus |
107 | | } |
108 | | #endif |
109 | | |
110 | | // The macros below define 1D fft/ifft for different data types and for |
111 | | // different simd vector intrinsic types. |
112 | | |
113 | | #define GEN_FFT_2(ret, suffix, T, T_VEC, load, store) \ |
114 | 0 | ret svt_aom_fft1d_2_##suffix(const T* input, T* output, int32_t stride) { \ |
115 | 0 | const T_VEC i0 = load(input + 0 * stride); \ |
116 | 0 | const T_VEC i1 = load(input + 1 * stride); \ |
117 | 0 | store(output + 0 * stride, i0 + i1); \ |
118 | 0 | store(output + 1 * stride, i0 - i1); \ |
119 | 0 | } |
120 | | |
121 | | #define GEN_FFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \ |
122 | 0 | ret svt_aom_fft1d_4_##suffix(const T* input, T* output, int32_t stride) { \ |
123 | 0 | const T_VEC k_weight0 = constant(0.0f); \ |
124 | 0 | const T_VEC i0 = load(input + 0 * stride); \ |
125 | 0 | const T_VEC i1 = load(input + 1 * stride); \ |
126 | 0 | const T_VEC i2 = load(input + 2 * stride); \ |
127 | 0 | const T_VEC i3 = load(input + 3 * stride); \ |
128 | 0 | const T_VEC w0 = add(i0, i2); \ |
129 | 0 | const T_VEC w1 = sub(i0, i2); \ |
130 | 0 | const T_VEC w2 = add(i1, i3); \ |
131 | 0 | const T_VEC w3 = sub(i1, i3); \ |
132 | 0 | store(output + 0 * stride, add(w0, w2)); \ |
133 | 0 | store(output + 1 * stride, w1); \ |
134 | 0 | store(output + 2 * stride, sub(w0, w2)); \ |
135 | 0 | store(output + 3 * stride, sub(k_weight0, w3)); \ |
136 | 0 | } |
137 | | |
138 | | #define GEN_FFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, mul) \ |
139 | 0 | ret svt_aom_fft1d_8_##suffix(const T* input, T* output, int32_t stride) { \ |
140 | 0 | const T_VEC k_weight0 = constant(0.0f); \ |
141 | 0 | const T_VEC k_weight2 = constant(0.707107f); \ |
142 | 0 | const T_VEC i0 = load(input + 0 * stride); \ |
143 | 0 | const T_VEC i1 = load(input + 1 * stride); \ |
144 | 0 | const T_VEC i2 = load(input + 2 * stride); \ |
145 | 0 | const T_VEC i3 = load(input + 3 * stride); \ |
146 | 0 | const T_VEC i4 = load(input + 4 * stride); \ |
147 | 0 | const T_VEC i5 = load(input + 5 * stride); \ |
148 | 0 | const T_VEC i6 = load(input + 6 * stride); \ |
149 | 0 | const T_VEC i7 = load(input + 7 * stride); \ |
150 | 0 | const T_VEC w0 = add(i0, i4); \ |
151 | 0 | const T_VEC w1 = sub(i0, i4); \ |
152 | 0 | const T_VEC w2 = add(i2, i6); \ |
153 | 0 | const T_VEC w3 = sub(i2, i6); \ |
154 | 0 | const T_VEC w4 = add(w0, w2); \ |
155 | 0 | const T_VEC w5 = sub(w0, w2); \ |
156 | 0 | const T_VEC w7 = add(i1, i5); \ |
157 | 0 | const T_VEC w8 = sub(i1, i5); \ |
158 | 0 | const T_VEC w9 = add(i3, i7); \ |
159 | 0 | const T_VEC w10 = sub(i3, i7); \ |
160 | 0 | const T_VEC w11 = add(w7, w9); \ |
161 | 0 | const T_VEC w12 = sub(w7, w9); \ |
162 | 0 | store(output + 0 * stride, add(w4, w11)); \ |
163 | 0 | store(output + 1 * stride, add(w1, mul(k_weight2, sub(w8, w10)))); \ |
164 | 0 | store(output + 2 * stride, w5); \ |
165 | 0 | store(output + 3 * stride, sub(w1, mul(k_weight2, sub(w8, w10)))); \ |
166 | 0 | store(output + 4 * stride, sub(w4, w11)); \ |
167 | 0 | store(output + 5 * stride, sub(sub(k_weight0, w3), mul(k_weight2, add(w10, w8)))); \ |
168 | 0 | store(output + 6 * stride, sub(k_weight0, w12)); \ |
169 | 0 | store(output + 7 * stride, sub(w3, mul(k_weight2, add(w10, w8)))); \ |
170 | 0 | } |
171 | | |
172 | | #define GEN_FFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub, mul) \ |
173 | 0 | ret svt_aom_fft1d_16_##suffix(const T* input, T* output, int32_t stride) { \ |
174 | 0 | const T_VEC k_weight0 = constant(0.0f); \ |
175 | 0 | const T_VEC k_weight2 = constant(0.707107f); \ |
176 | 0 | const T_VEC k_weight3 = constant(0.92388f); \ |
177 | 0 | const T_VEC k_weight4 = constant(0.382683f); \ |
178 | 0 | const T_VEC i0 = load(input + 0 * stride); \ |
179 | 0 | const T_VEC i1 = load(input + 1 * stride); \ |
180 | 0 | const T_VEC i2 = load(input + 2 * stride); \ |
181 | 0 | const T_VEC i3 = load(input + 3 * stride); \ |
182 | 0 | const T_VEC i4 = load(input + 4 * stride); \ |
183 | 0 | const T_VEC i5 = load(input + 5 * stride); \ |
184 | 0 | const T_VEC i6 = load(input + 6 * stride); \ |
185 | 0 | const T_VEC i7 = load(input + 7 * stride); \ |
186 | 0 | const T_VEC i8 = load(input + 8 * stride); \ |
187 | 0 | const T_VEC i9 = load(input + 9 * stride); \ |
188 | 0 | const T_VEC i10 = load(input + 10 * stride); \ |
189 | 0 | const T_VEC i11 = load(input + 11 * stride); \ |
190 | 0 | const T_VEC i12 = load(input + 12 * stride); \ |
191 | 0 | const T_VEC i13 = load(input + 13 * stride); \ |
192 | 0 | const T_VEC i14 = load(input + 14 * stride); \ |
193 | 0 | const T_VEC i15 = load(input + 15 * stride); \ |
194 | 0 | const T_VEC w0 = add(i0, i8); \ |
195 | 0 | const T_VEC w1 = sub(i0, i8); \ |
196 | 0 | const T_VEC w2 = add(i4, i12); \ |
197 | 0 | const T_VEC w3 = sub(i4, i12); \ |
198 | 0 | const T_VEC w4 = add(w0, w2); \ |
199 | 0 | const T_VEC w5 = sub(w0, w2); \ |
200 | 0 | const T_VEC w7 = add(i2, i10); \ |
201 | 0 | const T_VEC w8 = sub(i2, i10); \ |
202 | 0 | const T_VEC w9 = add(i6, i14); \ |
203 | 0 | const T_VEC w10 = sub(i6, i14); \ |
204 | 0 | const T_VEC w11 = add(w7, w9); \ |
205 | 0 | const T_VEC w12 = sub(w7, w9); \ |
206 | 0 | const T_VEC w14 = add(w4, w11); \ |
207 | 0 | const T_VEC w15 = sub(w4, w11); \ |
208 | 0 | const T_VEC w16[2] = {add(w1, mul(k_weight2, sub(w8, w10))), \ |
209 | 0 | sub(sub(k_weight0, w3), mul(k_weight2, add(w10, w8)))}; \ |
210 | 0 | const T_VEC w18[2] = {sub(w1, mul(k_weight2, sub(w8, w10))), sub(w3, mul(k_weight2, add(w10, w8)))}; \ |
211 | 0 | const T_VEC w19 = add(i1, i9); \ |
212 | 0 | const T_VEC w20 = sub(i1, i9); \ |
213 | 0 | const T_VEC w21 = add(i5, i13); \ |
214 | 0 | const T_VEC w22 = sub(i5, i13); \ |
215 | 0 | const T_VEC w23 = add(w19, w21); \ |
216 | 0 | const T_VEC w24 = sub(w19, w21); \ |
217 | 0 | const T_VEC w26 = add(i3, i11); \ |
218 | 0 | const T_VEC w27 = sub(i3, i11); \ |
219 | 0 | const T_VEC w28 = add(i7, i15); \ |
220 | 0 | const T_VEC w29 = sub(i7, i15); \ |
221 | 0 | const T_VEC w30 = add(w26, w28); \ |
222 | 0 | const T_VEC w31 = sub(w26, w28); \ |
223 | 0 | const T_VEC w33 = add(w23, w30); \ |
224 | 0 | const T_VEC w34 = sub(w23, w30); \ |
225 | 0 | const T_VEC w35[2] = {add(w20, mul(k_weight2, sub(w27, w29))), \ |
226 | 0 | sub(sub(k_weight0, w22), mul(k_weight2, add(w29, w27)))}; \ |
227 | 0 | const T_VEC w37[2] = {sub(w20, mul(k_weight2, sub(w27, w29))), sub(w22, mul(k_weight2, add(w29, w27)))}; \ |
228 | 0 | store(output + 0 * stride, add(w14, w33)); \ |
229 | 0 | store(output + 1 * stride, add(w16[0], add(mul(k_weight3, w35[0]), mul(k_weight4, w35[1])))); \ |
230 | 0 | store(output + 2 * stride, add(w5, mul(k_weight2, sub(w24, w31)))); \ |
231 | 0 | store(output + 3 * stride, add(w18[0], add(mul(k_weight4, w37[0]), mul(k_weight3, w37[1])))); \ |
232 | 0 | store(output + 4 * stride, w15); \ |
233 | 0 | store(output + 5 * stride, add(w18[0], sub(sub(k_weight0, mul(k_weight4, w37[0])), mul(k_weight3, w37[1])))); \ |
234 | 0 | store(output + 6 * stride, sub(w5, mul(k_weight2, sub(w24, w31)))); \ |
235 | 0 | store(output + 7 * stride, add(w16[0], sub(sub(k_weight0, mul(k_weight3, w35[0])), mul(k_weight4, w35[1])))); \ |
236 | 0 | store(output + 8 * stride, sub(w14, w33)); \ |
237 | 0 | store(output + 9 * stride, add(w16[1], sub(mul(k_weight3, w35[1]), mul(k_weight4, w35[0])))); \ |
238 | 0 | store(output + 10 * stride, sub(sub(k_weight0, w12), mul(k_weight2, add(w31, w24)))); \ |
239 | 0 | store(output + 11 * stride, add(w18[1], sub(mul(k_weight4, w37[1]), mul(k_weight3, w37[0])))); \ |
240 | 0 | store(output + 12 * stride, sub(k_weight0, w34)); \ |
241 | 0 | store(output + 13 * stride, sub(sub(k_weight0, w18[1]), sub(mul(k_weight3, w37[0]), mul(k_weight4, w37[1])))); \ |
242 | 0 | store(output + 14 * stride, sub(w12, mul(k_weight2, add(w31, w24)))); \ |
243 | 0 | store(output + 15 * stride, sub(sub(k_weight0, w16[1]), sub(mul(k_weight4, w35[0]), mul(k_weight3, w35[1])))); \ |
244 | 0 | } |
245 | | |
246 | | #define GEN_FFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub, mul) \ |
247 | 0 | ret svt_aom_fft1d_32_##suffix(const T* input, T* output, int32_t stride) { \ |
248 | 0 | const T_VEC k_weight0 = constant(0.0f); \ |
249 | 0 | const T_VEC k_weight2 = constant(0.707107f); \ |
250 | 0 | const T_VEC k_weight3 = constant(0.92388f); \ |
251 | 0 | const T_VEC k_weight4 = constant(0.382683f); \ |
252 | 0 | const T_VEC k_weight5 = constant(0.980785f); \ |
253 | 0 | const T_VEC k_weight6 = constant(0.19509f); \ |
254 | 0 | const T_VEC k_weight7 = constant(0.83147f); \ |
255 | 0 | const T_VEC k_weight8 = constant(0.55557f); \ |
256 | 0 | const T_VEC i0 = load(input + 0 * stride); \ |
257 | 0 | const T_VEC i1 = load(input + 1 * stride); \ |
258 | 0 | const T_VEC i2 = load(input + 2 * stride); \ |
259 | 0 | const T_VEC i3 = load(input + 3 * stride); \ |
260 | 0 | const T_VEC i4 = load(input + 4 * stride); \ |
261 | 0 | const T_VEC i5 = load(input + 5 * stride); \ |
262 | 0 | const T_VEC i6 = load(input + 6 * stride); \ |
263 | 0 | const T_VEC i7 = load(input + 7 * stride); \ |
264 | 0 | const T_VEC i8 = load(input + 8 * stride); \ |
265 | 0 | const T_VEC i9 = load(input + 9 * stride); \ |
266 | 0 | const T_VEC i10 = load(input + 10 * stride); \ |
267 | 0 | const T_VEC i11 = load(input + 11 * stride); \ |
268 | 0 | const T_VEC i12 = load(input + 12 * stride); \ |
269 | 0 | const T_VEC i13 = load(input + 13 * stride); \ |
270 | 0 | const T_VEC i14 = load(input + 14 * stride); \ |
271 | 0 | const T_VEC i15 = load(input + 15 * stride); \ |
272 | 0 | const T_VEC i16 = load(input + 16 * stride); \ |
273 | 0 | const T_VEC i17 = load(input + 17 * stride); \ |
274 | 0 | const T_VEC i18 = load(input + 18 * stride); \ |
275 | 0 | const T_VEC i19 = load(input + 19 * stride); \ |
276 | 0 | const T_VEC i20 = load(input + 20 * stride); \ |
277 | 0 | const T_VEC i21 = load(input + 21 * stride); \ |
278 | 0 | const T_VEC i22 = load(input + 22 * stride); \ |
279 | 0 | const T_VEC i23 = load(input + 23 * stride); \ |
280 | 0 | const T_VEC i24 = load(input + 24 * stride); \ |
281 | 0 | const T_VEC i25 = load(input + 25 * stride); \ |
282 | 0 | const T_VEC i26 = load(input + 26 * stride); \ |
283 | 0 | const T_VEC i27 = load(input + 27 * stride); \ |
284 | 0 | const T_VEC i28 = load(input + 28 * stride); \ |
285 | 0 | const T_VEC i29 = load(input + 29 * stride); \ |
286 | 0 | const T_VEC i30 = load(input + 30 * stride); \ |
287 | 0 | const T_VEC i31 = load(input + 31 * stride); \ |
288 | 0 | const T_VEC w0 = add(i0, i16); \ |
289 | 0 | const T_VEC w1 = sub(i0, i16); \ |
290 | 0 | const T_VEC w2 = add(i8, i24); \ |
291 | 0 | const T_VEC w3 = sub(i8, i24); \ |
292 | 0 | const T_VEC w4 = add(w0, w2); \ |
293 | 0 | const T_VEC w5 = sub(w0, w2); \ |
294 | 0 | const T_VEC w7 = add(i4, i20); \ |
295 | 0 | const T_VEC w8 = sub(i4, i20); \ |
296 | 0 | const T_VEC w9 = add(i12, i28); \ |
297 | 0 | const T_VEC w10 = sub(i12, i28); \ |
298 | 0 | const T_VEC w11 = add(w7, w9); \ |
299 | 0 | const T_VEC w12 = sub(w7, w9); \ |
300 | 0 | const T_VEC w14 = add(w4, w11); \ |
301 | 0 | const T_VEC w15 = sub(w4, w11); \ |
302 | 0 | const T_VEC w16[2] = {add(w1, mul(k_weight2, sub(w8, w10))), \ |
303 | 0 | sub(sub(k_weight0, w3), mul(k_weight2, add(w10, w8)))}; \ |
304 | 0 | const T_VEC w18[2] = {sub(w1, mul(k_weight2, sub(w8, w10))), sub(w3, mul(k_weight2, add(w10, w8)))}; \ |
305 | 0 | const T_VEC w19 = add(i2, i18); \ |
306 | 0 | const T_VEC w20 = sub(i2, i18); \ |
307 | 0 | const T_VEC w21 = add(i10, i26); \ |
308 | 0 | const T_VEC w22 = sub(i10, i26); \ |
309 | 0 | const T_VEC w23 = add(w19, w21); \ |
310 | 0 | const T_VEC w24 = sub(w19, w21); \ |
311 | 0 | const T_VEC w26 = add(i6, i22); \ |
312 | 0 | const T_VEC w27 = sub(i6, i22); \ |
313 | 0 | const T_VEC w28 = add(i14, i30); \ |
314 | 0 | const T_VEC w29 = sub(i14, i30); \ |
315 | 0 | const T_VEC w30 = add(w26, w28); \ |
316 | 0 | const T_VEC w31 = sub(w26, w28); \ |
317 | 0 | const T_VEC w33 = add(w23, w30); \ |
318 | 0 | const T_VEC w34 = sub(w23, w30); \ |
319 | 0 | const T_VEC w35[2] = {add(w20, mul(k_weight2, sub(w27, w29))), \ |
320 | 0 | sub(sub(k_weight0, w22), mul(k_weight2, add(w29, w27)))}; \ |
321 | 0 | const T_VEC w37[2] = {sub(w20, mul(k_weight2, sub(w27, w29))), sub(w22, mul(k_weight2, add(w29, w27)))}; \ |
322 | 0 | const T_VEC w38 = add(w14, w33); \ |
323 | 0 | const T_VEC w39 = sub(w14, w33); \ |
324 | 0 | const T_VEC w40[2] = {add(w16[0], add(mul(k_weight3, w35[0]), mul(k_weight4, w35[1]))), \ |
325 | 0 | add(w16[1], sub(mul(k_weight3, w35[1]), mul(k_weight4, w35[0])))}; \ |
326 | 0 | const T_VEC w41[2] = {add(w5, mul(k_weight2, sub(w24, w31))), \ |
327 | 0 | sub(sub(k_weight0, w12), mul(k_weight2, add(w31, w24)))}; \ |
328 | 0 | const T_VEC w42[2] = {add(w18[0], add(mul(k_weight4, w37[0]), mul(k_weight3, w37[1]))), \ |
329 | 0 | add(w18[1], sub(mul(k_weight4, w37[1]), mul(k_weight3, w37[0])))}; \ |
330 | 0 | const T_VEC w44[2] = {add(w18[0], sub(sub(k_weight0, mul(k_weight4, w37[0])), mul(k_weight3, w37[1]))), \ |
331 | 0 | sub(sub(k_weight0, w18[1]), sub(mul(k_weight3, w37[0]), mul(k_weight4, w37[1])))}; \ |
332 | 0 | const T_VEC w45[2] = {sub(w5, mul(k_weight2, sub(w24, w31))), sub(w12, mul(k_weight2, add(w31, w24)))}; \ |
333 | 0 | const T_VEC w46[2] = {add(w16[0], sub(sub(k_weight0, mul(k_weight3, w35[0])), mul(k_weight4, w35[1]))), \ |
334 | 0 | sub(sub(k_weight0, w16[1]), sub(mul(k_weight4, w35[0]), mul(k_weight3, w35[1])))}; \ |
335 | 0 | const T_VEC w47 = add(i1, i17); \ |
336 | 0 | const T_VEC w48 = sub(i1, i17); \ |
337 | 0 | const T_VEC w49 = add(i9, i25); \ |
338 | 0 | const T_VEC w50 = sub(i9, i25); \ |
339 | 0 | const T_VEC w51 = add(w47, w49); \ |
340 | 0 | const T_VEC w52 = sub(w47, w49); \ |
341 | 0 | const T_VEC w54 = add(i5, i21); \ |
342 | 0 | const T_VEC w55 = sub(i5, i21); \ |
343 | 0 | const T_VEC w56 = add(i13, i29); \ |
344 | 0 | const T_VEC w57 = sub(i13, i29); \ |
345 | 0 | const T_VEC w58 = add(w54, w56); \ |
346 | 0 | const T_VEC w59 = sub(w54, w56); \ |
347 | 0 | const T_VEC w61 = add(w51, w58); \ |
348 | 0 | const T_VEC w62 = sub(w51, w58); \ |
349 | 0 | const T_VEC w63[2] = {add(w48, mul(k_weight2, sub(w55, w57))), \ |
350 | 0 | sub(sub(k_weight0, w50), mul(k_weight2, add(w57, w55)))}; \ |
351 | 0 | const T_VEC w65[2] = {sub(w48, mul(k_weight2, sub(w55, w57))), sub(w50, mul(k_weight2, add(w57, w55)))}; \ |
352 | 0 | const T_VEC w66 = add(i3, i19); \ |
353 | 0 | const T_VEC w67 = sub(i3, i19); \ |
354 | 0 | const T_VEC w68 = add(i11, i27); \ |
355 | 0 | const T_VEC w69 = sub(i11, i27); \ |
356 | 0 | const T_VEC w70 = add(w66, w68); \ |
357 | 0 | const T_VEC w71 = sub(w66, w68); \ |
358 | 0 | const T_VEC w73 = add(i7, i23); \ |
359 | 0 | const T_VEC w74 = sub(i7, i23); \ |
360 | 0 | const T_VEC w75 = add(i15, i31); \ |
361 | 0 | const T_VEC w76 = sub(i15, i31); \ |
362 | 0 | const T_VEC w77 = add(w73, w75); \ |
363 | 0 | const T_VEC w78 = sub(w73, w75); \ |
364 | 0 | const T_VEC w80 = add(w70, w77); \ |
365 | 0 | const T_VEC w81 = sub(w70, w77); \ |
366 | 0 | const T_VEC w82[2] = {add(w67, mul(k_weight2, sub(w74, w76))), \ |
367 | 0 | sub(sub(k_weight0, w69), mul(k_weight2, add(w76, w74)))}; \ |
368 | 0 | const T_VEC w84[2] = {sub(w67, mul(k_weight2, sub(w74, w76))), sub(w69, mul(k_weight2, add(w76, w74)))}; \ |
369 | 0 | const T_VEC w85 = add(w61, w80); \ |
370 | 0 | const T_VEC w86 = sub(w61, w80); \ |
371 | 0 | const T_VEC w87[2] = {add(w63[0], add(mul(k_weight3, w82[0]), mul(k_weight4, w82[1]))), \ |
372 | 0 | add(w63[1], sub(mul(k_weight3, w82[1]), mul(k_weight4, w82[0])))}; \ |
373 | 0 | const T_VEC w88[2] = {add(w52, mul(k_weight2, sub(w71, w78))), \ |
374 | 0 | sub(sub(k_weight0, w59), mul(k_weight2, add(w78, w71)))}; \ |
375 | 0 | const T_VEC w89[2] = {add(w65[0], add(mul(k_weight4, w84[0]), mul(k_weight3, w84[1]))), \ |
376 | 0 | add(w65[1], sub(mul(k_weight4, w84[1]), mul(k_weight3, w84[0])))}; \ |
377 | 0 | const T_VEC w91[2] = {add(w65[0], sub(sub(k_weight0, mul(k_weight4, w84[0])), mul(k_weight3, w84[1]))), \ |
378 | 0 | sub(sub(k_weight0, w65[1]), sub(mul(k_weight3, w84[0]), mul(k_weight4, w84[1])))}; \ |
379 | 0 | const T_VEC w92[2] = {sub(w52, mul(k_weight2, sub(w71, w78))), sub(w59, mul(k_weight2, add(w78, w71)))}; \ |
380 | 0 | const T_VEC w93[2] = {add(w63[0], sub(sub(k_weight0, mul(k_weight3, w82[0])), mul(k_weight4, w82[1]))), \ |
381 | 0 | sub(sub(k_weight0, w63[1]), sub(mul(k_weight4, w82[0]), mul(k_weight3, w82[1])))}; \ |
382 | 0 | store(output + 0 * stride, add(w38, w85)); \ |
383 | 0 | store(output + 1 * stride, add(w40[0], add(mul(k_weight5, w87[0]), mul(k_weight6, w87[1])))); \ |
384 | 0 | store(output + 2 * stride, add(w41[0], add(mul(k_weight3, w88[0]), mul(k_weight4, w88[1])))); \ |
385 | 0 | store(output + 3 * stride, add(w42[0], add(mul(k_weight7, w89[0]), mul(k_weight8, w89[1])))); \ |
386 | 0 | store(output + 4 * stride, add(w15, mul(k_weight2, sub(w62, w81)))); \ |
387 | 0 | store(output + 5 * stride, add(w44[0], add(mul(k_weight8, w91[0]), mul(k_weight7, w91[1])))); \ |
388 | 0 | store(output + 6 * stride, add(w45[0], add(mul(k_weight4, w92[0]), mul(k_weight3, w92[1])))); \ |
389 | 0 | store(output + 7 * stride, add(w46[0], add(mul(k_weight6, w93[0]), mul(k_weight5, w93[1])))); \ |
390 | 0 | store(output + 8 * stride, w39); \ |
391 | 0 | store(output + 9 * stride, add(w46[0], sub(sub(k_weight0, mul(k_weight6, w93[0])), mul(k_weight5, w93[1])))); \ |
392 | 0 | store(output + 10 * stride, add(w45[0], sub(sub(k_weight0, mul(k_weight4, w92[0])), mul(k_weight3, w92[1])))); \ |
393 | 0 | store(output + 11 * stride, add(w44[0], sub(sub(k_weight0, mul(k_weight8, w91[0])), mul(k_weight7, w91[1])))); \ |
394 | 0 | store(output + 12 * stride, sub(w15, mul(k_weight2, sub(w62, w81)))); \ |
395 | 0 | store(output + 13 * stride, add(w42[0], sub(sub(k_weight0, mul(k_weight7, w89[0])), mul(k_weight8, w89[1])))); \ |
396 | 0 | store(output + 14 * stride, add(w41[0], sub(sub(k_weight0, mul(k_weight3, w88[0])), mul(k_weight4, w88[1])))); \ |
397 | 0 | store(output + 15 * stride, add(w40[0], sub(sub(k_weight0, mul(k_weight5, w87[0])), mul(k_weight6, w87[1])))); \ |
398 | 0 | store(output + 16 * stride, sub(w38, w85)); \ |
399 | 0 | store(output + 17 * stride, add(w40[1], sub(mul(k_weight5, w87[1]), mul(k_weight6, w87[0])))); \ |
400 | 0 | store(output + 18 * stride, add(w41[1], sub(mul(k_weight3, w88[1]), mul(k_weight4, w88[0])))); \ |
401 | 0 | store(output + 19 * stride, add(w42[1], sub(mul(k_weight7, w89[1]), mul(k_weight8, w89[0])))); \ |
402 | 0 | store(output + 20 * stride, sub(sub(k_weight0, w34), mul(k_weight2, add(w81, w62)))); \ |
403 | 0 | store(output + 21 * stride, add(w44[1], sub(mul(k_weight8, w91[1]), mul(k_weight7, w91[0])))); \ |
404 | 0 | store(output + 22 * stride, add(w45[1], sub(mul(k_weight4, w92[1]), mul(k_weight3, w92[0])))); \ |
405 | 0 | store(output + 23 * stride, add(w46[1], sub(mul(k_weight6, w93[1]), mul(k_weight5, w93[0])))); \ |
406 | 0 | store(output + 24 * stride, sub(k_weight0, w86)); \ |
407 | 0 | store(output + 25 * stride, sub(sub(k_weight0, w46[1]), sub(mul(k_weight5, w93[0]), mul(k_weight6, w93[1])))); \ |
408 | 0 | store(output + 26 * stride, sub(sub(k_weight0, w45[1]), sub(mul(k_weight3, w92[0]), mul(k_weight4, w92[1])))); \ |
409 | 0 | store(output + 27 * stride, sub(sub(k_weight0, w44[1]), sub(mul(k_weight7, w91[0]), mul(k_weight8, w91[1])))); \ |
410 | 0 | store(output + 28 * stride, sub(w34, mul(k_weight2, add(w81, w62)))); \ |
411 | 0 | store(output + 29 * stride, sub(sub(k_weight0, w42[1]), sub(mul(k_weight8, w89[0]), mul(k_weight7, w89[1])))); \ |
412 | 0 | store(output + 30 * stride, sub(sub(k_weight0, w41[1]), sub(mul(k_weight4, w88[0]), mul(k_weight3, w88[1])))); \ |
413 | 0 | store(output + 31 * stride, sub(sub(k_weight0, w40[1]), sub(mul(k_weight6, w87[0]), mul(k_weight5, w87[1])))); \ |
414 | 0 | } |
415 | | |
416 | | #define GEN_IFFT_2(ret, suffix, T, T_VEC, load, store) \ |
417 | 0 | ret svt_aom_ifft1d_2_##suffix(const T* input, T* output, int32_t stride) { \ |
418 | 0 | const T_VEC i0 = load(input + 0 * stride); \ |
419 | 0 | const T_VEC i1 = load(input + 1 * stride); \ |
420 | 0 | store(output + 0 * stride, i0 + i1); \ |
421 | 0 | store(output + 1 * stride, i0 - i1); \ |
422 | 0 | } |
423 | | |
424 | | #define GEN_IFFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \ |
425 | 0 | ret svt_aom_ifft1d_4_##suffix(const T* input, T* output, int32_t stride) { \ |
426 | 0 | const T_VEC k_weight0 = constant(0.0f); \ |
427 | 0 | const T_VEC i0 = load(input + 0 * stride); \ |
428 | 0 | const T_VEC i1 = load(input + 1 * stride); \ |
429 | 0 | const T_VEC i2 = load(input + 2 * stride); \ |
430 | 0 | const T_VEC i3 = load(input + 3 * stride); \ |
431 | 0 | const T_VEC w2 = add(i0, i2); \ |
432 | 0 | const T_VEC w3 = sub(i0, i2); \ |
433 | 0 | const T_VEC w4[2] = {add(i1, i1), sub(i3, i3)}; \ |
434 | 0 | const T_VEC w5[2] = {sub(i1, i1), sub(sub(k_weight0, i3), i3)}; \ |
435 | 0 | store(output + 0 * stride, add(w2, w4[0])); \ |
436 | 0 | store(output + 1 * stride, add(w3, w5[1])); \ |
437 | 0 | store(output + 2 * stride, sub(w2, w4[0])); \ |
438 | 0 | store(output + 3 * stride, sub(w3, w5[1])); \ |
439 | 0 | } |
440 | | |
441 | | #define GEN_IFFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, mul) \ |
442 | 0 | ret svt_aom_ifft1d_8_##suffix(const T* input, T* output, int32_t stride) { \ |
443 | 0 | const T_VEC k_weight0 = constant(0.0f); \ |
444 | 0 | const T_VEC k_weight2 = constant(0.707107f); \ |
445 | 0 | const T_VEC i0 = load(input + 0 * stride); \ |
446 | 0 | const T_VEC i1 = load(input + 1 * stride); \ |
447 | 0 | const T_VEC i2 = load(input + 2 * stride); \ |
448 | 0 | const T_VEC i3 = load(input + 3 * stride); \ |
449 | 0 | const T_VEC i4 = load(input + 4 * stride); \ |
450 | 0 | const T_VEC i5 = load(input + 5 * stride); \ |
451 | 0 | const T_VEC i6 = load(input + 6 * stride); \ |
452 | 0 | const T_VEC i7 = load(input + 7 * stride); \ |
453 | 0 | const T_VEC w6 = add(i0, i4); \ |
454 | 0 | const T_VEC w7 = sub(i0, i4); \ |
455 | 0 | const T_VEC w8[2] = {add(i2, i2), sub(i6, i6)}; \ |
456 | 0 | const T_VEC w9[2] = {sub(i2, i2), sub(sub(k_weight0, i6), i6)}; \ |
457 | 0 | const T_VEC w10[2] = {add(w6, w8[0]), w8[1]}; \ |
458 | 0 | const T_VEC w11[2] = {sub(w6, w8[0]), sub(k_weight0, w8[1])}; \ |
459 | 0 | const T_VEC w12[2] = {add(w7, w9[1]), sub(k_weight0, w9[0])}; \ |
460 | 0 | const T_VEC w13[2] = {sub(w7, w9[1]), w9[0]}; \ |
461 | 0 | const T_VEC w14[2] = {add(i1, i3), sub(i7, i5)}; \ |
462 | 0 | const T_VEC w15[2] = {sub(i1, i3), sub(sub(k_weight0, i5), i7)}; \ |
463 | 0 | const T_VEC w16[2] = {add(i3, i1), sub(i5, i7)}; \ |
464 | 0 | const T_VEC w17[2] = {sub(i3, i1), sub(sub(k_weight0, i7), i5)}; \ |
465 | 0 | const T_VEC w18[2] = {add(w14[0], w16[0]), add(w14[1], w16[1])}; \ |
466 | 0 | const T_VEC w19[2] = {sub(w14[0], w16[0]), sub(w14[1], w16[1])}; \ |
467 | 0 | const T_VEC w20[2] = {add(w15[0], w17[1]), sub(w15[1], w17[0])}; \ |
468 | 0 | const T_VEC w21[2] = {sub(w15[0], w17[1]), add(w15[1], w17[0])}; \ |
469 | 0 | store(output + 0 * stride, add(w10[0], w18[0])); \ |
470 | 0 | store(output + 1 * stride, add(w12[0], mul(k_weight2, add(w20[0], w20[1])))); \ |
471 | 0 | store(output + 2 * stride, add(w11[0], w19[1])); \ |
472 | 0 | store(output + 3 * stride, sub(w13[0], mul(k_weight2, sub(w21[0], w21[1])))); \ |
473 | 0 | store(output + 4 * stride, sub(w10[0], w18[0])); \ |
474 | 0 | store(output + 5 * stride, add(w12[0], sub(sub(k_weight0, mul(k_weight2, w20[0])), mul(k_weight2, w20[1])))); \ |
475 | 0 | store(output + 6 * stride, sub(w11[0], w19[1])); \ |
476 | 0 | store(output + 7 * stride, add(w13[0], mul(k_weight2, sub(w21[0], w21[1])))); \ |
477 | 0 | } |
478 | | |
479 | | #define GEN_IFFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub, mul) \ |
480 | 0 | ret svt_aom_ifft1d_16_##suffix(const T* input, T* output, int32_t stride) { \ |
481 | 0 | const T_VEC k_weight0 = constant(0.0f); \ |
482 | 0 | const T_VEC k_weight2 = constant(0.707107f); \ |
483 | 0 | const T_VEC k_weight3 = constant(0.92388f); \ |
484 | 0 | const T_VEC k_weight4 = constant(0.382683f); \ |
485 | 0 | const T_VEC i0 = load(input + 0 * stride); \ |
486 | 0 | const T_VEC i1 = load(input + 1 * stride); \ |
487 | 0 | const T_VEC i2 = load(input + 2 * stride); \ |
488 | 0 | const T_VEC i3 = load(input + 3 * stride); \ |
489 | 0 | const T_VEC i4 = load(input + 4 * stride); \ |
490 | 0 | const T_VEC i5 = load(input + 5 * stride); \ |
491 | 0 | const T_VEC i6 = load(input + 6 * stride); \ |
492 | 0 | const T_VEC i7 = load(input + 7 * stride); \ |
493 | 0 | const T_VEC i8 = load(input + 8 * stride); \ |
494 | 0 | const T_VEC i9 = load(input + 9 * stride); \ |
495 | 0 | const T_VEC i10 = load(input + 10 * stride); \ |
496 | 0 | const T_VEC i11 = load(input + 11 * stride); \ |
497 | 0 | const T_VEC i12 = load(input + 12 * stride); \ |
498 | 0 | const T_VEC i13 = load(input + 13 * stride); \ |
499 | 0 | const T_VEC i14 = load(input + 14 * stride); \ |
500 | 0 | const T_VEC i15 = load(input + 15 * stride); \ |
501 | 0 | const T_VEC w14 = add(i0, i8); \ |
502 | 0 | const T_VEC w15 = sub(i0, i8); \ |
503 | 0 | const T_VEC w16[2] = {add(i4, i4), sub(i12, i12)}; \ |
504 | 0 | const T_VEC w17[2] = {sub(i4, i4), sub(sub(k_weight0, i12), i12)}; \ |
505 | 0 | const T_VEC w18[2] = {add(w14, w16[0]), w16[1]}; \ |
506 | 0 | const T_VEC w19[2] = {sub(w14, w16[0]), sub(k_weight0, w16[1])}; \ |
507 | 0 | const T_VEC w20[2] = {add(w15, w17[1]), sub(k_weight0, w17[0])}; \ |
508 | 0 | const T_VEC w21[2] = {sub(w15, w17[1]), w17[0]}; \ |
509 | 0 | const T_VEC w22[2] = {add(i2, i6), sub(i14, i10)}; \ |
510 | 0 | const T_VEC w23[2] = {sub(i2, i6), sub(sub(k_weight0, i10), i14)}; \ |
511 | 0 | const T_VEC w24[2] = {add(i6, i2), sub(i10, i14)}; \ |
512 | 0 | const T_VEC w25[2] = {sub(i6, i2), sub(sub(k_weight0, i14), i10)}; \ |
513 | 0 | const T_VEC w26[2] = {add(w22[0], w24[0]), add(w22[1], w24[1])}; \ |
514 | 0 | const T_VEC w27[2] = {sub(w22[0], w24[0]), sub(w22[1], w24[1])}; \ |
515 | 0 | const T_VEC w28[2] = {add(w23[0], w25[1]), sub(w23[1], w25[0])}; \ |
516 | 0 | const T_VEC w29[2] = {sub(w23[0], w25[1]), add(w23[1], w25[0])}; \ |
517 | 0 | const T_VEC w30[2] = {add(w18[0], w26[0]), add(w18[1], w26[1])}; \ |
518 | 0 | const T_VEC w31[2] = {sub(w18[0], w26[0]), sub(w18[1], w26[1])}; \ |
519 | 0 | const T_VEC w32[2] = {add(w20[0], mul(k_weight2, add(w28[0], w28[1]))), \ |
520 | 0 | add(w20[1], mul(k_weight2, sub(w28[1], w28[0])))}; \ |
521 | 0 | const T_VEC w33[2] = {add(w20[0], sub(sub(k_weight0, mul(k_weight2, w28[0])), mul(k_weight2, w28[1]))), \ |
522 | 0 | add(w20[1], mul(k_weight2, sub(w28[0], w28[1])))}; \ |
523 | 0 | const T_VEC w34[2] = {add(w19[0], w27[1]), sub(w19[1], w27[0])}; \ |
524 | 0 | const T_VEC w35[2] = {sub(w19[0], w27[1]), add(w19[1], w27[0])}; \ |
525 | 0 | const T_VEC w36[2] = {sub(w21[0], mul(k_weight2, sub(w29[0], w29[1]))), \ |
526 | 0 | sub(w21[1], mul(k_weight2, add(w29[1], w29[0])))}; \ |
527 | 0 | const T_VEC w37[2] = {add(w21[0], mul(k_weight2, sub(w29[0], w29[1]))), \ |
528 | 0 | add(w21[1], mul(k_weight2, add(w29[1], w29[0])))}; \ |
529 | 0 | const T_VEC w38[2] = {add(i1, i7), sub(i15, i9)}; \ |
530 | 0 | const T_VEC w39[2] = {sub(i1, i7), sub(sub(k_weight0, i9), i15)}; \ |
531 | 0 | const T_VEC w40[2] = {add(i5, i3), sub(i11, i13)}; \ |
532 | 0 | const T_VEC w41[2] = {sub(i5, i3), sub(sub(k_weight0, i13), i11)}; \ |
533 | 0 | const T_VEC w42[2] = {add(w38[0], w40[0]), add(w38[1], w40[1])}; \ |
534 | 0 | const T_VEC w43[2] = {sub(w38[0], w40[0]), sub(w38[1], w40[1])}; \ |
535 | 0 | const T_VEC w44[2] = {add(w39[0], w41[1]), sub(w39[1], w41[0])}; \ |
536 | 0 | const T_VEC w45[2] = {sub(w39[0], w41[1]), add(w39[1], w41[0])}; \ |
537 | 0 | const T_VEC w46[2] = {add(i3, i5), sub(i13, i11)}; \ |
538 | 0 | const T_VEC w47[2] = {sub(i3, i5), sub(sub(k_weight0, i11), i13)}; \ |
539 | 0 | const T_VEC w48[2] = {add(i7, i1), sub(i9, i15)}; \ |
540 | 0 | const T_VEC w49[2] = {sub(i7, i1), sub(sub(k_weight0, i15), i9)}; \ |
541 | 0 | const T_VEC w50[2] = {add(w46[0], w48[0]), add(w46[1], w48[1])}; \ |
542 | 0 | const T_VEC w51[2] = {sub(w46[0], w48[0]), sub(w46[1], w48[1])}; \ |
543 | 0 | const T_VEC w52[2] = {add(w47[0], w49[1]), sub(w47[1], w49[0])}; \ |
544 | 0 | const T_VEC w53[2] = {sub(w47[0], w49[1]), add(w47[1], w49[0])}; \ |
545 | 0 | const T_VEC w54[2] = {add(w42[0], w50[0]), add(w42[1], w50[1])}; \ |
546 | 0 | const T_VEC w55[2] = {sub(w42[0], w50[0]), sub(w42[1], w50[1])}; \ |
547 | 0 | const T_VEC w56[2] = {add(w44[0], mul(k_weight2, add(w52[0], w52[1]))), \ |
548 | 0 | add(w44[1], mul(k_weight2, sub(w52[1], w52[0])))}; \ |
549 | 0 | const T_VEC w57[2] = {add(w44[0], sub(sub(k_weight0, mul(k_weight2, w52[0])), mul(k_weight2, w52[1]))), \ |
550 | 0 | add(w44[1], mul(k_weight2, sub(w52[0], w52[1])))}; \ |
551 | 0 | const T_VEC w58[2] = {add(w43[0], w51[1]), sub(w43[1], w51[0])}; \ |
552 | 0 | const T_VEC w59[2] = {sub(w43[0], w51[1]), add(w43[1], w51[0])}; \ |
553 | 0 | const T_VEC w60[2] = {sub(w45[0], mul(k_weight2, sub(w53[0], w53[1]))), \ |
554 | 0 | sub(w45[1], mul(k_weight2, add(w53[1], w53[0])))}; \ |
555 | 0 | const T_VEC w61[2] = {add(w45[0], mul(k_weight2, sub(w53[0], w53[1]))), \ |
556 | 0 | add(w45[1], mul(k_weight2, add(w53[1], w53[0])))}; \ |
557 | 0 | store(output + 0 * stride, add(w30[0], w54[0])); \ |
558 | 0 | store(output + 1 * stride, add(w32[0], add(mul(k_weight3, w56[0]), mul(k_weight4, w56[1])))); \ |
559 | 0 | store(output + 2 * stride, add(w34[0], mul(k_weight2, add(w58[0], w58[1])))); \ |
560 | 0 | store(output + 3 * stride, add(w36[0], add(mul(k_weight4, w60[0]), mul(k_weight3, w60[1])))); \ |
561 | 0 | store(output + 4 * stride, add(w31[0], w55[1])); \ |
562 | 0 | store(output + 5 * stride, sub(w33[0], sub(mul(k_weight4, w57[0]), mul(k_weight3, w57[1])))); \ |
563 | 0 | store(output + 6 * stride, sub(w35[0], mul(k_weight2, sub(w59[0], w59[1])))); \ |
564 | 0 | store(output + 7 * stride, sub(w37[0], sub(mul(k_weight3, w61[0]), mul(k_weight4, w61[1])))); \ |
565 | 0 | store(output + 8 * stride, sub(w30[0], w54[0])); \ |
566 | 0 | store(output + 9 * stride, add(w32[0], sub(sub(k_weight0, mul(k_weight3, w56[0])), mul(k_weight4, w56[1])))); \ |
567 | 0 | store(output + 10 * stride, add(w34[0], sub(sub(k_weight0, mul(k_weight2, w58[0])), mul(k_weight2, w58[1])))); \ |
568 | 0 | store(output + 11 * stride, add(w36[0], sub(sub(k_weight0, mul(k_weight4, w60[0])), mul(k_weight3, w60[1])))); \ |
569 | 0 | store(output + 12 * stride, sub(w31[0], w55[1])); \ |
570 | 0 | store(output + 13 * stride, add(w33[0], sub(mul(k_weight4, w57[0]), mul(k_weight3, w57[1])))); \ |
571 | 0 | store(output + 14 * stride, add(w35[0], mul(k_weight2, sub(w59[0], w59[1])))); \ |
572 | 0 | store(output + 15 * stride, add(w37[0], sub(mul(k_weight3, w61[0]), mul(k_weight4, w61[1])))); \ |
573 | 0 | } |
574 | | #define GEN_IFFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub, mul) \ |
575 | 0 | ret svt_aom_ifft1d_32_##suffix(const T* input, T* output, int32_t stride) { \ |
576 | 0 | const T_VEC k_weight0 = constant(0.0f); \ |
577 | 0 | const T_VEC k_weight2 = constant(0.707107f); \ |
578 | 0 | const T_VEC k_weight3 = constant(0.92388f); \ |
579 | 0 | const T_VEC k_weight4 = constant(0.382683f); \ |
580 | 0 | const T_VEC k_weight5 = constant(0.980785f); \ |
581 | 0 | const T_VEC k_weight6 = constant(0.19509f); \ |
582 | 0 | const T_VEC k_weight7 = constant(0.83147f); \ |
583 | 0 | const T_VEC k_weight8 = constant(0.55557f); \ |
584 | 0 | const T_VEC i0 = load(input + 0 * stride); \ |
585 | 0 | const T_VEC i1 = load(input + 1 * stride); \ |
586 | 0 | const T_VEC i2 = load(input + 2 * stride); \ |
587 | 0 | const T_VEC i3 = load(input + 3 * stride); \ |
588 | 0 | const T_VEC i4 = load(input + 4 * stride); \ |
589 | 0 | const T_VEC i5 = load(input + 5 * stride); \ |
590 | 0 | const T_VEC i6 = load(input + 6 * stride); \ |
591 | 0 | const T_VEC i7 = load(input + 7 * stride); \ |
592 | 0 | const T_VEC i8 = load(input + 8 * stride); \ |
593 | 0 | const T_VEC i9 = load(input + 9 * stride); \ |
594 | 0 | const T_VEC i10 = load(input + 10 * stride); \ |
595 | 0 | const T_VEC i11 = load(input + 11 * stride); \ |
596 | 0 | const T_VEC i12 = load(input + 12 * stride); \ |
597 | 0 | const T_VEC i13 = load(input + 13 * stride); \ |
598 | 0 | const T_VEC i14 = load(input + 14 * stride); \ |
599 | 0 | const T_VEC i15 = load(input + 15 * stride); \ |
600 | 0 | const T_VEC i16 = load(input + 16 * stride); \ |
601 | 0 | const T_VEC i17 = load(input + 17 * stride); \ |
602 | 0 | const T_VEC i18 = load(input + 18 * stride); \ |
603 | 0 | const T_VEC i19 = load(input + 19 * stride); \ |
604 | 0 | const T_VEC i20 = load(input + 20 * stride); \ |
605 | 0 | const T_VEC i21 = load(input + 21 * stride); \ |
606 | 0 | const T_VEC i22 = load(input + 22 * stride); \ |
607 | 0 | const T_VEC i23 = load(input + 23 * stride); \ |
608 | 0 | const T_VEC i24 = load(input + 24 * stride); \ |
609 | 0 | const T_VEC i25 = load(input + 25 * stride); \ |
610 | 0 | const T_VEC i26 = load(input + 26 * stride); \ |
611 | 0 | const T_VEC i27 = load(input + 27 * stride); \ |
612 | 0 | const T_VEC i28 = load(input + 28 * stride); \ |
613 | 0 | const T_VEC i29 = load(input + 29 * stride); \ |
614 | 0 | const T_VEC i30 = load(input + 30 * stride); \ |
615 | 0 | const T_VEC i31 = load(input + 31 * stride); \ |
616 | 0 | const T_VEC w30 = add(i0, i16); \ |
617 | 0 | const T_VEC w31 = sub(i0, i16); \ |
618 | 0 | const T_VEC w32[2] = {add(i8, i8), sub(i24, i24)}; \ |
619 | 0 | const T_VEC w33[2] = {sub(i8, i8), sub(sub(k_weight0, i24), i24)}; \ |
620 | 0 | const T_VEC w34[2] = {add(w30, w32[0]), w32[1]}; \ |
621 | 0 | const T_VEC w35[2] = {sub(w30, w32[0]), sub(k_weight0, w32[1])}; \ |
622 | 0 | const T_VEC w36[2] = {add(w31, w33[1]), sub(k_weight0, w33[0])}; \ |
623 | 0 | const T_VEC w37[2] = {sub(w31, w33[1]), w33[0]}; \ |
624 | 0 | const T_VEC w38[2] = {add(i4, i12), sub(i28, i20)}; \ |
625 | 0 | const T_VEC w39[2] = {sub(i4, i12), sub(sub(k_weight0, i20), i28)}; \ |
626 | 0 | const T_VEC w40[2] = {add(i12, i4), sub(i20, i28)}; \ |
627 | 0 | const T_VEC w41[2] = {sub(i12, i4), sub(sub(k_weight0, i28), i20)}; \ |
628 | 0 | const T_VEC w42[2] = {add(w38[0], w40[0]), add(w38[1], w40[1])}; \ |
629 | 0 | const T_VEC w43[2] = {sub(w38[0], w40[0]), sub(w38[1], w40[1])}; \ |
630 | 0 | const T_VEC w44[2] = {add(w39[0], w41[1]), sub(w39[1], w41[0])}; \ |
631 | 0 | const T_VEC w45[2] = {sub(w39[0], w41[1]), add(w39[1], w41[0])}; \ |
632 | 0 | const T_VEC w46[2] = {add(w34[0], w42[0]), add(w34[1], w42[1])}; \ |
633 | 0 | const T_VEC w47[2] = {sub(w34[0], w42[0]), sub(w34[1], w42[1])}; \ |
634 | 0 | const T_VEC w48[2] = {add(w36[0], mul(k_weight2, add(w44[0], w44[1]))), \ |
635 | 0 | add(w36[1], mul(k_weight2, sub(w44[1], w44[0])))}; \ |
636 | 0 | const T_VEC w49[2] = {add(w36[0], sub(sub(k_weight0, mul(k_weight2, w44[0])), mul(k_weight2, w44[1]))), \ |
637 | 0 | add(w36[1], mul(k_weight2, sub(w44[0], w44[1])))}; \ |
638 | 0 | const T_VEC w50[2] = {add(w35[0], w43[1]), sub(w35[1], w43[0])}; \ |
639 | 0 | const T_VEC w51[2] = {sub(w35[0], w43[1]), add(w35[1], w43[0])}; \ |
640 | 0 | const T_VEC w52[2] = {sub(w37[0], mul(k_weight2, sub(w45[0], w45[1]))), \ |
641 | 0 | sub(w37[1], mul(k_weight2, add(w45[1], w45[0])))}; \ |
642 | 0 | const T_VEC w53[2] = {add(w37[0], mul(k_weight2, sub(w45[0], w45[1]))), \ |
643 | 0 | add(w37[1], mul(k_weight2, add(w45[1], w45[0])))}; \ |
644 | 0 | const T_VEC w54[2] = {add(i2, i14), sub(i30, i18)}; \ |
645 | 0 | const T_VEC w55[2] = {sub(i2, i14), sub(sub(k_weight0, i18), i30)}; \ |
646 | 0 | const T_VEC w56[2] = {add(i10, i6), sub(i22, i26)}; \ |
647 | 0 | const T_VEC w57[2] = {sub(i10, i6), sub(sub(k_weight0, i26), i22)}; \ |
648 | 0 | const T_VEC w58[2] = {add(w54[0], w56[0]), add(w54[1], w56[1])}; \ |
649 | 0 | const T_VEC w59[2] = {sub(w54[0], w56[0]), sub(w54[1], w56[1])}; \ |
650 | 0 | const T_VEC w60[2] = {add(w55[0], w57[1]), sub(w55[1], w57[0])}; \ |
651 | 0 | const T_VEC w61[2] = {sub(w55[0], w57[1]), add(w55[1], w57[0])}; \ |
652 | 0 | const T_VEC w62[2] = {add(i6, i10), sub(i26, i22)}; \ |
653 | 0 | const T_VEC w63[2] = {sub(i6, i10), sub(sub(k_weight0, i22), i26)}; \ |
654 | 0 | const T_VEC w64[2] = {add(i14, i2), sub(i18, i30)}; \ |
655 | 0 | const T_VEC w65[2] = {sub(i14, i2), sub(sub(k_weight0, i30), i18)}; \ |
656 | 0 | const T_VEC w66[2] = {add(w62[0], w64[0]), add(w62[1], w64[1])}; \ |
657 | 0 | const T_VEC w67[2] = {sub(w62[0], w64[0]), sub(w62[1], w64[1])}; \ |
658 | 0 | const T_VEC w68[2] = {add(w63[0], w65[1]), sub(w63[1], w65[0])}; \ |
659 | 0 | const T_VEC w69[2] = {sub(w63[0], w65[1]), add(w63[1], w65[0])}; \ |
660 | 0 | const T_VEC w70[2] = {add(w58[0], w66[0]), add(w58[1], w66[1])}; \ |
661 | 0 | const T_VEC w71[2] = {sub(w58[0], w66[0]), sub(w58[1], w66[1])}; \ |
662 | 0 | const T_VEC w72[2] = {add(w60[0], mul(k_weight2, add(w68[0], w68[1]))), \ |
663 | 0 | add(w60[1], mul(k_weight2, sub(w68[1], w68[0])))}; \ |
664 | 0 | const T_VEC w73[2] = {add(w60[0], sub(sub(k_weight0, mul(k_weight2, w68[0])), mul(k_weight2, w68[1]))), \ |
665 | 0 | add(w60[1], mul(k_weight2, sub(w68[0], w68[1])))}; \ |
666 | 0 | const T_VEC w74[2] = {add(w59[0], w67[1]), sub(w59[1], w67[0])}; \ |
667 | 0 | const T_VEC w75[2] = {sub(w59[0], w67[1]), add(w59[1], w67[0])}; \ |
668 | 0 | const T_VEC w76[2] = {sub(w61[0], mul(k_weight2, sub(w69[0], w69[1]))), \ |
669 | 0 | sub(w61[1], mul(k_weight2, add(w69[1], w69[0])))}; \ |
670 | 0 | const T_VEC w77[2] = {add(w61[0], mul(k_weight2, sub(w69[0], w69[1]))), \ |
671 | 0 | add(w61[1], mul(k_weight2, add(w69[1], w69[0])))}; \ |
672 | 0 | const T_VEC w78[2] = {add(w46[0], w70[0]), add(w46[1], w70[1])}; \ |
673 | 0 | const T_VEC w79[2] = {sub(w46[0], w70[0]), sub(w46[1], w70[1])}; \ |
674 | 0 | const T_VEC w80[2] = {add(w48[0], add(mul(k_weight3, w72[0]), mul(k_weight4, w72[1]))), \ |
675 | 0 | add(w48[1], sub(mul(k_weight3, w72[1]), mul(k_weight4, w72[0])))}; \ |
676 | 0 | const T_VEC w81[2] = {add(w48[0], sub(sub(k_weight0, mul(k_weight3, w72[0])), mul(k_weight4, w72[1]))), \ |
677 | 0 | add(w48[1], sub(mul(k_weight4, w72[0]), mul(k_weight3, w72[1])))}; \ |
678 | 0 | const T_VEC w82[2] = {add(w50[0], mul(k_weight2, add(w74[0], w74[1]))), \ |
679 | 0 | add(w50[1], mul(k_weight2, sub(w74[1], w74[0])))}; \ |
680 | 0 | const T_VEC w83[2] = {add(w50[0], sub(sub(k_weight0, mul(k_weight2, w74[0])), mul(k_weight2, w74[1]))), \ |
681 | 0 | add(w50[1], mul(k_weight2, sub(w74[0], w74[1])))}; \ |
682 | 0 | const T_VEC w84[2] = {add(w52[0], add(mul(k_weight4, w76[0]), mul(k_weight3, w76[1]))), \ |
683 | 0 | add(w52[1], sub(mul(k_weight4, w76[1]), mul(k_weight3, w76[0])))}; \ |
684 | 0 | const T_VEC w85[2] = {add(w52[0], sub(sub(k_weight0, mul(k_weight4, w76[0])), mul(k_weight3, w76[1]))), \ |
685 | 0 | add(w52[1], sub(mul(k_weight3, w76[0]), mul(k_weight4, w76[1])))}; \ |
686 | 0 | const T_VEC w86[2] = {add(w47[0], w71[1]), sub(w47[1], w71[0])}; \ |
687 | 0 | const T_VEC w87[2] = {sub(w47[0], w71[1]), add(w47[1], w71[0])}; \ |
688 | 0 | const T_VEC w88[2] = {sub(w49[0], sub(mul(k_weight4, w73[0]), mul(k_weight3, w73[1]))), \ |
689 | 0 | add(w49[1], sub(sub(k_weight0, mul(k_weight4, w73[1])), mul(k_weight3, w73[0])))}; \ |
690 | 0 | const T_VEC w89[2] = {add(w49[0], sub(mul(k_weight4, w73[0]), mul(k_weight3, w73[1]))), \ |
691 | 0 | add(w49[1], add(mul(k_weight4, w73[1]), mul(k_weight3, w73[0])))}; \ |
692 | 0 | const T_VEC w90[2] = {sub(w51[0], mul(k_weight2, sub(w75[0], w75[1]))), \ |
693 | 0 | sub(w51[1], mul(k_weight2, add(w75[1], w75[0])))}; \ |
694 | 0 | const T_VEC w91[2] = {add(w51[0], mul(k_weight2, sub(w75[0], w75[1]))), \ |
695 | 0 | add(w51[1], mul(k_weight2, add(w75[1], w75[0])))}; \ |
696 | 0 | const T_VEC w92[2] = {sub(w53[0], sub(mul(k_weight3, w77[0]), mul(k_weight4, w77[1]))), \ |
697 | 0 | add(w53[1], sub(sub(k_weight0, mul(k_weight3, w77[1])), mul(k_weight4, w77[0])))}; \ |
698 | 0 | const T_VEC w93[2] = {add(w53[0], sub(mul(k_weight3, w77[0]), mul(k_weight4, w77[1]))), \ |
699 | 0 | add(w53[1], add(mul(k_weight3, w77[1]), mul(k_weight4, w77[0])))}; \ |
700 | 0 | const T_VEC w94[2] = {add(i1, i15), sub(i31, i17)}; \ |
701 | 0 | const T_VEC w95[2] = {sub(i1, i15), sub(sub(k_weight0, i17), i31)}; \ |
702 | 0 | const T_VEC w96[2] = {add(i9, i7), sub(i23, i25)}; \ |
703 | 0 | const T_VEC w97[2] = {sub(i9, i7), sub(sub(k_weight0, i25), i23)}; \ |
704 | 0 | const T_VEC w98[2] = {add(w94[0], w96[0]), add(w94[1], w96[1])}; \ |
705 | 0 | const T_VEC w99[2] = {sub(w94[0], w96[0]), sub(w94[1], w96[1])}; \ |
706 | 0 | const T_VEC w100[2] = {add(w95[0], w97[1]), sub(w95[1], w97[0])}; \ |
707 | 0 | const T_VEC w101[2] = {sub(w95[0], w97[1]), add(w95[1], w97[0])}; \ |
708 | 0 | const T_VEC w102[2] = {add(i5, i11), sub(i27, i21)}; \ |
709 | 0 | const T_VEC w103[2] = {sub(i5, i11), sub(sub(k_weight0, i21), i27)}; \ |
710 | 0 | const T_VEC w104[2] = {add(i13, i3), sub(i19, i29)}; \ |
711 | 0 | const T_VEC w105[2] = {sub(i13, i3), sub(sub(k_weight0, i29), i19)}; \ |
712 | 0 | const T_VEC w106[2] = {add(w102[0], w104[0]), add(w102[1], w104[1])}; \ |
713 | 0 | const T_VEC w107[2] = {sub(w102[0], w104[0]), sub(w102[1], w104[1])}; \ |
714 | 0 | const T_VEC w108[2] = {add(w103[0], w105[1]), sub(w103[1], w105[0])}; \ |
715 | 0 | const T_VEC w109[2] = {sub(w103[0], w105[1]), add(w103[1], w105[0])}; \ |
716 | 0 | const T_VEC w110[2] = {add(w98[0], w106[0]), add(w98[1], w106[1])}; \ |
717 | 0 | const T_VEC w111[2] = {sub(w98[0], w106[0]), sub(w98[1], w106[1])}; \ |
718 | 0 | const T_VEC w112[2] = {add(w100[0], mul(k_weight2, add(w108[0], w108[1]))), \ |
719 | 0 | add(w100[1], mul(k_weight2, sub(w108[1], w108[0])))}; \ |
720 | 0 | const T_VEC w113[2] = {add(w100[0], sub(sub(k_weight0, mul(k_weight2, w108[0])), mul(k_weight2, w108[1]))), \ |
721 | 0 | add(w100[1], mul(k_weight2, sub(w108[0], w108[1])))}; \ |
722 | 0 | const T_VEC w114[2] = {add(w99[0], w107[1]), sub(w99[1], w107[0])}; \ |
723 | 0 | const T_VEC w115[2] = {sub(w99[0], w107[1]), add(w99[1], w107[0])}; \ |
724 | 0 | const T_VEC w116[2] = {sub(w101[0], mul(k_weight2, sub(w109[0], w109[1]))), \ |
725 | 0 | sub(w101[1], mul(k_weight2, add(w109[1], w109[0])))}; \ |
726 | 0 | const T_VEC w117[2] = {add(w101[0], mul(k_weight2, sub(w109[0], w109[1]))), \ |
727 | 0 | add(w101[1], mul(k_weight2, add(w109[1], w109[0])))}; \ |
728 | 0 | const T_VEC w118[2] = {add(i3, i13), sub(i29, i19)}; \ |
729 | 0 | const T_VEC w119[2] = {sub(i3, i13), sub(sub(k_weight0, i19), i29)}; \ |
730 | 0 | const T_VEC w120[2] = {add(i11, i5), sub(i21, i27)}; \ |
731 | 0 | const T_VEC w121[2] = {sub(i11, i5), sub(sub(k_weight0, i27), i21)}; \ |
732 | 0 | const T_VEC w122[2] = {add(w118[0], w120[0]), add(w118[1], w120[1])}; \ |
733 | 0 | const T_VEC w123[2] = {sub(w118[0], w120[0]), sub(w118[1], w120[1])}; \ |
734 | 0 | const T_VEC w124[2] = {add(w119[0], w121[1]), sub(w119[1], w121[0])}; \ |
735 | 0 | const T_VEC w125[2] = {sub(w119[0], w121[1]), add(w119[1], w121[0])}; \ |
736 | 0 | const T_VEC w126[2] = {add(i7, i9), sub(i25, i23)}; \ |
737 | 0 | const T_VEC w127[2] = {sub(i7, i9), sub(sub(k_weight0, i23), i25)}; \ |
738 | 0 | const T_VEC w128[2] = {add(i15, i1), sub(i17, i31)}; \ |
739 | 0 | const T_VEC w129[2] = {sub(i15, i1), sub(sub(k_weight0, i31), i17)}; \ |
740 | 0 | const T_VEC w130[2] = {add(w126[0], w128[0]), add(w126[1], w128[1])}; \ |
741 | 0 | const T_VEC w131[2] = {sub(w126[0], w128[0]), sub(w126[1], w128[1])}; \ |
742 | 0 | const T_VEC w132[2] = {add(w127[0], w129[1]), sub(w127[1], w129[0])}; \ |
743 | 0 | const T_VEC w133[2] = {sub(w127[0], w129[1]), add(w127[1], w129[0])}; \ |
744 | 0 | const T_VEC w134[2] = {add(w122[0], w130[0]), add(w122[1], w130[1])}; \ |
745 | 0 | const T_VEC w135[2] = {sub(w122[0], w130[0]), sub(w122[1], w130[1])}; \ |
746 | 0 | const T_VEC w136[2] = {add(w124[0], mul(k_weight2, add(w132[0], w132[1]))), \ |
747 | 0 | add(w124[1], mul(k_weight2, sub(w132[1], w132[0])))}; \ |
748 | 0 | const T_VEC w137[2] = {add(w124[0], sub(sub(k_weight0, mul(k_weight2, w132[0])), mul(k_weight2, w132[1]))), \ |
749 | 0 | add(w124[1], mul(k_weight2, sub(w132[0], w132[1])))}; \ |
750 | 0 | const T_VEC w138[2] = {add(w123[0], w131[1]), sub(w123[1], w131[0])}; \ |
751 | 0 | const T_VEC w139[2] = {sub(w123[0], w131[1]), add(w123[1], w131[0])}; \ |
752 | 0 | const T_VEC w140[2] = {sub(w125[0], mul(k_weight2, sub(w133[0], w133[1]))), \ |
753 | 0 | sub(w125[1], mul(k_weight2, add(w133[1], w133[0])))}; \ |
754 | 0 | const T_VEC w141[2] = {add(w125[0], mul(k_weight2, sub(w133[0], w133[1]))), \ |
755 | 0 | add(w125[1], mul(k_weight2, add(w133[1], w133[0])))}; \ |
756 | 0 | const T_VEC w142[2] = {add(w110[0], w134[0]), add(w110[1], w134[1])}; \ |
757 | 0 | const T_VEC w143[2] = {sub(w110[0], w134[0]), sub(w110[1], w134[1])}; \ |
758 | 0 | const T_VEC w144[2] = {add(w112[0], add(mul(k_weight3, w136[0]), mul(k_weight4, w136[1]))), \ |
759 | 0 | add(w112[1], sub(mul(k_weight3, w136[1]), mul(k_weight4, w136[0])))}; \ |
760 | 0 | const T_VEC w145[2] = {add(w112[0], sub(sub(k_weight0, mul(k_weight3, w136[0])), mul(k_weight4, w136[1]))), \ |
761 | 0 | add(w112[1], sub(mul(k_weight4, w136[0]), mul(k_weight3, w136[1])))}; \ |
762 | 0 | const T_VEC w146[2] = {add(w114[0], mul(k_weight2, add(w138[0], w138[1]))), \ |
763 | 0 | add(w114[1], mul(k_weight2, sub(w138[1], w138[0])))}; \ |
764 | 0 | const T_VEC w147[2] = {add(w114[0], sub(sub(k_weight0, mul(k_weight2, w138[0])), mul(k_weight2, w138[1]))), \ |
765 | 0 | add(w114[1], mul(k_weight2, sub(w138[0], w138[1])))}; \ |
766 | 0 | const T_VEC w148[2] = {add(w116[0], add(mul(k_weight4, w140[0]), mul(k_weight3, w140[1]))), \ |
767 | 0 | add(w116[1], sub(mul(k_weight4, w140[1]), mul(k_weight3, w140[0])))}; \ |
768 | 0 | const T_VEC w149[2] = {add(w116[0], sub(sub(k_weight0, mul(k_weight4, w140[0])), mul(k_weight3, w140[1]))), \ |
769 | 0 | add(w116[1], sub(mul(k_weight3, w140[0]), mul(k_weight4, w140[1])))}; \ |
770 | 0 | const T_VEC w150[2] = {add(w111[0], w135[1]), sub(w111[1], w135[0])}; \ |
771 | 0 | const T_VEC w151[2] = {sub(w111[0], w135[1]), add(w111[1], w135[0])}; \ |
772 | 0 | const T_VEC w152[2] = {sub(w113[0], sub(mul(k_weight4, w137[0]), mul(k_weight3, w137[1]))), \ |
773 | 0 | add(w113[1], sub(sub(k_weight0, mul(k_weight4, w137[1])), mul(k_weight3, w137[0])))}; \ |
774 | 0 | const T_VEC w153[2] = {add(w113[0], sub(mul(k_weight4, w137[0]), mul(k_weight3, w137[1]))), \ |
775 | 0 | add(w113[1], add(mul(k_weight4, w137[1]), mul(k_weight3, w137[0])))}; \ |
776 | 0 | const T_VEC w154[2] = {sub(w115[0], mul(k_weight2, sub(w139[0], w139[1]))), \ |
777 | 0 | sub(w115[1], mul(k_weight2, add(w139[1], w139[0])))}; \ |
778 | 0 | const T_VEC w155[2] = {add(w115[0], mul(k_weight2, sub(w139[0], w139[1]))), \ |
779 | 0 | add(w115[1], mul(k_weight2, add(w139[1], w139[0])))}; \ |
780 | 0 | const T_VEC w156[2] = {sub(w117[0], sub(mul(k_weight3, w141[0]), mul(k_weight4, w141[1]))), \ |
781 | 0 | add(w117[1], sub(sub(k_weight0, mul(k_weight3, w141[1])), mul(k_weight4, w141[0])))}; \ |
782 | 0 | const T_VEC w157[2] = {add(w117[0], sub(mul(k_weight3, w141[0]), mul(k_weight4, w141[1]))), \ |
783 | 0 | add(w117[1], add(mul(k_weight3, w141[1]), mul(k_weight4, w141[0])))}; \ |
784 | 0 | store(output + 0 * stride, add(w78[0], w142[0])); \ |
785 | 0 | store(output + 1 * stride, add(w80[0], add(mul(k_weight5, w144[0]), mul(k_weight6, w144[1])))); \ |
786 | 0 | store(output + 2 * stride, add(w82[0], add(mul(k_weight3, w146[0]), mul(k_weight4, w146[1])))); \ |
787 | 0 | store(output + 3 * stride, add(w84[0], add(mul(k_weight7, w148[0]), mul(k_weight8, w148[1])))); \ |
788 | 0 | store(output + 4 * stride, add(w86[0], mul(k_weight2, add(w150[0], w150[1])))); \ |
789 | 0 | store(output + 5 * stride, add(w88[0], add(mul(k_weight8, w152[0]), mul(k_weight7, w152[1])))); \ |
790 | 0 | store(output + 6 * stride, add(w90[0], add(mul(k_weight4, w154[0]), mul(k_weight3, w154[1])))); \ |
791 | 0 | store(output + 7 * stride, add(w92[0], add(mul(k_weight6, w156[0]), mul(k_weight5, w156[1])))); \ |
792 | 0 | store(output + 8 * stride, add(w79[0], w143[1])); \ |
793 | 0 | store(output + 9 * stride, sub(w81[0], sub(mul(k_weight6, w145[0]), mul(k_weight5, w145[1])))); \ |
794 | 0 | store(output + 10 * stride, sub(w83[0], sub(mul(k_weight4, w147[0]), mul(k_weight3, w147[1])))); \ |
795 | 0 | store(output + 11 * stride, sub(w85[0], sub(mul(k_weight8, w149[0]), mul(k_weight7, w149[1])))); \ |
796 | 0 | store(output + 12 * stride, sub(w87[0], mul(k_weight2, sub(w151[0], w151[1])))); \ |
797 | 0 | store(output + 13 * stride, sub(w89[0], sub(mul(k_weight7, w153[0]), mul(k_weight8, w153[1])))); \ |
798 | 0 | store(output + 14 * stride, sub(w91[0], sub(mul(k_weight3, w155[0]), mul(k_weight4, w155[1])))); \ |
799 | 0 | store(output + 15 * stride, sub(w93[0], sub(mul(k_weight5, w157[0]), mul(k_weight6, w157[1])))); \ |
800 | 0 | store(output + 16 * stride, sub(w78[0], w142[0])); \ |
801 | 0 | store(output + 17 * stride, \ |
802 | 0 | add(w80[0], sub(sub(k_weight0, mul(k_weight5, w144[0])), mul(k_weight6, w144[1])))); \ |
803 | 0 | store(output + 18 * stride, \ |
804 | 0 | add(w82[0], sub(sub(k_weight0, mul(k_weight3, w146[0])), mul(k_weight4, w146[1])))); \ |
805 | 0 | store(output + 19 * stride, \ |
806 | 0 | add(w84[0], sub(sub(k_weight0, mul(k_weight7, w148[0])), mul(k_weight8, w148[1])))); \ |
807 | 0 | store(output + 20 * stride, \ |
808 | 0 | add(w86[0], sub(sub(k_weight0, mul(k_weight2, w150[0])), mul(k_weight2, w150[1])))); \ |
809 | 0 | store(output + 21 * stride, \ |
810 | 0 | add(w88[0], sub(sub(k_weight0, mul(k_weight8, w152[0])), mul(k_weight7, w152[1])))); \ |
811 | 0 | store(output + 22 * stride, \ |
812 | 0 | add(w90[0], sub(sub(k_weight0, mul(k_weight4, w154[0])), mul(k_weight3, w154[1])))); \ |
813 | 0 | store(output + 23 * stride, \ |
814 | 0 | add(w92[0], sub(sub(k_weight0, mul(k_weight6, w156[0])), mul(k_weight5, w156[1])))); \ |
815 | 0 | store(output + 24 * stride, sub(w79[0], w143[1])); \ |
816 | 0 | store(output + 25 * stride, add(w81[0], sub(mul(k_weight6, w145[0]), mul(k_weight5, w145[1])))); \ |
817 | 0 | store(output + 26 * stride, add(w83[0], sub(mul(k_weight4, w147[0]), mul(k_weight3, w147[1])))); \ |
818 | 0 | store(output + 27 * stride, add(w85[0], sub(mul(k_weight8, w149[0]), mul(k_weight7, w149[1])))); \ |
819 | 0 | store(output + 28 * stride, add(w87[0], mul(k_weight2, sub(w151[0], w151[1])))); \ |
820 | 0 | store(output + 29 * stride, add(w89[0], sub(mul(k_weight7, w153[0]), mul(k_weight8, w153[1])))); \ |
821 | 0 | store(output + 30 * stride, add(w91[0], sub(mul(k_weight3, w155[0]), mul(k_weight4, w155[1])))); \ |
822 | 0 | store(output + 31 * stride, add(w93[0], sub(mul(k_weight5, w157[0]), mul(k_weight6, w157[1])))); \ |
823 | 0 | } |
824 | | |
825 | | #endif // AOM_AOM_DSP_FFT_COMMON_H_ |