/work/svt-av1/Source/Lib/Codec/fft.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2018, Alliance for Open Media. All rights reserved |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license. |
10 | | */ |
11 | | |
12 | | #include "definitions.h" |
13 | | #include "fft_common.h" |
14 | | |
15 | | static INLINE void simple_transpose(const float* A, float* b, int32_t n) { |
16 | | for (int32_t y = 0; y < n; y++) { |
17 | | for (int32_t x = 0; x < n; x++) { |
18 | | b[y * n + x] = A[x * n + y]; |
19 | | } |
20 | | } |
21 | | } |
22 | | |
23 | | // The 1d transform is real to complex and packs the complex results in |
24 | | // a way to take advantage of conjugate symmetry (e.g., the n/2 + 1 real |
25 | | // components, followed by the n/2 - 1 imaginary components). After the |
26 | | // transform is done on the rows, the first n/2 + 1 columns are real, and |
27 | | // the remaining are the imaginary components. After the transform on the |
28 | | // columns, the region of [0, n/2]x[0, n/2] contains the real part of |
29 | | // fft of the real columns. The real part of the 2d fft also includes the |
30 | | // imaginary part of transformed imaginary columns. This function assembles |
31 | | // the correct outputs while putting the real and imaginary components |
32 | | // next to each other. |
33 | | static INLINE void unpack_2d_output(const float* col_fft, float* output, int32_t n) { |
34 | | for (int32_t y = 0; y <= n / 2; ++y) { |
35 | | const int32_t y2 = y + n / 2; |
36 | | const int32_t y_extra = y2 > n / 2 && y2 < n; |
37 | | |
38 | | for (int32_t x = 0; x <= n / 2; ++x) { |
39 | | const int32_t x2 = x + n / 2; |
40 | | const int32_t x_extra = x2 > n / 2 && x2 < n; |
41 | | output[2 * (y * n + x)] = col_fft[y * n + x] - (x_extra && y_extra ? col_fft[y2 * n + x2] : 0); |
42 | | output[2 * (y * n + x) + 1] = (y_extra ? col_fft[y2 * n + x] : 0) + (x_extra ? col_fft[y * n + x2] : 0); |
43 | | if (y_extra) { |
44 | | output[2 * ((n - y) * n + x)] = col_fft[y * n + x] + (x_extra ? col_fft[y2 * n + x2] : 0); |
45 | | output[2 * ((n - y) * n + x) + 1] = -col_fft[y2 * n + x] + (x_extra ? col_fft[y * n + x2] : 0); |
46 | | } |
47 | | } |
48 | | } |
49 | | } |
50 | | |
51 | | void svt_aom_fft_2d_gen(const float* input, float* temp, float* output, int32_t n, AomFft1dFunc tform, |
52 | 0 | AomFftTransposeFunc transpose, AomFftUnpackFunc unpack, int32_t vec_size) { |
53 | 0 | for (int32_t x = 0; x < n; x += vec_size) { |
54 | 0 | tform(input + x, output + x, n); |
55 | 0 | } |
56 | 0 | transpose(output, temp, n); |
57 | |
|
58 | 0 | for (int32_t x = 0; x < n; x += vec_size) { |
59 | 0 | tform(temp + x, output + x, n); |
60 | 0 | } |
61 | 0 | transpose(output, temp, n); |
62 | |
|
63 | 0 | unpack(temp, output, n); |
64 | 0 | } |
65 | | |
66 | | static INLINE void store_float(float* output, float input) { |
67 | | *output = input; |
68 | | } |
69 | | |
70 | | static INLINE float add_float(float a, float b) { |
71 | | return a + b; |
72 | | } |
73 | | |
74 | | static INLINE float sub_float(float a, float b) { |
75 | | return a - b; |
76 | | } |
77 | | |
78 | | static INLINE float mul_float(float a, float b) { |
79 | | return a * b; |
80 | | } |
81 | | |
82 | | GEN_FFT_2(void, float, float, float, *, store_float); |
83 | | GEN_FFT_4(void, float, float, float, *, store_float, (float), add_float, sub_float); |
84 | | GEN_FFT_8(void, float, float, float, *, store_float, (float), add_float, sub_float, mul_float); |
85 | | GEN_FFT_16(void, float, float, float, *, store_float, (float), add_float, sub_float, mul_float); |
86 | | GEN_FFT_32(void, float, float, float, *, store_float, (float), add_float, sub_float, mul_float); |
87 | | |
88 | 0 | void svt_aom_fft2x2_float_c(const float* input, float* temp, float* output) { |
89 | 0 | svt_aom_fft_2d_gen(input, temp, output, 2, svt_aom_fft1d_2_float, simple_transpose, unpack_2d_output, 1); |
90 | 0 | } |
91 | | |
92 | 0 | void svt_aom_fft4x4_float_c(const float* input, float* temp, float* output) { |
93 | 0 | svt_aom_fft_2d_gen(input, temp, output, 4, svt_aom_fft1d_4_float, simple_transpose, unpack_2d_output, 1); |
94 | 0 | } |
95 | | |
96 | 0 | void svt_aom_fft8x8_float_c(const float* input, float* temp, float* output) { |
97 | 0 | svt_aom_fft_2d_gen(input, temp, output, 8, svt_aom_fft1d_8_float, simple_transpose, unpack_2d_output, 1); |
98 | 0 | } |
99 | | |
100 | 0 | void svt_aom_fft16x16_float_c(const float* input, float* temp, float* output) { |
101 | 0 | svt_aom_fft_2d_gen(input, temp, output, 16, svt_aom_fft1d_16_float, simple_transpose, unpack_2d_output, 1); |
102 | 0 | } |
103 | | |
104 | 0 | void svt_aom_fft32x32_float_c(const float* input, float* temp, float* output) { |
105 | 0 | svt_aom_fft_2d_gen(input, temp, output, 32, svt_aom_fft1d_32_float, simple_transpose, unpack_2d_output, 1); |
106 | 0 | } |
107 | | |
108 | | void svt_aom_ifft_2d_gen(const float* input, float* temp, float* output, int32_t n, AomFft1dFunc fft_single, |
109 | | AomFft1dFunc fft_multi, AomFft1dFunc ifft_multi, AomFftTransposeFunc transpose, |
110 | 0 | int32_t vec_size) { |
111 | | // Column 0 and n/2 have conjugate symmetry, so we can directly do the ifft |
112 | | // and get real outputs. |
113 | 0 | for (int32_t y = 0; y <= n / 2; ++y) { |
114 | 0 | output[y * n] = input[2 * y * n]; |
115 | 0 | output[y * n + 1] = input[2 * (y * n + n / 2)]; |
116 | 0 | } |
117 | 0 | for (int32_t y = n / 2 + 1; y < n; ++y) { |
118 | 0 | output[y * n] = input[2 * (y - n / 2) * n + 1]; |
119 | 0 | output[y * n + 1] = input[2 * ((y - n / 2) * n + n / 2) + 1]; |
120 | 0 | } |
121 | |
|
122 | 0 | for (int32_t i = 0; i < 2; i += vec_size) { |
123 | 0 | ifft_multi(output + i, temp + i, n); |
124 | 0 | } |
125 | | // For the other columns, since we don't have a full ifft for complex inputs |
126 | | // we have to split them into the real and imaginary counterparts. |
127 | | // Pack the real component, then the imaginary components. |
128 | 0 | for (int32_t y = 0; y < n; ++y) { |
129 | 0 | for (int32_t x = 1; x < n / 2; ++x) { |
130 | 0 | output[y * n + (x + 1)] = input[2 * (y * n + x)]; |
131 | 0 | } |
132 | 0 | for (int32_t x = 1; x < n / 2; ++x) { |
133 | 0 | output[y * n + (x + n / 2)] = input[2 * (y * n + x) + 1]; |
134 | 0 | } |
135 | 0 | } |
136 | 0 | for (int32_t y = 2; y < vec_size; y++) { |
137 | 0 | fft_single(output + y, temp + y, n); |
138 | 0 | } |
139 | | // This is the part that can be sped up with SIMD |
140 | 0 | for (int32_t y = AOMMAX(2, vec_size); y < n; y += vec_size) { |
141 | 0 | fft_multi(output + y, temp + y, n); |
142 | 0 | } |
143 | | // Put the 0 and n/2 th results in the correct place. |
144 | 0 | for (int32_t x = 0; x < n; ++x) { |
145 | 0 | output[x] = temp[x * n]; |
146 | 0 | output[(n / 2) * n + x] = temp[x * n + 1]; |
147 | 0 | } |
148 | | // This rearranges and transposes. |
149 | 0 | for (int32_t y = 1; y < n / 2; ++y) { |
150 | | // Fill in the real columns |
151 | 0 | for (int32_t x = 0; x <= n / 2; ++x) { |
152 | 0 | output[x + y * n] = temp[(y + 1) + x * n] + |
153 | 0 | ((x > 0 && x < n / 2) ? temp[(y + n / 2) + (x + n / 2) * n] : 0); |
154 | 0 | } |
155 | 0 | for (int32_t x = n / 2 + 1; x < n; ++x) { |
156 | 0 | output[x + y * n] = temp[(y + 1) + (n - x) * n] - temp[(y + n / 2) + ((n - x) + n / 2) * n]; |
157 | 0 | } |
158 | | // Fill in the imag columns |
159 | 0 | for (int32_t x = 0; x <= n / 2; ++x) { |
160 | 0 | output[x + (y + n / 2) * n] = temp[(y + n / 2) + x * n] - |
161 | 0 | ((x > 0 && x < n / 2) ? temp[(y + 1) + (x + n / 2) * n] : 0); |
162 | 0 | } |
163 | 0 | for (int32_t x = n / 2 + 1; x < n; ++x) { |
164 | 0 | output[x + (y + n / 2) * n] = temp[(y + 1) + ((n - x) + n / 2) * n] + temp[(y + n / 2) + (n - x) * n]; |
165 | 0 | } |
166 | 0 | } |
167 | 0 | for (int32_t y = 0; y < n; y += vec_size) { |
168 | 0 | ifft_multi(output + y, temp + y, n); |
169 | 0 | } |
170 | 0 | transpose(temp, output, n); |
171 | 0 | } |
172 | | |
173 | | GEN_IFFT_2(void, float, float, float, *, store_float); |
174 | | GEN_IFFT_4(void, float, float, float, *, store_float, (float), add_float, sub_float); |
175 | | GEN_IFFT_8(void, float, float, float, *, store_float, (float), add_float, sub_float, mul_float); |
176 | | GEN_IFFT_16(void, float, float, float, *, store_float, (float), add_float, sub_float, mul_float); |
177 | | GEN_IFFT_32(void, float, float, float, *, store_float, (float), add_float, sub_float, mul_float); |
178 | | |
179 | 0 | void svt_aom_ifft2x2_float_c(const float* input, float* temp, float* output) { |
180 | 0 | svt_aom_ifft_2d_gen(input, |
181 | 0 | temp, |
182 | 0 | output, |
183 | 0 | 2, |
184 | 0 | svt_aom_fft1d_2_float, |
185 | 0 | svt_aom_fft1d_2_float, |
186 | 0 | svt_aom_ifft1d_2_float, |
187 | 0 | simple_transpose, |
188 | 0 | 1); |
189 | 0 | } |
190 | | |
191 | 0 | void svt_aom_ifft4x4_float_c(const float* input, float* temp, float* output) { |
192 | 0 | svt_aom_ifft_2d_gen(input, |
193 | 0 | temp, |
194 | 0 | output, |
195 | 0 | 4, |
196 | 0 | svt_aom_fft1d_4_float, |
197 | 0 | svt_aom_fft1d_4_float, |
198 | 0 | svt_aom_ifft1d_4_float, |
199 | 0 | simple_transpose, |
200 | 0 | 1); |
201 | 0 | } |
202 | | |
203 | 0 | void svt_aom_ifft8x8_float_c(const float* input, float* temp, float* output) { |
204 | 0 | svt_aom_ifft_2d_gen(input, |
205 | 0 | temp, |
206 | 0 | output, |
207 | 0 | 8, |
208 | 0 | svt_aom_fft1d_8_float, |
209 | 0 | svt_aom_fft1d_8_float, |
210 | 0 | svt_aom_ifft1d_8_float, |
211 | 0 | simple_transpose, |
212 | 0 | 1); |
213 | 0 | } |
214 | | |
215 | 0 | void svt_aom_ifft16x16_float_c(const float* input, float* temp, float* output) { |
216 | 0 | svt_aom_ifft_2d_gen(input, |
217 | 0 | temp, |
218 | 0 | output, |
219 | 0 | 16, |
220 | 0 | svt_aom_fft1d_16_float, |
221 | 0 | svt_aom_fft1d_16_float, |
222 | 0 | svt_aom_ifft1d_16_float, |
223 | 0 | simple_transpose, |
224 | 0 | 1); |
225 | 0 | } |
226 | | |
227 | 0 | void svt_aom_ifft32x32_float_c(const float* input, float* temp, float* output) { |
228 | 0 | svt_aom_ifft_2d_gen(input, |
229 | 0 | temp, |
230 | 0 | output, |
231 | 0 | 32, |
232 | 0 | svt_aom_fft1d_32_float, |
233 | 0 | svt_aom_fft1d_32_float, |
234 | 0 | svt_aom_ifft1d_32_float, |
235 | 0 | simple_transpose, |
236 | 0 | 1); |
237 | 0 | } |