Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2018, Alliance for Open Media. All rights reserved. |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #include "aom_dsp/aom_dsp_common.h" |
13 | | #include "aom_dsp/fft_common.h" |
14 | | #include "config/aom_dsp_rtcd.h" |
15 | | |
16 | 0 | static inline void simple_transpose(const float *A, float *B, int n) { |
17 | 0 | for (int y = 0; y < n; y++) { |
18 | 0 | for (int x = 0; x < n; x++) { |
19 | 0 | B[y * n + x] = A[x * n + y]; |
20 | 0 | } |
21 | 0 | } |
22 | 0 | } |
23 | | |
24 | | // The 1d transform is real to complex and packs the complex results in |
25 | | // a way to take advantage of conjugate symmetry (e.g., the n/2 + 1 real |
26 | | // components, followed by the n/2 - 1 imaginary components). After the |
27 | | // transform is done on the rows, the first n/2 + 1 columns are real, and |
28 | | // the remaining are the imaginary components. After the transform on the |
29 | | // columns, the region of [0, n/2]x[0, n/2] contains the real part of |
30 | | // fft of the real columns. The real part of the 2d fft also includes the |
31 | | // imaginary part of transformed imaginary columns. This function assembles |
32 | | // the correct outputs while putting the real and imaginary components |
33 | | // next to each other. |
34 | | static inline void unpack_2d_output(const float *col_fft, float *output, |
35 | 0 | int n) { |
36 | 0 | for (int y = 0; y <= n / 2; ++y) { |
37 | 0 | const int y2 = y + n / 2; |
38 | 0 | const int y_extra = y2 > n / 2 && y2 < n; |
39 | |
|
40 | 0 | for (int x = 0; x <= n / 2; ++x) { |
41 | 0 | const int x2 = x + n / 2; |
42 | 0 | const int x_extra = x2 > n / 2 && x2 < n; |
43 | 0 | output[2 * (y * n + x)] = |
44 | 0 | col_fft[y * n + x] - (x_extra && y_extra ? col_fft[y2 * n + x2] : 0); |
45 | 0 | output[2 * (y * n + x) + 1] = (y_extra ? col_fft[y2 * n + x] : 0) + |
46 | 0 | (x_extra ? col_fft[y * n + x2] : 0); |
47 | 0 | if (y_extra) { |
48 | 0 | output[2 * ((n - y) * n + x)] = |
49 | 0 | col_fft[y * n + x] + |
50 | 0 | (x_extra && y_extra ? col_fft[y2 * n + x2] : 0); |
51 | 0 | output[2 * ((n - y) * n + x) + 1] = |
52 | 0 | -(y_extra ? col_fft[y2 * n + x] : 0) + |
53 | 0 | (x_extra ? col_fft[y * n + x2] : 0); |
54 | 0 | } |
55 | 0 | } |
56 | 0 | } |
57 | 0 | } |
58 | | |
59 | | void aom_fft_2d_gen(const float *input, float *temp, float *output, int n, |
60 | | aom_fft_1d_func_t tform, aom_fft_transpose_func_t transpose, |
61 | 0 | aom_fft_unpack_func_t unpack, int vec_size) { |
62 | 0 | for (int x = 0; x < n; x += vec_size) { |
63 | 0 | tform(input + x, output + x, n); |
64 | 0 | } |
65 | 0 | transpose(output, temp, n); |
66 | |
|
67 | 0 | for (int x = 0; x < n; x += vec_size) { |
68 | 0 | tform(temp + x, output + x, n); |
69 | 0 | } |
70 | 0 | transpose(output, temp, n); |
71 | |
|
72 | 0 | unpack(temp, output, n); |
73 | 0 | } |
74 | | |
75 | 0 | static inline void store_float(float *output, float input) { *output = input; } |
76 | 0 | static inline float add_float(float a, float b) { return a + b; } |
77 | 0 | static inline float sub_float(float a, float b) { return a - b; } |
78 | 0 | static inline float mul_float(float a, float b) { return a * b; } |
79 | | |
80 | | GEN_FFT_2(void, float, float, float, *, store_float) |
81 | | GEN_FFT_4(void, float, float, float, *, store_float, (float), add_float, |
82 | | sub_float) |
83 | | GEN_FFT_8(void, float, float, float, *, store_float, (float), add_float, |
84 | | sub_float, mul_float) |
85 | | GEN_FFT_16(void, float, float, float, *, store_float, (float), add_float, |
86 | | sub_float, mul_float) |
87 | | GEN_FFT_32(void, float, float, float, *, store_float, (float), add_float, |
88 | | sub_float, mul_float) |
89 | | |
90 | 0 | void aom_fft2x2_float_c(const float *input, float *temp, float *output) { |
91 | 0 | aom_fft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, simple_transpose, |
92 | 0 | unpack_2d_output, 1); |
93 | 0 | } |
94 | | |
95 | 0 | void aom_fft4x4_float_c(const float *input, float *temp, float *output) { |
96 | 0 | aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, simple_transpose, |
97 | 0 | unpack_2d_output, 1); |
98 | 0 | } |
99 | | |
100 | 0 | void aom_fft8x8_float_c(const float *input, float *temp, float *output) { |
101 | 0 | aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, simple_transpose, |
102 | 0 | unpack_2d_output, 1); |
103 | 0 | } |
104 | | |
105 | 0 | void aom_fft16x16_float_c(const float *input, float *temp, float *output) { |
106 | 0 | aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_float, simple_transpose, |
107 | 0 | unpack_2d_output, 1); |
108 | 0 | } |
109 | | |
110 | 0 | void aom_fft32x32_float_c(const float *input, float *temp, float *output) { |
111 | 0 | aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_float, simple_transpose, |
112 | 0 | unpack_2d_output, 1); |
113 | 0 | } |
114 | | |
115 | | void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n, |
116 | | aom_fft_1d_func_t fft_single, aom_fft_1d_func_t fft_multi, |
117 | | aom_fft_1d_func_t ifft_multi, |
118 | 0 | aom_fft_transpose_func_t transpose, int vec_size) { |
119 | | // Column 0 and n/2 have conjugate symmetry, so we can directly do the ifft |
120 | | // and get real outputs. |
121 | 0 | for (int y = 0; y <= n / 2; ++y) { |
122 | 0 | output[y * n] = input[2 * y * n]; |
123 | 0 | output[y * n + 1] = input[2 * (y * n + n / 2)]; |
124 | 0 | } |
125 | 0 | for (int y = n / 2 + 1; y < n; ++y) { |
126 | 0 | output[y * n] = input[2 * (y - n / 2) * n + 1]; |
127 | 0 | output[y * n + 1] = input[2 * ((y - n / 2) * n + n / 2) + 1]; |
128 | 0 | } |
129 | |
|
130 | 0 | for (int i = 0; i < 2; i += vec_size) { |
131 | 0 | ifft_multi(output + i, temp + i, n); |
132 | 0 | } |
133 | | |
134 | | // For the other columns, since we don't have a full ifft for complex inputs |
135 | | // we have to split them into the real and imaginary counterparts. |
136 | | // Pack the real component, then the imaginary components. |
137 | 0 | for (int y = 0; y < n; ++y) { |
138 | 0 | for (int x = 1; x < n / 2; ++x) { |
139 | 0 | output[y * n + (x + 1)] = input[2 * (y * n + x)]; |
140 | 0 | } |
141 | 0 | for (int x = 1; x < n / 2; ++x) { |
142 | 0 | output[y * n + (x + n / 2)] = input[2 * (y * n + x) + 1]; |
143 | 0 | } |
144 | 0 | } |
145 | 0 | for (int y = 2; y < vec_size; y++) { |
146 | 0 | fft_single(output + y, temp + y, n); |
147 | 0 | } |
148 | | // This is the part that can be sped up with SIMD |
149 | 0 | for (int y = AOMMAX(2, vec_size); y < n; y += vec_size) { |
150 | 0 | fft_multi(output + y, temp + y, n); |
151 | 0 | } |
152 | | |
153 | | // Put the 0 and n/2 th results in the correct place. |
154 | 0 | for (int x = 0; x < n; ++x) { |
155 | 0 | output[x] = temp[x * n]; |
156 | 0 | output[(n / 2) * n + x] = temp[x * n + 1]; |
157 | 0 | } |
158 | | // This rearranges and transposes. |
159 | 0 | for (int y = 1; y < n / 2; ++y) { |
160 | | // Fill in the real columns |
161 | 0 | for (int x = 0; x <= n / 2; ++x) { |
162 | 0 | output[x + y * n] = |
163 | 0 | temp[(y + 1) + x * n] + |
164 | 0 | ((x > 0 && x < n / 2) ? temp[(y + n / 2) + (x + n / 2) * n] : 0); |
165 | 0 | } |
166 | 0 | for (int x = n / 2 + 1; x < n; ++x) { |
167 | 0 | output[x + y * n] = temp[(y + 1) + (n - x) * n] - |
168 | 0 | temp[(y + n / 2) + ((n - x) + n / 2) * n]; |
169 | 0 | } |
170 | | // Fill in the imag columns |
171 | 0 | for (int x = 0; x <= n / 2; ++x) { |
172 | 0 | output[x + (y + n / 2) * n] = |
173 | 0 | temp[(y + n / 2) + x * n] - |
174 | 0 | ((x > 0 && x < n / 2) ? temp[(y + 1) + (x + n / 2) * n] : 0); |
175 | 0 | } |
176 | 0 | for (int x = n / 2 + 1; x < n; ++x) { |
177 | 0 | output[x + (y + n / 2) * n] = temp[(y + 1) + ((n - x) + n / 2) * n] + |
178 | 0 | temp[(y + n / 2) + (n - x) * n]; |
179 | 0 | } |
180 | 0 | } |
181 | 0 | for (int y = 0; y < n; y += vec_size) { |
182 | 0 | ifft_multi(output + y, temp + y, n); |
183 | 0 | } |
184 | 0 | transpose(temp, output, n); |
185 | 0 | } |
186 | | |
187 | | GEN_IFFT_2(static void, float, float, float, *, store_float) |
188 | | GEN_IFFT_4(static void, float, float, float, *, store_float, (float), add_float, |
189 | | sub_float) |
190 | | GEN_IFFT_8(static void, float, float, float, *, store_float, (float), add_float, |
191 | | sub_float, mul_float) |
192 | | GEN_IFFT_16(static void, float, float, float, *, store_float, (float), |
193 | | add_float, sub_float, mul_float) |
194 | | GEN_IFFT_32(static void, float, float, float, *, store_float, (float), |
195 | | add_float, sub_float, mul_float) |
196 | | |
197 | 0 | void aom_ifft2x2_float_c(const float *input, float *temp, float *output) { |
198 | 0 | aom_ifft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, aom_fft1d_2_float, |
199 | 0 | aom_ifft1d_2_float, simple_transpose, 1); |
200 | 0 | } |
201 | | |
202 | 0 | void aom_ifft4x4_float_c(const float *input, float *temp, float *output) { |
203 | 0 | aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_float, |
204 | 0 | aom_ifft1d_4_float, simple_transpose, 1); |
205 | 0 | } |
206 | | |
207 | 0 | void aom_ifft8x8_float_c(const float *input, float *temp, float *output) { |
208 | 0 | aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_float, |
209 | 0 | aom_ifft1d_8_float, simple_transpose, 1); |
210 | 0 | } |
211 | | |
212 | 0 | void aom_ifft16x16_float_c(const float *input, float *temp, float *output) { |
213 | 0 | aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float, |
214 | 0 | aom_fft1d_16_float, aom_ifft1d_16_float, simple_transpose, 1); |
215 | 0 | } |
216 | | |
217 | 0 | void aom_ifft32x32_float_c(const float *input, float *temp, float *output) { |
218 | 0 | aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float, |
219 | 0 | aom_fft1d_32_float, aom_ifft1d_32_float, simple_transpose, 1); |
220 | 0 | } |