Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2018, Alliance for Open Media. All rights reserved |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #include "aom_dsp/aom_dsp_common.h" |
13 | | #include "aom_dsp/fft_common.h" |
14 | | |
15 | 0 | static INLINE void simple_transpose(const float *A, float *B, int n) { |
16 | 0 | for (int y = 0; y < n; y++) { |
17 | 0 | for (int x = 0; x < n; x++) { |
18 | 0 | B[y * n + x] = A[x * n + y]; |
19 | 0 | } |
20 | 0 | } |
21 | 0 | } |
22 | | |
23 | | // The 1d transform is real to complex and packs the complex results in |
24 | | // a way to take advantage of conjugate symmetry (e.g., the n/2 + 1 real |
25 | | // components, followed by the n/2 - 1 imaginary components). After the |
26 | | // transform is done on the rows, the first n/2 + 1 columns are real, and |
27 | | // the remaining are the imaginary components. After the transform on the |
28 | | // columns, the region of [0, n/2]x[0, n/2] contains the real part of |
29 | | // fft of the real columns. The real part of the 2d fft also includes the |
30 | | // imaginary part of transformed imaginary columns. This function assembles |
31 | | // the correct outputs while putting the real and imaginary components |
32 | | // next to each other. |
33 | | static INLINE void unpack_2d_output(const float *col_fft, float *output, |
34 | 0 | int n) { |
35 | 0 | for (int y = 0; y <= n / 2; ++y) { |
36 | 0 | const int y2 = y + n / 2; |
37 | 0 | const int y_extra = y2 > n / 2 && y2 < n; |
38 | |
|
39 | 0 | for (int x = 0; x <= n / 2; ++x) { |
40 | 0 | const int x2 = x + n / 2; |
41 | 0 | const int x_extra = x2 > n / 2 && x2 < n; |
42 | 0 | output[2 * (y * n + x)] = |
43 | 0 | col_fft[y * n + x] - (x_extra && y_extra ? col_fft[y2 * n + x2] : 0); |
44 | 0 | output[2 * (y * n + x) + 1] = (y_extra ? col_fft[y2 * n + x] : 0) + |
45 | 0 | (x_extra ? col_fft[y * n + x2] : 0); |
46 | 0 | if (y_extra) { |
47 | 0 | output[2 * ((n - y) * n + x)] = |
48 | 0 | col_fft[y * n + x] + |
49 | 0 | (x_extra && y_extra ? col_fft[y2 * n + x2] : 0); |
50 | 0 | output[2 * ((n - y) * n + x) + 1] = |
51 | 0 | -(y_extra ? col_fft[y2 * n + x] : 0) + |
52 | 0 | (x_extra ? col_fft[y * n + x2] : 0); |
53 | 0 | } |
54 | 0 | } |
55 | 0 | } |
56 | 0 | } |
57 | | |
58 | | void aom_fft_2d_gen(const float *input, float *temp, float *output, int n, |
59 | | aom_fft_1d_func_t tform, aom_fft_transpose_func_t transpose, |
60 | 0 | aom_fft_unpack_func_t unpack, int vec_size) { |
61 | 0 | for (int x = 0; x < n; x += vec_size) { |
62 | 0 | tform(input + x, output + x, n); |
63 | 0 | } |
64 | 0 | transpose(output, temp, n); |
65 | |
|
66 | 0 | for (int x = 0; x < n; x += vec_size) { |
67 | 0 | tform(temp + x, output + x, n); |
68 | 0 | } |
69 | 0 | transpose(output, temp, n); |
70 | |
|
71 | 0 | unpack(temp, output, n); |
72 | 0 | } |
73 | | |
74 | 0 | static INLINE void store_float(float *output, float input) { *output = input; } |
75 | 0 | static INLINE float add_float(float a, float b) { return a + b; } |
76 | 0 | static INLINE float sub_float(float a, float b) { return a - b; } |
77 | 0 | static INLINE float mul_float(float a, float b) { return a * b; } |
78 | | |
79 | | GEN_FFT_2(void, float, float, float, *, store_float); |
80 | | GEN_FFT_4(void, float, float, float, *, store_float, (float), add_float, |
81 | | sub_float); |
82 | | GEN_FFT_8(void, float, float, float, *, store_float, (float), add_float, |
83 | | sub_float, mul_float); |
84 | | GEN_FFT_16(void, float, float, float, *, store_float, (float), add_float, |
85 | | sub_float, mul_float); |
86 | | GEN_FFT_32(void, float, float, float, *, store_float, (float), add_float, |
87 | | sub_float, mul_float); |
88 | | |
89 | 0 | void aom_fft2x2_float_c(const float *input, float *temp, float *output) { |
90 | 0 | aom_fft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, simple_transpose, |
91 | 0 | unpack_2d_output, 1); |
92 | 0 | } |
93 | | |
94 | 0 | void aom_fft4x4_float_c(const float *input, float *temp, float *output) { |
95 | 0 | aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, simple_transpose, |
96 | 0 | unpack_2d_output, 1); |
97 | 0 | } |
98 | | |
99 | 0 | void aom_fft8x8_float_c(const float *input, float *temp, float *output) { |
100 | 0 | aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, simple_transpose, |
101 | 0 | unpack_2d_output, 1); |
102 | 0 | } |
103 | | |
104 | 0 | void aom_fft16x16_float_c(const float *input, float *temp, float *output) { |
105 | 0 | aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_float, simple_transpose, |
106 | 0 | unpack_2d_output, 1); |
107 | 0 | } |
108 | | |
109 | 0 | void aom_fft32x32_float_c(const float *input, float *temp, float *output) { |
110 | 0 | aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_float, simple_transpose, |
111 | 0 | unpack_2d_output, 1); |
112 | 0 | } |
113 | | |
114 | | void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n, |
115 | | aom_fft_1d_func_t fft_single, aom_fft_1d_func_t fft_multi, |
116 | | aom_fft_1d_func_t ifft_multi, |
117 | 0 | aom_fft_transpose_func_t transpose, int vec_size) { |
118 | | // Column 0 and n/2 have conjugate symmetry, so we can directly do the ifft |
119 | | // and get real outputs. |
120 | 0 | for (int y = 0; y <= n / 2; ++y) { |
121 | 0 | output[y * n] = input[2 * y * n]; |
122 | 0 | output[y * n + 1] = input[2 * (y * n + n / 2)]; |
123 | 0 | } |
124 | 0 | for (int y = n / 2 + 1; y < n; ++y) { |
125 | 0 | output[y * n] = input[2 * (y - n / 2) * n + 1]; |
126 | 0 | output[y * n + 1] = input[2 * ((y - n / 2) * n + n / 2) + 1]; |
127 | 0 | } |
128 | |
|
129 | 0 | for (int i = 0; i < 2; i += vec_size) { |
130 | 0 | ifft_multi(output + i, temp + i, n); |
131 | 0 | } |
132 | | |
133 | | // For the other columns, since we don't have a full ifft for complex inputs |
134 | | // we have to split them into the real and imaginary counterparts. |
135 | | // Pack the real component, then the imaginary components. |
136 | 0 | for (int y = 0; y < n; ++y) { |
137 | 0 | for (int x = 1; x < n / 2; ++x) { |
138 | 0 | output[y * n + (x + 1)] = input[2 * (y * n + x)]; |
139 | 0 | } |
140 | 0 | for (int x = 1; x < n / 2; ++x) { |
141 | 0 | output[y * n + (x + n / 2)] = input[2 * (y * n + x) + 1]; |
142 | 0 | } |
143 | 0 | } |
144 | 0 | for (int y = 2; y < vec_size; y++) { |
145 | 0 | fft_single(output + y, temp + y, n); |
146 | 0 | } |
147 | | // This is the part that can be sped up with SIMD |
148 | 0 | for (int y = AOMMAX(2, vec_size); y < n; y += vec_size) { |
149 | 0 | fft_multi(output + y, temp + y, n); |
150 | 0 | } |
151 | | |
152 | | // Put the 0 and n/2 th results in the correct place. |
153 | 0 | for (int x = 0; x < n; ++x) { |
154 | 0 | output[x] = temp[x * n]; |
155 | 0 | output[(n / 2) * n + x] = temp[x * n + 1]; |
156 | 0 | } |
157 | | // This rearranges and transposes. |
158 | 0 | for (int y = 1; y < n / 2; ++y) { |
159 | | // Fill in the real columns |
160 | 0 | for (int x = 0; x <= n / 2; ++x) { |
161 | 0 | output[x + y * n] = |
162 | 0 | temp[(y + 1) + x * n] + |
163 | 0 | ((x > 0 && x < n / 2) ? temp[(y + n / 2) + (x + n / 2) * n] : 0); |
164 | 0 | } |
165 | 0 | for (int x = n / 2 + 1; x < n; ++x) { |
166 | 0 | output[x + y * n] = temp[(y + 1) + (n - x) * n] - |
167 | 0 | temp[(y + n / 2) + ((n - x) + n / 2) * n]; |
168 | 0 | } |
169 | | // Fill in the imag columns |
170 | 0 | for (int x = 0; x <= n / 2; ++x) { |
171 | 0 | output[x + (y + n / 2) * n] = |
172 | 0 | temp[(y + n / 2) + x * n] - |
173 | 0 | ((x > 0 && x < n / 2) ? temp[(y + 1) + (x + n / 2) * n] : 0); |
174 | 0 | } |
175 | 0 | for (int x = n / 2 + 1; x < n; ++x) { |
176 | 0 | output[x + (y + n / 2) * n] = temp[(y + 1) + ((n - x) + n / 2) * n] + |
177 | 0 | temp[(y + n / 2) + (n - x) * n]; |
178 | 0 | } |
179 | 0 | } |
180 | 0 | for (int y = 0; y < n; y += vec_size) { |
181 | 0 | ifft_multi(output + y, temp + y, n); |
182 | 0 | } |
183 | 0 | transpose(temp, output, n); |
184 | 0 | } |
185 | | |
186 | | GEN_IFFT_2(void, float, float, float, *, store_float); |
187 | | GEN_IFFT_4(void, float, float, float, *, store_float, (float), add_float, |
188 | | sub_float); |
189 | | GEN_IFFT_8(void, float, float, float, *, store_float, (float), add_float, |
190 | | sub_float, mul_float); |
191 | | GEN_IFFT_16(void, float, float, float, *, store_float, (float), add_float, |
192 | | sub_float, mul_float); |
193 | | GEN_IFFT_32(void, float, float, float, *, store_float, (float), add_float, |
194 | | sub_float, mul_float); |
195 | | |
196 | 0 | void aom_ifft2x2_float_c(const float *input, float *temp, float *output) { |
197 | 0 | aom_ifft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, aom_fft1d_2_float, |
198 | 0 | aom_ifft1d_2_float, simple_transpose, 1); |
199 | 0 | } |
200 | | |
201 | 0 | void aom_ifft4x4_float_c(const float *input, float *temp, float *output) { |
202 | 0 | aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_float, |
203 | 0 | aom_ifft1d_4_float, simple_transpose, 1); |
204 | 0 | } |
205 | | |
206 | 0 | void aom_ifft8x8_float_c(const float *input, float *temp, float *output) { |
207 | 0 | aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_float, |
208 | 0 | aom_ifft1d_8_float, simple_transpose, 1); |
209 | 0 | } |
210 | | |
211 | 0 | void aom_ifft16x16_float_c(const float *input, float *temp, float *output) { |
212 | 0 | aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float, |
213 | 0 | aom_fft1d_16_float, aom_ifft1d_16_float, simple_transpose, 1); |
214 | 0 | } |
215 | | |
216 | 0 | void aom_ifft32x32_float_c(const float *input, float *temp, float *output) { |
217 | 0 | aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float, |
218 | 0 | aom_fft1d_32_float, aom_ifft1d_32_float, simple_transpose, 1); |
219 | 0 | } |