/src/libvpx/vpx_dsp/x86/inv_txfm_avx2.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2023 The WebM project authors. All Rights Reserved. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license |
5 | | * that can be found in the LICENSE file in the root of the source |
6 | | * tree. An additional intellectual property rights grant can be found |
7 | | * in the file PATENTS. All contributing project authors may |
8 | | * be found in the AUTHORS file in the root of the source tree. |
9 | | */ |
10 | | |
11 | | #include <immintrin.h> // AVX2 |
12 | | |
13 | | #include "./vpx_dsp_rtcd.h" |
14 | | #include "vpx_dsp/txfm_common.h" |
15 | | |
16 | | #define PAIR256_SET_EPI16(a, b) \ |
17 | 758M | _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ |
18 | 758M | (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ |
19 | 758M | (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ |
20 | 758M | (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a)) |
21 | | |
22 | | static INLINE void idct_load16x16(const tran_low_t *input, __m256i *in, |
23 | 12.3M | int stride) { |
24 | 12.3M | int i; |
25 | | // Load 16x16 values |
26 | 209M | for (i = 0; i < 16; i++) { |
27 | 197M | #if CONFIG_VP9_HIGHBITDEPTH |
28 | 197M | const __m128i in0 = _mm_loadu_si128((const __m128i *)(input + i * stride)); |
29 | 197M | const __m128i in1 = |
30 | 197M | _mm_loadu_si128((const __m128i *)((input + i * stride) + 4)); |
31 | 197M | const __m128i in2 = |
32 | 197M | _mm_loadu_si128((const __m128i *)((input + i * stride) + 8)); |
33 | 197M | const __m128i in3 = |
34 | 197M | _mm_loadu_si128((const __m128i *)((input + i * stride) + 12)); |
35 | 197M | const __m128i ls = _mm_packs_epi32(in0, in1); |
36 | 197M | const __m128i rs = _mm_packs_epi32(in2, in3); |
37 | 197M | in[i] = _mm256_inserti128_si256(_mm256_castsi128_si256(ls), rs, 1); |
38 | | #else |
39 | | in[i] = _mm256_load_si256((const __m256i *)(input + i * stride)); |
40 | | #endif |
41 | 197M | } |
42 | 12.3M | } |
43 | | |
44 | 1.51G | static INLINE __m256i dct_round_shift_avx2(__m256i in) { |
45 | 1.51G | const __m256i t = _mm256_add_epi32(in, _mm256_set1_epi32(DCT_CONST_ROUNDING)); |
46 | 1.51G | return _mm256_srai_epi32(t, DCT_CONST_BITS); |
47 | 1.51G | } |
48 | | |
49 | 1.51G | static INLINE __m256i idct_madd_round_shift_avx2(__m256i *in, __m256i *cospi) { |
50 | 1.51G | const __m256i t = _mm256_madd_epi16(*in, *cospi); |
51 | 1.51G | return dct_round_shift_avx2(t); |
52 | 1.51G | } |
53 | | |
54 | | // Calculate the dot product between in0/1 and x and wrap to short. |
55 | | static INLINE __m256i idct_calc_wraplow_avx2(__m256i *in0, __m256i *in1, |
56 | 758M | __m256i *x) { |
57 | 758M | const __m256i t0 = idct_madd_round_shift_avx2(in0, x); |
58 | 758M | const __m256i t1 = idct_madd_round_shift_avx2(in1, x); |
59 | 758M | return _mm256_packs_epi32(t0, t1); |
60 | 758M | } |
61 | | |
62 | | // Multiply elements by constants and add them together. |
63 | | static INLINE void butterfly16(__m256i in0, __m256i in1, int c0, int c1, |
64 | 379M | __m256i *out0, __m256i *out1) { |
65 | 379M | __m256i cst0 = PAIR256_SET_EPI16(c0, -c1); |
66 | 379M | __m256i cst1 = PAIR256_SET_EPI16(c1, c0); |
67 | 379M | __m256i lo = _mm256_unpacklo_epi16(in0, in1); |
68 | 379M | __m256i hi = _mm256_unpackhi_epi16(in0, in1); |
69 | 379M | *out0 = idct_calc_wraplow_avx2(&lo, &hi, &cst0); |
70 | 379M | *out1 = idct_calc_wraplow_avx2(&lo, &hi, &cst1); |
71 | 379M | } |
72 | | |
73 | 8.26M | static INLINE void idct16_16col(__m256i *in, __m256i *out) { |
74 | 8.26M | __m256i step1[16], step2[16]; |
75 | | |
76 | | // stage 2 |
77 | 8.26M | butterfly16(in[1], in[15], cospi_30_64, cospi_2_64, &step2[8], &step2[15]); |
78 | 8.26M | butterfly16(in[9], in[7], cospi_14_64, cospi_18_64, &step2[9], &step2[14]); |
79 | 8.26M | butterfly16(in[5], in[11], cospi_22_64, cospi_10_64, &step2[10], &step2[13]); |
80 | 8.26M | butterfly16(in[13], in[3], cospi_6_64, cospi_26_64, &step2[11], &step2[12]); |
81 | | |
82 | | // stage 3 |
83 | 8.26M | butterfly16(in[2], in[14], cospi_28_64, cospi_4_64, &step1[4], &step1[7]); |
84 | 8.26M | butterfly16(in[10], in[6], cospi_12_64, cospi_20_64, &step1[5], &step1[6]); |
85 | 8.26M | step1[8] = _mm256_add_epi16(step2[8], step2[9]); |
86 | 8.26M | step1[9] = _mm256_sub_epi16(step2[8], step2[9]); |
87 | 8.26M | step1[10] = _mm256_sub_epi16(step2[11], step2[10]); |
88 | 8.26M | step1[11] = _mm256_add_epi16(step2[10], step2[11]); |
89 | 8.26M | step1[12] = _mm256_add_epi16(step2[12], step2[13]); |
90 | 8.26M | step1[13] = _mm256_sub_epi16(step2[12], step2[13]); |
91 | 8.26M | step1[14] = _mm256_sub_epi16(step2[15], step2[14]); |
92 | 8.26M | step1[15] = _mm256_add_epi16(step2[14], step2[15]); |
93 | | |
94 | | // stage 4 |
95 | 8.26M | butterfly16(in[0], in[8], cospi_16_64, cospi_16_64, &step2[1], &step2[0]); |
96 | 8.26M | butterfly16(in[4], in[12], cospi_24_64, cospi_8_64, &step2[2], &step2[3]); |
97 | 8.26M | butterfly16(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], |
98 | 8.26M | &step2[14]); |
99 | 8.26M | butterfly16(step1[10], step1[13], -cospi_8_64, -cospi_24_64, &step2[13], |
100 | 8.26M | &step2[10]); |
101 | 8.26M | step2[5] = _mm256_sub_epi16(step1[4], step1[5]); |
102 | 8.26M | step1[4] = _mm256_add_epi16(step1[4], step1[5]); |
103 | 8.26M | step2[6] = _mm256_sub_epi16(step1[7], step1[6]); |
104 | 8.26M | step1[7] = _mm256_add_epi16(step1[6], step1[7]); |
105 | 8.26M | step2[8] = step1[8]; |
106 | 8.26M | step2[11] = step1[11]; |
107 | 8.26M | step2[12] = step1[12]; |
108 | 8.26M | step2[15] = step1[15]; |
109 | | |
110 | | // stage 5 |
111 | 8.26M | step1[0] = _mm256_add_epi16(step2[0], step2[3]); |
112 | 8.26M | step1[1] = _mm256_add_epi16(step2[1], step2[2]); |
113 | 8.26M | step1[2] = _mm256_sub_epi16(step2[1], step2[2]); |
114 | 8.26M | step1[3] = _mm256_sub_epi16(step2[0], step2[3]); |
115 | 8.26M | butterfly16(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], |
116 | 8.26M | &step1[6]); |
117 | 8.26M | step1[8] = _mm256_add_epi16(step2[8], step2[11]); |
118 | 8.26M | step1[9] = _mm256_add_epi16(step2[9], step2[10]); |
119 | 8.26M | step1[10] = _mm256_sub_epi16(step2[9], step2[10]); |
120 | 8.26M | step1[11] = _mm256_sub_epi16(step2[8], step2[11]); |
121 | 8.26M | step1[12] = _mm256_sub_epi16(step2[15], step2[12]); |
122 | 8.26M | step1[13] = _mm256_sub_epi16(step2[14], step2[13]); |
123 | 8.26M | step1[14] = _mm256_add_epi16(step2[14], step2[13]); |
124 | 8.26M | step1[15] = _mm256_add_epi16(step2[15], step2[12]); |
125 | | |
126 | | // stage 6 |
127 | 8.26M | step2[0] = _mm256_add_epi16(step1[0], step1[7]); |
128 | 8.26M | step2[1] = _mm256_add_epi16(step1[1], step1[6]); |
129 | 8.26M | step2[2] = _mm256_add_epi16(step1[2], step1[5]); |
130 | 8.26M | step2[3] = _mm256_add_epi16(step1[3], step1[4]); |
131 | 8.26M | step2[4] = _mm256_sub_epi16(step1[3], step1[4]); |
132 | 8.26M | step2[5] = _mm256_sub_epi16(step1[2], step1[5]); |
133 | 8.26M | step2[6] = _mm256_sub_epi16(step1[1], step1[6]); |
134 | 8.26M | step2[7] = _mm256_sub_epi16(step1[0], step1[7]); |
135 | 8.26M | butterfly16(step1[13], step1[10], cospi_16_64, cospi_16_64, &step2[10], |
136 | 8.26M | &step2[13]); |
137 | 8.26M | butterfly16(step1[12], step1[11], cospi_16_64, cospi_16_64, &step2[11], |
138 | 8.26M | &step2[12]); |
139 | | |
140 | | // stage 7 |
141 | 8.26M | out[0] = _mm256_add_epi16(step2[0], step1[15]); |
142 | 8.26M | out[1] = _mm256_add_epi16(step2[1], step1[14]); |
143 | 8.26M | out[2] = _mm256_add_epi16(step2[2], step2[13]); |
144 | 8.26M | out[3] = _mm256_add_epi16(step2[3], step2[12]); |
145 | 8.26M | out[4] = _mm256_add_epi16(step2[4], step2[11]); |
146 | 8.26M | out[5] = _mm256_add_epi16(step2[5], step2[10]); |
147 | 8.26M | out[6] = _mm256_add_epi16(step2[6], step1[9]); |
148 | 8.26M | out[7] = _mm256_add_epi16(step2[7], step1[8]); |
149 | 8.26M | out[8] = _mm256_sub_epi16(step2[7], step1[8]); |
150 | 8.26M | out[9] = _mm256_sub_epi16(step2[6], step1[9]); |
151 | 8.26M | out[10] = _mm256_sub_epi16(step2[5], step2[10]); |
152 | 8.26M | out[11] = _mm256_sub_epi16(step2[4], step2[11]); |
153 | 8.26M | out[12] = _mm256_sub_epi16(step2[3], step2[12]); |
154 | 8.26M | out[13] = _mm256_sub_epi16(step2[2], step2[13]); |
155 | 8.26M | out[14] = _mm256_sub_epi16(step2[1], step1[14]); |
156 | 8.26M | out[15] = _mm256_sub_epi16(step2[0], step1[15]); |
157 | 8.26M | } |
158 | | |
159 | 198M | static INLINE void recon_and_store16(uint8_t *dest, __m256i in_x) { |
160 | 198M | const __m256i zero = _mm256_setzero_si256(); |
161 | 198M | __m256i d0 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dest))); |
162 | 198M | d0 = _mm256_permute4x64_epi64(d0, 0xd8); |
163 | 198M | d0 = _mm256_unpacklo_epi8(d0, zero); |
164 | 198M | d0 = _mm256_add_epi16(in_x, d0); |
165 | 198M | d0 = _mm256_packus_epi16( |
166 | 198M | d0, _mm256_castsi128_si256(_mm256_extractf128_si256(d0, 1))); |
167 | | |
168 | 198M | _mm_storeu_si128((__m128i *)dest, _mm256_castsi256_si128(d0)); |
169 | 198M | } |
170 | | |
171 | 66.1M | static INLINE void write_buffer_16x1(uint8_t *dest, __m256i in) { |
172 | 66.1M | const __m256i final_rounding = _mm256_set1_epi16(1 << 5); |
173 | 66.1M | __m256i out; |
174 | 66.1M | out = _mm256_adds_epi16(in, final_rounding); |
175 | 66.1M | out = _mm256_srai_epi16(out, 6); |
176 | 66.1M | recon_and_store16(dest, out); |
177 | 66.1M | } |
178 | | |
179 | 4.12M | static INLINE void store_buffer_16x32(__m256i *in, uint8_t *dst, int stride) { |
180 | 4.12M | const __m256i final_rounding = _mm256_set1_epi16(1 << 5); |
181 | 4.12M | int j = 0; |
182 | 70.1M | while (j < 32) { |
183 | 66.0M | in[j] = _mm256_adds_epi16(in[j], final_rounding); |
184 | 66.0M | in[j + 1] = _mm256_adds_epi16(in[j + 1], final_rounding); |
185 | | |
186 | 66.0M | in[j] = _mm256_srai_epi16(in[j], 6); |
187 | 66.0M | in[j + 1] = _mm256_srai_epi16(in[j + 1], 6); |
188 | | |
189 | 66.0M | recon_and_store16(dst, in[j]); |
190 | 66.0M | dst += stride; |
191 | 66.0M | recon_and_store16(dst, in[j + 1]); |
192 | 66.0M | dst += stride; |
193 | 66.0M | j += 2; |
194 | 66.0M | } |
195 | 4.12M | } |
196 | | |
197 | 49.3M | static INLINE void transpose2_8x8_avx2(__m256i *in, __m256i *out) { |
198 | 49.3M | int i; |
199 | 49.3M | __m256i t[16], u[16]; |
200 | | // (1st, 2nd) ==> (lo, hi) |
201 | | // (0, 1) ==> (0, 1) |
202 | | // (2, 3) ==> (2, 3) |
203 | | // (4, 5) ==> (4, 5) |
204 | | // (6, 7) ==> (6, 7) |
205 | 246M | for (i = 0; i < 4; i++) { |
206 | 197M | t[2 * i] = _mm256_unpacklo_epi16(in[2 * i], in[2 * i + 1]); |
207 | 197M | t[2 * i + 1] = _mm256_unpackhi_epi16(in[2 * i], in[2 * i + 1]); |
208 | 197M | } |
209 | | |
210 | | // (1st, 2nd) ==> (lo, hi) |
211 | | // (0, 2) ==> (0, 2) |
212 | | // (1, 3) ==> (1, 3) |
213 | | // (4, 6) ==> (4, 6) |
214 | | // (5, 7) ==> (5, 7) |
215 | 147M | for (i = 0; i < 2; i++) { |
216 | 98.6M | u[i] = _mm256_unpacklo_epi32(t[i], t[i + 2]); |
217 | 98.6M | u[i + 2] = _mm256_unpackhi_epi32(t[i], t[i + 2]); |
218 | | |
219 | 98.6M | u[i + 4] = _mm256_unpacklo_epi32(t[i + 4], t[i + 6]); |
220 | 98.6M | u[i + 6] = _mm256_unpackhi_epi32(t[i + 4], t[i + 6]); |
221 | 98.6M | } |
222 | | |
223 | | // (1st, 2nd) ==> (lo, hi) |
224 | | // (0, 4) ==> (0, 1) |
225 | | // (1, 5) ==> (4, 5) |
226 | | // (2, 6) ==> (2, 3) |
227 | | // (3, 7) ==> (6, 7) |
228 | 147M | for (i = 0; i < 2; i++) { |
229 | 98.6M | out[2 * i] = _mm256_unpacklo_epi64(u[2 * i], u[2 * i + 4]); |
230 | 98.6M | out[2 * i + 1] = _mm256_unpackhi_epi64(u[2 * i], u[2 * i + 4]); |
231 | | |
232 | 98.6M | out[2 * i + 4] = _mm256_unpacklo_epi64(u[2 * i + 1], u[2 * i + 5]); |
233 | 98.6M | out[2 * i + 5] = _mm256_unpackhi_epi64(u[2 * i + 1], u[2 * i + 5]); |
234 | 98.6M | } |
235 | 49.3M | } |
236 | | |
237 | 24.6M | static INLINE void transpose_16bit_16x16_avx2(__m256i *in, __m256i *out) { |
238 | 24.6M | __m256i t[16]; |
239 | | |
240 | 24.6M | #define LOADL(idx) \ |
241 | 197M | t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \ |
242 | 197M | t[idx] = _mm256_inserti128_si256( \ |
243 | 197M | t[idx], _mm_load_si128((__m128i const *)&in[(idx) + 8]), 1); |
244 | | |
245 | 24.6M | #define LOADR(idx) \ |
246 | 197M | t[8 + (idx)] = \ |
247 | 197M | _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \ |
248 | 197M | t[8 + (idx)] = _mm256_inserti128_si256( \ |
249 | 197M | t[8 + (idx)], _mm_load_si128((__m128i const *)&in[(idx) + 8] + 1), 1); |
250 | | |
251 | | // load left 8x16 |
252 | 24.6M | LOADL(0) |
253 | 24.6M | LOADL(1) |
254 | 24.6M | LOADL(2) |
255 | 24.6M | LOADL(3) |
256 | 24.6M | LOADL(4) |
257 | 24.6M | LOADL(5) |
258 | 24.6M | LOADL(6) |
259 | 24.6M | LOADL(7) |
260 | | |
261 | | // load right 8x16 |
262 | 24.6M | LOADR(0) |
263 | 24.6M | LOADR(1) |
264 | 24.6M | LOADR(2) |
265 | 24.6M | LOADR(3) |
266 | 24.6M | LOADR(4) |
267 | 24.6M | LOADR(5) |
268 | 24.6M | LOADR(6) |
269 | 24.6M | LOADR(7) |
270 | | |
271 | | // get the top 16x8 result |
272 | 24.6M | transpose2_8x8_avx2(t, out); |
273 | | // get the bottom 16x8 result |
274 | 24.6M | transpose2_8x8_avx2(&t[8], &out[8]); |
275 | 24.6M | } |
276 | | |
277 | | void vpx_idct16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest, |
278 | 4.13M | int stride) { |
279 | 4.13M | int i; |
280 | 4.13M | __m256i in[16]; |
281 | | |
282 | | // Load 16x16 values |
283 | 4.13M | idct_load16x16(input, in, 16); |
284 | | |
285 | 4.13M | transpose_16bit_16x16_avx2(in, in); |
286 | 4.13M | idct16_16col(in, in); |
287 | | |
288 | 4.13M | transpose_16bit_16x16_avx2(in, in); |
289 | 4.13M | idct16_16col(in, in); |
290 | | |
291 | 70.2M | for (i = 0; i < 16; ++i) { |
292 | 66.1M | write_buffer_16x1(dest + i * stride, in[i]); |
293 | 66.1M | } |
294 | 4.13M | } |
295 | | |
296 | | // Only do addition and subtraction butterfly, size = 16, 32 |
297 | 16.4M | static INLINE void add_sub_butterfly_avx2(__m256i *in, __m256i *out, int size) { |
298 | 16.4M | int i = 0; |
299 | 16.4M | const int num = size >> 1; |
300 | 16.4M | const int bound = size - 1; |
301 | 213M | while (i < num) { |
302 | 197M | out[i] = _mm256_add_epi16(in[i], in[bound - i]); |
303 | 197M | out[bound - i] = _mm256_sub_epi16(in[i], in[bound - i]); |
304 | 197M | i++; |
305 | 197M | } |
306 | 16.4M | } |
307 | | |
308 | | // For each 16x32 block __m256i in[32], |
309 | | // Input with index, 0, 4, 8, 12, 16, 20, 24, 28 |
310 | | // output pixels: 0-7 in __m256i out[32] |
311 | 8.22M | static INLINE void idct32_1024_16x32_quarter_1(__m256i *in, __m256i *out) { |
312 | 8.22M | __m256i step1[8], step2[8]; |
313 | | |
314 | | // stage 3 |
315 | 8.22M | butterfly16(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4], &step1[7]); |
316 | 8.22M | butterfly16(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5], &step1[6]); |
317 | | |
318 | | // stage 4 |
319 | 8.22M | butterfly16(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1], &step2[0]); |
320 | 8.22M | butterfly16(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2], &step2[3]); |
321 | 8.22M | step2[4] = _mm256_add_epi16(step1[4], step1[5]); |
322 | 8.22M | step2[5] = _mm256_sub_epi16(step1[4], step1[5]); |
323 | 8.22M | step2[6] = _mm256_sub_epi16(step1[7], step1[6]); |
324 | 8.22M | step2[7] = _mm256_add_epi16(step1[7], step1[6]); |
325 | | |
326 | | // stage 5 |
327 | 8.22M | step1[0] = _mm256_add_epi16(step2[0], step2[3]); |
328 | 8.22M | step1[1] = _mm256_add_epi16(step2[1], step2[2]); |
329 | 8.22M | step1[2] = _mm256_sub_epi16(step2[1], step2[2]); |
330 | 8.22M | step1[3] = _mm256_sub_epi16(step2[0], step2[3]); |
331 | 8.22M | step1[4] = step2[4]; |
332 | 8.22M | butterfly16(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], |
333 | 8.22M | &step1[6]); |
334 | 8.22M | step1[7] = step2[7]; |
335 | | |
336 | | // stage 6 |
337 | 8.22M | out[0] = _mm256_add_epi16(step1[0], step1[7]); |
338 | 8.22M | out[1] = _mm256_add_epi16(step1[1], step1[6]); |
339 | 8.22M | out[2] = _mm256_add_epi16(step1[2], step1[5]); |
340 | 8.22M | out[3] = _mm256_add_epi16(step1[3], step1[4]); |
341 | 8.22M | out[4] = _mm256_sub_epi16(step1[3], step1[4]); |
342 | 8.22M | out[5] = _mm256_sub_epi16(step1[2], step1[5]); |
343 | 8.22M | out[6] = _mm256_sub_epi16(step1[1], step1[6]); |
344 | 8.22M | out[7] = _mm256_sub_epi16(step1[0], step1[7]); |
345 | 8.22M | } |
346 | | |
347 | | static INLINE void idct32_16x32_quarter_2_stage_4_to_6(__m256i *step1, |
348 | 8.22M | __m256i *out) { |
349 | 8.22M | __m256i step2[32]; |
350 | | |
351 | | // stage 4 |
352 | 8.22M | step2[8] = step1[8]; |
353 | 8.22M | step2[15] = step1[15]; |
354 | 8.22M | butterfly16(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], |
355 | 8.22M | &step2[14]); |
356 | 8.22M | butterfly16(step1[13], step1[10], -cospi_8_64, cospi_24_64, &step2[10], |
357 | 8.22M | &step2[13]); |
358 | 8.22M | step2[11] = step1[11]; |
359 | 8.22M | step2[12] = step1[12]; |
360 | | |
361 | | // stage 5 |
362 | 8.22M | step1[8] = _mm256_add_epi16(step2[8], step2[11]); |
363 | 8.22M | step1[9] = _mm256_add_epi16(step2[9], step2[10]); |
364 | 8.22M | step1[10] = _mm256_sub_epi16(step2[9], step2[10]); |
365 | 8.22M | step1[11] = _mm256_sub_epi16(step2[8], step2[11]); |
366 | 8.22M | step1[12] = _mm256_sub_epi16(step2[15], step2[12]); |
367 | 8.22M | step1[13] = _mm256_sub_epi16(step2[14], step2[13]); |
368 | 8.22M | step1[14] = _mm256_add_epi16(step2[14], step2[13]); |
369 | 8.22M | step1[15] = _mm256_add_epi16(step2[15], step2[12]); |
370 | | |
371 | | // stage 6 |
372 | 8.22M | out[8] = step1[8]; |
373 | 8.22M | out[9] = step1[9]; |
374 | 8.22M | butterfly16(step1[13], step1[10], cospi_16_64, cospi_16_64, &out[10], |
375 | 8.22M | &out[13]); |
376 | 8.22M | butterfly16(step1[12], step1[11], cospi_16_64, cospi_16_64, &out[11], |
377 | 8.22M | &out[12]); |
378 | 8.22M | out[14] = step1[14]; |
379 | 8.22M | out[15] = step1[15]; |
380 | 8.22M | } |
381 | | |
382 | | // For each 16x32 block __m256i in[32], |
383 | | // Input with index, 2, 6, 10, 14, 18, 22, 26, 30 |
384 | | // output pixels: 8-15 in __m256i out[32] |
385 | 8.22M | static INLINE void idct32_1024_16x32_quarter_2(__m256i *in, __m256i *out) { |
386 | 8.22M | __m256i step1[16], step2[16]; |
387 | | |
388 | | // stage 2 |
389 | 8.22M | butterfly16(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8], &step2[15]); |
390 | 8.22M | butterfly16(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9], &step2[14]); |
391 | 8.22M | butterfly16(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10], &step2[13]); |
392 | 8.22M | butterfly16(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11], &step2[12]); |
393 | | |
394 | | // stage 3 |
395 | 8.22M | step1[8] = _mm256_add_epi16(step2[8], step2[9]); |
396 | 8.22M | step1[9] = _mm256_sub_epi16(step2[8], step2[9]); |
397 | 8.22M | step1[10] = _mm256_sub_epi16(step2[11], step2[10]); |
398 | 8.22M | step1[11] = _mm256_add_epi16(step2[11], step2[10]); |
399 | 8.22M | step1[12] = _mm256_add_epi16(step2[12], step2[13]); |
400 | 8.22M | step1[13] = _mm256_sub_epi16(step2[12], step2[13]); |
401 | 8.22M | step1[14] = _mm256_sub_epi16(step2[15], step2[14]); |
402 | 8.22M | step1[15] = _mm256_add_epi16(step2[15], step2[14]); |
403 | | |
404 | 8.22M | idct32_16x32_quarter_2_stage_4_to_6(step1, out); |
405 | 8.22M | } |
406 | | |
407 | | static INLINE void idct32_16x32_quarter_3_4_stage_4_to_7(__m256i *step1, |
408 | 8.22M | __m256i *out) { |
409 | 8.22M | __m256i step2[32]; |
410 | | |
411 | | // stage 4 |
412 | 8.22M | step2[16] = _mm256_add_epi16(step1[16], step1[19]); |
413 | 8.22M | step2[17] = _mm256_add_epi16(step1[17], step1[18]); |
414 | 8.22M | step2[18] = _mm256_sub_epi16(step1[17], step1[18]); |
415 | 8.22M | step2[19] = _mm256_sub_epi16(step1[16], step1[19]); |
416 | 8.22M | step2[20] = _mm256_sub_epi16(step1[23], step1[20]); |
417 | 8.22M | step2[21] = _mm256_sub_epi16(step1[22], step1[21]); |
418 | 8.22M | step2[22] = _mm256_add_epi16(step1[22], step1[21]); |
419 | 8.22M | step2[23] = _mm256_add_epi16(step1[23], step1[20]); |
420 | | |
421 | 8.22M | step2[24] = _mm256_add_epi16(step1[24], step1[27]); |
422 | 8.22M | step2[25] = _mm256_add_epi16(step1[25], step1[26]); |
423 | 8.22M | step2[26] = _mm256_sub_epi16(step1[25], step1[26]); |
424 | 8.22M | step2[27] = _mm256_sub_epi16(step1[24], step1[27]); |
425 | 8.22M | step2[28] = _mm256_sub_epi16(step1[31], step1[28]); |
426 | 8.22M | step2[29] = _mm256_sub_epi16(step1[30], step1[29]); |
427 | 8.22M | step2[30] = _mm256_add_epi16(step1[29], step1[30]); |
428 | 8.22M | step2[31] = _mm256_add_epi16(step1[28], step1[31]); |
429 | | |
430 | | // stage 5 |
431 | 8.22M | step1[16] = step2[16]; |
432 | 8.22M | step1[17] = step2[17]; |
433 | 8.22M | butterfly16(step2[29], step2[18], cospi_24_64, cospi_8_64, &step1[18], |
434 | 8.22M | &step1[29]); |
435 | 8.22M | butterfly16(step2[28], step2[19], cospi_24_64, cospi_8_64, &step1[19], |
436 | 8.22M | &step1[28]); |
437 | 8.22M | butterfly16(step2[27], step2[20], -cospi_8_64, cospi_24_64, &step1[20], |
438 | 8.22M | &step1[27]); |
439 | 8.22M | butterfly16(step2[26], step2[21], -cospi_8_64, cospi_24_64, &step1[21], |
440 | 8.22M | &step1[26]); |
441 | 8.22M | step1[22] = step2[22]; |
442 | 8.22M | step1[23] = step2[23]; |
443 | 8.22M | step1[24] = step2[24]; |
444 | 8.22M | step1[25] = step2[25]; |
445 | 8.22M | step1[30] = step2[30]; |
446 | 8.22M | step1[31] = step2[31]; |
447 | | |
448 | | // stage 6 |
449 | 8.22M | out[16] = _mm256_add_epi16(step1[16], step1[23]); |
450 | 8.22M | out[17] = _mm256_add_epi16(step1[17], step1[22]); |
451 | 8.22M | out[18] = _mm256_add_epi16(step1[18], step1[21]); |
452 | 8.22M | out[19] = _mm256_add_epi16(step1[19], step1[20]); |
453 | 8.22M | step2[20] = _mm256_sub_epi16(step1[19], step1[20]); |
454 | 8.22M | step2[21] = _mm256_sub_epi16(step1[18], step1[21]); |
455 | 8.22M | step2[22] = _mm256_sub_epi16(step1[17], step1[22]); |
456 | 8.22M | step2[23] = _mm256_sub_epi16(step1[16], step1[23]); |
457 | | |
458 | 8.22M | step2[24] = _mm256_sub_epi16(step1[31], step1[24]); |
459 | 8.22M | step2[25] = _mm256_sub_epi16(step1[30], step1[25]); |
460 | 8.22M | step2[26] = _mm256_sub_epi16(step1[29], step1[26]); |
461 | 8.22M | step2[27] = _mm256_sub_epi16(step1[28], step1[27]); |
462 | 8.22M | out[28] = _mm256_add_epi16(step1[27], step1[28]); |
463 | 8.22M | out[29] = _mm256_add_epi16(step1[26], step1[29]); |
464 | 8.22M | out[30] = _mm256_add_epi16(step1[25], step1[30]); |
465 | 8.22M | out[31] = _mm256_add_epi16(step1[24], step1[31]); |
466 | | |
467 | | // stage 7 |
468 | 8.22M | butterfly16(step2[27], step2[20], cospi_16_64, cospi_16_64, &out[20], |
469 | 8.22M | &out[27]); |
470 | 8.22M | butterfly16(step2[26], step2[21], cospi_16_64, cospi_16_64, &out[21], |
471 | 8.22M | &out[26]); |
472 | 8.22M | butterfly16(step2[25], step2[22], cospi_16_64, cospi_16_64, &out[22], |
473 | 8.22M | &out[25]); |
474 | 8.22M | butterfly16(step2[24], step2[23], cospi_16_64, cospi_16_64, &out[23], |
475 | 8.22M | &out[24]); |
476 | 8.22M | } |
477 | | |
478 | 8.22M | static INLINE void idct32_1024_16x32_quarter_1_2(__m256i *in, __m256i *out) { |
479 | 8.22M | __m256i temp[16]; |
480 | | |
481 | | // For each 16x32 block __m256i in[32], |
482 | | // Input with index, 0, 4, 8, 12, 16, 20, 24, 28 |
483 | | // output pixels: 0-7 in __m256i out[32] |
484 | 8.22M | idct32_1024_16x32_quarter_1(in, temp); |
485 | | |
486 | | // Input with index, 2, 6, 10, 14, 18, 22, 26, 30 |
487 | | // output pixels: 8-15 in __m256i out[32] |
488 | 8.22M | idct32_1024_16x32_quarter_2(in, temp); |
489 | | |
490 | | // stage 7 |
491 | 8.22M | add_sub_butterfly_avx2(temp, out, 16); |
492 | 8.22M | } |
493 | | |
494 | | // For each 16x32 block __m256i in[32], |
495 | | // Input with odd index, |
496 | | // 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 |
497 | | // output pixels: 16-23, 24-31 in __m256i out[32] |
498 | 8.22M | static INLINE void idct32_1024_16x32_quarter_3_4(__m256i *in, __m256i *out) { |
499 | 8.22M | __m256i step1[32], step2[32]; |
500 | | |
501 | | // stage 1 |
502 | 8.22M | butterfly16(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16], &step1[31]); |
503 | 8.22M | butterfly16(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17], &step1[30]); |
504 | 8.22M | butterfly16(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18], &step1[29]); |
505 | 8.22M | butterfly16(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19], &step1[28]); |
506 | | |
507 | 8.22M | butterfly16(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20], &step1[27]); |
508 | 8.22M | butterfly16(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21], &step1[26]); |
509 | | |
510 | 8.22M | butterfly16(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22], &step1[25]); |
511 | 8.22M | butterfly16(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23], &step1[24]); |
512 | | |
513 | | // stage 2 |
514 | 8.22M | step2[16] = _mm256_add_epi16(step1[16], step1[17]); |
515 | 8.22M | step2[17] = _mm256_sub_epi16(step1[16], step1[17]); |
516 | 8.22M | step2[18] = _mm256_sub_epi16(step1[19], step1[18]); |
517 | 8.22M | step2[19] = _mm256_add_epi16(step1[19], step1[18]); |
518 | 8.22M | step2[20] = _mm256_add_epi16(step1[20], step1[21]); |
519 | 8.22M | step2[21] = _mm256_sub_epi16(step1[20], step1[21]); |
520 | 8.22M | step2[22] = _mm256_sub_epi16(step1[23], step1[22]); |
521 | 8.22M | step2[23] = _mm256_add_epi16(step1[23], step1[22]); |
522 | | |
523 | 8.22M | step2[24] = _mm256_add_epi16(step1[24], step1[25]); |
524 | 8.22M | step2[25] = _mm256_sub_epi16(step1[24], step1[25]); |
525 | 8.22M | step2[26] = _mm256_sub_epi16(step1[27], step1[26]); |
526 | 8.22M | step2[27] = _mm256_add_epi16(step1[27], step1[26]); |
527 | 8.22M | step2[28] = _mm256_add_epi16(step1[28], step1[29]); |
528 | 8.22M | step2[29] = _mm256_sub_epi16(step1[28], step1[29]); |
529 | 8.22M | step2[30] = _mm256_sub_epi16(step1[31], step1[30]); |
530 | 8.22M | step2[31] = _mm256_add_epi16(step1[31], step1[30]); |
531 | | |
532 | | // stage 3 |
533 | 8.22M | step1[16] = step2[16]; |
534 | 8.22M | step1[31] = step2[31]; |
535 | 8.22M | butterfly16(step2[30], step2[17], cospi_28_64, cospi_4_64, &step1[17], |
536 | 8.22M | &step1[30]); |
537 | 8.22M | butterfly16(step2[29], step2[18], -cospi_4_64, cospi_28_64, &step1[18], |
538 | 8.22M | &step1[29]); |
539 | 8.22M | step1[19] = step2[19]; |
540 | 8.22M | step1[20] = step2[20]; |
541 | 8.22M | butterfly16(step2[26], step2[21], cospi_12_64, cospi_20_64, &step1[21], |
542 | 8.22M | &step1[26]); |
543 | 8.22M | butterfly16(step2[25], step2[22], -cospi_20_64, cospi_12_64, &step1[22], |
544 | 8.22M | &step1[25]); |
545 | 8.22M | step1[23] = step2[23]; |
546 | 8.22M | step1[24] = step2[24]; |
547 | 8.22M | step1[27] = step2[27]; |
548 | 8.22M | step1[28] = step2[28]; |
549 | | |
550 | 8.22M | idct32_16x32_quarter_3_4_stage_4_to_7(step1, out); |
551 | 8.22M | } |
552 | | |
553 | 8.22M | static INLINE void idct32_1024_16x32(__m256i *in, __m256i *out) { |
554 | 8.22M | __m256i temp[32]; |
555 | | |
556 | | // For each 16x32 block __m256i in[32], |
557 | | // Input with index, 0, 4, 8, 12, 16, 20, 24, 28 |
558 | | // output pixels: 0-7 in __m256i out[32] |
559 | | // AND |
560 | | // Input with index, 2, 6, 10, 14, 18, 22, 26, 30 |
561 | | // output pixels: 8-15 in __m256i out[32] |
562 | 8.22M | idct32_1024_16x32_quarter_1_2(in, temp); |
563 | | |
564 | | // For each 16x32 block __m256i in[32], |
565 | | // Input with odd index, |
566 | | // 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 |
567 | | // output pixels: 16-23, 24-31 in __m256i out[32] |
568 | 8.22M | idct32_1024_16x32_quarter_3_4(in, temp); |
569 | | |
570 | | // final stage |
571 | 8.22M | add_sub_butterfly_avx2(temp, out, 32); |
572 | 8.22M | } |
573 | | |
574 | | void vpx_idct32x32_1024_add_avx2(const tran_low_t *input, uint8_t *dest, |
575 | 2.03M | int stride) { |
576 | 2.03M | __m256i l[32], r[32], out[32], *in; |
577 | 2.03M | int i; |
578 | | |
579 | 2.03M | in = l; |
580 | | |
581 | 6.11M | for (i = 0; i < 2; i++) { |
582 | 4.07M | idct_load16x16(input, in, 32); |
583 | 4.07M | transpose_16bit_16x16_avx2(in, in); |
584 | | |
585 | 4.07M | idct_load16x16(input + 16, in + 16, 32); |
586 | 4.07M | transpose_16bit_16x16_avx2(in + 16, in + 16); |
587 | 4.07M | idct32_1024_16x32(in, in); |
588 | | |
589 | 4.07M | in = r; |
590 | 4.07M | input += 32 << 4; |
591 | 4.07M | } |
592 | | |
593 | 6.11M | for (i = 0; i < 32; i += 16) { |
594 | 4.07M | transpose_16bit_16x16_avx2(l + i, out); |
595 | 4.07M | transpose_16bit_16x16_avx2(r + i, out + 16); |
596 | 4.07M | idct32_1024_16x32(out, out); |
597 | | |
598 | 4.07M | store_buffer_16x32(out, dest, stride); |
599 | 4.07M | dest += 16; |
600 | 4.07M | } |
601 | 2.03M | } |
602 | | |
603 | | // Case when only upper-left 16x16 has non-zero coeff |
604 | | void vpx_idct32x32_135_add_avx2(const tran_low_t *input, uint8_t *dest, |
605 | 22.6k | int stride) { |
606 | 22.6k | __m256i in[32], io[32], out[32]; |
607 | 22.6k | int i; |
608 | | |
609 | 385k | for (i = 16; i < 32; i++) { |
610 | 363k | in[i] = _mm256_setzero_si256(); |
611 | 363k | } |
612 | | |
613 | | // rows |
614 | 22.6k | idct_load16x16(input, in, 32); |
615 | 22.6k | transpose_16bit_16x16_avx2(in, in); |
616 | 22.6k | idct32_1024_16x32(in, io); |
617 | | |
618 | | // columns |
619 | 68.0k | for (i = 0; i < 32; i += 16) { |
620 | 45.3k | transpose_16bit_16x16_avx2(io + i, in); |
621 | 45.3k | idct32_1024_16x32(in, out); |
622 | | |
623 | 45.3k | store_buffer_16x32(out, dest, stride); |
624 | 45.3k | dest += 16; |
625 | 45.3k | } |
626 | 22.6k | } |