/src/aom/av1/common/x86/av1_inv_txfm_avx2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2018, Alliance for Open Media. All rights reserved. |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #include "config/aom_config.h" |
13 | | |
14 | | #include "config/av1_rtcd.h" |
15 | | |
16 | | #include "av1/common/av1_inv_txfm1d_cfg.h" |
17 | | #include "av1/common/x86/av1_txfm_sse2.h" |
18 | | #include "av1/common/x86/av1_inv_txfm_avx2.h" |
19 | | #include "av1/common/x86/av1_inv_txfm_ssse3.h" |
20 | | |
21 | | // TODO(venkatsanampudi@ittiam.com): move this to header file |
22 | | |
23 | | // Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5 |
24 | | static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096, |
25 | | 4 * 5793 }; |
26 | | |
27 | | static INLINE void idct16_stage5_avx2(__m256i *x1, const int32_t *cospi, |
28 | 1.40M | const __m256i _r, int8_t cos_bit) { |
29 | 1.40M | const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); |
30 | 1.40M | const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); |
31 | 1.40M | btf_16_adds_subs_avx2(&x1[0], &x1[3]); |
32 | 1.40M | btf_16_adds_subs_avx2(&x1[1], &x1[2]); |
33 | 1.40M | btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit); |
34 | | |
35 | 1.40M | btf_16_adds_subs_avx2(&x1[8], &x1[11]); |
36 | 1.40M | btf_16_adds_subs_avx2(&x1[9], &x1[10]); |
37 | 1.40M | btf_16_adds_subs_avx2(&x1[15], &x1[12]); |
38 | 1.40M | btf_16_adds_subs_avx2(&x1[14], &x1[13]); |
39 | 1.40M | } |
40 | | |
41 | | static INLINE void idct16_stage6_avx2(__m256i *x, const int32_t *cospi, |
42 | 1.40M | const __m256i _r, int8_t cos_bit) { |
43 | 1.40M | const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); |
44 | 1.40M | const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); |
45 | 1.40M | btf_16_adds_subs_avx2(&x[0], &x[7]); |
46 | 1.40M | btf_16_adds_subs_avx2(&x[1], &x[6]); |
47 | 1.40M | btf_16_adds_subs_avx2(&x[2], &x[5]); |
48 | 1.40M | btf_16_adds_subs_avx2(&x[3], &x[4]); |
49 | 1.40M | btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit); |
50 | 1.40M | btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit); |
51 | 1.40M | } |
52 | | |
53 | 1.40M | static INLINE void idct16_stage7_avx2(__m256i *output, __m256i *x1) { |
54 | 1.40M | btf_16_adds_subs_out_avx2(&output[0], &output[15], x1[0], x1[15]); |
55 | 1.40M | btf_16_adds_subs_out_avx2(&output[1], &output[14], x1[1], x1[14]); |
56 | 1.40M | btf_16_adds_subs_out_avx2(&output[2], &output[13], x1[2], x1[13]); |
57 | 1.40M | btf_16_adds_subs_out_avx2(&output[3], &output[12], x1[3], x1[12]); |
58 | 1.40M | btf_16_adds_subs_out_avx2(&output[4], &output[11], x1[4], x1[11]); |
59 | 1.40M | btf_16_adds_subs_out_avx2(&output[5], &output[10], x1[5], x1[10]); |
60 | 1.40M | btf_16_adds_subs_out_avx2(&output[6], &output[9], x1[6], x1[9]); |
61 | 1.40M | btf_16_adds_subs_out_avx2(&output[7], &output[8], x1[7], x1[8]); |
62 | 1.40M | } |
63 | | |
64 | 459k | static void idct16_avx2(const __m256i *input, __m256i *output) { |
65 | 459k | const int32_t *cospi = cospi_arr(INV_COS_BIT); |
66 | 459k | const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); |
67 | | |
68 | 459k | __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]); |
69 | 459k | __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]); |
70 | 459k | __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]); |
71 | 459k | __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]); |
72 | 459k | __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]); |
73 | 459k | __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]); |
74 | 459k | __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]); |
75 | 459k | __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]); |
76 | 459k | __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]); |
77 | 459k | __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]); |
78 | 459k | __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]); |
79 | 459k | __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]); |
80 | 459k | __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); |
81 | 459k | __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); |
82 | 459k | __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); |
83 | 459k | __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); |
84 | 459k | __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); |
85 | 459k | __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); |
86 | 459k | __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); |
87 | | |
88 | | // stage 1 |
89 | 459k | __m256i x1[16]; |
90 | 459k | x1[0] = input[0]; |
91 | 459k | x1[1] = input[8]; |
92 | 459k | x1[2] = input[4]; |
93 | 459k | x1[3] = input[12]; |
94 | 459k | x1[4] = input[2]; |
95 | 459k | x1[5] = input[10]; |
96 | 459k | x1[6] = input[6]; |
97 | 459k | x1[7] = input[14]; |
98 | 459k | x1[8] = input[1]; |
99 | 459k | x1[9] = input[9]; |
100 | 459k | x1[10] = input[5]; |
101 | 459k | x1[11] = input[13]; |
102 | 459k | x1[12] = input[3]; |
103 | 459k | x1[13] = input[11]; |
104 | 459k | x1[14] = input[7]; |
105 | 459k | x1[15] = input[15]; |
106 | | |
107 | | // stage 2 |
108 | 459k | btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r, |
109 | 459k | INV_COS_BIT); |
110 | 459k | btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r, |
111 | 459k | INV_COS_BIT); |
112 | 459k | btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r, |
113 | 459k | INV_COS_BIT); |
114 | 459k | btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r, |
115 | 459k | INV_COS_BIT); |
116 | | |
117 | | // stage 3 |
118 | 459k | btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r, |
119 | 459k | INV_COS_BIT); |
120 | 459k | btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r, |
121 | 459k | INV_COS_BIT); |
122 | 459k | btf_16_adds_subs_avx2(&x1[8], &x1[9]); |
123 | 459k | btf_16_adds_subs_avx2(&x1[11], &x1[10]); |
124 | 459k | btf_16_adds_subs_avx2(&x1[12], &x1[13]); |
125 | 459k | btf_16_adds_subs_avx2(&x1[15], &x1[14]); |
126 | | |
127 | | // stage 4 |
128 | 459k | btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, |
129 | 459k | INV_COS_BIT); |
130 | 459k | btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r, |
131 | 459k | INV_COS_BIT); |
132 | 459k | btf_16_adds_subs_avx2(&x1[4], &x1[5]); |
133 | 459k | btf_16_adds_subs_avx2(&x1[7], &x1[6]); |
134 | 459k | btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, |
135 | 459k | INV_COS_BIT); |
136 | 459k | btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, |
137 | 459k | INV_COS_BIT); |
138 | | |
139 | 459k | idct16_stage5_avx2(x1, cospi, _r, INV_COS_BIT); |
140 | 459k | idct16_stage6_avx2(x1, cospi, _r, INV_COS_BIT); |
141 | 459k | idct16_stage7_avx2(output, x1); |
142 | 459k | } |
143 | | |
144 | 944k | static void idct16_low8_avx2(const __m256i *input, __m256i *output) { |
145 | 944k | const int32_t *cospi = cospi_arr(INV_COS_BIT); |
146 | 944k | const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); |
147 | | |
148 | 944k | const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); |
149 | 944k | const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); |
150 | 944k | const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); |
151 | | |
152 | | // stage 1 |
153 | 944k | __m256i x1[16]; |
154 | 944k | x1[0] = input[0]; |
155 | 944k | x1[2] = input[4]; |
156 | 944k | x1[4] = input[2]; |
157 | 944k | x1[6] = input[6]; |
158 | 944k | x1[8] = input[1]; |
159 | 944k | x1[10] = input[5]; |
160 | 944k | x1[12] = input[3]; |
161 | 944k | x1[14] = input[7]; |
162 | | |
163 | | // stage 2 |
164 | 944k | btf_16_w16_0_avx2(cospi[60], cospi[4], x1[8], x1[8], x1[15]); |
165 | 944k | btf_16_w16_0_avx2(-cospi[36], cospi[28], x1[14], x1[9], x1[14]); |
166 | 944k | btf_16_w16_0_avx2(cospi[44], cospi[20], x1[10], x1[10], x1[13]); |
167 | 944k | btf_16_w16_0_avx2(-cospi[52], cospi[12], x1[12], x1[11], x1[12]); |
168 | | |
169 | | // stage 3 |
170 | 944k | btf_16_w16_0_avx2(cospi[56], cospi[8], x1[4], x1[4], x1[7]); |
171 | 944k | btf_16_w16_0_avx2(-cospi[40], cospi[24], x1[6], x1[5], x1[6]); |
172 | 944k | btf_16_adds_subs_avx2(&x1[8], &x1[9]); |
173 | 944k | btf_16_adds_subs_avx2(&x1[11], &x1[10]); |
174 | 944k | btf_16_adds_subs_avx2(&x1[12], &x1[13]); |
175 | 944k | btf_16_adds_subs_avx2(&x1[15], &x1[14]); |
176 | | |
177 | | // stage 4 |
178 | 944k | btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]); |
179 | 944k | btf_16_w16_0_avx2(cospi[48], cospi[16], x1[2], x1[2], x1[3]); |
180 | 944k | btf_16_adds_subs_avx2(&x1[4], &x1[5]); |
181 | 944k | btf_16_adds_subs_avx2(&x1[7], &x1[6]); |
182 | 944k | btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, |
183 | 944k | INV_COS_BIT); |
184 | 944k | btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, |
185 | 944k | INV_COS_BIT); |
186 | | |
187 | 944k | idct16_stage5_avx2(x1, cospi, _r, INV_COS_BIT); |
188 | 944k | idct16_stage6_avx2(x1, cospi, _r, INV_COS_BIT); |
189 | 944k | idct16_stage7_avx2(output, x1); |
190 | 944k | } |
191 | | |
192 | 640k | static void idct16_low1_avx2(const __m256i *input, __m256i *output) { |
193 | 640k | const int32_t *cospi = cospi_arr(INV_COS_BIT); |
194 | | |
195 | | // stage 1 |
196 | 640k | __m256i x1[2]; |
197 | 640k | x1[0] = input[0]; |
198 | | |
199 | | // stage 2 |
200 | | // stage 3 |
201 | | // stage 4 |
202 | 640k | btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]); |
203 | | |
204 | | // stage 5 |
205 | | // stage 6 |
206 | 640k | output[0] = x1[0]; |
207 | 640k | output[1] = x1[1]; |
208 | 640k | output[2] = x1[1]; |
209 | 640k | output[3] = x1[0]; |
210 | 640k | output[4] = x1[0]; |
211 | 640k | output[5] = x1[1]; |
212 | 640k | output[6] = x1[1]; |
213 | 640k | output[7] = x1[0]; |
214 | 640k | output[8] = x1[0]; |
215 | 640k | output[9] = x1[1]; |
216 | 640k | output[10] = x1[1]; |
217 | 640k | output[11] = x1[0]; |
218 | 640k | output[12] = x1[0]; |
219 | 640k | output[13] = x1[1]; |
220 | 640k | output[14] = x1[1]; |
221 | 640k | output[15] = x1[0]; |
222 | 640k | } |
223 | | |
224 | 458k | static INLINE void iadst16_stage3_avx2(__m256i *x) { |
225 | 458k | btf_16_adds_subs_avx2(&x[0], &x[8]); |
226 | 458k | btf_16_adds_subs_avx2(&x[1], &x[9]); |
227 | 458k | btf_16_adds_subs_avx2(&x[2], &x[10]); |
228 | 458k | btf_16_adds_subs_avx2(&x[3], &x[11]); |
229 | 458k | btf_16_adds_subs_avx2(&x[4], &x[12]); |
230 | 458k | btf_16_adds_subs_avx2(&x[5], &x[13]); |
231 | 458k | btf_16_adds_subs_avx2(&x[6], &x[14]); |
232 | 458k | btf_16_adds_subs_avx2(&x[7], &x[15]); |
233 | 458k | } |
234 | | |
235 | | static INLINE void iadst16_stage4_avx2(__m256i *x, const int32_t *cospi, |
236 | 458k | const __m256i _r, int8_t cos_bit) { |
237 | 458k | const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]); |
238 | 458k | const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]); |
239 | 458k | const __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]); |
240 | 458k | const __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]); |
241 | 458k | const __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]); |
242 | 458k | const __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]); |
243 | 458k | btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x[8], &x[9], _r, cos_bit); |
244 | 458k | btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x[10], &x[11], _r, cos_bit); |
245 | 458k | btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x[12], &x[13], _r, cos_bit); |
246 | 458k | btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x[14], &x[15], _r, cos_bit); |
247 | 458k | } |
248 | | |
249 | 458k | static INLINE void iadst16_stage5_avx2(__m256i *x) { |
250 | 458k | btf_16_adds_subs_avx2(&x[0], &x[4]); |
251 | 458k | btf_16_adds_subs_avx2(&x[1], &x[5]); |
252 | 458k | btf_16_adds_subs_avx2(&x[2], &x[6]); |
253 | 458k | btf_16_adds_subs_avx2(&x[3], &x[7]); |
254 | 458k | btf_16_adds_subs_avx2(&x[8], &x[12]); |
255 | 458k | btf_16_adds_subs_avx2(&x[9], &x[13]); |
256 | 458k | btf_16_adds_subs_avx2(&x[10], &x[14]); |
257 | 458k | btf_16_adds_subs_avx2(&x[11], &x[15]); |
258 | 458k | } |
259 | | |
260 | | static INLINE void iadst16_stage6_avx2(__m256i *x, const int32_t *cospi, |
261 | 458k | const __m256i _r, int8_t cos_bit) { |
262 | 458k | const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); |
263 | 458k | const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); |
264 | 458k | const __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]); |
265 | 458k | btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[4], &x[5], _r, cos_bit); |
266 | 458k | btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[6], &x[7], _r, cos_bit); |
267 | 458k | btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[12], &x[13], _r, cos_bit); |
268 | 458k | btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[14], &x[15], _r, cos_bit); |
269 | 458k | } |
270 | | |
271 | 458k | static INLINE void iadst16_stage7_avx2(__m256i *x) { |
272 | 458k | btf_16_adds_subs_avx2(&x[0], &x[2]); |
273 | 458k | btf_16_adds_subs_avx2(&x[1], &x[3]); |
274 | 458k | btf_16_adds_subs_avx2(&x[4], &x[6]); |
275 | 458k | btf_16_adds_subs_avx2(&x[5], &x[7]); |
276 | 458k | btf_16_adds_subs_avx2(&x[8], &x[10]); |
277 | 458k | btf_16_adds_subs_avx2(&x[9], &x[11]); |
278 | 458k | btf_16_adds_subs_avx2(&x[12], &x[14]); |
279 | 458k | btf_16_adds_subs_avx2(&x[13], &x[15]); |
280 | 458k | } |
281 | | |
282 | | static INLINE void iadst16_stage8_avx2(__m256i *x1, const int32_t *cospi, |
283 | 577k | const __m256i _r, int8_t cos_bit) { |
284 | 577k | const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); |
285 | 577k | const __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); |
286 | 577k | btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit); |
287 | 577k | btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit); |
288 | 577k | btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit); |
289 | 577k | btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit); |
290 | 577k | } |
291 | | |
292 | 577k | static INLINE void iadst16_stage9_avx2(__m256i *output, __m256i *x1) { |
293 | 577k | const __m256i __zero = _mm256_setzero_si256(); |
294 | 577k | output[0] = x1[0]; |
295 | 577k | output[1] = _mm256_subs_epi16(__zero, x1[8]); |
296 | 577k | output[2] = x1[12]; |
297 | 577k | output[3] = _mm256_subs_epi16(__zero, x1[4]); |
298 | 577k | output[4] = x1[6]; |
299 | 577k | output[5] = _mm256_subs_epi16(__zero, x1[14]); |
300 | 577k | output[6] = x1[10]; |
301 | 577k | output[7] = _mm256_subs_epi16(__zero, x1[2]); |
302 | 577k | output[8] = x1[3]; |
303 | 577k | output[9] = _mm256_subs_epi16(__zero, x1[11]); |
304 | 577k | output[10] = x1[15]; |
305 | 577k | output[11] = _mm256_subs_epi16(__zero, x1[7]); |
306 | 577k | output[12] = x1[5]; |
307 | 577k | output[13] = _mm256_subs_epi16(__zero, x1[13]); |
308 | 577k | output[14] = x1[9]; |
309 | 577k | output[15] = _mm256_subs_epi16(__zero, x1[1]); |
310 | 577k | } |
311 | | |
312 | 128k | static void iadst16_avx2(const __m256i *input, __m256i *output) { |
313 | 128k | const int32_t *cospi = cospi_arr(INV_COS_BIT); |
314 | | |
315 | 128k | const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); |
316 | | |
317 | 128k | __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]); |
318 | 128k | __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]); |
319 | 128k | __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]); |
320 | 128k | __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]); |
321 | 128k | __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]); |
322 | 128k | __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]); |
323 | 128k | __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]); |
324 | 128k | __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]); |
325 | 128k | __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]); |
326 | 128k | __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]); |
327 | 128k | __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]); |
328 | 128k | __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]); |
329 | 128k | __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]); |
330 | 128k | __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]); |
331 | 128k | __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]); |
332 | 128k | __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]); |
333 | | |
334 | | // stage 1 |
335 | 128k | __m256i x1[16]; |
336 | 128k | x1[0] = input[15]; |
337 | 128k | x1[1] = input[0]; |
338 | 128k | x1[2] = input[13]; |
339 | 128k | x1[3] = input[2]; |
340 | 128k | x1[4] = input[11]; |
341 | 128k | x1[5] = input[4]; |
342 | 128k | x1[6] = input[9]; |
343 | 128k | x1[7] = input[6]; |
344 | 128k | x1[8] = input[7]; |
345 | 128k | x1[9] = input[8]; |
346 | 128k | x1[10] = input[5]; |
347 | 128k | x1[11] = input[10]; |
348 | 128k | x1[12] = input[3]; |
349 | 128k | x1[13] = input[12]; |
350 | 128k | x1[14] = input[1]; |
351 | 128k | x1[15] = input[14]; |
352 | | |
353 | | // stage 2 |
354 | 128k | btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r, |
355 | 128k | INV_COS_BIT); |
356 | 128k | btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r, |
357 | 128k | INV_COS_BIT); |
358 | 128k | btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r, |
359 | 128k | INV_COS_BIT); |
360 | 128k | btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r, |
361 | 128k | INV_COS_BIT); |
362 | 128k | btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r, |
363 | 128k | INV_COS_BIT); |
364 | 128k | btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r, |
365 | 128k | INV_COS_BIT); |
366 | 128k | btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r, |
367 | 128k | INV_COS_BIT); |
368 | 128k | btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r, |
369 | 128k | INV_COS_BIT); |
370 | | |
371 | 128k | iadst16_stage3_avx2(x1); |
372 | 128k | iadst16_stage4_avx2(x1, cospi, _r, INV_COS_BIT); |
373 | 128k | iadst16_stage5_avx2(x1); |
374 | 128k | iadst16_stage6_avx2(x1, cospi, _r, INV_COS_BIT); |
375 | 128k | iadst16_stage7_avx2(x1); |
376 | 128k | iadst16_stage8_avx2(x1, cospi, _r, INV_COS_BIT); |
377 | 128k | iadst16_stage9_avx2(output, x1); |
378 | 128k | } |
379 | | |
380 | 330k | static void iadst16_low8_avx2(const __m256i *input, __m256i *output) { |
381 | 330k | const int32_t *cospi = cospi_arr(INV_COS_BIT); |
382 | 330k | const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); |
383 | | |
384 | | // stage 1 |
385 | 330k | __m256i x1[16]; |
386 | 330k | x1[1] = input[0]; |
387 | 330k | x1[3] = input[2]; |
388 | 330k | x1[5] = input[4]; |
389 | 330k | x1[7] = input[6]; |
390 | 330k | x1[8] = input[7]; |
391 | 330k | x1[10] = input[5]; |
392 | 330k | x1[12] = input[3]; |
393 | 330k | x1[14] = input[1]; |
394 | | |
395 | | // stage 2 |
396 | 330k | btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]); |
397 | 330k | btf_16_w16_0_avx2(cospi[54], -cospi[10], x1[3], x1[2], x1[3]); |
398 | 330k | btf_16_w16_0_avx2(cospi[46], -cospi[18], x1[5], x1[4], x1[5]); |
399 | 330k | btf_16_w16_0_avx2(cospi[38], -cospi[26], x1[7], x1[6], x1[7]); |
400 | 330k | btf_16_w16_0_avx2(cospi[34], cospi[30], x1[8], x1[8], x1[9]); |
401 | 330k | btf_16_w16_0_avx2(cospi[42], cospi[22], x1[10], x1[10], x1[11]); |
402 | 330k | btf_16_w16_0_avx2(cospi[50], cospi[14], x1[12], x1[12], x1[13]); |
403 | 330k | btf_16_w16_0_avx2(cospi[58], cospi[06], x1[14], x1[14], x1[15]); |
404 | | |
405 | 330k | iadst16_stage3_avx2(x1); |
406 | 330k | iadst16_stage4_avx2(x1, cospi, _r, INV_COS_BIT); |
407 | 330k | iadst16_stage5_avx2(x1); |
408 | 330k | iadst16_stage6_avx2(x1, cospi, _r, INV_COS_BIT); |
409 | 330k | iadst16_stage7_avx2(x1); |
410 | 330k | iadst16_stage8_avx2(x1, cospi, _r, INV_COS_BIT); |
411 | 330k | iadst16_stage9_avx2(output, x1); |
412 | 330k | } |
413 | | |
414 | 118k | static void iadst16_low1_avx2(const __m256i *input, __m256i *output) { |
415 | 118k | const int32_t *cospi = cospi_arr(INV_COS_BIT); |
416 | 118k | const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); |
417 | | |
418 | 118k | const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]); |
419 | 118k | const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]); |
420 | 118k | const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); |
421 | 118k | const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); |
422 | | |
423 | | // stage 1 |
424 | 118k | __m256i x1[16]; |
425 | 118k | x1[1] = input[0]; |
426 | | |
427 | | // stage 2 |
428 | 118k | btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]); |
429 | | |
430 | | // stage 3 |
431 | 118k | x1[8] = x1[0]; |
432 | 118k | x1[9] = x1[1]; |
433 | | |
434 | | // stage 4 |
435 | 118k | btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r, |
436 | 118k | INV_COS_BIT); |
437 | | |
438 | | // stage 5 |
439 | 118k | x1[4] = x1[0]; |
440 | 118k | x1[5] = x1[1]; |
441 | | |
442 | 118k | x1[12] = x1[8]; |
443 | 118k | x1[13] = x1[9]; |
444 | | |
445 | | // stage 6 |
446 | 118k | btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r, |
447 | 118k | INV_COS_BIT); |
448 | 118k | btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r, |
449 | 118k | INV_COS_BIT); |
450 | | |
451 | | // stage 7 |
452 | 118k | x1[2] = x1[0]; |
453 | 118k | x1[3] = x1[1]; |
454 | 118k | x1[6] = x1[4]; |
455 | 118k | x1[7] = x1[5]; |
456 | 118k | x1[10] = x1[8]; |
457 | 118k | x1[11] = x1[9]; |
458 | 118k | x1[14] = x1[12]; |
459 | 118k | x1[15] = x1[13]; |
460 | | |
461 | 118k | iadst16_stage8_avx2(x1, cospi, _r, INV_COS_BIT); |
462 | 118k | iadst16_stage9_avx2(output, x1); |
463 | 118k | } |
464 | | |
465 | 389k | static INLINE void idct32_high16_stage3_avx2(__m256i *x) { |
466 | 389k | btf_16_adds_subs_avx2(&x[16], &x[17]); |
467 | 389k | btf_16_adds_subs_avx2(&x[19], &x[18]); |
468 | 389k | btf_16_adds_subs_avx2(&x[20], &x[21]); |
469 | 389k | btf_16_adds_subs_avx2(&x[23], &x[22]); |
470 | 389k | btf_16_adds_subs_avx2(&x[24], &x[25]); |
471 | 389k | btf_16_adds_subs_avx2(&x[27], &x[26]); |
472 | 389k | btf_16_adds_subs_avx2(&x[28], &x[29]); |
473 | 389k | btf_16_adds_subs_avx2(&x[31], &x[30]); |
474 | 389k | } |
475 | | |
476 | | static INLINE void idct32_high16_stage4_avx2(__m256i *x, const int32_t *cospi, |
477 | 1.10M | const __m256i _r, int8_t cos_bit) { |
478 | 1.10M | const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); |
479 | 1.10M | const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); |
480 | 1.10M | const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]); |
481 | 1.10M | const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); |
482 | 1.10M | const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); |
483 | 1.10M | const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); |
484 | 1.10M | btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit); |
485 | 1.10M | btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit); |
486 | 1.10M | btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit); |
487 | 1.10M | btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit); |
488 | 1.10M | } |
489 | | |
490 | | static INLINE void idct32_high24_stage5_avx2(__m256i *x, const int32_t *cospi, |
491 | 1.10M | const __m256i _r, int8_t cos_bit) { |
492 | 1.10M | const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); |
493 | 1.10M | const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); |
494 | 1.10M | const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); |
495 | 1.10M | btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit); |
496 | 1.10M | btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit); |
497 | 1.10M | btf_16_adds_subs_avx2(&x[16], &x[19]); |
498 | 1.10M | btf_16_adds_subs_avx2(&x[17], &x[18]); |
499 | 1.10M | btf_16_adds_subs_avx2(&x[23], &x[20]); |
500 | 1.10M | btf_16_adds_subs_avx2(&x[22], &x[21]); |
501 | 1.10M | btf_16_adds_subs_avx2(&x[24], &x[27]); |
502 | 1.10M | btf_16_adds_subs_avx2(&x[25], &x[26]); |
503 | 1.10M | btf_16_adds_subs_avx2(&x[31], &x[28]); |
504 | 1.10M | btf_16_adds_subs_avx2(&x[30], &x[29]); |
505 | 1.10M | } |
506 | | |
507 | | static INLINE void idct32_high28_stage6_avx2(__m256i *x, const int32_t *cospi, |
508 | 1.10M | const __m256i _r, int8_t cos_bit) { |
509 | 1.10M | const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); |
510 | 1.10M | const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); |
511 | 1.10M | const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); |
512 | 1.10M | const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); |
513 | 1.10M | const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); |
514 | 1.10M | btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit); |
515 | 1.10M | btf_16_adds_subs_avx2(&x[8], &x[11]); |
516 | 1.10M | btf_16_adds_subs_avx2(&x[9], &x[10]); |
517 | 1.10M | btf_16_adds_subs_avx2(&x[15], &x[12]); |
518 | 1.10M | btf_16_adds_subs_avx2(&x[14], &x[13]); |
519 | 1.10M | btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit); |
520 | 1.10M | btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit); |
521 | 1.10M | btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit); |
522 | 1.10M | btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit); |
523 | 1.10M | } |
524 | | |
525 | | static INLINE void idct32_stage7_avx2(__m256i *x, const int32_t *cospi, |
526 | 1.10M | const __m256i _r, int8_t cos_bit) { |
527 | 1.10M | const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); |
528 | 1.10M | const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); |
529 | 1.10M | btf_16_adds_subs_avx2(&x[0], &x[7]); |
530 | 1.10M | btf_16_adds_subs_avx2(&x[1], &x[6]); |
531 | 1.10M | btf_16_adds_subs_avx2(&x[2], &x[5]); |
532 | 1.10M | btf_16_adds_subs_avx2(&x[3], &x[4]); |
533 | 1.10M | btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit); |
534 | 1.10M | btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit); |
535 | 1.10M | btf_16_adds_subs_avx2(&x[16], &x[23]); |
536 | 1.10M | btf_16_adds_subs_avx2(&x[17], &x[22]); |
537 | 1.10M | btf_16_adds_subs_avx2(&x[18], &x[21]); |
538 | 1.10M | btf_16_adds_subs_avx2(&x[19], &x[20]); |
539 | 1.10M | btf_16_adds_subs_avx2(&x[31], &x[24]); |
540 | 1.10M | btf_16_adds_subs_avx2(&x[30], &x[25]); |
541 | 1.10M | btf_16_adds_subs_avx2(&x[29], &x[26]); |
542 | 1.10M | btf_16_adds_subs_avx2(&x[28], &x[27]); |
543 | 1.10M | } |
544 | | |
545 | | static INLINE void idct32_stage8_avx2(__m256i *x, const int32_t *cospi, |
546 | 1.10M | const __m256i _r, int8_t cos_bit) { |
547 | 1.10M | const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); |
548 | 1.10M | const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); |
549 | 1.10M | btf_16_adds_subs_avx2(&x[0], &x[15]); |
550 | 1.10M | btf_16_adds_subs_avx2(&x[1], &x[14]); |
551 | 1.10M | btf_16_adds_subs_avx2(&x[2], &x[13]); |
552 | 1.10M | btf_16_adds_subs_avx2(&x[3], &x[12]); |
553 | 1.10M | btf_16_adds_subs_avx2(&x[4], &x[11]); |
554 | 1.10M | btf_16_adds_subs_avx2(&x[5], &x[10]); |
555 | 1.10M | btf_16_adds_subs_avx2(&x[6], &x[9]); |
556 | 1.10M | btf_16_adds_subs_avx2(&x[7], &x[8]); |
557 | 1.10M | btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit); |
558 | 1.10M | btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit); |
559 | 1.10M | btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit); |
560 | 1.10M | btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit); |
561 | 1.10M | } |
562 | | |
563 | 1.10M | static INLINE void idct32_stage9_avx2(__m256i *output, __m256i *x) { |
564 | 1.10M | btf_16_adds_subs_out_avx2(&output[0], &output[31], x[0], x[31]); |
565 | 1.10M | btf_16_adds_subs_out_avx2(&output[1], &output[30], x[1], x[30]); |
566 | 1.10M | btf_16_adds_subs_out_avx2(&output[2], &output[29], x[2], x[29]); |
567 | 1.10M | btf_16_adds_subs_out_avx2(&output[3], &output[28], x[3], x[28]); |
568 | 1.10M | btf_16_adds_subs_out_avx2(&output[4], &output[27], x[4], x[27]); |
569 | 1.10M | btf_16_adds_subs_out_avx2(&output[5], &output[26], x[5], x[26]); |
570 | 1.10M | btf_16_adds_subs_out_avx2(&output[6], &output[25], x[6], x[25]); |
571 | 1.10M | btf_16_adds_subs_out_avx2(&output[7], &output[24], x[7], x[24]); |
572 | 1.10M | btf_16_adds_subs_out_avx2(&output[8], &output[23], x[8], x[23]); |
573 | 1.10M | btf_16_adds_subs_out_avx2(&output[9], &output[22], x[9], x[22]); |
574 | 1.10M | btf_16_adds_subs_out_avx2(&output[10], &output[21], x[10], x[21]); |
575 | 1.10M | btf_16_adds_subs_out_avx2(&output[11], &output[20], x[11], x[20]); |
576 | 1.10M | btf_16_adds_subs_out_avx2(&output[12], &output[19], x[12], x[19]); |
577 | 1.10M | btf_16_adds_subs_out_avx2(&output[13], &output[18], x[13], x[18]); |
578 | 1.10M | btf_16_adds_subs_out_avx2(&output[14], &output[17], x[14], x[17]); |
579 | 1.10M | btf_16_adds_subs_out_avx2(&output[15], &output[16], x[15], x[16]); |
580 | 1.10M | } |
581 | | |
582 | 544k | static void idct32_low1_avx2(const __m256i *input, __m256i *output) { |
583 | 544k | const int32_t *cospi = cospi_arr(INV_COS_BIT); |
584 | | |
585 | | // stage 1 |
586 | 544k | __m256i x[2]; |
587 | 544k | x[0] = input[0]; |
588 | | |
589 | | // stage 2 |
590 | | // stage 3 |
591 | | // stage 4 |
592 | | // stage 5 |
593 | 544k | btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); |
594 | | |
595 | | // stage 6 |
596 | | // stage 7 |
597 | | // stage 8 |
598 | | // stage 9 |
599 | 544k | output[0] = x[0]; |
600 | 544k | output[31] = x[0]; |
601 | 544k | output[1] = x[1]; |
602 | 544k | output[30] = x[1]; |
603 | 544k | output[2] = x[1]; |
604 | 544k | output[29] = x[1]; |
605 | 544k | output[3] = x[0]; |
606 | 544k | output[28] = x[0]; |
607 | 544k | output[4] = x[0]; |
608 | 544k | output[27] = x[0]; |
609 | 544k | output[5] = x[1]; |
610 | 544k | output[26] = x[1]; |
611 | 544k | output[6] = x[1]; |
612 | 544k | output[25] = x[1]; |
613 | 544k | output[7] = x[0]; |
614 | 544k | output[24] = x[0]; |
615 | 544k | output[8] = x[0]; |
616 | 544k | output[23] = x[0]; |
617 | 544k | output[9] = x[1]; |
618 | 544k | output[22] = x[1]; |
619 | 544k | output[10] = x[1]; |
620 | 544k | output[21] = x[1]; |
621 | 544k | output[11] = x[0]; |
622 | 544k | output[20] = x[0]; |
623 | 544k | output[12] = x[0]; |
624 | 544k | output[19] = x[0]; |
625 | 544k | output[13] = x[1]; |
626 | 544k | output[18] = x[1]; |
627 | 544k | output[14] = x[1]; |
628 | 544k | output[17] = x[1]; |
629 | 544k | output[15] = x[0]; |
630 | 544k | output[16] = x[0]; |
631 | 544k | } |
632 | | |
633 | 720k | static void idct32_low8_avx2(const __m256i *input, __m256i *output) { |
634 | 720k | const int32_t *cospi = cospi_arr(INV_COS_BIT); |
635 | 720k | const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); |
636 | | |
637 | | // stage 1 |
638 | 720k | __m256i x[32]; |
639 | 720k | x[0] = input[0]; |
640 | 720k | x[4] = input[4]; |
641 | 720k | x[8] = input[2]; |
642 | 720k | x[12] = input[6]; |
643 | 720k | x[16] = input[1]; |
644 | 720k | x[20] = input[5]; |
645 | 720k | x[24] = input[3]; |
646 | 720k | x[28] = input[7]; |
647 | | |
648 | | // stage 2 |
649 | 720k | btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]); |
650 | 720k | btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]); |
651 | 720k | btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]); |
652 | 720k | btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]); |
653 | | |
654 | | // stage 3 |
655 | 720k | btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]); |
656 | 720k | btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]); |
657 | 720k | x[17] = x[16]; |
658 | 720k | x[18] = x[19]; |
659 | 720k | x[21] = x[20]; |
660 | 720k | x[22] = x[23]; |
661 | 720k | x[25] = x[24]; |
662 | 720k | x[26] = x[27]; |
663 | 720k | x[29] = x[28]; |
664 | 720k | x[30] = x[31]; |
665 | | |
666 | | // stage 4 |
667 | 720k | btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]); |
668 | 720k | x[9] = x[8]; |
669 | 720k | x[10] = x[11]; |
670 | 720k | x[13] = x[12]; |
671 | 720k | x[14] = x[15]; |
672 | 720k | idct32_high16_stage4_avx2(x, cospi, _r, INV_COS_BIT); |
673 | | |
674 | | // stage 5 |
675 | 720k | btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); |
676 | 720k | x[5] = x[4]; |
677 | 720k | x[6] = x[7]; |
678 | 720k | idct32_high24_stage5_avx2(x, cospi, _r, INV_COS_BIT); |
679 | | // stage 6 |
680 | 720k | x[3] = x[0]; |
681 | 720k | x[2] = x[1]; |
682 | 720k | idct32_high28_stage6_avx2(x, cospi, _r, INV_COS_BIT); |
683 | | |
684 | 720k | idct32_stage7_avx2(x, cospi, _r, INV_COS_BIT); |
685 | 720k | idct32_stage8_avx2(x, cospi, _r, INV_COS_BIT); |
686 | 720k | idct32_stage9_avx2(output, x); |
687 | 720k | } |
688 | | |
689 | 214k | static void idct32_low16_avx2(const __m256i *input, __m256i *output) { |
690 | 214k | const int32_t *cospi = cospi_arr(INV_COS_BIT); |
691 | 214k | const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); |
692 | | |
693 | | // stage 1 |
694 | 214k | __m256i x[32]; |
695 | 214k | x[0] = input[0]; |
696 | 214k | x[2] = input[8]; |
697 | 214k | x[4] = input[4]; |
698 | 214k | x[6] = input[12]; |
699 | 214k | x[8] = input[2]; |
700 | 214k | x[10] = input[10]; |
701 | 214k | x[12] = input[6]; |
702 | 214k | x[14] = input[14]; |
703 | 214k | x[16] = input[1]; |
704 | 214k | x[18] = input[9]; |
705 | 214k | x[20] = input[5]; |
706 | 214k | x[22] = input[13]; |
707 | 214k | x[24] = input[3]; |
708 | 214k | x[26] = input[11]; |
709 | 214k | x[28] = input[7]; |
710 | 214k | x[30] = input[15]; |
711 | | |
712 | | // stage 2 |
713 | 214k | btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]); |
714 | 214k | btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]); |
715 | 214k | btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]); |
716 | 214k | btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]); |
717 | 214k | btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]); |
718 | 214k | btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]); |
719 | 214k | btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]); |
720 | 214k | btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]); |
721 | | |
722 | | // stage 3 |
723 | 214k | btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]); |
724 | 214k | btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]); |
725 | 214k | btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]); |
726 | 214k | btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]); |
727 | 214k | idct32_high16_stage3_avx2(x); |
728 | | |
729 | | // stage 4 |
730 | 214k | btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]); |
731 | 214k | btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]); |
732 | 214k | btf_16_adds_subs_avx2(&x[8], &x[9]); |
733 | 214k | btf_16_adds_subs_avx2(&x[11], &x[10]); |
734 | 214k | btf_16_adds_subs_avx2(&x[12], &x[13]); |
735 | 214k | btf_16_adds_subs_avx2(&x[15], &x[14]); |
736 | 214k | idct32_high16_stage4_avx2(x, cospi, _r, INV_COS_BIT); |
737 | | |
738 | | // stage 5 |
739 | 214k | btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); |
740 | 214k | btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]); |
741 | 214k | btf_16_adds_subs_avx2(&x[4], &x[5]); |
742 | 214k | btf_16_adds_subs_avx2(&x[7], &x[6]); |
743 | 214k | idct32_high24_stage5_avx2(x, cospi, _r, INV_COS_BIT); |
744 | | |
745 | 214k | btf_16_adds_subs_avx2(&x[0], &x[3]); |
746 | 214k | btf_16_adds_subs_avx2(&x[1], &x[2]); |
747 | 214k | idct32_high28_stage6_avx2(x, cospi, _r, INV_COS_BIT); |
748 | | |
749 | 214k | idct32_stage7_avx2(x, cospi, _r, INV_COS_BIT); |
750 | 214k | idct32_stage8_avx2(x, cospi, _r, INV_COS_BIT); |
751 | 214k | idct32_stage9_avx2(output, x); |
752 | 214k | } |
753 | | |
754 | 175k | static void idct32_avx2(const __m256i *input, __m256i *output) { |
755 | 175k | const int32_t *cospi = cospi_arr(INV_COS_BIT); |
756 | 175k | const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); |
757 | | |
758 | 175k | __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]); |
759 | 175k | __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]); |
760 | 175k | __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]); |
761 | 175k | __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]); |
762 | 175k | __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]); |
763 | 175k | __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]); |
764 | 175k | __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]); |
765 | 175k | __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]); |
766 | 175k | __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]); |
767 | 175k | __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]); |
768 | 175k | __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]); |
769 | 175k | __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]); |
770 | 175k | __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]); |
771 | 175k | __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]); |
772 | 175k | __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]); |
773 | 175k | __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]); |
774 | 175k | __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]); |
775 | 175k | __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]); |
776 | 175k | __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]); |
777 | 175k | __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]); |
778 | 175k | __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]); |
779 | 175k | __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]); |
780 | 175k | __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]); |
781 | 175k | __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]); |
782 | 175k | __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]); |
783 | 175k | __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]); |
784 | 175k | __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]); |
785 | 175k | __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]); |
786 | 175k | __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); |
787 | 175k | __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); |
788 | 175k | __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); |
789 | 175k | __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); |
790 | | |
791 | | // stage 1 |
792 | 175k | __m256i x1[32]; |
793 | 175k | x1[0] = input[0]; |
794 | 175k | x1[1] = input[16]; |
795 | 175k | x1[2] = input[8]; |
796 | 175k | x1[3] = input[24]; |
797 | 175k | x1[4] = input[4]; |
798 | 175k | x1[5] = input[20]; |
799 | 175k | x1[6] = input[12]; |
800 | 175k | x1[7] = input[28]; |
801 | 175k | x1[8] = input[2]; |
802 | 175k | x1[9] = input[18]; |
803 | 175k | x1[10] = input[10]; |
804 | 175k | x1[11] = input[26]; |
805 | 175k | x1[12] = input[6]; |
806 | 175k | x1[13] = input[22]; |
807 | 175k | x1[14] = input[14]; |
808 | 175k | x1[15] = input[30]; |
809 | 175k | x1[16] = input[1]; |
810 | 175k | x1[17] = input[17]; |
811 | 175k | x1[18] = input[9]; |
812 | 175k | x1[19] = input[25]; |
813 | 175k | x1[20] = input[5]; |
814 | 175k | x1[21] = input[21]; |
815 | 175k | x1[22] = input[13]; |
816 | 175k | x1[23] = input[29]; |
817 | 175k | x1[24] = input[3]; |
818 | 175k | x1[25] = input[19]; |
819 | 175k | x1[26] = input[11]; |
820 | 175k | x1[27] = input[27]; |
821 | 175k | x1[28] = input[7]; |
822 | 175k | x1[29] = input[23]; |
823 | 175k | x1[30] = input[15]; |
824 | 175k | x1[31] = input[31]; |
825 | | |
826 | | // stage 2 |
827 | 175k | btf_16_w16_avx2(cospi_p62_m02, cospi_p02_p62, &x1[16], &x1[31], _r, |
828 | 175k | INV_COS_BIT); |
829 | 175k | btf_16_w16_avx2(cospi_p30_m34, cospi_p34_p30, &x1[17], &x1[30], _r, |
830 | 175k | INV_COS_BIT); |
831 | 175k | btf_16_w16_avx2(cospi_p46_m18, cospi_p18_p46, &x1[18], &x1[29], _r, |
832 | 175k | INV_COS_BIT); |
833 | 175k | btf_16_w16_avx2(cospi_p14_m50, cospi_p50_p14, &x1[19], &x1[28], _r, |
834 | 175k | INV_COS_BIT); |
835 | 175k | btf_16_w16_avx2(cospi_p54_m10, cospi_p10_p54, &x1[20], &x1[27], _r, |
836 | 175k | INV_COS_BIT); |
837 | 175k | btf_16_w16_avx2(cospi_p22_m42, cospi_p42_p22, &x1[21], &x1[26], _r, |
838 | 175k | INV_COS_BIT); |
839 | 175k | btf_16_w16_avx2(cospi_p38_m26, cospi_p26_p38, &x1[22], &x1[25], _r, |
840 | 175k | INV_COS_BIT); |
841 | 175k | btf_16_w16_avx2(cospi_p06_m58, cospi_p58_p06, &x1[23], &x1[24], _r, |
842 | 175k | INV_COS_BIT); |
843 | | |
844 | | // stage 3 |
845 | 175k | btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r, |
846 | 175k | INV_COS_BIT); |
847 | 175k | btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r, |
848 | 175k | INV_COS_BIT); |
849 | 175k | btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r, |
850 | 175k | INV_COS_BIT); |
851 | 175k | btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r, |
852 | 175k | INV_COS_BIT); |
853 | 175k | idct32_high16_stage3_avx2(x1); |
854 | | |
855 | | // stage 4 |
856 | 175k | btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r, |
857 | 175k | INV_COS_BIT); |
858 | 175k | btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r, |
859 | 175k | INV_COS_BIT); |
860 | 175k | btf_16_adds_subs_avx2(&x1[8], &x1[9]); |
861 | 175k | btf_16_adds_subs_avx2(&x1[11], &x1[10]); |
862 | 175k | btf_16_adds_subs_avx2(&x1[12], &x1[13]); |
863 | 175k | btf_16_adds_subs_avx2(&x1[15], &x1[14]); |
864 | 175k | idct32_high16_stage4_avx2(x1, cospi, _r, INV_COS_BIT); |
865 | | |
866 | | // stage 5 |
867 | 175k | btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, |
868 | 175k | INV_COS_BIT); |
869 | 175k | btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r, |
870 | 175k | INV_COS_BIT); |
871 | 175k | btf_16_adds_subs_avx2(&x1[4], &x1[5]); |
872 | 175k | btf_16_adds_subs_avx2(&x1[7], &x1[6]); |
873 | 175k | idct32_high24_stage5_avx2(x1, cospi, _r, INV_COS_BIT); |
874 | | |
875 | | // stage 6 |
876 | 175k | btf_16_adds_subs_avx2(&x1[0], &x1[3]); |
877 | 175k | btf_16_adds_subs_avx2(&x1[1], &x1[2]); |
878 | 175k | idct32_high28_stage6_avx2(x1, cospi, _r, INV_COS_BIT); |
879 | | |
880 | 175k | idct32_stage7_avx2(x1, cospi, _r, INV_COS_BIT); |
881 | 175k | idct32_stage8_avx2(x1, cospi, _r, INV_COS_BIT); |
882 | 175k | idct32_stage9_avx2(output, x1); |
883 | 175k | } |
884 | | |
885 | | static INLINE void idct64_stage4_high32_avx2(__m256i *x, const int32_t *cospi, |
886 | 263k | const __m256i _r, int8_t cos_bit) { |
887 | 263k | (void)cos_bit; |
888 | 263k | const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]); |
889 | 263k | const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]); |
890 | 263k | const __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]); |
891 | 263k | const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]); |
892 | 263k | const __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]); |
893 | 263k | const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]); |
894 | 263k | const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]); |
895 | 263k | const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]); |
896 | 263k | const __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]); |
897 | 263k | const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]); |
898 | 263k | const __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]); |
899 | 263k | const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]); |
900 | 263k | btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, cos_bit); |
901 | 263k | btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x[34], &x[61], _r, cos_bit); |
902 | 263k | btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x[37], &x[58], _r, cos_bit); |
903 | 263k | btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, cos_bit); |
904 | 263k | btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, cos_bit); |
905 | 263k | btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x[42], &x[53], _r, cos_bit); |
906 | 263k | btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x[45], &x[50], _r, cos_bit); |
907 | 263k | btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, cos_bit); |
908 | 263k | } |
909 | | |
910 | | static INLINE void idct64_stage5_high48_avx2(__m256i *x, const int32_t *cospi, |
911 | 263k | const __m256i _r, int8_t cos_bit) { |
912 | 263k | (void)cos_bit; |
913 | 263k | const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); |
914 | 263k | const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); |
915 | 263k | const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]); |
916 | 263k | const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); |
917 | 263k | const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); |
918 | 263k | const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); |
919 | 263k | btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit); |
920 | 263k | btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit); |
921 | 263k | btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit); |
922 | 263k | btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit); |
923 | 263k | btf_16_adds_subs_avx2(&x[32], &x[35]); |
924 | 263k | btf_16_adds_subs_avx2(&x[33], &x[34]); |
925 | 263k | btf_16_adds_subs_avx2(&x[39], &x[36]); |
926 | 263k | btf_16_adds_subs_avx2(&x[38], &x[37]); |
927 | 263k | btf_16_adds_subs_avx2(&x[40], &x[43]); |
928 | 263k | btf_16_adds_subs_avx2(&x[41], &x[42]); |
929 | 263k | btf_16_adds_subs_avx2(&x[47], &x[44]); |
930 | 263k | btf_16_adds_subs_avx2(&x[46], &x[45]); |
931 | 263k | btf_16_adds_subs_avx2(&x[48], &x[51]); |
932 | 263k | btf_16_adds_subs_avx2(&x[49], &x[50]); |
933 | 263k | btf_16_adds_subs_avx2(&x[55], &x[52]); |
934 | 263k | btf_16_adds_subs_avx2(&x[54], &x[53]); |
935 | 263k | btf_16_adds_subs_avx2(&x[56], &x[59]); |
936 | 263k | btf_16_adds_subs_avx2(&x[57], &x[58]); |
937 | 263k | btf_16_adds_subs_avx2(&x[63], &x[60]); |
938 | 263k | btf_16_adds_subs_avx2(&x[62], &x[61]); |
939 | 263k | } |
940 | | |
941 | | static INLINE void idct64_stage6_high32_avx2(__m256i *x, const int32_t *cospi, |
942 | 506k | const __m256i _r, int8_t cos_bit) { |
943 | 506k | (void)cos_bit; |
944 | 506k | const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); |
945 | 506k | const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); |
946 | 506k | const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]); |
947 | 506k | const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); |
948 | 506k | const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); |
949 | 506k | const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); |
950 | 506k | btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[34], &x[61], _r, cos_bit); |
951 | 506k | btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[35], &x[60], _r, cos_bit); |
952 | 506k | btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[36], &x[59], _r, cos_bit); |
953 | 506k | btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[37], &x[58], _r, cos_bit); |
954 | 506k | btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[42], &x[53], _r, cos_bit); |
955 | 506k | btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[43], &x[52], _r, cos_bit); |
956 | 506k | btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[44], &x[51], _r, cos_bit); |
957 | 506k | btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[45], &x[50], _r, cos_bit); |
958 | 506k | } |
959 | | |
960 | | static INLINE void idct64_stage6_high48_avx2(__m256i *x, const int32_t *cospi, |
961 | 263k | const __m256i _r, int8_t cos_bit) { |
962 | 263k | btf_16_adds_subs_avx2(&x[16], &x[19]); |
963 | 263k | btf_16_adds_subs_avx2(&x[17], &x[18]); |
964 | 263k | btf_16_adds_subs_avx2(&x[23], &x[20]); |
965 | 263k | btf_16_adds_subs_avx2(&x[22], &x[21]); |
966 | 263k | btf_16_adds_subs_avx2(&x[24], &x[27]); |
967 | 263k | btf_16_adds_subs_avx2(&x[25], &x[26]); |
968 | 263k | btf_16_adds_subs_avx2(&x[31], &x[28]); |
969 | 263k | btf_16_adds_subs_avx2(&x[30], &x[29]); |
970 | 263k | idct64_stage6_high32_avx2(x, cospi, _r, cos_bit); |
971 | 263k | } |
972 | | |
973 | | static INLINE void idct64_stage7_high48_avx2(__m256i *x, const int32_t *cospi, |
974 | 506k | const __m256i _r, int8_t cos_bit) { |
975 | 506k | (void)cos_bit; |
976 | 506k | const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); |
977 | 506k | const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); |
978 | 506k | const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); |
979 | 506k | btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit); |
980 | 506k | btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit); |
981 | 506k | btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit); |
982 | 506k | btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit); |
983 | 506k | btf_16_adds_subs_avx2(&x[32], &x[39]); |
984 | 506k | btf_16_adds_subs_avx2(&x[33], &x[38]); |
985 | 506k | btf_16_adds_subs_avx2(&x[34], &x[37]); |
986 | 506k | btf_16_adds_subs_avx2(&x[35], &x[36]); |
987 | 506k | btf_16_adds_subs_avx2(&x[47], &x[40]); |
988 | 506k | btf_16_adds_subs_avx2(&x[46], &x[41]); |
989 | 506k | btf_16_adds_subs_avx2(&x[45], &x[42]); |
990 | 506k | btf_16_adds_subs_avx2(&x[44], &x[43]); |
991 | 506k | btf_16_adds_subs_avx2(&x[48], &x[55]); |
992 | 506k | btf_16_adds_subs_avx2(&x[49], &x[54]); |
993 | 506k | btf_16_adds_subs_avx2(&x[50], &x[53]); |
994 | 506k | btf_16_adds_subs_avx2(&x[51], &x[52]); |
995 | 506k | btf_16_adds_subs_avx2(&x[63], &x[56]); |
996 | 506k | btf_16_adds_subs_avx2(&x[62], &x[57]); |
997 | 506k | btf_16_adds_subs_avx2(&x[61], &x[58]); |
998 | 506k | btf_16_adds_subs_avx2(&x[60], &x[59]); |
999 | 506k | } |
1000 | | |
1001 | | static INLINE void idct64_stage8_high48_avx2(__m256i *x, const int32_t *cospi, |
1002 | 506k | const __m256i _r, int8_t cos_bit) { |
1003 | 506k | (void)cos_bit; |
1004 | 506k | const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); |
1005 | 506k | const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); |
1006 | 506k | const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); |
1007 | 506k | btf_16_adds_subs_avx2(&x[16], &x[23]); |
1008 | 506k | btf_16_adds_subs_avx2(&x[17], &x[22]); |
1009 | 506k | btf_16_adds_subs_avx2(&x[18], &x[21]); |
1010 | 506k | btf_16_adds_subs_avx2(&x[19], &x[20]); |
1011 | 506k | btf_16_adds_subs_avx2(&x[31], &x[24]); |
1012 | 506k | btf_16_adds_subs_avx2(&x[30], &x[25]); |
1013 | 506k | btf_16_adds_subs_avx2(&x[29], &x[26]); |
1014 | 506k | btf_16_adds_subs_avx2(&x[28], &x[27]); |
1015 | 506k | btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[36], &x[59], _r, cos_bit); |
1016 | 506k | btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[37], &x[58], _r, cos_bit); |
1017 | 506k | btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[38], &x[57], _r, cos_bit); |
1018 | 506k | btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[39], &x[56], _r, cos_bit); |
1019 | 506k | btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[40], &x[55], _r, cos_bit); |
1020 | 506k | btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[41], &x[54], _r, cos_bit); |
1021 | 506k | btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[42], &x[53], _r, cos_bit); |
1022 | 506k | btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[43], &x[52], _r, cos_bit); |
1023 | 506k | } |
1024 | | |
1025 | | static INLINE void idct64_stage9_avx2(__m256i *x, const int32_t *cospi, |
1026 | 506k | const __m256i _r, int8_t cos_bit) { |
1027 | 506k | (void)cos_bit; |
1028 | 506k | const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); |
1029 | 506k | const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); |
1030 | 506k | btf_16_adds_subs_avx2(&x[0], &x[15]); |
1031 | 506k | btf_16_adds_subs_avx2(&x[1], &x[14]); |
1032 | 506k | btf_16_adds_subs_avx2(&x[2], &x[13]); |
1033 | 506k | btf_16_adds_subs_avx2(&x[3], &x[12]); |
1034 | 506k | btf_16_adds_subs_avx2(&x[4], &x[11]); |
1035 | 506k | btf_16_adds_subs_avx2(&x[5], &x[10]); |
1036 | 506k | btf_16_adds_subs_avx2(&x[6], &x[9]); |
1037 | 506k | btf_16_adds_subs_avx2(&x[7], &x[8]); |
1038 | 506k | btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit); |
1039 | 506k | btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit); |
1040 | 506k | btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit); |
1041 | 506k | btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit); |
1042 | 506k | btf_16_adds_subs_avx2(&x[32], &x[47]); |
1043 | 506k | btf_16_adds_subs_avx2(&x[33], &x[46]); |
1044 | 506k | btf_16_adds_subs_avx2(&x[34], &x[45]); |
1045 | 506k | btf_16_adds_subs_avx2(&x[35], &x[44]); |
1046 | 506k | btf_16_adds_subs_avx2(&x[36], &x[43]); |
1047 | 506k | btf_16_adds_subs_avx2(&x[37], &x[42]); |
1048 | 506k | btf_16_adds_subs_avx2(&x[38], &x[41]); |
1049 | 506k | btf_16_adds_subs_avx2(&x[39], &x[40]); |
1050 | 506k | btf_16_adds_subs_avx2(&x[63], &x[48]); |
1051 | 506k | btf_16_adds_subs_avx2(&x[62], &x[49]); |
1052 | 506k | btf_16_adds_subs_avx2(&x[61], &x[50]); |
1053 | 506k | btf_16_adds_subs_avx2(&x[60], &x[51]); |
1054 | 506k | btf_16_adds_subs_avx2(&x[59], &x[52]); |
1055 | 506k | btf_16_adds_subs_avx2(&x[58], &x[53]); |
1056 | 506k | btf_16_adds_subs_avx2(&x[57], &x[54]); |
1057 | 506k | btf_16_adds_subs_avx2(&x[56], &x[55]); |
1058 | 506k | } |
1059 | | |
1060 | | static INLINE void idct64_stage10_avx2(__m256i *x, const int32_t *cospi, |
1061 | 506k | const __m256i _r, int8_t cos_bit) { |
1062 | 506k | (void)cos_bit; |
1063 | 506k | const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); |
1064 | 506k | const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); |
1065 | 506k | btf_16_adds_subs_avx2(&x[0], &x[31]); |
1066 | 506k | btf_16_adds_subs_avx2(&x[1], &x[30]); |
1067 | 506k | btf_16_adds_subs_avx2(&x[2], &x[29]); |
1068 | 506k | btf_16_adds_subs_avx2(&x[3], &x[28]); |
1069 | 506k | btf_16_adds_subs_avx2(&x[4], &x[27]); |
1070 | 506k | btf_16_adds_subs_avx2(&x[5], &x[26]); |
1071 | 506k | btf_16_adds_subs_avx2(&x[6], &x[25]); |
1072 | 506k | btf_16_adds_subs_avx2(&x[7], &x[24]); |
1073 | 506k | btf_16_adds_subs_avx2(&x[8], &x[23]); |
1074 | 506k | btf_16_adds_subs_avx2(&x[9], &x[22]); |
1075 | 506k | btf_16_adds_subs_avx2(&x[10], &x[21]); |
1076 | 506k | btf_16_adds_subs_avx2(&x[11], &x[20]); |
1077 | 506k | btf_16_adds_subs_avx2(&x[12], &x[19]); |
1078 | 506k | btf_16_adds_subs_avx2(&x[13], &x[18]); |
1079 | 506k | btf_16_adds_subs_avx2(&x[14], &x[17]); |
1080 | 506k | btf_16_adds_subs_avx2(&x[15], &x[16]); |
1081 | 506k | btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[40], &x[55], _r, cos_bit); |
1082 | 506k | btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[41], &x[54], _r, cos_bit); |
1083 | 506k | btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[42], &x[53], _r, cos_bit); |
1084 | 506k | btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[43], &x[52], _r, cos_bit); |
1085 | 506k | btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[44], &x[51], _r, cos_bit); |
1086 | 506k | btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[45], &x[50], _r, cos_bit); |
1087 | 506k | btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[46], &x[49], _r, cos_bit); |
1088 | 506k | btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[47], &x[48], _r, cos_bit); |
1089 | 506k | } |
1090 | | |
1091 | 506k | static INLINE void idct64_stage11_avx2(__m256i *output, __m256i *x) { |
1092 | 506k | btf_16_adds_subs_out_avx2(&output[0], &output[63], x[0], x[63]); |
1093 | 506k | btf_16_adds_subs_out_avx2(&output[1], &output[62], x[1], x[62]); |
1094 | 506k | btf_16_adds_subs_out_avx2(&output[2], &output[61], x[2], x[61]); |
1095 | 506k | btf_16_adds_subs_out_avx2(&output[3], &output[60], x[3], x[60]); |
1096 | 506k | btf_16_adds_subs_out_avx2(&output[4], &output[59], x[4], x[59]); |
1097 | 506k | btf_16_adds_subs_out_avx2(&output[5], &output[58], x[5], x[58]); |
1098 | 506k | btf_16_adds_subs_out_avx2(&output[6], &output[57], x[6], x[57]); |
1099 | 506k | btf_16_adds_subs_out_avx2(&output[7], &output[56], x[7], x[56]); |
1100 | 506k | btf_16_adds_subs_out_avx2(&output[8], &output[55], x[8], x[55]); |
1101 | 506k | btf_16_adds_subs_out_avx2(&output[9], &output[54], x[9], x[54]); |
1102 | 506k | btf_16_adds_subs_out_avx2(&output[10], &output[53], x[10], x[53]); |
1103 | 506k | btf_16_adds_subs_out_avx2(&output[11], &output[52], x[11], x[52]); |
1104 | 506k | btf_16_adds_subs_out_avx2(&output[12], &output[51], x[12], x[51]); |
1105 | 506k | btf_16_adds_subs_out_avx2(&output[13], &output[50], x[13], x[50]); |
1106 | 506k | btf_16_adds_subs_out_avx2(&output[14], &output[49], x[14], x[49]); |
1107 | 506k | btf_16_adds_subs_out_avx2(&output[15], &output[48], x[15], x[48]); |
1108 | 506k | btf_16_adds_subs_out_avx2(&output[16], &output[47], x[16], x[47]); |
1109 | 506k | btf_16_adds_subs_out_avx2(&output[17], &output[46], x[17], x[46]); |
1110 | 506k | btf_16_adds_subs_out_avx2(&output[18], &output[45], x[18], x[45]); |
1111 | 506k | btf_16_adds_subs_out_avx2(&output[19], &output[44], x[19], x[44]); |
1112 | 506k | btf_16_adds_subs_out_avx2(&output[20], &output[43], x[20], x[43]); |
1113 | 506k | btf_16_adds_subs_out_avx2(&output[21], &output[42], x[21], x[42]); |
1114 | 506k | btf_16_adds_subs_out_avx2(&output[22], &output[41], x[22], x[41]); |
1115 | 506k | btf_16_adds_subs_out_avx2(&output[23], &output[40], x[23], x[40]); |
1116 | 506k | btf_16_adds_subs_out_avx2(&output[24], &output[39], x[24], x[39]); |
1117 | 506k | btf_16_adds_subs_out_avx2(&output[25], &output[38], x[25], x[38]); |
1118 | 506k | btf_16_adds_subs_out_avx2(&output[26], &output[37], x[26], x[37]); |
1119 | 506k | btf_16_adds_subs_out_avx2(&output[27], &output[36], x[27], x[36]); |
1120 | 506k | btf_16_adds_subs_out_avx2(&output[28], &output[35], x[28], x[35]); |
1121 | 506k | btf_16_adds_subs_out_avx2(&output[29], &output[34], x[29], x[34]); |
1122 | 506k | btf_16_adds_subs_out_avx2(&output[30], &output[33], x[30], x[33]); |
1123 | 506k | btf_16_adds_subs_out_avx2(&output[31], &output[32], x[31], x[32]); |
1124 | 506k | } |
1125 | | |
1126 | 170k | static void idct64_low1_avx2(const __m256i *input, __m256i *output) { |
1127 | 170k | const int32_t *cospi = cospi_arr(INV_COS_BIT); |
1128 | | |
1129 | | // stage 1 |
1130 | 170k | __m256i x[32]; |
1131 | 170k | x[0] = input[0]; |
1132 | | |
1133 | | // stage 2 |
1134 | | // stage 3 |
1135 | | // stage 4 |
1136 | | // stage 5 |
1137 | | // stage 6 |
1138 | 170k | btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); |
1139 | | |
1140 | | // stage 7 |
1141 | | // stage 8 |
1142 | | // stage 9 |
1143 | | // stage 10 |
1144 | | // stage 11 |
1145 | 170k | output[0] = x[0]; |
1146 | 170k | output[63] = x[0]; |
1147 | 170k | output[1] = x[1]; |
1148 | 170k | output[62] = x[1]; |
1149 | 170k | output[2] = x[1]; |
1150 | 170k | output[61] = x[1]; |
1151 | 170k | output[3] = x[0]; |
1152 | 170k | output[60] = x[0]; |
1153 | 170k | output[4] = x[0]; |
1154 | 170k | output[59] = x[0]; |
1155 | 170k | output[5] = x[1]; |
1156 | 170k | output[58] = x[1]; |
1157 | 170k | output[6] = x[1]; |
1158 | 170k | output[57] = x[1]; |
1159 | 170k | output[7] = x[0]; |
1160 | 170k | output[56] = x[0]; |
1161 | 170k | output[8] = x[0]; |
1162 | 170k | output[55] = x[0]; |
1163 | 170k | output[9] = x[1]; |
1164 | 170k | output[54] = x[1]; |
1165 | 170k | output[10] = x[1]; |
1166 | 170k | output[53] = x[1]; |
1167 | 170k | output[11] = x[0]; |
1168 | 170k | output[52] = x[0]; |
1169 | 170k | output[12] = x[0]; |
1170 | 170k | output[51] = x[0]; |
1171 | 170k | output[13] = x[1]; |
1172 | 170k | output[50] = x[1]; |
1173 | 170k | output[14] = x[1]; |
1174 | 170k | output[49] = x[1]; |
1175 | 170k | output[15] = x[0]; |
1176 | 170k | output[48] = x[0]; |
1177 | 170k | output[16] = x[0]; |
1178 | 170k | output[47] = x[0]; |
1179 | 170k | output[17] = x[1]; |
1180 | 170k | output[46] = x[1]; |
1181 | 170k | output[18] = x[1]; |
1182 | 170k | output[45] = x[1]; |
1183 | 170k | output[19] = x[0]; |
1184 | 170k | output[44] = x[0]; |
1185 | 170k | output[20] = x[0]; |
1186 | 170k | output[43] = x[0]; |
1187 | 170k | output[21] = x[1]; |
1188 | 170k | output[42] = x[1]; |
1189 | 170k | output[22] = x[1]; |
1190 | 170k | output[41] = x[1]; |
1191 | 170k | output[23] = x[0]; |
1192 | 170k | output[40] = x[0]; |
1193 | 170k | output[24] = x[0]; |
1194 | 170k | output[39] = x[0]; |
1195 | 170k | output[25] = x[1]; |
1196 | 170k | output[38] = x[1]; |
1197 | 170k | output[26] = x[1]; |
1198 | 170k | output[37] = x[1]; |
1199 | 170k | output[27] = x[0]; |
1200 | 170k | output[36] = x[0]; |
1201 | 170k | output[28] = x[0]; |
1202 | 170k | output[35] = x[0]; |
1203 | 170k | output[29] = x[1]; |
1204 | 170k | output[34] = x[1]; |
1205 | 170k | output[30] = x[1]; |
1206 | 170k | output[33] = x[1]; |
1207 | 170k | output[31] = x[0]; |
1208 | 170k | output[32] = x[0]; |
1209 | 170k | } |
1210 | | |
1211 | 242k | static void idct64_low8_avx2(const __m256i *input, __m256i *output) { |
1212 | 242k | const int32_t *cospi = cospi_arr(INV_COS_BIT); |
1213 | 242k | const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); |
1214 | 242k | const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]); |
1215 | 242k | const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]); |
1216 | 242k | const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]); |
1217 | 242k | const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]); |
1218 | 242k | const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]); |
1219 | 242k | const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]); |
1220 | 242k | const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]); |
1221 | 242k | const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]); |
1222 | 242k | const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); |
1223 | 242k | const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); |
1224 | 242k | const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); |
1225 | 242k | const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); |
1226 | 242k | const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); |
1227 | 242k | const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); |
1228 | 242k | const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); |
1229 | 242k | const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); |
1230 | | |
1231 | | // stage 1 |
1232 | 242k | __m256i x[64]; |
1233 | 242k | x[0] = input[0]; |
1234 | 242k | x[8] = input[4]; |
1235 | 242k | x[16] = input[2]; |
1236 | 242k | x[24] = input[6]; |
1237 | 242k | x[32] = input[1]; |
1238 | 242k | x[40] = input[5]; |
1239 | 242k | x[48] = input[3]; |
1240 | 242k | x[56] = input[7]; |
1241 | | |
1242 | | // stage 2 |
1243 | 242k | btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]); |
1244 | 242k | btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]); |
1245 | 242k | btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]); |
1246 | 242k | btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]); |
1247 | | |
1248 | | // stage 3 |
1249 | 242k | btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]); |
1250 | 242k | btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]); |
1251 | 242k | x[33] = x[32]; |
1252 | 242k | x[38] = x[39]; |
1253 | 242k | x[41] = x[40]; |
1254 | 242k | x[46] = x[47]; |
1255 | 242k | x[49] = x[48]; |
1256 | 242k | x[54] = x[55]; |
1257 | 242k | x[57] = x[56]; |
1258 | 242k | x[62] = x[63]; |
1259 | | |
1260 | | // stage 4 |
1261 | 242k | btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]); |
1262 | 242k | x[17] = x[16]; |
1263 | 242k | x[22] = x[23]; |
1264 | 242k | x[25] = x[24]; |
1265 | 242k | x[30] = x[31]; |
1266 | 242k | btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, |
1267 | 242k | INV_COS_BIT); |
1268 | 242k | btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, |
1269 | 242k | INV_COS_BIT); |
1270 | 242k | btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, |
1271 | 242k | INV_COS_BIT); |
1272 | 242k | btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, |
1273 | 242k | INV_COS_BIT); |
1274 | | |
1275 | | // stage 5 |
1276 | 242k | x[9] = x[8]; |
1277 | 242k | x[14] = x[15]; |
1278 | 242k | btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, |
1279 | 242k | INV_COS_BIT); |
1280 | 242k | btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, |
1281 | 242k | INV_COS_BIT); |
1282 | 242k | x[35] = x[32]; |
1283 | 242k | x[34] = x[33]; |
1284 | 242k | x[36] = x[39]; |
1285 | 242k | x[37] = x[38]; |
1286 | 242k | x[43] = x[40]; |
1287 | 242k | x[42] = x[41]; |
1288 | 242k | x[44] = x[47]; |
1289 | 242k | x[45] = x[46]; |
1290 | 242k | x[51] = x[48]; |
1291 | 242k | x[50] = x[49]; |
1292 | 242k | x[52] = x[55]; |
1293 | 242k | x[53] = x[54]; |
1294 | 242k | x[59] = x[56]; |
1295 | 242k | x[58] = x[57]; |
1296 | 242k | x[60] = x[63]; |
1297 | 242k | x[61] = x[62]; |
1298 | | |
1299 | | // stage 6 |
1300 | 242k | btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); |
1301 | 242k | btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, INV_COS_BIT); |
1302 | 242k | x[19] = x[16]; |
1303 | 242k | x[18] = x[17]; |
1304 | 242k | x[20] = x[23]; |
1305 | 242k | x[21] = x[22]; |
1306 | 242k | x[27] = x[24]; |
1307 | 242k | x[26] = x[25]; |
1308 | 242k | x[28] = x[31]; |
1309 | 242k | x[29] = x[30]; |
1310 | 242k | idct64_stage6_high32_avx2(x, cospi, _r, INV_COS_BIT); |
1311 | | |
1312 | | // stage 7 |
1313 | 242k | x[3] = x[0]; |
1314 | 242k | x[2] = x[1]; |
1315 | 242k | x[11] = x[8]; |
1316 | 242k | x[10] = x[9]; |
1317 | 242k | x[12] = x[15]; |
1318 | 242k | x[13] = x[14]; |
1319 | 242k | idct64_stage7_high48_avx2(x, cospi, _r, INV_COS_BIT); |
1320 | | |
1321 | | // stage 8 |
1322 | 242k | x[7] = x[0]; |
1323 | 242k | x[6] = x[1]; |
1324 | 242k | x[5] = x[2]; |
1325 | 242k | x[4] = x[3]; |
1326 | 242k | btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, |
1327 | 242k | INV_COS_BIT); |
1328 | 242k | btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, |
1329 | 242k | INV_COS_BIT); |
1330 | 242k | idct64_stage8_high48_avx2(x, cospi, _r, INV_COS_BIT); |
1331 | | |
1332 | 242k | idct64_stage9_avx2(x, cospi, _r, INV_COS_BIT); |
1333 | 242k | idct64_stage10_avx2(x, cospi, _r, INV_COS_BIT); |
1334 | 242k | idct64_stage11_avx2(output, x); |
1335 | 242k | } |
1336 | | |
1337 | 138k | static void idct64_low16_avx2(const __m256i *input, __m256i *output) { |
1338 | 138k | const int32_t *cospi = cospi_arr(INV_COS_BIT); |
1339 | 138k | const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); |
1340 | | |
1341 | 138k | const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); |
1342 | 138k | const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); |
1343 | 138k | const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); |
1344 | 138k | const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); |
1345 | 138k | const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); |
1346 | | |
1347 | | // stage 1 |
1348 | 138k | __m256i x[64]; |
1349 | 138k | x[0] = input[0]; |
1350 | 138k | x[4] = input[8]; |
1351 | 138k | x[8] = input[4]; |
1352 | 138k | x[12] = input[12]; |
1353 | 138k | x[16] = input[2]; |
1354 | 138k | x[20] = input[10]; |
1355 | 138k | x[24] = input[6]; |
1356 | 138k | x[28] = input[14]; |
1357 | 138k | x[32] = input[1]; |
1358 | 138k | x[36] = input[9]; |
1359 | 138k | x[40] = input[5]; |
1360 | 138k | x[44] = input[13]; |
1361 | 138k | x[48] = input[3]; |
1362 | 138k | x[52] = input[11]; |
1363 | 138k | x[56] = input[7]; |
1364 | 138k | x[60] = input[15]; |
1365 | | |
1366 | | // stage 2 |
1367 | 138k | btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]); |
1368 | 138k | btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]); |
1369 | 138k | btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]); |
1370 | 138k | btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]); |
1371 | 138k | btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]); |
1372 | 138k | btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]); |
1373 | 138k | btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]); |
1374 | 138k | btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]); |
1375 | | |
1376 | | // stage 3 |
1377 | 138k | btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]); |
1378 | 138k | btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]); |
1379 | 138k | btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]); |
1380 | 138k | btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]); |
1381 | 138k | x[33] = x[32]; |
1382 | 138k | x[34] = x[35]; |
1383 | 138k | x[37] = x[36]; |
1384 | 138k | x[38] = x[39]; |
1385 | 138k | x[41] = x[40]; |
1386 | 138k | x[42] = x[43]; |
1387 | 138k | x[45] = x[44]; |
1388 | 138k | x[46] = x[47]; |
1389 | 138k | x[49] = x[48]; |
1390 | 138k | x[50] = x[51]; |
1391 | 138k | x[53] = x[52]; |
1392 | 138k | x[54] = x[55]; |
1393 | 138k | x[57] = x[56]; |
1394 | 138k | x[58] = x[59]; |
1395 | 138k | x[61] = x[60]; |
1396 | 138k | x[62] = x[63]; |
1397 | | |
1398 | | // stage 4 |
1399 | 138k | btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]); |
1400 | 138k | btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]); |
1401 | 138k | x[17] = x[16]; |
1402 | 138k | x[18] = x[19]; |
1403 | 138k | x[21] = x[20]; |
1404 | 138k | x[22] = x[23]; |
1405 | 138k | x[25] = x[24]; |
1406 | 138k | x[26] = x[27]; |
1407 | 138k | x[29] = x[28]; |
1408 | 138k | x[30] = x[31]; |
1409 | 138k | idct64_stage4_high32_avx2(x, cospi, _r, INV_COS_BIT); |
1410 | | |
1411 | | // stage 5 |
1412 | 138k | btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]); |
1413 | 138k | x[9] = x[8]; |
1414 | 138k | x[10] = x[11]; |
1415 | 138k | x[13] = x[12]; |
1416 | 138k | x[14] = x[15]; |
1417 | 138k | idct64_stage5_high48_avx2(x, cospi, _r, INV_COS_BIT); |
1418 | | |
1419 | | // stage 6 |
1420 | 138k | btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); |
1421 | 138k | x[5] = x[4]; |
1422 | 138k | x[6] = x[7]; |
1423 | 138k | btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, INV_COS_BIT); |
1424 | 138k | btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, |
1425 | 138k | INV_COS_BIT); |
1426 | 138k | idct64_stage6_high48_avx2(x, cospi, _r, INV_COS_BIT); |
1427 | | |
1428 | | // stage 7 |
1429 | 138k | x[3] = x[0]; |
1430 | 138k | x[2] = x[1]; |
1431 | 138k | btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, INV_COS_BIT); |
1432 | 138k | btf_16_adds_subs_avx2(&x[8], &x[11]); |
1433 | 138k | btf_16_adds_subs_avx2(&x[9], &x[10]); |
1434 | 138k | btf_16_adds_subs_avx2(&x[15], &x[12]); |
1435 | 138k | btf_16_adds_subs_avx2(&x[14], &x[13]); |
1436 | 138k | idct64_stage7_high48_avx2(x, cospi, _r, INV_COS_BIT); |
1437 | | |
1438 | | // stage 8 |
1439 | 138k | btf_16_adds_subs_avx2(&x[0], &x[7]); |
1440 | 138k | btf_16_adds_subs_avx2(&x[1], &x[6]); |
1441 | 138k | btf_16_adds_subs_avx2(&x[2], &x[5]); |
1442 | 138k | btf_16_adds_subs_avx2(&x[3], &x[4]); |
1443 | 138k | btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, |
1444 | 138k | INV_COS_BIT); |
1445 | 138k | btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, |
1446 | 138k | INV_COS_BIT); |
1447 | 138k | idct64_stage8_high48_avx2(x, cospi, _r, INV_COS_BIT); |
1448 | | |
1449 | 138k | idct64_stage9_avx2(x, cospi, _r, INV_COS_BIT); |
1450 | 138k | idct64_stage10_avx2(x, cospi, _r, INV_COS_BIT); |
1451 | 138k | idct64_stage11_avx2(output, x); |
1452 | 138k | } |
1453 | | |
1454 | 125k | static void idct64_low32_avx2(const __m256i *input, __m256i *output) { |
1455 | 125k | const int32_t *cospi = cospi_arr(INV_COS_BIT); |
1456 | 125k | const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); |
1457 | | |
1458 | 125k | const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); |
1459 | 125k | const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); |
1460 | 125k | const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); |
1461 | 125k | const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); |
1462 | 125k | const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); |
1463 | | |
1464 | | // stage 1 |
1465 | 125k | __m256i x[64]; |
1466 | 125k | x[0] = input[0]; |
1467 | 125k | x[2] = input[16]; |
1468 | 125k | x[4] = input[8]; |
1469 | 125k | x[6] = input[24]; |
1470 | 125k | x[8] = input[4]; |
1471 | 125k | x[10] = input[20]; |
1472 | 125k | x[12] = input[12]; |
1473 | 125k | x[14] = input[28]; |
1474 | 125k | x[16] = input[2]; |
1475 | 125k | x[18] = input[18]; |
1476 | 125k | x[20] = input[10]; |
1477 | 125k | x[22] = input[26]; |
1478 | 125k | x[24] = input[6]; |
1479 | 125k | x[26] = input[22]; |
1480 | 125k | x[28] = input[14]; |
1481 | 125k | x[30] = input[30]; |
1482 | 125k | x[32] = input[1]; |
1483 | 125k | x[34] = input[17]; |
1484 | 125k | x[36] = input[9]; |
1485 | 125k | x[38] = input[25]; |
1486 | 125k | x[40] = input[5]; |
1487 | 125k | x[42] = input[21]; |
1488 | 125k | x[44] = input[13]; |
1489 | 125k | x[46] = input[29]; |
1490 | 125k | x[48] = input[3]; |
1491 | 125k | x[50] = input[19]; |
1492 | 125k | x[52] = input[11]; |
1493 | 125k | x[54] = input[27]; |
1494 | 125k | x[56] = input[7]; |
1495 | 125k | x[58] = input[23]; |
1496 | 125k | x[60] = input[15]; |
1497 | 125k | x[62] = input[31]; |
1498 | | |
1499 | | // stage 2 |
1500 | 125k | btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]); |
1501 | 125k | btf_16_w16_0_avx2(-cospi[33], cospi[31], x[62], x[33], x[62]); |
1502 | 125k | btf_16_w16_0_avx2(cospi[47], cospi[17], x[34], x[34], x[61]); |
1503 | 125k | btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]); |
1504 | 125k | btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]); |
1505 | 125k | btf_16_w16_0_avx2(-cospi[41], cospi[23], x[58], x[37], x[58]); |
1506 | 125k | btf_16_w16_0_avx2(cospi[39], cospi[25], x[38], x[38], x[57]); |
1507 | 125k | btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]); |
1508 | 125k | btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]); |
1509 | 125k | btf_16_w16_0_avx2(-cospi[37], cospi[27], x[54], x[41], x[54]); |
1510 | 125k | btf_16_w16_0_avx2(cospi[43], cospi[21], x[42], x[42], x[53]); |
1511 | 125k | btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]); |
1512 | 125k | btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]); |
1513 | 125k | btf_16_w16_0_avx2(-cospi[45], cospi[19], x[50], x[45], x[50]); |
1514 | 125k | btf_16_w16_0_avx2(cospi[35], cospi[29], x[46], x[46], x[49]); |
1515 | 125k | btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]); |
1516 | | |
1517 | | // stage 3 |
1518 | 125k | btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]); |
1519 | 125k | btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]); |
1520 | 125k | btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]); |
1521 | 125k | btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]); |
1522 | 125k | btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]); |
1523 | 125k | btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]); |
1524 | 125k | btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]); |
1525 | 125k | btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]); |
1526 | 125k | btf_16_adds_subs_avx2(&x[32], &x[33]); |
1527 | 125k | btf_16_adds_subs_avx2(&x[35], &x[34]); |
1528 | 125k | btf_16_adds_subs_avx2(&x[36], &x[37]); |
1529 | 125k | btf_16_adds_subs_avx2(&x[39], &x[38]); |
1530 | 125k | btf_16_adds_subs_avx2(&x[40], &x[41]); |
1531 | 125k | btf_16_adds_subs_avx2(&x[43], &x[42]); |
1532 | 125k | btf_16_adds_subs_avx2(&x[44], &x[45]); |
1533 | 125k | btf_16_adds_subs_avx2(&x[47], &x[46]); |
1534 | 125k | btf_16_adds_subs_avx2(&x[48], &x[49]); |
1535 | 125k | btf_16_adds_subs_avx2(&x[51], &x[50]); |
1536 | 125k | btf_16_adds_subs_avx2(&x[52], &x[53]); |
1537 | 125k | btf_16_adds_subs_avx2(&x[55], &x[54]); |
1538 | 125k | btf_16_adds_subs_avx2(&x[56], &x[57]); |
1539 | 125k | btf_16_adds_subs_avx2(&x[59], &x[58]); |
1540 | 125k | btf_16_adds_subs_avx2(&x[60], &x[61]); |
1541 | 125k | btf_16_adds_subs_avx2(&x[63], &x[62]); |
1542 | | |
1543 | | // stage 4 |
1544 | 125k | btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]); |
1545 | 125k | btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]); |
1546 | 125k | btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]); |
1547 | 125k | btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]); |
1548 | 125k | btf_16_adds_subs_avx2(&x[16], &x[17]); |
1549 | 125k | btf_16_adds_subs_avx2(&x[19], &x[18]); |
1550 | 125k | btf_16_adds_subs_avx2(&x[20], &x[21]); |
1551 | 125k | btf_16_adds_subs_avx2(&x[23], &x[22]); |
1552 | 125k | btf_16_adds_subs_avx2(&x[24], &x[25]); |
1553 | 125k | btf_16_adds_subs_avx2(&x[27], &x[26]); |
1554 | 125k | btf_16_adds_subs_avx2(&x[28], &x[29]); |
1555 | 125k | btf_16_adds_subs_avx2(&x[31], &x[30]); |
1556 | 125k | idct64_stage4_high32_avx2(x, cospi, _r, INV_COS_BIT); |
1557 | | |
1558 | | // stage 5 |
1559 | 125k | btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]); |
1560 | 125k | btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]); |
1561 | 125k | btf_16_adds_subs_avx2(&x[8], &x[9]); |
1562 | 125k | btf_16_adds_subs_avx2(&x[11], &x[10]); |
1563 | 125k | btf_16_adds_subs_avx2(&x[12], &x[13]); |
1564 | 125k | btf_16_adds_subs_avx2(&x[15], &x[14]); |
1565 | 125k | idct64_stage5_high48_avx2(x, cospi, _r, INV_COS_BIT); |
1566 | | |
1567 | | // stage 6 |
1568 | 125k | btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); |
1569 | 125k | btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]); |
1570 | 125k | btf_16_adds_subs_avx2(&x[4], &x[5]); |
1571 | 125k | btf_16_adds_subs_avx2(&x[7], &x[6]); |
1572 | 125k | btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, INV_COS_BIT); |
1573 | 125k | btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, |
1574 | 125k | INV_COS_BIT); |
1575 | 125k | idct64_stage6_high48_avx2(x, cospi, _r, INV_COS_BIT); |
1576 | | |
1577 | | // stage 7 |
1578 | 125k | btf_16_adds_subs_avx2(&x[0], &x[3]); |
1579 | 125k | btf_16_adds_subs_avx2(&x[1], &x[2]); |
1580 | 125k | btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, INV_COS_BIT); |
1581 | 125k | btf_16_adds_subs_avx2(&x[8], &x[11]); |
1582 | 125k | btf_16_adds_subs_avx2(&x[9], &x[10]); |
1583 | 125k | btf_16_adds_subs_avx2(&x[15], &x[12]); |
1584 | 125k | btf_16_adds_subs_avx2(&x[14], &x[13]); |
1585 | 125k | idct64_stage7_high48_avx2(x, cospi, _r, INV_COS_BIT); |
1586 | | |
1587 | | // stage 8 |
1588 | 125k | btf_16_adds_subs_avx2(&x[0], &x[7]); |
1589 | 125k | btf_16_adds_subs_avx2(&x[1], &x[6]); |
1590 | 125k | btf_16_adds_subs_avx2(&x[2], &x[5]); |
1591 | 125k | btf_16_adds_subs_avx2(&x[3], &x[4]); |
1592 | 125k | btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, |
1593 | 125k | INV_COS_BIT); |
1594 | 125k | btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, |
1595 | 125k | INV_COS_BIT); |
1596 | 125k | idct64_stage8_high48_avx2(x, cospi, _r, INV_COS_BIT); |
1597 | | |
1598 | | // stage 9~11 |
1599 | 125k | idct64_stage9_avx2(x, cospi, _r, INV_COS_BIT); |
1600 | 125k | idct64_stage10_avx2(x, cospi, _r, INV_COS_BIT); |
1601 | 125k | idct64_stage11_avx2(output, x); |
1602 | 125k | } |
1603 | | |
1604 | | typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output); |
1605 | | |
1606 | | // 1D functions process 16 pixels at one time. |
1607 | | static const transform_1d_avx2 |
1608 | | lowbd_txfm_all_1d_zeros_w16_arr[TX_SIZES][ITX_TYPES_1D][4] = { |
1609 | | { |
1610 | | { NULL, NULL, NULL, NULL }, |
1611 | | { NULL, NULL, NULL, NULL }, |
1612 | | { NULL, NULL, NULL, NULL }, |
1613 | | }, |
1614 | | { { NULL, NULL, NULL, NULL }, |
1615 | | { NULL, NULL, NULL, NULL }, |
1616 | | { NULL, NULL, NULL, NULL } }, |
1617 | | { |
1618 | | { idct16_low1_avx2, idct16_low8_avx2, idct16_avx2, NULL }, |
1619 | | { iadst16_low1_avx2, iadst16_low8_avx2, iadst16_avx2, NULL }, |
1620 | | { NULL, NULL, NULL, NULL }, |
1621 | | }, |
1622 | | { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 }, |
1623 | | { NULL, NULL, NULL, NULL }, |
1624 | | { NULL, NULL, NULL, NULL } }, |
1625 | | { { idct64_low1_avx2, idct64_low8_avx2, idct64_low16_avx2, |
1626 | | idct64_low32_avx2 }, |
1627 | | { NULL, NULL, NULL, NULL }, |
1628 | | { NULL, NULL, NULL, NULL } } |
1629 | | }; |
1630 | | |
1631 | | // only process w >= 16 h >= 16 |
1632 | | static INLINE void lowbd_inv_txfm2d_add_no_identity_avx2( |
1633 | | const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, |
1634 | 1.74M | TX_SIZE tx_size, int eob) { |
1635 | 1.74M | __m256i buf1[64 * 16]; |
1636 | 1.74M | int eobx, eoby; |
1637 | 1.74M | get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); |
1638 | 1.74M | const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; |
1639 | 1.74M | const int txw_idx = get_txw_idx(tx_size); |
1640 | 1.74M | const int txh_idx = get_txh_idx(tx_size); |
1641 | 1.74M | const int txfm_size_col = tx_size_wide[tx_size]; |
1642 | 1.74M | const int txfm_size_row = tx_size_high[tx_size]; |
1643 | 1.74M | const int buf_size_w_div16 = txfm_size_col >> 4; |
1644 | 1.74M | const int buf_size_nonzero_w = ((eobx + 16) >> 4) << 4; |
1645 | 1.74M | const int buf_size_nonzero_h_div16 = (eoby + 16) >> 4; |
1646 | 1.74M | const int input_stride = AOMMIN(32, txfm_size_row); |
1647 | 1.74M | const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); |
1648 | | |
1649 | 1.74M | const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; |
1650 | 1.74M | const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; |
1651 | 1.74M | const transform_1d_avx2 row_txfm = |
1652 | 1.74M | lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; |
1653 | 1.74M | const transform_1d_avx2 col_txfm = |
1654 | 1.74M | lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; |
1655 | | |
1656 | 1.74M | assert(col_txfm != NULL); |
1657 | 1.74M | assert(row_txfm != NULL); |
1658 | 1.74M | int ud_flip, lr_flip; |
1659 | 1.74M | get_flip_cfg(tx_type, &ud_flip, &lr_flip); |
1660 | 1.74M | const __m256i scale0 = _mm256_set1_epi16(1 << (15 + shift[0])); |
1661 | 3.55M | for (int i = 0; i < buf_size_nonzero_h_div16; i++) { |
1662 | 1.81M | __m256i buf0[64]; |
1663 | 1.81M | load_buffer_32bit_to_16bit_w16_avx2(input + 16 * i, input_stride, buf0, |
1664 | 1.81M | buf_size_nonzero_w); |
1665 | 1.81M | if (rect_type == 1 || rect_type == -1) { |
1666 | 300k | round_shift_avx2(buf0, buf0, buf_size_nonzero_w); // rect special code |
1667 | 300k | } |
1668 | 1.81M | row_txfm(buf0, buf0); |
1669 | 54.1M | for (int j = 0; j < txfm_size_col; ++j) { |
1670 | 52.3M | buf0[j] = _mm256_mulhrs_epi16(buf0[j], scale0); |
1671 | 52.3M | } |
1672 | | |
1673 | 1.81M | __m256i *buf1_cur = buf1 + (i << 4); |
1674 | 1.81M | if (lr_flip) { |
1675 | 60.3k | for (int j = 0; j < buf_size_w_div16; ++j) { |
1676 | 30.1k | __m256i temp[16]; |
1677 | 30.1k | flip_buf_avx2(buf0 + 16 * j, temp, 16); |
1678 | 30.1k | int offset = txfm_size_row * (buf_size_w_div16 - 1 - j); |
1679 | 30.1k | transpose_16bit_16x16_avx2(temp, buf1_cur + offset); |
1680 | 30.1k | } |
1681 | 1.78M | } else { |
1682 | 5.02M | for (int j = 0; j < buf_size_w_div16; ++j) { |
1683 | 3.24M | transpose_16bit_16x16_avx2(buf0 + 16 * j, buf1_cur + txfm_size_row * j); |
1684 | 3.24M | } |
1685 | 1.78M | } |
1686 | 1.81M | } |
1687 | 1.74M | const __m256i scale1 = _mm256_set1_epi16(1 << (15 + shift[1])); |
1688 | 4.85M | for (int i = 0; i < buf_size_w_div16; i++) { |
1689 | 3.10M | __m256i *buf1_cur = buf1 + i * txfm_size_row; |
1690 | 3.10M | col_txfm(buf1_cur, buf1_cur); |
1691 | 88.4M | for (int j = 0; j < txfm_size_row; ++j) { |
1692 | 85.2M | buf1_cur[j] = _mm256_mulhrs_epi16(buf1_cur[j], scale1); |
1693 | 85.2M | } |
1694 | 3.10M | } |
1695 | 4.85M | for (int i = 0; i < buf_size_w_div16; i++) { |
1696 | 3.10M | lowbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row, output + 16 * i, |
1697 | 3.10M | stride, ud_flip, txfm_size_row); |
1698 | 3.10M | } |
1699 | 1.74M | } |
1700 | | |
1701 | | static INLINE void iidentity_row_16xn_avx2(__m256i *out, const int32_t *input, |
1702 | | int stride, int shift, int height, |
1703 | 57.7k | int txw_idx, int rect_type) { |
1704 | 57.7k | const int32_t *input_row = input; |
1705 | 57.7k | const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txw_idx]); |
1706 | 57.7k | const __m256i _r = _mm256_set1_epi16((1 << (NewSqrt2Bits - 1)) + |
1707 | 57.7k | (1 << (NewSqrt2Bits - shift - 1))); |
1708 | 57.7k | const __m256i one = _mm256_set1_epi16(1); |
1709 | 57.7k | const __m256i scale__r = _mm256_unpacklo_epi16(scale, _r); |
1710 | 57.7k | if (rect_type != 1 && rect_type != -1) { |
1711 | 761k | for (int i = 0; i < height; ++i) { |
1712 | 717k | const __m256i src = load_32bit_to_16bit_w16_avx2(input_row); |
1713 | 717k | input_row += stride; |
1714 | 717k | __m256i lo = _mm256_unpacklo_epi16(src, one); |
1715 | 717k | __m256i hi = _mm256_unpackhi_epi16(src, one); |
1716 | 717k | lo = _mm256_madd_epi16(lo, scale__r); |
1717 | 717k | hi = _mm256_madd_epi16(hi, scale__r); |
1718 | 717k | lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift); |
1719 | 717k | hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift); |
1720 | 717k | out[i] = _mm256_packs_epi32(lo, hi); |
1721 | 717k | } |
1722 | 44.8k | } else { |
1723 | 12.9k | const __m256i rect_scale = |
1724 | 12.9k | _mm256_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits)); |
1725 | 219k | for (int i = 0; i < height; ++i) { |
1726 | 206k | __m256i src = load_32bit_to_16bit_w16_avx2(input_row); |
1727 | 206k | src = _mm256_mulhrs_epi16(src, rect_scale); |
1728 | 206k | input_row += stride; |
1729 | 206k | __m256i lo = _mm256_unpacklo_epi16(src, one); |
1730 | 206k | __m256i hi = _mm256_unpackhi_epi16(src, one); |
1731 | 206k | lo = _mm256_madd_epi16(lo, scale__r); |
1732 | 206k | hi = _mm256_madd_epi16(hi, scale__r); |
1733 | 206k | lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift); |
1734 | 206k | hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift); |
1735 | 206k | out[i] = _mm256_packs_epi32(lo, hi); |
1736 | 206k | } |
1737 | 12.9k | } |
1738 | 57.7k | } |
1739 | | |
1740 | | static INLINE void iidentity_col_16xn_avx2(uint8_t *output, int stride, |
1741 | | __m256i *buf, int shift, int height, |
1742 | 66.2k | int txh_idx) { |
1743 | 66.2k | const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txh_idx]); |
1744 | 66.2k | const __m256i scale__r = _mm256_set1_epi16(1 << (NewSqrt2Bits - 1)); |
1745 | 66.2k | const __m256i shift__r = _mm256_set1_epi32(1 << (-shift - 1)); |
1746 | 66.2k | const __m256i one = _mm256_set1_epi16(1); |
1747 | 66.2k | const __m256i scale_coeff = _mm256_unpacklo_epi16(scale, scale__r); |
1748 | 1.12M | for (int h = 0; h < height; ++h) { |
1749 | 1.05M | __m256i lo = _mm256_unpacklo_epi16(buf[h], one); |
1750 | 1.05M | __m256i hi = _mm256_unpackhi_epi16(buf[h], one); |
1751 | 1.05M | lo = _mm256_madd_epi16(lo, scale_coeff); |
1752 | 1.05M | hi = _mm256_madd_epi16(hi, scale_coeff); |
1753 | 1.05M | lo = _mm256_srai_epi32(lo, NewSqrt2Bits); |
1754 | 1.05M | hi = _mm256_srai_epi32(hi, NewSqrt2Bits); |
1755 | 1.05M | lo = _mm256_add_epi32(lo, shift__r); |
1756 | 1.05M | hi = _mm256_add_epi32(hi, shift__r); |
1757 | 1.05M | lo = _mm256_srai_epi32(lo, -shift); |
1758 | 1.05M | hi = _mm256_srai_epi32(hi, -shift); |
1759 | 1.05M | const __m256i x = _mm256_packs_epi32(lo, hi); |
1760 | 1.05M | write_recon_w16_avx2(x, output); |
1761 | 1.05M | output += stride; |
1762 | 1.05M | } |
1763 | 66.2k | } |
1764 | | |
1765 | | static INLINE void lowbd_inv_txfm2d_add_idtx_avx2(const int32_t *input, |
1766 | | uint8_t *output, int stride, |
1767 | | TX_SIZE tx_size, |
1768 | 33.6k | int32_t eob) { |
1769 | 33.6k | (void)eob; |
1770 | 33.6k | const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; |
1771 | 33.6k | const int txw_idx = get_txw_idx(tx_size); |
1772 | 33.6k | const int txh_idx = get_txh_idx(tx_size); |
1773 | 33.6k | const int txfm_size_col = tx_size_wide[tx_size]; |
1774 | 33.6k | const int txfm_size_row = tx_size_high[tx_size]; |
1775 | 33.6k | const int col_max = AOMMIN(32, txfm_size_col); |
1776 | 33.6k | const int row_max = AOMMIN(32, txfm_size_row); |
1777 | 33.6k | const int input_stride = row_max; |
1778 | 33.6k | const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); |
1779 | 33.6k | __m256i buf[32]; |
1780 | | |
1781 | 73.0k | for (int i = 0; i < (col_max >> 4); ++i) { |
1782 | 85.7k | for (int j = 0; j < (row_max >> 4); j++) { |
1783 | 46.2k | iidentity_row_16xn_avx2(buf, input + j * 16 + i * 16 * input_stride, |
1784 | 46.2k | row_max, shift[0], 16, txw_idx, rect_type); |
1785 | 46.2k | transpose_16bit_16x16_avx2(buf, buf); |
1786 | 46.2k | iidentity_col_16xn_avx2(output + i * 16 + j * 16 * stride, stride, buf, |
1787 | 46.2k | shift[1], 16, txh_idx); |
1788 | 46.2k | } |
1789 | 39.4k | } |
1790 | 33.6k | } |
1791 | | |
1792 | | static INLINE void lowbd_inv_txfm2d_add_h_identity_avx2( |
1793 | | const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, |
1794 | 11.4k | TX_SIZE tx_size, int eob) { |
1795 | 11.4k | int eobx, eoby; |
1796 | 11.4k | get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob); |
1797 | 11.4k | const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; |
1798 | 11.4k | const int txw_idx = get_txw_idx(tx_size); |
1799 | 11.4k | const int txh_idx = get_txh_idx(tx_size); |
1800 | 11.4k | const int txfm_size_col = tx_size_wide[tx_size]; |
1801 | 11.4k | const int txfm_size_row = tx_size_high[tx_size]; |
1802 | 11.4k | const int txfm_size_row_notzero = AOMMIN(32, txfm_size_row); |
1803 | 11.4k | const int input_stride = txfm_size_row_notzero; |
1804 | 11.4k | const int buf_size_w_div16 = (eobx + 16) >> 4; |
1805 | 11.4k | const int buf_size_h_div16 = (eoby + 16) >> 4; |
1806 | 11.4k | const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); |
1807 | | |
1808 | 11.4k | const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; |
1809 | 11.4k | const transform_1d_avx2 col_txfm = |
1810 | 11.4k | lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; |
1811 | | |
1812 | 11.4k | assert(col_txfm != NULL); |
1813 | | |
1814 | 11.4k | int ud_flip, lr_flip; |
1815 | 11.4k | get_flip_cfg(tx_type, &ud_flip, &lr_flip); |
1816 | 22.8k | for (int i = 0; i < buf_size_w_div16; i++) { |
1817 | 11.4k | __m256i buf0[64]; |
1818 | 22.8k | for (int j = 0; j < buf_size_h_div16; j++) { |
1819 | 11.4k | __m256i *buf0_cur = buf0 + j * 16; |
1820 | 11.4k | const int32_t *input_cur = input + i * 16 * input_stride + j * 16; |
1821 | 11.4k | iidentity_row_16xn_avx2(buf0_cur, input_cur, input_stride, shift[0], 16, |
1822 | 11.4k | txw_idx, rect_type); |
1823 | 11.4k | transpose_16bit_16x16_avx2(buf0_cur, buf0_cur); |
1824 | 11.4k | } |
1825 | 11.4k | col_txfm(buf0, buf0); |
1826 | 11.4k | __m256i mshift = _mm256_set1_epi16(1 << (15 + shift[1])); |
1827 | 11.4k | int k = ud_flip ? (txfm_size_row - 1) : 0; |
1828 | 11.4k | const int step = ud_flip ? -1 : 1; |
1829 | 194k | for (int j = 0; j < txfm_size_row; ++j, k += step) { |
1830 | 182k | __m256i res = _mm256_mulhrs_epi16(buf0[k], mshift); |
1831 | 182k | write_recon_w16_avx2(res, output + (i << 4) + j * stride); |
1832 | 182k | } |
1833 | 11.4k | } |
1834 | 11.4k | } |
1835 | | |
1836 | | static INLINE void lowbd_inv_txfm2d_add_v_identity_avx2( |
1837 | | const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, |
1838 | 19.9k | TX_SIZE tx_size, int eob) { |
1839 | 19.9k | __m256i buf1[64]; |
1840 | 19.9k | int eobx, eoby; |
1841 | 19.9k | get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob); |
1842 | 19.9k | const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; |
1843 | 19.9k | const int txw_idx = get_txw_idx(tx_size); |
1844 | 19.9k | const int txh_idx = get_txh_idx(tx_size); |
1845 | 19.9k | const int txfm_size_col = tx_size_wide[tx_size]; |
1846 | 19.9k | const int txfm_size_row = tx_size_high[tx_size]; |
1847 | 19.9k | const int buf_size_w_div16 = txfm_size_col >> 4; |
1848 | 19.9k | const int buf_size_h_div16 = (eoby + 16) >> 4; |
1849 | 19.9k | const int buf_size_nonzero_w = ((eobx + 8) >> 3) << 3; |
1850 | 19.9k | const int input_stride = AOMMIN(32, txfm_size_row); |
1851 | 19.9k | const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); |
1852 | | |
1853 | 19.9k | const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; |
1854 | 19.9k | const transform_1d_avx2 row_txfm = |
1855 | 19.9k | lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; |
1856 | | |
1857 | 19.9k | assert(row_txfm != NULL); |
1858 | | |
1859 | 19.9k | int ud_flip, lr_flip; |
1860 | 19.9k | get_flip_cfg(tx_type, &ud_flip, &lr_flip); |
1861 | 39.8k | for (int i = 0; i < buf_size_h_div16; i++) { |
1862 | 19.9k | __m256i buf0[64]; |
1863 | 19.9k | load_buffer_32bit_to_16bit_w16_avx2(input + i * 16, input_stride, buf0, |
1864 | 19.9k | buf_size_nonzero_w); |
1865 | 19.9k | if (rect_type == 1 || rect_type == -1) { |
1866 | 0 | round_shift_avx2(buf0, buf0, buf_size_nonzero_w); // rect special code |
1867 | 0 | } |
1868 | 19.9k | row_txfm(buf0, buf0); |
1869 | 19.9k | round_shift_16bit_w16_avx2(buf0, txfm_size_col, shift[0]); |
1870 | 19.9k | __m256i *_buf1 = buf1; |
1871 | 19.9k | if (lr_flip) { |
1872 | 0 | for (int j = 0; j < buf_size_w_div16; ++j) { |
1873 | 0 | __m256i temp[16]; |
1874 | 0 | flip_buf_avx2(buf0 + 16 * j, temp, 16); |
1875 | 0 | transpose_16bit_16x16_avx2(temp, |
1876 | 0 | _buf1 + 16 * (buf_size_w_div16 - 1 - j)); |
1877 | 0 | } |
1878 | 19.9k | } else { |
1879 | 39.8k | for (int j = 0; j < buf_size_w_div16; ++j) { |
1880 | 19.9k | transpose_16bit_16x16_avx2(buf0 + 16 * j, _buf1 + 16 * j); |
1881 | 19.9k | } |
1882 | 19.9k | } |
1883 | 39.8k | for (int j = 0; j < buf_size_w_div16; ++j) { |
1884 | 19.9k | iidentity_col_16xn_avx2(output + i * 16 * stride + j * 16, stride, |
1885 | 19.9k | buf1 + j * 16, shift[1], 16, txh_idx); |
1886 | 19.9k | } |
1887 | 19.9k | } |
1888 | 19.9k | } |
1889 | | |
1890 | | static const transform_1d_ssse3 lowbd_txfm_all_1d_zeros_8x8_arr[2][2] = { |
1891 | | { av1_idct8_low1_ssse3, av1_idct8_sse2 }, |
1892 | | { av1_iadst8_low1_ssse3, av1_iadst8_sse2 } |
1893 | | }; |
1894 | | |
1895 | | static INLINE void load_buffer_avx2(const int32_t *in, int stride, |
1896 | 956k | __m128i *out) { |
1897 | 956k | const __m256i a = _mm256_load_si256((const __m256i *)in); |
1898 | 956k | const __m256i b = _mm256_load_si256((const __m256i *)(in + stride * 1)); |
1899 | 956k | const __m256i c = _mm256_load_si256((const __m256i *)(in + stride * 2)); |
1900 | 956k | const __m256i d = _mm256_load_si256((const __m256i *)(in + stride * 3)); |
1901 | 956k | const __m256i e = _mm256_load_si256((const __m256i *)(in + stride * 4)); |
1902 | 956k | const __m256i f = _mm256_load_si256((const __m256i *)(in + stride * 5)); |
1903 | 956k | const __m256i g = _mm256_load_si256((const __m256i *)(in + stride * 6)); |
1904 | 956k | const __m256i h = _mm256_load_si256((const __m256i *)(in + stride * 7)); |
1905 | | |
1906 | | // a0 a1 a2 a3 b0 b1 b2 b3 a4 a5 a6 a7 b4 b5 b6 b7 |
1907 | 956k | const __m256i ab_16bit = _mm256_packs_epi32(a, b); |
1908 | | // c0 c1 c2 c3 d0 d1 d2 d3 c4 c5 c6 c7 d4 d5 d6 d7 |
1909 | 956k | const __m256i cd_16bit = _mm256_packs_epi32(c, d); |
1910 | | // e0 e1 e2 e3 f0 f1 f2 f3 e4 e5 e6 e7 f4 f5 f6 f7 |
1911 | 956k | const __m256i ef_16bit = _mm256_packs_epi32(e, f); |
1912 | | // g0 g1 g2 g3 h0 h1 h2 h3 g4 g5 g6 g7 h4 h5 h6 h7 |
1913 | 956k | const __m256i gh_16bit = _mm256_packs_epi32(g, h); |
1914 | | |
1915 | | // a0 a1 a2 a3 a4 a5 a6 a7 b0 b1 b2 b3 b4 b5 b6 b7 |
1916 | 956k | const __m256i ab = _mm256_permute4x64_epi64(ab_16bit, 0xd8); |
1917 | | // c0 c1 c2 c3 c4 c5 c6 c7 d0 d1 d2 d3 d4 d5 d6 d7 |
1918 | 956k | const __m256i cd = _mm256_permute4x64_epi64(cd_16bit, 0xd8); |
1919 | | // e0 e1 e2 e3 e4 e5 e6 e7 f0 f1 f2 f3 f4 f5 f6 f7 |
1920 | 956k | const __m256i ef = _mm256_permute4x64_epi64(ef_16bit, 0xd8); |
1921 | | // g0 g1 g2 g3 g4 g5 g6 g7 h0 h1 h2 h3 h4 h5 h6 h7 |
1922 | 956k | const __m256i gh = _mm256_permute4x64_epi64(gh_16bit, 0xd8); |
1923 | | |
1924 | 956k | out[0] = _mm256_castsi256_si128(ab); |
1925 | 956k | out[1] = _mm256_extractf128_si256(ab, 1); |
1926 | 956k | out[2] = _mm256_castsi256_si128(cd); |
1927 | 956k | out[3] = _mm256_extractf128_si256(cd, 1); |
1928 | 956k | out[4] = _mm256_castsi256_si128(ef); |
1929 | 956k | out[5] = _mm256_extractf128_si256(ef, 1); |
1930 | 956k | out[6] = _mm256_castsi256_si128(gh); |
1931 | 956k | out[7] = _mm256_extractf128_si256(gh, 1); |
1932 | 956k | } |
1933 | | |
1934 | | static INLINE void round_and_transpose_avx2(const __m128i *const in, |
1935 | | __m128i *const out, int bit, |
1936 | 956k | int *lr_flip) { |
1937 | 956k | __m256i buf_temp[4]; |
1938 | 956k | const __m256i scale = _mm256_set1_epi16(1 << (15 + bit)); |
1939 | 956k | int j = *lr_flip ? 7 : 0; |
1940 | 956k | const int step = *lr_flip ? -1 : 1; |
1941 | | |
1942 | | // 70 71 72 73 74 75 76 77 | 30 31 32 33 34 35 36 37 |
1943 | 956k | buf_temp[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), |
1944 | 956k | in[j + 4 * step], 1); |
1945 | 956k | j += step; |
1946 | | // 60 61 62 63 64 65 66 67 | 20 21 22 23 24 25 26 27 |
1947 | 956k | buf_temp[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), |
1948 | 956k | in[j + 4 * step], 1); |
1949 | 956k | j += step; |
1950 | | // 50 51 52 53 54 55 56 57 | 10 11 12 13 14 15 16 17 |
1951 | 956k | buf_temp[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), |
1952 | 956k | in[j + 4 * step], 1); |
1953 | 956k | j += step; |
1954 | | // 40 41 42 43 44 45 46 47 | 00 01 02 03 04 05 06 07 |
1955 | 956k | buf_temp[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), |
1956 | 956k | in[j + 4 * step], 1); |
1957 | | |
1958 | | // 70 71 72 73 74 75 76 77 | 30 31 32 33 34 35 36 37 |
1959 | 956k | buf_temp[0] = _mm256_mulhrs_epi16(buf_temp[0], scale); |
1960 | | // 60 61 62 63 64 65 66 67 | 20 21 22 23 24 25 26 27 |
1961 | 956k | buf_temp[1] = _mm256_mulhrs_epi16(buf_temp[1], scale); |
1962 | | // 50 51 52 53 54 55 56 57 | 10 11 12 13 14 15 16 17 |
1963 | 956k | buf_temp[2] = _mm256_mulhrs_epi16(buf_temp[2], scale); |
1964 | | // 40 41 42 43 44 45 46 47 | 00 01 02 03 04 05 06 07 |
1965 | 956k | buf_temp[3] = _mm256_mulhrs_epi16(buf_temp[3], scale); |
1966 | | |
1967 | | // 70 60 71 61 72 62 73 63 | 30 20 31 21 32 22 33 23 |
1968 | 956k | const __m256i unpcklo0 = _mm256_unpacklo_epi16(buf_temp[0], buf_temp[1]); |
1969 | | // 74 64 75 65 76 66 77 67 | 34 24 35 25 36 26 37 27 |
1970 | 956k | const __m256i unpckhi0 = _mm256_unpackhi_epi16(buf_temp[0], buf_temp[1]); |
1971 | | // 50 40 51 41 52 42 53 43 | 10 00 11 01 12 02 13 03 |
1972 | 956k | const __m256i unpcklo1 = _mm256_unpacklo_epi16(buf_temp[2], buf_temp[3]); |
1973 | | // 54 44 55 45 56 46 57 47 | 14 04 15 05 16 06 17 07 |
1974 | 956k | const __m256i unpckhi1 = _mm256_unpackhi_epi16(buf_temp[2], buf_temp[3]); |
1975 | | |
1976 | | // 70 60 50 40 71 61 51 41 | 30 20 10 00 31 21 11 01 |
1977 | 956k | const __m256i unpcklo00 = _mm256_unpacklo_epi32(unpcklo0, unpcklo1); |
1978 | | // 72 62 52 42 73 63 53 43 | 32 22 12 02 33 23 13 03 |
1979 | 956k | const __m256i unpckhi00 = _mm256_unpackhi_epi32(unpcklo0, unpcklo1); |
1980 | | // 74 64 54 44 75 65 55 45 | 34 24 14 04 35 25 15 05 |
1981 | 956k | const __m256i unpcklo01 = _mm256_unpacklo_epi32(unpckhi0, unpckhi1); |
1982 | | // 76 66 56 46 77 67 57 47 | 36 26 16 06 37 27 17 07 |
1983 | 956k | const __m256i unpckhi01 = _mm256_unpackhi_epi32(unpckhi0, unpckhi1); |
1984 | | |
1985 | | // 70 60 50 40 30 20 10 00 | 71 61 51 41 31 21 11 01 |
1986 | 956k | const __m256i reg_00 = _mm256_permute4x64_epi64(unpcklo00, 0xd8); |
1987 | | // 72 62 52 42 32 22 12 02 | 73 63 53 43 33 23 13 03 |
1988 | 956k | const __m256i reg_01 = _mm256_permute4x64_epi64(unpckhi00, 0xd8); |
1989 | | // 74 64 54 44 34 24 14 04 | 75 65 55 45 35 25 15 05 |
1990 | 956k | const __m256i reg_10 = _mm256_permute4x64_epi64(unpcklo01, 0xd8); |
1991 | | // 76 66 56 46 36 26 16 06 | 77 67 57 47 37 27 17 07 |
1992 | 956k | const __m256i reg_11 = _mm256_permute4x64_epi64(unpckhi01, 0xd8); |
1993 | | |
1994 | | // 70 60 50 40 30 20 10 00 |
1995 | 956k | out[0] = _mm256_castsi256_si128(reg_00); |
1996 | | // 71 61 51 41 31 21 11 01 |
1997 | 956k | out[1] = _mm256_extracti128_si256(reg_00, 1); |
1998 | | // 72 62 52 42 32 22 12 02 |
1999 | 956k | out[2] = _mm256_castsi256_si128(reg_01); |
2000 | | // 73 63 53 43 33 23 13 03 |
2001 | 956k | out[3] = _mm256_extracti128_si256(reg_01, 1); |
2002 | | // 74 64 54 44 34 24 14 04 |
2003 | 956k | out[4] = _mm256_castsi256_si128(reg_10); |
2004 | | // 75 65 55 45 35 25 15 05 |
2005 | 956k | out[5] = _mm256_extracti128_si256(reg_10, 1); |
2006 | | // 76 66 56 46 36 26 16 06 |
2007 | 956k | out[6] = _mm256_castsi256_si128(reg_11); |
2008 | | // 77 67 57 47 37 27 17 07 |
2009 | 956k | out[7] = _mm256_extracti128_si256(reg_11, 1); |
2010 | 956k | } |
2011 | | |
2012 | | static INLINE void round_shift_lowbd_write_buffer_avx2(__m128i *in, int bit, |
2013 | | uint8_t *output, |
2014 | 956k | int stride, int flipud) { |
2015 | 956k | __m256i in_256[4], v_256[4]; |
2016 | 956k | int j = flipud ? 7 : 0; |
2017 | 956k | const int step = flipud ? -1 : 1; |
2018 | 956k | const __m256i scale = _mm256_set1_epi16(1 << (15 + bit)); |
2019 | 956k | const __m256i zero = _mm256_setzero_si256(); |
2020 | | // in[0], in[1] |
2021 | 956k | in_256[0] = |
2022 | 956k | _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1); |
2023 | 956k | j += 2 * step; |
2024 | | // in[2], in[3] |
2025 | 956k | in_256[1] = |
2026 | 956k | _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1); |
2027 | 956k | j += 2 * step; |
2028 | | // in[4], in[5] |
2029 | 956k | in_256[2] = |
2030 | 956k | _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1); |
2031 | 956k | j += 2 * step; |
2032 | | // in[6], in[7] |
2033 | 956k | in_256[3] = |
2034 | 956k | _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1); |
2035 | | |
2036 | | // i00 i01 i02 i03 i04 i05 i06 i07 i10 i11 i12 i13 i14 i15 i16 i17 |
2037 | 956k | in_256[0] = _mm256_mulhrs_epi16(in_256[0], scale); |
2038 | | // i20 i21 i22 i23 i24 i25 i26 i27 i30 i31 i32 i33 i34 i35 i36 i37 |
2039 | 956k | in_256[1] = _mm256_mulhrs_epi16(in_256[1], scale); |
2040 | | // i40 i41 i42 i43 i44 i45 i46 i47 i50 i51 i52 i53 i54 i55 i56 i57 |
2041 | 956k | in_256[2] = _mm256_mulhrs_epi16(in_256[2], scale); |
2042 | | // i60 i61 i62 i63 i64 i65 i66 i67 i70 i71 i72 i73 i74 i75 i76 i77 |
2043 | 956k | in_256[3] = _mm256_mulhrs_epi16(in_256[3], scale); |
2044 | | |
2045 | 956k | const __m128i v0 = _mm_loadl_epi64((__m128i const *)(output)); |
2046 | 956k | const __m128i v1 = _mm_loadl_epi64((__m128i const *)(output + stride)); |
2047 | 956k | const __m128i v2 = _mm_loadl_epi64((__m128i const *)(output + 2 * stride)); |
2048 | 956k | const __m128i v3 = _mm_loadl_epi64((__m128i const *)(output + 3 * stride)); |
2049 | 956k | const __m128i v4 = _mm_loadl_epi64((__m128i const *)(output + 4 * stride)); |
2050 | 956k | const __m128i v5 = _mm_loadl_epi64((__m128i const *)(output + 5 * stride)); |
2051 | 956k | const __m128i v6 = _mm_loadl_epi64((__m128i const *)(output + 6 * stride)); |
2052 | 956k | const __m128i v7 = _mm_loadl_epi64((__m128i const *)(output + 7 * stride)); |
2053 | | |
2054 | 956k | v_256[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(v0), v1, 1); |
2055 | 956k | v_256[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(v2), v3, 1); |
2056 | 956k | v_256[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(v4), v5, 1); |
2057 | 956k | v_256[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(v6), v7, 1); |
2058 | | |
2059 | 956k | const __m256i unpcklo0 = _mm256_unpacklo_epi8(v_256[0], zero); |
2060 | 956k | const __m256i unpcklo1 = _mm256_unpacklo_epi8(v_256[1], zero); |
2061 | 956k | const __m256i unpcklo2 = _mm256_unpacklo_epi8(v_256[2], zero); |
2062 | 956k | const __m256i unpcklo3 = _mm256_unpacklo_epi8(v_256[3], zero); |
2063 | | // 00 01 10 11 |
2064 | 956k | const __m256i x0 = _mm256_adds_epi16(in_256[0], unpcklo0); |
2065 | | // 20 21 30 31 |
2066 | 956k | const __m256i x1 = _mm256_adds_epi16(in_256[1], unpcklo1); |
2067 | | // 40 41 50 51 |
2068 | 956k | const __m256i x2 = _mm256_adds_epi16(in_256[2], unpcklo2); |
2069 | | // 60 61 70 71 |
2070 | 956k | const __m256i x3 = _mm256_adds_epi16(in_256[3], unpcklo3); |
2071 | | |
2072 | | // 00 01 20 21 10 11 30 31 |
2073 | 956k | const __m256i res_0123 = _mm256_packus_epi16(x0, x1); |
2074 | | // 40 41 60 61 50 51 70 71 |
2075 | 956k | const __m256i res_4567 = _mm256_packus_epi16(x2, x3); |
2076 | | |
2077 | | // 00 01 20 21 |
2078 | 956k | const __m128i res_02 = _mm256_castsi256_si128(res_0123); |
2079 | | // 10 11 30 31 |
2080 | 956k | const __m128i res_13 = _mm256_extracti128_si256(res_0123, 1); |
2081 | | // 40 41 60 61 |
2082 | 956k | const __m128i res_46 = _mm256_castsi256_si128(res_4567); |
2083 | | // 50 51 70 71 |
2084 | 956k | const __m128i res_57 = _mm256_extracti128_si256(res_4567, 1); |
2085 | | |
2086 | | // 00 01 |
2087 | 956k | _mm_storel_epi64((__m128i *)(output), res_02); |
2088 | | // 10 11 |
2089 | 956k | _mm_storel_epi64((__m128i *)(output + stride), res_13); |
2090 | | // 20 21 |
2091 | 956k | _mm_storel_epi64((__m128i *)(output + 2 * stride), |
2092 | 956k | _mm_unpackhi_epi64(res_02, res_02)); |
2093 | | // 30 31 |
2094 | 956k | _mm_storel_epi64((__m128i *)(output + 3 * stride), |
2095 | 956k | _mm_unpackhi_epi64(res_13, res_13)); |
2096 | | // 40 41 |
2097 | 956k | _mm_storel_epi64((__m128i *)(output + 4 * stride), res_46); |
2098 | | // 50 51 |
2099 | 956k | _mm_storel_epi64((__m128i *)(output + 5 * stride), res_57); |
2100 | | // 60 61 |
2101 | 956k | _mm_storel_epi64((__m128i *)(output + 6 * stride), |
2102 | 956k | _mm_unpackhi_epi64(res_46, res_46)); |
2103 | | // 70 71 |
2104 | 956k | _mm_storel_epi64((__m128i *)(output + 7 * stride), |
2105 | 956k | _mm_unpackhi_epi64(res_57, res_57)); |
2106 | 956k | } |
2107 | | |
2108 | | // AVX2 implementation has the advantage when combined multiple operations |
2109 | | // together. |
2110 | | static INLINE void lowbd_inv_txfm2d_8x8_no_identity_avx2( |
2111 | | const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, |
2112 | 956k | TX_SIZE tx_size, int eob) { |
2113 | 956k | __m128i buf1[8]; |
2114 | 956k | const int input_stride = 8; |
2115 | 956k | const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; |
2116 | 956k | assert(hitx_1d_tab[tx_type] < 2); |
2117 | 956k | assert(vitx_1d_tab[tx_type] < 2); |
2118 | 956k | const transform_1d_ssse3 row_txfm = |
2119 | 956k | lowbd_txfm_all_1d_zeros_8x8_arr[hitx_1d_tab[tx_type]][eob != 1]; |
2120 | 956k | const transform_1d_ssse3 col_txfm = |
2121 | 956k | lowbd_txfm_all_1d_zeros_8x8_arr[vitx_1d_tab[tx_type]][eob != 1]; |
2122 | | |
2123 | 956k | assert(col_txfm != NULL); |
2124 | 956k | assert(row_txfm != NULL); |
2125 | 956k | int ud_flip, lr_flip; |
2126 | 956k | get_flip_cfg(tx_type, &ud_flip, &lr_flip); |
2127 | | |
2128 | 956k | __m128i buf0[8]; |
2129 | 956k | __m128i *buf0_cur = buf0; |
2130 | 956k | load_buffer_avx2(input, input_stride, buf0_cur); |
2131 | 956k | row_txfm(buf0, buf0); |
2132 | | |
2133 | 956k | assert(shift[0] < 0); |
2134 | 956k | __m128i *_buf1 = buf1; |
2135 | 956k | round_and_transpose_avx2(buf0, _buf1, shift[0], &lr_flip); |
2136 | 956k | assert(shift[1] < 0); |
2137 | 956k | col_txfm(buf1, buf1); |
2138 | 956k | round_shift_lowbd_write_buffer_avx2(buf1, shift[1], output, stride, ud_flip); |
2139 | 956k | } |
2140 | | |
2141 | | // AVX2 implementation of 8x8 inverse transform. Observed that coding AVX2 for |
2142 | | // tx_type with identity in either of the direction has no advantage. |
2143 | | static void lowbd_inv_txfm2d_add_8x8_avx2(const int32_t *input, uint8_t *output, |
2144 | | int stride, TX_TYPE tx_type, |
2145 | 1.18M | TX_SIZE tx_size, int eob) { |
2146 | 1.18M | switch (tx_type) { |
2147 | 118k | case IDTX: |
2148 | 118k | av1_lowbd_inv_txfm2d_add_idtx_ssse3(input, output, stride, tx_size); |
2149 | | |
2150 | 118k | break; |
2151 | 22.8k | case V_DCT: |
2152 | 32.8k | case V_ADST: |
2153 | 39.2k | case V_FLIPADST: |
2154 | 39.2k | av1_lowbd_inv_txfm2d_add_h_identity_ssse3(input, output, stride, tx_type, |
2155 | 39.2k | tx_size, eob); |
2156 | 39.2k | break; |
2157 | 69.2k | case H_DCT: |
2158 | 72.0k | case H_ADST: |
2159 | 75.8k | case H_FLIPADST: |
2160 | 75.8k | av1_lowbd_inv_txfm2d_add_v_identity_ssse3(input, output, stride, tx_type, |
2161 | 75.8k | tx_size, eob); |
2162 | 75.8k | break; |
2163 | 956k | default: |
2164 | 956k | lowbd_inv_txfm2d_8x8_no_identity_avx2(input, output, stride, tx_type, |
2165 | 956k | tx_size, eob); |
2166 | 1.18M | } |
2167 | 1.18M | } |
2168 | | |
2169 | | // for 32x32,32x64,64x32,64x64,16x32,32x16,64x16,16x64 |
2170 | | static INLINE void lowbd_inv_txfm2d_add_universe_avx2( |
2171 | | const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, |
2172 | 1.81M | TX_SIZE tx_size, int eob) { |
2173 | 1.81M | (void)eob; |
2174 | 1.81M | switch (tx_type) { |
2175 | 1.29M | case DCT_DCT: |
2176 | 1.40M | case ADST_DCT: // ADST in vertical, DCT in horizontal |
2177 | 1.59M | case DCT_ADST: // DCT in vertical, ADST in horizontal |
2178 | 1.69M | case ADST_ADST: // ADST in both directions |
2179 | 1.70M | case FLIPADST_DCT: |
2180 | 1.71M | case DCT_FLIPADST: |
2181 | 1.72M | case FLIPADST_FLIPADST: |
2182 | 1.73M | case ADST_FLIPADST: |
2183 | 1.74M | case FLIPADST_ADST: |
2184 | 1.74M | lowbd_inv_txfm2d_add_no_identity_avx2(input, output, stride, tx_type, |
2185 | 1.74M | tx_size, eob); |
2186 | 1.74M | break; |
2187 | 33.6k | case IDTX: |
2188 | 33.6k | lowbd_inv_txfm2d_add_idtx_avx2(input, output, stride, tx_size, eob); |
2189 | 33.6k | break; |
2190 | 11.4k | case V_DCT: |
2191 | 11.4k | case V_ADST: |
2192 | 11.4k | case V_FLIPADST: |
2193 | 11.4k | lowbd_inv_txfm2d_add_h_identity_avx2(input, output, stride, tx_type, |
2194 | 11.4k | tx_size, eob); |
2195 | 11.4k | break; |
2196 | 19.9k | case H_DCT: |
2197 | 19.9k | case H_ADST: |
2198 | 19.9k | case H_FLIPADST: |
2199 | 19.9k | lowbd_inv_txfm2d_add_v_identity_avx2(input, output, stride, tx_type, |
2200 | 19.9k | tx_size, eob); |
2201 | 19.9k | break; |
2202 | 0 | default: |
2203 | 0 | av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size, |
2204 | 0 | eob); |
2205 | 0 | break; |
2206 | 1.81M | } |
2207 | 1.81M | } |
2208 | | |
2209 | | void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output, |
2210 | | int stride, TX_TYPE tx_type, TX_SIZE tx_size, |
2211 | 7.47M | int eob) { |
2212 | 7.47M | switch (tx_size) { |
2213 | 1.00M | case TX_4X4: |
2214 | 1.51M | case TX_4X8: |
2215 | 2.24M | case TX_8X4: |
2216 | 2.54M | case TX_8X16: |
2217 | 3.10M | case TX_16X8: |
2218 | 3.35M | case TX_4X16: |
2219 | 3.97M | case TX_16X4: |
2220 | 4.07M | case TX_8X32: |
2221 | 4.47M | case TX_32X8: |
2222 | 4.47M | av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size, |
2223 | 4.47M | eob); |
2224 | 4.47M | break; |
2225 | 1.18M | case TX_8X8: |
2226 | 1.18M | lowbd_inv_txfm2d_add_8x8_avx2(input, output, stride, tx_type, tx_size, |
2227 | 1.18M | eob); |
2228 | 1.18M | break; |
2229 | 864k | case TX_16X16: |
2230 | 1.27M | case TX_32X32: |
2231 | 1.35M | case TX_64X64: |
2232 | 1.45M | case TX_16X32: |
2233 | 1.60M | case TX_32X16: |
2234 | 1.62M | case TX_32X64: |
2235 | 1.65M | case TX_64X32: |
2236 | 1.66M | case TX_16X64: |
2237 | 1.81M | case TX_64X16: |
2238 | 1.81M | default: |
2239 | 1.81M | lowbd_inv_txfm2d_add_universe_avx2(input, output, stride, tx_type, |
2240 | 1.81M | tx_size, eob); |
2241 | 1.81M | break; |
2242 | 7.47M | } |
2243 | 7.47M | } |
2244 | | |
2245 | | void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, |
2246 | 9.02M | const TxfmParam *txfm_param) { |
2247 | 9.02M | const TX_TYPE tx_type = txfm_param->tx_type; |
2248 | 9.02M | if (!txfm_param->lossless) { |
2249 | 7.47M | av1_lowbd_inv_txfm2d_add_avx2(dqcoeff, dst, stride, tx_type, |
2250 | 7.47M | txfm_param->tx_size, txfm_param->eob); |
2251 | 7.47M | } else { |
2252 | 1.54M | av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param); |
2253 | 1.54M | } |
2254 | 9.02M | } |