/rust/registry/src/index.crates.io-1949cf8c6b5b557f/jpeg-encoder-0.7.0/src/avx2/fdct.rs
Line | Count | Source |
1 | | /* |
2 | | * Ported from mozjpeg / jfdctint-avx2.asm to rust |
3 | | * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB |
4 | | * Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander. |
5 | | * |
6 | | * Based on the x86 SIMD extension for IJG JPEG library |
7 | | * Copyright (C) 1999-2006, MIYASAKA Masaru. |
8 | | */ |
9 | | |
10 | | #[cfg(target_arch = "x86")] |
11 | | use core::arch::x86::{ |
12 | | __m256i, _mm256_add_epi16, _mm256_add_epi32, _mm256_loadu_si256, _mm256_madd_epi16, |
13 | | _mm256_packs_epi32, _mm256_permute2x128_si256, _mm256_permute4x64_epi64, _mm256_set_epi16, |
14 | | _mm256_set_epi32, _mm256_sign_epi16, _mm256_slli_epi16, _mm256_srai_epi16, _mm256_srai_epi32, |
15 | | _mm256_storeu_si256, _mm256_sub_epi16, _mm256_unpackhi_epi16, _mm256_unpackhi_epi32, |
16 | | _mm256_unpacklo_epi16, _mm256_unpacklo_epi32, |
17 | | }; |
18 | | |
19 | | #[cfg(target_arch = "x86_64")] |
20 | | use core::arch::x86_64::{ |
21 | | __m256i, _mm256_add_epi16, _mm256_add_epi32, _mm256_loadu_si256, _mm256_madd_epi16, |
22 | | _mm256_packs_epi32, _mm256_permute2x128_si256, _mm256_permute4x64_epi64, _mm256_set_epi16, |
23 | | _mm256_set_epi32, _mm256_sign_epi16, _mm256_slli_epi16, _mm256_srai_epi16, _mm256_srai_epi32, |
24 | | _mm256_storeu_si256, _mm256_sub_epi16, _mm256_unpackhi_epi16, _mm256_unpackhi_epi32, |
25 | | _mm256_unpacklo_epi16, _mm256_unpacklo_epi32, |
26 | | }; |
27 | | |
28 | | const CONST_BITS: i32 = 13; |
29 | | const PASS1_BITS: i32 = 2; |
30 | | |
31 | | // FIX(0.298631336) |
32 | | const F_0_298: i16 = 2446; |
33 | | // FIX(0.390180644) |
34 | | const F_0_390: i16 = 3196; |
35 | | // FIX(0.541196100) |
36 | | const F_0_541: i16 = 4433; |
37 | | // FIX(0.765366865) |
38 | | const F_0_765: i16 = 6270; |
39 | | //FIX(0.899976223) |
40 | | const F_0_899: i16 = 7373; |
41 | | //FIX(1.175875602) |
42 | | const F_1_175: i16 = 9633; |
43 | | //FIX(1.501321110) |
44 | | const F_1_501: i16 = 12299; |
45 | | //FIX(1.847759065) |
46 | | const F_1_847: i16 = 15137; |
47 | | //FIX(1.961570560) |
48 | | const F_1_961: i16 = 16069; |
49 | | //FIX(2.053119869) |
50 | | const F_2_053: i16 = 16819; |
51 | | //FIX(2.562915447) |
52 | | const F_2_562: i16 = 20995; |
53 | | //FIX(3.072711026) |
54 | | const F_3_072: i16 = 25172; |
55 | | |
56 | | const DESCALE_P1: i32 = CONST_BITS - PASS1_BITS; |
57 | | const DESCALE_P2: i32 = CONST_BITS + PASS1_BITS; |
58 | | |
59 | | #[inline(always)] |
60 | 0 | pub fn fdct_avx2(data: &mut [i16; 64]) { |
61 | 0 | unsafe { |
62 | 0 | fdct_avx2_internal(data); |
63 | 0 | } |
64 | 0 | } |
65 | | |
66 | | #[target_feature(enable = "avx2")] |
67 | 0 | fn fdct_avx2_internal(data: &mut [i16; 64]) { |
68 | | #[target_feature(enable = "avx2")] |
69 | | #[allow(non_snake_case)] |
70 | | #[inline] |
71 | 0 | fn PW_F130_F054_MF130_F054() -> __m256i { |
72 | 0 | _mm256_set_epi16( |
73 | | F_0_541, |
74 | 0 | F_0_541 - F_1_847, |
75 | | F_0_541, |
76 | 0 | F_0_541 - F_1_847, |
77 | | F_0_541, |
78 | 0 | F_0_541 - F_1_847, |
79 | | F_0_541, |
80 | 0 | F_0_541 - F_1_847, |
81 | | F_0_541, |
82 | 0 | F_0_541 + F_0_765, |
83 | | F_0_541, |
84 | 0 | F_0_541 + F_0_765, |
85 | | F_0_541, |
86 | 0 | F_0_541 + F_0_765, |
87 | | F_0_541, |
88 | 0 | F_0_541 + F_0_765, |
89 | | ) |
90 | 0 | } |
91 | | |
92 | | #[target_feature(enable = "avx2")] |
93 | | #[allow(non_snake_case)] |
94 | | #[inline] |
95 | 0 | fn PW_MF078_F117_F078_F117() -> __m256i { |
96 | 0 | _mm256_set_epi16( |
97 | | F_1_175, |
98 | 0 | F_1_175 - F_0_390, |
99 | | F_1_175, |
100 | 0 | F_1_175 - F_0_390, |
101 | | F_1_175, |
102 | 0 | F_1_175 - F_0_390, |
103 | | F_1_175, |
104 | 0 | F_1_175 - F_0_390, |
105 | | F_1_175, |
106 | 0 | F_1_175 - F_1_961, |
107 | | F_1_175, |
108 | 0 | F_1_175 - F_1_961, |
109 | | F_1_175, |
110 | 0 | F_1_175 - F_1_961, |
111 | | F_1_175, |
112 | 0 | F_1_175 - F_1_961, |
113 | | ) |
114 | 0 | } |
115 | | |
116 | | #[target_feature(enable = "avx2")] |
117 | | #[allow(non_snake_case)] |
118 | | #[inline] |
119 | 0 | fn PW_MF060_MF089_MF050_MF256() -> __m256i { |
120 | 0 | _mm256_set_epi16( |
121 | 0 | -F_2_562, |
122 | 0 | F_2_053 - F_2_562, |
123 | 0 | -F_2_562, |
124 | 0 | F_2_053 - F_2_562, |
125 | 0 | -F_2_562, |
126 | 0 | F_2_053 - F_2_562, |
127 | 0 | -F_2_562, |
128 | 0 | F_2_053 - F_2_562, |
129 | 0 | -F_0_899, |
130 | 0 | F_0_298 - F_0_899, |
131 | 0 | -F_0_899, |
132 | 0 | F_0_298 - F_0_899, |
133 | 0 | -F_0_899, |
134 | 0 | F_0_298 - F_0_899, |
135 | 0 | -F_0_899, |
136 | 0 | F_0_298 - F_0_899, |
137 | | ) |
138 | 0 | } |
139 | | |
140 | | #[target_feature(enable = "avx2")] |
141 | | #[allow(non_snake_case)] |
142 | | #[inline] |
143 | 0 | fn PW_F050_MF256_F060_MF089() -> __m256i { |
144 | 0 | _mm256_set_epi16( |
145 | 0 | -F_0_899, |
146 | 0 | F_1_501 - F_0_899, |
147 | 0 | -F_0_899, |
148 | 0 | F_1_501 - F_0_899, |
149 | 0 | -F_0_899, |
150 | 0 | F_1_501 - F_0_899, |
151 | 0 | -F_0_899, |
152 | 0 | F_1_501 - F_0_899, |
153 | 0 | -F_2_562, |
154 | 0 | F_3_072 - F_2_562, |
155 | 0 | -F_2_562, |
156 | 0 | F_3_072 - F_2_562, |
157 | 0 | -F_2_562, |
158 | 0 | F_3_072 - F_2_562, |
159 | 0 | -F_2_562, |
160 | 0 | F_3_072 - F_2_562, |
161 | | ) |
162 | 0 | } |
163 | | |
164 | | #[target_feature(enable = "avx2")] |
165 | | #[allow(non_snake_case)] |
166 | | #[inline] |
167 | 0 | fn PD_DESCALE_P(first_pass: bool) -> __m256i { |
168 | 0 | if first_pass { |
169 | 0 | _mm256_set_epi32( |
170 | 0 | 1 << (DESCALE_P1 - 1), |
171 | 0 | 1 << (DESCALE_P1 - 1), |
172 | 0 | 1 << (DESCALE_P1 - 1), |
173 | 0 | 1 << (DESCALE_P1 - 1), |
174 | 0 | 1 << (DESCALE_P1 - 1), |
175 | 0 | 1 << (DESCALE_P1 - 1), |
176 | 0 | 1 << (DESCALE_P1 - 1), |
177 | 0 | 1 << (DESCALE_P1 - 1), |
178 | | ) |
179 | | } else { |
180 | 0 | _mm256_set_epi32( |
181 | 0 | 1 << (DESCALE_P2 - 1), |
182 | 0 | 1 << (DESCALE_P2 - 1), |
183 | 0 | 1 << (DESCALE_P2 - 1), |
184 | 0 | 1 << (DESCALE_P2 - 1), |
185 | 0 | 1 << (DESCALE_P2 - 1), |
186 | 0 | 1 << (DESCALE_P2 - 1), |
187 | 0 | 1 << (DESCALE_P2 - 1), |
188 | 0 | 1 << (DESCALE_P2 - 1), |
189 | | ) |
190 | | } |
191 | 0 | } |
192 | | |
193 | | #[target_feature(enable = "avx2")] |
194 | | #[allow(non_snake_case)] |
195 | | #[inline] |
196 | 0 | fn PW_DESCALE_P2X() -> __m256i { |
197 | 0 | _mm256_set_epi32( |
198 | 0 | 1 << (PASS1_BITS - 1), |
199 | 0 | 1 << (PASS1_BITS - 1), |
200 | 0 | 1 << (PASS1_BITS - 1), |
201 | 0 | 1 << (PASS1_BITS - 1), |
202 | 0 | 1 << (PASS1_BITS - 1), |
203 | 0 | 1 << (PASS1_BITS - 1), |
204 | 0 | 1 << (PASS1_BITS - 1), |
205 | 0 | 1 << (PASS1_BITS - 1), |
206 | | ) |
207 | 0 | } |
208 | | |
209 | | // In-place 8x8x16-bit matrix transpose using AVX2 instructions |
210 | | #[target_feature(enable = "avx2")] |
211 | | #[inline] |
212 | 0 | fn do_transpose( |
213 | 0 | i1: __m256i, |
214 | 0 | i2: __m256i, |
215 | 0 | i3: __m256i, |
216 | 0 | i4: __m256i, |
217 | 0 | ) -> (__m256i, __m256i, __m256i, __m256i) { |
218 | | //i1=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47) |
219 | | //i2=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57) |
220 | | //i3=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67) |
221 | | //i4=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77) |
222 | | |
223 | 0 | let t5 = _mm256_unpacklo_epi16(i1, i2); |
224 | 0 | let t6 = _mm256_unpackhi_epi16(i1, i2); |
225 | 0 | let t7 = _mm256_unpacklo_epi16(i3, i4); |
226 | 0 | let t8 = _mm256_unpackhi_epi16(i3, i4); |
227 | | |
228 | | // transpose coefficients(phase 1) |
229 | | // t1=(00 10 01 11 02 12 03 13 40 50 41 51 42 52 43 53) |
230 | | // t2=(04 14 05 15 06 16 07 17 44 54 45 55 46 56 47 57) |
231 | | // t3=(20 30 21 31 22 32 23 33 60 70 61 71 62 72 63 73) |
232 | | // t4=(24 34 25 35 26 36 27 37 64 74 65 75 66 76 67 77) |
233 | | |
234 | 0 | let t1 = _mm256_unpacklo_epi32(t5, t7); |
235 | 0 | let t2 = _mm256_unpackhi_epi32(t5, t7); |
236 | 0 | let t3 = _mm256_unpacklo_epi32(t6, t8); |
237 | 0 | let t4 = _mm256_unpackhi_epi32(t6, t8); |
238 | | |
239 | | // transpose coefficients(phase 2) |
240 | | // t5=(00 10 20 30 01 11 21 31 40 50 60 70 41 51 61 71) |
241 | | // t6=(02 12 22 32 03 13 23 33 42 52 62 72 43 53 63 73) |
242 | | // t7=(04 14 24 34 05 15 25 35 44 54 64 74 45 55 65 75) |
243 | | // t8=(06 16 26 36 07 17 27 37 46 56 66 76 47 57 67 77) |
244 | | |
245 | 0 | ( |
246 | 0 | _mm256_permute4x64_epi64(t1, 0x8D), |
247 | 0 | _mm256_permute4x64_epi64(t2, 0x8D), |
248 | 0 | _mm256_permute4x64_epi64(t3, 0xD8), |
249 | 0 | _mm256_permute4x64_epi64(t4, 0xD8), |
250 | 0 | ) |
251 | 0 | } |
252 | | |
253 | | // In-place 8x8x16-bit accurate integer forward DCT using AVX2 instructions |
254 | | #[target_feature(enable = "avx2")] |
255 | | #[inline] |
256 | 0 | fn do_dct( |
257 | 0 | first_pass: bool, |
258 | 0 | i1: __m256i, |
259 | 0 | i2: __m256i, |
260 | 0 | i3: __m256i, |
261 | 0 | i4: __m256i, |
262 | 0 | ) -> (__m256i, __m256i, __m256i, __m256i) { |
263 | 0 | let t5 = _mm256_sub_epi16(i1, i4); // data1_0 - data6_7 = tmp6_7 |
264 | 0 | let t6 = _mm256_add_epi16(i1, i4); // data1_0 + data6_7 = tmp1_0 |
265 | 0 | let t7 = _mm256_add_epi16(i2, i3); // data3_2 + data4_5 = tmp3_2 |
266 | 0 | let t8 = _mm256_sub_epi16(i2, i3); // data3_2 - data4_5 = tmp4_5 |
267 | | |
268 | | // Even part |
269 | | |
270 | 0 | let t6 = _mm256_permute2x128_si256(t6, t6, 0x01); // t6=tmp0_1 |
271 | 0 | let t1 = _mm256_add_epi16(t6, t7); // t1 = tmp0_1 + tmp3_2 = tmp10_11 |
272 | 0 | let t6 = _mm256_sub_epi16(t6, t7); // t6 = tmp0_1 - tmp3_2 = tmp13_12 |
273 | | |
274 | 0 | let t7 = _mm256_permute2x128_si256(t1, t1, 0x01); // t7 = tmp11_10 |
275 | 0 | let t1 = _mm256_sign_epi16( |
276 | 0 | t1, |
277 | 0 | _mm256_set_epi16(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1), |
278 | | ); // tmp10_neg11 |
279 | | |
280 | 0 | let t7 = _mm256_add_epi16(t7, t1); // t7 = (tmp10 + tmp11)_(tmp10 - tmp11) |
281 | | |
282 | 0 | let t1 = if first_pass { |
283 | 0 | _mm256_slli_epi16(t7, PASS1_BITS) |
284 | | } else { |
285 | 0 | let t7 = _mm256_add_epi16(t7, PW_DESCALE_P2X()); |
286 | 0 | _mm256_srai_epi16(t7, PASS1_BITS) |
287 | | }; |
288 | | |
289 | | // (Original) |
290 | | // z1 = (tmp12 + tmp13) * 0.541196100; |
291 | | // data2 = z1 + tmp13 * 0.765366865; |
292 | | // data6 = z1 + tmp12 * -1.847759065; |
293 | | // |
294 | | // (This implementation) |
295 | | // data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; |
296 | | // data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); |
297 | | |
298 | 0 | let t7 = _mm256_permute2x128_si256(t6, t6, 0x01); // t7 = tmp12_13 |
299 | 0 | let t2 = _mm256_unpacklo_epi16(t6, t7); |
300 | 0 | let t6 = _mm256_unpackhi_epi16(t6, t7); |
301 | | |
302 | 0 | let t2 = _mm256_madd_epi16(t2, PW_F130_F054_MF130_F054()); // t2 = data2_6L |
303 | 0 | let t6 = _mm256_madd_epi16(t6, PW_F130_F054_MF130_F054()); // t6 = data2_6H |
304 | | |
305 | 0 | let t2 = _mm256_add_epi32(t2, PD_DESCALE_P(first_pass)); |
306 | 0 | let t6 = _mm256_add_epi32(t6, PD_DESCALE_P(first_pass)); |
307 | | |
308 | 0 | let t2 = if first_pass { |
309 | 0 | _mm256_srai_epi32(t2, DESCALE_P1) |
310 | | } else { |
311 | 0 | _mm256_srai_epi32(t2, DESCALE_P2) |
312 | | }; |
313 | 0 | let t6 = if first_pass { |
314 | 0 | _mm256_srai_epi32(t6, DESCALE_P1) |
315 | | } else { |
316 | 0 | _mm256_srai_epi32(t6, DESCALE_P2) |
317 | | }; |
318 | | |
319 | 0 | let t3 = _mm256_packs_epi32(t2, t6); // t6 = data2_6 |
320 | | |
321 | | // Odd part |
322 | | |
323 | 0 | let t7 = _mm256_add_epi16(t8, t5); // t7 = tmp4_5 + tmp6_7 = z3_4 |
324 | | |
325 | | // (Original) |
326 | | // z5 = (z3 + z4) * 1.175875602; |
327 | | // z3 = z3 * -1.961570560; |
328 | | // z4 = z4 * -0.390180644; |
329 | | // z3 += z5; |
330 | | // z4 += z5; |
331 | | // |
332 | | // (This implementation) |
333 | | // z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; |
334 | | // z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); |
335 | | |
336 | 0 | let t2 = _mm256_permute2x128_si256(t7, t7, 0x01); // t2 = z4_3 |
337 | 0 | let t6 = _mm256_unpacklo_epi16(t7, t2); |
338 | 0 | let t7 = _mm256_unpackhi_epi16(t7, t2); |
339 | | |
340 | 0 | let t6 = _mm256_madd_epi16(t6, PW_MF078_F117_F078_F117()); // t6 = z3_4L |
341 | 0 | let t7 = _mm256_madd_epi16(t7, PW_MF078_F117_F078_F117()); // t7 = z3_4H |
342 | | |
343 | | // (Original) |
344 | | // z1 = tmp4 + tmp7; |
345 | | // z2 = tmp5 + tmp6; |
346 | | // tmp4 = tmp4 * 0.298631336; |
347 | | // tmp5 = tmp5 * 2.053119869; |
348 | | // tmp6 = tmp6 * 3.072711026; |
349 | | // tmp7 = tmp7 * 1.501321110; |
350 | | // z1 = z1 * -0.899976223; |
351 | | // z2 = z2 * -2.562915447; |
352 | | // data7 = tmp4 + z1 + z3; |
353 | | // data5 = tmp5 + z2 + z4; |
354 | | // data3 = tmp6 + z2 + z3; |
355 | | // data1 = tmp7 + z1 + z4; |
356 | | // |
357 | | // (This implementation) |
358 | | // tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; |
359 | | // tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; |
360 | | // tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); |
361 | | // tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); |
362 | | // data7 = tmp4 + z3; |
363 | | // data5 = tmp5 + z4; |
364 | | // data3 = tmp6 + z3; |
365 | | // data1 = tmp7 + z4; |
366 | | |
367 | 0 | let t4 = _mm256_permute2x128_si256(t5, t5, 0x01); // t4 = tmp7_6 |
368 | 0 | let t2 = _mm256_unpacklo_epi16(t8, t4); |
369 | 0 | let t4 = _mm256_unpackhi_epi16(t8, t4); |
370 | | |
371 | 0 | let t2 = _mm256_madd_epi16(t2, PW_MF060_MF089_MF050_MF256()); //t2 = tmp4_5L |
372 | 0 | let t4 = _mm256_madd_epi16(t4, PW_MF060_MF089_MF050_MF256()); // t4 = tmp4_5H |
373 | | |
374 | 0 | let t2 = _mm256_add_epi32(t2, t6); // t2 = data7_5L |
375 | 0 | let t4 = _mm256_add_epi32(t4, t7); // t4 = data7_5H |
376 | | |
377 | 0 | let t2 = _mm256_add_epi32(t2, PD_DESCALE_P(first_pass)); |
378 | 0 | let t4 = _mm256_add_epi32(t4, PD_DESCALE_P(first_pass)); |
379 | | |
380 | 0 | let t2 = if first_pass { |
381 | 0 | _mm256_srai_epi32(t2, DESCALE_P1) |
382 | | } else { |
383 | 0 | _mm256_srai_epi32(t2, DESCALE_P2) |
384 | | }; |
385 | 0 | let t4 = if first_pass { |
386 | 0 | _mm256_srai_epi32(t4, DESCALE_P1) |
387 | | } else { |
388 | 0 | _mm256_srai_epi32(t4, DESCALE_P2) |
389 | | }; |
390 | | |
391 | 0 | let t4 = _mm256_packs_epi32(t2, t4); // t4 = data7_5 |
392 | | |
393 | 0 | let t2 = _mm256_permute2x128_si256(t8, t8, 0x01); // t2 = tmp5_4 |
394 | | |
395 | 0 | let t8 = _mm256_unpacklo_epi16(t5, t2); |
396 | 0 | let t5 = _mm256_unpackhi_epi16(t5, t2); |
397 | | |
398 | 0 | let t8 = _mm256_madd_epi16(t8, PW_F050_MF256_F060_MF089()); // t8 = tmp6_7L |
399 | 0 | let t5 = _mm256_madd_epi16(t5, PW_F050_MF256_F060_MF089()); // t5 = tmp6_7H |
400 | | |
401 | 0 | let t8 = _mm256_add_epi32(t8, t6); // t8 = data3_1L |
402 | 0 | let t5 = _mm256_add_epi32(t5, t7); // t5 = data3_1H |
403 | | |
404 | 0 | let t8 = _mm256_add_epi32(t8, PD_DESCALE_P(first_pass)); |
405 | 0 | let t5 = _mm256_add_epi32(t5, PD_DESCALE_P(first_pass)); |
406 | | |
407 | 0 | let t8 = if first_pass { |
408 | 0 | _mm256_srai_epi32(t8, DESCALE_P1) |
409 | | } else { |
410 | 0 | _mm256_srai_epi32(t8, DESCALE_P2) |
411 | | }; |
412 | 0 | let t5 = if first_pass { |
413 | 0 | _mm256_srai_epi32(t5, DESCALE_P1) |
414 | | } else { |
415 | 0 | _mm256_srai_epi32(t5, DESCALE_P2) |
416 | | }; |
417 | | |
418 | 0 | let t2 = _mm256_packs_epi32(t8, t5); // t2 = data3_1 |
419 | | |
420 | 0 | (t1, t2, t3, t4) |
421 | 0 | } |
422 | | |
423 | 0 | let ymm4 = avx_load(&data[0..16]); |
424 | 0 | let ymm5 = avx_load(&data[16..32]); |
425 | 0 | let ymm6 = avx_load(&data[32..48]); |
426 | 0 | let ymm7 = avx_load(&data[48..64]); |
427 | | |
428 | | // ---- Pass 1: process rows. |
429 | | // ymm4=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) |
430 | | // ymm5=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) |
431 | | // ymm6=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) |
432 | | // ymm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) |
433 | | |
434 | 0 | let ymm0 = _mm256_permute2x128_si256(ymm4, ymm6, 0x20); |
435 | 0 | let ymm1 = _mm256_permute2x128_si256(ymm4, ymm6, 0x31); |
436 | 0 | let ymm2 = _mm256_permute2x128_si256(ymm5, ymm7, 0x20); |
437 | 0 | let ymm3 = _mm256_permute2x128_si256(ymm5, ymm7, 0x31); |
438 | | |
439 | | // ymm0=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47) |
440 | | // ymm1=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57) |
441 | | // ymm2=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67) |
442 | | // ymm3=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77) |
443 | | |
444 | 0 | let (ymm0, ymm1, ymm2, ymm3) = do_transpose(ymm0, ymm1, ymm2, ymm3); |
445 | 0 | let (ymm0, ymm1, ymm2, ymm3) = do_dct(true, ymm0, ymm1, ymm2, ymm3); |
446 | | |
447 | | // ---- Pass 2: process columns. |
448 | | |
449 | 0 | let ymm4 = _mm256_permute2x128_si256(ymm1, ymm3, 0x20); // ymm4=data3_7 |
450 | 0 | let ymm1 = _mm256_permute2x128_si256(ymm1, ymm3, 0x31); // ymm1=data1_5 |
451 | | |
452 | 0 | let (ymm0, ymm1, ymm2, ymm4) = do_transpose(ymm0, ymm1, ymm2, ymm4); |
453 | 0 | let (ymm0, ymm1, ymm2, ymm4) = do_dct(false, ymm0, ymm1, ymm2, ymm4); |
454 | | |
455 | 0 | let ymm3 = _mm256_permute2x128_si256(ymm0, ymm1, 0x30); // ymm3=data0_1 |
456 | 0 | let ymm5 = _mm256_permute2x128_si256(ymm2, ymm1, 0x20); // ymm5=data2_3 |
457 | 0 | let ymm6 = _mm256_permute2x128_si256(ymm0, ymm4, 0x31); // ymm6=data4_5 |
458 | 0 | let ymm7 = _mm256_permute2x128_si256(ymm2, ymm4, 0x21); // ymm7=data6_7 |
459 | | |
460 | 0 | avx_store(ymm3, &mut data[0..16]); |
461 | 0 | avx_store(ymm5, &mut data[16..32]); |
462 | 0 | avx_store(ymm6, &mut data[32..48]); |
463 | 0 | avx_store(ymm7, &mut data[48..64]); |
464 | 0 | } |
465 | | |
466 | | /// Safe wrapper for an unaligned AVX load |
467 | | #[target_feature(enable = "avx2")] |
468 | | #[inline] |
469 | 0 | fn avx_load(input: &[i16]) -> __m256i { |
470 | 0 | assert!(input.len() == 16); |
471 | 0 | assert!(core::mem::size_of::<[i16; 16]>() == core::mem::size_of::<__m256i>()); |
472 | | // SAFETY: we've checked sizes above. The load is unaligned, so no alignment requirements. |
473 | 0 | unsafe { _mm256_loadu_si256(input.as_ptr() as *const __m256i) } |
474 | 0 | } |
475 | | |
476 | | /// Safe wrapper for an unaligned AVX store |
477 | | #[target_feature(enable = "avx2")] |
478 | | #[inline] |
479 | 0 | fn avx_store(input: __m256i, output: &mut [i16]) { |
480 | 0 | assert!(output.len() == 16); |
481 | 0 | assert!(core::mem::size_of::<[i16; 16]>() == core::mem::size_of::<__m256i>()); |
482 | | // SAFETY: we've checked sizes above. The load is unaligned, so no alignment requirements. |
483 | 0 | unsafe { _mm256_storeu_si256(output.as_mut_ptr() as *mut __m256i, input) } |
484 | 0 | } |