/rust/registry/src/index.crates.io-1949cf8c6b5b557f/jpeg-encoder-0.6.1/src/avx2/fdct.rs
Line | Count | Source |
1 | | /* |
2 | | * Ported from mozjpeg / jfdctint-avx2.asm to rust |
3 | | * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB |
4 | | * Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander. |
5 | | * |
6 | | * Based on the x86 SIMD extension for IJG JPEG library |
7 | | * Copyright (C) 1999-2006, MIYASAKA Masaru. |
8 | | */ |
9 | | |
10 | | #[cfg(target_arch = "x86")] |
11 | | use core::arch::x86::{ |
12 | | __m256i, _mm256_add_epi16, _mm256_add_epi32, _mm256_loadu_si256, _mm256_madd_epi16, |
13 | | _mm256_packs_epi32, _mm256_permute2x128_si256, _mm256_permute4x64_epi64, _mm256_set_epi16, |
14 | | _mm256_set_epi32, _mm256_sign_epi16, _mm256_slli_epi16, _mm256_srai_epi16, _mm256_srai_epi32, |
15 | | _mm256_storeu_si256, _mm256_sub_epi16, _mm256_unpackhi_epi16, _mm256_unpackhi_epi32, |
16 | | _mm256_unpacklo_epi16, _mm256_unpacklo_epi32, |
17 | | }; |
18 | | |
19 | | #[cfg(target_arch = "x86_64")] |
20 | | use core::arch::x86_64::{ |
21 | | __m256i, _mm256_add_epi16, _mm256_add_epi32, _mm256_loadu_si256, _mm256_madd_epi16, |
22 | | _mm256_packs_epi32, _mm256_permute2x128_si256, _mm256_permute4x64_epi64, _mm256_set_epi16, |
23 | | _mm256_set_epi32, _mm256_sign_epi16, _mm256_slli_epi16, _mm256_srai_epi16, _mm256_srai_epi32, |
24 | | _mm256_storeu_si256, _mm256_sub_epi16, _mm256_unpackhi_epi16, _mm256_unpackhi_epi32, |
25 | | _mm256_unpacklo_epi16, _mm256_unpacklo_epi32, |
26 | | }; |
27 | | |
28 | | const CONST_BITS: i32 = 13; |
29 | | const PASS1_BITS: i32 = 2; |
30 | | |
31 | | // FIX(0.298631336) |
32 | | const F_0_298: i16 = 2446; |
33 | | // FIX(0.390180644) |
34 | | const F_0_390: i16 = 3196; |
35 | | // FIX(0.541196100) |
36 | | const F_0_541: i16 = 4433; |
37 | | // FIX(0.765366865) |
38 | | const F_0_765: i16 = 6270; |
39 | | //FIX(0.899976223) |
40 | | const F_0_899: i16 = 7373; |
41 | | //FIX(1.175875602) |
42 | | const F_1_175: i16 = 9633; |
43 | | //FIX(1.501321110) |
44 | | const F_1_501: i16 = 12299; |
45 | | //FIX(1.847759065) |
46 | | const F_1_847: i16 = 15137; |
47 | | //FIX(1.961570560) |
48 | | const F_1_961: i16 = 16069; |
49 | | //FIX(2.053119869) |
50 | | const F_2_053: i16 = 16819; |
51 | | //FIX(2.562915447) |
52 | | const F_2_562: i16 = 20995; |
53 | | //FIX(3.072711026) |
54 | | const F_3_072: i16 = 25172; |
55 | | |
56 | | const DESCALE_P1: i32 = CONST_BITS - PASS1_BITS; |
57 | | const DESCALE_P2: i32 = CONST_BITS + PASS1_BITS; |
58 | | |
59 | | #[inline(always)] |
60 | 0 | pub fn fdct_avx2(data: &mut [i16; 64]) { |
61 | 0 | unsafe { |
62 | 0 | fdct_avx2_internal(data); |
63 | 0 | } |
64 | 0 | } |
65 | | |
66 | | #[target_feature(enable = "avx2")] |
67 | 0 | unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) { |
68 | | #[allow(non_snake_case)] |
69 | | #[inline(always)] |
70 | 0 | unsafe fn PW_F130_F054_MF130_F054() -> __m256i { |
71 | 0 | _mm256_set_epi16( |
72 | | F_0_541, |
73 | 0 | F_0_541 - F_1_847, |
74 | | F_0_541, |
75 | 0 | F_0_541 - F_1_847, |
76 | | F_0_541, |
77 | 0 | F_0_541 - F_1_847, |
78 | | F_0_541, |
79 | 0 | F_0_541 - F_1_847, |
80 | | F_0_541, |
81 | 0 | F_0_541 + F_0_765, |
82 | | F_0_541, |
83 | 0 | F_0_541 + F_0_765, |
84 | | F_0_541, |
85 | 0 | F_0_541 + F_0_765, |
86 | | F_0_541, |
87 | 0 | F_0_541 + F_0_765, |
88 | | ) |
89 | 0 | } |
90 | | |
91 | | #[allow(non_snake_case)] |
92 | | #[inline(always)] |
93 | 0 | unsafe fn PW_MF078_F117_F078_F117() -> __m256i { |
94 | 0 | _mm256_set_epi16( |
95 | | F_1_175, |
96 | 0 | F_1_175 - F_0_390, |
97 | | F_1_175, |
98 | 0 | F_1_175 - F_0_390, |
99 | | F_1_175, |
100 | 0 | F_1_175 - F_0_390, |
101 | | F_1_175, |
102 | 0 | F_1_175 - F_0_390, |
103 | | F_1_175, |
104 | 0 | F_1_175 - F_1_961, |
105 | | F_1_175, |
106 | 0 | F_1_175 - F_1_961, |
107 | | F_1_175, |
108 | 0 | F_1_175 - F_1_961, |
109 | | F_1_175, |
110 | 0 | F_1_175 - F_1_961, |
111 | | ) |
112 | 0 | } |
113 | | |
114 | | #[allow(non_snake_case)] |
115 | | #[inline(always)] |
116 | 0 | unsafe fn PW_MF060_MF089_MF050_MF256() -> __m256i { |
117 | 0 | _mm256_set_epi16( |
118 | 0 | -F_2_562, |
119 | 0 | F_2_053 - F_2_562, |
120 | 0 | -F_2_562, |
121 | 0 | F_2_053 - F_2_562, |
122 | 0 | -F_2_562, |
123 | 0 | F_2_053 - F_2_562, |
124 | 0 | -F_2_562, |
125 | 0 | F_2_053 - F_2_562, |
126 | 0 | -F_0_899, |
127 | 0 | F_0_298 - F_0_899, |
128 | 0 | -F_0_899, |
129 | 0 | F_0_298 - F_0_899, |
130 | 0 | -F_0_899, |
131 | 0 | F_0_298 - F_0_899, |
132 | 0 | -F_0_899, |
133 | 0 | F_0_298 - F_0_899, |
134 | | ) |
135 | 0 | } |
136 | | |
137 | | #[allow(non_snake_case)] |
138 | | #[inline(always)] |
139 | 0 | unsafe fn PW_F050_MF256_F060_MF089() -> __m256i { |
140 | 0 | _mm256_set_epi16( |
141 | 0 | -F_0_899, |
142 | 0 | F_1_501 - F_0_899, |
143 | 0 | -F_0_899, |
144 | 0 | F_1_501 - F_0_899, |
145 | 0 | -F_0_899, |
146 | 0 | F_1_501 - F_0_899, |
147 | 0 | -F_0_899, |
148 | 0 | F_1_501 - F_0_899, |
149 | 0 | -F_2_562, |
150 | 0 | F_3_072 - F_2_562, |
151 | 0 | -F_2_562, |
152 | 0 | F_3_072 - F_2_562, |
153 | 0 | -F_2_562, |
154 | 0 | F_3_072 - F_2_562, |
155 | 0 | -F_2_562, |
156 | 0 | F_3_072 - F_2_562, |
157 | | ) |
158 | 0 | } |
159 | | |
160 | | #[allow(non_snake_case)] |
161 | | #[inline(always)] |
162 | 0 | unsafe fn PD_DESCALE_P(first_pass: bool) -> __m256i { |
163 | 0 | if first_pass { |
164 | 0 | _mm256_set_epi32( |
165 | 0 | 1 << (DESCALE_P1 - 1), |
166 | 0 | 1 << (DESCALE_P1 - 1), |
167 | 0 | 1 << (DESCALE_P1 - 1), |
168 | 0 | 1 << (DESCALE_P1 - 1), |
169 | 0 | 1 << (DESCALE_P1 - 1), |
170 | 0 | 1 << (DESCALE_P1 - 1), |
171 | 0 | 1 << (DESCALE_P1 - 1), |
172 | 0 | 1 << (DESCALE_P1 - 1), |
173 | | ) |
174 | | } else { |
175 | 0 | _mm256_set_epi32( |
176 | 0 | 1 << (DESCALE_P2 - 1), |
177 | 0 | 1 << (DESCALE_P2 - 1), |
178 | 0 | 1 << (DESCALE_P2 - 1), |
179 | 0 | 1 << (DESCALE_P2 - 1), |
180 | 0 | 1 << (DESCALE_P2 - 1), |
181 | 0 | 1 << (DESCALE_P2 - 1), |
182 | 0 | 1 << (DESCALE_P2 - 1), |
183 | 0 | 1 << (DESCALE_P2 - 1), |
184 | | ) |
185 | | } |
186 | 0 | } |
187 | | |
188 | | #[allow(non_snake_case)] |
189 | | #[inline(always)] |
190 | 0 | unsafe fn PW_DESCALE_P2X() -> __m256i { |
191 | 0 | _mm256_set_epi32( |
192 | 0 | 1 << (PASS1_BITS - 1), |
193 | 0 | 1 << (PASS1_BITS - 1), |
194 | 0 | 1 << (PASS1_BITS - 1), |
195 | 0 | 1 << (PASS1_BITS - 1), |
196 | 0 | 1 << (PASS1_BITS - 1), |
197 | 0 | 1 << (PASS1_BITS - 1), |
198 | 0 | 1 << (PASS1_BITS - 1), |
199 | 0 | 1 << (PASS1_BITS - 1), |
200 | | ) |
201 | 0 | } |
202 | | |
203 | | // In-place 8x8x16-bit matrix transpose using AVX2 instructions |
204 | | #[inline(always)] |
205 | 0 | unsafe fn do_transpose( |
206 | 0 | i1: __m256i, |
207 | 0 | i2: __m256i, |
208 | 0 | i3: __m256i, |
209 | 0 | i4: __m256i, |
210 | 0 | ) -> (__m256i, __m256i, __m256i, __m256i) { |
211 | | //i1=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47) |
212 | | //i2=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57) |
213 | | //i3=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67) |
214 | | //i4=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77) |
215 | | |
216 | 0 | let t5 = _mm256_unpacklo_epi16(i1, i2); |
217 | 0 | let t6 = _mm256_unpackhi_epi16(i1, i2); |
218 | 0 | let t7 = _mm256_unpacklo_epi16(i3, i4); |
219 | 0 | let t8 = _mm256_unpackhi_epi16(i3, i4); |
220 | | |
221 | | // transpose coefficients(phase 1) |
222 | | // t1=(00 10 01 11 02 12 03 13 40 50 41 51 42 52 43 53) |
223 | | // t2=(04 14 05 15 06 16 07 17 44 54 45 55 46 56 47 57) |
224 | | // t3=(20 30 21 31 22 32 23 33 60 70 61 71 62 72 63 73) |
225 | | // t4=(24 34 25 35 26 36 27 37 64 74 65 75 66 76 67 77) |
226 | | |
227 | 0 | let t1 = _mm256_unpacklo_epi32(t5, t7); |
228 | 0 | let t2 = _mm256_unpackhi_epi32(t5, t7); |
229 | 0 | let t3 = _mm256_unpacklo_epi32(t6, t8); |
230 | 0 | let t4 = _mm256_unpackhi_epi32(t6, t8); |
231 | | |
232 | | // transpose coefficients(phase 2) |
233 | | // t5=(00 10 20 30 01 11 21 31 40 50 60 70 41 51 61 71) |
234 | | // t6=(02 12 22 32 03 13 23 33 42 52 62 72 43 53 63 73) |
235 | | // t7=(04 14 24 34 05 15 25 35 44 54 64 74 45 55 65 75) |
236 | | // t8=(06 16 26 36 07 17 27 37 46 56 66 76 47 57 67 77) |
237 | | |
238 | 0 | ( |
239 | 0 | _mm256_permute4x64_epi64(t1, 0x8D), |
240 | 0 | _mm256_permute4x64_epi64(t2, 0x8D), |
241 | 0 | _mm256_permute4x64_epi64(t3, 0xD8), |
242 | 0 | _mm256_permute4x64_epi64(t4, 0xD8), |
243 | 0 | ) |
244 | 0 | } |
245 | | |
246 | | // In-place 8x8x16-bit accurate integer forward DCT using AVX2 instructions |
247 | | #[inline(always)] |
248 | 0 | unsafe fn do_dct( |
249 | 0 | first_pass: bool, |
250 | 0 | i1: __m256i, |
251 | 0 | i2: __m256i, |
252 | 0 | i3: __m256i, |
253 | 0 | i4: __m256i, |
254 | 0 | ) -> (__m256i, __m256i, __m256i, __m256i) { |
255 | 0 | let t5 = _mm256_sub_epi16(i1, i4); // data1_0 - data6_7 = tmp6_7 |
256 | 0 | let t6 = _mm256_add_epi16(i1, i4); // data1_0 + data6_7 = tmp1_0 |
257 | 0 | let t7 = _mm256_add_epi16(i2, i3); // data3_2 + data4_5 = tmp3_2 |
258 | 0 | let t8 = _mm256_sub_epi16(i2, i3); // data3_2 - data4_5 = tmp4_5 |
259 | | |
260 | | // Even part |
261 | | |
262 | 0 | let t6 = _mm256_permute2x128_si256(t6, t6, 0x01); // t6=tmp0_1 |
263 | 0 | let t1 = _mm256_add_epi16(t6, t7); // t1 = tmp0_1 + tmp3_2 = tmp10_11 |
264 | 0 | let t6 = _mm256_sub_epi16(t6, t7); // t6 = tmp0_1 - tmp3_2 = tmp13_12 |
265 | | |
266 | 0 | let t7 = _mm256_permute2x128_si256(t1, t1, 0x01); // t7 = tmp11_10 |
267 | 0 | let t1 = _mm256_sign_epi16( |
268 | 0 | t1, |
269 | 0 | _mm256_set_epi16(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1), |
270 | | ); // tmp10_neg11 |
271 | | |
272 | 0 | let t7 = _mm256_add_epi16(t7, t1); // t7 = (tmp10 + tmp11)_(tmp10 - tmp11) |
273 | | |
274 | 0 | let t1 = if first_pass { |
275 | 0 | _mm256_slli_epi16(t7, PASS1_BITS) |
276 | | } else { |
277 | 0 | let t7 = _mm256_add_epi16(t7, PW_DESCALE_P2X()); |
278 | 0 | _mm256_srai_epi16(t7, PASS1_BITS) |
279 | | }; |
280 | | |
281 | | // (Original) |
282 | | // z1 = (tmp12 + tmp13) * 0.541196100; |
283 | | // data2 = z1 + tmp13 * 0.765366865; |
284 | | // data6 = z1 + tmp12 * -1.847759065; |
285 | | // |
286 | | // (This implementation) |
287 | | // data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; |
288 | | // data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); |
289 | | |
290 | 0 | let t7 = _mm256_permute2x128_si256(t6, t6, 0x01); // t7 = tmp12_13 |
291 | 0 | let t2 = _mm256_unpacklo_epi16(t6, t7); |
292 | 0 | let t6 = _mm256_unpackhi_epi16(t6, t7); |
293 | | |
294 | 0 | let t2 = _mm256_madd_epi16(t2, PW_F130_F054_MF130_F054()); // t2 = data2_6L |
295 | 0 | let t6 = _mm256_madd_epi16(t6, PW_F130_F054_MF130_F054()); // t6 = data2_6H |
296 | | |
297 | 0 | let t2 = _mm256_add_epi32(t2, PD_DESCALE_P(first_pass)); |
298 | 0 | let t6 = _mm256_add_epi32(t6, PD_DESCALE_P(first_pass)); |
299 | | |
300 | 0 | let t2 = if first_pass { |
301 | 0 | _mm256_srai_epi32(t2, DESCALE_P1) |
302 | | } else { |
303 | 0 | _mm256_srai_epi32(t2, DESCALE_P2) |
304 | | }; |
305 | 0 | let t6 = if first_pass { |
306 | 0 | _mm256_srai_epi32(t6, DESCALE_P1) |
307 | | } else { |
308 | 0 | _mm256_srai_epi32(t6, DESCALE_P2) |
309 | | }; |
310 | | |
311 | 0 | let t3 = _mm256_packs_epi32(t2, t6); // t6 = data2_6 |
312 | | |
313 | | // Odd part |
314 | | |
315 | 0 | let t7 = _mm256_add_epi16(t8, t5); // t7 = tmp4_5 + tmp6_7 = z3_4 |
316 | | |
317 | | // (Original) |
318 | | // z5 = (z3 + z4) * 1.175875602; |
319 | | // z3 = z3 * -1.961570560; |
320 | | // z4 = z4 * -0.390180644; |
321 | | // z3 += z5; |
322 | | // z4 += z5; |
323 | | // |
324 | | // (This implementation) |
325 | | // z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; |
326 | | // z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); |
327 | | |
328 | 0 | let t2 = _mm256_permute2x128_si256(t7, t7, 0x01); // t2 = z4_3 |
329 | 0 | let t6 = _mm256_unpacklo_epi16(t7, t2); |
330 | 0 | let t7 = _mm256_unpackhi_epi16(t7, t2); |
331 | | |
332 | 0 | let t6 = _mm256_madd_epi16(t6, PW_MF078_F117_F078_F117()); // t6 = z3_4L |
333 | 0 | let t7 = _mm256_madd_epi16(t7, PW_MF078_F117_F078_F117()); // t7 = z3_4H |
334 | | |
335 | | // (Original) |
336 | | // z1 = tmp4 + tmp7; |
337 | | // z2 = tmp5 + tmp6; |
338 | | // tmp4 = tmp4 * 0.298631336; |
339 | | // tmp5 = tmp5 * 2.053119869; |
340 | | // tmp6 = tmp6 * 3.072711026; |
341 | | // tmp7 = tmp7 * 1.501321110; |
342 | | // z1 = z1 * -0.899976223; |
343 | | // z2 = z2 * -2.562915447; |
344 | | // data7 = tmp4 + z1 + z3; |
345 | | // data5 = tmp5 + z2 + z4; |
346 | | // data3 = tmp6 + z2 + z3; |
347 | | // data1 = tmp7 + z1 + z4; |
348 | | // |
349 | | // (This implementation) |
350 | | // tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; |
351 | | // tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; |
352 | | // tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); |
353 | | // tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); |
354 | | // data7 = tmp4 + z3; |
355 | | // data5 = tmp5 + z4; |
356 | | // data3 = tmp6 + z3; |
357 | | // data1 = tmp7 + z4; |
358 | | |
359 | 0 | let t4 = _mm256_permute2x128_si256(t5, t5, 0x01); // t4 = tmp7_6 |
360 | 0 | let t2 = _mm256_unpacklo_epi16(t8, t4); |
361 | 0 | let t4 = _mm256_unpackhi_epi16(t8, t4); |
362 | | |
363 | 0 | let t2 = _mm256_madd_epi16(t2, PW_MF060_MF089_MF050_MF256()); //t2 = tmp4_5L |
364 | 0 | let t4 = _mm256_madd_epi16(t4, PW_MF060_MF089_MF050_MF256()); // t4 = tmp4_5H |
365 | | |
366 | 0 | let t2 = _mm256_add_epi32(t2, t6); // t2 = data7_5L |
367 | 0 | let t4 = _mm256_add_epi32(t4, t7); // t4 = data7_5H |
368 | | |
369 | 0 | let t2 = _mm256_add_epi32(t2, PD_DESCALE_P(first_pass)); |
370 | 0 | let t4 = _mm256_add_epi32(t4, PD_DESCALE_P(first_pass)); |
371 | | |
372 | 0 | let t2 = if first_pass { |
373 | 0 | _mm256_srai_epi32(t2, DESCALE_P1) |
374 | | } else { |
375 | 0 | _mm256_srai_epi32(t2, DESCALE_P2) |
376 | | }; |
377 | 0 | let t4 = if first_pass { |
378 | 0 | _mm256_srai_epi32(t4, DESCALE_P1) |
379 | | } else { |
380 | 0 | _mm256_srai_epi32(t4, DESCALE_P2) |
381 | | }; |
382 | | |
383 | 0 | let t4 = _mm256_packs_epi32(t2, t4); // t4 = data7_5 |
384 | | |
385 | 0 | let t2 = _mm256_permute2x128_si256(t8, t8, 0x01); // t2 = tmp5_4 |
386 | | |
387 | 0 | let t8 = _mm256_unpacklo_epi16(t5, t2); |
388 | 0 | let t5 = _mm256_unpackhi_epi16(t5, t2); |
389 | | |
390 | 0 | let t8 = _mm256_madd_epi16(t8, PW_F050_MF256_F060_MF089()); // t8 = tmp6_7L |
391 | 0 | let t5 = _mm256_madd_epi16(t5, PW_F050_MF256_F060_MF089()); // t5 = tmp6_7H |
392 | | |
393 | 0 | let t8 = _mm256_add_epi32(t8, t6); // t8 = data3_1L |
394 | 0 | let t5 = _mm256_add_epi32(t5, t7); // t5 = data3_1H |
395 | | |
396 | 0 | let t8 = _mm256_add_epi32(t8, PD_DESCALE_P(first_pass)); |
397 | 0 | let t5 = _mm256_add_epi32(t5, PD_DESCALE_P(first_pass)); |
398 | | |
399 | 0 | let t8 = if first_pass { |
400 | 0 | _mm256_srai_epi32(t8, DESCALE_P1) |
401 | | } else { |
402 | 0 | _mm256_srai_epi32(t8, DESCALE_P2) |
403 | | }; |
404 | 0 | let t5 = if first_pass { |
405 | 0 | _mm256_srai_epi32(t5, DESCALE_P1) |
406 | | } else { |
407 | 0 | _mm256_srai_epi32(t5, DESCALE_P2) |
408 | | }; |
409 | | |
410 | 0 | let t2 = _mm256_packs_epi32(t8, t5); // t2 = data3_1 |
411 | | |
412 | 0 | (t1, t2, t3, t4) |
413 | 0 | } |
414 | | |
415 | 0 | let in_data = core::mem::transmute::<*mut i16, *mut __m256i>(data.as_mut_ptr()); |
416 | | |
417 | 0 | let ymm4 = _mm256_loadu_si256(in_data); |
418 | 0 | let ymm5 = _mm256_loadu_si256(in_data.add(1)); |
419 | 0 | let ymm6 = _mm256_loadu_si256(in_data.add(2)); |
420 | 0 | let ymm7 = _mm256_loadu_si256(in_data.add(3)); |
421 | | |
422 | | // ---- Pass 1: process rows. |
423 | | // ymm4=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) |
424 | | // ymm5=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) |
425 | | // ymm6=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) |
426 | | // ymm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) |
427 | | |
428 | 0 | let ymm0 = _mm256_permute2x128_si256(ymm4, ymm6, 0x20); |
429 | 0 | let ymm1 = _mm256_permute2x128_si256(ymm4, ymm6, 0x31); |
430 | 0 | let ymm2 = _mm256_permute2x128_si256(ymm5, ymm7, 0x20); |
431 | 0 | let ymm3 = _mm256_permute2x128_si256(ymm5, ymm7, 0x31); |
432 | | |
433 | | // ymm0=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47) |
434 | | // ymm1=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57) |
435 | | // ymm2=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67) |
436 | | // ymm3=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77) |
437 | | |
438 | 0 | let (ymm0, ymm1, ymm2, ymm3) = do_transpose(ymm0, ymm1, ymm2, ymm3); |
439 | 0 | let (ymm0, ymm1, ymm2, ymm3) = do_dct(true, ymm0, ymm1, ymm2, ymm3); |
440 | | |
441 | | // ---- Pass 2: process columns. |
442 | | |
443 | 0 | let ymm4 = _mm256_permute2x128_si256(ymm1, ymm3, 0x20); // ymm4=data3_7 |
444 | 0 | let ymm1 = _mm256_permute2x128_si256(ymm1, ymm3, 0x31); // ymm1=data1_5 |
445 | | |
446 | 0 | let (ymm0, ymm1, ymm2, ymm4) = do_transpose(ymm0, ymm1, ymm2, ymm4); |
447 | 0 | let (ymm0, ymm1, ymm2, ymm4) = do_dct(false, ymm0, ymm1, ymm2, ymm4); |
448 | | |
449 | 0 | let ymm3 = _mm256_permute2x128_si256(ymm0, ymm1, 0x30); // ymm3=data0_1 |
450 | 0 | let ymm5 = _mm256_permute2x128_si256(ymm2, ymm1, 0x20); // ymm5=data2_3 |
451 | 0 | let ymm6 = _mm256_permute2x128_si256(ymm0, ymm4, 0x31); // ymm6=data4_5 |
452 | 0 | let ymm7 = _mm256_permute2x128_si256(ymm2, ymm4, 0x21); // ymm7=data6_7 |
453 | | |
454 | 0 | let out_data = core::mem::transmute::<*mut i16, *mut __m256i>(data.as_mut_ptr()); |
455 | | |
456 | 0 | _mm256_storeu_si256(out_data, ymm3); |
457 | 0 | _mm256_storeu_si256(out_data.add(1), ymm5); |
458 | 0 | _mm256_storeu_si256(out_data.add(2), ymm6); |
459 | 0 | _mm256_storeu_si256(out_data.add(3), ymm7); |
460 | 0 | } |