Coverage Report

Created: 2025-12-05 07:37

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/rust/registry/src/index.crates.io-1949cf8c6b5b557f/jpeg-encoder-0.6.1/src/avx2/fdct.rs
Line
Count
Source
1
/*
2
 * Ported from mozjpeg / jfdctint-avx2.asm to rust
3
 * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
4
 * Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander.
5
 *
6
 * Based on the x86 SIMD extension for IJG JPEG library
7
 * Copyright (C) 1999-2006, MIYASAKA Masaru.
8
 */
9
10
#[cfg(target_arch = "x86")]
11
use core::arch::x86::{
12
    __m256i, _mm256_add_epi16, _mm256_add_epi32, _mm256_loadu_si256, _mm256_madd_epi16,
13
    _mm256_packs_epi32, _mm256_permute2x128_si256, _mm256_permute4x64_epi64, _mm256_set_epi16,
14
    _mm256_set_epi32, _mm256_sign_epi16, _mm256_slli_epi16, _mm256_srai_epi16, _mm256_srai_epi32,
15
    _mm256_storeu_si256, _mm256_sub_epi16, _mm256_unpackhi_epi16, _mm256_unpackhi_epi32,
16
    _mm256_unpacklo_epi16, _mm256_unpacklo_epi32,
17
};
18
19
#[cfg(target_arch = "x86_64")]
20
use core::arch::x86_64::{
21
    __m256i, _mm256_add_epi16, _mm256_add_epi32, _mm256_loadu_si256, _mm256_madd_epi16,
22
    _mm256_packs_epi32, _mm256_permute2x128_si256, _mm256_permute4x64_epi64, _mm256_set_epi16,
23
    _mm256_set_epi32, _mm256_sign_epi16, _mm256_slli_epi16, _mm256_srai_epi16, _mm256_srai_epi32,
24
    _mm256_storeu_si256, _mm256_sub_epi16, _mm256_unpackhi_epi16, _mm256_unpackhi_epi32,
25
    _mm256_unpacklo_epi16, _mm256_unpacklo_epi32,
26
};
27
28
const CONST_BITS: i32 = 13;
29
const PASS1_BITS: i32 = 2;
30
31
// FIX(0.298631336)
32
const F_0_298: i16 = 2446;
33
// FIX(0.390180644)
34
const F_0_390: i16 = 3196;
35
// FIX(0.541196100)
36
const F_0_541: i16 = 4433;
37
// FIX(0.765366865)
38
const F_0_765: i16 = 6270;
39
//FIX(0.899976223)
40
const F_0_899: i16 = 7373;
41
//FIX(1.175875602)
42
const F_1_175: i16 = 9633;
43
//FIX(1.501321110)
44
const F_1_501: i16 = 12299;
45
//FIX(1.847759065)
46
const F_1_847: i16 = 15137;
47
//FIX(1.961570560)
48
const F_1_961: i16 = 16069;
49
//FIX(2.053119869)
50
const F_2_053: i16 = 16819;
51
//FIX(2.562915447)
52
const F_2_562: i16 = 20995;
53
//FIX(3.072711026)
54
const F_3_072: i16 = 25172;
55
56
const DESCALE_P1: i32 = CONST_BITS - PASS1_BITS;
57
const DESCALE_P2: i32 = CONST_BITS + PASS1_BITS;
58
59
#[inline(always)]
60
0
pub fn fdct_avx2(data: &mut [i16; 64]) {
61
0
    unsafe {
62
0
        fdct_avx2_internal(data);
63
0
    }
64
0
}
65
66
#[target_feature(enable = "avx2")]
67
0
unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) {
68
    #[allow(non_snake_case)]
69
    #[inline(always)]
70
0
    unsafe fn PW_F130_F054_MF130_F054() -> __m256i {
71
0
        _mm256_set_epi16(
72
            F_0_541,
73
0
            F_0_541 - F_1_847,
74
            F_0_541,
75
0
            F_0_541 - F_1_847,
76
            F_0_541,
77
0
            F_0_541 - F_1_847,
78
            F_0_541,
79
0
            F_0_541 - F_1_847,
80
            F_0_541,
81
0
            F_0_541 + F_0_765,
82
            F_0_541,
83
0
            F_0_541 + F_0_765,
84
            F_0_541,
85
0
            F_0_541 + F_0_765,
86
            F_0_541,
87
0
            F_0_541 + F_0_765,
88
        )
89
0
    }
90
91
    #[allow(non_snake_case)]
92
    #[inline(always)]
93
0
    unsafe fn PW_MF078_F117_F078_F117() -> __m256i {
94
0
        _mm256_set_epi16(
95
            F_1_175,
96
0
            F_1_175 - F_0_390,
97
            F_1_175,
98
0
            F_1_175 - F_0_390,
99
            F_1_175,
100
0
            F_1_175 - F_0_390,
101
            F_1_175,
102
0
            F_1_175 - F_0_390,
103
            F_1_175,
104
0
            F_1_175 - F_1_961,
105
            F_1_175,
106
0
            F_1_175 - F_1_961,
107
            F_1_175,
108
0
            F_1_175 - F_1_961,
109
            F_1_175,
110
0
            F_1_175 - F_1_961,
111
        )
112
0
    }
113
114
    #[allow(non_snake_case)]
115
    #[inline(always)]
116
0
    unsafe fn PW_MF060_MF089_MF050_MF256() -> __m256i {
117
0
        _mm256_set_epi16(
118
0
            -F_2_562,
119
0
            F_2_053 - F_2_562,
120
0
            -F_2_562,
121
0
            F_2_053 - F_2_562,
122
0
            -F_2_562,
123
0
            F_2_053 - F_2_562,
124
0
            -F_2_562,
125
0
            F_2_053 - F_2_562,
126
0
            -F_0_899,
127
0
            F_0_298 - F_0_899,
128
0
            -F_0_899,
129
0
            F_0_298 - F_0_899,
130
0
            -F_0_899,
131
0
            F_0_298 - F_0_899,
132
0
            -F_0_899,
133
0
            F_0_298 - F_0_899,
134
        )
135
0
    }
136
137
    #[allow(non_snake_case)]
138
    #[inline(always)]
139
0
    unsafe fn PW_F050_MF256_F060_MF089() -> __m256i {
140
0
        _mm256_set_epi16(
141
0
            -F_0_899,
142
0
            F_1_501 - F_0_899,
143
0
            -F_0_899,
144
0
            F_1_501 - F_0_899,
145
0
            -F_0_899,
146
0
            F_1_501 - F_0_899,
147
0
            -F_0_899,
148
0
            F_1_501 - F_0_899,
149
0
            -F_2_562,
150
0
            F_3_072 - F_2_562,
151
0
            -F_2_562,
152
0
            F_3_072 - F_2_562,
153
0
            -F_2_562,
154
0
            F_3_072 - F_2_562,
155
0
            -F_2_562,
156
0
            F_3_072 - F_2_562,
157
        )
158
0
    }
159
160
    #[allow(non_snake_case)]
161
    #[inline(always)]
162
0
    unsafe fn PD_DESCALE_P(first_pass: bool) -> __m256i {
163
0
        if first_pass {
164
0
            _mm256_set_epi32(
165
0
                1 << (DESCALE_P1 - 1),
166
0
                1 << (DESCALE_P1 - 1),
167
0
                1 << (DESCALE_P1 - 1),
168
0
                1 << (DESCALE_P1 - 1),
169
0
                1 << (DESCALE_P1 - 1),
170
0
                1 << (DESCALE_P1 - 1),
171
0
                1 << (DESCALE_P1 - 1),
172
0
                1 << (DESCALE_P1 - 1),
173
            )
174
        } else {
175
0
            _mm256_set_epi32(
176
0
                1 << (DESCALE_P2 - 1),
177
0
                1 << (DESCALE_P2 - 1),
178
0
                1 << (DESCALE_P2 - 1),
179
0
                1 << (DESCALE_P2 - 1),
180
0
                1 << (DESCALE_P2 - 1),
181
0
                1 << (DESCALE_P2 - 1),
182
0
                1 << (DESCALE_P2 - 1),
183
0
                1 << (DESCALE_P2 - 1),
184
            )
185
        }
186
0
    }
187
188
    #[allow(non_snake_case)]
189
    #[inline(always)]
190
0
    unsafe fn PW_DESCALE_P2X() -> __m256i {
191
0
        _mm256_set_epi32(
192
0
            1 << (PASS1_BITS - 1),
193
0
            1 << (PASS1_BITS - 1),
194
0
            1 << (PASS1_BITS - 1),
195
0
            1 << (PASS1_BITS - 1),
196
0
            1 << (PASS1_BITS - 1),
197
0
            1 << (PASS1_BITS - 1),
198
0
            1 << (PASS1_BITS - 1),
199
0
            1 << (PASS1_BITS - 1),
200
        )
201
0
    }
202
203
    // In-place 8x8x16-bit matrix transpose using AVX2 instructions
204
    #[inline(always)]
205
0
    unsafe fn do_transpose(
206
0
        i1: __m256i,
207
0
        i2: __m256i,
208
0
        i3: __m256i,
209
0
        i4: __m256i,
210
0
    ) -> (__m256i, __m256i, __m256i, __m256i) {
211
        //i1=(00 01 02 03 04 05 06 07  40 41 42 43 44 45 46 47)
212
        //i2=(10 11 12 13 14 15 16 17  50 51 52 53 54 55 56 57)
213
        //i3=(20 21 22 23 24 25 26 27  60 61 62 63 64 65 66 67)
214
        //i4=(30 31 32 33 34 35 36 37  70 71 72 73 74 75 76 77)
215
216
0
        let t5 = _mm256_unpacklo_epi16(i1, i2);
217
0
        let t6 = _mm256_unpackhi_epi16(i1, i2);
218
0
        let t7 = _mm256_unpacklo_epi16(i3, i4);
219
0
        let t8 = _mm256_unpackhi_epi16(i3, i4);
220
221
        // transpose coefficients(phase 1)
222
        // t1=(00 10 01 11 02 12 03 13  40 50 41 51 42 52 43 53)
223
        // t2=(04 14 05 15 06 16 07 17  44 54 45 55 46 56 47 57)
224
        // t3=(20 30 21 31 22 32 23 33  60 70 61 71 62 72 63 73)
225
        // t4=(24 34 25 35 26 36 27 37  64 74 65 75 66 76 67 77)
226
227
0
        let t1 = _mm256_unpacklo_epi32(t5, t7);
228
0
        let t2 = _mm256_unpackhi_epi32(t5, t7);
229
0
        let t3 = _mm256_unpacklo_epi32(t6, t8);
230
0
        let t4 = _mm256_unpackhi_epi32(t6, t8);
231
232
        // transpose coefficients(phase 2)
233
        // t5=(00 10 20 30 01 11 21 31  40 50 60 70 41 51 61 71)
234
        // t6=(02 12 22 32 03 13 23 33  42 52 62 72 43 53 63 73)
235
        // t7=(04 14 24 34 05 15 25 35  44 54 64 74 45 55 65 75)
236
        // t8=(06 16 26 36 07 17 27 37  46 56 66 76 47 57 67 77)
237
238
0
        (
239
0
            _mm256_permute4x64_epi64(t1, 0x8D),
240
0
            _mm256_permute4x64_epi64(t2, 0x8D),
241
0
            _mm256_permute4x64_epi64(t3, 0xD8),
242
0
            _mm256_permute4x64_epi64(t4, 0xD8),
243
0
        )
244
0
    }
245
246
    // In-place 8x8x16-bit accurate integer forward DCT using AVX2 instructions
247
    #[inline(always)]
248
0
    unsafe fn do_dct(
249
0
        first_pass: bool,
250
0
        i1: __m256i,
251
0
        i2: __m256i,
252
0
        i3: __m256i,
253
0
        i4: __m256i,
254
0
    ) -> (__m256i, __m256i, __m256i, __m256i) {
255
0
        let t5 = _mm256_sub_epi16(i1, i4); // data1_0 - data6_7 = tmp6_7
256
0
        let t6 = _mm256_add_epi16(i1, i4); // data1_0 + data6_7 = tmp1_0
257
0
        let t7 = _mm256_add_epi16(i2, i3); // data3_2 + data4_5 = tmp3_2
258
0
        let t8 = _mm256_sub_epi16(i2, i3); // data3_2 - data4_5 = tmp4_5
259
260
        // Even part
261
262
0
        let t6 = _mm256_permute2x128_si256(t6, t6, 0x01); // t6=tmp0_1
263
0
        let t1 = _mm256_add_epi16(t6, t7); // t1 = tmp0_1 + tmp3_2 = tmp10_11
264
0
        let t6 = _mm256_sub_epi16(t6, t7); // t6 = tmp0_1 - tmp3_2 = tmp13_12
265
266
0
        let t7 = _mm256_permute2x128_si256(t1, t1, 0x01); // t7 = tmp11_10
267
0
        let t1 = _mm256_sign_epi16(
268
0
            t1,
269
0
            _mm256_set_epi16(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1),
270
        ); // tmp10_neg11
271
272
0
        let t7 = _mm256_add_epi16(t7, t1); // t7 = (tmp10 + tmp11)_(tmp10 - tmp11)
273
274
0
        let t1 = if first_pass {
275
0
            _mm256_slli_epi16(t7, PASS1_BITS)
276
        } else {
277
0
            let t7 = _mm256_add_epi16(t7, PW_DESCALE_P2X());
278
0
            _mm256_srai_epi16(t7, PASS1_BITS)
279
        };
280
281
        // (Original)
282
        // z1 = (tmp12 + tmp13) * 0.541196100;
283
        // data2 = z1 + tmp13 * 0.765366865;
284
        // data6 = z1 + tmp12 * -1.847759065;
285
        //
286
        // (This implementation)
287
        // data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
288
        // data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
289
290
0
        let t7 = _mm256_permute2x128_si256(t6, t6, 0x01); // t7 = tmp12_13
291
0
        let t2 = _mm256_unpacklo_epi16(t6, t7);
292
0
        let t6 = _mm256_unpackhi_epi16(t6, t7);
293
294
0
        let t2 = _mm256_madd_epi16(t2, PW_F130_F054_MF130_F054()); // t2 = data2_6L
295
0
        let t6 = _mm256_madd_epi16(t6, PW_F130_F054_MF130_F054()); // t6 = data2_6H
296
297
0
        let t2 = _mm256_add_epi32(t2, PD_DESCALE_P(first_pass));
298
0
        let t6 = _mm256_add_epi32(t6, PD_DESCALE_P(first_pass));
299
300
0
        let t2 = if first_pass {
301
0
            _mm256_srai_epi32(t2, DESCALE_P1)
302
        } else {
303
0
            _mm256_srai_epi32(t2, DESCALE_P2)
304
        };
305
0
        let t6 = if first_pass {
306
0
            _mm256_srai_epi32(t6, DESCALE_P1)
307
        } else {
308
0
            _mm256_srai_epi32(t6, DESCALE_P2)
309
        };
310
311
0
        let t3 = _mm256_packs_epi32(t2, t6); // t6 = data2_6
312
313
        // Odd part
314
315
0
        let t7 = _mm256_add_epi16(t8, t5); // t7 = tmp4_5 + tmp6_7 = z3_4
316
317
        // (Original)
318
        // z5 = (z3 + z4) * 1.175875602;
319
        // z3 = z3 * -1.961570560;
320
        // z4 = z4 * -0.390180644;
321
        // z3 += z5;
322
        // z4 += z5;
323
        //
324
        // (This implementation)
325
        // z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
326
        // z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
327
328
0
        let t2 = _mm256_permute2x128_si256(t7, t7, 0x01); // t2 = z4_3
329
0
        let t6 = _mm256_unpacklo_epi16(t7, t2);
330
0
        let t7 = _mm256_unpackhi_epi16(t7, t2);
331
332
0
        let t6 = _mm256_madd_epi16(t6, PW_MF078_F117_F078_F117()); // t6 = z3_4L
333
0
        let t7 = _mm256_madd_epi16(t7, PW_MF078_F117_F078_F117()); // t7 = z3_4H
334
335
        // (Original)
336
        // z1 = tmp4 + tmp7;
337
        // z2 = tmp5 + tmp6;
338
        // tmp4 = tmp4 * 0.298631336;
339
        // tmp5 = tmp5 * 2.053119869;
340
        // tmp6 = tmp6 * 3.072711026;
341
        // tmp7 = tmp7 * 1.501321110;
342
        // z1 = z1 * -0.899976223;
343
        // z2 = z2 * -2.562915447;
344
        // data7 = tmp4 + z1 + z3;
345
        // data5 = tmp5 + z2 + z4;
346
        // data3 = tmp6 + z2 + z3;
347
        // data1 = tmp7 + z1 + z4;
348
        //
349
        // (This implementation)
350
        // tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
351
        // tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
352
        // tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
353
        // tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
354
        // data7 = tmp4 + z3;
355
        // data5 = tmp5 + z4;
356
        // data3 = tmp6 + z3;
357
        // data1 = tmp7 + z4;
358
359
0
        let t4 = _mm256_permute2x128_si256(t5, t5, 0x01); // t4 = tmp7_6
360
0
        let t2 = _mm256_unpacklo_epi16(t8, t4);
361
0
        let t4 = _mm256_unpackhi_epi16(t8, t4);
362
363
0
        let t2 = _mm256_madd_epi16(t2, PW_MF060_MF089_MF050_MF256()); //t2 = tmp4_5L
364
0
        let t4 = _mm256_madd_epi16(t4, PW_MF060_MF089_MF050_MF256()); // t4 = tmp4_5H
365
366
0
        let t2 = _mm256_add_epi32(t2, t6); // t2 = data7_5L
367
0
        let t4 = _mm256_add_epi32(t4, t7); // t4 = data7_5H
368
369
0
        let t2 = _mm256_add_epi32(t2, PD_DESCALE_P(first_pass));
370
0
        let t4 = _mm256_add_epi32(t4, PD_DESCALE_P(first_pass));
371
372
0
        let t2 = if first_pass {
373
0
            _mm256_srai_epi32(t2, DESCALE_P1)
374
        } else {
375
0
            _mm256_srai_epi32(t2, DESCALE_P2)
376
        };
377
0
        let t4 = if first_pass {
378
0
            _mm256_srai_epi32(t4, DESCALE_P1)
379
        } else {
380
0
            _mm256_srai_epi32(t4, DESCALE_P2)
381
        };
382
383
0
        let t4 = _mm256_packs_epi32(t2, t4); // t4 = data7_5
384
385
0
        let t2 = _mm256_permute2x128_si256(t8, t8, 0x01); // t2 = tmp5_4
386
387
0
        let t8 = _mm256_unpacklo_epi16(t5, t2);
388
0
        let t5 = _mm256_unpackhi_epi16(t5, t2);
389
390
0
        let t8 = _mm256_madd_epi16(t8, PW_F050_MF256_F060_MF089()); // t8 = tmp6_7L
391
0
        let t5 = _mm256_madd_epi16(t5, PW_F050_MF256_F060_MF089()); // t5 = tmp6_7H
392
393
0
        let t8 = _mm256_add_epi32(t8, t6); // t8 = data3_1L
394
0
        let t5 = _mm256_add_epi32(t5, t7); // t5 = data3_1H
395
396
0
        let t8 = _mm256_add_epi32(t8, PD_DESCALE_P(first_pass));
397
0
        let t5 = _mm256_add_epi32(t5, PD_DESCALE_P(first_pass));
398
399
0
        let t8 = if first_pass {
400
0
            _mm256_srai_epi32(t8, DESCALE_P1)
401
        } else {
402
0
            _mm256_srai_epi32(t8, DESCALE_P2)
403
        };
404
0
        let t5 = if first_pass {
405
0
            _mm256_srai_epi32(t5, DESCALE_P1)
406
        } else {
407
0
            _mm256_srai_epi32(t5, DESCALE_P2)
408
        };
409
410
0
        let t2 = _mm256_packs_epi32(t8, t5); // t2 = data3_1
411
412
0
        (t1, t2, t3, t4)
413
0
    }
414
415
0
    let in_data = core::mem::transmute::<*mut i16, *mut __m256i>(data.as_mut_ptr());
416
417
0
    let ymm4 = _mm256_loadu_si256(in_data);
418
0
    let ymm5 = _mm256_loadu_si256(in_data.add(1));
419
0
    let ymm6 = _mm256_loadu_si256(in_data.add(2));
420
0
    let ymm7 = _mm256_loadu_si256(in_data.add(3));
421
422
    // ---- Pass 1: process rows.
423
    // ymm4=(00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17)
424
    // ymm5=(20 21 22 23 24 25 26 27  30 31 32 33 34 35 36 37)
425
    // ymm6=(40 41 42 43 44 45 46 47  50 51 52 53 54 55 56 57)
426
    // ymm7=(60 61 62 63 64 65 66 67  70 71 72 73 74 75 76 77)
427
428
0
    let ymm0 = _mm256_permute2x128_si256(ymm4, ymm6, 0x20);
429
0
    let ymm1 = _mm256_permute2x128_si256(ymm4, ymm6, 0x31);
430
0
    let ymm2 = _mm256_permute2x128_si256(ymm5, ymm7, 0x20);
431
0
    let ymm3 = _mm256_permute2x128_si256(ymm5, ymm7, 0x31);
432
433
    // ymm0=(00 01 02 03 04 05 06 07  40 41 42 43 44 45 46 47)
434
    // ymm1=(10 11 12 13 14 15 16 17  50 51 52 53 54 55 56 57)
435
    // ymm2=(20 21 22 23 24 25 26 27  60 61 62 63 64 65 66 67)
436
    // ymm3=(30 31 32 33 34 35 36 37  70 71 72 73 74 75 76 77)
437
438
0
    let (ymm0, ymm1, ymm2, ymm3) = do_transpose(ymm0, ymm1, ymm2, ymm3);
439
0
    let (ymm0, ymm1, ymm2, ymm3) = do_dct(true, ymm0, ymm1, ymm2, ymm3);
440
441
    // ---- Pass 2: process columns.
442
443
0
    let ymm4 = _mm256_permute2x128_si256(ymm1, ymm3, 0x20); // ymm4=data3_7
444
0
    let ymm1 = _mm256_permute2x128_si256(ymm1, ymm3, 0x31); // ymm1=data1_5
445
446
0
    let (ymm0, ymm1, ymm2, ymm4) = do_transpose(ymm0, ymm1, ymm2, ymm4);
447
0
    let (ymm0, ymm1, ymm2, ymm4) = do_dct(false, ymm0, ymm1, ymm2, ymm4);
448
449
0
    let ymm3 = _mm256_permute2x128_si256(ymm0, ymm1, 0x30); // ymm3=data0_1
450
0
    let ymm5 = _mm256_permute2x128_si256(ymm2, ymm1, 0x20); // ymm5=data2_3
451
0
    let ymm6 = _mm256_permute2x128_si256(ymm0, ymm4, 0x31); // ymm6=data4_5
452
0
    let ymm7 = _mm256_permute2x128_si256(ymm2, ymm4, 0x21); // ymm7=data6_7
453
454
0
    let out_data = core::mem::transmute::<*mut i16, *mut __m256i>(data.as_mut_ptr());
455
456
0
    _mm256_storeu_si256(out_data, ymm3);
457
0
    _mm256_storeu_si256(out_data.add(1), ymm5);
458
0
    _mm256_storeu_si256(out_data.add(2), ymm6);
459
0
    _mm256_storeu_si256(out_data.add(3), ymm7);
460
0
}