Coverage Report

Created: 2025-12-14 07:56

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/rust/registry/src/index.crates.io-1949cf8c6b5b557f/zune-jpeg-0.5.6/src/idct/avx2.rs
Line
Count
Source
1
/*
2
 * Copyright (c) 2023.
3
 *
4
 * This software is free software;
5
 *
6
 * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
7
 */
8
9
#![cfg(any(target_arch = "x86", target_arch = "x86_64"))]
10
//! AVX optimised IDCT.
11
//!
12
//! Okay not thaat optimised.
13
//!
14
//!
15
//! # The implementation
16
//! The implementation is neatly broken down into two operations.
17
//!
18
//! 1. Test for zeroes
19
//! > There is a shortcut method for idct  where when all AC values are zero, we can get the answer really quickly.
20
//!  by scaling the 1/8th of the DCT coefficient of the block to the whole block and level shifting.
21
//!
22
//! 2. If above fails, we proceed to carry out IDCT as a two pass one dimensional algorithm.
23
//! IT does two whole scans where it carries out IDCT on all items
24
//! After each successive scan, data is transposed in register(thank you x86 SIMD powers). and the second
25
//! pass is carried out.
26
//!
27
//! The code is not super optimized, it produces bit identical results with scalar code hence it's
28
//! `mm256_add_epi16`
29
//! and it also has the advantage of making this implementation easy to maintain.
30
31
#![cfg(feature = "x86")]
32
#![allow(dead_code)]
33
34
#[cfg(target_arch = "x86")]
35
use core::arch::x86::*;
36
#[cfg(target_arch = "x86_64")]
37
use core::arch::x86_64::*;
38
39
use crate::unsafe_utils::{transpose, YmmRegister};
40
41
const SCALE_BITS: i32 = 512 + 65536 + (128 << 17);
42
43
// Pack i32 to i16's,
44
// clamp them to be between 0-255
45
// Undo shuffling
46
// Store back to array
47
macro_rules! permute_store {
48
    ($x:tt,$y:tt,$index:tt,$out:tt,$stride:tt) => {
49
        let a = _mm256_packs_epi32($x, $y);
50
51
        // Clamp the values after packing, we can clamp more values at once
52
        let b = clamp_avx(a);
53
54
        // /Undo shuffling
55
        let c = _mm256_permute4x64_epi64(b, shuffle(3, 1, 2, 0));
56
57
        // store first vector
58
        _mm_storeu_si128(
59
            ($out)
60
                .get_mut($index..$index + 8)
61
                .unwrap()
62
                .as_mut_ptr()
63
                .cast(),
64
            _mm256_extractf128_si256::<0>(c),
65
        );
66
        $index += $stride;
67
        // second vector
68
        _mm_storeu_si128(
69
            ($out)
70
                .get_mut($index..$index + 8)
71
                .unwrap()
72
                .as_mut_ptr()
73
                .cast(),
74
            _mm256_extractf128_si256::<1>(c),
75
        );
76
        $index += $stride;
77
    };
78
}
79
80
#[target_feature(enable = "avx2")]
81
#[allow(
82
    clippy::too_many_lines,
83
    clippy::cast_possible_truncation,
84
    clippy::similar_names,
85
    clippy::op_ref,
86
    unused_assignments,
87
    clippy::zero_prefixed_literal
88
)]
89
0
pub unsafe fn idct_avx2(
90
0
    in_vector: &mut [i32; 64], out_vector: &mut [i16], stride: usize,
91
0
) {
92
0
    let mut pos = 0;
93
94
    // load into registers
95
    //
96
    // We sign extend i16's to i32's and calculate them with extended precision and
97
    // later reduce them to i16's when we are done carrying out IDCT
98
99
0
    let rw0 = _mm256_loadu_si256(in_vector[00..].as_ptr().cast());
100
0
    let rw1 = _mm256_loadu_si256(in_vector[08..].as_ptr().cast());
101
0
    let rw2 = _mm256_loadu_si256(in_vector[16..].as_ptr().cast());
102
0
    let rw3 = _mm256_loadu_si256(in_vector[24..].as_ptr().cast());
103
0
    let rw4 = _mm256_loadu_si256(in_vector[32..].as_ptr().cast());
104
0
    let rw5 = _mm256_loadu_si256(in_vector[40..].as_ptr().cast());
105
0
    let rw6 = _mm256_loadu_si256(in_vector[48..].as_ptr().cast());
106
0
    let rw7 = _mm256_loadu_si256(in_vector[56..].as_ptr().cast());
107
108
    // Forward DCT and quantization may cause all the AC terms to be zero, for such
109
    // cases we can try to accelerate it
110
111
    // Basically the poop is that whenever the array has 63 zeroes, its idct is
112
    // (arr[0]>>3)or (arr[0]/8) propagated to all the elements.
113
    // We first test to see if the array contains zero elements and if it does, we go the
114
    // short way.
115
    //
116
    // This reduces IDCT overhead from about 39% to 18 %, almost half
117
118
    // Do another load for the first row, we don't want to check DC value, because
119
    // we only care about AC terms
120
0
    let rw8 = _mm256_loadu_si256(in_vector[1..].as_ptr().cast());
121
122
0
    let zero = _mm256_setzero_si256();
123
124
0
    let mut non_zero = 0;
125
126
0
    non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi32(rw8, zero));
127
0
    non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi32(rw1, zero));
128
0
    non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi32(rw2, zero));
129
0
    non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi64(rw3, zero));
130
131
0
    non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi64(rw4, zero));
132
0
    non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi64(rw5, zero));
133
0
    non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi64(rw6, zero));
134
0
    non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi64(rw7, zero));
135
136
0
    if non_zero == -8 {
137
        // AC terms all zero, idct of the block is ( coeff[0] * qt[0] )/8 + 128 (bias)
138
        // (and clamped to 255)
139
        // Round by adding 0.5 * (1 << 3) and offset by adding (128 << 3) before scaling
140
0
        let coeff = ((in_vector[0] + 4 + 1024) >> 3).clamp(0, 255) as i16;
141
0
        let idct_value = _mm_set1_epi16(coeff);
142
143
        macro_rules! store {
144
            ($pos:tt,$value:tt) => {
145
                // store
146
                _mm_storeu_si128(
147
                    out_vector
148
0
                        .get_mut($pos..$pos + 8)
149
                        .unwrap()
150
                        .as_mut_ptr()
151
                        .cast(),
152
                    $value,
153
                );
154
                $pos += stride;
155
            };
156
        }
157
0
        store!(pos, idct_value);
158
0
        store!(pos, idct_value);
159
0
        store!(pos, idct_value);
160
0
        store!(pos, idct_value);
161
162
0
        store!(pos, idct_value);
163
0
        store!(pos, idct_value);
164
0
        store!(pos, idct_value);
165
0
        store!(pos, idct_value);
166
167
0
        return;
168
0
    }
169
170
0
    let mut row0 = YmmRegister { mm256: rw0 };
171
0
    let mut row1 = YmmRegister { mm256: rw1 };
172
0
    let mut row2 = YmmRegister { mm256: rw2 };
173
0
    let mut row3 = YmmRegister { mm256: rw3 };
174
175
0
    let mut row4 = YmmRegister { mm256: rw4 };
176
0
    let mut row5 = YmmRegister { mm256: rw5 };
177
0
    let mut row6 = YmmRegister { mm256: rw6 };
178
0
    let mut row7 = YmmRegister { mm256: rw7 };
179
180
    macro_rules! dct_pass {
181
        ($SCALE_BITS:tt,$scale:tt) => {
182
            // There are a lot of ways to do this
183
            // but to keep it simple(and beautiful), ill make a direct translation of the
184
            // scalar code to also make this code fully transparent(this version and the non
185
            // avx one should produce identical code.)
186
187
            // even part
188
            let p1 = (row2 + row6) * 2217;
189
190
            let mut t2 = p1 + row6 * -7567;
191
            let mut t3 = p1 + row2 * 3135;
192
193
            let mut t0 = YmmRegister {
194
                mm256: _mm256_slli_epi32((row0 + row4).mm256, 12),
195
            };
196
            let mut t1 = YmmRegister {
197
                mm256: _mm256_slli_epi32((row0 - row4).mm256, 12),
198
            };
199
200
            let x0 = t0 + t3 + $SCALE_BITS;
201
            let x3 = t0 - t3 + $SCALE_BITS;
202
            let x1 = t1 + t2 + $SCALE_BITS;
203
            let x2 = t1 - t2 + $SCALE_BITS;
204
205
            let p3 = row7 + row3;
206
            let p4 = row5 + row1;
207
            let p1 = row7 + row1;
208
            let p2 = row5 + row3;
209
            let p5 = (p3 + p4) * 4816;
210
211
            t0 = row7 * 1223;
212
            t1 = row5 * 8410;
213
            t2 = row3 * 12586;
214
            t3 = row1 * 6149;
215
216
            let p1 = p5 + p1 * -3685;
217
            let p2 = p5 + (p2 * -10497);
218
            let p3 = p3 * -8034;
219
            let p4 = p4 * -1597;
220
221
            t3 += p1 + p4;
222
            t2 += p2 + p3;
223
            t1 += p2 + p4;
224
            t0 += p1 + p3;
225
226
            row0.mm256 = _mm256_srai_epi32((x0 + t3).mm256, $scale);
227
            row1.mm256 = _mm256_srai_epi32((x1 + t2).mm256, $scale);
228
            row2.mm256 = _mm256_srai_epi32((x2 + t1).mm256, $scale);
229
            row3.mm256 = _mm256_srai_epi32((x3 + t0).mm256, $scale);
230
231
            row4.mm256 = _mm256_srai_epi32((x3 - t0).mm256, $scale);
232
            row5.mm256 = _mm256_srai_epi32((x2 - t1).mm256, $scale);
233
            row6.mm256 = _mm256_srai_epi32((x1 - t2).mm256, $scale);
234
            row7.mm256 = _mm256_srai_epi32((x0 - t3).mm256, $scale);
235
        };
236
    }
237
238
    // Process rows
239
0
    dct_pass!(512, 10);
240
0
    transpose(
241
0
        &mut row0, &mut row1, &mut row2, &mut row3, &mut row4, &mut row5, &mut row6, &mut row7,
242
    );
243
244
    // process columns
245
0
    dct_pass!(SCALE_BITS, 17);
246
0
    transpose(
247
0
        &mut row0, &mut row1, &mut row2, &mut row3, &mut row4, &mut row5, &mut row6, &mut row7,
248
    );
249
    // Pack and write the values back to the array
250
0
    permute_store!((row0.mm256), (row1.mm256), pos, out_vector, stride);
251
0
    permute_store!((row2.mm256), (row3.mm256), pos, out_vector, stride);
252
0
    permute_store!((row4.mm256), (row5.mm256), pos, out_vector, stride);
253
0
    permute_store!((row6.mm256), (row7.mm256), pos, out_vector, stride);
254
0
}
255
256
257
#[target_feature(enable = "avx2")]
258
#[allow(
259
    clippy::too_many_lines,
260
    clippy::cast_possible_truncation,
261
    clippy::similar_names,
262
    clippy::op_ref,
263
    unused_assignments,
264
    clippy::zero_prefixed_literal
265
)]
266
0
pub unsafe fn idct_avx2_4x4(
267
0
    in_vector: &mut [i32; 64], out_vector: &mut [i16], stride: usize,
268
0
) {
269
0
    let rw0 = _mm256_loadu_si256(in_vector[00..].as_ptr().cast());
270
0
    let rw1 = _mm256_loadu_si256(in_vector[08..].as_ptr().cast());
271
0
    let rw2 = _mm256_loadu_si256(in_vector[16..].as_ptr().cast());
272
0
    let rw3 = _mm256_loadu_si256(in_vector[24..].as_ptr().cast());
273
274
0
    let mut row0 = YmmRegister { mm256: rw0 };
275
0
    let mut row1 = YmmRegister { mm256: rw1 };
276
0
    let mut row2 = YmmRegister { mm256: rw2 };
277
0
    let mut row3 = YmmRegister { mm256: rw3 };
278
279
0
    let mut row4 = YmmRegister { mm256: rw0 };
280
0
    let mut row5 = YmmRegister { mm256: rw0 };
281
0
    let mut row6 = YmmRegister { mm256: rw0 };
282
0
    let mut row7 = YmmRegister { mm256: rw0 };
283
284
0
    {
285
0
        row0.mm256 = _mm256_slli_epi32(row0.mm256, 12);
286
0
        row0 += 512;
287
0
288
0
        let i2 = row2;
289
0
290
0
        let p1 = i2 * 2217;
291
0
        let p3 = i2 * 5352;
292
0
293
0
        let x0 = row0 + p3;
294
0
        let x1 = row0 + p1;
295
0
        let x2 = row0 - p1;
296
0
        let x3 = row0 - p3;
297
0
298
0
        // odd part
299
0
        let i4 = row3;
300
0
        let i3 = row1;
301
0
302
0
        let p5 = (i4 + i3) * 4816;
303
0
304
0
        let p1 = p5 + i3 * -3685;
305
0
        let p2 = p5 + i4 * -10497;
306
0
307
0
        let t3 = p5 + i3 * 867;
308
0
        let t2 = p5 + i4 * -5945;
309
0
310
0
        let t1 = p2 + i3 * -1597;
311
0
        let t0 = p1 + i4 * -8034;
312
0
313
0
        row0.mm256 = _mm256_srai_epi32((x0 + t3).mm256, 10);
314
0
        row1.mm256 = _mm256_srai_epi32((x1 + t2).mm256, 10);
315
0
        row2.mm256 = _mm256_srai_epi32((x2 + t1).mm256, 10);
316
0
        row3.mm256 = _mm256_srai_epi32((x3 + t0).mm256, 10);
317
0
318
0
        row4.mm256 = _mm256_srai_epi32((x3 - t0).mm256, 10);
319
0
        row5.mm256 = _mm256_srai_epi32((x2 - t1).mm256, 10);
320
0
        row6.mm256 = _mm256_srai_epi32((x1 - t2).mm256, 10);
321
0
        row7.mm256 = _mm256_srai_epi32((x0 - t3).mm256, 10);
322
0
    }
323
324
0
    transpose(
325
0
        &mut row0, &mut row1, &mut row2, &mut row3, &mut row4, &mut row5, &mut row6, &mut row7,
326
    );
327
328
0
    {
329
0
        let i2 = row2;
330
0
        let i0 = row0;
331
0
332
0
        row0.mm256 = _mm256_slli_epi32(i0.mm256, 12);
333
0
        let t0 = row0 + SCALE_BITS;
334
0
335
0
        let t2 = i2 * 2217;
336
0
        let t3 = i2 * 5352;
337
0
338
0
        // constants scaled things up by 1<<12, plus we had 1<<2 from first
339
0
        // loop, plus horizontal and vertical each scale by sqrt(8) so together
340
0
        // we've got an extra 1<<3, so 1<<17 total we need to remove.
341
0
        // so we want to round that, which means adding 0.5 * 1<<17,
342
0
        // aka 65536. Also, we'll end up with -128 to 127 that we want
343
0
        // to encode as 0..255 by adding 128, so we'll add that before the shift
344
0
        // Rounding constant is already added into `t0`
345
0
        let x0 = t0 + t3;
346
0
        let x3 = t0 - t3;
347
0
        let x1 = t0 + t2;
348
0
        let x2 = t0 - t2;
349
0
350
0
        // odd part
351
0
        let i3 = row3;
352
0
        let i1 = row1;
353
0
354
0
        let p5 = (i3 + i1) * 4816;
355
0
356
0
        let p1 = p5 + i1 * -3685;
357
0
        let p2 = p5 + i3 * -10497;
358
0
359
0
        let t3 = p5 + i1 * 867;
360
0
        let t2 = p5 + i3 * -5945;
361
0
362
0
        let t1 = p2 + i1 * -1597;
363
0
        let t0 = p1 + i3 * -8034;
364
0
365
0
        row0.mm256 = _mm256_srai_epi32((x0 + t3).mm256, 17);
366
0
        row1.mm256 = _mm256_srai_epi32((x1 + t2).mm256, 17);
367
0
        row2.mm256 = _mm256_srai_epi32((x2 + t1).mm256, 17);
368
0
        row3.mm256 = _mm256_srai_epi32((x3 + t0).mm256, 17);
369
0
        row4.mm256 = _mm256_srai_epi32((x3 - t0).mm256, 17);
370
0
        row5.mm256 = _mm256_srai_epi32((x2 - t1).mm256, 17);
371
0
        row6.mm256 = _mm256_srai_epi32((x1 - t2).mm256, 17);
372
0
        row7.mm256 = _mm256_srai_epi32((x0 - t3).mm256, 17);
373
0
    }
374
375
0
    transpose(
376
0
        &mut row0, &mut row1, &mut row2, &mut row3, &mut row4, &mut row5, &mut row6, &mut row7,
377
    );
378
379
0
    let mut pos = 0;
380
381
    // Pack and write the values back to the array
382
0
    permute_store!((row0.mm256), (row1.mm256), pos, out_vector, stride);
383
0
    permute_store!((row2.mm256), (row3.mm256), pos, out_vector, stride);
384
0
    permute_store!((row4.mm256), (row5.mm256), pos, out_vector, stride);
385
0
    permute_store!((row6.mm256), (row7.mm256), pos, out_vector, stride);
386
0
}
387
388
#[inline]
389
#[target_feature(enable = "avx2")]
390
0
unsafe fn clamp_avx(reg: __m256i) -> __m256i {
391
0
    let min_s = _mm256_set1_epi16(0);
392
0
    let max_s = _mm256_set1_epi16(255);
393
394
0
    let max_v = _mm256_max_epi16(reg, min_s); //max(a,0)
395
0
    let min_v = _mm256_min_epi16(max_v, max_s); //min(max(a,0),255)
396
0
    return min_v;
397
0
}
398
399
/// A copy of `_MM_SHUFFLE()` that doesn't require
400
/// a nightly compiler
401
#[inline]
402
0
const fn shuffle(z: i32, y: i32, x: i32, w: i32) -> i32 {
403
0
    ((z << 6) | (y << 4) | (x << 2) | w)
404
0
}