Coverage Report

Created: 2025-07-11 07:25

/rust/registry/src/index.crates.io-6f17d22bba15001f/zune-jpeg-0.4.19/src/idct/scalar.rs
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2023.
3
 *
4
 * This software is free software;
5
 *
6
 * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
7
 */
8
9
//! Platform independent IDCT algorithm
10
//!
11
//! Not as fast as AVX one.
12
13
const SCALE_BITS: i32 = 512 + 65536 + (128 << 17);
14
15
#[allow(unused_assignments)]
16
#[allow(
17
    clippy::too_many_lines,
18
    clippy::op_ref,
19
    clippy::cast_possible_truncation
20
)]
21
0
pub fn idct_int(in_vector: &mut [i32; 64], out_vector: &mut [i16], stride: usize) {
22
0
    // Temporary variables.
23
0
24
0
    let mut pos = 0;
25
0
26
0
    let mut i = 0;
27
0
    // Don't check for zeroes inside loop, lift it and check outside
28
0
    // we want to accelerate the case with 63 0 ac coeff
29
0
    if &in_vector[1..] == &[0_i32; 63] {
30
0
        // okay then if you work, yay, let's write you really quick
31
0
        let coeff = [((in_vector[0] + 4 + 1024) >> 3).clamp(0, 255) as i16; 8];
32
0
33
0
        macro_rules! store {
34
            ($index:tt) => {
35
                // position of the MCU
36
                let mcu_stride: &mut [i16; 8] = out_vector
37
                    .get_mut($index..$index + 8)
38
                    .unwrap()
39
                    .try_into()
40
                    .unwrap();
41
                // copy coefficients
42
                mcu_stride.copy_from_slice(&coeff);
43
                // increment index
44
                $index += stride;
45
            };
46
0
        }
47
0
        // write to four positions
48
0
        store!(pos);
49
0
        store!(pos);
50
0
        store!(pos);
51
0
        store!(pos);
52
0
53
0
        store!(pos);
54
0
        store!(pos);
55
0
        store!(pos);
56
0
        store!(pos);
57
0
    } else {
58
        // because the compiler fails to see that it can be auto_vectorised so i'll
59
        // leave it here check out [idct_int_slow, and idct_int_1D to get what i mean ] https://godbolt.org/z/8hqW9z9j9
60
0
        for ptr in 0..8 {
61
0
            let p2 = in_vector[ptr + 16];
62
0
            let p3 = in_vector[ptr + 48];
63
0
64
0
            let p1 = (p2 + p3).wrapping_mul(2217);
65
0
66
0
            let t2 = p1 + p3 * -7567;
67
0
            let t3 = p1 + p2 * 3135;
68
0
69
0
            let p2 = in_vector[ptr];
70
0
            let p3 = in_vector[32 + ptr];
71
0
            let t0 = fsh(p2 + p3);
72
0
            let t1 = fsh(p2 - p3);
73
0
74
0
            let x0 = t0 + t3 + 512;
75
0
            let x3 = t0 - t3 + 512;
76
0
            let x1 = t1 + t2 + 512;
77
0
            let x2 = t1 - t2 + 512;
78
0
79
0
            // odd part
80
0
            let mut t0 = in_vector[ptr + 56];
81
0
            let mut t1 = in_vector[ptr + 40];
82
0
            let mut t2 = in_vector[ptr + 24];
83
0
            let mut t3 = in_vector[ptr + 8];
84
0
85
0
            let p3 = t0 + t2;
86
0
            let p4 = t1 + t3;
87
0
            let p1 = t0 + t3;
88
0
            let p2 = t1 + t2;
89
0
            let p5 = (p3 + p4) * 4816;
90
0
91
0
            t0 *= 1223;
92
0
            t1 *= 8410;
93
0
            t2 *= 12586;
94
0
            t3 *= 6149;
95
0
96
0
            let p1 = p5 + p1 * -3685;
97
0
            let p2 = p5 + p2 * -10497;
98
0
            let p3 = p3 * -8034;
99
0
            let p4 = p4 * -1597;
100
0
101
0
            t3 += p1 + p4;
102
0
            t2 += p2 + p3;
103
0
            t1 += p2 + p4;
104
0
            t0 += p1 + p3;
105
0
106
0
            // constants scaled things up by 1<<12; let's bring them back
107
0
            // down, but keep 2 extra bits of precision
108
0
            in_vector[ptr] = (x0 + t3) >> 10;
109
0
            in_vector[ptr + 8] = (x1 + t2) >> 10;
110
0
            in_vector[ptr + 16] = (x2 + t1) >> 10;
111
0
            in_vector[ptr + 24] = (x3 + t0) >> 10;
112
0
            in_vector[ptr + 32] = (x3 - t0) >> 10;
113
0
            in_vector[ptr + 40] = (x2 - t1) >> 10;
114
0
            in_vector[ptr + 48] = (x1 - t2) >> 10;
115
0
            in_vector[ptr + 56] = (x0 - t3) >> 10;
116
0
        }
117
118
        // This is vectorised in architectures supporting SSE 4.1
119
0
        while i < 64 {
120
0
            // We won't try to short circuit here because it rarely works
121
0
122
0
            // Even part
123
0
            let p2 = in_vector[i + 2];
124
0
            let p3 = in_vector[i + 6];
125
0
126
0
            let p1 = (p2 + p3) * 2217;
127
0
            let t2 = p1 + p3 * -7567;
128
0
            let t3 = p1 + p2 * 3135;
129
0
130
0
            let p2 = in_vector[i];
131
0
            let p3 = in_vector[i + 4];
132
0
133
0
            let t0 = fsh(p2 + p3);
134
0
            let t1 = fsh(p2 - p3);
135
0
            // constants scaled things up by 1<<12, plus we had 1<<2 from first
136
0
            // loop, plus horizontal and vertical each scale by sqrt(8) so together
137
0
            // we've got an extra 1<<3, so 1<<17 total we need to remove.
138
0
            // so we want to round that, which means adding 0.5 * 1<<17,
139
0
            // aka 65536. Also, we'll end up with -128 to 127 that we want
140
0
            // to encode as 0..255 by adding 128, so we'll add that before the shift
141
0
            let x0 = t0 + t3 + SCALE_BITS;
142
0
            let x3 = t0 - t3 + SCALE_BITS;
143
0
            let x1 = t1 + t2 + SCALE_BITS;
144
0
            let x2 = t1 - t2 + SCALE_BITS;
145
0
            // odd part
146
0
            let mut t0 = in_vector[i + 7];
147
0
            let mut t1 = in_vector[i + 5];
148
0
            let mut t2 = in_vector[i + 3];
149
0
            let mut t3 = in_vector[i + 1];
150
0
151
0
            let p3 = t0 + t2;
152
0
            let p4 = t1 + t3;
153
0
            let p1 = t0 + t3;
154
0
            let p2 = t1 + t2;
155
0
            let p5 = (p3 + p4) * f2f(1.175875602);
156
0
157
0
            t0 = t0.wrapping_mul(1223);
158
0
            t1 = t1.wrapping_mul(8410);
159
0
            t2 = t2.wrapping_mul(12586);
160
0
            t3 = t3.wrapping_mul(6149);
161
0
162
0
            let p1 = p5 + p1 * -3685;
163
0
            let p2 = p5 + p2 * -10497;
164
0
            let p3 = p3 * -8034;
165
0
            let p4 = p4 * -1597;
166
0
167
0
            t3 += p1 + p4;
168
0
            t2 += p2 + p3;
169
0
            t1 += p2 + p4;
170
0
            t0 += p1 + p3;
171
0
172
0
            let out: &mut [i16; 8] = out_vector
173
0
                .get_mut(pos..pos + 8)
174
0
                .unwrap()
175
0
                .try_into()
176
0
                .unwrap();
177
0
178
0
            out[0] = clamp((x0 + t3) >> 17);
179
0
            out[1] = clamp((x1 + t2) >> 17);
180
0
            out[2] = clamp((x2 + t1) >> 17);
181
0
            out[3] = clamp((x3 + t0) >> 17);
182
0
            out[4] = clamp((x3 - t0) >> 17);
183
0
            out[5] = clamp((x2 - t1) >> 17);
184
0
            out[6] = clamp((x1 - t2) >> 17);
185
0
            out[7] = clamp((x0 - t3) >> 17);
186
0
187
0
            i += 8;
188
0
189
0
            pos += stride;
190
0
        }
191
    }
192
0
}
193
194
#[inline]
195
#[allow(clippy::cast_possible_truncation)]
196
/// Multiply a number by 4096
197
0
fn f2f(x: f32) -> i32 {
198
0
    (x * 4096.0 + 0.5) as i32
199
0
}
200
201
#[inline]
202
/// Multiply a number by 4096
203
0
fn fsh(x: i32) -> i32 {
204
0
    x << 12
205
0
}
206
207
/// Clamp values between 0 and 255
208
#[inline]
209
#[allow(clippy::cast_possible_truncation)]
210
0
fn clamp(a: i32) -> i16 {
211
0
    a.clamp(0, 255) as i16
212
0
}