/rust/registry/src/index.crates.io-6f17d22bba15001f/zune-jpeg-0.4.19/src/idct/scalar.rs

Source (jump to first uncovered line)
/*
 * Copyright (c) 2023.
 *
 * This software is free software;
 *
 * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
 */

//! Platform independent IDCT algorithm
//!
//! Not as fast as AVX one.

const SCALE_BITS: i32 = 512 + 65536 + (128 << 17);

#[allow(unused_assignments)]
#[allow(
    clippy::too_many_lines,
    clippy::op_ref,
    clippy::cast_possible_truncation
)]
pub fn idct_int(in_vector: &mut [i32; 64], out_vector: &mut [i16], stride: usize) {
    // Temporary variables.

    let mut pos = 0;

    let mut i = 0;
    // Don't check for zeroes inside loop, lift it and check outside
    // we want to accelerate the case with 63 0 ac coeff
    if &in_vector[1..] == &[0_i32; 63] {
        // okay then if you work, yay, let's write you really quick
        let coeff = [((in_vector[0] + 4 + 1024) >> 3).clamp(0, 255) as i16; 8];

        macro_rules! store {
            ($index:tt) => {
                // position of the MCU
                let mcu_stride: &mut [i16; 8] = out_vector
                    .get_mut($index..$index + 8)
                    .unwrap()
                    .try_into()
                    .unwrap();
                // copy coefficients
                mcu_stride.copy_from_slice(&coeff);
                // increment index
                $index += stride;
            };
        }
        // write to four positions
        store!(pos);
        store!(pos);
        store!(pos);
        store!(pos);

        store!(pos);
        store!(pos);
        store!(pos);
        store!(pos);
    } else {
        // because the compiler fails to see that it can be auto_vectorised so i'll
        // leave it here check out [idct_int_slow, and idct_int_1D to get what i mean ] https://godbolt.org/z/8hqW9z9j9
        for ptr in 0..8 {
            let p2 = in_vector[ptr + 16];
            let p3 = in_vector[ptr + 48];

            let p1 = (p2 + p3).wrapping_mul(2217);

            let t2 = p1 + p3 * -7567;
            let t3 = p1 + p2 * 3135;

            let p2 = in_vector[ptr];
            let p3 = in_vector[32 + ptr];
            let t0 = fsh(p2 + p3);
            let t1 = fsh(p2 - p3);

            let x0 = t0 + t3 + 512;
            let x3 = t0 - t3 + 512;
            let x1 = t1 + t2 + 512;
            let x2 = t1 - t2 + 512;

            // odd part
            let mut t0 = in_vector[ptr + 56];
            let mut t1 = in_vector[ptr + 40];
            let mut t2 = in_vector[ptr + 24];
            let mut t3 = in_vector[ptr + 8];

            let p3 = t0 + t2;
            let p4 = t1 + t3;
            let p1 = t0 + t3;
            let p2 = t1 + t2;
            let p5 = (p3 + p4) * 4816;

            t0 *= 1223;
            t1 *= 8410;
            t2 *= 12586;
            t3 *= 6149;

            let p1 = p5 + p1 * -3685;
            let p2 = p5 + p2 * -10497;
            let p3 = p3 * -8034;
            let p4 = p4 * -1597;

            t3 += p1 + p4;
            t2 += p2 + p3;
            t1 += p2 + p4;
            t0 += p1 + p3;

            // constants scaled things up by 1<<12; let's bring them back
            // down, but keep 2 extra bits of precision
            in_vector[ptr] = (x0 + t3) >> 10;
            in_vector[ptr + 8] = (x1 + t2) >> 10;
            in_vector[ptr + 16] = (x2 + t1) >> 10;
            in_vector[ptr + 24] = (x3 + t0) >> 10;
            in_vector[ptr + 32] = (x3 - t0) >> 10;
            in_vector[ptr + 40] = (x2 - t1) >> 10;
            in_vector[ptr + 48] = (x1 - t2) >> 10;
            in_vector[ptr + 56] = (x0 - t3) >> 10;
        }

        // This is vectorised in architectures supporting SSE 4.1
        while i < 64 {
            // We won't try to short circuit here because it rarely works

            // Even part
            let p2 = in_vector[i + 2];
            let p3 = in_vector[i + 6];

            let p1 = (p2 + p3) * 2217;
            let t2 = p1 + p3 * -7567;
            let t3 = p1 + p2 * 3135;

            let p2 = in_vector[i];
            let p3 = in_vector[i + 4];

            let t0 = fsh(p2 + p3);
            let t1 = fsh(p2 - p3);
            // constants scaled things up by 1<<12, plus we had 1<<2 from first
            // loop, plus horizontal and vertical each scale by sqrt(8) so together
            // we've got an extra 1<<3, so 1<<17 total we need to remove.
            // so we want to round that, which means adding 0.5 * 1<<17,
            // aka 65536. Also, we'll end up with -128 to 127 that we want
            // to encode as 0..255 by adding 128, so we'll add that before the shift
            let x0 = t0 + t3 + SCALE_BITS;
            let x3 = t0 - t3 + SCALE_BITS;
            let x1 = t1 + t2 + SCALE_BITS;
            let x2 = t1 - t2 + SCALE_BITS;
            // odd part
            let mut t0 = in_vector[i + 7];
            let mut t1 = in_vector[i + 5];
            let mut t2 = in_vector[i + 3];
            let mut t3 = in_vector[i + 1];

            let p3 = t0 + t2;
            let p4 = t1 + t3;
            let p1 = t0 + t3;
            let p2 = t1 + t2;
            let p5 = (p3 + p4) * f2f(1.175875602);

            t0 = t0.wrapping_mul(1223);
            t1 = t1.wrapping_mul(8410);
            t2 = t2.wrapping_mul(12586);
            t3 = t3.wrapping_mul(6149);

            let p1 = p5 + p1 * -3685;
            let p2 = p5 + p2 * -10497;
            let p3 = p3 * -8034;
            let p4 = p4 * -1597;

            t3 += p1 + p4;
            t2 += p2 + p3;
            t1 += p2 + p4;
            t0 += p1 + p3;

            let out: &mut [i16; 8] = out_vector
                .get_mut(pos..pos + 8)
                .unwrap()
                .try_into()
                .unwrap();

            out[0] = clamp((x0 + t3) >> 17);
            out[1] = clamp((x1 + t2) >> 17);
            out[2] = clamp((x2 + t1) >> 17);
            out[3] = clamp((x3 + t0) >> 17);
            out[4] = clamp((x3 - t0) >> 17);
            out[5] = clamp((x2 - t1) >> 17);
            out[6] = clamp((x1 - t2) >> 17);
            out[7] = clamp((x0 - t3) >> 17);

            i += 8;

            pos += stride;
        }
    }
}

#[inline]
#[allow(clippy::cast_possible_truncation)]
/// Multiply a number by 4096
fn f2f(x: f32) -> i32 {
    (x * 4096.0 + 0.5) as i32
}

#[inline]
/// Multiply a number by 4096
fn fsh(x: i32) -> i32 {
    x << 12
}

/// Clamp values between 0 and 255
#[inline]
#[allow(clippy::cast_possible_truncation)]
fn clamp(a: i32) -> i16 {
    a.clamp(0, 255) as i16
}

Coverage Report

Created: 2025-07-11 07:25

Line	Count	Source (jump to first uncovered line)
1		/*
2		* Copyright (c) 2023.
3		*
4		* This software is free software;
5		*
6		* You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
7		*/
8
9		//! Platform independent IDCT algorithm
10		//!
11		//! Not as fast as AVX one.
12
13		const SCALE_BITS: i32 = 512 + 65536 + (128 << 17);
14
15		#[allow(unused_assignments)]
16		#[allow(
17		clippy::too_many_lines,
18		clippy::op_ref,
19		clippy::cast_possible_truncation
20		)]
21	0	pub fn idct_int(in_vector: &mut [i32; 64], out_vector: &mut [i16], stride: usize) {
22	0	// Temporary variables.
23	0
24	0	let mut pos = 0;
25	0
26	0	let mut i = 0;
27	0	// Don't check for zeroes inside loop, lift it and check outside
28	0	// we want to accelerate the case with 63 0 ac coeff
29	0	if &in_vector[1..] == &[0_i32; 63] {
30	0	// okay then if you work, yay, let's write you really quick
31	0	let coeff = [((in_vector[0] + 4 + 1024) >> 3).clamp(0, 255) as i16; 8];
32	0
33	0	macro_rules! store {
34		($index:tt) => {
35		// position of the MCU
36		let mcu_stride: &mut [i16; 8] = out_vector
37		.get_mut($index..$index + 8)
38		.unwrap()
39		.try_into()
40		.unwrap();
41		// copy coefficients
42		mcu_stride.copy_from_slice(&coeff);
43		// increment index
44		$index += stride;
45		};
46	0	}
47	0	// write to four positions
48	0	store!(pos);
49	0	store!(pos);
50	0	store!(pos);
51	0	store!(pos);
52	0
53	0	store!(pos);
54	0	store!(pos);
55	0	store!(pos);
56	0	store!(pos);
57	0	} else {
58		// because the compiler fails to see that it can be auto_vectorised so i'll
59		// leave it here check out [idct_int_slow, and idct_int_1D to get what i mean ] https://godbolt.org/z/8hqW9z9j9
60	0	for ptr in 0..8 {
61	0	let p2 = in_vector[ptr + 16];
62	0	let p3 = in_vector[ptr + 48];
63	0
64	0	let p1 = (p2 + p3).wrapping_mul(2217);
65	0
66	0	let t2 = p1 + p3 * -7567;
67	0	let t3 = p1 + p2 * 3135;
68	0
69	0	let p2 = in_vector[ptr];
70	0	let p3 = in_vector[32 + ptr];
71	0	let t0 = fsh(p2 + p3);
72	0	let t1 = fsh(p2 - p3);
73	0
74	0	let x0 = t0 + t3 + 512;
75	0	let x3 = t0 - t3 + 512;
76	0	let x1 = t1 + t2 + 512;
77	0	let x2 = t1 - t2 + 512;
78	0
79	0	// odd part
80	0	let mut t0 = in_vector[ptr + 56];
81	0	let mut t1 = in_vector[ptr + 40];
82	0	let mut t2 = in_vector[ptr + 24];
83	0	let mut t3 = in_vector[ptr + 8];
84	0
85	0	let p3 = t0 + t2;
86	0	let p4 = t1 + t3;
87	0	let p1 = t0 + t3;
88	0	let p2 = t1 + t2;
89	0	let p5 = (p3 + p4) * 4816;
90	0
91	0	t0 *= 1223;
92	0	t1 *= 8410;
93	0	t2 *= 12586;
94	0	t3 *= 6149;
95	0
96	0	let p1 = p5 + p1 * -3685;
97	0	let p2 = p5 + p2 * -10497;
98	0	let p3 = p3 * -8034;
99	0	let p4 = p4 * -1597;
100	0
101	0	t3 += p1 + p4;
102	0	t2 += p2 + p3;
103	0	t1 += p2 + p4;
104	0	t0 += p1 + p3;
105	0
106	0	// constants scaled things up by 1<<12; let's bring them back
107	0	// down, but keep 2 extra bits of precision
108	0	in_vector[ptr] = (x0 + t3) >> 10;
109	0	in_vector[ptr + 8] = (x1 + t2) >> 10;
110	0	in_vector[ptr + 16] = (x2 + t1) >> 10;
111	0	in_vector[ptr + 24] = (x3 + t0) >> 10;
112	0	in_vector[ptr + 32] = (x3 - t0) >> 10;
113	0	in_vector[ptr + 40] = (x2 - t1) >> 10;
114	0	in_vector[ptr + 48] = (x1 - t2) >> 10;
115	0	in_vector[ptr + 56] = (x0 - t3) >> 10;
116	0	}
117
118		// This is vectorised in architectures supporting SSE 4.1
119	0	while i < 64 {
120	0	// We won't try to short circuit here because it rarely works
121	0
122	0	// Even part
123	0	let p2 = in_vector[i + 2];
124	0	let p3 = in_vector[i + 6];
125	0
126	0	let p1 = (p2 + p3) * 2217;
127	0	let t2 = p1 + p3 * -7567;
128	0	let t3 = p1 + p2 * 3135;
129	0
130	0	let p2 = in_vector[i];
131	0	let p3 = in_vector[i + 4];
132	0
133	0	let t0 = fsh(p2 + p3);
134	0	let t1 = fsh(p2 - p3);
135	0	// constants scaled things up by 1<<12, plus we had 1<<2 from first
136	0	// loop, plus horizontal and vertical each scale by sqrt(8) so together
137	0	// we've got an extra 1<<3, so 1<<17 total we need to remove.
138	0	// so we want to round that, which means adding 0.5 * 1<<17,
139	0	// aka 65536. Also, we'll end up with -128 to 127 that we want
140	0	// to encode as 0..255 by adding 128, so we'll add that before the shift
141	0	let x0 = t0 + t3 + SCALE_BITS;
142	0	let x3 = t0 - t3 + SCALE_BITS;
143	0	let x1 = t1 + t2 + SCALE_BITS;
144	0	let x2 = t1 - t2 + SCALE_BITS;
145	0	// odd part
146	0	let mut t0 = in_vector[i + 7];
147	0	let mut t1 = in_vector[i + 5];
148	0	let mut t2 = in_vector[i + 3];
149	0	let mut t3 = in_vector[i + 1];
150	0
151	0	let p3 = t0 + t2;
152	0	let p4 = t1 + t3;
153	0	let p1 = t0 + t3;
154	0	let p2 = t1 + t2;
155	0	let p5 = (p3 + p4) * f2f(1.175875602);
156	0
157	0	t0 = t0.wrapping_mul(1223);
158	0	t1 = t1.wrapping_mul(8410);
159	0	t2 = t2.wrapping_mul(12586);
160	0	t3 = t3.wrapping_mul(6149);
161	0
162	0	let p1 = p5 + p1 * -3685;
163	0	let p2 = p5 + p2 * -10497;
164	0	let p3 = p3 * -8034;
165	0	let p4 = p4 * -1597;
166	0
167	0	t3 += p1 + p4;
168	0	t2 += p2 + p3;
169	0	t1 += p2 + p4;
170	0	t0 += p1 + p3;
171	0
172	0	let out: &mut [i16; 8] = out_vector
173	0	.get_mut(pos..pos + 8)
174	0	.unwrap()
175	0	.try_into()
176	0	.unwrap();
177	0
178	0	out[0] = clamp((x0 + t3) >> 17);
179	0	out[1] = clamp((x1 + t2) >> 17);
180	0	out[2] = clamp((x2 + t1) >> 17);
181	0	out[3] = clamp((x3 + t0) >> 17);
182	0	out[4] = clamp((x3 - t0) >> 17);
183	0	out[5] = clamp((x2 - t1) >> 17);
184	0	out[6] = clamp((x1 - t2) >> 17);
185	0	out[7] = clamp((x0 - t3) >> 17);
186	0
187	0	i += 8;
188	0
189	0	pos += stride;
190	0	}
191		}
192	0	}
193
194		#[inline]
195		#[allow(clippy::cast_possible_truncation)]
196		/// Multiply a number by 4096
197	0	fn f2f(x: f32) -> i32 {
198	0	(x * 4096.0 + 0.5) as i32
199	0	}
200
201		#[inline]
202		/// Multiply a number by 4096
203	0	fn fsh(x: i32) -> i32 {
204	0	x << 12
205	0	}
206
207		/// Clamp values between 0 and 255
208		#[inline]
209		#[allow(clippy::cast_possible_truncation)]
210	0	fn clamp(a: i32) -> i16 {
211	0	a.clamp(0, 255) as i16
212	0	}