/rust/registry/src/index.crates.io-6f17d22bba15001f/zune-jpeg-0.4.19/src/idct/scalar.rs
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2023. |
3 | | * |
4 | | * This software is free software; |
5 | | * |
6 | | * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license |
7 | | */ |
8 | | |
9 | | //! Platform independent IDCT algorithm |
10 | | //! |
11 | | //! Not as fast as AVX one. |
12 | | |
13 | | const SCALE_BITS: i32 = 512 + 65536 + (128 << 17); |
14 | | |
15 | | #[allow(unused_assignments)] |
16 | | #[allow( |
17 | | clippy::too_many_lines, |
18 | | clippy::op_ref, |
19 | | clippy::cast_possible_truncation |
20 | | )] |
21 | 0 | pub fn idct_int(in_vector: &mut [i32; 64], out_vector: &mut [i16], stride: usize) { |
22 | 0 | // Temporary variables. |
23 | 0 |
|
24 | 0 | let mut pos = 0; |
25 | 0 |
|
26 | 0 | let mut i = 0; |
27 | 0 | // Don't check for zeroes inside loop, lift it and check outside |
28 | 0 | // we want to accelerate the case with 63 0 ac coeff |
29 | 0 | if &in_vector[1..] == &[0_i32; 63] { |
30 | 0 | // okay then if you work, yay, let's write you really quick |
31 | 0 | let coeff = [((in_vector[0] + 4 + 1024) >> 3).clamp(0, 255) as i16; 8]; |
32 | 0 |
|
33 | 0 | macro_rules! store { |
34 | | ($index:tt) => { |
35 | | // position of the MCU |
36 | | let mcu_stride: &mut [i16; 8] = out_vector |
37 | | .get_mut($index..$index + 8) |
38 | | .unwrap() |
39 | | .try_into() |
40 | | .unwrap(); |
41 | | // copy coefficients |
42 | | mcu_stride.copy_from_slice(&coeff); |
43 | | // increment index |
44 | | $index += stride; |
45 | | }; |
46 | 0 | } |
47 | 0 | // write to four positions |
48 | 0 | store!(pos); |
49 | 0 | store!(pos); |
50 | 0 | store!(pos); |
51 | 0 | store!(pos); |
52 | 0 |
|
53 | 0 | store!(pos); |
54 | 0 | store!(pos); |
55 | 0 | store!(pos); |
56 | 0 | store!(pos); |
57 | 0 | } else { |
58 | | // because the compiler fails to see that it can be auto_vectorised so i'll |
59 | | // leave it here check out [idct_int_slow, and idct_int_1D to get what i mean ] https://godbolt.org/z/8hqW9z9j9 |
60 | 0 | for ptr in 0..8 { |
61 | 0 | let p2 = in_vector[ptr + 16]; |
62 | 0 | let p3 = in_vector[ptr + 48]; |
63 | 0 |
|
64 | 0 | let p1 = (p2 + p3).wrapping_mul(2217); |
65 | 0 |
|
66 | 0 | let t2 = p1 + p3 * -7567; |
67 | 0 | let t3 = p1 + p2 * 3135; |
68 | 0 |
|
69 | 0 | let p2 = in_vector[ptr]; |
70 | 0 | let p3 = in_vector[32 + ptr]; |
71 | 0 | let t0 = fsh(p2 + p3); |
72 | 0 | let t1 = fsh(p2 - p3); |
73 | 0 |
|
74 | 0 | let x0 = t0 + t3 + 512; |
75 | 0 | let x3 = t0 - t3 + 512; |
76 | 0 | let x1 = t1 + t2 + 512; |
77 | 0 | let x2 = t1 - t2 + 512; |
78 | 0 |
|
79 | 0 | // odd part |
80 | 0 | let mut t0 = in_vector[ptr + 56]; |
81 | 0 | let mut t1 = in_vector[ptr + 40]; |
82 | 0 | let mut t2 = in_vector[ptr + 24]; |
83 | 0 | let mut t3 = in_vector[ptr + 8]; |
84 | 0 |
|
85 | 0 | let p3 = t0 + t2; |
86 | 0 | let p4 = t1 + t3; |
87 | 0 | let p1 = t0 + t3; |
88 | 0 | let p2 = t1 + t2; |
89 | 0 | let p5 = (p3 + p4) * 4816; |
90 | 0 |
|
91 | 0 | t0 *= 1223; |
92 | 0 | t1 *= 8410; |
93 | 0 | t2 *= 12586; |
94 | 0 | t3 *= 6149; |
95 | 0 |
|
96 | 0 | let p1 = p5 + p1 * -3685; |
97 | 0 | let p2 = p5 + p2 * -10497; |
98 | 0 | let p3 = p3 * -8034; |
99 | 0 | let p4 = p4 * -1597; |
100 | 0 |
|
101 | 0 | t3 += p1 + p4; |
102 | 0 | t2 += p2 + p3; |
103 | 0 | t1 += p2 + p4; |
104 | 0 | t0 += p1 + p3; |
105 | 0 |
|
106 | 0 | // constants scaled things up by 1<<12; let's bring them back |
107 | 0 | // down, but keep 2 extra bits of precision |
108 | 0 | in_vector[ptr] = (x0 + t3) >> 10; |
109 | 0 | in_vector[ptr + 8] = (x1 + t2) >> 10; |
110 | 0 | in_vector[ptr + 16] = (x2 + t1) >> 10; |
111 | 0 | in_vector[ptr + 24] = (x3 + t0) >> 10; |
112 | 0 | in_vector[ptr + 32] = (x3 - t0) >> 10; |
113 | 0 | in_vector[ptr + 40] = (x2 - t1) >> 10; |
114 | 0 | in_vector[ptr + 48] = (x1 - t2) >> 10; |
115 | 0 | in_vector[ptr + 56] = (x0 - t3) >> 10; |
116 | 0 | } |
117 | | |
118 | | // This is vectorised in architectures supporting SSE 4.1 |
119 | 0 | while i < 64 { |
120 | 0 | // We won't try to short circuit here because it rarely works |
121 | 0 |
|
122 | 0 | // Even part |
123 | 0 | let p2 = in_vector[i + 2]; |
124 | 0 | let p3 = in_vector[i + 6]; |
125 | 0 |
|
126 | 0 | let p1 = (p2 + p3) * 2217; |
127 | 0 | let t2 = p1 + p3 * -7567; |
128 | 0 | let t3 = p1 + p2 * 3135; |
129 | 0 |
|
130 | 0 | let p2 = in_vector[i]; |
131 | 0 | let p3 = in_vector[i + 4]; |
132 | 0 |
|
133 | 0 | let t0 = fsh(p2 + p3); |
134 | 0 | let t1 = fsh(p2 - p3); |
135 | 0 | // constants scaled things up by 1<<12, plus we had 1<<2 from first |
136 | 0 | // loop, plus horizontal and vertical each scale by sqrt(8) so together |
137 | 0 | // we've got an extra 1<<3, so 1<<17 total we need to remove. |
138 | 0 | // so we want to round that, which means adding 0.5 * 1<<17, |
139 | 0 | // aka 65536. Also, we'll end up with -128 to 127 that we want |
140 | 0 | // to encode as 0..255 by adding 128, so we'll add that before the shift |
141 | 0 | let x0 = t0 + t3 + SCALE_BITS; |
142 | 0 | let x3 = t0 - t3 + SCALE_BITS; |
143 | 0 | let x1 = t1 + t2 + SCALE_BITS; |
144 | 0 | let x2 = t1 - t2 + SCALE_BITS; |
145 | 0 | // odd part |
146 | 0 | let mut t0 = in_vector[i + 7]; |
147 | 0 | let mut t1 = in_vector[i + 5]; |
148 | 0 | let mut t2 = in_vector[i + 3]; |
149 | 0 | let mut t3 = in_vector[i + 1]; |
150 | 0 |
|
151 | 0 | let p3 = t0 + t2; |
152 | 0 | let p4 = t1 + t3; |
153 | 0 | let p1 = t0 + t3; |
154 | 0 | let p2 = t1 + t2; |
155 | 0 | let p5 = (p3 + p4) * f2f(1.175875602); |
156 | 0 |
|
157 | 0 | t0 = t0.wrapping_mul(1223); |
158 | 0 | t1 = t1.wrapping_mul(8410); |
159 | 0 | t2 = t2.wrapping_mul(12586); |
160 | 0 | t3 = t3.wrapping_mul(6149); |
161 | 0 |
|
162 | 0 | let p1 = p5 + p1 * -3685; |
163 | 0 | let p2 = p5 + p2 * -10497; |
164 | 0 | let p3 = p3 * -8034; |
165 | 0 | let p4 = p4 * -1597; |
166 | 0 |
|
167 | 0 | t3 += p1 + p4; |
168 | 0 | t2 += p2 + p3; |
169 | 0 | t1 += p2 + p4; |
170 | 0 | t0 += p1 + p3; |
171 | 0 |
|
172 | 0 | let out: &mut [i16; 8] = out_vector |
173 | 0 | .get_mut(pos..pos + 8) |
174 | 0 | .unwrap() |
175 | 0 | .try_into() |
176 | 0 | .unwrap(); |
177 | 0 |
|
178 | 0 | out[0] = clamp((x0 + t3) >> 17); |
179 | 0 | out[1] = clamp((x1 + t2) >> 17); |
180 | 0 | out[2] = clamp((x2 + t1) >> 17); |
181 | 0 | out[3] = clamp((x3 + t0) >> 17); |
182 | 0 | out[4] = clamp((x3 - t0) >> 17); |
183 | 0 | out[5] = clamp((x2 - t1) >> 17); |
184 | 0 | out[6] = clamp((x1 - t2) >> 17); |
185 | 0 | out[7] = clamp((x0 - t3) >> 17); |
186 | 0 |
|
187 | 0 | i += 8; |
188 | 0 |
|
189 | 0 | pos += stride; |
190 | 0 | } |
191 | | } |
192 | 0 | } |
193 | | |
194 | | #[inline] |
195 | | #[allow(clippy::cast_possible_truncation)] |
196 | | /// Multiply a number by 4096 |
197 | 0 | fn f2f(x: f32) -> i32 { |
198 | 0 | (x * 4096.0 + 0.5) as i32 |
199 | 0 | } |
200 | | |
201 | | #[inline] |
202 | | /// Multiply a number by 4096 |
203 | 0 | fn fsh(x: i32) -> i32 { |
204 | 0 | x << 12 |
205 | 0 | } |
206 | | |
207 | | /// Clamp values between 0 and 255 |
208 | | #[inline] |
209 | | #[allow(clippy::cast_possible_truncation)] |
210 | 0 | fn clamp(a: i32) -> i16 { |
211 | 0 | a.clamp(0, 255) as i16 |
212 | 0 | } |