/rust/registry/src/index.crates.io-1949cf8c6b5b557f/jpeg-encoder-0.7.0/src/avx2/ycbcr.rs
Line | Count | Source |
1 | | #[cfg(target_arch = "x86")] |
2 | | use core::arch::x86::{ |
3 | | __m256i, _mm256_add_epi32, _mm256_mullo_epi32, _mm256_set_epi32, _mm256_set1_epi32, |
4 | | _mm256_srli_epi32, _mm256_sub_epi32, |
5 | | }; |
6 | | |
7 | | #[cfg(target_arch = "x86_64")] |
8 | | use core::arch::x86_64::{ |
9 | | __m256i, _mm256_add_epi32, _mm256_mullo_epi32, _mm256_set_epi32, _mm256_set1_epi32, |
10 | | _mm256_srli_epi32, _mm256_sub_epi32, |
11 | | }; |
12 | | |
13 | | use alloc::vec::Vec; |
14 | | |
15 | | use crate::{ImageBuffer, JpegColorType, rgb_to_ycbcr}; |
16 | | |
17 | | macro_rules! ycbcr_image_avx2 { |
18 | | ($name:ident, $num_colors:expr, $o1:expr, $o2:expr, $o3:expr) => { |
19 | | pub struct $name<'a>(pub &'a [u8], pub u16, pub u16); |
20 | | |
21 | | impl<'a> $name<'a> { |
22 | | #[target_feature(enable = "avx2")] |
23 | 0 | fn fill_buffers_avx2(&self, y: u16, buffers: &mut [Vec<u8>; 4]) { |
24 | | // TODO: this compiles to many separate scalar loads and could be optimized further. |
25 | | // But the gains are no more than 3% end to end and it doesn't seem to be worth the complexity. |
26 | | #[inline] |
27 | | #[target_feature(enable = "avx2")] |
28 | 0 | fn load3(data: &[u8]) -> __m256i { |
29 | 0 | _ = data[7 * $num_colors]; // dummy indexing operation up front to avoid bounds checks later |
30 | 0 | _mm256_set_epi32( |
31 | 0 | data[0] as i32, |
32 | 0 | data[1 * $num_colors] as i32, |
33 | 0 | data[2 * $num_colors] as i32, |
34 | 0 | data[3 * $num_colors] as i32, |
35 | 0 | data[4 * $num_colors] as i32, |
36 | 0 | data[5 * $num_colors] as i32, |
37 | 0 | data[6 * $num_colors] as i32, |
38 | 0 | data[7 * $num_colors] as i32, |
39 | | ) |
40 | 0 | } Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::RgbImageAVX2>::fill_buffers_avx2::load3 Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::RgbaImageAVX2>::fill_buffers_avx2::load3 Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::BgrImageAVX2>::fill_buffers_avx2::load3 Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::BgraImageAVX2>::fill_buffers_avx2::load3 |
41 | | |
42 | | #[inline] |
43 | | #[target_feature(enable = "avx2")] |
44 | 0 | fn avx_as_i32_array(data: __m256i) -> [i32; 8] { |
45 | | // Safety preconditions. Optimized away in release mode, no runtime cost. |
46 | 0 | assert!(core::mem::size_of::<__m256i>() == core::mem::size_of::<[i32; 8]>()); |
47 | 0 | assert!(core::mem::align_of::<__m256i>() >= core::mem::align_of::<[i32; 8]>()); |
48 | | // SAFETY: size and alignment preconditions checked above. |
49 | | // Both types are plain old data: no pointers, lifetimes, etc. |
50 | 0 | unsafe { core::mem::transmute(data) } |
51 | 0 | } Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::RgbImageAVX2>::fill_buffers_avx2::avx_as_i32_array Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::RgbaImageAVX2>::fill_buffers_avx2::avx_as_i32_array Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::BgrImageAVX2>::fill_buffers_avx2::avx_as_i32_array Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::BgraImageAVX2>::fill_buffers_avx2::avx_as_i32_array |
52 | | |
53 | 0 | let [y_buffer, cb_buffer, cr_buffer, _] = buffers; |
54 | 0 | y_buffer.reserve(self.width() as usize); |
55 | 0 | cb_buffer.reserve(self.width() as usize); |
56 | 0 | cr_buffer.reserve(self.width() as usize); |
57 | | |
58 | 0 | let ymulr = _mm256_set1_epi32(19595); |
59 | 0 | let ymulg = _mm256_set1_epi32(38470); |
60 | 0 | let ymulb = _mm256_set1_epi32(7471); |
61 | | |
62 | 0 | let cbmulr = _mm256_set1_epi32(-11059); |
63 | 0 | let cbmulg = _mm256_set1_epi32(21709); |
64 | 0 | let cbmulb = _mm256_set1_epi32(32768); |
65 | | |
66 | 0 | let crmulr = _mm256_set1_epi32(32768); |
67 | 0 | let crmulg = _mm256_set1_epi32(27439); |
68 | 0 | let crmulb = _mm256_set1_epi32(5329); |
69 | | |
70 | 0 | let mut data = &self.0[(y as usize * self.1 as usize * $num_colors)..]; |
71 | | |
72 | 0 | for _ in 0..self.width() / 8 { |
73 | 0 | let r = load3(&data[$o1..]); |
74 | 0 | let g = load3(&data[$o2..]); |
75 | 0 | let b = load3(&data[$o3..]); |
76 | | |
77 | 0 | data = &data[($num_colors * 8)..]; |
78 | | |
79 | 0 | let yr = _mm256_mullo_epi32(ymulr, r); |
80 | 0 | let yg = _mm256_mullo_epi32(ymulg, g); |
81 | 0 | let yb = _mm256_mullo_epi32(ymulb, b); |
82 | | |
83 | 0 | let y = _mm256_add_epi32(_mm256_add_epi32(yr, yg), yb); |
84 | 0 | let y = _mm256_add_epi32(y, _mm256_set1_epi32(0x7FFF)); |
85 | 0 | let y = _mm256_srli_epi32(y, 16); |
86 | 0 | let y: [i32; 8] = avx_as_i32_array(y); |
87 | 0 | let mut y: [u8; 8] = y.map(|x| x as u8); Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::RgbImageAVX2>::fill_buffers_avx2::{closure#0}Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::RgbaImageAVX2>::fill_buffers_avx2::{closure#0}Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::BgrImageAVX2>::fill_buffers_avx2::{closure#0}Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::BgraImageAVX2>::fill_buffers_avx2::{closure#0} |
88 | 0 | y.reverse(); |
89 | 0 | y_buffer.extend_from_slice(&y); |
90 | | |
91 | 0 | let cbr = _mm256_mullo_epi32(cbmulr, r); |
92 | 0 | let cbg = _mm256_mullo_epi32(cbmulg, g); |
93 | 0 | let cbb = _mm256_mullo_epi32(cbmulb, b); |
94 | | |
95 | 0 | let cb = _mm256_add_epi32(_mm256_sub_epi32(cbr, cbg), cbb); |
96 | 0 | let cb = _mm256_add_epi32(cb, _mm256_set1_epi32(128 << 16)); |
97 | 0 | let cb = _mm256_add_epi32(cb, _mm256_set1_epi32(0x7FFF)); |
98 | 0 | let cb = _mm256_srli_epi32(cb, 16); |
99 | 0 | let cb: [i32; 8] = avx_as_i32_array(cb); |
100 | 0 | let mut cb: [u8; 8] = cb.map(|x| x as u8); Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::RgbImageAVX2>::fill_buffers_avx2::{closure#1}Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::RgbaImageAVX2>::fill_buffers_avx2::{closure#1}Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::BgrImageAVX2>::fill_buffers_avx2::{closure#1}Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::BgraImageAVX2>::fill_buffers_avx2::{closure#1} |
101 | 0 | cb.reverse(); |
102 | 0 | cb_buffer.extend_from_slice(&cb); |
103 | | |
104 | 0 | let crr = _mm256_mullo_epi32(crmulr, r); |
105 | 0 | let crg = _mm256_mullo_epi32(crmulg, g); |
106 | 0 | let crb = _mm256_mullo_epi32(crmulb, b); |
107 | | |
108 | 0 | let cr = _mm256_sub_epi32(_mm256_sub_epi32(crr, crg), crb); |
109 | 0 | let cr = _mm256_add_epi32(cr, _mm256_set1_epi32(128 << 16)); |
110 | 0 | let cr = _mm256_add_epi32(cr, _mm256_set1_epi32(0x7FFF)); |
111 | 0 | let cr = _mm256_srli_epi32(cr, 16); |
112 | 0 | let cr: [i32; 8] = avx_as_i32_array(cr); |
113 | 0 | let mut cr: [u8; 8] = cr.map(|x| x as u8); Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::RgbImageAVX2>::fill_buffers_avx2::{closure#2}Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::RgbaImageAVX2>::fill_buffers_avx2::{closure#2}Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::BgrImageAVX2>::fill_buffers_avx2::{closure#2}Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::BgraImageAVX2>::fill_buffers_avx2::{closure#2} |
114 | 0 | cr.reverse(); |
115 | 0 | cr_buffer.extend_from_slice(&cr); |
116 | | } |
117 | | |
118 | 0 | for _ in 0..self.width() % 8 { |
119 | 0 | let (y, cb, cr) = rgb_to_ycbcr(data[$o1], data[$o2], data[$o3]); |
120 | 0 | data = &data[$num_colors..]; |
121 | 0 |
|
122 | 0 | y_buffer.push(y); |
123 | 0 | cb_buffer.push(cb); |
124 | 0 | cr_buffer.push(cr); |
125 | 0 | } |
126 | 0 | } Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::RgbImageAVX2>::fill_buffers_avx2 Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::RgbaImageAVX2>::fill_buffers_avx2 Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::BgrImageAVX2>::fill_buffers_avx2 Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::BgraImageAVX2>::fill_buffers_avx2 |
127 | | } |
128 | | |
129 | | impl<'a> ImageBuffer for $name<'a> { |
130 | 0 | fn get_jpeg_color_type(&self) -> JpegColorType { |
131 | 0 | JpegColorType::Ycbcr |
132 | 0 | } Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::RgbaImageAVX2 as jpeg_encoder::image_buffer::ImageBuffer>::get_jpeg_color_type Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::BgrImageAVX2 as jpeg_encoder::image_buffer::ImageBuffer>::get_jpeg_color_type Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::BgraImageAVX2 as jpeg_encoder::image_buffer::ImageBuffer>::get_jpeg_color_type Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::RgbImageAVX2 as jpeg_encoder::image_buffer::ImageBuffer>::get_jpeg_color_type |
133 | | |
134 | 0 | fn width(&self) -> u16 { |
135 | 0 | self.1 |
136 | 0 | } Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::RgbaImageAVX2 as jpeg_encoder::image_buffer::ImageBuffer>::width Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::BgrImageAVX2 as jpeg_encoder::image_buffer::ImageBuffer>::width Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::BgraImageAVX2 as jpeg_encoder::image_buffer::ImageBuffer>::width Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::RgbImageAVX2 as jpeg_encoder::image_buffer::ImageBuffer>::width |
137 | | |
138 | 0 | fn height(&self) -> u16 { |
139 | 0 | self.2 |
140 | 0 | } Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::RgbaImageAVX2 as jpeg_encoder::image_buffer::ImageBuffer>::height Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::BgrImageAVX2 as jpeg_encoder::image_buffer::ImageBuffer>::height Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::BgraImageAVX2 as jpeg_encoder::image_buffer::ImageBuffer>::height Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::RgbImageAVX2 as jpeg_encoder::image_buffer::ImageBuffer>::height |
141 | | |
142 | | #[inline(always)] |
143 | 0 | fn fill_buffers(&self, y: u16, buffers: &mut [Vec<u8>; 4]) { |
144 | 0 | unsafe { |
145 | 0 | self.fill_buffers_avx2(y, buffers); |
146 | 0 | } |
147 | 0 | } Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::RgbaImageAVX2 as jpeg_encoder::image_buffer::ImageBuffer>::fill_buffers Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::BgrImageAVX2 as jpeg_encoder::image_buffer::ImageBuffer>::fill_buffers Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::BgraImageAVX2 as jpeg_encoder::image_buffer::ImageBuffer>::fill_buffers Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::RgbImageAVX2 as jpeg_encoder::image_buffer::ImageBuffer>::fill_buffers |
148 | | } |
149 | | }; |
150 | | } |
151 | | |
152 | | ycbcr_image_avx2!(RgbImageAVX2, 3, 0, 1, 2); |
153 | | ycbcr_image_avx2!(RgbaImageAVX2, 4, 0, 1, 2); |
154 | | ycbcr_image_avx2!(BgrImageAVX2, 3, 2, 1, 0); |
155 | | ycbcr_image_avx2!(BgraImageAVX2, 4, 2, 1, 0); |
156 | | |
157 | | #[cfg(test)] |
158 | | mod tests { |
159 | | use super::*; |
160 | | use std::vec::Vec; |
161 | | |
162 | | // A very basic linear congruential generator (LCG) to avoid external dependencies. |
163 | | pub struct SimpleRng { |
164 | | state: u64, |
165 | | } |
166 | | |
167 | | impl SimpleRng { |
168 | | /// Create a new RNG with a given seed. |
169 | | pub fn new(seed: u64) -> Self { |
170 | | Self { state: seed } |
171 | | } |
172 | | |
173 | | /// Generate the next random u64 value. |
174 | | pub fn next_u64(&mut self) -> u64 { |
175 | | // Constants from Numerical Recipes |
176 | | self.state = self.state.wrapping_mul(6364136223846793005).wrapping_add(1); |
177 | | self.state |
178 | | } |
179 | | |
180 | | /// Generate a random byte in 0..=255 |
181 | | pub fn next_byte(&mut self) -> u8 { |
182 | | (self.next_u64() & 0xFF) as u8 |
183 | | } |
184 | | |
185 | | /// Fill a Vec<u8> with random bytes of the given length. |
186 | | pub fn random_bytes(&mut self, len: usize) -> Vec<u8> { |
187 | | (0..len).map(|_| self.next_byte()).collect() |
188 | | } |
189 | | } |
190 | | |
191 | | #[test] |
192 | | #[cfg(feature = "simd")] |
193 | | fn avx_matches_scalar_rgb() { |
194 | | // Do not run AVX2 test on machines without it |
195 | | if !std::is_x86_feature_detected!("avx2") { |
196 | | return; |
197 | | } |
198 | | let mut rng = SimpleRng::new(42); |
199 | | let width = 512 + 3; // power of two plus a bit to stress remainder handling |
200 | | let height = 1; |
201 | | let bpp = 3; |
202 | | |
203 | | let input = rng.random_bytes(width * height * bpp); // power of two plus a bit to exercise remainder handling |
204 | | |
205 | | let scalar_result: Vec<[u8; 3]> = input |
206 | | .chunks_exact(bpp) |
207 | | .map(|chunk| { |
208 | | let [r, g, b, ..] = chunk else { unreachable!() }; |
209 | | let (y, cb, cr) = rgb_to_ycbcr(*r, *g, *b); |
210 | | [y, cb, cr] |
211 | | }) |
212 | | .collect(); |
213 | | |
214 | | let mut buffers = [Vec::new(), Vec::new(), Vec::new(), Vec::new()]; |
215 | | let avx_input = RgbImageAVX2( |
216 | | &input, |
217 | | width.try_into().unwrap(), |
218 | | height.try_into().unwrap(), |
219 | | ); |
220 | | // SAFETY: we've checked above that AVX2 is present |
221 | | unsafe { |
222 | | avx_input.fill_buffers_avx2(0, &mut buffers); |
223 | | } |
224 | | |
225 | | for i in 0..3 { |
226 | | assert_eq!(buffers[i].len(), input.len() / 3); |
227 | | } |
228 | | |
229 | | for (i, pixel) in scalar_result.iter().copied().enumerate() { |
230 | | let avx_pixel: [u8; 3] = [buffers[0][i], buffers[1][i], buffers[2][i]]; |
231 | | if pixel != avx_pixel { |
232 | | panic!( |
233 | | "Mismatch at index {i}: scalar result is {pixel:?}, avx result is {avx_pixel:?}" |
234 | | ); |
235 | | } |
236 | | } |
237 | | } |
238 | | } |