/rust/registry/src/index.crates.io-1949cf8c6b5b557f/jpeg-encoder-0.7.0/src/avx2/ycbcr.rs

Source
#[cfg(target_arch = "x86")]
use core::arch::x86::{
    __m256i, _mm256_add_epi32, _mm256_mullo_epi32, _mm256_set_epi32, _mm256_set1_epi32,
    _mm256_srli_epi32, _mm256_sub_epi32,
};

#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::{
    __m256i, _mm256_add_epi32, _mm256_mullo_epi32, _mm256_set_epi32, _mm256_set1_epi32,
    _mm256_srli_epi32, _mm256_sub_epi32,
};

use alloc::vec::Vec;

use crate::{ImageBuffer, JpegColorType, rgb_to_ycbcr};

macro_rules! ycbcr_image_avx2 {
    ($name:ident, $num_colors:expr, $o1:expr, $o2:expr, $o3:expr) => {
        pub struct $name<'a>(pub &'a [u8], pub u16, pub u16);

        impl<'a> $name<'a> {
            #[target_feature(enable = "avx2")]
            fn fill_buffers_avx2(&self, y: u16, buffers: &mut [Vec<u8>; 4]) {
                // TODO: this compiles to many separate scalar loads and could be optimized further.
                // But the gains are no more than 3% end to end and it doesn't seem to be worth the complexity.
                #[inline]
                #[target_feature(enable = "avx2")]
                fn load3(data: &[u8]) -> __m256i {
                    _ = data[7 * $num_colors]; // dummy indexing operation up front to avoid bounds checks later
                    _mm256_set_epi32(
                        data[0] as i32,
                        data[1 * $num_colors] as i32,
                        data[2 * $num_colors] as i32,
                        data[3 * $num_colors] as i32,
                        data[4 * $num_colors] as i32,
                        data[5 * $num_colors] as i32,
                        data[6 * $num_colors] as i32,
                        data[7 * $num_colors] as i32,
                    )
                }

                #[inline]
                #[target_feature(enable = "avx2")]
                fn avx_as_i32_array(data: __m256i) -> [i32; 8] {
                    // Safety preconditions. Optimized away in release mode, no runtime cost.
                    assert!(core::mem::size_of::<__m256i>() == core::mem::size_of::<[i32; 8]>());
                    assert!(core::mem::align_of::<__m256i>() >= core::mem::align_of::<[i32; 8]>());
                    // SAFETY: size and alignment preconditions checked above.
                    // Both types are plain old data: no pointers, lifetimes, etc.
                    unsafe { core::mem::transmute(data) }
                }

                let [y_buffer, cb_buffer, cr_buffer, _] = buffers;
                y_buffer.reserve(self.width() as usize);
                cb_buffer.reserve(self.width() as usize);
                cr_buffer.reserve(self.width() as usize);

                let ymulr = _mm256_set1_epi32(19595);
                let ymulg = _mm256_set1_epi32(38470);
                let ymulb = _mm256_set1_epi32(7471);

                let cbmulr = _mm256_set1_epi32(-11059);
                let cbmulg = _mm256_set1_epi32(21709);
                let cbmulb = _mm256_set1_epi32(32768);

                let crmulr = _mm256_set1_epi32(32768);
                let crmulg = _mm256_set1_epi32(27439);
                let crmulb = _mm256_set1_epi32(5329);

                let mut data = &self.0[(y as usize * self.1 as usize * $num_colors)..];

                for _ in 0..self.width() / 8 {
                    let r = load3(&data[$o1..]);
                    let g = load3(&data[$o2..]);
                    let b = load3(&data[$o3..]);

                    data = &data[($num_colors * 8)..];

                    let yr = _mm256_mullo_epi32(ymulr, r);
                    let yg = _mm256_mullo_epi32(ymulg, g);
                    let yb = _mm256_mullo_epi32(ymulb, b);

                    let y = _mm256_add_epi32(_mm256_add_epi32(yr, yg), yb);
                    let y = _mm256_add_epi32(y, _mm256_set1_epi32(0x7FFF));
                    let y = _mm256_srli_epi32(y, 16);
                    let y: [i32; 8] = avx_as_i32_array(y);
                    let mut y: [u8; 8] = y.map(|x| x as u8);
                    y.reverse();
                    y_buffer.extend_from_slice(&y);

                    let cbr = _mm256_mullo_epi32(cbmulr, r);
                    let cbg = _mm256_mullo_epi32(cbmulg, g);
                    let cbb = _mm256_mullo_epi32(cbmulb, b);

                    let cb = _mm256_add_epi32(_mm256_sub_epi32(cbr, cbg), cbb);
                    let cb = _mm256_add_epi32(cb, _mm256_set1_epi32(128 << 16));
                    let cb = _mm256_add_epi32(cb, _mm256_set1_epi32(0x7FFF));
                    let cb = _mm256_srli_epi32(cb, 16);
                    let cb: [i32; 8] = avx_as_i32_array(cb);
                    let mut cb: [u8; 8] = cb.map(|x| x as u8);
                    cb.reverse();
                    cb_buffer.extend_from_slice(&cb);

                    let crr = _mm256_mullo_epi32(crmulr, r);
                    let crg = _mm256_mullo_epi32(crmulg, g);
                    let crb = _mm256_mullo_epi32(crmulb, b);

                    let cr = _mm256_sub_epi32(_mm256_sub_epi32(crr, crg), crb);
                    let cr = _mm256_add_epi32(cr, _mm256_set1_epi32(128 << 16));
                    let cr = _mm256_add_epi32(cr, _mm256_set1_epi32(0x7FFF));
                    let cr = _mm256_srli_epi32(cr, 16);
                    let cr: [i32; 8] = avx_as_i32_array(cr);
                    let mut cr: [u8; 8] = cr.map(|x| x as u8);
                    cr.reverse();
                    cr_buffer.extend_from_slice(&cr);
                }

                for _ in 0..self.width() % 8 {
                    let (y, cb, cr) = rgb_to_ycbcr(data[$o1], data[$o2], data[$o3]);
                    data = &data[$num_colors..];

                    y_buffer.push(y);
                    cb_buffer.push(cb);
                    cr_buffer.push(cr);
                }
            }
        }

        impl<'a> ImageBuffer for $name<'a> {
            fn get_jpeg_color_type(&self) -> JpegColorType {
                JpegColorType::Ycbcr
            }

            fn width(&self) -> u16 {
                self.1
            }

            fn height(&self) -> u16 {
                self.2
            }

            #[inline(always)]
            fn fill_buffers(&self, y: u16, buffers: &mut [Vec<u8>; 4]) {
                unsafe {
                    self.fill_buffers_avx2(y, buffers);
                }
            }
        }
    };
}

ycbcr_image_avx2!(RgbImageAVX2, 3, 0, 1, 2);
ycbcr_image_avx2!(RgbaImageAVX2, 4, 0, 1, 2);
ycbcr_image_avx2!(BgrImageAVX2, 3, 2, 1, 0);
ycbcr_image_avx2!(BgraImageAVX2, 4, 2, 1, 0);

#[cfg(test)]
mod tests {
    use super::*;
    use std::vec::Vec;

    // A very basic linear congruential generator (LCG) to avoid external dependencies.
    pub struct SimpleRng {
        state: u64,
    }

    impl SimpleRng {
        /// Create a new RNG with a given seed.
        pub fn new(seed: u64) -> Self {
            Self { state: seed }
        }

        /// Generate the next random u64 value.
        pub fn next_u64(&mut self) -> u64 {
            // Constants from Numerical Recipes
            self.state = self.state.wrapping_mul(6364136223846793005).wrapping_add(1);
            self.state
        }

        /// Generate a random byte in 0..=255
        pub fn next_byte(&mut self) -> u8 {
            (self.next_u64() & 0xFF) as u8
        }

        /// Fill a Vec<u8> with random bytes of the given length.
        pub fn random_bytes(&mut self, len: usize) -> Vec<u8> {
            (0..len).map(|_| self.next_byte()).collect()
        }
    }

    #[test]
    #[cfg(feature = "simd")]
    fn avx_matches_scalar_rgb() {
        // Do not run AVX2 test on machines without it
        if !std::is_x86_feature_detected!("avx2") {
            return;
        }
        let mut rng = SimpleRng::new(42);
        let width = 512 + 3; // power of two plus a bit to stress remainder handling
        let height = 1;
        let bpp = 3;

        let input = rng.random_bytes(width * height * bpp); // power of two plus a bit to exercise remainder handling

        let scalar_result: Vec<[u8; 3]> = input
            .chunks_exact(bpp)
            .map(|chunk| {
                let [r, g, b, ..] = chunk else { unreachable!() };
                let (y, cb, cr) = rgb_to_ycbcr(*r, *g, *b);
                [y, cb, cr]
            })
            .collect();

        let mut buffers = [Vec::new(), Vec::new(), Vec::new(), Vec::new()];
        let avx_input = RgbImageAVX2(
            &input,
            width.try_into().unwrap(),
            height.try_into().unwrap(),
        );
        // SAFETY: we've checked above that AVX2 is present
        unsafe {
            avx_input.fill_buffers_avx2(0, &mut buffers);
        }

        for i in 0..3 {
            assert_eq!(buffers[i].len(), input.len() / 3);
        }

        for (i, pixel) in scalar_result.iter().copied().enumerate() {
            let avx_pixel: [u8; 3] = [buffers[0][i], buffers[1][i], buffers[2][i]];
            if pixel != avx_pixel {
                panic!(
                    "Mismatch at index {i}: scalar result is {pixel:?}, avx result is {avx_pixel:?}"
                );
            }
        }
    }
}

Coverage Report

Created: 2026-03-10 07:34

Line	Count	Source
1		#[cfg(target_arch = "x86")]
2		use core::arch::x86::{
3		__m256i, _mm256_add_epi32, _mm256_mullo_epi32, _mm256_set_epi32, _mm256_set1_epi32,
4		_mm256_srli_epi32, _mm256_sub_epi32,
5		};
6
7		#[cfg(target_arch = "x86_64")]
8		use core::arch::x86_64::{
9		__m256i, _mm256_add_epi32, _mm256_mullo_epi32, _mm256_set_epi32, _mm256_set1_epi32,
10		_mm256_srli_epi32, _mm256_sub_epi32,
11		};
12
13		use alloc::vec::Vec;
14
15		use crate::{ImageBuffer, JpegColorType, rgb_to_ycbcr};
16
17		macro_rules! ycbcr_image_avx2 {
18		($name:ident, $num_colors:expr, $o1:expr, $o2:expr, $o3:expr) => {
19		pub struct $name<'a>(pub &'a [u8], pub u16, pub u16);
20
21		impl<'a> $name<'a> {
22		#[target_feature(enable = "avx2")]
23	0	fn fill_buffers_avx2(&self, y: u16, buffers: &mut [Vec<u8>; 4]) {
24		// TODO: this compiles to many separate scalar loads and could be optimized further.
25		// But the gains are no more than 3% end to end and it doesn't seem to be worth the complexity.
26		#[inline]
27		#[target_feature(enable = "avx2")]
28	0	fn load3(data: &[u8]) -> __m256i {
29	0	_ = data[7 * $num_colors]; // dummy indexing operation up front to avoid bounds checks later
30	0	_mm256_set_epi32(
31	0	data[0] as i32,
32	0	data[1 * $num_colors] as i32,
33	0	data[2 * $num_colors] as i32,
34	0	data[3 * $num_colors] as i32,
35	0	data[4 * $num_colors] as i32,
36	0	data[5 * $num_colors] as i32,
37	0	data[6 * $num_colors] as i32,
38	0	data[7 * $num_colors] as i32,
39		)
40	0	} Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::RgbImageAVX2>::fill_buffers_avx2::load3 Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::RgbaImageAVX2>::fill_buffers_avx2::load3 Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::BgrImageAVX2>::fill_buffers_avx2::load3 Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::BgraImageAVX2>::fill_buffers_avx2::load3
41
42		#[inline]
43		#[target_feature(enable = "avx2")]
44	0	fn avx_as_i32_array(data: __m256i) -> [i32; 8] {
45		// Safety preconditions. Optimized away in release mode, no runtime cost.
46	0	assert!(core::mem::size_of::<__m256i>() == core::mem::size_of::<[i32; 8]>());
47	0	assert!(core::mem::align_of::<__m256i>() >= core::mem::align_of::<[i32; 8]>());
48		// SAFETY: size and alignment preconditions checked above.
49		// Both types are plain old data: no pointers, lifetimes, etc.
50	0	unsafe { core::mem::transmute(data) }
51	0	} Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::RgbImageAVX2>::fill_buffers_avx2::avx_as_i32_array Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::RgbaImageAVX2>::fill_buffers_avx2::avx_as_i32_array Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::BgrImageAVX2>::fill_buffers_avx2::avx_as_i32_array Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::BgraImageAVX2>::fill_buffers_avx2::avx_as_i32_array
52
53	0	let [y_buffer, cb_buffer, cr_buffer, _] = buffers;
54	0	y_buffer.reserve(self.width() as usize);
55	0	cb_buffer.reserve(self.width() as usize);
56	0	cr_buffer.reserve(self.width() as usize);
57
58	0	let ymulr = _mm256_set1_epi32(19595);
59	0	let ymulg = _mm256_set1_epi32(38470);
60	0	let ymulb = _mm256_set1_epi32(7471);
61
62	0	let cbmulr = _mm256_set1_epi32(-11059);
63	0	let cbmulg = _mm256_set1_epi32(21709);
64	0	let cbmulb = _mm256_set1_epi32(32768);
65
66	0	let crmulr = _mm256_set1_epi32(32768);
67	0	let crmulg = _mm256_set1_epi32(27439);
68	0	let crmulb = _mm256_set1_epi32(5329);
69
70	0	let mut data = &self.0[(y as usize * self.1 as usize * $num_colors)..];
71
72	0	for _ in 0..self.width() / 8 {
73	0	let r = load3(&data[$o1..]);
74	0	let g = load3(&data[$o2..]);
75	0	let b = load3(&data[$o3..]);
76
77	0	data = &data[($num_colors * 8)..];
78
79	0	let yr = _mm256_mullo_epi32(ymulr, r);
80	0	let yg = _mm256_mullo_epi32(ymulg, g);
81	0	let yb = _mm256_mullo_epi32(ymulb, b);
82
83	0	let y = _mm256_add_epi32(_mm256_add_epi32(yr, yg), yb);
84	0	let y = _mm256_add_epi32(y, _mm256_set1_epi32(0x7FFF));
85	0	let y = _mm256_srli_epi32(y, 16);
86	0	let y: [i32; 8] = avx_as_i32_array(y);
87	0	let mut y: [u8; 8] = y.map(\|x\| x as u8); Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::RgbImageAVX2>::fill_buffers_avx2::{closure#0} Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::RgbaImageAVX2>::fill_buffers_avx2::{closure#0} Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::BgrImageAVX2>::fill_buffers_avx2::{closure#0} Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::BgraImageAVX2>::fill_buffers_avx2::{closure#0}
88	0	y.reverse();
89	0	y_buffer.extend_from_slice(&y);
90
91	0	let cbr = _mm256_mullo_epi32(cbmulr, r);
92	0	let cbg = _mm256_mullo_epi32(cbmulg, g);
93	0	let cbb = _mm256_mullo_epi32(cbmulb, b);
94
95	0	let cb = _mm256_add_epi32(_mm256_sub_epi32(cbr, cbg), cbb);
96	0	let cb = _mm256_add_epi32(cb, _mm256_set1_epi32(128 << 16));
97	0	let cb = _mm256_add_epi32(cb, _mm256_set1_epi32(0x7FFF));
98	0	let cb = _mm256_srli_epi32(cb, 16);
99	0	let cb: [i32; 8] = avx_as_i32_array(cb);
100	0	let mut cb: [u8; 8] = cb.map(\|x\| x as u8); Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::RgbImageAVX2>::fill_buffers_avx2::{closure#1} Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::RgbaImageAVX2>::fill_buffers_avx2::{closure#1} Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::BgrImageAVX2>::fill_buffers_avx2::{closure#1} Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::BgraImageAVX2>::fill_buffers_avx2::{closure#1}
101	0	cb.reverse();
102	0	cb_buffer.extend_from_slice(&cb);
103
104	0	let crr = _mm256_mullo_epi32(crmulr, r);
105	0	let crg = _mm256_mullo_epi32(crmulg, g);
106	0	let crb = _mm256_mullo_epi32(crmulb, b);
107
108	0	let cr = _mm256_sub_epi32(_mm256_sub_epi32(crr, crg), crb);
109	0	let cr = _mm256_add_epi32(cr, _mm256_set1_epi32(128 << 16));
110	0	let cr = _mm256_add_epi32(cr, _mm256_set1_epi32(0x7FFF));
111	0	let cr = _mm256_srli_epi32(cr, 16);
112	0	let cr: [i32; 8] = avx_as_i32_array(cr);
113	0	let mut cr: [u8; 8] = cr.map(\|x\| x as u8); Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::RgbImageAVX2>::fill_buffers_avx2::{closure#2} Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::RgbaImageAVX2>::fill_buffers_avx2::{closure#2} Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::BgrImageAVX2>::fill_buffers_avx2::{closure#2} Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::BgraImageAVX2>::fill_buffers_avx2::{closure#2}
114	0	cr.reverse();
115	0	cr_buffer.extend_from_slice(&cr);
116		}
117
118	0	for _ in 0..self.width() % 8 {
119	0	let (y, cb, cr) = rgb_to_ycbcr(data[$o1], data[$o2], data[$o3]);
120	0	data = &data[$num_colors..];
121	0
122	0	y_buffer.push(y);
123	0	cb_buffer.push(cb);
124	0	cr_buffer.push(cr);
125	0	}
126	0	} Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::RgbImageAVX2>::fill_buffers_avx2 Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::RgbaImageAVX2>::fill_buffers_avx2 Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::BgrImageAVX2>::fill_buffers_avx2 Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::BgraImageAVX2>::fill_buffers_avx2
127		}
128
129		impl<'a> ImageBuffer for $name<'a> {
130	0	fn get_jpeg_color_type(&self) -> JpegColorType {
131	0	JpegColorType::Ycbcr
132	0	} Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::RgbaImageAVX2 as jpeg_encoder::image_buffer::ImageBuffer>::get_jpeg_color_type Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::BgrImageAVX2 as jpeg_encoder::image_buffer::ImageBuffer>::get_jpeg_color_type Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::BgraImageAVX2 as jpeg_encoder::image_buffer::ImageBuffer>::get_jpeg_color_type Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::RgbImageAVX2 as jpeg_encoder::image_buffer::ImageBuffer>::get_jpeg_color_type
133
134	0	fn width(&self) -> u16 {
135	0	self.1
136	0	} Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::RgbaImageAVX2 as jpeg_encoder::image_buffer::ImageBuffer>::width Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::BgrImageAVX2 as jpeg_encoder::image_buffer::ImageBuffer>::width Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::BgraImageAVX2 as jpeg_encoder::image_buffer::ImageBuffer>::width Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::RgbImageAVX2 as jpeg_encoder::image_buffer::ImageBuffer>::width
137
138	0	fn height(&self) -> u16 {
139	0	self.2
140	0	} Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::RgbaImageAVX2 as jpeg_encoder::image_buffer::ImageBuffer>::height Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::BgrImageAVX2 as jpeg_encoder::image_buffer::ImageBuffer>::height Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::BgraImageAVX2 as jpeg_encoder::image_buffer::ImageBuffer>::height Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::RgbImageAVX2 as jpeg_encoder::image_buffer::ImageBuffer>::height
141
142		#[inline(always)]
143	0	fn fill_buffers(&self, y: u16, buffers: &mut [Vec<u8>; 4]) {
144	0	unsafe {
145	0	self.fill_buffers_avx2(y, buffers);
146	0	}
147	0	} Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::RgbaImageAVX2 as jpeg_encoder::image_buffer::ImageBuffer>::fill_buffers Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::BgrImageAVX2 as jpeg_encoder::image_buffer::ImageBuffer>::fill_buffers Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::BgraImageAVX2 as jpeg_encoder::image_buffer::ImageBuffer>::fill_buffers Unexecuted instantiation: <jpeg_encoder::avx2::ycbcr::RgbImageAVX2 as jpeg_encoder::image_buffer::ImageBuffer>::fill_buffers
148		}
149		};
150		}
151
152		ycbcr_image_avx2!(RgbImageAVX2, 3, 0, 1, 2);
153		ycbcr_image_avx2!(RgbaImageAVX2, 4, 0, 1, 2);
154		ycbcr_image_avx2!(BgrImageAVX2, 3, 2, 1, 0);
155		ycbcr_image_avx2!(BgraImageAVX2, 4, 2, 1, 0);
156
157		#[cfg(test)]
158		mod tests {
159		use super::*;
160		use std::vec::Vec;
161
162		// A very basic linear congruential generator (LCG) to avoid external dependencies.
163		pub struct SimpleRng {
164		state: u64,
165		}
166
167		impl SimpleRng {
168		/// Create a new RNG with a given seed.
169		pub fn new(seed: u64) -> Self {
170		Self { state: seed }
171		}
172
173		/// Generate the next random u64 value.
174		pub fn next_u64(&mut self) -> u64 {
175		// Constants from Numerical Recipes
176		self.state = self.state.wrapping_mul(6364136223846793005).wrapping_add(1);
177		self.state
178		}
179
180		/// Generate a random byte in 0..=255
181		pub fn next_byte(&mut self) -> u8 {
182		(self.next_u64() & 0xFF) as u8
183		}
184
185		/// Fill a Vec<u8> with random bytes of the given length.
186		pub fn random_bytes(&mut self, len: usize) -> Vec<u8> {
187		(0..len).map(\|_\| self.next_byte()).collect()
188		}
189		}
190
191		#[test]
192		#[cfg(feature = "simd")]
193		fn avx_matches_scalar_rgb() {
194		// Do not run AVX2 test on machines without it
195		if !std::is_x86_feature_detected!("avx2") {
196		return;
197		}
198		let mut rng = SimpleRng::new(42);
199		let width = 512 + 3; // power of two plus a bit to stress remainder handling
200		let height = 1;
201		let bpp = 3;
202
203		let input = rng.random_bytes(width * height * bpp); // power of two plus a bit to exercise remainder handling
204
205		let scalar_result: Vec<[u8; 3]> = input
206		.chunks_exact(bpp)
207		.map(\|chunk\| {
208		let [r, g, b, ..] = chunk else { unreachable!() };
209		let (y, cb, cr) = rgb_to_ycbcr(r, g, *b);
210		[y, cb, cr]
211		})
212		.collect();
213
214		let mut buffers = [Vec::new(), Vec::new(), Vec::new(), Vec::new()];
215		let avx_input = RgbImageAVX2(
216		&input,
217		width.try_into().unwrap(),
218		height.try_into().unwrap(),
219		);
220		// SAFETY: we've checked above that AVX2 is present
221		unsafe {
222		avx_input.fill_buffers_avx2(0, &mut buffers);
223		}
224
225		for i in 0..3 {
226		assert_eq!(buffers[i].len(), input.len() / 3);
227		}
228
229		for (i, pixel) in scalar_result.iter().copied().enumerate() {
230		let avx_pixel: [u8; 3] = [buffers[0][i], buffers[1][i], buffers[2][i]];
231		if pixel != avx_pixel {
232		panic!(
233		"Mismatch at index {i}: scalar result is {pixel:?}, avx result is {avx_pixel:?}"
234		);
235		}
236		}
237		}
238		}