/rust/registry/src/index.crates.io-1949cf8c6b5b557f/zune-jpeg-0.5.15/src/upsampler/avx2.rs

Source
/*
 * Copyright (c) 2025.
 *
 * This software is free software;
 *
 * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
 */

#[cfg(target_arch = "x86")]
use core::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe fn upsample_horizontal_avx2(
    input: &[i16],
    in_near: &[i16],
    in_far: &[i16],
    scratch: &mut [i16],
    output: &mut [i16],
) {
    assert_eq!(input.len() * 2, output.len());
    assert!(input.len() > 2);

    let len = input.len();

    if len < 18 {
        return super::scalar::upsample_horizontal(input, in_near, in_far, scratch, output);
    }

    // First two pixels
    output[0] = input[0];
    output[1] = (input[0] * 3 + input[1] + 2) >> 2;

    let v_three = _mm256_set1_epi16(3);
    let v_two = _mm256_set1_epi16(2);

    let upsample16 = |input: &[i16; 18], output: &mut [i16; 32]| {
        let in_ptr = input.as_ptr();
        let out_ptr = output.as_mut_ptr();

        // SAFETY: The input is 18 * 16 bit long, so the loads are safe.
        let (v_prev, v_curr, v_next) = unsafe {
            (
                _mm256_loadu_si256(in_ptr.add(0) as *const __m256i),
                _mm256_loadu_si256(in_ptr.add(1) as *const __m256i),
                _mm256_loadu_si256(in_ptr.add(2) as *const __m256i),
            )
        };

        let v_common = _mm256_add_epi16(_mm256_mullo_epi16(v_curr, v_three), v_two);

        let v_even = _mm256_srai_epi16(_mm256_add_epi16(v_common, v_prev), 2);
        let v_odd = _mm256_srai_epi16(_mm256_add_epi16(v_common, v_next), 2);

        let v_res_1 = _mm256_unpacklo_epi16(v_even, v_odd);
        let v_res_2 = _mm256_unpackhi_epi16(v_even, v_odd);

        let v_final_1 = _mm256_permute2x128_si256(v_res_1, v_res_2, 0x20);
        let v_final_2 = _mm256_permute2x128_si256(v_res_1, v_res_2, 0x31);

        // SAFETY: The output is 32 * 16 bit long, so the stores are safe.
        unsafe {
            _mm256_storeu_si256(out_ptr as *mut __m256i, v_final_1);
            _mm256_storeu_si256(out_ptr.add(16) as *mut __m256i, v_final_2);
        }
    };

    for (input, output) in input
        .windows(18)
        .step_by(16)
        .zip(output[2..].chunks_exact_mut(32))
    {
        upsample16(input.try_into().unwrap(), output.try_into().unwrap());
    }

    // Upsample the remainder. This may have some overlap, but that's fine.
    if let Some(rest_input) = input.last_chunk::<18>() {
        let end = output.len() - 2;
        if let Some(rest_output) = output[..end].last_chunk_mut::<32>() {
            upsample16(rest_input, rest_output);
        }
    }

    // Last two pixels.
    output[output.len() - 2] = (3 * input[len - 1] + input[len - 2] + 2) >> 2;
    output[output.len() - 1] = input[len - 1];
}

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe fn upsample_vertical_avx2(
    input: &[i16],
    in_near: &[i16],
    in_far: &[i16],
    scratch: &mut [i16],
    output: &mut [i16],
) {
    assert_eq!(input.len() * 2, output.len());
    assert_eq!(in_near.len(), input.len());
    assert_eq!(in_far.len(), input.len());

    let len = input.len();

    if len < 16 {
        return super::scalar::upsample_vertical(input, in_near, in_far, scratch, output);
    }

    let middle = output.len() / 2;
    let (out_top, out_bottom) = output.split_at_mut(middle);

    let v_three = _mm256_set1_epi16(3);
    let v_two = _mm256_set1_epi16(2);

    let upsample16 = |input: &[i16; 16],
                      in_near: &[i16; 16],
                      in_far: &[i16; 16],
                      out_top: &mut [i16; 16],
                      out_bottom: &mut [i16; 16]| {
        // SAFETY: Inputs are all 16 * 16 bit long, so the loads are safe.
        let (v_in, v_near, v_far) = unsafe {
            (
                _mm256_loadu_si256(input.as_ptr() as *const __m256i),
                _mm256_loadu_si256(in_near.as_ptr() as *const __m256i),
                _mm256_loadu_si256(in_far.as_ptr() as *const __m256i),
            )
        };

        let v_common = _mm256_add_epi16(_mm256_mullo_epi16(v_in, v_three), v_two);

        let v_out_top = _mm256_srai_epi16(_mm256_add_epi16(v_common, v_near), 2);
        let v_out_bottom = _mm256_srai_epi16(_mm256_add_epi16(v_common, v_far), 2);

        // SAFETY: Outputs are 16 * 16 bit long, so the stores are safe.
        unsafe {
            _mm256_storeu_si256(out_top.as_mut_ptr() as *mut __m256i, v_out_top);
            _mm256_storeu_si256(out_bottom.as_mut_ptr() as *mut __m256i, v_out_bottom);
        }
    };

    let chunks = input
        .chunks_exact(16)
        .zip(in_near.chunks_exact(16))
        .zip(in_far.chunks_exact(16))
        .zip(out_top.chunks_exact_mut(16))
        .zip(out_bottom.chunks_exact_mut(16));

    for ((((input, in_near), in_far), out_top), out_bottom) in chunks {
        upsample16(
            input.try_into().unwrap(),
            in_near.try_into().unwrap(),
            in_far.try_into().unwrap(),
            out_top.try_into().unwrap(),
            out_bottom.try_into().unwrap(),
        );
    }

    // Upsample the remainder. This may have some overlap, but that's fine.
    // Edition upgrade will fix this nested awfulness.
    if let Some(rest) = input.last_chunk::<16>() {
        if let Some(rest_near) = in_near.last_chunk::<16>() {
            if let Some(rest_far) = in_far.last_chunk::<16>() {
                if let Some(mut rest_top) = out_top.last_chunk_mut::<16>() {
                    if let Some(mut rest_bottom) = out_bottom.last_chunk_mut::<16>() {
                        upsample16(rest, rest_near, rest_far, &mut rest_top, &mut rest_bottom);
                    }
                }
            }
        }
    }
}

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe fn upsample_hv_avx2(
    input: &[i16],
    in_near: &[i16],
    in_far: &[i16],
    scratch_space: &mut [i16],
    output: &mut [i16],
) {
    assert_eq!(input.len() * 4, output.len());
    assert!(input.len() * 2 <= scratch_space.len());
    let scratch_space = &mut scratch_space[..input.len() * 2];


    upsample_vertical_avx2(input, in_near, in_far, &mut [], scratch_space);

    let scratch_half = scratch_space.len() / 2;
    let output_half = output.len() / 2;

    let (scratch_top, scratch_bottom) = scratch_space.split_at_mut(scratch_half);
    let (out_top, out_bottom) = output.split_at_mut(output_half);

    let mut t = [0];
    upsample_horizontal_avx2(scratch_top, &[], &[], &mut t, out_top);
    upsample_horizontal_avx2(scratch_bottom, &[], &[], &mut t, out_bottom);
}

Line	Count	Source
1		/*
2		* Copyright (c) 2025.
3		*
4		* This software is free software;
5		*
6		* You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
7		*/
8
9		#[cfg(target_arch = "x86")]
10		use core::arch::x86::*;
11		#[cfg(target_arch = "x86_64")]
12		use core::arch::x86_64::*;
13
14		#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
15		#[target_feature(enable = "avx2")]
16	2.01M	pub unsafe fn upsample_horizontal_avx2(
17	2.01M	input: &[i16],
18	2.01M	in_near: &[i16],
19	2.01M	in_far: &[i16],
20	2.01M	scratch: &mut [i16],
21	2.01M	output: &mut [i16],
22	2.01M	) {
23	2.01M	assert_eq!(input.len() * 2, output.len());
24	2.01M	assert!(input.len() > 2);
25
26	2.01M	let len = input.len();
27
28	2.01M	if len < 18 {
29	824k	return super::scalar::upsample_horizontal(input, in_near, in_far, scratch, output);
30	1.19M	}
31
32		// First two pixels
33	1.19M	output[0] = input[0];
34	1.19M	output[1] = (input[0] * 3 + input[1] + 2) >> 2;
35
36	1.19M	let v_three = _mm256_set1_epi16(3);
37	1.19M	let v_two = _mm256_set1_epi16(2);
38
39	47.2M	let upsample16 = \|input: &[i16; 18], output: &mut [i16; 32]\| {
40	47.2M	let in_ptr = input.as_ptr();
41	47.2M	let out_ptr = output.as_mut_ptr();
42
43		// SAFETY: The input is 18 * 16 bit long, so the loads are safe.
44	47.2M	let (v_prev, v_curr, v_next) = unsafe {
45	47.2M	(
46	47.2M	_mm256_loadu_si256(in_ptr.add(0) as *const __m256i),
47	47.2M	_mm256_loadu_si256(in_ptr.add(1) as *const __m256i),
48	47.2M	_mm256_loadu_si256(in_ptr.add(2) as *const __m256i),
49	47.2M	)
50	47.2M	};
51
52	47.2M	let v_common = _mm256_add_epi16(_mm256_mullo_epi16(v_curr, v_three), v_two);
53
54	47.2M	let v_even = _mm256_srai_epi16(_mm256_add_epi16(v_common, v_prev), 2);
55	47.2M	let v_odd = _mm256_srai_epi16(_mm256_add_epi16(v_common, v_next), 2);
56
57	47.2M	let v_res_1 = _mm256_unpacklo_epi16(v_even, v_odd);
58	47.2M	let v_res_2 = _mm256_unpackhi_epi16(v_even, v_odd);
59
60	47.2M	let v_final_1 = _mm256_permute2x128_si256(v_res_1, v_res_2, 0x20);
61	47.2M	let v_final_2 = _mm256_permute2x128_si256(v_res_1, v_res_2, 0x31);
62
63		// SAFETY: The output is 32 * 16 bit long, so the stores are safe.
64	47.2M	unsafe {
65	47.2M	_mm256_storeu_si256(out_ptr as *mut __m256i, v_final_1);
66	47.2M	_mm256_storeu_si256(out_ptr.add(16) as *mut __m256i, v_final_2);
67	47.2M	}
68	47.2M	};
69
70	46.1M	for (input, output) in input
71	1.19M	.windows(18)
72	1.19M	.step_by(16)
73	1.19M	.zip(output[2..].chunks_exact_mut(32))
74	46.1M	{
75	46.1M	upsample16(input.try_into().unwrap(), output.try_into().unwrap());
76	46.1M	}
77
78		// Upsample the remainder. This may have some overlap, but that's fine.
79	1.19M	if let Some(rest_input) = input.last_chunk::<18>() {
80	1.19M	let end = output.len() - 2;
81	1.19M	if let Some(rest_output) = output[..end].last_chunk_mut::<32>() {
82	1.19M	upsample16(rest_input, rest_output);
83	1.19M	}
84	0	}
85
86		// Last two pixels.
87	1.19M	output[output.len() - 2] = (3 * input[len - 1] + input[len - 2] + 2) >> 2;
88	1.19M	output[output.len() - 1] = input[len - 1];
89	2.01M	}
90
91		#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
92		#[target_feature(enable = "avx2")]
93	5.43M	pub unsafe fn upsample_vertical_avx2(
94	5.43M	input: &[i16],
95	5.43M	in_near: &[i16],
96	5.43M	in_far: &[i16],
97	5.43M	scratch: &mut [i16],
98	5.43M	output: &mut [i16],
99	5.43M	) {
100	5.43M	assert_eq!(input.len() * 2, output.len());
101	5.43M	assert_eq!(in_near.len(), input.len());
102	5.43M	assert_eq!(in_far.len(), input.len());
103
104	5.43M	let len = input.len();
105
106	5.43M	if len < 16 {
107	329k	return super::scalar::upsample_vertical(input, in_near, in_far, scratch, output);
108	5.10M	}
109
110	5.10M	let middle = output.len() / 2;
111	5.10M	let (out_top, out_bottom) = output.split_at_mut(middle);
112
113	5.10M	let v_three = _mm256_set1_epi16(3);
114	5.10M	let v_two = _mm256_set1_epi16(2);
115
116	5.10M	let upsample16 = \|input: &[i16; 16],
117		in_near: &[i16; 16],
118		in_far: &[i16; 16],
119		out_top: &mut [i16; 16],
120	202M	out_bottom: &mut [i16; 16]\| {
121		// SAFETY: Inputs are all 16 * 16 bit long, so the loads are safe.
122	202M	let (v_in, v_near, v_far) = unsafe {
123	202M	(
124	202M	_mm256_loadu_si256(input.as_ptr() as *const __m256i),
125	202M	_mm256_loadu_si256(in_near.as_ptr() as *const __m256i),
126	202M	_mm256_loadu_si256(in_far.as_ptr() as *const __m256i),
127	202M	)
128	202M	};
129
130	202M	let v_common = _mm256_add_epi16(_mm256_mullo_epi16(v_in, v_three), v_two);
131
132	202M	let v_out_top = _mm256_srai_epi16(_mm256_add_epi16(v_common, v_near), 2);
133	202M	let v_out_bottom = _mm256_srai_epi16(_mm256_add_epi16(v_common, v_far), 2);
134
135		// SAFETY: Outputs are 16 * 16 bit long, so the stores are safe.
136	202M	unsafe {
137	202M	_mm256_storeu_si256(out_top.as_mut_ptr() as *mut __m256i, v_out_top);
138	202M	_mm256_storeu_si256(out_bottom.as_mut_ptr() as *mut __m256i, v_out_bottom);
139	202M	}
140	202M	};
141
142	5.10M	let chunks = input
143	5.10M	.chunks_exact(16)
144	5.10M	.zip(in_near.chunks_exact(16))
145	5.10M	.zip(in_far.chunks_exact(16))
146	5.10M	.zip(out_top.chunks_exact_mut(16))
147	5.10M	.zip(out_bottom.chunks_exact_mut(16));
148
149	202M	for ((((input, in_near), in_far), out_top), out_bottom) in chunks {
150	197M	upsample16(
151	197M	input.try_into().unwrap(),
152	197M	in_near.try_into().unwrap(),
153	197M	in_far.try_into().unwrap(),
154	197M	out_top.try_into().unwrap(),
155	197M	out_bottom.try_into().unwrap(),
156	197M	);
157	197M	}
158
159		// Upsample the remainder. This may have some overlap, but that's fine.
160		// Edition upgrade will fix this nested awfulness.
161	5.10M	if let Some(rest) = input.last_chunk::<16>() {
162	5.10M	if let Some(rest_near) = in_near.last_chunk::<16>() {
163	5.10M	if let Some(rest_far) = in_far.last_chunk::<16>() {
164	5.10M	if let Some(mut rest_top) = out_top.last_chunk_mut::<16>() {
165	5.10M	if let Some(mut rest_bottom) = out_bottom.last_chunk_mut::<16>() {
166	5.10M	upsample16(rest, rest_near, rest_far, &mut rest_top, &mut rest_bottom);
167	5.10M	}
168	0	}
169	0	}
170	0	}
171	0	}
172	5.43M	}
173
174		#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
175		#[target_feature(enable = "avx2")]
176	529k	pub unsafe fn upsample_hv_avx2(
177	529k	input: &[i16],
178	529k	in_near: &[i16],
179	529k	in_far: &[i16],
180	529k	scratch_space: &mut [i16],
181	529k	output: &mut [i16],
182	529k	) {
183	529k	assert_eq!(input.len() * 4, output.len());
184	529k	assert!(input.len() * 2 <= scratch_space.len());
185	529k	let scratch_space = &mut scratch_space[..input.len() * 2];
186
187
188	529k	upsample_vertical_avx2(input, in_near, in_far, &mut [], scratch_space);
189
190	529k	let scratch_half = scratch_space.len() / 2;
191	529k	let output_half = output.len() / 2;
192
193	529k	let (scratch_top, scratch_bottom) = scratch_space.split_at_mut(scratch_half);
194	529k	let (out_top, out_bottom) = output.split_at_mut(output_half);
195
196	529k	let mut t = [0];
197	529k	upsample_horizontal_avx2(scratch_top, &[], &[], &mut t, out_top);
198	529k	upsample_horizontal_avx2(scratch_bottom, &[], &[], &mut t, out_bottom);
199	529k	}

Coverage Report

Created: 2026-05-16 07:04