Coverage Report

Created: 2026-05-16 07:04

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/rust/registry/src/index.crates.io-1949cf8c6b5b557f/zune-jpeg-0.5.15/src/upsampler/avx2.rs
Line
Count
Source
1
/*
2
 * Copyright (c) 2025.
3
 *
4
 * This software is free software;
5
 *
6
 * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
7
 */
8
9
#[cfg(target_arch = "x86")]
10
use core::arch::x86::*;
11
#[cfg(target_arch = "x86_64")]
12
use core::arch::x86_64::*;
13
14
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
15
#[target_feature(enable = "avx2")]
16
2.01M
pub unsafe fn upsample_horizontal_avx2(
17
2.01M
    input: &[i16],
18
2.01M
    in_near: &[i16],
19
2.01M
    in_far: &[i16],
20
2.01M
    scratch: &mut [i16],
21
2.01M
    output: &mut [i16],
22
2.01M
) {
23
2.01M
    assert_eq!(input.len() * 2, output.len());
24
2.01M
    assert!(input.len() > 2);
25
26
2.01M
    let len = input.len();
27
28
2.01M
    if len < 18 {
29
824k
        return super::scalar::upsample_horizontal(input, in_near, in_far, scratch, output);
30
1.19M
    }
31
32
    // First two pixels
33
1.19M
    output[0] = input[0];
34
1.19M
    output[1] = (input[0] * 3 + input[1] + 2) >> 2;
35
36
1.19M
    let v_three = _mm256_set1_epi16(3);
37
1.19M
    let v_two = _mm256_set1_epi16(2);
38
39
47.2M
    let upsample16 = |input: &[i16; 18], output: &mut [i16; 32]| {
40
47.2M
        let in_ptr = input.as_ptr();
41
47.2M
        let out_ptr = output.as_mut_ptr();
42
43
        // SAFETY: The input is 18 * 16 bit long, so the loads are safe.
44
47.2M
        let (v_prev, v_curr, v_next) = unsafe {
45
47.2M
            (
46
47.2M
                _mm256_loadu_si256(in_ptr.add(0) as *const __m256i),
47
47.2M
                _mm256_loadu_si256(in_ptr.add(1) as *const __m256i),
48
47.2M
                _mm256_loadu_si256(in_ptr.add(2) as *const __m256i),
49
47.2M
            )
50
47.2M
        };
51
52
47.2M
        let v_common = _mm256_add_epi16(_mm256_mullo_epi16(v_curr, v_three), v_two);
53
54
47.2M
        let v_even = _mm256_srai_epi16(_mm256_add_epi16(v_common, v_prev), 2);
55
47.2M
        let v_odd = _mm256_srai_epi16(_mm256_add_epi16(v_common, v_next), 2);
56
57
47.2M
        let v_res_1 = _mm256_unpacklo_epi16(v_even, v_odd);
58
47.2M
        let v_res_2 = _mm256_unpackhi_epi16(v_even, v_odd);
59
60
47.2M
        let v_final_1 = _mm256_permute2x128_si256(v_res_1, v_res_2, 0x20);
61
47.2M
        let v_final_2 = _mm256_permute2x128_si256(v_res_1, v_res_2, 0x31);
62
63
        // SAFETY: The output is 32 * 16 bit long, so the stores are safe.
64
47.2M
        unsafe {
65
47.2M
            _mm256_storeu_si256(out_ptr as *mut __m256i, v_final_1);
66
47.2M
            _mm256_storeu_si256(out_ptr.add(16) as *mut __m256i, v_final_2);
67
47.2M
        }
68
47.2M
    };
69
70
46.1M
    for (input, output) in input
71
1.19M
        .windows(18)
72
1.19M
        .step_by(16)
73
1.19M
        .zip(output[2..].chunks_exact_mut(32))
74
46.1M
    {
75
46.1M
        upsample16(input.try_into().unwrap(), output.try_into().unwrap());
76
46.1M
    }
77
78
    // Upsample the remainder. This may have some overlap, but that's fine.
79
1.19M
    if let Some(rest_input) = input.last_chunk::<18>() {
80
1.19M
        let end = output.len() - 2;
81
1.19M
        if let Some(rest_output) = output[..end].last_chunk_mut::<32>() {
82
1.19M
            upsample16(rest_input, rest_output);
83
1.19M
        }
84
0
    }
85
86
    // Last two pixels.
87
1.19M
    output[output.len() - 2] = (3 * input[len - 1] + input[len - 2] + 2) >> 2;
88
1.19M
    output[output.len() - 1] = input[len - 1];
89
2.01M
}
90
91
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
92
#[target_feature(enable = "avx2")]
93
5.43M
pub unsafe fn upsample_vertical_avx2(
94
5.43M
    input: &[i16],
95
5.43M
    in_near: &[i16],
96
5.43M
    in_far: &[i16],
97
5.43M
    scratch: &mut [i16],
98
5.43M
    output: &mut [i16],
99
5.43M
) {
100
5.43M
    assert_eq!(input.len() * 2, output.len());
101
5.43M
    assert_eq!(in_near.len(), input.len());
102
5.43M
    assert_eq!(in_far.len(), input.len());
103
104
5.43M
    let len = input.len();
105
106
5.43M
    if len < 16 {
107
329k
        return super::scalar::upsample_vertical(input, in_near, in_far, scratch, output);
108
5.10M
    }
109
110
5.10M
    let middle = output.len() / 2;
111
5.10M
    let (out_top, out_bottom) = output.split_at_mut(middle);
112
113
5.10M
    let v_three = _mm256_set1_epi16(3);
114
5.10M
    let v_two = _mm256_set1_epi16(2);
115
116
5.10M
    let upsample16 = |input: &[i16; 16],
117
                      in_near: &[i16; 16],
118
                      in_far: &[i16; 16],
119
                      out_top: &mut [i16; 16],
120
202M
                      out_bottom: &mut [i16; 16]| {
121
        // SAFETY: Inputs are all 16 * 16 bit long, so the loads are safe.
122
202M
        let (v_in, v_near, v_far) = unsafe {
123
202M
            (
124
202M
                _mm256_loadu_si256(input.as_ptr() as *const __m256i),
125
202M
                _mm256_loadu_si256(in_near.as_ptr() as *const __m256i),
126
202M
                _mm256_loadu_si256(in_far.as_ptr() as *const __m256i),
127
202M
            )
128
202M
        };
129
130
202M
        let v_common = _mm256_add_epi16(_mm256_mullo_epi16(v_in, v_three), v_two);
131
132
202M
        let v_out_top = _mm256_srai_epi16(_mm256_add_epi16(v_common, v_near), 2);
133
202M
        let v_out_bottom = _mm256_srai_epi16(_mm256_add_epi16(v_common, v_far), 2);
134
135
        // SAFETY: Outputs are 16 * 16 bit long, so the stores are safe.
136
202M
        unsafe {
137
202M
            _mm256_storeu_si256(out_top.as_mut_ptr() as *mut __m256i, v_out_top);
138
202M
            _mm256_storeu_si256(out_bottom.as_mut_ptr() as *mut __m256i, v_out_bottom);
139
202M
        }
140
202M
    };
141
142
5.10M
    let chunks = input
143
5.10M
        .chunks_exact(16)
144
5.10M
        .zip(in_near.chunks_exact(16))
145
5.10M
        .zip(in_far.chunks_exact(16))
146
5.10M
        .zip(out_top.chunks_exact_mut(16))
147
5.10M
        .zip(out_bottom.chunks_exact_mut(16));
148
149
202M
    for ((((input, in_near), in_far), out_top), out_bottom) in chunks {
150
197M
        upsample16(
151
197M
            input.try_into().unwrap(),
152
197M
            in_near.try_into().unwrap(),
153
197M
            in_far.try_into().unwrap(),
154
197M
            out_top.try_into().unwrap(),
155
197M
            out_bottom.try_into().unwrap(),
156
197M
        );
157
197M
    }
158
159
    // Upsample the remainder. This may have some overlap, but that's fine.
160
    // Edition upgrade will fix this nested awfulness.
161
5.10M
    if let Some(rest) = input.last_chunk::<16>() {
162
5.10M
        if let Some(rest_near) = in_near.last_chunk::<16>() {
163
5.10M
            if let Some(rest_far) = in_far.last_chunk::<16>() {
164
5.10M
                if let Some(mut rest_top) = out_top.last_chunk_mut::<16>() {
165
5.10M
                    if let Some(mut rest_bottom) = out_bottom.last_chunk_mut::<16>() {
166
5.10M
                        upsample16(rest, rest_near, rest_far, &mut rest_top, &mut rest_bottom);
167
5.10M
                    }
168
0
                }
169
0
            }
170
0
        }
171
0
    }
172
5.43M
}
173
174
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
175
#[target_feature(enable = "avx2")]
176
529k
pub unsafe fn upsample_hv_avx2(
177
529k
    input: &[i16],
178
529k
    in_near: &[i16],
179
529k
    in_far: &[i16],
180
529k
    scratch_space: &mut [i16],
181
529k
    output: &mut [i16],
182
529k
) {
183
529k
    assert_eq!(input.len() * 4, output.len());
184
529k
    assert!(input.len() * 2 <= scratch_space.len());
185
529k
    let scratch_space = &mut scratch_space[..input.len() * 2];
186
187
188
529k
    upsample_vertical_avx2(input, in_near, in_far, &mut [], scratch_space);
189
190
529k
    let scratch_half = scratch_space.len() / 2;
191
529k
    let output_half = output.len() / 2;
192
193
529k
    let (scratch_top, scratch_bottom) = scratch_space.split_at_mut(scratch_half);
194
529k
    let (out_top, out_bottom) = output.split_at_mut(output_half);
195
196
529k
    let mut t = [0];
197
529k
    upsample_horizontal_avx2(scratch_top, &[], &[], &mut t, out_top);
198
529k
    upsample_horizontal_avx2(scratch_bottom, &[], &[], &mut t, out_bottom);
199
529k
}