/rust/registry/src/index.crates.io-1949cf8c6b5b557f/zune-jpeg-0.5.15/src/upsampler/avx2.rs
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2025. |
3 | | * |
4 | | * This software is free software; |
5 | | * |
6 | | * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license |
7 | | */ |
8 | | |
9 | | #[cfg(target_arch = "x86")] |
10 | | use core::arch::x86::*; |
11 | | #[cfg(target_arch = "x86_64")] |
12 | | use core::arch::x86_64::*; |
13 | | |
14 | | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] |
15 | | #[target_feature(enable = "avx2")] |
16 | 2.01M | pub unsafe fn upsample_horizontal_avx2( |
17 | 2.01M | input: &[i16], |
18 | 2.01M | in_near: &[i16], |
19 | 2.01M | in_far: &[i16], |
20 | 2.01M | scratch: &mut [i16], |
21 | 2.01M | output: &mut [i16], |
22 | 2.01M | ) { |
23 | 2.01M | assert_eq!(input.len() * 2, output.len()); |
24 | 2.01M | assert!(input.len() > 2); |
25 | | |
26 | 2.01M | let len = input.len(); |
27 | | |
28 | 2.01M | if len < 18 { |
29 | 824k | return super::scalar::upsample_horizontal(input, in_near, in_far, scratch, output); |
30 | 1.19M | } |
31 | | |
32 | | // First two pixels |
33 | 1.19M | output[0] = input[0]; |
34 | 1.19M | output[1] = (input[0] * 3 + input[1] + 2) >> 2; |
35 | | |
36 | 1.19M | let v_three = _mm256_set1_epi16(3); |
37 | 1.19M | let v_two = _mm256_set1_epi16(2); |
38 | | |
39 | 47.2M | let upsample16 = |input: &[i16; 18], output: &mut [i16; 32]| { |
40 | 47.2M | let in_ptr = input.as_ptr(); |
41 | 47.2M | let out_ptr = output.as_mut_ptr(); |
42 | | |
43 | | // SAFETY: The input is 18 * 16 bit long, so the loads are safe. |
44 | 47.2M | let (v_prev, v_curr, v_next) = unsafe { |
45 | 47.2M | ( |
46 | 47.2M | _mm256_loadu_si256(in_ptr.add(0) as *const __m256i), |
47 | 47.2M | _mm256_loadu_si256(in_ptr.add(1) as *const __m256i), |
48 | 47.2M | _mm256_loadu_si256(in_ptr.add(2) as *const __m256i), |
49 | 47.2M | ) |
50 | 47.2M | }; |
51 | | |
52 | 47.2M | let v_common = _mm256_add_epi16(_mm256_mullo_epi16(v_curr, v_three), v_two); |
53 | | |
54 | 47.2M | let v_even = _mm256_srai_epi16(_mm256_add_epi16(v_common, v_prev), 2); |
55 | 47.2M | let v_odd = _mm256_srai_epi16(_mm256_add_epi16(v_common, v_next), 2); |
56 | | |
57 | 47.2M | let v_res_1 = _mm256_unpacklo_epi16(v_even, v_odd); |
58 | 47.2M | let v_res_2 = _mm256_unpackhi_epi16(v_even, v_odd); |
59 | | |
60 | 47.2M | let v_final_1 = _mm256_permute2x128_si256(v_res_1, v_res_2, 0x20); |
61 | 47.2M | let v_final_2 = _mm256_permute2x128_si256(v_res_1, v_res_2, 0x31); |
62 | | |
63 | | // SAFETY: The output is 32 * 16 bit long, so the stores are safe. |
64 | 47.2M | unsafe { |
65 | 47.2M | _mm256_storeu_si256(out_ptr as *mut __m256i, v_final_1); |
66 | 47.2M | _mm256_storeu_si256(out_ptr.add(16) as *mut __m256i, v_final_2); |
67 | 47.2M | } |
68 | 47.2M | }; |
69 | | |
70 | 46.1M | for (input, output) in input |
71 | 1.19M | .windows(18) |
72 | 1.19M | .step_by(16) |
73 | 1.19M | .zip(output[2..].chunks_exact_mut(32)) |
74 | 46.1M | { |
75 | 46.1M | upsample16(input.try_into().unwrap(), output.try_into().unwrap()); |
76 | 46.1M | } |
77 | | |
78 | | // Upsample the remainder. This may have some overlap, but that's fine. |
79 | 1.19M | if let Some(rest_input) = input.last_chunk::<18>() { |
80 | 1.19M | let end = output.len() - 2; |
81 | 1.19M | if let Some(rest_output) = output[..end].last_chunk_mut::<32>() { |
82 | 1.19M | upsample16(rest_input, rest_output); |
83 | 1.19M | } |
84 | 0 | } |
85 | | |
86 | | // Last two pixels. |
87 | 1.19M | output[output.len() - 2] = (3 * input[len - 1] + input[len - 2] + 2) >> 2; |
88 | 1.19M | output[output.len() - 1] = input[len - 1]; |
89 | 2.01M | } |
90 | | |
91 | | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] |
92 | | #[target_feature(enable = "avx2")] |
93 | 5.43M | pub unsafe fn upsample_vertical_avx2( |
94 | 5.43M | input: &[i16], |
95 | 5.43M | in_near: &[i16], |
96 | 5.43M | in_far: &[i16], |
97 | 5.43M | scratch: &mut [i16], |
98 | 5.43M | output: &mut [i16], |
99 | 5.43M | ) { |
100 | 5.43M | assert_eq!(input.len() * 2, output.len()); |
101 | 5.43M | assert_eq!(in_near.len(), input.len()); |
102 | 5.43M | assert_eq!(in_far.len(), input.len()); |
103 | | |
104 | 5.43M | let len = input.len(); |
105 | | |
106 | 5.43M | if len < 16 { |
107 | 329k | return super::scalar::upsample_vertical(input, in_near, in_far, scratch, output); |
108 | 5.10M | } |
109 | | |
110 | 5.10M | let middle = output.len() / 2; |
111 | 5.10M | let (out_top, out_bottom) = output.split_at_mut(middle); |
112 | | |
113 | 5.10M | let v_three = _mm256_set1_epi16(3); |
114 | 5.10M | let v_two = _mm256_set1_epi16(2); |
115 | | |
116 | 5.10M | let upsample16 = |input: &[i16; 16], |
117 | | in_near: &[i16; 16], |
118 | | in_far: &[i16; 16], |
119 | | out_top: &mut [i16; 16], |
120 | 202M | out_bottom: &mut [i16; 16]| { |
121 | | // SAFETY: Inputs are all 16 * 16 bit long, so the loads are safe. |
122 | 202M | let (v_in, v_near, v_far) = unsafe { |
123 | 202M | ( |
124 | 202M | _mm256_loadu_si256(input.as_ptr() as *const __m256i), |
125 | 202M | _mm256_loadu_si256(in_near.as_ptr() as *const __m256i), |
126 | 202M | _mm256_loadu_si256(in_far.as_ptr() as *const __m256i), |
127 | 202M | ) |
128 | 202M | }; |
129 | | |
130 | 202M | let v_common = _mm256_add_epi16(_mm256_mullo_epi16(v_in, v_three), v_two); |
131 | | |
132 | 202M | let v_out_top = _mm256_srai_epi16(_mm256_add_epi16(v_common, v_near), 2); |
133 | 202M | let v_out_bottom = _mm256_srai_epi16(_mm256_add_epi16(v_common, v_far), 2); |
134 | | |
135 | | // SAFETY: Outputs are 16 * 16 bit long, so the stores are safe. |
136 | 202M | unsafe { |
137 | 202M | _mm256_storeu_si256(out_top.as_mut_ptr() as *mut __m256i, v_out_top); |
138 | 202M | _mm256_storeu_si256(out_bottom.as_mut_ptr() as *mut __m256i, v_out_bottom); |
139 | 202M | } |
140 | 202M | }; |
141 | | |
142 | 5.10M | let chunks = input |
143 | 5.10M | .chunks_exact(16) |
144 | 5.10M | .zip(in_near.chunks_exact(16)) |
145 | 5.10M | .zip(in_far.chunks_exact(16)) |
146 | 5.10M | .zip(out_top.chunks_exact_mut(16)) |
147 | 5.10M | .zip(out_bottom.chunks_exact_mut(16)); |
148 | | |
149 | 202M | for ((((input, in_near), in_far), out_top), out_bottom) in chunks { |
150 | 197M | upsample16( |
151 | 197M | input.try_into().unwrap(), |
152 | 197M | in_near.try_into().unwrap(), |
153 | 197M | in_far.try_into().unwrap(), |
154 | 197M | out_top.try_into().unwrap(), |
155 | 197M | out_bottom.try_into().unwrap(), |
156 | 197M | ); |
157 | 197M | } |
158 | | |
159 | | // Upsample the remainder. This may have some overlap, but that's fine. |
160 | | // Edition upgrade will fix this nested awfulness. |
161 | 5.10M | if let Some(rest) = input.last_chunk::<16>() { |
162 | 5.10M | if let Some(rest_near) = in_near.last_chunk::<16>() { |
163 | 5.10M | if let Some(rest_far) = in_far.last_chunk::<16>() { |
164 | 5.10M | if let Some(mut rest_top) = out_top.last_chunk_mut::<16>() { |
165 | 5.10M | if let Some(mut rest_bottom) = out_bottom.last_chunk_mut::<16>() { |
166 | 5.10M | upsample16(rest, rest_near, rest_far, &mut rest_top, &mut rest_bottom); |
167 | 5.10M | } |
168 | 0 | } |
169 | 0 | } |
170 | 0 | } |
171 | 0 | } |
172 | 5.43M | } |
173 | | |
174 | | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] |
175 | | #[target_feature(enable = "avx2")] |
176 | 529k | pub unsafe fn upsample_hv_avx2( |
177 | 529k | input: &[i16], |
178 | 529k | in_near: &[i16], |
179 | 529k | in_far: &[i16], |
180 | 529k | scratch_space: &mut [i16], |
181 | 529k | output: &mut [i16], |
182 | 529k | ) { |
183 | 529k | assert_eq!(input.len() * 4, output.len()); |
184 | 529k | assert!(input.len() * 2 <= scratch_space.len()); |
185 | 529k | let scratch_space = &mut scratch_space[..input.len() * 2]; |
186 | | |
187 | | |
188 | 529k | upsample_vertical_avx2(input, in_near, in_far, &mut [], scratch_space); |
189 | | |
190 | 529k | let scratch_half = scratch_space.len() / 2; |
191 | 529k | let output_half = output.len() / 2; |
192 | | |
193 | 529k | let (scratch_top, scratch_bottom) = scratch_space.split_at_mut(scratch_half); |
194 | 529k | let (out_top, out_bottom) = output.split_at_mut(output_half); |
195 | | |
196 | 529k | let mut t = [0]; |
197 | 529k | upsample_horizontal_avx2(scratch_top, &[], &[], &mut t, out_top); |
198 | 529k | upsample_horizontal_avx2(scratch_bottom, &[], &[], &mut t, out_bottom); |
199 | 529k | } |