/rust/registry/src/index.crates.io-1949cf8c6b5b557f/sha2-0.10.9/src/sha512/x86.rs
Line | Count | Source |
1 | | //! SHA-512 `x86`/`x86_64` backend |
2 | | |
3 | | #![allow(clippy::many_single_char_names)] |
4 | | |
5 | | use core::mem::size_of; |
6 | | |
7 | | #[cfg(target_arch = "x86")] |
8 | | use core::arch::x86::*; |
9 | | #[cfg(target_arch = "x86_64")] |
10 | | use core::arch::x86_64::*; |
11 | | |
12 | | use crate::consts::K64; |
13 | | |
14 | | cpufeatures::new!(avx2_cpuid, "avx2"); |
15 | | |
16 | 0 | pub fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) { |
17 | | // TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725 |
18 | | // after stabilization |
19 | 0 | if avx2_cpuid::get() { |
20 | 0 | unsafe { |
21 | 0 | sha512_compress_x86_64_avx2(state, blocks); |
22 | 0 | } |
23 | 0 | } else { |
24 | 0 | super::soft::compress(state, blocks); |
25 | 0 | } |
26 | 0 | } |
27 | | |
28 | | #[target_feature(enable = "avx2")] |
29 | 0 | unsafe fn sha512_compress_x86_64_avx2(state: &mut [u64; 8], blocks: &[[u8; 128]]) { |
30 | 0 | let mut start_block = 0; |
31 | | |
32 | 0 | if blocks.len() & 0b1 != 0 { |
33 | 0 | sha512_compress_x86_64_avx(state, &blocks[0]); |
34 | 0 | start_block += 1; |
35 | 0 | } |
36 | | |
37 | 0 | let mut ms: MsgSchedule = [_mm_setzero_si128(); 8]; |
38 | 0 | let mut t2: RoundStates = [_mm_setzero_si128(); 40]; |
39 | 0 | let mut x = [_mm256_setzero_si256(); 8]; |
40 | | |
41 | 0 | for i in (start_block..blocks.len()).step_by(2) { |
42 | 0 | load_data_avx2(&mut x, &mut ms, &mut t2, blocks.as_ptr().add(i) as *const _); |
43 | 0 |
|
44 | 0 | // First block |
45 | 0 | let mut current_state = *state; |
46 | 0 | rounds_0_63_avx2(&mut current_state, &mut x, &mut ms, &mut t2); |
47 | 0 | rounds_64_79(&mut current_state, &ms); |
48 | 0 | accumulate_state(state, ¤t_state); |
49 | 0 |
|
50 | 0 | // Second block |
51 | 0 | current_state = *state; |
52 | 0 | process_second_block(&mut current_state, &t2); |
53 | 0 | accumulate_state(state, ¤t_state); |
54 | 0 | } |
55 | 0 | } |
56 | | |
57 | | #[inline(always)] |
58 | 0 | unsafe fn sha512_compress_x86_64_avx(state: &mut [u64; 8], block: &[u8; 128]) { |
59 | 0 | let mut ms = [_mm_setzero_si128(); 8]; |
60 | 0 | let mut x = [_mm_setzero_si128(); 8]; |
61 | | |
62 | | // Reduced to single iteration |
63 | 0 | let mut current_state = *state; |
64 | 0 | load_data_avx(&mut x, &mut ms, block.as_ptr() as *const _); |
65 | 0 | rounds_0_63_avx(&mut current_state, &mut x, &mut ms); |
66 | 0 | rounds_64_79(&mut current_state, &ms); |
67 | 0 | accumulate_state(state, ¤t_state); |
68 | 0 | } |
69 | | |
70 | | #[inline(always)] |
71 | 0 | unsafe fn load_data_avx(x: &mut [__m128i; 8], ms: &mut MsgSchedule, data: *const __m128i) { |
72 | | #[allow(non_snake_case)] |
73 | 0 | let MASK = _mm_setr_epi32(0x04050607, 0x00010203, 0x0c0d0e0f, 0x08090a0b); |
74 | | |
75 | | macro_rules! unrolled_iterations { |
76 | | ($($i:literal),*) => {$( |
77 | | x[$i] = _mm_loadu_si128(data.add($i) as *const _); |
78 | | x[$i] = _mm_shuffle_epi8(x[$i], MASK); |
79 | | |
80 | | let y = _mm_add_epi64( |
81 | | x[$i], |
82 | | _mm_loadu_si128(&K64[2 * $i] as *const u64 as *const _), |
83 | | ); |
84 | | |
85 | | ms[$i] = y; |
86 | | )*}; |
87 | | } |
88 | | |
89 | 0 | unrolled_iterations!(0, 1, 2, 3, 4, 5, 6, 7); |
90 | 0 | } |
91 | | |
92 | | #[inline(always)] |
93 | 0 | unsafe fn load_data_avx2( |
94 | 0 | x: &mut [__m256i; 8], |
95 | 0 | ms: &mut MsgSchedule, |
96 | 0 | t2: &mut RoundStates, |
97 | 0 | data: *const __m128i, |
98 | 0 | ) { |
99 | | #[allow(non_snake_case)] |
100 | 0 | let MASK = _mm256_set_epi64x( |
101 | | 0x0809_0A0B_0C0D_0E0F_i64, |
102 | | 0x0001_0203_0405_0607_i64, |
103 | | 0x0809_0A0B_0C0D_0E0F_i64, |
104 | | 0x0001_0203_0405_0607_i64, |
105 | | ); |
106 | | |
107 | | macro_rules! unrolled_iterations { |
108 | | ($($i:literal),*) => {$( |
109 | | x[$i] = _mm256_insertf128_si256(x[$i], _mm_loadu_si128(data.add(8 + $i) as *const _), 1); |
110 | | x[$i] = _mm256_insertf128_si256(x[$i], _mm_loadu_si128(data.add($i) as *const _), 0); |
111 | | |
112 | | x[$i] = _mm256_shuffle_epi8(x[$i], MASK); |
113 | | |
114 | | let t = _mm_loadu_si128(K64.as_ptr().add($i * 2) as *const u64 as *const _); |
115 | | let y = _mm256_add_epi64(x[$i], _mm256_set_m128i(t, t)); |
116 | | |
117 | | ms[$i] = _mm256_extracti128_si256(y, 0); |
118 | | t2[$i] = _mm256_extracti128_si256(y, 1); |
119 | | )*}; |
120 | | } |
121 | | |
122 | 0 | unrolled_iterations!(0, 1, 2, 3, 4, 5, 6, 7); |
123 | 0 | } |
124 | | |
125 | | #[inline(always)] |
126 | 0 | unsafe fn rounds_0_63_avx(current_state: &mut State, x: &mut [__m128i; 8], ms: &mut MsgSchedule) { |
127 | 0 | let mut k64_idx: usize = SHA512_BLOCK_WORDS_NUM; |
128 | | |
129 | 0 | for _ in 0..4 { |
130 | 0 | for j in 0..8 { |
131 | 0 | let k64 = _mm_loadu_si128(&K64[k64_idx] as *const u64 as *const _); |
132 | 0 | let y = sha512_update_x_avx(x, k64); |
133 | 0 |
|
134 | 0 | { |
135 | 0 | let ms = cast_ms(ms); |
136 | 0 | sha_round(current_state, ms[2 * j]); |
137 | 0 | sha_round(current_state, ms[2 * j + 1]); |
138 | 0 | } |
139 | 0 |
|
140 | 0 | ms[j] = y; |
141 | 0 | k64_idx += 2; |
142 | 0 | } |
143 | | } |
144 | 0 | } |
145 | | |
146 | | #[inline(always)] |
147 | 0 | unsafe fn rounds_0_63_avx2( |
148 | 0 | current_state: &mut State, |
149 | 0 | x: &mut [__m256i; 8], |
150 | 0 | ms: &mut MsgSchedule, |
151 | 0 | t2: &mut RoundStates, |
152 | 0 | ) { |
153 | 0 | let mut k64x4_idx: usize = SHA512_BLOCK_WORDS_NUM; |
154 | | |
155 | 0 | for i in 1..5 { |
156 | 0 | for j in 0..8 { |
157 | 0 | let t = _mm_loadu_si128(K64.as_ptr().add(k64x4_idx) as *const u64 as *const _); |
158 | 0 | let y = sha512_update_x_avx2(x, _mm256_set_m128i(t, t)); |
159 | 0 |
|
160 | 0 | { |
161 | 0 | let ms = cast_ms(ms); |
162 | 0 | sha_round(current_state, ms[2 * j]); |
163 | 0 | sha_round(current_state, ms[2 * j + 1]); |
164 | 0 | } |
165 | 0 |
|
166 | 0 | ms[j] = _mm256_extracti128_si256(y, 0); |
167 | 0 | t2[8 * i + j] = _mm256_extracti128_si256(y, 1); |
168 | 0 |
|
169 | 0 | k64x4_idx += 2; |
170 | 0 | } |
171 | | } |
172 | 0 | } |
173 | | |
174 | | #[inline(always)] |
175 | 0 | fn rounds_64_79(current_state: &mut State, ms: &MsgSchedule) { |
176 | 0 | let ms = cast_ms(ms); |
177 | 0 | for i in 64..80 { |
178 | 0 | sha_round(current_state, ms[i & 0xf]); |
179 | 0 | } |
180 | 0 | } |
181 | | |
182 | | #[inline(always)] |
183 | 0 | fn process_second_block(current_state: &mut State, t2: &RoundStates) { |
184 | 0 | for t2 in cast_rs(t2).iter() { |
185 | 0 | sha_round(current_state, *t2); |
186 | 0 | } |
187 | 0 | } |
188 | | |
189 | | #[inline(always)] |
190 | 0 | fn sha_round(s: &mut State, x: u64) { |
191 | | macro_rules! big_sigma0 { |
192 | | ($a:expr) => { |
193 | | $a.rotate_right(28) ^ $a.rotate_right(34) ^ $a.rotate_right(39) |
194 | | }; |
195 | | } |
196 | | macro_rules! big_sigma1 { |
197 | | ($a:expr) => { |
198 | | $a.rotate_right(14) ^ $a.rotate_right(18) ^ $a.rotate_right(41) |
199 | | }; |
200 | | } |
201 | | macro_rules! bool3ary_202 { |
202 | | ($a:expr, $b:expr, $c:expr) => { |
203 | | $c ^ ($a & ($b ^ $c)) |
204 | | }; |
205 | | } // Choose, MD5F, SHA1C |
206 | | macro_rules! bool3ary_232 { |
207 | | ($a:expr, $b:expr, $c:expr) => { |
208 | | ($a & $b) ^ ($a & $c) ^ ($b & $c) |
209 | | }; |
210 | | } // Majority, SHA1M |
211 | | |
212 | | macro_rules! rotate_state { |
213 | | ($s:ident) => {{ |
214 | | let tmp = $s[7]; |
215 | | $s[7] = $s[6]; |
216 | | $s[6] = $s[5]; |
217 | | $s[5] = $s[4]; |
218 | | $s[4] = $s[3]; |
219 | | $s[3] = $s[2]; |
220 | | $s[2] = $s[1]; |
221 | | $s[1] = $s[0]; |
222 | | $s[0] = tmp; |
223 | | }}; |
224 | | } |
225 | | |
226 | 0 | let t = x |
227 | 0 | .wrapping_add(s[7]) |
228 | 0 | .wrapping_add(big_sigma1!(s[4])) |
229 | 0 | .wrapping_add(bool3ary_202!(s[4], s[5], s[6])); |
230 | | |
231 | 0 | s[7] = t |
232 | 0 | .wrapping_add(big_sigma0!(s[0])) |
233 | 0 | .wrapping_add(bool3ary_232!(s[0], s[1], s[2])); |
234 | 0 | s[3] = s[3].wrapping_add(t); |
235 | | |
236 | 0 | rotate_state!(s); |
237 | 0 | } |
238 | | |
239 | | #[inline(always)] |
240 | 0 | fn accumulate_state(dst: &mut State, src: &State) { |
241 | 0 | for i in 0..SHA512_HASH_WORDS_NUM { |
242 | 0 | dst[i] = dst[i].wrapping_add(src[i]); |
243 | 0 | } |
244 | 0 | } |
245 | | |
246 | | macro_rules! fn_sha512_update_x { |
247 | | ($name:ident, $ty:ident, { |
248 | | ADD64 = $ADD64:ident, |
249 | | ALIGNR8 = $ALIGNR8:ident, |
250 | | SRL64 = $SRL64:ident, |
251 | | SLL64 = $SLL64:ident, |
252 | | XOR = $XOR:ident, |
253 | | }) => { |
254 | 0 | unsafe fn $name(x: &mut [$ty; 8], k64: $ty) -> $ty { |
255 | | // q[2:1] |
256 | 0 | let mut t0 = $ALIGNR8(x[1], x[0], 8); |
257 | | // q[10:9] |
258 | 0 | let mut t3 = $ALIGNR8(x[5], x[4], 8); |
259 | | // q[2:1] >> s0[0] |
260 | 0 | let mut t2 = $SRL64(t0, 1); |
261 | | // q[1:0] + q[10:9] |
262 | 0 | x[0] = $ADD64(x[0], t3); |
263 | | // q[2:1] >> s0[2] |
264 | 0 | t3 = $SRL64(t0, 7); |
265 | | // q[2:1] << (64 - s0[1]) |
266 | 0 | let mut t1 = $SLL64(t0, 64 - 8); |
267 | | // (q[2:1] >> s0[2]) ^ |
268 | | // (q[2:1] >> s0[0]) |
269 | 0 | t0 = $XOR(t3, t2); |
270 | | // q[2:1] >> s0[1] |
271 | 0 | t2 = $SRL64(t2, 8 - 1); |
272 | | // (q[2:1] >> s0[2]) ^ |
273 | | // (q[2:1] >> s0[0]) ^ |
274 | | // q[2:1] << (64 - s0[1]) |
275 | 0 | t0 = $XOR(t0, t1); |
276 | | // q[2:1] << (64 - s0[0]) |
277 | 0 | t1 = $SLL64(t1, 8 - 1); |
278 | | // sigma1(q[2:1]) |
279 | 0 | t0 = $XOR(t0, t2); |
280 | 0 | t0 = $XOR(t0, t1); |
281 | | // q[15:14] >> s1[2] |
282 | 0 | t3 = $SRL64(x[7], 6); |
283 | | // q[15:14] >> (64 - s1[1]) |
284 | 0 | t2 = $SLL64(x[7], 64 - 61); |
285 | | // q[1:0] + sigma0(q[2:1]) |
286 | 0 | x[0] = $ADD64(x[0], t0); |
287 | | // q[15:14] >> s1[0] |
288 | 0 | t1 = $SRL64(x[7], 19); |
289 | | // q[15:14] >> s1[2] ^ |
290 | | // q[15:14] >> (64 - s1[1]) |
291 | 0 | t3 = $XOR(t3, t2); |
292 | | // q[15:14] >> (64 - s1[0]) |
293 | 0 | t2 = $SLL64(t2, 61 - 19); |
294 | | // q[15:14] >> s1[2] ^ |
295 | | // q[15:14] >> (64 - s1[1] ^ |
296 | | // q[15:14] >> s1[0] |
297 | 0 | t3 = $XOR(t3, t1); |
298 | | // q[15:14] >> s1[1] |
299 | 0 | t1 = $SRL64(t1, 61 - 19); |
300 | | // sigma1(q[15:14]) |
301 | 0 | t3 = $XOR(t3, t2); |
302 | 0 | t3 = $XOR(t3, t1); |
303 | | |
304 | | // q[1:0] + q[10:9] + sigma1(q[15:14]) + sigma0(q[2:1]) |
305 | 0 | x[0] = $ADD64(x[0], t3); |
306 | | |
307 | | // rotate |
308 | 0 | let temp = x[0]; |
309 | 0 | x[0] = x[1]; |
310 | 0 | x[1] = x[2]; |
311 | 0 | x[2] = x[3]; |
312 | 0 | x[3] = x[4]; |
313 | 0 | x[4] = x[5]; |
314 | 0 | x[5] = x[6]; |
315 | 0 | x[6] = x[7]; |
316 | 0 | x[7] = temp; |
317 | | |
318 | 0 | $ADD64(x[7], k64) |
319 | 0 | } Unexecuted instantiation: sha2::sha512::x86::sha512_update_x_avx Unexecuted instantiation: sha2::sha512::x86::sha512_update_x_avx2 |
320 | | }; |
321 | | } |
322 | | |
323 | | fn_sha512_update_x!(sha512_update_x_avx, __m128i, { |
324 | | ADD64 = _mm_add_epi64, |
325 | | ALIGNR8 = _mm_alignr_epi8, |
326 | | SRL64 = _mm_srli_epi64, |
327 | | SLL64 = _mm_slli_epi64, |
328 | | XOR = _mm_xor_si128, |
329 | | }); |
330 | | |
331 | | fn_sha512_update_x!(sha512_update_x_avx2, __m256i, { |
332 | | ADD64 = _mm256_add_epi64, |
333 | | ALIGNR8 = _mm256_alignr_epi8, |
334 | | SRL64 = _mm256_srli_epi64, |
335 | | SLL64 = _mm256_slli_epi64, |
336 | | XOR = _mm256_xor_si256, |
337 | | }); |
338 | | |
339 | | #[inline(always)] |
340 | 0 | fn cast_ms(ms: &MsgSchedule) -> &[u64; SHA512_BLOCK_WORDS_NUM] { |
341 | 0 | unsafe { &*(ms as *const MsgSchedule as *const _) } |
342 | 0 | } |
343 | | |
344 | | #[inline(always)] |
345 | 0 | fn cast_rs(rs: &RoundStates) -> &[u64; SHA512_ROUNDS_NUM] { |
346 | 0 | unsafe { &*(rs as *const RoundStates as *const _) } |
347 | 0 | } |
348 | | |
349 | | type State = [u64; SHA512_HASH_WORDS_NUM]; |
350 | | type MsgSchedule = [__m128i; SHA512_BLOCK_WORDS_NUM / 2]; |
351 | | type RoundStates = [__m128i; SHA512_ROUNDS_NUM / 2]; |
352 | | |
353 | | const SHA512_BLOCK_BYTE_LEN: usize = 128; |
354 | | const SHA512_ROUNDS_NUM: usize = 80; |
355 | | const SHA512_HASH_BYTE_LEN: usize = 64; |
356 | | const SHA512_HASH_WORDS_NUM: usize = SHA512_HASH_BYTE_LEN / size_of::<u64>(); |
357 | | const SHA512_BLOCK_WORDS_NUM: usize = SHA512_BLOCK_BYTE_LEN / size_of::<u64>(); |