/rust/registry/src/index.crates.io-6f17d22bba15001f/sha2-0.10.9/src/sha256/x86.rs
Line | Count | Source (jump to first uncovered line) |
1 | | //! SHA-256 `x86`/`x86_64` backend |
2 | | |
3 | | #![allow(clippy::many_single_char_names)] |
4 | | |
5 | | #[cfg(target_arch = "x86")] |
6 | | use core::arch::x86::*; |
7 | | #[cfg(target_arch = "x86_64")] |
8 | | use core::arch::x86_64::*; |
9 | | |
10 | 0 | unsafe fn schedule(v0: __m128i, v1: __m128i, v2: __m128i, v3: __m128i) -> __m128i { |
11 | 0 | let t1 = _mm_sha256msg1_epu32(v0, v1); |
12 | 0 | let t2 = _mm_alignr_epi8(v3, v2, 4); |
13 | 0 | let t3 = _mm_add_epi32(t1, t2); |
14 | 0 | _mm_sha256msg2_epu32(t3, v3) |
15 | 0 | } |
16 | | |
17 | | macro_rules! rounds4 { |
18 | | ($abef:ident, $cdgh:ident, $rest:expr, $i:expr) => {{ |
19 | | let k = crate::consts::K32X4[$i]; |
20 | | let kv = _mm_set_epi32(k[0] as i32, k[1] as i32, k[2] as i32, k[3] as i32); |
21 | | let t1 = _mm_add_epi32($rest, kv); |
22 | | $cdgh = _mm_sha256rnds2_epu32($cdgh, $abef, t1); |
23 | | let t2 = _mm_shuffle_epi32(t1, 0x0E); |
24 | | $abef = _mm_sha256rnds2_epu32($abef, $cdgh, t2); |
25 | | }}; |
26 | | } |
27 | | |
28 | | macro_rules! schedule_rounds4 { |
29 | | ( |
30 | | $abef:ident, $cdgh:ident, |
31 | | $w0:expr, $w1:expr, $w2:expr, $w3:expr, $w4:expr, |
32 | | $i: expr |
33 | | ) => {{ |
34 | | $w4 = schedule($w0, $w1, $w2, $w3); |
35 | | rounds4!($abef, $cdgh, $w4, $i); |
36 | | }}; |
37 | | } |
38 | | |
39 | | // we use unaligned loads with `__m128i` pointers |
40 | | #[allow(clippy::cast_ptr_alignment)] |
41 | | #[target_feature(enable = "sha,sse2,ssse3,sse4.1")] |
42 | 0 | unsafe fn digest_blocks(state: &mut [u32; 8], blocks: &[[u8; 64]]) { |
43 | 0 | #[allow(non_snake_case)] |
44 | 0 | let MASK: __m128i = _mm_set_epi64x( |
45 | 0 | 0x0C0D_0E0F_0809_0A0Bu64 as i64, |
46 | 0 | 0x0405_0607_0001_0203u64 as i64, |
47 | 0 | ); |
48 | 0 |
|
49 | 0 | let state_ptr = state.as_ptr() as *const __m128i; |
50 | 0 | let dcba = _mm_loadu_si128(state_ptr.add(0)); |
51 | 0 | let efgh = _mm_loadu_si128(state_ptr.add(1)); |
52 | 0 |
|
53 | 0 | let cdab = _mm_shuffle_epi32(dcba, 0xB1); |
54 | 0 | let efgh = _mm_shuffle_epi32(efgh, 0x1B); |
55 | 0 | let mut abef = _mm_alignr_epi8(cdab, efgh, 8); |
56 | 0 | let mut cdgh = _mm_blend_epi16(efgh, cdab, 0xF0); |
57 | | |
58 | 0 | for block in blocks { |
59 | 0 | let abef_save = abef; |
60 | 0 | let cdgh_save = cdgh; |
61 | 0 |
|
62 | 0 | let data_ptr = block.as_ptr() as *const __m128i; |
63 | 0 | let mut w0 = _mm_shuffle_epi8(_mm_loadu_si128(data_ptr.add(0)), MASK); |
64 | 0 | let mut w1 = _mm_shuffle_epi8(_mm_loadu_si128(data_ptr.add(1)), MASK); |
65 | 0 | let mut w2 = _mm_shuffle_epi8(_mm_loadu_si128(data_ptr.add(2)), MASK); |
66 | 0 | let mut w3 = _mm_shuffle_epi8(_mm_loadu_si128(data_ptr.add(3)), MASK); |
67 | 0 | let mut w4; |
68 | 0 |
|
69 | 0 | rounds4!(abef, cdgh, w0, 0); |
70 | 0 | rounds4!(abef, cdgh, w1, 1); |
71 | 0 | rounds4!(abef, cdgh, w2, 2); |
72 | 0 | rounds4!(abef, cdgh, w3, 3); |
73 | 0 | schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 4); |
74 | 0 | schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 5); |
75 | 0 | schedule_rounds4!(abef, cdgh, w2, w3, w4, w0, w1, 6); |
76 | 0 | schedule_rounds4!(abef, cdgh, w3, w4, w0, w1, w2, 7); |
77 | 0 | schedule_rounds4!(abef, cdgh, w4, w0, w1, w2, w3, 8); |
78 | 0 | schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 9); |
79 | 0 | schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 10); |
80 | 0 | schedule_rounds4!(abef, cdgh, w2, w3, w4, w0, w1, 11); |
81 | 0 | schedule_rounds4!(abef, cdgh, w3, w4, w0, w1, w2, 12); |
82 | 0 | schedule_rounds4!(abef, cdgh, w4, w0, w1, w2, w3, 13); |
83 | 0 | schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 14); |
84 | 0 | schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 15); |
85 | 0 |
|
86 | 0 | abef = _mm_add_epi32(abef, abef_save); |
87 | 0 | cdgh = _mm_add_epi32(cdgh, cdgh_save); |
88 | 0 | } |
89 | | |
90 | 0 | let feba = _mm_shuffle_epi32(abef, 0x1B); |
91 | 0 | let dchg = _mm_shuffle_epi32(cdgh, 0xB1); |
92 | 0 | let dcba = _mm_blend_epi16(feba, dchg, 0xF0); |
93 | 0 | let hgef = _mm_alignr_epi8(dchg, feba, 8); |
94 | 0 |
|
95 | 0 | let state_ptr_mut = state.as_mut_ptr() as *mut __m128i; |
96 | 0 | _mm_storeu_si128(state_ptr_mut.add(0), dcba); |
97 | 0 | _mm_storeu_si128(state_ptr_mut.add(1), hgef); |
98 | 0 | } |
99 | | |
100 | | cpufeatures::new!(shani_cpuid, "sha", "sse2", "ssse3", "sse4.1"); |
101 | | |
102 | 0 | pub fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) { |
103 | 0 | // TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725 |
104 | 0 | // after stabilization |
105 | 0 | if shani_cpuid::get() { |
106 | 0 | unsafe { |
107 | 0 | digest_blocks(state, blocks); |
108 | 0 | } |
109 | 0 | } else { |
110 | 0 | super::soft::compress(state, blocks); |
111 | 0 | } |
112 | 0 | } |