/rust/registry/src/index.crates.io-6f17d22bba15001f/polyval-0.5.3/src/backend/clmul.rs
Line | Count | Source (jump to first uncovered line) |
1 | | //! Intel `CLMUL`-accelerated implementation for modern x86/x86_64 CPUs |
2 | | //! (i.e. Intel Sandy Bridge-compatible or newer) |
3 | | |
4 | | use crate::{Block, Key}; |
5 | | use universal_hash::{consts::U16, NewUniversalHash, Output, UniversalHash}; |
6 | | |
7 | | #[cfg(target_arch = "x86")] |
8 | | use core::arch::x86::*; |
9 | | #[cfg(target_arch = "x86_64")] |
10 | | use core::arch::x86_64::*; |
11 | | |
12 | | /// **POLYVAL**: GHASH-like universal hash over GF(2^128). |
13 | | #[derive(Clone)] |
14 | | pub struct Polyval { |
15 | | h: __m128i, |
16 | | y: __m128i, |
17 | | } |
18 | | |
19 | | impl NewUniversalHash for Polyval { |
20 | | type KeySize = U16; |
21 | | |
22 | | /// Initialize POLYVAL with the given `H` field element |
23 | 3.12k | fn new(h: &Key) -> Self { |
24 | 3.12k | unsafe { |
25 | 3.12k | // `_mm_loadu_si128` performs an unaligned load |
26 | 3.12k | #[allow(clippy::cast_ptr_alignment)] |
27 | 3.12k | Self { |
28 | 3.12k | h: _mm_loadu_si128(h.as_ptr() as *const __m128i), |
29 | 3.12k | y: _mm_setzero_si128(), |
30 | 3.12k | } |
31 | 3.12k | } |
32 | 3.12k | } <polyval::backend::clmul::Polyval as universal_hash::NewUniversalHash>::new Line | Count | Source | 23 | 1.79k | fn new(h: &Key) -> Self { | 24 | 1.79k | unsafe { | 25 | 1.79k | // `_mm_loadu_si128` performs an unaligned load | 26 | 1.79k | #[allow(clippy::cast_ptr_alignment)] | 27 | 1.79k | Self { | 28 | 1.79k | h: _mm_loadu_si128(h.as_ptr() as *const __m128i), | 29 | 1.79k | y: _mm_setzero_si128(), | 30 | 1.79k | } | 31 | 1.79k | } | 32 | 1.79k | } |
<polyval::backend::clmul::Polyval as universal_hash::NewUniversalHash>::new Line | Count | Source | 23 | 1.33k | fn new(h: &Key) -> Self { | 24 | 1.33k | unsafe { | 25 | 1.33k | // `_mm_loadu_si128` performs an unaligned load | 26 | 1.33k | #[allow(clippy::cast_ptr_alignment)] | 27 | 1.33k | Self { | 28 | 1.33k | h: _mm_loadu_si128(h.as_ptr() as *const __m128i), | 29 | 1.33k | y: _mm_setzero_si128(), | 30 | 1.33k | } | 31 | 1.33k | } | 32 | 1.33k | } |
|
33 | | } |
34 | | |
35 | | impl UniversalHash for Polyval { |
36 | | type BlockSize = U16; |
37 | | |
38 | | #[inline] |
39 | 311k | fn update(&mut self, x: &Block) { |
40 | 311k | unsafe { |
41 | 311k | self.mul(x); |
42 | 311k | } |
43 | 311k | } <polyval::backend::clmul::Polyval as universal_hash::UniversalHash>::update Line | Count | Source | 39 | 193k | fn update(&mut self, x: &Block) { | 40 | 193k | unsafe { | 41 | 193k | self.mul(x); | 42 | 193k | } | 43 | 193k | } |
<polyval::backend::clmul::Polyval as universal_hash::UniversalHash>::update Line | Count | Source | 39 | 118k | fn update(&mut self, x: &Block) { | 40 | 118k | unsafe { | 41 | 118k | self.mul(x); | 42 | 118k | } | 43 | 118k | } |
|
44 | | |
45 | | /// Reset internal state |
46 | 0 | fn reset(&mut self) { |
47 | 0 | unsafe { |
48 | 0 | self.y = _mm_setzero_si128(); |
49 | 0 | } |
50 | 0 | } Unexecuted instantiation: <polyval::backend::clmul::Polyval as universal_hash::UniversalHash>::reset Unexecuted instantiation: <polyval::backend::clmul::Polyval as universal_hash::UniversalHash>::reset |
51 | | |
52 | | /// Get GHASH output |
53 | 5.15k | fn finalize(self) -> Output<Self> { |
54 | 5.15k | unsafe { core::mem::transmute(self.y) } |
55 | 5.15k | } <polyval::backend::clmul::Polyval as universal_hash::UniversalHash>::finalize Line | Count | Source | 53 | 1.38k | fn finalize(self) -> Output<Self> { | 54 | 1.38k | unsafe { core::mem::transmute(self.y) } | 55 | 1.38k | } |
<polyval::backend::clmul::Polyval as universal_hash::UniversalHash>::finalize Line | Count | Source | 53 | 3.76k | fn finalize(self) -> Output<Self> { | 54 | 3.76k | unsafe { core::mem::transmute(self.y) } | 55 | 3.76k | } |
|
56 | | } |
57 | | |
58 | | impl Polyval { |
59 | | #[inline] |
60 | | #[target_feature(enable = "pclmulqdq")] |
61 | | #[target_feature(enable = "sse4.1")] |
62 | 311k | unsafe fn mul(&mut self, x: &Block) { |
63 | 311k | let h = self.h; |
64 | 311k | |
65 | 311k | // `_mm_loadu_si128` performs an unaligned load |
66 | 311k | #[allow(clippy::cast_ptr_alignment)] |
67 | 311k | let x = _mm_loadu_si128(x.as_ptr() as *const __m128i); |
68 | 311k | let y = _mm_xor_si128(self.y, x); |
69 | 311k | |
70 | 311k | let h0 = h; |
71 | 311k | let h1 = _mm_shuffle_epi32(h, 0x0E); |
72 | 311k | let h2 = _mm_xor_si128(h0, h1); |
73 | 311k | let y0 = y; |
74 | 311k | |
75 | 311k | // Multiply values partitioned to 64-bit parts |
76 | 311k | let y1 = _mm_shuffle_epi32(y, 0x0E); |
77 | 311k | let y2 = _mm_xor_si128(y0, y1); |
78 | 311k | let t0 = _mm_clmulepi64_si128(y0, h0, 0x00); |
79 | 311k | let t1 = _mm_clmulepi64_si128(y, h, 0x11); |
80 | 311k | let t2 = _mm_clmulepi64_si128(y2, h2, 0x00); |
81 | 311k | let t2 = _mm_xor_si128(t2, _mm_xor_si128(t0, t1)); |
82 | 311k | let v0 = t0; |
83 | 311k | let v1 = _mm_xor_si128(_mm_shuffle_epi32(t0, 0x0E), t2); |
84 | 311k | let v2 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E)); |
85 | 311k | let v3 = _mm_shuffle_epi32(t1, 0x0E); |
86 | 311k | |
87 | 311k | // Polynomial reduction |
88 | 311k | let v2 = xor5( |
89 | 311k | v2, |
90 | 311k | v0, |
91 | 311k | _mm_srli_epi64(v0, 1), |
92 | 311k | _mm_srli_epi64(v0, 2), |
93 | 311k | _mm_srli_epi64(v0, 7), |
94 | 311k | ); |
95 | 311k | |
96 | 311k | let v1 = xor4( |
97 | 311k | v1, |
98 | 311k | _mm_slli_epi64(v0, 63), |
99 | 311k | _mm_slli_epi64(v0, 62), |
100 | 311k | _mm_slli_epi64(v0, 57), |
101 | 311k | ); |
102 | 311k | |
103 | 311k | let v3 = xor5( |
104 | 311k | v3, |
105 | 311k | v1, |
106 | 311k | _mm_srli_epi64(v1, 1), |
107 | 311k | _mm_srli_epi64(v1, 2), |
108 | 311k | _mm_srli_epi64(v1, 7), |
109 | 311k | ); |
110 | 311k | |
111 | 311k | let v2 = xor4( |
112 | 311k | v2, |
113 | 311k | _mm_slli_epi64(v1, 63), |
114 | 311k | _mm_slli_epi64(v1, 62), |
115 | 311k | _mm_slli_epi64(v1, 57), |
116 | 311k | ); |
117 | 311k | |
118 | 311k | self.y = _mm_unpacklo_epi64(v2, v3); |
119 | 311k | } <polyval::backend::clmul::Polyval>::mul Line | Count | Source | 62 | 193k | unsafe fn mul(&mut self, x: &Block) { | 63 | 193k | let h = self.h; | 64 | 193k | | 65 | 193k | // `_mm_loadu_si128` performs an unaligned load | 66 | 193k | #[allow(clippy::cast_ptr_alignment)] | 67 | 193k | let x = _mm_loadu_si128(x.as_ptr() as *const __m128i); | 68 | 193k | let y = _mm_xor_si128(self.y, x); | 69 | 193k | | 70 | 193k | let h0 = h; | 71 | 193k | let h1 = _mm_shuffle_epi32(h, 0x0E); | 72 | 193k | let h2 = _mm_xor_si128(h0, h1); | 73 | 193k | let y0 = y; | 74 | 193k | | 75 | 193k | // Multiply values partitioned to 64-bit parts | 76 | 193k | let y1 = _mm_shuffle_epi32(y, 0x0E); | 77 | 193k | let y2 = _mm_xor_si128(y0, y1); | 78 | 193k | let t0 = _mm_clmulepi64_si128(y0, h0, 0x00); | 79 | 193k | let t1 = _mm_clmulepi64_si128(y, h, 0x11); | 80 | 193k | let t2 = _mm_clmulepi64_si128(y2, h2, 0x00); | 81 | 193k | let t2 = _mm_xor_si128(t2, _mm_xor_si128(t0, t1)); | 82 | 193k | let v0 = t0; | 83 | 193k | let v1 = _mm_xor_si128(_mm_shuffle_epi32(t0, 0x0E), t2); | 84 | 193k | let v2 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E)); | 85 | 193k | let v3 = _mm_shuffle_epi32(t1, 0x0E); | 86 | 193k | | 87 | 193k | // Polynomial reduction | 88 | 193k | let v2 = xor5( | 89 | 193k | v2, | 90 | 193k | v0, | 91 | 193k | _mm_srli_epi64(v0, 1), | 92 | 193k | _mm_srli_epi64(v0, 2), | 93 | 193k | _mm_srli_epi64(v0, 7), | 94 | 193k | ); | 95 | 193k | | 96 | 193k | let v1 = xor4( | 97 | 193k | v1, | 98 | 193k | _mm_slli_epi64(v0, 63), | 99 | 193k | _mm_slli_epi64(v0, 62), | 100 | 193k | _mm_slli_epi64(v0, 57), | 101 | 193k | ); | 102 | 193k | | 103 | 193k | let v3 = xor5( | 104 | 193k | v3, | 105 | 193k | v1, | 106 | 193k | _mm_srli_epi64(v1, 1), | 107 | 193k | _mm_srli_epi64(v1, 2), | 108 | 193k | _mm_srli_epi64(v1, 7), | 109 | 193k | ); | 110 | 193k | | 111 | 193k | let v2 = xor4( | 112 | 193k | v2, | 113 | 193k | _mm_slli_epi64(v1, 63), | 114 | 193k | _mm_slli_epi64(v1, 62), | 115 | 193k | _mm_slli_epi64(v1, 57), | 116 | 193k | ); | 117 | 193k | | 118 | 193k | self.y = _mm_unpacklo_epi64(v2, v3); | 119 | 193k | } |
<polyval::backend::clmul::Polyval>::mul Line | Count | Source | 62 | 118k | unsafe fn mul(&mut self, x: &Block) { | 63 | 118k | let h = self.h; | 64 | 118k | | 65 | 118k | // `_mm_loadu_si128` performs an unaligned load | 66 | 118k | #[allow(clippy::cast_ptr_alignment)] | 67 | 118k | let x = _mm_loadu_si128(x.as_ptr() as *const __m128i); | 68 | 118k | let y = _mm_xor_si128(self.y, x); | 69 | 118k | | 70 | 118k | let h0 = h; | 71 | 118k | let h1 = _mm_shuffle_epi32(h, 0x0E); | 72 | 118k | let h2 = _mm_xor_si128(h0, h1); | 73 | 118k | let y0 = y; | 74 | 118k | | 75 | 118k | // Multiply values partitioned to 64-bit parts | 76 | 118k | let y1 = _mm_shuffle_epi32(y, 0x0E); | 77 | 118k | let y2 = _mm_xor_si128(y0, y1); | 78 | 118k | let t0 = _mm_clmulepi64_si128(y0, h0, 0x00); | 79 | 118k | let t1 = _mm_clmulepi64_si128(y, h, 0x11); | 80 | 118k | let t2 = _mm_clmulepi64_si128(y2, h2, 0x00); | 81 | 118k | let t2 = _mm_xor_si128(t2, _mm_xor_si128(t0, t1)); | 82 | 118k | let v0 = t0; | 83 | 118k | let v1 = _mm_xor_si128(_mm_shuffle_epi32(t0, 0x0E), t2); | 84 | 118k | let v2 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E)); | 85 | 118k | let v3 = _mm_shuffle_epi32(t1, 0x0E); | 86 | 118k | | 87 | 118k | // Polynomial reduction | 88 | 118k | let v2 = xor5( | 89 | 118k | v2, | 90 | 118k | v0, | 91 | 118k | _mm_srli_epi64(v0, 1), | 92 | 118k | _mm_srli_epi64(v0, 2), | 93 | 118k | _mm_srli_epi64(v0, 7), | 94 | 118k | ); | 95 | 118k | | 96 | 118k | let v1 = xor4( | 97 | 118k | v1, | 98 | 118k | _mm_slli_epi64(v0, 63), | 99 | 118k | _mm_slli_epi64(v0, 62), | 100 | 118k | _mm_slli_epi64(v0, 57), | 101 | 118k | ); | 102 | 118k | | 103 | 118k | let v3 = xor5( | 104 | 118k | v3, | 105 | 118k | v1, | 106 | 118k | _mm_srli_epi64(v1, 1), | 107 | 118k | _mm_srli_epi64(v1, 2), | 108 | 118k | _mm_srli_epi64(v1, 7), | 109 | 118k | ); | 110 | 118k | | 111 | 118k | let v2 = xor4( | 112 | 118k | v2, | 113 | 118k | _mm_slli_epi64(v1, 63), | 114 | 118k | _mm_slli_epi64(v1, 62), | 115 | 118k | _mm_slli_epi64(v1, 57), | 116 | 118k | ); | 117 | 118k | | 118 | 118k | self.y = _mm_unpacklo_epi64(v2, v3); | 119 | 118k | } |
|
120 | | } |
121 | | |
122 | | #[cfg(feature = "zeroize")] |
123 | | impl Drop for Polyval { |
124 | | fn drop(&mut self) { |
125 | | use zeroize::Zeroize; |
126 | | self.h.zeroize(); |
127 | | self.y.zeroize(); |
128 | | } |
129 | | } |
130 | | |
131 | | #[inline(always)] |
132 | 623k | unsafe fn xor4(e1: __m128i, e2: __m128i, e3: __m128i, e4: __m128i) -> __m128i { |
133 | 623k | _mm_xor_si128(_mm_xor_si128(e1, e2), _mm_xor_si128(e3, e4)) |
134 | 623k | } polyval::backend::clmul::xor4 Line | Count | Source | 132 | 386k | unsafe fn xor4(e1: __m128i, e2: __m128i, e3: __m128i, e4: __m128i) -> __m128i { | 133 | 386k | _mm_xor_si128(_mm_xor_si128(e1, e2), _mm_xor_si128(e3, e4)) | 134 | 386k | } |
polyval::backend::clmul::xor4 Line | Count | Source | 132 | 236k | unsafe fn xor4(e1: __m128i, e2: __m128i, e3: __m128i, e4: __m128i) -> __m128i { | 133 | 236k | _mm_xor_si128(_mm_xor_si128(e1, e2), _mm_xor_si128(e3, e4)) | 134 | 236k | } |
|
135 | | |
136 | | #[inline(always)] |
137 | 623k | unsafe fn xor5(e1: __m128i, e2: __m128i, e3: __m128i, e4: __m128i, e5: __m128i) -> __m128i { |
138 | 623k | _mm_xor_si128( |
139 | 623k | e1, |
140 | 623k | _mm_xor_si128(_mm_xor_si128(e2, e3), _mm_xor_si128(e4, e5)), |
141 | 623k | ) |
142 | 623k | } polyval::backend::clmul::xor5 Line | Count | Source | 137 | 386k | unsafe fn xor5(e1: __m128i, e2: __m128i, e3: __m128i, e4: __m128i, e5: __m128i) -> __m128i { | 138 | 386k | _mm_xor_si128( | 139 | 386k | e1, | 140 | 386k | _mm_xor_si128(_mm_xor_si128(e2, e3), _mm_xor_si128(e4, e5)), | 141 | 386k | ) | 142 | 386k | } |
polyval::backend::clmul::xor5 Line | Count | Source | 137 | 236k | unsafe fn xor5(e1: __m128i, e2: __m128i, e3: __m128i, e4: __m128i, e5: __m128i) -> __m128i { | 138 | 236k | _mm_xor_si128( | 139 | 236k | e1, | 140 | 236k | _mm_xor_si128(_mm_xor_si128(e2, e3), _mm_xor_si128(e4, e5)), | 141 | 236k | ) | 142 | 236k | } |
|