/rust/registry/src/index.crates.io-1949cf8c6b5b557f/polyval-0.6.2/src/backend/clmul.rs
Line | Count | Source |
1 | | //! Intel `CLMUL`-accelerated implementation for modern x86/x86_64 CPUs |
2 | | //! (i.e. Intel Sandy Bridge-compatible or newer) |
3 | | |
4 | | #[cfg(target_arch = "x86")] |
5 | | use core::arch::x86::*; |
6 | | #[cfg(target_arch = "x86_64")] |
7 | | use core::arch::x86_64::*; |
8 | | |
9 | | use universal_hash::{ |
10 | | consts::{U1, U16}, |
11 | | crypto_common::{BlockSizeUser, KeySizeUser, ParBlocksSizeUser}, |
12 | | KeyInit, Reset, UhfBackend, |
13 | | }; |
14 | | |
15 | | use crate::{Block, Key, Tag}; |
16 | | |
17 | | /// **POLYVAL**: GHASH-like universal hash over GF(2^128). |
18 | | #[derive(Clone)] |
19 | | pub struct Polyval { |
20 | | h: __m128i, |
21 | | y: __m128i, |
22 | | } |
23 | | |
24 | | impl KeySizeUser for Polyval { |
25 | | type KeySize = U16; |
26 | | } |
27 | | |
28 | | impl Polyval { |
29 | | /// Initialize POLYVAL with the given `H` field element and initial block |
30 | 1.55k | pub fn new_with_init_block(h: &Key, init_block: u128) -> Self { |
31 | | unsafe { |
32 | | // `_mm_loadu_si128` performs an unaligned load |
33 | | #[allow(clippy::cast_ptr_alignment)] |
34 | 1.55k | Self { |
35 | 1.55k | h: _mm_loadu_si128(h.as_ptr() as *const __m128i), |
36 | 1.55k | y: _mm_loadu_si128(&init_block.to_be_bytes()[..] as *const _ as *const __m128i), |
37 | 1.55k | } |
38 | | } |
39 | 1.55k | } |
40 | | } |
41 | | |
42 | | impl KeyInit for Polyval { |
43 | | /// Initialize POLYVAL with the given `H` field element |
44 | 0 | fn new(h: &Key) -> Self { |
45 | 0 | Self::new_with_init_block(h, 0) |
46 | 0 | } |
47 | | } |
48 | | |
49 | | impl BlockSizeUser for Polyval { |
50 | | type BlockSize = U16; |
51 | | } |
52 | | |
53 | | impl ParBlocksSizeUser for Polyval { |
54 | | type ParBlocksSize = U1; |
55 | | } |
56 | | |
57 | | impl UhfBackend for Polyval { |
58 | 90.2k | fn proc_block(&mut self, x: &Block) { |
59 | 90.2k | unsafe { |
60 | 90.2k | self.mul(x); |
61 | 90.2k | } |
62 | 90.2k | } |
63 | | } |
64 | | |
65 | | impl Polyval { |
66 | | /// Get GHASH output |
67 | 1.23k | pub(crate) fn finalize(self) -> Tag { |
68 | 1.23k | unsafe { core::mem::transmute(self.y) } |
69 | 1.23k | } |
70 | | } |
71 | | |
72 | | impl Polyval { |
73 | | #[inline] |
74 | | #[target_feature(enable = "pclmulqdq")] |
75 | 90.2k | unsafe fn mul(&mut self, x: &Block) { |
76 | 90.2k | let h = self.h; |
77 | | |
78 | | // `_mm_loadu_si128` performs an unaligned load |
79 | | #[allow(clippy::cast_ptr_alignment)] |
80 | 90.2k | let x = _mm_loadu_si128(x.as_ptr() as *const __m128i); |
81 | 90.2k | let y = _mm_xor_si128(self.y, x); |
82 | | |
83 | 90.2k | let h0 = h; |
84 | 90.2k | let h1 = _mm_shuffle_epi32(h, 0x0E); |
85 | 90.2k | let h2 = _mm_xor_si128(h0, h1); |
86 | 90.2k | let y0 = y; |
87 | | |
88 | | // Multiply values partitioned to 64-bit parts |
89 | 90.2k | let y1 = _mm_shuffle_epi32(y, 0x0E); |
90 | 90.2k | let y2 = _mm_xor_si128(y0, y1); |
91 | 90.2k | let t0 = _mm_clmulepi64_si128(y0, h0, 0x00); |
92 | 90.2k | let t1 = _mm_clmulepi64_si128(y, h, 0x11); |
93 | 90.2k | let t2 = _mm_clmulepi64_si128(y2, h2, 0x00); |
94 | 90.2k | let t2 = _mm_xor_si128(t2, _mm_xor_si128(t0, t1)); |
95 | 90.2k | let v0 = t0; |
96 | 90.2k | let v1 = _mm_xor_si128(_mm_shuffle_epi32(t0, 0x0E), t2); |
97 | 90.2k | let v2 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E)); |
98 | 90.2k | let v3 = _mm_shuffle_epi32(t1, 0x0E); |
99 | | |
100 | | // Polynomial reduction |
101 | 90.2k | let v2 = xor5( |
102 | 90.2k | v2, |
103 | 90.2k | v0, |
104 | 90.2k | _mm_srli_epi64(v0, 1), |
105 | 90.2k | _mm_srli_epi64(v0, 2), |
106 | 90.2k | _mm_srli_epi64(v0, 7), |
107 | | ); |
108 | | |
109 | 90.2k | let v1 = xor4( |
110 | 90.2k | v1, |
111 | 90.2k | _mm_slli_epi64(v0, 63), |
112 | 90.2k | _mm_slli_epi64(v0, 62), |
113 | 90.2k | _mm_slli_epi64(v0, 57), |
114 | | ); |
115 | | |
116 | 90.2k | let v3 = xor5( |
117 | 90.2k | v3, |
118 | 90.2k | v1, |
119 | 90.2k | _mm_srli_epi64(v1, 1), |
120 | 90.2k | _mm_srli_epi64(v1, 2), |
121 | 90.2k | _mm_srli_epi64(v1, 7), |
122 | | ); |
123 | | |
124 | 90.2k | let v2 = xor4( |
125 | 90.2k | v2, |
126 | 90.2k | _mm_slli_epi64(v1, 63), |
127 | 90.2k | _mm_slli_epi64(v1, 62), |
128 | 90.2k | _mm_slli_epi64(v1, 57), |
129 | | ); |
130 | | |
131 | 90.2k | self.y = _mm_unpacklo_epi64(v2, v3); |
132 | 90.2k | } |
133 | | } |
134 | | |
135 | | impl Reset for Polyval { |
136 | 0 | fn reset(&mut self) { |
137 | 0 | unsafe { |
138 | 0 | self.y = _mm_setzero_si128(); |
139 | 0 | } |
140 | 0 | } |
141 | | } |
142 | | |
143 | | #[cfg(feature = "zeroize")] |
144 | | impl Drop for Polyval { |
145 | | fn drop(&mut self) { |
146 | | use zeroize::Zeroize; |
147 | | self.h.zeroize(); |
148 | | self.y.zeroize(); |
149 | | } |
150 | | } |
151 | | |
152 | | #[inline(always)] |
153 | 180k | unsafe fn xor4(e1: __m128i, e2: __m128i, e3: __m128i, e4: __m128i) -> __m128i { |
154 | 180k | _mm_xor_si128(_mm_xor_si128(e1, e2), _mm_xor_si128(e3, e4)) |
155 | 180k | } |
156 | | |
157 | | #[inline(always)] |
158 | 180k | unsafe fn xor5(e1: __m128i, e2: __m128i, e3: __m128i, e4: __m128i, e5: __m128i) -> __m128i { |
159 | 180k | _mm_xor_si128( |
160 | 180k | e1, |
161 | 180k | _mm_xor_si128(_mm_xor_si128(e2, e3), _mm_xor_si128(e4, e5)), |
162 | | ) |
163 | 180k | } |