Coverage Report

Created: 2025-07-23 07:29

/rust/registry/src/index.crates.io-6f17d22bba15001f/polyval-0.5.3/src/backend/clmul.rs
Line
Count
Source (jump to first uncovered line)
1
//! Intel `CLMUL`-accelerated implementation for modern x86/x86_64 CPUs
2
//! (i.e. Intel Sandy Bridge-compatible or newer)
3
4
use crate::{Block, Key};
5
use universal_hash::{consts::U16, NewUniversalHash, Output, UniversalHash};
6
7
#[cfg(target_arch = "x86")]
8
use core::arch::x86::*;
9
#[cfg(target_arch = "x86_64")]
10
use core::arch::x86_64::*;
11
12
/// **POLYVAL**: GHASH-like universal hash over GF(2^128).
13
#[derive(Clone)]
14
pub struct Polyval {
15
    h: __m128i,
16
    y: __m128i,
17
}
18
19
impl NewUniversalHash for Polyval {
20
    type KeySize = U16;
21
22
    /// Initialize POLYVAL with the given `H` field element
23
3.12k
    fn new(h: &Key) -> Self {
24
3.12k
        unsafe {
25
3.12k
            // `_mm_loadu_si128` performs an unaligned load
26
3.12k
            #[allow(clippy::cast_ptr_alignment)]
27
3.12k
            Self {
28
3.12k
                h: _mm_loadu_si128(h.as_ptr() as *const __m128i),
29
3.12k
                y: _mm_setzero_si128(),
30
3.12k
            }
31
3.12k
        }
32
3.12k
    }
<polyval::backend::clmul::Polyval as universal_hash::NewUniversalHash>::new
Line
Count
Source
23
1.79k
    fn new(h: &Key) -> Self {
24
1.79k
        unsafe {
25
1.79k
            // `_mm_loadu_si128` performs an unaligned load
26
1.79k
            #[allow(clippy::cast_ptr_alignment)]
27
1.79k
            Self {
28
1.79k
                h: _mm_loadu_si128(h.as_ptr() as *const __m128i),
29
1.79k
                y: _mm_setzero_si128(),
30
1.79k
            }
31
1.79k
        }
32
1.79k
    }
<polyval::backend::clmul::Polyval as universal_hash::NewUniversalHash>::new
Line
Count
Source
23
1.33k
    fn new(h: &Key) -> Self {
24
1.33k
        unsafe {
25
1.33k
            // `_mm_loadu_si128` performs an unaligned load
26
1.33k
            #[allow(clippy::cast_ptr_alignment)]
27
1.33k
            Self {
28
1.33k
                h: _mm_loadu_si128(h.as_ptr() as *const __m128i),
29
1.33k
                y: _mm_setzero_si128(),
30
1.33k
            }
31
1.33k
        }
32
1.33k
    }
33
}
34
35
impl UniversalHash for Polyval {
36
    type BlockSize = U16;
37
38
    #[inline]
39
311k
    fn update(&mut self, x: &Block) {
40
311k
        unsafe {
41
311k
            self.mul(x);
42
311k
        }
43
311k
    }
<polyval::backend::clmul::Polyval as universal_hash::UniversalHash>::update
Line
Count
Source
39
193k
    fn update(&mut self, x: &Block) {
40
193k
        unsafe {
41
193k
            self.mul(x);
42
193k
        }
43
193k
    }
<polyval::backend::clmul::Polyval as universal_hash::UniversalHash>::update
Line
Count
Source
39
118k
    fn update(&mut self, x: &Block) {
40
118k
        unsafe {
41
118k
            self.mul(x);
42
118k
        }
43
118k
    }
44
45
    /// Reset internal state
46
0
    fn reset(&mut self) {
47
0
        unsafe {
48
0
            self.y = _mm_setzero_si128();
49
0
        }
50
0
    }
Unexecuted instantiation: <polyval::backend::clmul::Polyval as universal_hash::UniversalHash>::reset
Unexecuted instantiation: <polyval::backend::clmul::Polyval as universal_hash::UniversalHash>::reset
51
52
    /// Get GHASH output
53
5.15k
    fn finalize(self) -> Output<Self> {
54
5.15k
        unsafe { core::mem::transmute(self.y) }
55
5.15k
    }
<polyval::backend::clmul::Polyval as universal_hash::UniversalHash>::finalize
Line
Count
Source
53
1.38k
    fn finalize(self) -> Output<Self> {
54
1.38k
        unsafe { core::mem::transmute(self.y) }
55
1.38k
    }
<polyval::backend::clmul::Polyval as universal_hash::UniversalHash>::finalize
Line
Count
Source
53
3.76k
    fn finalize(self) -> Output<Self> {
54
3.76k
        unsafe { core::mem::transmute(self.y) }
55
3.76k
    }
56
}
57
58
impl Polyval {
59
    #[inline]
60
    #[target_feature(enable = "pclmulqdq")]
61
    #[target_feature(enable = "sse4.1")]
62
311k
    unsafe fn mul(&mut self, x: &Block) {
63
311k
        let h = self.h;
64
311k
65
311k
        // `_mm_loadu_si128` performs an unaligned load
66
311k
        #[allow(clippy::cast_ptr_alignment)]
67
311k
        let x = _mm_loadu_si128(x.as_ptr() as *const __m128i);
68
311k
        let y = _mm_xor_si128(self.y, x);
69
311k
70
311k
        let h0 = h;
71
311k
        let h1 = _mm_shuffle_epi32(h, 0x0E);
72
311k
        let h2 = _mm_xor_si128(h0, h1);
73
311k
        let y0 = y;
74
311k
75
311k
        // Multiply values partitioned to 64-bit parts
76
311k
        let y1 = _mm_shuffle_epi32(y, 0x0E);
77
311k
        let y2 = _mm_xor_si128(y0, y1);
78
311k
        let t0 = _mm_clmulepi64_si128(y0, h0, 0x00);
79
311k
        let t1 = _mm_clmulepi64_si128(y, h, 0x11);
80
311k
        let t2 = _mm_clmulepi64_si128(y2, h2, 0x00);
81
311k
        let t2 = _mm_xor_si128(t2, _mm_xor_si128(t0, t1));
82
311k
        let v0 = t0;
83
311k
        let v1 = _mm_xor_si128(_mm_shuffle_epi32(t0, 0x0E), t2);
84
311k
        let v2 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
85
311k
        let v3 = _mm_shuffle_epi32(t1, 0x0E);
86
311k
87
311k
        // Polynomial reduction
88
311k
        let v2 = xor5(
89
311k
            v2,
90
311k
            v0,
91
311k
            _mm_srli_epi64(v0, 1),
92
311k
            _mm_srli_epi64(v0, 2),
93
311k
            _mm_srli_epi64(v0, 7),
94
311k
        );
95
311k
96
311k
        let v1 = xor4(
97
311k
            v1,
98
311k
            _mm_slli_epi64(v0, 63),
99
311k
            _mm_slli_epi64(v0, 62),
100
311k
            _mm_slli_epi64(v0, 57),
101
311k
        );
102
311k
103
311k
        let v3 = xor5(
104
311k
            v3,
105
311k
            v1,
106
311k
            _mm_srli_epi64(v1, 1),
107
311k
            _mm_srli_epi64(v1, 2),
108
311k
            _mm_srli_epi64(v1, 7),
109
311k
        );
110
311k
111
311k
        let v2 = xor4(
112
311k
            v2,
113
311k
            _mm_slli_epi64(v1, 63),
114
311k
            _mm_slli_epi64(v1, 62),
115
311k
            _mm_slli_epi64(v1, 57),
116
311k
        );
117
311k
118
311k
        self.y = _mm_unpacklo_epi64(v2, v3);
119
311k
    }
<polyval::backend::clmul::Polyval>::mul
Line
Count
Source
62
193k
    unsafe fn mul(&mut self, x: &Block) {
63
193k
        let h = self.h;
64
193k
65
193k
        // `_mm_loadu_si128` performs an unaligned load
66
193k
        #[allow(clippy::cast_ptr_alignment)]
67
193k
        let x = _mm_loadu_si128(x.as_ptr() as *const __m128i);
68
193k
        let y = _mm_xor_si128(self.y, x);
69
193k
70
193k
        let h0 = h;
71
193k
        let h1 = _mm_shuffle_epi32(h, 0x0E);
72
193k
        let h2 = _mm_xor_si128(h0, h1);
73
193k
        let y0 = y;
74
193k
75
193k
        // Multiply values partitioned to 64-bit parts
76
193k
        let y1 = _mm_shuffle_epi32(y, 0x0E);
77
193k
        let y2 = _mm_xor_si128(y0, y1);
78
193k
        let t0 = _mm_clmulepi64_si128(y0, h0, 0x00);
79
193k
        let t1 = _mm_clmulepi64_si128(y, h, 0x11);
80
193k
        let t2 = _mm_clmulepi64_si128(y2, h2, 0x00);
81
193k
        let t2 = _mm_xor_si128(t2, _mm_xor_si128(t0, t1));
82
193k
        let v0 = t0;
83
193k
        let v1 = _mm_xor_si128(_mm_shuffle_epi32(t0, 0x0E), t2);
84
193k
        let v2 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
85
193k
        let v3 = _mm_shuffle_epi32(t1, 0x0E);
86
193k
87
193k
        // Polynomial reduction
88
193k
        let v2 = xor5(
89
193k
            v2,
90
193k
            v0,
91
193k
            _mm_srli_epi64(v0, 1),
92
193k
            _mm_srli_epi64(v0, 2),
93
193k
            _mm_srli_epi64(v0, 7),
94
193k
        );
95
193k
96
193k
        let v1 = xor4(
97
193k
            v1,
98
193k
            _mm_slli_epi64(v0, 63),
99
193k
            _mm_slli_epi64(v0, 62),
100
193k
            _mm_slli_epi64(v0, 57),
101
193k
        );
102
193k
103
193k
        let v3 = xor5(
104
193k
            v3,
105
193k
            v1,
106
193k
            _mm_srli_epi64(v1, 1),
107
193k
            _mm_srli_epi64(v1, 2),
108
193k
            _mm_srli_epi64(v1, 7),
109
193k
        );
110
193k
111
193k
        let v2 = xor4(
112
193k
            v2,
113
193k
            _mm_slli_epi64(v1, 63),
114
193k
            _mm_slli_epi64(v1, 62),
115
193k
            _mm_slli_epi64(v1, 57),
116
193k
        );
117
193k
118
193k
        self.y = _mm_unpacklo_epi64(v2, v3);
119
193k
    }
<polyval::backend::clmul::Polyval>::mul
Line
Count
Source
62
118k
    unsafe fn mul(&mut self, x: &Block) {
63
118k
        let h = self.h;
64
118k
65
118k
        // `_mm_loadu_si128` performs an unaligned load
66
118k
        #[allow(clippy::cast_ptr_alignment)]
67
118k
        let x = _mm_loadu_si128(x.as_ptr() as *const __m128i);
68
118k
        let y = _mm_xor_si128(self.y, x);
69
118k
70
118k
        let h0 = h;
71
118k
        let h1 = _mm_shuffle_epi32(h, 0x0E);
72
118k
        let h2 = _mm_xor_si128(h0, h1);
73
118k
        let y0 = y;
74
118k
75
118k
        // Multiply values partitioned to 64-bit parts
76
118k
        let y1 = _mm_shuffle_epi32(y, 0x0E);
77
118k
        let y2 = _mm_xor_si128(y0, y1);
78
118k
        let t0 = _mm_clmulepi64_si128(y0, h0, 0x00);
79
118k
        let t1 = _mm_clmulepi64_si128(y, h, 0x11);
80
118k
        let t2 = _mm_clmulepi64_si128(y2, h2, 0x00);
81
118k
        let t2 = _mm_xor_si128(t2, _mm_xor_si128(t0, t1));
82
118k
        let v0 = t0;
83
118k
        let v1 = _mm_xor_si128(_mm_shuffle_epi32(t0, 0x0E), t2);
84
118k
        let v2 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
85
118k
        let v3 = _mm_shuffle_epi32(t1, 0x0E);
86
118k
87
118k
        // Polynomial reduction
88
118k
        let v2 = xor5(
89
118k
            v2,
90
118k
            v0,
91
118k
            _mm_srli_epi64(v0, 1),
92
118k
            _mm_srli_epi64(v0, 2),
93
118k
            _mm_srli_epi64(v0, 7),
94
118k
        );
95
118k
96
118k
        let v1 = xor4(
97
118k
            v1,
98
118k
            _mm_slli_epi64(v0, 63),
99
118k
            _mm_slli_epi64(v0, 62),
100
118k
            _mm_slli_epi64(v0, 57),
101
118k
        );
102
118k
103
118k
        let v3 = xor5(
104
118k
            v3,
105
118k
            v1,
106
118k
            _mm_srli_epi64(v1, 1),
107
118k
            _mm_srli_epi64(v1, 2),
108
118k
            _mm_srli_epi64(v1, 7),
109
118k
        );
110
118k
111
118k
        let v2 = xor4(
112
118k
            v2,
113
118k
            _mm_slli_epi64(v1, 63),
114
118k
            _mm_slli_epi64(v1, 62),
115
118k
            _mm_slli_epi64(v1, 57),
116
118k
        );
117
118k
118
118k
        self.y = _mm_unpacklo_epi64(v2, v3);
119
118k
    }
120
}
121
122
#[cfg(feature = "zeroize")]
123
impl Drop for Polyval {
124
    fn drop(&mut self) {
125
        use zeroize::Zeroize;
126
        self.h.zeroize();
127
        self.y.zeroize();
128
    }
129
}
130
131
#[inline(always)]
132
623k
unsafe fn xor4(e1: __m128i, e2: __m128i, e3: __m128i, e4: __m128i) -> __m128i {
133
623k
    _mm_xor_si128(_mm_xor_si128(e1, e2), _mm_xor_si128(e3, e4))
134
623k
}
polyval::backend::clmul::xor4
Line
Count
Source
132
386k
unsafe fn xor4(e1: __m128i, e2: __m128i, e3: __m128i, e4: __m128i) -> __m128i {
133
386k
    _mm_xor_si128(_mm_xor_si128(e1, e2), _mm_xor_si128(e3, e4))
134
386k
}
polyval::backend::clmul::xor4
Line
Count
Source
132
236k
unsafe fn xor4(e1: __m128i, e2: __m128i, e3: __m128i, e4: __m128i) -> __m128i {
133
236k
    _mm_xor_si128(_mm_xor_si128(e1, e2), _mm_xor_si128(e3, e4))
134
236k
}
135
136
#[inline(always)]
137
623k
unsafe fn xor5(e1: __m128i, e2: __m128i, e3: __m128i, e4: __m128i, e5: __m128i) -> __m128i {
138
623k
    _mm_xor_si128(
139
623k
        e1,
140
623k
        _mm_xor_si128(_mm_xor_si128(e2, e3), _mm_xor_si128(e4, e5)),
141
623k
    )
142
623k
}
polyval::backend::clmul::xor5
Line
Count
Source
137
386k
unsafe fn xor5(e1: __m128i, e2: __m128i, e3: __m128i, e4: __m128i, e5: __m128i) -> __m128i {
138
386k
    _mm_xor_si128(
139
386k
        e1,
140
386k
        _mm_xor_si128(_mm_xor_si128(e2, e3), _mm_xor_si128(e4, e5)),
141
386k
    )
142
386k
}
polyval::backend::clmul::xor5
Line
Count
Source
137
236k
unsafe fn xor5(e1: __m128i, e2: __m128i, e3: __m128i, e4: __m128i, e5: __m128i) -> __m128i {
138
236k
    _mm_xor_si128(
139
236k
        e1,
140
236k
        _mm_xor_si128(_mm_xor_si128(e2, e3), _mm_xor_si128(e4, e5)),
141
236k
    )
142
236k
}