Coverage Report

Created: 2026-05-16 07:02

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/rust/registry/src/index.crates.io-1949cf8c6b5b557f/chacha20-0.10.0/src/backends/sse2.rs
Line
Count
Source
1
//! SSE2 backend.
2
3
#![allow(unsafe_op_in_unsafe_fn, reason = "needs triage")]
4
#![allow(clippy::cast_possible_truncation, reason = "needs triage")]
5
#![allow(clippy::cast_possible_wrap, reason = "needs triage")]
6
#![allow(clippy::cast_sign_loss, reason = "needs triage")]
7
#![allow(clippy::undocumented_unsafe_blocks, reason = "TODO")]
8
9
use crate::{Rounds, Variant};
10
11
#[cfg(feature = "rng")]
12
use crate::ChaChaCore;
13
14
#[cfg(feature = "cipher")]
15
use crate::{STATE_WORDS, chacha::Block};
16
#[cfg(feature = "cipher")]
17
use cipher::{
18
    BlockSizeUser, ParBlocksSizeUser, StreamCipherBackend, StreamCipherClosure,
19
    consts::{U4, U64},
20
};
21
use core::marker::PhantomData;
22
23
#[cfg(target_arch = "x86")]
24
use core::arch::x86::*;
25
#[cfg(target_arch = "x86_64")]
26
use core::arch::x86_64::*;
27
28
const PAR_BLOCKS: usize = 4;
29
30
#[inline]
31
#[target_feature(enable = "sse2")]
32
#[cfg(feature = "cipher")]
33
pub(crate) unsafe fn inner<R, F, V>(state: &mut [u32; STATE_WORDS], f: F)
34
where
35
    R: Rounds,
36
    F: StreamCipherClosure<BlockSize = U64>,
37
    V: Variant,
38
{
39
    let state_ptr = state.as_ptr().cast::<__m128i>();
40
    let mut backend = Backend::<R, V> {
41
        v: [
42
            _mm_loadu_si128(state_ptr.add(0)),
43
            _mm_loadu_si128(state_ptr.add(1)),
44
            _mm_loadu_si128(state_ptr.add(2)),
45
            _mm_loadu_si128(state_ptr.add(3)),
46
        ],
47
        _pd: PhantomData,
48
    };
49
50
    f.call(&mut backend);
51
52
    state[12] = _mm_cvtsi128_si32(backend.v[3]) as u32;
53
    if size_of::<V::Counter>() == 8 {
54
        state[13] = _mm_extract_epi32(backend.v[3], 1) as u32;
55
    }
56
}
57
58
struct Backend<R: Rounds, V: Variant> {
59
    v: [__m128i; 4],
60
    _pd: PhantomData<(R, V)>,
61
}
62
63
#[cfg(feature = "cipher")]
64
impl<R: Rounds, V: Variant> BlockSizeUser for Backend<R, V> {
65
    type BlockSize = U64;
66
}
67
68
#[cfg(feature = "cipher")]
69
impl<R: Rounds, V: Variant> ParBlocksSizeUser for Backend<R, V> {
70
    type ParBlocksSize = U4;
71
}
72
73
#[cfg(feature = "cipher")]
74
impl<R: Rounds, V: Variant> StreamCipherBackend for Backend<R, V> {
75
    #[inline(always)]
76
    fn gen_ks_block(&mut self, block: &mut Block) {
77
        unsafe {
78
            let res = rounds::<R, V>(&self.v);
79
            self.v[3] = match size_of::<V::Counter>() {
80
                4 => _mm_add_epi32(self.v[3], _mm_set_epi32(0, 0, 0, 1)),
81
                8 => _mm_add_epi64(self.v[3], _mm_set_epi64x(0, 1)),
82
                _ => unreachable!(),
83
            };
84
85
            let block_ptr = block.as_mut_ptr().cast::<__m128i>();
86
            for i in 0..4 {
87
                _mm_storeu_si128(block_ptr.add(i), res[0][i]);
88
            }
89
        }
90
    }
91
    #[inline(always)]
92
    fn gen_par_ks_blocks(&mut self, blocks: &mut cipher::ParBlocks<Self>) {
93
        unsafe {
94
            let res = rounds::<R, V>(&self.v);
95
            self.v[3] = match size_of::<V::Counter>() {
96
                4 => _mm_add_epi32(self.v[3], _mm_set_epi32(0, 0, 0, PAR_BLOCKS as i32)),
97
                8 => _mm_add_epi64(self.v[3], _mm_set_epi64x(0, PAR_BLOCKS as i64)),
98
                _ => unreachable!(),
99
            };
100
101
            let blocks_ptr = blocks.as_mut_ptr().cast::<__m128i>();
102
            for block in 0..PAR_BLOCKS {
103
                for i in 0..4 {
104
                    _mm_storeu_si128(blocks_ptr.add(i + block * PAR_BLOCKS), res[block][i]);
105
                }
106
            }
107
        }
108
    }
109
}
110
111
#[inline]
112
#[target_feature(enable = "sse2")]
113
#[cfg(feature = "rng")]
114
0
pub(crate) unsafe fn rng_inner<R, V>(core: &mut ChaChaCore<R, V>, buffer: &mut [u32; 64])
115
0
where
116
0
    R: Rounds,
117
0
    V: Variant,
118
{
119
0
    let state_ptr = core.state.as_ptr().cast::<__m128i>();
120
0
    let mut backend = Backend::<R, V> {
121
0
        v: [
122
0
            _mm_loadu_si128(state_ptr.add(0)),
123
0
            _mm_loadu_si128(state_ptr.add(1)),
124
0
            _mm_loadu_si128(state_ptr.add(2)),
125
0
            _mm_loadu_si128(state_ptr.add(3)),
126
0
        ],
127
0
        _pd: PhantomData,
128
0
    };
129
130
0
    backend.gen_ks_blocks(buffer);
131
132
0
    core.state[12] = _mm_cvtsi128_si32(backend.v[3]) as u32;
133
0
    core.state[13] = _mm_extract_epi32(backend.v[3], 1) as u32;
134
0
}
Unexecuted instantiation: chacha20::backends::sse2::rng_inner::<chacha20::R12, chacha20::variants::Legacy>
Unexecuted instantiation: chacha20::backends::sse2::rng_inner::<_, _>
135
136
#[cfg(feature = "rng")]
137
impl<R: Rounds, V: Variant> Backend<R, V> {
138
    #[inline(always)]
139
0
    fn gen_ks_blocks(&mut self, block: &mut [u32; 64]) {
140
        const _: () = assert!(4 * PAR_BLOCKS * size_of::<__m128i>() == size_of::<[u32; 64]>());
141
        unsafe {
142
0
            let res = rounds::<R, V>(&self.v);
143
0
            self.v[3] = _mm_add_epi64(self.v[3], _mm_set_epi64x(0, PAR_BLOCKS as i64));
144
145
0
            let blocks_ptr = block.as_mut_ptr().cast::<__m128i>();
146
0
            for block in 0..PAR_BLOCKS {
147
0
                for i in 0..4 {
148
0
                    _mm_storeu_si128(blocks_ptr.add(i + block * PAR_BLOCKS), res[block][i]);
149
0
                }
150
            }
151
        }
152
0
    }
Unexecuted instantiation: <chacha20::backends::sse2::Backend<chacha20::R12, chacha20::variants::Legacy>>::gen_ks_blocks
Unexecuted instantiation: <chacha20::backends::sse2::Backend<_, _>>::gen_ks_blocks
153
}
154
155
#[inline]
156
#[target_feature(enable = "sse2")]
157
0
unsafe fn rounds<R: Rounds, V: Variant>(v: &[__m128i; 4]) -> [[__m128i; 4]; PAR_BLOCKS] {
158
0
    let mut res = [*v; 4];
159
0
    for block in 1..PAR_BLOCKS {
160
0
        res[block][3] = match size_of::<V::Counter>() {
161
0
            4 => _mm_add_epi32(res[block][3], _mm_set_epi32(0, 0, 0, block as i32)),
162
0
            8 => _mm_add_epi64(res[block][3], _mm_set_epi64x(0, block as i64)),
163
0
            _ => unreachable!(),
164
        }
165
    }
166
167
0
    for _ in 0..R::COUNT {
168
0
        double_quarter_round(&mut res);
169
0
    }
170
171
0
    for block in 0..PAR_BLOCKS {
172
0
        for i in 0..3 {
173
0
            res[block][i] = _mm_add_epi32(res[block][i], v[i]);
174
0
        }
175
0
        let ctr = match size_of::<V::Counter>() {
176
0
            4 => _mm_add_epi32(v[3], _mm_set_epi32(0, 0, 0, block as i32)),
177
0
            8 => _mm_add_epi64(v[3], _mm_set_epi64x(0, block as i64)),
178
0
            _ => unreachable!(),
179
        };
180
0
        res[block][3] = _mm_add_epi32(res[block][3], ctr);
181
    }
182
183
0
    res
184
0
}
Unexecuted instantiation: chacha20::backends::sse2::rounds::<chacha20::R12, chacha20::variants::Legacy>
Unexecuted instantiation: chacha20::backends::sse2::rounds::<_, _>
185
186
#[inline]
187
#[target_feature(enable = "sse2")]
188
0
unsafe fn double_quarter_round(v: &mut [[__m128i; 4]; PAR_BLOCKS]) {
189
0
    add_xor_rot(v);
190
0
    rows_to_cols(v);
191
0
    add_xor_rot(v);
192
0
    cols_to_rows(v);
193
0
}
Unexecuted instantiation: chacha20::backends::sse2::double_quarter_round
Unexecuted instantiation: chacha20::backends::sse2::double_quarter_round
194
195
/// The goal of this function is to transform the state words from:
196
/// ```text
197
/// [a0, a1, a2, a3]    [ 0,  1,  2,  3]
198
/// [b0, b1, b2, b3] == [ 4,  5,  6,  7]
199
/// [c0, c1, c2, c3]    [ 8,  9, 10, 11]
200
/// [d0, d1, d2, d3]    [12, 13, 14, 15]
201
/// ```
202
///
203
/// to:
204
/// ```text
205
/// [a0, a1, a2, a3]    [ 0,  1,  2,  3]
206
/// [b1, b2, b3, b0] == [ 5,  6,  7,  4]
207
/// [c2, c3, c0, c1]    [10, 11,  8,  9]
208
/// [d3, d0, d1, d2]    [15, 12, 13, 14]
209
/// ```
210
///
211
/// so that we can apply [`add_xor_rot`] to the resulting columns, and have it compute the
212
/// "diagonal rounds" (as defined in RFC 7539) in parallel. In practice, this shuffle is
213
/// non-optimal: the last state word to be altered in `add_xor_rot` is `b`, so the shuffle
214
/// blocks on the result of `b` being calculated.
215
///
216
/// We can optimize this by observing that the four quarter rounds in `add_xor_rot` are
217
/// data-independent: they only access a single column of the state, and thus the order of
218
/// the columns does not matter. We therefore instead shuffle the other three state words,
219
/// to obtain the following equivalent layout:
220
/// ```text
221
/// [a3, a0, a1, a2]    [ 3,  0,  1,  2]
222
/// [b0, b1, b2, b3] == [ 4,  5,  6,  7]
223
/// [c1, c2, c3, c0]    [ 9, 10, 11,  8]
224
/// [d2, d3, d0, d1]    [14, 15, 12, 13]
225
/// ```
226
///
227
/// See https://github.com/sneves/blake2-avx2/pull/4 for additional details. The earliest
228
/// known occurrence of this optimization is in floodyberry's SSE4 ChaCha code from 2014:
229
/// - https://github.com/floodyberry/chacha-opt/blob/0ab65cb99f5016633b652edebaf3691ceb4ff753/chacha_blocks_ssse3-64.S#L639-L643
230
#[inline]
231
#[target_feature(enable = "sse2")]
232
0
unsafe fn rows_to_cols(blocks: &mut [[__m128i; 4]; PAR_BLOCKS]) {
233
0
    for [a, _, c, d] in blocks.iter_mut() {
234
0
        // c >>>= 32; d >>>= 64; a >>>= 96;
235
0
        *c = _mm_shuffle_epi32(*c, 0b_00_11_10_01); // _MM_SHUFFLE(0, 3, 2, 1)
236
0
        *d = _mm_shuffle_epi32(*d, 0b_01_00_11_10); // _MM_SHUFFLE(1, 0, 3, 2)
237
0
        *a = _mm_shuffle_epi32(*a, 0b_10_01_00_11); // _MM_SHUFFLE(2, 1, 0, 3)
238
0
    }
239
0
}
Unexecuted instantiation: chacha20::backends::sse2::rows_to_cols
Unexecuted instantiation: chacha20::backends::sse2::rows_to_cols
240
241
/// The goal of this function is to transform the state words from:
242
/// ```text
243
/// [a3, a0, a1, a2]    [ 3,  0,  1,  2]
244
/// [b0, b1, b2, b3] == [ 4,  5,  6,  7]
245
/// [c1, c2, c3, c0]    [ 9, 10, 11,  8]
246
/// [d2, d3, d0, d1]    [14, 15, 12, 13]
247
/// ```
248
///
249
/// to:
250
/// ```text
251
/// [a0, a1, a2, a3]    [ 0,  1,  2,  3]
252
/// [b0, b1, b2, b3] == [ 4,  5,  6,  7]
253
/// [c0, c1, c2, c3]    [ 8,  9, 10, 11]
254
/// [d0, d1, d2, d3]    [12, 13, 14, 15]
255
/// ```
256
///
257
/// reversing the transformation of [`rows_to_cols`].
258
#[inline]
259
#[target_feature(enable = "sse2")]
260
0
unsafe fn cols_to_rows(blocks: &mut [[__m128i; 4]; PAR_BLOCKS]) {
261
0
    for [a, _, c, d] in blocks.iter_mut() {
262
0
        // c <<<= 32; d <<<= 64; a <<<= 96;
263
0
        *c = _mm_shuffle_epi32(*c, 0b_10_01_00_11); // _MM_SHUFFLE(2, 1, 0, 3)
264
0
        *d = _mm_shuffle_epi32(*d, 0b_01_00_11_10); // _MM_SHUFFLE(1, 0, 3, 2)
265
0
        *a = _mm_shuffle_epi32(*a, 0b_00_11_10_01); // _MM_SHUFFLE(0, 3, 2, 1)
266
0
    }
267
0
}
Unexecuted instantiation: chacha20::backends::sse2::cols_to_rows
Unexecuted instantiation: chacha20::backends::sse2::cols_to_rows
268
269
#[inline]
270
#[target_feature(enable = "sse2")]
271
0
unsafe fn add_xor_rot(blocks: &mut [[__m128i; 4]; PAR_BLOCKS]) {
272
0
    for [a, b, c, d] in blocks.iter_mut() {
273
0
        // a += b; d ^= a; d <<<= (16, 16, 16, 16);
274
0
        *a = _mm_add_epi32(*a, *b);
275
0
        *d = _mm_xor_si128(*d, *a);
276
0
        *d = _mm_xor_si128(_mm_slli_epi32(*d, 16), _mm_srli_epi32(*d, 16));
277
0
278
0
        // c += d; b ^= c; b <<<= (12, 12, 12, 12);
279
0
        *c = _mm_add_epi32(*c, *d);
280
0
        *b = _mm_xor_si128(*b, *c);
281
0
        *b = _mm_xor_si128(_mm_slli_epi32(*b, 12), _mm_srli_epi32(*b, 20));
282
0
283
0
        // a += b; d ^= a; d <<<= (8, 8, 8, 8);
284
0
        *a = _mm_add_epi32(*a, *b);
285
0
        *d = _mm_xor_si128(*d, *a);
286
0
        *d = _mm_xor_si128(_mm_slli_epi32(*d, 8), _mm_srli_epi32(*d, 24));
287
0
288
0
        // c += d; b ^= c; b <<<= (7, 7, 7, 7);
289
0
        *c = _mm_add_epi32(*c, *d);
290
0
        *b = _mm_xor_si128(*b, *c);
291
0
        *b = _mm_xor_si128(_mm_slli_epi32(*b, 7), _mm_srli_epi32(*b, 25));
292
0
    }
293
0
}
Unexecuted instantiation: chacha20::backends::sse2::add_xor_rot
Unexecuted instantiation: chacha20::backends::sse2::add_xor_rot