/rust/registry/src/index.crates.io-1949cf8c6b5b557f/chacha20-0.10.0/src/backends/sse2.rs
Line | Count | Source |
1 | | //! SSE2 backend. |
2 | | |
3 | | #![allow(unsafe_op_in_unsafe_fn, reason = "needs triage")] |
4 | | #![allow(clippy::cast_possible_truncation, reason = "needs triage")] |
5 | | #![allow(clippy::cast_possible_wrap, reason = "needs triage")] |
6 | | #![allow(clippy::cast_sign_loss, reason = "needs triage")] |
7 | | #![allow(clippy::undocumented_unsafe_blocks, reason = "TODO")] |
8 | | |
9 | | use crate::{Rounds, Variant}; |
10 | | |
11 | | #[cfg(feature = "rng")] |
12 | | use crate::ChaChaCore; |
13 | | |
14 | | #[cfg(feature = "cipher")] |
15 | | use crate::{STATE_WORDS, chacha::Block}; |
16 | | #[cfg(feature = "cipher")] |
17 | | use cipher::{ |
18 | | BlockSizeUser, ParBlocksSizeUser, StreamCipherBackend, StreamCipherClosure, |
19 | | consts::{U4, U64}, |
20 | | }; |
21 | | use core::marker::PhantomData; |
22 | | |
23 | | #[cfg(target_arch = "x86")] |
24 | | use core::arch::x86::*; |
25 | | #[cfg(target_arch = "x86_64")] |
26 | | use core::arch::x86_64::*; |
27 | | |
28 | | const PAR_BLOCKS: usize = 4; |
29 | | |
30 | | #[inline] |
31 | | #[target_feature(enable = "sse2")] |
32 | | #[cfg(feature = "cipher")] |
33 | | pub(crate) unsafe fn inner<R, F, V>(state: &mut [u32; STATE_WORDS], f: F) |
34 | | where |
35 | | R: Rounds, |
36 | | F: StreamCipherClosure<BlockSize = U64>, |
37 | | V: Variant, |
38 | | { |
39 | | let state_ptr = state.as_ptr().cast::<__m128i>(); |
40 | | let mut backend = Backend::<R, V> { |
41 | | v: [ |
42 | | _mm_loadu_si128(state_ptr.add(0)), |
43 | | _mm_loadu_si128(state_ptr.add(1)), |
44 | | _mm_loadu_si128(state_ptr.add(2)), |
45 | | _mm_loadu_si128(state_ptr.add(3)), |
46 | | ], |
47 | | _pd: PhantomData, |
48 | | }; |
49 | | |
50 | | f.call(&mut backend); |
51 | | |
52 | | state[12] = _mm_cvtsi128_si32(backend.v[3]) as u32; |
53 | | if size_of::<V::Counter>() == 8 { |
54 | | state[13] = _mm_extract_epi32(backend.v[3], 1) as u32; |
55 | | } |
56 | | } |
57 | | |
58 | | struct Backend<R: Rounds, V: Variant> { |
59 | | v: [__m128i; 4], |
60 | | _pd: PhantomData<(R, V)>, |
61 | | } |
62 | | |
63 | | #[cfg(feature = "cipher")] |
64 | | impl<R: Rounds, V: Variant> BlockSizeUser for Backend<R, V> { |
65 | | type BlockSize = U64; |
66 | | } |
67 | | |
68 | | #[cfg(feature = "cipher")] |
69 | | impl<R: Rounds, V: Variant> ParBlocksSizeUser for Backend<R, V> { |
70 | | type ParBlocksSize = U4; |
71 | | } |
72 | | |
73 | | #[cfg(feature = "cipher")] |
74 | | impl<R: Rounds, V: Variant> StreamCipherBackend for Backend<R, V> { |
75 | | #[inline(always)] |
76 | | fn gen_ks_block(&mut self, block: &mut Block) { |
77 | | unsafe { |
78 | | let res = rounds::<R, V>(&self.v); |
79 | | self.v[3] = match size_of::<V::Counter>() { |
80 | | 4 => _mm_add_epi32(self.v[3], _mm_set_epi32(0, 0, 0, 1)), |
81 | | 8 => _mm_add_epi64(self.v[3], _mm_set_epi64x(0, 1)), |
82 | | _ => unreachable!(), |
83 | | }; |
84 | | |
85 | | let block_ptr = block.as_mut_ptr().cast::<__m128i>(); |
86 | | for i in 0..4 { |
87 | | _mm_storeu_si128(block_ptr.add(i), res[0][i]); |
88 | | } |
89 | | } |
90 | | } |
91 | | #[inline(always)] |
92 | | fn gen_par_ks_blocks(&mut self, blocks: &mut cipher::ParBlocks<Self>) { |
93 | | unsafe { |
94 | | let res = rounds::<R, V>(&self.v); |
95 | | self.v[3] = match size_of::<V::Counter>() { |
96 | | 4 => _mm_add_epi32(self.v[3], _mm_set_epi32(0, 0, 0, PAR_BLOCKS as i32)), |
97 | | 8 => _mm_add_epi64(self.v[3], _mm_set_epi64x(0, PAR_BLOCKS as i64)), |
98 | | _ => unreachable!(), |
99 | | }; |
100 | | |
101 | | let blocks_ptr = blocks.as_mut_ptr().cast::<__m128i>(); |
102 | | for block in 0..PAR_BLOCKS { |
103 | | for i in 0..4 { |
104 | | _mm_storeu_si128(blocks_ptr.add(i + block * PAR_BLOCKS), res[block][i]); |
105 | | } |
106 | | } |
107 | | } |
108 | | } |
109 | | } |
110 | | |
111 | | #[inline] |
112 | | #[target_feature(enable = "sse2")] |
113 | | #[cfg(feature = "rng")] |
114 | 0 | pub(crate) unsafe fn rng_inner<R, V>(core: &mut ChaChaCore<R, V>, buffer: &mut [u32; 64]) |
115 | 0 | where |
116 | 0 | R: Rounds, |
117 | 0 | V: Variant, |
118 | | { |
119 | 0 | let state_ptr = core.state.as_ptr().cast::<__m128i>(); |
120 | 0 | let mut backend = Backend::<R, V> { |
121 | 0 | v: [ |
122 | 0 | _mm_loadu_si128(state_ptr.add(0)), |
123 | 0 | _mm_loadu_si128(state_ptr.add(1)), |
124 | 0 | _mm_loadu_si128(state_ptr.add(2)), |
125 | 0 | _mm_loadu_si128(state_ptr.add(3)), |
126 | 0 | ], |
127 | 0 | _pd: PhantomData, |
128 | 0 | }; |
129 | | |
130 | 0 | backend.gen_ks_blocks(buffer); |
131 | | |
132 | 0 | core.state[12] = _mm_cvtsi128_si32(backend.v[3]) as u32; |
133 | 0 | core.state[13] = _mm_extract_epi32(backend.v[3], 1) as u32; |
134 | 0 | } Unexecuted instantiation: chacha20::backends::sse2::rng_inner::<chacha20::R12, chacha20::variants::Legacy> Unexecuted instantiation: chacha20::backends::sse2::rng_inner::<_, _> |
135 | | |
136 | | #[cfg(feature = "rng")] |
137 | | impl<R: Rounds, V: Variant> Backend<R, V> { |
138 | | #[inline(always)] |
139 | 0 | fn gen_ks_blocks(&mut self, block: &mut [u32; 64]) { |
140 | | const _: () = assert!(4 * PAR_BLOCKS * size_of::<__m128i>() == size_of::<[u32; 64]>()); |
141 | | unsafe { |
142 | 0 | let res = rounds::<R, V>(&self.v); |
143 | 0 | self.v[3] = _mm_add_epi64(self.v[3], _mm_set_epi64x(0, PAR_BLOCKS as i64)); |
144 | | |
145 | 0 | let blocks_ptr = block.as_mut_ptr().cast::<__m128i>(); |
146 | 0 | for block in 0..PAR_BLOCKS { |
147 | 0 | for i in 0..4 { |
148 | 0 | _mm_storeu_si128(blocks_ptr.add(i + block * PAR_BLOCKS), res[block][i]); |
149 | 0 | } |
150 | | } |
151 | | } |
152 | 0 | } Unexecuted instantiation: <chacha20::backends::sse2::Backend<chacha20::R12, chacha20::variants::Legacy>>::gen_ks_blocks Unexecuted instantiation: <chacha20::backends::sse2::Backend<_, _>>::gen_ks_blocks |
153 | | } |
154 | | |
155 | | #[inline] |
156 | | #[target_feature(enable = "sse2")] |
157 | 0 | unsafe fn rounds<R: Rounds, V: Variant>(v: &[__m128i; 4]) -> [[__m128i; 4]; PAR_BLOCKS] { |
158 | 0 | let mut res = [*v; 4]; |
159 | 0 | for block in 1..PAR_BLOCKS { |
160 | 0 | res[block][3] = match size_of::<V::Counter>() { |
161 | 0 | 4 => _mm_add_epi32(res[block][3], _mm_set_epi32(0, 0, 0, block as i32)), |
162 | 0 | 8 => _mm_add_epi64(res[block][3], _mm_set_epi64x(0, block as i64)), |
163 | 0 | _ => unreachable!(), |
164 | | } |
165 | | } |
166 | | |
167 | 0 | for _ in 0..R::COUNT { |
168 | 0 | double_quarter_round(&mut res); |
169 | 0 | } |
170 | | |
171 | 0 | for block in 0..PAR_BLOCKS { |
172 | 0 | for i in 0..3 { |
173 | 0 | res[block][i] = _mm_add_epi32(res[block][i], v[i]); |
174 | 0 | } |
175 | 0 | let ctr = match size_of::<V::Counter>() { |
176 | 0 | 4 => _mm_add_epi32(v[3], _mm_set_epi32(0, 0, 0, block as i32)), |
177 | 0 | 8 => _mm_add_epi64(v[3], _mm_set_epi64x(0, block as i64)), |
178 | 0 | _ => unreachable!(), |
179 | | }; |
180 | 0 | res[block][3] = _mm_add_epi32(res[block][3], ctr); |
181 | | } |
182 | | |
183 | 0 | res |
184 | 0 | } Unexecuted instantiation: chacha20::backends::sse2::rounds::<chacha20::R12, chacha20::variants::Legacy> Unexecuted instantiation: chacha20::backends::sse2::rounds::<_, _> |
185 | | |
186 | | #[inline] |
187 | | #[target_feature(enable = "sse2")] |
188 | 0 | unsafe fn double_quarter_round(v: &mut [[__m128i; 4]; PAR_BLOCKS]) { |
189 | 0 | add_xor_rot(v); |
190 | 0 | rows_to_cols(v); |
191 | 0 | add_xor_rot(v); |
192 | 0 | cols_to_rows(v); |
193 | 0 | } Unexecuted instantiation: chacha20::backends::sse2::double_quarter_round Unexecuted instantiation: chacha20::backends::sse2::double_quarter_round |
194 | | |
195 | | /// The goal of this function is to transform the state words from: |
196 | | /// ```text |
197 | | /// [a0, a1, a2, a3] [ 0, 1, 2, 3] |
198 | | /// [b0, b1, b2, b3] == [ 4, 5, 6, 7] |
199 | | /// [c0, c1, c2, c3] [ 8, 9, 10, 11] |
200 | | /// [d0, d1, d2, d3] [12, 13, 14, 15] |
201 | | /// ``` |
202 | | /// |
203 | | /// to: |
204 | | /// ```text |
205 | | /// [a0, a1, a2, a3] [ 0, 1, 2, 3] |
206 | | /// [b1, b2, b3, b0] == [ 5, 6, 7, 4] |
207 | | /// [c2, c3, c0, c1] [10, 11, 8, 9] |
208 | | /// [d3, d0, d1, d2] [15, 12, 13, 14] |
209 | | /// ``` |
210 | | /// |
211 | | /// so that we can apply [`add_xor_rot`] to the resulting columns, and have it compute the |
212 | | /// "diagonal rounds" (as defined in RFC 7539) in parallel. In practice, this shuffle is |
213 | | /// non-optimal: the last state word to be altered in `add_xor_rot` is `b`, so the shuffle |
214 | | /// blocks on the result of `b` being calculated. |
215 | | /// |
216 | | /// We can optimize this by observing that the four quarter rounds in `add_xor_rot` are |
217 | | /// data-independent: they only access a single column of the state, and thus the order of |
218 | | /// the columns does not matter. We therefore instead shuffle the other three state words, |
219 | | /// to obtain the following equivalent layout: |
220 | | /// ```text |
221 | | /// [a3, a0, a1, a2] [ 3, 0, 1, 2] |
222 | | /// [b0, b1, b2, b3] == [ 4, 5, 6, 7] |
223 | | /// [c1, c2, c3, c0] [ 9, 10, 11, 8] |
224 | | /// [d2, d3, d0, d1] [14, 15, 12, 13] |
225 | | /// ``` |
226 | | /// |
227 | | /// See https://github.com/sneves/blake2-avx2/pull/4 for additional details. The earliest |
228 | | /// known occurrence of this optimization is in floodyberry's SSE4 ChaCha code from 2014: |
229 | | /// - https://github.com/floodyberry/chacha-opt/blob/0ab65cb99f5016633b652edebaf3691ceb4ff753/chacha_blocks_ssse3-64.S#L639-L643 |
230 | | #[inline] |
231 | | #[target_feature(enable = "sse2")] |
232 | 0 | unsafe fn rows_to_cols(blocks: &mut [[__m128i; 4]; PAR_BLOCKS]) { |
233 | 0 | for [a, _, c, d] in blocks.iter_mut() { |
234 | 0 | // c >>>= 32; d >>>= 64; a >>>= 96; |
235 | 0 | *c = _mm_shuffle_epi32(*c, 0b_00_11_10_01); // _MM_SHUFFLE(0, 3, 2, 1) |
236 | 0 | *d = _mm_shuffle_epi32(*d, 0b_01_00_11_10); // _MM_SHUFFLE(1, 0, 3, 2) |
237 | 0 | *a = _mm_shuffle_epi32(*a, 0b_10_01_00_11); // _MM_SHUFFLE(2, 1, 0, 3) |
238 | 0 | } |
239 | 0 | } Unexecuted instantiation: chacha20::backends::sse2::rows_to_cols Unexecuted instantiation: chacha20::backends::sse2::rows_to_cols |
240 | | |
241 | | /// The goal of this function is to transform the state words from: |
242 | | /// ```text |
243 | | /// [a3, a0, a1, a2] [ 3, 0, 1, 2] |
244 | | /// [b0, b1, b2, b3] == [ 4, 5, 6, 7] |
245 | | /// [c1, c2, c3, c0] [ 9, 10, 11, 8] |
246 | | /// [d2, d3, d0, d1] [14, 15, 12, 13] |
247 | | /// ``` |
248 | | /// |
249 | | /// to: |
250 | | /// ```text |
251 | | /// [a0, a1, a2, a3] [ 0, 1, 2, 3] |
252 | | /// [b0, b1, b2, b3] == [ 4, 5, 6, 7] |
253 | | /// [c0, c1, c2, c3] [ 8, 9, 10, 11] |
254 | | /// [d0, d1, d2, d3] [12, 13, 14, 15] |
255 | | /// ``` |
256 | | /// |
257 | | /// reversing the transformation of [`rows_to_cols`]. |
258 | | #[inline] |
259 | | #[target_feature(enable = "sse2")] |
260 | 0 | unsafe fn cols_to_rows(blocks: &mut [[__m128i; 4]; PAR_BLOCKS]) { |
261 | 0 | for [a, _, c, d] in blocks.iter_mut() { |
262 | 0 | // c <<<= 32; d <<<= 64; a <<<= 96; |
263 | 0 | *c = _mm_shuffle_epi32(*c, 0b_10_01_00_11); // _MM_SHUFFLE(2, 1, 0, 3) |
264 | 0 | *d = _mm_shuffle_epi32(*d, 0b_01_00_11_10); // _MM_SHUFFLE(1, 0, 3, 2) |
265 | 0 | *a = _mm_shuffle_epi32(*a, 0b_00_11_10_01); // _MM_SHUFFLE(0, 3, 2, 1) |
266 | 0 | } |
267 | 0 | } Unexecuted instantiation: chacha20::backends::sse2::cols_to_rows Unexecuted instantiation: chacha20::backends::sse2::cols_to_rows |
268 | | |
269 | | #[inline] |
270 | | #[target_feature(enable = "sse2")] |
271 | 0 | unsafe fn add_xor_rot(blocks: &mut [[__m128i; 4]; PAR_BLOCKS]) { |
272 | 0 | for [a, b, c, d] in blocks.iter_mut() { |
273 | 0 | // a += b; d ^= a; d <<<= (16, 16, 16, 16); |
274 | 0 | *a = _mm_add_epi32(*a, *b); |
275 | 0 | *d = _mm_xor_si128(*d, *a); |
276 | 0 | *d = _mm_xor_si128(_mm_slli_epi32(*d, 16), _mm_srli_epi32(*d, 16)); |
277 | 0 |
|
278 | 0 | // c += d; b ^= c; b <<<= (12, 12, 12, 12); |
279 | 0 | *c = _mm_add_epi32(*c, *d); |
280 | 0 | *b = _mm_xor_si128(*b, *c); |
281 | 0 | *b = _mm_xor_si128(_mm_slli_epi32(*b, 12), _mm_srli_epi32(*b, 20)); |
282 | 0 |
|
283 | 0 | // a += b; d ^= a; d <<<= (8, 8, 8, 8); |
284 | 0 | *a = _mm_add_epi32(*a, *b); |
285 | 0 | *d = _mm_xor_si128(*d, *a); |
286 | 0 | *d = _mm_xor_si128(_mm_slli_epi32(*d, 8), _mm_srli_epi32(*d, 24)); |
287 | 0 |
|
288 | 0 | // c += d; b ^= c; b <<<= (7, 7, 7, 7); |
289 | 0 | *c = _mm_add_epi32(*c, *d); |
290 | 0 | *b = _mm_xor_si128(*b, *c); |
291 | 0 | *b = _mm_xor_si128(_mm_slli_epi32(*b, 7), _mm_srli_epi32(*b, 25)); |
292 | 0 | } |
293 | 0 | } Unexecuted instantiation: chacha20::backends::sse2::add_xor_rot Unexecuted instantiation: chacha20::backends::sse2::add_xor_rot |