/src/cryptopp/chacha_simd.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // chacha_simd.cpp - written and placed in the public domain by |
2 | | // Jack Lloyd and Jeffrey Walton |
3 | | // |
4 | | // This source file uses intrinsics and built-ins to gain access to |
5 | | // SSE2, ARM NEON and ARMv8a, Power7 and Altivec instructions. A separate |
6 | | // source file is needed because additional CXXFLAGS are required to enable |
7 | | // the appropriate instructions sets in some build configurations. |
8 | | // |
9 | | // SSE2 implementation based on Botan's chacha_sse2.cpp. Many thanks |
10 | | // to Jack Lloyd and the Botan team for allowing us to use it. |
11 | | // |
12 | | // The SSE2 implementation is kind of unusual among Crypto++ algorithms. |
13 | | // We guard on CRYTPOPP_SSE2_AVAILABLE and use HasSSE2() at runtime. However, |
14 | | // if the compiler says a target machine has SSSE3 or XOP available (say, by |
15 | | // way of -march=native), then we can pull another 150 to 800 MB/s out of |
16 | | // ChaCha. To capture SSSE3 and XOP we use the compiler defines __SSSE3__ and |
17 | | // __XOP__ and forgo runtime tests. |
18 | | // |
19 | | // Runtime tests for HasSSSE3() and HasXop() are too expensive to make a |
20 | | // sub-case of SSE2. The rotates are on a critical path and the runtime tests |
21 | | // crush performance. |
22 | | // |
23 | | // Here are some relative numbers for ChaCha8: |
24 | | // * Intel Skylake, 3.0 GHz: SSE2 at 2160 MB/s; SSSE3 at 2310 MB/s. |
25 | | // * AMD Bulldozer, 3.3 GHz: SSE2 at 1680 MB/s; XOP at 2510 MB/s. |
26 | | |
27 | | #include "pch.h" |
28 | | #include "config.h" |
29 | | |
30 | | #include "chacha.h" |
31 | | #include "misc.h" |
32 | | |
33 | | // Internal compiler error in GCC 3.3 and below |
34 | | #if defined(__GNUC__) && (__GNUC__ < 4) |
35 | | # undef CRYPTOPP_SSE2_INTRIN_AVAILABLE |
36 | | #endif |
37 | | |
38 | | #if (CRYPTOPP_SSE2_INTRIN_AVAILABLE) |
39 | | # include <xmmintrin.h> |
40 | | # include <emmintrin.h> |
41 | | #endif |
42 | | |
43 | | #if defined(__SSSE3__) |
44 | | # include <tmmintrin.h> |
45 | | #endif |
46 | | |
47 | | #if defined(__XOP__) |
48 | | # if defined(CRYPTOPP_GCC_COMPATIBLE) |
49 | | # include <x86intrin.h> |
50 | | # endif |
51 | | # include <ammintrin.h> |
52 | | #endif // XOP |
53 | | |
54 | | #if (CRYPTOPP_ARM_NEON_HEADER) |
55 | | # include <arm_neon.h> |
56 | | #endif |
57 | | |
58 | | #if (CRYPTOPP_ARM_ACLE_HEADER) |
59 | | # include <stdint.h> |
60 | | # include <arm_acle.h> |
61 | | #endif |
62 | | |
63 | | #if defined(CRYPTOPP_ALTIVEC_AVAILABLE) |
64 | | # include "ppc_simd.h" |
65 | | #endif |
66 | | |
67 | | // Squash MS LNK4221 and libtool warnings |
68 | | extern const char CHACHA_SIMD_FNAME[] = __FILE__; |
69 | | |
70 | | ANONYMOUS_NAMESPACE_BEGIN |
71 | | |
72 | | // ***************************** NEON ***************************** // |
73 | | |
74 | | #if (CRYPTOPP_ARM_NEON_AVAILABLE) |
75 | | |
76 | | template <unsigned int R> |
77 | | inline uint32x4_t RotateLeft(const uint32x4_t& val) |
78 | | { |
79 | | return vorrq_u32(vshlq_n_u32(val, R), vshrq_n_u32(val, 32 - R)); |
80 | | } |
81 | | |
82 | | template <unsigned int R> |
83 | | inline uint32x4_t RotateRight(const uint32x4_t& val) |
84 | | { |
85 | | return vorrq_u32(vshlq_n_u32(val, 32 - R), vshrq_n_u32(val, R)); |
86 | | } |
87 | | |
88 | | template <> |
89 | | inline uint32x4_t RotateLeft<8>(const uint32x4_t& val) |
90 | | { |
91 | | #if defined(__aarch32__) || defined(__aarch64__) |
92 | | const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 }; |
93 | | const uint8x16_t mask = vld1q_u8(maskb); |
94 | | |
95 | | return vreinterpretq_u32_u8( |
96 | | vqtbl1q_u8(vreinterpretq_u8_u32(val), mask)); |
97 | | #else |
98 | | // fallback to slower C++ rotation. |
99 | | return vorrq_u32(vshlq_n_u32(val, 8), |
100 | | vshrq_n_u32(val, 32 - 8)); |
101 | | #endif |
102 | | } |
103 | | |
104 | | template <> |
105 | | inline uint32x4_t RotateLeft<16>(const uint32x4_t& val) |
106 | | { |
107 | | #if defined(__aarch32__) || defined(__aarch64__) |
108 | | return vreinterpretq_u32_u16( |
109 | | vrev32q_u16(vreinterpretq_u16_u32(val))); |
110 | | #else |
111 | | // fallback to slower C++ rotation. |
112 | | return vorrq_u32(vshlq_n_u32(val, 16), |
113 | | vshrq_n_u32(val, 32 - 16)); |
114 | | #endif |
115 | | } |
116 | | |
117 | | template <> |
118 | | inline uint32x4_t RotateRight<8>(const uint32x4_t& val) |
119 | | { |
120 | | #if defined(__aarch32__) || defined(__aarch64__) |
121 | | const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,15,12 }; |
122 | | const uint8x16_t mask = vld1q_u8(maskb); |
123 | | |
124 | | return vreinterpretq_u32_u8( |
125 | | vqtbl1q_u8(vreinterpretq_u8_u32(val), mask)); |
126 | | #else |
127 | | // fallback to slower C++ rotation. |
128 | | return vorrq_u32(vshrq_n_u32(val, 8), |
129 | | vshlq_n_u32(val, 32 - 8)); |
130 | | #endif |
131 | | } |
132 | | |
133 | | template <> |
134 | | inline uint32x4_t RotateRight<16>(const uint32x4_t& val) |
135 | | { |
136 | | #if defined(__aarch32__) || defined(__aarch64__) |
137 | | return vreinterpretq_u32_u16( |
138 | | vrev32q_u16(vreinterpretq_u16_u32(val))); |
139 | | #else |
140 | | // fallback to slower C++ rotation. |
141 | | return vorrq_u32(vshrq_n_u32(val, 16), |
142 | | vshlq_n_u32(val, 32 - 16)); |
143 | | #endif |
144 | | } |
145 | | |
146 | | // ChaCha's use of x86 shuffle is really a 4, 8, or 12 byte |
147 | | // rotation on the 128-bit vector word: |
148 | | // * [3,2,1,0] => [0,3,2,1] is Extract<1>(x) |
149 | | // * [3,2,1,0] => [1,0,3,2] is Extract<2>(x) |
150 | | // * [3,2,1,0] => [2,1,0,3] is Extract<3>(x) |
151 | | template <unsigned int S> |
152 | | inline uint32x4_t Extract(const uint32x4_t& val) |
153 | | { |
154 | | return vextq_u32(val, val, S); |
155 | | } |
156 | | |
157 | | // Helper to perform 64-bit addition across two elements of 32-bit vectors |
158 | | inline uint32x4_t Add64(const uint32x4_t& a, const uint32x4_t& b) |
159 | | { |
160 | | return vreinterpretq_u32_u64( |
161 | | vaddq_u64( |
162 | | vreinterpretq_u64_u32(a), |
163 | | vreinterpretq_u64_u32(b))); |
164 | | } |
165 | | |
166 | | #endif // CRYPTOPP_ARM_NEON_AVAILABLE |
167 | | |
168 | | // ***************************** SSE2 ***************************** // |
169 | | |
170 | | #if (CRYPTOPP_SSE2_INTRIN_AVAILABLE) |
171 | | |
172 | | template <unsigned int R> |
173 | | inline __m128i RotateLeft(const __m128i val) |
174 | 0 | { |
175 | | #ifdef __XOP__ |
176 | | return _mm_roti_epi32(val, R); |
177 | | #else |
178 | 0 | return _mm_or_si128(_mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R)); |
179 | 0 | #endif |
180 | 0 | } Unexecuted instantiation: chacha_simd.cpp:long long __vector(2) (anonymous namespace)::RotateLeft<12u>(long long __vector(2)) Unexecuted instantiation: chacha_simd.cpp:long long __vector(2) (anonymous namespace)::RotateLeft<7u>(long long __vector(2)) |
181 | | |
182 | | template <> |
183 | | inline __m128i RotateLeft<8>(const __m128i val) |
184 | 0 | { |
185 | | #if defined(__XOP__) |
186 | | return _mm_roti_epi32(val, 8); |
187 | | #elif defined(__SSSE3__) |
188 | | const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3); |
189 | | return _mm_shuffle_epi8(val, mask); |
190 | | #else |
191 | 0 | return _mm_or_si128(_mm_slli_epi32(val, 8), _mm_srli_epi32(val, 32-8)); |
192 | 0 | #endif |
193 | 0 | } |
194 | | |
195 | | template <> |
196 | | inline __m128i RotateLeft<16>(const __m128i val) |
197 | 0 | { |
198 | | #if defined(__XOP__) |
199 | | return _mm_roti_epi32(val, 16); |
200 | | #elif defined(__SSSE3__) |
201 | | const __m128i mask = _mm_set_epi8(13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2); |
202 | | return _mm_shuffle_epi8(val, mask); |
203 | | #else |
204 | 0 | return _mm_or_si128(_mm_slli_epi32(val, 16), _mm_srli_epi32(val, 32-16)); |
205 | 0 | #endif |
206 | 0 | } |
207 | | |
208 | | #endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE |
209 | | |
210 | | // **************************** Altivec **************************** // |
211 | | |
212 | | #if (CRYPTOPP_ALTIVEC_AVAILABLE) |
213 | | |
214 | | // ChaCha_OperateKeystream is optimized for Altivec. However, Altivec |
215 | | // is supported by using vec_ld and vec_st, and using a composite VecAdd |
216 | | // that supports 64-bit element adds. vec_ld and vec_st add significant |
217 | | // overhead when memory is not aligned. Despite the drawbacks Altivec |
218 | | // is profitable. The numbers for ChaCha8 are: |
219 | | // |
220 | | // PowerMac, C++, 2.0 GHz: 205 MB/s, 9.29 cpb |
221 | | // PowerMac, Altivec, 2.0 GHz: 471 MB/s, 4.09 cpb |
222 | | |
223 | | using CryptoPP::uint8x16_p; |
224 | | using CryptoPP::uint32x4_p; |
225 | | using CryptoPP::VecLoad; |
226 | | using CryptoPP::VecLoadAligned; |
227 | | using CryptoPP::VecStore; |
228 | | using CryptoPP::VecPermute; |
229 | | |
230 | | // Permutes bytes in packed 32-bit words to little endian. |
231 | | // State is already in proper endian order. Input and |
232 | | // output must be permuted during load and save. |
233 | | inline uint32x4_p VecLoad32LE(const uint8_t src[16]) |
234 | | { |
235 | | #if (CRYPTOPP_BIG_ENDIAN) |
236 | | const uint8x16_p mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12}; |
237 | | const uint32x4_p val = VecLoad(src); |
238 | | return VecPermute(val, val, mask); |
239 | | #else |
240 | | return VecLoad(src); |
241 | | #endif |
242 | | } |
243 | | |
244 | | // Permutes bytes in packed 32-bit words to little endian. |
245 | | // State is already in proper endian order. Input and |
246 | | // output must be permuted during load and save. |
247 | | inline void VecStore32LE(uint8_t dest[16], const uint32x4_p& val) |
248 | | { |
249 | | #if (CRYPTOPP_BIG_ENDIAN) |
250 | | const uint8x16_p mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12}; |
251 | | VecStore(VecPermute(val, val, mask), dest); |
252 | | #else |
253 | | return VecStore(val, dest); |
254 | | #endif |
255 | | } |
256 | | |
257 | | // ChaCha's use of x86 shuffle is really a 4, 8, or 12 byte |
258 | | // rotation on the 128-bit vector word: |
259 | | // * [3,2,1,0] => [0,3,2,1] is Shuffle<1>(x) |
260 | | // * [3,2,1,0] => [1,0,3,2] is Shuffle<2>(x) |
261 | | // * [3,2,1,0] => [2,1,0,3] is Shuffle<3>(x) |
262 | | template <unsigned int S> |
263 | | inline uint32x4_p Shuffle(const uint32x4_p& val) |
264 | | { |
265 | | CRYPTOPP_ASSERT(0); |
266 | | return val; |
267 | | } |
268 | | |
269 | | template <> |
270 | | inline uint32x4_p Shuffle<1>(const uint32x4_p& val) |
271 | | { |
272 | | const uint8x16_p mask = {4,5,6,7, 8,9,10,11, 12,13,14,15, 0,1,2,3}; |
273 | | return VecPermute(val, val, mask); |
274 | | } |
275 | | |
276 | | template <> |
277 | | inline uint32x4_p Shuffle<2>(const uint32x4_p& val) |
278 | | { |
279 | | const uint8x16_p mask = {8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7}; |
280 | | return VecPermute(val, val, mask); |
281 | | } |
282 | | |
283 | | template <> |
284 | | inline uint32x4_p Shuffle<3>(const uint32x4_p& val) |
285 | | { |
286 | | const uint8x16_p mask = {12,13,14,15, 0,1,2,3, 4,5,6,7, 8,9,10,11}; |
287 | | return VecPermute(val, val, mask); |
288 | | } |
289 | | |
290 | | #endif // CRYPTOPP_ALTIVEC_AVAILABLE |
291 | | |
292 | | ANONYMOUS_NAMESPACE_END |
293 | | |
294 | | NAMESPACE_BEGIN(CryptoPP) |
295 | | |
296 | | // ***************************** NEON ***************************** // |
297 | | |
298 | | #if (CRYPTOPP_ARM_NEON_AVAILABLE) |
299 | | |
300 | | void ChaCha_OperateKeystream_NEON(const word32 *state, const byte* input, byte *output, unsigned int rounds) |
301 | | { |
302 | | const uint32x4_t state0 = vld1q_u32(state + 0*4); |
303 | | const uint32x4_t state1 = vld1q_u32(state + 1*4); |
304 | | const uint32x4_t state2 = vld1q_u32(state + 2*4); |
305 | | const uint32x4_t state3 = vld1q_u32(state + 3*4); |
306 | | |
307 | | const unsigned int w[] = {1,0,0,0, 2,0,0,0, 3,0,0,0}; |
308 | | const uint32x4_t CTRS[3] = { |
309 | | vld1q_u32(w+0), vld1q_u32(w+4), vld1q_u32(w+8) |
310 | | }; |
311 | | |
312 | | uint32x4_t r0_0 = state0; |
313 | | uint32x4_t r0_1 = state1; |
314 | | uint32x4_t r0_2 = state2; |
315 | | uint32x4_t r0_3 = state3; |
316 | | |
317 | | uint32x4_t r1_0 = state0; |
318 | | uint32x4_t r1_1 = state1; |
319 | | uint32x4_t r1_2 = state2; |
320 | | uint32x4_t r1_3 = Add64(r0_3, CTRS[0]); |
321 | | |
322 | | uint32x4_t r2_0 = state0; |
323 | | uint32x4_t r2_1 = state1; |
324 | | uint32x4_t r2_2 = state2; |
325 | | uint32x4_t r2_3 = Add64(r0_3, CTRS[1]); |
326 | | |
327 | | uint32x4_t r3_0 = state0; |
328 | | uint32x4_t r3_1 = state1; |
329 | | uint32x4_t r3_2 = state2; |
330 | | uint32x4_t r3_3 = Add64(r0_3, CTRS[2]); |
331 | | |
332 | | for (int i = static_cast<int>(rounds); i > 0; i -= 2) |
333 | | { |
334 | | r0_0 = vaddq_u32(r0_0, r0_1); |
335 | | r1_0 = vaddq_u32(r1_0, r1_1); |
336 | | r2_0 = vaddq_u32(r2_0, r2_1); |
337 | | r3_0 = vaddq_u32(r3_0, r3_1); |
338 | | |
339 | | r0_3 = veorq_u32(r0_3, r0_0); |
340 | | r1_3 = veorq_u32(r1_3, r1_0); |
341 | | r2_3 = veorq_u32(r2_3, r2_0); |
342 | | r3_3 = veorq_u32(r3_3, r3_0); |
343 | | |
344 | | r0_3 = RotateLeft<16>(r0_3); |
345 | | r1_3 = RotateLeft<16>(r1_3); |
346 | | r2_3 = RotateLeft<16>(r2_3); |
347 | | r3_3 = RotateLeft<16>(r3_3); |
348 | | |
349 | | r0_2 = vaddq_u32(r0_2, r0_3); |
350 | | r1_2 = vaddq_u32(r1_2, r1_3); |
351 | | r2_2 = vaddq_u32(r2_2, r2_3); |
352 | | r3_2 = vaddq_u32(r3_2, r3_3); |
353 | | |
354 | | r0_1 = veorq_u32(r0_1, r0_2); |
355 | | r1_1 = veorq_u32(r1_1, r1_2); |
356 | | r2_1 = veorq_u32(r2_1, r2_2); |
357 | | r3_1 = veorq_u32(r3_1, r3_2); |
358 | | |
359 | | r0_1 = RotateLeft<12>(r0_1); |
360 | | r1_1 = RotateLeft<12>(r1_1); |
361 | | r2_1 = RotateLeft<12>(r2_1); |
362 | | r3_1 = RotateLeft<12>(r3_1); |
363 | | |
364 | | r0_0 = vaddq_u32(r0_0, r0_1); |
365 | | r1_0 = vaddq_u32(r1_0, r1_1); |
366 | | r2_0 = vaddq_u32(r2_0, r2_1); |
367 | | r3_0 = vaddq_u32(r3_0, r3_1); |
368 | | |
369 | | r0_3 = veorq_u32(r0_3, r0_0); |
370 | | r1_3 = veorq_u32(r1_3, r1_0); |
371 | | r2_3 = veorq_u32(r2_3, r2_0); |
372 | | r3_3 = veorq_u32(r3_3, r3_0); |
373 | | |
374 | | r0_3 = RotateLeft<8>(r0_3); |
375 | | r1_3 = RotateLeft<8>(r1_3); |
376 | | r2_3 = RotateLeft<8>(r2_3); |
377 | | r3_3 = RotateLeft<8>(r3_3); |
378 | | |
379 | | r0_2 = vaddq_u32(r0_2, r0_3); |
380 | | r1_2 = vaddq_u32(r1_2, r1_3); |
381 | | r2_2 = vaddq_u32(r2_2, r2_3); |
382 | | r3_2 = vaddq_u32(r3_2, r3_3); |
383 | | |
384 | | r0_1 = veorq_u32(r0_1, r0_2); |
385 | | r1_1 = veorq_u32(r1_1, r1_2); |
386 | | r2_1 = veorq_u32(r2_1, r2_2); |
387 | | r3_1 = veorq_u32(r3_1, r3_2); |
388 | | |
389 | | r0_1 = RotateLeft<7>(r0_1); |
390 | | r1_1 = RotateLeft<7>(r1_1); |
391 | | r2_1 = RotateLeft<7>(r2_1); |
392 | | r3_1 = RotateLeft<7>(r3_1); |
393 | | |
394 | | r0_1 = Extract<1>(r0_1); |
395 | | r0_2 = Extract<2>(r0_2); |
396 | | r0_3 = Extract<3>(r0_3); |
397 | | |
398 | | r1_1 = Extract<1>(r1_1); |
399 | | r1_2 = Extract<2>(r1_2); |
400 | | r1_3 = Extract<3>(r1_3); |
401 | | |
402 | | r2_1 = Extract<1>(r2_1); |
403 | | r2_2 = Extract<2>(r2_2); |
404 | | r2_3 = Extract<3>(r2_3); |
405 | | |
406 | | r3_1 = Extract<1>(r3_1); |
407 | | r3_2 = Extract<2>(r3_2); |
408 | | r3_3 = Extract<3>(r3_3); |
409 | | |
410 | | r0_0 = vaddq_u32(r0_0, r0_1); |
411 | | r1_0 = vaddq_u32(r1_0, r1_1); |
412 | | r2_0 = vaddq_u32(r2_0, r2_1); |
413 | | r3_0 = vaddq_u32(r3_0, r3_1); |
414 | | |
415 | | r0_3 = veorq_u32(r0_3, r0_0); |
416 | | r1_3 = veorq_u32(r1_3, r1_0); |
417 | | r2_3 = veorq_u32(r2_3, r2_0); |
418 | | r3_3 = veorq_u32(r3_3, r3_0); |
419 | | |
420 | | r0_3 = RotateLeft<16>(r0_3); |
421 | | r1_3 = RotateLeft<16>(r1_3); |
422 | | r2_3 = RotateLeft<16>(r2_3); |
423 | | r3_3 = RotateLeft<16>(r3_3); |
424 | | |
425 | | r0_2 = vaddq_u32(r0_2, r0_3); |
426 | | r1_2 = vaddq_u32(r1_2, r1_3); |
427 | | r2_2 = vaddq_u32(r2_2, r2_3); |
428 | | r3_2 = vaddq_u32(r3_2, r3_3); |
429 | | |
430 | | r0_1 = veorq_u32(r0_1, r0_2); |
431 | | r1_1 = veorq_u32(r1_1, r1_2); |
432 | | r2_1 = veorq_u32(r2_1, r2_2); |
433 | | r3_1 = veorq_u32(r3_1, r3_2); |
434 | | |
435 | | r0_1 = RotateLeft<12>(r0_1); |
436 | | r1_1 = RotateLeft<12>(r1_1); |
437 | | r2_1 = RotateLeft<12>(r2_1); |
438 | | r3_1 = RotateLeft<12>(r3_1); |
439 | | |
440 | | r0_0 = vaddq_u32(r0_0, r0_1); |
441 | | r1_0 = vaddq_u32(r1_0, r1_1); |
442 | | r2_0 = vaddq_u32(r2_0, r2_1); |
443 | | r3_0 = vaddq_u32(r3_0, r3_1); |
444 | | |
445 | | r0_3 = veorq_u32(r0_3, r0_0); |
446 | | r1_3 = veorq_u32(r1_3, r1_0); |
447 | | r2_3 = veorq_u32(r2_3, r2_0); |
448 | | r3_3 = veorq_u32(r3_3, r3_0); |
449 | | |
450 | | r0_3 = RotateLeft<8>(r0_3); |
451 | | r1_3 = RotateLeft<8>(r1_3); |
452 | | r2_3 = RotateLeft<8>(r2_3); |
453 | | r3_3 = RotateLeft<8>(r3_3); |
454 | | |
455 | | r0_2 = vaddq_u32(r0_2, r0_3); |
456 | | r1_2 = vaddq_u32(r1_2, r1_3); |
457 | | r2_2 = vaddq_u32(r2_2, r2_3); |
458 | | r3_2 = vaddq_u32(r3_2, r3_3); |
459 | | |
460 | | r0_1 = veorq_u32(r0_1, r0_2); |
461 | | r1_1 = veorq_u32(r1_1, r1_2); |
462 | | r2_1 = veorq_u32(r2_1, r2_2); |
463 | | r3_1 = veorq_u32(r3_1, r3_2); |
464 | | |
465 | | r0_1 = RotateLeft<7>(r0_1); |
466 | | r1_1 = RotateLeft<7>(r1_1); |
467 | | r2_1 = RotateLeft<7>(r2_1); |
468 | | r3_1 = RotateLeft<7>(r3_1); |
469 | | |
470 | | r0_1 = Extract<3>(r0_1); |
471 | | r0_2 = Extract<2>(r0_2); |
472 | | r0_3 = Extract<1>(r0_3); |
473 | | |
474 | | r1_1 = Extract<3>(r1_1); |
475 | | r1_2 = Extract<2>(r1_2); |
476 | | r1_3 = Extract<1>(r1_3); |
477 | | |
478 | | r2_1 = Extract<3>(r2_1); |
479 | | r2_2 = Extract<2>(r2_2); |
480 | | r2_3 = Extract<1>(r2_3); |
481 | | |
482 | | r3_1 = Extract<3>(r3_1); |
483 | | r3_2 = Extract<2>(r3_2); |
484 | | r3_3 = Extract<1>(r3_3); |
485 | | } |
486 | | |
487 | | r0_0 = vaddq_u32(r0_0, state0); |
488 | | r0_1 = vaddq_u32(r0_1, state1); |
489 | | r0_2 = vaddq_u32(r0_2, state2); |
490 | | r0_3 = vaddq_u32(r0_3, state3); |
491 | | |
492 | | r1_0 = vaddq_u32(r1_0, state0); |
493 | | r1_1 = vaddq_u32(r1_1, state1); |
494 | | r1_2 = vaddq_u32(r1_2, state2); |
495 | | r1_3 = vaddq_u32(r1_3, state3); |
496 | | r1_3 = Add64(r1_3, CTRS[0]); |
497 | | |
498 | | r2_0 = vaddq_u32(r2_0, state0); |
499 | | r2_1 = vaddq_u32(r2_1, state1); |
500 | | r2_2 = vaddq_u32(r2_2, state2); |
501 | | r2_3 = vaddq_u32(r2_3, state3); |
502 | | r2_3 = Add64(r2_3, CTRS[1]); |
503 | | |
504 | | r3_0 = vaddq_u32(r3_0, state0); |
505 | | r3_1 = vaddq_u32(r3_1, state1); |
506 | | r3_2 = vaddq_u32(r3_2, state2); |
507 | | r3_3 = vaddq_u32(r3_3, state3); |
508 | | r3_3 = Add64(r3_3, CTRS[2]); |
509 | | |
510 | | if (input) |
511 | | { |
512 | | r0_0 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 0*16)), r0_0); |
513 | | r0_1 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 1*16)), r0_1); |
514 | | r0_2 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 2*16)), r0_2); |
515 | | r0_3 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 3*16)), r0_3); |
516 | | } |
517 | | |
518 | | vst1q_u8(output + 0*16, vreinterpretq_u8_u32(r0_0)); |
519 | | vst1q_u8(output + 1*16, vreinterpretq_u8_u32(r0_1)); |
520 | | vst1q_u8(output + 2*16, vreinterpretq_u8_u32(r0_2)); |
521 | | vst1q_u8(output + 3*16, vreinterpretq_u8_u32(r0_3)); |
522 | | |
523 | | if (input) |
524 | | { |
525 | | r1_0 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 4*16)), r1_0); |
526 | | r1_1 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 5*16)), r1_1); |
527 | | r1_2 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 6*16)), r1_2); |
528 | | r1_3 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 7*16)), r1_3); |
529 | | } |
530 | | |
531 | | vst1q_u8(output + 4*16, vreinterpretq_u8_u32(r1_0)); |
532 | | vst1q_u8(output + 5*16, vreinterpretq_u8_u32(r1_1)); |
533 | | vst1q_u8(output + 6*16, vreinterpretq_u8_u32(r1_2)); |
534 | | vst1q_u8(output + 7*16, vreinterpretq_u8_u32(r1_3)); |
535 | | |
536 | | if (input) |
537 | | { |
538 | | r2_0 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 8*16)), r2_0); |
539 | | r2_1 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 9*16)), r2_1); |
540 | | r2_2 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 10*16)), r2_2); |
541 | | r2_3 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 11*16)), r2_3); |
542 | | } |
543 | | |
544 | | vst1q_u8(output + 8*16, vreinterpretq_u8_u32(r2_0)); |
545 | | vst1q_u8(output + 9*16, vreinterpretq_u8_u32(r2_1)); |
546 | | vst1q_u8(output + 10*16, vreinterpretq_u8_u32(r2_2)); |
547 | | vst1q_u8(output + 11*16, vreinterpretq_u8_u32(r2_3)); |
548 | | |
549 | | if (input) |
550 | | { |
551 | | r3_0 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 12*16)), r3_0); |
552 | | r3_1 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 13*16)), r3_1); |
553 | | r3_2 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 14*16)), r3_2); |
554 | | r3_3 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 15*16)), r3_3); |
555 | | } |
556 | | |
557 | | vst1q_u8(output + 12*16, vreinterpretq_u8_u32(r3_0)); |
558 | | vst1q_u8(output + 13*16, vreinterpretq_u8_u32(r3_1)); |
559 | | vst1q_u8(output + 14*16, vreinterpretq_u8_u32(r3_2)); |
560 | | vst1q_u8(output + 15*16, vreinterpretq_u8_u32(r3_3)); |
561 | | } |
562 | | |
563 | | #endif // CRYPTOPP_ARM_NEON_AVAILABLE |
564 | | |
565 | | // ***************************** SSE2 ***************************** // |
566 | | |
567 | | #if (CRYPTOPP_SSE2_INTRIN_AVAILABLE) |
568 | | |
569 | | void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input, byte *output, unsigned int rounds) |
570 | 0 | { |
571 | 0 | const __m128i state0 = _mm_load_si128(reinterpret_cast<const __m128i*>(state+0*4)); |
572 | 0 | const __m128i state1 = _mm_load_si128(reinterpret_cast<const __m128i*>(state+1*4)); |
573 | 0 | const __m128i state2 = _mm_load_si128(reinterpret_cast<const __m128i*>(state+2*4)); |
574 | 0 | const __m128i state3 = _mm_load_si128(reinterpret_cast<const __m128i*>(state+3*4)); |
575 | |
|
576 | 0 | __m128i r0_0 = state0; |
577 | 0 | __m128i r0_1 = state1; |
578 | 0 | __m128i r0_2 = state2; |
579 | 0 | __m128i r0_3 = state3; |
580 | |
|
581 | 0 | __m128i r1_0 = state0; |
582 | 0 | __m128i r1_1 = state1; |
583 | 0 | __m128i r1_2 = state2; |
584 | 0 | __m128i r1_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 1)); |
585 | |
|
586 | 0 | __m128i r2_0 = state0; |
587 | 0 | __m128i r2_1 = state1; |
588 | 0 | __m128i r2_2 = state2; |
589 | 0 | __m128i r2_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 2)); |
590 | |
|
591 | 0 | __m128i r3_0 = state0; |
592 | 0 | __m128i r3_1 = state1; |
593 | 0 | __m128i r3_2 = state2; |
594 | 0 | __m128i r3_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 3)); |
595 | |
|
596 | 0 | for (int i = static_cast<int>(rounds); i > 0; i -= 2) |
597 | 0 | { |
598 | 0 | r0_0 = _mm_add_epi32(r0_0, r0_1); |
599 | 0 | r1_0 = _mm_add_epi32(r1_0, r1_1); |
600 | 0 | r2_0 = _mm_add_epi32(r2_0, r2_1); |
601 | 0 | r3_0 = _mm_add_epi32(r3_0, r3_1); |
602 | |
|
603 | 0 | r0_3 = _mm_xor_si128(r0_3, r0_0); |
604 | 0 | r1_3 = _mm_xor_si128(r1_3, r1_0); |
605 | 0 | r2_3 = _mm_xor_si128(r2_3, r2_0); |
606 | 0 | r3_3 = _mm_xor_si128(r3_3, r3_0); |
607 | |
|
608 | 0 | r0_3 = RotateLeft<16>(r0_3); |
609 | 0 | r1_3 = RotateLeft<16>(r1_3); |
610 | 0 | r2_3 = RotateLeft<16>(r2_3); |
611 | 0 | r3_3 = RotateLeft<16>(r3_3); |
612 | |
|
613 | 0 | r0_2 = _mm_add_epi32(r0_2, r0_3); |
614 | 0 | r1_2 = _mm_add_epi32(r1_2, r1_3); |
615 | 0 | r2_2 = _mm_add_epi32(r2_2, r2_3); |
616 | 0 | r3_2 = _mm_add_epi32(r3_2, r3_3); |
617 | |
|
618 | 0 | r0_1 = _mm_xor_si128(r0_1, r0_2); |
619 | 0 | r1_1 = _mm_xor_si128(r1_1, r1_2); |
620 | 0 | r2_1 = _mm_xor_si128(r2_1, r2_2); |
621 | 0 | r3_1 = _mm_xor_si128(r3_1, r3_2); |
622 | |
|
623 | 0 | r0_1 = RotateLeft<12>(r0_1); |
624 | 0 | r1_1 = RotateLeft<12>(r1_1); |
625 | 0 | r2_1 = RotateLeft<12>(r2_1); |
626 | 0 | r3_1 = RotateLeft<12>(r3_1); |
627 | |
|
628 | 0 | r0_0 = _mm_add_epi32(r0_0, r0_1); |
629 | 0 | r1_0 = _mm_add_epi32(r1_0, r1_1); |
630 | 0 | r2_0 = _mm_add_epi32(r2_0, r2_1); |
631 | 0 | r3_0 = _mm_add_epi32(r3_0, r3_1); |
632 | |
|
633 | 0 | r0_3 = _mm_xor_si128(r0_3, r0_0); |
634 | 0 | r1_3 = _mm_xor_si128(r1_3, r1_0); |
635 | 0 | r2_3 = _mm_xor_si128(r2_3, r2_0); |
636 | 0 | r3_3 = _mm_xor_si128(r3_3, r3_0); |
637 | |
|
638 | 0 | r0_3 = RotateLeft<8>(r0_3); |
639 | 0 | r1_3 = RotateLeft<8>(r1_3); |
640 | 0 | r2_3 = RotateLeft<8>(r2_3); |
641 | 0 | r3_3 = RotateLeft<8>(r3_3); |
642 | |
|
643 | 0 | r0_2 = _mm_add_epi32(r0_2, r0_3); |
644 | 0 | r1_2 = _mm_add_epi32(r1_2, r1_3); |
645 | 0 | r2_2 = _mm_add_epi32(r2_2, r2_3); |
646 | 0 | r3_2 = _mm_add_epi32(r3_2, r3_3); |
647 | |
|
648 | 0 | r0_1 = _mm_xor_si128(r0_1, r0_2); |
649 | 0 | r1_1 = _mm_xor_si128(r1_1, r1_2); |
650 | 0 | r2_1 = _mm_xor_si128(r2_1, r2_2); |
651 | 0 | r3_1 = _mm_xor_si128(r3_1, r3_2); |
652 | |
|
653 | 0 | r0_1 = RotateLeft<7>(r0_1); |
654 | 0 | r1_1 = RotateLeft<7>(r1_1); |
655 | 0 | r2_1 = RotateLeft<7>(r2_1); |
656 | 0 | r3_1 = RotateLeft<7>(r3_1); |
657 | |
|
658 | 0 | r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(0, 3, 2, 1)); |
659 | 0 | r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2)); |
660 | 0 | r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(2, 1, 0, 3)); |
661 | |
|
662 | 0 | r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(0, 3, 2, 1)); |
663 | 0 | r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2)); |
664 | 0 | r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(2, 1, 0, 3)); |
665 | |
|
666 | 0 | r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(0, 3, 2, 1)); |
667 | 0 | r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2)); |
668 | 0 | r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(2, 1, 0, 3)); |
669 | |
|
670 | 0 | r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(0, 3, 2, 1)); |
671 | 0 | r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2)); |
672 | 0 | r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(2, 1, 0, 3)); |
673 | |
|
674 | 0 | r0_0 = _mm_add_epi32(r0_0, r0_1); |
675 | 0 | r1_0 = _mm_add_epi32(r1_0, r1_1); |
676 | 0 | r2_0 = _mm_add_epi32(r2_0, r2_1); |
677 | 0 | r3_0 = _mm_add_epi32(r3_0, r3_1); |
678 | |
|
679 | 0 | r0_3 = _mm_xor_si128(r0_3, r0_0); |
680 | 0 | r1_3 = _mm_xor_si128(r1_3, r1_0); |
681 | 0 | r2_3 = _mm_xor_si128(r2_3, r2_0); |
682 | 0 | r3_3 = _mm_xor_si128(r3_3, r3_0); |
683 | |
|
684 | 0 | r0_3 = RotateLeft<16>(r0_3); |
685 | 0 | r1_3 = RotateLeft<16>(r1_3); |
686 | 0 | r2_3 = RotateLeft<16>(r2_3); |
687 | 0 | r3_3 = RotateLeft<16>(r3_3); |
688 | |
|
689 | 0 | r0_2 = _mm_add_epi32(r0_2, r0_3); |
690 | 0 | r1_2 = _mm_add_epi32(r1_2, r1_3); |
691 | 0 | r2_2 = _mm_add_epi32(r2_2, r2_3); |
692 | 0 | r3_2 = _mm_add_epi32(r3_2, r3_3); |
693 | |
|
694 | 0 | r0_1 = _mm_xor_si128(r0_1, r0_2); |
695 | 0 | r1_1 = _mm_xor_si128(r1_1, r1_2); |
696 | 0 | r2_1 = _mm_xor_si128(r2_1, r2_2); |
697 | 0 | r3_1 = _mm_xor_si128(r3_1, r3_2); |
698 | |
|
699 | 0 | r0_1 = RotateLeft<12>(r0_1); |
700 | 0 | r1_1 = RotateLeft<12>(r1_1); |
701 | 0 | r2_1 = RotateLeft<12>(r2_1); |
702 | 0 | r3_1 = RotateLeft<12>(r3_1); |
703 | |
|
704 | 0 | r0_0 = _mm_add_epi32(r0_0, r0_1); |
705 | 0 | r1_0 = _mm_add_epi32(r1_0, r1_1); |
706 | 0 | r2_0 = _mm_add_epi32(r2_0, r2_1); |
707 | 0 | r3_0 = _mm_add_epi32(r3_0, r3_1); |
708 | |
|
709 | 0 | r0_3 = _mm_xor_si128(r0_3, r0_0); |
710 | 0 | r1_3 = _mm_xor_si128(r1_3, r1_0); |
711 | 0 | r2_3 = _mm_xor_si128(r2_3, r2_0); |
712 | 0 | r3_3 = _mm_xor_si128(r3_3, r3_0); |
713 | |
|
714 | 0 | r0_3 = RotateLeft<8>(r0_3); |
715 | 0 | r1_3 = RotateLeft<8>(r1_3); |
716 | 0 | r2_3 = RotateLeft<8>(r2_3); |
717 | 0 | r3_3 = RotateLeft<8>(r3_3); |
718 | |
|
719 | 0 | r0_2 = _mm_add_epi32(r0_2, r0_3); |
720 | 0 | r1_2 = _mm_add_epi32(r1_2, r1_3); |
721 | 0 | r2_2 = _mm_add_epi32(r2_2, r2_3); |
722 | 0 | r3_2 = _mm_add_epi32(r3_2, r3_3); |
723 | |
|
724 | 0 | r0_1 = _mm_xor_si128(r0_1, r0_2); |
725 | 0 | r1_1 = _mm_xor_si128(r1_1, r1_2); |
726 | 0 | r2_1 = _mm_xor_si128(r2_1, r2_2); |
727 | 0 | r3_1 = _mm_xor_si128(r3_1, r3_2); |
728 | |
|
729 | 0 | r0_1 = RotateLeft<7>(r0_1); |
730 | 0 | r1_1 = RotateLeft<7>(r1_1); |
731 | 0 | r2_1 = RotateLeft<7>(r2_1); |
732 | 0 | r3_1 = RotateLeft<7>(r3_1); |
733 | |
|
734 | 0 | r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(2, 1, 0, 3)); |
735 | 0 | r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2)); |
736 | 0 | r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(0, 3, 2, 1)); |
737 | |
|
738 | 0 | r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(2, 1, 0, 3)); |
739 | 0 | r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2)); |
740 | 0 | r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(0, 3, 2, 1)); |
741 | |
|
742 | 0 | r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(2, 1, 0, 3)); |
743 | 0 | r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2)); |
744 | 0 | r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(0, 3, 2, 1)); |
745 | |
|
746 | 0 | r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(2, 1, 0, 3)); |
747 | 0 | r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2)); |
748 | 0 | r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(0, 3, 2, 1)); |
749 | 0 | } |
750 | |
|
751 | 0 | r0_0 = _mm_add_epi32(r0_0, state0); |
752 | 0 | r0_1 = _mm_add_epi32(r0_1, state1); |
753 | 0 | r0_2 = _mm_add_epi32(r0_2, state2); |
754 | 0 | r0_3 = _mm_add_epi32(r0_3, state3); |
755 | |
|
756 | 0 | r1_0 = _mm_add_epi32(r1_0, state0); |
757 | 0 | r1_1 = _mm_add_epi32(r1_1, state1); |
758 | 0 | r1_2 = _mm_add_epi32(r1_2, state2); |
759 | 0 | r1_3 = _mm_add_epi32(r1_3, state3); |
760 | 0 | r1_3 = _mm_add_epi64(r1_3, _mm_set_epi32(0, 0, 0, 1)); |
761 | |
|
762 | 0 | r2_0 = _mm_add_epi32(r2_0, state0); |
763 | 0 | r2_1 = _mm_add_epi32(r2_1, state1); |
764 | 0 | r2_2 = _mm_add_epi32(r2_2, state2); |
765 | 0 | r2_3 = _mm_add_epi32(r2_3, state3); |
766 | 0 | r2_3 = _mm_add_epi64(r2_3, _mm_set_epi32(0, 0, 0, 2)); |
767 | |
|
768 | 0 | r3_0 = _mm_add_epi32(r3_0, state0); |
769 | 0 | r3_1 = _mm_add_epi32(r3_1, state1); |
770 | 0 | r3_2 = _mm_add_epi32(r3_2, state2); |
771 | 0 | r3_3 = _mm_add_epi32(r3_3, state3); |
772 | 0 | r3_3 = _mm_add_epi64(r3_3, _mm_set_epi32(0, 0, 0, 3)); |
773 | |
|
774 | 0 | if (input) |
775 | 0 | { |
776 | 0 | r0_0 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+0*16)), r0_0); |
777 | 0 | r0_1 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+1*16)), r0_1); |
778 | 0 | r0_2 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+2*16)), r0_2); |
779 | 0 | r0_3 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+3*16)), r0_3); |
780 | 0 | } |
781 | |
|
782 | 0 | _mm_storeu_si128(reinterpret_cast<__m128i*>(output+0*16), r0_0); |
783 | 0 | _mm_storeu_si128(reinterpret_cast<__m128i*>(output+1*16), r0_1); |
784 | 0 | _mm_storeu_si128(reinterpret_cast<__m128i*>(output+2*16), r0_2); |
785 | 0 | _mm_storeu_si128(reinterpret_cast<__m128i*>(output+3*16), r0_3); |
786 | |
|
787 | 0 | if (input) |
788 | 0 | { |
789 | 0 | r1_0 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+4*16)), r1_0); |
790 | 0 | r1_1 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+5*16)), r1_1); |
791 | 0 | r1_2 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+6*16)), r1_2); |
792 | 0 | r1_3 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+7*16)), r1_3); |
793 | 0 | } |
794 | |
|
795 | 0 | _mm_storeu_si128(reinterpret_cast<__m128i*>(output+4*16), r1_0); |
796 | 0 | _mm_storeu_si128(reinterpret_cast<__m128i*>(output+5*16), r1_1); |
797 | 0 | _mm_storeu_si128(reinterpret_cast<__m128i*>(output+6*16), r1_2); |
798 | 0 | _mm_storeu_si128(reinterpret_cast<__m128i*>(output+7*16), r1_3); |
799 | |
|
800 | 0 | if (input) |
801 | 0 | { |
802 | 0 | r2_0 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+ 8*16)), r2_0); |
803 | 0 | r2_1 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+ 9*16)), r2_1); |
804 | 0 | r2_2 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+10*16)), r2_2); |
805 | 0 | r2_3 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+11*16)), r2_3); |
806 | 0 | } |
807 | |
|
808 | 0 | _mm_storeu_si128(reinterpret_cast<__m128i*>(output+ 8*16), r2_0); |
809 | 0 | _mm_storeu_si128(reinterpret_cast<__m128i*>(output+ 9*16), r2_1); |
810 | 0 | _mm_storeu_si128(reinterpret_cast<__m128i*>(output+10*16), r2_2); |
811 | 0 | _mm_storeu_si128(reinterpret_cast<__m128i*>(output+11*16), r2_3); |
812 | |
|
813 | 0 | if (input) |
814 | 0 | { |
815 | 0 | r3_0 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+12*16)), r3_0); |
816 | 0 | r3_1 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+13*16)), r3_1); |
817 | 0 | r3_2 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+14*16)), r3_2); |
818 | 0 | r3_3 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+15*16)), r3_3); |
819 | 0 | } |
820 | |
|
821 | 0 | _mm_storeu_si128(reinterpret_cast<__m128i*>(output+12*16), r3_0); |
822 | 0 | _mm_storeu_si128(reinterpret_cast<__m128i*>(output+13*16), r3_1); |
823 | 0 | _mm_storeu_si128(reinterpret_cast<__m128i*>(output+14*16), r3_2); |
824 | 0 | _mm_storeu_si128(reinterpret_cast<__m128i*>(output+15*16), r3_3); |
825 | 0 | } |
826 | | |
827 | | #endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE |
828 | | |
829 | | #if (CRYPTOPP_ALTIVEC_AVAILABLE) |
830 | | |
831 | | // ChaCha_OperateKeystream_CORE will use either POWER7 or ALTIVEC, |
832 | | // depending on the flags used to compile this source file. The |
833 | | // abstractions are handled in VecLoad, VecStore and friends. In |
834 | | // the future we may to provide both POWER7 or ALTIVEC at the same |
835 | | // time to better support distros. |
836 | | inline void ChaCha_OperateKeystream_CORE(const word32 *state, const byte* input, byte *output, unsigned int rounds) |
837 | | { |
838 | | const uint32x4_p state0 = VecLoadAligned(state + 0*4); |
839 | | const uint32x4_p state1 = VecLoadAligned(state + 1*4); |
840 | | const uint32x4_p state2 = VecLoadAligned(state + 2*4); |
841 | | const uint32x4_p state3 = VecLoadAligned(state + 3*4); |
842 | | |
843 | | const uint32x4_p CTRS[3] = { |
844 | | {1,0,0,0}, {2,0,0,0}, {3,0,0,0} |
845 | | }; |
846 | | |
847 | | uint32x4_p r0_0 = state0; |
848 | | uint32x4_p r0_1 = state1; |
849 | | uint32x4_p r0_2 = state2; |
850 | | uint32x4_p r0_3 = state3; |
851 | | |
852 | | uint32x4_p r1_0 = state0; |
853 | | uint32x4_p r1_1 = state1; |
854 | | uint32x4_p r1_2 = state2; |
855 | | uint32x4_p r1_3 = VecAdd64(r0_3, CTRS[0]); |
856 | | |
857 | | uint32x4_p r2_0 = state0; |
858 | | uint32x4_p r2_1 = state1; |
859 | | uint32x4_p r2_2 = state2; |
860 | | uint32x4_p r2_3 = VecAdd64(r0_3, CTRS[1]); |
861 | | |
862 | | uint32x4_p r3_0 = state0; |
863 | | uint32x4_p r3_1 = state1; |
864 | | uint32x4_p r3_2 = state2; |
865 | | uint32x4_p r3_3 = VecAdd64(r0_3, CTRS[2]); |
866 | | |
867 | | for (int i = static_cast<int>(rounds); i > 0; i -= 2) |
868 | | { |
869 | | r0_0 = VecAdd(r0_0, r0_1); |
870 | | r1_0 = VecAdd(r1_0, r1_1); |
871 | | r2_0 = VecAdd(r2_0, r2_1); |
872 | | r3_0 = VecAdd(r3_0, r3_1); |
873 | | |
874 | | r0_3 = VecXor(r0_3, r0_0); |
875 | | r1_3 = VecXor(r1_3, r1_0); |
876 | | r2_3 = VecXor(r2_3, r2_0); |
877 | | r3_3 = VecXor(r3_3, r3_0); |
878 | | |
879 | | r0_3 = VecRotateLeft<16>(r0_3); |
880 | | r1_3 = VecRotateLeft<16>(r1_3); |
881 | | r2_3 = VecRotateLeft<16>(r2_3); |
882 | | r3_3 = VecRotateLeft<16>(r3_3); |
883 | | |
884 | | r0_2 = VecAdd(r0_2, r0_3); |
885 | | r1_2 = VecAdd(r1_2, r1_3); |
886 | | r2_2 = VecAdd(r2_2, r2_3); |
887 | | r3_2 = VecAdd(r3_2, r3_3); |
888 | | |
889 | | r0_1 = VecXor(r0_1, r0_2); |
890 | | r1_1 = VecXor(r1_1, r1_2); |
891 | | r2_1 = VecXor(r2_1, r2_2); |
892 | | r3_1 = VecXor(r3_1, r3_2); |
893 | | |
894 | | r0_1 = VecRotateLeft<12>(r0_1); |
895 | | r1_1 = VecRotateLeft<12>(r1_1); |
896 | | r2_1 = VecRotateLeft<12>(r2_1); |
897 | | r3_1 = VecRotateLeft<12>(r3_1); |
898 | | |
899 | | r0_0 = VecAdd(r0_0, r0_1); |
900 | | r1_0 = VecAdd(r1_0, r1_1); |
901 | | r2_0 = VecAdd(r2_0, r2_1); |
902 | | r3_0 = VecAdd(r3_0, r3_1); |
903 | | |
904 | | r0_3 = VecXor(r0_3, r0_0); |
905 | | r1_3 = VecXor(r1_3, r1_0); |
906 | | r2_3 = VecXor(r2_3, r2_0); |
907 | | r3_3 = VecXor(r3_3, r3_0); |
908 | | |
909 | | r0_3 = VecRotateLeft<8>(r0_3); |
910 | | r1_3 = VecRotateLeft<8>(r1_3); |
911 | | r2_3 = VecRotateLeft<8>(r2_3); |
912 | | r3_3 = VecRotateLeft<8>(r3_3); |
913 | | |
914 | | r0_2 = VecAdd(r0_2, r0_3); |
915 | | r1_2 = VecAdd(r1_2, r1_3); |
916 | | r2_2 = VecAdd(r2_2, r2_3); |
917 | | r3_2 = VecAdd(r3_2, r3_3); |
918 | | |
919 | | r0_1 = VecXor(r0_1, r0_2); |
920 | | r1_1 = VecXor(r1_1, r1_2); |
921 | | r2_1 = VecXor(r2_1, r2_2); |
922 | | r3_1 = VecXor(r3_1, r3_2); |
923 | | |
924 | | r0_1 = VecRotateLeft<7>(r0_1); |
925 | | r1_1 = VecRotateLeft<7>(r1_1); |
926 | | r2_1 = VecRotateLeft<7>(r2_1); |
927 | | r3_1 = VecRotateLeft<7>(r3_1); |
928 | | |
929 | | r0_1 = Shuffle<1>(r0_1); |
930 | | r0_2 = Shuffle<2>(r0_2); |
931 | | r0_3 = Shuffle<3>(r0_3); |
932 | | |
933 | | r1_1 = Shuffle<1>(r1_1); |
934 | | r1_2 = Shuffle<2>(r1_2); |
935 | | r1_3 = Shuffle<3>(r1_3); |
936 | | |
937 | | r2_1 = Shuffle<1>(r2_1); |
938 | | r2_2 = Shuffle<2>(r2_2); |
939 | | r2_3 = Shuffle<3>(r2_3); |
940 | | |
941 | | r3_1 = Shuffle<1>(r3_1); |
942 | | r3_2 = Shuffle<2>(r3_2); |
943 | | r3_3 = Shuffle<3>(r3_3); |
944 | | |
945 | | r0_0 = VecAdd(r0_0, r0_1); |
946 | | r1_0 = VecAdd(r1_0, r1_1); |
947 | | r2_0 = VecAdd(r2_0, r2_1); |
948 | | r3_0 = VecAdd(r3_0, r3_1); |
949 | | |
950 | | r0_3 = VecXor(r0_3, r0_0); |
951 | | r1_3 = VecXor(r1_3, r1_0); |
952 | | r2_3 = VecXor(r2_3, r2_0); |
953 | | r3_3 = VecXor(r3_3, r3_0); |
954 | | |
955 | | r0_3 = VecRotateLeft<16>(r0_3); |
956 | | r1_3 = VecRotateLeft<16>(r1_3); |
957 | | r2_3 = VecRotateLeft<16>(r2_3); |
958 | | r3_3 = VecRotateLeft<16>(r3_3); |
959 | | |
960 | | r0_2 = VecAdd(r0_2, r0_3); |
961 | | r1_2 = VecAdd(r1_2, r1_3); |
962 | | r2_2 = VecAdd(r2_2, r2_3); |
963 | | r3_2 = VecAdd(r3_2, r3_3); |
964 | | |
965 | | r0_1 = VecXor(r0_1, r0_2); |
966 | | r1_1 = VecXor(r1_1, r1_2); |
967 | | r2_1 = VecXor(r2_1, r2_2); |
968 | | r3_1 = VecXor(r3_1, r3_2); |
969 | | |
970 | | r0_1 = VecRotateLeft<12>(r0_1); |
971 | | r1_1 = VecRotateLeft<12>(r1_1); |
972 | | r2_1 = VecRotateLeft<12>(r2_1); |
973 | | r3_1 = VecRotateLeft<12>(r3_1); |
974 | | |
975 | | r0_0 = VecAdd(r0_0, r0_1); |
976 | | r1_0 = VecAdd(r1_0, r1_1); |
977 | | r2_0 = VecAdd(r2_0, r2_1); |
978 | | r3_0 = VecAdd(r3_0, r3_1); |
979 | | |
980 | | r0_3 = VecXor(r0_3, r0_0); |
981 | | r1_3 = VecXor(r1_3, r1_0); |
982 | | r2_3 = VecXor(r2_3, r2_0); |
983 | | r3_3 = VecXor(r3_3, r3_0); |
984 | | |
985 | | r0_3 = VecRotateLeft<8>(r0_3); |
986 | | r1_3 = VecRotateLeft<8>(r1_3); |
987 | | r2_3 = VecRotateLeft<8>(r2_3); |
988 | | r3_3 = VecRotateLeft<8>(r3_3); |
989 | | |
990 | | r0_2 = VecAdd(r0_2, r0_3); |
991 | | r1_2 = VecAdd(r1_2, r1_3); |
992 | | r2_2 = VecAdd(r2_2, r2_3); |
993 | | r3_2 = VecAdd(r3_2, r3_3); |
994 | | |
995 | | r0_1 = VecXor(r0_1, r0_2); |
996 | | r1_1 = VecXor(r1_1, r1_2); |
997 | | r2_1 = VecXor(r2_1, r2_2); |
998 | | r3_1 = VecXor(r3_1, r3_2); |
999 | | |
1000 | | r0_1 = VecRotateLeft<7>(r0_1); |
1001 | | r1_1 = VecRotateLeft<7>(r1_1); |
1002 | | r2_1 = VecRotateLeft<7>(r2_1); |
1003 | | r3_1 = VecRotateLeft<7>(r3_1); |
1004 | | |
1005 | | r0_1 = Shuffle<3>(r0_1); |
1006 | | r0_2 = Shuffle<2>(r0_2); |
1007 | | r0_3 = Shuffle<1>(r0_3); |
1008 | | |
1009 | | r1_1 = Shuffle<3>(r1_1); |
1010 | | r1_2 = Shuffle<2>(r1_2); |
1011 | | r1_3 = Shuffle<1>(r1_3); |
1012 | | |
1013 | | r2_1 = Shuffle<3>(r2_1); |
1014 | | r2_2 = Shuffle<2>(r2_2); |
1015 | | r2_3 = Shuffle<1>(r2_3); |
1016 | | |
1017 | | r3_1 = Shuffle<3>(r3_1); |
1018 | | r3_2 = Shuffle<2>(r3_2); |
1019 | | r3_3 = Shuffle<1>(r3_3); |
1020 | | } |
1021 | | |
1022 | | r0_0 = VecAdd(r0_0, state0); |
1023 | | r0_1 = VecAdd(r0_1, state1); |
1024 | | r0_2 = VecAdd(r0_2, state2); |
1025 | | r0_3 = VecAdd(r0_3, state3); |
1026 | | |
1027 | | r1_0 = VecAdd(r1_0, state0); |
1028 | | r1_1 = VecAdd(r1_1, state1); |
1029 | | r1_2 = VecAdd(r1_2, state2); |
1030 | | r1_3 = VecAdd(r1_3, state3); |
1031 | | r1_3 = VecAdd64(r1_3, CTRS[0]); |
1032 | | |
1033 | | r2_0 = VecAdd(r2_0, state0); |
1034 | | r2_1 = VecAdd(r2_1, state1); |
1035 | | r2_2 = VecAdd(r2_2, state2); |
1036 | | r2_3 = VecAdd(r2_3, state3); |
1037 | | r2_3 = VecAdd64(r2_3, CTRS[1]); |
1038 | | |
1039 | | r3_0 = VecAdd(r3_0, state0); |
1040 | | r3_1 = VecAdd(r3_1, state1); |
1041 | | r3_2 = VecAdd(r3_2, state2); |
1042 | | r3_3 = VecAdd(r3_3, state3); |
1043 | | r3_3 = VecAdd64(r3_3, CTRS[2]); |
1044 | | |
1045 | | if (input) |
1046 | | { |
1047 | | r0_0 = VecXor(VecLoad32LE(input + 0*16), r0_0); |
1048 | | r0_1 = VecXor(VecLoad32LE(input + 1*16), r0_1); |
1049 | | r0_2 = VecXor(VecLoad32LE(input + 2*16), r0_2); |
1050 | | r0_3 = VecXor(VecLoad32LE(input + 3*16), r0_3); |
1051 | | } |
1052 | | |
1053 | | VecStore32LE(output + 0*16, r0_0); |
1054 | | VecStore32LE(output + 1*16, r0_1); |
1055 | | VecStore32LE(output + 2*16, r0_2); |
1056 | | VecStore32LE(output + 3*16, r0_3); |
1057 | | |
1058 | | if (input) |
1059 | | { |
1060 | | r1_0 = VecXor(VecLoad32LE(input + 4*16), r1_0); |
1061 | | r1_1 = VecXor(VecLoad32LE(input + 5*16), r1_1); |
1062 | | r1_2 = VecXor(VecLoad32LE(input + 6*16), r1_2); |
1063 | | r1_3 = VecXor(VecLoad32LE(input + 7*16), r1_3); |
1064 | | } |
1065 | | |
1066 | | VecStore32LE(output + 4*16, r1_0); |
1067 | | VecStore32LE(output + 5*16, r1_1); |
1068 | | VecStore32LE(output + 6*16, r1_2); |
1069 | | VecStore32LE(output + 7*16, r1_3); |
1070 | | |
1071 | | if (input) |
1072 | | { |
1073 | | r2_0 = VecXor(VecLoad32LE(input + 8*16), r2_0); |
1074 | | r2_1 = VecXor(VecLoad32LE(input + 9*16), r2_1); |
1075 | | r2_2 = VecXor(VecLoad32LE(input + 10*16), r2_2); |
1076 | | r2_3 = VecXor(VecLoad32LE(input + 11*16), r2_3); |
1077 | | } |
1078 | | |
1079 | | VecStore32LE(output + 8*16, r2_0); |
1080 | | VecStore32LE(output + 9*16, r2_1); |
1081 | | VecStore32LE(output + 10*16, r2_2); |
1082 | | VecStore32LE(output + 11*16, r2_3); |
1083 | | |
1084 | | if (input) |
1085 | | { |
1086 | | r3_0 = VecXor(VecLoad32LE(input + 12*16), r3_0); |
1087 | | r3_1 = VecXor(VecLoad32LE(input + 13*16), r3_1); |
1088 | | r3_2 = VecXor(VecLoad32LE(input + 14*16), r3_2); |
1089 | | r3_3 = VecXor(VecLoad32LE(input + 15*16), r3_3); |
1090 | | } |
1091 | | |
1092 | | VecStore32LE(output + 12*16, r3_0); |
1093 | | VecStore32LE(output + 13*16, r3_1); |
1094 | | VecStore32LE(output + 14*16, r3_2); |
1095 | | VecStore32LE(output + 15*16, r3_3); |
1096 | | } |
1097 | | |
1098 | | #endif // CRYPTOPP_ALTIVEC_AVAILABLE |
1099 | | |
1100 | | #if (CRYPTOPP_ALTIVEC_AVAILABLE) |
1101 | | |
1102 | | void ChaCha_OperateKeystream_ALTIVEC(const word32 *state, const byte* input, byte *output, unsigned int rounds) |
1103 | | { |
1104 | | ChaCha_OperateKeystream_CORE(state, input, output, rounds); |
1105 | | } |
1106 | | |
1107 | | #endif |
1108 | | |
1109 | | NAMESPACE_END |