/src/cryptopp/speck128_simd.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // speck128_simd.cpp - written and placed in the public domain by Jeffrey Walton |
2 | | // |
3 | | // This source file uses intrinsics and built-ins to gain access to |
4 | | // SSSE3, ARM NEON and ARMv8a, and Altivec instructions. A separate |
5 | | // source file is needed because additional CXXFLAGS are required to enable |
6 | | // the appropriate instructions sets in some build configurations. |
7 | | |
8 | | #include "pch.h" |
9 | | #include "config.h" |
10 | | |
11 | | #include "speck.h" |
12 | | #include "misc.h" |
13 | | |
14 | | // Uncomment for benchmarking C++ against SSE or NEON. |
15 | | // Do so in both speck.cpp and speck_simd.cpp. |
16 | | // #undef CRYPTOPP_SSSE3_AVAILABLE |
17 | | // #undef CRYPTOPP_ARM_NEON_AVAILABLE |
18 | | |
19 | | #if (CRYPTOPP_SSSE3_AVAILABLE) |
20 | | # include "adv_simd.h" |
21 | | # include <pmmintrin.h> |
22 | | # include <tmmintrin.h> |
23 | | #endif |
24 | | |
25 | | #if defined(__XOP__) |
26 | | # if defined(CRYPTOPP_GCC_COMPATIBLE) |
27 | | # include <x86intrin.h> |
28 | | # endif |
29 | | # include <ammintrin.h> |
30 | | #endif // XOP |
31 | | |
32 | | #if (CRYPTOPP_ARM_NEON_HEADER) |
33 | | # include "adv_simd.h" |
34 | | # include <arm_neon.h> |
35 | | #endif |
36 | | |
37 | | #if (CRYPTOPP_ARM_ACLE_HEADER) |
38 | | # include <stdint.h> |
39 | | # include <arm_acle.h> |
40 | | #endif |
41 | | |
42 | | #if defined(_M_ARM64) |
43 | | # include "adv_simd.h" |
44 | | #endif |
45 | | |
46 | | #if defined(CRYPTOPP_ALTIVEC_AVAILABLE) |
47 | | # include "adv_simd.h" |
48 | | # include "ppc_simd.h" |
49 | | #endif |
50 | | |
51 | | // Squash MS LNK4221 and libtool warnings |
52 | | extern const char SPECK128_SIMD_FNAME[] = __FILE__; |
53 | | |
54 | | ANONYMOUS_NAMESPACE_BEGIN |
55 | | |
56 | | using CryptoPP::byte; |
57 | | using CryptoPP::word32; |
58 | | using CryptoPP::word64; |
59 | | |
60 | | // *************************** ARM NEON ************************** // |
61 | | |
62 | | #if (CRYPTOPP_ARM_NEON_AVAILABLE) |
63 | | |
64 | | // Missing from Microsoft's ARM A-32 implementation |
65 | | #if defined(CRYPTOPP_MSC_VERSION) && !defined(_M_ARM64) |
66 | | inline uint64x2_t vld1q_dup_u64(const uint64_t* ptr) |
67 | | { |
68 | | return vmovq_n_u64(*ptr); |
69 | | } |
70 | | #endif |
71 | | |
72 | | template <class T> |
73 | | inline T UnpackHigh64(const T& a, const T& b) |
74 | | { |
75 | | const uint64x1_t x(vget_high_u64((uint64x2_t)a)); |
76 | | const uint64x1_t y(vget_high_u64((uint64x2_t)b)); |
77 | | return (T)vcombine_u64(x, y); |
78 | | } |
79 | | |
80 | | template <class T> |
81 | | inline T UnpackLow64(const T& a, const T& b) |
82 | | { |
83 | | const uint64x1_t x(vget_low_u64((uint64x2_t)a)); |
84 | | const uint64x1_t y(vget_low_u64((uint64x2_t)b)); |
85 | | return (T)vcombine_u64(x, y); |
86 | | } |
87 | | |
88 | | template <unsigned int R> |
89 | | inline uint64x2_t RotateLeft64(const uint64x2_t& val) |
90 | | { |
91 | | const uint64x2_t a(vshlq_n_u64(val, R)); |
92 | | const uint64x2_t b(vshrq_n_u64(val, 64 - R)); |
93 | | return vorrq_u64(a, b); |
94 | | } |
95 | | |
96 | | template <unsigned int R> |
97 | | inline uint64x2_t RotateRight64(const uint64x2_t& val) |
98 | | { |
99 | | const uint64x2_t a(vshlq_n_u64(val, 64 - R)); |
100 | | const uint64x2_t b(vshrq_n_u64(val, R)); |
101 | | return vorrq_u64(a, b); |
102 | | } |
103 | | |
104 | | #if defined(__aarch32__) || defined(__aarch64__) |
105 | | // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. |
106 | | template <> |
107 | | inline uint64x2_t RotateLeft64<8>(const uint64x2_t& val) |
108 | | { |
109 | | const uint8_t maskb[16] = { 7,0,1,2, 3,4,5,6, 15,8,9,10, 11,12,13,14 }; |
110 | | const uint8x16_t mask = vld1q_u8(maskb); |
111 | | |
112 | | return vreinterpretq_u64_u8( |
113 | | vqtbl1q_u8(vreinterpretq_u8_u64(val), mask)); |
114 | | } |
115 | | |
116 | | // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. |
117 | | template <> |
118 | | inline uint64x2_t RotateRight64<8>(const uint64x2_t& val) |
119 | | { |
120 | | const uint8_t maskb[16] = { 1,2,3,4, 5,6,7,0, 9,10,11,12, 13,14,15,8 }; |
121 | | const uint8x16_t mask = vld1q_u8(maskb); |
122 | | |
123 | | return vreinterpretq_u64_u8( |
124 | | vqtbl1q_u8(vreinterpretq_u8_u64(val), mask)); |
125 | | } |
126 | | #endif |
127 | | |
128 | | inline void SPECK128_Enc_Block(uint64x2_t &block0, uint64x2_t &block1, |
129 | | const word64 *subkeys, unsigned int rounds) |
130 | | { |
131 | | // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... |
132 | | uint64x2_t x1 = UnpackHigh64(block0, block1); |
133 | | uint64x2_t y1 = UnpackLow64(block0, block1); |
134 | | |
135 | | for (size_t i=0; i < static_cast<size_t>(rounds); ++i) |
136 | | { |
137 | | const uint64x2_t rk = vld1q_dup_u64(subkeys+i); |
138 | | |
139 | | x1 = RotateRight64<8>(x1); |
140 | | x1 = vaddq_u64(x1, y1); |
141 | | x1 = veorq_u64(x1, rk); |
142 | | y1 = RotateLeft64<3>(y1); |
143 | | y1 = veorq_u64(y1, x1); |
144 | | } |
145 | | |
146 | | // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... |
147 | | block0 = UnpackLow64(y1, x1); |
148 | | block1 = UnpackHigh64(y1, x1); |
149 | | } |
150 | | |
151 | | inline void SPECK128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1, |
152 | | uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5, |
153 | | const word64 *subkeys, unsigned int rounds) |
154 | | { |
155 | | // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... |
156 | | uint64x2_t x1 = UnpackHigh64(block0, block1); |
157 | | uint64x2_t y1 = UnpackLow64(block0, block1); |
158 | | uint64x2_t x2 = UnpackHigh64(block2, block3); |
159 | | uint64x2_t y2 = UnpackLow64(block2, block3); |
160 | | uint64x2_t x3 = UnpackHigh64(block4, block5); |
161 | | uint64x2_t y3 = UnpackLow64(block4, block5); |
162 | | |
163 | | for (size_t i=0; i < static_cast<size_t>(rounds); ++i) |
164 | | { |
165 | | const uint64x2_t rk = vld1q_dup_u64(subkeys+i); |
166 | | |
167 | | x1 = RotateRight64<8>(x1); |
168 | | x2 = RotateRight64<8>(x2); |
169 | | x3 = RotateRight64<8>(x3); |
170 | | x1 = vaddq_u64(x1, y1); |
171 | | x2 = vaddq_u64(x2, y2); |
172 | | x3 = vaddq_u64(x3, y3); |
173 | | x1 = veorq_u64(x1, rk); |
174 | | x2 = veorq_u64(x2, rk); |
175 | | x3 = veorq_u64(x3, rk); |
176 | | y1 = RotateLeft64<3>(y1); |
177 | | y2 = RotateLeft64<3>(y2); |
178 | | y3 = RotateLeft64<3>(y3); |
179 | | y1 = veorq_u64(y1, x1); |
180 | | y2 = veorq_u64(y2, x2); |
181 | | y3 = veorq_u64(y3, x3); |
182 | | } |
183 | | |
184 | | // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... |
185 | | block0 = UnpackLow64(y1, x1); |
186 | | block1 = UnpackHigh64(y1, x1); |
187 | | block2 = UnpackLow64(y2, x2); |
188 | | block3 = UnpackHigh64(y2, x2); |
189 | | block4 = UnpackLow64(y3, x3); |
190 | | block5 = UnpackHigh64(y3, x3); |
191 | | } |
192 | | |
193 | | inline void SPECK128_Dec_Block(uint64x2_t &block0, uint64x2_t &block1, |
194 | | const word64 *subkeys, unsigned int rounds) |
195 | | { |
196 | | // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... |
197 | | uint64x2_t x1 = UnpackHigh64(block0, block1); |
198 | | uint64x2_t y1 = UnpackLow64(block0, block1); |
199 | | |
200 | | for (int i = static_cast<int>(rounds-1); i >= 0; --i) |
201 | | { |
202 | | const uint64x2_t rk = vld1q_dup_u64(subkeys+i); |
203 | | |
204 | | y1 = veorq_u64(y1, x1); |
205 | | y1 = RotateRight64<3>(y1); |
206 | | x1 = veorq_u64(x1, rk); |
207 | | x1 = vsubq_u64(x1, y1); |
208 | | x1 = RotateLeft64<8>(x1); |
209 | | } |
210 | | |
211 | | // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... |
212 | | block0 = UnpackLow64(y1, x1); |
213 | | block1 = UnpackHigh64(y1, x1); |
214 | | } |
215 | | |
216 | | inline void SPECK128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1, |
217 | | uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5, |
218 | | const word64 *subkeys, unsigned int rounds) |
219 | | { |
220 | | // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... |
221 | | uint64x2_t x1 = UnpackHigh64(block0, block1); |
222 | | uint64x2_t y1 = UnpackLow64(block0, block1); |
223 | | uint64x2_t x2 = UnpackHigh64(block2, block3); |
224 | | uint64x2_t y2 = UnpackLow64(block2, block3); |
225 | | uint64x2_t x3 = UnpackHigh64(block4, block5); |
226 | | uint64x2_t y3 = UnpackLow64(block4, block5); |
227 | | |
228 | | for (int i = static_cast<int>(rounds-1); i >= 0; --i) |
229 | | { |
230 | | const uint64x2_t rk = vld1q_dup_u64(subkeys+i); |
231 | | |
232 | | y1 = veorq_u64(y1, x1); |
233 | | y2 = veorq_u64(y2, x2); |
234 | | y3 = veorq_u64(y3, x3); |
235 | | y1 = RotateRight64<3>(y1); |
236 | | y2 = RotateRight64<3>(y2); |
237 | | y3 = RotateRight64<3>(y3); |
238 | | x1 = veorq_u64(x1, rk); |
239 | | x2 = veorq_u64(x2, rk); |
240 | | x3 = veorq_u64(x3, rk); |
241 | | x1 = vsubq_u64(x1, y1); |
242 | | x2 = vsubq_u64(x2, y2); |
243 | | x3 = vsubq_u64(x3, y3); |
244 | | x1 = RotateLeft64<8>(x1); |
245 | | x2 = RotateLeft64<8>(x2); |
246 | | x3 = RotateLeft64<8>(x3); |
247 | | } |
248 | | |
249 | | // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... |
250 | | block0 = UnpackLow64(y1, x1); |
251 | | block1 = UnpackHigh64(y1, x1); |
252 | | block2 = UnpackLow64(y2, x2); |
253 | | block3 = UnpackHigh64(y2, x2); |
254 | | block4 = UnpackLow64(y3, x3); |
255 | | block5 = UnpackHigh64(y3, x3); |
256 | | } |
257 | | |
258 | | #endif // CRYPTOPP_ARM_NEON_AVAILABLE |
259 | | |
260 | | // ***************************** IA-32 ***************************** // |
261 | | |
262 | | #if defined(CRYPTOPP_SSSE3_AVAILABLE) |
263 | | |
264 | | // GCC double casts, https://www.spinics.net/lists/gcchelp/msg47735.html |
265 | | #ifndef DOUBLE_CAST |
266 | | # define DOUBLE_CAST(x) ((double *)(void *)(x)) |
267 | | #endif |
268 | | #ifndef CONST_DOUBLE_CAST |
269 | | # define CONST_DOUBLE_CAST(x) ((const double *)(const void *)(x)) |
270 | | #endif |
271 | | |
272 | | template <unsigned int R> |
273 | | inline __m128i RotateLeft64(const __m128i& val) |
274 | 0 | { |
275 | | #if defined(__XOP__) |
276 | | return _mm_roti_epi64(val, R); |
277 | | #else |
278 | 0 | return _mm_or_si128( |
279 | 0 | _mm_slli_epi64(val, R), _mm_srli_epi64(val, 64-R)); |
280 | 0 | #endif |
281 | 0 | } |
282 | | |
283 | | template <unsigned int R> |
284 | | inline __m128i RotateRight64(const __m128i& val) |
285 | 64 | { |
286 | | #if defined(__XOP__) |
287 | | return _mm_roti_epi64(val, 64-R); |
288 | | #else |
289 | 64 | return _mm_or_si128( |
290 | 64 | _mm_slli_epi64(val, 64-R), _mm_srli_epi64(val, R)); |
291 | 64 | #endif |
292 | 64 | } |
293 | | |
294 | | // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. |
295 | | template <> |
296 | | __m128i RotateLeft64<8>(const __m128i& val) |
297 | 64 | { |
298 | | #if defined(__XOP__) |
299 | | return _mm_roti_epi64(val, 8); |
300 | | #else |
301 | 64 | const __m128i mask = _mm_set_epi8(14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7); |
302 | 64 | return _mm_shuffle_epi8(val, mask); |
303 | 64 | #endif |
304 | 64 | } |
305 | | |
306 | | // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. |
307 | | template <> |
308 | | __m128i RotateRight64<8>(const __m128i& val) |
309 | 0 | { |
310 | | #if defined(__XOP__) |
311 | | return _mm_roti_epi64(val, 64-8); |
312 | | #else |
313 | 0 | const __m128i mask = _mm_set_epi8(8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1); |
314 | 0 | return _mm_shuffle_epi8(val, mask); |
315 | 0 | #endif |
316 | 0 | } |
317 | | |
318 | | inline void SPECK128_Enc_Block(__m128i &block0, __m128i &block1, |
319 | | const word64 *subkeys, unsigned int rounds) |
320 | 0 | { |
321 | | // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... |
322 | 0 | __m128i x1 = _mm_unpackhi_epi64(block0, block1); |
323 | 0 | __m128i y1 = _mm_unpacklo_epi64(block0, block1); |
324 | |
|
325 | 0 | for (size_t i=0; i < static_cast<size_t>(rounds); ++i) |
326 | 0 | { |
327 | | // Round keys are pre-splated in forward direction |
328 | 0 | const __m128i rk = _mm_load_si128(CONST_M128_CAST(subkeys+i*2)); |
329 | |
|
330 | 0 | x1 = RotateRight64<8>(x1); |
331 | 0 | x1 = _mm_add_epi64(x1, y1); |
332 | 0 | x1 = _mm_xor_si128(x1, rk); |
333 | 0 | y1 = RotateLeft64<3>(y1); |
334 | 0 | y1 = _mm_xor_si128(y1, x1); |
335 | 0 | } |
336 | | |
337 | | // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... |
338 | 0 | block0 = _mm_unpacklo_epi64(y1, x1); |
339 | 0 | block1 = _mm_unpackhi_epi64(y1, x1); |
340 | 0 | } |
341 | | |
342 | | inline void SPECK128_Enc_6_Blocks(__m128i &block0, __m128i &block1, |
343 | | __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5, |
344 | | const word64 *subkeys, unsigned int rounds) |
345 | 0 | { |
346 | | // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... |
347 | 0 | __m128i x1 = _mm_unpackhi_epi64(block0, block1); |
348 | 0 | __m128i y1 = _mm_unpacklo_epi64(block0, block1); |
349 | 0 | __m128i x2 = _mm_unpackhi_epi64(block2, block3); |
350 | 0 | __m128i y2 = _mm_unpacklo_epi64(block2, block3); |
351 | 0 | __m128i x3 = _mm_unpackhi_epi64(block4, block5); |
352 | 0 | __m128i y3 = _mm_unpacklo_epi64(block4, block5); |
353 | |
|
354 | 0 | for (size_t i=0; i < static_cast<size_t>(rounds); ++i) |
355 | 0 | { |
356 | | // Round keys are pre-splated in forward direction |
357 | 0 | const __m128i rk = _mm_load_si128(CONST_M128_CAST(subkeys+i*2)); |
358 | |
|
359 | 0 | x1 = RotateRight64<8>(x1); |
360 | 0 | x2 = RotateRight64<8>(x2); |
361 | 0 | x3 = RotateRight64<8>(x3); |
362 | 0 | x1 = _mm_add_epi64(x1, y1); |
363 | 0 | x2 = _mm_add_epi64(x2, y2); |
364 | 0 | x3 = _mm_add_epi64(x3, y3); |
365 | 0 | x1 = _mm_xor_si128(x1, rk); |
366 | 0 | x2 = _mm_xor_si128(x2, rk); |
367 | 0 | x3 = _mm_xor_si128(x3, rk); |
368 | 0 | y1 = RotateLeft64<3>(y1); |
369 | 0 | y2 = RotateLeft64<3>(y2); |
370 | 0 | y3 = RotateLeft64<3>(y3); |
371 | 0 | y1 = _mm_xor_si128(y1, x1); |
372 | 0 | y2 = _mm_xor_si128(y2, x2); |
373 | 0 | y3 = _mm_xor_si128(y3, x3); |
374 | 0 | } |
375 | | |
376 | | // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... |
377 | 0 | block0 = _mm_unpacklo_epi64(y1, x1); |
378 | 0 | block1 = _mm_unpackhi_epi64(y1, x1); |
379 | 0 | block2 = _mm_unpacklo_epi64(y2, x2); |
380 | 0 | block3 = _mm_unpackhi_epi64(y2, x2); |
381 | 0 | block4 = _mm_unpacklo_epi64(y3, x3); |
382 | 0 | block5 = _mm_unpackhi_epi64(y3, x3); |
383 | 0 | } |
384 | | |
385 | | inline void SPECK128_Dec_Block(__m128i &block0, __m128i &block1, |
386 | | const word64 *subkeys, unsigned int rounds) |
387 | 2 | { |
388 | | // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... |
389 | 2 | __m128i x1 = _mm_unpackhi_epi64(block0, block1); |
390 | 2 | __m128i y1 = _mm_unpacklo_epi64(block0, block1); |
391 | | |
392 | 66 | for (int i = static_cast<int>(rounds-1); i >= 0; --i) |
393 | 64 | { |
394 | 64 | const __m128i rk = _mm_castpd_si128( |
395 | 64 | _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i))); |
396 | | |
397 | 64 | y1 = _mm_xor_si128(y1, x1); |
398 | 64 | y1 = RotateRight64<3>(y1); |
399 | 64 | x1 = _mm_xor_si128(x1, rk); |
400 | 64 | x1 = _mm_sub_epi64(x1, y1); |
401 | 64 | x1 = RotateLeft64<8>(x1); |
402 | 64 | } |
403 | | |
404 | | // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... |
405 | 2 | block0 = _mm_unpacklo_epi64(y1, x1); |
406 | 2 | block1 = _mm_unpackhi_epi64(y1, x1); |
407 | 2 | } |
408 | | |
409 | | inline void SPECK128_Dec_6_Blocks(__m128i &block0, __m128i &block1, |
410 | | __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5, |
411 | | const word64 *subkeys, unsigned int rounds) |
412 | 0 | { |
413 | | // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... |
414 | 0 | __m128i x1 = _mm_unpackhi_epi64(block0, block1); |
415 | 0 | __m128i y1 = _mm_unpacklo_epi64(block0, block1); |
416 | 0 | __m128i x2 = _mm_unpackhi_epi64(block2, block3); |
417 | 0 | __m128i y2 = _mm_unpacklo_epi64(block2, block3); |
418 | 0 | __m128i x3 = _mm_unpackhi_epi64(block4, block5); |
419 | 0 | __m128i y3 = _mm_unpacklo_epi64(block4, block5); |
420 | |
|
421 | 0 | for (int i = static_cast<int>(rounds-1); i >= 0; --i) |
422 | 0 | { |
423 | 0 | const __m128i rk = _mm_castpd_si128( |
424 | 0 | _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i))); |
425 | |
|
426 | 0 | y1 = _mm_xor_si128(y1, x1); |
427 | 0 | y2 = _mm_xor_si128(y2, x2); |
428 | 0 | y3 = _mm_xor_si128(y3, x3); |
429 | 0 | y1 = RotateRight64<3>(y1); |
430 | 0 | y2 = RotateRight64<3>(y2); |
431 | 0 | y3 = RotateRight64<3>(y3); |
432 | 0 | x1 = _mm_xor_si128(x1, rk); |
433 | 0 | x2 = _mm_xor_si128(x2, rk); |
434 | 0 | x3 = _mm_xor_si128(x3, rk); |
435 | 0 | x1 = _mm_sub_epi64(x1, y1); |
436 | 0 | x2 = _mm_sub_epi64(x2, y2); |
437 | 0 | x3 = _mm_sub_epi64(x3, y3); |
438 | 0 | x1 = RotateLeft64<8>(x1); |
439 | 0 | x2 = RotateLeft64<8>(x2); |
440 | 0 | x3 = RotateLeft64<8>(x3); |
441 | 0 | } |
442 | | |
443 | | // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... |
444 | 0 | block0 = _mm_unpacklo_epi64(y1, x1); |
445 | 0 | block1 = _mm_unpackhi_epi64(y1, x1); |
446 | 0 | block2 = _mm_unpacklo_epi64(y2, x2); |
447 | 0 | block3 = _mm_unpackhi_epi64(y2, x2); |
448 | 0 | block4 = _mm_unpacklo_epi64(y3, x3); |
449 | 0 | block5 = _mm_unpackhi_epi64(y3, x3); |
450 | 0 | } |
451 | | |
452 | | #endif // CRYPTOPP_SSSE3_AVAILABLE |
453 | | |
454 | | // ***************************** Altivec ***************************** // |
455 | | |
456 | | #if defined(CRYPTOPP_ALTIVEC_AVAILABLE) |
457 | | |
458 | | // Altivec uses native 64-bit types on 64-bit environments, or 32-bit types |
459 | | // in 32-bit environments. Speck128 will use the appropriate type for the |
460 | | // environment. Functions like VecAdd64 have two overloads, one for each |
461 | | // environment. The 32-bit overload treats uint32x4_p like a 64-bit type, |
462 | | // and does things like perform a add with carry or subtract with borrow. |
463 | | |
464 | | // Speck128 on Power8 performed as expected because of 64-bit environment. |
465 | | // Performance sucked on old PowerPC machines because of 32-bit environments. |
466 | | // At Crypto++ 8.3 we added an implementation that operated on 32-bit words. |
467 | | // Native 64-bit Speck128 performance dropped from about 4.1 to 6.3 cpb, but |
468 | | // 32-bit Speck128 improved from 66.5 cpb to 10.4 cpb. Overall it was a |
469 | | // good win even though we lost some performance in 64-bit environments. |
470 | | |
471 | | using CryptoPP::uint8x16_p; |
472 | | using CryptoPP::uint32x4_p; |
473 | | #if defined(_ARCH_PWR8) |
474 | | using CryptoPP::uint64x2_p; |
475 | | #endif |
476 | | |
477 | | using CryptoPP::VecAdd64; |
478 | | using CryptoPP::VecSub64; |
479 | | using CryptoPP::VecAnd64; |
480 | | using CryptoPP::VecOr64; |
481 | | using CryptoPP::VecXor64; |
482 | | using CryptoPP::VecSplatWord64; |
483 | | using CryptoPP::VecRotateLeft64; |
484 | | using CryptoPP::VecRotateRight64; |
485 | | using CryptoPP::VecLoad; |
486 | | using CryptoPP::VecLoadAligned; |
487 | | using CryptoPP::VecPermute; |
488 | | |
489 | | #if defined(_ARCH_PWR8) |
490 | | #define speck128_t uint64x2_p |
491 | | #else |
492 | | #define speck128_t uint32x4_p |
493 | | #endif |
494 | | |
495 | | void SPECK128_Enc_Block(uint32x4_p &block, const word64 *subkeys, unsigned int rounds) |
496 | | { |
497 | | #if (CRYPTOPP_BIG_ENDIAN) |
498 | | const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8}; |
499 | | const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0}; |
500 | | #else |
501 | | const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16}; |
502 | | const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24}; |
503 | | #endif |
504 | | |
505 | | // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... |
506 | | speck128_t x1 = (speck128_t)VecPermute(block, block, m1); |
507 | | speck128_t y1 = (speck128_t)VecPermute(block, block, m2); |
508 | | |
509 | | for (size_t i=0; i < static_cast<size_t>(rounds); ++i) |
510 | | { |
511 | | // Round keys are pre-splated in forward direction |
512 | | const word32* ptr = reinterpret_cast<const word32*>(subkeys+i*2); |
513 | | const speck128_t rk = (speck128_t)VecLoadAligned(ptr); |
514 | | |
515 | | x1 = (speck128_t)VecRotateRight64<8>(x1); |
516 | | x1 = (speck128_t)VecAdd64(x1, y1); |
517 | | x1 = (speck128_t)VecXor64(x1, rk); |
518 | | |
519 | | y1 = (speck128_t)VecRotateLeft64<3>(y1); |
520 | | y1 = (speck128_t)VecXor64(y1, x1); |
521 | | } |
522 | | |
523 | | #if (CRYPTOPP_BIG_ENDIAN) |
524 | | const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8}; |
525 | | //const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0}; |
526 | | #else |
527 | | const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16}; |
528 | | //const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24}; |
529 | | #endif |
530 | | |
531 | | // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... |
532 | | block = (uint32x4_p)VecPermute(x1, y1, m3); |
533 | | } |
534 | | |
535 | | void SPECK128_Dec_Block(uint32x4_p &block, const word64 *subkeys, unsigned int rounds) |
536 | | { |
537 | | #if (CRYPTOPP_BIG_ENDIAN) |
538 | | const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8}; |
539 | | const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0}; |
540 | | #else |
541 | | const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16}; |
542 | | const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24}; |
543 | | #endif |
544 | | |
545 | | // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... |
546 | | speck128_t x1 = (speck128_t)VecPermute(block, block, m1); |
547 | | speck128_t y1 = (speck128_t)VecPermute(block, block, m2); |
548 | | |
549 | | for (int i = static_cast<int>(rounds-1); i >= 0; --i) |
550 | | { |
551 | | const speck128_t rk = (speck128_t)VecSplatWord64(subkeys[i]); |
552 | | |
553 | | y1 = (speck128_t)VecXor64(y1, x1); |
554 | | y1 = (speck128_t)VecRotateRight64<3>(y1); |
555 | | x1 = (speck128_t)VecXor64(x1, rk); |
556 | | x1 = (speck128_t)VecSub64(x1, y1); |
557 | | x1 = (speck128_t)VecRotateLeft64<8>(x1); |
558 | | } |
559 | | |
560 | | #if (CRYPTOPP_BIG_ENDIAN) |
561 | | const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8}; |
562 | | //const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0}; |
563 | | #else |
564 | | const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16}; |
565 | | //const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24}; |
566 | | #endif |
567 | | |
568 | | // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... |
569 | | block = (uint32x4_p)VecPermute(x1, y1, m3); |
570 | | } |
571 | | |
572 | | void SPECK128_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1, |
573 | | uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4, |
574 | | uint32x4_p &block5, const word64 *subkeys, unsigned int rounds) |
575 | | { |
576 | | #if (CRYPTOPP_BIG_ENDIAN) |
577 | | const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8}; |
578 | | const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0}; |
579 | | #else |
580 | | const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16}; |
581 | | const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24}; |
582 | | #endif |
583 | | |
584 | | // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... |
585 | | speck128_t x1 = (speck128_t)VecPermute(block0, block1, m1); |
586 | | speck128_t y1 = (speck128_t)VecPermute(block0, block1, m2); |
587 | | speck128_t x2 = (speck128_t)VecPermute(block2, block3, m1); |
588 | | speck128_t y2 = (speck128_t)VecPermute(block2, block3, m2); |
589 | | speck128_t x3 = (speck128_t)VecPermute(block4, block5, m1); |
590 | | speck128_t y3 = (speck128_t)VecPermute(block4, block5, m2); |
591 | | |
592 | | for (size_t i=0; i < static_cast<size_t>(rounds); ++i) |
593 | | { |
594 | | // Round keys are pre-splated in forward direction |
595 | | const word32* ptr = reinterpret_cast<const word32*>(subkeys+i*2); |
596 | | const speck128_t rk = (speck128_t)VecLoadAligned(ptr); |
597 | | |
598 | | x1 = (speck128_t)VecRotateRight64<8>(x1); |
599 | | x2 = (speck128_t)VecRotateRight64<8>(x2); |
600 | | x3 = (speck128_t)VecRotateRight64<8>(x3); |
601 | | x1 = (speck128_t)VecAdd64(x1, y1); |
602 | | x2 = (speck128_t)VecAdd64(x2, y2); |
603 | | x3 = (speck128_t)VecAdd64(x3, y3); |
604 | | x1 = (speck128_t)VecXor64(x1, rk); |
605 | | x2 = (speck128_t)VecXor64(x2, rk); |
606 | | x3 = (speck128_t)VecXor64(x3, rk); |
607 | | |
608 | | y1 = (speck128_t)VecRotateLeft64<3>(y1); |
609 | | y2 = (speck128_t)VecRotateLeft64<3>(y2); |
610 | | y3 = (speck128_t)VecRotateLeft64<3>(y3); |
611 | | y1 = (speck128_t)VecXor64(y1, x1); |
612 | | y2 = (speck128_t)VecXor64(y2, x2); |
613 | | y3 = (speck128_t)VecXor64(y3, x3); |
614 | | } |
615 | | |
616 | | #if (CRYPTOPP_BIG_ENDIAN) |
617 | | const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8}; |
618 | | const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0}; |
619 | | #else |
620 | | const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16}; |
621 | | const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24}; |
622 | | #endif |
623 | | |
624 | | // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... |
625 | | block0 = (uint32x4_p)VecPermute(x1, y1, m3); |
626 | | block1 = (uint32x4_p)VecPermute(x1, y1, m4); |
627 | | block2 = (uint32x4_p)VecPermute(x2, y2, m3); |
628 | | block3 = (uint32x4_p)VecPermute(x2, y2, m4); |
629 | | block4 = (uint32x4_p)VecPermute(x3, y3, m3); |
630 | | block5 = (uint32x4_p)VecPermute(x3, y3, m4); |
631 | | } |
632 | | |
633 | | void SPECK128_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1, |
634 | | uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4, |
635 | | uint32x4_p &block5, const word64 *subkeys, unsigned int rounds) |
636 | | { |
637 | | #if (CRYPTOPP_BIG_ENDIAN) |
638 | | const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8}; |
639 | | const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0}; |
640 | | #else |
641 | | const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16}; |
642 | | const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24}; |
643 | | #endif |
644 | | |
645 | | // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... |
646 | | speck128_t x1 = (speck128_t)VecPermute(block0, block1, m1); |
647 | | speck128_t y1 = (speck128_t)VecPermute(block0, block1, m2); |
648 | | speck128_t x2 = (speck128_t)VecPermute(block2, block3, m1); |
649 | | speck128_t y2 = (speck128_t)VecPermute(block2, block3, m2); |
650 | | speck128_t x3 = (speck128_t)VecPermute(block4, block5, m1); |
651 | | speck128_t y3 = (speck128_t)VecPermute(block4, block5, m2); |
652 | | |
653 | | for (int i = static_cast<int>(rounds-1); i >= 0; --i) |
654 | | { |
655 | | const speck128_t rk = (speck128_t)VecSplatWord64(subkeys[i]); |
656 | | |
657 | | y1 = (speck128_t)VecXor64(y1, x1); |
658 | | y2 = (speck128_t)VecXor64(y2, x2); |
659 | | y3 = (speck128_t)VecXor64(y3, x3); |
660 | | y1 = (speck128_t)VecRotateRight64<3>(y1); |
661 | | y2 = (speck128_t)VecRotateRight64<3>(y2); |
662 | | y3 = (speck128_t)VecRotateRight64<3>(y3); |
663 | | |
664 | | x1 = (speck128_t)VecXor64(x1, rk); |
665 | | x2 = (speck128_t)VecXor64(x2, rk); |
666 | | x3 = (speck128_t)VecXor64(x3, rk); |
667 | | x1 = (speck128_t)VecSub64(x1, y1); |
668 | | x2 = (speck128_t)VecSub64(x2, y2); |
669 | | x3 = (speck128_t)VecSub64(x3, y3); |
670 | | x1 = (speck128_t)VecRotateLeft64<8>(x1); |
671 | | x2 = (speck128_t)VecRotateLeft64<8>(x2); |
672 | | x3 = (speck128_t)VecRotateLeft64<8>(x3); |
673 | | } |
674 | | |
675 | | #if (CRYPTOPP_BIG_ENDIAN) |
676 | | const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8}; |
677 | | const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0}; |
678 | | #else |
679 | | const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16}; |
680 | | const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24}; |
681 | | #endif |
682 | | |
683 | | // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... |
684 | | block0 = (uint32x4_p)VecPermute(x1, y1, m3); |
685 | | block1 = (uint32x4_p)VecPermute(x1, y1, m4); |
686 | | block2 = (uint32x4_p)VecPermute(x2, y2, m3); |
687 | | block3 = (uint32x4_p)VecPermute(x2, y2, m4); |
688 | | block4 = (uint32x4_p)VecPermute(x3, y3, m3); |
689 | | block5 = (uint32x4_p)VecPermute(x3, y3, m4); |
690 | | } |
691 | | |
692 | | #endif // CRYPTOPP_ALTIVEC_AVAILABLE |
693 | | |
694 | | ANONYMOUS_NAMESPACE_END |
695 | | |
696 | | /////////////////////////////////////////////////////////////////////// |
697 | | |
698 | | NAMESPACE_BEGIN(CryptoPP) |
699 | | |
700 | | // *************************** ARM NEON **************************** // |
701 | | |
702 | | #if (CRYPTOPP_ARM_NEON_AVAILABLE) |
703 | | size_t SPECK128_Enc_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds, |
704 | | const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) |
705 | | { |
706 | | return AdvancedProcessBlocks128_6x2_NEON(SPECK128_Enc_Block, SPECK128_Enc_6_Blocks, |
707 | | subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); |
708 | | } |
709 | | |
710 | | size_t SPECK128_Dec_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds, |
711 | | const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) |
712 | | { |
713 | | return AdvancedProcessBlocks128_6x2_NEON(SPECK128_Dec_Block, SPECK128_Dec_6_Blocks, |
714 | | subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); |
715 | | } |
716 | | #endif // CRYPTOPP_ARM_NEON_AVAILABLE |
717 | | |
718 | | // ***************************** IA-32 ***************************** // |
719 | | |
720 | | #if (CRYPTOPP_SSSE3_AVAILABLE) |
721 | | size_t SPECK128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds, |
722 | | const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) |
723 | 0 | { |
724 | 0 | return AdvancedProcessBlocks128_6x2_SSE(SPECK128_Enc_Block, SPECK128_Enc_6_Blocks, |
725 | 0 | subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); |
726 | 0 | } |
727 | | |
728 | | size_t SPECK128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds, |
729 | | const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) |
730 | 2 | { |
731 | 2 | return AdvancedProcessBlocks128_6x2_SSE(SPECK128_Dec_Block, SPECK128_Dec_6_Blocks, |
732 | 2 | subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); |
733 | 2 | } |
734 | | #endif // CRYPTOPP_SSSE3_AVAILABLE |
735 | | |
736 | | // ***************************** Altivec ***************************** // |
737 | | |
738 | | #if (CRYPTOPP_ALTIVEC_AVAILABLE) |
739 | | size_t SPECK128_Enc_AdvancedProcessBlocks_ALTIVEC(const word64* subKeys, size_t rounds, |
740 | | const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) |
741 | | { |
742 | | return AdvancedProcessBlocks128_6x1_ALTIVEC(SPECK128_Enc_Block, SPECK128_Enc_6_Blocks, |
743 | | subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); |
744 | | } |
745 | | |
746 | | size_t SPECK128_Dec_AdvancedProcessBlocks_ALTIVEC(const word64* subKeys, size_t rounds, |
747 | | const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) |
748 | | { |
749 | | return AdvancedProcessBlocks128_6x1_ALTIVEC(SPECK128_Dec_Block, SPECK128_Dec_6_Blocks, |
750 | | subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); |
751 | | } |
752 | | #endif // CRYPTOPP_ALTIVEC_AVAILABLE |
753 | | |
754 | | NAMESPACE_END |