/src/cryptopp/simon128_simd.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // simon_simd.cpp - written and placed in the public domain by Jeffrey Walton |
2 | | // |
3 | | // This source file uses intrinsics and built-ins to gain access to |
4 | | // SSSE3, ARM NEON and ARMv8a, and Altivec instructions. A separate |
5 | | // source file is needed because additional CXXFLAGS are required to enable |
6 | | // the appropriate instructions sets in some build configurations. |
7 | | |
8 | | #include "pch.h" |
9 | | #include "config.h" |
10 | | |
11 | | #include "simon.h" |
12 | | #include "misc.h" |
13 | | |
14 | | // Uncomment for benchmarking C++ against SSE or NEON. |
15 | | // Do so in both simon.cpp and simon_simd.cpp. |
16 | | // #undef CRYPTOPP_SSSE3_AVAILABLE |
17 | | // #undef CRYPTOPP_ARM_NEON_AVAILABLE |
18 | | |
19 | | #if (CRYPTOPP_SSSE3_AVAILABLE) |
20 | | # include "adv_simd.h" |
21 | | # include <pmmintrin.h> |
22 | | # include <tmmintrin.h> |
23 | | #endif |
24 | | |
25 | | #if defined(__XOP__) |
26 | | # if defined(CRYPTOPP_GCC_COMPATIBLE) |
27 | | # include <x86intrin.h> |
28 | | # endif |
29 | | # include <ammintrin.h> |
30 | | #endif // XOP |
31 | | |
32 | | #if (CRYPTOPP_ARM_NEON_HEADER) |
33 | | # include "adv_simd.h" |
34 | | # include <arm_neon.h> |
35 | | #endif |
36 | | |
37 | | #if (CRYPTOPP_ARM_ACLE_HEADER) |
38 | | # include <stdint.h> |
39 | | # include <arm_acle.h> |
40 | | #endif |
41 | | |
42 | | #if defined(_M_ARM64) |
43 | | # include "adv_simd.h" |
44 | | #endif |
45 | | |
46 | | #if (CRYPTOPP_ALTIVEC_AVAILABLE) |
47 | | # include "adv_simd.h" |
48 | | # include "ppc_simd.h" |
49 | | #endif |
50 | | |
51 | | // Squash MS LNK4221 and libtool warnings |
52 | | extern const char SIMON128_SIMD_FNAME[] = __FILE__; |
53 | | |
54 | | ANONYMOUS_NAMESPACE_BEGIN |
55 | | |
56 | | using CryptoPP::byte; |
57 | | using CryptoPP::word32; |
58 | | using CryptoPP::word64; |
59 | | using CryptoPP::vec_swap; // SunCC |
60 | | |
61 | | // *************************** ARM NEON ************************** // |
62 | | |
63 | | #if (CRYPTOPP_ARM_NEON_AVAILABLE) |
64 | | |
65 | | // Missing from Microsoft's ARM A-32 implementation |
66 | | #if defined(CRYPTOPP_MSC_VERSION) && !defined(_M_ARM64) |
67 | | inline uint64x2_t vld1q_dup_u64(const uint64_t* ptr) |
68 | | { |
69 | | return vmovq_n_u64(*ptr); |
70 | | } |
71 | | #endif |
72 | | |
73 | | template <class T> |
74 | | inline T UnpackHigh64(const T& a, const T& b) |
75 | | { |
76 | | const uint64x1_t x(vget_high_u64((uint64x2_t)a)); |
77 | | const uint64x1_t y(vget_high_u64((uint64x2_t)b)); |
78 | | return (T)vcombine_u64(x, y); |
79 | | } |
80 | | |
81 | | template <class T> |
82 | | inline T UnpackLow64(const T& a, const T& b) |
83 | | { |
84 | | const uint64x1_t x(vget_low_u64((uint64x2_t)a)); |
85 | | const uint64x1_t y(vget_low_u64((uint64x2_t)b)); |
86 | | return (T)vcombine_u64(x, y); |
87 | | } |
88 | | |
89 | | template <unsigned int R> |
90 | | inline uint64x2_t RotateLeft64(const uint64x2_t& val) |
91 | | { |
92 | | const uint64x2_t a(vshlq_n_u64(val, R)); |
93 | | const uint64x2_t b(vshrq_n_u64(val, 64 - R)); |
94 | | return vorrq_u64(a, b); |
95 | | } |
96 | | |
97 | | template <unsigned int R> |
98 | | inline uint64x2_t RotateRight64(const uint64x2_t& val) |
99 | | { |
100 | | const uint64x2_t a(vshlq_n_u64(val, 64 - R)); |
101 | | const uint64x2_t b(vshrq_n_u64(val, R)); |
102 | | return vorrq_u64(a, b); |
103 | | } |
104 | | |
105 | | #if defined(__aarch32__) || defined(__aarch64__) |
106 | | // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. |
107 | | template <> |
108 | | inline uint64x2_t RotateLeft64<8>(const uint64x2_t& val) |
109 | | { |
110 | | const uint8_t maskb[16] = { 7,0,1,2, 3,4,5,6, 15,8,9,10, 11,12,13,14 }; |
111 | | const uint8x16_t mask = vld1q_u8(maskb); |
112 | | |
113 | | return vreinterpretq_u64_u8( |
114 | | vqtbl1q_u8(vreinterpretq_u8_u64(val), mask)); |
115 | | } |
116 | | |
117 | | // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. |
118 | | template <> |
119 | | inline uint64x2_t RotateRight64<8>(const uint64x2_t& val) |
120 | | { |
121 | | const uint8_t maskb[16] = { 1,2,3,4, 5,6,7,0, 9,10,11,12, 13,14,15,8 }; |
122 | | const uint8x16_t mask = vld1q_u8(maskb); |
123 | | |
124 | | return vreinterpretq_u64_u8( |
125 | | vqtbl1q_u8(vreinterpretq_u8_u64(val), mask)); |
126 | | } |
127 | | #endif |
128 | | |
129 | | inline uint64x2_t SIMON128_f(const uint64x2_t& val) |
130 | | { |
131 | | return veorq_u64(RotateLeft64<2>(val), |
132 | | vandq_u64(RotateLeft64<1>(val), RotateLeft64<8>(val))); |
133 | | } |
134 | | |
135 | | inline void SIMON128_Enc_Block(uint64x2_t &block0, uint64x2_t &block1, |
136 | | const word64 *subkeys, unsigned int rounds) |
137 | | { |
138 | | // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... |
139 | | uint64x2_t x1 = UnpackHigh64(block0, block1); |
140 | | uint64x2_t y1 = UnpackLow64(block0, block1); |
141 | | |
142 | | for (size_t i = 0; i < static_cast<size_t>(rounds & ~1)-1; i += 2) |
143 | | { |
144 | | const uint64x2_t rk1 = vld1q_dup_u64(subkeys+i); |
145 | | y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk1); |
146 | | |
147 | | const uint64x2_t rk2 = vld1q_dup_u64(subkeys+i+1); |
148 | | x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk2); |
149 | | } |
150 | | |
151 | | if (rounds & 1) |
152 | | { |
153 | | const uint64x2_t rk = vld1q_dup_u64(subkeys+rounds-1); |
154 | | |
155 | | y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk); |
156 | | std::swap(x1, y1); |
157 | | } |
158 | | |
159 | | // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... |
160 | | block0 = UnpackLow64(y1, x1); |
161 | | block1 = UnpackHigh64(y1, x1); |
162 | | } |
163 | | |
164 | | inline void SIMON128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1, |
165 | | uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5, |
166 | | const word64 *subkeys, unsigned int rounds) |
167 | | { |
168 | | // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... |
169 | | uint64x2_t x1 = UnpackHigh64(block0, block1); |
170 | | uint64x2_t y1 = UnpackLow64(block0, block1); |
171 | | uint64x2_t x2 = UnpackHigh64(block2, block3); |
172 | | uint64x2_t y2 = UnpackLow64(block2, block3); |
173 | | uint64x2_t x3 = UnpackHigh64(block4, block5); |
174 | | uint64x2_t y3 = UnpackLow64(block4, block5); |
175 | | |
176 | | for (size_t i = 0; i < static_cast<size_t>(rounds & ~1) - 1; i += 2) |
177 | | { |
178 | | const uint64x2_t rk1 = vld1q_dup_u64(subkeys+i); |
179 | | y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk1); |
180 | | y2 = veorq_u64(veorq_u64(y2, SIMON128_f(x2)), rk1); |
181 | | y3 = veorq_u64(veorq_u64(y3, SIMON128_f(x3)), rk1); |
182 | | |
183 | | const uint64x2_t rk2 = vld1q_dup_u64(subkeys+i+1); |
184 | | x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk2); |
185 | | x2 = veorq_u64(veorq_u64(x2, SIMON128_f(y2)), rk2); |
186 | | x3 = veorq_u64(veorq_u64(x3, SIMON128_f(y3)), rk2); |
187 | | } |
188 | | |
189 | | if (rounds & 1) |
190 | | { |
191 | | const uint64x2_t rk = vld1q_dup_u64(subkeys + rounds - 1); |
192 | | |
193 | | y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk); |
194 | | y2 = veorq_u64(veorq_u64(y2, SIMON128_f(x2)), rk); |
195 | | y3 = veorq_u64(veorq_u64(y3, SIMON128_f(x3)), rk); |
196 | | std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3); |
197 | | } |
198 | | |
199 | | // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... |
200 | | block0 = UnpackLow64(y1, x1); |
201 | | block1 = UnpackHigh64(y1, x1); |
202 | | block2 = UnpackLow64(y2, x2); |
203 | | block3 = UnpackHigh64(y2, x2); |
204 | | block4 = UnpackLow64(y3, x3); |
205 | | block5 = UnpackHigh64(y3, x3); |
206 | | } |
207 | | |
208 | | inline void SIMON128_Dec_Block(uint64x2_t &block0, uint64x2_t &block1, |
209 | | const word64 *subkeys, unsigned int rounds) |
210 | | { |
211 | | // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... |
212 | | uint64x2_t x1 = UnpackHigh64(block0, block1); |
213 | | uint64x2_t y1 = UnpackLow64(block0, block1); |
214 | | |
215 | | if (rounds & 1) |
216 | | { |
217 | | std::swap(x1, y1); |
218 | | const uint64x2_t rk = vld1q_dup_u64(subkeys + rounds - 1); |
219 | | |
220 | | y1 = veorq_u64(veorq_u64(y1, rk), SIMON128_f(x1)); |
221 | | rounds--; |
222 | | } |
223 | | |
224 | | for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2) |
225 | | { |
226 | | const uint64x2_t rk1 = vld1q_dup_u64(subkeys+i+1); |
227 | | x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk1); |
228 | | |
229 | | const uint64x2_t rk2 = vld1q_dup_u64(subkeys+i); |
230 | | y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk2); |
231 | | } |
232 | | |
233 | | // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... |
234 | | block0 = UnpackLow64(y1, x1); |
235 | | block1 = UnpackHigh64(y1, x1); |
236 | | } |
237 | | |
238 | | inline void SIMON128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1, |
239 | | uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5, |
240 | | const word64 *subkeys, unsigned int rounds) |
241 | | { |
242 | | // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... |
243 | | uint64x2_t x1 = UnpackHigh64(block0, block1); |
244 | | uint64x2_t y1 = UnpackLow64(block0, block1); |
245 | | uint64x2_t x2 = UnpackHigh64(block2, block3); |
246 | | uint64x2_t y2 = UnpackLow64(block2, block3); |
247 | | uint64x2_t x3 = UnpackHigh64(block4, block5); |
248 | | uint64x2_t y3 = UnpackLow64(block4, block5); |
249 | | |
250 | | if (rounds & 1) |
251 | | { |
252 | | std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3); |
253 | | const uint64x2_t rk = vld1q_dup_u64(subkeys + rounds - 1); |
254 | | |
255 | | y1 = veorq_u64(veorq_u64(y1, rk), SIMON128_f(x1)); |
256 | | y2 = veorq_u64(veorq_u64(y2, rk), SIMON128_f(x2)); |
257 | | y3 = veorq_u64(veorq_u64(y3, rk), SIMON128_f(x3)); |
258 | | rounds--; |
259 | | } |
260 | | |
261 | | for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2) |
262 | | { |
263 | | const uint64x2_t rk1 = vld1q_dup_u64(subkeys + i + 1); |
264 | | x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk1); |
265 | | x2 = veorq_u64(veorq_u64(x2, SIMON128_f(y2)), rk1); |
266 | | x3 = veorq_u64(veorq_u64(x3, SIMON128_f(y3)), rk1); |
267 | | |
268 | | const uint64x2_t rk2 = vld1q_dup_u64(subkeys + i); |
269 | | y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk2); |
270 | | y2 = veorq_u64(veorq_u64(y2, SIMON128_f(x2)), rk2); |
271 | | y3 = veorq_u64(veorq_u64(y3, SIMON128_f(x3)), rk2); |
272 | | } |
273 | | |
274 | | // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... |
275 | | block0 = UnpackLow64(y1, x1); |
276 | | block1 = UnpackHigh64(y1, x1); |
277 | | block2 = UnpackLow64(y2, x2); |
278 | | block3 = UnpackHigh64(y2, x2); |
279 | | block4 = UnpackLow64(y3, x3); |
280 | | block5 = UnpackHigh64(y3, x3); |
281 | | } |
282 | | |
283 | | #endif // CRYPTOPP_ARM_NEON_AVAILABLE |
284 | | |
285 | | // ***************************** IA-32 ***************************** // |
286 | | |
287 | | #if (CRYPTOPP_SSSE3_AVAILABLE) |
288 | | |
289 | | // GCC double casts, https://www.spinics.net/lists/gcchelp/msg47735.html |
290 | | #ifndef DOUBLE_CAST |
291 | | # define DOUBLE_CAST(x) ((double *)(void *)(x)) |
292 | | #endif |
293 | | #ifndef CONST_DOUBLE_CAST |
294 | | # define CONST_DOUBLE_CAST(x) ((const double *)(const void *)(x)) |
295 | | #endif |
296 | | |
297 | | inline void Swap128(__m128i& a,__m128i& b) |
298 | 32 | { |
299 | | #if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120) |
300 | | // __m128i is an unsigned long long[2], and support for swapping it was not added until C++11. |
301 | | // SunCC 12.1 - 12.3 fail to consume the swap; while SunCC 12.4 consumes it without -std=c++11. |
302 | | vec_swap(a, b); |
303 | | #else |
304 | 32 | std::swap(a, b); |
305 | 32 | #endif |
306 | 32 | } |
307 | | |
308 | | template <unsigned int R> |
309 | | inline __m128i RotateLeft64(const __m128i& val) |
310 | 6.32k | { |
311 | | #if defined(__XOP__) |
312 | | return _mm_roti_epi64(val, R); |
313 | | #else |
314 | 6.32k | return _mm_or_si128( |
315 | 6.32k | _mm_slli_epi64(val, R), _mm_srli_epi64(val, 64-R)); |
316 | 6.32k | #endif |
317 | 6.32k | } simon128_simd.cpp:long long __vector(2) (anonymous namespace)::RotateLeft64<2u>(long long __vector(2) const&) Line | Count | Source | 310 | 3.16k | { | 311 | | #if defined(__XOP__) | 312 | | return _mm_roti_epi64(val, R); | 313 | | #else | 314 | 3.16k | return _mm_or_si128( | 315 | 3.16k | _mm_slli_epi64(val, R), _mm_srli_epi64(val, 64-R)); | 316 | 3.16k | #endif | 317 | 3.16k | } |
simon128_simd.cpp:long long __vector(2) (anonymous namespace)::RotateLeft64<1u>(long long __vector(2) const&) Line | Count | Source | 310 | 3.16k | { | 311 | | #if defined(__XOP__) | 312 | | return _mm_roti_epi64(val, R); | 313 | | #else | 314 | 3.16k | return _mm_or_si128( | 315 | 3.16k | _mm_slli_epi64(val, R), _mm_srli_epi64(val, 64-R)); | 316 | 3.16k | #endif | 317 | 3.16k | } |
|
318 | | |
319 | | template <unsigned int R> |
320 | | inline __m128i RotateRight64(const __m128i& val) |
321 | | { |
322 | | #if defined(__XOP__) |
323 | | return _mm_roti_epi64(val, 64-R); |
324 | | #else |
325 | | return _mm_or_si128( |
326 | | _mm_slli_epi64(val, 64-R), _mm_srli_epi64(val, R)); |
327 | | #endif |
328 | | } |
329 | | |
330 | | // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. |
331 | | template <> |
332 | | __m128i RotateLeft64<8>(const __m128i& val) |
333 | 3.16k | { |
334 | | #if defined(__XOP__) |
335 | | return _mm_roti_epi64(val, 8); |
336 | | #else |
337 | 3.16k | const __m128i mask = _mm_set_epi8(14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7); |
338 | 3.16k | return _mm_shuffle_epi8(val, mask); |
339 | 3.16k | #endif |
340 | 3.16k | } |
341 | | |
342 | | // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. |
343 | | template <> |
344 | | __m128i RotateRight64<8>(const __m128i& val) |
345 | 0 | { |
346 | 0 | #if defined(__XOP__) |
347 | 0 | return _mm_roti_epi64(val, 64-8); |
348 | 0 | #else |
349 | 0 | const __m128i mask = _mm_set_epi8(8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1); |
350 | 0 | return _mm_shuffle_epi8(val, mask); |
351 | 0 | #endif |
352 | 0 | } |
353 | | |
354 | | inline __m128i SIMON128_f(const __m128i& v) |
355 | 3.16k | { |
356 | 3.16k | return _mm_xor_si128(RotateLeft64<2>(v), |
357 | 3.16k | _mm_and_si128(RotateLeft64<1>(v), RotateLeft64<8>(v))); |
358 | 3.16k | } |
359 | | |
360 | | inline void SIMON128_Enc_Block(__m128i &block0, __m128i &block1, |
361 | | const word64 *subkeys, unsigned int rounds) |
362 | 8 | { |
363 | | // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... |
364 | 8 | __m128i x1 = _mm_unpackhi_epi64(block0, block1); |
365 | 8 | __m128i y1 = _mm_unpacklo_epi64(block0, block1); |
366 | | |
367 | 280 | for (size_t i = 0; i < static_cast<size_t>(rounds & ~1)-1; i += 2) |
368 | 272 | { |
369 | | // Round keys are pre-splated in forward direction |
370 | 272 | const __m128i rk1 = _mm_load_si128(CONST_M128_CAST(subkeys+i*2)); |
371 | 272 | y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk1); |
372 | | |
373 | 272 | const __m128i rk2 = _mm_load_si128(CONST_M128_CAST(subkeys+(i+1)*2)); |
374 | 272 | x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk2); |
375 | 272 | } |
376 | | |
377 | 8 | if (rounds & 1) |
378 | 1 | { |
379 | | // Round keys are pre-splated in forward direction |
380 | 1 | const __m128i rk = _mm_load_si128(CONST_M128_CAST(subkeys+(rounds-1)*2)); |
381 | | |
382 | 1 | y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk); |
383 | 1 | Swap128(x1, y1); |
384 | 1 | } |
385 | | |
386 | | // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... |
387 | 8 | block0 = _mm_unpacklo_epi64(y1, x1); |
388 | 8 | block1 = _mm_unpackhi_epi64(y1, x1); |
389 | 8 | } |
390 | | |
391 | | inline void SIMON128_Enc_6_Blocks(__m128i &block0, __m128i &block1, |
392 | | __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5, |
393 | | const word64 *subkeys, unsigned int rounds) |
394 | 2 | { |
395 | | // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... |
396 | 2 | __m128i x1 = _mm_unpackhi_epi64(block0, block1); |
397 | 2 | __m128i y1 = _mm_unpacklo_epi64(block0, block1); |
398 | 2 | __m128i x2 = _mm_unpackhi_epi64(block2, block3); |
399 | 2 | __m128i y2 = _mm_unpacklo_epi64(block2, block3); |
400 | 2 | __m128i x3 = _mm_unpackhi_epi64(block4, block5); |
401 | 2 | __m128i y3 = _mm_unpacklo_epi64(block4, block5); |
402 | | |
403 | 70 | for (size_t i = 0; i < static_cast<size_t>(rounds & ~1) - 1; i += 2) |
404 | 68 | { |
405 | | // Round keys are pre-splated in forward direction |
406 | 68 | const __m128i rk1 = _mm_load_si128(CONST_M128_CAST(subkeys+i*2)); |
407 | 68 | y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk1); |
408 | 68 | y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk1); |
409 | 68 | y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk1); |
410 | | |
411 | | // Round keys are pre-splated in forward direction |
412 | 68 | const __m128i rk2 = _mm_load_si128(CONST_M128_CAST(subkeys+(i+1)*2)); |
413 | 68 | x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk2); |
414 | 68 | x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON128_f(y2)), rk2); |
415 | 68 | x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON128_f(y3)), rk2); |
416 | 68 | } |
417 | | |
418 | 2 | if (rounds & 1) |
419 | 0 | { |
420 | | // Round keys are pre-splated in forward direction |
421 | 0 | const __m128i rk = _mm_load_si128(CONST_M128_CAST(subkeys+(rounds-1)*2)); |
422 | 0 | y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk); |
423 | 0 | y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk); |
424 | 0 | y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk); |
425 | 0 | Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3); |
426 | 0 | } |
427 | | |
428 | | // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... |
429 | 2 | block0 = _mm_unpacklo_epi64(y1, x1); |
430 | 2 | block1 = _mm_unpackhi_epi64(y1, x1); |
431 | 2 | block2 = _mm_unpacklo_epi64(y2, x2); |
432 | 2 | block3 = _mm_unpackhi_epi64(y2, x2); |
433 | 2 | block4 = _mm_unpacklo_epi64(y3, x3); |
434 | 2 | block5 = _mm_unpackhi_epi64(y3, x3); |
435 | 2 | } |
436 | | |
437 | | inline void SIMON128_Dec_Block(__m128i &block0, __m128i &block1, |
438 | | const word64 *subkeys, unsigned int rounds) |
439 | 8 | { |
440 | | // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... |
441 | 8 | __m128i x1 = _mm_unpackhi_epi64(block0, block1); |
442 | 8 | __m128i y1 = _mm_unpacklo_epi64(block0, block1); |
443 | | |
444 | 8 | if (rounds & 1) |
445 | 7 | { |
446 | 7 | const __m128i rk = _mm_castpd_si128( |
447 | 7 | _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + rounds - 1))); |
448 | | |
449 | 7 | Swap128(x1, y1); |
450 | 7 | y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON128_f(x1)); |
451 | 7 | rounds--; |
452 | 7 | } |
453 | | |
454 | 280 | for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2) |
455 | 272 | { |
456 | 272 | const __m128i rk1 = _mm_castpd_si128( |
457 | 272 | _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i+1))); |
458 | 272 | x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk1); |
459 | | |
460 | 272 | const __m128i rk2 = _mm_castpd_si128( |
461 | 272 | _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i))); |
462 | 272 | y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk2); |
463 | 272 | } |
464 | | |
465 | | // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... |
466 | 8 | block0 = _mm_unpacklo_epi64(y1, x1); |
467 | 8 | block1 = _mm_unpackhi_epi64(y1, x1); |
468 | 8 | } |
469 | | |
470 | | inline void SIMON128_Dec_6_Blocks(__m128i &block0, __m128i &block1, |
471 | | __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5, |
472 | | const word64 *subkeys, unsigned int rounds) |
473 | 8 | { |
474 | | // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... |
475 | 8 | __m128i x1 = _mm_unpackhi_epi64(block0, block1); |
476 | 8 | __m128i y1 = _mm_unpacklo_epi64(block0, block1); |
477 | 8 | __m128i x2 = _mm_unpackhi_epi64(block2, block3); |
478 | 8 | __m128i y2 = _mm_unpacklo_epi64(block2, block3); |
479 | 8 | __m128i x3 = _mm_unpackhi_epi64(block4, block5); |
480 | 8 | __m128i y3 = _mm_unpacklo_epi64(block4, block5); |
481 | | |
482 | 8 | if (rounds & 1) |
483 | 8 | { |
484 | 8 | const __m128i rk = _mm_castpd_si128( |
485 | 8 | _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + rounds - 1))); |
486 | | |
487 | 8 | Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3); |
488 | 8 | y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON128_f(x1)); |
489 | 8 | y2 = _mm_xor_si128(_mm_xor_si128(y2, rk), SIMON128_f(x2)); |
490 | 8 | y3 = _mm_xor_si128(_mm_xor_si128(y3, rk), SIMON128_f(x3)); |
491 | 8 | rounds--; |
492 | 8 | } |
493 | | |
494 | 280 | for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2) |
495 | 272 | { |
496 | 272 | const __m128i rk1 = _mm_castpd_si128( |
497 | 272 | _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i + 1))); |
498 | 272 | x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk1); |
499 | 272 | x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON128_f(y2)), rk1); |
500 | 272 | x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON128_f(y3)), rk1); |
501 | | |
502 | 272 | const __m128i rk2 = _mm_castpd_si128( |
503 | 272 | _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i))); |
504 | 272 | y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk2); |
505 | 272 | y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk2); |
506 | 272 | y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk2); |
507 | 272 | } |
508 | | |
509 | | // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... |
510 | 8 | block0 = _mm_unpacklo_epi64(y1, x1); |
511 | 8 | block1 = _mm_unpackhi_epi64(y1, x1); |
512 | 8 | block2 = _mm_unpacklo_epi64(y2, x2); |
513 | 8 | block3 = _mm_unpackhi_epi64(y2, x2); |
514 | 8 | block4 = _mm_unpacklo_epi64(y3, x3); |
515 | 8 | block5 = _mm_unpackhi_epi64(y3, x3); |
516 | 8 | } |
517 | | |
518 | | #endif // CRYPTOPP_SSSE3_AVAILABLE |
519 | | |
520 | | // ***************************** Altivec ***************************** // |
521 | | |
522 | | #if (CRYPTOPP_ALTIVEC_AVAILABLE) |
523 | | |
524 | | // Altivec uses native 64-bit types on 64-bit environments, or 32-bit types |
525 | | // in 32-bit environments. Speck128 will use the appropriate type for the |
526 | | // environment. Functions like VecAdd64 have two overloads, one for each |
527 | | // environment. The 32-bit overload treats uint32x4_p like a 64-bit type, |
528 | | // and does things like perform a add with carry or subtract with borrow. |
529 | | |
530 | | // Speck128 on Power8 performed as expected because of 64-bit environment. |
531 | | // Performance sucked on old PowerPC machines because of 32-bit environments. |
532 | | // At Crypto++ 8.3 we added an implementation that operated on 32-bit words. |
533 | | // Native 64-bit Speck128 performance dropped from about 4.1 to 6.3 cpb, but |
534 | | // 32-bit Speck128 improved from 66.5 cpb to 10.4 cpb. Overall it was a |
535 | | // good win even though we lost some performance in 64-bit environments. |
536 | | |
537 | | using CryptoPP::uint8x16_p; |
538 | | using CryptoPP::uint32x4_p; |
539 | | #if defined(_ARCH_PWR8) |
540 | | using CryptoPP::uint64x2_p; |
541 | | #endif |
542 | | |
543 | | using CryptoPP::VecAdd64; |
544 | | using CryptoPP::VecSub64; |
545 | | using CryptoPP::VecAnd64; |
546 | | using CryptoPP::VecOr64; |
547 | | using CryptoPP::VecXor64; |
548 | | using CryptoPP::VecRotateLeft64; |
549 | | using CryptoPP::VecRotateRight64; |
550 | | using CryptoPP::VecSplatElement64; |
551 | | using CryptoPP::VecLoad; |
552 | | using CryptoPP::VecLoadAligned; |
553 | | using CryptoPP::VecPermute; |
554 | | |
555 | | #if defined(_ARCH_PWR8) |
556 | | #define simon128_t uint64x2_p |
557 | | #else |
558 | | #define simon128_t uint32x4_p |
559 | | #endif |
560 | | |
561 | | inline simon128_t SIMON128_f(const simon128_t val) |
562 | | { |
563 | | return (simon128_t)VecXor64(VecRotateLeft64<2>(val), |
564 | | VecAnd64(VecRotateLeft64<1>(val), VecRotateLeft64<8>(val))); |
565 | | } |
566 | | |
567 | | inline void SIMON128_Enc_Block(uint32x4_p &block, const word64 *subkeys, unsigned int rounds) |
568 | | { |
569 | | #if (CRYPTOPP_BIG_ENDIAN) |
570 | | const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8}; |
571 | | const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0}; |
572 | | #else |
573 | | const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16}; |
574 | | const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24}; |
575 | | #endif |
576 | | |
577 | | // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... |
578 | | simon128_t x1 = (simon128_t)VecPermute(block, block, m1); |
579 | | simon128_t y1 = (simon128_t)VecPermute(block, block, m2); |
580 | | |
581 | | for (size_t i = 0; i < static_cast<size_t>(rounds & ~1)-1; i += 2) |
582 | | { |
583 | | // Round keys are pre-splated in forward direction |
584 | | const word32* ptr1 = reinterpret_cast<const word32*>(subkeys+i*2); |
585 | | const simon128_t rk1 = (simon128_t)VecLoadAligned(ptr1); |
586 | | const word32* ptr2 = reinterpret_cast<const word32*>(subkeys+(i+1)*2); |
587 | | const simon128_t rk2 = (simon128_t)VecLoadAligned(ptr2); |
588 | | |
589 | | y1 = VecXor64(VecXor64(y1, SIMON128_f(x1)), rk1); |
590 | | x1 = VecXor64(VecXor64(x1, SIMON128_f(y1)), rk2); |
591 | | } |
592 | | |
593 | | if (rounds & 1) |
594 | | { |
595 | | // Round keys are pre-splated in forward direction |
596 | | const word32* ptr = reinterpret_cast<const word32*>(subkeys+(rounds-1)*2); |
597 | | const simon128_t rk = (simon128_t)VecLoadAligned(ptr); |
598 | | |
599 | | y1 = VecXor64(VecXor64(y1, SIMON128_f(x1)), rk); |
600 | | |
601 | | std::swap(x1, y1); |
602 | | } |
603 | | |
604 | | #if (CRYPTOPP_BIG_ENDIAN) |
605 | | const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8}; |
606 | | //const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0}; |
607 | | #else |
608 | | const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16}; |
609 | | //const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24}; |
610 | | #endif |
611 | | |
612 | | // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... |
613 | | block = (uint32x4_p)VecPermute(x1, y1, m3); |
614 | | } |
615 | | |
616 | | inline void SIMON128_Dec_Block(uint32x4_p &block, const word64 *subkeys, unsigned int rounds) |
617 | | { |
618 | | #if (CRYPTOPP_BIG_ENDIAN) |
619 | | const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8}; |
620 | | const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0}; |
621 | | #else |
622 | | const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16}; |
623 | | const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24}; |
624 | | #endif |
625 | | |
626 | | // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... |
627 | | simon128_t x1 = (simon128_t)VecPermute(block, block, m1); |
628 | | simon128_t y1 = (simon128_t)VecPermute(block, block, m2); |
629 | | |
630 | | if (rounds & 1) |
631 | | { |
632 | | std::swap(x1, y1); |
633 | | |
634 | | const word32* ptr = reinterpret_cast<const word32*>(subkeys+rounds-1); |
635 | | const simon128_t tk = (simon128_t)VecLoad(ptr); |
636 | | const simon128_t rk = (simon128_t)VecSplatElement64<0>(tk); |
637 | | |
638 | | y1 = VecXor64(VecXor64(y1, rk), SIMON128_f(x1)); |
639 | | rounds--; |
640 | | } |
641 | | |
642 | | for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2) |
643 | | { |
644 | | const word32* ptr = reinterpret_cast<const word32*>(subkeys+i); |
645 | | const simon128_t tk = (simon128_t)VecLoad(ptr); |
646 | | const simon128_t rk1 = (simon128_t)VecSplatElement64<1>(tk); |
647 | | const simon128_t rk2 = (simon128_t)VecSplatElement64<0>(tk); |
648 | | |
649 | | x1 = VecXor64(VecXor64(x1, SIMON128_f(y1)), rk1); |
650 | | y1 = VecXor64(VecXor64(y1, SIMON128_f(x1)), rk2); |
651 | | } |
652 | | |
653 | | #if (CRYPTOPP_BIG_ENDIAN) |
654 | | const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8}; |
655 | | //const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0}; |
656 | | #else |
657 | | const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16}; |
658 | | //const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24}; |
659 | | #endif |
660 | | |
661 | | // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... |
662 | | block = (uint32x4_p)VecPermute(x1, y1, m3); |
663 | | } |
664 | | |
665 | | inline void SIMON128_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1, |
666 | | uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4, |
667 | | uint32x4_p &block5, const word64 *subkeys, unsigned int rounds) |
668 | | { |
669 | | #if (CRYPTOPP_BIG_ENDIAN) |
670 | | const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8}; |
671 | | const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0}; |
672 | | #else |
673 | | const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16}; |
674 | | const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24}; |
675 | | #endif |
676 | | |
677 | | // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... |
678 | | simon128_t x1 = (simon128_t)VecPermute(block0, block1, m1); |
679 | | simon128_t y1 = (simon128_t)VecPermute(block0, block1, m2); |
680 | | simon128_t x2 = (simon128_t)VecPermute(block2, block3, m1); |
681 | | simon128_t y2 = (simon128_t)VecPermute(block2, block3, m2); |
682 | | simon128_t x3 = (simon128_t)VecPermute(block4, block5, m1); |
683 | | simon128_t y3 = (simon128_t)VecPermute(block4, block5, m2); |
684 | | |
685 | | for (size_t i = 0; i < static_cast<size_t>(rounds & ~1)-1; i += 2) |
686 | | { |
687 | | // Round keys are pre-splated in forward direction |
688 | | const word32* ptr1 = reinterpret_cast<const word32*>(subkeys+i*2); |
689 | | const simon128_t rk1 = (simon128_t)VecLoadAligned(ptr1); |
690 | | |
691 | | const word32* ptr2 = reinterpret_cast<const word32*>(subkeys+(i+1)*2); |
692 | | const simon128_t rk2 = (simon128_t)VecLoadAligned(ptr2); |
693 | | |
694 | | y1 = VecXor64(VecXor64(y1, SIMON128_f(x1)), rk1); |
695 | | y2 = VecXor64(VecXor64(y2, SIMON128_f(x2)), rk1); |
696 | | y3 = VecXor64(VecXor64(y3, SIMON128_f(x3)), rk1); |
697 | | |
698 | | x1 = VecXor64(VecXor64(x1, SIMON128_f(y1)), rk2); |
699 | | x2 = VecXor64(VecXor64(x2, SIMON128_f(y2)), rk2); |
700 | | x3 = VecXor64(VecXor64(x3, SIMON128_f(y3)), rk2); |
701 | | } |
702 | | |
703 | | if (rounds & 1) |
704 | | { |
705 | | // Round keys are pre-splated in forward direction |
706 | | const word32* ptr = reinterpret_cast<const word32*>(subkeys+(rounds-1)*2); |
707 | | const simon128_t rk = (simon128_t)VecLoadAligned(ptr); |
708 | | |
709 | | y1 = VecXor64(VecXor64(y1, SIMON128_f(x1)), rk); |
710 | | y2 = VecXor64(VecXor64(y2, SIMON128_f(x2)), rk); |
711 | | y3 = VecXor64(VecXor64(y3, SIMON128_f(x3)), rk); |
712 | | |
713 | | std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3); |
714 | | } |
715 | | |
716 | | #if (CRYPTOPP_BIG_ENDIAN) |
717 | | const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8}; |
718 | | const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0}; |
719 | | #else |
720 | | const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16}; |
721 | | const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24}; |
722 | | #endif |
723 | | |
724 | | // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... |
725 | | block0 = (uint32x4_p)VecPermute(x1, y1, m3); |
726 | | block1 = (uint32x4_p)VecPermute(x1, y1, m4); |
727 | | block2 = (uint32x4_p)VecPermute(x2, y2, m3); |
728 | | block3 = (uint32x4_p)VecPermute(x2, y2, m4); |
729 | | block4 = (uint32x4_p)VecPermute(x3, y3, m3); |
730 | | block5 = (uint32x4_p)VecPermute(x3, y3, m4); |
731 | | } |
732 | | |
733 | | inline void SIMON128_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1, |
734 | | uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4, |
735 | | uint32x4_p &block5, const word64 *subkeys, unsigned int rounds) |
736 | | { |
737 | | #if (CRYPTOPP_BIG_ENDIAN) |
738 | | const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8}; |
739 | | const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0}; |
740 | | #else |
741 | | const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16}; |
742 | | const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24}; |
743 | | #endif |
744 | | |
745 | | // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... |
746 | | simon128_t x1 = (simon128_t)VecPermute(block0, block1, m1); |
747 | | simon128_t y1 = (simon128_t)VecPermute(block0, block1, m2); |
748 | | simon128_t x2 = (simon128_t)VecPermute(block2, block3, m1); |
749 | | simon128_t y2 = (simon128_t)VecPermute(block2, block3, m2); |
750 | | simon128_t x3 = (simon128_t)VecPermute(block4, block5, m1); |
751 | | simon128_t y3 = (simon128_t)VecPermute(block4, block5, m2); |
752 | | |
753 | | if (rounds & 1) |
754 | | { |
755 | | std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3); |
756 | | |
757 | | const word32* ptr = reinterpret_cast<const word32*>(subkeys+rounds-1); |
758 | | const simon128_t tk = (simon128_t)VecLoad(ptr); |
759 | | const simon128_t rk = (simon128_t)VecSplatElement64<0>(tk); |
760 | | |
761 | | y1 = VecXor64(VecXor64(y1, rk), SIMON128_f(x1)); |
762 | | y2 = VecXor64(VecXor64(y2, rk), SIMON128_f(x2)); |
763 | | y3 = VecXor64(VecXor64(y3, rk), SIMON128_f(x3)); |
764 | | rounds--; |
765 | | } |
766 | | |
767 | | for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2) |
768 | | { |
769 | | const word32* ptr = reinterpret_cast<const word32*>(subkeys+i); |
770 | | const simon128_t tk = (simon128_t)VecLoad(ptr); |
771 | | const simon128_t rk1 = (simon128_t)VecSplatElement64<1>(tk); |
772 | | const simon128_t rk2 = (simon128_t)VecSplatElement64<0>(tk); |
773 | | |
774 | | x1 = VecXor64(VecXor64(x1, SIMON128_f(y1)), rk1); |
775 | | x2 = VecXor64(VecXor64(x2, SIMON128_f(y2)), rk1); |
776 | | x3 = VecXor64(VecXor64(x3, SIMON128_f(y3)), rk1); |
777 | | |
778 | | y1 = VecXor64(VecXor64(y1, SIMON128_f(x1)), rk2); |
779 | | y2 = VecXor64(VecXor64(y2, SIMON128_f(x2)), rk2); |
780 | | y3 = VecXor64(VecXor64(y3, SIMON128_f(x3)), rk2); |
781 | | } |
782 | | |
783 | | #if (CRYPTOPP_BIG_ENDIAN) |
784 | | const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8}; |
785 | | const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0}; |
786 | | #else |
787 | | const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16}; |
788 | | const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24}; |
789 | | #endif |
790 | | |
791 | | // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... |
792 | | block0 = (uint32x4_p)VecPermute(x1, y1, m3); |
793 | | block1 = (uint32x4_p)VecPermute(x1, y1, m4); |
794 | | block2 = (uint32x4_p)VecPermute(x2, y2, m3); |
795 | | block3 = (uint32x4_p)VecPermute(x2, y2, m4); |
796 | | block4 = (uint32x4_p)VecPermute(x3, y3, m3); |
797 | | block5 = (uint32x4_p)VecPermute(x3, y3, m4); |
798 | | } |
799 | | |
800 | | #endif // CRYPTOPP_ALTIVEC_AVAILABLE |
801 | | |
802 | | ANONYMOUS_NAMESPACE_END |
803 | | |
804 | | /////////////////////////////////////////////////////////////////////// |
805 | | |
806 | | NAMESPACE_BEGIN(CryptoPP) |
807 | | |
808 | | // *************************** ARM NEON **************************** // |
809 | | |
810 | | #if (CRYPTOPP_ARM_NEON_AVAILABLE) |
811 | | size_t SIMON128_Enc_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds, |
812 | | const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) |
813 | | { |
814 | | return AdvancedProcessBlocks128_6x2_NEON(SIMON128_Enc_Block, SIMON128_Enc_6_Blocks, |
815 | | subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); |
816 | | } |
817 | | |
818 | | size_t SIMON128_Dec_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds, |
819 | | const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) |
820 | | { |
821 | | return AdvancedProcessBlocks128_6x2_NEON(SIMON128_Dec_Block, SIMON128_Dec_6_Blocks, |
822 | | subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); |
823 | | } |
824 | | #endif // CRYPTOPP_ARM_NEON_AVAILABLE |
825 | | |
826 | | // ***************************** IA-32 ***************************** // |
827 | | |
828 | | #if (CRYPTOPP_SSSE3_AVAILABLE) |
829 | | size_t SIMON128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds, |
830 | | const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) |
831 | 7 | { |
832 | 7 | return AdvancedProcessBlocks128_6x2_SSE(SIMON128_Enc_Block, SIMON128_Enc_6_Blocks, |
833 | 7 | subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); |
834 | 7 | } |
835 | | |
836 | | size_t SIMON128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds, |
837 | | const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) |
838 | 7 | { |
839 | 7 | return AdvancedProcessBlocks128_6x2_SSE(SIMON128_Dec_Block, SIMON128_Dec_6_Blocks, |
840 | 7 | subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); |
841 | 7 | } |
842 | | #endif // CRYPTOPP_SSSE3_AVAILABLE |
843 | | |
844 | | // ***************************** Altivec ***************************** // |
845 | | |
846 | | #if (CRYPTOPP_ALTIVEC_AVAILABLE) |
847 | | size_t SIMON128_Enc_AdvancedProcessBlocks_ALTIVEC(const word64* subKeys, size_t rounds, |
848 | | const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) |
849 | | { |
850 | | return AdvancedProcessBlocks128_6x1_ALTIVEC(SIMON128_Enc_Block, SIMON128_Enc_6_Blocks, |
851 | | subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); |
852 | | } |
853 | | |
854 | | size_t SIMON128_Dec_AdvancedProcessBlocks_ALTIVEC(const word64* subKeys, size_t rounds, |
855 | | const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) |
856 | | { |
857 | | return AdvancedProcessBlocks128_6x1_ALTIVEC(SIMON128_Dec_Block, SIMON128_Dec_6_Blocks, |
858 | | subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); |
859 | | } |
860 | | #endif // CRYPTOPP_ALTIVEC_AVAILABLE |
861 | | |
862 | | NAMESPACE_END |