/src/cryptopp/rijndael_simd.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // rijndael_simd.cpp - written and placed in the public domain by |
2 | | // Jeffrey Walton, Uri Blumenthal and Marcel Raad. |
3 | | // AES-NI code originally written by Wei Dai. |
4 | | // |
5 | | // This source file uses intrinsics and built-ins to gain access to |
6 | | // AES-NI, ARMv8a AES and Power8 AES instructions. A separate source |
7 | | // file is needed because additional CXXFLAGS are required to enable |
8 | | // the appropriate instructions sets in some build configurations. |
9 | | // |
10 | | // ARMv8a AES code based on CriticalBlue code from Johannes Schneiders, |
11 | | // Skip Hovsmith and Barry O'Rourke for the mbedTLS project. Stepping |
12 | | // mbedTLS under a debugger was helped for us to determine problems |
13 | | // with our subkey generation and scheduling. |
14 | | // |
15 | | // AltiVec and Power8 code based on http://github.com/noloader/AES-Intrinsics and |
16 | | // http://www.ibm.com/developerworks/library/se-power8-in-core-cryptography/ |
17 | | // For Power8 do not remove the casts, even when const-ness is cast away. It causes |
18 | | // failed compiles and a 0.3 to 0.6 cpb drop in performance. The IBM documentation |
19 | | // absolutely sucks. Thanks to Andy Polyakov, Paul R and Trudeaun for answering |
20 | | // questions and filling the gaps in the IBM documentation. |
21 | | // |
22 | | |
23 | | #include "pch.h" |
24 | | #include "config.h" |
25 | | #include "misc.h" |
26 | | |
27 | | #if (CRYPTOPP_AESNI_AVAILABLE) |
28 | | # include "adv_simd.h" |
29 | | # include <emmintrin.h> |
30 | | # include <smmintrin.h> |
31 | | # include <wmmintrin.h> |
32 | | #endif |
33 | | |
34 | | // Android makes <arm_acle.h> available with ARMv7-a |
35 | | #if (CRYPTOPP_BOOL_ARMV8) |
36 | | # include "adv_simd.h" |
37 | | # if (CRYPTOPP_ARM_NEON_HEADER) |
38 | | # include <arm_neon.h> |
39 | | # endif |
40 | | # if (CRYPTOPP_ARM_ACLE_HEADER) |
41 | | # include <stdint.h> |
42 | | # include <arm_acle.h> |
43 | | # endif |
44 | | #endif |
45 | | |
46 | | #if defined(_M_ARM64) |
47 | | # include "adv_simd.h" |
48 | | #endif |
49 | | |
50 | | #if defined(CRYPTOPP_POWER8_AES_AVAILABLE) |
51 | | # include "adv_simd.h" |
52 | | # include "ppc_simd.h" |
53 | | #endif |
54 | | |
55 | | #ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY |
56 | | # include <signal.h> |
57 | | # include <setjmp.h> |
58 | | #endif |
59 | | |
60 | | #ifndef EXCEPTION_EXECUTE_HANDLER |
61 | | # define EXCEPTION_EXECUTE_HANDLER 1 |
62 | | #endif |
63 | | |
64 | | // Squash MS LNK4221 and libtool warnings |
65 | | extern const char RIJNDAEL_SIMD_FNAME[] = __FILE__; |
66 | | |
67 | | NAMESPACE_BEGIN(CryptoPP) |
68 | | |
69 | | // ************************* Feature Probes ************************* // |
70 | | |
71 | | #ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY |
72 | | extern "C" { |
73 | | typedef void (*SigHandler)(int); |
74 | | |
75 | | static jmp_buf s_jmpSIGILL; |
76 | | static void SigIllHandler(int) |
77 | 0 | { |
78 | 0 | longjmp(s_jmpSIGILL, 1); |
79 | 0 | } |
80 | | } |
81 | | #endif // Not CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY |
82 | | |
83 | | #if (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARMV8) |
84 | | bool CPU_ProbeAES() |
85 | | { |
86 | | #if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES) |
87 | | return false; |
88 | | #elif (CRYPTOPP_ARM_AES_AVAILABLE) |
89 | | # if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY) |
90 | | volatile bool result = true; |
91 | | __try |
92 | | { |
93 | | // AES encrypt and decrypt |
94 | | uint8x16_t data = vdupq_n_u8(0), key = vdupq_n_u8(0); |
95 | | uint8x16_t r1 = vaeseq_u8(data, key); |
96 | | uint8x16_t r2 = vaesdq_u8(data, key); |
97 | | r1 = vaesmcq_u8(r1); |
98 | | r2 = vaesimcq_u8(r2); |
99 | | |
100 | | result = !!(vgetq_lane_u8(r1,0) | vgetq_lane_u8(r2,7)); |
101 | | } |
102 | | __except (EXCEPTION_EXECUTE_HANDLER) |
103 | | { |
104 | | return false; |
105 | | } |
106 | | return result; |
107 | | # else |
108 | | |
109 | | // longjmp and clobber warnings. Volatile is required. |
110 | | // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854 |
111 | | volatile bool result = true; |
112 | | |
113 | | volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler); |
114 | | if (oldHandler == SIG_ERR) |
115 | | return false; |
116 | | |
117 | | volatile sigset_t oldMask; |
118 | | if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask)) |
119 | | { |
120 | | signal(SIGILL, oldHandler); |
121 | | return false; |
122 | | } |
123 | | |
124 | | if (setjmp(s_jmpSIGILL)) |
125 | | result = false; |
126 | | else |
127 | | { |
128 | | uint8x16_t data = vdupq_n_u8(0), key = vdupq_n_u8(0); |
129 | | uint8x16_t r1 = vaeseq_u8(data, key); |
130 | | uint8x16_t r2 = vaesdq_u8(data, key); |
131 | | r1 = vaesmcq_u8(r1); |
132 | | r2 = vaesimcq_u8(r2); |
133 | | |
134 | | // Hack... GCC optimizes away the code and returns true |
135 | | result = !!(vgetq_lane_u8(r1,0) | vgetq_lane_u8(r2,7)); |
136 | | } |
137 | | |
138 | | sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR); |
139 | | signal(SIGILL, oldHandler); |
140 | | return result; |
141 | | # endif |
142 | | #else |
143 | | return false; |
144 | | #endif // CRYPTOPP_ARM_AES_AVAILABLE |
145 | | } |
146 | | #endif // ARM32 or ARM64 |
147 | | |
148 | | // ***************************** ARMv8 ***************************** // |
149 | | |
150 | | #if (CRYPTOPP_ARM_AES_AVAILABLE) |
151 | | |
152 | | ANONYMOUS_NAMESPACE_BEGIN |
153 | | |
154 | | inline void ARMV8_Enc_Block(uint64x2_t &data, const word32 *subkeys, unsigned int rounds) |
155 | | { |
156 | | CRYPTOPP_ASSERT(subkeys); |
157 | | const byte *keys = reinterpret_cast<const byte*>(subkeys); |
158 | | uint8x16_t block = vreinterpretq_u8_u64(data); |
159 | | |
160 | | // AES single round encryption |
161 | | block = vaeseq_u8(block, vld1q_u8(keys+0*16)); |
162 | | // AES mix columns |
163 | | block = vaesmcq_u8(block); |
164 | | |
165 | | for (unsigned int i=1; i<rounds-1; i+=2) |
166 | | { |
167 | | // AES single round encryption |
168 | | block = vaeseq_u8(block, vld1q_u8(keys+i*16)); |
169 | | // AES mix columns |
170 | | block = vaesmcq_u8(block); |
171 | | // AES single round encryption |
172 | | block = vaeseq_u8(block, vld1q_u8(keys+(i+1)*16)); |
173 | | // AES mix columns |
174 | | block = vaesmcq_u8(block); |
175 | | } |
176 | | |
177 | | // AES single round encryption |
178 | | block = vaeseq_u8(block, vld1q_u8(keys+(rounds-1)*16)); |
179 | | // Final Add (bitwise Xor) |
180 | | block = veorq_u8(block, vld1q_u8(keys+rounds*16)); |
181 | | |
182 | | data = vreinterpretq_u64_u8(block); |
183 | | } |
184 | | |
185 | | inline void ARMV8_Enc_6_Blocks(uint64x2_t &data0, uint64x2_t &data1, |
186 | | uint64x2_t &data2, uint64x2_t &data3, uint64x2_t &data4, uint64x2_t &data5, |
187 | | const word32 *subkeys, unsigned int rounds) |
188 | | { |
189 | | CRYPTOPP_ASSERT(subkeys); |
190 | | const byte *keys = reinterpret_cast<const byte*>(subkeys); |
191 | | |
192 | | uint8x16_t block0 = vreinterpretq_u8_u64(data0); |
193 | | uint8x16_t block1 = vreinterpretq_u8_u64(data1); |
194 | | uint8x16_t block2 = vreinterpretq_u8_u64(data2); |
195 | | uint8x16_t block3 = vreinterpretq_u8_u64(data3); |
196 | | uint8x16_t block4 = vreinterpretq_u8_u64(data4); |
197 | | uint8x16_t block5 = vreinterpretq_u8_u64(data5); |
198 | | |
199 | | uint8x16_t key; |
200 | | for (unsigned int i=0; i<rounds-1; ++i) |
201 | | { |
202 | | key = vld1q_u8(keys+i*16); |
203 | | // AES single round encryption |
204 | | block0 = vaeseq_u8(block0, key); |
205 | | // AES mix columns |
206 | | block0 = vaesmcq_u8(block0); |
207 | | // AES single round encryption |
208 | | block1 = vaeseq_u8(block1, key); |
209 | | // AES mix columns |
210 | | block1 = vaesmcq_u8(block1); |
211 | | // AES single round encryption |
212 | | block2 = vaeseq_u8(block2, key); |
213 | | // AES mix columns |
214 | | block2 = vaesmcq_u8(block2); |
215 | | // AES single round encryption |
216 | | block3 = vaeseq_u8(block3, key); |
217 | | // AES mix columns |
218 | | block3 = vaesmcq_u8(block3); |
219 | | // AES single round encryption |
220 | | block4 = vaeseq_u8(block4, key); |
221 | | // AES mix columns |
222 | | block4 = vaesmcq_u8(block4); |
223 | | // AES single round encryption |
224 | | block5 = vaeseq_u8(block5, key); |
225 | | // AES mix columns |
226 | | block5 = vaesmcq_u8(block5); |
227 | | } |
228 | | |
229 | | // AES single round encryption |
230 | | key = vld1q_u8(keys+(rounds-1)*16); |
231 | | block0 = vaeseq_u8(block0, key); |
232 | | block1 = vaeseq_u8(block1, key); |
233 | | block2 = vaeseq_u8(block2, key); |
234 | | block3 = vaeseq_u8(block3, key); |
235 | | block4 = vaeseq_u8(block4, key); |
236 | | block5 = vaeseq_u8(block5, key); |
237 | | |
238 | | // Final Add (bitwise Xor) |
239 | | key = vld1q_u8(keys+rounds*16); |
240 | | data0 = vreinterpretq_u64_u8(veorq_u8(block0, key)); |
241 | | data1 = vreinterpretq_u64_u8(veorq_u8(block1, key)); |
242 | | data2 = vreinterpretq_u64_u8(veorq_u8(block2, key)); |
243 | | data3 = vreinterpretq_u64_u8(veorq_u8(block3, key)); |
244 | | data4 = vreinterpretq_u64_u8(veorq_u8(block4, key)); |
245 | | data5 = vreinterpretq_u64_u8(veorq_u8(block5, key)); |
246 | | } |
247 | | |
248 | | inline void ARMV8_Dec_Block(uint64x2_t &data, const word32 *subkeys, unsigned int rounds) |
249 | | { |
250 | | CRYPTOPP_ASSERT(subkeys); |
251 | | const byte *keys = reinterpret_cast<const byte*>(subkeys); |
252 | | uint8x16_t block = vreinterpretq_u8_u64(data); |
253 | | |
254 | | // AES single round decryption |
255 | | block = vaesdq_u8(block, vld1q_u8(keys+0*16)); |
256 | | // AES inverse mix columns |
257 | | block = vaesimcq_u8(block); |
258 | | |
259 | | for (unsigned int i=1; i<rounds-1; i+=2) |
260 | | { |
261 | | // AES single round decryption |
262 | | block = vaesdq_u8(block, vld1q_u8(keys+i*16)); |
263 | | // AES inverse mix columns |
264 | | block = vaesimcq_u8(block); |
265 | | // AES single round decryption |
266 | | block = vaesdq_u8(block, vld1q_u8(keys+(i+1)*16)); |
267 | | // AES inverse mix columns |
268 | | block = vaesimcq_u8(block); |
269 | | } |
270 | | |
271 | | // AES single round decryption |
272 | | block = vaesdq_u8(block, vld1q_u8(keys+(rounds-1)*16)); |
273 | | // Final Add (bitwise Xor) |
274 | | block = veorq_u8(block, vld1q_u8(keys+rounds*16)); |
275 | | |
276 | | data = vreinterpretq_u64_u8(block); |
277 | | } |
278 | | |
279 | | inline void ARMV8_Dec_6_Blocks(uint64x2_t &data0, uint64x2_t &data1, |
280 | | uint64x2_t &data2, uint64x2_t &data3, uint64x2_t &data4, uint64x2_t &data5, |
281 | | const word32 *subkeys, unsigned int rounds) |
282 | | { |
283 | | CRYPTOPP_ASSERT(subkeys); |
284 | | const byte *keys = reinterpret_cast<const byte*>(subkeys); |
285 | | |
286 | | uint8x16_t block0 = vreinterpretq_u8_u64(data0); |
287 | | uint8x16_t block1 = vreinterpretq_u8_u64(data1); |
288 | | uint8x16_t block2 = vreinterpretq_u8_u64(data2); |
289 | | uint8x16_t block3 = vreinterpretq_u8_u64(data3); |
290 | | uint8x16_t block4 = vreinterpretq_u8_u64(data4); |
291 | | uint8x16_t block5 = vreinterpretq_u8_u64(data5); |
292 | | |
293 | | uint8x16_t key; |
294 | | for (unsigned int i=0; i<rounds-1; ++i) |
295 | | { |
296 | | key = vld1q_u8(keys+i*16); |
297 | | // AES single round decryption |
298 | | block0 = vaesdq_u8(block0, key); |
299 | | // AES inverse mix columns |
300 | | block0 = vaesimcq_u8(block0); |
301 | | // AES single round decryption |
302 | | block1 = vaesdq_u8(block1, key); |
303 | | // AES inverse mix columns |
304 | | block1 = vaesimcq_u8(block1); |
305 | | // AES single round decryption |
306 | | block2 = vaesdq_u8(block2, key); |
307 | | // AES inverse mix columns |
308 | | block2 = vaesimcq_u8(block2); |
309 | | // AES single round decryption |
310 | | block3 = vaesdq_u8(block3, key); |
311 | | // AES inverse mix columns |
312 | | block3 = vaesimcq_u8(block3); |
313 | | // AES single round decryption |
314 | | block4 = vaesdq_u8(block4, key); |
315 | | // AES inverse mix columns |
316 | | block4 = vaesimcq_u8(block4); |
317 | | // AES single round decryption |
318 | | block5 = vaesdq_u8(block5, key); |
319 | | // AES inverse mix columns |
320 | | block5 = vaesimcq_u8(block5); |
321 | | } |
322 | | |
323 | | // AES single round decryption |
324 | | key = vld1q_u8(keys+(rounds-1)*16); |
325 | | block0 = vaesdq_u8(block0, key); |
326 | | block1 = vaesdq_u8(block1, key); |
327 | | block2 = vaesdq_u8(block2, key); |
328 | | block3 = vaesdq_u8(block3, key); |
329 | | block4 = vaesdq_u8(block4, key); |
330 | | block5 = vaesdq_u8(block5, key); |
331 | | |
332 | | // Final Add (bitwise Xor) |
333 | | key = vld1q_u8(keys+rounds*16); |
334 | | data0 = vreinterpretq_u64_u8(veorq_u8(block0, key)); |
335 | | data1 = vreinterpretq_u64_u8(veorq_u8(block1, key)); |
336 | | data2 = vreinterpretq_u64_u8(veorq_u8(block2, key)); |
337 | | data3 = vreinterpretq_u64_u8(veorq_u8(block3, key)); |
338 | | data4 = vreinterpretq_u64_u8(veorq_u8(block4, key)); |
339 | | data5 = vreinterpretq_u64_u8(veorq_u8(block5, key)); |
340 | | } |
341 | | |
342 | | ANONYMOUS_NAMESPACE_END |
343 | | |
344 | | size_t Rijndael_Enc_AdvancedProcessBlocks_ARMV8(const word32 *subKeys, size_t rounds, |
345 | | const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) |
346 | | { |
347 | | return AdvancedProcessBlocks128_6x1_NEON(ARMV8_Enc_Block, ARMV8_Enc_6_Blocks, |
348 | | subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); |
349 | | } |
350 | | |
351 | | size_t Rijndael_Dec_AdvancedProcessBlocks_ARMV8(const word32 *subKeys, size_t rounds, |
352 | | const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) |
353 | | { |
354 | | return AdvancedProcessBlocks128_6x1_NEON(ARMV8_Dec_Block, ARMV8_Dec_6_Blocks, |
355 | | subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); |
356 | | } |
357 | | |
358 | | #endif // CRYPTOPP_ARM_AES_AVAILABLE |
359 | | |
360 | | // ***************************** AES-NI ***************************** // |
361 | | |
362 | | #if (CRYPTOPP_AESNI_AVAILABLE) |
363 | | |
364 | | ANONYMOUS_NAMESPACE_BEGIN |
365 | | |
366 | | /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */ |
367 | | CRYPTOPP_ALIGN_DATA(16) |
368 | | const word32 s_rconLE[] = { |
369 | | 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36 |
370 | | }; |
371 | | |
372 | | inline void AESNI_Enc_Block(__m128i &block, MAYBE_CONST word32 *subkeys, unsigned int rounds) |
373 | 597 | { |
374 | 597 | const __m128i* skeys = reinterpret_cast<const __m128i*>(subkeys); |
375 | | |
376 | 597 | block = _mm_xor_si128(block, skeys[0]); |
377 | 3.29k | for (unsigned int i=1; i<rounds-1; i+=2) |
378 | 2.69k | { |
379 | 2.69k | block = _mm_aesenc_si128(block, skeys[i]); |
380 | 2.69k | block = _mm_aesenc_si128(block, skeys[i+1]); |
381 | 2.69k | } |
382 | 597 | block = _mm_aesenc_si128(block, skeys[rounds-1]); |
383 | 597 | block = _mm_aesenclast_si128(block, skeys[rounds]); |
384 | 597 | } |
385 | | |
386 | | inline void AESNI_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, |
387 | | MAYBE_CONST word32 *subkeys, unsigned int rounds) |
388 | 2.29k | { |
389 | 2.29k | const __m128i* skeys = reinterpret_cast<const __m128i*>(subkeys); |
390 | | |
391 | 2.29k | __m128i rk = skeys[0]; |
392 | 2.29k | block0 = _mm_xor_si128(block0, rk); |
393 | 2.29k | block1 = _mm_xor_si128(block1, rk); |
394 | 2.29k | block2 = _mm_xor_si128(block2, rk); |
395 | 2.29k | block3 = _mm_xor_si128(block3, rk); |
396 | 29.1k | for (unsigned int i=1; i<rounds; i++) |
397 | 26.8k | { |
398 | 26.8k | rk = skeys[i]; |
399 | 26.8k | block0 = _mm_aesenc_si128(block0, rk); |
400 | 26.8k | block1 = _mm_aesenc_si128(block1, rk); |
401 | 26.8k | block2 = _mm_aesenc_si128(block2, rk); |
402 | 26.8k | block3 = _mm_aesenc_si128(block3, rk); |
403 | 26.8k | } |
404 | 2.29k | rk = skeys[rounds]; |
405 | 2.29k | block0 = _mm_aesenclast_si128(block0, rk); |
406 | 2.29k | block1 = _mm_aesenclast_si128(block1, rk); |
407 | 2.29k | block2 = _mm_aesenclast_si128(block2, rk); |
408 | 2.29k | block3 = _mm_aesenclast_si128(block3, rk); |
409 | 2.29k | } |
410 | | |
411 | | inline void AESNI_Dec_Block(__m128i &block, MAYBE_CONST word32 *subkeys, unsigned int rounds) |
412 | 25 | { |
413 | 25 | const __m128i* skeys = reinterpret_cast<const __m128i*>(subkeys); |
414 | | |
415 | 25 | block = _mm_xor_si128(block, skeys[0]); |
416 | 131 | for (unsigned int i=1; i<rounds-1; i+=2) |
417 | 106 | { |
418 | 106 | block = _mm_aesdec_si128(block, skeys[i]); |
419 | 106 | block = _mm_aesdec_si128(block, skeys[i+1]); |
420 | 106 | } |
421 | 25 | block = _mm_aesdec_si128(block, skeys[rounds-1]); |
422 | 25 | block = _mm_aesdeclast_si128(block, skeys[rounds]); |
423 | 25 | } |
424 | | |
425 | | inline void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, |
426 | | MAYBE_CONST word32 *subkeys, unsigned int rounds) |
427 | 8 | { |
428 | 8 | const __m128i* skeys = reinterpret_cast<const __m128i*>(subkeys); |
429 | | |
430 | 8 | __m128i rk = skeys[0]; |
431 | 8 | block0 = _mm_xor_si128(block0, rk); |
432 | 8 | block1 = _mm_xor_si128(block1, rk); |
433 | 8 | block2 = _mm_xor_si128(block2, rk); |
434 | 8 | block3 = _mm_xor_si128(block3, rk); |
435 | 80 | for (unsigned int i=1; i<rounds; i++) |
436 | 72 | { |
437 | 72 | rk = skeys[i]; |
438 | 72 | block0 = _mm_aesdec_si128(block0, rk); |
439 | 72 | block1 = _mm_aesdec_si128(block1, rk); |
440 | 72 | block2 = _mm_aesdec_si128(block2, rk); |
441 | 72 | block3 = _mm_aesdec_si128(block3, rk); |
442 | 72 | } |
443 | 8 | rk = skeys[rounds]; |
444 | 8 | block0 = _mm_aesdeclast_si128(block0, rk); |
445 | 8 | block1 = _mm_aesdeclast_si128(block1, rk); |
446 | 8 | block2 = _mm_aesdeclast_si128(block2, rk); |
447 | 8 | block3 = _mm_aesdeclast_si128(block3, rk); |
448 | 8 | } |
449 | | |
450 | | ANONYMOUS_NAMESPACE_END |
451 | | |
452 | | void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, word32 *rk) |
453 | 169 | { |
454 | 169 | const size_t rounds = keyLen / 4 + 6; |
455 | 169 | const word32 *rc = s_rconLE; |
456 | | |
457 | 169 | __m128i temp = _mm_loadu_si128(M128_CAST(userKey+keyLen-16)); |
458 | 169 | std::memcpy(rk, userKey, keyLen); |
459 | | |
460 | | // keySize: m_key allocates 4*(rounds+1) word32's. |
461 | 169 | const size_t keySize = 4*(rounds+1); |
462 | 169 | const word32* end = rk + keySize; |
463 | | |
464 | 1.51k | while (true) |
465 | 1.51k | { |
466 | 1.51k | rk[keyLen/4] = rk[0] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 3) ^ *(rc++); |
467 | 1.51k | rk[keyLen/4+1] = rk[1] ^ rk[keyLen/4]; |
468 | 1.51k | rk[keyLen/4+2] = rk[2] ^ rk[keyLen/4+1]; |
469 | 1.51k | rk[keyLen/4+3] = rk[3] ^ rk[keyLen/4+2]; |
470 | | |
471 | 1.51k | if (rk + keyLen/4 + 4 == end) |
472 | 169 | break; |
473 | | |
474 | 1.34k | if (keyLen == 24) |
475 | 147 | { |
476 | 147 | rk[10] = rk[ 4] ^ rk[ 9]; |
477 | 147 | rk[11] = rk[ 5] ^ rk[10]; |
478 | 147 | temp = _mm_insert_epi32(temp, rk[11], 3); |
479 | 147 | } |
480 | 1.20k | else if (keyLen == 32) |
481 | 264 | { |
482 | 264 | temp = _mm_insert_epi32(temp, rk[11], 3); |
483 | 264 | rk[12] = rk[ 4] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 2); |
484 | 264 | rk[13] = rk[ 5] ^ rk[12]; |
485 | 264 | rk[14] = rk[ 6] ^ rk[13]; |
486 | 264 | rk[15] = rk[ 7] ^ rk[14]; |
487 | 264 | temp = _mm_insert_epi32(temp, rk[15], 3); |
488 | 264 | } |
489 | 936 | else |
490 | 936 | { |
491 | 936 | temp = _mm_insert_epi32(temp, rk[7], 3); |
492 | 936 | } |
493 | | |
494 | 1.34k | rk += keyLen/4; |
495 | 1.34k | } |
496 | 169 | } |
497 | | |
498 | | void Rijndael_UncheckedSetKeyRev_AESNI(word32 *key, unsigned int rounds) |
499 | 21 | { |
500 | 21 | unsigned int i, j; |
501 | 21 | __m128i temp; |
502 | | |
503 | 21 | vec_swap(*M128_CAST(key), *M128_CAST(key+4*rounds)); |
504 | | |
505 | 107 | for (i = 4, j = 4*rounds-4; i < j; i += 4, j -= 4) |
506 | 86 | { |
507 | 86 | temp = _mm_aesimc_si128(*M128_CAST(key+i)); |
508 | 86 | *M128_CAST(key+i) = _mm_aesimc_si128(*M128_CAST(key+j)); |
509 | 86 | *M128_CAST(key+j) = temp; |
510 | 86 | } |
511 | | |
512 | 21 | *M128_CAST(key+i) = _mm_aesimc_si128(*M128_CAST(key+i)); |
513 | 21 | } |
514 | | |
515 | | size_t Rijndael_Enc_AdvancedProcessBlocks_AESNI(const word32 *subKeys, size_t rounds, |
516 | | const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) |
517 | 493 | { |
518 | | // SunCC workaround |
519 | 493 | MAYBE_CONST word32* sk = MAYBE_UNCONST_CAST(word32*, subKeys); |
520 | 493 | MAYBE_CONST byte* ib = MAYBE_UNCONST_CAST(byte*, inBlocks); |
521 | 493 | MAYBE_CONST byte* xb = MAYBE_UNCONST_CAST(byte*, xorBlocks); |
522 | | |
523 | 493 | return AdvancedProcessBlocks128_4x1_SSE(AESNI_Enc_Block, AESNI_Enc_4_Blocks, |
524 | 493 | sk, rounds, ib, xb, outBlocks, length, flags); |
525 | 493 | } |
526 | | |
527 | | size_t Rijndael_Dec_AdvancedProcessBlocks_AESNI(const word32 *subKeys, size_t rounds, |
528 | | const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) |
529 | 24 | { |
530 | 24 | MAYBE_CONST word32* sk = MAYBE_UNCONST_CAST(word32*, subKeys); |
531 | 24 | MAYBE_CONST byte* ib = MAYBE_UNCONST_CAST(byte*, inBlocks); |
532 | 24 | MAYBE_CONST byte* xb = MAYBE_UNCONST_CAST(byte*, xorBlocks); |
533 | | |
534 | 24 | return AdvancedProcessBlocks128_4x1_SSE(AESNI_Dec_Block, AESNI_Dec_4_Blocks, |
535 | 24 | sk, rounds, ib, xb, outBlocks, length, flags); |
536 | 24 | } |
537 | | |
538 | | #endif // CRYPTOPP_AESNI_AVAILABLE |
539 | | |
540 | | // ************************** Power 8 Crypto ************************** // |
541 | | |
542 | | #if (CRYPTOPP_POWER8_AES_AVAILABLE) |
543 | | |
544 | | ANONYMOUS_NAMESPACE_BEGIN |
545 | | |
546 | | /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */ |
547 | | CRYPTOPP_ALIGN_DATA(16) |
548 | | static const uint32_t s_rconBE[] = { |
549 | | 0x01000000, 0x02000000, 0x04000000, 0x08000000, |
550 | | 0x10000000, 0x20000000, 0x40000000, 0x80000000, |
551 | | 0x1B000000, 0x36000000 |
552 | | }; |
553 | | |
554 | | inline void POWER8_Enc_Block(uint32x4_p &block, const word32 *subkeys, unsigned int rounds) |
555 | | { |
556 | | CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16)); |
557 | | const byte *keys = reinterpret_cast<const byte*>(subkeys); |
558 | | |
559 | | uint32x4_p k = VecLoadAligned(keys); |
560 | | block = VecXor(block, k); |
561 | | |
562 | | for (size_t i=1; i<rounds-1; i+=2) |
563 | | { |
564 | | block = VecEncrypt(block, VecLoadAligned( i*16, keys)); |
565 | | block = VecEncrypt(block, VecLoadAligned((i+1)*16, keys)); |
566 | | } |
567 | | |
568 | | block = VecEncrypt(block, VecLoadAligned((rounds-1)*16, keys)); |
569 | | block = VecEncryptLast(block, VecLoadAligned(rounds*16, keys)); |
570 | | } |
571 | | |
572 | | inline void POWER8_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1, |
573 | | uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4, |
574 | | uint32x4_p &block5, const word32 *subkeys, unsigned int rounds) |
575 | | { |
576 | | CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16)); |
577 | | const byte *keys = reinterpret_cast<const byte*>(subkeys); |
578 | | |
579 | | uint32x4_p k = VecLoadAligned(keys); |
580 | | block0 = VecXor(block0, k); |
581 | | block1 = VecXor(block1, k); |
582 | | block2 = VecXor(block2, k); |
583 | | block3 = VecXor(block3, k); |
584 | | block4 = VecXor(block4, k); |
585 | | block5 = VecXor(block5, k); |
586 | | |
587 | | for (size_t i=1; i<rounds; ++i) |
588 | | { |
589 | | k = VecLoadAligned(i*16, keys); |
590 | | block0 = VecEncrypt(block0, k); |
591 | | block1 = VecEncrypt(block1, k); |
592 | | block2 = VecEncrypt(block2, k); |
593 | | block3 = VecEncrypt(block3, k); |
594 | | block4 = VecEncrypt(block4, k); |
595 | | block5 = VecEncrypt(block5, k); |
596 | | } |
597 | | |
598 | | k = VecLoadAligned(rounds*16, keys); |
599 | | block0 = VecEncryptLast(block0, k); |
600 | | block1 = VecEncryptLast(block1, k); |
601 | | block2 = VecEncryptLast(block2, k); |
602 | | block3 = VecEncryptLast(block3, k); |
603 | | block4 = VecEncryptLast(block4, k); |
604 | | block5 = VecEncryptLast(block5, k); |
605 | | } |
606 | | |
607 | | inline void POWER8_Dec_Block(uint32x4_p &block, const word32 *subkeys, unsigned int rounds) |
608 | | { |
609 | | CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16)); |
610 | | const byte *keys = reinterpret_cast<const byte*>(subkeys); |
611 | | |
612 | | uint32x4_p k = VecLoadAligned(rounds*16, keys); |
613 | | block = VecXor(block, k); |
614 | | |
615 | | for (size_t i=rounds-1; i>1; i-=2) |
616 | | { |
617 | | block = VecDecrypt(block, VecLoadAligned( i*16, keys)); |
618 | | block = VecDecrypt(block, VecLoadAligned((i-1)*16, keys)); |
619 | | } |
620 | | |
621 | | block = VecDecrypt(block, VecLoadAligned(16, keys)); |
622 | | block = VecDecryptLast(block, VecLoadAligned(0, keys)); |
623 | | } |
624 | | |
625 | | inline void POWER8_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1, |
626 | | uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4, |
627 | | uint32x4_p &block5, const word32 *subkeys, unsigned int rounds) |
628 | | { |
629 | | CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16)); |
630 | | const byte *keys = reinterpret_cast<const byte*>(subkeys); |
631 | | |
632 | | uint32x4_p k = VecLoadAligned(rounds*16, keys); |
633 | | block0 = VecXor(block0, k); |
634 | | block1 = VecXor(block1, k); |
635 | | block2 = VecXor(block2, k); |
636 | | block3 = VecXor(block3, k); |
637 | | block4 = VecXor(block4, k); |
638 | | block5 = VecXor(block5, k); |
639 | | |
640 | | for (size_t i=rounds-1; i>0; --i) |
641 | | { |
642 | | k = VecLoadAligned(i*16, keys); |
643 | | block0 = VecDecrypt(block0, k); |
644 | | block1 = VecDecrypt(block1, k); |
645 | | block2 = VecDecrypt(block2, k); |
646 | | block3 = VecDecrypt(block3, k); |
647 | | block4 = VecDecrypt(block4, k); |
648 | | block5 = VecDecrypt(block5, k); |
649 | | } |
650 | | |
651 | | k = VecLoadAligned(0, keys); |
652 | | block0 = VecDecryptLast(block0, k); |
653 | | block1 = VecDecryptLast(block1, k); |
654 | | block2 = VecDecryptLast(block2, k); |
655 | | block3 = VecDecryptLast(block3, k); |
656 | | block4 = VecDecryptLast(block4, k); |
657 | | block5 = VecDecryptLast(block5, k); |
658 | | } |
659 | | |
660 | | ANONYMOUS_NAMESPACE_END |
661 | | |
662 | | void Rijndael_UncheckedSetKey_POWER8(const byte* userKey, size_t keyLen, word32* rk, const byte* Se) |
663 | | { |
664 | | const size_t rounds = keyLen / 4 + 6; |
665 | | const word32 *rc = s_rconBE; |
666 | | word32 *rkey = rk, temp; |
667 | | |
668 | | GetUserKey(BIG_ENDIAN_ORDER, rkey, keyLen/4, userKey, keyLen); |
669 | | |
670 | | // keySize: m_key allocates 4*(rounds+1) word32's. |
671 | | const size_t keySize = 4*(rounds+1); |
672 | | const word32* end = rkey + keySize; |
673 | | |
674 | | while (true) |
675 | | { |
676 | | temp = rkey[keyLen/4-1]; |
677 | | word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^ |
678 | | (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)]; |
679 | | rkey[keyLen/4] = rkey[0] ^ x ^ *(rc++); |
680 | | rkey[keyLen/4+1] = rkey[1] ^ rkey[keyLen/4]; |
681 | | rkey[keyLen/4+2] = rkey[2] ^ rkey[keyLen/4+1]; |
682 | | rkey[keyLen/4+3] = rkey[3] ^ rkey[keyLen/4+2]; |
683 | | |
684 | | if (rkey + keyLen/4 + 4 == end) |
685 | | break; |
686 | | |
687 | | if (keyLen == 24) |
688 | | { |
689 | | rkey[10] = rkey[ 4] ^ rkey[ 9]; |
690 | | rkey[11] = rkey[ 5] ^ rkey[10]; |
691 | | } |
692 | | else if (keyLen == 32) |
693 | | { |
694 | | temp = rkey[11]; |
695 | | rkey[12] = rkey[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)]; |
696 | | rkey[13] = rkey[ 5] ^ rkey[12]; |
697 | | rkey[14] = rkey[ 6] ^ rkey[13]; |
698 | | rkey[15] = rkey[ 7] ^ rkey[14]; |
699 | | } |
700 | | rkey += keyLen/4; |
701 | | } |
702 | | |
703 | | #if (CRYPTOPP_LITTLE_ENDIAN) |
704 | | rkey = rk; |
705 | | const uint8x16_p mask = {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3}; |
706 | | |
707 | | unsigned int i=0; |
708 | | for (i=0; i<rounds; i+=2, rkey+=8) |
709 | | { |
710 | | VecStore(VecPermute(VecLoad(rkey+0), mask), rkey+0); |
711 | | VecStore(VecPermute(VecLoad(rkey+4), mask), rkey+4); |
712 | | } |
713 | | |
714 | | for ( ; i<rounds+1; i++, rkey+=4) |
715 | | VecStore(VecPermute(VecLoad(rkey), mask), rkey); |
716 | | #endif |
717 | | } |
718 | | |
719 | | size_t Rijndael_Enc_AdvancedProcessBlocks128_6x1_ALTIVEC(const word32 *subKeys, size_t rounds, |
720 | | const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) |
721 | | { |
722 | | return AdvancedProcessBlocks128_6x1_ALTIVEC(POWER8_Enc_Block, POWER8_Enc_6_Blocks, |
723 | | subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); |
724 | | } |
725 | | |
726 | | size_t Rijndael_Dec_AdvancedProcessBlocks128_6x1_ALTIVEC(const word32 *subKeys, size_t rounds, |
727 | | const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) |
728 | | { |
729 | | return AdvancedProcessBlocks128_6x1_ALTIVEC(POWER8_Dec_Block, POWER8_Dec_6_Blocks, |
730 | | subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); |
731 | | } |
732 | | |
733 | | #endif // CRYPTOPP_POWER8_AES_AVAILABLE |
734 | | NAMESPACE_END |