/src/cryptopp/rijndael.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // rijndael.cpp - modified by Chris Morgan <cmorgan@wpi.edu> |
2 | | // and Wei Dai from Paulo Baretto's Rijndael implementation |
3 | | // The original code and all modifications are in the public domain. |
4 | | |
5 | | // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM rijndael.cpp" to generate MASM code |
6 | | |
7 | | /* |
8 | | July 2018: Added support for ARMv7 AES instructions via Cryptogams ASM. |
9 | | See the head notes in aes_armv4.S for copyright and license. |
10 | | */ |
11 | | |
12 | | /* |
13 | | September 2017: Added support for Power8 AES instructions via compiler intrinsics. |
14 | | */ |
15 | | |
16 | | /* |
17 | | July 2017: Added support for ARMv8 AES instructions via compiler intrinsics. |
18 | | */ |
19 | | |
20 | | /* |
21 | | July 2010: Added support for AES-NI instructions via compiler intrinsics. |
22 | | */ |
23 | | |
24 | | /* |
25 | | Feb 2009: The x86/x64 assembly code was rewritten in by Wei Dai to do counter mode |
26 | | caching, which was invented by Hongjun Wu and popularized by Daniel J. Bernstein |
27 | | and Peter Schwabe in their paper "New AES software speed records". The round |
28 | | function was also modified to include a trick similar to one in Brian Gladman's |
29 | | x86 assembly code, doing an 8-bit register move to minimize the number of |
30 | | register spills. Also switched to compressed tables and copying round keys to |
31 | | the stack. |
32 | | |
33 | | The C++ implementation uses compressed tables if |
34 | | CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS is defined. |
35 | | It is defined on x86 platforms by default but no others. |
36 | | */ |
37 | | |
38 | | /* |
39 | | July 2006: Defense against timing attacks was added in by Wei Dai. |
40 | | |
41 | | The code now uses smaller tables in the first and last rounds, |
42 | | and preloads them into L1 cache before usage (by loading at least |
43 | | one element in each cache line). |
44 | | |
45 | | We try to delay subsequent accesses to each table (used in the first |
46 | | and last rounds) until all of the table has been preloaded. Hopefully |
47 | | the compiler isn't smart enough to optimize that code away. |
48 | | |
49 | | After preloading the table, we also try not to access any memory location |
50 | | other than the table and the stack, in order to prevent table entries from |
51 | | being unloaded from L1 cache, until that round is finished. |
52 | | (Some popular CPUs have 2-way associative caches.) |
53 | | */ |
54 | | |
55 | | // This is the original introductory comment: |
56 | | |
57 | | /** |
58 | | * version 3.0 (December 2000) |
59 | | * |
60 | | * Optimised ANSI C code for the Rijndael cipher (now AES) |
61 | | * |
62 | | * author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be> |
63 | | * author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be> |
64 | | * author Paulo Barreto <paulo.barreto@terra.com.br> |
65 | | * |
66 | | * This code is hereby placed in the public domain. |
67 | | * |
68 | | * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS |
69 | | * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
70 | | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
71 | | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE |
72 | | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
73 | | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
74 | | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR |
75 | | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, |
76 | | * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE |
77 | | * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, |
78 | | * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
79 | | */ |
80 | | |
81 | | #include "pch.h" |
82 | | #include "config.h" |
83 | | |
84 | | #ifndef CRYPTOPP_IMPORTS |
85 | | #ifndef CRYPTOPP_GENERATE_X64_MASM |
86 | | |
87 | | #include "rijndael.h" |
88 | | #include "misc.h" |
89 | | #include "cpu.h" |
90 | | |
91 | | // VS2017 and global optimization bug. Also see |
92 | | // https://github.com/weidai11/cryptopp/issues/649 |
93 | | #if (CRYPTOPP_MSC_VERSION >= 1910) && (CRYPTOPP_MSC_VERSION <= 1916) |
94 | | # ifndef CRYPTOPP_DEBUG |
95 | | # pragma optimize("", off) |
96 | | # pragma optimize("ts", on) |
97 | | # endif |
98 | | #endif |
99 | | |
100 | | NAMESPACE_BEGIN(CryptoPP) |
101 | | |
102 | | // Hack for http://github.com/weidai11/cryptopp/issues/42 and http://github.com/weidai11/cryptopp/issues/132 |
103 | | #if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) |
104 | | # define CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS 1 |
105 | | #endif |
106 | | |
107 | | // Clang intrinsic casts |
108 | | #define M128I_CAST(x) ((__m128i *)(void *)(x)) |
109 | | #define CONST_M128I_CAST(x) ((const __m128i *)(const void *)(x)) |
110 | | |
111 | | #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS) |
112 | | # if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM) |
113 | | namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];} |
114 | | using namespace rdtable; |
115 | | # else |
116 | | static word64 Te[256]; |
117 | | # endif |
118 | | static word64 Td[256]; |
119 | | #else // Not CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS |
120 | | # if defined(CRYPTOPP_X64_MASM_AVAILABLE) |
121 | | // Unused; avoids linker error on Microsoft X64 non-AESNI platforms |
122 | | namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];} |
123 | | # endif |
124 | | CRYPTOPP_ALIGN_DATA(16) static word32 Te[256*4]; |
125 | | CRYPTOPP_ALIGN_DATA(16) static word32 Td[256*4]; |
126 | | #endif // CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS |
127 | | |
128 | | static volatile bool s_TeFilled = false, s_TdFilled = false; |
129 | | |
130 | | ANONYMOUS_NAMESPACE_BEGIN |
131 | | |
132 | | #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 |
133 | | |
134 | | // Determine whether the range between begin and end overlaps |
135 | | // with the same 4k block offsets as the Te table. Logically, |
136 | | // the code is trying to create the condition: |
137 | | // |
138 | | // Two separate memory pages: |
139 | | // |
140 | | // +-----+ +-----+ |
141 | | // |XXXXX| |YYYYY| |
142 | | // |XXXXX| |YYYYY| |
143 | | // | | | | |
144 | | // | | | | |
145 | | // +-----+ +-----+ |
146 | | // Te Table Locals |
147 | | // |
148 | | // Have a logical cache view of (X and Y may be inverted): |
149 | | // |
150 | | // +-----+ |
151 | | // |XXXXX| |
152 | | // |XXXXX| |
153 | | // |YYYYY| |
154 | | // |YYYYY| |
155 | | // +-----+ |
156 | | // |
157 | | static inline bool AliasedWithTable(const byte *begin, const byte *end) |
158 | 0 | { |
159 | 0 | ptrdiff_t s0 = uintptr_t(begin)%4096, s1 = uintptr_t(end)%4096; |
160 | 0 | ptrdiff_t t0 = uintptr_t(Te)%4096, t1 = (uintptr_t(Te)+sizeof(Te))%4096; |
161 | 0 | if (t1 > t0) |
162 | 0 | return (s0 >= t0 && s0 < t1) || (s1 > t0 && s1 <= t1); |
163 | 0 | else |
164 | 0 | return (s0 < t1 || s1 <= t1) || (s0 >= t0 || s1 > t0); |
165 | 0 | } |
166 | | |
167 | | struct Locals |
168 | | { |
169 | | word32 subkeys[4*12], workspace[8]; |
170 | | const byte *inBlocks, *inXorBlocks, *outXorBlocks; |
171 | | byte *outBlocks; |
172 | | size_t inIncrement, inXorIncrement, outXorIncrement, outIncrement; |
173 | | size_t regSpill, lengthAndCounterFlag, keysBegin; |
174 | | }; |
175 | | |
176 | | const size_t s_aliasPageSize = 4096; |
177 | | const size_t s_aliasBlockSize = 256; |
178 | | const size_t s_sizeToAllocate = s_aliasPageSize + s_aliasBlockSize + sizeof(Locals); |
179 | | |
180 | | #endif // CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 |
181 | | |
182 | | ANONYMOUS_NAMESPACE_END |
183 | | |
184 | | // ************************* Portable Code ************************************ |
185 | | |
186 | | #define QUARTER_ROUND(L, T, t, a, b, c, d) \ |
187 | 0 | a ^= L(T, 3, byte(t)); t >>= 8;\ |
188 | 0 | b ^= L(T, 2, byte(t)); t >>= 8;\ |
189 | 0 | c ^= L(T, 1, byte(t)); t >>= 8;\ |
190 | 0 | d ^= L(T, 0, t); |
191 | | |
192 | | #define QUARTER_ROUND_LE(t, a, b, c, d) \ |
193 | 0 | tempBlock[a] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\ |
194 | 0 | tempBlock[b] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\ |
195 | 0 | tempBlock[c] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\ |
196 | 0 | tempBlock[d] = ((byte *)(Te+t))[1]; |
197 | | |
198 | | #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS) |
199 | | #define QUARTER_ROUND_LD(t, a, b, c, d) \ |
200 | 0 | tempBlock[a] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\ |
201 | 0 | tempBlock[b] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\ |
202 | 0 | tempBlock[c] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\ |
203 | 0 | tempBlock[d] = ((byte *)(Td+t))[GetNativeByteOrder()*7]; |
204 | | #else |
205 | | #define QUARTER_ROUND_LD(t, a, b, c, d) \ |
206 | | tempBlock[a] = Sd[byte(t)]; t >>= 8;\ |
207 | | tempBlock[b] = Sd[byte(t)]; t >>= 8;\ |
208 | | tempBlock[c] = Sd[byte(t)]; t >>= 8;\ |
209 | | tempBlock[d] = Sd[t]; |
210 | | #endif |
211 | | |
212 | 0 | #define QUARTER_ROUND_E(t, a, b, c, d) QUARTER_ROUND(TL_M, Te, t, a, b, c, d) |
213 | 0 | #define QUARTER_ROUND_D(t, a, b, c, d) QUARTER_ROUND(TL_M, Td, t, a, b, c, d) |
214 | | |
215 | | #if (CRYPTOPP_LITTLE_ENDIAN) |
216 | 0 | #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, d, c, b, a) |
217 | 0 | #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, d, c, b, a) |
218 | | #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS) |
219 | 0 | #define TL_F(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (6-i)%4+1)) |
220 | 0 | #define TL_M(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (i+3)%4+1)) |
221 | | #else |
222 | | #define TL_F(T, i, x) rotrFixed(T[x], (3-i)*8) |
223 | | #define TL_M(T, i, x) T[i*256 + x] |
224 | | #endif |
225 | | #else |
226 | | #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, a, b, c, d) |
227 | | #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, a, b, c, d) |
228 | | #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS) |
229 | | #define TL_F(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (4-i)%4)) |
230 | | #define TL_M TL_F |
231 | | #else |
232 | | #define TL_F(T, i, x) rotrFixed(T[x], i*8) |
233 | | #define TL_M(T, i, x) T[i*256 + x] |
234 | | #endif |
235 | | #endif |
236 | | |
237 | | |
238 | 0 | #define f2(x) ((x<<1)^(((x>>7)&1)*0x11b)) |
239 | 0 | #define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b)) |
240 | 0 | #define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b)) |
241 | | |
242 | 0 | #define f3(x) (f2(x) ^ x) |
243 | 0 | #define f9(x) (f8(x) ^ x) |
244 | 0 | #define fb(x) (f8(x) ^ f2(x) ^ x) |
245 | 0 | #define fd(x) (f8(x) ^ f4(x) ^ x) |
246 | 0 | #define fe(x) (f8(x) ^ f4(x) ^ f2(x)) |
247 | | |
248 | | unsigned int Rijndael::Base::OptimalDataAlignment() const |
249 | 183 | { |
250 | 183 | #if (CRYPTOPP_AESNI_AVAILABLE) |
251 | 183 | if (HasAESNI()) |
252 | 183 | return 16; // load __m128i |
253 | 0 | #endif |
254 | | #if (CRYPTOPP_ARM_AES_AVAILABLE) |
255 | | if (HasAES()) |
256 | | return 4; // load uint32x4_t |
257 | | #endif |
258 | | #if (CRYPTOGAMS_ARM_AES) |
259 | | // Must use 1 here for Cryptogams AES. Also see |
260 | | // https://github.com/weidai11/cryptopp/issues/683 |
261 | | if (HasARMv7()) |
262 | | return 1; |
263 | | #endif |
264 | | #if (CRYPTOPP_POWER8_AES_AVAILABLE) |
265 | | if (HasAES()) |
266 | | return 16; // load uint32x4_p |
267 | | #endif |
268 | 0 | return BlockTransformation::OptimalDataAlignment(); |
269 | 183 | } |
270 | | |
271 | | void Rijndael::Base::FillEncTable() |
272 | 0 | { |
273 | 0 | for (int i=0; i<256; i++) |
274 | 0 | { |
275 | 0 | byte x = Se[i]; |
276 | 0 | #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS) |
277 | 0 | word32 y = word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24; |
278 | 0 | Te[i] = word64(y | f3(x))<<32 | y; |
279 | | #else |
280 | | word32 y = f3(x) | word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24; |
281 | | for (int j=0; j<4; j++) |
282 | | { |
283 | | Te[i+j*256] = y; |
284 | | y = rotrConstant<8>(y); |
285 | | } |
286 | | #endif |
287 | 0 | } |
288 | | #if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM) |
289 | | Te[256] = Te[257] = 0; |
290 | | #endif |
291 | 0 | s_TeFilled = true; |
292 | 0 | } |
293 | | |
294 | | void Rijndael::Base::FillDecTable() |
295 | 0 | { |
296 | 0 | for (int i=0; i<256; i++) |
297 | 0 | { |
298 | 0 | byte x = Sd[i]; |
299 | 0 | #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS) |
300 | 0 | word32 y = word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24; |
301 | 0 | Td[i] = word64(y | fb(x))<<32 | y | x; |
302 | | #else |
303 | | word32 y = fb(x) | word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24; |
304 | | for (int j=0; j<4; j++) |
305 | | { |
306 | | Td[i+j*256] = y; |
307 | | y = rotrConstant<8>(y); |
308 | | } |
309 | | #endif |
310 | 0 | } |
311 | 0 | s_TdFilled = true; |
312 | 0 | } |
313 | | |
314 | | #if (CRYPTOPP_AESNI_AVAILABLE) |
315 | | extern void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, word32* rk); |
316 | | extern void Rijndael_UncheckedSetKeyRev_AESNI(word32 *key, unsigned int rounds); |
317 | | |
318 | | extern size_t Rijndael_Enc_AdvancedProcessBlocks_AESNI(const word32 *subkeys, size_t rounds, |
319 | | const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); |
320 | | extern size_t Rijndael_Dec_AdvancedProcessBlocks_AESNI(const word32 *subkeys, size_t rounds, |
321 | | const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); |
322 | | #endif |
323 | | |
324 | | #if (CRYPTOPP_ARM_AES_AVAILABLE) |
325 | | extern size_t Rijndael_Enc_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, size_t rounds, |
326 | | const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); |
327 | | extern size_t Rijndael_Dec_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, size_t rounds, |
328 | | const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); |
329 | | #endif |
330 | | |
331 | | #if (CRYPTOGAMS_ARM_AES) |
332 | | extern "C" int cryptogams_AES_set_encrypt_key(const unsigned char *userKey, const int bitLen, word32 *rkey); |
333 | | extern "C" int cryptogams_AES_set_decrypt_key(const unsigned char *userKey, const int bitLen, word32 *rkey); |
334 | | extern "C" void cryptogams_AES_encrypt_block(const unsigned char *in, unsigned char *out, const word32 *rkey); |
335 | | extern "C" void cryptogams_AES_decrypt_block(const unsigned char *in, unsigned char *out, const word32 *rkey); |
336 | | #endif |
337 | | |
338 | | #if (CRYPTOPP_POWER8_AES_AVAILABLE) |
339 | | extern void Rijndael_UncheckedSetKey_POWER8(const byte* userKey, size_t keyLen, |
340 | | word32* rk, const byte* Se); |
341 | | |
342 | | extern size_t Rijndael_Enc_AdvancedProcessBlocks128_6x1_ALTIVEC(const word32 *subkeys, size_t rounds, |
343 | | const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); |
344 | | extern size_t Rijndael_Dec_AdvancedProcessBlocks128_6x1_ALTIVEC(const word32 *subkeys, size_t rounds, |
345 | | const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); |
346 | | #endif |
347 | | |
348 | | #if (CRYPTOGAMS_ARM_AES) |
349 | | int CRYPTOGAMS_set_encrypt_key(const byte *userKey, const int bitLen, word32 *rkey) |
350 | | { |
351 | | return cryptogams_AES_set_encrypt_key(userKey, bitLen, rkey); |
352 | | } |
353 | | int CRYPTOGAMS_set_decrypt_key(const byte *userKey, const int bitLen, word32 *rkey) |
354 | | { |
355 | | return cryptogams_AES_set_decrypt_key(userKey, bitLen, rkey); |
356 | | } |
357 | | void CRYPTOGAMS_encrypt(const byte *inBlock, const byte *xorBlock, byte *outBlock, const word32 *rkey) |
358 | | { |
359 | | cryptogams_AES_encrypt_block(inBlock, outBlock, rkey); |
360 | | if (xorBlock) |
361 | | xorbuf (outBlock, xorBlock, 16); |
362 | | } |
363 | | void CRYPTOGAMS_decrypt(const byte *inBlock, const byte *xorBlock, byte *outBlock, const word32 *rkey) |
364 | | { |
365 | | cryptogams_AES_decrypt_block(inBlock, outBlock, rkey); |
366 | | if (xorBlock) |
367 | | xorbuf (outBlock, xorBlock, 16); |
368 | | } |
369 | | #endif |
370 | | |
371 | | std::string Rijndael::Base::AlgorithmProvider() const |
372 | 0 | { |
373 | 0 | #if (CRYPTOPP_AESNI_AVAILABLE) |
374 | 0 | if (HasAESNI()) |
375 | 0 | return "AESNI"; |
376 | 0 | #endif |
377 | | #if CRYPTOPP_SSE2_ASM_AVAILABLE && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM) |
378 | | if (HasSSE2()) |
379 | | return "SSE2"; |
380 | | #endif |
381 | | #if (CRYPTOPP_ARM_AES_AVAILABLE) |
382 | | if (HasAES()) |
383 | | return "ARMv8"; |
384 | | #endif |
385 | | #if (CRYPTOGAMS_ARM_AES) |
386 | | if (HasARMv7()) |
387 | | return "ARMv7"; |
388 | | #endif |
389 | | #if (CRYPTOPP_POWER8_AES_AVAILABLE) |
390 | | if (HasAES()) |
391 | | return "Power8"; |
392 | | #endif |
393 | 0 | return "C++"; |
394 | 0 | } |
395 | | |
396 | | void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLen, const NameValuePairs &) |
397 | 169 | { |
398 | 169 | AssertValidKeyLength(keyLen); |
399 | | |
400 | | #if (CRYPTOGAMS_ARM_AES) |
401 | | if (HasARMv7()) |
402 | | { |
403 | | m_rounds = keyLen/4 + 6; |
404 | | m_key.New(4*(14+1)+4); |
405 | | |
406 | | if (IsForwardTransformation()) |
407 | | CRYPTOGAMS_set_encrypt_key(userKey, keyLen*8, m_key.begin()); |
408 | | else |
409 | | CRYPTOGAMS_set_decrypt_key(userKey, keyLen*8, m_key.begin()); |
410 | | return; |
411 | | } |
412 | | #endif |
413 | | |
414 | 169 | #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 |
415 | 169 | m_aliasBlock.New(s_sizeToAllocate); |
416 | | // The alias block is only used on IA-32 when unaligned data access is in effect. |
417 | | // Setting the low water mark to 0 avoids zeroization when m_aliasBlock is unused. |
418 | 169 | m_aliasBlock.SetMark(0); |
419 | 169 | #endif |
420 | | |
421 | 169 | m_rounds = keyLen/4 + 6; |
422 | 169 | m_key.New(4*(m_rounds+1)); |
423 | 169 | word32 *rk = m_key; |
424 | | |
425 | 169 | #if (CRYPTOPP_AESNI_AVAILABLE && CRYPTOPP_SSE41_AVAILABLE && (!defined(CRYPTOPP_MSC_VERSION) || CRYPTOPP_MSC_VERSION >= 1600 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32)) |
426 | | // MSVC 2008 SP1 generates bad code for _mm_extract_epi32() when compiling for X64 |
427 | 169 | if (HasAESNI() && HasSSE41()) |
428 | 169 | { |
429 | | // TODO: Add non-SSE4.1 variant for low-end Atoms. The low-end |
430 | | // Atoms have SSE2-SSSE3 and AES-NI, but not SSE4.1 or SSE4.2. |
431 | 169 | Rijndael_UncheckedSetKey_SSE4_AESNI(userKey, keyLen, rk); |
432 | 169 | if (!IsForwardTransformation()) |
433 | 21 | Rijndael_UncheckedSetKeyRev_AESNI(m_key, m_rounds); |
434 | | |
435 | 169 | return; |
436 | 169 | } |
437 | 0 | #endif |
438 | | |
439 | | #if CRYPTOPP_POWER8_AES_AVAILABLE |
440 | | if (HasAES()) |
441 | | { |
442 | | // We still need rcon and Se to fallback to C/C++ for AES-192 and AES-256. |
443 | | // The IBM docs on AES sucks. Intel's docs on AESNI puts IBM to shame. |
444 | | Rijndael_UncheckedSetKey_POWER8(userKey, keyLen, rk, Se); |
445 | | return; |
446 | | } |
447 | | #endif |
448 | | |
449 | 0 | GetUserKey(BIG_ENDIAN_ORDER, rk, keyLen/4, userKey, keyLen); |
450 | 0 | const word32 *rc = rcon; |
451 | 0 | word32 temp; |
452 | |
|
453 | 0 | while (true) |
454 | 0 | { |
455 | 0 | temp = rk[keyLen/4-1]; |
456 | 0 | word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^ |
457 | 0 | (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)]; |
458 | 0 | rk[keyLen/4] = rk[0] ^ x ^ *(rc++); |
459 | 0 | rk[keyLen/4+1] = rk[1] ^ rk[keyLen/4]; |
460 | 0 | rk[keyLen/4+2] = rk[2] ^ rk[keyLen/4+1]; |
461 | 0 | rk[keyLen/4+3] = rk[3] ^ rk[keyLen/4+2]; |
462 | |
|
463 | 0 | if (rk + keyLen/4 + 4 == m_key.end()) |
464 | 0 | break; |
465 | | |
466 | 0 | if (keyLen == 24) |
467 | 0 | { |
468 | 0 | rk[10] = rk[ 4] ^ rk[ 9]; |
469 | 0 | rk[11] = rk[ 5] ^ rk[10]; |
470 | 0 | } |
471 | 0 | else if (keyLen == 32) |
472 | 0 | { |
473 | 0 | temp = rk[11]; |
474 | 0 | rk[12] = rk[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)]; |
475 | 0 | rk[13] = rk[ 5] ^ rk[12]; |
476 | 0 | rk[14] = rk[ 6] ^ rk[13]; |
477 | 0 | rk[15] = rk[ 7] ^ rk[14]; |
478 | 0 | } |
479 | 0 | rk += keyLen/4; |
480 | 0 | } |
481 | |
|
482 | 0 | rk = m_key; |
483 | |
|
484 | 0 | if (IsForwardTransformation()) |
485 | 0 | { |
486 | 0 | if (!s_TeFilled) |
487 | 0 | FillEncTable(); |
488 | |
|
489 | 0 | ConditionalByteReverse(BIG_ENDIAN_ORDER, rk, rk, 16); |
490 | 0 | ConditionalByteReverse(BIG_ENDIAN_ORDER, rk + m_rounds*4, rk + m_rounds*4, 16); |
491 | 0 | } |
492 | 0 | else |
493 | 0 | { |
494 | 0 | if (!s_TdFilled) |
495 | 0 | FillDecTable(); |
496 | |
|
497 | 0 | #define InverseMixColumn(x) \ |
498 | 0 | TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ \ |
499 | 0 | TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)]) |
500 | |
|
501 | 0 | unsigned int i, j; |
502 | 0 | for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4) |
503 | 0 | { |
504 | 0 | temp = InverseMixColumn(rk[i ]); rk[i ] = InverseMixColumn(rk[j ]); rk[j ] = temp; |
505 | 0 | temp = InverseMixColumn(rk[i + 1]); rk[i + 1] = InverseMixColumn(rk[j + 1]); rk[j + 1] = temp; |
506 | 0 | temp = InverseMixColumn(rk[i + 2]); rk[i + 2] = InverseMixColumn(rk[j + 2]); rk[j + 2] = temp; |
507 | 0 | temp = InverseMixColumn(rk[i + 3]); rk[i + 3] = InverseMixColumn(rk[j + 3]); rk[j + 3] = temp; |
508 | 0 | } |
509 | |
|
510 | 0 | rk[i+0] = InverseMixColumn(rk[i+0]); |
511 | 0 | rk[i+1] = InverseMixColumn(rk[i+1]); |
512 | 0 | rk[i+2] = InverseMixColumn(rk[i+2]); |
513 | 0 | rk[i+3] = InverseMixColumn(rk[i+3]); |
514 | |
|
515 | 0 | temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[0]); rk[0] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+0]); rk[4*m_rounds+0] = temp; |
516 | 0 | temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[1]); rk[1] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+1]); rk[4*m_rounds+1] = temp; |
517 | 0 | temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[2]); rk[2] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+2]); rk[4*m_rounds+2] = temp; |
518 | 0 | temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[3]); rk[3] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+3]); rk[4*m_rounds+3] = temp; |
519 | 0 | } |
520 | |
|
521 | 0 | #if CRYPTOPP_AESNI_AVAILABLE |
522 | 0 | if (HasAESNI()) |
523 | 0 | ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16); |
524 | 0 | #endif |
525 | | #if CRYPTOPP_ARM_AES_AVAILABLE |
526 | | if (HasAES()) |
527 | | ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16); |
528 | | #endif |
529 | 0 | } |
530 | | |
531 | | void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const |
532 | 209 | { |
533 | 209 | #if CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) || CRYPTOPP_AESNI_AVAILABLE |
534 | | # if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM) |
535 | | if (HasSSE2()) |
536 | | # else |
537 | 209 | if (HasAESNI()) |
538 | 209 | # endif |
539 | 209 | { |
540 | 209 | (void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0); |
541 | 209 | return; |
542 | 209 | } |
543 | 0 | #endif |
544 | | |
545 | | #if (CRYPTOPP_ARM_AES_AVAILABLE) |
546 | | if (HasAES()) |
547 | | { |
548 | | (void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0); |
549 | | return; |
550 | | } |
551 | | #endif |
552 | | |
553 | | #if (CRYPTOGAMS_ARM_AES) |
554 | | if (HasARMv7()) |
555 | | { |
556 | | CRYPTOGAMS_encrypt(inBlock, xorBlock, outBlock, m_key.begin()); |
557 | | return; |
558 | | } |
559 | | #endif |
560 | | |
561 | | #if (CRYPTOPP_POWER8_AES_AVAILABLE) |
562 | | if (HasAES()) |
563 | | { |
564 | | (void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0); |
565 | | return; |
566 | | } |
567 | | #endif |
568 | | |
569 | 0 | typedef BlockGetAndPut<word32, NativeByteOrder> Block; |
570 | |
|
571 | 0 | word32 s0, s1, s2, s3, t0, t1, t2, t3; |
572 | 0 | Block::Get(inBlock)(s0)(s1)(s2)(s3); |
573 | |
|
574 | 0 | const word32 *rk = m_key; |
575 | 0 | s0 ^= rk[0]; |
576 | 0 | s1 ^= rk[1]; |
577 | 0 | s2 ^= rk[2]; |
578 | 0 | s3 ^= rk[3]; |
579 | 0 | t0 = rk[4]; |
580 | 0 | t1 = rk[5]; |
581 | 0 | t2 = rk[6]; |
582 | 0 | t3 = rk[7]; |
583 | 0 | rk += 8; |
584 | | |
585 | | // timing attack countermeasure. see comments at top for more details. |
586 | | // also see http://github.com/weidai11/cryptopp/issues/146 |
587 | 0 | const int cacheLineSize = GetCacheLineSize(); |
588 | 0 | unsigned int i; |
589 | 0 | volatile word32 _u = 0; |
590 | 0 | word32 u = _u; |
591 | 0 | #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS) |
592 | 0 | for (i=0; i<2048; i+=cacheLineSize) |
593 | | #else |
594 | | for (i=0; i<1024; i+=cacheLineSize) |
595 | | #endif |
596 | 0 | u &= *(const word32 *)(const void *)(((const byte *)Te)+i); |
597 | 0 | u &= Te[255]; |
598 | 0 | s0 |= u; s1 |= u; s2 |= u; s3 |= u; |
599 | |
|
600 | 0 | QUARTER_ROUND_FE(s3, t0, t1, t2, t3) |
601 | 0 | QUARTER_ROUND_FE(s2, t3, t0, t1, t2) |
602 | 0 | QUARTER_ROUND_FE(s1, t2, t3, t0, t1) |
603 | 0 | QUARTER_ROUND_FE(s0, t1, t2, t3, t0) |
604 | | |
605 | | // Nr - 2 full rounds: |
606 | 0 | unsigned int r = m_rounds/2 - 1; |
607 | 0 | do |
608 | 0 | { |
609 | 0 | s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3]; |
610 | |
|
611 | 0 | QUARTER_ROUND_E(t3, s0, s1, s2, s3) |
612 | 0 | QUARTER_ROUND_E(t2, s3, s0, s1, s2) |
613 | 0 | QUARTER_ROUND_E(t1, s2, s3, s0, s1) |
614 | 0 | QUARTER_ROUND_E(t0, s1, s2, s3, s0) |
615 | |
|
616 | 0 | t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7]; |
617 | |
|
618 | 0 | QUARTER_ROUND_E(s3, t0, t1, t2, t3) |
619 | 0 | QUARTER_ROUND_E(s2, t3, t0, t1, t2) |
620 | 0 | QUARTER_ROUND_E(s1, t2, t3, t0, t1) |
621 | 0 | QUARTER_ROUND_E(s0, t1, t2, t3, t0) |
622 | |
|
623 | 0 | rk += 8; |
624 | 0 | } while (--r); |
625 | |
|
626 | 0 | word32 tbw[4]; |
627 | 0 | byte *const tempBlock = (byte *)tbw; |
628 | |
|
629 | 0 | QUARTER_ROUND_LE(t2, 15, 2, 5, 8) |
630 | 0 | QUARTER_ROUND_LE(t1, 11, 14, 1, 4) |
631 | 0 | QUARTER_ROUND_LE(t0, 7, 10, 13, 0) |
632 | 0 | QUARTER_ROUND_LE(t3, 3, 6, 9, 12) |
633 | |
|
634 | 0 | Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]); |
635 | 0 | } |
636 | | |
637 | | void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const |
638 | 6 | { |
639 | 6 | #if CRYPTOPP_AESNI_AVAILABLE |
640 | 6 | if (HasAESNI()) |
641 | 6 | { |
642 | 6 | (void)Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0); |
643 | 6 | return; |
644 | 6 | } |
645 | 0 | #endif |
646 | | |
647 | | #if (CRYPTOPP_ARM_AES_AVAILABLE) |
648 | | if (HasAES()) |
649 | | { |
650 | | (void)Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0); |
651 | | return; |
652 | | } |
653 | | #endif |
654 | | |
655 | | #if (CRYPTOGAMS_ARM_AES) |
656 | | if (HasARMv7()) |
657 | | { |
658 | | CRYPTOGAMS_decrypt(inBlock, xorBlock, outBlock, m_key.begin()); |
659 | | return; |
660 | | } |
661 | | #endif |
662 | | |
663 | | #if (CRYPTOPP_POWER8_AES_AVAILABLE) |
664 | | if (HasAES()) |
665 | | { |
666 | | (void)Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0); |
667 | | return; |
668 | | } |
669 | | #endif |
670 | | |
671 | 0 | typedef BlockGetAndPut<word32, NativeByteOrder> Block; |
672 | |
|
673 | 0 | word32 s0, s1, s2, s3, t0, t1, t2, t3; |
674 | 0 | Block::Get(inBlock)(s0)(s1)(s2)(s3); |
675 | |
|
676 | 0 | const word32 *rk = m_key; |
677 | 0 | s0 ^= rk[0]; |
678 | 0 | s1 ^= rk[1]; |
679 | 0 | s2 ^= rk[2]; |
680 | 0 | s3 ^= rk[3]; |
681 | 0 | t0 = rk[4]; |
682 | 0 | t1 = rk[5]; |
683 | 0 | t2 = rk[6]; |
684 | 0 | t3 = rk[7]; |
685 | 0 | rk += 8; |
686 | | |
687 | | // timing attack countermeasure. see comments at top for more details. |
688 | | // also see http://github.com/weidai11/cryptopp/issues/146 |
689 | 0 | const int cacheLineSize = GetCacheLineSize(); |
690 | 0 | unsigned int i; |
691 | 0 | volatile word32 _u = 0; |
692 | 0 | word32 u = _u; |
693 | 0 | #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS) |
694 | 0 | for (i=0; i<2048; i+=cacheLineSize) |
695 | | #else |
696 | | for (i=0; i<1024; i+=cacheLineSize) |
697 | | #endif |
698 | 0 | u &= *(const word32 *)(const void *)(((const byte *)Td)+i); |
699 | 0 | u &= Td[255]; |
700 | 0 | s0 |= u; s1 |= u; s2 |= u; s3 |= u; |
701 | |
|
702 | 0 | QUARTER_ROUND_FD(s3, t2, t1, t0, t3) |
703 | 0 | QUARTER_ROUND_FD(s2, t1, t0, t3, t2) |
704 | 0 | QUARTER_ROUND_FD(s1, t0, t3, t2, t1) |
705 | 0 | QUARTER_ROUND_FD(s0, t3, t2, t1, t0) |
706 | | |
707 | | // Nr - 2 full rounds: |
708 | 0 | unsigned int r = m_rounds/2 - 1; |
709 | 0 | do |
710 | 0 | { |
711 | 0 | s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3]; |
712 | |
|
713 | 0 | QUARTER_ROUND_D(t3, s2, s1, s0, s3) |
714 | 0 | QUARTER_ROUND_D(t2, s1, s0, s3, s2) |
715 | 0 | QUARTER_ROUND_D(t1, s0, s3, s2, s1) |
716 | 0 | QUARTER_ROUND_D(t0, s3, s2, s1, s0) |
717 | |
|
718 | 0 | t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7]; |
719 | |
|
720 | 0 | QUARTER_ROUND_D(s3, t2, t1, t0, t3) |
721 | 0 | QUARTER_ROUND_D(s2, t1, t0, t3, t2) |
722 | 0 | QUARTER_ROUND_D(s1, t0, t3, t2, t1) |
723 | 0 | QUARTER_ROUND_D(s0, t3, t2, t1, t0) |
724 | |
|
725 | 0 | rk += 8; |
726 | 0 | } while (--r); |
727 | |
|
728 | | #if !(defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)) |
729 | | // timing attack countermeasure. see comments at top for more details |
730 | | // If CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS is defined, |
731 | | // QUARTER_ROUND_LD will use Td, which is already preloaded. |
732 | | u = _u; |
733 | | for (i=0; i<256; i+=cacheLineSize) |
734 | | u &= *(const word32 *)(const void *)(Sd+i); |
735 | | u &= *(const word32 *)(const void *)(Sd+252); |
736 | | t0 |= u; t1 |= u; t2 |= u; t3 |= u; |
737 | | #endif |
738 | |
|
739 | 0 | word32 tbw[4]; |
740 | 0 | byte *const tempBlock = (byte *)tbw; |
741 | |
|
742 | 0 | QUARTER_ROUND_LD(t2, 7, 2, 13, 8) |
743 | 0 | QUARTER_ROUND_LD(t1, 3, 14, 9, 4) |
744 | 0 | QUARTER_ROUND_LD(t0, 15, 10, 5, 0) |
745 | 0 | QUARTER_ROUND_LD(t3, 11, 6, 1, 12) |
746 | |
|
747 | 0 | Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]); |
748 | 0 | } |
749 | | |
750 | | // ************************* Assembly Code ************************************ |
751 | | |
752 | | #if CRYPTOPP_MSC_VERSION |
753 | | # pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code |
754 | | #endif |
755 | | |
756 | | #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM |
757 | | |
758 | | #if CRYPTOPP_SSE2_ASM_AVAILABLE && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM) |
759 | | |
760 | | CRYPTOPP_NAKED void CRYPTOPP_FASTCALL Rijndael_Enc_AdvancedProcessBlocks_SSE2(void *locals, const word32 *k) |
761 | | { |
762 | | CRYPTOPP_UNUSED(locals); CRYPTOPP_UNUSED(k); |
763 | | |
764 | | #if CRYPTOPP_BOOL_X86 |
765 | | |
766 | | #define L_REG esp |
767 | | #define L_INDEX(i) (L_REG+768+i) |
768 | | #define L_INXORBLOCKS L_INBLOCKS+4 |
769 | | #define L_OUTXORBLOCKS L_INBLOCKS+8 |
770 | | #define L_OUTBLOCKS L_INBLOCKS+12 |
771 | | #define L_INCREMENTS L_INDEX(16*15) |
772 | | #define L_SP L_INDEX(16*16) |
773 | | #define L_LENGTH L_INDEX(16*16+4) |
774 | | #define L_KEYS_BEGIN L_INDEX(16*16+8) |
775 | | |
776 | | #define MOVD movd |
777 | | #define MM(i) mm##i |
778 | | |
779 | | #define MXOR(a,b,c) \ |
780 | | AS2( movzx esi, b)\ |
781 | | AS2( movd mm7, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\ |
782 | | AS2( pxor MM(a), mm7)\ |
783 | | |
784 | | #define MMOV(a,b,c) \ |
785 | | AS2( movzx esi, b)\ |
786 | | AS2( movd MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\ |
787 | | |
788 | | #else |
789 | | |
790 | | #define L_REG r8 |
791 | | #define L_INDEX(i) (L_REG+i) |
792 | | #define L_INXORBLOCKS L_INBLOCKS+8 |
793 | | #define L_OUTXORBLOCKS L_INBLOCKS+16 |
794 | | #define L_OUTBLOCKS L_INBLOCKS+24 |
795 | | #define L_INCREMENTS L_INDEX(16*16) |
796 | | #define L_LENGTH L_INDEX(16*18+8) |
797 | | #define L_KEYS_BEGIN L_INDEX(16*19) |
798 | | |
799 | | #define MOVD mov |
800 | | #define MM_0 r9d |
801 | | #define MM_1 r12d |
802 | | #ifdef __GNUC__ |
803 | | #define MM_2 r11d |
804 | | #else |
805 | | #define MM_2 r10d |
806 | | #endif |
807 | | #define MM(i) MM_##i |
808 | | |
809 | | #define MXOR(a,b,c) \ |
810 | | AS2( movzx esi, b)\ |
811 | | AS2( xor MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\ |
812 | | |
813 | | #define MMOV(a,b,c) \ |
814 | | AS2( movzx esi, b)\ |
815 | | AS2( mov MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\ |
816 | | |
817 | | #endif |
818 | | |
819 | | #define L_SUBKEYS L_INDEX(0) |
820 | | #define L_SAVED_X L_SUBKEYS |
821 | | #define L_KEY12 L_INDEX(16*12) |
822 | | #define L_LASTROUND L_INDEX(16*13) |
823 | | #define L_INBLOCKS L_INDEX(16*14) |
824 | | #define MAP0TO4(i) (ASM_MOD(i+3,4)+1) |
825 | | |
826 | | #define XOR(a,b,c) \ |
827 | | AS2( movzx esi, b)\ |
828 | | AS2( xor a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\ |
829 | | |
830 | | #define MOV(a,b,c) \ |
831 | | AS2( movzx esi, b)\ |
832 | | AS2( mov a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\ |
833 | | |
834 | | #ifdef CRYPTOPP_GENERATE_X64_MASM |
835 | | ALIGN 8 |
836 | | Rijndael_Enc_AdvancedProcessBlocks PROC FRAME |
837 | | rex_push_reg rsi |
838 | | push_reg rdi |
839 | | push_reg rbx |
840 | | push_reg r12 |
841 | | .endprolog |
842 | | mov L_REG, rcx |
843 | | mov AS_REG_7, ?Te@rdtable@CryptoPP@@3PA_KA |
844 | | mov edi, DWORD PTR [?g_cacheLineSize@CryptoPP@@3IA] |
845 | | #elif defined(__GNUC__) |
846 | | __asm__ __volatile__ |
847 | | ( |
848 | | INTEL_NOPREFIX |
849 | | #if CRYPTOPP_BOOL_X64 |
850 | | AS2( mov L_REG, rcx) |
851 | | #endif |
852 | | AS_PUSH_IF86(bx) |
853 | | AS_PUSH_IF86(bp) |
854 | | AS2( mov AS_REG_7, WORD_REG(si)) |
855 | | #else |
856 | | AS_PUSH_IF86(si) |
857 | | AS_PUSH_IF86(di) |
858 | | AS_PUSH_IF86(bx) |
859 | | AS_PUSH_IF86(bp) |
860 | | AS2( lea AS_REG_7, [Te]) |
861 | | AS2( mov edi, [g_cacheLineSize]) |
862 | | #endif |
863 | | |
864 | | #if CRYPTOPP_BOOL_X86 |
865 | | AS2( mov [ecx+16*12+16*4], esp) // save esp to L_SP |
866 | | AS2( lea esp, [ecx-768]) |
867 | | #endif |
868 | | |
869 | | // copy subkeys to stack |
870 | | AS2( mov WORD_REG(si), [L_KEYS_BEGIN]) |
871 | | AS2( mov WORD_REG(ax), 16) |
872 | | AS2( and WORD_REG(ax), WORD_REG(si)) |
873 | | AS2( movdqa xmm3, XMMWORD_PTR [WORD_REG(dx)+16+WORD_REG(ax)]) // subkey 1 (non-counter) or 2 (counter) |
874 | | AS2( movdqa [L_KEY12], xmm3) |
875 | | AS2( lea WORD_REG(ax), [WORD_REG(dx)+WORD_REG(ax)+2*16]) |
876 | | AS2( sub WORD_REG(ax), WORD_REG(si)) |
877 | | ASL(0) |
878 | | AS2( movdqa xmm0, [WORD_REG(ax)+WORD_REG(si)]) |
879 | | AS2( movdqa XMMWORD_PTR [L_SUBKEYS+WORD_REG(si)], xmm0) |
880 | | AS2( add WORD_REG(si), 16) |
881 | | AS2( cmp WORD_REG(si), 16*12) |
882 | | ATT_NOPREFIX |
883 | | ASJ( jl, 0, b) |
884 | | INTEL_NOPREFIX |
885 | | |
886 | | // read subkeys 0, 1 and last |
887 | | AS2( movdqa xmm4, [WORD_REG(ax)+WORD_REG(si)]) // last subkey |
888 | | AS2( movdqa xmm1, [WORD_REG(dx)]) // subkey 0 |
889 | | AS2( MOVD MM(1), [WORD_REG(dx)+4*4]) // 0,1,2,3 |
890 | | AS2( mov ebx, [WORD_REG(dx)+5*4]) // 4,5,6,7 |
891 | | AS2( mov ecx, [WORD_REG(dx)+6*4]) // 8,9,10,11 |
892 | | AS2( mov edx, [WORD_REG(dx)+7*4]) // 12,13,14,15 |
893 | | |
894 | | // load table into cache |
895 | | AS2( xor WORD_REG(ax), WORD_REG(ax)) |
896 | | ASL(9) |
897 | | AS2( mov esi, [AS_REG_7+WORD_REG(ax)]) |
898 | | AS2( add WORD_REG(ax), WORD_REG(di)) |
899 | | AS2( mov esi, [AS_REG_7+WORD_REG(ax)]) |
900 | | AS2( add WORD_REG(ax), WORD_REG(di)) |
901 | | AS2( mov esi, [AS_REG_7+WORD_REG(ax)]) |
902 | | AS2( add WORD_REG(ax), WORD_REG(di)) |
903 | | AS2( mov esi, [AS_REG_7+WORD_REG(ax)]) |
904 | | AS2( add WORD_REG(ax), WORD_REG(di)) |
905 | | AS2( cmp WORD_REG(ax), 2048) |
906 | | ATT_NOPREFIX |
907 | | ASJ( jl, 9, b) |
908 | | INTEL_NOPREFIX |
909 | | AS1( lfence) |
910 | | |
911 | | AS2( test DWORD PTR [L_LENGTH], 1) |
912 | | ATT_NOPREFIX |
913 | | ASJ( jz, 8, f) |
914 | | INTEL_NOPREFIX |
915 | | |
916 | | // counter mode one-time setup |
917 | | AS2( mov WORD_REG(si), [L_INBLOCKS]) |
918 | | AS2( movdqu xmm2, [WORD_REG(si)]) // counter |
919 | | AS2( pxor xmm2, xmm1) |
920 | | AS2( psrldq xmm1, 14) |
921 | | AS2( movd eax, xmm1) |
922 | | AS2( mov al, BYTE PTR [WORD_REG(si)+15]) |
923 | | AS2( MOVD MM(2), eax) |
924 | | #if CRYPTOPP_BOOL_X86 |
925 | | AS2( mov eax, 1) |
926 | | AS2( movd mm3, eax) |
927 | | #endif |
928 | | |
929 | | // partial first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: mm1, ebx, ecx, edx |
930 | | AS2( movd eax, xmm2) |
931 | | AS2( psrldq xmm2, 4) |
932 | | AS2( movd edi, xmm2) |
933 | | AS2( psrldq xmm2, 4) |
934 | | MXOR( 1, al, 0) // 0 |
935 | | XOR( edx, ah, 1) // 1 |
936 | | AS2( shr eax, 16) |
937 | | XOR( ecx, al, 2) // 2 |
938 | | XOR( ebx, ah, 3) // 3 |
939 | | AS2( mov eax, edi) |
940 | | AS2( movd edi, xmm2) |
941 | | AS2( psrldq xmm2, 4) |
942 | | XOR( ebx, al, 0) // 4 |
943 | | MXOR( 1, ah, 1) // 5 |
944 | | AS2( shr eax, 16) |
945 | | XOR( edx, al, 2) // 6 |
946 | | XOR( ecx, ah, 3) // 7 |
947 | | AS2( mov eax, edi) |
948 | | AS2( movd edi, xmm2) |
949 | | XOR( ecx, al, 0) // 8 |
950 | | XOR( ebx, ah, 1) // 9 |
951 | | AS2( shr eax, 16) |
952 | | MXOR( 1, al, 2) // 10 |
953 | | XOR( edx, ah, 3) // 11 |
954 | | AS2( mov eax, edi) |
955 | | XOR( edx, al, 0) // 12 |
956 | | XOR( ecx, ah, 1) // 13 |
957 | | AS2( shr eax, 16) |
958 | | XOR( ebx, al, 2) // 14 |
959 | | AS2( psrldq xmm2, 3) |
960 | | |
961 | | // partial second round, in: ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15), out: eax, ebx, edi, mm0 |
962 | | AS2( mov eax, [L_KEY12+0*4]) |
963 | | AS2( mov edi, [L_KEY12+2*4]) |
964 | | AS2( MOVD MM(0), [L_KEY12+3*4]) |
965 | | MXOR( 0, cl, 3) /* 11 */ |
966 | | XOR( edi, bl, 3) /* 7 */ |
967 | | MXOR( 0, bh, 2) /* 6 */ |
968 | | AS2( shr ebx, 16) /* 4,5 */ |
969 | | XOR( eax, bl, 1) /* 5 */ |
970 | | MOV( ebx, bh, 0) /* 4 */ |
971 | | AS2( xor ebx, [L_KEY12+1*4]) |
972 | | XOR( eax, ch, 2) /* 10 */ |
973 | | AS2( shr ecx, 16) /* 8,9 */ |
974 | | XOR( eax, dl, 3) /* 15 */ |
975 | | XOR( ebx, dh, 2) /* 14 */ |
976 | | AS2( shr edx, 16) /* 12,13 */ |
977 | | XOR( edi, ch, 0) /* 8 */ |
978 | | XOR( ebx, cl, 1) /* 9 */ |
979 | | XOR( edi, dl, 1) /* 13 */ |
980 | | MXOR( 0, dh, 0) /* 12 */ |
981 | | |
982 | | AS2( movd ecx, xmm2) |
983 | | AS2( MOVD edx, MM(1)) |
984 | | AS2( MOVD [L_SAVED_X+3*4], MM(0)) |
985 | | AS2( mov [L_SAVED_X+0*4], eax) |
986 | | AS2( mov [L_SAVED_X+1*4], ebx) |
987 | | AS2( mov [L_SAVED_X+2*4], edi) |
988 | | ATT_NOPREFIX |
989 | | ASJ( jmp, 5, f) |
990 | | INTEL_NOPREFIX |
991 | | ASL(3) |
992 | | // non-counter mode per-block setup |
993 | | AS2( MOVD MM(1), [L_KEY12+0*4]) // 0,1,2,3 |
994 | | AS2( mov ebx, [L_KEY12+1*4]) // 4,5,6,7 |
995 | | AS2( mov ecx, [L_KEY12+2*4]) // 8,9,10,11 |
996 | | AS2( mov edx, [L_KEY12+3*4]) // 12,13,14,15 |
997 | | ASL(8) |
998 | | AS2( mov WORD_REG(ax), [L_INBLOCKS]) |
999 | | AS2( movdqu xmm2, [WORD_REG(ax)]) |
1000 | | AS2( mov WORD_REG(si), [L_INXORBLOCKS]) |
1001 | | AS2( movdqu xmm5, [WORD_REG(si)]) |
1002 | | AS2( pxor xmm2, xmm1) |
1003 | | AS2( pxor xmm2, xmm5) |
1004 | | |
1005 | | // first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: eax, ebx, ecx, edx |
1006 | | AS2( movd eax, xmm2) |
1007 | | AS2( psrldq xmm2, 4) |
1008 | | AS2( movd edi, xmm2) |
1009 | | AS2( psrldq xmm2, 4) |
1010 | | MXOR( 1, al, 0) // 0 |
1011 | | XOR( edx, ah, 1) // 1 |
1012 | | AS2( shr eax, 16) |
1013 | | XOR( ecx, al, 2) // 2 |
1014 | | XOR( ebx, ah, 3) // 3 |
1015 | | AS2( mov eax, edi) |
1016 | | AS2( movd edi, xmm2) |
1017 | | AS2( psrldq xmm2, 4) |
1018 | | XOR( ebx, al, 0) // 4 |
1019 | | MXOR( 1, ah, 1) // 5 |
1020 | | AS2( shr eax, 16) |
1021 | | XOR( edx, al, 2) // 6 |
1022 | | XOR( ecx, ah, 3) // 7 |
1023 | | AS2( mov eax, edi) |
1024 | | AS2( movd edi, xmm2) |
1025 | | XOR( ecx, al, 0) // 8 |
1026 | | XOR( ebx, ah, 1) // 9 |
1027 | | AS2( shr eax, 16) |
1028 | | MXOR( 1, al, 2) // 10 |
1029 | | XOR( edx, ah, 3) // 11 |
1030 | | AS2( mov eax, edi) |
1031 | | XOR( edx, al, 0) // 12 |
1032 | | XOR( ecx, ah, 1) // 13 |
1033 | | AS2( shr eax, 16) |
1034 | | XOR( ebx, al, 2) // 14 |
1035 | | MXOR( 1, ah, 3) // 15 |
1036 | | AS2( MOVD eax, MM(1)) |
1037 | | |
1038 | | AS2( add L_REG, [L_KEYS_BEGIN]) |
1039 | | AS2( add L_REG, 4*16) |
1040 | | ATT_NOPREFIX |
1041 | | ASJ( jmp, 2, f) |
1042 | | INTEL_NOPREFIX |
1043 | | ASL(1) |
1044 | | // counter-mode per-block setup |
1045 | | AS2( MOVD ecx, MM(2)) |
1046 | | AS2( MOVD edx, MM(1)) |
1047 | | AS2( mov eax, [L_SAVED_X+0*4]) |
1048 | | AS2( mov ebx, [L_SAVED_X+1*4]) |
1049 | | AS2( xor cl, ch) |
1050 | | AS2( and WORD_REG(cx), 255) |
1051 | | ASL(5) |
1052 | | #if CRYPTOPP_BOOL_X86 |
1053 | | AS2( paddb MM(2), mm3) |
1054 | | #else |
1055 | | AS2( add MM(2), 1) |
1056 | | #endif |
1057 | | // remaining part of second round, in: edx(previous round),esi(keyed counter byte) eax,ebx,[L_SAVED_X+2*4],[L_SAVED_X+3*4], out: eax,ebx,ecx,edx |
1058 | | AS2( xor edx, DWORD PTR [AS_REG_7+WORD_REG(cx)*8+3]) |
1059 | | XOR( ebx, dl, 3) |
1060 | | MOV( ecx, dh, 2) |
1061 | | AS2( shr edx, 16) |
1062 | | AS2( xor ecx, [L_SAVED_X+2*4]) |
1063 | | XOR( eax, dh, 0) |
1064 | | MOV( edx, dl, 1) |
1065 | | AS2( xor edx, [L_SAVED_X+3*4]) |
1066 | | |
1067 | | AS2( add L_REG, [L_KEYS_BEGIN]) |
1068 | | AS2( add L_REG, 3*16) |
1069 | | ATT_NOPREFIX |
1070 | | ASJ( jmp, 4, f) |
1071 | | INTEL_NOPREFIX |
1072 | | |
1073 | | // in: eax(0,1,2,3), ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15) |
1074 | | // out: eax, ebx, edi, mm0 |
1075 | | #define ROUND() \ |
1076 | | MXOR( 0, cl, 3) /* 11 */\ |
1077 | | AS2( mov cl, al) /* 8,9,10,3 */\ |
1078 | | XOR( edi, ah, 2) /* 2 */\ |
1079 | | AS2( shr eax, 16) /* 0,1 */\ |
1080 | | XOR( edi, bl, 3) /* 7 */\ |
1081 | | MXOR( 0, bh, 2) /* 6 */\ |
1082 | | AS2( shr ebx, 16) /* 4,5 */\ |
1083 | | MXOR( 0, al, 1) /* 1 */\ |
1084 | | MOV( eax, ah, 0) /* 0 */\ |
1085 | | XOR( eax, bl, 1) /* 5 */\ |
1086 | | MOV( ebx, bh, 0) /* 4 */\ |
1087 | | XOR( eax, ch, 2) /* 10 */\ |
1088 | | XOR( ebx, cl, 3) /* 3 */\ |
1089 | | AS2( shr ecx, 16) /* 8,9 */\ |
1090 | | XOR( eax, dl, 3) /* 15 */\ |
1091 | | XOR( ebx, dh, 2) /* 14 */\ |
1092 | | AS2( shr edx, 16) /* 12,13 */\ |
1093 | | XOR( edi, ch, 0) /* 8 */\ |
1094 | | XOR( ebx, cl, 1) /* 9 */\ |
1095 | | XOR( edi, dl, 1) /* 13 */\ |
1096 | | MXOR( 0, dh, 0) /* 12 */\ |
1097 | | |
1098 | | ASL(2) // 2-round loop |
1099 | | AS2( MOVD MM(0), [L_SUBKEYS-4*16+3*4]) |
1100 | | AS2( mov edi, [L_SUBKEYS-4*16+2*4]) |
1101 | | ROUND() |
1102 | | AS2( mov ecx, edi) |
1103 | | AS2( xor eax, [L_SUBKEYS-4*16+0*4]) |
1104 | | AS2( xor ebx, [L_SUBKEYS-4*16+1*4]) |
1105 | | AS2( MOVD edx, MM(0)) |
1106 | | |
1107 | | ASL(4) |
1108 | | AS2( MOVD MM(0), [L_SUBKEYS-4*16+7*4]) |
1109 | | AS2( mov edi, [L_SUBKEYS-4*16+6*4]) |
1110 | | ROUND() |
1111 | | AS2( mov ecx, edi) |
1112 | | AS2( xor eax, [L_SUBKEYS-4*16+4*4]) |
1113 | | AS2( xor ebx, [L_SUBKEYS-4*16+5*4]) |
1114 | | AS2( MOVD edx, MM(0)) |
1115 | | |
1116 | | AS2( add L_REG, 32) |
1117 | | AS2( test L_REG, 255) |
1118 | | ATT_NOPREFIX |
1119 | | ASJ( jnz, 2, b) |
1120 | | INTEL_NOPREFIX |
1121 | | AS2( sub L_REG, 16*16) |
1122 | | |
1123 | | #define LAST(a, b, c) \ |
1124 | | AS2( movzx esi, a )\ |
1125 | | AS2( movzx edi, BYTE PTR [AS_REG_7+WORD_REG(si)*8+1] )\ |
1126 | | AS2( movzx esi, b )\ |
1127 | | AS2( xor edi, DWORD PTR [AS_REG_7+WORD_REG(si)*8+0] )\ |
1128 | | AS2( mov WORD PTR [L_LASTROUND+c], di )\ |
1129 | | |
1130 | | // last round |
1131 | | LAST(ch, dl, 2) |
1132 | | LAST(dh, al, 6) |
1133 | | AS2( shr edx, 16) |
1134 | | LAST(ah, bl, 10) |
1135 | | AS2( shr eax, 16) |
1136 | | LAST(bh, cl, 14) |
1137 | | AS2( shr ebx, 16) |
1138 | | LAST(dh, al, 12) |
1139 | | AS2( shr ecx, 16) |
1140 | | LAST(ah, bl, 0) |
1141 | | LAST(bh, cl, 4) |
1142 | | LAST(ch, dl, 8) |
1143 | | |
1144 | | AS2( mov WORD_REG(ax), [L_OUTXORBLOCKS]) |
1145 | | AS2( mov WORD_REG(bx), [L_OUTBLOCKS]) |
1146 | | |
1147 | | AS2( mov WORD_REG(cx), [L_LENGTH]) |
1148 | | AS2( sub WORD_REG(cx), 16) |
1149 | | |
1150 | | AS2( movdqu xmm2, [WORD_REG(ax)]) |
1151 | | AS2( pxor xmm2, xmm4) |
1152 | | |
1153 | | #if CRYPTOPP_BOOL_X86 |
1154 | | AS2( movdqa xmm0, [L_INCREMENTS]) |
1155 | | AS2( paddd xmm0, [L_INBLOCKS]) |
1156 | | AS2( movdqa [L_INBLOCKS], xmm0) |
1157 | | #else |
1158 | | AS2( movdqa xmm0, [L_INCREMENTS+16]) |
1159 | | AS2( paddq xmm0, [L_INBLOCKS+16]) |
1160 | | AS2( movdqa [L_INBLOCKS+16], xmm0) |
1161 | | #endif |
1162 | | |
1163 | | AS2( pxor xmm2, [L_LASTROUND]) |
1164 | | AS2( movdqu [WORD_REG(bx)], xmm2) |
1165 | | |
1166 | | ATT_NOPREFIX |
1167 | | ASJ( jle, 7, f) |
1168 | | INTEL_NOPREFIX |
1169 | | AS2( mov [L_LENGTH], WORD_REG(cx)) |
1170 | | AS2( test WORD_REG(cx), 1) |
1171 | | ATT_NOPREFIX |
1172 | | ASJ( jnz, 1, b) |
1173 | | INTEL_NOPREFIX |
1174 | | #if CRYPTOPP_BOOL_X64 |
1175 | | AS2( movdqa xmm0, [L_INCREMENTS]) |
1176 | | AS2( paddq xmm0, [L_INBLOCKS]) |
1177 | | AS2( movdqa [L_INBLOCKS], xmm0) |
1178 | | #endif |
1179 | | ATT_NOPREFIX |
1180 | | ASJ( jmp, 3, b) |
1181 | | INTEL_NOPREFIX |
1182 | | |
1183 | | ASL(7) |
1184 | | // erase keys on stack |
1185 | | AS2( xorps xmm0, xmm0) |
1186 | | AS2( lea WORD_REG(ax), [L_SUBKEYS+7*16]) |
1187 | | AS2( movaps [WORD_REG(ax)-7*16], xmm0) |
1188 | | AS2( movaps [WORD_REG(ax)-6*16], xmm0) |
1189 | | AS2( movaps [WORD_REG(ax)-5*16], xmm0) |
1190 | | AS2( movaps [WORD_REG(ax)-4*16], xmm0) |
1191 | | AS2( movaps [WORD_REG(ax)-3*16], xmm0) |
1192 | | AS2( movaps [WORD_REG(ax)-2*16], xmm0) |
1193 | | AS2( movaps [WORD_REG(ax)-1*16], xmm0) |
1194 | | AS2( movaps [WORD_REG(ax)+0*16], xmm0) |
1195 | | AS2( movaps [WORD_REG(ax)+1*16], xmm0) |
1196 | | AS2( movaps [WORD_REG(ax)+2*16], xmm0) |
1197 | | AS2( movaps [WORD_REG(ax)+3*16], xmm0) |
1198 | | AS2( movaps [WORD_REG(ax)+4*16], xmm0) |
1199 | | AS2( movaps [WORD_REG(ax)+5*16], xmm0) |
1200 | | AS2( movaps [WORD_REG(ax)+6*16], xmm0) |
1201 | | #if CRYPTOPP_BOOL_X86 |
1202 | | AS2( mov esp, [L_SP]) |
1203 | | AS1( emms) |
1204 | | #endif |
1205 | | AS_POP_IF86(bp) |
1206 | | AS_POP_IF86(bx) |
1207 | | #if defined(CRYPTOPP_MSC_VERSION) && CRYPTOPP_BOOL_X86 |
1208 | | AS_POP_IF86(di) |
1209 | | AS_POP_IF86(si) |
1210 | | AS1(ret) |
1211 | | #endif |
1212 | | #ifdef CRYPTOPP_GENERATE_X64_MASM |
1213 | | pop r12 |
1214 | | pop rbx |
1215 | | pop rdi |
1216 | | pop rsi |
1217 | | ret |
1218 | | Rijndael_Enc_AdvancedProcessBlocks ENDP |
1219 | | #endif |
1220 | | #ifdef __GNUC__ |
1221 | | ATT_PREFIX |
1222 | | : |
1223 | | : "c" (locals), "d" (k), "S" (Te), "D" (g_cacheLineSize) |
1224 | | : "memory", "cc", "%eax" |
1225 | | #if CRYPTOPP_BOOL_X64 |
1226 | | , "%rbx", "%r8", "%r9", "%r10", "%r11", "%r12" |
1227 | | #endif |
1228 | | ); |
1229 | | #endif |
1230 | | } |
1231 | | |
1232 | | #endif |
1233 | | |
1234 | | #ifndef CRYPTOPP_GENERATE_X64_MASM |
1235 | | |
1236 | | #ifdef CRYPTOPP_X64_MASM_AVAILABLE |
1237 | | extern "C" { |
1238 | | void Rijndael_Enc_AdvancedProcessBlocks_SSE2(void *locals, const word32 *k); |
1239 | | } |
1240 | | #endif |
1241 | | |
1242 | | #if CRYPTOPP_RIJNDAEL_ADVANCED_PROCESS_BLOCKS |
1243 | | size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const |
1244 | 493 | { |
1245 | 493 | #if CRYPTOPP_AESNI_AVAILABLE |
1246 | 493 | if (HasAESNI()) |
1247 | 493 | return Rijndael_Enc_AdvancedProcessBlocks_AESNI(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); |
1248 | 0 | #endif |
1249 | | #if CRYPTOPP_ARM_AES_AVAILABLE |
1250 | | if (HasAES()) |
1251 | | return Rijndael_Enc_AdvancedProcessBlocks_ARMV8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); |
1252 | | #endif |
1253 | | #if CRYPTOPP_POWER8_AES_AVAILABLE |
1254 | | if (HasAES()) |
1255 | | return Rijndael_Enc_AdvancedProcessBlocks128_6x1_ALTIVEC(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); |
1256 | | #endif |
1257 | | |
1258 | | #if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM) |
1259 | | if (HasSSE2()) |
1260 | | { |
1261 | | if (length < BLOCKSIZE) |
1262 | | return length; |
1263 | | |
1264 | | static const byte *zeros = (const byte*)(Te+256); |
1265 | | m_aliasBlock.SetMark(m_aliasBlock.size()); |
1266 | | byte *space = NULLPTR, *originalSpace = const_cast<byte*>(m_aliasBlock.data()); |
1267 | | |
1268 | | // round up to nearest 256 byte boundary |
1269 | | space = originalSpace + (s_aliasBlockSize - (uintptr_t)originalSpace % s_aliasBlockSize) % s_aliasBlockSize; |
1270 | | while (AliasedWithTable(space, space + sizeof(Locals))) |
1271 | | { |
1272 | | space += 256; |
1273 | | CRYPTOPP_ASSERT(space < (originalSpace + s_aliasPageSize)); |
1274 | | } |
1275 | | |
1276 | | size_t increment = BLOCKSIZE; |
1277 | | if (flags & BT_ReverseDirection) |
1278 | | { |
1279 | | CRYPTOPP_ASSERT(length % BLOCKSIZE == 0); |
1280 | | inBlocks += length - BLOCKSIZE; |
1281 | | xorBlocks += length - BLOCKSIZE; |
1282 | | outBlocks += length - BLOCKSIZE; |
1283 | | increment = 0-increment; |
1284 | | } |
1285 | | |
1286 | | Locals &locals = *(Locals *)(void *)space; |
1287 | | |
1288 | | locals.inBlocks = inBlocks; |
1289 | | locals.inXorBlocks = (flags & BT_XorInput) && xorBlocks ? xorBlocks : zeros; |
1290 | | locals.outXorBlocks = (flags & BT_XorInput) || !xorBlocks ? zeros : xorBlocks; |
1291 | | locals.outBlocks = outBlocks; |
1292 | | |
1293 | | locals.inIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment; |
1294 | | locals.inXorIncrement = (flags & BT_XorInput) && xorBlocks ? increment : 0; |
1295 | | locals.outXorIncrement = (flags & BT_XorInput) || !xorBlocks ? 0 : increment; |
1296 | | locals.outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment; |
1297 | | |
1298 | | locals.lengthAndCounterFlag = length - (length%16) - bool(flags & BT_InBlockIsCounter); |
1299 | | int keysToCopy = m_rounds - (flags & BT_InBlockIsCounter ? 3 : 2); |
1300 | | locals.keysBegin = (12-keysToCopy)*16; |
1301 | | |
1302 | | Rijndael_Enc_AdvancedProcessBlocks_SSE2(&locals, m_key); |
1303 | | |
1304 | | return length % BLOCKSIZE; |
1305 | | } |
1306 | | #endif |
1307 | | |
1308 | 0 | return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags); |
1309 | 493 | } |
1310 | | |
1311 | | size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const |
1312 | 24 | { |
1313 | 24 | #if CRYPTOPP_AESNI_AVAILABLE |
1314 | 24 | if (HasAESNI()) |
1315 | 24 | return Rijndael_Dec_AdvancedProcessBlocks_AESNI(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); |
1316 | 0 | #endif |
1317 | | #if CRYPTOPP_ARM_AES_AVAILABLE |
1318 | | if (HasAES()) |
1319 | | return Rijndael_Dec_AdvancedProcessBlocks_ARMV8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); |
1320 | | #endif |
1321 | | #if CRYPTOPP_POWER8_AES_AVAILABLE |
1322 | | if (HasAES()) |
1323 | | return Rijndael_Dec_AdvancedProcessBlocks128_6x1_ALTIVEC(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); |
1324 | | #endif |
1325 | | |
1326 | 0 | return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags); |
1327 | 24 | } |
1328 | | #endif // CRYPTOPP_RIJNDAEL_ADVANCED_PROCESS_BLOCKS |
1329 | | |
1330 | | NAMESPACE_END |
1331 | | |
1332 | | #endif |
1333 | | #endif |