Coverage Report

Created: 2024-11-21 07:03

/src/cryptopp/rijndael.cpp
Line
Count
Source (jump to first uncovered line)
1
// rijndael.cpp - modified by Chris Morgan <cmorgan@wpi.edu>
2
// and Wei Dai from Paulo Baretto's Rijndael implementation
3
// The original code and all modifications are in the public domain.
4
5
// use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM rijndael.cpp" to generate MASM code
6
7
/*
8
July 2018: Added support for ARMv7 AES instructions via Cryptogams ASM.
9
           See the head notes in aes_armv4.S for copyright and license.
10
*/
11
12
/*
13
September 2017: Added support for Power8 AES instructions via compiler intrinsics.
14
*/
15
16
/*
17
July 2017: Added support for ARMv8 AES instructions via compiler intrinsics.
18
*/
19
20
/*
21
July 2010: Added support for AES-NI instructions via compiler intrinsics.
22
*/
23
24
/*
25
Feb 2009: The x86/x64 assembly code was rewritten in by Wei Dai to do counter mode
26
caching, which was invented by Hongjun Wu and popularized by Daniel J. Bernstein
27
and Peter Schwabe in their paper "New AES software speed records". The round
28
function was also modified to include a trick similar to one in Brian Gladman's
29
x86 assembly code, doing an 8-bit register move to minimize the number of
30
register spills. Also switched to compressed tables and copying round keys to
31
the stack.
32
33
The C++ implementation uses compressed tables if
34
CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS is defined.
35
It is defined on x86 platforms by default but no others.
36
*/
37
38
/*
39
July 2006: Defense against timing attacks was added in by Wei Dai.
40
41
The code now uses smaller tables in the first and last rounds,
42
and preloads them into L1 cache before usage (by loading at least
43
one element in each cache line).
44
45
We try to delay subsequent accesses to each table (used in the first
46
and last rounds) until all of the table has been preloaded. Hopefully
47
the compiler isn't smart enough to optimize that code away.
48
49
After preloading the table, we also try not to access any memory location
50
other than the table and the stack, in order to prevent table entries from
51
being unloaded from L1 cache, until that round is finished.
52
(Some popular CPUs have 2-way associative caches.)
53
*/
54
55
// This is the original introductory comment:
56
57
/**
58
 * version 3.0 (December 2000)
59
 *
60
 * Optimised ANSI C code for the Rijndael cipher (now AES)
61
 *
62
 * author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
63
 * author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
64
 * author Paulo Barreto <paulo.barreto@terra.com.br>
65
 *
66
 * This code is hereby placed in the public domain.
67
 *
68
 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
69
 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
70
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
71
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
72
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
73
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
74
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
75
 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
76
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
77
 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
78
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
79
 */
80
81
#include "pch.h"
82
#include "config.h"
83
84
#ifndef CRYPTOPP_IMPORTS
85
#ifndef CRYPTOPP_GENERATE_X64_MASM
86
87
#include "rijndael.h"
88
#include "misc.h"
89
#include "cpu.h"
90
91
// VS2017 and global optimization bug. Also see
92
// https://github.com/weidai11/cryptopp/issues/649
93
#if (CRYPTOPP_MSC_VERSION >= 1910) && (CRYPTOPP_MSC_VERSION <= 1916)
94
# ifndef CRYPTOPP_DEBUG
95
#  pragma optimize("", off)
96
#  pragma optimize("ts", on)
97
# endif
98
#endif
99
100
NAMESPACE_BEGIN(CryptoPP)
101
102
// Hack for http://github.com/weidai11/cryptopp/issues/42 and http://github.com/weidai11/cryptopp/issues/132
103
#if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE))
104
# define CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS 1
105
#endif
106
107
// Clang intrinsic casts
108
#define M128I_CAST(x) ((__m128i *)(void *)(x))
109
#define CONST_M128I_CAST(x) ((const __m128i *)(const void *)(x))
110
111
#if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
112
# if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
113
namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
114
using namespace rdtable;
115
# else
116
static word64 Te[256];
117
# endif
118
static word64 Td[256];
119
#else // Not CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS
120
# if defined(CRYPTOPP_X64_MASM_AVAILABLE)
121
// Unused; avoids linker error on Microsoft X64 non-AESNI platforms
122
namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
123
# endif
124
CRYPTOPP_ALIGN_DATA(16) static word32 Te[256*4];
125
CRYPTOPP_ALIGN_DATA(16) static word32 Td[256*4];
126
#endif // CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS
127
128
static volatile bool s_TeFilled = false, s_TdFilled = false;
129
130
ANONYMOUS_NAMESPACE_BEGIN
131
132
#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
133
134
// Determine whether the range between begin and end overlaps
135
//   with the same 4k block offsets as the Te table. Logically,
136
//   the code is trying to create the condition:
137
//
138
// Two separate memory pages:
139
//
140
//  +-----+   +-----+
141
//  |XXXXX|   |YYYYY|
142
//  |XXXXX|   |YYYYY|
143
//  |     |   |     |
144
//  |     |   |     |
145
//  +-----+   +-----+
146
//  Te Table   Locals
147
//
148
// Have a logical cache view of (X and Y may be inverted):
149
//
150
// +-----+
151
// |XXXXX|
152
// |XXXXX|
153
// |YYYYY|
154
// |YYYYY|
155
// +-----+
156
//
157
static inline bool AliasedWithTable(const byte *begin, const byte *end)
158
0
{
159
0
  ptrdiff_t s0 = uintptr_t(begin)%4096, s1 = uintptr_t(end)%4096;
160
0
  ptrdiff_t t0 = uintptr_t(Te)%4096, t1 = (uintptr_t(Te)+sizeof(Te))%4096;
161
0
  if (t1 > t0)
162
0
    return (s0 >= t0 && s0 < t1) || (s1 > t0 && s1 <= t1);
163
0
  else
164
0
    return (s0 < t1 || s1 <= t1) || (s0 >= t0 || s1 > t0);
165
0
}
166
167
struct Locals
168
{
169
  word32 subkeys[4*12], workspace[8];
170
  const byte *inBlocks, *inXorBlocks, *outXorBlocks;
171
  byte *outBlocks;
172
  size_t inIncrement, inXorIncrement, outXorIncrement, outIncrement;
173
  size_t regSpill, lengthAndCounterFlag, keysBegin;
174
};
175
176
const size_t s_aliasPageSize = 4096;
177
const size_t s_aliasBlockSize = 256;
178
const size_t s_sizeToAllocate = s_aliasPageSize + s_aliasBlockSize + sizeof(Locals);
179
180
#endif  // CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
181
182
ANONYMOUS_NAMESPACE_END
183
184
// ************************* Portable Code ************************************
185
186
#define QUARTER_ROUND(L, T, t, a, b, c, d)  \
187
0
  a ^= L(T, 3, byte(t)); t >>= 8;\
188
0
  b ^= L(T, 2, byte(t)); t >>= 8;\
189
0
  c ^= L(T, 1, byte(t)); t >>= 8;\
190
0
  d ^= L(T, 0, t);
191
192
#define QUARTER_ROUND_LE(t, a, b, c, d) \
193
0
  tempBlock[a] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
194
0
  tempBlock[b] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
195
0
  tempBlock[c] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
196
0
  tempBlock[d] = ((byte *)(Te+t))[1];
197
198
#if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
199
  #define QUARTER_ROUND_LD(t, a, b, c, d) \
200
0
    tempBlock[a] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
201
0
    tempBlock[b] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
202
0
    tempBlock[c] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
203
0
    tempBlock[d] = ((byte *)(Td+t))[GetNativeByteOrder()*7];
204
#else
205
  #define QUARTER_ROUND_LD(t, a, b, c, d) \
206
    tempBlock[a] = Sd[byte(t)]; t >>= 8;\
207
    tempBlock[b] = Sd[byte(t)]; t >>= 8;\
208
    tempBlock[c] = Sd[byte(t)]; t >>= 8;\
209
    tempBlock[d] = Sd[t];
210
#endif
211
212
0
#define QUARTER_ROUND_E(t, a, b, c, d)    QUARTER_ROUND(TL_M, Te, t, a, b, c, d)
213
0
#define QUARTER_ROUND_D(t, a, b, c, d)    QUARTER_ROUND(TL_M, Td, t, a, b, c, d)
214
215
#if (CRYPTOPP_LITTLE_ENDIAN)
216
0
  #define QUARTER_ROUND_FE(t, a, b, c, d)   QUARTER_ROUND(TL_F, Te, t, d, c, b, a)
217
0
  #define QUARTER_ROUND_FD(t, a, b, c, d)   QUARTER_ROUND(TL_F, Td, t, d, c, b, a)
218
  #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
219
0
    #define TL_F(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (6-i)%4+1))
220
0
    #define TL_M(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (i+3)%4+1))
221
  #else
222
    #define TL_F(T, i, x) rotrFixed(T[x], (3-i)*8)
223
    #define TL_M(T, i, x) T[i*256 + x]
224
  #endif
225
#else
226
  #define QUARTER_ROUND_FE(t, a, b, c, d)   QUARTER_ROUND(TL_F, Te, t, a, b, c, d)
227
  #define QUARTER_ROUND_FD(t, a, b, c, d)   QUARTER_ROUND(TL_F, Td, t, a, b, c, d)
228
  #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
229
    #define TL_F(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (4-i)%4))
230
    #define TL_M      TL_F
231
  #else
232
    #define TL_F(T, i, x) rotrFixed(T[x], i*8)
233
    #define TL_M(T, i, x) T[i*256 + x]
234
  #endif
235
#endif
236
237
238
0
#define f2(x)   ((x<<1)^(((x>>7)&1)*0x11b))
239
0
#define f4(x)   ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
240
0
#define f8(x)   ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
241
242
0
#define f3(x)   (f2(x) ^ x)
243
0
#define f9(x)   (f8(x) ^ x)
244
0
#define fb(x)   (f8(x) ^ f2(x) ^ x)
245
0
#define fd(x)   (f8(x) ^ f4(x) ^ x)
246
0
#define fe(x)   (f8(x) ^ f4(x) ^ f2(x))
247
248
unsigned int Rijndael::Base::OptimalDataAlignment() const
249
183
{
250
183
#if (CRYPTOPP_AESNI_AVAILABLE)
251
183
  if (HasAESNI())
252
183
    return 16;  // load __m128i
253
0
#endif
254
#if (CRYPTOPP_ARM_AES_AVAILABLE)
255
  if (HasAES())
256
    return 4;  // load uint32x4_t
257
#endif
258
#if (CRYPTOGAMS_ARM_AES)
259
  // Must use 1 here for Cryptogams AES. Also see
260
  // https://github.com/weidai11/cryptopp/issues/683
261
  if (HasARMv7())
262
    return 1;
263
#endif
264
#if (CRYPTOPP_POWER8_AES_AVAILABLE)
265
  if (HasAES())
266
    return 16;  // load uint32x4_p
267
#endif
268
0
  return BlockTransformation::OptimalDataAlignment();
269
183
}
270
271
void Rijndael::Base::FillEncTable()
272
0
{
273
0
  for (int i=0; i<256; i++)
274
0
  {
275
0
    byte x = Se[i];
276
0
#if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
277
0
    word32 y = word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
278
0
    Te[i] = word64(y | f3(x))<<32 | y;
279
#else
280
    word32 y = f3(x) | word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
281
    for (int j=0; j<4; j++)
282
    {
283
      Te[i+j*256] = y;
284
      y = rotrConstant<8>(y);
285
    }
286
#endif
287
0
  }
288
#if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
289
  Te[256] = Te[257] = 0;
290
#endif
291
0
  s_TeFilled = true;
292
0
}
293
294
void Rijndael::Base::FillDecTable()
295
0
{
296
0
  for (int i=0; i<256; i++)
297
0
  {
298
0
    byte x = Sd[i];
299
0
#if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
300
0
    word32 y = word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;
301
0
    Td[i] = word64(y | fb(x))<<32 | y | x;
302
#else
303
    word32 y = fb(x) | word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;
304
    for (int j=0; j<4; j++)
305
    {
306
      Td[i+j*256] = y;
307
      y = rotrConstant<8>(y);
308
    }
309
#endif
310
0
  }
311
0
  s_TdFilled = true;
312
0
}
313
314
#if (CRYPTOPP_AESNI_AVAILABLE)
315
extern void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, word32* rk);
316
extern void Rijndael_UncheckedSetKeyRev_AESNI(word32 *key, unsigned int rounds);
317
318
extern size_t Rijndael_Enc_AdvancedProcessBlocks_AESNI(const word32 *subkeys, size_t rounds,
319
        const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
320
extern size_t Rijndael_Dec_AdvancedProcessBlocks_AESNI(const word32 *subkeys, size_t rounds,
321
        const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
322
#endif
323
324
#if (CRYPTOPP_ARM_AES_AVAILABLE)
325
extern size_t Rijndael_Enc_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, size_t rounds,
326
        const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
327
extern size_t Rijndael_Dec_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, size_t rounds,
328
        const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
329
#endif
330
331
#if (CRYPTOGAMS_ARM_AES)
332
extern "C" int cryptogams_AES_set_encrypt_key(const unsigned char *userKey, const int bitLen, word32 *rkey);
333
extern "C" int cryptogams_AES_set_decrypt_key(const unsigned char *userKey, const int bitLen, word32 *rkey);
334
extern "C" void cryptogams_AES_encrypt_block(const unsigned char *in, unsigned char *out, const word32 *rkey);
335
extern "C" void cryptogams_AES_decrypt_block(const unsigned char *in, unsigned char *out, const word32 *rkey);
336
#endif
337
338
#if (CRYPTOPP_POWER8_AES_AVAILABLE)
339
extern void Rijndael_UncheckedSetKey_POWER8(const byte* userKey, size_t keyLen,
340
        word32* rk, const byte* Se);
341
342
extern size_t Rijndael_Enc_AdvancedProcessBlocks128_6x1_ALTIVEC(const word32 *subkeys, size_t rounds,
343
        const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
344
extern size_t Rijndael_Dec_AdvancedProcessBlocks128_6x1_ALTIVEC(const word32 *subkeys, size_t rounds,
345
        const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
346
#endif
347
348
#if (CRYPTOGAMS_ARM_AES)
349
int CRYPTOGAMS_set_encrypt_key(const byte *userKey, const int bitLen, word32 *rkey)
350
{
351
  return cryptogams_AES_set_encrypt_key(userKey, bitLen, rkey);
352
}
353
int CRYPTOGAMS_set_decrypt_key(const byte *userKey, const int bitLen, word32 *rkey)
354
{
355
  return cryptogams_AES_set_decrypt_key(userKey, bitLen, rkey);
356
}
357
void CRYPTOGAMS_encrypt(const byte *inBlock, const byte *xorBlock, byte *outBlock, const word32 *rkey)
358
{
359
  cryptogams_AES_encrypt_block(inBlock, outBlock, rkey);
360
  if (xorBlock)
361
    xorbuf (outBlock, xorBlock, 16);
362
}
363
void CRYPTOGAMS_decrypt(const byte *inBlock, const byte *xorBlock, byte *outBlock, const word32 *rkey)
364
{
365
  cryptogams_AES_decrypt_block(inBlock, outBlock, rkey);
366
  if (xorBlock)
367
    xorbuf (outBlock, xorBlock, 16);
368
}
369
#endif
370
371
std::string Rijndael::Base::AlgorithmProvider() const
372
0
{
373
0
#if (CRYPTOPP_AESNI_AVAILABLE)
374
0
  if (HasAESNI())
375
0
    return "AESNI";
376
0
#endif
377
#if CRYPTOPP_SSE2_ASM_AVAILABLE && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
378
  if (HasSSE2())
379
    return "SSE2";
380
#endif
381
#if (CRYPTOPP_ARM_AES_AVAILABLE)
382
  if (HasAES())
383
    return "ARMv8";
384
#endif
385
#if (CRYPTOGAMS_ARM_AES)
386
  if (HasARMv7())
387
    return "ARMv7";
388
#endif
389
#if (CRYPTOPP_POWER8_AES_AVAILABLE)
390
  if (HasAES())
391
    return "Power8";
392
#endif
393
0
  return "C++";
394
0
}
395
396
void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLen, const NameValuePairs &)
397
169
{
398
169
  AssertValidKeyLength(keyLen);
399
400
#if (CRYPTOGAMS_ARM_AES)
401
  if (HasARMv7())
402
  {
403
    m_rounds = keyLen/4 + 6;
404
    m_key.New(4*(14+1)+4);
405
406
    if (IsForwardTransformation())
407
      CRYPTOGAMS_set_encrypt_key(userKey, keyLen*8, m_key.begin());
408
    else
409
      CRYPTOGAMS_set_decrypt_key(userKey, keyLen*8, m_key.begin());
410
    return;
411
  }
412
#endif
413
414
169
#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
415
169
  m_aliasBlock.New(s_sizeToAllocate);
416
  // The alias block is only used on IA-32 when unaligned data access is in effect.
417
  // Setting the low water mark to 0 avoids zeroization when m_aliasBlock is unused.
418
169
  m_aliasBlock.SetMark(0);
419
169
#endif
420
421
169
  m_rounds = keyLen/4 + 6;
422
169
  m_key.New(4*(m_rounds+1));
423
169
  word32 *rk = m_key;
424
425
169
#if (CRYPTOPP_AESNI_AVAILABLE && CRYPTOPP_SSE41_AVAILABLE && (!defined(CRYPTOPP_MSC_VERSION) || CRYPTOPP_MSC_VERSION >= 1600 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32))
426
  // MSVC 2008 SP1 generates bad code for _mm_extract_epi32() when compiling for X64
427
169
  if (HasAESNI() && HasSSE41())
428
169
  {
429
    // TODO: Add non-SSE4.1 variant for low-end Atoms. The low-end
430
    //  Atoms have SSE2-SSSE3 and AES-NI, but not SSE4.1 or SSE4.2.
431
169
    Rijndael_UncheckedSetKey_SSE4_AESNI(userKey, keyLen, rk);
432
169
    if (!IsForwardTransformation())
433
21
      Rijndael_UncheckedSetKeyRev_AESNI(m_key, m_rounds);
434
435
169
    return;
436
169
  }
437
0
#endif
438
439
#if CRYPTOPP_POWER8_AES_AVAILABLE
440
  if (HasAES())
441
  {
442
    // We still need rcon and Se to fallback to C/C++ for AES-192 and AES-256.
443
    // The IBM docs on AES sucks. Intel's docs on AESNI puts IBM to shame.
444
    Rijndael_UncheckedSetKey_POWER8(userKey, keyLen, rk, Se);
445
    return;
446
  }
447
#endif
448
449
0
  GetUserKey(BIG_ENDIAN_ORDER, rk, keyLen/4, userKey, keyLen);
450
0
  const word32 *rc = rcon;
451
0
  word32 temp;
452
453
0
  while (true)
454
0
  {
455
0
    temp  = rk[keyLen/4-1];
456
0
    word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^
457
0
          (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)];
458
0
    rk[keyLen/4] = rk[0] ^ x ^ *(rc++);
459
0
    rk[keyLen/4+1] = rk[1] ^ rk[keyLen/4];
460
0
    rk[keyLen/4+2] = rk[2] ^ rk[keyLen/4+1];
461
0
    rk[keyLen/4+3] = rk[3] ^ rk[keyLen/4+2];
462
463
0
    if (rk + keyLen/4 + 4 == m_key.end())
464
0
      break;
465
466
0
    if (keyLen == 24)
467
0
    {
468
0
      rk[10] = rk[ 4] ^ rk[ 9];
469
0
      rk[11] = rk[ 5] ^ rk[10];
470
0
    }
471
0
    else if (keyLen == 32)
472
0
    {
473
0
        temp = rk[11];
474
0
        rk[12] = rk[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)];
475
0
        rk[13] = rk[ 5] ^ rk[12];
476
0
        rk[14] = rk[ 6] ^ rk[13];
477
0
        rk[15] = rk[ 7] ^ rk[14];
478
0
    }
479
0
    rk += keyLen/4;
480
0
  }
481
482
0
  rk = m_key;
483
484
0
  if (IsForwardTransformation())
485
0
  {
486
0
    if (!s_TeFilled)
487
0
      FillEncTable();
488
489
0
    ConditionalByteReverse(BIG_ENDIAN_ORDER, rk, rk, 16);
490
0
    ConditionalByteReverse(BIG_ENDIAN_ORDER, rk + m_rounds*4, rk + m_rounds*4, 16);
491
0
  }
492
0
  else
493
0
  {
494
0
    if (!s_TdFilled)
495
0
      FillDecTable();
496
497
0
    #define InverseMixColumn(x) \
498
0
      TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ \
499
0
      TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)])
500
501
0
    unsigned int i, j;
502
0
    for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
503
0
    {
504
0
      temp = InverseMixColumn(rk[i    ]); rk[i    ] = InverseMixColumn(rk[j    ]); rk[j    ] = temp;
505
0
      temp = InverseMixColumn(rk[i + 1]); rk[i + 1] = InverseMixColumn(rk[j + 1]); rk[j + 1] = temp;
506
0
      temp = InverseMixColumn(rk[i + 2]); rk[i + 2] = InverseMixColumn(rk[j + 2]); rk[j + 2] = temp;
507
0
      temp = InverseMixColumn(rk[i + 3]); rk[i + 3] = InverseMixColumn(rk[j + 3]); rk[j + 3] = temp;
508
0
    }
509
510
0
    rk[i+0] = InverseMixColumn(rk[i+0]);
511
0
    rk[i+1] = InverseMixColumn(rk[i+1]);
512
0
    rk[i+2] = InverseMixColumn(rk[i+2]);
513
0
    rk[i+3] = InverseMixColumn(rk[i+3]);
514
515
0
    temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[0]); rk[0] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+0]); rk[4*m_rounds+0] = temp;
516
0
    temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[1]); rk[1] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+1]); rk[4*m_rounds+1] = temp;
517
0
    temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[2]); rk[2] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+2]); rk[4*m_rounds+2] = temp;
518
0
    temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[3]); rk[3] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+3]); rk[4*m_rounds+3] = temp;
519
0
  }
520
521
0
#if CRYPTOPP_AESNI_AVAILABLE
522
0
  if (HasAESNI())
523
0
    ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
524
0
#endif
525
#if CRYPTOPP_ARM_AES_AVAILABLE
526
  if (HasAES())
527
    ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
528
#endif
529
0
}
530
531
void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
532
209
{
533
209
#if CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) || CRYPTOPP_AESNI_AVAILABLE
534
# if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
535
  if (HasSSE2())
536
# else
537
209
  if (HasAESNI())
538
209
# endif
539
209
  {
540
209
    (void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
541
209
    return;
542
209
  }
543
0
#endif
544
545
#if (CRYPTOPP_ARM_AES_AVAILABLE)
546
  if (HasAES())
547
  {
548
    (void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
549
    return;
550
  }
551
#endif
552
553
#if (CRYPTOGAMS_ARM_AES)
554
  if (HasARMv7())
555
  {
556
    CRYPTOGAMS_encrypt(inBlock, xorBlock, outBlock, m_key.begin());
557
    return;
558
  }
559
#endif
560
561
#if (CRYPTOPP_POWER8_AES_AVAILABLE)
562
  if (HasAES())
563
  {
564
    (void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
565
    return;
566
  }
567
#endif
568
569
0
  typedef BlockGetAndPut<word32, NativeByteOrder> Block;
570
571
0
  word32 s0, s1, s2, s3, t0, t1, t2, t3;
572
0
  Block::Get(inBlock)(s0)(s1)(s2)(s3);
573
574
0
  const word32 *rk = m_key;
575
0
  s0 ^= rk[0];
576
0
  s1 ^= rk[1];
577
0
  s2 ^= rk[2];
578
0
  s3 ^= rk[3];
579
0
  t0 = rk[4];
580
0
  t1 = rk[5];
581
0
  t2 = rk[6];
582
0
  t3 = rk[7];
583
0
  rk += 8;
584
585
  // timing attack countermeasure. see comments at top for more details.
586
  // also see http://github.com/weidai11/cryptopp/issues/146
587
0
  const int cacheLineSize = GetCacheLineSize();
588
0
  unsigned int i;
589
0
  volatile word32 _u = 0;
590
0
  word32 u = _u;
591
0
#if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
592
0
  for (i=0; i<2048; i+=cacheLineSize)
593
#else
594
  for (i=0; i<1024; i+=cacheLineSize)
595
#endif
596
0
    u &= *(const word32 *)(const void *)(((const byte *)Te)+i);
597
0
  u &= Te[255];
598
0
  s0 |= u; s1 |= u; s2 |= u; s3 |= u;
599
600
0
  QUARTER_ROUND_FE(s3, t0, t1, t2, t3)
601
0
  QUARTER_ROUND_FE(s2, t3, t0, t1, t2)
602
0
  QUARTER_ROUND_FE(s1, t2, t3, t0, t1)
603
0
  QUARTER_ROUND_FE(s0, t1, t2, t3, t0)
604
605
  // Nr - 2 full rounds:
606
0
  unsigned int r = m_rounds/2 - 1;
607
0
  do
608
0
  {
609
0
    s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
610
611
0
    QUARTER_ROUND_E(t3, s0, s1, s2, s3)
612
0
    QUARTER_ROUND_E(t2, s3, s0, s1, s2)
613
0
    QUARTER_ROUND_E(t1, s2, s3, s0, s1)
614
0
    QUARTER_ROUND_E(t0, s1, s2, s3, s0)
615
616
0
    t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
617
618
0
    QUARTER_ROUND_E(s3, t0, t1, t2, t3)
619
0
    QUARTER_ROUND_E(s2, t3, t0, t1, t2)
620
0
    QUARTER_ROUND_E(s1, t2, t3, t0, t1)
621
0
    QUARTER_ROUND_E(s0, t1, t2, t3, t0)
622
623
0
    rk += 8;
624
0
  } while (--r);
625
626
0
  word32 tbw[4];
627
0
  byte *const tempBlock = (byte *)tbw;
628
629
0
  QUARTER_ROUND_LE(t2, 15, 2, 5, 8)
630
0
  QUARTER_ROUND_LE(t1, 11, 14, 1, 4)
631
0
  QUARTER_ROUND_LE(t0, 7, 10, 13, 0)
632
0
  QUARTER_ROUND_LE(t3, 3, 6, 9, 12)
633
634
0
  Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
635
0
}
636
637
void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
638
6
{
639
6
#if CRYPTOPP_AESNI_AVAILABLE
640
6
  if (HasAESNI())
641
6
  {
642
6
    (void)Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
643
6
    return;
644
6
  }
645
0
#endif
646
647
#if (CRYPTOPP_ARM_AES_AVAILABLE)
648
  if (HasAES())
649
  {
650
    (void)Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
651
    return;
652
  }
653
#endif
654
655
#if (CRYPTOGAMS_ARM_AES)
656
  if (HasARMv7())
657
  {
658
    CRYPTOGAMS_decrypt(inBlock, xorBlock, outBlock, m_key.begin());
659
    return;
660
  }
661
#endif
662
663
#if (CRYPTOPP_POWER8_AES_AVAILABLE)
664
  if (HasAES())
665
  {
666
    (void)Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
667
    return;
668
  }
669
#endif
670
671
0
  typedef BlockGetAndPut<word32, NativeByteOrder> Block;
672
673
0
  word32 s0, s1, s2, s3, t0, t1, t2, t3;
674
0
  Block::Get(inBlock)(s0)(s1)(s2)(s3);
675
676
0
  const word32 *rk = m_key;
677
0
  s0 ^= rk[0];
678
0
  s1 ^= rk[1];
679
0
  s2 ^= rk[2];
680
0
  s3 ^= rk[3];
681
0
  t0 = rk[4];
682
0
  t1 = rk[5];
683
0
  t2 = rk[6];
684
0
  t3 = rk[7];
685
0
  rk += 8;
686
687
  // timing attack countermeasure. see comments at top for more details.
688
  // also see http://github.com/weidai11/cryptopp/issues/146
689
0
  const int cacheLineSize = GetCacheLineSize();
690
0
  unsigned int i;
691
0
  volatile word32 _u = 0;
692
0
  word32 u = _u;
693
0
#if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
694
0
  for (i=0; i<2048; i+=cacheLineSize)
695
#else
696
  for (i=0; i<1024; i+=cacheLineSize)
697
#endif
698
0
    u &= *(const word32 *)(const void *)(((const byte *)Td)+i);
699
0
  u &= Td[255];
700
0
  s0 |= u; s1 |= u; s2 |= u; s3 |= u;
701
702
0
  QUARTER_ROUND_FD(s3, t2, t1, t0, t3)
703
0
  QUARTER_ROUND_FD(s2, t1, t0, t3, t2)
704
0
  QUARTER_ROUND_FD(s1, t0, t3, t2, t1)
705
0
  QUARTER_ROUND_FD(s0, t3, t2, t1, t0)
706
707
  // Nr - 2 full rounds:
708
0
  unsigned int r = m_rounds/2 - 1;
709
0
  do
710
0
  {
711
0
    s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
712
713
0
    QUARTER_ROUND_D(t3, s2, s1, s0, s3)
714
0
    QUARTER_ROUND_D(t2, s1, s0, s3, s2)
715
0
    QUARTER_ROUND_D(t1, s0, s3, s2, s1)
716
0
    QUARTER_ROUND_D(t0, s3, s2, s1, s0)
717
718
0
    t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
719
720
0
    QUARTER_ROUND_D(s3, t2, t1, t0, t3)
721
0
    QUARTER_ROUND_D(s2, t1, t0, t3, t2)
722
0
    QUARTER_ROUND_D(s1, t0, t3, t2, t1)
723
0
    QUARTER_ROUND_D(s0, t3, t2, t1, t0)
724
725
0
    rk += 8;
726
0
  } while (--r);
727
728
#if !(defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS))
729
  // timing attack countermeasure. see comments at top for more details
730
  // If CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS is defined,
731
  // QUARTER_ROUND_LD will use Td, which is already preloaded.
732
  u = _u;
733
  for (i=0; i<256; i+=cacheLineSize)
734
    u &= *(const word32 *)(const void *)(Sd+i);
735
  u &= *(const word32 *)(const void *)(Sd+252);
736
  t0 |= u; t1 |= u; t2 |= u; t3 |= u;
737
#endif
738
739
0
  word32 tbw[4];
740
0
  byte *const tempBlock = (byte *)tbw;
741
742
0
  QUARTER_ROUND_LD(t2, 7, 2, 13, 8)
743
0
  QUARTER_ROUND_LD(t1, 3, 14, 9, 4)
744
0
  QUARTER_ROUND_LD(t0, 15, 10, 5, 0)
745
0
  QUARTER_ROUND_LD(t3, 11, 6, 1, 12)
746
747
0
  Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
748
0
}
749
750
// ************************* Assembly Code ************************************
751
752
#if CRYPTOPP_MSC_VERSION
753
# pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
754
#endif
755
756
#endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
757
758
#if CRYPTOPP_SSE2_ASM_AVAILABLE && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
759
760
CRYPTOPP_NAKED void CRYPTOPP_FASTCALL Rijndael_Enc_AdvancedProcessBlocks_SSE2(void *locals, const word32 *k)
761
{
762
  CRYPTOPP_UNUSED(locals); CRYPTOPP_UNUSED(k);
763
764
#if CRYPTOPP_BOOL_X86
765
766
#define L_REG     esp
767
#define L_INDEX(i)    (L_REG+768+i)
768
#define L_INXORBLOCKS L_INBLOCKS+4
769
#define L_OUTXORBLOCKS  L_INBLOCKS+8
770
#define L_OUTBLOCKS   L_INBLOCKS+12
771
#define L_INCREMENTS  L_INDEX(16*15)
772
#define L_SP      L_INDEX(16*16)
773
#define L_LENGTH    L_INDEX(16*16+4)
774
#define L_KEYS_BEGIN  L_INDEX(16*16+8)
775
776
#define MOVD      movd
777
#define MM(i)     mm##i
778
779
#define MXOR(a,b,c) \
780
  AS2(  movzx esi, b)\
781
  AS2(  movd  mm7, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
782
  AS2(  pxor  MM(a), mm7)\
783
784
#define MMOV(a,b,c) \
785
  AS2(  movzx esi, b)\
786
  AS2(  movd  MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
787
788
#else
789
790
#define L_REG     r8
791
#define L_INDEX(i)    (L_REG+i)
792
#define L_INXORBLOCKS L_INBLOCKS+8
793
#define L_OUTXORBLOCKS  L_INBLOCKS+16
794
#define L_OUTBLOCKS   L_INBLOCKS+24
795
#define L_INCREMENTS  L_INDEX(16*16)
796
#define L_LENGTH    L_INDEX(16*18+8)
797
#define L_KEYS_BEGIN  L_INDEX(16*19)
798
799
#define MOVD      mov
800
#define MM_0      r9d
801
#define MM_1      r12d
802
#ifdef __GNUC__
803
#define MM_2      r11d
804
#else
805
#define MM_2      r10d
806
#endif
807
#define MM(i)     MM_##i
808
809
#define MXOR(a,b,c) \
810
  AS2(  movzx esi, b)\
811
  AS2(  xor   MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
812
813
#define MMOV(a,b,c) \
814
  AS2(  movzx esi, b)\
815
  AS2(  mov   MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
816
817
#endif
818
819
#define L_SUBKEYS   L_INDEX(0)
820
#define L_SAVED_X   L_SUBKEYS
821
#define L_KEY12     L_INDEX(16*12)
822
#define L_LASTROUND   L_INDEX(16*13)
823
#define L_INBLOCKS    L_INDEX(16*14)
824
#define MAP0TO4(i)    (ASM_MOD(i+3,4)+1)
825
826
#define XOR(a,b,c)  \
827
  AS2(  movzx esi, b)\
828
  AS2(  xor   a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
829
830
#define MOV(a,b,c)  \
831
  AS2(  movzx esi, b)\
832
  AS2(  mov   a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
833
834
#ifdef CRYPTOPP_GENERATE_X64_MASM
835
    ALIGN   8
836
  Rijndael_Enc_AdvancedProcessBlocks  PROC FRAME
837
    rex_push_reg rsi
838
    push_reg rdi
839
    push_reg rbx
840
    push_reg r12
841
    .endprolog
842
    mov L_REG, rcx
843
    mov AS_REG_7, ?Te@rdtable@CryptoPP@@3PA_KA
844
    mov edi, DWORD PTR [?g_cacheLineSize@CryptoPP@@3IA]
845
#elif defined(__GNUC__)
846
  __asm__ __volatile__
847
  (
848
  INTEL_NOPREFIX
849
  #if CRYPTOPP_BOOL_X64
850
  AS2(  mov   L_REG, rcx)
851
  #endif
852
  AS_PUSH_IF86(bx)
853
  AS_PUSH_IF86(bp)
854
  AS2(  mov   AS_REG_7, WORD_REG(si))
855
#else
856
  AS_PUSH_IF86(si)
857
  AS_PUSH_IF86(di)
858
  AS_PUSH_IF86(bx)
859
  AS_PUSH_IF86(bp)
860
  AS2(  lea   AS_REG_7, [Te])
861
  AS2(  mov   edi, [g_cacheLineSize])
862
#endif
863
864
#if CRYPTOPP_BOOL_X86
865
  AS2(  mov   [ecx+16*12+16*4], esp)  // save esp to L_SP
866
  AS2(  lea   esp, [ecx-768])
867
#endif
868
869
  // copy subkeys to stack
870
  AS2(  mov   WORD_REG(si), [L_KEYS_BEGIN])
871
  AS2(  mov   WORD_REG(ax), 16)
872
  AS2(  and   WORD_REG(ax), WORD_REG(si))
873
  AS2(  movdqa  xmm3, XMMWORD_PTR [WORD_REG(dx)+16+WORD_REG(ax)]) // subkey 1 (non-counter) or 2 (counter)
874
  AS2(  movdqa  [L_KEY12], xmm3)
875
  AS2(  lea   WORD_REG(ax), [WORD_REG(dx)+WORD_REG(ax)+2*16])
876
  AS2(  sub   WORD_REG(ax), WORD_REG(si))
877
  ASL(0)
878
  AS2(  movdqa  xmm0, [WORD_REG(ax)+WORD_REG(si)])
879
  AS2(  movdqa  XMMWORD_PTR [L_SUBKEYS+WORD_REG(si)], xmm0)
880
  AS2(  add   WORD_REG(si), 16)
881
  AS2(  cmp   WORD_REG(si), 16*12)
882
  ATT_NOPREFIX
883
  ASJ(  jl,   0, b)
884
  INTEL_NOPREFIX
885
886
  // read subkeys 0, 1 and last
887
  AS2(  movdqa  xmm4, [WORD_REG(ax)+WORD_REG(si)])  // last subkey
888
  AS2(  movdqa  xmm1, [WORD_REG(dx)])     // subkey 0
889
  AS2(  MOVD  MM(1), [WORD_REG(dx)+4*4])    // 0,1,2,3
890
  AS2(  mov   ebx, [WORD_REG(dx)+5*4])    // 4,5,6,7
891
  AS2(  mov   ecx, [WORD_REG(dx)+6*4])    // 8,9,10,11
892
  AS2(  mov   edx, [WORD_REG(dx)+7*4])    // 12,13,14,15
893
894
  // load table into cache
895
  AS2(  xor   WORD_REG(ax), WORD_REG(ax))
896
  ASL(9)
897
  AS2(  mov   esi, [AS_REG_7+WORD_REG(ax)])
898
  AS2(  add   WORD_REG(ax), WORD_REG(di))
899
  AS2(  mov   esi, [AS_REG_7+WORD_REG(ax)])
900
  AS2(  add   WORD_REG(ax), WORD_REG(di))
901
  AS2(  mov   esi, [AS_REG_7+WORD_REG(ax)])
902
  AS2(  add   WORD_REG(ax), WORD_REG(di))
903
  AS2(  mov   esi, [AS_REG_7+WORD_REG(ax)])
904
  AS2(  add   WORD_REG(ax), WORD_REG(di))
905
  AS2(  cmp   WORD_REG(ax), 2048)
906
  ATT_NOPREFIX
907
  ASJ(  jl,   9, b)
908
  INTEL_NOPREFIX
909
  AS1(  lfence)
910
911
  AS2(  test  DWORD PTR [L_LENGTH], 1)
912
  ATT_NOPREFIX
913
  ASJ(  jz,   8, f)
914
  INTEL_NOPREFIX
915
916
  // counter mode one-time setup
917
  AS2(  mov   WORD_REG(si), [L_INBLOCKS])
918
  AS2(  movdqu  xmm2, [WORD_REG(si)]) // counter
919
  AS2(  pxor  xmm2, xmm1)
920
  AS2(  psrldq  xmm1, 14)
921
  AS2(  movd  eax, xmm1)
922
  AS2(  mov   al, BYTE PTR [WORD_REG(si)+15])
923
  AS2(  MOVD  MM(2), eax)
924
#if CRYPTOPP_BOOL_X86
925
  AS2(  mov   eax, 1)
926
  AS2(  movd  mm3, eax)
927
#endif
928
929
  // partial first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: mm1, ebx, ecx, edx
930
  AS2(  movd  eax, xmm2)
931
  AS2(  psrldq  xmm2, 4)
932
  AS2(  movd  edi, xmm2)
933
  AS2(  psrldq  xmm2, 4)
934
    MXOR(   1, al, 0)   // 0
935
    XOR(    edx, ah, 1)   // 1
936
  AS2(  shr   eax, 16)
937
    XOR(    ecx, al, 2)   // 2
938
    XOR(    ebx, ah, 3)   // 3
939
  AS2(  mov   eax, edi)
940
  AS2(  movd  edi, xmm2)
941
  AS2(  psrldq  xmm2, 4)
942
    XOR(    ebx, al, 0)   // 4
943
    MXOR(   1, ah, 1)   // 5
944
  AS2(  shr   eax, 16)
945
    XOR(    edx, al, 2)   // 6
946
    XOR(    ecx, ah, 3)   // 7
947
  AS2(  mov   eax, edi)
948
  AS2(  movd  edi, xmm2)
949
    XOR(    ecx, al, 0)   // 8
950
    XOR(    ebx, ah, 1)   // 9
951
  AS2(  shr   eax, 16)
952
    MXOR(   1, al, 2)   // 10
953
    XOR(    edx, ah, 3)   // 11
954
  AS2(  mov   eax, edi)
955
    XOR(    edx, al, 0)   // 12
956
    XOR(    ecx, ah, 1)   // 13
957
  AS2(  shr   eax, 16)
958
    XOR(    ebx, al, 2)   // 14
959
  AS2(  psrldq  xmm2, 3)
960
961
  // partial second round, in: ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15), out: eax, ebx, edi, mm0
962
  AS2(  mov   eax, [L_KEY12+0*4])
963
  AS2(  mov   edi, [L_KEY12+2*4])
964
  AS2(  MOVD  MM(0), [L_KEY12+3*4])
965
    MXOR( 0, cl, 3) /* 11 */
966
    XOR(  edi, bl, 3) /* 7 */
967
    MXOR( 0, bh, 2) /* 6 */
968
  AS2(  shr ebx, 16)  /* 4,5 */
969
    XOR(  eax, bl, 1) /* 5 */
970
    MOV(  ebx, bh, 0) /* 4 */
971
  AS2(  xor   ebx, [L_KEY12+1*4])
972
    XOR(  eax, ch, 2) /* 10 */
973
  AS2(  shr ecx, 16)  /* 8,9 */
974
    XOR(  eax, dl, 3) /* 15 */
975
    XOR(  ebx, dh, 2) /* 14 */
976
  AS2(  shr edx, 16)  /* 12,13 */
977
    XOR(  edi, ch, 0) /* 8 */
978
    XOR(  ebx, cl, 1) /* 9 */
979
    XOR(  edi, dl, 1) /* 13 */
980
    MXOR( 0, dh, 0) /* 12 */
981
982
  AS2(  movd  ecx, xmm2)
983
  AS2(  MOVD  edx, MM(1))
984
  AS2(  MOVD  [L_SAVED_X+3*4], MM(0))
985
  AS2(  mov   [L_SAVED_X+0*4], eax)
986
  AS2(  mov   [L_SAVED_X+1*4], ebx)
987
  AS2(  mov   [L_SAVED_X+2*4], edi)
988
  ATT_NOPREFIX
989
  ASJ(  jmp,  5, f)
990
  INTEL_NOPREFIX
991
  ASL(3)
992
  // non-counter mode per-block setup
993
  AS2(  MOVD  MM(1), [L_KEY12+0*4]) // 0,1,2,3
994
  AS2(  mov   ebx, [L_KEY12+1*4])   // 4,5,6,7
995
  AS2(  mov   ecx, [L_KEY12+2*4])   // 8,9,10,11
996
  AS2(  mov   edx, [L_KEY12+3*4])   // 12,13,14,15
997
  ASL(8)
998
  AS2(  mov   WORD_REG(ax), [L_INBLOCKS])
999
  AS2(  movdqu  xmm2, [WORD_REG(ax)])
1000
  AS2(  mov   WORD_REG(si), [L_INXORBLOCKS])
1001
  AS2(  movdqu  xmm5, [WORD_REG(si)])
1002
  AS2(  pxor  xmm2, xmm1)
1003
  AS2(  pxor  xmm2, xmm5)
1004
1005
  // first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: eax, ebx, ecx, edx
1006
  AS2(  movd  eax, xmm2)
1007
  AS2(  psrldq  xmm2, 4)
1008
  AS2(  movd  edi, xmm2)
1009
  AS2(  psrldq  xmm2, 4)
1010
    MXOR(   1, al, 0)   // 0
1011
    XOR(    edx, ah, 1)   // 1
1012
  AS2(  shr   eax, 16)
1013
    XOR(    ecx, al, 2)   // 2
1014
    XOR(    ebx, ah, 3)   // 3
1015
  AS2(  mov   eax, edi)
1016
  AS2(  movd  edi, xmm2)
1017
  AS2(  psrldq  xmm2, 4)
1018
    XOR(    ebx, al, 0)   // 4
1019
    MXOR(   1, ah, 1)   // 5
1020
  AS2(  shr   eax, 16)
1021
    XOR(    edx, al, 2)   // 6
1022
    XOR(    ecx, ah, 3)   // 7
1023
  AS2(  mov   eax, edi)
1024
  AS2(  movd  edi, xmm2)
1025
    XOR(    ecx, al, 0)   // 8
1026
    XOR(    ebx, ah, 1)   // 9
1027
  AS2(  shr   eax, 16)
1028
    MXOR(   1, al, 2)   // 10
1029
    XOR(    edx, ah, 3)   // 11
1030
  AS2(  mov   eax, edi)
1031
    XOR(    edx, al, 0)   // 12
1032
    XOR(    ecx, ah, 1)   // 13
1033
  AS2(  shr   eax, 16)
1034
    XOR(    ebx, al, 2)   // 14
1035
    MXOR(   1, ah, 3)   // 15
1036
  AS2(  MOVD  eax, MM(1))
1037
1038
  AS2(  add   L_REG, [L_KEYS_BEGIN])
1039
  AS2(  add   L_REG, 4*16)
1040
  ATT_NOPREFIX
1041
  ASJ(  jmp,  2, f)
1042
  INTEL_NOPREFIX
1043
  ASL(1)
1044
  // counter-mode per-block setup
1045
  AS2(  MOVD  ecx, MM(2))
1046
  AS2(  MOVD  edx, MM(1))
1047
  AS2(  mov   eax, [L_SAVED_X+0*4])
1048
  AS2(  mov   ebx, [L_SAVED_X+1*4])
1049
  AS2(  xor   cl, ch)
1050
  AS2(  and   WORD_REG(cx), 255)
1051
  ASL(5)
1052
#if CRYPTOPP_BOOL_X86
1053
  AS2(  paddb MM(2), mm3)
1054
#else
1055
  AS2(  add   MM(2), 1)
1056
#endif
1057
  // remaining part of second round, in: edx(previous round),esi(keyed counter byte) eax,ebx,[L_SAVED_X+2*4],[L_SAVED_X+3*4], out: eax,ebx,ecx,edx
1058
  AS2(  xor   edx, DWORD PTR [AS_REG_7+WORD_REG(cx)*8+3])
1059
    XOR(    ebx, dl, 3)
1060
    MOV(    ecx, dh, 2)
1061
  AS2(  shr   edx, 16)
1062
  AS2(  xor   ecx, [L_SAVED_X+2*4])
1063
    XOR(    eax, dh, 0)
1064
    MOV(    edx, dl, 1)
1065
  AS2(  xor   edx, [L_SAVED_X+3*4])
1066
1067
  AS2(  add   L_REG, [L_KEYS_BEGIN])
1068
  AS2(  add   L_REG, 3*16)
1069
  ATT_NOPREFIX
1070
  ASJ(  jmp,  4, f)
1071
  INTEL_NOPREFIX
1072
1073
// in: eax(0,1,2,3), ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15)
1074
// out: eax, ebx, edi, mm0
1075
#define ROUND()   \
1076
    MXOR( 0, cl, 3) /* 11 */\
1077
  AS2(  mov cl, al)   /* 8,9,10,3 */\
1078
    XOR(  edi, ah, 2) /* 2 */\
1079
  AS2(  shr eax, 16)  /* 0,1 */\
1080
    XOR(  edi, bl, 3) /* 7 */\
1081
    MXOR( 0, bh, 2) /* 6 */\
1082
  AS2(  shr ebx, 16)  /* 4,5 */\
1083
    MXOR( 0, al, 1) /* 1 */\
1084
    MOV(  eax, ah, 0) /* 0 */\
1085
    XOR(  eax, bl, 1) /* 5 */\
1086
    MOV(  ebx, bh, 0) /* 4 */\
1087
    XOR(  eax, ch, 2) /* 10 */\
1088
    XOR(  ebx, cl, 3) /* 3 */\
1089
  AS2(  shr ecx, 16)  /* 8,9 */\
1090
    XOR(  eax, dl, 3) /* 15 */\
1091
    XOR(  ebx, dh, 2) /* 14 */\
1092
  AS2(  shr edx, 16)  /* 12,13 */\
1093
    XOR(  edi, ch, 0) /* 8 */\
1094
    XOR(  ebx, cl, 1) /* 9 */\
1095
    XOR(  edi, dl, 1) /* 13 */\
1096
    MXOR( 0, dh, 0) /* 12 */\
1097
1098
  ASL(2)  // 2-round loop
1099
  AS2(  MOVD  MM(0), [L_SUBKEYS-4*16+3*4])
1100
  AS2(  mov   edi, [L_SUBKEYS-4*16+2*4])
1101
  ROUND()
1102
  AS2(  mov   ecx, edi)
1103
  AS2(  xor   eax, [L_SUBKEYS-4*16+0*4])
1104
  AS2(  xor   ebx, [L_SUBKEYS-4*16+1*4])
1105
  AS2(  MOVD  edx, MM(0))
1106
1107
  ASL(4)
1108
  AS2(  MOVD  MM(0), [L_SUBKEYS-4*16+7*4])
1109
  AS2(  mov   edi, [L_SUBKEYS-4*16+6*4])
1110
  ROUND()
1111
  AS2(  mov   ecx, edi)
1112
  AS2(  xor   eax, [L_SUBKEYS-4*16+4*4])
1113
  AS2(  xor   ebx, [L_SUBKEYS-4*16+5*4])
1114
  AS2(  MOVD  edx, MM(0))
1115
1116
  AS2(  add   L_REG, 32)
1117
  AS2(  test  L_REG, 255)
1118
  ATT_NOPREFIX
1119
  ASJ(  jnz,  2, b)
1120
  INTEL_NOPREFIX
1121
  AS2(  sub   L_REG, 16*16)
1122
1123
#define LAST(a, b, c)                       \
1124
  AS2(  movzx esi, a                      )\
1125
  AS2(  movzx edi, BYTE PTR [AS_REG_7+WORD_REG(si)*8+1] )\
1126
  AS2(  movzx esi, b                      )\
1127
  AS2(  xor   edi, DWORD PTR [AS_REG_7+WORD_REG(si)*8+0]  )\
1128
  AS2(  mov   WORD PTR [L_LASTROUND+c], di          )\
1129
1130
  // last round
1131
  LAST(ch, dl, 2)
1132
  LAST(dh, al, 6)
1133
  AS2(  shr   edx, 16)
1134
  LAST(ah, bl, 10)
1135
  AS2(  shr   eax, 16)
1136
  LAST(bh, cl, 14)
1137
  AS2(  shr   ebx, 16)
1138
  LAST(dh, al, 12)
1139
  AS2(  shr   ecx, 16)
1140
  LAST(ah, bl, 0)
1141
  LAST(bh, cl, 4)
1142
  LAST(ch, dl, 8)
1143
1144
  AS2(  mov   WORD_REG(ax), [L_OUTXORBLOCKS])
1145
  AS2(  mov   WORD_REG(bx), [L_OUTBLOCKS])
1146
1147
  AS2(  mov   WORD_REG(cx), [L_LENGTH])
1148
  AS2(  sub   WORD_REG(cx), 16)
1149
1150
  AS2(  movdqu  xmm2, [WORD_REG(ax)])
1151
  AS2(  pxor  xmm2, xmm4)
1152
1153
#if CRYPTOPP_BOOL_X86
1154
  AS2(  movdqa  xmm0, [L_INCREMENTS])
1155
  AS2(  paddd xmm0, [L_INBLOCKS])
1156
  AS2(  movdqa  [L_INBLOCKS], xmm0)
1157
#else
1158
  AS2(  movdqa  xmm0, [L_INCREMENTS+16])
1159
  AS2(  paddq xmm0, [L_INBLOCKS+16])
1160
  AS2(  movdqa  [L_INBLOCKS+16], xmm0)
1161
#endif
1162
1163
  AS2(  pxor  xmm2, [L_LASTROUND])
1164
  AS2(  movdqu  [WORD_REG(bx)], xmm2)
1165
1166
  ATT_NOPREFIX
1167
  ASJ(  jle,  7, f)
1168
  INTEL_NOPREFIX
1169
  AS2(  mov   [L_LENGTH], WORD_REG(cx))
1170
  AS2(  test  WORD_REG(cx), 1)
1171
  ATT_NOPREFIX
1172
  ASJ(  jnz,  1, b)
1173
  INTEL_NOPREFIX
1174
#if CRYPTOPP_BOOL_X64
1175
  AS2(  movdqa  xmm0, [L_INCREMENTS])
1176
  AS2(  paddq xmm0, [L_INBLOCKS])
1177
  AS2(  movdqa  [L_INBLOCKS], xmm0)
1178
#endif
1179
  ATT_NOPREFIX
1180
  ASJ(  jmp,  3, b)
1181
  INTEL_NOPREFIX
1182
1183
  ASL(7)
1184
  // erase keys on stack
1185
  AS2(  xorps xmm0, xmm0)
1186
  AS2(  lea   WORD_REG(ax), [L_SUBKEYS+7*16])
1187
  AS2(  movaps  [WORD_REG(ax)-7*16], xmm0)
1188
  AS2(  movaps  [WORD_REG(ax)-6*16], xmm0)
1189
  AS2(  movaps  [WORD_REG(ax)-5*16], xmm0)
1190
  AS2(  movaps  [WORD_REG(ax)-4*16], xmm0)
1191
  AS2(  movaps  [WORD_REG(ax)-3*16], xmm0)
1192
  AS2(  movaps  [WORD_REG(ax)-2*16], xmm0)
1193
  AS2(  movaps  [WORD_REG(ax)-1*16], xmm0)
1194
  AS2(  movaps  [WORD_REG(ax)+0*16], xmm0)
1195
  AS2(  movaps  [WORD_REG(ax)+1*16], xmm0)
1196
  AS2(  movaps  [WORD_REG(ax)+2*16], xmm0)
1197
  AS2(  movaps  [WORD_REG(ax)+3*16], xmm0)
1198
  AS2(  movaps  [WORD_REG(ax)+4*16], xmm0)
1199
  AS2(  movaps  [WORD_REG(ax)+5*16], xmm0)
1200
  AS2(  movaps  [WORD_REG(ax)+6*16], xmm0)
1201
#if CRYPTOPP_BOOL_X86
1202
  AS2(  mov   esp, [L_SP])
1203
  AS1(  emms)
1204
#endif
1205
  AS_POP_IF86(bp)
1206
  AS_POP_IF86(bx)
1207
#if defined(CRYPTOPP_MSC_VERSION) && CRYPTOPP_BOOL_X86
1208
  AS_POP_IF86(di)
1209
  AS_POP_IF86(si)
1210
  AS1(ret)
1211
#endif
1212
#ifdef CRYPTOPP_GENERATE_X64_MASM
1213
  pop r12
1214
  pop rbx
1215
  pop rdi
1216
  pop rsi
1217
  ret
1218
  Rijndael_Enc_AdvancedProcessBlocks ENDP
1219
#endif
1220
#ifdef __GNUC__
1221
  ATT_PREFIX
1222
  :
1223
  : "c" (locals), "d" (k), "S" (Te), "D" (g_cacheLineSize)
1224
  : "memory", "cc", "%eax"
1225
  #if CRYPTOPP_BOOL_X64
1226
    , "%rbx", "%r8", "%r9", "%r10", "%r11", "%r12"
1227
  #endif
1228
  );
1229
#endif
1230
}
1231
1232
#endif
1233
1234
#ifndef CRYPTOPP_GENERATE_X64_MASM
1235
1236
#ifdef CRYPTOPP_X64_MASM_AVAILABLE
1237
extern "C" {
1238
void Rijndael_Enc_AdvancedProcessBlocks_SSE2(void *locals, const word32 *k);
1239
}
1240
#endif
1241
1242
#if CRYPTOPP_RIJNDAEL_ADVANCED_PROCESS_BLOCKS
1243
size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
1244
493
{
1245
493
#if CRYPTOPP_AESNI_AVAILABLE
1246
493
  if (HasAESNI())
1247
493
    return Rijndael_Enc_AdvancedProcessBlocks_AESNI(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1248
0
#endif
1249
#if CRYPTOPP_ARM_AES_AVAILABLE
1250
  if (HasAES())
1251
    return Rijndael_Enc_AdvancedProcessBlocks_ARMV8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1252
#endif
1253
#if CRYPTOPP_POWER8_AES_AVAILABLE
1254
  if (HasAES())
1255
    return Rijndael_Enc_AdvancedProcessBlocks128_6x1_ALTIVEC(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1256
#endif
1257
1258
#if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
1259
  if (HasSSE2())
1260
  {
1261
    if (length < BLOCKSIZE)
1262
      return length;
1263
1264
    static const byte *zeros = (const byte*)(Te+256);
1265
    m_aliasBlock.SetMark(m_aliasBlock.size());
1266
    byte *space = NULLPTR, *originalSpace = const_cast<byte*>(m_aliasBlock.data());
1267
1268
    // round up to nearest 256 byte boundary
1269
    space = originalSpace + (s_aliasBlockSize - (uintptr_t)originalSpace % s_aliasBlockSize) % s_aliasBlockSize;
1270
    while (AliasedWithTable(space, space + sizeof(Locals)))
1271
    {
1272
      space += 256;
1273
      CRYPTOPP_ASSERT(space < (originalSpace + s_aliasPageSize));
1274
    }
1275
1276
    size_t increment = BLOCKSIZE;
1277
    if (flags & BT_ReverseDirection)
1278
    {
1279
      CRYPTOPP_ASSERT(length % BLOCKSIZE == 0);
1280
      inBlocks += length - BLOCKSIZE;
1281
      xorBlocks += length - BLOCKSIZE;
1282
      outBlocks += length - BLOCKSIZE;
1283
      increment = 0-increment;
1284
    }
1285
1286
    Locals &locals = *(Locals *)(void *)space;
1287
1288
    locals.inBlocks = inBlocks;
1289
    locals.inXorBlocks = (flags & BT_XorInput) && xorBlocks ? xorBlocks : zeros;
1290
    locals.outXorBlocks = (flags & BT_XorInput) || !xorBlocks ? zeros : xorBlocks;
1291
    locals.outBlocks = outBlocks;
1292
1293
    locals.inIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1294
    locals.inXorIncrement = (flags & BT_XorInput) && xorBlocks ? increment : 0;
1295
    locals.outXorIncrement = (flags & BT_XorInput) || !xorBlocks ? 0 : increment;
1296
    locals.outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1297
1298
    locals.lengthAndCounterFlag = length - (length%16) - bool(flags & BT_InBlockIsCounter);
1299
    int keysToCopy = m_rounds - (flags & BT_InBlockIsCounter ? 3 : 2);
1300
    locals.keysBegin = (12-keysToCopy)*16;
1301
1302
    Rijndael_Enc_AdvancedProcessBlocks_SSE2(&locals, m_key);
1303
1304
    return length % BLOCKSIZE;
1305
  }
1306
#endif
1307
1308
0
  return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
1309
493
}
1310
1311
size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
1312
24
{
1313
24
#if CRYPTOPP_AESNI_AVAILABLE
1314
24
  if (HasAESNI())
1315
24
    return Rijndael_Dec_AdvancedProcessBlocks_AESNI(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1316
0
#endif
1317
#if CRYPTOPP_ARM_AES_AVAILABLE
1318
  if (HasAES())
1319
    return Rijndael_Dec_AdvancedProcessBlocks_ARMV8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1320
#endif
1321
#if CRYPTOPP_POWER8_AES_AVAILABLE
1322
  if (HasAES())
1323
    return Rijndael_Dec_AdvancedProcessBlocks128_6x1_ALTIVEC(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1324
#endif
1325
1326
0
  return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
1327
24
}
1328
#endif  // CRYPTOPP_RIJNDAEL_ADVANCED_PROCESS_BLOCKS
1329
1330
NAMESPACE_END
1331
1332
#endif
1333
#endif