Coverage Report

Created: 2020-03-26 13:53

/src/botan/src/lib/block/aes/aes_vperm/aes_vperm.cpp
Line
Count
Source (jump to first uncovered line)
1
/*
2
* AES using vector permutes (SSSE3, NEON)
3
* (C) 2010,2016,2019 Jack Lloyd
4
*
5
* Based on public domain x86-64 assembly written by Mike Hamburg,
6
* described in "Accelerating AES with Vector Permute Instructions"
7
* (CHES 2009). His original code is available at
8
* https://crypto.stanford.edu/vpaes/
9
*
10
* Botan is released under the Simplified BSD License (see license.txt)
11
*/
12
13
#include <botan/aes.h>
14
#include <botan/internal/ct_utils.h>
15
#include <botan/internal/simd_32.h>
16
17
#if defined(BOTAN_SIMD_USE_SSE2)
18
  #include <tmmintrin.h>
19
#endif
20
21
namespace Botan {
22
23
namespace {
24
25
inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) shuffle(SIMD_4x32 a, SIMD_4x32 b)
26
0
   {
27
0
#if defined(BOTAN_SIMD_USE_SSE2)
28
0
   return SIMD_4x32(_mm_shuffle_epi8(a.raw(), b.raw()));
29
#elif defined(BOTAN_SIMD_USE_NEON)
30
   const uint8x16_t tbl = vreinterpretq_u8_u32(a.raw());
31
   const uint8x16_t idx = vreinterpretq_u8_u32(b.raw());
32
33
#if defined(BOTAN_TARGET_ARCH_IS_ARM32)
34
   const uint8x8x2_t tbl2 = { vget_low_u8(tbl), vget_high_u8(tbl) };
35
36
   return SIMD_4x32(vreinterpretq_u32_u8(
37
                       vcombine_u8(vtbl2_u8(tbl2, vget_low_u8(idx)),
38
                                   vtbl2_u8(tbl2, vget_high_u8(idx)))));
39
40
#else
41
   return SIMD_4x32(vreinterpretq_u32_u8(vqtbl1q_u8(tbl, idx)));
42
#endif
43
44
#elif defined(BOTAN_SIMD_USE_ALTIVEC)
45
46
   const auto zero = vec_splat_s8(0x00);
47
   const auto mask = vec_cmplt((__vector signed char)b.raw(), zero);
48
   const auto r = vec_perm((__vector signed char)a.raw(), (__vector signed char)a.raw(), (__vector unsigned char)b.raw());
49
   return SIMD_4x32((__vector unsigned int)vec_sel(r, zero, mask));
50
51
#else
52
   #error "No shuffle implementation available"
53
#endif
54
   }
55
56
inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) alignr8(SIMD_4x32 a, SIMD_4x32 b)
57
0
   {
58
0
#if defined(BOTAN_SIMD_USE_SSE2)
59
0
   return SIMD_4x32(_mm_alignr_epi8(a.raw(), b.raw(), 8));
60
#elif defined(BOTAN_SIMD_USE_NEON)
61
   return SIMD_4x32(vextq_u32(b.raw(), a.raw(), 2));
62
#elif defined(BOTAN_SIMD_USE_ALTIVEC)
63
   const __vector unsigned char mask = {8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23};
64
   return SIMD_4x32(vec_perm(b.raw(), a.raw(), mask));
65
#else
66
   #error "No alignr8 implementation available"
67
#endif
68
   }
69
70
const SIMD_4x32 k_ipt1 = SIMD_4x32(0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090);
71
const SIMD_4x32 k_ipt2 = SIMD_4x32(0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC);
72
73
const SIMD_4x32 k_inv1 = SIMD_4x32(0x0D080180, 0x0E05060F, 0x0A0B0C02, 0x04070309);
74
const SIMD_4x32 k_inv2 = SIMD_4x32(0x0F0B0780, 0x01040A06, 0x02050809, 0x030D0E0C);
75
76
const SIMD_4x32 sb1u = SIMD_4x32(0xCB503E00, 0xB19BE18F, 0x142AF544, 0xA5DF7A6E);
77
const SIMD_4x32 sb1t = SIMD_4x32(0xFAE22300, 0x3618D415, 0x0D2ED9EF, 0x3BF7CCC1);
78
const SIMD_4x32 sbou = SIMD_4x32(0x6FBDC700, 0xD0D26D17, 0xC502A878, 0x15AABF7A);
79
const SIMD_4x32 sbot = SIMD_4x32(0x5FBB6A00, 0xCFE474A5, 0x412B35FA, 0x8E1E90D1);
80
81
const SIMD_4x32 sboud = SIMD_4x32(0x7EF94000, 0x1387EA53, 0xD4943E2D, 0xC7AA6DB9);
82
const SIMD_4x32 sbotd = SIMD_4x32(0x93441D00, 0x12D7560F, 0xD8C58E9C, 0xCA4B8159);
83
84
const SIMD_4x32 mc_forward[4] = {
85
   SIMD_4x32(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D),
86
   SIMD_4x32(0x04070605, 0x080B0A09, 0x0C0F0E0D, 0x00030201),
87
   SIMD_4x32(0x080B0A09, 0x0C0F0E0D, 0x00030201, 0x04070605),
88
   SIMD_4x32(0x0C0F0E0D, 0x00030201, 0x04070605, 0x080B0A09)
89
};
90
91
const SIMD_4x32 vperm_sr[4] = {
92
   SIMD_4x32(0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C),
93
   SIMD_4x32(0x0F0A0500, 0x030E0904, 0x07020D08, 0x0B06010C),
94
   SIMD_4x32(0x0B020900, 0x0F060D04, 0x030A0108, 0x070E050C),
95
   SIMD_4x32(0x070A0D00, 0x0B0E0104, 0x0F020508, 0x0306090C),
96
};
97
98
const SIMD_4x32 rcon[10] = {
99
   SIMD_4x32(0x00000070, 0x00000000, 0x00000000, 0x00000000),
100
   SIMD_4x32(0x0000002A, 0x00000000, 0x00000000, 0x00000000),
101
   SIMD_4x32(0x00000098, 0x00000000, 0x00000000, 0x00000000),
102
   SIMD_4x32(0x00000008, 0x00000000, 0x00000000, 0x00000000),
103
   SIMD_4x32(0x0000004D, 0x00000000, 0x00000000, 0x00000000),
104
   SIMD_4x32(0x0000007C, 0x00000000, 0x00000000, 0x00000000),
105
   SIMD_4x32(0x0000007D, 0x00000000, 0x00000000, 0x00000000),
106
   SIMD_4x32(0x00000081, 0x00000000, 0x00000000, 0x00000000),
107
   SIMD_4x32(0x0000001F, 0x00000000, 0x00000000, 0x00000000),
108
   SIMD_4x32(0x00000083, 0x00000000, 0x00000000, 0x00000000),
109
};
110
111
const SIMD_4x32 sb2u = SIMD_4x32(0x0B712400, 0xE27A93C6, 0xBC982FCD, 0x5EB7E955);
112
const SIMD_4x32 sb2t = SIMD_4x32(0x0AE12900, 0x69EB8840, 0xAB82234A, 0xC2A163C8);
113
114
const SIMD_4x32 k_dipt1 = SIMD_4x32(0x0B545F00, 0x0F505B04, 0x114E451A, 0x154A411E);
115
const SIMD_4x32 k_dipt2 = SIMD_4x32(0x60056500, 0x86E383E6, 0xF491F194, 0x12771772);
116
117
const SIMD_4x32 sb9u = SIMD_4x32(0x9A86D600, 0x851C0353, 0x4F994CC9, 0xCAD51F50);
118
const SIMD_4x32 sb9t = SIMD_4x32(0xECD74900, 0xC03B1789, 0xB2FBA565, 0x725E2C9E);
119
120
const SIMD_4x32 sbeu = SIMD_4x32(0x26D4D000, 0x46F29296, 0x64B4F6B0, 0x22426004);
121
const SIMD_4x32 sbet = SIMD_4x32(0xFFAAC100, 0x0C55A6CD, 0x98593E32, 0x9467F36B);
122
123
const SIMD_4x32 sbdu = SIMD_4x32(0xE6B1A200, 0x7D57CCDF, 0x882A4439, 0xF56E9B13);
124
const SIMD_4x32 sbdt = SIMD_4x32(0x24C6CB00, 0x3CE2FAF7, 0x15DEEFD3, 0x2931180D);
125
126
const SIMD_4x32 sbbu = SIMD_4x32(0x96B44200, 0xD0226492, 0xB0F2D404, 0x602646F6);
127
const SIMD_4x32 sbbt = SIMD_4x32(0xCD596700, 0xC19498A6, 0x3255AA6B, 0xF3FF0C3E);
128
129
const SIMD_4x32 mcx[4] = {
130
   SIMD_4x32(0x0C0F0E0D, 0x00030201, 0x04070605, 0x080B0A09),
131
   SIMD_4x32(0x080B0A09, 0x0C0F0E0D, 0x00030201, 0x04070605),
132
   SIMD_4x32(0x04070605, 0x080B0A09, 0x0C0F0E0D, 0x00030201),
133
   SIMD_4x32(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D),
134
};
135
136
const SIMD_4x32 mc_backward[4] = {
137
   SIMD_4x32(0x02010003, 0x06050407, 0x0A09080B, 0x0E0D0C0F),
138
   SIMD_4x32(0x0E0D0C0F, 0x02010003, 0x06050407, 0x0A09080B),
139
   SIMD_4x32(0x0A09080B, 0x0E0D0C0F, 0x02010003, 0x06050407),
140
   SIMD_4x32(0x06050407, 0x0A09080B, 0x0E0D0C0F, 0x02010003),
141
};
142
143
const SIMD_4x32 lo_nibs_mask = SIMD_4x32::splat_u8(0x0F);
144
145
inline SIMD_4x32 low_nibs(SIMD_4x32 x)
146
0
   {
147
0
   return lo_nibs_mask & x;
148
0
   }
149
150
inline SIMD_4x32 high_nibs(SIMD_4x32 x)
151
0
   {
152
0
   return (x.shr<4>() & lo_nibs_mask);
153
0
   }
154
155
inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_enc_first_round(SIMD_4x32 B, SIMD_4x32 K)
156
0
   {
157
0
   return shuffle(k_ipt1, low_nibs(B)) ^ shuffle(k_ipt2, high_nibs(B)) ^ K;
158
0
   }
159
160
inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_enc_round(SIMD_4x32 B, SIMD_4x32 K, size_t r)
161
0
   {
162
0
   const SIMD_4x32 Bh = high_nibs(B);
163
0
   SIMD_4x32 Bl = low_nibs(B);
164
0
   const SIMD_4x32 t2 = shuffle(k_inv2, Bl);
165
0
   Bl ^= Bh;
166
0
167
0
   const SIMD_4x32 t5 = Bl ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh));
168
0
   const SIMD_4x32 t6 = Bh ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bl));
169
0
170
0
   const SIMD_4x32 t7 = shuffle(sb1t, t6) ^ shuffle(sb1u, t5) ^ K;
171
0
   const SIMD_4x32 t8 = shuffle(sb2t, t6) ^ shuffle(sb2u, t5) ^ shuffle(t7, mc_forward[r % 4]);
172
0
173
0
   return shuffle(t8, mc_forward[r % 4]) ^ shuffle(t7, mc_backward[r % 4]) ^ t8;
174
0
   }
175
176
inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_enc_last_round(SIMD_4x32 B, SIMD_4x32 K, size_t r)
177
0
   {
178
0
   const SIMD_4x32 Bh = high_nibs(B);
179
0
   SIMD_4x32 Bl = low_nibs(B);
180
0
   const SIMD_4x32 t2 = shuffle(k_inv2, Bl);
181
0
   Bl ^= Bh;
182
0
183
0
   const SIMD_4x32 t5 = Bl ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh));
184
0
   const SIMD_4x32 t6 = Bh ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bl));
185
0
186
0
   return shuffle(shuffle(sbou, t5) ^ shuffle(sbot, t6) ^ K, vperm_sr[r % 4]);
187
0
   }
188
189
inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_dec_first_round(SIMD_4x32 B, SIMD_4x32 K)
190
0
   {
191
0
   return shuffle(k_dipt1, low_nibs(B)) ^ shuffle(k_dipt2, high_nibs(B)) ^ K;
192
0
   }
193
194
inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_dec_round(SIMD_4x32 B, SIMD_4x32 K, size_t r)
195
0
   {
196
0
   const SIMD_4x32 Bh = high_nibs(B);
197
0
   B = low_nibs(B);
198
0
   const SIMD_4x32 t2 = shuffle(k_inv2, B);
199
0
200
0
   B ^= Bh;
201
0
202
0
   const SIMD_4x32 t5 = B ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh));
203
0
   const SIMD_4x32 t6 = Bh ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, B));
204
0
205
0
   const SIMD_4x32 mc = mcx[(r-1)%4];
206
0
207
0
   const SIMD_4x32 t8 = shuffle(sb9t, t6) ^ shuffle(sb9u, t5) ^ K;
208
0
   const SIMD_4x32 t9 = shuffle(t8, mc) ^ shuffle(sbdu, t5) ^ shuffle(sbdt, t6);
209
0
   const SIMD_4x32 t12 = shuffle(t9, mc) ^ shuffle(sbbu, t5) ^ shuffle(sbbt, t6);
210
0
   return shuffle(t12, mc) ^ shuffle(sbeu, t5) ^ shuffle(sbet, t6);
211
0
   }
212
213
inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_dec_last_round(SIMD_4x32 B, SIMD_4x32 K, size_t r)
214
0
   {
215
0
   const uint32_t which_sr = ((((r - 1) << 4) ^ 48) & 48) / 16;
216
0
217
0
   const SIMD_4x32 Bh = high_nibs(B);
218
0
   B = low_nibs(B);
219
0
   const SIMD_4x32 t2 = shuffle(k_inv2, B);
220
0
221
0
   B ^= Bh;
222
0
223
0
   const SIMD_4x32 t5 = B ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh));
224
0
   const SIMD_4x32 t6 = Bh ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, B));
225
0
226
0
   const SIMD_4x32 x = shuffle(sboud, t5) ^ shuffle(sbotd, t6) ^ K;
227
0
   return shuffle(x, vperm_sr[which_sr]);
228
0
   }
229
230
void BOTAN_FUNC_ISA(BOTAN_VPERM_ISA)
231
   vperm_encrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks,
232
                        const SIMD_4x32 K[], size_t rounds)
233
0
   {
234
0
   CT::poison(in, blocks * 16);
235
0
236
0
   const size_t blocks2 = blocks - (blocks % 2);
237
0
238
0
   for(size_t i = 0; i != blocks2; i += 2)
239
0
      {
240
0
      SIMD_4x32 B0 = SIMD_4x32::load_le(in + i*16);
241
0
      SIMD_4x32 B1 = SIMD_4x32::load_le(in + (i+1)*16);
242
0
243
0
      B0 = aes_enc_first_round(B0, K[0]);
244
0
      B1 = aes_enc_first_round(B1, K[0]);
245
0
246
0
      for(size_t r = 1; r != rounds; ++r)
247
0
         {
248
0
         B0 = aes_enc_round(B0, K[r], r);
249
0
         B1 = aes_enc_round(B1, K[r], r);
250
0
         }
251
0
252
0
      B0 = aes_enc_last_round(B0, K[rounds], rounds);
253
0
      B1 = aes_enc_last_round(B1, K[rounds], rounds);
254
0
255
0
      B0.store_le(out + i*16);
256
0
      B1.store_le(out + (i+1)*16);
257
0
      }
258
0
259
0
   for(size_t i = blocks2; i < blocks; ++i)
260
0
      {
261
0
      SIMD_4x32 B = SIMD_4x32::load_le(in + i*16); // ???
262
0
263
0
      B = aes_enc_first_round(B, K[0]);
264
0
265
0
      for(size_t r = 1; r != rounds; ++r)
266
0
         {
267
0
         B = aes_enc_round(B, K[r], r);
268
0
         }
269
0
270
0
      B = aes_enc_last_round(B, K[rounds], rounds);
271
0
      B.store_le(out + i*16);
272
0
      }
273
0
274
0
   CT::unpoison(in,  blocks * 16);
275
0
   CT::unpoison(out, blocks * 16);
276
0
   }
277
278
void BOTAN_FUNC_ISA(BOTAN_VPERM_ISA)
279
   vperm_decrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks,
280
                        const SIMD_4x32 K[], size_t rounds)
281
0
   {
282
0
   CT::poison(in, blocks * 16);
283
0
284
0
   const size_t blocks2 = blocks - (blocks % 2);
285
0
286
0
   for(size_t i = 0; i != blocks2; i += 2)
287
0
      {
288
0
      SIMD_4x32 B0 = SIMD_4x32::load_le(in + i*16);
289
0
      SIMD_4x32 B1 = SIMD_4x32::load_le(in + (i+1)*16);
290
0
291
0
      B0 = aes_dec_first_round(B0, K[0]);
292
0
      B1 = aes_dec_first_round(B1, K[0]);
293
0
294
0
      for(size_t r = 1; r != rounds; ++r)
295
0
         {
296
0
         B0 = aes_dec_round(B0, K[r], r);
297
0
         B1 = aes_dec_round(B1, K[r], r);
298
0
         }
299
0
300
0
      B0 = aes_dec_last_round(B0, K[rounds], rounds);
301
0
      B1 = aes_dec_last_round(B1, K[rounds], rounds);
302
0
303
0
      B0.store_le(out + i*16);
304
0
      B1.store_le(out + (i+1)*16);
305
0
      }
306
0
307
0
   for(size_t i = blocks2; i < blocks; ++i)
308
0
      {
309
0
      SIMD_4x32 B = SIMD_4x32::load_le(in + i*16); // ???
310
0
311
0
      B = aes_dec_first_round(B, K[0]);
312
0
313
0
      for(size_t r = 1; r != rounds; ++r)
314
0
         {
315
0
         B = aes_dec_round(B, K[r], r);
316
0
         }
317
0
318
0
      B = aes_dec_last_round(B, K[rounds], rounds);
319
0
      B.store_le(out + i*16);
320
0
      }
321
0
322
0
   CT::unpoison(in,  blocks * 16);
323
0
   CT::unpoison(out, blocks * 16);
324
0
   }
325
326
}
327
328
void AES_128::vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
329
0
   {
330
0
   const SIMD_4x32 K[11] = {
331
0
      SIMD_4x32(&m_EK[4* 0]), SIMD_4x32(&m_EK[4* 1]), SIMD_4x32(&m_EK[4* 2]),
332
0
      SIMD_4x32(&m_EK[4* 3]), SIMD_4x32(&m_EK[4* 4]), SIMD_4x32(&m_EK[4* 5]),
333
0
      SIMD_4x32(&m_EK[4* 6]), SIMD_4x32(&m_EK[4* 7]), SIMD_4x32(&m_EK[4* 8]),
334
0
      SIMD_4x32(&m_EK[4* 9]), SIMD_4x32(&m_EK[4*10]),
335
0
   };
336
0
337
0
   return vperm_encrypt_blocks(in, out, blocks, K, 10);
338
0
   }
339
340
void AES_128::vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
341
0
   {
342
0
   const SIMD_4x32 K[11] = {
343
0
      SIMD_4x32(&m_DK[4* 0]), SIMD_4x32(&m_DK[4* 1]), SIMD_4x32(&m_DK[4* 2]),
344
0
      SIMD_4x32(&m_DK[4* 3]), SIMD_4x32(&m_DK[4* 4]), SIMD_4x32(&m_DK[4* 5]),
345
0
      SIMD_4x32(&m_DK[4* 6]), SIMD_4x32(&m_DK[4* 7]), SIMD_4x32(&m_DK[4* 8]),
346
0
      SIMD_4x32(&m_DK[4* 9]), SIMD_4x32(&m_DK[4*10]),
347
0
   };
348
0
349
0
   return vperm_decrypt_blocks(in, out, blocks, K, 10);
350
0
   }
351
352
void AES_192::vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
353
0
   {
354
0
   const SIMD_4x32 K[13] = {
355
0
      SIMD_4x32(&m_EK[4* 0]), SIMD_4x32(&m_EK[4* 1]), SIMD_4x32(&m_EK[4* 2]),
356
0
      SIMD_4x32(&m_EK[4* 3]), SIMD_4x32(&m_EK[4* 4]), SIMD_4x32(&m_EK[4* 5]),
357
0
      SIMD_4x32(&m_EK[4* 6]), SIMD_4x32(&m_EK[4* 7]), SIMD_4x32(&m_EK[4* 8]),
358
0
      SIMD_4x32(&m_EK[4* 9]), SIMD_4x32(&m_EK[4*10]), SIMD_4x32(&m_EK[4*11]),
359
0
      SIMD_4x32(&m_EK[4*12]),
360
0
   };
361
0
362
0
   return vperm_encrypt_blocks(in, out, blocks, K, 12);
363
0
   }
364
365
void AES_192::vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
366
0
   {
367
0
   const SIMD_4x32 K[13] = {
368
0
      SIMD_4x32(&m_DK[4* 0]), SIMD_4x32(&m_DK[4* 1]), SIMD_4x32(&m_DK[4* 2]),
369
0
      SIMD_4x32(&m_DK[4* 3]), SIMD_4x32(&m_DK[4* 4]), SIMD_4x32(&m_DK[4* 5]),
370
0
      SIMD_4x32(&m_DK[4* 6]), SIMD_4x32(&m_DK[4* 7]), SIMD_4x32(&m_DK[4* 8]),
371
0
      SIMD_4x32(&m_DK[4* 9]), SIMD_4x32(&m_DK[4*10]), SIMD_4x32(&m_DK[4*11]),
372
0
      SIMD_4x32(&m_DK[4*12]),
373
0
   };
374
0
375
0
   return vperm_decrypt_blocks(in, out, blocks, K, 12);
376
0
   }
377
378
void AES_256::vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
379
0
   {
380
0
   const SIMD_4x32 K[15] = {
381
0
      SIMD_4x32(&m_EK[4* 0]), SIMD_4x32(&m_EK[4* 1]), SIMD_4x32(&m_EK[4* 2]),
382
0
      SIMD_4x32(&m_EK[4* 3]), SIMD_4x32(&m_EK[4* 4]), SIMD_4x32(&m_EK[4* 5]),
383
0
      SIMD_4x32(&m_EK[4* 6]), SIMD_4x32(&m_EK[4* 7]), SIMD_4x32(&m_EK[4* 8]),
384
0
      SIMD_4x32(&m_EK[4* 9]), SIMD_4x32(&m_EK[4*10]), SIMD_4x32(&m_EK[4*11]),
385
0
      SIMD_4x32(&m_EK[4*12]), SIMD_4x32(&m_EK[4*13]), SIMD_4x32(&m_EK[4*14]),
386
0
   };
387
0
388
0
   return vperm_encrypt_blocks(in, out, blocks, K, 14);
389
0
   }
390
391
void AES_256::vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
392
0
   {
393
0
   const SIMD_4x32 K[15] = {
394
0
      SIMD_4x32(&m_DK[4* 0]), SIMD_4x32(&m_DK[4* 1]), SIMD_4x32(&m_DK[4* 2]),
395
0
      SIMD_4x32(&m_DK[4* 3]), SIMD_4x32(&m_DK[4* 4]), SIMD_4x32(&m_DK[4* 5]),
396
0
      SIMD_4x32(&m_DK[4* 6]), SIMD_4x32(&m_DK[4* 7]), SIMD_4x32(&m_DK[4* 8]),
397
0
      SIMD_4x32(&m_DK[4* 9]), SIMD_4x32(&m_DK[4*10]), SIMD_4x32(&m_DK[4*11]),
398
0
      SIMD_4x32(&m_DK[4*12]), SIMD_4x32(&m_DK[4*13]), SIMD_4x32(&m_DK[4*14]),
399
0
   };
400
0
401
0
   return vperm_decrypt_blocks(in, out, blocks, K, 14);
402
0
   }
403
404
namespace {
405
406
inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA)
407
   aes_schedule_transform(SIMD_4x32 input,
408
                          SIMD_4x32 table_1,
409
                          SIMD_4x32 table_2)
410
0
   {
411
0
   return shuffle(table_1, low_nibs(input)) ^ shuffle(table_2, high_nibs(input));
412
0
   }
413
414
SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_schedule_mangle(SIMD_4x32 k, uint8_t round_no)
415
0
   {
416
0
   const SIMD_4x32 mc_forward0(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D);
417
0
418
0
   SIMD_4x32 t = shuffle(k ^ SIMD_4x32::splat_u8(0x5B), mc_forward0);
419
0
   SIMD_4x32 t2 = t;
420
0
   t = shuffle(t, mc_forward0);
421
0
   t2 = t ^ t2 ^ shuffle(t, mc_forward0);
422
0
   return shuffle(t2, vperm_sr[round_no % 4]);
423
0
   }
424
425
SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_schedule_mangle_dec(SIMD_4x32 k, uint8_t round_no)
426
0
   {
427
0
   const SIMD_4x32 mc_forward0(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D);
428
0
429
0
   const SIMD_4x32 dsk[8] = {
430
0
      SIMD_4x32(0x7ED9A700, 0xB6116FC8, 0x82255BFC, 0x4AED9334),
431
0
      SIMD_4x32(0x27143300, 0x45765162, 0xE9DAFDCE, 0x8BB89FAC),
432
0
      SIMD_4x32(0xCCA86400, 0x27438FEB, 0xADC90561, 0x4622EE8A),
433
0
      SIMD_4x32(0x4F92DD00, 0x815C13CE, 0xBD602FF2, 0x73AEE13C),
434
0
      SIMD_4x32(0x01C6C700, 0x03C4C502, 0xFA3D3CFB, 0xF83F3EF9),
435
0
      SIMD_4x32(0x38CFF700, 0xEE1921D6, 0x7384BC4B, 0xA5526A9D),
436
0
      SIMD_4x32(0x53732000, 0xE3C390B0, 0x10306343, 0xA080D3F3),
437
0
      SIMD_4x32(0x036982E8, 0xA0CA214B, 0x8CE60D67, 0x2F45AEC4),
438
0
   };
439
0
440
0
   SIMD_4x32 t = aes_schedule_transform(k, dsk[0], dsk[1]);
441
0
   SIMD_4x32 output = shuffle(t, mc_forward0);
442
0
443
0
   t = aes_schedule_transform(t, dsk[2], dsk[3]);
444
0
   output = shuffle(t ^ output, mc_forward0);
445
0
446
0
   t = aes_schedule_transform(t, dsk[4], dsk[5]);
447
0
   output = shuffle(t ^ output, mc_forward0);
448
0
449
0
   t = aes_schedule_transform(t, dsk[6], dsk[7]);
450
0
   output = shuffle(t ^ output, mc_forward0);
451
0
452
0
   return shuffle(output, vperm_sr[round_no % 4]);
453
0
   }
454
455
SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_schedule_mangle_last(SIMD_4x32 k, uint8_t round_no)
456
0
   {
457
0
   const SIMD_4x32 out_tr1(0xD6B66000, 0xFF9F4929, 0xDEBE6808, 0xF7974121);
458
0
   const SIMD_4x32 out_tr2(0x50BCEC00, 0x01EDBD51, 0xB05C0CE0, 0xE10D5DB1);
459
0
460
0
   k = shuffle(k, vperm_sr[round_no % 4]);
461
0
   k ^= SIMD_4x32::splat_u8(0x5B);
462
0
   return aes_schedule_transform(k, out_tr1, out_tr2);
463
0
   }
464
465
SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_schedule_mangle_last_dec(SIMD_4x32 k)
466
0
   {
467
0
   const SIMD_4x32 deskew1(0x47A4E300, 0x07E4A340, 0x5DBEF91A, 0x1DFEB95A);
468
0
   const SIMD_4x32 deskew2(0x83EA6900, 0x5F36B5DC, 0xF49D1E77, 0x2841C2AB);
469
0
470
0
   k ^= SIMD_4x32::splat_u8(0x5B);
471
0
   return aes_schedule_transform(k, deskew1, deskew2);
472
0
   }
473
474
SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_schedule_round(SIMD_4x32 input1, SIMD_4x32 input2)
475
0
   {
476
0
   SIMD_4x32 smeared = input2 ^ input2.shift_elems_left<1>();
477
0
   smeared ^= smeared.shift_elems_left<2>();
478
0
   smeared ^= SIMD_4x32::splat_u8(0x5B);
479
0
480
0
   const SIMD_4x32 Bh = high_nibs(input1);
481
0
   SIMD_4x32 Bl = low_nibs(input1);
482
0
483
0
   const SIMD_4x32 t2 = shuffle(k_inv2, Bl);
484
0
485
0
   Bl ^= Bh;
486
0
487
0
   SIMD_4x32 t5 = Bl ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh));
488
0
   SIMD_4x32 t6 = Bh ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bl));
489
0
490
0
   return smeared ^ shuffle(sb1u, t5) ^ shuffle(sb1t, t6);
491
0
   }
492
493
SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_schedule_round(SIMD_4x32 rc, SIMD_4x32 input1, SIMD_4x32 input2)
494
0
   {
495
0
   // This byte shuffle is equivalent to alignr<1>(shuffle32(input1, (3,3,3,3)));
496
0
   const SIMD_4x32 shuffle3333_15 = SIMD_4x32::splat(0x0C0F0E0D);
497
0
   return aes_schedule_round(shuffle(input1, shuffle3333_15), input2 ^ rc);
498
0
   }
499
500
SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_schedule_192_smear(SIMD_4x32 x, SIMD_4x32 y)
501
0
   {
502
0
   const SIMD_4x32 shuffle3332 =
503
0
      SIMD_4x32(0x0B0A0908, 0x0F0E0D0C, 0x0F0E0D0C, 0x0F0E0D0C);
504
0
   const SIMD_4x32 shuffle2000 =
505
0
      SIMD_4x32(0x03020100, 0x03020100, 0x03020100, 0x0B0A0908);
506
0
507
0
   const SIMD_4x32 zero_top_half(0, 0, 0xFFFFFFFF, 0xFFFFFFFF);
508
0
   y &= zero_top_half;
509
0
   return y ^ shuffle(x, shuffle3332) ^ shuffle(y, shuffle2000);
510
0
   }
511
512
}
513
514
void AES_128::vperm_key_schedule(const uint8_t keyb[], size_t)
515
0
   {
516
0
   m_EK.resize(11*4);
517
0
   m_DK.resize(11*4);
518
0
519
0
   SIMD_4x32 key = SIMD_4x32::load_le(keyb);
520
0
521
0
   shuffle(key, vperm_sr[2]).store_le(&m_DK[4*10]);
522
0
523
0
   key = aes_schedule_transform(key, k_ipt1, k_ipt2);
524
0
   key.store_le(&m_EK[0]);
525
0
526
0
   for(size_t i = 1; i != 10; ++i)
527
0
      {
528
0
      key = aes_schedule_round(rcon[i-1], key, key);
529
0
530
0
      aes_schedule_mangle(key, (12-i) % 4).store_le(&m_EK[4*i]);
531
0
532
0
      aes_schedule_mangle_dec(key, (10-i)%4).store_le(&m_DK[4*(10-i)]);
533
0
      }
534
0
535
0
   key = aes_schedule_round(rcon[9], key, key);
536
0
   aes_schedule_mangle_last(key, 2).store_le(&m_EK[4*10]);
537
0
   aes_schedule_mangle_last_dec(key).store_le(&m_DK[0]);
538
0
   }
539
540
void AES_192::vperm_key_schedule(const uint8_t keyb[], size_t)
541
0
   {
542
0
   m_EK.resize(13*4);
543
0
   m_DK.resize(13*4);
544
0
545
0
   SIMD_4x32 key1 = SIMD_4x32::load_le(keyb);
546
0
   SIMD_4x32 key2 = SIMD_4x32::load_le(keyb + 8);
547
0
548
0
   shuffle(key1, vperm_sr[0]).store_le(&m_DK[12*4]);
549
0
550
0
   key1 = aes_schedule_transform(key1, k_ipt1, k_ipt2);
551
0
   key2 = aes_schedule_transform(key2, k_ipt1, k_ipt2);
552
0
553
0
   key1.store_le(&m_EK[0]);
554
0
555
0
   for(size_t i = 0; i != 4; ++i)
556
0
      {
557
0
      // key2 with 8 high bytes masked off
558
0
      SIMD_4x32 t = key2;
559
0
      key2 = aes_schedule_round(rcon[2*i], key2, key1);
560
0
      const SIMD_4x32 key2t = alignr8(key2, t);
561
0
      aes_schedule_mangle(key2t, (i+3)%4).store_le(&m_EK[4*(3*i+1)]);
562
0
      aes_schedule_mangle_dec(key2t, (i+3)%4).store_le(&m_DK[4*(11-3*i)]);
563
0
564
0
      t = aes_schedule_192_smear(key2, t);
565
0
566
0
      aes_schedule_mangle(t, (i+2)%4).store_le(&m_EK[4*(3*i+2)]);
567
0
      aes_schedule_mangle_dec(t, (i+2)%4).store_le(&m_DK[4*(10-3*i)]);
568
0
569
0
      key2 = aes_schedule_round(rcon[2*i+1], t, key2);
570
0
571
0
      if(i == 3)
572
0
         {
573
0
         aes_schedule_mangle_last(key2, (i+1)%4).store_le(&m_EK[4*(3*i+3)]);
574
0
         aes_schedule_mangle_last_dec(key2).store_le(&m_DK[4*(9-3*i)]);
575
0
         }
576
0
      else
577
0
         {
578
0
         aes_schedule_mangle(key2, (i+1)%4).store_le(&m_EK[4*(3*i+3)]);
579
0
         aes_schedule_mangle_dec(key2, (i+1)%4).store_le(&m_DK[4*(9-3*i)]);
580
0
         }
581
0
582
0
      key1 = key2;
583
0
      key2 = aes_schedule_192_smear(key2, t);
584
0
      }
585
0
   }
586
587
void AES_256::vperm_key_schedule(const uint8_t keyb[], size_t)
588
0
   {
589
0
   m_EK.resize(15*4);
590
0
   m_DK.resize(15*4);
591
0
592
0
   SIMD_4x32 key1 = SIMD_4x32::load_le(keyb);
593
0
   SIMD_4x32 key2 = SIMD_4x32::load_le(keyb + 16);
594
0
595
0
   shuffle(key1, vperm_sr[2]).store_le(&m_DK[4*14]);
596
0
597
0
   key1 = aes_schedule_transform(key1, k_ipt1, k_ipt2);
598
0
   key2 = aes_schedule_transform(key2, k_ipt1, k_ipt2);
599
0
600
0
   key1.store_le(&m_EK[0]);
601
0
   aes_schedule_mangle(key2, 3).store_le(&m_EK[4]);
602
0
603
0
   aes_schedule_mangle_dec(key2, 1).store_le(&m_DK[4*13]);
604
0
605
0
   const SIMD_4x32 shuffle3333 = SIMD_4x32::splat(0x0F0E0D0C);
606
0
607
0
   for(size_t i = 2; i != 14; i += 2)
608
0
      {
609
0
      const SIMD_4x32 k_t = key2;
610
0
      key1 = key2 = aes_schedule_round(rcon[(i/2)-1], key2, key1);
611
0
612
0
      aes_schedule_mangle(key2, i % 4).store_le(&m_EK[4*i]);
613
0
      aes_schedule_mangle_dec(key2, (i+2)%4).store_le(&m_DK[4*(14-i)]);
614
0
615
0
      key2 = aes_schedule_round(shuffle(key2, shuffle3333), k_t);
616
0
617
0
      aes_schedule_mangle(key2, (i-1)%4).store_le(&m_EK[4*(i+1)]);
618
0
      aes_schedule_mangle_dec(key2, (i+1)%4).store_le(&m_DK[4*(13-i)]);
619
0
      }
620
0
621
0
   key2 = aes_schedule_round(rcon[6], key2, key1);
622
0
623
0
   aes_schedule_mangle_last(key2, 2).store_le(&m_EK[4*14]);
624
0
   aes_schedule_mangle_last_dec(key2).store_le(&m_DK[0]);
625
0
   }
626
627
}