Coverage Report

Created: 2025-04-11 06:34

/src/botan/src/lib/block/aes/aes_vperm/aes_vperm.cpp
Line
Count
Source (jump to first uncovered line)
1
/*
2
* AES using vector permutes (SSSE3, NEON)
3
* (C) 2010,2016,2019 Jack Lloyd
4
*
5
* Based on public domain x86-64 assembly written by Mike Hamburg,
6
* described in "Accelerating AES with Vector Permute Instructions"
7
* (CHES 2009). His original code is available at
8
* https://crypto.stanford.edu/vpaes/
9
*
10
* Botan is released under the Simplified BSD License (see license.txt)
11
*/
12
13
#include <botan/internal/aes.h>
14
15
#include <botan/internal/ct_utils.h>
16
#include <botan/internal/isa_extn.h>
17
#include <botan/internal/simd_4x32.h>
18
#include <botan/internal/target_info.h>
19
#include <bit>
20
21
namespace Botan {
22
23
namespace {
24
25
0
inline SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 shuffle(SIMD_4x32 tbl, SIMD_4x32 idx) {
26
0
   if constexpr(std::endian::native == std::endian::little) {
27
0
      return SIMD_4x32::byte_shuffle(tbl, idx);
28
   } else {
29
      return SIMD_4x32::byte_shuffle(tbl.bswap(), idx.bswap()).bswap();
30
   }
31
0
}
32
33
0
inline SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 masked_shuffle(SIMD_4x32 tbl, SIMD_4x32 idx) {
34
0
   if constexpr(std::endian::native == std::endian::little) {
35
0
      return SIMD_4x32::masked_byte_shuffle(tbl, idx);
36
   } else {
37
      return SIMD_4x32::masked_byte_shuffle(tbl.bswap(), idx.bswap()).bswap();
38
   }
39
0
}
40
41
const SIMD_4x32 k_ipt1 = SIMD_4x32(0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090);
42
const SIMD_4x32 k_ipt2 = SIMD_4x32(0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC);
43
44
const SIMD_4x32 k_inv1 = SIMD_4x32(0x0D080180, 0x0E05060F, 0x0A0B0C02, 0x04070309);
45
const SIMD_4x32 k_inv2 = SIMD_4x32(0x0F0B0780, 0x01040A06, 0x02050809, 0x030D0E0C);
46
47
const SIMD_4x32 sb1u = SIMD_4x32(0xCB503E00, 0xB19BE18F, 0x142AF544, 0xA5DF7A6E);
48
const SIMD_4x32 sb1t = SIMD_4x32(0xFAE22300, 0x3618D415, 0x0D2ED9EF, 0x3BF7CCC1);
49
const SIMD_4x32 sbou = SIMD_4x32(0x6FBDC700, 0xD0D26D17, 0xC502A878, 0x15AABF7A);
50
const SIMD_4x32 sbot = SIMD_4x32(0x5FBB6A00, 0xCFE474A5, 0x412B35FA, 0x8E1E90D1);
51
52
const SIMD_4x32 sboud = SIMD_4x32(0x7EF94000, 0x1387EA53, 0xD4943E2D, 0xC7AA6DB9);
53
const SIMD_4x32 sbotd = SIMD_4x32(0x93441D00, 0x12D7560F, 0xD8C58E9C, 0xCA4B8159);
54
55
const SIMD_4x32 mc_forward[4] = {SIMD_4x32(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D),
56
                                 SIMD_4x32(0x04070605, 0x080B0A09, 0x0C0F0E0D, 0x00030201),
57
                                 SIMD_4x32(0x080B0A09, 0x0C0F0E0D, 0x00030201, 0x04070605),
58
                                 SIMD_4x32(0x0C0F0E0D, 0x00030201, 0x04070605, 0x080B0A09)};
59
60
const SIMD_4x32 vperm_sr[4] = {
61
   SIMD_4x32(0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C),
62
   SIMD_4x32(0x0F0A0500, 0x030E0904, 0x07020D08, 0x0B06010C),
63
   SIMD_4x32(0x0B020900, 0x0F060D04, 0x030A0108, 0x070E050C),
64
   SIMD_4x32(0x070A0D00, 0x0B0E0104, 0x0F020508, 0x0306090C),
65
};
66
67
const SIMD_4x32 rcon[10] = {
68
   SIMD_4x32(0x00000070, 0x00000000, 0x00000000, 0x00000000),
69
   SIMD_4x32(0x0000002A, 0x00000000, 0x00000000, 0x00000000),
70
   SIMD_4x32(0x00000098, 0x00000000, 0x00000000, 0x00000000),
71
   SIMD_4x32(0x00000008, 0x00000000, 0x00000000, 0x00000000),
72
   SIMD_4x32(0x0000004D, 0x00000000, 0x00000000, 0x00000000),
73
   SIMD_4x32(0x0000007C, 0x00000000, 0x00000000, 0x00000000),
74
   SIMD_4x32(0x0000007D, 0x00000000, 0x00000000, 0x00000000),
75
   SIMD_4x32(0x00000081, 0x00000000, 0x00000000, 0x00000000),
76
   SIMD_4x32(0x0000001F, 0x00000000, 0x00000000, 0x00000000),
77
   SIMD_4x32(0x00000083, 0x00000000, 0x00000000, 0x00000000),
78
};
79
80
const SIMD_4x32 sb2u = SIMD_4x32(0x0B712400, 0xE27A93C6, 0xBC982FCD, 0x5EB7E955);
81
const SIMD_4x32 sb2t = SIMD_4x32(0x0AE12900, 0x69EB8840, 0xAB82234A, 0xC2A163C8);
82
83
const SIMD_4x32 k_dipt1 = SIMD_4x32(0x0B545F00, 0x0F505B04, 0x114E451A, 0x154A411E);
84
const SIMD_4x32 k_dipt2 = SIMD_4x32(0x60056500, 0x86E383E6, 0xF491F194, 0x12771772);
85
86
const SIMD_4x32 sb9u = SIMD_4x32(0x9A86D600, 0x851C0353, 0x4F994CC9, 0xCAD51F50);
87
const SIMD_4x32 sb9t = SIMD_4x32(0xECD74900, 0xC03B1789, 0xB2FBA565, 0x725E2C9E);
88
89
const SIMD_4x32 sbeu = SIMD_4x32(0x26D4D000, 0x46F29296, 0x64B4F6B0, 0x22426004);
90
const SIMD_4x32 sbet = SIMD_4x32(0xFFAAC100, 0x0C55A6CD, 0x98593E32, 0x9467F36B);
91
92
const SIMD_4x32 sbdu = SIMD_4x32(0xE6B1A200, 0x7D57CCDF, 0x882A4439, 0xF56E9B13);
93
const SIMD_4x32 sbdt = SIMD_4x32(0x24C6CB00, 0x3CE2FAF7, 0x15DEEFD3, 0x2931180D);
94
95
const SIMD_4x32 sbbu = SIMD_4x32(0x96B44200, 0xD0226492, 0xB0F2D404, 0x602646F6);
96
const SIMD_4x32 sbbt = SIMD_4x32(0xCD596700, 0xC19498A6, 0x3255AA6B, 0xF3FF0C3E);
97
98
const SIMD_4x32 mcx[4] = {
99
   SIMD_4x32(0x0C0F0E0D, 0x00030201, 0x04070605, 0x080B0A09),
100
   SIMD_4x32(0x080B0A09, 0x0C0F0E0D, 0x00030201, 0x04070605),
101
   SIMD_4x32(0x04070605, 0x080B0A09, 0x0C0F0E0D, 0x00030201),
102
   SIMD_4x32(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D),
103
};
104
105
const SIMD_4x32 mc_backward[4] = {
106
   SIMD_4x32(0x02010003, 0x06050407, 0x0A09080B, 0x0E0D0C0F),
107
   SIMD_4x32(0x0E0D0C0F, 0x02010003, 0x06050407, 0x0A09080B),
108
   SIMD_4x32(0x0A09080B, 0x0E0D0C0F, 0x02010003, 0x06050407),
109
   SIMD_4x32(0x06050407, 0x0A09080B, 0x0E0D0C0F, 0x02010003),
110
};
111
112
const SIMD_4x32 lo_nibs_mask = SIMD_4x32::splat_u8(0x0F);
113
114
0
inline SIMD_4x32 low_nibs(SIMD_4x32 x) {
115
0
   return lo_nibs_mask & x;
116
0
}
117
118
0
inline SIMD_4x32 high_nibs(SIMD_4x32 x) {
119
0
   return (x.shr<4>() & lo_nibs_mask);
120
0
}
121
122
0
inline SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_enc_first_round(SIMD_4x32 B, SIMD_4x32 K) {
123
0
   return shuffle(k_ipt1, low_nibs(B)) ^ shuffle(k_ipt2, high_nibs(B)) ^ K;
124
0
}
125
126
0
inline SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_enc_round(SIMD_4x32 B, SIMD_4x32 K, size_t r) {
127
0
   const SIMD_4x32 Bh = high_nibs(B);
128
0
   SIMD_4x32 Bl = low_nibs(B);
129
0
   const SIMD_4x32 t2 = shuffle(k_inv2, Bl);
130
0
   Bl ^= Bh;
131
132
0
   const SIMD_4x32 t5 = Bl ^ masked_shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh));
133
0
   const SIMD_4x32 t6 = Bh ^ masked_shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bl));
134
135
0
   const SIMD_4x32 t7 = masked_shuffle(sb1t, t6) ^ masked_shuffle(sb1u, t5) ^ K;
136
0
   const SIMD_4x32 t8 = masked_shuffle(sb2t, t6) ^ masked_shuffle(sb2u, t5) ^ shuffle(t7, mc_forward[r % 4]);
137
138
0
   return shuffle(t8, mc_forward[r % 4]) ^ shuffle(t7, mc_backward[r % 4]) ^ t8;
139
0
}
140
141
0
inline SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_enc_last_round(SIMD_4x32 B, SIMD_4x32 K, size_t r) {
142
0
   const SIMD_4x32 Bh = high_nibs(B);
143
0
   SIMD_4x32 Bl = low_nibs(B);
144
0
   const SIMD_4x32 t2 = shuffle(k_inv2, Bl);
145
0
   Bl ^= Bh;
146
147
0
   const SIMD_4x32 t5 = Bl ^ masked_shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh));
148
0
   const SIMD_4x32 t6 = Bh ^ masked_shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bl));
149
150
0
   return shuffle(masked_shuffle(sbou, t5) ^ masked_shuffle(sbot, t6) ^ K, vperm_sr[r % 4]);
151
0
}
152
153
0
inline SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_dec_first_round(SIMD_4x32 B, SIMD_4x32 K) {
154
0
   return shuffle(k_dipt1, low_nibs(B)) ^ shuffle(k_dipt2, high_nibs(B)) ^ K;
155
0
}
156
157
0
inline SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_dec_round(SIMD_4x32 B, SIMD_4x32 K, size_t r) {
158
0
   const SIMD_4x32 Bh = high_nibs(B);
159
0
   B = low_nibs(B);
160
0
   const SIMD_4x32 t2 = shuffle(k_inv2, B);
161
162
0
   B ^= Bh;
163
164
0
   const SIMD_4x32 t5 = B ^ masked_shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh));
165
0
   const SIMD_4x32 t6 = Bh ^ masked_shuffle(k_inv1, t2 ^ shuffle(k_inv1, B));
166
167
0
   const SIMD_4x32 mc = mcx[(r - 1) % 4];
168
169
0
   const SIMD_4x32 t8 = masked_shuffle(sb9t, t6) ^ masked_shuffle(sb9u, t5) ^ K;
170
0
   const SIMD_4x32 t9 = shuffle(t8, mc) ^ masked_shuffle(sbdu, t5) ^ masked_shuffle(sbdt, t6);
171
0
   const SIMD_4x32 t12 = shuffle(t9, mc) ^ masked_shuffle(sbbu, t5) ^ masked_shuffle(sbbt, t6);
172
0
   return shuffle(t12, mc) ^ masked_shuffle(sbeu, t5) ^ masked_shuffle(sbet, t6);
173
0
}
174
175
0
inline SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_dec_last_round(SIMD_4x32 B, SIMD_4x32 K, size_t r) {
176
0
   const uint32_t which_sr = ((((r - 1) << 4) ^ 48) & 48) / 16;
177
178
0
   const SIMD_4x32 Bh = high_nibs(B);
179
0
   B = low_nibs(B);
180
0
   const SIMD_4x32 t2 = shuffle(k_inv2, B);
181
182
0
   B ^= Bh;
183
184
0
   const SIMD_4x32 t5 = B ^ masked_shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh));
185
0
   const SIMD_4x32 t6 = Bh ^ masked_shuffle(k_inv1, t2 ^ shuffle(k_inv1, B));
186
187
0
   const SIMD_4x32 x = masked_shuffle(sboud, t5) ^ masked_shuffle(sbotd, t6) ^ K;
188
0
   return shuffle(x, vperm_sr[which_sr]);
189
0
}
190
191
void BOTAN_FN_ISA_SIMD_4X32
192
0
vperm_encrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks, const SIMD_4x32 K[], size_t rounds) {
193
0
   CT::poison(in, blocks * 16);
194
195
0
   const size_t blocks2 = blocks - (blocks % 2);
196
197
0
   for(size_t i = 0; i != blocks2; i += 2) {
198
0
      SIMD_4x32 B0 = SIMD_4x32::load_le(in + i * 16);
199
0
      SIMD_4x32 B1 = SIMD_4x32::load_le(in + (i + 1) * 16);
200
201
0
      B0 = aes_enc_first_round(B0, K[0]);
202
0
      B1 = aes_enc_first_round(B1, K[0]);
203
204
0
      for(size_t r = 1; r != rounds; ++r) {
205
0
         B0 = aes_enc_round(B0, K[r], r);
206
0
         B1 = aes_enc_round(B1, K[r], r);
207
0
      }
208
209
0
      B0 = aes_enc_last_round(B0, K[rounds], rounds);
210
0
      B1 = aes_enc_last_round(B1, K[rounds], rounds);
211
212
0
      B0.store_le(out + i * 16);
213
0
      B1.store_le(out + (i + 1) * 16);
214
0
   }
215
216
0
   for(size_t i = blocks2; i < blocks; ++i) {
217
0
      SIMD_4x32 B = SIMD_4x32::load_le(in + i * 16);  // ???
218
219
0
      B = aes_enc_first_round(B, K[0]);
220
221
0
      for(size_t r = 1; r != rounds; ++r) {
222
0
         B = aes_enc_round(B, K[r], r);
223
0
      }
224
225
0
      B = aes_enc_last_round(B, K[rounds], rounds);
226
0
      B.store_le(out + i * 16);
227
0
   }
228
229
0
   CT::unpoison(in, blocks * 16);
230
0
   CT::unpoison(out, blocks * 16);
231
0
}
232
233
void BOTAN_FN_ISA_SIMD_4X32
234
0
vperm_decrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks, const SIMD_4x32 K[], size_t rounds) {
235
0
   CT::poison(in, blocks * 16);
236
237
0
   const size_t blocks2 = blocks - (blocks % 2);
238
239
0
   for(size_t i = 0; i != blocks2; i += 2) {
240
0
      SIMD_4x32 B0 = SIMD_4x32::load_le(in + i * 16);
241
0
      SIMD_4x32 B1 = SIMD_4x32::load_le(in + (i + 1) * 16);
242
243
0
      B0 = aes_dec_first_round(B0, K[0]);
244
0
      B1 = aes_dec_first_round(B1, K[0]);
245
246
0
      for(size_t r = 1; r != rounds; ++r) {
247
0
         B0 = aes_dec_round(B0, K[r], r);
248
0
         B1 = aes_dec_round(B1, K[r], r);
249
0
      }
250
251
0
      B0 = aes_dec_last_round(B0, K[rounds], rounds);
252
0
      B1 = aes_dec_last_round(B1, K[rounds], rounds);
253
254
0
      B0.store_le(out + i * 16);
255
0
      B1.store_le(out + (i + 1) * 16);
256
0
   }
257
258
0
   for(size_t i = blocks2; i < blocks; ++i) {
259
0
      SIMD_4x32 B = SIMD_4x32::load_le(in + i * 16);  // ???
260
261
0
      B = aes_dec_first_round(B, K[0]);
262
263
0
      for(size_t r = 1; r != rounds; ++r) {
264
0
         B = aes_dec_round(B, K[r], r);
265
0
      }
266
267
0
      B = aes_dec_last_round(B, K[rounds], rounds);
268
0
      B.store_le(out + i * 16);
269
0
   }
270
271
0
   CT::unpoison(in, blocks * 16);
272
0
   CT::unpoison(out, blocks * 16);
273
0
}
274
275
}  // namespace
276
277
0
void AES_128::vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
278
0
   const SIMD_4x32 K[11] = {
279
0
      SIMD_4x32::load_le(&m_EK[4 * 0]),
280
0
      SIMD_4x32::load_le(&m_EK[4 * 1]),
281
0
      SIMD_4x32::load_le(&m_EK[4 * 2]),
282
0
      SIMD_4x32::load_le(&m_EK[4 * 3]),
283
0
      SIMD_4x32::load_le(&m_EK[4 * 4]),
284
0
      SIMD_4x32::load_le(&m_EK[4 * 5]),
285
0
      SIMD_4x32::load_le(&m_EK[4 * 6]),
286
0
      SIMD_4x32::load_le(&m_EK[4 * 7]),
287
0
      SIMD_4x32::load_le(&m_EK[4 * 8]),
288
0
      SIMD_4x32::load_le(&m_EK[4 * 9]),
289
0
      SIMD_4x32::load_le(&m_EK[4 * 10]),
290
0
   };
291
292
0
   return vperm_encrypt_blocks(in, out, blocks, K, 10);
293
0
}
294
295
0
void AES_128::vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
296
0
   const SIMD_4x32 K[11] = {
297
0
      SIMD_4x32::load_le(&m_DK[4 * 0]),
298
0
      SIMD_4x32::load_le(&m_DK[4 * 1]),
299
0
      SIMD_4x32::load_le(&m_DK[4 * 2]),
300
0
      SIMD_4x32::load_le(&m_DK[4 * 3]),
301
0
      SIMD_4x32::load_le(&m_DK[4 * 4]),
302
0
      SIMD_4x32::load_le(&m_DK[4 * 5]),
303
0
      SIMD_4x32::load_le(&m_DK[4 * 6]),
304
0
      SIMD_4x32::load_le(&m_DK[4 * 7]),
305
0
      SIMD_4x32::load_le(&m_DK[4 * 8]),
306
0
      SIMD_4x32::load_le(&m_DK[4 * 9]),
307
0
      SIMD_4x32::load_le(&m_DK[4 * 10]),
308
0
   };
309
310
0
   return vperm_decrypt_blocks(in, out, blocks, K, 10);
311
0
}
312
313
0
void AES_192::vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
314
0
   const SIMD_4x32 K[13] = {
315
0
      SIMD_4x32::load_le(&m_EK[4 * 0]),
316
0
      SIMD_4x32::load_le(&m_EK[4 * 1]),
317
0
      SIMD_4x32::load_le(&m_EK[4 * 2]),
318
0
      SIMD_4x32::load_le(&m_EK[4 * 3]),
319
0
      SIMD_4x32::load_le(&m_EK[4 * 4]),
320
0
      SIMD_4x32::load_le(&m_EK[4 * 5]),
321
0
      SIMD_4x32::load_le(&m_EK[4 * 6]),
322
0
      SIMD_4x32::load_le(&m_EK[4 * 7]),
323
0
      SIMD_4x32::load_le(&m_EK[4 * 8]),
324
0
      SIMD_4x32::load_le(&m_EK[4 * 9]),
325
0
      SIMD_4x32::load_le(&m_EK[4 * 10]),
326
0
      SIMD_4x32::load_le(&m_EK[4 * 11]),
327
0
      SIMD_4x32::load_le(&m_EK[4 * 12]),
328
0
   };
329
330
0
   return vperm_encrypt_blocks(in, out, blocks, K, 12);
331
0
}
332
333
0
void AES_192::vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
334
0
   const SIMD_4x32 K[13] = {
335
0
      SIMD_4x32::load_le(&m_DK[4 * 0]),
336
0
      SIMD_4x32::load_le(&m_DK[4 * 1]),
337
0
      SIMD_4x32::load_le(&m_DK[4 * 2]),
338
0
      SIMD_4x32::load_le(&m_DK[4 * 3]),
339
0
      SIMD_4x32::load_le(&m_DK[4 * 4]),
340
0
      SIMD_4x32::load_le(&m_DK[4 * 5]),
341
0
      SIMD_4x32::load_le(&m_DK[4 * 6]),
342
0
      SIMD_4x32::load_le(&m_DK[4 * 7]),
343
0
      SIMD_4x32::load_le(&m_DK[4 * 8]),
344
0
      SIMD_4x32::load_le(&m_DK[4 * 9]),
345
0
      SIMD_4x32::load_le(&m_DK[4 * 10]),
346
0
      SIMD_4x32::load_le(&m_DK[4 * 11]),
347
0
      SIMD_4x32::load_le(&m_DK[4 * 12]),
348
0
   };
349
350
0
   return vperm_decrypt_blocks(in, out, blocks, K, 12);
351
0
}
352
353
0
void AES_256::vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
354
0
   const SIMD_4x32 K[15] = {
355
0
      SIMD_4x32::load_le(&m_EK[4 * 0]),
356
0
      SIMD_4x32::load_le(&m_EK[4 * 1]),
357
0
      SIMD_4x32::load_le(&m_EK[4 * 2]),
358
0
      SIMD_4x32::load_le(&m_EK[4 * 3]),
359
0
      SIMD_4x32::load_le(&m_EK[4 * 4]),
360
0
      SIMD_4x32::load_le(&m_EK[4 * 5]),
361
0
      SIMD_4x32::load_le(&m_EK[4 * 6]),
362
0
      SIMD_4x32::load_le(&m_EK[4 * 7]),
363
0
      SIMD_4x32::load_le(&m_EK[4 * 8]),
364
0
      SIMD_4x32::load_le(&m_EK[4 * 9]),
365
0
      SIMD_4x32::load_le(&m_EK[4 * 10]),
366
0
      SIMD_4x32::load_le(&m_EK[4 * 11]),
367
0
      SIMD_4x32::load_le(&m_EK[4 * 12]),
368
0
      SIMD_4x32::load_le(&m_EK[4 * 13]),
369
0
      SIMD_4x32::load_le(&m_EK[4 * 14]),
370
0
   };
371
372
0
   return vperm_encrypt_blocks(in, out, blocks, K, 14);
373
0
}
374
375
0
void AES_256::vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
376
0
   const SIMD_4x32 K[15] = {
377
0
      SIMD_4x32::load_le(&m_DK[4 * 0]),
378
0
      SIMD_4x32::load_le(&m_DK[4 * 1]),
379
0
      SIMD_4x32::load_le(&m_DK[4 * 2]),
380
0
      SIMD_4x32::load_le(&m_DK[4 * 3]),
381
0
      SIMD_4x32::load_le(&m_DK[4 * 4]),
382
0
      SIMD_4x32::load_le(&m_DK[4 * 5]),
383
0
      SIMD_4x32::load_le(&m_DK[4 * 6]),
384
0
      SIMD_4x32::load_le(&m_DK[4 * 7]),
385
0
      SIMD_4x32::load_le(&m_DK[4 * 8]),
386
0
      SIMD_4x32::load_le(&m_DK[4 * 9]),
387
0
      SIMD_4x32::load_le(&m_DK[4 * 10]),
388
0
      SIMD_4x32::load_le(&m_DK[4 * 11]),
389
0
      SIMD_4x32::load_le(&m_DK[4 * 12]),
390
0
      SIMD_4x32::load_le(&m_DK[4 * 13]),
391
0
      SIMD_4x32::load_le(&m_DK[4 * 14]),
392
0
   };
393
394
0
   return vperm_decrypt_blocks(in, out, blocks, K, 14);
395
0
}
396
397
namespace {
398
399
0
inline SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_schedule_transform(SIMD_4x32 input, SIMD_4x32 table_1, SIMD_4x32 table_2) {
400
0
   return shuffle(table_1, low_nibs(input)) ^ shuffle(table_2, high_nibs(input));
401
0
}
402
403
0
SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_schedule_mangle(SIMD_4x32 k, uint8_t round_no) {
404
0
   const SIMD_4x32 mc_forward0(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D);
405
406
0
   SIMD_4x32 t = shuffle(k ^ SIMD_4x32::splat_u8(0x5B), mc_forward0);
407
0
   SIMD_4x32 t2 = t;
408
0
   t = shuffle(t, mc_forward0);
409
0
   t2 = t ^ t2 ^ shuffle(t, mc_forward0);
410
0
   return shuffle(t2, vperm_sr[round_no % 4]);
411
0
}
412
413
0
SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_schedule_mangle_dec(SIMD_4x32 k, uint8_t round_no) {
414
0
   const SIMD_4x32 mc_forward0(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D);
415
416
0
   const SIMD_4x32 dsk[8] = {
417
0
      SIMD_4x32(0x7ED9A700, 0xB6116FC8, 0x82255BFC, 0x4AED9334),
418
0
      SIMD_4x32(0x27143300, 0x45765162, 0xE9DAFDCE, 0x8BB89FAC),
419
0
      SIMD_4x32(0xCCA86400, 0x27438FEB, 0xADC90561, 0x4622EE8A),
420
0
      SIMD_4x32(0x4F92DD00, 0x815C13CE, 0xBD602FF2, 0x73AEE13C),
421
0
      SIMD_4x32(0x01C6C700, 0x03C4C502, 0xFA3D3CFB, 0xF83F3EF9),
422
0
      SIMD_4x32(0x38CFF700, 0xEE1921D6, 0x7384BC4B, 0xA5526A9D),
423
0
      SIMD_4x32(0x53732000, 0xE3C390B0, 0x10306343, 0xA080D3F3),
424
0
      SIMD_4x32(0x036982E8, 0xA0CA214B, 0x8CE60D67, 0x2F45AEC4),
425
0
   };
426
427
0
   SIMD_4x32 t = aes_schedule_transform(k, dsk[0], dsk[1]);
428
0
   SIMD_4x32 output = shuffle(t, mc_forward0);
429
430
0
   t = aes_schedule_transform(t, dsk[2], dsk[3]);
431
0
   output = shuffle(t ^ output, mc_forward0);
432
433
0
   t = aes_schedule_transform(t, dsk[4], dsk[5]);
434
0
   output = shuffle(t ^ output, mc_forward0);
435
436
0
   t = aes_schedule_transform(t, dsk[6], dsk[7]);
437
0
   output = shuffle(t ^ output, mc_forward0);
438
439
0
   return shuffle(output, vperm_sr[round_no % 4]);
440
0
}
441
442
0
SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_schedule_mangle_last(SIMD_4x32 k, uint8_t round_no) {
443
0
   const SIMD_4x32 out_tr1(0xD6B66000, 0xFF9F4929, 0xDEBE6808, 0xF7974121);
444
0
   const SIMD_4x32 out_tr2(0x50BCEC00, 0x01EDBD51, 0xB05C0CE0, 0xE10D5DB1);
445
446
0
   k = shuffle(k, vperm_sr[round_no % 4]);
447
0
   k ^= SIMD_4x32::splat_u8(0x5B);
448
0
   return aes_schedule_transform(k, out_tr1, out_tr2);
449
0
}
450
451
0
SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_schedule_mangle_last_dec(SIMD_4x32 k) {
452
0
   const SIMD_4x32 deskew1(0x47A4E300, 0x07E4A340, 0x5DBEF91A, 0x1DFEB95A);
453
0
   const SIMD_4x32 deskew2(0x83EA6900, 0x5F36B5DC, 0xF49D1E77, 0x2841C2AB);
454
455
0
   k ^= SIMD_4x32::splat_u8(0x5B);
456
0
   return aes_schedule_transform(k, deskew1, deskew2);
457
0
}
458
459
0
SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_schedule_round(SIMD_4x32 input1, SIMD_4x32 input2) {
460
0
   SIMD_4x32 smeared = input2 ^ input2.shift_elems_left<1>();
461
0
   smeared ^= smeared.shift_elems_left<2>();
462
0
   smeared ^= SIMD_4x32::splat_u8(0x5B);
463
464
0
   const SIMD_4x32 Bh = high_nibs(input1);
465
0
   SIMD_4x32 Bl = low_nibs(input1);
466
467
0
   const SIMD_4x32 t2 = shuffle(k_inv2, Bl);
468
469
0
   Bl ^= Bh;
470
471
0
   SIMD_4x32 t5 = Bl ^ masked_shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh));
472
0
   SIMD_4x32 t6 = Bh ^ masked_shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bl));
473
474
0
   return smeared ^ masked_shuffle(sb1u, t5) ^ masked_shuffle(sb1t, t6);
475
0
}
476
477
0
SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_schedule_round(SIMD_4x32 rc, SIMD_4x32 input1, SIMD_4x32 input2) {
478
   // This byte shuffle is equivalent to alignr<1>(shuffle32(input1, (3,3,3,3)));
479
0
   const SIMD_4x32 shuffle3333_15 = SIMD_4x32::splat(0x0C0F0E0D);
480
0
   return aes_schedule_round(shuffle(input1, shuffle3333_15), input2 ^ rc);
481
0
}
482
483
0
SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_schedule_192_smear(SIMD_4x32 x, SIMD_4x32 y) {
484
0
   const SIMD_4x32 shuffle3332 = SIMD_4x32(0x0B0A0908, 0x0F0E0D0C, 0x0F0E0D0C, 0x0F0E0D0C);
485
0
   const SIMD_4x32 shuffle2000 = SIMD_4x32(0x03020100, 0x03020100, 0x03020100, 0x0B0A0908);
486
487
0
   const SIMD_4x32 zero_top_half(0, 0, 0xFFFFFFFF, 0xFFFFFFFF);
488
0
   y &= zero_top_half;
489
0
   return y ^ shuffle(x, shuffle3332) ^ shuffle(y, shuffle2000);
490
0
}
491
492
}  // namespace
493
494
0
void AES_128::vperm_key_schedule(const uint8_t keyb[], size_t /*unused*/) {
495
0
   m_EK.resize(11 * 4);
496
0
   m_DK.resize(11 * 4);
497
498
0
   SIMD_4x32 key = SIMD_4x32::load_le(keyb);
499
500
0
   shuffle(key, vperm_sr[2]).store_le(&m_DK[4 * 10]);
501
502
0
   key = aes_schedule_transform(key, k_ipt1, k_ipt2);
503
0
   key.store_le(&m_EK[0]);
504
505
0
   for(size_t i = 1; i != 10; ++i) {
506
0
      key = aes_schedule_round(rcon[i - 1], key, key);
507
508
0
      aes_schedule_mangle(key, (12 - i) % 4).store_le(&m_EK[4 * i]);
509
510
0
      aes_schedule_mangle_dec(key, (10 - i) % 4).store_le(&m_DK[4 * (10 - i)]);
511
0
   }
512
513
0
   key = aes_schedule_round(rcon[9], key, key);
514
0
   aes_schedule_mangle_last(key, 2).store_le(&m_EK[4 * 10]);
515
0
   aes_schedule_mangle_last_dec(key).store_le(&m_DK[0]);
516
0
}
517
518
0
void AES_192::vperm_key_schedule(const uint8_t keyb[], size_t /*unused*/) {
519
0
   m_EK.resize(13 * 4);
520
0
   m_DK.resize(13 * 4);
521
522
0
   SIMD_4x32 key1 = SIMD_4x32::load_le(keyb);
523
0
   SIMD_4x32 key2 = SIMD_4x32::load_le(keyb + 8);
524
525
0
   shuffle(key1, vperm_sr[0]).store_le(&m_DK[12 * 4]);
526
527
0
   key1 = aes_schedule_transform(key1, k_ipt1, k_ipt2);
528
0
   key2 = aes_schedule_transform(key2, k_ipt1, k_ipt2);
529
530
0
   key1.store_le(&m_EK[0]);
531
532
0
   for(size_t i = 0; i != 4; ++i) {
533
      // key2 with 8 high bytes masked off
534
0
      SIMD_4x32 t = key2;
535
0
      key2 = aes_schedule_round(rcon[2 * i], key2, key1);
536
0
      const auto key2t = SIMD_4x32::alignr8(key2, t);
537
538
0
      aes_schedule_mangle(key2t, (i + 3) % 4).store_le(&m_EK[4 * (3 * i + 1)]);
539
0
      aes_schedule_mangle_dec(key2t, (i + 3) % 4).store_le(&m_DK[4 * (11 - 3 * i)]);
540
541
0
      t = aes_schedule_192_smear(key2, t);
542
543
0
      aes_schedule_mangle(t, (i + 2) % 4).store_le(&m_EK[4 * (3 * i + 2)]);
544
0
      aes_schedule_mangle_dec(t, (i + 2) % 4).store_le(&m_DK[4 * (10 - 3 * i)]);
545
546
0
      key2 = aes_schedule_round(rcon[2 * i + 1], t, key2);
547
548
0
      if(i == 3) {
549
0
         aes_schedule_mangle_last(key2, (i + 1) % 4).store_le(&m_EK[4 * (3 * i + 3)]);
550
0
         aes_schedule_mangle_last_dec(key2).store_le(&m_DK[4 * (9 - 3 * i)]);
551
0
      } else {
552
0
         aes_schedule_mangle(key2, (i + 1) % 4).store_le(&m_EK[4 * (3 * i + 3)]);
553
0
         aes_schedule_mangle_dec(key2, (i + 1) % 4).store_le(&m_DK[4 * (9 - 3 * i)]);
554
0
      }
555
556
0
      key1 = key2;
557
0
      key2 = aes_schedule_192_smear(key2, t);
558
0
   }
559
0
}
560
561
0
void AES_256::vperm_key_schedule(const uint8_t keyb[], size_t /*unused*/) {
562
0
   m_EK.resize(15 * 4);
563
0
   m_DK.resize(15 * 4);
564
565
0
   SIMD_4x32 key1 = SIMD_4x32::load_le(keyb);
566
0
   SIMD_4x32 key2 = SIMD_4x32::load_le(keyb + 16);
567
568
0
   shuffle(key1, vperm_sr[2]).store_le(&m_DK[4 * 14]);
569
570
0
   key1 = aes_schedule_transform(key1, k_ipt1, k_ipt2);
571
0
   key2 = aes_schedule_transform(key2, k_ipt1, k_ipt2);
572
573
0
   key1.store_le(&m_EK[0]);
574
0
   aes_schedule_mangle(key2, 3).store_le(&m_EK[4]);
575
576
0
   aes_schedule_mangle_dec(key2, 1).store_le(&m_DK[4 * 13]);
577
578
0
   const SIMD_4x32 shuffle3333 = SIMD_4x32::splat(0x0F0E0D0C);
579
580
0
   for(size_t i = 2; i != 14; i += 2) {
581
0
      const SIMD_4x32 k_t = key2;
582
0
      key1 = key2 = aes_schedule_round(rcon[(i / 2) - 1], key2, key1);
583
584
0
      aes_schedule_mangle(key2, i % 4).store_le(&m_EK[4 * i]);
585
0
      aes_schedule_mangle_dec(key2, (i + 2) % 4).store_le(&m_DK[4 * (14 - i)]);
586
587
0
      key2 = aes_schedule_round(shuffle(key2, shuffle3333), k_t);
588
589
0
      aes_schedule_mangle(key2, (i - 1) % 4).store_le(&m_EK[4 * (i + 1)]);
590
0
      aes_schedule_mangle_dec(key2, (i + 1) % 4).store_le(&m_DK[4 * (13 - i)]);
591
0
   }
592
593
0
   key2 = aes_schedule_round(rcon[6], key2, key1);
594
595
0
   aes_schedule_mangle_last(key2, 2).store_le(&m_EK[4 * 14]);
596
0
   aes_schedule_mangle_last_dec(key2).store_le(&m_DK[0]);
597
0
}
598
599
}  // namespace Botan