Coverage Report

Created: 2022-01-14 08:07

/src/botan/src/lib/block/aes/aes_ni/aes_ni.cpp
Line
Count
Source (jump to first uncovered line)
1
/*
2
* AES using AES-NI instructions
3
* (C) 2009,2012 Jack Lloyd
4
*
5
* Botan is released under the Simplified BSD License (see license.txt)
6
*/
7
8
#include <botan/internal/aes.h>
9
#include <botan/internal/loadstor.h>
10
#include <wmmintrin.h>
11
12
namespace Botan {
13
14
namespace {
15
16
BOTAN_FUNC_ISA("ssse3")
17
__m128i aes_128_key_expansion(__m128i key, __m128i key_with_rcon)
18
6.23k
   {
19
6.23k
   key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(3,3,3,3));
20
6.23k
   key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
21
6.23k
   key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
22
6.23k
   key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
23
6.23k
   return _mm_xor_si128(key, key_with_rcon);
24
6.23k
   }
25
26
BOTAN_FUNC_ISA("ssse3")
27
void aes_192_key_expansion(__m128i* K1, __m128i* K2, __m128i key2_with_rcon,
28
                           uint32_t out[], bool last)
29
0
   {
30
0
   __m128i key1 = *K1;
31
0
   __m128i key2 = *K2;
32
33
0
   key2_with_rcon  = _mm_shuffle_epi32(key2_with_rcon, _MM_SHUFFLE(1,1,1,1));
34
0
   key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
35
0
   key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
36
0
   key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
37
0
   key1 = _mm_xor_si128(key1, key2_with_rcon);
38
39
0
   *K1 = key1;
40
0
   _mm_storeu_si128(reinterpret_cast<__m128i*>(out), key1);
41
42
0
   if(last)
43
0
      return;
44
45
0
   key2 = _mm_xor_si128(key2, _mm_slli_si128(key2, 4));
46
0
   key2 = _mm_xor_si128(key2, _mm_shuffle_epi32(key1, _MM_SHUFFLE(3,3,3,3)));
47
48
0
   *K2 = key2;
49
0
   out[4] = _mm_cvtsi128_si32(key2);
50
0
   out[5] = _mm_cvtsi128_si32(_mm_srli_si128(key2, 4));
51
0
   }
52
53
/*
54
* The second half of the AES-256 key expansion (other half same as AES-128)
55
*/
56
BOTAN_FUNC_ISA("ssse3,aes")
57
__m128i aes_256_key_expansion(__m128i key, __m128i key2)
58
3.46k
   {
59
3.46k
   __m128i key_with_rcon = _mm_aeskeygenassist_si128(key2, 0x00);
60
3.46k
   key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(2,2,2,2));
61
62
3.46k
   key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
63
3.46k
   key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
64
3.46k
   key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
65
3.46k
   return _mm_xor_si128(key, key_with_rcon);
66
3.46k
   }
67
68
}
69
70
#define AES_ENC_4_ROUNDS(K)                     \
71
45.3k
   do                                           \
72
45.3k
      {                                         \
73
45.3k
      B0 = _mm_aesenc_si128(B0, K);             \
74
45.3k
      B1 = _mm_aesenc_si128(B1, K);             \
75
45.3k
      B2 = _mm_aesenc_si128(B2, K);             \
76
45.3k
      B3 = _mm_aesenc_si128(B3, K);             \
77
45.3k
      } while(0)
78
79
#define AES_ENC_4_LAST_ROUNDS(K)                \
80
3.63k
   do                                           \
81
3.63k
      {                                         \
82
3.63k
      B0 = _mm_aesenclast_si128(B0, K);         \
83
3.63k
      B1 = _mm_aesenclast_si128(B1, K);         \
84
3.63k
      B2 = _mm_aesenclast_si128(B2, K);         \
85
3.63k
      B3 = _mm_aesenclast_si128(B3, K);         \
86
3.63k
      } while(0)
87
88
#define AES_DEC_4_ROUNDS(K)                     \
89
15.3k
   do                                           \
90
15.3k
      {                                         \
91
15.3k
      B0 = _mm_aesdec_si128(B0, K);             \
92
15.3k
      B1 = _mm_aesdec_si128(B1, K);             \
93
15.3k
      B2 = _mm_aesdec_si128(B2, K);             \
94
15.3k
      B3 = _mm_aesdec_si128(B3, K);             \
95
15.3k
      } while(0)
96
97
#define AES_DEC_4_LAST_ROUNDS(K)                \
98
1.26k
   do                                           \
99
1.26k
      {                                         \
100
1.26k
      B0 = _mm_aesdeclast_si128(B0, K);         \
101
1.26k
      B1 = _mm_aesdeclast_si128(B1, K);         \
102
1.26k
      B2 = _mm_aesdeclast_si128(B2, K);         \
103
1.26k
      B3 = _mm_aesdeclast_si128(B3, K);         \
104
1.26k
      } while(0)
105
106
/*
107
* AES-128 Encryption
108
*/
109
BOTAN_FUNC_ISA("ssse3,aes")
110
void AES_128::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
111
2.49k
   {
112
2.49k
   const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
113
2.49k
   __m128i* out_mm = reinterpret_cast<__m128i*>(out);
114
115
2.49k
   const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_EK.data());
116
117
2.49k
   const __m128i K0  = _mm_loadu_si128(key_mm);
118
2.49k
   const __m128i K1  = _mm_loadu_si128(key_mm + 1);
119
2.49k
   const __m128i K2  = _mm_loadu_si128(key_mm + 2);
120
2.49k
   const __m128i K3  = _mm_loadu_si128(key_mm + 3);
121
2.49k
   const __m128i K4  = _mm_loadu_si128(key_mm + 4);
122
2.49k
   const __m128i K5  = _mm_loadu_si128(key_mm + 5);
123
2.49k
   const __m128i K6  = _mm_loadu_si128(key_mm + 6);
124
2.49k
   const __m128i K7  = _mm_loadu_si128(key_mm + 7);
125
2.49k
   const __m128i K8  = _mm_loadu_si128(key_mm + 8);
126
2.49k
   const __m128i K9  = _mm_loadu_si128(key_mm + 9);
127
2.49k
   const __m128i K10 = _mm_loadu_si128(key_mm + 10);
128
129
2.95k
   while(blocks >= 4)
130
464
      {
131
464
      __m128i B0 = _mm_loadu_si128(in_mm + 0);
132
464
      __m128i B1 = _mm_loadu_si128(in_mm + 1);
133
464
      __m128i B2 = _mm_loadu_si128(in_mm + 2);
134
464
      __m128i B3 = _mm_loadu_si128(in_mm + 3);
135
136
464
      B0 = _mm_xor_si128(B0, K0);
137
464
      B1 = _mm_xor_si128(B1, K0);
138
464
      B2 = _mm_xor_si128(B2, K0);
139
464
      B3 = _mm_xor_si128(B3, K0);
140
141
464
      AES_ENC_4_ROUNDS(K1);
142
464
      AES_ENC_4_ROUNDS(K2);
143
464
      AES_ENC_4_ROUNDS(K3);
144
464
      AES_ENC_4_ROUNDS(K4);
145
464
      AES_ENC_4_ROUNDS(K5);
146
464
      AES_ENC_4_ROUNDS(K6);
147
464
      AES_ENC_4_ROUNDS(K7);
148
464
      AES_ENC_4_ROUNDS(K8);
149
464
      AES_ENC_4_ROUNDS(K9);
150
464
      AES_ENC_4_LAST_ROUNDS(K10);
151
152
464
      _mm_storeu_si128(out_mm + 0, B0);
153
464
      _mm_storeu_si128(out_mm + 1, B1);
154
464
      _mm_storeu_si128(out_mm + 2, B2);
155
464
      _mm_storeu_si128(out_mm + 3, B3);
156
157
464
      blocks -= 4;
158
464
      in_mm += 4;
159
464
      out_mm += 4;
160
464
      }
161
162
4.87k
   for(size_t i = 0; i != blocks; ++i)
163
2.37k
      {
164
2.37k
      __m128i B = _mm_loadu_si128(in_mm + i);
165
166
2.37k
      B = _mm_xor_si128(B, K0);
167
168
2.37k
      B = _mm_aesenc_si128(B, K1);
169
2.37k
      B = _mm_aesenc_si128(B, K2);
170
2.37k
      B = _mm_aesenc_si128(B, K3);
171
2.37k
      B = _mm_aesenc_si128(B, K4);
172
2.37k
      B = _mm_aesenc_si128(B, K5);
173
2.37k
      B = _mm_aesenc_si128(B, K6);
174
2.37k
      B = _mm_aesenc_si128(B, K7);
175
2.37k
      B = _mm_aesenc_si128(B, K8);
176
2.37k
      B = _mm_aesenc_si128(B, K9);
177
2.37k
      B = _mm_aesenclast_si128(B, K10);
178
179
2.37k
      _mm_storeu_si128(out_mm + i, B);
180
2.37k
      }
181
2.49k
   }
182
183
/*
184
* AES-128 Decryption
185
*/
186
BOTAN_FUNC_ISA("ssse3,aes")
187
void AES_128::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
188
80
   {
189
80
   const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
190
80
   __m128i* out_mm = reinterpret_cast<__m128i*>(out);
191
192
80
   const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_DK.data());
193
194
80
   const __m128i K0  = _mm_loadu_si128(key_mm);
195
80
   const __m128i K1  = _mm_loadu_si128(key_mm + 1);
196
80
   const __m128i K2  = _mm_loadu_si128(key_mm + 2);
197
80
   const __m128i K3  = _mm_loadu_si128(key_mm + 3);
198
80
   const __m128i K4  = _mm_loadu_si128(key_mm + 4);
199
80
   const __m128i K5  = _mm_loadu_si128(key_mm + 5);
200
80
   const __m128i K6  = _mm_loadu_si128(key_mm + 6);
201
80
   const __m128i K7  = _mm_loadu_si128(key_mm + 7);
202
80
   const __m128i K8  = _mm_loadu_si128(key_mm + 8);
203
80
   const __m128i K9  = _mm_loadu_si128(key_mm + 9);
204
80
   const __m128i K10 = _mm_loadu_si128(key_mm + 10);
205
206
363
   while(blocks >= 4)
207
283
      {
208
283
      __m128i B0 = _mm_loadu_si128(in_mm + 0);
209
283
      __m128i B1 = _mm_loadu_si128(in_mm + 1);
210
283
      __m128i B2 = _mm_loadu_si128(in_mm + 2);
211
283
      __m128i B3 = _mm_loadu_si128(in_mm + 3);
212
213
283
      B0 = _mm_xor_si128(B0, K0);
214
283
      B1 = _mm_xor_si128(B1, K0);
215
283
      B2 = _mm_xor_si128(B2, K0);
216
283
      B3 = _mm_xor_si128(B3, K0);
217
218
283
      AES_DEC_4_ROUNDS(K1);
219
283
      AES_DEC_4_ROUNDS(K2);
220
283
      AES_DEC_4_ROUNDS(K3);
221
283
      AES_DEC_4_ROUNDS(K4);
222
283
      AES_DEC_4_ROUNDS(K5);
223
283
      AES_DEC_4_ROUNDS(K6);
224
283
      AES_DEC_4_ROUNDS(K7);
225
283
      AES_DEC_4_ROUNDS(K8);
226
283
      AES_DEC_4_ROUNDS(K9);
227
283
      AES_DEC_4_LAST_ROUNDS(K10);
228
229
283
      _mm_storeu_si128(out_mm + 0, B0);
230
283
      _mm_storeu_si128(out_mm + 1, B1);
231
283
      _mm_storeu_si128(out_mm + 2, B2);
232
283
      _mm_storeu_si128(out_mm + 3, B3);
233
234
283
      blocks -= 4;
235
283
      in_mm += 4;
236
283
      out_mm += 4;
237
283
      }
238
239
114
   for(size_t i = 0; i != blocks; ++i)
240
34
      {
241
34
      __m128i B = _mm_loadu_si128(in_mm + i);
242
243
34
      B = _mm_xor_si128(B, K0);
244
245
34
      B = _mm_aesdec_si128(B, K1);
246
34
      B = _mm_aesdec_si128(B, K2);
247
34
      B = _mm_aesdec_si128(B, K3);
248
34
      B = _mm_aesdec_si128(B, K4);
249
34
      B = _mm_aesdec_si128(B, K5);
250
34
      B = _mm_aesdec_si128(B, K6);
251
34
      B = _mm_aesdec_si128(B, K7);
252
34
      B = _mm_aesdec_si128(B, K8);
253
34
      B = _mm_aesdec_si128(B, K9);
254
34
      B = _mm_aesdeclast_si128(B, K10);
255
256
34
      _mm_storeu_si128(out_mm + i, B);
257
34
      }
258
80
   }
259
260
/*
261
* AES-128 Key Schedule
262
*/
263
BOTAN_FUNC_ISA("ssse3,aes")
264
void AES_128::aesni_key_schedule(const uint8_t key[], size_t)
265
219
   {
266
219
   m_EK.resize(44);
267
219
   m_DK.resize(44);
268
269
219
   #define AES_128_key_exp(K, RCON) \
270
2.19k
      aes_128_key_expansion(K, _mm_aeskeygenassist_si128(K, RCON))
271
272
219
   const __m128i K0  = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key));
273
219
   const __m128i K1  = AES_128_key_exp(K0, 0x01);
274
219
   const __m128i K2  = AES_128_key_exp(K1, 0x02);
275
219
   const __m128i K3  = AES_128_key_exp(K2, 0x04);
276
219
   const __m128i K4  = AES_128_key_exp(K3, 0x08);
277
219
   const __m128i K5  = AES_128_key_exp(K4, 0x10);
278
219
   const __m128i K6  = AES_128_key_exp(K5, 0x20);
279
219
   const __m128i K7  = AES_128_key_exp(K6, 0x40);
280
219
   const __m128i K8  = AES_128_key_exp(K7, 0x80);
281
219
   const __m128i K9  = AES_128_key_exp(K8, 0x1B);
282
219
   const __m128i K10 = AES_128_key_exp(K9, 0x36);
283
284
219
   __m128i* EK_mm = reinterpret_cast<__m128i*>(m_EK.data());
285
219
   _mm_storeu_si128(EK_mm     , K0);
286
219
   _mm_storeu_si128(EK_mm +  1, K1);
287
219
   _mm_storeu_si128(EK_mm +  2, K2);
288
219
   _mm_storeu_si128(EK_mm +  3, K3);
289
219
   _mm_storeu_si128(EK_mm +  4, K4);
290
219
   _mm_storeu_si128(EK_mm +  5, K5);
291
219
   _mm_storeu_si128(EK_mm +  6, K6);
292
219
   _mm_storeu_si128(EK_mm +  7, K7);
293
219
   _mm_storeu_si128(EK_mm +  8, K8);
294
219
   _mm_storeu_si128(EK_mm +  9, K9);
295
219
   _mm_storeu_si128(EK_mm + 10, K10);
296
297
   // Now generate decryption keys
298
299
219
   __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data());
300
219
   _mm_storeu_si128(DK_mm     , K10);
301
219
   _mm_storeu_si128(DK_mm +  1, _mm_aesimc_si128(K9));
302
219
   _mm_storeu_si128(DK_mm +  2, _mm_aesimc_si128(K8));
303
219
   _mm_storeu_si128(DK_mm +  3, _mm_aesimc_si128(K7));
304
219
   _mm_storeu_si128(DK_mm +  4, _mm_aesimc_si128(K6));
305
219
   _mm_storeu_si128(DK_mm +  5, _mm_aesimc_si128(K5));
306
219
   _mm_storeu_si128(DK_mm +  6, _mm_aesimc_si128(K4));
307
219
   _mm_storeu_si128(DK_mm +  7, _mm_aesimc_si128(K3));
308
219
   _mm_storeu_si128(DK_mm +  8, _mm_aesimc_si128(K2));
309
219
   _mm_storeu_si128(DK_mm +  9, _mm_aesimc_si128(K1));
310
219
   _mm_storeu_si128(DK_mm + 10, K0);
311
219
   }
312
313
/*
314
* AES-192 Encryption
315
*/
316
BOTAN_FUNC_ISA("ssse3,aes")
317
void AES_192::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
318
0
   {
319
0
   const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
320
0
   __m128i* out_mm = reinterpret_cast<__m128i*>(out);
321
322
0
   const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_EK.data());
323
324
0
   const __m128i K0  = _mm_loadu_si128(key_mm);
325
0
   const __m128i K1  = _mm_loadu_si128(key_mm + 1);
326
0
   const __m128i K2  = _mm_loadu_si128(key_mm + 2);
327
0
   const __m128i K3  = _mm_loadu_si128(key_mm + 3);
328
0
   const __m128i K4  = _mm_loadu_si128(key_mm + 4);
329
0
   const __m128i K5  = _mm_loadu_si128(key_mm + 5);
330
0
   const __m128i K6  = _mm_loadu_si128(key_mm + 6);
331
0
   const __m128i K7  = _mm_loadu_si128(key_mm + 7);
332
0
   const __m128i K8  = _mm_loadu_si128(key_mm + 8);
333
0
   const __m128i K9  = _mm_loadu_si128(key_mm + 9);
334
0
   const __m128i K10 = _mm_loadu_si128(key_mm + 10);
335
0
   const __m128i K11 = _mm_loadu_si128(key_mm + 11);
336
0
   const __m128i K12 = _mm_loadu_si128(key_mm + 12);
337
338
0
   while(blocks >= 4)
339
0
      {
340
0
      __m128i B0 = _mm_loadu_si128(in_mm + 0);
341
0
      __m128i B1 = _mm_loadu_si128(in_mm + 1);
342
0
      __m128i B2 = _mm_loadu_si128(in_mm + 2);
343
0
      __m128i B3 = _mm_loadu_si128(in_mm + 3);
344
345
0
      B0 = _mm_xor_si128(B0, K0);
346
0
      B1 = _mm_xor_si128(B1, K0);
347
0
      B2 = _mm_xor_si128(B2, K0);
348
0
      B3 = _mm_xor_si128(B3, K0);
349
350
0
      AES_ENC_4_ROUNDS(K1);
351
0
      AES_ENC_4_ROUNDS(K2);
352
0
      AES_ENC_4_ROUNDS(K3);
353
0
      AES_ENC_4_ROUNDS(K4);
354
0
      AES_ENC_4_ROUNDS(K5);
355
0
      AES_ENC_4_ROUNDS(K6);
356
0
      AES_ENC_4_ROUNDS(K7);
357
0
      AES_ENC_4_ROUNDS(K8);
358
0
      AES_ENC_4_ROUNDS(K9);
359
0
      AES_ENC_4_ROUNDS(K10);
360
0
      AES_ENC_4_ROUNDS(K11);
361
0
      AES_ENC_4_LAST_ROUNDS(K12);
362
363
0
      _mm_storeu_si128(out_mm + 0, B0);
364
0
      _mm_storeu_si128(out_mm + 1, B1);
365
0
      _mm_storeu_si128(out_mm + 2, B2);
366
0
      _mm_storeu_si128(out_mm + 3, B3);
367
368
0
      blocks -= 4;
369
0
      in_mm += 4;
370
0
      out_mm += 4;
371
0
      }
372
373
0
   for(size_t i = 0; i != blocks; ++i)
374
0
      {
375
0
      __m128i B = _mm_loadu_si128(in_mm + i);
376
377
0
      B = _mm_xor_si128(B, K0);
378
379
0
      B = _mm_aesenc_si128(B, K1);
380
0
      B = _mm_aesenc_si128(B, K2);
381
0
      B = _mm_aesenc_si128(B, K3);
382
0
      B = _mm_aesenc_si128(B, K4);
383
0
      B = _mm_aesenc_si128(B, K5);
384
0
      B = _mm_aesenc_si128(B, K6);
385
0
      B = _mm_aesenc_si128(B, K7);
386
0
      B = _mm_aesenc_si128(B, K8);
387
0
      B = _mm_aesenc_si128(B, K9);
388
0
      B = _mm_aesenc_si128(B, K10);
389
0
      B = _mm_aesenc_si128(B, K11);
390
0
      B = _mm_aesenclast_si128(B, K12);
391
392
0
      _mm_storeu_si128(out_mm + i, B);
393
0
      }
394
0
   }
395
396
/*
397
* AES-192 Decryption
398
*/
399
BOTAN_FUNC_ISA("ssse3,aes")
400
void AES_192::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
401
0
   {
402
0
   const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
403
0
   __m128i* out_mm = reinterpret_cast<__m128i*>(out);
404
405
0
   const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_DK.data());
406
407
0
   const __m128i K0  = _mm_loadu_si128(key_mm);
408
0
   const __m128i K1  = _mm_loadu_si128(key_mm + 1);
409
0
   const __m128i K2  = _mm_loadu_si128(key_mm + 2);
410
0
   const __m128i K3  = _mm_loadu_si128(key_mm + 3);
411
0
   const __m128i K4  = _mm_loadu_si128(key_mm + 4);
412
0
   const __m128i K5  = _mm_loadu_si128(key_mm + 5);
413
0
   const __m128i K6  = _mm_loadu_si128(key_mm + 6);
414
0
   const __m128i K7  = _mm_loadu_si128(key_mm + 7);
415
0
   const __m128i K8  = _mm_loadu_si128(key_mm + 8);
416
0
   const __m128i K9  = _mm_loadu_si128(key_mm + 9);
417
0
   const __m128i K10 = _mm_loadu_si128(key_mm + 10);
418
0
   const __m128i K11 = _mm_loadu_si128(key_mm + 11);
419
0
   const __m128i K12 = _mm_loadu_si128(key_mm + 12);
420
421
0
   while(blocks >= 4)
422
0
      {
423
0
      __m128i B0 = _mm_loadu_si128(in_mm + 0);
424
0
      __m128i B1 = _mm_loadu_si128(in_mm + 1);
425
0
      __m128i B2 = _mm_loadu_si128(in_mm + 2);
426
0
      __m128i B3 = _mm_loadu_si128(in_mm + 3);
427
428
0
      B0 = _mm_xor_si128(B0, K0);
429
0
      B1 = _mm_xor_si128(B1, K0);
430
0
      B2 = _mm_xor_si128(B2, K0);
431
0
      B3 = _mm_xor_si128(B3, K0);
432
433
0
      AES_DEC_4_ROUNDS(K1);
434
0
      AES_DEC_4_ROUNDS(K2);
435
0
      AES_DEC_4_ROUNDS(K3);
436
0
      AES_DEC_4_ROUNDS(K4);
437
0
      AES_DEC_4_ROUNDS(K5);
438
0
      AES_DEC_4_ROUNDS(K6);
439
0
      AES_DEC_4_ROUNDS(K7);
440
0
      AES_DEC_4_ROUNDS(K8);
441
0
      AES_DEC_4_ROUNDS(K9);
442
0
      AES_DEC_4_ROUNDS(K10);
443
0
      AES_DEC_4_ROUNDS(K11);
444
0
      AES_DEC_4_LAST_ROUNDS(K12);
445
446
0
      _mm_storeu_si128(out_mm + 0, B0);
447
0
      _mm_storeu_si128(out_mm + 1, B1);
448
0
      _mm_storeu_si128(out_mm + 2, B2);
449
0
      _mm_storeu_si128(out_mm + 3, B3);
450
451
0
      blocks -= 4;
452
0
      in_mm += 4;
453
0
      out_mm += 4;
454
0
      }
455
456
0
   for(size_t i = 0; i != blocks; ++i)
457
0
      {
458
0
      __m128i B = _mm_loadu_si128(in_mm + i);
459
460
0
      B = _mm_xor_si128(B, K0);
461
462
0
      B = _mm_aesdec_si128(B, K1);
463
0
      B = _mm_aesdec_si128(B, K2);
464
0
      B = _mm_aesdec_si128(B, K3);
465
0
      B = _mm_aesdec_si128(B, K4);
466
0
      B = _mm_aesdec_si128(B, K5);
467
0
      B = _mm_aesdec_si128(B, K6);
468
0
      B = _mm_aesdec_si128(B, K7);
469
0
      B = _mm_aesdec_si128(B, K8);
470
0
      B = _mm_aesdec_si128(B, K9);
471
0
      B = _mm_aesdec_si128(B, K10);
472
0
      B = _mm_aesdec_si128(B, K11);
473
0
      B = _mm_aesdeclast_si128(B, K12);
474
475
0
      _mm_storeu_si128(out_mm + i, B);
476
0
      }
477
0
   }
478
479
/*
480
* AES-192 Key Schedule
481
*/
482
BOTAN_FUNC_ISA("ssse3,aes")
483
void AES_192::aesni_key_schedule(const uint8_t key[], size_t)
484
0
   {
485
0
   m_EK.resize(52);
486
0
   m_DK.resize(52);
487
488
0
   __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key));
489
0
   __m128i K1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key + 8));
490
0
   K1 = _mm_srli_si128(K1, 8);
491
492
0
   load_le(m_EK.data(), key, 6);
493
494
0
   #define AES_192_key_exp(RCON, EK_OFF)                         \
495
0
     aes_192_key_expansion(&K0, &K1,                             \
496
0
                           _mm_aeskeygenassist_si128(K1, RCON),  \
497
0
                           &m_EK[EK_OFF], EK_OFF == 48)
498
499
0
   AES_192_key_exp(0x01, 6);
500
0
   AES_192_key_exp(0x02, 12);
501
0
   AES_192_key_exp(0x04, 18);
502
0
   AES_192_key_exp(0x08, 24);
503
0
   AES_192_key_exp(0x10, 30);
504
0
   AES_192_key_exp(0x20, 36);
505
0
   AES_192_key_exp(0x40, 42);
506
0
   AES_192_key_exp(0x80, 48);
507
508
0
   #undef AES_192_key_exp
509
510
   // Now generate decryption keys
511
0
   const __m128i* EK_mm = reinterpret_cast<const __m128i*>(m_EK.data());
512
513
0
   __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data());
514
0
   _mm_storeu_si128(DK_mm     , _mm_loadu_si128(EK_mm + 12));
515
0
   _mm_storeu_si128(DK_mm +  1, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 11)));
516
0
   _mm_storeu_si128(DK_mm +  2, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 10)));
517
0
   _mm_storeu_si128(DK_mm +  3, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 9)));
518
0
   _mm_storeu_si128(DK_mm +  4, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 8)));
519
0
   _mm_storeu_si128(DK_mm +  5, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 7)));
520
0
   _mm_storeu_si128(DK_mm +  6, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 6)));
521
0
   _mm_storeu_si128(DK_mm +  7, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 5)));
522
0
   _mm_storeu_si128(DK_mm +  8, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 4)));
523
0
   _mm_storeu_si128(DK_mm +  9, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 3)));
524
0
   _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 2)));
525
0
   _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 1)));
526
0
   _mm_storeu_si128(DK_mm + 12, _mm_loadu_si128(EK_mm + 0));
527
0
   }
528
529
/*
530
* AES-256 Encryption
531
*/
532
BOTAN_FUNC_ISA("ssse3,aes")
533
void AES_256::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
534
2.62k
   {
535
2.62k
   const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
536
2.62k
   __m128i* out_mm = reinterpret_cast<__m128i*>(out);
537
538
2.62k
   const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_EK.data());
539
540
2.62k
   const __m128i K0  = _mm_loadu_si128(key_mm);
541
2.62k
   const __m128i K1  = _mm_loadu_si128(key_mm + 1);
542
2.62k
   const __m128i K2  = _mm_loadu_si128(key_mm + 2);
543
2.62k
   const __m128i K3  = _mm_loadu_si128(key_mm + 3);
544
2.62k
   const __m128i K4  = _mm_loadu_si128(key_mm + 4);
545
2.62k
   const __m128i K5  = _mm_loadu_si128(key_mm + 5);
546
2.62k
   const __m128i K6  = _mm_loadu_si128(key_mm + 6);
547
2.62k
   const __m128i K7  = _mm_loadu_si128(key_mm + 7);
548
2.62k
   const __m128i K8  = _mm_loadu_si128(key_mm + 8);
549
2.62k
   const __m128i K9  = _mm_loadu_si128(key_mm + 9);
550
2.62k
   const __m128i K10 = _mm_loadu_si128(key_mm + 10);
551
2.62k
   const __m128i K11 = _mm_loadu_si128(key_mm + 11);
552
2.62k
   const __m128i K12 = _mm_loadu_si128(key_mm + 12);
553
2.62k
   const __m128i K13 = _mm_loadu_si128(key_mm + 13);
554
2.62k
   const __m128i K14 = _mm_loadu_si128(key_mm + 14);
555
556
5.78k
   while(blocks >= 4)
557
3.16k
      {
558
3.16k
      __m128i B0 = _mm_loadu_si128(in_mm + 0);
559
3.16k
      __m128i B1 = _mm_loadu_si128(in_mm + 1);
560
3.16k
      __m128i B2 = _mm_loadu_si128(in_mm + 2);
561
3.16k
      __m128i B3 = _mm_loadu_si128(in_mm + 3);
562
563
3.16k
      B0 = _mm_xor_si128(B0, K0);
564
3.16k
      B1 = _mm_xor_si128(B1, K0);
565
3.16k
      B2 = _mm_xor_si128(B2, K0);
566
3.16k
      B3 = _mm_xor_si128(B3, K0);
567
568
3.16k
      AES_ENC_4_ROUNDS(K1);
569
3.16k
      AES_ENC_4_ROUNDS(K2);
570
3.16k
      AES_ENC_4_ROUNDS(K3);
571
3.16k
      AES_ENC_4_ROUNDS(K4);
572
3.16k
      AES_ENC_4_ROUNDS(K5);
573
3.16k
      AES_ENC_4_ROUNDS(K6);
574
3.16k
      AES_ENC_4_ROUNDS(K7);
575
3.16k
      AES_ENC_4_ROUNDS(K8);
576
3.16k
      AES_ENC_4_ROUNDS(K9);
577
3.16k
      AES_ENC_4_ROUNDS(K10);
578
3.16k
      AES_ENC_4_ROUNDS(K11);
579
3.16k
      AES_ENC_4_ROUNDS(K12);
580
3.16k
      AES_ENC_4_ROUNDS(K13);
581
3.16k
      AES_ENC_4_LAST_ROUNDS(K14);
582
583
3.16k
      _mm_storeu_si128(out_mm + 0, B0);
584
3.16k
      _mm_storeu_si128(out_mm + 1, B1);
585
3.16k
      _mm_storeu_si128(out_mm + 2, B2);
586
3.16k
      _mm_storeu_si128(out_mm + 3, B3);
587
588
3.16k
      blocks -= 4;
589
3.16k
      in_mm += 4;
590
3.16k
      out_mm += 4;
591
3.16k
      }
592
593
4.45k
   for(size_t i = 0; i != blocks; ++i)
594
1.82k
      {
595
1.82k
      __m128i B = _mm_loadu_si128(in_mm + i);
596
597
1.82k
      B = _mm_xor_si128(B, K0);
598
599
1.82k
      B = _mm_aesenc_si128(B, K1);
600
1.82k
      B = _mm_aesenc_si128(B, K2);
601
1.82k
      B = _mm_aesenc_si128(B, K3);
602
1.82k
      B = _mm_aesenc_si128(B, K4);
603
1.82k
      B = _mm_aesenc_si128(B, K5);
604
1.82k
      B = _mm_aesenc_si128(B, K6);
605
1.82k
      B = _mm_aesenc_si128(B, K7);
606
1.82k
      B = _mm_aesenc_si128(B, K8);
607
1.82k
      B = _mm_aesenc_si128(B, K9);
608
1.82k
      B = _mm_aesenc_si128(B, K10);
609
1.82k
      B = _mm_aesenc_si128(B, K11);
610
1.82k
      B = _mm_aesenc_si128(B, K12);
611
1.82k
      B = _mm_aesenc_si128(B, K13);
612
1.82k
      B = _mm_aesenclast_si128(B, K14);
613
614
1.82k
      _mm_storeu_si128(out_mm + i, B);
615
1.82k
      }
616
2.62k
   }
617
618
/*
619
* AES-256 Decryption
620
*/
621
BOTAN_FUNC_ISA("ssse3,aes")
622
void AES_256::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
623
284
   {
624
284
   const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
625
284
   __m128i* out_mm = reinterpret_cast<__m128i*>(out);
626
627
284
   const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_DK.data());
628
629
284
   const __m128i K0  = _mm_loadu_si128(key_mm);
630
284
   const __m128i K1  = _mm_loadu_si128(key_mm + 1);
631
284
   const __m128i K2  = _mm_loadu_si128(key_mm + 2);
632
284
   const __m128i K3  = _mm_loadu_si128(key_mm + 3);
633
284
   const __m128i K4  = _mm_loadu_si128(key_mm + 4);
634
284
   const __m128i K5  = _mm_loadu_si128(key_mm + 5);
635
284
   const __m128i K6  = _mm_loadu_si128(key_mm + 6);
636
284
   const __m128i K7  = _mm_loadu_si128(key_mm + 7);
637
284
   const __m128i K8  = _mm_loadu_si128(key_mm + 8);
638
284
   const __m128i K9  = _mm_loadu_si128(key_mm + 9);
639
284
   const __m128i K10 = _mm_loadu_si128(key_mm + 10);
640
284
   const __m128i K11 = _mm_loadu_si128(key_mm + 11);
641
284
   const __m128i K12 = _mm_loadu_si128(key_mm + 12);
642
284
   const __m128i K13 = _mm_loadu_si128(key_mm + 13);
643
284
   const __m128i K14 = _mm_loadu_si128(key_mm + 14);
644
645
1.26k
   while(blocks >= 4)
646
981
      {
647
981
      __m128i B0 = _mm_loadu_si128(in_mm + 0);
648
981
      __m128i B1 = _mm_loadu_si128(in_mm + 1);
649
981
      __m128i B2 = _mm_loadu_si128(in_mm + 2);
650
981
      __m128i B3 = _mm_loadu_si128(in_mm + 3);
651
652
981
      B0 = _mm_xor_si128(B0, K0);
653
981
      B1 = _mm_xor_si128(B1, K0);
654
981
      B2 = _mm_xor_si128(B2, K0);
655
981
      B3 = _mm_xor_si128(B3, K0);
656
657
981
      AES_DEC_4_ROUNDS(K1);
658
981
      AES_DEC_4_ROUNDS(K2);
659
981
      AES_DEC_4_ROUNDS(K3);
660
981
      AES_DEC_4_ROUNDS(K4);
661
981
      AES_DEC_4_ROUNDS(K5);
662
981
      AES_DEC_4_ROUNDS(K6);
663
981
      AES_DEC_4_ROUNDS(K7);
664
981
      AES_DEC_4_ROUNDS(K8);
665
981
      AES_DEC_4_ROUNDS(K9);
666
981
      AES_DEC_4_ROUNDS(K10);
667
981
      AES_DEC_4_ROUNDS(K11);
668
981
      AES_DEC_4_ROUNDS(K12);
669
981
      AES_DEC_4_ROUNDS(K13);
670
981
      AES_DEC_4_LAST_ROUNDS(K14);
671
672
981
      _mm_storeu_si128(out_mm + 0, B0);
673
981
      _mm_storeu_si128(out_mm + 1, B1);
674
981
      _mm_storeu_si128(out_mm + 2, B2);
675
981
      _mm_storeu_si128(out_mm + 3, B3);
676
677
981
      blocks -= 4;
678
981
      in_mm += 4;
679
981
      out_mm += 4;
680
981
      }
681
682
406
   for(size_t i = 0; i != blocks; ++i)
683
122
      {
684
122
      __m128i B = _mm_loadu_si128(in_mm + i);
685
686
122
      B = _mm_xor_si128(B, K0);
687
688
122
      B = _mm_aesdec_si128(B, K1);
689
122
      B = _mm_aesdec_si128(B, K2);
690
122
      B = _mm_aesdec_si128(B, K3);
691
122
      B = _mm_aesdec_si128(B, K4);
692
122
      B = _mm_aesdec_si128(B, K5);
693
122
      B = _mm_aesdec_si128(B, K6);
694
122
      B = _mm_aesdec_si128(B, K7);
695
122
      B = _mm_aesdec_si128(B, K8);
696
122
      B = _mm_aesdec_si128(B, K9);
697
122
      B = _mm_aesdec_si128(B, K10);
698
122
      B = _mm_aesdec_si128(B, K11);
699
122
      B = _mm_aesdec_si128(B, K12);
700
122
      B = _mm_aesdec_si128(B, K13);
701
122
      B = _mm_aesdeclast_si128(B, K14);
702
703
122
      _mm_storeu_si128(out_mm + i, B);
704
122
      }
705
284
   }
706
707
/*
708
* AES-256 Key Schedule
709
*/
710
BOTAN_FUNC_ISA("ssse3,aes")
711
void AES_256::aesni_key_schedule(const uint8_t key[], size_t)
712
578
   {
713
578
   m_EK.resize(60);
714
578
   m_DK.resize(60);
715
716
578
   const __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key));
717
578
   const __m128i K1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key + 16));
718
719
578
   const __m128i K2 = aes_128_key_expansion(K0, _mm_aeskeygenassist_si128(K1, 0x01));
720
578
   const __m128i K3 = aes_256_key_expansion(K1, K2);
721
722
578
   const __m128i K4 = aes_128_key_expansion(K2, _mm_aeskeygenassist_si128(K3, 0x02));
723
578
   const __m128i K5 = aes_256_key_expansion(K3, K4);
724
725
578
   const __m128i K6 = aes_128_key_expansion(K4, _mm_aeskeygenassist_si128(K5, 0x04));
726
578
   const __m128i K7 = aes_256_key_expansion(K5, K6);
727
728
578
   const __m128i K8 = aes_128_key_expansion(K6, _mm_aeskeygenassist_si128(K7, 0x08));
729
578
   const __m128i K9 = aes_256_key_expansion(K7, K8);
730
731
578
   const __m128i K10 = aes_128_key_expansion(K8, _mm_aeskeygenassist_si128(K9, 0x10));
732
578
   const __m128i K11 = aes_256_key_expansion(K9, K10);
733
734
578
   const __m128i K12 = aes_128_key_expansion(K10, _mm_aeskeygenassist_si128(K11, 0x20));
735
578
   const __m128i K13 = aes_256_key_expansion(K11, K12);
736
737
578
   const __m128i K14 = aes_128_key_expansion(K12, _mm_aeskeygenassist_si128(K13, 0x40));
738
739
578
   __m128i* EK_mm = reinterpret_cast<__m128i*>(m_EK.data());
740
578
   _mm_storeu_si128(EK_mm     , K0);
741
578
   _mm_storeu_si128(EK_mm +  1, K1);
742
578
   _mm_storeu_si128(EK_mm +  2, K2);
743
578
   _mm_storeu_si128(EK_mm +  3, K3);
744
578
   _mm_storeu_si128(EK_mm +  4, K4);
745
578
   _mm_storeu_si128(EK_mm +  5, K5);
746
578
   _mm_storeu_si128(EK_mm +  6, K6);
747
578
   _mm_storeu_si128(EK_mm +  7, K7);
748
578
   _mm_storeu_si128(EK_mm +  8, K8);
749
578
   _mm_storeu_si128(EK_mm +  9, K9);
750
578
   _mm_storeu_si128(EK_mm + 10, K10);
751
578
   _mm_storeu_si128(EK_mm + 11, K11);
752
578
   _mm_storeu_si128(EK_mm + 12, K12);
753
578
   _mm_storeu_si128(EK_mm + 13, K13);
754
578
   _mm_storeu_si128(EK_mm + 14, K14);
755
756
   // Now generate decryption keys
757
578
   __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data());
758
578
   _mm_storeu_si128(DK_mm     , K14);
759
578
   _mm_storeu_si128(DK_mm +  1, _mm_aesimc_si128(K13));
760
578
   _mm_storeu_si128(DK_mm +  2, _mm_aesimc_si128(K12));
761
578
   _mm_storeu_si128(DK_mm +  3, _mm_aesimc_si128(K11));
762
578
   _mm_storeu_si128(DK_mm +  4, _mm_aesimc_si128(K10));
763
578
   _mm_storeu_si128(DK_mm +  5, _mm_aesimc_si128(K9));
764
578
   _mm_storeu_si128(DK_mm +  6, _mm_aesimc_si128(K8));
765
578
   _mm_storeu_si128(DK_mm +  7, _mm_aesimc_si128(K7));
766
578
   _mm_storeu_si128(DK_mm +  8, _mm_aesimc_si128(K6));
767
578
   _mm_storeu_si128(DK_mm +  9, _mm_aesimc_si128(K5));
768
578
   _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(K4));
769
578
   _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(K3));
770
578
   _mm_storeu_si128(DK_mm + 12, _mm_aesimc_si128(K2));
771
578
   _mm_storeu_si128(DK_mm + 13, _mm_aesimc_si128(K1));
772
578
   _mm_storeu_si128(DK_mm + 14, K0);
773
578
   }
774
775
#undef AES_ENC_4_ROUNDS
776
#undef AES_ENC_4_LAST_ROUNDS
777
#undef AES_DEC_4_ROUNDS
778
#undef AES_DEC_4_LAST_ROUNDS
779
780
}