Coverage Report

Created: 2021-05-04 09:02

/src/botan/src/lib/block/aes/aes_ni/aes_ni.cpp
Line
Count
Source (jump to first uncovered line)
1
/*
2
* AES using AES-NI instructions
3
* (C) 2009,2012 Jack Lloyd
4
*
5
* Botan is released under the Simplified BSD License (see license.txt)
6
*/
7
8
#include <botan/internal/aes.h>
9
#include <botan/internal/loadstor.h>
10
#include <wmmintrin.h>
11
12
namespace Botan {
13
14
namespace {
15
16
BOTAN_FUNC_ISA("ssse3")
17
__m128i aes_128_key_expansion(__m128i key, __m128i key_with_rcon)
18
10.3k
   {
19
10.3k
   key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(3,3,3,3));
20
10.3k
   key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
21
10.3k
   key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
22
10.3k
   key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
23
10.3k
   return _mm_xor_si128(key, key_with_rcon);
24
10.3k
   }
25
26
BOTAN_FUNC_ISA("ssse3")
27
void aes_192_key_expansion(__m128i* K1, __m128i* K2, __m128i key2_with_rcon,
28
                           uint32_t out[], bool last)
29
0
   {
30
0
   __m128i key1 = *K1;
31
0
   __m128i key2 = *K2;
32
33
0
   key2_with_rcon  = _mm_shuffle_epi32(key2_with_rcon, _MM_SHUFFLE(1,1,1,1));
34
0
   key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
35
0
   key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
36
0
   key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
37
0
   key1 = _mm_xor_si128(key1, key2_with_rcon);
38
39
0
   *K1 = key1;
40
0
   _mm_storeu_si128(reinterpret_cast<__m128i*>(out), key1);
41
42
0
   if(last)
43
0
      return;
44
45
0
   key2 = _mm_xor_si128(key2, _mm_slli_si128(key2, 4));
46
0
   key2 = _mm_xor_si128(key2, _mm_shuffle_epi32(key1, _MM_SHUFFLE(3,3,3,3)));
47
48
0
   *K2 = key2;
49
0
   out[4] = _mm_cvtsi128_si32(key2);
50
0
   out[5] = _mm_cvtsi128_si32(_mm_srli_si128(key2, 4));
51
0
   }
52
53
/*
54
* The second half of the AES-256 key expansion (other half same as AES-128)
55
*/
56
BOTAN_FUNC_ISA("ssse3,aes")
57
__m128i aes_256_key_expansion(__m128i key, __m128i key2)
58
7.09k
   {
59
7.09k
   __m128i key_with_rcon = _mm_aeskeygenassist_si128(key2, 0x00);
60
7.09k
   key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(2,2,2,2));
61
62
7.09k
   key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
63
7.09k
   key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
64
7.09k
   key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
65
7.09k
   return _mm_xor_si128(key, key_with_rcon);
66
7.09k
   }
67
68
}
69
70
#define AES_ENC_4_ROUNDS(K)                     \
71
60.3k
   do                                           \
72
60.3k
      {                                         \
73
60.3k
      B0 = _mm_aesenc_si128(B0, K);             \
74
60.3k
      B1 = _mm_aesenc_si128(B1, K);             \
75
60.3k
      B2 = _mm_aesenc_si128(B2, K);             \
76
60.3k
      B3 = _mm_aesenc_si128(B3, K);             \
77
60.3k
      } while(0)
78
79
#define AES_ENC_4_LAST_ROUNDS(K)                \
80
4.67k
   do                                           \
81
4.67k
      {                                         \
82
4.67k
      B0 = _mm_aesenclast_si128(B0, K);         \
83
4.67k
      B1 = _mm_aesenclast_si128(B1, K);         \
84
4.67k
      B2 = _mm_aesenclast_si128(B2, K);         \
85
4.67k
      B3 = _mm_aesenclast_si128(B3, K);         \
86
4.67k
      } while(0)
87
88
#define AES_DEC_4_ROUNDS(K)                     \
89
11.7k
   do                                           \
90
11.7k
      {                                         \
91
11.7k
      B0 = _mm_aesdec_si128(B0, K);             \
92
11.7k
      B1 = _mm_aesdec_si128(B1, K);             \
93
11.7k
      B2 = _mm_aesdec_si128(B2, K);             \
94
11.7k
      B3 = _mm_aesdec_si128(B3, K);             \
95
11.7k
      } while(0)
96
97
#define AES_DEC_4_LAST_ROUNDS(K)                \
98
1.01k
   do                                           \
99
1.01k
      {                                         \
100
1.01k
      B0 = _mm_aesdeclast_si128(B0, K);         \
101
1.01k
      B1 = _mm_aesdeclast_si128(B1, K);         \
102
1.01k
      B2 = _mm_aesdeclast_si128(B2, K);         \
103
1.01k
      B3 = _mm_aesdeclast_si128(B3, K);         \
104
1.01k
      } while(0)
105
106
/*
107
* AES-128 Encryption
108
*/
109
BOTAN_FUNC_ISA("ssse3,aes")
110
void AES_128::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
111
701
   {
112
701
   const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
113
701
   __m128i* out_mm = reinterpret_cast<__m128i*>(out);
114
115
701
   const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_EK.data());
116
117
701
   const __m128i K0  = _mm_loadu_si128(key_mm);
118
701
   const __m128i K1  = _mm_loadu_si128(key_mm + 1);
119
701
   const __m128i K2  = _mm_loadu_si128(key_mm + 2);
120
701
   const __m128i K3  = _mm_loadu_si128(key_mm + 3);
121
701
   const __m128i K4  = _mm_loadu_si128(key_mm + 4);
122
701
   const __m128i K5  = _mm_loadu_si128(key_mm + 5);
123
701
   const __m128i K6  = _mm_loadu_si128(key_mm + 6);
124
701
   const __m128i K7  = _mm_loadu_si128(key_mm + 7);
125
701
   const __m128i K8  = _mm_loadu_si128(key_mm + 8);
126
701
   const __m128i K9  = _mm_loadu_si128(key_mm + 9);
127
701
   const __m128i K10 = _mm_loadu_si128(key_mm + 10);
128
129
805
   while(blocks >= 4)
130
104
      {
131
104
      __m128i B0 = _mm_loadu_si128(in_mm + 0);
132
104
      __m128i B1 = _mm_loadu_si128(in_mm + 1);
133
104
      __m128i B2 = _mm_loadu_si128(in_mm + 2);
134
104
      __m128i B3 = _mm_loadu_si128(in_mm + 3);
135
136
104
      B0 = _mm_xor_si128(B0, K0);
137
104
      B1 = _mm_xor_si128(B1, K0);
138
104
      B2 = _mm_xor_si128(B2, K0);
139
104
      B3 = _mm_xor_si128(B3, K0);
140
141
104
      AES_ENC_4_ROUNDS(K1);
142
104
      AES_ENC_4_ROUNDS(K2);
143
104
      AES_ENC_4_ROUNDS(K3);
144
104
      AES_ENC_4_ROUNDS(K4);
145
104
      AES_ENC_4_ROUNDS(K5);
146
104
      AES_ENC_4_ROUNDS(K6);
147
104
      AES_ENC_4_ROUNDS(K7);
148
104
      AES_ENC_4_ROUNDS(K8);
149
104
      AES_ENC_4_ROUNDS(K9);
150
104
      AES_ENC_4_LAST_ROUNDS(K10);
151
152
104
      _mm_storeu_si128(out_mm + 0, B0);
153
104
      _mm_storeu_si128(out_mm + 1, B1);
154
104
      _mm_storeu_si128(out_mm + 2, B2);
155
104
      _mm_storeu_si128(out_mm + 3, B3);
156
157
104
      blocks -= 4;
158
104
      in_mm += 4;
159
104
      out_mm += 4;
160
104
      }
161
162
1.37k
   for(size_t i = 0; i != blocks; ++i)
163
675
      {
164
675
      __m128i B = _mm_loadu_si128(in_mm + i);
165
166
675
      B = _mm_xor_si128(B, K0);
167
168
675
      B = _mm_aesenc_si128(B, K1);
169
675
      B = _mm_aesenc_si128(B, K2);
170
675
      B = _mm_aesenc_si128(B, K3);
171
675
      B = _mm_aesenc_si128(B, K4);
172
675
      B = _mm_aesenc_si128(B, K5);
173
675
      B = _mm_aesenc_si128(B, K6);
174
675
      B = _mm_aesenc_si128(B, K7);
175
675
      B = _mm_aesenc_si128(B, K8);
176
675
      B = _mm_aesenc_si128(B, K9);
177
675
      B = _mm_aesenclast_si128(B, K10);
178
179
675
      _mm_storeu_si128(out_mm + i, B);
180
675
      }
181
701
   }
182
183
/*
184
* AES-128 Decryption
185
*/
186
BOTAN_FUNC_ISA("ssse3,aes")
187
void AES_128::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
188
99
   {
189
99
   const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
190
99
   __m128i* out_mm = reinterpret_cast<__m128i*>(out);
191
192
99
   const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_DK.data());
193
194
99
   const __m128i K0  = _mm_loadu_si128(key_mm);
195
99
   const __m128i K1  = _mm_loadu_si128(key_mm + 1);
196
99
   const __m128i K2  = _mm_loadu_si128(key_mm + 2);
197
99
   const __m128i K3  = _mm_loadu_si128(key_mm + 3);
198
99
   const __m128i K4  = _mm_loadu_si128(key_mm + 4);
199
99
   const __m128i K5  = _mm_loadu_si128(key_mm + 5);
200
99
   const __m128i K6  = _mm_loadu_si128(key_mm + 6);
201
99
   const __m128i K7  = _mm_loadu_si128(key_mm + 7);
202
99
   const __m128i K8  = _mm_loadu_si128(key_mm + 8);
203
99
   const __m128i K9  = _mm_loadu_si128(key_mm + 9);
204
99
   const __m128i K10 = _mm_loadu_si128(key_mm + 10);
205
206
453
   while(blocks >= 4)
207
354
      {
208
354
      __m128i B0 = _mm_loadu_si128(in_mm + 0);
209
354
      __m128i B1 = _mm_loadu_si128(in_mm + 1);
210
354
      __m128i B2 = _mm_loadu_si128(in_mm + 2);
211
354
      __m128i B3 = _mm_loadu_si128(in_mm + 3);
212
213
354
      B0 = _mm_xor_si128(B0, K0);
214
354
      B1 = _mm_xor_si128(B1, K0);
215
354
      B2 = _mm_xor_si128(B2, K0);
216
354
      B3 = _mm_xor_si128(B3, K0);
217
218
354
      AES_DEC_4_ROUNDS(K1);
219
354
      AES_DEC_4_ROUNDS(K2);
220
354
      AES_DEC_4_ROUNDS(K3);
221
354
      AES_DEC_4_ROUNDS(K4);
222
354
      AES_DEC_4_ROUNDS(K5);
223
354
      AES_DEC_4_ROUNDS(K6);
224
354
      AES_DEC_4_ROUNDS(K7);
225
354
      AES_DEC_4_ROUNDS(K8);
226
354
      AES_DEC_4_ROUNDS(K9);
227
354
      AES_DEC_4_LAST_ROUNDS(K10);
228
229
354
      _mm_storeu_si128(out_mm + 0, B0);
230
354
      _mm_storeu_si128(out_mm + 1, B1);
231
354
      _mm_storeu_si128(out_mm + 2, B2);
232
354
      _mm_storeu_si128(out_mm + 3, B3);
233
234
354
      blocks -= 4;
235
354
      in_mm += 4;
236
354
      out_mm += 4;
237
354
      }
238
239
140
   for(size_t i = 0; i != blocks; ++i)
240
41
      {
241
41
      __m128i B = _mm_loadu_si128(in_mm + i);
242
243
41
      B = _mm_xor_si128(B, K0);
244
245
41
      B = _mm_aesdec_si128(B, K1);
246
41
      B = _mm_aesdec_si128(B, K2);
247
41
      B = _mm_aesdec_si128(B, K3);
248
41
      B = _mm_aesdec_si128(B, K4);
249
41
      B = _mm_aesdec_si128(B, K5);
250
41
      B = _mm_aesdec_si128(B, K6);
251
41
      B = _mm_aesdec_si128(B, K7);
252
41
      B = _mm_aesdec_si128(B, K8);
253
41
      B = _mm_aesdec_si128(B, K9);
254
41
      B = _mm_aesdeclast_si128(B, K10);
255
256
41
      _mm_storeu_si128(out_mm + i, B);
257
41
      }
258
99
   }
259
260
/*
261
* AES-128 Key Schedule
262
*/
263
BOTAN_FUNC_ISA("ssse3,aes")
264
void AES_128::aesni_key_schedule(const uint8_t key[], size_t)
265
204
   {
266
204
   m_EK.resize(44);
267
204
   m_DK.resize(44);
268
269
204
   #define AES_128_key_exp(K, RCON) \
270
2.04k
      aes_128_key_expansion(K, _mm_aeskeygenassist_si128(K, RCON))
271
272
204
   const __m128i K0  = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key));
273
204
   const __m128i K1  = AES_128_key_exp(K0, 0x01);
274
204
   const __m128i K2  = AES_128_key_exp(K1, 0x02);
275
204
   const __m128i K3  = AES_128_key_exp(K2, 0x04);
276
204
   const __m128i K4  = AES_128_key_exp(K3, 0x08);
277
204
   const __m128i K5  = AES_128_key_exp(K4, 0x10);
278
204
   const __m128i K6  = AES_128_key_exp(K5, 0x20);
279
204
   const __m128i K7  = AES_128_key_exp(K6, 0x40);
280
204
   const __m128i K8  = AES_128_key_exp(K7, 0x80);
281
204
   const __m128i K9  = AES_128_key_exp(K8, 0x1B);
282
204
   const __m128i K10 = AES_128_key_exp(K9, 0x36);
283
284
204
   __m128i* EK_mm = reinterpret_cast<__m128i*>(m_EK.data());
285
204
   _mm_storeu_si128(EK_mm     , K0);
286
204
   _mm_storeu_si128(EK_mm +  1, K1);
287
204
   _mm_storeu_si128(EK_mm +  2, K2);
288
204
   _mm_storeu_si128(EK_mm +  3, K3);
289
204
   _mm_storeu_si128(EK_mm +  4, K4);
290
204
   _mm_storeu_si128(EK_mm +  5, K5);
291
204
   _mm_storeu_si128(EK_mm +  6, K6);
292
204
   _mm_storeu_si128(EK_mm +  7, K7);
293
204
   _mm_storeu_si128(EK_mm +  8, K8);
294
204
   _mm_storeu_si128(EK_mm +  9, K9);
295
204
   _mm_storeu_si128(EK_mm + 10, K10);
296
297
   // Now generate decryption keys
298
299
204
   __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data());
300
204
   _mm_storeu_si128(DK_mm     , K10);
301
204
   _mm_storeu_si128(DK_mm +  1, _mm_aesimc_si128(K9));
302
204
   _mm_storeu_si128(DK_mm +  2, _mm_aesimc_si128(K8));
303
204
   _mm_storeu_si128(DK_mm +  3, _mm_aesimc_si128(K7));
304
204
   _mm_storeu_si128(DK_mm +  4, _mm_aesimc_si128(K6));
305
204
   _mm_storeu_si128(DK_mm +  5, _mm_aesimc_si128(K5));
306
204
   _mm_storeu_si128(DK_mm +  6, _mm_aesimc_si128(K4));
307
204
   _mm_storeu_si128(DK_mm +  7, _mm_aesimc_si128(K3));
308
204
   _mm_storeu_si128(DK_mm +  8, _mm_aesimc_si128(K2));
309
204
   _mm_storeu_si128(DK_mm +  9, _mm_aesimc_si128(K1));
310
204
   _mm_storeu_si128(DK_mm + 10, K0);
311
204
   }
312
313
/*
314
* AES-192 Encryption
315
*/
316
BOTAN_FUNC_ISA("ssse3,aes")
317
void AES_192::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
318
0
   {
319
0
   const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
320
0
   __m128i* out_mm = reinterpret_cast<__m128i*>(out);
321
322
0
   const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_EK.data());
323
324
0
   const __m128i K0  = _mm_loadu_si128(key_mm);
325
0
   const __m128i K1  = _mm_loadu_si128(key_mm + 1);
326
0
   const __m128i K2  = _mm_loadu_si128(key_mm + 2);
327
0
   const __m128i K3  = _mm_loadu_si128(key_mm + 3);
328
0
   const __m128i K4  = _mm_loadu_si128(key_mm + 4);
329
0
   const __m128i K5  = _mm_loadu_si128(key_mm + 5);
330
0
   const __m128i K6  = _mm_loadu_si128(key_mm + 6);
331
0
   const __m128i K7  = _mm_loadu_si128(key_mm + 7);
332
0
   const __m128i K8  = _mm_loadu_si128(key_mm + 8);
333
0
   const __m128i K9  = _mm_loadu_si128(key_mm + 9);
334
0
   const __m128i K10 = _mm_loadu_si128(key_mm + 10);
335
0
   const __m128i K11 = _mm_loadu_si128(key_mm + 11);
336
0
   const __m128i K12 = _mm_loadu_si128(key_mm + 12);
337
338
0
   while(blocks >= 4)
339
0
      {
340
0
      __m128i B0 = _mm_loadu_si128(in_mm + 0);
341
0
      __m128i B1 = _mm_loadu_si128(in_mm + 1);
342
0
      __m128i B2 = _mm_loadu_si128(in_mm + 2);
343
0
      __m128i B3 = _mm_loadu_si128(in_mm + 3);
344
345
0
      B0 = _mm_xor_si128(B0, K0);
346
0
      B1 = _mm_xor_si128(B1, K0);
347
0
      B2 = _mm_xor_si128(B2, K0);
348
0
      B3 = _mm_xor_si128(B3, K0);
349
350
0
      AES_ENC_4_ROUNDS(K1);
351
0
      AES_ENC_4_ROUNDS(K2);
352
0
      AES_ENC_4_ROUNDS(K3);
353
0
      AES_ENC_4_ROUNDS(K4);
354
0
      AES_ENC_4_ROUNDS(K5);
355
0
      AES_ENC_4_ROUNDS(K6);
356
0
      AES_ENC_4_ROUNDS(K7);
357
0
      AES_ENC_4_ROUNDS(K8);
358
0
      AES_ENC_4_ROUNDS(K9);
359
0
      AES_ENC_4_ROUNDS(K10);
360
0
      AES_ENC_4_ROUNDS(K11);
361
0
      AES_ENC_4_LAST_ROUNDS(K12);
362
363
0
      _mm_storeu_si128(out_mm + 0, B0);
364
0
      _mm_storeu_si128(out_mm + 1, B1);
365
0
      _mm_storeu_si128(out_mm + 2, B2);
366
0
      _mm_storeu_si128(out_mm + 3, B3);
367
368
0
      blocks -= 4;
369
0
      in_mm += 4;
370
0
      out_mm += 4;
371
0
      }
372
373
0
   for(size_t i = 0; i != blocks; ++i)
374
0
      {
375
0
      __m128i B = _mm_loadu_si128(in_mm + i);
376
377
0
      B = _mm_xor_si128(B, K0);
378
379
0
      B = _mm_aesenc_si128(B, K1);
380
0
      B = _mm_aesenc_si128(B, K2);
381
0
      B = _mm_aesenc_si128(B, K3);
382
0
      B = _mm_aesenc_si128(B, K4);
383
0
      B = _mm_aesenc_si128(B, K5);
384
0
      B = _mm_aesenc_si128(B, K6);
385
0
      B = _mm_aesenc_si128(B, K7);
386
0
      B = _mm_aesenc_si128(B, K8);
387
0
      B = _mm_aesenc_si128(B, K9);
388
0
      B = _mm_aesenc_si128(B, K10);
389
0
      B = _mm_aesenc_si128(B, K11);
390
0
      B = _mm_aesenclast_si128(B, K12);
391
392
0
      _mm_storeu_si128(out_mm + i, B);
393
0
      }
394
0
   }
395
396
/*
397
* AES-192 Decryption
398
*/
399
BOTAN_FUNC_ISA("ssse3,aes")
400
void AES_192::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
401
0
   {
402
0
   const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
403
0
   __m128i* out_mm = reinterpret_cast<__m128i*>(out);
404
405
0
   const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_DK.data());
406
407
0
   const __m128i K0  = _mm_loadu_si128(key_mm);
408
0
   const __m128i K1  = _mm_loadu_si128(key_mm + 1);
409
0
   const __m128i K2  = _mm_loadu_si128(key_mm + 2);
410
0
   const __m128i K3  = _mm_loadu_si128(key_mm + 3);
411
0
   const __m128i K4  = _mm_loadu_si128(key_mm + 4);
412
0
   const __m128i K5  = _mm_loadu_si128(key_mm + 5);
413
0
   const __m128i K6  = _mm_loadu_si128(key_mm + 6);
414
0
   const __m128i K7  = _mm_loadu_si128(key_mm + 7);
415
0
   const __m128i K8  = _mm_loadu_si128(key_mm + 8);
416
0
   const __m128i K9  = _mm_loadu_si128(key_mm + 9);
417
0
   const __m128i K10 = _mm_loadu_si128(key_mm + 10);
418
0
   const __m128i K11 = _mm_loadu_si128(key_mm + 11);
419
0
   const __m128i K12 = _mm_loadu_si128(key_mm + 12);
420
421
0
   while(blocks >= 4)
422
0
      {
423
0
      __m128i B0 = _mm_loadu_si128(in_mm + 0);
424
0
      __m128i B1 = _mm_loadu_si128(in_mm + 1);
425
0
      __m128i B2 = _mm_loadu_si128(in_mm + 2);
426
0
      __m128i B3 = _mm_loadu_si128(in_mm + 3);
427
428
0
      B0 = _mm_xor_si128(B0, K0);
429
0
      B1 = _mm_xor_si128(B1, K0);
430
0
      B2 = _mm_xor_si128(B2, K0);
431
0
      B3 = _mm_xor_si128(B3, K0);
432
433
0
      AES_DEC_4_ROUNDS(K1);
434
0
      AES_DEC_4_ROUNDS(K2);
435
0
      AES_DEC_4_ROUNDS(K3);
436
0
      AES_DEC_4_ROUNDS(K4);
437
0
      AES_DEC_4_ROUNDS(K5);
438
0
      AES_DEC_4_ROUNDS(K6);
439
0
      AES_DEC_4_ROUNDS(K7);
440
0
      AES_DEC_4_ROUNDS(K8);
441
0
      AES_DEC_4_ROUNDS(K9);
442
0
      AES_DEC_4_ROUNDS(K10);
443
0
      AES_DEC_4_ROUNDS(K11);
444
0
      AES_DEC_4_LAST_ROUNDS(K12);
445
446
0
      _mm_storeu_si128(out_mm + 0, B0);
447
0
      _mm_storeu_si128(out_mm + 1, B1);
448
0
      _mm_storeu_si128(out_mm + 2, B2);
449
0
      _mm_storeu_si128(out_mm + 3, B3);
450
451
0
      blocks -= 4;
452
0
      in_mm += 4;
453
0
      out_mm += 4;
454
0
      }
455
456
0
   for(size_t i = 0; i != blocks; ++i)
457
0
      {
458
0
      __m128i B = _mm_loadu_si128(in_mm + i);
459
460
0
      B = _mm_xor_si128(B, K0);
461
462
0
      B = _mm_aesdec_si128(B, K1);
463
0
      B = _mm_aesdec_si128(B, K2);
464
0
      B = _mm_aesdec_si128(B, K3);
465
0
      B = _mm_aesdec_si128(B, K4);
466
0
      B = _mm_aesdec_si128(B, K5);
467
0
      B = _mm_aesdec_si128(B, K6);
468
0
      B = _mm_aesdec_si128(B, K7);
469
0
      B = _mm_aesdec_si128(B, K8);
470
0
      B = _mm_aesdec_si128(B, K9);
471
0
      B = _mm_aesdec_si128(B, K10);
472
0
      B = _mm_aesdec_si128(B, K11);
473
0
      B = _mm_aesdeclast_si128(B, K12);
474
475
0
      _mm_storeu_si128(out_mm + i, B);
476
0
      }
477
0
   }
478
479
/*
480
* AES-192 Key Schedule
481
*/
482
BOTAN_FUNC_ISA("ssse3,aes")
483
void AES_192::aesni_key_schedule(const uint8_t key[], size_t)
484
0
   {
485
0
   m_EK.resize(52);
486
0
   m_DK.resize(52);
487
488
0
   __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key));
489
0
   __m128i K1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key + 8));
490
0
   K1 = _mm_srli_si128(K1, 8);
491
492
0
   load_le(m_EK.data(), key, 6);
493
494
0
   #define AES_192_key_exp(RCON, EK_OFF)                         \
495
0
     aes_192_key_expansion(&K0, &K1,                             \
496
0
                           _mm_aeskeygenassist_si128(K1, RCON),  \
497
0
                           &m_EK[EK_OFF], EK_OFF == 48)
498
499
0
   AES_192_key_exp(0x01, 6);
500
0
   AES_192_key_exp(0x02, 12);
501
0
   AES_192_key_exp(0x04, 18);
502
0
   AES_192_key_exp(0x08, 24);
503
0
   AES_192_key_exp(0x10, 30);
504
0
   AES_192_key_exp(0x20, 36);
505
0
   AES_192_key_exp(0x40, 42);
506
0
   AES_192_key_exp(0x80, 48);
507
508
0
   #undef AES_192_key_exp
509
510
   // Now generate decryption keys
511
0
   const __m128i* EK_mm = reinterpret_cast<const __m128i*>(m_EK.data());
512
513
0
   __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data());
514
0
   _mm_storeu_si128(DK_mm     , _mm_loadu_si128(EK_mm + 12));
515
0
   _mm_storeu_si128(DK_mm +  1, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 11)));
516
0
   _mm_storeu_si128(DK_mm +  2, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 10)));
517
0
   _mm_storeu_si128(DK_mm +  3, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 9)));
518
0
   _mm_storeu_si128(DK_mm +  4, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 8)));
519
0
   _mm_storeu_si128(DK_mm +  5, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 7)));
520
0
   _mm_storeu_si128(DK_mm +  6, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 6)));
521
0
   _mm_storeu_si128(DK_mm +  7, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 5)));
522
0
   _mm_storeu_si128(DK_mm +  8, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 4)));
523
0
   _mm_storeu_si128(DK_mm +  9, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 3)));
524
0
   _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 2)));
525
0
   _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 1)));
526
0
   _mm_storeu_si128(DK_mm + 12, _mm_loadu_si128(EK_mm + 0));
527
0
   }
528
529
/*
530
* AES-256 Encryption
531
*/
532
BOTAN_FUNC_ISA("ssse3,aes")
533
void AES_256::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
534
5.25k
   {
535
5.25k
   const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
536
5.25k
   __m128i* out_mm = reinterpret_cast<__m128i*>(out);
537
538
5.25k
   const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_EK.data());
539
540
5.25k
   const __m128i K0  = _mm_loadu_si128(key_mm);
541
5.25k
   const __m128i K1  = _mm_loadu_si128(key_mm + 1);
542
5.25k
   const __m128i K2  = _mm_loadu_si128(key_mm + 2);
543
5.25k
   const __m128i K3  = _mm_loadu_si128(key_mm + 3);
544
5.25k
   const __m128i K4  = _mm_loadu_si128(key_mm + 4);
545
5.25k
   const __m128i K5  = _mm_loadu_si128(key_mm + 5);
546
5.25k
   const __m128i K6  = _mm_loadu_si128(key_mm + 6);
547
5.25k
   const __m128i K7  = _mm_loadu_si128(key_mm + 7);
548
5.25k
   const __m128i K8  = _mm_loadu_si128(key_mm + 8);
549
5.25k
   const __m128i K9  = _mm_loadu_si128(key_mm + 9);
550
5.25k
   const __m128i K10 = _mm_loadu_si128(key_mm + 10);
551
5.25k
   const __m128i K11 = _mm_loadu_si128(key_mm + 11);
552
5.25k
   const __m128i K12 = _mm_loadu_si128(key_mm + 12);
553
5.25k
   const __m128i K13 = _mm_loadu_si128(key_mm + 13);
554
5.25k
   const __m128i K14 = _mm_loadu_si128(key_mm + 14);
555
556
9.82k
   while(blocks >= 4)
557
4.56k
      {
558
4.56k
      __m128i B0 = _mm_loadu_si128(in_mm + 0);
559
4.56k
      __m128i B1 = _mm_loadu_si128(in_mm + 1);
560
4.56k
      __m128i B2 = _mm_loadu_si128(in_mm + 2);
561
4.56k
      __m128i B3 = _mm_loadu_si128(in_mm + 3);
562
563
4.56k
      B0 = _mm_xor_si128(B0, K0);
564
4.56k
      B1 = _mm_xor_si128(B1, K0);
565
4.56k
      B2 = _mm_xor_si128(B2, K0);
566
4.56k
      B3 = _mm_xor_si128(B3, K0);
567
568
4.56k
      AES_ENC_4_ROUNDS(K1);
569
4.56k
      AES_ENC_4_ROUNDS(K2);
570
4.56k
      AES_ENC_4_ROUNDS(K3);
571
4.56k
      AES_ENC_4_ROUNDS(K4);
572
4.56k
      AES_ENC_4_ROUNDS(K5);
573
4.56k
      AES_ENC_4_ROUNDS(K6);
574
4.56k
      AES_ENC_4_ROUNDS(K7);
575
4.56k
      AES_ENC_4_ROUNDS(K8);
576
4.56k
      AES_ENC_4_ROUNDS(K9);
577
4.56k
      AES_ENC_4_ROUNDS(K10);
578
4.56k
      AES_ENC_4_ROUNDS(K11);
579
4.56k
      AES_ENC_4_ROUNDS(K12);
580
4.56k
      AES_ENC_4_ROUNDS(K13);
581
4.56k
      AES_ENC_4_LAST_ROUNDS(K14);
582
583
4.56k
      _mm_storeu_si128(out_mm + 0, B0);
584
4.56k
      _mm_storeu_si128(out_mm + 1, B1);
585
4.56k
      _mm_storeu_si128(out_mm + 2, B2);
586
4.56k
      _mm_storeu_si128(out_mm + 3, B3);
587
588
4.56k
      blocks -= 4;
589
4.56k
      in_mm += 4;
590
4.56k
      out_mm += 4;
591
4.56k
      }
592
593
9.37k
   for(size_t i = 0; i != blocks; ++i)
594
4.11k
      {
595
4.11k
      __m128i B = _mm_loadu_si128(in_mm + i);
596
597
4.11k
      B = _mm_xor_si128(B, K0);
598
599
4.11k
      B = _mm_aesenc_si128(B, K1);
600
4.11k
      B = _mm_aesenc_si128(B, K2);
601
4.11k
      B = _mm_aesenc_si128(B, K3);
602
4.11k
      B = _mm_aesenc_si128(B, K4);
603
4.11k
      B = _mm_aesenc_si128(B, K5);
604
4.11k
      B = _mm_aesenc_si128(B, K6);
605
4.11k
      B = _mm_aesenc_si128(B, K7);
606
4.11k
      B = _mm_aesenc_si128(B, K8);
607
4.11k
      B = _mm_aesenc_si128(B, K9);
608
4.11k
      B = _mm_aesenc_si128(B, K10);
609
4.11k
      B = _mm_aesenc_si128(B, K11);
610
4.11k
      B = _mm_aesenc_si128(B, K12);
611
4.11k
      B = _mm_aesenc_si128(B, K13);
612
4.11k
      B = _mm_aesenclast_si128(B, K14);
613
614
4.11k
      _mm_storeu_si128(out_mm + i, B);
615
4.11k
      }
616
5.25k
   }
617
618
/*
619
* AES-256 Decryption
620
*/
621
BOTAN_FUNC_ISA("ssse3,aes")
622
void AES_256::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
623
191
   {
624
191
   const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
625
191
   __m128i* out_mm = reinterpret_cast<__m128i*>(out);
626
627
191
   const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_DK.data());
628
629
191
   const __m128i K0  = _mm_loadu_si128(key_mm);
630
191
   const __m128i K1  = _mm_loadu_si128(key_mm + 1);
631
191
   const __m128i K2  = _mm_loadu_si128(key_mm + 2);
632
191
   const __m128i K3  = _mm_loadu_si128(key_mm + 3);
633
191
   const __m128i K4  = _mm_loadu_si128(key_mm + 4);
634
191
   const __m128i K5  = _mm_loadu_si128(key_mm + 5);
635
191
   const __m128i K6  = _mm_loadu_si128(key_mm + 6);
636
191
   const __m128i K7  = _mm_loadu_si128(key_mm + 7);
637
191
   const __m128i K8  = _mm_loadu_si128(key_mm + 8);
638
191
   const __m128i K9  = _mm_loadu_si128(key_mm + 9);
639
191
   const __m128i K10 = _mm_loadu_si128(key_mm + 10);
640
191
   const __m128i K11 = _mm_loadu_si128(key_mm + 11);
641
191
   const __m128i K12 = _mm_loadu_si128(key_mm + 12);
642
191
   const __m128i K13 = _mm_loadu_si128(key_mm + 13);
643
191
   const __m128i K14 = _mm_loadu_si128(key_mm + 14);
644
645
851
   while(blocks >= 4)
646
660
      {
647
660
      __m128i B0 = _mm_loadu_si128(in_mm + 0);
648
660
      __m128i B1 = _mm_loadu_si128(in_mm + 1);
649
660
      __m128i B2 = _mm_loadu_si128(in_mm + 2);
650
660
      __m128i B3 = _mm_loadu_si128(in_mm + 3);
651
652
660
      B0 = _mm_xor_si128(B0, K0);
653
660
      B1 = _mm_xor_si128(B1, K0);
654
660
      B2 = _mm_xor_si128(B2, K0);
655
660
      B3 = _mm_xor_si128(B3, K0);
656
657
660
      AES_DEC_4_ROUNDS(K1);
658
660
      AES_DEC_4_ROUNDS(K2);
659
660
      AES_DEC_4_ROUNDS(K3);
660
660
      AES_DEC_4_ROUNDS(K4);
661
660
      AES_DEC_4_ROUNDS(K5);
662
660
      AES_DEC_4_ROUNDS(K6);
663
660
      AES_DEC_4_ROUNDS(K7);
664
660
      AES_DEC_4_ROUNDS(K8);
665
660
      AES_DEC_4_ROUNDS(K9);
666
660
      AES_DEC_4_ROUNDS(K10);
667
660
      AES_DEC_4_ROUNDS(K11);
668
660
      AES_DEC_4_ROUNDS(K12);
669
660
      AES_DEC_4_ROUNDS(K13);
670
660
      AES_DEC_4_LAST_ROUNDS(K14);
671
672
660
      _mm_storeu_si128(out_mm + 0, B0);
673
660
      _mm_storeu_si128(out_mm + 1, B1);
674
660
      _mm_storeu_si128(out_mm + 2, B2);
675
660
      _mm_storeu_si128(out_mm + 3, B3);
676
677
660
      blocks -= 4;
678
660
      in_mm += 4;
679
660
      out_mm += 4;
680
660
      }
681
682
283
   for(size_t i = 0; i != blocks; ++i)
683
92
      {
684
92
      __m128i B = _mm_loadu_si128(in_mm + i);
685
686
92
      B = _mm_xor_si128(B, K0);
687
688
92
      B = _mm_aesdec_si128(B, K1);
689
92
      B = _mm_aesdec_si128(B, K2);
690
92
      B = _mm_aesdec_si128(B, K3);
691
92
      B = _mm_aesdec_si128(B, K4);
692
92
      B = _mm_aesdec_si128(B, K5);
693
92
      B = _mm_aesdec_si128(B, K6);
694
92
      B = _mm_aesdec_si128(B, K7);
695
92
      B = _mm_aesdec_si128(B, K8);
696
92
      B = _mm_aesdec_si128(B, K9);
697
92
      B = _mm_aesdec_si128(B, K10);
698
92
      B = _mm_aesdec_si128(B, K11);
699
92
      B = _mm_aesdec_si128(B, K12);
700
92
      B = _mm_aesdec_si128(B, K13);
701
92
      B = _mm_aesdeclast_si128(B, K14);
702
703
92
      _mm_storeu_si128(out_mm + i, B);
704
92
      }
705
191
   }
706
707
/*
708
* AES-256 Key Schedule
709
*/
710
BOTAN_FUNC_ISA("ssse3,aes")
711
void AES_256::aesni_key_schedule(const uint8_t key[], size_t)
712
1.18k
   {
713
1.18k
   m_EK.resize(60);
714
1.18k
   m_DK.resize(60);
715
716
1.18k
   const __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key));
717
1.18k
   const __m128i K1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key + 16));
718
719
1.18k
   const __m128i K2 = aes_128_key_expansion(K0, _mm_aeskeygenassist_si128(K1, 0x01));
720
1.18k
   const __m128i K3 = aes_256_key_expansion(K1, K2);
721
722
1.18k
   const __m128i K4 = aes_128_key_expansion(K2, _mm_aeskeygenassist_si128(K3, 0x02));
723
1.18k
   const __m128i K5 = aes_256_key_expansion(K3, K4);
724
725
1.18k
   const __m128i K6 = aes_128_key_expansion(K4, _mm_aeskeygenassist_si128(K5, 0x04));
726
1.18k
   const __m128i K7 = aes_256_key_expansion(K5, K6);
727
728
1.18k
   const __m128i K8 = aes_128_key_expansion(K6, _mm_aeskeygenassist_si128(K7, 0x08));
729
1.18k
   const __m128i K9 = aes_256_key_expansion(K7, K8);
730
731
1.18k
   const __m128i K10 = aes_128_key_expansion(K8, _mm_aeskeygenassist_si128(K9, 0x10));
732
1.18k
   const __m128i K11 = aes_256_key_expansion(K9, K10);
733
734
1.18k
   const __m128i K12 = aes_128_key_expansion(K10, _mm_aeskeygenassist_si128(K11, 0x20));
735
1.18k
   const __m128i K13 = aes_256_key_expansion(K11, K12);
736
737
1.18k
   const __m128i K14 = aes_128_key_expansion(K12, _mm_aeskeygenassist_si128(K13, 0x40));
738
739
1.18k
   __m128i* EK_mm = reinterpret_cast<__m128i*>(m_EK.data());
740
1.18k
   _mm_storeu_si128(EK_mm     , K0);
741
1.18k
   _mm_storeu_si128(EK_mm +  1, K1);
742
1.18k
   _mm_storeu_si128(EK_mm +  2, K2);
743
1.18k
   _mm_storeu_si128(EK_mm +  3, K3);
744
1.18k
   _mm_storeu_si128(EK_mm +  4, K4);
745
1.18k
   _mm_storeu_si128(EK_mm +  5, K5);
746
1.18k
   _mm_storeu_si128(EK_mm +  6, K6);
747
1.18k
   _mm_storeu_si128(EK_mm +  7, K7);
748
1.18k
   _mm_storeu_si128(EK_mm +  8, K8);
749
1.18k
   _mm_storeu_si128(EK_mm +  9, K9);
750
1.18k
   _mm_storeu_si128(EK_mm + 10, K10);
751
1.18k
   _mm_storeu_si128(EK_mm + 11, K11);
752
1.18k
   _mm_storeu_si128(EK_mm + 12, K12);
753
1.18k
   _mm_storeu_si128(EK_mm + 13, K13);
754
1.18k
   _mm_storeu_si128(EK_mm + 14, K14);
755
756
   // Now generate decryption keys
757
1.18k
   __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data());
758
1.18k
   _mm_storeu_si128(DK_mm     , K14);
759
1.18k
   _mm_storeu_si128(DK_mm +  1, _mm_aesimc_si128(K13));
760
1.18k
   _mm_storeu_si128(DK_mm +  2, _mm_aesimc_si128(K12));
761
1.18k
   _mm_storeu_si128(DK_mm +  3, _mm_aesimc_si128(K11));
762
1.18k
   _mm_storeu_si128(DK_mm +  4, _mm_aesimc_si128(K10));
763
1.18k
   _mm_storeu_si128(DK_mm +  5, _mm_aesimc_si128(K9));
764
1.18k
   _mm_storeu_si128(DK_mm +  6, _mm_aesimc_si128(K8));
765
1.18k
   _mm_storeu_si128(DK_mm +  7, _mm_aesimc_si128(K7));
766
1.18k
   _mm_storeu_si128(DK_mm +  8, _mm_aesimc_si128(K6));
767
1.18k
   _mm_storeu_si128(DK_mm +  9, _mm_aesimc_si128(K5));
768
1.18k
   _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(K4));
769
1.18k
   _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(K3));
770
1.18k
   _mm_storeu_si128(DK_mm + 12, _mm_aesimc_si128(K2));
771
1.18k
   _mm_storeu_si128(DK_mm + 13, _mm_aesimc_si128(K1));
772
1.18k
   _mm_storeu_si128(DK_mm + 14, K0);
773
1.18k
   }
774
775
#undef AES_ENC_4_ROUNDS
776
#undef AES_ENC_4_LAST_ROUNDS
777
#undef AES_DEC_4_ROUNDS
778
#undef AES_DEC_4_LAST_ROUNDS
779
780
}