Coverage Report

Created: 2020-02-14 15:38

/src/botan/src/lib/block/aes/aes_ni/aes_ni.cpp
Line
Count
Source (jump to first uncovered line)
1
/*
2
* AES using AES-NI instructions
3
* (C) 2009,2012 Jack Lloyd
4
*
5
* Botan is released under the Simplified BSD License (see license.txt)
6
*/
7
8
#include <botan/aes.h>
9
#include <botan/loadstor.h>
10
#include <wmmintrin.h>
11
12
namespace Botan {
13
14
namespace {
15
16
BOTAN_FUNC_ISA("ssse3")
17
__m128i aes_128_key_expansion(__m128i key, __m128i key_with_rcon)
18
13.2k
   {
19
13.2k
   key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(3,3,3,3));
20
13.2k
   key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
21
13.2k
   key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
22
13.2k
   key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
23
13.2k
   return _mm_xor_si128(key, key_with_rcon);
24
13.2k
   }
25
26
BOTAN_FUNC_ISA("ssse3")
27
void aes_192_key_expansion(__m128i* K1, __m128i* K2, __m128i key2_with_rcon,
28
                           uint32_t out[], bool last)
29
0
   {
30
0
   __m128i key1 = *K1;
31
0
   __m128i key2 = *K2;
32
0
33
0
   key2_with_rcon  = _mm_shuffle_epi32(key2_with_rcon, _MM_SHUFFLE(1,1,1,1));
34
0
   key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
35
0
   key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
36
0
   key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
37
0
   key1 = _mm_xor_si128(key1, key2_with_rcon);
38
0
39
0
   *K1 = key1;
40
0
   _mm_storeu_si128(reinterpret_cast<__m128i*>(out), key1);
41
0
42
0
   if(last)
43
0
      return;
44
0
45
0
   key2 = _mm_xor_si128(key2, _mm_slli_si128(key2, 4));
46
0
   key2 = _mm_xor_si128(key2, _mm_shuffle_epi32(key1, _MM_SHUFFLE(3,3,3,3)));
47
0
48
0
   *K2 = key2;
49
0
   out[4] = _mm_cvtsi128_si32(key2);
50
0
   out[5] = _mm_cvtsi128_si32(_mm_srli_si128(key2, 4));
51
0
   }
52
53
/*
54
* The second half of the AES-256 key expansion (other half same as AES-128)
55
*/
56
BOTAN_FUNC_ISA("ssse3,aes")
57
__m128i aes_256_key_expansion(__m128i key, __m128i key2)
58
6.07k
   {
59
6.07k
   __m128i key_with_rcon = _mm_aeskeygenassist_si128(key2, 0x00);
60
6.07k
   key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(2,2,2,2));
61
6.07k
62
6.07k
   key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
63
6.07k
   key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
64
6.07k
   key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
65
6.07k
   return _mm_xor_si128(key, key_with_rcon);
66
6.07k
   }
67
68
}
69
70
#define AES_ENC_4_ROUNDS(K)                     \
71
77.9k
   do                                           \
72
77.9k
      {                                         \
73
77.9k
      B0 = _mm_aesenc_si128(B0, K);             \
74
77.9k
      B1 = _mm_aesenc_si128(B1, K);             \
75
77.9k
      B2 = _mm_aesenc_si128(B2, K);             \
76
77.9k
      B3 = _mm_aesenc_si128(B3, K);             \
77
77.9k
      } while(0)
78
79
#define AES_ENC_4_LAST_ROUNDS(K)                \
80
6.44k
   do                                           \
81
6.44k
      {                                         \
82
6.44k
      B0 = _mm_aesenclast_si128(B0, K);         \
83
6.44k
      B1 = _mm_aesenclast_si128(B1, K);         \
84
6.44k
      B2 = _mm_aesenclast_si128(B2, K);         \
85
6.44k
      B3 = _mm_aesenclast_si128(B3, K);         \
86
6.44k
      } while(0)
87
88
#define AES_DEC_4_ROUNDS(K)                     \
89
57.5k
   do                                           \
90
57.5k
      {                                         \
91
57.5k
      B0 = _mm_aesdec_si128(B0, K);             \
92
57.5k
      B1 = _mm_aesdec_si128(B1, K);             \
93
57.5k
      B2 = _mm_aesdec_si128(B2, K);             \
94
57.5k
      B3 = _mm_aesdec_si128(B3, K);             \
95
57.5k
      } while(0)
96
97
#define AES_DEC_4_LAST_ROUNDS(K)                \
98
5.28k
   do                                           \
99
5.28k
      {                                         \
100
5.28k
      B0 = _mm_aesdeclast_si128(B0, K);         \
101
5.28k
      B1 = _mm_aesdeclast_si128(B1, K);         \
102
5.28k
      B2 = _mm_aesdeclast_si128(B2, K);         \
103
5.28k
      B3 = _mm_aesdeclast_si128(B3, K);         \
104
5.28k
      } while(0)
105
106
/*
107
* AES-128 Encryption
108
*/
109
BOTAN_FUNC_ISA("ssse3,aes")
110
void AES_128::aesni_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
111
11.4k
   {
112
11.4k
   const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
113
11.4k
   __m128i* out_mm = reinterpret_cast<__m128i*>(out);
114
11.4k
115
11.4k
   const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_EK.data());
116
11.4k
117
11.4k
   const __m128i K0  = _mm_loadu_si128(key_mm);
118
11.4k
   const __m128i K1  = _mm_loadu_si128(key_mm + 1);
119
11.4k
   const __m128i K2  = _mm_loadu_si128(key_mm + 2);
120
11.4k
   const __m128i K3  = _mm_loadu_si128(key_mm + 3);
121
11.4k
   const __m128i K4  = _mm_loadu_si128(key_mm + 4);
122
11.4k
   const __m128i K5  = _mm_loadu_si128(key_mm + 5);
123
11.4k
   const __m128i K6  = _mm_loadu_si128(key_mm + 6);
124
11.4k
   const __m128i K7  = _mm_loadu_si128(key_mm + 7);
125
11.4k
   const __m128i K8  = _mm_loadu_si128(key_mm + 8);
126
11.4k
   const __m128i K9  = _mm_loadu_si128(key_mm + 9);
127
11.4k
   const __m128i K10 = _mm_loadu_si128(key_mm + 10);
128
11.4k
129
12.8k
   while(blocks >= 4)
130
1.47k
      {
131
1.47k
      __m128i B0 = _mm_loadu_si128(in_mm + 0);
132
1.47k
      __m128i B1 = _mm_loadu_si128(in_mm + 1);
133
1.47k
      __m128i B2 = _mm_loadu_si128(in_mm + 2);
134
1.47k
      __m128i B3 = _mm_loadu_si128(in_mm + 3);
135
1.47k
136
1.47k
      B0 = _mm_xor_si128(B0, K0);
137
1.47k
      B1 = _mm_xor_si128(B1, K0);
138
1.47k
      B2 = _mm_xor_si128(B2, K0);
139
1.47k
      B3 = _mm_xor_si128(B3, K0);
140
1.47k
141
1.47k
      AES_ENC_4_ROUNDS(K1);
142
1.47k
      AES_ENC_4_ROUNDS(K2);
143
1.47k
      AES_ENC_4_ROUNDS(K3);
144
1.47k
      AES_ENC_4_ROUNDS(K4);
145
1.47k
      AES_ENC_4_ROUNDS(K5);
146
1.47k
      AES_ENC_4_ROUNDS(K6);
147
1.47k
      AES_ENC_4_ROUNDS(K7);
148
1.47k
      AES_ENC_4_ROUNDS(K8);
149
1.47k
      AES_ENC_4_ROUNDS(K9);
150
1.47k
      AES_ENC_4_LAST_ROUNDS(K10);
151
1.47k
152
1.47k
      _mm_storeu_si128(out_mm + 0, B0);
153
1.47k
      _mm_storeu_si128(out_mm + 1, B1);
154
1.47k
      _mm_storeu_si128(out_mm + 2, B2);
155
1.47k
      _mm_storeu_si128(out_mm + 3, B3);
156
1.47k
157
1.47k
      blocks -= 4;
158
1.47k
      in_mm += 4;
159
1.47k
      out_mm += 4;
160
1.47k
      }
161
11.4k
162
22.4k
   for(size_t i = 0; i != blocks; ++i)
163
11.0k
      {
164
11.0k
      __m128i B = _mm_loadu_si128(in_mm + i);
165
11.0k
166
11.0k
      B = _mm_xor_si128(B, K0);
167
11.0k
168
11.0k
      B = _mm_aesenc_si128(B, K1);
169
11.0k
      B = _mm_aesenc_si128(B, K2);
170
11.0k
      B = _mm_aesenc_si128(B, K3);
171
11.0k
      B = _mm_aesenc_si128(B, K4);
172
11.0k
      B = _mm_aesenc_si128(B, K5);
173
11.0k
      B = _mm_aesenc_si128(B, K6);
174
11.0k
      B = _mm_aesenc_si128(B, K7);
175
11.0k
      B = _mm_aesenc_si128(B, K8);
176
11.0k
      B = _mm_aesenc_si128(B, K9);
177
11.0k
      B = _mm_aesenclast_si128(B, K10);
178
11.0k
179
11.0k
      _mm_storeu_si128(out_mm + i, B);
180
11.0k
      }
181
11.4k
   }
182
183
/*
184
* AES-128 Decryption
185
*/
186
BOTAN_FUNC_ISA("ssse3,aes")
187
void AES_128::aesni_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
188
774
   {
189
774
   const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
190
774
   __m128i* out_mm = reinterpret_cast<__m128i*>(out);
191
774
192
774
   const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_DK.data());
193
774
194
774
   const __m128i K0  = _mm_loadu_si128(key_mm);
195
774
   const __m128i K1  = _mm_loadu_si128(key_mm + 1);
196
774
   const __m128i K2  = _mm_loadu_si128(key_mm + 2);
197
774
   const __m128i K3  = _mm_loadu_si128(key_mm + 3);
198
774
   const __m128i K4  = _mm_loadu_si128(key_mm + 4);
199
774
   const __m128i K5  = _mm_loadu_si128(key_mm + 5);
200
774
   const __m128i K6  = _mm_loadu_si128(key_mm + 6);
201
774
   const __m128i K7  = _mm_loadu_si128(key_mm + 7);
202
774
   const __m128i K8  = _mm_loadu_si128(key_mm + 8);
203
774
   const __m128i K9  = _mm_loadu_si128(key_mm + 9);
204
774
   const __m128i K10 = _mm_loadu_si128(key_mm + 10);
205
774
206
3.56k
   while(blocks >= 4)
207
2.79k
      {
208
2.79k
      __m128i B0 = _mm_loadu_si128(in_mm + 0);
209
2.79k
      __m128i B1 = _mm_loadu_si128(in_mm + 1);
210
2.79k
      __m128i B2 = _mm_loadu_si128(in_mm + 2);
211
2.79k
      __m128i B3 = _mm_loadu_si128(in_mm + 3);
212
2.79k
213
2.79k
      B0 = _mm_xor_si128(B0, K0);
214
2.79k
      B1 = _mm_xor_si128(B1, K0);
215
2.79k
      B2 = _mm_xor_si128(B2, K0);
216
2.79k
      B3 = _mm_xor_si128(B3, K0);
217
2.79k
218
2.79k
      AES_DEC_4_ROUNDS(K1);
219
2.79k
      AES_DEC_4_ROUNDS(K2);
220
2.79k
      AES_DEC_4_ROUNDS(K3);
221
2.79k
      AES_DEC_4_ROUNDS(K4);
222
2.79k
      AES_DEC_4_ROUNDS(K5);
223
2.79k
      AES_DEC_4_ROUNDS(K6);
224
2.79k
      AES_DEC_4_ROUNDS(K7);
225
2.79k
      AES_DEC_4_ROUNDS(K8);
226
2.79k
      AES_DEC_4_ROUNDS(K9);
227
2.79k
      AES_DEC_4_LAST_ROUNDS(K10);
228
2.79k
229
2.79k
      _mm_storeu_si128(out_mm + 0, B0);
230
2.79k
      _mm_storeu_si128(out_mm + 1, B1);
231
2.79k
      _mm_storeu_si128(out_mm + 2, B2);
232
2.79k
      _mm_storeu_si128(out_mm + 3, B3);
233
2.79k
234
2.79k
      blocks -= 4;
235
2.79k
      in_mm += 4;
236
2.79k
      out_mm += 4;
237
2.79k
      }
238
774
239
947
   for(size_t i = 0; i != blocks; ++i)
240
173
      {
241
173
      __m128i B = _mm_loadu_si128(in_mm + i);
242
173
243
173
      B = _mm_xor_si128(B, K0);
244
173
245
173
      B = _mm_aesdec_si128(B, K1);
246
173
      B = _mm_aesdec_si128(B, K2);
247
173
      B = _mm_aesdec_si128(B, K3);
248
173
      B = _mm_aesdec_si128(B, K4);
249
173
      B = _mm_aesdec_si128(B, K5);
250
173
      B = _mm_aesdec_si128(B, K6);
251
173
      B = _mm_aesdec_si128(B, K7);
252
173
      B = _mm_aesdec_si128(B, K8);
253
173
      B = _mm_aesdec_si128(B, K9);
254
173
      B = _mm_aesdeclast_si128(B, K10);
255
173
256
173
      _mm_storeu_si128(out_mm + i, B);
257
173
      }
258
774
   }
259
260
/*
261
* AES-128 Key Schedule
262
*/
263
BOTAN_FUNC_ISA("ssse3,aes")
264
void AES_128::aesni_key_schedule(const uint8_t key[], size_t)
265
618
   {
266
618
   m_EK.resize(44);
267
618
   m_DK.resize(44);
268
618
269
618
   #define AES_128_key_exp(K, RCON) \
270
6.18k
      aes_128_key_expansion(K, _mm_aeskeygenassist_si128(K, RCON))
271
618
272
618
   const __m128i K0  = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key));
273
618
   const __m128i K1  = AES_128_key_exp(K0, 0x01);
274
618
   const __m128i K2  = AES_128_key_exp(K1, 0x02);
275
618
   const __m128i K3  = AES_128_key_exp(K2, 0x04);
276
618
   const __m128i K4  = AES_128_key_exp(K3, 0x08);
277
618
   const __m128i K5  = AES_128_key_exp(K4, 0x10);
278
618
   const __m128i K6  = AES_128_key_exp(K5, 0x20);
279
618
   const __m128i K7  = AES_128_key_exp(K6, 0x40);
280
618
   const __m128i K8  = AES_128_key_exp(K7, 0x80);
281
618
   const __m128i K9  = AES_128_key_exp(K8, 0x1B);
282
618
   const __m128i K10 = AES_128_key_exp(K9, 0x36);
283
618
284
618
   __m128i* EK_mm = reinterpret_cast<__m128i*>(m_EK.data());
285
618
   _mm_storeu_si128(EK_mm     , K0);
286
618
   _mm_storeu_si128(EK_mm +  1, K1);
287
618
   _mm_storeu_si128(EK_mm +  2, K2);
288
618
   _mm_storeu_si128(EK_mm +  3, K3);
289
618
   _mm_storeu_si128(EK_mm +  4, K4);
290
618
   _mm_storeu_si128(EK_mm +  5, K5);
291
618
   _mm_storeu_si128(EK_mm +  6, K6);
292
618
   _mm_storeu_si128(EK_mm +  7, K7);
293
618
   _mm_storeu_si128(EK_mm +  8, K8);
294
618
   _mm_storeu_si128(EK_mm +  9, K9);
295
618
   _mm_storeu_si128(EK_mm + 10, K10);
296
618
297
618
   // Now generate decryption keys
298
618
299
618
   __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data());
300
618
   _mm_storeu_si128(DK_mm     , K10);
301
618
   _mm_storeu_si128(DK_mm +  1, _mm_aesimc_si128(K9));
302
618
   _mm_storeu_si128(DK_mm +  2, _mm_aesimc_si128(K8));
303
618
   _mm_storeu_si128(DK_mm +  3, _mm_aesimc_si128(K7));
304
618
   _mm_storeu_si128(DK_mm +  4, _mm_aesimc_si128(K6));
305
618
   _mm_storeu_si128(DK_mm +  5, _mm_aesimc_si128(K5));
306
618
   _mm_storeu_si128(DK_mm +  6, _mm_aesimc_si128(K4));
307
618
   _mm_storeu_si128(DK_mm +  7, _mm_aesimc_si128(K3));
308
618
   _mm_storeu_si128(DK_mm +  8, _mm_aesimc_si128(K2));
309
618
   _mm_storeu_si128(DK_mm +  9, _mm_aesimc_si128(K1));
310
618
   _mm_storeu_si128(DK_mm + 10, K0);
311
618
   }
312
313
/*
314
* AES-192 Encryption
315
*/
316
BOTAN_FUNC_ISA("ssse3,aes")
317
void AES_192::aesni_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
318
0
   {
319
0
   const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
320
0
   __m128i* out_mm = reinterpret_cast<__m128i*>(out);
321
0
322
0
   const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_EK.data());
323
0
324
0
   const __m128i K0  = _mm_loadu_si128(key_mm);
325
0
   const __m128i K1  = _mm_loadu_si128(key_mm + 1);
326
0
   const __m128i K2  = _mm_loadu_si128(key_mm + 2);
327
0
   const __m128i K3  = _mm_loadu_si128(key_mm + 3);
328
0
   const __m128i K4  = _mm_loadu_si128(key_mm + 4);
329
0
   const __m128i K5  = _mm_loadu_si128(key_mm + 5);
330
0
   const __m128i K6  = _mm_loadu_si128(key_mm + 6);
331
0
   const __m128i K7  = _mm_loadu_si128(key_mm + 7);
332
0
   const __m128i K8  = _mm_loadu_si128(key_mm + 8);
333
0
   const __m128i K9  = _mm_loadu_si128(key_mm + 9);
334
0
   const __m128i K10 = _mm_loadu_si128(key_mm + 10);
335
0
   const __m128i K11 = _mm_loadu_si128(key_mm + 11);
336
0
   const __m128i K12 = _mm_loadu_si128(key_mm + 12);
337
0
338
0
   while(blocks >= 4)
339
0
      {
340
0
      __m128i B0 = _mm_loadu_si128(in_mm + 0);
341
0
      __m128i B1 = _mm_loadu_si128(in_mm + 1);
342
0
      __m128i B2 = _mm_loadu_si128(in_mm + 2);
343
0
      __m128i B3 = _mm_loadu_si128(in_mm + 3);
344
0
345
0
      B0 = _mm_xor_si128(B0, K0);
346
0
      B1 = _mm_xor_si128(B1, K0);
347
0
      B2 = _mm_xor_si128(B2, K0);
348
0
      B3 = _mm_xor_si128(B3, K0);
349
0
350
0
      AES_ENC_4_ROUNDS(K1);
351
0
      AES_ENC_4_ROUNDS(K2);
352
0
      AES_ENC_4_ROUNDS(K3);
353
0
      AES_ENC_4_ROUNDS(K4);
354
0
      AES_ENC_4_ROUNDS(K5);
355
0
      AES_ENC_4_ROUNDS(K6);
356
0
      AES_ENC_4_ROUNDS(K7);
357
0
      AES_ENC_4_ROUNDS(K8);
358
0
      AES_ENC_4_ROUNDS(K9);
359
0
      AES_ENC_4_ROUNDS(K10);
360
0
      AES_ENC_4_ROUNDS(K11);
361
0
      AES_ENC_4_LAST_ROUNDS(K12);
362
0
363
0
      _mm_storeu_si128(out_mm + 0, B0);
364
0
      _mm_storeu_si128(out_mm + 1, B1);
365
0
      _mm_storeu_si128(out_mm + 2, B2);
366
0
      _mm_storeu_si128(out_mm + 3, B3);
367
0
368
0
      blocks -= 4;
369
0
      in_mm += 4;
370
0
      out_mm += 4;
371
0
      }
372
0
373
0
   for(size_t i = 0; i != blocks; ++i)
374
0
      {
375
0
      __m128i B = _mm_loadu_si128(in_mm + i);
376
0
377
0
      B = _mm_xor_si128(B, K0);
378
0
379
0
      B = _mm_aesenc_si128(B, K1);
380
0
      B = _mm_aesenc_si128(B, K2);
381
0
      B = _mm_aesenc_si128(B, K3);
382
0
      B = _mm_aesenc_si128(B, K4);
383
0
      B = _mm_aesenc_si128(B, K5);
384
0
      B = _mm_aesenc_si128(B, K6);
385
0
      B = _mm_aesenc_si128(B, K7);
386
0
      B = _mm_aesenc_si128(B, K8);
387
0
      B = _mm_aesenc_si128(B, K9);
388
0
      B = _mm_aesenc_si128(B, K10);
389
0
      B = _mm_aesenc_si128(B, K11);
390
0
      B = _mm_aesenclast_si128(B, K12);
391
0
392
0
      _mm_storeu_si128(out_mm + i, B);
393
0
      }
394
0
   }
395
396
/*
397
* AES-192 Decryption
398
*/
399
BOTAN_FUNC_ISA("ssse3,aes")
400
void AES_192::aesni_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
401
0
   {
402
0
   const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
403
0
   __m128i* out_mm = reinterpret_cast<__m128i*>(out);
404
0
405
0
   const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_DK.data());
406
0
407
0
   const __m128i K0  = _mm_loadu_si128(key_mm);
408
0
   const __m128i K1  = _mm_loadu_si128(key_mm + 1);
409
0
   const __m128i K2  = _mm_loadu_si128(key_mm + 2);
410
0
   const __m128i K3  = _mm_loadu_si128(key_mm + 3);
411
0
   const __m128i K4  = _mm_loadu_si128(key_mm + 4);
412
0
   const __m128i K5  = _mm_loadu_si128(key_mm + 5);
413
0
   const __m128i K6  = _mm_loadu_si128(key_mm + 6);
414
0
   const __m128i K7  = _mm_loadu_si128(key_mm + 7);
415
0
   const __m128i K8  = _mm_loadu_si128(key_mm + 8);
416
0
   const __m128i K9  = _mm_loadu_si128(key_mm + 9);
417
0
   const __m128i K10 = _mm_loadu_si128(key_mm + 10);
418
0
   const __m128i K11 = _mm_loadu_si128(key_mm + 11);
419
0
   const __m128i K12 = _mm_loadu_si128(key_mm + 12);
420
0
421
0
   while(blocks >= 4)
422
0
      {
423
0
      __m128i B0 = _mm_loadu_si128(in_mm + 0);
424
0
      __m128i B1 = _mm_loadu_si128(in_mm + 1);
425
0
      __m128i B2 = _mm_loadu_si128(in_mm + 2);
426
0
      __m128i B3 = _mm_loadu_si128(in_mm + 3);
427
0
428
0
      B0 = _mm_xor_si128(B0, K0);
429
0
      B1 = _mm_xor_si128(B1, K0);
430
0
      B2 = _mm_xor_si128(B2, K0);
431
0
      B3 = _mm_xor_si128(B3, K0);
432
0
433
0
      AES_DEC_4_ROUNDS(K1);
434
0
      AES_DEC_4_ROUNDS(K2);
435
0
      AES_DEC_4_ROUNDS(K3);
436
0
      AES_DEC_4_ROUNDS(K4);
437
0
      AES_DEC_4_ROUNDS(K5);
438
0
      AES_DEC_4_ROUNDS(K6);
439
0
      AES_DEC_4_ROUNDS(K7);
440
0
      AES_DEC_4_ROUNDS(K8);
441
0
      AES_DEC_4_ROUNDS(K9);
442
0
      AES_DEC_4_ROUNDS(K10);
443
0
      AES_DEC_4_ROUNDS(K11);
444
0
      AES_DEC_4_LAST_ROUNDS(K12);
445
0
446
0
      _mm_storeu_si128(out_mm + 0, B0);
447
0
      _mm_storeu_si128(out_mm + 1, B1);
448
0
      _mm_storeu_si128(out_mm + 2, B2);
449
0
      _mm_storeu_si128(out_mm + 3, B3);
450
0
451
0
      blocks -= 4;
452
0
      in_mm += 4;
453
0
      out_mm += 4;
454
0
      }
455
0
456
0
   for(size_t i = 0; i != blocks; ++i)
457
0
      {
458
0
      __m128i B = _mm_loadu_si128(in_mm + i);
459
0
460
0
      B = _mm_xor_si128(B, K0);
461
0
462
0
      B = _mm_aesdec_si128(B, K1);
463
0
      B = _mm_aesdec_si128(B, K2);
464
0
      B = _mm_aesdec_si128(B, K3);
465
0
      B = _mm_aesdec_si128(B, K4);
466
0
      B = _mm_aesdec_si128(B, K5);
467
0
      B = _mm_aesdec_si128(B, K6);
468
0
      B = _mm_aesdec_si128(B, K7);
469
0
      B = _mm_aesdec_si128(B, K8);
470
0
      B = _mm_aesdec_si128(B, K9);
471
0
      B = _mm_aesdec_si128(B, K10);
472
0
      B = _mm_aesdec_si128(B, K11);
473
0
      B = _mm_aesdeclast_si128(B, K12);
474
0
475
0
      _mm_storeu_si128(out_mm + i, B);
476
0
      }
477
0
   }
478
479
/*
480
* AES-192 Key Schedule
481
*/
482
BOTAN_FUNC_ISA("ssse3,aes")
483
void AES_192::aesni_key_schedule(const uint8_t key[], size_t)
484
0
   {
485
0
   m_EK.resize(52);
486
0
   m_DK.resize(52);
487
0
488
0
   __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key));
489
0
   __m128i K1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key + 8));
490
0
   K1 = _mm_srli_si128(K1, 8);
491
0
492
0
   load_le(m_EK.data(), key, 6);
493
0
494
0
   #define AES_192_key_exp(RCON, EK_OFF)                         \
495
0
     aes_192_key_expansion(&K0, &K1,                             \
496
0
                           _mm_aeskeygenassist_si128(K1, RCON),  \
497
0
                           &m_EK[EK_OFF], EK_OFF == 48)
498
0
499
0
   AES_192_key_exp(0x01, 6);
500
0
   AES_192_key_exp(0x02, 12);
501
0
   AES_192_key_exp(0x04, 18);
502
0
   AES_192_key_exp(0x08, 24);
503
0
   AES_192_key_exp(0x10, 30);
504
0
   AES_192_key_exp(0x20, 36);
505
0
   AES_192_key_exp(0x40, 42);
506
0
   AES_192_key_exp(0x80, 48);
507
0
508
0
   #undef AES_192_key_exp
509
0
510
0
   // Now generate decryption keys
511
0
   const __m128i* EK_mm = reinterpret_cast<const __m128i*>(m_EK.data());
512
0
513
0
   __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data());
514
0
   _mm_storeu_si128(DK_mm     , _mm_loadu_si128(EK_mm + 12));
515
0
   _mm_storeu_si128(DK_mm +  1, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 11)));
516
0
   _mm_storeu_si128(DK_mm +  2, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 10)));
517
0
   _mm_storeu_si128(DK_mm +  3, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 9)));
518
0
   _mm_storeu_si128(DK_mm +  4, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 8)));
519
0
   _mm_storeu_si128(DK_mm +  5, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 7)));
520
0
   _mm_storeu_si128(DK_mm +  6, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 6)));
521
0
   _mm_storeu_si128(DK_mm +  7, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 5)));
522
0
   _mm_storeu_si128(DK_mm +  8, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 4)));
523
0
   _mm_storeu_si128(DK_mm +  9, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 3)));
524
0
   _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 2)));
525
0
   _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 1)));
526
0
   _mm_storeu_si128(DK_mm + 12, _mm_loadu_si128(EK_mm + 0));
527
0
   }
528
529
/*
530
* AES-256 Encryption
531
*/
532
BOTAN_FUNC_ISA("ssse3,aes")
533
void AES_256::aesni_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
534
8.96k
   {
535
8.96k
   const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
536
8.96k
   __m128i* out_mm = reinterpret_cast<__m128i*>(out);
537
8.96k
538
8.96k
   const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_EK.data());
539
8.96k
540
8.96k
   const __m128i K0  = _mm_loadu_si128(key_mm);
541
8.96k
   const __m128i K1  = _mm_loadu_si128(key_mm + 1);
542
8.96k
   const __m128i K2  = _mm_loadu_si128(key_mm + 2);
543
8.96k
   const __m128i K3  = _mm_loadu_si128(key_mm + 3);
544
8.96k
   const __m128i K4  = _mm_loadu_si128(key_mm + 4);
545
8.96k
   const __m128i K5  = _mm_loadu_si128(key_mm + 5);
546
8.96k
   const __m128i K6  = _mm_loadu_si128(key_mm + 6);
547
8.96k
   const __m128i K7  = _mm_loadu_si128(key_mm + 7);
548
8.96k
   const __m128i K8  = _mm_loadu_si128(key_mm + 8);
549
8.96k
   const __m128i K9  = _mm_loadu_si128(key_mm + 9);
550
8.96k
   const __m128i K10 = _mm_loadu_si128(key_mm + 10);
551
8.96k
   const __m128i K11 = _mm_loadu_si128(key_mm + 11);
552
8.96k
   const __m128i K12 = _mm_loadu_si128(key_mm + 12);
553
8.96k
   const __m128i K13 = _mm_loadu_si128(key_mm + 13);
554
8.96k
   const __m128i K14 = _mm_loadu_si128(key_mm + 14);
555
8.96k
556
13.9k
   while(blocks >= 4)
557
4.97k
      {
558
4.97k
      __m128i B0 = _mm_loadu_si128(in_mm + 0);
559
4.97k
      __m128i B1 = _mm_loadu_si128(in_mm + 1);
560
4.97k
      __m128i B2 = _mm_loadu_si128(in_mm + 2);
561
4.97k
      __m128i B3 = _mm_loadu_si128(in_mm + 3);
562
4.97k
563
4.97k
      B0 = _mm_xor_si128(B0, K0);
564
4.97k
      B1 = _mm_xor_si128(B1, K0);
565
4.97k
      B2 = _mm_xor_si128(B2, K0);
566
4.97k
      B3 = _mm_xor_si128(B3, K0);
567
4.97k
568
4.97k
      AES_ENC_4_ROUNDS(K1);
569
4.97k
      AES_ENC_4_ROUNDS(K2);
570
4.97k
      AES_ENC_4_ROUNDS(K3);
571
4.97k
      AES_ENC_4_ROUNDS(K4);
572
4.97k
      AES_ENC_4_ROUNDS(K5);
573
4.97k
      AES_ENC_4_ROUNDS(K6);
574
4.97k
      AES_ENC_4_ROUNDS(K7);
575
4.97k
      AES_ENC_4_ROUNDS(K8);
576
4.97k
      AES_ENC_4_ROUNDS(K9);
577
4.97k
      AES_ENC_4_ROUNDS(K10);
578
4.97k
      AES_ENC_4_ROUNDS(K11);
579
4.97k
      AES_ENC_4_ROUNDS(K12);
580
4.97k
      AES_ENC_4_ROUNDS(K13);
581
4.97k
      AES_ENC_4_LAST_ROUNDS(K14);
582
4.97k
583
4.97k
      _mm_storeu_si128(out_mm + 0, B0);
584
4.97k
      _mm_storeu_si128(out_mm + 1, B1);
585
4.97k
      _mm_storeu_si128(out_mm + 2, B2);
586
4.97k
      _mm_storeu_si128(out_mm + 3, B3);
587
4.97k
588
4.97k
      blocks -= 4;
589
4.97k
      in_mm += 4;
590
4.97k
      out_mm += 4;
591
4.97k
      }
592
8.96k
593
16.6k
   for(size_t i = 0; i != blocks; ++i)
594
7.71k
      {
595
7.71k
      __m128i B = _mm_loadu_si128(in_mm + i);
596
7.71k
597
7.71k
      B = _mm_xor_si128(B, K0);
598
7.71k
599
7.71k
      B = _mm_aesenc_si128(B, K1);
600
7.71k
      B = _mm_aesenc_si128(B, K2);
601
7.71k
      B = _mm_aesenc_si128(B, K3);
602
7.71k
      B = _mm_aesenc_si128(B, K4);
603
7.71k
      B = _mm_aesenc_si128(B, K5);
604
7.71k
      B = _mm_aesenc_si128(B, K6);
605
7.71k
      B = _mm_aesenc_si128(B, K7);
606
7.71k
      B = _mm_aesenc_si128(B, K8);
607
7.71k
      B = _mm_aesenc_si128(B, K9);
608
7.71k
      B = _mm_aesenc_si128(B, K10);
609
7.71k
      B = _mm_aesenc_si128(B, K11);
610
7.71k
      B = _mm_aesenc_si128(B, K12);
611
7.71k
      B = _mm_aesenc_si128(B, K13);
612
7.71k
      B = _mm_aesenclast_si128(B, K14);
613
7.71k
614
7.71k
      _mm_storeu_si128(out_mm + i, B);
615
7.71k
      }
616
8.96k
   }
617
618
/*
619
* AES-256 Decryption
620
*/
621
BOTAN_FUNC_ISA("ssse3,aes")
622
void AES_256::aesni_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
623
707
   {
624
707
   const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
625
707
   __m128i* out_mm = reinterpret_cast<__m128i*>(out);
626
707
627
707
   const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_DK.data());
628
707
629
707
   const __m128i K0  = _mm_loadu_si128(key_mm);
630
707
   const __m128i K1  = _mm_loadu_si128(key_mm + 1);
631
707
   const __m128i K2  = _mm_loadu_si128(key_mm + 2);
632
707
   const __m128i K3  = _mm_loadu_si128(key_mm + 3);
633
707
   const __m128i K4  = _mm_loadu_si128(key_mm + 4);
634
707
   const __m128i K5  = _mm_loadu_si128(key_mm + 5);
635
707
   const __m128i K6  = _mm_loadu_si128(key_mm + 6);
636
707
   const __m128i K7  = _mm_loadu_si128(key_mm + 7);
637
707
   const __m128i K8  = _mm_loadu_si128(key_mm + 8);
638
707
   const __m128i K9  = _mm_loadu_si128(key_mm + 9);
639
707
   const __m128i K10 = _mm_loadu_si128(key_mm + 10);
640
707
   const __m128i K11 = _mm_loadu_si128(key_mm + 11);
641
707
   const __m128i K12 = _mm_loadu_si128(key_mm + 12);
642
707
   const __m128i K13 = _mm_loadu_si128(key_mm + 13);
643
707
   const __m128i K14 = _mm_loadu_si128(key_mm + 14);
644
707
645
3.20k
   while(blocks >= 4)
646
2.49k
      {
647
2.49k
      __m128i B0 = _mm_loadu_si128(in_mm + 0);
648
2.49k
      __m128i B1 = _mm_loadu_si128(in_mm + 1);
649
2.49k
      __m128i B2 = _mm_loadu_si128(in_mm + 2);
650
2.49k
      __m128i B3 = _mm_loadu_si128(in_mm + 3);
651
2.49k
652
2.49k
      B0 = _mm_xor_si128(B0, K0);
653
2.49k
      B1 = _mm_xor_si128(B1, K0);
654
2.49k
      B2 = _mm_xor_si128(B2, K0);
655
2.49k
      B3 = _mm_xor_si128(B3, K0);
656
2.49k
657
2.49k
      AES_DEC_4_ROUNDS(K1);
658
2.49k
      AES_DEC_4_ROUNDS(K2);
659
2.49k
      AES_DEC_4_ROUNDS(K3);
660
2.49k
      AES_DEC_4_ROUNDS(K4);
661
2.49k
      AES_DEC_4_ROUNDS(K5);
662
2.49k
      AES_DEC_4_ROUNDS(K6);
663
2.49k
      AES_DEC_4_ROUNDS(K7);
664
2.49k
      AES_DEC_4_ROUNDS(K8);
665
2.49k
      AES_DEC_4_ROUNDS(K9);
666
2.49k
      AES_DEC_4_ROUNDS(K10);
667
2.49k
      AES_DEC_4_ROUNDS(K11);
668
2.49k
      AES_DEC_4_ROUNDS(K12);
669
2.49k
      AES_DEC_4_ROUNDS(K13);
670
2.49k
      AES_DEC_4_LAST_ROUNDS(K14);
671
2.49k
672
2.49k
      _mm_storeu_si128(out_mm + 0, B0);
673
2.49k
      _mm_storeu_si128(out_mm + 1, B1);
674
2.49k
      _mm_storeu_si128(out_mm + 2, B2);
675
2.49k
      _mm_storeu_si128(out_mm + 3, B3);
676
2.49k
677
2.49k
      blocks -= 4;
678
2.49k
      in_mm += 4;
679
2.49k
      out_mm += 4;
680
2.49k
      }
681
707
682
886
   for(size_t i = 0; i != blocks; ++i)
683
179
      {
684
179
      __m128i B = _mm_loadu_si128(in_mm + i);
685
179
686
179
      B = _mm_xor_si128(B, K0);
687
179
688
179
      B = _mm_aesdec_si128(B, K1);
689
179
      B = _mm_aesdec_si128(B, K2);
690
179
      B = _mm_aesdec_si128(B, K3);
691
179
      B = _mm_aesdec_si128(B, K4);
692
179
      B = _mm_aesdec_si128(B, K5);
693
179
      B = _mm_aesdec_si128(B, K6);
694
179
      B = _mm_aesdec_si128(B, K7);
695
179
      B = _mm_aesdec_si128(B, K8);
696
179
      B = _mm_aesdec_si128(B, K9);
697
179
      B = _mm_aesdec_si128(B, K10);
698
179
      B = _mm_aesdec_si128(B, K11);
699
179
      B = _mm_aesdec_si128(B, K12);
700
179
      B = _mm_aesdec_si128(B, K13);
701
179
      B = _mm_aesdeclast_si128(B, K14);
702
179
703
179
      _mm_storeu_si128(out_mm + i, B);
704
179
      }
705
707
   }
706
707
/*
708
* AES-256 Key Schedule
709
*/
710
BOTAN_FUNC_ISA("ssse3,aes")
711
void AES_256::aesni_key_schedule(const uint8_t key[], size_t)
712
1.01k
   {
713
1.01k
   m_EK.resize(60);
714
1.01k
   m_DK.resize(60);
715
1.01k
716
1.01k
   const __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key));
717
1.01k
   const __m128i K1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key + 16));
718
1.01k
719
1.01k
   const __m128i K2 = aes_128_key_expansion(K0, _mm_aeskeygenassist_si128(K1, 0x01));
720
1.01k
   const __m128i K3 = aes_256_key_expansion(K1, K2);
721
1.01k
722
1.01k
   const __m128i K4 = aes_128_key_expansion(K2, _mm_aeskeygenassist_si128(K3, 0x02));
723
1.01k
   const __m128i K5 = aes_256_key_expansion(K3, K4);
724
1.01k
725
1.01k
   const __m128i K6 = aes_128_key_expansion(K4, _mm_aeskeygenassist_si128(K5, 0x04));
726
1.01k
   const __m128i K7 = aes_256_key_expansion(K5, K6);
727
1.01k
728
1.01k
   const __m128i K8 = aes_128_key_expansion(K6, _mm_aeskeygenassist_si128(K7, 0x08));
729
1.01k
   const __m128i K9 = aes_256_key_expansion(K7, K8);
730
1.01k
731
1.01k
   const __m128i K10 = aes_128_key_expansion(K8, _mm_aeskeygenassist_si128(K9, 0x10));
732
1.01k
   const __m128i K11 = aes_256_key_expansion(K9, K10);
733
1.01k
734
1.01k
   const __m128i K12 = aes_128_key_expansion(K10, _mm_aeskeygenassist_si128(K11, 0x20));
735
1.01k
   const __m128i K13 = aes_256_key_expansion(K11, K12);
736
1.01k
737
1.01k
   const __m128i K14 = aes_128_key_expansion(K12, _mm_aeskeygenassist_si128(K13, 0x40));
738
1.01k
739
1.01k
   __m128i* EK_mm = reinterpret_cast<__m128i*>(m_EK.data());
740
1.01k
   _mm_storeu_si128(EK_mm     , K0);
741
1.01k
   _mm_storeu_si128(EK_mm +  1, K1);
742
1.01k
   _mm_storeu_si128(EK_mm +  2, K2);
743
1.01k
   _mm_storeu_si128(EK_mm +  3, K3);
744
1.01k
   _mm_storeu_si128(EK_mm +  4, K4);
745
1.01k
   _mm_storeu_si128(EK_mm +  5, K5);
746
1.01k
   _mm_storeu_si128(EK_mm +  6, K6);
747
1.01k
   _mm_storeu_si128(EK_mm +  7, K7);
748
1.01k
   _mm_storeu_si128(EK_mm +  8, K8);
749
1.01k
   _mm_storeu_si128(EK_mm +  9, K9);
750
1.01k
   _mm_storeu_si128(EK_mm + 10, K10);
751
1.01k
   _mm_storeu_si128(EK_mm + 11, K11);
752
1.01k
   _mm_storeu_si128(EK_mm + 12, K12);
753
1.01k
   _mm_storeu_si128(EK_mm + 13, K13);
754
1.01k
   _mm_storeu_si128(EK_mm + 14, K14);
755
1.01k
756
1.01k
   // Now generate decryption keys
757
1.01k
   __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data());
758
1.01k
   _mm_storeu_si128(DK_mm     , K14);
759
1.01k
   _mm_storeu_si128(DK_mm +  1, _mm_aesimc_si128(K13));
760
1.01k
   _mm_storeu_si128(DK_mm +  2, _mm_aesimc_si128(K12));
761
1.01k
   _mm_storeu_si128(DK_mm +  3, _mm_aesimc_si128(K11));
762
1.01k
   _mm_storeu_si128(DK_mm +  4, _mm_aesimc_si128(K10));
763
1.01k
   _mm_storeu_si128(DK_mm +  5, _mm_aesimc_si128(K9));
764
1.01k
   _mm_storeu_si128(DK_mm +  6, _mm_aesimc_si128(K8));
765
1.01k
   _mm_storeu_si128(DK_mm +  7, _mm_aesimc_si128(K7));
766
1.01k
   _mm_storeu_si128(DK_mm +  8, _mm_aesimc_si128(K6));
767
1.01k
   _mm_storeu_si128(DK_mm +  9, _mm_aesimc_si128(K5));
768
1.01k
   _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(K4));
769
1.01k
   _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(K3));
770
1.01k
   _mm_storeu_si128(DK_mm + 12, _mm_aesimc_si128(K2));
771
1.01k
   _mm_storeu_si128(DK_mm + 13, _mm_aesimc_si128(K1));
772
1.01k
   _mm_storeu_si128(DK_mm + 14, K0);
773
1.01k
   }
774
775
#undef AES_ENC_4_ROUNDS
776
#undef AES_ENC_4_LAST_ROUNDS
777
#undef AES_DEC_4_ROUNDS
778
#undef AES_DEC_4_LAST_ROUNDS
779
780
}