Coverage Report

Created: 2022-06-23 06:44

/src/botan/src/lib/block/aes/aes_ni/aes_ni.cpp
Line
Count
Source (jump to first uncovered line)
1
/*
2
* AES using AES-NI instructions
3
* (C) 2009,2012 Jack Lloyd
4
*
5
* Botan is released under the Simplified BSD License (see license.txt)
6
*/
7
8
#include <botan/internal/aes.h>
9
#include <botan/internal/loadstor.h>
10
#include <botan/internal/simd_32.h>
11
#include <wmmintrin.h>
12
13
namespace Botan {
14
15
namespace {
16
17
BOTAN_FUNC_ISA("ssse3")
18
__m128i aes_128_key_expansion(__m128i key, __m128i key_with_rcon)
19
5.86k
   {
20
5.86k
   key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(3,3,3,3));
21
5.86k
   key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
22
5.86k
   key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
23
5.86k
   key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
24
5.86k
   return _mm_xor_si128(key, key_with_rcon);
25
5.86k
   }
26
27
BOTAN_FUNC_ISA("ssse3")
28
void aes_192_key_expansion(__m128i* K1, __m128i* K2, __m128i key2_with_rcon,
29
                           uint32_t out[], bool last)
30
0
   {
31
0
   __m128i key1 = *K1;
32
0
   __m128i key2 = *K2;
33
34
0
   key2_with_rcon  = _mm_shuffle_epi32(key2_with_rcon, _MM_SHUFFLE(1,1,1,1));
35
0
   key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
36
0
   key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
37
0
   key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
38
0
   key1 = _mm_xor_si128(key1, key2_with_rcon);
39
40
0
   *K1 = key1;
41
0
   _mm_storeu_si128(reinterpret_cast<__m128i*>(out), key1);
42
43
0
   if(last)
44
0
      return;
45
46
0
   key2 = _mm_xor_si128(key2, _mm_slli_si128(key2, 4));
47
0
   key2 = _mm_xor_si128(key2, _mm_shuffle_epi32(key1, _MM_SHUFFLE(3,3,3,3)));
48
49
0
   *K2 = key2;
50
0
   out[4] = _mm_cvtsi128_si32(key2);
51
0
   out[5] = _mm_cvtsi128_si32(_mm_srli_si128(key2, 4));
52
0
   }
53
54
/*
55
* The second half of the AES-256 key expansion (other half same as AES-128)
56
*/
57
BOTAN_FUNC_ISA("ssse3,aes")
58
__m128i aes_256_key_expansion(__m128i key, __m128i key2)
59
3.82k
   {
60
3.82k
   __m128i key_with_rcon = _mm_aeskeygenassist_si128(key2, 0x00);
61
3.82k
   key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(2,2,2,2));
62
63
3.82k
   key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
64
3.82k
   key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
65
3.82k
   key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
66
3.82k
   return _mm_xor_si128(key, key_with_rcon);
67
3.82k
   }
68
69
BOTAN_FORCE_INLINE void keyxor(
70
   SIMD_4x32 K, SIMD_4x32& B0, SIMD_4x32& B1, SIMD_4x32& B2, SIMD_4x32& B3)
71
4.73k
   {
72
4.73k
   B0 ^= K;
73
4.73k
   B1 ^= K;
74
4.73k
   B2 ^= K;
75
4.73k
   B3 ^= K;
76
4.73k
   }
77
78
BOTAN_FUNC_ISA("aes")
79
BOTAN_FORCE_INLINE void aesenc(SIMD_4x32 K, SIMD_4x32& B)
80
49.3k
   {
81
49.3k
   B = SIMD_4x32(_mm_aesenc_si128(B.raw(), K.raw()));
82
49.3k
   }
83
84
BOTAN_FUNC_ISA("aes")
85
BOTAN_FORCE_INLINE void aesenc(
86
   SIMD_4x32 K, SIMD_4x32& B0, SIMD_4x32& B1, SIMD_4x32& B2, SIMD_4x32& B3)
87
39.5k
   {
88
39.5k
   B0 = SIMD_4x32(_mm_aesenc_si128(B0.raw(), K.raw()));
89
39.5k
   B1 = SIMD_4x32(_mm_aesenc_si128(B1.raw(), K.raw()));
90
39.5k
   B2 = SIMD_4x32(_mm_aesenc_si128(B2.raw(), K.raw()));
91
39.5k
   B3 = SIMD_4x32(_mm_aesenc_si128(B3.raw(), K.raw()));
92
39.5k
   }
93
94
BOTAN_FUNC_ISA("aes")
95
BOTAN_FORCE_INLINE void aesenclast(SIMD_4x32 K, SIMD_4x32& B)
96
4.69k
   {
97
4.69k
   B = SIMD_4x32(_mm_aesenclast_si128(B.raw(), K.raw()));
98
4.69k
   }
99
100
BOTAN_FUNC_ISA("aes")
101
BOTAN_FORCE_INLINE void aesenclast(
102
   SIMD_4x32 K, SIMD_4x32& B0, SIMD_4x32& B1, SIMD_4x32& B2, SIMD_4x32& B3)
103
3.16k
   {
104
3.16k
   B0 = SIMD_4x32(_mm_aesenclast_si128(B0.raw(), K.raw()));
105
3.16k
   B1 = SIMD_4x32(_mm_aesenclast_si128(B1.raw(), K.raw()));
106
3.16k
   B2 = SIMD_4x32(_mm_aesenclast_si128(B2.raw(), K.raw()));
107
3.16k
   B3 = SIMD_4x32(_mm_aesenclast_si128(B3.raw(), K.raw()));
108
3.16k
   }
109
110
BOTAN_FUNC_ISA("aes")
111
BOTAN_FORCE_INLINE void aesdec(SIMD_4x32 K, SIMD_4x32& B)
112
1.80k
   {
113
1.80k
   B = SIMD_4x32(_mm_aesdec_si128(B.raw(), K.raw()));
114
1.80k
   }
115
116
BOTAN_FUNC_ISA("aes")
117
BOTAN_FORCE_INLINE void aesdec(
118
   SIMD_4x32 K, SIMD_4x32& B0, SIMD_4x32& B1, SIMD_4x32& B2, SIMD_4x32& B3)
119
18.6k
   {
120
18.6k
   B0 = SIMD_4x32(_mm_aesdec_si128(B0.raw(), K.raw()));
121
18.6k
   B1 = SIMD_4x32(_mm_aesdec_si128(B1.raw(), K.raw()));
122
18.6k
   B2 = SIMD_4x32(_mm_aesdec_si128(B2.raw(), K.raw()));
123
18.6k
   B3 = SIMD_4x32(_mm_aesdec_si128(B3.raw(), K.raw()));
124
18.6k
   }
125
126
BOTAN_FUNC_ISA("aes")
127
BOTAN_FORCE_INLINE void aesdeclast(SIMD_4x32 K, SIMD_4x32& B)
128
148
   {
129
148
   B = SIMD_4x32(_mm_aesdeclast_si128(B.raw(), K.raw()));
130
148
   }
131
132
BOTAN_FUNC_ISA("aes")
133
BOTAN_FORCE_INLINE void aesdeclast(
134
   SIMD_4x32 K, SIMD_4x32& B0, SIMD_4x32& B1, SIMD_4x32& B2, SIMD_4x32& B3)
135
1.57k
   {
136
1.57k
   B0 = SIMD_4x32(_mm_aesdeclast_si128(B0.raw(), K.raw()));
137
1.57k
   B1 = SIMD_4x32(_mm_aesdeclast_si128(B1.raw(), K.raw()));
138
1.57k
   B2 = SIMD_4x32(_mm_aesdeclast_si128(B2.raw(), K.raw()));
139
1.57k
   B3 = SIMD_4x32(_mm_aesdeclast_si128(B3.raw(), K.raw()));
140
1.57k
   }
141
142
}
143
144
/*
145
* AES-128 Encryption
146
*/
147
BOTAN_FUNC_ISA("ssse3,aes")
148
void AES_128::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
149
2.99k
   {
150
2.99k
   const SIMD_4x32 K0 = SIMD_4x32::load_le(&m_EK[4*0]);
151
2.99k
   const SIMD_4x32 K1 = SIMD_4x32::load_le(&m_EK[4*1]);
152
2.99k
   const SIMD_4x32 K2 = SIMD_4x32::load_le(&m_EK[4*2]);
153
2.99k
   const SIMD_4x32 K3 = SIMD_4x32::load_le(&m_EK[4*3]);
154
2.99k
   const SIMD_4x32 K4 = SIMD_4x32::load_le(&m_EK[4*4]);
155
2.99k
   const SIMD_4x32 K5 = SIMD_4x32::load_le(&m_EK[4*5]);
156
2.99k
   const SIMD_4x32 K6 = SIMD_4x32::load_le(&m_EK[4*6]);
157
2.99k
   const SIMD_4x32 K7 = SIMD_4x32::load_le(&m_EK[4*7]);
158
2.99k
   const SIMD_4x32 K8 = SIMD_4x32::load_le(&m_EK[4*8]);
159
2.99k
   const SIMD_4x32 K9 = SIMD_4x32::load_le(&m_EK[4*9]);
160
2.99k
   const SIMD_4x32 K10 = SIMD_4x32::load_le(&m_EK[4*10]);
161
162
3.36k
   while(blocks >= 4)
163
372
      {
164
372
      SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16*0);
165
372
      SIMD_4x32 B1 = SIMD_4x32::load_le(in + 16*1);
166
372
      SIMD_4x32 B2 = SIMD_4x32::load_le(in + 16*2);
167
372
      SIMD_4x32 B3 = SIMD_4x32::load_le(in + 16*3);
168
169
372
      keyxor(K0, B0, B1, B2, B3);
170
372
      aesenc(K1, B0, B1, B2, B3);
171
372
      aesenc(K2, B0, B1, B2, B3);
172
372
      aesenc(K3, B0, B1, B2, B3);
173
372
      aesenc(K4, B0, B1, B2, B3);
174
372
      aesenc(K5, B0, B1, B2, B3);
175
372
      aesenc(K6, B0, B1, B2, B3);
176
372
      aesenc(K7, B0, B1, B2, B3);
177
372
      aesenc(K8, B0, B1, B2, B3);
178
372
      aesenc(K9, B0, B1, B2, B3);
179
372
      aesenclast(K10, B0, B1, B2, B3);
180
181
372
      B0.store_le(out + 16*0);
182
372
      B1.store_le(out + 16*1);
183
372
      B2.store_le(out + 16*2);
184
372
      B3.store_le(out + 16*3);
185
186
372
      blocks -= 4;
187
372
      in += 4*16;
188
372
      out += 4*16;
189
372
      }
190
191
5.89k
   for(size_t i = 0; i != blocks; ++i)
192
2.90k
      {
193
2.90k
      SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16*i);
194
195
2.90k
      B0 ^= K0;
196
2.90k
      aesenc(K1, B0);
197
2.90k
      aesenc(K2, B0);
198
2.90k
      aesenc(K3, B0);
199
2.90k
      aesenc(K4, B0);
200
2.90k
      aesenc(K5, B0);
201
2.90k
      aesenc(K6, B0);
202
2.90k
      aesenc(K7, B0);
203
2.90k
      aesenc(K8, B0);
204
2.90k
      aesenc(K9, B0);
205
2.90k
      aesenclast(K10, B0);
206
207
2.90k
      B0.store_le(out + 16*i);
208
2.90k
      }
209
2.99k
   }
210
211
/*
212
* AES-128 Decryption
213
*/
214
BOTAN_FUNC_ISA("ssse3,aes")
215
void AES_128::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
216
134
   {
217
134
   const SIMD_4x32 K0 = SIMD_4x32::load_le(&m_DK[4*0]);
218
134
   const SIMD_4x32 K1 = SIMD_4x32::load_le(&m_DK[4*1]);
219
134
   const SIMD_4x32 K2 = SIMD_4x32::load_le(&m_DK[4*2]);
220
134
   const SIMD_4x32 K3 = SIMD_4x32::load_le(&m_DK[4*3]);
221
134
   const SIMD_4x32 K4 = SIMD_4x32::load_le(&m_DK[4*4]);
222
134
   const SIMD_4x32 K5 = SIMD_4x32::load_le(&m_DK[4*5]);
223
134
   const SIMD_4x32 K6 = SIMD_4x32::load_le(&m_DK[4*6]);
224
134
   const SIMD_4x32 K7 = SIMD_4x32::load_le(&m_DK[4*7]);
225
134
   const SIMD_4x32 K8 = SIMD_4x32::load_le(&m_DK[4*8]);
226
134
   const SIMD_4x32 K9 = SIMD_4x32::load_le(&m_DK[4*9]);
227
134
   const SIMD_4x32 K10 = SIMD_4x32::load_le(&m_DK[4*10]);
228
229
607
   while(blocks >= 4)
230
473
      {
231
473
      SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16*0);
232
473
      SIMD_4x32 B1 = SIMD_4x32::load_le(in + 16*1);
233
473
      SIMD_4x32 B2 = SIMD_4x32::load_le(in + 16*2);
234
473
      SIMD_4x32 B3 = SIMD_4x32::load_le(in + 16*3);
235
236
473
      keyxor(K0, B0, B1, B2, B3);
237
473
      aesdec(K1, B0, B1, B2, B3);
238
473
      aesdec(K2, B0, B1, B2, B3);
239
473
      aesdec(K3, B0, B1, B2, B3);
240
473
      aesdec(K4, B0, B1, B2, B3);
241
473
      aesdec(K5, B0, B1, B2, B3);
242
473
      aesdec(K6, B0, B1, B2, B3);
243
473
      aesdec(K7, B0, B1, B2, B3);
244
473
      aesdec(K8, B0, B1, B2, B3);
245
473
      aesdec(K9, B0, B1, B2, B3);
246
473
      aesdeclast(K10, B0, B1, B2, B3);
247
248
473
      B0.store_le(out + 16*0);
249
473
      B1.store_le(out + 16*1);
250
473
      B2.store_le(out + 16*2);
251
473
      B3.store_le(out + 16*3);
252
253
473
      blocks -= 4;
254
473
      in += 4*16;
255
473
      out += 4*16;
256
473
      }
257
258
164
   for(size_t i = 0; i != blocks; ++i)
259
30
      {
260
30
      SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16*i);
261
262
30
      B0 ^= K0;
263
30
      aesdec(K1, B0);
264
30
      aesdec(K2, B0);
265
30
      aesdec(K3, B0);
266
30
      aesdec(K4, B0);
267
30
      aesdec(K5, B0);
268
30
      aesdec(K6, B0);
269
30
      aesdec(K7, B0);
270
30
      aesdec(K8, B0);
271
30
      aesdec(K9, B0);
272
30
      aesdeclast(K10, B0);
273
274
30
      B0.store_le(out + 16*i);
275
30
      }
276
134
   }
277
278
/*
279
* AES-128 Key Schedule
280
*/
281
BOTAN_FUNC_ISA("ssse3,aes")
282
void AES_128::aesni_key_schedule(const uint8_t key[], size_t /*length*/)
283
141
   {
284
141
   m_EK.resize(44);
285
141
   m_DK.resize(44);
286
287
141
   #define AES_128_key_exp(K, RCON) \
288
1.41k
      aes_128_key_expansion(K, _mm_aeskeygenassist_si128(K, RCON))
289
290
141
   const __m128i K0  = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key));
291
141
   const __m128i K1  = AES_128_key_exp(K0, 0x01);
292
141
   const __m128i K2  = AES_128_key_exp(K1, 0x02);
293
141
   const __m128i K3  = AES_128_key_exp(K2, 0x04);
294
141
   const __m128i K4  = AES_128_key_exp(K3, 0x08);
295
141
   const __m128i K5  = AES_128_key_exp(K4, 0x10);
296
141
   const __m128i K6  = AES_128_key_exp(K5, 0x20);
297
141
   const __m128i K7  = AES_128_key_exp(K6, 0x40);
298
141
   const __m128i K8  = AES_128_key_exp(K7, 0x80);
299
141
   const __m128i K9  = AES_128_key_exp(K8, 0x1B);
300
141
   const __m128i K10 = AES_128_key_exp(K9, 0x36);
301
302
141
   __m128i* EK_mm = reinterpret_cast<__m128i*>(m_EK.data());
303
141
   _mm_storeu_si128(EK_mm     , K0);
304
141
   _mm_storeu_si128(EK_mm +  1, K1);
305
141
   _mm_storeu_si128(EK_mm +  2, K2);
306
141
   _mm_storeu_si128(EK_mm +  3, K3);
307
141
   _mm_storeu_si128(EK_mm +  4, K4);
308
141
   _mm_storeu_si128(EK_mm +  5, K5);
309
141
   _mm_storeu_si128(EK_mm +  6, K6);
310
141
   _mm_storeu_si128(EK_mm +  7, K7);
311
141
   _mm_storeu_si128(EK_mm +  8, K8);
312
141
   _mm_storeu_si128(EK_mm +  9, K9);
313
141
   _mm_storeu_si128(EK_mm + 10, K10);
314
315
   // Now generate decryption keys
316
317
141
   __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data());
318
141
   _mm_storeu_si128(DK_mm     , K10);
319
141
   _mm_storeu_si128(DK_mm +  1, _mm_aesimc_si128(K9));
320
141
   _mm_storeu_si128(DK_mm +  2, _mm_aesimc_si128(K8));
321
141
   _mm_storeu_si128(DK_mm +  3, _mm_aesimc_si128(K7));
322
141
   _mm_storeu_si128(DK_mm +  4, _mm_aesimc_si128(K6));
323
141
   _mm_storeu_si128(DK_mm +  5, _mm_aesimc_si128(K5));
324
141
   _mm_storeu_si128(DK_mm +  6, _mm_aesimc_si128(K4));
325
141
   _mm_storeu_si128(DK_mm +  7, _mm_aesimc_si128(K3));
326
141
   _mm_storeu_si128(DK_mm +  8, _mm_aesimc_si128(K2));
327
141
   _mm_storeu_si128(DK_mm +  9, _mm_aesimc_si128(K1));
328
141
   _mm_storeu_si128(DK_mm + 10, K0);
329
141
   }
330
331
/*
332
* AES-192 Encryption
333
*/
334
BOTAN_FUNC_ISA("ssse3,aes")
335
void AES_192::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
336
0
   {
337
0
   const SIMD_4x32 K0 = SIMD_4x32::load_le(&m_EK[4*0]);
338
0
   const SIMD_4x32 K1 = SIMD_4x32::load_le(&m_EK[4*1]);
339
0
   const SIMD_4x32 K2 = SIMD_4x32::load_le(&m_EK[4*2]);
340
0
   const SIMD_4x32 K3 = SIMD_4x32::load_le(&m_EK[4*3]);
341
0
   const SIMD_4x32 K4 = SIMD_4x32::load_le(&m_EK[4*4]);
342
0
   const SIMD_4x32 K5 = SIMD_4x32::load_le(&m_EK[4*5]);
343
0
   const SIMD_4x32 K6 = SIMD_4x32::load_le(&m_EK[4*6]);
344
0
   const SIMD_4x32 K7 = SIMD_4x32::load_le(&m_EK[4*7]);
345
0
   const SIMD_4x32 K8 = SIMD_4x32::load_le(&m_EK[4*8]);
346
0
   const SIMD_4x32 K9 = SIMD_4x32::load_le(&m_EK[4*9]);
347
0
   const SIMD_4x32 K10 = SIMD_4x32::load_le(&m_EK[4*10]);
348
0
   const SIMD_4x32 K11 = SIMD_4x32::load_le(&m_EK[4*11]);
349
0
   const SIMD_4x32 K12 = SIMD_4x32::load_le(&m_EK[4*12]);
350
351
0
   while(blocks >= 4)
352
0
      {
353
0
      SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16*0);
354
0
      SIMD_4x32 B1 = SIMD_4x32::load_le(in + 16*1);
355
0
      SIMD_4x32 B2 = SIMD_4x32::load_le(in + 16*2);
356
0
      SIMD_4x32 B3 = SIMD_4x32::load_le(in + 16*3);
357
358
0
      keyxor(K0, B0, B1, B2, B3);
359
0
      aesenc(K1, B0, B1, B2, B3);
360
0
      aesenc(K2, B0, B1, B2, B3);
361
0
      aesenc(K3, B0, B1, B2, B3);
362
0
      aesenc(K4, B0, B1, B2, B3);
363
0
      aesenc(K5, B0, B1, B2, B3);
364
0
      aesenc(K6, B0, B1, B2, B3);
365
0
      aesenc(K7, B0, B1, B2, B3);
366
0
      aesenc(K8, B0, B1, B2, B3);
367
0
      aesenc(K9, B0, B1, B2, B3);
368
0
      aesenc(K10, B0, B1, B2, B3);
369
0
      aesenc(K11, B0, B1, B2, B3);
370
0
      aesenclast(K12, B0, B1, B2, B3);
371
372
0
      B0.store_le(out + 16*0);
373
0
      B1.store_le(out + 16*1);
374
0
      B2.store_le(out + 16*2);
375
0
      B3.store_le(out + 16*3);
376
377
0
      blocks -= 4;
378
0
      in += 4*16;
379
0
      out += 4*16;
380
0
      }
381
382
0
   for(size_t i = 0; i != blocks; ++i)
383
0
      {
384
0
      SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16*i);
385
386
0
      B0 ^= K0;
387
388
0
      aesenc(K1, B0);
389
0
      aesenc(K2, B0);
390
0
      aesenc(K3, B0);
391
0
      aesenc(K4, B0);
392
0
      aesenc(K5, B0);
393
0
      aesenc(K6, B0);
394
0
      aesenc(K7, B0);
395
0
      aesenc(K8, B0);
396
0
      aesenc(K9, B0);
397
0
      aesenc(K10, B0);
398
0
      aesenc(K11, B0);
399
0
      aesenclast(K12, B0);
400
401
0
      B0.store_le(out + 16*i);
402
0
      }
403
0
   }
404
405
/*
406
* AES-192 Decryption
407
*/
408
BOTAN_FUNC_ISA("ssse3,aes")
409
void AES_192::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
410
0
   {
411
0
   const SIMD_4x32 K0 = SIMD_4x32::load_le(&m_DK[4*0]);
412
0
   const SIMD_4x32 K1 = SIMD_4x32::load_le(&m_DK[4*1]);
413
0
   const SIMD_4x32 K2 = SIMD_4x32::load_le(&m_DK[4*2]);
414
0
   const SIMD_4x32 K3 = SIMD_4x32::load_le(&m_DK[4*3]);
415
0
   const SIMD_4x32 K4 = SIMD_4x32::load_le(&m_DK[4*4]);
416
0
   const SIMD_4x32 K5 = SIMD_4x32::load_le(&m_DK[4*5]);
417
0
   const SIMD_4x32 K6 = SIMD_4x32::load_le(&m_DK[4*6]);
418
0
   const SIMD_4x32 K7 = SIMD_4x32::load_le(&m_DK[4*7]);
419
0
   const SIMD_4x32 K8 = SIMD_4x32::load_le(&m_DK[4*8]);
420
0
   const SIMD_4x32 K9 = SIMD_4x32::load_le(&m_DK[4*9]);
421
0
   const SIMD_4x32 K10 = SIMD_4x32::load_le(&m_DK[4*10]);
422
0
   const SIMD_4x32 K11 = SIMD_4x32::load_le(&m_DK[4*11]);
423
0
   const SIMD_4x32 K12 = SIMD_4x32::load_le(&m_DK[4*12]);
424
425
0
   while(blocks >= 4)
426
0
      {
427
0
      SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16*0);
428
0
      SIMD_4x32 B1 = SIMD_4x32::load_le(in + 16*1);
429
0
      SIMD_4x32 B2 = SIMD_4x32::load_le(in + 16*2);
430
0
      SIMD_4x32 B3 = SIMD_4x32::load_le(in + 16*3);
431
432
0
      keyxor(K0, B0, B1, B2, B3);
433
0
      aesdec(K1, B0, B1, B2, B3);
434
0
      aesdec(K2, B0, B1, B2, B3);
435
0
      aesdec(K3, B0, B1, B2, B3);
436
0
      aesdec(K4, B0, B1, B2, B3);
437
0
      aesdec(K5, B0, B1, B2, B3);
438
0
      aesdec(K6, B0, B1, B2, B3);
439
0
      aesdec(K7, B0, B1, B2, B3);
440
0
      aesdec(K8, B0, B1, B2, B3);
441
0
      aesdec(K9, B0, B1, B2, B3);
442
0
      aesdec(K10, B0, B1, B2, B3);
443
0
      aesdec(K11, B0, B1, B2, B3);
444
0
      aesdeclast(K12, B0, B1, B2, B3);
445
446
0
      B0.store_le(out + 16*0);
447
0
      B1.store_le(out + 16*1);
448
0
      B2.store_le(out + 16*2);
449
0
      B3.store_le(out + 16*3);
450
451
0
      blocks -= 4;
452
0
      in += 4*16;
453
0
      out += 4*16;
454
0
      }
455
456
0
   for(size_t i = 0; i != blocks; ++i)
457
0
      {
458
0
      SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16*i);
459
460
0
      B0 ^= K0;
461
462
0
      aesdec(K1, B0);
463
0
      aesdec(K2, B0);
464
0
      aesdec(K3, B0);
465
0
      aesdec(K4, B0);
466
0
      aesdec(K5, B0);
467
0
      aesdec(K6, B0);
468
0
      aesdec(K7, B0);
469
0
      aesdec(K8, B0);
470
0
      aesdec(K9, B0);
471
0
      aesdec(K10, B0);
472
0
      aesdec(K11, B0);
473
0
      aesdeclast(K12, B0);
474
475
0
      B0.store_le(out + 16*i);
476
0
      }
477
0
   }
478
479
/*
480
* AES-192 Key Schedule
481
*/
482
BOTAN_FUNC_ISA("ssse3,aes")
483
void AES_192::aesni_key_schedule(const uint8_t key[], size_t /*length*/)
484
0
   {
485
0
   m_EK.resize(52);
486
0
   m_DK.resize(52);
487
488
0
   __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key));
489
0
   __m128i K1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key + 8));
490
0
   K1 = _mm_srli_si128(K1, 8);
491
492
0
   load_le(m_EK.data(), key, 6);
493
494
0
   #define AES_192_key_exp(RCON, EK_OFF)                         \
495
0
     aes_192_key_expansion(&K0, &K1,                             \
496
0
                           _mm_aeskeygenassist_si128(K1, RCON),  \
497
0
                           &m_EK[EK_OFF], EK_OFF == 48)
498
499
0
   AES_192_key_exp(0x01, 6);
500
0
   AES_192_key_exp(0x02, 12);
501
0
   AES_192_key_exp(0x04, 18);
502
0
   AES_192_key_exp(0x08, 24);
503
0
   AES_192_key_exp(0x10, 30);
504
0
   AES_192_key_exp(0x20, 36);
505
0
   AES_192_key_exp(0x40, 42);
506
0
   AES_192_key_exp(0x80, 48);
507
508
0
   #undef AES_192_key_exp
509
510
   // Now generate decryption keys
511
0
   const __m128i* EK_mm = reinterpret_cast<const __m128i*>(m_EK.data());
512
513
0
   __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data());
514
0
   _mm_storeu_si128(DK_mm     , _mm_loadu_si128(EK_mm + 12));
515
0
   _mm_storeu_si128(DK_mm +  1, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 11)));
516
0
   _mm_storeu_si128(DK_mm +  2, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 10)));
517
0
   _mm_storeu_si128(DK_mm +  3, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 9)));
518
0
   _mm_storeu_si128(DK_mm +  4, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 8)));
519
0
   _mm_storeu_si128(DK_mm +  5, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 7)));
520
0
   _mm_storeu_si128(DK_mm +  6, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 6)));
521
0
   _mm_storeu_si128(DK_mm +  7, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 5)));
522
0
   _mm_storeu_si128(DK_mm +  8, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 4)));
523
0
   _mm_storeu_si128(DK_mm +  9, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 3)));
524
0
   _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 2)));
525
0
   _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 1)));
526
0
   _mm_storeu_si128(DK_mm + 12, _mm_loadu_si128(EK_mm + 0));
527
0
   }
528
529
/*
530
* AES-256 Encryption
531
*/
532
BOTAN_FUNC_ISA("ssse3,aes")
533
void AES_256::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
534
2.48k
   {
535
2.48k
   const SIMD_4x32 K0 = SIMD_4x32::load_le(&m_EK[4*0]);
536
2.48k
   const SIMD_4x32 K1 = SIMD_4x32::load_le(&m_EK[4*1]);
537
2.48k
   const SIMD_4x32 K2 = SIMD_4x32::load_le(&m_EK[4*2]);
538
2.48k
   const SIMD_4x32 K3 = SIMD_4x32::load_le(&m_EK[4*3]);
539
2.48k
   const SIMD_4x32 K4 = SIMD_4x32::load_le(&m_EK[4*4]);
540
2.48k
   const SIMD_4x32 K5 = SIMD_4x32::load_le(&m_EK[4*5]);
541
2.48k
   const SIMD_4x32 K6 = SIMD_4x32::load_le(&m_EK[4*6]);
542
2.48k
   const SIMD_4x32 K7 = SIMD_4x32::load_le(&m_EK[4*7]);
543
2.48k
   const SIMD_4x32 K8 = SIMD_4x32::load_le(&m_EK[4*8]);
544
2.48k
   const SIMD_4x32 K9 = SIMD_4x32::load_le(&m_EK[4*9]);
545
2.48k
   const SIMD_4x32 K10 = SIMD_4x32::load_le(&m_EK[4*10]);
546
2.48k
   const SIMD_4x32 K11 = SIMD_4x32::load_le(&m_EK[4*11]);
547
2.48k
   const SIMD_4x32 K12 = SIMD_4x32::load_le(&m_EK[4*12]);
548
2.48k
   const SIMD_4x32 K13 = SIMD_4x32::load_le(&m_EK[4*13]);
549
2.48k
   const SIMD_4x32 K14 = SIMD_4x32::load_le(&m_EK[4*14]);
550
551
5.27k
   while(blocks >= 4)
552
2.78k
      {
553
2.78k
      SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16*0);
554
2.78k
      SIMD_4x32 B1 = SIMD_4x32::load_le(in + 16*1);
555
2.78k
      SIMD_4x32 B2 = SIMD_4x32::load_le(in + 16*2);
556
2.78k
      SIMD_4x32 B3 = SIMD_4x32::load_le(in + 16*3);
557
558
2.78k
      keyxor(K0, B0, B1, B2, B3);
559
2.78k
      aesenc(K1, B0, B1, B2, B3);
560
2.78k
      aesenc(K2, B0, B1, B2, B3);
561
2.78k
      aesenc(K3, B0, B1, B2, B3);
562
2.78k
      aesenc(K4, B0, B1, B2, B3);
563
2.78k
      aesenc(K5, B0, B1, B2, B3);
564
2.78k
      aesenc(K6, B0, B1, B2, B3);
565
2.78k
      aesenc(K7, B0, B1, B2, B3);
566
2.78k
      aesenc(K8, B0, B1, B2, B3);
567
2.78k
      aesenc(K9, B0, B1, B2, B3);
568
2.78k
      aesenc(K10, B0, B1, B2, B3);
569
2.78k
      aesenc(K11, B0, B1, B2, B3);
570
2.78k
      aesenc(K12, B0, B1, B2, B3);
571
2.78k
      aesenc(K13, B0, B1, B2, B3);
572
2.78k
      aesenclast(K14, B0, B1, B2, B3);
573
574
2.78k
      B0.store_le(out + 16*0);
575
2.78k
      B1.store_le(out + 16*1);
576
2.78k
      B2.store_le(out + 16*2);
577
2.78k
      B3.store_le(out + 16*3);
578
579
2.78k
      blocks -= 4;
580
2.78k
      in += 4*16;
581
2.78k
      out += 4*16;
582
2.78k
      }
583
584
4.27k
   for(size_t i = 0; i != blocks; ++i)
585
1.78k
      {
586
1.78k
      SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16*i);
587
588
1.78k
      B0 ^= K0;
589
590
1.78k
      aesenc(K1, B0);
591
1.78k
      aesenc(K2, B0);
592
1.78k
      aesenc(K3, B0);
593
1.78k
      aesenc(K4, B0);
594
1.78k
      aesenc(K5, B0);
595
1.78k
      aesenc(K6, B0);
596
1.78k
      aesenc(K7, B0);
597
1.78k
      aesenc(K8, B0);
598
1.78k
      aesenc(K9, B0);
599
1.78k
      aesenc(K10, B0);
600
1.78k
      aesenc(K11, B0);
601
1.78k
      aesenc(K12, B0);
602
1.78k
      aesenc(K13, B0);
603
1.78k
      aesenclast(K14, B0);
604
605
1.78k
      B0.store_le(out + 16*i);
606
1.78k
      }
607
2.48k
   }
608
609
/*
610
* AES-256 Decryption
611
*/
612
BOTAN_FUNC_ISA("ssse3,aes")
613
void AES_256::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
614
322
   {
615
322
   const SIMD_4x32 K0 = SIMD_4x32::load_le(&m_DK[4*0]);
616
322
   const SIMD_4x32 K1 = SIMD_4x32::load_le(&m_DK[4*1]);
617
322
   const SIMD_4x32 K2 = SIMD_4x32::load_le(&m_DK[4*2]);
618
322
   const SIMD_4x32 K3 = SIMD_4x32::load_le(&m_DK[4*3]);
619
322
   const SIMD_4x32 K4 = SIMD_4x32::load_le(&m_DK[4*4]);
620
322
   const SIMD_4x32 K5 = SIMD_4x32::load_le(&m_DK[4*5]);
621
322
   const SIMD_4x32 K6 = SIMD_4x32::load_le(&m_DK[4*6]);
622
322
   const SIMD_4x32 K7 = SIMD_4x32::load_le(&m_DK[4*7]);
623
322
   const SIMD_4x32 K8 = SIMD_4x32::load_le(&m_DK[4*8]);
624
322
   const SIMD_4x32 K9 = SIMD_4x32::load_le(&m_DK[4*9]);
625
322
   const SIMD_4x32 K10 = SIMD_4x32::load_le(&m_DK[4*10]);
626
322
   const SIMD_4x32 K11 = SIMD_4x32::load_le(&m_DK[4*11]);
627
322
   const SIMD_4x32 K12 = SIMD_4x32::load_le(&m_DK[4*12]);
628
322
   const SIMD_4x32 K13 = SIMD_4x32::load_le(&m_DK[4*13]);
629
322
   const SIMD_4x32 K14 = SIMD_4x32::load_le(&m_DK[4*14]);
630
631
1.42k
   while(blocks >= 4)
632
1.10k
      {
633
1.10k
      SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16*0);
634
1.10k
      SIMD_4x32 B1 = SIMD_4x32::load_le(in + 16*1);
635
1.10k
      SIMD_4x32 B2 = SIMD_4x32::load_le(in + 16*2);
636
1.10k
      SIMD_4x32 B3 = SIMD_4x32::load_le(in + 16*3);
637
638
1.10k
      keyxor(K0, B0, B1, B2, B3);
639
1.10k
      aesdec(K1, B0, B1, B2, B3);
640
1.10k
      aesdec(K2, B0, B1, B2, B3);
641
1.10k
      aesdec(K3, B0, B1, B2, B3);
642
1.10k
      aesdec(K4, B0, B1, B2, B3);
643
1.10k
      aesdec(K5, B0, B1, B2, B3);
644
1.10k
      aesdec(K6, B0, B1, B2, B3);
645
1.10k
      aesdec(K7, B0, B1, B2, B3);
646
1.10k
      aesdec(K8, B0, B1, B2, B3);
647
1.10k
      aesdec(K9, B0, B1, B2, B3);
648
1.10k
      aesdec(K10, B0, B1, B2, B3);
649
1.10k
      aesdec(K11, B0, B1, B2, B3);
650
1.10k
      aesdec(K12, B0, B1, B2, B3);
651
1.10k
      aesdec(K13, B0, B1, B2, B3);
652
1.10k
      aesdeclast(K14, B0, B1, B2, B3);
653
654
1.10k
      B0.store_le(out + 16*0);
655
1.10k
      B1.store_le(out + 16*1);
656
1.10k
      B2.store_le(out + 16*2);
657
1.10k
      B3.store_le(out + 16*3);
658
659
1.10k
      blocks -= 4;
660
1.10k
      in += 4*16;
661
1.10k
      out += 4*16;
662
1.10k
      }
663
664
440
   for(size_t i = 0; i != blocks; ++i)
665
118
      {
666
118
      SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16*i);
667
668
118
      B0 ^= K0;
669
670
118
      aesdec(K1, B0);
671
118
      aesdec(K2, B0);
672
118
      aesdec(K3, B0);
673
118
      aesdec(K4, B0);
674
118
      aesdec(K5, B0);
675
118
      aesdec(K6, B0);
676
118
      aesdec(K7, B0);
677
118
      aesdec(K8, B0);
678
118
      aesdec(K9, B0);
679
118
      aesdec(K10, B0);
680
118
      aesdec(K11, B0);
681
118
      aesdec(K12, B0);
682
118
      aesdec(K13, B0);
683
118
      aesdeclast(K14, B0);
684
685
118
      B0.store_le(out + 16*i);
686
118
      }
687
322
   }
688
689
/*
690
* AES-256 Key Schedule
691
*/
692
BOTAN_FUNC_ISA("ssse3,aes")
693
void AES_256::aesni_key_schedule(const uint8_t key[], size_t /*length*/)
694
637
   {
695
637
   m_EK.resize(60);
696
637
   m_DK.resize(60);
697
698
637
   const __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key));
699
637
   const __m128i K1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key + 16));
700
701
637
   const __m128i K2 = aes_128_key_expansion(K0, _mm_aeskeygenassist_si128(K1, 0x01));
702
637
   const __m128i K3 = aes_256_key_expansion(K1, K2);
703
704
637
   const __m128i K4 = aes_128_key_expansion(K2, _mm_aeskeygenassist_si128(K3, 0x02));
705
637
   const __m128i K5 = aes_256_key_expansion(K3, K4);
706
707
637
   const __m128i K6 = aes_128_key_expansion(K4, _mm_aeskeygenassist_si128(K5, 0x04));
708
637
   const __m128i K7 = aes_256_key_expansion(K5, K6);
709
710
637
   const __m128i K8 = aes_128_key_expansion(K6, _mm_aeskeygenassist_si128(K7, 0x08));
711
637
   const __m128i K9 = aes_256_key_expansion(K7, K8);
712
713
637
   const __m128i K10 = aes_128_key_expansion(K8, _mm_aeskeygenassist_si128(K9, 0x10));
714
637
   const __m128i K11 = aes_256_key_expansion(K9, K10);
715
716
637
   const __m128i K12 = aes_128_key_expansion(K10, _mm_aeskeygenassist_si128(K11, 0x20));
717
637
   const __m128i K13 = aes_256_key_expansion(K11, K12);
718
719
637
   const __m128i K14 = aes_128_key_expansion(K12, _mm_aeskeygenassist_si128(K13, 0x40));
720
721
637
   __m128i* EK_mm = reinterpret_cast<__m128i*>(m_EK.data());
722
637
   _mm_storeu_si128(EK_mm     , K0);
723
637
   _mm_storeu_si128(EK_mm +  1, K1);
724
637
   _mm_storeu_si128(EK_mm +  2, K2);
725
637
   _mm_storeu_si128(EK_mm +  3, K3);
726
637
   _mm_storeu_si128(EK_mm +  4, K4);
727
637
   _mm_storeu_si128(EK_mm +  5, K5);
728
637
   _mm_storeu_si128(EK_mm +  6, K6);
729
637
   _mm_storeu_si128(EK_mm +  7, K7);
730
637
   _mm_storeu_si128(EK_mm +  8, K8);
731
637
   _mm_storeu_si128(EK_mm +  9, K9);
732
637
   _mm_storeu_si128(EK_mm + 10, K10);
733
637
   _mm_storeu_si128(EK_mm + 11, K11);
734
637
   _mm_storeu_si128(EK_mm + 12, K12);
735
637
   _mm_storeu_si128(EK_mm + 13, K13);
736
637
   _mm_storeu_si128(EK_mm + 14, K14);
737
738
   // Now generate decryption keys
739
637
   __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data());
740
637
   _mm_storeu_si128(DK_mm     , K14);
741
637
   _mm_storeu_si128(DK_mm +  1, _mm_aesimc_si128(K13));
742
637
   _mm_storeu_si128(DK_mm +  2, _mm_aesimc_si128(K12));
743
637
   _mm_storeu_si128(DK_mm +  3, _mm_aesimc_si128(K11));
744
637
   _mm_storeu_si128(DK_mm +  4, _mm_aesimc_si128(K10));
745
637
   _mm_storeu_si128(DK_mm +  5, _mm_aesimc_si128(K9));
746
637
   _mm_storeu_si128(DK_mm +  6, _mm_aesimc_si128(K8));
747
637
   _mm_storeu_si128(DK_mm +  7, _mm_aesimc_si128(K7));
748
637
   _mm_storeu_si128(DK_mm +  8, _mm_aesimc_si128(K6));
749
637
   _mm_storeu_si128(DK_mm +  9, _mm_aesimc_si128(K5));
750
637
   _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(K4));
751
637
   _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(K3));
752
637
   _mm_storeu_si128(DK_mm + 12, _mm_aesimc_si128(K2));
753
637
   _mm_storeu_si128(DK_mm + 13, _mm_aesimc_si128(K1));
754
637
   _mm_storeu_si128(DK_mm + 14, K0);
755
637
   }
756
757
}