/src/botan/src/lib/block/aes/aes_ni/aes_ni.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * AES using AES-NI instructions |
3 | | * (C) 2009,2012 Jack Lloyd |
4 | | * |
5 | | * Botan is released under the Simplified BSD License (see license.txt) |
6 | | */ |
7 | | |
8 | | #include <botan/aes.h> |
9 | | #include <botan/loadstor.h> |
10 | | #include <wmmintrin.h> |
11 | | |
12 | | namespace Botan { |
13 | | |
14 | | namespace { |
15 | | |
16 | | BOTAN_FUNC_ISA("ssse3") |
17 | | __m128i aes_128_key_expansion(__m128i key, __m128i key_with_rcon) |
18 | 16.8k | { |
19 | 16.8k | key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(3,3,3,3)); |
20 | 16.8k | key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); |
21 | 16.8k | key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); |
22 | 16.8k | key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); |
23 | 16.8k | return _mm_xor_si128(key, key_with_rcon); |
24 | 16.8k | } |
25 | | |
26 | | BOTAN_FUNC_ISA("ssse3") |
27 | | void aes_192_key_expansion(__m128i* K1, __m128i* K2, __m128i key2_with_rcon, |
28 | | uint32_t out[], bool last) |
29 | 0 | { |
30 | 0 | __m128i key1 = *K1; |
31 | 0 | __m128i key2 = *K2; |
32 | 0 |
|
33 | 0 | key2_with_rcon = _mm_shuffle_epi32(key2_with_rcon, _MM_SHUFFLE(1,1,1,1)); |
34 | 0 | key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4)); |
35 | 0 | key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4)); |
36 | 0 | key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4)); |
37 | 0 | key1 = _mm_xor_si128(key1, key2_with_rcon); |
38 | 0 |
|
39 | 0 | *K1 = key1; |
40 | 0 | _mm_storeu_si128(reinterpret_cast<__m128i*>(out), key1); |
41 | 0 |
|
42 | 0 | if(last) |
43 | 0 | return; |
44 | 0 | |
45 | 0 | key2 = _mm_xor_si128(key2, _mm_slli_si128(key2, 4)); |
46 | 0 | key2 = _mm_xor_si128(key2, _mm_shuffle_epi32(key1, _MM_SHUFFLE(3,3,3,3))); |
47 | 0 |
|
48 | 0 | *K2 = key2; |
49 | 0 | out[4] = _mm_cvtsi128_si32(key2); |
50 | 0 | out[5] = _mm_cvtsi128_si32(_mm_srli_si128(key2, 4)); |
51 | 0 | } |
52 | | |
53 | | /* |
54 | | * The second half of the AES-256 key expansion (other half same as AES-128) |
55 | | */ |
56 | | BOTAN_FUNC_ISA("ssse3,aes") |
57 | | __m128i aes_256_key_expansion(__m128i key, __m128i key2) |
58 | 8.66k | { |
59 | 8.66k | __m128i key_with_rcon = _mm_aeskeygenassist_si128(key2, 0x00); |
60 | 8.66k | key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(2,2,2,2)); |
61 | 8.66k | |
62 | 8.66k | key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); |
63 | 8.66k | key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); |
64 | 8.66k | key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); |
65 | 8.66k | return _mm_xor_si128(key, key_with_rcon); |
66 | 8.66k | } |
67 | | |
68 | | } |
69 | | |
70 | | #define AES_ENC_4_ROUNDS(K) \ |
71 | 124k | do \ |
72 | 124k | { \ |
73 | 124k | B0 = _mm_aesenc_si128(B0, K); \ |
74 | 124k | B1 = _mm_aesenc_si128(B1, K); \ |
75 | 124k | B2 = _mm_aesenc_si128(B2, K); \ |
76 | 124k | B3 = _mm_aesenc_si128(B3, K); \ |
77 | 124k | } while(0) |
78 | | |
79 | | #define AES_ENC_4_LAST_ROUNDS(K) \ |
80 | 10.1k | do \ |
81 | 10.1k | { \ |
82 | 10.1k | B0 = _mm_aesenclast_si128(B0, K); \ |
83 | 10.1k | B1 = _mm_aesenclast_si128(B1, K); \ |
84 | 10.1k | B2 = _mm_aesenclast_si128(B2, K); \ |
85 | 10.1k | B3 = _mm_aesenclast_si128(B3, K); \ |
86 | 10.1k | } while(0) |
87 | | |
88 | | #define AES_DEC_4_ROUNDS(K) \ |
89 | 62.9k | do \ |
90 | 62.9k | { \ |
91 | 62.9k | B0 = _mm_aesdec_si128(B0, K); \ |
92 | 62.9k | B1 = _mm_aesdec_si128(B1, K); \ |
93 | 62.9k | B2 = _mm_aesdec_si128(B2, K); \ |
94 | 62.9k | B3 = _mm_aesdec_si128(B3, K); \ |
95 | 62.9k | } while(0) |
96 | | |
97 | | #define AES_DEC_4_LAST_ROUNDS(K) \ |
98 | 5.47k | do \ |
99 | 5.47k | { \ |
100 | 5.47k | B0 = _mm_aesdeclast_si128(B0, K); \ |
101 | 5.47k | B1 = _mm_aesdeclast_si128(B1, K); \ |
102 | 5.47k | B2 = _mm_aesdeclast_si128(B2, K); \ |
103 | 5.47k | B3 = _mm_aesdeclast_si128(B3, K); \ |
104 | 5.47k | } while(0) |
105 | | |
106 | | /* |
107 | | * AES-128 Encryption |
108 | | */ |
109 | | BOTAN_FUNC_ISA("ssse3,aes") |
110 | | void AES_128::aesni_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const |
111 | 9.13k | { |
112 | 9.13k | const __m128i* in_mm = reinterpret_cast<const __m128i*>(in); |
113 | 9.13k | __m128i* out_mm = reinterpret_cast<__m128i*>(out); |
114 | 9.13k | |
115 | 9.13k | const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_EK.data()); |
116 | 9.13k | |
117 | 9.13k | const __m128i K0 = _mm_loadu_si128(key_mm); |
118 | 9.13k | const __m128i K1 = _mm_loadu_si128(key_mm + 1); |
119 | 9.13k | const __m128i K2 = _mm_loadu_si128(key_mm + 2); |
120 | 9.13k | const __m128i K3 = _mm_loadu_si128(key_mm + 3); |
121 | 9.13k | const __m128i K4 = _mm_loadu_si128(key_mm + 4); |
122 | 9.13k | const __m128i K5 = _mm_loadu_si128(key_mm + 5); |
123 | 9.13k | const __m128i K6 = _mm_loadu_si128(key_mm + 6); |
124 | 9.13k | const __m128i K7 = _mm_loadu_si128(key_mm + 7); |
125 | 9.13k | const __m128i K8 = _mm_loadu_si128(key_mm + 8); |
126 | 9.13k | const __m128i K9 = _mm_loadu_si128(key_mm + 9); |
127 | 9.13k | const __m128i K10 = _mm_loadu_si128(key_mm + 10); |
128 | 9.13k | |
129 | 10.8k | while(blocks >= 4) |
130 | 1.73k | { |
131 | 1.73k | __m128i B0 = _mm_loadu_si128(in_mm + 0); |
132 | 1.73k | __m128i B1 = _mm_loadu_si128(in_mm + 1); |
133 | 1.73k | __m128i B2 = _mm_loadu_si128(in_mm + 2); |
134 | 1.73k | __m128i B3 = _mm_loadu_si128(in_mm + 3); |
135 | 1.73k | |
136 | 1.73k | B0 = _mm_xor_si128(B0, K0); |
137 | 1.73k | B1 = _mm_xor_si128(B1, K0); |
138 | 1.73k | B2 = _mm_xor_si128(B2, K0); |
139 | 1.73k | B3 = _mm_xor_si128(B3, K0); |
140 | 1.73k | |
141 | 1.73k | AES_ENC_4_ROUNDS(K1); |
142 | 1.73k | AES_ENC_4_ROUNDS(K2); |
143 | 1.73k | AES_ENC_4_ROUNDS(K3); |
144 | 1.73k | AES_ENC_4_ROUNDS(K4); |
145 | 1.73k | AES_ENC_4_ROUNDS(K5); |
146 | 1.73k | AES_ENC_4_ROUNDS(K6); |
147 | 1.73k | AES_ENC_4_ROUNDS(K7); |
148 | 1.73k | AES_ENC_4_ROUNDS(K8); |
149 | 1.73k | AES_ENC_4_ROUNDS(K9); |
150 | 1.73k | AES_ENC_4_LAST_ROUNDS(K10); |
151 | 1.73k | |
152 | 1.73k | _mm_storeu_si128(out_mm + 0, B0); |
153 | 1.73k | _mm_storeu_si128(out_mm + 1, B1); |
154 | 1.73k | _mm_storeu_si128(out_mm + 2, B2); |
155 | 1.73k | _mm_storeu_si128(out_mm + 3, B3); |
156 | 1.73k | |
157 | 1.73k | blocks -= 4; |
158 | 1.73k | in_mm += 4; |
159 | 1.73k | out_mm += 4; |
160 | 1.73k | } |
161 | 9.13k | |
162 | 17.8k | for(size_t i = 0; i != blocks; ++i) |
163 | 8.69k | { |
164 | 8.69k | __m128i B = _mm_loadu_si128(in_mm + i); |
165 | 8.69k | |
166 | 8.69k | B = _mm_xor_si128(B, K0); |
167 | 8.69k | |
168 | 8.69k | B = _mm_aesenc_si128(B, K1); |
169 | 8.69k | B = _mm_aesenc_si128(B, K2); |
170 | 8.69k | B = _mm_aesenc_si128(B, K3); |
171 | 8.69k | B = _mm_aesenc_si128(B, K4); |
172 | 8.69k | B = _mm_aesenc_si128(B, K5); |
173 | 8.69k | B = _mm_aesenc_si128(B, K6); |
174 | 8.69k | B = _mm_aesenc_si128(B, K7); |
175 | 8.69k | B = _mm_aesenc_si128(B, K8); |
176 | 8.69k | B = _mm_aesenc_si128(B, K9); |
177 | 8.69k | B = _mm_aesenclast_si128(B, K10); |
178 | 8.69k | |
179 | 8.69k | _mm_storeu_si128(out_mm + i, B); |
180 | 8.69k | } |
181 | 9.13k | } |
182 | | |
183 | | /* |
184 | | * AES-128 Decryption |
185 | | */ |
186 | | BOTAN_FUNC_ISA("ssse3,aes") |
187 | | void AES_128::aesni_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const |
188 | 578 | { |
189 | 578 | const __m128i* in_mm = reinterpret_cast<const __m128i*>(in); |
190 | 578 | __m128i* out_mm = reinterpret_cast<__m128i*>(out); |
191 | 578 | |
192 | 578 | const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_DK.data()); |
193 | 578 | |
194 | 578 | const __m128i K0 = _mm_loadu_si128(key_mm); |
195 | 578 | const __m128i K1 = _mm_loadu_si128(key_mm + 1); |
196 | 578 | const __m128i K2 = _mm_loadu_si128(key_mm + 2); |
197 | 578 | const __m128i K3 = _mm_loadu_si128(key_mm + 3); |
198 | 578 | const __m128i K4 = _mm_loadu_si128(key_mm + 4); |
199 | 578 | const __m128i K5 = _mm_loadu_si128(key_mm + 5); |
200 | 578 | const __m128i K6 = _mm_loadu_si128(key_mm + 6); |
201 | 578 | const __m128i K7 = _mm_loadu_si128(key_mm + 7); |
202 | 578 | const __m128i K8 = _mm_loadu_si128(key_mm + 8); |
203 | 578 | const __m128i K9 = _mm_loadu_si128(key_mm + 9); |
204 | 578 | const __m128i K10 = _mm_loadu_si128(key_mm + 10); |
205 | 578 | |
206 | 2.62k | while(blocks >= 4) |
207 | 2.04k | { |
208 | 2.04k | __m128i B0 = _mm_loadu_si128(in_mm + 0); |
209 | 2.04k | __m128i B1 = _mm_loadu_si128(in_mm + 1); |
210 | 2.04k | __m128i B2 = _mm_loadu_si128(in_mm + 2); |
211 | 2.04k | __m128i B3 = _mm_loadu_si128(in_mm + 3); |
212 | 2.04k | |
213 | 2.04k | B0 = _mm_xor_si128(B0, K0); |
214 | 2.04k | B1 = _mm_xor_si128(B1, K0); |
215 | 2.04k | B2 = _mm_xor_si128(B2, K0); |
216 | 2.04k | B3 = _mm_xor_si128(B3, K0); |
217 | 2.04k | |
218 | 2.04k | AES_DEC_4_ROUNDS(K1); |
219 | 2.04k | AES_DEC_4_ROUNDS(K2); |
220 | 2.04k | AES_DEC_4_ROUNDS(K3); |
221 | 2.04k | AES_DEC_4_ROUNDS(K4); |
222 | 2.04k | AES_DEC_4_ROUNDS(K5); |
223 | 2.04k | AES_DEC_4_ROUNDS(K6); |
224 | 2.04k | AES_DEC_4_ROUNDS(K7); |
225 | 2.04k | AES_DEC_4_ROUNDS(K8); |
226 | 2.04k | AES_DEC_4_ROUNDS(K9); |
227 | 2.04k | AES_DEC_4_LAST_ROUNDS(K10); |
228 | 2.04k | |
229 | 2.04k | _mm_storeu_si128(out_mm + 0, B0); |
230 | 2.04k | _mm_storeu_si128(out_mm + 1, B1); |
231 | 2.04k | _mm_storeu_si128(out_mm + 2, B2); |
232 | 2.04k | _mm_storeu_si128(out_mm + 3, B3); |
233 | 2.04k | |
234 | 2.04k | blocks -= 4; |
235 | 2.04k | in_mm += 4; |
236 | 2.04k | out_mm += 4; |
237 | 2.04k | } |
238 | 578 | |
239 | 734 | for(size_t i = 0; i != blocks; ++i) |
240 | 156 | { |
241 | 156 | __m128i B = _mm_loadu_si128(in_mm + i); |
242 | 156 | |
243 | 156 | B = _mm_xor_si128(B, K0); |
244 | 156 | |
245 | 156 | B = _mm_aesdec_si128(B, K1); |
246 | 156 | B = _mm_aesdec_si128(B, K2); |
247 | 156 | B = _mm_aesdec_si128(B, K3); |
248 | 156 | B = _mm_aesdec_si128(B, K4); |
249 | 156 | B = _mm_aesdec_si128(B, K5); |
250 | 156 | B = _mm_aesdec_si128(B, K6); |
251 | 156 | B = _mm_aesdec_si128(B, K7); |
252 | 156 | B = _mm_aesdec_si128(B, K8); |
253 | 156 | B = _mm_aesdec_si128(B, K9); |
254 | 156 | B = _mm_aesdeclast_si128(B, K10); |
255 | 156 | |
256 | 156 | _mm_storeu_si128(out_mm + i, B); |
257 | 156 | } |
258 | 578 | } |
259 | | |
260 | | /* |
261 | | * AES-128 Key Schedule |
262 | | */ |
263 | | BOTAN_FUNC_ISA("ssse3,aes") |
264 | | void AES_128::aesni_key_schedule(const uint8_t key[], size_t) |
265 | 674 | { |
266 | 674 | m_EK.resize(44); |
267 | 674 | m_DK.resize(44); |
268 | 674 | |
269 | 674 | #define AES_128_key_exp(K, RCON) \ |
270 | 6.74k | aes_128_key_expansion(K, _mm_aeskeygenassist_si128(K, RCON)) |
271 | 674 | |
272 | 674 | const __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key)); |
273 | 674 | const __m128i K1 = AES_128_key_exp(K0, 0x01); |
274 | 674 | const __m128i K2 = AES_128_key_exp(K1, 0x02); |
275 | 674 | const __m128i K3 = AES_128_key_exp(K2, 0x04); |
276 | 674 | const __m128i K4 = AES_128_key_exp(K3, 0x08); |
277 | 674 | const __m128i K5 = AES_128_key_exp(K4, 0x10); |
278 | 674 | const __m128i K6 = AES_128_key_exp(K5, 0x20); |
279 | 674 | const __m128i K7 = AES_128_key_exp(K6, 0x40); |
280 | 674 | const __m128i K8 = AES_128_key_exp(K7, 0x80); |
281 | 674 | const __m128i K9 = AES_128_key_exp(K8, 0x1B); |
282 | 674 | const __m128i K10 = AES_128_key_exp(K9, 0x36); |
283 | 674 | |
284 | 674 | __m128i* EK_mm = reinterpret_cast<__m128i*>(m_EK.data()); |
285 | 674 | _mm_storeu_si128(EK_mm , K0); |
286 | 674 | _mm_storeu_si128(EK_mm + 1, K1); |
287 | 674 | _mm_storeu_si128(EK_mm + 2, K2); |
288 | 674 | _mm_storeu_si128(EK_mm + 3, K3); |
289 | 674 | _mm_storeu_si128(EK_mm + 4, K4); |
290 | 674 | _mm_storeu_si128(EK_mm + 5, K5); |
291 | 674 | _mm_storeu_si128(EK_mm + 6, K6); |
292 | 674 | _mm_storeu_si128(EK_mm + 7, K7); |
293 | 674 | _mm_storeu_si128(EK_mm + 8, K8); |
294 | 674 | _mm_storeu_si128(EK_mm + 9, K9); |
295 | 674 | _mm_storeu_si128(EK_mm + 10, K10); |
296 | 674 | |
297 | 674 | // Now generate decryption keys |
298 | 674 | |
299 | 674 | __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data()); |
300 | 674 | _mm_storeu_si128(DK_mm , K10); |
301 | 674 | _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K9)); |
302 | 674 | _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K8)); |
303 | 674 | _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K7)); |
304 | 674 | _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K6)); |
305 | 674 | _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K5)); |
306 | 674 | _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K4)); |
307 | 674 | _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K3)); |
308 | 674 | _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K2)); |
309 | 674 | _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K1)); |
310 | 674 | _mm_storeu_si128(DK_mm + 10, K0); |
311 | 674 | } |
312 | | |
313 | | /* |
314 | | * AES-192 Encryption |
315 | | */ |
316 | | BOTAN_FUNC_ISA("ssse3,aes") |
317 | | void AES_192::aesni_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const |
318 | 0 | { |
319 | 0 | const __m128i* in_mm = reinterpret_cast<const __m128i*>(in); |
320 | 0 | __m128i* out_mm = reinterpret_cast<__m128i*>(out); |
321 | 0 |
|
322 | 0 | const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_EK.data()); |
323 | 0 |
|
324 | 0 | const __m128i K0 = _mm_loadu_si128(key_mm); |
325 | 0 | const __m128i K1 = _mm_loadu_si128(key_mm + 1); |
326 | 0 | const __m128i K2 = _mm_loadu_si128(key_mm + 2); |
327 | 0 | const __m128i K3 = _mm_loadu_si128(key_mm + 3); |
328 | 0 | const __m128i K4 = _mm_loadu_si128(key_mm + 4); |
329 | 0 | const __m128i K5 = _mm_loadu_si128(key_mm + 5); |
330 | 0 | const __m128i K6 = _mm_loadu_si128(key_mm + 6); |
331 | 0 | const __m128i K7 = _mm_loadu_si128(key_mm + 7); |
332 | 0 | const __m128i K8 = _mm_loadu_si128(key_mm + 8); |
333 | 0 | const __m128i K9 = _mm_loadu_si128(key_mm + 9); |
334 | 0 | const __m128i K10 = _mm_loadu_si128(key_mm + 10); |
335 | 0 | const __m128i K11 = _mm_loadu_si128(key_mm + 11); |
336 | 0 | const __m128i K12 = _mm_loadu_si128(key_mm + 12); |
337 | 0 |
|
338 | 0 | while(blocks >= 4) |
339 | 0 | { |
340 | 0 | __m128i B0 = _mm_loadu_si128(in_mm + 0); |
341 | 0 | __m128i B1 = _mm_loadu_si128(in_mm + 1); |
342 | 0 | __m128i B2 = _mm_loadu_si128(in_mm + 2); |
343 | 0 | __m128i B3 = _mm_loadu_si128(in_mm + 3); |
344 | 0 |
|
345 | 0 | B0 = _mm_xor_si128(B0, K0); |
346 | 0 | B1 = _mm_xor_si128(B1, K0); |
347 | 0 | B2 = _mm_xor_si128(B2, K0); |
348 | 0 | B3 = _mm_xor_si128(B3, K0); |
349 | 0 |
|
350 | 0 | AES_ENC_4_ROUNDS(K1); |
351 | 0 | AES_ENC_4_ROUNDS(K2); |
352 | 0 | AES_ENC_4_ROUNDS(K3); |
353 | 0 | AES_ENC_4_ROUNDS(K4); |
354 | 0 | AES_ENC_4_ROUNDS(K5); |
355 | 0 | AES_ENC_4_ROUNDS(K6); |
356 | 0 | AES_ENC_4_ROUNDS(K7); |
357 | 0 | AES_ENC_4_ROUNDS(K8); |
358 | 0 | AES_ENC_4_ROUNDS(K9); |
359 | 0 | AES_ENC_4_ROUNDS(K10); |
360 | 0 | AES_ENC_4_ROUNDS(K11); |
361 | 0 | AES_ENC_4_LAST_ROUNDS(K12); |
362 | 0 |
|
363 | 0 | _mm_storeu_si128(out_mm + 0, B0); |
364 | 0 | _mm_storeu_si128(out_mm + 1, B1); |
365 | 0 | _mm_storeu_si128(out_mm + 2, B2); |
366 | 0 | _mm_storeu_si128(out_mm + 3, B3); |
367 | 0 |
|
368 | 0 | blocks -= 4; |
369 | 0 | in_mm += 4; |
370 | 0 | out_mm += 4; |
371 | 0 | } |
372 | 0 |
|
373 | 0 | for(size_t i = 0; i != blocks; ++i) |
374 | 0 | { |
375 | 0 | __m128i B = _mm_loadu_si128(in_mm + i); |
376 | 0 |
|
377 | 0 | B = _mm_xor_si128(B, K0); |
378 | 0 |
|
379 | 0 | B = _mm_aesenc_si128(B, K1); |
380 | 0 | B = _mm_aesenc_si128(B, K2); |
381 | 0 | B = _mm_aesenc_si128(B, K3); |
382 | 0 | B = _mm_aesenc_si128(B, K4); |
383 | 0 | B = _mm_aesenc_si128(B, K5); |
384 | 0 | B = _mm_aesenc_si128(B, K6); |
385 | 0 | B = _mm_aesenc_si128(B, K7); |
386 | 0 | B = _mm_aesenc_si128(B, K8); |
387 | 0 | B = _mm_aesenc_si128(B, K9); |
388 | 0 | B = _mm_aesenc_si128(B, K10); |
389 | 0 | B = _mm_aesenc_si128(B, K11); |
390 | 0 | B = _mm_aesenclast_si128(B, K12); |
391 | 0 |
|
392 | 0 | _mm_storeu_si128(out_mm + i, B); |
393 | 0 | } |
394 | 0 | } |
395 | | |
396 | | /* |
397 | | * AES-192 Decryption |
398 | | */ |
399 | | BOTAN_FUNC_ISA("ssse3,aes") |
400 | | void AES_192::aesni_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const |
401 | 0 | { |
402 | 0 | const __m128i* in_mm = reinterpret_cast<const __m128i*>(in); |
403 | 0 | __m128i* out_mm = reinterpret_cast<__m128i*>(out); |
404 | 0 |
|
405 | 0 | const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_DK.data()); |
406 | 0 |
|
407 | 0 | const __m128i K0 = _mm_loadu_si128(key_mm); |
408 | 0 | const __m128i K1 = _mm_loadu_si128(key_mm + 1); |
409 | 0 | const __m128i K2 = _mm_loadu_si128(key_mm + 2); |
410 | 0 | const __m128i K3 = _mm_loadu_si128(key_mm + 3); |
411 | 0 | const __m128i K4 = _mm_loadu_si128(key_mm + 4); |
412 | 0 | const __m128i K5 = _mm_loadu_si128(key_mm + 5); |
413 | 0 | const __m128i K6 = _mm_loadu_si128(key_mm + 6); |
414 | 0 | const __m128i K7 = _mm_loadu_si128(key_mm + 7); |
415 | 0 | const __m128i K8 = _mm_loadu_si128(key_mm + 8); |
416 | 0 | const __m128i K9 = _mm_loadu_si128(key_mm + 9); |
417 | 0 | const __m128i K10 = _mm_loadu_si128(key_mm + 10); |
418 | 0 | const __m128i K11 = _mm_loadu_si128(key_mm + 11); |
419 | 0 | const __m128i K12 = _mm_loadu_si128(key_mm + 12); |
420 | 0 |
|
421 | 0 | while(blocks >= 4) |
422 | 0 | { |
423 | 0 | __m128i B0 = _mm_loadu_si128(in_mm + 0); |
424 | 0 | __m128i B1 = _mm_loadu_si128(in_mm + 1); |
425 | 0 | __m128i B2 = _mm_loadu_si128(in_mm + 2); |
426 | 0 | __m128i B3 = _mm_loadu_si128(in_mm + 3); |
427 | 0 |
|
428 | 0 | B0 = _mm_xor_si128(B0, K0); |
429 | 0 | B1 = _mm_xor_si128(B1, K0); |
430 | 0 | B2 = _mm_xor_si128(B2, K0); |
431 | 0 | B3 = _mm_xor_si128(B3, K0); |
432 | 0 |
|
433 | 0 | AES_DEC_4_ROUNDS(K1); |
434 | 0 | AES_DEC_4_ROUNDS(K2); |
435 | 0 | AES_DEC_4_ROUNDS(K3); |
436 | 0 | AES_DEC_4_ROUNDS(K4); |
437 | 0 | AES_DEC_4_ROUNDS(K5); |
438 | 0 | AES_DEC_4_ROUNDS(K6); |
439 | 0 | AES_DEC_4_ROUNDS(K7); |
440 | 0 | AES_DEC_4_ROUNDS(K8); |
441 | 0 | AES_DEC_4_ROUNDS(K9); |
442 | 0 | AES_DEC_4_ROUNDS(K10); |
443 | 0 | AES_DEC_4_ROUNDS(K11); |
444 | 0 | AES_DEC_4_LAST_ROUNDS(K12); |
445 | 0 |
|
446 | 0 | _mm_storeu_si128(out_mm + 0, B0); |
447 | 0 | _mm_storeu_si128(out_mm + 1, B1); |
448 | 0 | _mm_storeu_si128(out_mm + 2, B2); |
449 | 0 | _mm_storeu_si128(out_mm + 3, B3); |
450 | 0 |
|
451 | 0 | blocks -= 4; |
452 | 0 | in_mm += 4; |
453 | 0 | out_mm += 4; |
454 | 0 | } |
455 | 0 |
|
456 | 0 | for(size_t i = 0; i != blocks; ++i) |
457 | 0 | { |
458 | 0 | __m128i B = _mm_loadu_si128(in_mm + i); |
459 | 0 |
|
460 | 0 | B = _mm_xor_si128(B, K0); |
461 | 0 |
|
462 | 0 | B = _mm_aesdec_si128(B, K1); |
463 | 0 | B = _mm_aesdec_si128(B, K2); |
464 | 0 | B = _mm_aesdec_si128(B, K3); |
465 | 0 | B = _mm_aesdec_si128(B, K4); |
466 | 0 | B = _mm_aesdec_si128(B, K5); |
467 | 0 | B = _mm_aesdec_si128(B, K6); |
468 | 0 | B = _mm_aesdec_si128(B, K7); |
469 | 0 | B = _mm_aesdec_si128(B, K8); |
470 | 0 | B = _mm_aesdec_si128(B, K9); |
471 | 0 | B = _mm_aesdec_si128(B, K10); |
472 | 0 | B = _mm_aesdec_si128(B, K11); |
473 | 0 | B = _mm_aesdeclast_si128(B, K12); |
474 | 0 |
|
475 | 0 | _mm_storeu_si128(out_mm + i, B); |
476 | 0 | } |
477 | 0 | } |
478 | | |
479 | | /* |
480 | | * AES-192 Key Schedule |
481 | | */ |
482 | | BOTAN_FUNC_ISA("ssse3,aes") |
483 | | void AES_192::aesni_key_schedule(const uint8_t key[], size_t) |
484 | 0 | { |
485 | 0 | m_EK.resize(52); |
486 | 0 | m_DK.resize(52); |
487 | 0 |
|
488 | 0 | __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key)); |
489 | 0 | __m128i K1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key + 8)); |
490 | 0 | K1 = _mm_srli_si128(K1, 8); |
491 | 0 |
|
492 | 0 | load_le(m_EK.data(), key, 6); |
493 | 0 |
|
494 | 0 | #define AES_192_key_exp(RCON, EK_OFF) \ |
495 | 0 | aes_192_key_expansion(&K0, &K1, \ |
496 | 0 | _mm_aeskeygenassist_si128(K1, RCON), \ |
497 | 0 | &m_EK[EK_OFF], EK_OFF == 48) |
498 | 0 |
|
499 | 0 | AES_192_key_exp(0x01, 6); |
500 | 0 | AES_192_key_exp(0x02, 12); |
501 | 0 | AES_192_key_exp(0x04, 18); |
502 | 0 | AES_192_key_exp(0x08, 24); |
503 | 0 | AES_192_key_exp(0x10, 30); |
504 | 0 | AES_192_key_exp(0x20, 36); |
505 | 0 | AES_192_key_exp(0x40, 42); |
506 | 0 | AES_192_key_exp(0x80, 48); |
507 | 0 |
|
508 | 0 | #undef AES_192_key_exp |
509 | 0 |
|
510 | 0 | // Now generate decryption keys |
511 | 0 | const __m128i* EK_mm = reinterpret_cast<const __m128i*>(m_EK.data()); |
512 | 0 |
|
513 | 0 | __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data()); |
514 | 0 | _mm_storeu_si128(DK_mm , _mm_loadu_si128(EK_mm + 12)); |
515 | 0 | _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 11))); |
516 | 0 | _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 10))); |
517 | 0 | _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 9))); |
518 | 0 | _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 8))); |
519 | 0 | _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 7))); |
520 | 0 | _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 6))); |
521 | 0 | _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 5))); |
522 | 0 | _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 4))); |
523 | 0 | _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 3))); |
524 | 0 | _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 2))); |
525 | 0 | _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 1))); |
526 | 0 | _mm_storeu_si128(DK_mm + 12, _mm_loadu_si128(EK_mm + 0)); |
527 | 0 | } |
528 | | |
529 | | /* |
530 | | * AES-256 Encryption |
531 | | */ |
532 | | BOTAN_FUNC_ISA("ssse3,aes") |
533 | | void AES_256::aesni_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const |
534 | 10.4k | { |
535 | 10.4k | const __m128i* in_mm = reinterpret_cast<const __m128i*>(in); |
536 | 10.4k | __m128i* out_mm = reinterpret_cast<__m128i*>(out); |
537 | 10.4k | |
538 | 10.4k | const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_EK.data()); |
539 | 10.4k | |
540 | 10.4k | const __m128i K0 = _mm_loadu_si128(key_mm); |
541 | 10.4k | const __m128i K1 = _mm_loadu_si128(key_mm + 1); |
542 | 10.4k | const __m128i K2 = _mm_loadu_si128(key_mm + 2); |
543 | 10.4k | const __m128i K3 = _mm_loadu_si128(key_mm + 3); |
544 | 10.4k | const __m128i K4 = _mm_loadu_si128(key_mm + 4); |
545 | 10.4k | const __m128i K5 = _mm_loadu_si128(key_mm + 5); |
546 | 10.4k | const __m128i K6 = _mm_loadu_si128(key_mm + 6); |
547 | 10.4k | const __m128i K7 = _mm_loadu_si128(key_mm + 7); |
548 | 10.4k | const __m128i K8 = _mm_loadu_si128(key_mm + 8); |
549 | 10.4k | const __m128i K9 = _mm_loadu_si128(key_mm + 9); |
550 | 10.4k | const __m128i K10 = _mm_loadu_si128(key_mm + 10); |
551 | 10.4k | const __m128i K11 = _mm_loadu_si128(key_mm + 11); |
552 | 10.4k | const __m128i K12 = _mm_loadu_si128(key_mm + 12); |
553 | 10.4k | const __m128i K13 = _mm_loadu_si128(key_mm + 13); |
554 | 10.4k | const __m128i K14 = _mm_loadu_si128(key_mm + 14); |
555 | 10.4k | |
556 | 18.8k | while(blocks >= 4) |
557 | 8.37k | { |
558 | 8.37k | __m128i B0 = _mm_loadu_si128(in_mm + 0); |
559 | 8.37k | __m128i B1 = _mm_loadu_si128(in_mm + 1); |
560 | 8.37k | __m128i B2 = _mm_loadu_si128(in_mm + 2); |
561 | 8.37k | __m128i B3 = _mm_loadu_si128(in_mm + 3); |
562 | 8.37k | |
563 | 8.37k | B0 = _mm_xor_si128(B0, K0); |
564 | 8.37k | B1 = _mm_xor_si128(B1, K0); |
565 | 8.37k | B2 = _mm_xor_si128(B2, K0); |
566 | 8.37k | B3 = _mm_xor_si128(B3, K0); |
567 | 8.37k | |
568 | 8.37k | AES_ENC_4_ROUNDS(K1); |
569 | 8.37k | AES_ENC_4_ROUNDS(K2); |
570 | 8.37k | AES_ENC_4_ROUNDS(K3); |
571 | 8.37k | AES_ENC_4_ROUNDS(K4); |
572 | 8.37k | AES_ENC_4_ROUNDS(K5); |
573 | 8.37k | AES_ENC_4_ROUNDS(K6); |
574 | 8.37k | AES_ENC_4_ROUNDS(K7); |
575 | 8.37k | AES_ENC_4_ROUNDS(K8); |
576 | 8.37k | AES_ENC_4_ROUNDS(K9); |
577 | 8.37k | AES_ENC_4_ROUNDS(K10); |
578 | 8.37k | AES_ENC_4_ROUNDS(K11); |
579 | 8.37k | AES_ENC_4_ROUNDS(K12); |
580 | 8.37k | AES_ENC_4_ROUNDS(K13); |
581 | 8.37k | AES_ENC_4_LAST_ROUNDS(K14); |
582 | 8.37k | |
583 | 8.37k | _mm_storeu_si128(out_mm + 0, B0); |
584 | 8.37k | _mm_storeu_si128(out_mm + 1, B1); |
585 | 8.37k | _mm_storeu_si128(out_mm + 2, B2); |
586 | 8.37k | _mm_storeu_si128(out_mm + 3, B3); |
587 | 8.37k | |
588 | 8.37k | blocks -= 4; |
589 | 8.37k | in_mm += 4; |
590 | 8.37k | out_mm += 4; |
591 | 8.37k | } |
592 | 10.4k | |
593 | 18.8k | for(size_t i = 0; i != blocks; ++i) |
594 | 8.40k | { |
595 | 8.40k | __m128i B = _mm_loadu_si128(in_mm + i); |
596 | 8.40k | |
597 | 8.40k | B = _mm_xor_si128(B, K0); |
598 | 8.40k | |
599 | 8.40k | B = _mm_aesenc_si128(B, K1); |
600 | 8.40k | B = _mm_aesenc_si128(B, K2); |
601 | 8.40k | B = _mm_aesenc_si128(B, K3); |
602 | 8.40k | B = _mm_aesenc_si128(B, K4); |
603 | 8.40k | B = _mm_aesenc_si128(B, K5); |
604 | 8.40k | B = _mm_aesenc_si128(B, K6); |
605 | 8.40k | B = _mm_aesenc_si128(B, K7); |
606 | 8.40k | B = _mm_aesenc_si128(B, K8); |
607 | 8.40k | B = _mm_aesenc_si128(B, K9); |
608 | 8.40k | B = _mm_aesenc_si128(B, K10); |
609 | 8.40k | B = _mm_aesenc_si128(B, K11); |
610 | 8.40k | B = _mm_aesenc_si128(B, K12); |
611 | 8.40k | B = _mm_aesenc_si128(B, K13); |
612 | 8.40k | B = _mm_aesenclast_si128(B, K14); |
613 | 8.40k | |
614 | 8.40k | _mm_storeu_si128(out_mm + i, B); |
615 | 8.40k | } |
616 | 10.4k | } |
617 | | |
618 | | /* |
619 | | * AES-256 Decryption |
620 | | */ |
621 | | BOTAN_FUNC_ISA("ssse3,aes") |
622 | | void AES_256::aesni_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const |
623 | 944 | { |
624 | 944 | const __m128i* in_mm = reinterpret_cast<const __m128i*>(in); |
625 | 944 | __m128i* out_mm = reinterpret_cast<__m128i*>(out); |
626 | 944 | |
627 | 944 | const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_DK.data()); |
628 | 944 | |
629 | 944 | const __m128i K0 = _mm_loadu_si128(key_mm); |
630 | 944 | const __m128i K1 = _mm_loadu_si128(key_mm + 1); |
631 | 944 | const __m128i K2 = _mm_loadu_si128(key_mm + 2); |
632 | 944 | const __m128i K3 = _mm_loadu_si128(key_mm + 3); |
633 | 944 | const __m128i K4 = _mm_loadu_si128(key_mm + 4); |
634 | 944 | const __m128i K5 = _mm_loadu_si128(key_mm + 5); |
635 | 944 | const __m128i K6 = _mm_loadu_si128(key_mm + 6); |
636 | 944 | const __m128i K7 = _mm_loadu_si128(key_mm + 7); |
637 | 944 | const __m128i K8 = _mm_loadu_si128(key_mm + 8); |
638 | 944 | const __m128i K9 = _mm_loadu_si128(key_mm + 9); |
639 | 944 | const __m128i K10 = _mm_loadu_si128(key_mm + 10); |
640 | 944 | const __m128i K11 = _mm_loadu_si128(key_mm + 11); |
641 | 944 | const __m128i K12 = _mm_loadu_si128(key_mm + 12); |
642 | 944 | const __m128i K13 = _mm_loadu_si128(key_mm + 13); |
643 | 944 | const __m128i K14 = _mm_loadu_si128(key_mm + 14); |
644 | 944 | |
645 | 4.37k | while(blocks >= 4) |
646 | 3.42k | { |
647 | 3.42k | __m128i B0 = _mm_loadu_si128(in_mm + 0); |
648 | 3.42k | __m128i B1 = _mm_loadu_si128(in_mm + 1); |
649 | 3.42k | __m128i B2 = _mm_loadu_si128(in_mm + 2); |
650 | 3.42k | __m128i B3 = _mm_loadu_si128(in_mm + 3); |
651 | 3.42k | |
652 | 3.42k | B0 = _mm_xor_si128(B0, K0); |
653 | 3.42k | B1 = _mm_xor_si128(B1, K0); |
654 | 3.42k | B2 = _mm_xor_si128(B2, K0); |
655 | 3.42k | B3 = _mm_xor_si128(B3, K0); |
656 | 3.42k | |
657 | 3.42k | AES_DEC_4_ROUNDS(K1); |
658 | 3.42k | AES_DEC_4_ROUNDS(K2); |
659 | 3.42k | AES_DEC_4_ROUNDS(K3); |
660 | 3.42k | AES_DEC_4_ROUNDS(K4); |
661 | 3.42k | AES_DEC_4_ROUNDS(K5); |
662 | 3.42k | AES_DEC_4_ROUNDS(K6); |
663 | 3.42k | AES_DEC_4_ROUNDS(K7); |
664 | 3.42k | AES_DEC_4_ROUNDS(K8); |
665 | 3.42k | AES_DEC_4_ROUNDS(K9); |
666 | 3.42k | AES_DEC_4_ROUNDS(K10); |
667 | 3.42k | AES_DEC_4_ROUNDS(K11); |
668 | 3.42k | AES_DEC_4_ROUNDS(K12); |
669 | 3.42k | AES_DEC_4_ROUNDS(K13); |
670 | 3.42k | AES_DEC_4_LAST_ROUNDS(K14); |
671 | 3.42k | |
672 | 3.42k | _mm_storeu_si128(out_mm + 0, B0); |
673 | 3.42k | _mm_storeu_si128(out_mm + 1, B1); |
674 | 3.42k | _mm_storeu_si128(out_mm + 2, B2); |
675 | 3.42k | _mm_storeu_si128(out_mm + 3, B3); |
676 | 3.42k | |
677 | 3.42k | blocks -= 4; |
678 | 3.42k | in_mm += 4; |
679 | 3.42k | out_mm += 4; |
680 | 3.42k | } |
681 | 944 | |
682 | 1.14k | for(size_t i = 0; i != blocks; ++i) |
683 | 198 | { |
684 | 198 | __m128i B = _mm_loadu_si128(in_mm + i); |
685 | 198 | |
686 | 198 | B = _mm_xor_si128(B, K0); |
687 | 198 | |
688 | 198 | B = _mm_aesdec_si128(B, K1); |
689 | 198 | B = _mm_aesdec_si128(B, K2); |
690 | 198 | B = _mm_aesdec_si128(B, K3); |
691 | 198 | B = _mm_aesdec_si128(B, K4); |
692 | 198 | B = _mm_aesdec_si128(B, K5); |
693 | 198 | B = _mm_aesdec_si128(B, K6); |
694 | 198 | B = _mm_aesdec_si128(B, K7); |
695 | 198 | B = _mm_aesdec_si128(B, K8); |
696 | 198 | B = _mm_aesdec_si128(B, K9); |
697 | 198 | B = _mm_aesdec_si128(B, K10); |
698 | 198 | B = _mm_aesdec_si128(B, K11); |
699 | 198 | B = _mm_aesdec_si128(B, K12); |
700 | 198 | B = _mm_aesdec_si128(B, K13); |
701 | 198 | B = _mm_aesdeclast_si128(B, K14); |
702 | 198 | |
703 | 198 | _mm_storeu_si128(out_mm + i, B); |
704 | 198 | } |
705 | 944 | } |
706 | | |
707 | | /* |
708 | | * AES-256 Key Schedule |
709 | | */ |
710 | | BOTAN_FUNC_ISA("ssse3,aes") |
711 | | void AES_256::aesni_key_schedule(const uint8_t key[], size_t) |
712 | 1.44k | { |
713 | 1.44k | m_EK.resize(60); |
714 | 1.44k | m_DK.resize(60); |
715 | 1.44k | |
716 | 1.44k | const __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key)); |
717 | 1.44k | const __m128i K1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key + 16)); |
718 | 1.44k | |
719 | 1.44k | const __m128i K2 = aes_128_key_expansion(K0, _mm_aeskeygenassist_si128(K1, 0x01)); |
720 | 1.44k | const __m128i K3 = aes_256_key_expansion(K1, K2); |
721 | 1.44k | |
722 | 1.44k | const __m128i K4 = aes_128_key_expansion(K2, _mm_aeskeygenassist_si128(K3, 0x02)); |
723 | 1.44k | const __m128i K5 = aes_256_key_expansion(K3, K4); |
724 | 1.44k | |
725 | 1.44k | const __m128i K6 = aes_128_key_expansion(K4, _mm_aeskeygenassist_si128(K5, 0x04)); |
726 | 1.44k | const __m128i K7 = aes_256_key_expansion(K5, K6); |
727 | 1.44k | |
728 | 1.44k | const __m128i K8 = aes_128_key_expansion(K6, _mm_aeskeygenassist_si128(K7, 0x08)); |
729 | 1.44k | const __m128i K9 = aes_256_key_expansion(K7, K8); |
730 | 1.44k | |
731 | 1.44k | const __m128i K10 = aes_128_key_expansion(K8, _mm_aeskeygenassist_si128(K9, 0x10)); |
732 | 1.44k | const __m128i K11 = aes_256_key_expansion(K9, K10); |
733 | 1.44k | |
734 | 1.44k | const __m128i K12 = aes_128_key_expansion(K10, _mm_aeskeygenassist_si128(K11, 0x20)); |
735 | 1.44k | const __m128i K13 = aes_256_key_expansion(K11, K12); |
736 | 1.44k | |
737 | 1.44k | const __m128i K14 = aes_128_key_expansion(K12, _mm_aeskeygenassist_si128(K13, 0x40)); |
738 | 1.44k | |
739 | 1.44k | __m128i* EK_mm = reinterpret_cast<__m128i*>(m_EK.data()); |
740 | 1.44k | _mm_storeu_si128(EK_mm , K0); |
741 | 1.44k | _mm_storeu_si128(EK_mm + 1, K1); |
742 | 1.44k | _mm_storeu_si128(EK_mm + 2, K2); |
743 | 1.44k | _mm_storeu_si128(EK_mm + 3, K3); |
744 | 1.44k | _mm_storeu_si128(EK_mm + 4, K4); |
745 | 1.44k | _mm_storeu_si128(EK_mm + 5, K5); |
746 | 1.44k | _mm_storeu_si128(EK_mm + 6, K6); |
747 | 1.44k | _mm_storeu_si128(EK_mm + 7, K7); |
748 | 1.44k | _mm_storeu_si128(EK_mm + 8, K8); |
749 | 1.44k | _mm_storeu_si128(EK_mm + 9, K9); |
750 | 1.44k | _mm_storeu_si128(EK_mm + 10, K10); |
751 | 1.44k | _mm_storeu_si128(EK_mm + 11, K11); |
752 | 1.44k | _mm_storeu_si128(EK_mm + 12, K12); |
753 | 1.44k | _mm_storeu_si128(EK_mm + 13, K13); |
754 | 1.44k | _mm_storeu_si128(EK_mm + 14, K14); |
755 | 1.44k | |
756 | 1.44k | // Now generate decryption keys |
757 | 1.44k | __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data()); |
758 | 1.44k | _mm_storeu_si128(DK_mm , K14); |
759 | 1.44k | _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K13)); |
760 | 1.44k | _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K12)); |
761 | 1.44k | _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K11)); |
762 | 1.44k | _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K10)); |
763 | 1.44k | _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K9)); |
764 | 1.44k | _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K8)); |
765 | 1.44k | _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K7)); |
766 | 1.44k | _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K6)); |
767 | 1.44k | _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K5)); |
768 | 1.44k | _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(K4)); |
769 | 1.44k | _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(K3)); |
770 | 1.44k | _mm_storeu_si128(DK_mm + 12, _mm_aesimc_si128(K2)); |
771 | 1.44k | _mm_storeu_si128(DK_mm + 13, _mm_aesimc_si128(K1)); |
772 | 1.44k | _mm_storeu_si128(DK_mm + 14, K0); |
773 | 1.44k | } |
774 | | |
775 | | #undef AES_ENC_4_ROUNDS |
776 | | #undef AES_ENC_4_LAST_ROUNDS |
777 | | #undef AES_DEC_4_ROUNDS |
778 | | #undef AES_DEC_4_LAST_ROUNDS |
779 | | |
780 | | } |