/src/botan/src/lib/block/aes/aes_ni/aes_ni.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * AES using AES-NI instructions |
3 | | * (C) 2009,2012 Jack Lloyd |
4 | | * |
5 | | * Botan is released under the Simplified BSD License (see license.txt) |
6 | | */ |
7 | | |
8 | | #include <botan/internal/aes.h> |
9 | | #include <botan/internal/loadstor.h> |
10 | | #include <wmmintrin.h> |
11 | | |
12 | | namespace Botan { |
13 | | |
14 | | namespace { |
15 | | |
16 | | BOTAN_FUNC_ISA("ssse3") |
17 | | __m128i aes_128_key_expansion(__m128i key, __m128i key_with_rcon) |
18 | 6.23k | { |
19 | 6.23k | key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(3,3,3,3)); |
20 | 6.23k | key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); |
21 | 6.23k | key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); |
22 | 6.23k | key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); |
23 | 6.23k | return _mm_xor_si128(key, key_with_rcon); |
24 | 6.23k | } |
25 | | |
26 | | BOTAN_FUNC_ISA("ssse3") |
27 | | void aes_192_key_expansion(__m128i* K1, __m128i* K2, __m128i key2_with_rcon, |
28 | | uint32_t out[], bool last) |
29 | 0 | { |
30 | 0 | __m128i key1 = *K1; |
31 | 0 | __m128i key2 = *K2; |
32 | |
|
33 | 0 | key2_with_rcon = _mm_shuffle_epi32(key2_with_rcon, _MM_SHUFFLE(1,1,1,1)); |
34 | 0 | key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4)); |
35 | 0 | key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4)); |
36 | 0 | key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4)); |
37 | 0 | key1 = _mm_xor_si128(key1, key2_with_rcon); |
38 | |
|
39 | 0 | *K1 = key1; |
40 | 0 | _mm_storeu_si128(reinterpret_cast<__m128i*>(out), key1); |
41 | |
|
42 | 0 | if(last) |
43 | 0 | return; |
44 | | |
45 | 0 | key2 = _mm_xor_si128(key2, _mm_slli_si128(key2, 4)); |
46 | 0 | key2 = _mm_xor_si128(key2, _mm_shuffle_epi32(key1, _MM_SHUFFLE(3,3,3,3))); |
47 | |
|
48 | 0 | *K2 = key2; |
49 | 0 | out[4] = _mm_cvtsi128_si32(key2); |
50 | 0 | out[5] = _mm_cvtsi128_si32(_mm_srli_si128(key2, 4)); |
51 | 0 | } |
52 | | |
53 | | /* |
54 | | * The second half of the AES-256 key expansion (other half same as AES-128) |
55 | | */ |
56 | | BOTAN_FUNC_ISA("ssse3,aes") |
57 | | __m128i aes_256_key_expansion(__m128i key, __m128i key2) |
58 | 3.46k | { |
59 | 3.46k | __m128i key_with_rcon = _mm_aeskeygenassist_si128(key2, 0x00); |
60 | 3.46k | key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(2,2,2,2)); |
61 | | |
62 | 3.46k | key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); |
63 | 3.46k | key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); |
64 | 3.46k | key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); |
65 | 3.46k | return _mm_xor_si128(key, key_with_rcon); |
66 | 3.46k | } |
67 | | |
68 | | } |
69 | | |
70 | | #define AES_ENC_4_ROUNDS(K) \ |
71 | 45.3k | do \ |
72 | 45.3k | { \ |
73 | 45.3k | B0 = _mm_aesenc_si128(B0, K); \ |
74 | 45.3k | B1 = _mm_aesenc_si128(B1, K); \ |
75 | 45.3k | B2 = _mm_aesenc_si128(B2, K); \ |
76 | 45.3k | B3 = _mm_aesenc_si128(B3, K); \ |
77 | 45.3k | } while(0) |
78 | | |
79 | | #define AES_ENC_4_LAST_ROUNDS(K) \ |
80 | 3.63k | do \ |
81 | 3.63k | { \ |
82 | 3.63k | B0 = _mm_aesenclast_si128(B0, K); \ |
83 | 3.63k | B1 = _mm_aesenclast_si128(B1, K); \ |
84 | 3.63k | B2 = _mm_aesenclast_si128(B2, K); \ |
85 | 3.63k | B3 = _mm_aesenclast_si128(B3, K); \ |
86 | 3.63k | } while(0) |
87 | | |
88 | | #define AES_DEC_4_ROUNDS(K) \ |
89 | 15.3k | do \ |
90 | 15.3k | { \ |
91 | 15.3k | B0 = _mm_aesdec_si128(B0, K); \ |
92 | 15.3k | B1 = _mm_aesdec_si128(B1, K); \ |
93 | 15.3k | B2 = _mm_aesdec_si128(B2, K); \ |
94 | 15.3k | B3 = _mm_aesdec_si128(B3, K); \ |
95 | 15.3k | } while(0) |
96 | | |
97 | | #define AES_DEC_4_LAST_ROUNDS(K) \ |
98 | 1.26k | do \ |
99 | 1.26k | { \ |
100 | 1.26k | B0 = _mm_aesdeclast_si128(B0, K); \ |
101 | 1.26k | B1 = _mm_aesdeclast_si128(B1, K); \ |
102 | 1.26k | B2 = _mm_aesdeclast_si128(B2, K); \ |
103 | 1.26k | B3 = _mm_aesdeclast_si128(B3, K); \ |
104 | 1.26k | } while(0) |
105 | | |
106 | | /* |
107 | | * AES-128 Encryption |
108 | | */ |
109 | | BOTAN_FUNC_ISA("ssse3,aes") |
110 | | void AES_128::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const |
111 | 2.49k | { |
112 | 2.49k | const __m128i* in_mm = reinterpret_cast<const __m128i*>(in); |
113 | 2.49k | __m128i* out_mm = reinterpret_cast<__m128i*>(out); |
114 | | |
115 | 2.49k | const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_EK.data()); |
116 | | |
117 | 2.49k | const __m128i K0 = _mm_loadu_si128(key_mm); |
118 | 2.49k | const __m128i K1 = _mm_loadu_si128(key_mm + 1); |
119 | 2.49k | const __m128i K2 = _mm_loadu_si128(key_mm + 2); |
120 | 2.49k | const __m128i K3 = _mm_loadu_si128(key_mm + 3); |
121 | 2.49k | const __m128i K4 = _mm_loadu_si128(key_mm + 4); |
122 | 2.49k | const __m128i K5 = _mm_loadu_si128(key_mm + 5); |
123 | 2.49k | const __m128i K6 = _mm_loadu_si128(key_mm + 6); |
124 | 2.49k | const __m128i K7 = _mm_loadu_si128(key_mm + 7); |
125 | 2.49k | const __m128i K8 = _mm_loadu_si128(key_mm + 8); |
126 | 2.49k | const __m128i K9 = _mm_loadu_si128(key_mm + 9); |
127 | 2.49k | const __m128i K10 = _mm_loadu_si128(key_mm + 10); |
128 | | |
129 | 2.95k | while(blocks >= 4) |
130 | 464 | { |
131 | 464 | __m128i B0 = _mm_loadu_si128(in_mm + 0); |
132 | 464 | __m128i B1 = _mm_loadu_si128(in_mm + 1); |
133 | 464 | __m128i B2 = _mm_loadu_si128(in_mm + 2); |
134 | 464 | __m128i B3 = _mm_loadu_si128(in_mm + 3); |
135 | | |
136 | 464 | B0 = _mm_xor_si128(B0, K0); |
137 | 464 | B1 = _mm_xor_si128(B1, K0); |
138 | 464 | B2 = _mm_xor_si128(B2, K0); |
139 | 464 | B3 = _mm_xor_si128(B3, K0); |
140 | | |
141 | 464 | AES_ENC_4_ROUNDS(K1); |
142 | 464 | AES_ENC_4_ROUNDS(K2); |
143 | 464 | AES_ENC_4_ROUNDS(K3); |
144 | 464 | AES_ENC_4_ROUNDS(K4); |
145 | 464 | AES_ENC_4_ROUNDS(K5); |
146 | 464 | AES_ENC_4_ROUNDS(K6); |
147 | 464 | AES_ENC_4_ROUNDS(K7); |
148 | 464 | AES_ENC_4_ROUNDS(K8); |
149 | 464 | AES_ENC_4_ROUNDS(K9); |
150 | 464 | AES_ENC_4_LAST_ROUNDS(K10); |
151 | | |
152 | 464 | _mm_storeu_si128(out_mm + 0, B0); |
153 | 464 | _mm_storeu_si128(out_mm + 1, B1); |
154 | 464 | _mm_storeu_si128(out_mm + 2, B2); |
155 | 464 | _mm_storeu_si128(out_mm + 3, B3); |
156 | | |
157 | 464 | blocks -= 4; |
158 | 464 | in_mm += 4; |
159 | 464 | out_mm += 4; |
160 | 464 | } |
161 | | |
162 | 4.87k | for(size_t i = 0; i != blocks; ++i) |
163 | 2.37k | { |
164 | 2.37k | __m128i B = _mm_loadu_si128(in_mm + i); |
165 | | |
166 | 2.37k | B = _mm_xor_si128(B, K0); |
167 | | |
168 | 2.37k | B = _mm_aesenc_si128(B, K1); |
169 | 2.37k | B = _mm_aesenc_si128(B, K2); |
170 | 2.37k | B = _mm_aesenc_si128(B, K3); |
171 | 2.37k | B = _mm_aesenc_si128(B, K4); |
172 | 2.37k | B = _mm_aesenc_si128(B, K5); |
173 | 2.37k | B = _mm_aesenc_si128(B, K6); |
174 | 2.37k | B = _mm_aesenc_si128(B, K7); |
175 | 2.37k | B = _mm_aesenc_si128(B, K8); |
176 | 2.37k | B = _mm_aesenc_si128(B, K9); |
177 | 2.37k | B = _mm_aesenclast_si128(B, K10); |
178 | | |
179 | 2.37k | _mm_storeu_si128(out_mm + i, B); |
180 | 2.37k | } |
181 | 2.49k | } |
182 | | |
183 | | /* |
184 | | * AES-128 Decryption |
185 | | */ |
186 | | BOTAN_FUNC_ISA("ssse3,aes") |
187 | | void AES_128::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const |
188 | 80 | { |
189 | 80 | const __m128i* in_mm = reinterpret_cast<const __m128i*>(in); |
190 | 80 | __m128i* out_mm = reinterpret_cast<__m128i*>(out); |
191 | | |
192 | 80 | const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_DK.data()); |
193 | | |
194 | 80 | const __m128i K0 = _mm_loadu_si128(key_mm); |
195 | 80 | const __m128i K1 = _mm_loadu_si128(key_mm + 1); |
196 | 80 | const __m128i K2 = _mm_loadu_si128(key_mm + 2); |
197 | 80 | const __m128i K3 = _mm_loadu_si128(key_mm + 3); |
198 | 80 | const __m128i K4 = _mm_loadu_si128(key_mm + 4); |
199 | 80 | const __m128i K5 = _mm_loadu_si128(key_mm + 5); |
200 | 80 | const __m128i K6 = _mm_loadu_si128(key_mm + 6); |
201 | 80 | const __m128i K7 = _mm_loadu_si128(key_mm + 7); |
202 | 80 | const __m128i K8 = _mm_loadu_si128(key_mm + 8); |
203 | 80 | const __m128i K9 = _mm_loadu_si128(key_mm + 9); |
204 | 80 | const __m128i K10 = _mm_loadu_si128(key_mm + 10); |
205 | | |
206 | 363 | while(blocks >= 4) |
207 | 283 | { |
208 | 283 | __m128i B0 = _mm_loadu_si128(in_mm + 0); |
209 | 283 | __m128i B1 = _mm_loadu_si128(in_mm + 1); |
210 | 283 | __m128i B2 = _mm_loadu_si128(in_mm + 2); |
211 | 283 | __m128i B3 = _mm_loadu_si128(in_mm + 3); |
212 | | |
213 | 283 | B0 = _mm_xor_si128(B0, K0); |
214 | 283 | B1 = _mm_xor_si128(B1, K0); |
215 | 283 | B2 = _mm_xor_si128(B2, K0); |
216 | 283 | B3 = _mm_xor_si128(B3, K0); |
217 | | |
218 | 283 | AES_DEC_4_ROUNDS(K1); |
219 | 283 | AES_DEC_4_ROUNDS(K2); |
220 | 283 | AES_DEC_4_ROUNDS(K3); |
221 | 283 | AES_DEC_4_ROUNDS(K4); |
222 | 283 | AES_DEC_4_ROUNDS(K5); |
223 | 283 | AES_DEC_4_ROUNDS(K6); |
224 | 283 | AES_DEC_4_ROUNDS(K7); |
225 | 283 | AES_DEC_4_ROUNDS(K8); |
226 | 283 | AES_DEC_4_ROUNDS(K9); |
227 | 283 | AES_DEC_4_LAST_ROUNDS(K10); |
228 | | |
229 | 283 | _mm_storeu_si128(out_mm + 0, B0); |
230 | 283 | _mm_storeu_si128(out_mm + 1, B1); |
231 | 283 | _mm_storeu_si128(out_mm + 2, B2); |
232 | 283 | _mm_storeu_si128(out_mm + 3, B3); |
233 | | |
234 | 283 | blocks -= 4; |
235 | 283 | in_mm += 4; |
236 | 283 | out_mm += 4; |
237 | 283 | } |
238 | | |
239 | 114 | for(size_t i = 0; i != blocks; ++i) |
240 | 34 | { |
241 | 34 | __m128i B = _mm_loadu_si128(in_mm + i); |
242 | | |
243 | 34 | B = _mm_xor_si128(B, K0); |
244 | | |
245 | 34 | B = _mm_aesdec_si128(B, K1); |
246 | 34 | B = _mm_aesdec_si128(B, K2); |
247 | 34 | B = _mm_aesdec_si128(B, K3); |
248 | 34 | B = _mm_aesdec_si128(B, K4); |
249 | 34 | B = _mm_aesdec_si128(B, K5); |
250 | 34 | B = _mm_aesdec_si128(B, K6); |
251 | 34 | B = _mm_aesdec_si128(B, K7); |
252 | 34 | B = _mm_aesdec_si128(B, K8); |
253 | 34 | B = _mm_aesdec_si128(B, K9); |
254 | 34 | B = _mm_aesdeclast_si128(B, K10); |
255 | | |
256 | 34 | _mm_storeu_si128(out_mm + i, B); |
257 | 34 | } |
258 | 80 | } |
259 | | |
260 | | /* |
261 | | * AES-128 Key Schedule |
262 | | */ |
263 | | BOTAN_FUNC_ISA("ssse3,aes") |
264 | | void AES_128::aesni_key_schedule(const uint8_t key[], size_t) |
265 | 219 | { |
266 | 219 | m_EK.resize(44); |
267 | 219 | m_DK.resize(44); |
268 | | |
269 | 219 | #define AES_128_key_exp(K, RCON) \ |
270 | 2.19k | aes_128_key_expansion(K, _mm_aeskeygenassist_si128(K, RCON)) |
271 | | |
272 | 219 | const __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key)); |
273 | 219 | const __m128i K1 = AES_128_key_exp(K0, 0x01); |
274 | 219 | const __m128i K2 = AES_128_key_exp(K1, 0x02); |
275 | 219 | const __m128i K3 = AES_128_key_exp(K2, 0x04); |
276 | 219 | const __m128i K4 = AES_128_key_exp(K3, 0x08); |
277 | 219 | const __m128i K5 = AES_128_key_exp(K4, 0x10); |
278 | 219 | const __m128i K6 = AES_128_key_exp(K5, 0x20); |
279 | 219 | const __m128i K7 = AES_128_key_exp(K6, 0x40); |
280 | 219 | const __m128i K8 = AES_128_key_exp(K7, 0x80); |
281 | 219 | const __m128i K9 = AES_128_key_exp(K8, 0x1B); |
282 | 219 | const __m128i K10 = AES_128_key_exp(K9, 0x36); |
283 | | |
284 | 219 | __m128i* EK_mm = reinterpret_cast<__m128i*>(m_EK.data()); |
285 | 219 | _mm_storeu_si128(EK_mm , K0); |
286 | 219 | _mm_storeu_si128(EK_mm + 1, K1); |
287 | 219 | _mm_storeu_si128(EK_mm + 2, K2); |
288 | 219 | _mm_storeu_si128(EK_mm + 3, K3); |
289 | 219 | _mm_storeu_si128(EK_mm + 4, K4); |
290 | 219 | _mm_storeu_si128(EK_mm + 5, K5); |
291 | 219 | _mm_storeu_si128(EK_mm + 6, K6); |
292 | 219 | _mm_storeu_si128(EK_mm + 7, K7); |
293 | 219 | _mm_storeu_si128(EK_mm + 8, K8); |
294 | 219 | _mm_storeu_si128(EK_mm + 9, K9); |
295 | 219 | _mm_storeu_si128(EK_mm + 10, K10); |
296 | | |
297 | | // Now generate decryption keys |
298 | | |
299 | 219 | __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data()); |
300 | 219 | _mm_storeu_si128(DK_mm , K10); |
301 | 219 | _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K9)); |
302 | 219 | _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K8)); |
303 | 219 | _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K7)); |
304 | 219 | _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K6)); |
305 | 219 | _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K5)); |
306 | 219 | _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K4)); |
307 | 219 | _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K3)); |
308 | 219 | _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K2)); |
309 | 219 | _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K1)); |
310 | 219 | _mm_storeu_si128(DK_mm + 10, K0); |
311 | 219 | } |
312 | | |
313 | | /* |
314 | | * AES-192 Encryption |
315 | | */ |
316 | | BOTAN_FUNC_ISA("ssse3,aes") |
317 | | void AES_192::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const |
318 | 0 | { |
319 | 0 | const __m128i* in_mm = reinterpret_cast<const __m128i*>(in); |
320 | 0 | __m128i* out_mm = reinterpret_cast<__m128i*>(out); |
321 | |
|
322 | 0 | const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_EK.data()); |
323 | |
|
324 | 0 | const __m128i K0 = _mm_loadu_si128(key_mm); |
325 | 0 | const __m128i K1 = _mm_loadu_si128(key_mm + 1); |
326 | 0 | const __m128i K2 = _mm_loadu_si128(key_mm + 2); |
327 | 0 | const __m128i K3 = _mm_loadu_si128(key_mm + 3); |
328 | 0 | const __m128i K4 = _mm_loadu_si128(key_mm + 4); |
329 | 0 | const __m128i K5 = _mm_loadu_si128(key_mm + 5); |
330 | 0 | const __m128i K6 = _mm_loadu_si128(key_mm + 6); |
331 | 0 | const __m128i K7 = _mm_loadu_si128(key_mm + 7); |
332 | 0 | const __m128i K8 = _mm_loadu_si128(key_mm + 8); |
333 | 0 | const __m128i K9 = _mm_loadu_si128(key_mm + 9); |
334 | 0 | const __m128i K10 = _mm_loadu_si128(key_mm + 10); |
335 | 0 | const __m128i K11 = _mm_loadu_si128(key_mm + 11); |
336 | 0 | const __m128i K12 = _mm_loadu_si128(key_mm + 12); |
337 | |
|
338 | 0 | while(blocks >= 4) |
339 | 0 | { |
340 | 0 | __m128i B0 = _mm_loadu_si128(in_mm + 0); |
341 | 0 | __m128i B1 = _mm_loadu_si128(in_mm + 1); |
342 | 0 | __m128i B2 = _mm_loadu_si128(in_mm + 2); |
343 | 0 | __m128i B3 = _mm_loadu_si128(in_mm + 3); |
344 | |
|
345 | 0 | B0 = _mm_xor_si128(B0, K0); |
346 | 0 | B1 = _mm_xor_si128(B1, K0); |
347 | 0 | B2 = _mm_xor_si128(B2, K0); |
348 | 0 | B3 = _mm_xor_si128(B3, K0); |
349 | |
|
350 | 0 | AES_ENC_4_ROUNDS(K1); |
351 | 0 | AES_ENC_4_ROUNDS(K2); |
352 | 0 | AES_ENC_4_ROUNDS(K3); |
353 | 0 | AES_ENC_4_ROUNDS(K4); |
354 | 0 | AES_ENC_4_ROUNDS(K5); |
355 | 0 | AES_ENC_4_ROUNDS(K6); |
356 | 0 | AES_ENC_4_ROUNDS(K7); |
357 | 0 | AES_ENC_4_ROUNDS(K8); |
358 | 0 | AES_ENC_4_ROUNDS(K9); |
359 | 0 | AES_ENC_4_ROUNDS(K10); |
360 | 0 | AES_ENC_4_ROUNDS(K11); |
361 | 0 | AES_ENC_4_LAST_ROUNDS(K12); |
362 | |
|
363 | 0 | _mm_storeu_si128(out_mm + 0, B0); |
364 | 0 | _mm_storeu_si128(out_mm + 1, B1); |
365 | 0 | _mm_storeu_si128(out_mm + 2, B2); |
366 | 0 | _mm_storeu_si128(out_mm + 3, B3); |
367 | |
|
368 | 0 | blocks -= 4; |
369 | 0 | in_mm += 4; |
370 | 0 | out_mm += 4; |
371 | 0 | } |
372 | |
|
373 | 0 | for(size_t i = 0; i != blocks; ++i) |
374 | 0 | { |
375 | 0 | __m128i B = _mm_loadu_si128(in_mm + i); |
376 | |
|
377 | 0 | B = _mm_xor_si128(B, K0); |
378 | |
|
379 | 0 | B = _mm_aesenc_si128(B, K1); |
380 | 0 | B = _mm_aesenc_si128(B, K2); |
381 | 0 | B = _mm_aesenc_si128(B, K3); |
382 | 0 | B = _mm_aesenc_si128(B, K4); |
383 | 0 | B = _mm_aesenc_si128(B, K5); |
384 | 0 | B = _mm_aesenc_si128(B, K6); |
385 | 0 | B = _mm_aesenc_si128(B, K7); |
386 | 0 | B = _mm_aesenc_si128(B, K8); |
387 | 0 | B = _mm_aesenc_si128(B, K9); |
388 | 0 | B = _mm_aesenc_si128(B, K10); |
389 | 0 | B = _mm_aesenc_si128(B, K11); |
390 | 0 | B = _mm_aesenclast_si128(B, K12); |
391 | |
|
392 | 0 | _mm_storeu_si128(out_mm + i, B); |
393 | 0 | } |
394 | 0 | } |
395 | | |
396 | | /* |
397 | | * AES-192 Decryption |
398 | | */ |
399 | | BOTAN_FUNC_ISA("ssse3,aes") |
400 | | void AES_192::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const |
401 | 0 | { |
402 | 0 | const __m128i* in_mm = reinterpret_cast<const __m128i*>(in); |
403 | 0 | __m128i* out_mm = reinterpret_cast<__m128i*>(out); |
404 | |
|
405 | 0 | const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_DK.data()); |
406 | |
|
407 | 0 | const __m128i K0 = _mm_loadu_si128(key_mm); |
408 | 0 | const __m128i K1 = _mm_loadu_si128(key_mm + 1); |
409 | 0 | const __m128i K2 = _mm_loadu_si128(key_mm + 2); |
410 | 0 | const __m128i K3 = _mm_loadu_si128(key_mm + 3); |
411 | 0 | const __m128i K4 = _mm_loadu_si128(key_mm + 4); |
412 | 0 | const __m128i K5 = _mm_loadu_si128(key_mm + 5); |
413 | 0 | const __m128i K6 = _mm_loadu_si128(key_mm + 6); |
414 | 0 | const __m128i K7 = _mm_loadu_si128(key_mm + 7); |
415 | 0 | const __m128i K8 = _mm_loadu_si128(key_mm + 8); |
416 | 0 | const __m128i K9 = _mm_loadu_si128(key_mm + 9); |
417 | 0 | const __m128i K10 = _mm_loadu_si128(key_mm + 10); |
418 | 0 | const __m128i K11 = _mm_loadu_si128(key_mm + 11); |
419 | 0 | const __m128i K12 = _mm_loadu_si128(key_mm + 12); |
420 | |
|
421 | 0 | while(blocks >= 4) |
422 | 0 | { |
423 | 0 | __m128i B0 = _mm_loadu_si128(in_mm + 0); |
424 | 0 | __m128i B1 = _mm_loadu_si128(in_mm + 1); |
425 | 0 | __m128i B2 = _mm_loadu_si128(in_mm + 2); |
426 | 0 | __m128i B3 = _mm_loadu_si128(in_mm + 3); |
427 | |
|
428 | 0 | B0 = _mm_xor_si128(B0, K0); |
429 | 0 | B1 = _mm_xor_si128(B1, K0); |
430 | 0 | B2 = _mm_xor_si128(B2, K0); |
431 | 0 | B3 = _mm_xor_si128(B3, K0); |
432 | |
|
433 | 0 | AES_DEC_4_ROUNDS(K1); |
434 | 0 | AES_DEC_4_ROUNDS(K2); |
435 | 0 | AES_DEC_4_ROUNDS(K3); |
436 | 0 | AES_DEC_4_ROUNDS(K4); |
437 | 0 | AES_DEC_4_ROUNDS(K5); |
438 | 0 | AES_DEC_4_ROUNDS(K6); |
439 | 0 | AES_DEC_4_ROUNDS(K7); |
440 | 0 | AES_DEC_4_ROUNDS(K8); |
441 | 0 | AES_DEC_4_ROUNDS(K9); |
442 | 0 | AES_DEC_4_ROUNDS(K10); |
443 | 0 | AES_DEC_4_ROUNDS(K11); |
444 | 0 | AES_DEC_4_LAST_ROUNDS(K12); |
445 | |
|
446 | 0 | _mm_storeu_si128(out_mm + 0, B0); |
447 | 0 | _mm_storeu_si128(out_mm + 1, B1); |
448 | 0 | _mm_storeu_si128(out_mm + 2, B2); |
449 | 0 | _mm_storeu_si128(out_mm + 3, B3); |
450 | |
|
451 | 0 | blocks -= 4; |
452 | 0 | in_mm += 4; |
453 | 0 | out_mm += 4; |
454 | 0 | } |
455 | |
|
456 | 0 | for(size_t i = 0; i != blocks; ++i) |
457 | 0 | { |
458 | 0 | __m128i B = _mm_loadu_si128(in_mm + i); |
459 | |
|
460 | 0 | B = _mm_xor_si128(B, K0); |
461 | |
|
462 | 0 | B = _mm_aesdec_si128(B, K1); |
463 | 0 | B = _mm_aesdec_si128(B, K2); |
464 | 0 | B = _mm_aesdec_si128(B, K3); |
465 | 0 | B = _mm_aesdec_si128(B, K4); |
466 | 0 | B = _mm_aesdec_si128(B, K5); |
467 | 0 | B = _mm_aesdec_si128(B, K6); |
468 | 0 | B = _mm_aesdec_si128(B, K7); |
469 | 0 | B = _mm_aesdec_si128(B, K8); |
470 | 0 | B = _mm_aesdec_si128(B, K9); |
471 | 0 | B = _mm_aesdec_si128(B, K10); |
472 | 0 | B = _mm_aesdec_si128(B, K11); |
473 | 0 | B = _mm_aesdeclast_si128(B, K12); |
474 | |
|
475 | 0 | _mm_storeu_si128(out_mm + i, B); |
476 | 0 | } |
477 | 0 | } |
478 | | |
479 | | /* |
480 | | * AES-192 Key Schedule |
481 | | */ |
482 | | BOTAN_FUNC_ISA("ssse3,aes") |
483 | | void AES_192::aesni_key_schedule(const uint8_t key[], size_t) |
484 | 0 | { |
485 | 0 | m_EK.resize(52); |
486 | 0 | m_DK.resize(52); |
487 | |
|
488 | 0 | __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key)); |
489 | 0 | __m128i K1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key + 8)); |
490 | 0 | K1 = _mm_srli_si128(K1, 8); |
491 | |
|
492 | 0 | load_le(m_EK.data(), key, 6); |
493 | |
|
494 | 0 | #define AES_192_key_exp(RCON, EK_OFF) \ |
495 | 0 | aes_192_key_expansion(&K0, &K1, \ |
496 | 0 | _mm_aeskeygenassist_si128(K1, RCON), \ |
497 | 0 | &m_EK[EK_OFF], EK_OFF == 48) |
498 | |
|
499 | 0 | AES_192_key_exp(0x01, 6); |
500 | 0 | AES_192_key_exp(0x02, 12); |
501 | 0 | AES_192_key_exp(0x04, 18); |
502 | 0 | AES_192_key_exp(0x08, 24); |
503 | 0 | AES_192_key_exp(0x10, 30); |
504 | 0 | AES_192_key_exp(0x20, 36); |
505 | 0 | AES_192_key_exp(0x40, 42); |
506 | 0 | AES_192_key_exp(0x80, 48); |
507 | |
|
508 | 0 | #undef AES_192_key_exp |
509 | | |
510 | | // Now generate decryption keys |
511 | 0 | const __m128i* EK_mm = reinterpret_cast<const __m128i*>(m_EK.data()); |
512 | |
|
513 | 0 | __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data()); |
514 | 0 | _mm_storeu_si128(DK_mm , _mm_loadu_si128(EK_mm + 12)); |
515 | 0 | _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 11))); |
516 | 0 | _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 10))); |
517 | 0 | _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 9))); |
518 | 0 | _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 8))); |
519 | 0 | _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 7))); |
520 | 0 | _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 6))); |
521 | 0 | _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 5))); |
522 | 0 | _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 4))); |
523 | 0 | _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 3))); |
524 | 0 | _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 2))); |
525 | 0 | _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 1))); |
526 | 0 | _mm_storeu_si128(DK_mm + 12, _mm_loadu_si128(EK_mm + 0)); |
527 | 0 | } |
528 | | |
529 | | /* |
530 | | * AES-256 Encryption |
531 | | */ |
532 | | BOTAN_FUNC_ISA("ssse3,aes") |
533 | | void AES_256::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const |
534 | 2.62k | { |
535 | 2.62k | const __m128i* in_mm = reinterpret_cast<const __m128i*>(in); |
536 | 2.62k | __m128i* out_mm = reinterpret_cast<__m128i*>(out); |
537 | | |
538 | 2.62k | const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_EK.data()); |
539 | | |
540 | 2.62k | const __m128i K0 = _mm_loadu_si128(key_mm); |
541 | 2.62k | const __m128i K1 = _mm_loadu_si128(key_mm + 1); |
542 | 2.62k | const __m128i K2 = _mm_loadu_si128(key_mm + 2); |
543 | 2.62k | const __m128i K3 = _mm_loadu_si128(key_mm + 3); |
544 | 2.62k | const __m128i K4 = _mm_loadu_si128(key_mm + 4); |
545 | 2.62k | const __m128i K5 = _mm_loadu_si128(key_mm + 5); |
546 | 2.62k | const __m128i K6 = _mm_loadu_si128(key_mm + 6); |
547 | 2.62k | const __m128i K7 = _mm_loadu_si128(key_mm + 7); |
548 | 2.62k | const __m128i K8 = _mm_loadu_si128(key_mm + 8); |
549 | 2.62k | const __m128i K9 = _mm_loadu_si128(key_mm + 9); |
550 | 2.62k | const __m128i K10 = _mm_loadu_si128(key_mm + 10); |
551 | 2.62k | const __m128i K11 = _mm_loadu_si128(key_mm + 11); |
552 | 2.62k | const __m128i K12 = _mm_loadu_si128(key_mm + 12); |
553 | 2.62k | const __m128i K13 = _mm_loadu_si128(key_mm + 13); |
554 | 2.62k | const __m128i K14 = _mm_loadu_si128(key_mm + 14); |
555 | | |
556 | 5.78k | while(blocks >= 4) |
557 | 3.16k | { |
558 | 3.16k | __m128i B0 = _mm_loadu_si128(in_mm + 0); |
559 | 3.16k | __m128i B1 = _mm_loadu_si128(in_mm + 1); |
560 | 3.16k | __m128i B2 = _mm_loadu_si128(in_mm + 2); |
561 | 3.16k | __m128i B3 = _mm_loadu_si128(in_mm + 3); |
562 | | |
563 | 3.16k | B0 = _mm_xor_si128(B0, K0); |
564 | 3.16k | B1 = _mm_xor_si128(B1, K0); |
565 | 3.16k | B2 = _mm_xor_si128(B2, K0); |
566 | 3.16k | B3 = _mm_xor_si128(B3, K0); |
567 | | |
568 | 3.16k | AES_ENC_4_ROUNDS(K1); |
569 | 3.16k | AES_ENC_4_ROUNDS(K2); |
570 | 3.16k | AES_ENC_4_ROUNDS(K3); |
571 | 3.16k | AES_ENC_4_ROUNDS(K4); |
572 | 3.16k | AES_ENC_4_ROUNDS(K5); |
573 | 3.16k | AES_ENC_4_ROUNDS(K6); |
574 | 3.16k | AES_ENC_4_ROUNDS(K7); |
575 | 3.16k | AES_ENC_4_ROUNDS(K8); |
576 | 3.16k | AES_ENC_4_ROUNDS(K9); |
577 | 3.16k | AES_ENC_4_ROUNDS(K10); |
578 | 3.16k | AES_ENC_4_ROUNDS(K11); |
579 | 3.16k | AES_ENC_4_ROUNDS(K12); |
580 | 3.16k | AES_ENC_4_ROUNDS(K13); |
581 | 3.16k | AES_ENC_4_LAST_ROUNDS(K14); |
582 | | |
583 | 3.16k | _mm_storeu_si128(out_mm + 0, B0); |
584 | 3.16k | _mm_storeu_si128(out_mm + 1, B1); |
585 | 3.16k | _mm_storeu_si128(out_mm + 2, B2); |
586 | 3.16k | _mm_storeu_si128(out_mm + 3, B3); |
587 | | |
588 | 3.16k | blocks -= 4; |
589 | 3.16k | in_mm += 4; |
590 | 3.16k | out_mm += 4; |
591 | 3.16k | } |
592 | | |
593 | 4.45k | for(size_t i = 0; i != blocks; ++i) |
594 | 1.82k | { |
595 | 1.82k | __m128i B = _mm_loadu_si128(in_mm + i); |
596 | | |
597 | 1.82k | B = _mm_xor_si128(B, K0); |
598 | | |
599 | 1.82k | B = _mm_aesenc_si128(B, K1); |
600 | 1.82k | B = _mm_aesenc_si128(B, K2); |
601 | 1.82k | B = _mm_aesenc_si128(B, K3); |
602 | 1.82k | B = _mm_aesenc_si128(B, K4); |
603 | 1.82k | B = _mm_aesenc_si128(B, K5); |
604 | 1.82k | B = _mm_aesenc_si128(B, K6); |
605 | 1.82k | B = _mm_aesenc_si128(B, K7); |
606 | 1.82k | B = _mm_aesenc_si128(B, K8); |
607 | 1.82k | B = _mm_aesenc_si128(B, K9); |
608 | 1.82k | B = _mm_aesenc_si128(B, K10); |
609 | 1.82k | B = _mm_aesenc_si128(B, K11); |
610 | 1.82k | B = _mm_aesenc_si128(B, K12); |
611 | 1.82k | B = _mm_aesenc_si128(B, K13); |
612 | 1.82k | B = _mm_aesenclast_si128(B, K14); |
613 | | |
614 | 1.82k | _mm_storeu_si128(out_mm + i, B); |
615 | 1.82k | } |
616 | 2.62k | } |
617 | | |
618 | | /* |
619 | | * AES-256 Decryption |
620 | | */ |
621 | | BOTAN_FUNC_ISA("ssse3,aes") |
622 | | void AES_256::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const |
623 | 284 | { |
624 | 284 | const __m128i* in_mm = reinterpret_cast<const __m128i*>(in); |
625 | 284 | __m128i* out_mm = reinterpret_cast<__m128i*>(out); |
626 | | |
627 | 284 | const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_DK.data()); |
628 | | |
629 | 284 | const __m128i K0 = _mm_loadu_si128(key_mm); |
630 | 284 | const __m128i K1 = _mm_loadu_si128(key_mm + 1); |
631 | 284 | const __m128i K2 = _mm_loadu_si128(key_mm + 2); |
632 | 284 | const __m128i K3 = _mm_loadu_si128(key_mm + 3); |
633 | 284 | const __m128i K4 = _mm_loadu_si128(key_mm + 4); |
634 | 284 | const __m128i K5 = _mm_loadu_si128(key_mm + 5); |
635 | 284 | const __m128i K6 = _mm_loadu_si128(key_mm + 6); |
636 | 284 | const __m128i K7 = _mm_loadu_si128(key_mm + 7); |
637 | 284 | const __m128i K8 = _mm_loadu_si128(key_mm + 8); |
638 | 284 | const __m128i K9 = _mm_loadu_si128(key_mm + 9); |
639 | 284 | const __m128i K10 = _mm_loadu_si128(key_mm + 10); |
640 | 284 | const __m128i K11 = _mm_loadu_si128(key_mm + 11); |
641 | 284 | const __m128i K12 = _mm_loadu_si128(key_mm + 12); |
642 | 284 | const __m128i K13 = _mm_loadu_si128(key_mm + 13); |
643 | 284 | const __m128i K14 = _mm_loadu_si128(key_mm + 14); |
644 | | |
645 | 1.26k | while(blocks >= 4) |
646 | 981 | { |
647 | 981 | __m128i B0 = _mm_loadu_si128(in_mm + 0); |
648 | 981 | __m128i B1 = _mm_loadu_si128(in_mm + 1); |
649 | 981 | __m128i B2 = _mm_loadu_si128(in_mm + 2); |
650 | 981 | __m128i B3 = _mm_loadu_si128(in_mm + 3); |
651 | | |
652 | 981 | B0 = _mm_xor_si128(B0, K0); |
653 | 981 | B1 = _mm_xor_si128(B1, K0); |
654 | 981 | B2 = _mm_xor_si128(B2, K0); |
655 | 981 | B3 = _mm_xor_si128(B3, K0); |
656 | | |
657 | 981 | AES_DEC_4_ROUNDS(K1); |
658 | 981 | AES_DEC_4_ROUNDS(K2); |
659 | 981 | AES_DEC_4_ROUNDS(K3); |
660 | 981 | AES_DEC_4_ROUNDS(K4); |
661 | 981 | AES_DEC_4_ROUNDS(K5); |
662 | 981 | AES_DEC_4_ROUNDS(K6); |
663 | 981 | AES_DEC_4_ROUNDS(K7); |
664 | 981 | AES_DEC_4_ROUNDS(K8); |
665 | 981 | AES_DEC_4_ROUNDS(K9); |
666 | 981 | AES_DEC_4_ROUNDS(K10); |
667 | 981 | AES_DEC_4_ROUNDS(K11); |
668 | 981 | AES_DEC_4_ROUNDS(K12); |
669 | 981 | AES_DEC_4_ROUNDS(K13); |
670 | 981 | AES_DEC_4_LAST_ROUNDS(K14); |
671 | | |
672 | 981 | _mm_storeu_si128(out_mm + 0, B0); |
673 | 981 | _mm_storeu_si128(out_mm + 1, B1); |
674 | 981 | _mm_storeu_si128(out_mm + 2, B2); |
675 | 981 | _mm_storeu_si128(out_mm + 3, B3); |
676 | | |
677 | 981 | blocks -= 4; |
678 | 981 | in_mm += 4; |
679 | 981 | out_mm += 4; |
680 | 981 | } |
681 | | |
682 | 406 | for(size_t i = 0; i != blocks; ++i) |
683 | 122 | { |
684 | 122 | __m128i B = _mm_loadu_si128(in_mm + i); |
685 | | |
686 | 122 | B = _mm_xor_si128(B, K0); |
687 | | |
688 | 122 | B = _mm_aesdec_si128(B, K1); |
689 | 122 | B = _mm_aesdec_si128(B, K2); |
690 | 122 | B = _mm_aesdec_si128(B, K3); |
691 | 122 | B = _mm_aesdec_si128(B, K4); |
692 | 122 | B = _mm_aesdec_si128(B, K5); |
693 | 122 | B = _mm_aesdec_si128(B, K6); |
694 | 122 | B = _mm_aesdec_si128(B, K7); |
695 | 122 | B = _mm_aesdec_si128(B, K8); |
696 | 122 | B = _mm_aesdec_si128(B, K9); |
697 | 122 | B = _mm_aesdec_si128(B, K10); |
698 | 122 | B = _mm_aesdec_si128(B, K11); |
699 | 122 | B = _mm_aesdec_si128(B, K12); |
700 | 122 | B = _mm_aesdec_si128(B, K13); |
701 | 122 | B = _mm_aesdeclast_si128(B, K14); |
702 | | |
703 | 122 | _mm_storeu_si128(out_mm + i, B); |
704 | 122 | } |
705 | 284 | } |
706 | | |
707 | | /* |
708 | | * AES-256 Key Schedule |
709 | | */ |
710 | | BOTAN_FUNC_ISA("ssse3,aes") |
711 | | void AES_256::aesni_key_schedule(const uint8_t key[], size_t) |
712 | 578 | { |
713 | 578 | m_EK.resize(60); |
714 | 578 | m_DK.resize(60); |
715 | | |
716 | 578 | const __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key)); |
717 | 578 | const __m128i K1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key + 16)); |
718 | | |
719 | 578 | const __m128i K2 = aes_128_key_expansion(K0, _mm_aeskeygenassist_si128(K1, 0x01)); |
720 | 578 | const __m128i K3 = aes_256_key_expansion(K1, K2); |
721 | | |
722 | 578 | const __m128i K4 = aes_128_key_expansion(K2, _mm_aeskeygenassist_si128(K3, 0x02)); |
723 | 578 | const __m128i K5 = aes_256_key_expansion(K3, K4); |
724 | | |
725 | 578 | const __m128i K6 = aes_128_key_expansion(K4, _mm_aeskeygenassist_si128(K5, 0x04)); |
726 | 578 | const __m128i K7 = aes_256_key_expansion(K5, K6); |
727 | | |
728 | 578 | const __m128i K8 = aes_128_key_expansion(K6, _mm_aeskeygenassist_si128(K7, 0x08)); |
729 | 578 | const __m128i K9 = aes_256_key_expansion(K7, K8); |
730 | | |
731 | 578 | const __m128i K10 = aes_128_key_expansion(K8, _mm_aeskeygenassist_si128(K9, 0x10)); |
732 | 578 | const __m128i K11 = aes_256_key_expansion(K9, K10); |
733 | | |
734 | 578 | const __m128i K12 = aes_128_key_expansion(K10, _mm_aeskeygenassist_si128(K11, 0x20)); |
735 | 578 | const __m128i K13 = aes_256_key_expansion(K11, K12); |
736 | | |
737 | 578 | const __m128i K14 = aes_128_key_expansion(K12, _mm_aeskeygenassist_si128(K13, 0x40)); |
738 | | |
739 | 578 | __m128i* EK_mm = reinterpret_cast<__m128i*>(m_EK.data()); |
740 | 578 | _mm_storeu_si128(EK_mm , K0); |
741 | 578 | _mm_storeu_si128(EK_mm + 1, K1); |
742 | 578 | _mm_storeu_si128(EK_mm + 2, K2); |
743 | 578 | _mm_storeu_si128(EK_mm + 3, K3); |
744 | 578 | _mm_storeu_si128(EK_mm + 4, K4); |
745 | 578 | _mm_storeu_si128(EK_mm + 5, K5); |
746 | 578 | _mm_storeu_si128(EK_mm + 6, K6); |
747 | 578 | _mm_storeu_si128(EK_mm + 7, K7); |
748 | 578 | _mm_storeu_si128(EK_mm + 8, K8); |
749 | 578 | _mm_storeu_si128(EK_mm + 9, K9); |
750 | 578 | _mm_storeu_si128(EK_mm + 10, K10); |
751 | 578 | _mm_storeu_si128(EK_mm + 11, K11); |
752 | 578 | _mm_storeu_si128(EK_mm + 12, K12); |
753 | 578 | _mm_storeu_si128(EK_mm + 13, K13); |
754 | 578 | _mm_storeu_si128(EK_mm + 14, K14); |
755 | | |
756 | | // Now generate decryption keys |
757 | 578 | __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data()); |
758 | 578 | _mm_storeu_si128(DK_mm , K14); |
759 | 578 | _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K13)); |
760 | 578 | _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K12)); |
761 | 578 | _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K11)); |
762 | 578 | _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K10)); |
763 | 578 | _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K9)); |
764 | 578 | _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K8)); |
765 | 578 | _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K7)); |
766 | 578 | _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K6)); |
767 | 578 | _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K5)); |
768 | 578 | _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(K4)); |
769 | 578 | _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(K3)); |
770 | 578 | _mm_storeu_si128(DK_mm + 12, _mm_aesimc_si128(K2)); |
771 | 578 | _mm_storeu_si128(DK_mm + 13, _mm_aesimc_si128(K1)); |
772 | 578 | _mm_storeu_si128(DK_mm + 14, K0); |
773 | 578 | } |
774 | | |
775 | | #undef AES_ENC_4_ROUNDS |
776 | | #undef AES_ENC_4_LAST_ROUNDS |
777 | | #undef AES_DEC_4_ROUNDS |
778 | | #undef AES_DEC_4_LAST_ROUNDS |
779 | | |
780 | | } |