/src/botan/src/lib/block/aes/aes_ni/aes_ni.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * AES using AES-NI instructions |
3 | | * (C) 2009,2012 Jack Lloyd |
4 | | * |
5 | | * Botan is released under the Simplified BSD License (see license.txt) |
6 | | */ |
7 | | |
8 | | #include <botan/internal/aes.h> |
9 | | #include <botan/internal/loadstor.h> |
10 | | #include <botan/internal/simd_32.h> |
11 | | #include <wmmintrin.h> |
12 | | |
13 | | namespace Botan { |
14 | | |
15 | | namespace { |
16 | | |
17 | | BOTAN_FUNC_ISA("ssse3") |
18 | | __m128i aes_128_key_expansion(__m128i key, __m128i key_with_rcon) |
19 | 5.86k | { |
20 | 5.86k | key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(3,3,3,3)); |
21 | 5.86k | key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); |
22 | 5.86k | key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); |
23 | 5.86k | key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); |
24 | 5.86k | return _mm_xor_si128(key, key_with_rcon); |
25 | 5.86k | } |
26 | | |
27 | | BOTAN_FUNC_ISA("ssse3") |
28 | | void aes_192_key_expansion(__m128i* K1, __m128i* K2, __m128i key2_with_rcon, |
29 | | uint32_t out[], bool last) |
30 | 0 | { |
31 | 0 | __m128i key1 = *K1; |
32 | 0 | __m128i key2 = *K2; |
33 | |
|
34 | 0 | key2_with_rcon = _mm_shuffle_epi32(key2_with_rcon, _MM_SHUFFLE(1,1,1,1)); |
35 | 0 | key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4)); |
36 | 0 | key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4)); |
37 | 0 | key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4)); |
38 | 0 | key1 = _mm_xor_si128(key1, key2_with_rcon); |
39 | |
|
40 | 0 | *K1 = key1; |
41 | 0 | _mm_storeu_si128(reinterpret_cast<__m128i*>(out), key1); |
42 | |
|
43 | 0 | if(last) |
44 | 0 | return; |
45 | | |
46 | 0 | key2 = _mm_xor_si128(key2, _mm_slli_si128(key2, 4)); |
47 | 0 | key2 = _mm_xor_si128(key2, _mm_shuffle_epi32(key1, _MM_SHUFFLE(3,3,3,3))); |
48 | |
|
49 | 0 | *K2 = key2; |
50 | 0 | out[4] = _mm_cvtsi128_si32(key2); |
51 | 0 | out[5] = _mm_cvtsi128_si32(_mm_srli_si128(key2, 4)); |
52 | 0 | } |
53 | | |
54 | | /* |
55 | | * The second half of the AES-256 key expansion (other half same as AES-128) |
56 | | */ |
57 | | BOTAN_FUNC_ISA("ssse3,aes") |
58 | | __m128i aes_256_key_expansion(__m128i key, __m128i key2) |
59 | 3.82k | { |
60 | 3.82k | __m128i key_with_rcon = _mm_aeskeygenassist_si128(key2, 0x00); |
61 | 3.82k | key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(2,2,2,2)); |
62 | | |
63 | 3.82k | key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); |
64 | 3.82k | key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); |
65 | 3.82k | key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); |
66 | 3.82k | return _mm_xor_si128(key, key_with_rcon); |
67 | 3.82k | } |
68 | | |
69 | | BOTAN_FORCE_INLINE void keyxor( |
70 | | SIMD_4x32 K, SIMD_4x32& B0, SIMD_4x32& B1, SIMD_4x32& B2, SIMD_4x32& B3) |
71 | 4.73k | { |
72 | 4.73k | B0 ^= K; |
73 | 4.73k | B1 ^= K; |
74 | 4.73k | B2 ^= K; |
75 | 4.73k | B3 ^= K; |
76 | 4.73k | } |
77 | | |
78 | | BOTAN_FUNC_ISA("aes") |
79 | | BOTAN_FORCE_INLINE void aesenc(SIMD_4x32 K, SIMD_4x32& B) |
80 | 49.3k | { |
81 | 49.3k | B = SIMD_4x32(_mm_aesenc_si128(B.raw(), K.raw())); |
82 | 49.3k | } |
83 | | |
84 | | BOTAN_FUNC_ISA("aes") |
85 | | BOTAN_FORCE_INLINE void aesenc( |
86 | | SIMD_4x32 K, SIMD_4x32& B0, SIMD_4x32& B1, SIMD_4x32& B2, SIMD_4x32& B3) |
87 | 39.5k | { |
88 | 39.5k | B0 = SIMD_4x32(_mm_aesenc_si128(B0.raw(), K.raw())); |
89 | 39.5k | B1 = SIMD_4x32(_mm_aesenc_si128(B1.raw(), K.raw())); |
90 | 39.5k | B2 = SIMD_4x32(_mm_aesenc_si128(B2.raw(), K.raw())); |
91 | 39.5k | B3 = SIMD_4x32(_mm_aesenc_si128(B3.raw(), K.raw())); |
92 | 39.5k | } |
93 | | |
94 | | BOTAN_FUNC_ISA("aes") |
95 | | BOTAN_FORCE_INLINE void aesenclast(SIMD_4x32 K, SIMD_4x32& B) |
96 | 4.69k | { |
97 | 4.69k | B = SIMD_4x32(_mm_aesenclast_si128(B.raw(), K.raw())); |
98 | 4.69k | } |
99 | | |
100 | | BOTAN_FUNC_ISA("aes") |
101 | | BOTAN_FORCE_INLINE void aesenclast( |
102 | | SIMD_4x32 K, SIMD_4x32& B0, SIMD_4x32& B1, SIMD_4x32& B2, SIMD_4x32& B3) |
103 | 3.16k | { |
104 | 3.16k | B0 = SIMD_4x32(_mm_aesenclast_si128(B0.raw(), K.raw())); |
105 | 3.16k | B1 = SIMD_4x32(_mm_aesenclast_si128(B1.raw(), K.raw())); |
106 | 3.16k | B2 = SIMD_4x32(_mm_aesenclast_si128(B2.raw(), K.raw())); |
107 | 3.16k | B3 = SIMD_4x32(_mm_aesenclast_si128(B3.raw(), K.raw())); |
108 | 3.16k | } |
109 | | |
110 | | BOTAN_FUNC_ISA("aes") |
111 | | BOTAN_FORCE_INLINE void aesdec(SIMD_4x32 K, SIMD_4x32& B) |
112 | 1.80k | { |
113 | 1.80k | B = SIMD_4x32(_mm_aesdec_si128(B.raw(), K.raw())); |
114 | 1.80k | } |
115 | | |
116 | | BOTAN_FUNC_ISA("aes") |
117 | | BOTAN_FORCE_INLINE void aesdec( |
118 | | SIMD_4x32 K, SIMD_4x32& B0, SIMD_4x32& B1, SIMD_4x32& B2, SIMD_4x32& B3) |
119 | 18.6k | { |
120 | 18.6k | B0 = SIMD_4x32(_mm_aesdec_si128(B0.raw(), K.raw())); |
121 | 18.6k | B1 = SIMD_4x32(_mm_aesdec_si128(B1.raw(), K.raw())); |
122 | 18.6k | B2 = SIMD_4x32(_mm_aesdec_si128(B2.raw(), K.raw())); |
123 | 18.6k | B3 = SIMD_4x32(_mm_aesdec_si128(B3.raw(), K.raw())); |
124 | 18.6k | } |
125 | | |
126 | | BOTAN_FUNC_ISA("aes") |
127 | | BOTAN_FORCE_INLINE void aesdeclast(SIMD_4x32 K, SIMD_4x32& B) |
128 | 148 | { |
129 | 148 | B = SIMD_4x32(_mm_aesdeclast_si128(B.raw(), K.raw())); |
130 | 148 | } |
131 | | |
132 | | BOTAN_FUNC_ISA("aes") |
133 | | BOTAN_FORCE_INLINE void aesdeclast( |
134 | | SIMD_4x32 K, SIMD_4x32& B0, SIMD_4x32& B1, SIMD_4x32& B2, SIMD_4x32& B3) |
135 | 1.57k | { |
136 | 1.57k | B0 = SIMD_4x32(_mm_aesdeclast_si128(B0.raw(), K.raw())); |
137 | 1.57k | B1 = SIMD_4x32(_mm_aesdeclast_si128(B1.raw(), K.raw())); |
138 | 1.57k | B2 = SIMD_4x32(_mm_aesdeclast_si128(B2.raw(), K.raw())); |
139 | 1.57k | B3 = SIMD_4x32(_mm_aesdeclast_si128(B3.raw(), K.raw())); |
140 | 1.57k | } |
141 | | |
142 | | } |
143 | | |
144 | | /* |
145 | | * AES-128 Encryption |
146 | | */ |
147 | | BOTAN_FUNC_ISA("ssse3,aes") |
148 | | void AES_128::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const |
149 | 2.99k | { |
150 | 2.99k | const SIMD_4x32 K0 = SIMD_4x32::load_le(&m_EK[4*0]); |
151 | 2.99k | const SIMD_4x32 K1 = SIMD_4x32::load_le(&m_EK[4*1]); |
152 | 2.99k | const SIMD_4x32 K2 = SIMD_4x32::load_le(&m_EK[4*2]); |
153 | 2.99k | const SIMD_4x32 K3 = SIMD_4x32::load_le(&m_EK[4*3]); |
154 | 2.99k | const SIMD_4x32 K4 = SIMD_4x32::load_le(&m_EK[4*4]); |
155 | 2.99k | const SIMD_4x32 K5 = SIMD_4x32::load_le(&m_EK[4*5]); |
156 | 2.99k | const SIMD_4x32 K6 = SIMD_4x32::load_le(&m_EK[4*6]); |
157 | 2.99k | const SIMD_4x32 K7 = SIMD_4x32::load_le(&m_EK[4*7]); |
158 | 2.99k | const SIMD_4x32 K8 = SIMD_4x32::load_le(&m_EK[4*8]); |
159 | 2.99k | const SIMD_4x32 K9 = SIMD_4x32::load_le(&m_EK[4*9]); |
160 | 2.99k | const SIMD_4x32 K10 = SIMD_4x32::load_le(&m_EK[4*10]); |
161 | | |
162 | 3.36k | while(blocks >= 4) |
163 | 372 | { |
164 | 372 | SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16*0); |
165 | 372 | SIMD_4x32 B1 = SIMD_4x32::load_le(in + 16*1); |
166 | 372 | SIMD_4x32 B2 = SIMD_4x32::load_le(in + 16*2); |
167 | 372 | SIMD_4x32 B3 = SIMD_4x32::load_le(in + 16*3); |
168 | | |
169 | 372 | keyxor(K0, B0, B1, B2, B3); |
170 | 372 | aesenc(K1, B0, B1, B2, B3); |
171 | 372 | aesenc(K2, B0, B1, B2, B3); |
172 | 372 | aesenc(K3, B0, B1, B2, B3); |
173 | 372 | aesenc(K4, B0, B1, B2, B3); |
174 | 372 | aesenc(K5, B0, B1, B2, B3); |
175 | 372 | aesenc(K6, B0, B1, B2, B3); |
176 | 372 | aesenc(K7, B0, B1, B2, B3); |
177 | 372 | aesenc(K8, B0, B1, B2, B3); |
178 | 372 | aesenc(K9, B0, B1, B2, B3); |
179 | 372 | aesenclast(K10, B0, B1, B2, B3); |
180 | | |
181 | 372 | B0.store_le(out + 16*0); |
182 | 372 | B1.store_le(out + 16*1); |
183 | 372 | B2.store_le(out + 16*2); |
184 | 372 | B3.store_le(out + 16*3); |
185 | | |
186 | 372 | blocks -= 4; |
187 | 372 | in += 4*16; |
188 | 372 | out += 4*16; |
189 | 372 | } |
190 | | |
191 | 5.89k | for(size_t i = 0; i != blocks; ++i) |
192 | 2.90k | { |
193 | 2.90k | SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16*i); |
194 | | |
195 | 2.90k | B0 ^= K0; |
196 | 2.90k | aesenc(K1, B0); |
197 | 2.90k | aesenc(K2, B0); |
198 | 2.90k | aesenc(K3, B0); |
199 | 2.90k | aesenc(K4, B0); |
200 | 2.90k | aesenc(K5, B0); |
201 | 2.90k | aesenc(K6, B0); |
202 | 2.90k | aesenc(K7, B0); |
203 | 2.90k | aesenc(K8, B0); |
204 | 2.90k | aesenc(K9, B0); |
205 | 2.90k | aesenclast(K10, B0); |
206 | | |
207 | 2.90k | B0.store_le(out + 16*i); |
208 | 2.90k | } |
209 | 2.99k | } |
210 | | |
211 | | /* |
212 | | * AES-128 Decryption |
213 | | */ |
214 | | BOTAN_FUNC_ISA("ssse3,aes") |
215 | | void AES_128::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const |
216 | 134 | { |
217 | 134 | const SIMD_4x32 K0 = SIMD_4x32::load_le(&m_DK[4*0]); |
218 | 134 | const SIMD_4x32 K1 = SIMD_4x32::load_le(&m_DK[4*1]); |
219 | 134 | const SIMD_4x32 K2 = SIMD_4x32::load_le(&m_DK[4*2]); |
220 | 134 | const SIMD_4x32 K3 = SIMD_4x32::load_le(&m_DK[4*3]); |
221 | 134 | const SIMD_4x32 K4 = SIMD_4x32::load_le(&m_DK[4*4]); |
222 | 134 | const SIMD_4x32 K5 = SIMD_4x32::load_le(&m_DK[4*5]); |
223 | 134 | const SIMD_4x32 K6 = SIMD_4x32::load_le(&m_DK[4*6]); |
224 | 134 | const SIMD_4x32 K7 = SIMD_4x32::load_le(&m_DK[4*7]); |
225 | 134 | const SIMD_4x32 K8 = SIMD_4x32::load_le(&m_DK[4*8]); |
226 | 134 | const SIMD_4x32 K9 = SIMD_4x32::load_le(&m_DK[4*9]); |
227 | 134 | const SIMD_4x32 K10 = SIMD_4x32::load_le(&m_DK[4*10]); |
228 | | |
229 | 607 | while(blocks >= 4) |
230 | 473 | { |
231 | 473 | SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16*0); |
232 | 473 | SIMD_4x32 B1 = SIMD_4x32::load_le(in + 16*1); |
233 | 473 | SIMD_4x32 B2 = SIMD_4x32::load_le(in + 16*2); |
234 | 473 | SIMD_4x32 B3 = SIMD_4x32::load_le(in + 16*3); |
235 | | |
236 | 473 | keyxor(K0, B0, B1, B2, B3); |
237 | 473 | aesdec(K1, B0, B1, B2, B3); |
238 | 473 | aesdec(K2, B0, B1, B2, B3); |
239 | 473 | aesdec(K3, B0, B1, B2, B3); |
240 | 473 | aesdec(K4, B0, B1, B2, B3); |
241 | 473 | aesdec(K5, B0, B1, B2, B3); |
242 | 473 | aesdec(K6, B0, B1, B2, B3); |
243 | 473 | aesdec(K7, B0, B1, B2, B3); |
244 | 473 | aesdec(K8, B0, B1, B2, B3); |
245 | 473 | aesdec(K9, B0, B1, B2, B3); |
246 | 473 | aesdeclast(K10, B0, B1, B2, B3); |
247 | | |
248 | 473 | B0.store_le(out + 16*0); |
249 | 473 | B1.store_le(out + 16*1); |
250 | 473 | B2.store_le(out + 16*2); |
251 | 473 | B3.store_le(out + 16*3); |
252 | | |
253 | 473 | blocks -= 4; |
254 | 473 | in += 4*16; |
255 | 473 | out += 4*16; |
256 | 473 | } |
257 | | |
258 | 164 | for(size_t i = 0; i != blocks; ++i) |
259 | 30 | { |
260 | 30 | SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16*i); |
261 | | |
262 | 30 | B0 ^= K0; |
263 | 30 | aesdec(K1, B0); |
264 | 30 | aesdec(K2, B0); |
265 | 30 | aesdec(K3, B0); |
266 | 30 | aesdec(K4, B0); |
267 | 30 | aesdec(K5, B0); |
268 | 30 | aesdec(K6, B0); |
269 | 30 | aesdec(K7, B0); |
270 | 30 | aesdec(K8, B0); |
271 | 30 | aesdec(K9, B0); |
272 | 30 | aesdeclast(K10, B0); |
273 | | |
274 | 30 | B0.store_le(out + 16*i); |
275 | 30 | } |
276 | 134 | } |
277 | | |
278 | | /* |
279 | | * AES-128 Key Schedule |
280 | | */ |
281 | | BOTAN_FUNC_ISA("ssse3,aes") |
282 | | void AES_128::aesni_key_schedule(const uint8_t key[], size_t /*length*/) |
283 | 141 | { |
284 | 141 | m_EK.resize(44); |
285 | 141 | m_DK.resize(44); |
286 | | |
287 | 141 | #define AES_128_key_exp(K, RCON) \ |
288 | 1.41k | aes_128_key_expansion(K, _mm_aeskeygenassist_si128(K, RCON)) |
289 | | |
290 | 141 | const __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key)); |
291 | 141 | const __m128i K1 = AES_128_key_exp(K0, 0x01); |
292 | 141 | const __m128i K2 = AES_128_key_exp(K1, 0x02); |
293 | 141 | const __m128i K3 = AES_128_key_exp(K2, 0x04); |
294 | 141 | const __m128i K4 = AES_128_key_exp(K3, 0x08); |
295 | 141 | const __m128i K5 = AES_128_key_exp(K4, 0x10); |
296 | 141 | const __m128i K6 = AES_128_key_exp(K5, 0x20); |
297 | 141 | const __m128i K7 = AES_128_key_exp(K6, 0x40); |
298 | 141 | const __m128i K8 = AES_128_key_exp(K7, 0x80); |
299 | 141 | const __m128i K9 = AES_128_key_exp(K8, 0x1B); |
300 | 141 | const __m128i K10 = AES_128_key_exp(K9, 0x36); |
301 | | |
302 | 141 | __m128i* EK_mm = reinterpret_cast<__m128i*>(m_EK.data()); |
303 | 141 | _mm_storeu_si128(EK_mm , K0); |
304 | 141 | _mm_storeu_si128(EK_mm + 1, K1); |
305 | 141 | _mm_storeu_si128(EK_mm + 2, K2); |
306 | 141 | _mm_storeu_si128(EK_mm + 3, K3); |
307 | 141 | _mm_storeu_si128(EK_mm + 4, K4); |
308 | 141 | _mm_storeu_si128(EK_mm + 5, K5); |
309 | 141 | _mm_storeu_si128(EK_mm + 6, K6); |
310 | 141 | _mm_storeu_si128(EK_mm + 7, K7); |
311 | 141 | _mm_storeu_si128(EK_mm + 8, K8); |
312 | 141 | _mm_storeu_si128(EK_mm + 9, K9); |
313 | 141 | _mm_storeu_si128(EK_mm + 10, K10); |
314 | | |
315 | | // Now generate decryption keys |
316 | | |
317 | 141 | __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data()); |
318 | 141 | _mm_storeu_si128(DK_mm , K10); |
319 | 141 | _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K9)); |
320 | 141 | _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K8)); |
321 | 141 | _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K7)); |
322 | 141 | _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K6)); |
323 | 141 | _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K5)); |
324 | 141 | _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K4)); |
325 | 141 | _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K3)); |
326 | 141 | _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K2)); |
327 | 141 | _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K1)); |
328 | 141 | _mm_storeu_si128(DK_mm + 10, K0); |
329 | 141 | } |
330 | | |
331 | | /* |
332 | | * AES-192 Encryption |
333 | | */ |
334 | | BOTAN_FUNC_ISA("ssse3,aes") |
335 | | void AES_192::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const |
336 | 0 | { |
337 | 0 | const SIMD_4x32 K0 = SIMD_4x32::load_le(&m_EK[4*0]); |
338 | 0 | const SIMD_4x32 K1 = SIMD_4x32::load_le(&m_EK[4*1]); |
339 | 0 | const SIMD_4x32 K2 = SIMD_4x32::load_le(&m_EK[4*2]); |
340 | 0 | const SIMD_4x32 K3 = SIMD_4x32::load_le(&m_EK[4*3]); |
341 | 0 | const SIMD_4x32 K4 = SIMD_4x32::load_le(&m_EK[4*4]); |
342 | 0 | const SIMD_4x32 K5 = SIMD_4x32::load_le(&m_EK[4*5]); |
343 | 0 | const SIMD_4x32 K6 = SIMD_4x32::load_le(&m_EK[4*6]); |
344 | 0 | const SIMD_4x32 K7 = SIMD_4x32::load_le(&m_EK[4*7]); |
345 | 0 | const SIMD_4x32 K8 = SIMD_4x32::load_le(&m_EK[4*8]); |
346 | 0 | const SIMD_4x32 K9 = SIMD_4x32::load_le(&m_EK[4*9]); |
347 | 0 | const SIMD_4x32 K10 = SIMD_4x32::load_le(&m_EK[4*10]); |
348 | 0 | const SIMD_4x32 K11 = SIMD_4x32::load_le(&m_EK[4*11]); |
349 | 0 | const SIMD_4x32 K12 = SIMD_4x32::load_le(&m_EK[4*12]); |
350 | |
|
351 | 0 | while(blocks >= 4) |
352 | 0 | { |
353 | 0 | SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16*0); |
354 | 0 | SIMD_4x32 B1 = SIMD_4x32::load_le(in + 16*1); |
355 | 0 | SIMD_4x32 B2 = SIMD_4x32::load_le(in + 16*2); |
356 | 0 | SIMD_4x32 B3 = SIMD_4x32::load_le(in + 16*3); |
357 | |
|
358 | 0 | keyxor(K0, B0, B1, B2, B3); |
359 | 0 | aesenc(K1, B0, B1, B2, B3); |
360 | 0 | aesenc(K2, B0, B1, B2, B3); |
361 | 0 | aesenc(K3, B0, B1, B2, B3); |
362 | 0 | aesenc(K4, B0, B1, B2, B3); |
363 | 0 | aesenc(K5, B0, B1, B2, B3); |
364 | 0 | aesenc(K6, B0, B1, B2, B3); |
365 | 0 | aesenc(K7, B0, B1, B2, B3); |
366 | 0 | aesenc(K8, B0, B1, B2, B3); |
367 | 0 | aesenc(K9, B0, B1, B2, B3); |
368 | 0 | aesenc(K10, B0, B1, B2, B3); |
369 | 0 | aesenc(K11, B0, B1, B2, B3); |
370 | 0 | aesenclast(K12, B0, B1, B2, B3); |
371 | |
|
372 | 0 | B0.store_le(out + 16*0); |
373 | 0 | B1.store_le(out + 16*1); |
374 | 0 | B2.store_le(out + 16*2); |
375 | 0 | B3.store_le(out + 16*3); |
376 | |
|
377 | 0 | blocks -= 4; |
378 | 0 | in += 4*16; |
379 | 0 | out += 4*16; |
380 | 0 | } |
381 | |
|
382 | 0 | for(size_t i = 0; i != blocks; ++i) |
383 | 0 | { |
384 | 0 | SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16*i); |
385 | |
|
386 | 0 | B0 ^= K0; |
387 | |
|
388 | 0 | aesenc(K1, B0); |
389 | 0 | aesenc(K2, B0); |
390 | 0 | aesenc(K3, B0); |
391 | 0 | aesenc(K4, B0); |
392 | 0 | aesenc(K5, B0); |
393 | 0 | aesenc(K6, B0); |
394 | 0 | aesenc(K7, B0); |
395 | 0 | aesenc(K8, B0); |
396 | 0 | aesenc(K9, B0); |
397 | 0 | aesenc(K10, B0); |
398 | 0 | aesenc(K11, B0); |
399 | 0 | aesenclast(K12, B0); |
400 | |
|
401 | 0 | B0.store_le(out + 16*i); |
402 | 0 | } |
403 | 0 | } |
404 | | |
405 | | /* |
406 | | * AES-192 Decryption |
407 | | */ |
408 | | BOTAN_FUNC_ISA("ssse3,aes") |
409 | | void AES_192::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const |
410 | 0 | { |
411 | 0 | const SIMD_4x32 K0 = SIMD_4x32::load_le(&m_DK[4*0]); |
412 | 0 | const SIMD_4x32 K1 = SIMD_4x32::load_le(&m_DK[4*1]); |
413 | 0 | const SIMD_4x32 K2 = SIMD_4x32::load_le(&m_DK[4*2]); |
414 | 0 | const SIMD_4x32 K3 = SIMD_4x32::load_le(&m_DK[4*3]); |
415 | 0 | const SIMD_4x32 K4 = SIMD_4x32::load_le(&m_DK[4*4]); |
416 | 0 | const SIMD_4x32 K5 = SIMD_4x32::load_le(&m_DK[4*5]); |
417 | 0 | const SIMD_4x32 K6 = SIMD_4x32::load_le(&m_DK[4*6]); |
418 | 0 | const SIMD_4x32 K7 = SIMD_4x32::load_le(&m_DK[4*7]); |
419 | 0 | const SIMD_4x32 K8 = SIMD_4x32::load_le(&m_DK[4*8]); |
420 | 0 | const SIMD_4x32 K9 = SIMD_4x32::load_le(&m_DK[4*9]); |
421 | 0 | const SIMD_4x32 K10 = SIMD_4x32::load_le(&m_DK[4*10]); |
422 | 0 | const SIMD_4x32 K11 = SIMD_4x32::load_le(&m_DK[4*11]); |
423 | 0 | const SIMD_4x32 K12 = SIMD_4x32::load_le(&m_DK[4*12]); |
424 | |
|
425 | 0 | while(blocks >= 4) |
426 | 0 | { |
427 | 0 | SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16*0); |
428 | 0 | SIMD_4x32 B1 = SIMD_4x32::load_le(in + 16*1); |
429 | 0 | SIMD_4x32 B2 = SIMD_4x32::load_le(in + 16*2); |
430 | 0 | SIMD_4x32 B3 = SIMD_4x32::load_le(in + 16*3); |
431 | |
|
432 | 0 | keyxor(K0, B0, B1, B2, B3); |
433 | 0 | aesdec(K1, B0, B1, B2, B3); |
434 | 0 | aesdec(K2, B0, B1, B2, B3); |
435 | 0 | aesdec(K3, B0, B1, B2, B3); |
436 | 0 | aesdec(K4, B0, B1, B2, B3); |
437 | 0 | aesdec(K5, B0, B1, B2, B3); |
438 | 0 | aesdec(K6, B0, B1, B2, B3); |
439 | 0 | aesdec(K7, B0, B1, B2, B3); |
440 | 0 | aesdec(K8, B0, B1, B2, B3); |
441 | 0 | aesdec(K9, B0, B1, B2, B3); |
442 | 0 | aesdec(K10, B0, B1, B2, B3); |
443 | 0 | aesdec(K11, B0, B1, B2, B3); |
444 | 0 | aesdeclast(K12, B0, B1, B2, B3); |
445 | |
|
446 | 0 | B0.store_le(out + 16*0); |
447 | 0 | B1.store_le(out + 16*1); |
448 | 0 | B2.store_le(out + 16*2); |
449 | 0 | B3.store_le(out + 16*3); |
450 | |
|
451 | 0 | blocks -= 4; |
452 | 0 | in += 4*16; |
453 | 0 | out += 4*16; |
454 | 0 | } |
455 | |
|
456 | 0 | for(size_t i = 0; i != blocks; ++i) |
457 | 0 | { |
458 | 0 | SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16*i); |
459 | |
|
460 | 0 | B0 ^= K0; |
461 | |
|
462 | 0 | aesdec(K1, B0); |
463 | 0 | aesdec(K2, B0); |
464 | 0 | aesdec(K3, B0); |
465 | 0 | aesdec(K4, B0); |
466 | 0 | aesdec(K5, B0); |
467 | 0 | aesdec(K6, B0); |
468 | 0 | aesdec(K7, B0); |
469 | 0 | aesdec(K8, B0); |
470 | 0 | aesdec(K9, B0); |
471 | 0 | aesdec(K10, B0); |
472 | 0 | aesdec(K11, B0); |
473 | 0 | aesdeclast(K12, B0); |
474 | |
|
475 | 0 | B0.store_le(out + 16*i); |
476 | 0 | } |
477 | 0 | } |
478 | | |
479 | | /* |
480 | | * AES-192 Key Schedule |
481 | | */ |
482 | | BOTAN_FUNC_ISA("ssse3,aes") |
483 | | void AES_192::aesni_key_schedule(const uint8_t key[], size_t /*length*/) |
484 | 0 | { |
485 | 0 | m_EK.resize(52); |
486 | 0 | m_DK.resize(52); |
487 | |
|
488 | 0 | __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key)); |
489 | 0 | __m128i K1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key + 8)); |
490 | 0 | K1 = _mm_srli_si128(K1, 8); |
491 | |
|
492 | 0 | load_le(m_EK.data(), key, 6); |
493 | |
|
494 | 0 | #define AES_192_key_exp(RCON, EK_OFF) \ |
495 | 0 | aes_192_key_expansion(&K0, &K1, \ |
496 | 0 | _mm_aeskeygenassist_si128(K1, RCON), \ |
497 | 0 | &m_EK[EK_OFF], EK_OFF == 48) |
498 | |
|
499 | 0 | AES_192_key_exp(0x01, 6); |
500 | 0 | AES_192_key_exp(0x02, 12); |
501 | 0 | AES_192_key_exp(0x04, 18); |
502 | 0 | AES_192_key_exp(0x08, 24); |
503 | 0 | AES_192_key_exp(0x10, 30); |
504 | 0 | AES_192_key_exp(0x20, 36); |
505 | 0 | AES_192_key_exp(0x40, 42); |
506 | 0 | AES_192_key_exp(0x80, 48); |
507 | |
|
508 | 0 | #undef AES_192_key_exp |
509 | | |
510 | | // Now generate decryption keys |
511 | 0 | const __m128i* EK_mm = reinterpret_cast<const __m128i*>(m_EK.data()); |
512 | |
|
513 | 0 | __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data()); |
514 | 0 | _mm_storeu_si128(DK_mm , _mm_loadu_si128(EK_mm + 12)); |
515 | 0 | _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 11))); |
516 | 0 | _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 10))); |
517 | 0 | _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 9))); |
518 | 0 | _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 8))); |
519 | 0 | _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 7))); |
520 | 0 | _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 6))); |
521 | 0 | _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 5))); |
522 | 0 | _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 4))); |
523 | 0 | _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 3))); |
524 | 0 | _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 2))); |
525 | 0 | _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 1))); |
526 | 0 | _mm_storeu_si128(DK_mm + 12, _mm_loadu_si128(EK_mm + 0)); |
527 | 0 | } |
528 | | |
529 | | /* |
530 | | * AES-256 Encryption |
531 | | */ |
532 | | BOTAN_FUNC_ISA("ssse3,aes") |
533 | | void AES_256::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const |
534 | 2.48k | { |
535 | 2.48k | const SIMD_4x32 K0 = SIMD_4x32::load_le(&m_EK[4*0]); |
536 | 2.48k | const SIMD_4x32 K1 = SIMD_4x32::load_le(&m_EK[4*1]); |
537 | 2.48k | const SIMD_4x32 K2 = SIMD_4x32::load_le(&m_EK[4*2]); |
538 | 2.48k | const SIMD_4x32 K3 = SIMD_4x32::load_le(&m_EK[4*3]); |
539 | 2.48k | const SIMD_4x32 K4 = SIMD_4x32::load_le(&m_EK[4*4]); |
540 | 2.48k | const SIMD_4x32 K5 = SIMD_4x32::load_le(&m_EK[4*5]); |
541 | 2.48k | const SIMD_4x32 K6 = SIMD_4x32::load_le(&m_EK[4*6]); |
542 | 2.48k | const SIMD_4x32 K7 = SIMD_4x32::load_le(&m_EK[4*7]); |
543 | 2.48k | const SIMD_4x32 K8 = SIMD_4x32::load_le(&m_EK[4*8]); |
544 | 2.48k | const SIMD_4x32 K9 = SIMD_4x32::load_le(&m_EK[4*9]); |
545 | 2.48k | const SIMD_4x32 K10 = SIMD_4x32::load_le(&m_EK[4*10]); |
546 | 2.48k | const SIMD_4x32 K11 = SIMD_4x32::load_le(&m_EK[4*11]); |
547 | 2.48k | const SIMD_4x32 K12 = SIMD_4x32::load_le(&m_EK[4*12]); |
548 | 2.48k | const SIMD_4x32 K13 = SIMD_4x32::load_le(&m_EK[4*13]); |
549 | 2.48k | const SIMD_4x32 K14 = SIMD_4x32::load_le(&m_EK[4*14]); |
550 | | |
551 | 5.27k | while(blocks >= 4) |
552 | 2.78k | { |
553 | 2.78k | SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16*0); |
554 | 2.78k | SIMD_4x32 B1 = SIMD_4x32::load_le(in + 16*1); |
555 | 2.78k | SIMD_4x32 B2 = SIMD_4x32::load_le(in + 16*2); |
556 | 2.78k | SIMD_4x32 B3 = SIMD_4x32::load_le(in + 16*3); |
557 | | |
558 | 2.78k | keyxor(K0, B0, B1, B2, B3); |
559 | 2.78k | aesenc(K1, B0, B1, B2, B3); |
560 | 2.78k | aesenc(K2, B0, B1, B2, B3); |
561 | 2.78k | aesenc(K3, B0, B1, B2, B3); |
562 | 2.78k | aesenc(K4, B0, B1, B2, B3); |
563 | 2.78k | aesenc(K5, B0, B1, B2, B3); |
564 | 2.78k | aesenc(K6, B0, B1, B2, B3); |
565 | 2.78k | aesenc(K7, B0, B1, B2, B3); |
566 | 2.78k | aesenc(K8, B0, B1, B2, B3); |
567 | 2.78k | aesenc(K9, B0, B1, B2, B3); |
568 | 2.78k | aesenc(K10, B0, B1, B2, B3); |
569 | 2.78k | aesenc(K11, B0, B1, B2, B3); |
570 | 2.78k | aesenc(K12, B0, B1, B2, B3); |
571 | 2.78k | aesenc(K13, B0, B1, B2, B3); |
572 | 2.78k | aesenclast(K14, B0, B1, B2, B3); |
573 | | |
574 | 2.78k | B0.store_le(out + 16*0); |
575 | 2.78k | B1.store_le(out + 16*1); |
576 | 2.78k | B2.store_le(out + 16*2); |
577 | 2.78k | B3.store_le(out + 16*3); |
578 | | |
579 | 2.78k | blocks -= 4; |
580 | 2.78k | in += 4*16; |
581 | 2.78k | out += 4*16; |
582 | 2.78k | } |
583 | | |
584 | 4.27k | for(size_t i = 0; i != blocks; ++i) |
585 | 1.78k | { |
586 | 1.78k | SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16*i); |
587 | | |
588 | 1.78k | B0 ^= K0; |
589 | | |
590 | 1.78k | aesenc(K1, B0); |
591 | 1.78k | aesenc(K2, B0); |
592 | 1.78k | aesenc(K3, B0); |
593 | 1.78k | aesenc(K4, B0); |
594 | 1.78k | aesenc(K5, B0); |
595 | 1.78k | aesenc(K6, B0); |
596 | 1.78k | aesenc(K7, B0); |
597 | 1.78k | aesenc(K8, B0); |
598 | 1.78k | aesenc(K9, B0); |
599 | 1.78k | aesenc(K10, B0); |
600 | 1.78k | aesenc(K11, B0); |
601 | 1.78k | aesenc(K12, B0); |
602 | 1.78k | aesenc(K13, B0); |
603 | 1.78k | aesenclast(K14, B0); |
604 | | |
605 | 1.78k | B0.store_le(out + 16*i); |
606 | 1.78k | } |
607 | 2.48k | } |
608 | | |
609 | | /* |
610 | | * AES-256 Decryption |
611 | | */ |
612 | | BOTAN_FUNC_ISA("ssse3,aes") |
613 | | void AES_256::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const |
614 | 322 | { |
615 | 322 | const SIMD_4x32 K0 = SIMD_4x32::load_le(&m_DK[4*0]); |
616 | 322 | const SIMD_4x32 K1 = SIMD_4x32::load_le(&m_DK[4*1]); |
617 | 322 | const SIMD_4x32 K2 = SIMD_4x32::load_le(&m_DK[4*2]); |
618 | 322 | const SIMD_4x32 K3 = SIMD_4x32::load_le(&m_DK[4*3]); |
619 | 322 | const SIMD_4x32 K4 = SIMD_4x32::load_le(&m_DK[4*4]); |
620 | 322 | const SIMD_4x32 K5 = SIMD_4x32::load_le(&m_DK[4*5]); |
621 | 322 | const SIMD_4x32 K6 = SIMD_4x32::load_le(&m_DK[4*6]); |
622 | 322 | const SIMD_4x32 K7 = SIMD_4x32::load_le(&m_DK[4*7]); |
623 | 322 | const SIMD_4x32 K8 = SIMD_4x32::load_le(&m_DK[4*8]); |
624 | 322 | const SIMD_4x32 K9 = SIMD_4x32::load_le(&m_DK[4*9]); |
625 | 322 | const SIMD_4x32 K10 = SIMD_4x32::load_le(&m_DK[4*10]); |
626 | 322 | const SIMD_4x32 K11 = SIMD_4x32::load_le(&m_DK[4*11]); |
627 | 322 | const SIMD_4x32 K12 = SIMD_4x32::load_le(&m_DK[4*12]); |
628 | 322 | const SIMD_4x32 K13 = SIMD_4x32::load_le(&m_DK[4*13]); |
629 | 322 | const SIMD_4x32 K14 = SIMD_4x32::load_le(&m_DK[4*14]); |
630 | | |
631 | 1.42k | while(blocks >= 4) |
632 | 1.10k | { |
633 | 1.10k | SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16*0); |
634 | 1.10k | SIMD_4x32 B1 = SIMD_4x32::load_le(in + 16*1); |
635 | 1.10k | SIMD_4x32 B2 = SIMD_4x32::load_le(in + 16*2); |
636 | 1.10k | SIMD_4x32 B3 = SIMD_4x32::load_le(in + 16*3); |
637 | | |
638 | 1.10k | keyxor(K0, B0, B1, B2, B3); |
639 | 1.10k | aesdec(K1, B0, B1, B2, B3); |
640 | 1.10k | aesdec(K2, B0, B1, B2, B3); |
641 | 1.10k | aesdec(K3, B0, B1, B2, B3); |
642 | 1.10k | aesdec(K4, B0, B1, B2, B3); |
643 | 1.10k | aesdec(K5, B0, B1, B2, B3); |
644 | 1.10k | aesdec(K6, B0, B1, B2, B3); |
645 | 1.10k | aesdec(K7, B0, B1, B2, B3); |
646 | 1.10k | aesdec(K8, B0, B1, B2, B3); |
647 | 1.10k | aesdec(K9, B0, B1, B2, B3); |
648 | 1.10k | aesdec(K10, B0, B1, B2, B3); |
649 | 1.10k | aesdec(K11, B0, B1, B2, B3); |
650 | 1.10k | aesdec(K12, B0, B1, B2, B3); |
651 | 1.10k | aesdec(K13, B0, B1, B2, B3); |
652 | 1.10k | aesdeclast(K14, B0, B1, B2, B3); |
653 | | |
654 | 1.10k | B0.store_le(out + 16*0); |
655 | 1.10k | B1.store_le(out + 16*1); |
656 | 1.10k | B2.store_le(out + 16*2); |
657 | 1.10k | B3.store_le(out + 16*3); |
658 | | |
659 | 1.10k | blocks -= 4; |
660 | 1.10k | in += 4*16; |
661 | 1.10k | out += 4*16; |
662 | 1.10k | } |
663 | | |
664 | 440 | for(size_t i = 0; i != blocks; ++i) |
665 | 118 | { |
666 | 118 | SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16*i); |
667 | | |
668 | 118 | B0 ^= K0; |
669 | | |
670 | 118 | aesdec(K1, B0); |
671 | 118 | aesdec(K2, B0); |
672 | 118 | aesdec(K3, B0); |
673 | 118 | aesdec(K4, B0); |
674 | 118 | aesdec(K5, B0); |
675 | 118 | aesdec(K6, B0); |
676 | 118 | aesdec(K7, B0); |
677 | 118 | aesdec(K8, B0); |
678 | 118 | aesdec(K9, B0); |
679 | 118 | aesdec(K10, B0); |
680 | 118 | aesdec(K11, B0); |
681 | 118 | aesdec(K12, B0); |
682 | 118 | aesdec(K13, B0); |
683 | 118 | aesdeclast(K14, B0); |
684 | | |
685 | 118 | B0.store_le(out + 16*i); |
686 | 118 | } |
687 | 322 | } |
688 | | |
689 | | /* |
690 | | * AES-256 Key Schedule |
691 | | */ |
692 | | BOTAN_FUNC_ISA("ssse3,aes") |
693 | | void AES_256::aesni_key_schedule(const uint8_t key[], size_t /*length*/) |
694 | 637 | { |
695 | 637 | m_EK.resize(60); |
696 | 637 | m_DK.resize(60); |
697 | | |
698 | 637 | const __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key)); |
699 | 637 | const __m128i K1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key + 16)); |
700 | | |
701 | 637 | const __m128i K2 = aes_128_key_expansion(K0, _mm_aeskeygenassist_si128(K1, 0x01)); |
702 | 637 | const __m128i K3 = aes_256_key_expansion(K1, K2); |
703 | | |
704 | 637 | const __m128i K4 = aes_128_key_expansion(K2, _mm_aeskeygenassist_si128(K3, 0x02)); |
705 | 637 | const __m128i K5 = aes_256_key_expansion(K3, K4); |
706 | | |
707 | 637 | const __m128i K6 = aes_128_key_expansion(K4, _mm_aeskeygenassist_si128(K5, 0x04)); |
708 | 637 | const __m128i K7 = aes_256_key_expansion(K5, K6); |
709 | | |
710 | 637 | const __m128i K8 = aes_128_key_expansion(K6, _mm_aeskeygenassist_si128(K7, 0x08)); |
711 | 637 | const __m128i K9 = aes_256_key_expansion(K7, K8); |
712 | | |
713 | 637 | const __m128i K10 = aes_128_key_expansion(K8, _mm_aeskeygenassist_si128(K9, 0x10)); |
714 | 637 | const __m128i K11 = aes_256_key_expansion(K9, K10); |
715 | | |
716 | 637 | const __m128i K12 = aes_128_key_expansion(K10, _mm_aeskeygenassist_si128(K11, 0x20)); |
717 | 637 | const __m128i K13 = aes_256_key_expansion(K11, K12); |
718 | | |
719 | 637 | const __m128i K14 = aes_128_key_expansion(K12, _mm_aeskeygenassist_si128(K13, 0x40)); |
720 | | |
721 | 637 | __m128i* EK_mm = reinterpret_cast<__m128i*>(m_EK.data()); |
722 | 637 | _mm_storeu_si128(EK_mm , K0); |
723 | 637 | _mm_storeu_si128(EK_mm + 1, K1); |
724 | 637 | _mm_storeu_si128(EK_mm + 2, K2); |
725 | 637 | _mm_storeu_si128(EK_mm + 3, K3); |
726 | 637 | _mm_storeu_si128(EK_mm + 4, K4); |
727 | 637 | _mm_storeu_si128(EK_mm + 5, K5); |
728 | 637 | _mm_storeu_si128(EK_mm + 6, K6); |
729 | 637 | _mm_storeu_si128(EK_mm + 7, K7); |
730 | 637 | _mm_storeu_si128(EK_mm + 8, K8); |
731 | 637 | _mm_storeu_si128(EK_mm + 9, K9); |
732 | 637 | _mm_storeu_si128(EK_mm + 10, K10); |
733 | 637 | _mm_storeu_si128(EK_mm + 11, K11); |
734 | 637 | _mm_storeu_si128(EK_mm + 12, K12); |
735 | 637 | _mm_storeu_si128(EK_mm + 13, K13); |
736 | 637 | _mm_storeu_si128(EK_mm + 14, K14); |
737 | | |
738 | | // Now generate decryption keys |
739 | 637 | __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data()); |
740 | 637 | _mm_storeu_si128(DK_mm , K14); |
741 | 637 | _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K13)); |
742 | 637 | _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K12)); |
743 | 637 | _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K11)); |
744 | 637 | _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K10)); |
745 | 637 | _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K9)); |
746 | 637 | _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K8)); |
747 | 637 | _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K7)); |
748 | 637 | _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K6)); |
749 | 637 | _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K5)); |
750 | 637 | _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(K4)); |
751 | 637 | _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(K3)); |
752 | 637 | _mm_storeu_si128(DK_mm + 12, _mm_aesimc_si128(K2)); |
753 | 637 | _mm_storeu_si128(DK_mm + 13, _mm_aesimc_si128(K1)); |
754 | 637 | _mm_storeu_si128(DK_mm + 14, K0); |
755 | 637 | } |
756 | | |
757 | | } |