/src/SymCrypt/lib/aes-xmm.c
Line | Count | Source (jump to first uncovered line) |
1 | | // |
2 | | // aes-xmm.c code for AES implementation |
3 | | // |
4 | | // Copyright (c) Microsoft Corporation. Licensed under the MIT license. |
5 | | // |
6 | | // All XMM code for AES operations |
7 | | // Requires compiler support for ssse3, aesni and pclmulqdq |
8 | | // |
9 | | |
10 | | #include "precomp.h" |
11 | | |
12 | | #if SYMCRYPT_CPU_X86 | SYMCRYPT_CPU_AMD64 |
13 | | |
14 | | #include "xtsaes_definitions.h" |
15 | | #include "ghash_definitions.h" |
16 | | |
17 | | VOID |
18 | | SYMCRYPT_CALL |
19 | | SymCryptAes4SboxXmm( _In_reads_(4) PCBYTE pIn, _Out_writes_(4) PBYTE pOut ) |
20 | 0 | { |
21 | 0 | __m128i x; |
22 | 0 | x = _mm_set1_epi32( *(int *) pIn ); |
23 | |
|
24 | 0 | x = _mm_aeskeygenassist_si128( x, 0 ); |
25 | | |
26 | | // Could use _mm_storeu_si32( pOut, x ) but it is missing from some headers and _mm_store_ss will be as fast |
27 | 0 | _mm_store_ss( (float *) pOut, _mm_castsi128_ps(x) ); |
28 | 0 | } |
29 | | |
30 | | VOID |
31 | | SYMCRYPT_CALL |
32 | | SymCryptAesCreateDecryptionRoundKeyXmm( |
33 | | _In_reads_(16) PCBYTE pEncryptionRoundKey, |
34 | | _Out_writes_(16) PBYTE pDecryptionRoundKey ) |
35 | 0 | { |
36 | | // |
37 | | // On x86 our key structure is only 4-aligned (the best we can do) so we use unaligned load/stores. |
38 | | // On Amd64 our round keys are aligned, but recent CPUs have fast unaligned load/store if the address is |
39 | | // actually aligned properly. |
40 | | // |
41 | 0 | _mm_storeu_si128( (__m128i *) pDecryptionRoundKey, _mm_aesimc_si128( _mm_loadu_si128( (__m128i *)pEncryptionRoundKey ) ) ); |
42 | 0 | } |
43 | | |
44 | | // |
45 | | // The latency of AES instruction has increased up to 8 cycles in Ivy Bridge, |
46 | | // and back to 7 in Haswell. |
47 | | // We use 8-parallel code to expose the maximum parallelism to the CPU. |
48 | | // On x86 it will introduce some register spilling, but the load/stores |
49 | | // should be able to hide behind the AES instruction latencies. |
50 | | // Silvermont x86 CPUs has AES-NI with latency = 8 and throughput = 5, so there |
51 | | // the CPU parallelism is low. |
52 | | // For things like BitLocker that is fine, but other uses, such as GCM & AES_CTR_DRBG |
53 | | // use odd sizes. |
54 | | // We try to do 5-8 blocks in 8-parallel code, 2-4 blocks in 4-parallel code, and |
55 | | // 1 block in 1-parallel code. |
56 | | // This is a compromise; the big cores can do 8 parallel in about the time of a 4-parallel, |
57 | | // but Silvermont cannot and would pay a big price on small requests if we only use 8-parallel. |
58 | | // Doing only 8-parallel and then 1-parallel would penalize the big cores a lot. |
59 | | // |
60 | | // We used to have 7-parallel code, but common request sizes are not multiples of 7 |
61 | | // blocks so we end up doing a lot of extra work. This is especially expensive on |
62 | | // Silvermont where the extra work isn't hidden in the latencies. |
63 | | // |
64 | | |
65 | 0 | #define AES_ENCRYPT_1( pExpandedKey, c0 ) \ |
66 | 0 | { \ |
67 | 0 | const BYTE (*keyPtr)[4][4]; \ |
68 | 0 | const BYTE (*keyLimit)[4][4]; \ |
69 | 0 | __m128i roundkey; \ |
70 | 0 | \ |
71 | 0 | keyPtr = &pExpandedKey->RoundKey[0]; \ |
72 | 0 | keyLimit = pExpandedKey->lastEncRoundKey; \ |
73 | 0 | \ |
74 | 0 | roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \ |
75 | 0 | keyPtr ++; \ |
76 | 0 | \ |
77 | 0 | c0 = _mm_xor_si128( c0, roundkey ); \ |
78 | 0 | \ |
79 | 0 | roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \ |
80 | 0 | keyPtr ++; \ |
81 | 0 | c0 = _mm_aesenc_si128( c0, roundkey ); \ |
82 | 0 | \ |
83 | 0 | do \ |
84 | 0 | { \ |
85 | 0 | roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \ |
86 | 0 | keyPtr ++; \ |
87 | 0 | c0 = _mm_aesenc_si128( c0, roundkey ); \ |
88 | 0 | roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \ |
89 | 0 | keyPtr ++; \ |
90 | 0 | c0 = _mm_aesenc_si128( c0, roundkey ); \ |
91 | 0 | } while( keyPtr < keyLimit ); \ |
92 | 0 | \ |
93 | 0 | roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \ |
94 | 0 | \ |
95 | 0 | c0 = _mm_aesenclast_si128( c0, roundkey ); \ |
96 | 0 | }; |
97 | | |
98 | | |
99 | | // Perform AES encryption without the first round key and with a specified last round key |
100 | | // |
101 | | // For algorithms where performance is dominated by a chain of dependent AES rounds (i.e. CBC encryption, CCM, CMAC) |
102 | | // we can gain a reasonable performance uplift by computing (last round key ^ next plaintext block ^ first round key) |
103 | | // off the critical path and using this computed value in place of last round key in AESENCLAST instructions. |
104 | 0 | #define AES_ENCRYPT_1_CHAIN( pExpandedKey, cipherState, mergedLastRoundKey ) \ |
105 | 0 | { \ |
106 | 0 | const BYTE (*keyPtr)[4][4]; \ |
107 | 0 | const BYTE (*keyLimit)[4][4]; \ |
108 | 0 | __m128i roundkey; \ |
109 | 0 | \ |
110 | 0 | keyPtr = &pExpandedKey->RoundKey[1]; \ |
111 | 0 | keyLimit = pExpandedKey->lastEncRoundKey; \ |
112 | 0 | \ |
113 | 0 | roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \ |
114 | 0 | keyPtr ++; \ |
115 | 0 | \ |
116 | 0 | cipherState = _mm_aesenc_si128( cipherState, roundkey ); \ |
117 | 0 | \ |
118 | 0 | do \ |
119 | 0 | { \ |
120 | 0 | roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \ |
121 | 0 | keyPtr ++; \ |
122 | 0 | cipherState = _mm_aesenc_si128( cipherState, roundkey ); \ |
123 | 0 | roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \ |
124 | 0 | keyPtr ++; \ |
125 | 0 | cipherState = _mm_aesenc_si128( cipherState, roundkey ); \ |
126 | 0 | } while( keyPtr < keyLimit ); \ |
127 | 0 | \ |
128 | 0 | cipherState = _mm_aesenclast_si128( cipherState, mergedLastRoundKey ); \ |
129 | 0 | }; |
130 | | |
131 | 0 | #define AES_ENCRYPT_4( pExpandedKey, c0, c1, c2, c3 ) \ |
132 | 0 | { \ |
133 | 0 | const BYTE (*keyPtr)[4][4]; \ |
134 | 0 | const BYTE (*keyLimit)[4][4]; \ |
135 | 0 | __m128i roundkey; \ |
136 | 0 | \ |
137 | 0 | keyPtr = &pExpandedKey->RoundKey[0]; \ |
138 | 0 | keyLimit = pExpandedKey->lastEncRoundKey; \ |
139 | 0 | \ |
140 | 0 | roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \ |
141 | 0 | keyPtr ++; \ |
142 | 0 | \ |
143 | 0 | c0 = _mm_xor_si128( c0, roundkey ); \ |
144 | 0 | c1 = _mm_xor_si128( c1, roundkey ); \ |
145 | 0 | c2 = _mm_xor_si128( c2, roundkey ); \ |
146 | 0 | c3 = _mm_xor_si128( c3, roundkey ); \ |
147 | 0 | \ |
148 | 0 | do \ |
149 | 0 | { \ |
150 | 0 | roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \ |
151 | 0 | keyPtr ++; \ |
152 | 0 | c0 = _mm_aesenc_si128( c0, roundkey ); \ |
153 | 0 | c1 = _mm_aesenc_si128( c1, roundkey ); \ |
154 | 0 | c2 = _mm_aesenc_si128( c2, roundkey ); \ |
155 | 0 | c3 = _mm_aesenc_si128( c3, roundkey ); \ |
156 | 0 | } while( keyPtr < keyLimit ); \ |
157 | 0 | \ |
158 | 0 | roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \ |
159 | 0 | \ |
160 | 0 | c0 = _mm_aesenclast_si128( c0, roundkey ); \ |
161 | 0 | c1 = _mm_aesenclast_si128( c1, roundkey ); \ |
162 | 0 | c2 = _mm_aesenclast_si128( c2, roundkey ); \ |
163 | 0 | c3 = _mm_aesenclast_si128( c3, roundkey ); \ |
164 | 0 | }; |
165 | | |
166 | 0 | #define AES_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 ) \ |
167 | 0 | { \ |
168 | 0 | const BYTE (*keyPtr)[4][4]; \ |
169 | 0 | const BYTE (*keyLimit)[4][4]; \ |
170 | 0 | __m128i roundkey; \ |
171 | 0 | \ |
172 | 0 | keyPtr = &pExpandedKey->RoundKey[0]; \ |
173 | 0 | keyLimit = pExpandedKey->lastEncRoundKey; \ |
174 | 0 | \ |
175 | 0 | roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \ |
176 | 0 | keyPtr ++; \ |
177 | 0 | \ |
178 | 0 | c0 = _mm_xor_si128( c0, roundkey ); \ |
179 | 0 | c1 = _mm_xor_si128( c1, roundkey ); \ |
180 | 0 | c2 = _mm_xor_si128( c2, roundkey ); \ |
181 | 0 | c3 = _mm_xor_si128( c3, roundkey ); \ |
182 | 0 | c4 = _mm_xor_si128( c4, roundkey ); \ |
183 | 0 | c5 = _mm_xor_si128( c5, roundkey ); \ |
184 | 0 | c6 = _mm_xor_si128( c6, roundkey ); \ |
185 | 0 | c7 = _mm_xor_si128( c7, roundkey ); \ |
186 | 0 | \ |
187 | 0 | do \ |
188 | 0 | { \ |
189 | 0 | roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \ |
190 | 0 | keyPtr ++; \ |
191 | 0 | c0 = _mm_aesenc_si128( c0, roundkey ); \ |
192 | 0 | c1 = _mm_aesenc_si128( c1, roundkey ); \ |
193 | 0 | c2 = _mm_aesenc_si128( c2, roundkey ); \ |
194 | 0 | c3 = _mm_aesenc_si128( c3, roundkey ); \ |
195 | 0 | c4 = _mm_aesenc_si128( c4, roundkey ); \ |
196 | 0 | c5 = _mm_aesenc_si128( c5, roundkey ); \ |
197 | 0 | c6 = _mm_aesenc_si128( c6, roundkey ); \ |
198 | 0 | c7 = _mm_aesenc_si128( c7, roundkey ); \ |
199 | 0 | } while( keyPtr < keyLimit ); \ |
200 | 0 | \ |
201 | 0 | roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \ |
202 | 0 | \ |
203 | 0 | c0 = _mm_aesenclast_si128( c0, roundkey ); \ |
204 | 0 | c1 = _mm_aesenclast_si128( c1, roundkey ); \ |
205 | 0 | c2 = _mm_aesenclast_si128( c2, roundkey ); \ |
206 | 0 | c3 = _mm_aesenclast_si128( c3, roundkey ); \ |
207 | 0 | c4 = _mm_aesenclast_si128( c4, roundkey ); \ |
208 | 0 | c5 = _mm_aesenclast_si128( c5, roundkey ); \ |
209 | 0 | c6 = _mm_aesenclast_si128( c6, roundkey ); \ |
210 | 0 | c7 = _mm_aesenclast_si128( c7, roundkey ); \ |
211 | 0 | }; |
212 | | |
213 | 0 | #define AES_DECRYPT_1( pExpandedKey, c0 ) \ |
214 | 0 | { \ |
215 | 0 | const BYTE (*keyPtr)[4][4]; \ |
216 | 0 | const BYTE (*keyLimit)[4][4]; \ |
217 | 0 | __m128i roundkey; \ |
218 | 0 | \ |
219 | 0 | keyPtr = pExpandedKey->lastEncRoundKey; \ |
220 | 0 | keyLimit = pExpandedKey->lastDecRoundKey; \ |
221 | 0 | \ |
222 | 0 | roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \ |
223 | 0 | keyPtr ++; \ |
224 | 0 | \ |
225 | 0 | c0 = _mm_xor_si128( c0, roundkey ); \ |
226 | 0 | \ |
227 | 0 | roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \ |
228 | 0 | keyPtr ++; \ |
229 | 0 | c0 = _mm_aesdec_si128( c0, roundkey ); \ |
230 | 0 | \ |
231 | 0 | do \ |
232 | 0 | { \ |
233 | 0 | roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \ |
234 | 0 | keyPtr ++; \ |
235 | 0 | c0 = _mm_aesdec_si128( c0, roundkey ); \ |
236 | 0 | roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \ |
237 | 0 | keyPtr ++; \ |
238 | 0 | c0 = _mm_aesdec_si128( c0, roundkey ); \ |
239 | 0 | } while( keyPtr < keyLimit ); \ |
240 | 0 | \ |
241 | 0 | roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \ |
242 | 0 | \ |
243 | 0 | c0 = _mm_aesdeclast_si128( c0, roundkey ); \ |
244 | 0 | }; |
245 | | |
246 | 0 | #define AES_DECRYPT_4( pExpandedKey, c0, c1, c2, c3 ) \ |
247 | 0 | { \ |
248 | 0 | const BYTE (*keyPtr)[4][4]; \ |
249 | 0 | const BYTE (*keyLimit)[4][4]; \ |
250 | 0 | __m128i roundkey; \ |
251 | 0 | \ |
252 | 0 | keyPtr = pExpandedKey->lastEncRoundKey; \ |
253 | 0 | keyLimit = pExpandedKey->lastDecRoundKey; \ |
254 | 0 | \ |
255 | 0 | roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \ |
256 | 0 | keyPtr ++; \ |
257 | 0 | \ |
258 | 0 | c0 = _mm_xor_si128( c0, roundkey ); \ |
259 | 0 | c1 = _mm_xor_si128( c1, roundkey ); \ |
260 | 0 | c2 = _mm_xor_si128( c2, roundkey ); \ |
261 | 0 | c3 = _mm_xor_si128( c3, roundkey ); \ |
262 | 0 | \ |
263 | 0 | do \ |
264 | 0 | { \ |
265 | 0 | roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \ |
266 | 0 | keyPtr ++; \ |
267 | 0 | c0 = _mm_aesdec_si128( c0, roundkey ); \ |
268 | 0 | c1 = _mm_aesdec_si128( c1, roundkey ); \ |
269 | 0 | c2 = _mm_aesdec_si128( c2, roundkey ); \ |
270 | 0 | c3 = _mm_aesdec_si128( c3, roundkey ); \ |
271 | 0 | } while( keyPtr < keyLimit ); \ |
272 | 0 | \ |
273 | 0 | roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \ |
274 | 0 | \ |
275 | 0 | c0 = _mm_aesdeclast_si128( c0, roundkey ); \ |
276 | 0 | c1 = _mm_aesdeclast_si128( c1, roundkey ); \ |
277 | 0 | c2 = _mm_aesdeclast_si128( c2, roundkey ); \ |
278 | 0 | c3 = _mm_aesdeclast_si128( c3, roundkey ); \ |
279 | 0 | }; |
280 | | |
281 | 0 | #define AES_DECRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 ) \ |
282 | 0 | { \ |
283 | 0 | const BYTE (*keyPtr)[4][4]; \ |
284 | 0 | const BYTE (*keyLimit)[4][4]; \ |
285 | 0 | __m128i roundkey; \ |
286 | 0 | \ |
287 | 0 | keyPtr = pExpandedKey->lastEncRoundKey; \ |
288 | 0 | keyLimit = pExpandedKey->lastDecRoundKey; \ |
289 | 0 | \ |
290 | 0 | roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \ |
291 | 0 | keyPtr ++; \ |
292 | 0 | \ |
293 | 0 | c0 = _mm_xor_si128( c0, roundkey ); \ |
294 | 0 | c1 = _mm_xor_si128( c1, roundkey ); \ |
295 | 0 | c2 = _mm_xor_si128( c2, roundkey ); \ |
296 | 0 | c3 = _mm_xor_si128( c3, roundkey ); \ |
297 | 0 | c4 = _mm_xor_si128( c4, roundkey ); \ |
298 | 0 | c5 = _mm_xor_si128( c5, roundkey ); \ |
299 | 0 | c6 = _mm_xor_si128( c6, roundkey ); \ |
300 | 0 | c7 = _mm_xor_si128( c7, roundkey ); \ |
301 | 0 | \ |
302 | 0 | do \ |
303 | 0 | { \ |
304 | 0 | roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \ |
305 | 0 | keyPtr ++; \ |
306 | 0 | c0 = _mm_aesdec_si128( c0, roundkey ); \ |
307 | 0 | c1 = _mm_aesdec_si128( c1, roundkey ); \ |
308 | 0 | c2 = _mm_aesdec_si128( c2, roundkey ); \ |
309 | 0 | c3 = _mm_aesdec_si128( c3, roundkey ); \ |
310 | 0 | c4 = _mm_aesdec_si128( c4, roundkey ); \ |
311 | 0 | c5 = _mm_aesdec_si128( c5, roundkey ); \ |
312 | 0 | c6 = _mm_aesdec_si128( c6, roundkey ); \ |
313 | 0 | c7 = _mm_aesdec_si128( c7, roundkey ); \ |
314 | 0 | } while( keyPtr < keyLimit ); \ |
315 | 0 | \ |
316 | 0 | roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \ |
317 | 0 | \ |
318 | 0 | c0 = _mm_aesdeclast_si128( c0, roundkey ); \ |
319 | 0 | c1 = _mm_aesdeclast_si128( c1, roundkey ); \ |
320 | 0 | c2 = _mm_aesdeclast_si128( c2, roundkey ); \ |
321 | 0 | c3 = _mm_aesdeclast_si128( c3, roundkey ); \ |
322 | 0 | c4 = _mm_aesdeclast_si128( c4, roundkey ); \ |
323 | 0 | c5 = _mm_aesdeclast_si128( c5, roundkey ); \ |
324 | 0 | c6 = _mm_aesdeclast_si128( c6, roundkey ); \ |
325 | 0 | c7 = _mm_aesdeclast_si128( c7, roundkey ); \ |
326 | 0 | }; |
327 | | |
328 | | |
329 | | // |
330 | | // The EncryptXmm code is tested through the CFB mode encryption which has no further optimizations. |
331 | | // |
332 | | VOID |
333 | | SYMCRYPT_CALL |
334 | | SymCryptAesEncryptXmm( |
335 | | _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey, |
336 | | _In_reads_( SYMCRYPT_AES_BLOCK_SIZE ) PCBYTE pbSrc, |
337 | | _Out_writes_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbDst ) |
338 | 0 | { |
339 | 0 | __m128i c; |
340 | |
|
341 | 0 | c = _mm_loadu_si128( ( __m128i * ) pbSrc); |
342 | |
|
343 | 0 | AES_ENCRYPT_1( pExpandedKey, c ); |
344 | |
|
345 | 0 | _mm_storeu_si128( (__m128i *) pbDst, c ); |
346 | 0 | } |
347 | | |
348 | | // |
349 | | // The DecryptXmm code is tested through the EcbDecrypt calls which has no further optimizations. |
350 | | // |
351 | | VOID |
352 | | SYMCRYPT_CALL |
353 | | SymCryptAesDecryptXmm( |
354 | | _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey, |
355 | | _In_reads_( SYMCRYPT_AES_BLOCK_SIZE ) PCBYTE pbSrc, |
356 | | _Out_writes_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbDst ) |
357 | 0 | { |
358 | 0 | __m128i c; |
359 | |
|
360 | 0 | c = _mm_loadu_si128( ( __m128i * ) pbSrc); |
361 | |
|
362 | 0 | AES_DECRYPT_1( pExpandedKey, c ); |
363 | |
|
364 | 0 | _mm_storeu_si128( (__m128i *) pbDst, c ); |
365 | 0 | } |
366 | | |
367 | | // Disable warnings and VC++ runtime checks for use of uninitialized values (by design) |
368 | | #pragma warning(push) |
369 | | #pragma warning( disable: 6001 4701 ) |
370 | | #pragma runtime_checks( "u", off ) |
371 | | VOID |
372 | | SYMCRYPT_CALL |
373 | | SymCryptAesEcbEncryptXmm( |
374 | | _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey, |
375 | | _In_reads_( cbData ) PCBYTE pbSrc, |
376 | | _Out_writes_( cbData ) PBYTE pbDst, |
377 | | SIZE_T cbData ) |
378 | 0 | { |
379 | 0 | __m128i c0, c1, c2, c3, c4, c5, c6, c7; |
380 | |
|
381 | 0 | while( cbData >= 8 * SYMCRYPT_AES_BLOCK_SIZE ) |
382 | 0 | { |
383 | 0 | c0 = _mm_loadu_si128( ( __m128i * ) (pbSrc + 0 )); |
384 | 0 | c1 = _mm_loadu_si128( ( __m128i * ) (pbSrc + 16 )); |
385 | 0 | c2 = _mm_loadu_si128( ( __m128i * ) (pbSrc + 32 )); |
386 | 0 | c3 = _mm_loadu_si128( ( __m128i * ) (pbSrc + 48 )); |
387 | 0 | c4 = _mm_loadu_si128( ( __m128i * ) (pbSrc + 64 )); |
388 | 0 | c5 = _mm_loadu_si128( ( __m128i * ) (pbSrc + 80 )); |
389 | 0 | c6 = _mm_loadu_si128( ( __m128i * ) (pbSrc + 96 )); |
390 | 0 | c7 = _mm_loadu_si128( ( __m128i * ) (pbSrc +112 )); |
391 | |
|
392 | 0 | AES_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 ); |
393 | |
|
394 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 0 ), c0 ); |
395 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 16 ), c1 ); |
396 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 32 ), c2 ); |
397 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 48 ), c3 ); |
398 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 64 ), c4 ); |
399 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 80 ), c5 ); |
400 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 96 ), c6 ); |
401 | 0 | _mm_storeu_si128( (__m128i *) (pbDst +112 ), c7 ); |
402 | |
|
403 | 0 | pbSrc += 8 * SYMCRYPT_AES_BLOCK_SIZE; |
404 | 0 | pbDst += 8 * SYMCRYPT_AES_BLOCK_SIZE; |
405 | 0 | cbData -= 8 * SYMCRYPT_AES_BLOCK_SIZE; |
406 | 0 | } |
407 | |
|
408 | 0 | if( cbData < 16 ) |
409 | 0 | { |
410 | 0 | return; |
411 | 0 | } |
412 | | |
413 | 0 | c0 = _mm_loadu_si128( ( __m128i * ) (pbSrc + 0 )); |
414 | 0 | if( cbData >= 32 ) |
415 | 0 | { |
416 | 0 | c1 = _mm_loadu_si128( ( __m128i * ) (pbSrc + 16 )); |
417 | 0 | if( cbData >= 48 ) |
418 | 0 | { |
419 | 0 | c2 = _mm_loadu_si128( ( __m128i * ) (pbSrc + 32 )); |
420 | 0 | if( cbData >= 64 ) |
421 | 0 | { |
422 | 0 | c3 = _mm_loadu_si128( ( __m128i * ) (pbSrc + 48 )); |
423 | 0 | if( cbData >= 80 ) |
424 | 0 | { |
425 | 0 | c4 = _mm_loadu_si128( ( __m128i * ) (pbSrc + 64 )); |
426 | 0 | if( cbData >= 96 ) |
427 | 0 | { |
428 | 0 | c5 = _mm_loadu_si128( ( __m128i * ) (pbSrc + 80 )); |
429 | 0 | if( cbData >= 112 ) |
430 | 0 | { |
431 | 0 | c6 = _mm_loadu_si128( ( __m128i * ) (pbSrc + 96 )); |
432 | 0 | } |
433 | 0 | } |
434 | 0 | } |
435 | 0 | } |
436 | 0 | } |
437 | 0 | } |
438 | |
|
439 | 0 | if( cbData >= 5 * SYMCRYPT_AES_BLOCK_SIZE ) |
440 | 0 | { |
441 | 0 | AES_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 ); |
442 | 0 | } |
443 | 0 | else if( cbData >= 2 * SYMCRYPT_AES_BLOCK_SIZE ) |
444 | 0 | { |
445 | 0 | AES_ENCRYPT_4( pExpandedKey, c0, c1, c2, c3 ); |
446 | 0 | } |
447 | 0 | else |
448 | 0 | { |
449 | 0 | AES_ENCRYPT_1( pExpandedKey, c0 ); |
450 | 0 | } |
451 | |
|
452 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 0 ), c0 ); |
453 | 0 | if( cbData >= 32 ) |
454 | 0 | { |
455 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 16 ), c1 ); |
456 | 0 | if( cbData >= 48 ) |
457 | 0 | { |
458 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 32 ), c2 ); |
459 | 0 | if( cbData >= 64 ) |
460 | 0 | { |
461 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 48 ), c3 ); |
462 | 0 | if( cbData >= 80 ) |
463 | 0 | { |
464 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 64 ), c4 ); |
465 | 0 | if( cbData >= 96 ) |
466 | 0 | { |
467 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 80 ), c5 ); |
468 | 0 | if( cbData >= 112 ) |
469 | 0 | { |
470 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 96 ), c6 ); |
471 | 0 | } |
472 | 0 | } |
473 | 0 | } |
474 | 0 | } |
475 | 0 | } |
476 | 0 | } |
477 | 0 | } |
478 | | #pragma runtime_checks( "u", restore ) |
479 | | #pragma warning( pop ) |
480 | | |
481 | | |
482 | | |
483 | | VOID |
484 | | SYMCRYPT_CALL |
485 | | SymCryptAesCbcEncryptXmm( |
486 | | _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey, |
487 | | _Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue, |
488 | | _In_reads_( cbData ) PCBYTE pbSrc, |
489 | | _Out_writes_( cbData ) PBYTE pbDst, |
490 | | SIZE_T cbData ) |
491 | 0 | { |
492 | 0 | __m128i c = _mm_loadu_si128( (__m128i *) pbChainingValue ); |
493 | 0 | __m128i rk0 = _mm_loadu_si128( (__m128i *) &pExpandedKey->RoundKey[0] ); |
494 | 0 | __m128i rkLast = _mm_loadu_si128( (__m128i *) pExpandedKey->lastEncRoundKey ); |
495 | 0 | __m128i d; |
496 | |
|
497 | 0 | if (cbData < SYMCRYPT_AES_BLOCK_SIZE) |
498 | 0 | return; |
499 | | |
500 | | // This algorithm is dominated by chain of dependent AES rounds, so we want to avoid XOR |
501 | | // instructions on the critical path where possible |
502 | | // We can compute (last round key ^ next plaintext block ^ first round key) off the critical |
503 | | // path and use this with AES_ENCRYPT_1_CHAIN so that only AES instructions write to c in |
504 | | // the main loop |
505 | 0 | d = _mm_xor_si128( _mm_loadu_si128( (__m128i *) pbSrc ), rk0 ); |
506 | 0 | c = _mm_xor_si128( c, d ); |
507 | 0 | pbSrc += SYMCRYPT_AES_BLOCK_SIZE; |
508 | 0 | cbData -= SYMCRYPT_AES_BLOCK_SIZE; |
509 | |
|
510 | 0 | while( cbData >= SYMCRYPT_AES_BLOCK_SIZE ) |
511 | 0 | { |
512 | 0 | d = _mm_xor_si128( _mm_loadu_si128( (__m128i *) pbSrc ), rk0 ); |
513 | 0 | AES_ENCRYPT_1_CHAIN( pExpandedKey, c, _mm_xor_si128(d, rkLast ) ); |
514 | 0 | _mm_storeu_si128( (__m128i *) pbDst, _mm_xor_si128(c, d) ); |
515 | |
|
516 | 0 | pbSrc += SYMCRYPT_AES_BLOCK_SIZE; |
517 | 0 | pbDst += SYMCRYPT_AES_BLOCK_SIZE; |
518 | 0 | cbData -= SYMCRYPT_AES_BLOCK_SIZE; |
519 | 0 | } |
520 | 0 | AES_ENCRYPT_1_CHAIN( pExpandedKey, c, rkLast ); |
521 | 0 | _mm_storeu_si128( (__m128i *) pbDst, c ); |
522 | 0 | _mm_storeu_si128( (__m128i *) pbChainingValue, c ); |
523 | 0 | } |
524 | | |
525 | | // Disable warnings and VC++ runtime checks for use of uninitialized values (by design) |
526 | | #pragma warning(push) |
527 | | #pragma warning( disable: 6001 4701 ) |
528 | | #pragma runtime_checks( "u", off ) |
529 | | VOID |
530 | | SYMCRYPT_CALL |
531 | | SymCryptAesCbcDecryptXmm( |
532 | | _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey, |
533 | | _Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue, |
534 | | _In_reads_( cbData ) PCBYTE pbSrc, |
535 | | _Out_writes_( cbData ) PBYTE pbDst, |
536 | | SIZE_T cbData ) |
537 | 0 | { |
538 | 0 | __m128i chain; |
539 | 0 | __m128i c0, c1, c2, c3, c4, c5, c6, c7; |
540 | 0 | __m128i d0, d1, d2, d3, d4, d5, d6, d7; |
541 | |
|
542 | 0 | if( cbData < SYMCRYPT_AES_BLOCK_SIZE ) |
543 | 0 | { |
544 | 0 | return; |
545 | 0 | } |
546 | | |
547 | 0 | chain = _mm_loadu_si128( (__m128i *) pbChainingValue ); |
548 | | |
549 | | // |
550 | | // First we do all multiples of 8 blocks |
551 | | // |
552 | |
|
553 | 0 | while( cbData >= 8 * SYMCRYPT_AES_BLOCK_SIZE ) |
554 | 0 | { |
555 | 0 | d0 = c0 = _mm_loadu_si128( (__m128i *) (pbSrc + 0 * SYMCRYPT_AES_BLOCK_SIZE ) ); |
556 | 0 | d1 = c1 = _mm_loadu_si128( (__m128i *) (pbSrc + 1 * SYMCRYPT_AES_BLOCK_SIZE ) ); |
557 | 0 | d2 = c2 = _mm_loadu_si128( (__m128i *) (pbSrc + 2 * SYMCRYPT_AES_BLOCK_SIZE ) ); |
558 | 0 | d3 = c3 = _mm_loadu_si128( (__m128i *) (pbSrc + 3 * SYMCRYPT_AES_BLOCK_SIZE ) ); |
559 | 0 | d4 = c4 = _mm_loadu_si128( (__m128i *) (pbSrc + 4 * SYMCRYPT_AES_BLOCK_SIZE ) ); |
560 | 0 | d5 = c5 = _mm_loadu_si128( (__m128i *) (pbSrc + 5 * SYMCRYPT_AES_BLOCK_SIZE ) ); |
561 | 0 | d6 = c6 = _mm_loadu_si128( (__m128i *) (pbSrc + 6 * SYMCRYPT_AES_BLOCK_SIZE ) ); |
562 | 0 | d7 = c7 = _mm_loadu_si128( (__m128i *) (pbSrc + 7 * SYMCRYPT_AES_BLOCK_SIZE ) ); |
563 | |
|
564 | 0 | AES_DECRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 ); |
565 | |
|
566 | 0 | c0 = _mm_xor_si128( c0, chain ); |
567 | 0 | c1 = _mm_xor_si128( c1, d0 ); |
568 | 0 | c2 = _mm_xor_si128( c2, d1 ); |
569 | 0 | c3 = _mm_xor_si128( c3, d2 ); |
570 | 0 | c4 = _mm_xor_si128( c4, d3 ); |
571 | 0 | c5 = _mm_xor_si128( c5, d4 ); |
572 | 0 | c6 = _mm_xor_si128( c6, d5 ); |
573 | 0 | c7 = _mm_xor_si128( c7, d6 ); |
574 | 0 | chain = d7; |
575 | |
|
576 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 0 * SYMCRYPT_AES_BLOCK_SIZE ), c0 ); |
577 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 1 * SYMCRYPT_AES_BLOCK_SIZE ), c1 ); |
578 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 2 * SYMCRYPT_AES_BLOCK_SIZE ), c2 ); |
579 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 3 * SYMCRYPT_AES_BLOCK_SIZE ), c3 ); |
580 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 4 * SYMCRYPT_AES_BLOCK_SIZE ), c4 ); |
581 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 5 * SYMCRYPT_AES_BLOCK_SIZE ), c5 ); |
582 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 6 * SYMCRYPT_AES_BLOCK_SIZE ), c6 ); |
583 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 7 * SYMCRYPT_AES_BLOCK_SIZE ), c7 ); |
584 | |
|
585 | 0 | pbSrc += 8 * SYMCRYPT_AES_BLOCK_SIZE; |
586 | 0 | pbDst += 8 * SYMCRYPT_AES_BLOCK_SIZE; |
587 | 0 | cbData -= 8 * SYMCRYPT_AES_BLOCK_SIZE; |
588 | 0 | } |
589 | |
|
590 | 0 | if( cbData >= 16 ) |
591 | 0 | { |
592 | | // |
593 | | // There is remaining work to be done |
594 | | // |
595 | 0 | d0 = c0 = _mm_loadu_si128( (__m128i *) (pbSrc + 0 * SYMCRYPT_AES_BLOCK_SIZE ) ); |
596 | 0 | if( cbData >= 32 ) |
597 | 0 | { |
598 | 0 | d1 = c1 = _mm_loadu_si128( (__m128i *) (pbSrc + 1 * SYMCRYPT_AES_BLOCK_SIZE ) ); |
599 | 0 | if( cbData >= 48 ) |
600 | 0 | { |
601 | 0 | d2 = c2 = _mm_loadu_si128( (__m128i *) (pbSrc + 2 * SYMCRYPT_AES_BLOCK_SIZE ) ); |
602 | 0 | if( cbData >= 64 ) |
603 | 0 | { |
604 | 0 | d3 = c3 = _mm_loadu_si128( (__m128i *) (pbSrc + 3 * SYMCRYPT_AES_BLOCK_SIZE ) ); |
605 | 0 | if( cbData >= 80 ) |
606 | 0 | { |
607 | 0 | d4 = c4 = _mm_loadu_si128( (__m128i *) (pbSrc + 4 * SYMCRYPT_AES_BLOCK_SIZE ) ); |
608 | 0 | if( cbData >= 96 ) |
609 | 0 | { |
610 | 0 | d5 = c5 = _mm_loadu_si128( (__m128i *) (pbSrc + 5 * SYMCRYPT_AES_BLOCK_SIZE ) ); |
611 | 0 | if( cbData >= 112 ) |
612 | 0 | { |
613 | 0 | d6 = c6 = _mm_loadu_si128( (__m128i *) (pbSrc + 6 * SYMCRYPT_AES_BLOCK_SIZE ) ); |
614 | 0 | } |
615 | 0 | } |
616 | 0 | } |
617 | 0 | } |
618 | 0 | } |
619 | 0 | } |
620 | | |
621 | | // |
622 | | // Decrypt 1, 4, or 8 blocks in AES-CBC mode. This might decrypt uninitialized registers, |
623 | | // but those will not be used when we store the results. |
624 | | // |
625 | 0 | if( cbData > 4 * SYMCRYPT_AES_BLOCK_SIZE ) |
626 | 0 | { |
627 | 0 | AES_DECRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 ); |
628 | 0 | c0 = _mm_xor_si128( c0, chain ); |
629 | 0 | c1 = _mm_xor_si128( c1, d0 ); |
630 | 0 | c2 = _mm_xor_si128( c2, d1 ); |
631 | 0 | c3 = _mm_xor_si128( c3, d2 ); |
632 | 0 | c4 = _mm_xor_si128( c4, d3 ); |
633 | 0 | c5 = _mm_xor_si128( c5, d4 ); |
634 | 0 | c6 = _mm_xor_si128( c6, d5 ); |
635 | 0 | } |
636 | 0 | else if( cbData > SYMCRYPT_AES_BLOCK_SIZE ) |
637 | 0 | { |
638 | 0 | AES_DECRYPT_4( pExpandedKey, c0, c1, c2, c3 ); |
639 | 0 | c0 = _mm_xor_si128( c0, chain ); |
640 | 0 | c1 = _mm_xor_si128( c1, d0 ); |
641 | 0 | c2 = _mm_xor_si128( c2, d1 ); |
642 | 0 | c3 = _mm_xor_si128( c3, d2 ); |
643 | 0 | } else |
644 | 0 | { |
645 | 0 | AES_DECRYPT_1( pExpandedKey, c0 ); |
646 | 0 | c0 = _mm_xor_si128( c0, chain ); |
647 | 0 | } |
648 | |
|
649 | 0 | chain = _mm_loadu_si128( (__m128i *) (pbSrc + cbData - SYMCRYPT_AES_BLOCK_SIZE ) ); |
650 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 0 * SYMCRYPT_AES_BLOCK_SIZE ), c0 ); |
651 | 0 | if( cbData >= 32 ) |
652 | 0 | { |
653 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 1 * SYMCRYPT_AES_BLOCK_SIZE ), c1 ); |
654 | 0 | if( cbData >= 48 ) |
655 | 0 | { |
656 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 2 * SYMCRYPT_AES_BLOCK_SIZE ), c2 ); |
657 | 0 | if( cbData >= 64 ) |
658 | 0 | { |
659 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 3 * SYMCRYPT_AES_BLOCK_SIZE ), c3 ); |
660 | 0 | if( cbData >= 80 ) |
661 | 0 | { |
662 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 4 * SYMCRYPT_AES_BLOCK_SIZE ), c4 ); |
663 | 0 | if( cbData >= 96 ) |
664 | 0 | { |
665 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 5 * SYMCRYPT_AES_BLOCK_SIZE ), c5 ); |
666 | 0 | if( cbData >= 112 ) |
667 | 0 | { |
668 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 6 * SYMCRYPT_AES_BLOCK_SIZE ), c6 ); |
669 | 0 | } |
670 | 0 | } |
671 | 0 | } |
672 | 0 | } |
673 | 0 | } |
674 | 0 | } |
675 | 0 | } |
676 | |
|
677 | 0 | _mm_storeu_si128( (__m128i *) pbChainingValue, chain ); |
678 | |
|
679 | 0 | return; |
680 | 0 | } |
681 | | #pragma runtime_checks( "u", restore ) |
682 | | #pragma warning( pop ) |
683 | | |
684 | | VOID |
685 | | SYMCRYPT_CALL |
686 | | SymCryptAesCbcMacXmm( |
687 | | _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey, |
688 | | _Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue, |
689 | | _In_reads_( cbData ) PCBYTE pbData, |
690 | | SIZE_T cbData ) |
691 | 0 | { |
692 | 0 | __m128i c = _mm_loadu_si128( (__m128i *) pbChainingValue ); |
693 | 0 | __m128i rk0 = _mm_loadu_si128( (__m128i *) &pExpandedKey->RoundKey[0] ); |
694 | 0 | __m128i rkLast = _mm_loadu_si128( (__m128i *) pExpandedKey->lastEncRoundKey ); |
695 | 0 | __m128i d, rk0AndLast; |
696 | |
|
697 | 0 | if (cbData < SYMCRYPT_AES_BLOCK_SIZE) |
698 | 0 | return; |
699 | | |
700 | | // This algorithm is dominated by chain of dependent AES rounds, so we want to avoid XOR |
701 | | // instructions on the critical path where possible |
702 | | // We can compute (last round key ^ next plaintext block ^ first round key) off the critical |
703 | | // path and use this with AES_ENCRYPT_1_CHAIN so that only AES instructions write to c in |
704 | | // the main loop |
705 | 0 | d = _mm_xor_si128( _mm_loadu_si128( (__m128i *) pbData ), rk0 ); |
706 | 0 | c = _mm_xor_si128( c, d ); |
707 | 0 | pbData += SYMCRYPT_AES_BLOCK_SIZE; |
708 | 0 | cbData -= SYMCRYPT_AES_BLOCK_SIZE; |
709 | | |
710 | | // As we don't compute ciphertext here, we only need to XOR rk0 and rkLast once |
711 | 0 | rk0AndLast = _mm_xor_si128( rk0, rkLast ); |
712 | |
|
713 | 0 | while( cbData >= SYMCRYPT_AES_BLOCK_SIZE ) |
714 | 0 | { |
715 | 0 | d = _mm_xor_si128( _mm_loadu_si128( (__m128i *) pbData ), rk0AndLast ); |
716 | 0 | AES_ENCRYPT_1_CHAIN( pExpandedKey, c, d ); |
717 | |
|
718 | 0 | pbData += SYMCRYPT_AES_BLOCK_SIZE; |
719 | 0 | cbData -= SYMCRYPT_AES_BLOCK_SIZE; |
720 | 0 | } |
721 | 0 | AES_ENCRYPT_1_CHAIN( pExpandedKey, c, rkLast ); |
722 | 0 | _mm_storeu_si128( (__m128i *) pbChainingValue, c ); |
723 | 0 | } |
724 | | |
725 | | |
726 | | #pragma warning(push) |
727 | | #pragma warning( disable:4701 ) // "Use of uninitialized variable" |
728 | | #pragma runtime_checks( "u", off ) |
729 | | |
730 | | #define SYMCRYPT_AesCtrMsbXxXmm SymCryptAesCtrMsb64Xmm |
731 | 0 | #define MM_ADD_EPIXX _mm_add_epi64 |
732 | 0 | #define MM_SUB_EPIXX _mm_sub_epi64 |
733 | | |
734 | | #include "aes-pattern.c" |
735 | | |
736 | | #undef MM_SUB_EPIXX |
737 | | #undef MM_ADD_EPIXX |
738 | | #undef SYMCRYPT_AesCtrMsbXxXmm |
739 | | |
740 | | #define SYMCRYPT_AesCtrMsbXxXmm SymCryptAesCtrMsb32Xmm |
741 | 0 | #define MM_ADD_EPIXX _mm_add_epi32 |
742 | 0 | #define MM_SUB_EPIXX _mm_sub_epi32 |
743 | | |
744 | | #include "aes-pattern.c" |
745 | | |
746 | | #undef MM_SUB_EPIXX |
747 | | #undef MM_ADD_EPIXX |
748 | | #undef SYMCRYPT_AesCtrMsbXxXmm |
749 | | |
750 | | #pragma runtime_checks( "u", restore ) |
751 | | #pragma warning(pop) |
752 | | |
753 | | /* |
754 | | if( cbData >= 16 ) |
755 | | { |
756 | | if( cbData >= 32 ) |
757 | | { |
758 | | if( cbData >= 48 ) |
759 | | { |
760 | | if( cbData >= 64 ) |
761 | | { |
762 | | if( cbData >= 80 ) |
763 | | { |
764 | | if( cbData >= 96 ) |
765 | | { |
766 | | if( cbData >= 112 ) |
767 | | { |
768 | | } |
769 | | } |
770 | | } |
771 | | } |
772 | | } |
773 | | } |
774 | | } |
775 | | */ |
776 | | |
777 | | VOID |
778 | | SYMCRYPT_CALL |
779 | | SymCryptXtsAesEncryptDataUnitXmm( |
780 | | _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey, |
781 | | _In_reads_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbTweakBlock, |
782 | | _Out_writes_( SYMCRYPT_AES_BLOCK_SIZE*16 ) PBYTE pbScratch, |
783 | | _In_reads_( cbData ) PCBYTE pbSrc, |
784 | | _Out_writes_( cbData ) PBYTE pbDst, |
785 | | SIZE_T cbData ) |
786 | 0 | { |
787 | 0 | __m128i t0; |
788 | 0 | __m128i c0, c1, c2, c3, c4, c5, c6, c7; |
789 | 0 | __m128i roundkey, firstRoundKey, lastRoundKey; |
790 | 0 | __m128i XTS_ALPHA_MASK = _mm_set_epi32( 1, 1, 1, 0x87 ); |
791 | 0 | SYMCRYPT_GF128_ELEMENT* tweakBuffer = (SYMCRYPT_GF128_ELEMENT*) pbScratch; |
792 | |
|
793 | 0 | const BYTE (*keyPtr)[4][4]; |
794 | 0 | const BYTE (*keyLimit)[4][4] = pExpandedKey->lastEncRoundKey; |
795 | 0 | UINT64 lastTweakLow, lastTweakHigh; |
796 | 0 | int aesEncryptXtsLoop; |
797 | |
|
798 | 0 | SIZE_T cbDataMain; // number of bytes to handle in the main loop |
799 | 0 | SIZE_T cbDataTail; // number of bytes to handle in the tail loop |
800 | |
|
801 | 0 | SYMCRYPT_ASSERT(cbData >= SYMCRYPT_AES_BLOCK_SIZE); |
802 | | |
803 | | // To simplify logic and unusual size processing, we handle all |
804 | | // data not a multiple of 8 blocks in the tail loop |
805 | 0 | cbDataTail = cbData & ((8*SYMCRYPT_AES_BLOCK_SIZE)-1); |
806 | | // Additionally, so that ciphertext stealing logic does not rely on |
807 | | // reading back from the destination buffer, when we have a non-zero |
808 | | // tail, we ensure that we handle at least 1 whole block in the tail |
809 | | // |
810 | | // Note that our caller has ensured we have at least 1 whole block |
811 | | // to process, this is checked in debug build |
812 | | // This means that cbDataTail is in [1,15] at this point iff there are |
813 | | // at least 8 whole blocks to process; so the below does not cause |
814 | | // cbDataTail or cbDataMain to exceed cbData |
815 | 0 | cbDataTail += ((cbDataTail > 0) && (cbDataTail < SYMCRYPT_AES_BLOCK_SIZE)) ? (8*SYMCRYPT_AES_BLOCK_SIZE) : 0; |
816 | 0 | cbDataMain = cbData - cbDataTail; |
817 | |
|
818 | 0 | SYMCRYPT_ASSERT(cbDataMain <= cbData); |
819 | 0 | SYMCRYPT_ASSERT(cbDataTail <= cbData); |
820 | 0 | SYMCRYPT_ASSERT((cbDataMain & ((8*SYMCRYPT_AES_BLOCK_SIZE)-1)) == 0); |
821 | |
|
822 | 0 | c0 = _mm_loadu_si128( (__m128i *) pbTweakBlock ); |
823 | 0 | XTS_MUL_ALPHA( c0, c1 ); |
824 | 0 | XTS_MUL_ALPHA( c1, c2 ); |
825 | 0 | XTS_MUL_ALPHA( c2, c3 ); |
826 | |
|
827 | 0 | XTS_MUL_ALPHA4( c0, c4 ); |
828 | 0 | XTS_MUL_ALPHA ( c4, c5 ); |
829 | 0 | XTS_MUL_ALPHA ( c5, c6 ); |
830 | 0 | XTS_MUL_ALPHA ( c6, c7 ); |
831 | |
|
832 | 0 | tweakBuffer[0].m128i = c0; |
833 | 0 | tweakBuffer[1].m128i = c1; |
834 | 0 | tweakBuffer[2].m128i = c2; |
835 | 0 | tweakBuffer[3].m128i = c3; |
836 | 0 | tweakBuffer[4].m128i = c4; |
837 | 0 | tweakBuffer[5].m128i = c5; |
838 | 0 | tweakBuffer[6].m128i = c6; |
839 | 0 | tweakBuffer[7].m128i = c7; |
840 | 0 | lastTweakLow = tweakBuffer[7].ull[0]; |
841 | 0 | lastTweakHigh = tweakBuffer[7].ull[1]; |
842 | |
|
843 | 0 | firstRoundKey = _mm_loadu_si128( (__m128i *) &pExpandedKey->RoundKey[0] ); |
844 | 0 | lastRoundKey = _mm_loadu_si128( (__m128i *) pExpandedKey->lastEncRoundKey ); |
845 | |
|
846 | 0 | while( cbDataMain > 0 ) |
847 | 0 | { |
848 | | // At loop entry, tweakBuffer[0-7] are tweakValues for the next 8 blocks |
849 | 0 | c0 = _mm_xor_si128( tweakBuffer[0].m128i, firstRoundKey ); |
850 | 0 | c1 = _mm_xor_si128( tweakBuffer[1].m128i, firstRoundKey ); |
851 | 0 | c2 = _mm_xor_si128( tweakBuffer[2].m128i, firstRoundKey ); |
852 | 0 | c3 = _mm_xor_si128( tweakBuffer[3].m128i, firstRoundKey ); |
853 | 0 | c4 = _mm_xor_si128( tweakBuffer[4].m128i, firstRoundKey ); |
854 | 0 | c5 = _mm_xor_si128( tweakBuffer[5].m128i, firstRoundKey ); |
855 | 0 | c6 = _mm_xor_si128( tweakBuffer[6].m128i, firstRoundKey ); |
856 | 0 | c7 = _mm_xor_si128( tweakBuffer[7].m128i, firstRoundKey ); |
857 | |
|
858 | 0 | c0 = _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc + 0) ) ); |
859 | 0 | c1 = _mm_xor_si128( c1, _mm_loadu_si128( ( __m128i * ) (pbSrc + 16) ) ); |
860 | 0 | c2 = _mm_xor_si128( c2, _mm_loadu_si128( ( __m128i * ) (pbSrc + 32) ) ); |
861 | 0 | c3 = _mm_xor_si128( c3, _mm_loadu_si128( ( __m128i * ) (pbSrc + 48) ) ); |
862 | 0 | c4 = _mm_xor_si128( c4, _mm_loadu_si128( ( __m128i * ) (pbSrc + 64) ) ); |
863 | 0 | c5 = _mm_xor_si128( c5, _mm_loadu_si128( ( __m128i * ) (pbSrc + 80) ) ); |
864 | 0 | c6 = _mm_xor_si128( c6, _mm_loadu_si128( ( __m128i * ) (pbSrc + 96) ) ); |
865 | 0 | c7 = _mm_xor_si128( c7, _mm_loadu_si128( ( __m128i * ) (pbSrc + 112) ) ); |
866 | |
|
867 | 0 | keyPtr = &pExpandedKey->RoundKey[1]; |
868 | | |
869 | | // Do 8 full rounds (AES-128|AES-192|AES-256) with stitched XTS (peformed in scalar registers) |
870 | 0 | for( aesEncryptXtsLoop = 0; aesEncryptXtsLoop < 8; aesEncryptXtsLoop++ ) |
871 | 0 | { |
872 | 0 | roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); |
873 | 0 | keyPtr ++; |
874 | 0 | c0 = _mm_aesenc_si128( c0, roundkey ); |
875 | 0 | c1 = _mm_aesenc_si128( c1, roundkey ); |
876 | 0 | c2 = _mm_aesenc_si128( c2, roundkey ); |
877 | 0 | c3 = _mm_aesenc_si128( c3, roundkey ); |
878 | 0 | c4 = _mm_aesenc_si128( c4, roundkey ); |
879 | 0 | c5 = _mm_aesenc_si128( c5, roundkey ); |
880 | 0 | c6 = _mm_aesenc_si128( c6, roundkey ); |
881 | 0 | c7 = _mm_aesenc_si128( c7, roundkey ); |
882 | | |
883 | | // Prepare tweakBuffer[8-15] with tweak^lastRoundKey |
884 | 0 | tweakBuffer[ 8+aesEncryptXtsLoop ].m128i = _mm_xor_si128( tweakBuffer[ aesEncryptXtsLoop ].m128i, lastRoundKey ); |
885 | | // Prepare tweakBuffer[0-7] with tweaks for next 8 blocks |
886 | 0 | XTS_MUL_ALPHA_Scalar( lastTweakLow, lastTweakHigh ); |
887 | 0 | tweakBuffer[ aesEncryptXtsLoop ].ull[0] = lastTweakLow; |
888 | 0 | tweakBuffer[ aesEncryptXtsLoop ].ull[1] = lastTweakHigh; |
889 | 0 | } |
890 | |
|
891 | 0 | do |
892 | 0 | { |
893 | 0 | roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); |
894 | 0 | keyPtr ++; |
895 | 0 | c0 = _mm_aesenc_si128( c0, roundkey ); |
896 | 0 | c1 = _mm_aesenc_si128( c1, roundkey ); |
897 | 0 | c2 = _mm_aesenc_si128( c2, roundkey ); |
898 | 0 | c3 = _mm_aesenc_si128( c3, roundkey ); |
899 | 0 | c4 = _mm_aesenc_si128( c4, roundkey ); |
900 | 0 | c5 = _mm_aesenc_si128( c5, roundkey ); |
901 | 0 | c6 = _mm_aesenc_si128( c6, roundkey ); |
902 | 0 | c7 = _mm_aesenc_si128( c7, roundkey ); |
903 | 0 | } while( keyPtr < keyLimit ); |
904 | |
|
905 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 0), _mm_aesenclast_si128( c0, tweakBuffer[ 8].m128i ) ); |
906 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 16), _mm_aesenclast_si128( c1, tweakBuffer[ 9].m128i ) ); |
907 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 32), _mm_aesenclast_si128( c2, tweakBuffer[10].m128i ) ); |
908 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 48), _mm_aesenclast_si128( c3, tweakBuffer[11].m128i ) ); |
909 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 64), _mm_aesenclast_si128( c4, tweakBuffer[12].m128i ) ); |
910 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 80), _mm_aesenclast_si128( c5, tweakBuffer[13].m128i ) ); |
911 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 96), _mm_aesenclast_si128( c6, tweakBuffer[14].m128i ) ); |
912 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 112), _mm_aesenclast_si128( c7, tweakBuffer[15].m128i ) ); |
913 | |
|
914 | 0 | pbSrc += 8 * SYMCRYPT_AES_BLOCK_SIZE; |
915 | 0 | pbDst += 8 * SYMCRYPT_AES_BLOCK_SIZE; |
916 | 0 | cbDataMain -= 8 * SYMCRYPT_AES_BLOCK_SIZE; |
917 | 0 | } |
918 | |
|
919 | 0 | if( cbDataTail == 0 ) |
920 | 0 | { |
921 | 0 | return; // <-- expected case; early return here |
922 | 0 | } |
923 | | |
924 | | // Rare case, with data unit length not being multiple of 128 bytes, handle the tail one block at a time |
925 | 0 | t0 = tweakBuffer[0].m128i; |
926 | |
|
927 | 0 | while( cbDataTail >= 2*SYMCRYPT_AES_BLOCK_SIZE ) |
928 | 0 | { |
929 | 0 | c0 = _mm_xor_si128( _mm_loadu_si128( ( __m128i * ) pbSrc ), t0 ); |
930 | 0 | pbSrc += SYMCRYPT_AES_BLOCK_SIZE; |
931 | 0 | AES_ENCRYPT_1( pExpandedKey, c0 ); |
932 | 0 | _mm_storeu_si128( (__m128i *) pbDst, _mm_xor_si128( c0, t0 ) ); |
933 | 0 | pbDst += SYMCRYPT_AES_BLOCK_SIZE; |
934 | 0 | XTS_MUL_ALPHA( t0, t0 ); |
935 | 0 | cbDataTail -= SYMCRYPT_AES_BLOCK_SIZE; |
936 | 0 | } |
937 | | |
938 | 0 | if( cbDataTail > SYMCRYPT_AES_BLOCK_SIZE ) |
939 | 0 | { |
940 | | // Ciphertext stealing encryption |
941 | | // |
942 | | // +--------------+ |
943 | | // | | |
944 | | // | V |
945 | | // +-----------------+ | +-----+-----------+ |
946 | | // | P_m-1 | | | P_m |++++CP+++++| |
947 | | // +-----------------+ | +-----+-----------+ |
948 | | // | | | |
949 | | // enc_m-1 | enc_m |
950 | | // | | | |
951 | | // V | V |
952 | | // +-----+-----------+ | +-----------------+ |
953 | | // | C_m |++++CP+++++|--+ | C_m-1 | |
954 | | // +-----+-----------+ +-----------------+ |
955 | | // | / |
956 | | // +---------------- / --+ |
957 | | // / | |
958 | | // | V |
959 | | // +-----------------+ | +-----+ |
960 | | // | C_m-1 |<-+ | C_m | |
961 | | // +-----------------+ +-----+ |
962 | | |
963 | | // Encrypt penultimate plaintext block into tweakBuffer[0] |
964 | 0 | c0 = _mm_xor_si128( _mm_loadu_si128( (__m128i *) pbSrc ), t0 ); |
965 | 0 | AES_ENCRYPT_1( pExpandedKey, c0 ); |
966 | 0 | tweakBuffer[0].m128i = _mm_xor_si128( c0, t0 ); |
967 | |
|
968 | 0 | cbDataTail -= SYMCRYPT_AES_BLOCK_SIZE; |
969 | | |
970 | | // Copy tweakBuffer[0] to tweakBuffer[1] |
971 | 0 | tweakBuffer[1].m128i = tweakBuffer[0].m128i; |
972 | | // Copy final plaintext bytes to prefix of tweakBuffer[0] - we must read before writing to support in-place encryption |
973 | 0 | memcpy( &tweakBuffer[0].ul[0], pbSrc + SYMCRYPT_AES_BLOCK_SIZE, cbDataTail ); |
974 | | // Copy prefix of tweakBuffer[1] to the right place in the destination buffer |
975 | 0 | memcpy( pbDst + SYMCRYPT_AES_BLOCK_SIZE, &tweakBuffer[1].ul[0], cbDataTail ); |
976 | | |
977 | | // Do final tweak update |
978 | 0 | XTS_MUL_ALPHA( t0, t0 ); |
979 | | |
980 | | // Load updated tweakBuffer[0] into c0 |
981 | 0 | c0 = tweakBuffer[0].m128i; |
982 | 0 | } else { |
983 | | // Just load final plaintext block into c0 |
984 | 0 | c0 = _mm_loadu_si128( (__m128i*) pbSrc ); |
985 | 0 | } |
986 | | |
987 | | // Final full block encryption |
988 | 0 | c0 = _mm_xor_si128( c0, t0 ); |
989 | 0 | AES_ENCRYPT_1( pExpandedKey, c0 ); |
990 | 0 | _mm_storeu_si128( (__m128i *) pbDst, _mm_xor_si128( c0, t0 ) ); |
991 | 0 | } |
992 | | |
993 | | VOID |
994 | | SYMCRYPT_CALL |
995 | | SymCryptXtsAesDecryptDataUnitXmm( |
996 | | _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey, |
997 | | _In_reads_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbTweakBlock, |
998 | | _Out_writes_( SYMCRYPT_AES_BLOCK_SIZE*16 ) PBYTE pbScratch, |
999 | | _In_reads_( cbData ) PCBYTE pbSrc, |
1000 | | _Out_writes_( cbData ) PBYTE pbDst, |
1001 | | SIZE_T cbData ) |
1002 | 0 | { |
1003 | 0 | __m128i t0; |
1004 | 0 | __m128i c0, c1, c2, c3, c4, c5, c6, c7; |
1005 | 0 | __m128i roundkey, firstRoundKey, lastRoundKey; |
1006 | 0 | __m128i XTS_ALPHA_MASK = _mm_set_epi32( 1, 1, 1, 0x87 ); |
1007 | 0 | SYMCRYPT_GF128_ELEMENT* tweakBuffer = (SYMCRYPT_GF128_ELEMENT*) pbScratch; |
1008 | |
|
1009 | 0 | const BYTE (*keyPtr)[4][4]; |
1010 | 0 | const BYTE (*keyLimit)[4][4] = pExpandedKey->lastDecRoundKey; |
1011 | 0 | UINT64 lastTweakLow, lastTweakHigh; |
1012 | 0 | int aesDecryptXtsLoop; |
1013 | |
|
1014 | 0 | SIZE_T cbDataMain; // number of bytes to handle in the main loop |
1015 | 0 | SIZE_T cbDataTail; // number of bytes to handle in the tail loop |
1016 | | |
1017 | 0 | SYMCRYPT_ASSERT(cbData >= SYMCRYPT_AES_BLOCK_SIZE); |
1018 | | |
1019 | | // To simplify logic and unusual size processing, we handle all |
1020 | | // data not a multiple of 8 blocks in the tail loop |
1021 | 0 | cbDataTail = cbData & ((8*SYMCRYPT_AES_BLOCK_SIZE)-1); |
1022 | | // Additionally, so that ciphertext stealing logic does not rely on |
1023 | | // reading back from the destination buffer, when we have a non-zero |
1024 | | // tail, we ensure that we handle at least 1 whole block in the tail |
1025 | | // |
1026 | | // Note that our caller has ensured we have at least 1 whole block |
1027 | | // to process, this is checked in debug build |
1028 | | // This means that cbDataTail is in [1,15] at this point iff there are |
1029 | | // at least 8 whole blocks to process; so the below does not cause |
1030 | | // cbDataTail or cbDataMain to exceed cbData |
1031 | 0 | cbDataTail += ((cbDataTail > 0) && (cbDataTail < SYMCRYPT_AES_BLOCK_SIZE)) ? (8*SYMCRYPT_AES_BLOCK_SIZE) : 0; |
1032 | 0 | cbDataMain = cbData - cbDataTail; |
1033 | |
|
1034 | 0 | SYMCRYPT_ASSERT(cbDataMain <= cbData); |
1035 | 0 | SYMCRYPT_ASSERT(cbDataTail <= cbData); |
1036 | 0 | SYMCRYPT_ASSERT((cbDataMain & ((8*SYMCRYPT_AES_BLOCK_SIZE)-1)) == 0); |
1037 | |
|
1038 | 0 | c0 = _mm_loadu_si128( (__m128i *) pbTweakBlock ); |
1039 | 0 | XTS_MUL_ALPHA( c0, c1 ); |
1040 | 0 | XTS_MUL_ALPHA( c1, c2 ); |
1041 | 0 | XTS_MUL_ALPHA( c2, c3 ); |
1042 | |
|
1043 | 0 | XTS_MUL_ALPHA4( c0, c4 ); |
1044 | 0 | XTS_MUL_ALPHA ( c4, c5 ); |
1045 | 0 | XTS_MUL_ALPHA ( c5, c6 ); |
1046 | 0 | XTS_MUL_ALPHA ( c6, c7 ); |
1047 | |
|
1048 | 0 | tweakBuffer[0].m128i = c0; |
1049 | 0 | tweakBuffer[1].m128i = c1; |
1050 | 0 | tweakBuffer[2].m128i = c2; |
1051 | 0 | tweakBuffer[3].m128i = c3; |
1052 | 0 | tweakBuffer[4].m128i = c4; |
1053 | 0 | tweakBuffer[5].m128i = c5; |
1054 | 0 | tweakBuffer[6].m128i = c6; |
1055 | 0 | tweakBuffer[7].m128i = c7; |
1056 | 0 | lastTweakLow = tweakBuffer[7].ull[0]; |
1057 | 0 | lastTweakHigh = tweakBuffer[7].ull[1]; |
1058 | |
|
1059 | 0 | firstRoundKey = _mm_loadu_si128( (__m128i *) pExpandedKey->lastEncRoundKey ); |
1060 | 0 | lastRoundKey = _mm_loadu_si128( (__m128i *) pExpandedKey->lastDecRoundKey ); |
1061 | |
|
1062 | 0 | while( cbDataMain > 0 ) |
1063 | 0 | { |
1064 | | // At loop entry, tweakBuffer[0-7] are tweakValues for the next 8 blocks |
1065 | 0 | c0 = _mm_xor_si128( tweakBuffer[0].m128i, firstRoundKey ); |
1066 | 0 | c1 = _mm_xor_si128( tweakBuffer[1].m128i, firstRoundKey ); |
1067 | 0 | c2 = _mm_xor_si128( tweakBuffer[2].m128i, firstRoundKey ); |
1068 | 0 | c3 = _mm_xor_si128( tweakBuffer[3].m128i, firstRoundKey ); |
1069 | 0 | c4 = _mm_xor_si128( tweakBuffer[4].m128i, firstRoundKey ); |
1070 | 0 | c5 = _mm_xor_si128( tweakBuffer[5].m128i, firstRoundKey ); |
1071 | 0 | c6 = _mm_xor_si128( tweakBuffer[6].m128i, firstRoundKey ); |
1072 | 0 | c7 = _mm_xor_si128( tweakBuffer[7].m128i, firstRoundKey ); |
1073 | |
|
1074 | 0 | c0 = _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc + 0) ) ); |
1075 | 0 | c1 = _mm_xor_si128( c1, _mm_loadu_si128( ( __m128i * ) (pbSrc + 16) ) ); |
1076 | 0 | c2 = _mm_xor_si128( c2, _mm_loadu_si128( ( __m128i * ) (pbSrc + 32) ) ); |
1077 | 0 | c3 = _mm_xor_si128( c3, _mm_loadu_si128( ( __m128i * ) (pbSrc + 48) ) ); |
1078 | 0 | c4 = _mm_xor_si128( c4, _mm_loadu_si128( ( __m128i * ) (pbSrc + 64) ) ); |
1079 | 0 | c5 = _mm_xor_si128( c5, _mm_loadu_si128( ( __m128i * ) (pbSrc + 80) ) ); |
1080 | 0 | c6 = _mm_xor_si128( c6, _mm_loadu_si128( ( __m128i * ) (pbSrc + 96) ) ); |
1081 | 0 | c7 = _mm_xor_si128( c7, _mm_loadu_si128( ( __m128i * ) (pbSrc + 112) ) ); |
1082 | |
|
1083 | 0 | keyPtr = pExpandedKey->lastEncRoundKey + 1; |
1084 | | |
1085 | | // Do 8 full rounds (AES-128|AES-192|AES-256) with stitched XTS (peformed in scalar registers) |
1086 | 0 | for( aesDecryptXtsLoop = 0; aesDecryptXtsLoop < 8; aesDecryptXtsLoop++ ) |
1087 | 0 | { |
1088 | 0 | roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); |
1089 | 0 | keyPtr ++; |
1090 | 0 | c0 = _mm_aesdec_si128( c0, roundkey ); |
1091 | 0 | c1 = _mm_aesdec_si128( c1, roundkey ); |
1092 | 0 | c2 = _mm_aesdec_si128( c2, roundkey ); |
1093 | 0 | c3 = _mm_aesdec_si128( c3, roundkey ); |
1094 | 0 | c4 = _mm_aesdec_si128( c4, roundkey ); |
1095 | 0 | c5 = _mm_aesdec_si128( c5, roundkey ); |
1096 | 0 | c6 = _mm_aesdec_si128( c6, roundkey ); |
1097 | 0 | c7 = _mm_aesdec_si128( c7, roundkey ); |
1098 | | |
1099 | | // Prepare tweakBuffer[8-15] with tweak^lastRoundKey |
1100 | 0 | tweakBuffer[ 8+aesDecryptXtsLoop ].m128i = _mm_xor_si128( tweakBuffer[ aesDecryptXtsLoop ].m128i, lastRoundKey ); |
1101 | | // Prepare tweakBuffer[0-7] with tweaks for next 8 blocks |
1102 | 0 | XTS_MUL_ALPHA_Scalar( lastTweakLow, lastTweakHigh ); |
1103 | 0 | tweakBuffer[ aesDecryptXtsLoop ].ull[0] = lastTweakLow; |
1104 | 0 | tweakBuffer[ aesDecryptXtsLoop ].ull[1] = lastTweakHigh; |
1105 | 0 | } |
1106 | |
|
1107 | 0 | do |
1108 | 0 | { |
1109 | 0 | roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); |
1110 | 0 | keyPtr ++; |
1111 | 0 | c0 = _mm_aesdec_si128( c0, roundkey ); |
1112 | 0 | c1 = _mm_aesdec_si128( c1, roundkey ); |
1113 | 0 | c2 = _mm_aesdec_si128( c2, roundkey ); |
1114 | 0 | c3 = _mm_aesdec_si128( c3, roundkey ); |
1115 | 0 | c4 = _mm_aesdec_si128( c4, roundkey ); |
1116 | 0 | c5 = _mm_aesdec_si128( c5, roundkey ); |
1117 | 0 | c6 = _mm_aesdec_si128( c6, roundkey ); |
1118 | 0 | c7 = _mm_aesdec_si128( c7, roundkey ); |
1119 | 0 | } while( keyPtr < keyLimit ); |
1120 | |
|
1121 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 0), _mm_aesdeclast_si128( c0, tweakBuffer[ 8].m128i ) ); |
1122 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 16), _mm_aesdeclast_si128( c1, tweakBuffer[ 9].m128i ) ); |
1123 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 32), _mm_aesdeclast_si128( c2, tweakBuffer[10].m128i ) ); |
1124 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 48), _mm_aesdeclast_si128( c3, tweakBuffer[11].m128i ) ); |
1125 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 64), _mm_aesdeclast_si128( c4, tweakBuffer[12].m128i ) ); |
1126 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 80), _mm_aesdeclast_si128( c5, tweakBuffer[13].m128i ) ); |
1127 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 96), _mm_aesdeclast_si128( c6, tweakBuffer[14].m128i ) ); |
1128 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 112), _mm_aesdeclast_si128( c7, tweakBuffer[15].m128i ) ); |
1129 | |
|
1130 | 0 | pbSrc += 8 * SYMCRYPT_AES_BLOCK_SIZE; |
1131 | 0 | pbDst += 8 * SYMCRYPT_AES_BLOCK_SIZE; |
1132 | 0 | cbDataMain -= 8 * SYMCRYPT_AES_BLOCK_SIZE; |
1133 | 0 | } |
1134 | |
|
1135 | 0 | if( cbDataTail == 0 ) |
1136 | 0 | { |
1137 | 0 | return; // <-- expected case; early return here |
1138 | 0 | } |
1139 | | |
1140 | | // Rare case, with data unit length not being multiple of 128 bytes, handle the tail one block at a time |
1141 | 0 | t0 = tweakBuffer[0].m128i; |
1142 | |
|
1143 | 0 | while( cbDataTail >= 2*SYMCRYPT_AES_BLOCK_SIZE ) |
1144 | 0 | { |
1145 | 0 | c0 = _mm_xor_si128( _mm_loadu_si128( ( __m128i * ) pbSrc ), t0 ); |
1146 | 0 | pbSrc += SYMCRYPT_AES_BLOCK_SIZE; |
1147 | 0 | AES_DECRYPT_1( pExpandedKey, c0 ); |
1148 | 0 | _mm_storeu_si128( (__m128i *) pbDst, _mm_xor_si128( c0, t0 ) ); |
1149 | 0 | pbDst += SYMCRYPT_AES_BLOCK_SIZE; |
1150 | 0 | c7 = t0; |
1151 | 0 | XTS_MUL_ALPHA( t0, t0 ); |
1152 | 0 | cbDataTail -= SYMCRYPT_AES_BLOCK_SIZE; |
1153 | 0 | } |
1154 | | |
1155 | 0 | if( cbDataTail > SYMCRYPT_AES_BLOCK_SIZE ) |
1156 | 0 | { |
1157 | | // Ciphertext stealing decryption |
1158 | | // |
1159 | | // +--------------+ |
1160 | | // | | |
1161 | | // | V |
1162 | | // +-----------------+ | +-----+-----------+ |
1163 | | // | C_m-1 | | | C_m |++++CP+++++| |
1164 | | // +-----------------+ | +-----+-----------+ |
1165 | | // | | | |
1166 | | // dec_m | dec_m-1 |
1167 | | // | | | |
1168 | | // V | V |
1169 | | // +-----+-----------+ | +-----------------+ |
1170 | | // | P_m |++++CP+++++|--+ | P_m-1 | |
1171 | | // +-----+-----------+ +-----------------+ |
1172 | | // | / |
1173 | | // +---------------- / --+ |
1174 | | // / | |
1175 | | // | V |
1176 | | // +-----------------+ | +-----+ |
1177 | | // | P_m-1 |<-+ | P_m | |
1178 | | // +-----------------+ +-----+ |
1179 | | |
1180 | | // Do final tweak update into c1 |
1181 | | // Penultimate tweak is in t0, ready for final decryption |
1182 | 0 | XTS_MUL_ALPHA( t0, c1 ); |
1183 | | |
1184 | | // Decrypt penultimate ciphertext block into tweakBuffer[0] |
1185 | 0 | c0 = _mm_xor_si128( _mm_loadu_si128( (__m128i *) pbSrc ), c1 ); |
1186 | 0 | AES_DECRYPT_1( pExpandedKey, c0 ); |
1187 | 0 | tweakBuffer[0].m128i = _mm_xor_si128( c0, c1 ); |
1188 | |
|
1189 | 0 | cbDataTail -= SYMCRYPT_AES_BLOCK_SIZE; |
1190 | | |
1191 | | // Copy tweakBuffer[0] to tweakBuffer[1] |
1192 | 0 | tweakBuffer[1].m128i = tweakBuffer[0].m128i; |
1193 | | // Copy final ciphertext bytes to prefix of tweakBuffer[0] - we must read before writing to support in-place decryption |
1194 | 0 | memcpy( &tweakBuffer[0].ul[0], pbSrc + SYMCRYPT_AES_BLOCK_SIZE, cbDataTail ); |
1195 | | // Copy prefix of tweakBuffer[1] to the right place in the destination buffer |
1196 | 0 | memcpy( pbDst + SYMCRYPT_AES_BLOCK_SIZE, &tweakBuffer[1].ul[0], cbDataTail ); |
1197 | | |
1198 | | // Load updated tweakBuffer[0] into c0 |
1199 | 0 | c0 = tweakBuffer[0].m128i; |
1200 | 0 | } else { |
1201 | | // Just load final ciphertext block into c0 |
1202 | 0 | c0 = _mm_loadu_si128( (__m128i*) pbSrc ); |
1203 | 0 | } |
1204 | | |
1205 | | // Final full block decryption |
1206 | 0 | c0 = _mm_xor_si128( c0, t0 ); |
1207 | 0 | AES_DECRYPT_1( pExpandedKey, c0 ); |
1208 | 0 | _mm_storeu_si128( (__m128i *) pbDst, _mm_xor_si128( c0, t0 ) ); |
1209 | 0 | } |
1210 | | |
1211 | 0 | #define AES_FULLROUND_4_GHASH_1( roundkey, keyPtr, c0, c1, c2, c3, r0, t0, t1, gHashPointer, byteReverseOrder, gHashExpandedKeyTable, todo, resl, resm, resh ) \ |
1212 | 0 | { \ |
1213 | 0 | roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \ |
1214 | 0 | keyPtr ++; \ |
1215 | 0 | c0 = _mm_aesenc_si128( c0, roundkey ); \ |
1216 | 0 | c1 = _mm_aesenc_si128( c1, roundkey ); \ |
1217 | 0 | c2 = _mm_aesenc_si128( c2, roundkey ); \ |
1218 | 0 | c3 = _mm_aesenc_si128( c3, roundkey ); \ |
1219 | 0 | \ |
1220 | 0 | r0 = _mm_loadu_si128( (__m128i *) gHashPointer ); \ |
1221 | 0 | r0 = _mm_shuffle_epi8( r0, byteReverseOrder ); \ |
1222 | 0 | gHashPointer += 16; \ |
1223 | 0 | \ |
1224 | 0 | t1 = _mm_loadu_si128( (__m128i *) &GHASH_H_POWER(gHashExpandedKeyTable, todo) ); \ |
1225 | 0 | t0 = _mm_clmulepi64_si128( r0, t1, 0x00 ); \ |
1226 | 0 | t1 = _mm_clmulepi64_si128( r0, t1, 0x11 ); \ |
1227 | 0 | \ |
1228 | 0 | resl = _mm_xor_si128( resl, t0 ); \ |
1229 | 0 | resh = _mm_xor_si128( resh, t1 ); \ |
1230 | 0 | \ |
1231 | 0 | t0 = _mm_srli_si128( r0, 8 ); \ |
1232 | 0 | r0 = _mm_xor_si128( r0, t0 ); \ |
1233 | 0 | t1 = _mm_loadu_si128( (__m128i *) &GHASH_Hx_POWER(gHashExpandedKeyTable, todo) ); \ |
1234 | 0 | t1 = _mm_clmulepi64_si128( r0, t1, 0x00 ); \ |
1235 | 0 | \ |
1236 | 0 | resm = _mm_xor_si128( resm, t1 ); \ |
1237 | 0 | todo --; \ |
1238 | 0 | }; |
1239 | | |
1240 | 0 | #define AES_GCM_ENCRYPT_4( pExpandedKey, c0, c1, c2, c3, gHashPointer, ghashRounds, byteReverseOrder, gHashExpandedKeyTable, todo, resl, resm, resh ) \ |
1241 | 0 | { \ |
1242 | 0 | const BYTE (*keyPtr)[4][4]; \ |
1243 | 0 | const BYTE (*keyLimit)[4][4]; \ |
1244 | 0 | __m128i roundkey; \ |
1245 | 0 | __m128i t0, t1; \ |
1246 | 0 | __m128i r0; \ |
1247 | 0 | SIZE_T aesEncryptGhashLoop; \ |
1248 | 0 | \ |
1249 | 0 | keyPtr = &pExpandedKey->RoundKey[0]; \ |
1250 | 0 | keyLimit = pExpandedKey->lastEncRoundKey; \ |
1251 | 0 | \ |
1252 | 0 | roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \ |
1253 | 0 | keyPtr ++; \ |
1254 | 0 | c0 = _mm_xor_si128( c0, roundkey ); \ |
1255 | 0 | c1 = _mm_xor_si128( c1, roundkey ); \ |
1256 | 0 | c2 = _mm_xor_si128( c2, roundkey ); \ |
1257 | 0 | c3 = _mm_xor_si128( c3, roundkey ); \ |
1258 | 0 | \ |
1259 | 0 | /* Do ghashRounds full rounds (AES-128|AES-192|AES-256) with stitched GHASH */ \ |
1260 | 0 | for( aesEncryptGhashLoop = 0; aesEncryptGhashLoop < ghashRounds; aesEncryptGhashLoop++ ) \ |
1261 | 0 | { \ |
1262 | 0 | AES_FULLROUND_4_GHASH_1( roundkey, keyPtr, c0, c1, c2, c3, r0, t0, t1, gHashPointer, byteReverseOrder, gHashExpandedKeyTable, todo, resl, resm, resh ); \ |
1263 | 0 | } \ |
1264 | 0 | \ |
1265 | 0 | do \ |
1266 | 0 | { \ |
1267 | 0 | roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \ |
1268 | 0 | keyPtr ++; \ |
1269 | 0 | c0 = _mm_aesenc_si128( c0, roundkey ); \ |
1270 | 0 | c1 = _mm_aesenc_si128( c1, roundkey ); \ |
1271 | 0 | c2 = _mm_aesenc_si128( c2, roundkey ); \ |
1272 | 0 | c3 = _mm_aesenc_si128( c3, roundkey ); \ |
1273 | 0 | } while( keyPtr < keyLimit ); \ |
1274 | 0 | \ |
1275 | 0 | roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \ |
1276 | 0 | \ |
1277 | 0 | c0 = _mm_aesenclast_si128( c0, roundkey ); \ |
1278 | 0 | c1 = _mm_aesenclast_si128( c1, roundkey ); \ |
1279 | 0 | c2 = _mm_aesenclast_si128( c2, roundkey ); \ |
1280 | 0 | c3 = _mm_aesenclast_si128( c3, roundkey ); \ |
1281 | 0 | }; |
1282 | | |
1283 | 0 | #define AES_FULLROUND_8_GHASH_1( roundkey, keyPtr, c0, c1, c2, c3, c4, c5, c6, c7, r0, t0, t1, gHashPointer, byteReverseOrder, gHashExpandedKeyTable, todo, resl, resm, resh ) \ |
1284 | 0 | { \ |
1285 | 0 | roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \ |
1286 | 0 | keyPtr ++; \ |
1287 | 0 | c0 = _mm_aesenc_si128( c0, roundkey ); \ |
1288 | 0 | c1 = _mm_aesenc_si128( c1, roundkey ); \ |
1289 | 0 | c2 = _mm_aesenc_si128( c2, roundkey ); \ |
1290 | 0 | c3 = _mm_aesenc_si128( c3, roundkey ); \ |
1291 | 0 | c4 = _mm_aesenc_si128( c4, roundkey ); \ |
1292 | 0 | c5 = _mm_aesenc_si128( c5, roundkey ); \ |
1293 | 0 | c6 = _mm_aesenc_si128( c6, roundkey ); \ |
1294 | 0 | c7 = _mm_aesenc_si128( c7, roundkey ); \ |
1295 | 0 | \ |
1296 | 0 | r0 = _mm_loadu_si128( (__m128i *) gHashPointer ); \ |
1297 | 0 | r0 = _mm_shuffle_epi8( r0, byteReverseOrder ); \ |
1298 | 0 | gHashPointer += 16; \ |
1299 | 0 | \ |
1300 | 0 | t1 = _mm_loadu_si128( (__m128i *) &GHASH_H_POWER(gHashExpandedKeyTable, todo) ); \ |
1301 | 0 | t0 = _mm_clmulepi64_si128( r0, t1, 0x00 ); \ |
1302 | 0 | t1 = _mm_clmulepi64_si128( r0, t1, 0x11 ); \ |
1303 | 0 | \ |
1304 | 0 | resl = _mm_xor_si128( resl, t0 ); \ |
1305 | 0 | resh = _mm_xor_si128( resh, t1 ); \ |
1306 | 0 | \ |
1307 | 0 | t0 = _mm_srli_si128( r0, 8 ); \ |
1308 | 0 | r0 = _mm_xor_si128( r0, t0 ); \ |
1309 | 0 | t1 = _mm_loadu_si128( (__m128i *) &GHASH_Hx_POWER(gHashExpandedKeyTable, todo) ); \ |
1310 | 0 | t1 = _mm_clmulepi64_si128( r0, t1, 0x00 ); \ |
1311 | 0 | \ |
1312 | 0 | resm = _mm_xor_si128( resm, t1 ); \ |
1313 | 0 | todo --; \ |
1314 | 0 | }; |
1315 | | |
1316 | 0 | #define AES_GCM_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7, gHashPointer, ghashRounds, byteReverseOrder, gHashExpandedKeyTable, todo, resl, resm, resh ) \ |
1317 | 0 | { \ |
1318 | 0 | const BYTE (*keyPtr)[4][4]; \ |
1319 | 0 | const BYTE (*keyLimit)[4][4]; \ |
1320 | 0 | __m128i roundkey; \ |
1321 | 0 | __m128i t0, t1; \ |
1322 | 0 | __m128i r0; \ |
1323 | 0 | SIZE_T aesEncryptGhashLoop; \ |
1324 | 0 | \ |
1325 | 0 | keyPtr = &pExpandedKey->RoundKey[0]; \ |
1326 | 0 | keyLimit = pExpandedKey->lastEncRoundKey; \ |
1327 | 0 | \ |
1328 | 0 | roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \ |
1329 | 0 | keyPtr ++; \ |
1330 | 0 | c0 = _mm_xor_si128( c0, roundkey ); \ |
1331 | 0 | c1 = _mm_xor_si128( c1, roundkey ); \ |
1332 | 0 | c2 = _mm_xor_si128( c2, roundkey ); \ |
1333 | 0 | c3 = _mm_xor_si128( c3, roundkey ); \ |
1334 | 0 | c4 = _mm_xor_si128( c4, roundkey ); \ |
1335 | 0 | c5 = _mm_xor_si128( c5, roundkey ); \ |
1336 | 0 | c6 = _mm_xor_si128( c6, roundkey ); \ |
1337 | 0 | c7 = _mm_xor_si128( c7, roundkey ); \ |
1338 | 0 | \ |
1339 | 0 | /* Do ghashRounds full rounds (AES-128|AES-192|AES-256) with stitched GHASH */ \ |
1340 | 0 | for( aesEncryptGhashLoop = 0; aesEncryptGhashLoop < ghashRounds; aesEncryptGhashLoop++ ) \ |
1341 | 0 | { \ |
1342 | 0 | AES_FULLROUND_8_GHASH_1( roundkey, keyPtr, c0, c1, c2, c3, c4, c5, c6, c7, r0, t0, t1, gHashPointer, byteReverseOrder, gHashExpandedKeyTable, todo, resl, resm, resh ); \ |
1343 | 0 | } \ |
1344 | 0 | \ |
1345 | 0 | do \ |
1346 | 0 | { \ |
1347 | 0 | roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \ |
1348 | 0 | keyPtr ++; \ |
1349 | 0 | c0 = _mm_aesenc_si128( c0, roundkey ); \ |
1350 | 0 | c1 = _mm_aesenc_si128( c1, roundkey ); \ |
1351 | 0 | c2 = _mm_aesenc_si128( c2, roundkey ); \ |
1352 | 0 | c3 = _mm_aesenc_si128( c3, roundkey ); \ |
1353 | 0 | c4 = _mm_aesenc_si128( c4, roundkey ); \ |
1354 | 0 | c5 = _mm_aesenc_si128( c5, roundkey ); \ |
1355 | 0 | c6 = _mm_aesenc_si128( c6, roundkey ); \ |
1356 | 0 | c7 = _mm_aesenc_si128( c7, roundkey ); \ |
1357 | 0 | } while( keyPtr < keyLimit ); \ |
1358 | 0 | \ |
1359 | 0 | roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \ |
1360 | 0 | \ |
1361 | 0 | c0 = _mm_aesenclast_si128( c0, roundkey ); \ |
1362 | 0 | c1 = _mm_aesenclast_si128( c1, roundkey ); \ |
1363 | 0 | c2 = _mm_aesenclast_si128( c2, roundkey ); \ |
1364 | 0 | c3 = _mm_aesenclast_si128( c3, roundkey ); \ |
1365 | 0 | c4 = _mm_aesenclast_si128( c4, roundkey ); \ |
1366 | 0 | c5 = _mm_aesenclast_si128( c5, roundkey ); \ |
1367 | 0 | c6 = _mm_aesenclast_si128( c6, roundkey ); \ |
1368 | 0 | c7 = _mm_aesenclast_si128( c7, roundkey ); \ |
1369 | 0 | }; |
1370 | | |
1371 | | // This call is functionally identical to: |
1372 | | // SymCryptAesCtrMsb64Xmm( pExpandedKey, |
1373 | | // pbChainingValue, |
1374 | | // pbSrc, |
1375 | | // pbDst, |
1376 | | // cbData ); |
1377 | | // SymCryptGHashAppendDataPclmulqdq( expandedKeyTable, |
1378 | | // pState, |
1379 | | // pbDst, |
1380 | | // cbData ); |
1381 | | VOID |
1382 | | SYMCRYPT_CALL |
1383 | | SymCryptAesGcmEncryptStitchedXmm( |
1384 | | _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey, |
1385 | | _In_reads_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue, |
1386 | | _In_reads_( SYMCRYPT_GF128_FIELD_SIZE ) PCSYMCRYPT_GF128_ELEMENT expandedKeyTable, |
1387 | | _Inout_ PSYMCRYPT_GF128_ELEMENT pState, |
1388 | | _In_reads_( cbData ) PCBYTE pbSrc, |
1389 | | _Out_writes_( cbData ) PBYTE pbDst, |
1390 | | SIZE_T cbData ) |
1391 | 0 | { |
1392 | 0 | __m128i chain = _mm_loadu_si128( (__m128i *) pbChainingValue ); |
1393 | |
|
1394 | 0 | __m128i BYTE_REVERSE_ORDER = _mm_set_epi8( |
1395 | 0 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ); |
1396 | 0 | __m128i vMultiplicationConstant = _mm_set_epi32( 0, 0, 0xc2000000, 0 ); |
1397 | |
|
1398 | 0 | __m128i chainIncrement1 = _mm_set_epi32( 0, 0, 0, 1 ); |
1399 | 0 | __m128i chainIncrement2 = _mm_set_epi32( 0, 0, 0, 2 ); |
1400 | 0 | __m128i chainIncrement8 = _mm_set_epi32( 0, 0, 0, 8 ); |
1401 | |
|
1402 | 0 | __m128i c0, c1, c2, c3, c4, c5, c6, c7; |
1403 | 0 | __m128i r0, r1; |
1404 | |
|
1405 | 0 | __m128i state; |
1406 | 0 | __m128i a0, a1, a2; |
1407 | 0 | SIZE_T nBlocks = cbData / SYMCRYPT_GF128_BLOCK_SIZE; |
1408 | 0 | SIZE_T todo; |
1409 | 0 | PCBYTE pbGhashSrc = pbDst; |
1410 | |
|
1411 | 0 | SYMCRYPT_ASSERT( (cbData & SYMCRYPT_GCM_BLOCK_MOD_MASK) == 0 ); // cbData is multiple of block size |
1412 | |
|
1413 | 0 | chain = _mm_shuffle_epi8( chain, BYTE_REVERSE_ORDER ); |
1414 | 0 | state = _mm_loadu_si128( (__m128i *) pState ); |
1415 | |
|
1416 | 0 | todo = SYMCRYPT_MIN( nBlocks, SYMCRYPT_GHASH_PCLMULQDQ_HPOWERS ); |
1417 | 0 | CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0, a1, a2 ); |
1418 | | |
1419 | | // Do 8 blocks of CTR either for tail (if total blocks <8) or for encryption of first 8 blocks |
1420 | 0 | c0 = chain; |
1421 | 0 | c1 = _mm_add_epi32( chain, chainIncrement1 ); |
1422 | 0 | c2 = _mm_add_epi32( chain, chainIncrement2 ); |
1423 | 0 | c3 = _mm_add_epi32( c1, chainIncrement2 ); |
1424 | 0 | c4 = _mm_add_epi32( c2, chainIncrement2 ); |
1425 | 0 | c5 = _mm_add_epi32( c3, chainIncrement2 ); |
1426 | 0 | c6 = _mm_add_epi32( c4, chainIncrement2 ); |
1427 | 0 | c7 = _mm_add_epi32( c5, chainIncrement2 ); |
1428 | |
|
1429 | 0 | c0 = _mm_shuffle_epi8( c0, BYTE_REVERSE_ORDER ); |
1430 | 0 | c1 = _mm_shuffle_epi8( c1, BYTE_REVERSE_ORDER ); |
1431 | 0 | c2 = _mm_shuffle_epi8( c2, BYTE_REVERSE_ORDER ); |
1432 | 0 | c3 = _mm_shuffle_epi8( c3, BYTE_REVERSE_ORDER ); |
1433 | 0 | c4 = _mm_shuffle_epi8( c4, BYTE_REVERSE_ORDER ); |
1434 | 0 | c5 = _mm_shuffle_epi8( c5, BYTE_REVERSE_ORDER ); |
1435 | 0 | c6 = _mm_shuffle_epi8( c6, BYTE_REVERSE_ORDER ); |
1436 | 0 | c7 = _mm_shuffle_epi8( c7, BYTE_REVERSE_ORDER ); |
1437 | |
|
1438 | 0 | AES_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 ); |
1439 | |
|
1440 | 0 | if( nBlocks >= 8 ) |
1441 | 0 | { |
1442 | | // Encrypt first 8 blocks - update chain |
1443 | 0 | chain = _mm_add_epi32( chain, chainIncrement8 ); |
1444 | |
|
1445 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 0), _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc + 0) ) ) ); |
1446 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 16), _mm_xor_si128( c1, _mm_loadu_si128( ( __m128i * ) (pbSrc + 16) ) ) ); |
1447 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 32), _mm_xor_si128( c2, _mm_loadu_si128( ( __m128i * ) (pbSrc + 32) ) ) ); |
1448 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 48), _mm_xor_si128( c3, _mm_loadu_si128( ( __m128i * ) (pbSrc + 48) ) ) ); |
1449 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 64), _mm_xor_si128( c4, _mm_loadu_si128( ( __m128i * ) (pbSrc + 64) ) ) ); |
1450 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 80), _mm_xor_si128( c5, _mm_loadu_si128( ( __m128i * ) (pbSrc + 80) ) ) ); |
1451 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 96), _mm_xor_si128( c6, _mm_loadu_si128( ( __m128i * ) (pbSrc + 96) ) ) ); |
1452 | 0 | _mm_storeu_si128( (__m128i *) (pbDst +112), _mm_xor_si128( c7, _mm_loadu_si128( ( __m128i * ) (pbSrc +112) ) ) ); |
1453 | |
|
1454 | 0 | pbDst += 8 * SYMCRYPT_AES_BLOCK_SIZE; |
1455 | 0 | pbSrc += 8 * SYMCRYPT_AES_BLOCK_SIZE; |
1456 | |
|
1457 | 0 | while( nBlocks >= 16 ) |
1458 | 0 | { |
1459 | | // In this loop we always have 8 blocks to encrypt and we have already encrypted the previous 8 blocks ready for GHASH |
1460 | 0 | c0 = chain; |
1461 | 0 | c1 = _mm_add_epi32( chain, chainIncrement1 ); |
1462 | 0 | c2 = _mm_add_epi32( chain, chainIncrement2 ); |
1463 | 0 | c3 = _mm_add_epi32( c1, chainIncrement2 ); |
1464 | 0 | c4 = _mm_add_epi32( c2, chainIncrement2 ); |
1465 | 0 | c5 = _mm_add_epi32( c3, chainIncrement2 ); |
1466 | 0 | c6 = _mm_add_epi32( c4, chainIncrement2 ); |
1467 | 0 | c7 = _mm_add_epi32( c5, chainIncrement2 ); |
1468 | 0 | chain = _mm_add_epi32( c6, chainIncrement2 ); |
1469 | |
|
1470 | 0 | c0 = _mm_shuffle_epi8( c0, BYTE_REVERSE_ORDER ); |
1471 | 0 | c1 = _mm_shuffle_epi8( c1, BYTE_REVERSE_ORDER ); |
1472 | 0 | c2 = _mm_shuffle_epi8( c2, BYTE_REVERSE_ORDER ); |
1473 | 0 | c3 = _mm_shuffle_epi8( c3, BYTE_REVERSE_ORDER ); |
1474 | 0 | c4 = _mm_shuffle_epi8( c4, BYTE_REVERSE_ORDER ); |
1475 | 0 | c5 = _mm_shuffle_epi8( c5, BYTE_REVERSE_ORDER ); |
1476 | 0 | c6 = _mm_shuffle_epi8( c6, BYTE_REVERSE_ORDER ); |
1477 | 0 | c7 = _mm_shuffle_epi8( c7, BYTE_REVERSE_ORDER ); |
1478 | |
|
1479 | 0 | AES_GCM_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7, pbGhashSrc, 8, BYTE_REVERSE_ORDER, expandedKeyTable, todo, a0, a1, a2 ); |
1480 | |
|
1481 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 0), _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc + 0) ) ) ); |
1482 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 16), _mm_xor_si128( c1, _mm_loadu_si128( ( __m128i * ) (pbSrc + 16) ) ) ); |
1483 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 32), _mm_xor_si128( c2, _mm_loadu_si128( ( __m128i * ) (pbSrc + 32) ) ) ); |
1484 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 48), _mm_xor_si128( c3, _mm_loadu_si128( ( __m128i * ) (pbSrc + 48) ) ) ); |
1485 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 64), _mm_xor_si128( c4, _mm_loadu_si128( ( __m128i * ) (pbSrc + 64) ) ) ); |
1486 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 80), _mm_xor_si128( c5, _mm_loadu_si128( ( __m128i * ) (pbSrc + 80) ) ) ); |
1487 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 96), _mm_xor_si128( c6, _mm_loadu_si128( ( __m128i * ) (pbSrc + 96) ) ) ); |
1488 | 0 | _mm_storeu_si128( (__m128i *) (pbDst +112), _mm_xor_si128( c7, _mm_loadu_si128( ( __m128i * ) (pbSrc +112) ) ) ); |
1489 | |
|
1490 | 0 | pbDst += 8 * SYMCRYPT_AES_BLOCK_SIZE; |
1491 | 0 | pbSrc += 8 * SYMCRYPT_AES_BLOCK_SIZE; |
1492 | 0 | nBlocks -= 8; |
1493 | |
|
1494 | 0 | if( todo == 0 ) |
1495 | 0 | { |
1496 | 0 | CLMUL_3_POST( a0, a1, a2 ); |
1497 | 0 | MODREDUCE( vMultiplicationConstant, a0, a1, a2, state ); |
1498 | |
|
1499 | 0 | todo = SYMCRYPT_MIN( nBlocks, SYMCRYPT_GHASH_PCLMULQDQ_HPOWERS ); |
1500 | 0 | CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0, a1, a2 ); |
1501 | 0 | } |
1502 | 0 | } |
1503 | | |
1504 | | // We now have at least 8 blocks of encrypted data to GHASH and at most 7 blocks left to encrypt |
1505 | | // Do 8 blocks of GHASH in parallel with generating 0, 4, or 8 AES-CTR blocks for tail encryption |
1506 | 0 | nBlocks -= 8; |
1507 | 0 | if (nBlocks > 0) |
1508 | 0 | { |
1509 | 0 | c0 = chain; |
1510 | 0 | c1 = _mm_add_epi32( chain, chainIncrement1 ); |
1511 | 0 | c2 = _mm_add_epi32( chain, chainIncrement2 ); |
1512 | 0 | c3 = _mm_add_epi32( c1, chainIncrement2 ); |
1513 | 0 | c4 = _mm_add_epi32( c2, chainIncrement2 ); |
1514 | |
|
1515 | 0 | c0 = _mm_shuffle_epi8( c0, BYTE_REVERSE_ORDER ); |
1516 | 0 | c1 = _mm_shuffle_epi8( c1, BYTE_REVERSE_ORDER ); |
1517 | 0 | c2 = _mm_shuffle_epi8( c2, BYTE_REVERSE_ORDER ); |
1518 | 0 | c3 = _mm_shuffle_epi8( c3, BYTE_REVERSE_ORDER ); |
1519 | |
|
1520 | 0 | if (nBlocks > 4) |
1521 | 0 | { |
1522 | | // Do 8 rounds of AES-CTR for tail in parallel with 8 rounds of GHASH |
1523 | 0 | c5 = _mm_add_epi32( c4, chainIncrement1 ); |
1524 | 0 | c6 = _mm_add_epi32( c4, chainIncrement2 ); |
1525 | |
|
1526 | 0 | c4 = _mm_shuffle_epi8( c4, BYTE_REVERSE_ORDER ); |
1527 | 0 | c5 = _mm_shuffle_epi8( c5, BYTE_REVERSE_ORDER ); |
1528 | 0 | c6 = _mm_shuffle_epi8( c6, BYTE_REVERSE_ORDER ); |
1529 | |
|
1530 | 0 | AES_GCM_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7, pbGhashSrc, 8, BYTE_REVERSE_ORDER, expandedKeyTable, todo, a0, a1, a2 ); |
1531 | 0 | } |
1532 | 0 | else |
1533 | 0 | { |
1534 | | // Do 4 rounds of AES-CTR for tail in parallel with 8 rounds of GHASH |
1535 | 0 | AES_GCM_ENCRYPT_4( pExpandedKey, c0, c1, c2, c3, pbGhashSrc, 8, BYTE_REVERSE_ORDER, expandedKeyTable, todo, a0, a1, a2 ); |
1536 | 0 | } |
1537 | |
|
1538 | 0 | if( todo == 0) |
1539 | 0 | { |
1540 | 0 | CLMUL_3_POST( a0, a1, a2 ); |
1541 | 0 | MODREDUCE( vMultiplicationConstant, a0, a1, a2, state ); |
1542 | |
|
1543 | 0 | todo = SYMCRYPT_MIN( nBlocks, SYMCRYPT_GHASH_PCLMULQDQ_HPOWERS ); |
1544 | 0 | CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0, a1, a2 ); |
1545 | 0 | } |
1546 | 0 | } |
1547 | 0 | else |
1548 | 0 | { |
1549 | | // Just do the final 8 rounds of GHASH |
1550 | 0 | for( todo=8; todo>0; todo-- ) |
1551 | 0 | { |
1552 | 0 | r0 = _mm_shuffle_epi8( _mm_loadu_si128( (__m128i *) (pbGhashSrc + 0) ), BYTE_REVERSE_ORDER ); |
1553 | 0 | pbGhashSrc += SYMCRYPT_AES_BLOCK_SIZE; |
1554 | |
|
1555 | 0 | CLMUL_ACC_3( r0, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0, a1, a2 ); |
1556 | 0 | } |
1557 | |
|
1558 | 0 | CLMUL_3_POST( a0, a1, a2 ); |
1559 | 0 | MODREDUCE( vMultiplicationConstant, a0, a1, a2, state ); |
1560 | 0 | } |
1561 | 0 | } |
1562 | |
|
1563 | 0 | if( nBlocks > 0 ) |
1564 | 0 | { |
1565 | | // Encrypt 1-7 blocks with pre-generated AES-CTR blocks and GHASH the results |
1566 | 0 | while( nBlocks >= 2 ) |
1567 | 0 | { |
1568 | 0 | chain = _mm_add_epi32( chain, chainIncrement2 ); |
1569 | |
|
1570 | 0 | r0 = _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc + 0) ) ); |
1571 | 0 | r1 = _mm_xor_si128( c1, _mm_loadu_si128( ( __m128i * ) (pbSrc + 16) ) ); |
1572 | |
|
1573 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 0), r0 ); |
1574 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 16), r1 ); |
1575 | |
|
1576 | 0 | r0 = _mm_shuffle_epi8( r0, BYTE_REVERSE_ORDER ); |
1577 | 0 | r1 = _mm_shuffle_epi8( r1, BYTE_REVERSE_ORDER ); |
1578 | |
|
1579 | 0 | CLMUL_ACC_3( r0, GHASH_H_POWER(expandedKeyTable, todo - 0), GHASH_Hx_POWER(expandedKeyTable, todo - 0), a0, a1, a2 ); |
1580 | 0 | CLMUL_ACC_3( r1, GHASH_H_POWER(expandedKeyTable, todo - 1), GHASH_Hx_POWER(expandedKeyTable, todo - 1), a0, a1, a2 ); |
1581 | |
|
1582 | 0 | pbDst += 2*SYMCRYPT_AES_BLOCK_SIZE; |
1583 | 0 | pbSrc += 2*SYMCRYPT_AES_BLOCK_SIZE; |
1584 | 0 | todo -= 2; |
1585 | 0 | nBlocks -= 2; |
1586 | 0 | c0 = c2; |
1587 | 0 | c1 = c3; |
1588 | 0 | c2 = c4; |
1589 | 0 | c3 = c5; |
1590 | 0 | c4 = c6; |
1591 | 0 | } |
1592 | |
|
1593 | 0 | if( nBlocks > 0 ) |
1594 | 0 | { |
1595 | 0 | chain = _mm_add_epi32( chain, chainIncrement1 ); |
1596 | |
|
1597 | 0 | r0 = _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc + 0) ) ); |
1598 | |
|
1599 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 0), r0 ); |
1600 | |
|
1601 | 0 | r0 = _mm_shuffle_epi8( r0, BYTE_REVERSE_ORDER ); |
1602 | |
|
1603 | 0 | CLMUL_ACC_3( r0, GHASH_H_POWER(expandedKeyTable, 1), GHASH_Hx_POWER(expandedKeyTable, 1), a0, a1, a2 ); |
1604 | 0 | } |
1605 | |
|
1606 | 0 | CLMUL_3_POST( a0, a1, a2 ); |
1607 | 0 | MODREDUCE( vMultiplicationConstant, a0, a1, a2, state ); |
1608 | 0 | } |
1609 | |
|
1610 | 0 | chain = _mm_shuffle_epi8( chain, BYTE_REVERSE_ORDER ); |
1611 | 0 | _mm_storeu_si128( (__m128i *) pbChainingValue, chain ); |
1612 | 0 | _mm_storeu_si128( (__m128i *) pState, state ); |
1613 | 0 | } |
1614 | | |
1615 | | #pragma warning(push) |
1616 | | #pragma warning( disable:4701 ) |
1617 | | #pragma runtime_checks( "u", off ) |
1618 | | // This call is functionally identical to: |
1619 | | // SymCryptGHashAppendDataPclmulqdq( expandedKeyTable, |
1620 | | // pState, |
1621 | | // pbSrc, |
1622 | | // cbData ); |
1623 | | // SymCryptAesCtrMsb64Xmm( pExpandedKey, |
1624 | | // pbChainingValue, |
1625 | | // pbSrc, |
1626 | | // pbDst, |
1627 | | // cbData ); |
1628 | | VOID |
1629 | | SYMCRYPT_CALL |
1630 | | SymCryptAesGcmDecryptStitchedXmm( |
1631 | | _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey, |
1632 | | _In_reads_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue, |
1633 | | _In_reads_( SYMCRYPT_GF128_FIELD_SIZE ) PCSYMCRYPT_GF128_ELEMENT expandedKeyTable, |
1634 | | _Inout_ PSYMCRYPT_GF128_ELEMENT pState, |
1635 | | _In_reads_( cbData ) PCBYTE pbSrc, |
1636 | | _Out_writes_( cbData ) PBYTE pbDst, |
1637 | | SIZE_T cbData ) |
1638 | 0 | { |
1639 | 0 | __m128i chain = _mm_loadu_si128( (__m128i *) pbChainingValue ); |
1640 | |
|
1641 | 0 | __m128i BYTE_REVERSE_ORDER = _mm_set_epi8( |
1642 | 0 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ); |
1643 | 0 | __m128i vMultiplicationConstant = _mm_set_epi32( 0, 0, 0xc2000000, 0 ); |
1644 | |
|
1645 | 0 | __m128i chainIncrement1 = _mm_set_epi32( 0, 0, 0, 1 ); |
1646 | 0 | __m128i chainIncrement2 = _mm_set_epi32( 0, 0, 0, 2 ); |
1647 | |
|
1648 | 0 | __m128i c0, c1, c2, c3, c4, c5, c6, c7; |
1649 | |
|
1650 | 0 | __m128i state; |
1651 | 0 | __m128i a0, a1, a2; |
1652 | 0 | SIZE_T nBlocks = cbData / SYMCRYPT_GF128_BLOCK_SIZE; |
1653 | 0 | SIZE_T todo = 0; |
1654 | 0 | PCBYTE pbGhashSrc = pbSrc; |
1655 | |
|
1656 | 0 | SYMCRYPT_ASSERT( (cbData & SYMCRYPT_GCM_BLOCK_MOD_MASK) == 0 ); // cbData is multiple of block size |
1657 | |
|
1658 | 0 | chain = _mm_shuffle_epi8( chain, BYTE_REVERSE_ORDER ); |
1659 | 0 | state = _mm_loadu_si128( (__m128i *) pState ); |
1660 | |
|
1661 | 0 | todo = SYMCRYPT_MIN( nBlocks, SYMCRYPT_GHASH_PCLMULQDQ_HPOWERS ); |
1662 | 0 | CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0, a1, a2 ); |
1663 | |
|
1664 | 0 | while( nBlocks >= 8 ) |
1665 | 0 | { |
1666 | | // In this loop we always have 8 blocks to decrypt and GHASH |
1667 | 0 | c0 = chain; |
1668 | 0 | c1 = _mm_add_epi32( chain, chainIncrement1 ); |
1669 | 0 | c2 = _mm_add_epi32( chain, chainIncrement2 ); |
1670 | 0 | c3 = _mm_add_epi32( c1, chainIncrement2 ); |
1671 | 0 | c4 = _mm_add_epi32( c2, chainIncrement2 ); |
1672 | 0 | c5 = _mm_add_epi32( c3, chainIncrement2 ); |
1673 | 0 | c6 = _mm_add_epi32( c4, chainIncrement2 ); |
1674 | 0 | c7 = _mm_add_epi32( c5, chainIncrement2 ); |
1675 | 0 | chain = _mm_add_epi32( c6, chainIncrement2 ); |
1676 | |
|
1677 | 0 | c0 = _mm_shuffle_epi8( c0, BYTE_REVERSE_ORDER ); |
1678 | 0 | c1 = _mm_shuffle_epi8( c1, BYTE_REVERSE_ORDER ); |
1679 | 0 | c2 = _mm_shuffle_epi8( c2, BYTE_REVERSE_ORDER ); |
1680 | 0 | c3 = _mm_shuffle_epi8( c3, BYTE_REVERSE_ORDER ); |
1681 | 0 | c4 = _mm_shuffle_epi8( c4, BYTE_REVERSE_ORDER ); |
1682 | 0 | c5 = _mm_shuffle_epi8( c5, BYTE_REVERSE_ORDER ); |
1683 | 0 | c6 = _mm_shuffle_epi8( c6, BYTE_REVERSE_ORDER ); |
1684 | 0 | c7 = _mm_shuffle_epi8( c7, BYTE_REVERSE_ORDER ); |
1685 | |
|
1686 | 0 | AES_GCM_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7, pbGhashSrc, 8, BYTE_REVERSE_ORDER, expandedKeyTable, todo, a0, a1, a2 ); |
1687 | |
|
1688 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 0), _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc + 0) ) ) ); |
1689 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 16), _mm_xor_si128( c1, _mm_loadu_si128( ( __m128i * ) (pbSrc + 16) ) ) ); |
1690 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 32), _mm_xor_si128( c2, _mm_loadu_si128( ( __m128i * ) (pbSrc + 32) ) ) ); |
1691 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 48), _mm_xor_si128( c3, _mm_loadu_si128( ( __m128i * ) (pbSrc + 48) ) ) ); |
1692 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 64), _mm_xor_si128( c4, _mm_loadu_si128( ( __m128i * ) (pbSrc + 64) ) ) ); |
1693 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 80), _mm_xor_si128( c5, _mm_loadu_si128( ( __m128i * ) (pbSrc + 80) ) ) ); |
1694 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 96), _mm_xor_si128( c6, _mm_loadu_si128( ( __m128i * ) (pbSrc + 96) ) ) ); |
1695 | 0 | _mm_storeu_si128( (__m128i *) (pbDst +112), _mm_xor_si128( c7, _mm_loadu_si128( ( __m128i * ) (pbSrc +112) ) ) ); |
1696 | |
|
1697 | 0 | pbDst += 8 * SYMCRYPT_AES_BLOCK_SIZE; |
1698 | 0 | pbSrc += 8 * SYMCRYPT_AES_BLOCK_SIZE; |
1699 | 0 | nBlocks -= 8; |
1700 | |
|
1701 | 0 | if ( todo == 0 ) |
1702 | 0 | { |
1703 | 0 | CLMUL_3_POST( a0, a1, a2 ); |
1704 | 0 | MODREDUCE( vMultiplicationConstant, a0, a1, a2, state ); |
1705 | |
|
1706 | 0 | if ( nBlocks > 0 ) |
1707 | 0 | { |
1708 | 0 | todo = SYMCRYPT_MIN( nBlocks, SYMCRYPT_GHASH_PCLMULQDQ_HPOWERS ); |
1709 | 0 | CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0, a1, a2 ); |
1710 | 0 | } |
1711 | 0 | } |
1712 | 0 | } |
1713 | |
|
1714 | 0 | if( nBlocks > 0 ) |
1715 | 0 | { |
1716 | | // We have 1-7 blocks to GHASH and decrypt |
1717 | | // Do the exact number of GHASH blocks we need in parallel with generating either 4 or 8 blocks of AES-CTR |
1718 | 0 | c0 = chain; |
1719 | 0 | c1 = _mm_add_epi32( chain, chainIncrement1 ); |
1720 | 0 | c2 = _mm_add_epi32( chain, chainIncrement2 ); |
1721 | 0 | c3 = _mm_add_epi32( c1, chainIncrement2 ); |
1722 | 0 | c4 = _mm_add_epi32( c2, chainIncrement2 ); |
1723 | |
|
1724 | 0 | c0 = _mm_shuffle_epi8( c0, BYTE_REVERSE_ORDER ); |
1725 | 0 | c1 = _mm_shuffle_epi8( c1, BYTE_REVERSE_ORDER ); |
1726 | 0 | c2 = _mm_shuffle_epi8( c2, BYTE_REVERSE_ORDER ); |
1727 | 0 | c3 = _mm_shuffle_epi8( c3, BYTE_REVERSE_ORDER ); |
1728 | |
|
1729 | 0 | if( nBlocks > 4 ) |
1730 | 0 | { |
1731 | 0 | c5 = _mm_add_epi32( c4, chainIncrement1 ); |
1732 | 0 | c6 = _mm_add_epi32( c4, chainIncrement2 ); |
1733 | |
|
1734 | 0 | c4 = _mm_shuffle_epi8( c4, BYTE_REVERSE_ORDER ); |
1735 | 0 | c5 = _mm_shuffle_epi8( c5, BYTE_REVERSE_ORDER ); |
1736 | 0 | c6 = _mm_shuffle_epi8( c6, BYTE_REVERSE_ORDER ); |
1737 | |
|
1738 | 0 | AES_GCM_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7, pbGhashSrc, nBlocks, BYTE_REVERSE_ORDER, expandedKeyTable, todo, a0, a1, a2 ); |
1739 | 0 | } else { |
1740 | 0 | AES_GCM_ENCRYPT_4( pExpandedKey, c0, c1, c2, c3, pbGhashSrc, nBlocks, BYTE_REVERSE_ORDER, expandedKeyTable, todo, a0, a1, a2 ); |
1741 | 0 | } |
1742 | |
|
1743 | 0 | CLMUL_3_POST( a0, a1, a2 ); |
1744 | 0 | MODREDUCE( vMultiplicationConstant, a0, a1, a2, state ); |
1745 | | |
1746 | | // Decrypt 1-7 blocks with pre-generated AES-CTR blocks |
1747 | 0 | while( nBlocks >= 2 ) |
1748 | 0 | { |
1749 | 0 | chain = _mm_add_epi32( chain, chainIncrement2 ); |
1750 | |
|
1751 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 0), _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc + 0) ) ) ); |
1752 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 16), _mm_xor_si128( c1, _mm_loadu_si128( ( __m128i * ) (pbSrc + 16) ) ) ); |
1753 | |
|
1754 | 0 | pbDst += 2*SYMCRYPT_AES_BLOCK_SIZE; |
1755 | 0 | pbSrc += 2*SYMCRYPT_AES_BLOCK_SIZE; |
1756 | 0 | nBlocks -= 2; |
1757 | 0 | c0 = c2; |
1758 | 0 | c1 = c3; |
1759 | 0 | c2 = c4; |
1760 | 0 | c3 = c5; |
1761 | 0 | c4 = c6; |
1762 | 0 | } |
1763 | |
|
1764 | 0 | if( nBlocks > 0 ) |
1765 | 0 | { |
1766 | 0 | chain = _mm_add_epi32( chain, chainIncrement1 ); |
1767 | |
|
1768 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 0), _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc + 0) ) ) ); |
1769 | 0 | } |
1770 | 0 | } |
1771 | |
|
1772 | 0 | chain = _mm_shuffle_epi8( chain, BYTE_REVERSE_ORDER ); |
1773 | 0 | _mm_storeu_si128( (__m128i *) pbChainingValue, chain ); |
1774 | 0 | _mm_storeu_si128((__m128i *)pState, state ); |
1775 | 0 | } |
1776 | | #pragma runtime_checks( "u", restore ) |
1777 | | #pragma warning(pop) |
1778 | | |
1779 | | #endif // CPU_X86 | CPU_AMD64 |