/src/SymCrypt/lib/aes-ymm.c
Line | Count | Source (jump to first uncovered line) |
1 | | // |
2 | | // aes-ymm.c code for AES implementation |
3 | | // |
4 | | // Copyright (c) Microsoft Corporation. Licensed under the MIT license. |
5 | | // |
6 | | // All YMM code for AES operations |
7 | | // Requires compiler support for aesni, pclmulqdq, avx2, vaes and vpclmulqdq |
8 | | // |
9 | | |
10 | | #include "precomp.h" |
11 | | |
12 | | #if SYMCRYPT_CPU_X86 | SYMCRYPT_CPU_AMD64 |
13 | | |
14 | | #include "xtsaes_definitions.h" |
15 | | #include "ghash_definitions.h" |
16 | | |
17 | 0 | #define AES_ENCRYPT_YMM_2048( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 ) \ |
18 | 0 | { \ |
19 | 0 | const BYTE (*keyPtr)[4][4]; \ |
20 | 0 | const BYTE (*keyLimit)[4][4]; \ |
21 | 0 | __m256i roundkeys; \ |
22 | 0 | \ |
23 | 0 | keyPtr = pExpandedKey->RoundKey; \ |
24 | 0 | keyLimit = pExpandedKey->lastEncRoundKey; \ |
25 | 0 | \ |
26 | 0 | /* _mm256_broadcastsi128_si256 requires AVX2 */ \ |
27 | 0 | roundkeys = _mm256_broadcastsi128_si256( *( (const __m128i *) keyPtr ) ); \ |
28 | 0 | keyPtr ++; \ |
29 | 0 | \ |
30 | 0 | /* _mm256_xor_si256 requires AVX2 */ \ |
31 | 0 | c0 = _mm256_xor_si256( c0, roundkeys ); \ |
32 | 0 | c1 = _mm256_xor_si256( c1, roundkeys ); \ |
33 | 0 | c2 = _mm256_xor_si256( c2, roundkeys ); \ |
34 | 0 | c3 = _mm256_xor_si256( c3, roundkeys ); \ |
35 | 0 | c4 = _mm256_xor_si256( c4, roundkeys ); \ |
36 | 0 | c5 = _mm256_xor_si256( c5, roundkeys ); \ |
37 | 0 | c6 = _mm256_xor_si256( c6, roundkeys ); \ |
38 | 0 | c7 = _mm256_xor_si256( c7, roundkeys ); \ |
39 | 0 | \ |
40 | 0 | do \ |
41 | 0 | { \ |
42 | 0 | roundkeys = _mm256_broadcastsi128_si256( *( (const __m128i *) keyPtr ) ); \ |
43 | 0 | keyPtr ++; \ |
44 | 0 | c0 = _mm256_aesenc_epi128( c0, roundkeys ); \ |
45 | 0 | c1 = _mm256_aesenc_epi128( c1, roundkeys ); \ |
46 | 0 | c2 = _mm256_aesenc_epi128( c2, roundkeys ); \ |
47 | 0 | c3 = _mm256_aesenc_epi128( c3, roundkeys ); \ |
48 | 0 | c4 = _mm256_aesenc_epi128( c4, roundkeys ); \ |
49 | 0 | c5 = _mm256_aesenc_epi128( c5, roundkeys ); \ |
50 | 0 | c6 = _mm256_aesenc_epi128( c6, roundkeys ); \ |
51 | 0 | c7 = _mm256_aesenc_epi128( c7, roundkeys ); \ |
52 | 0 | } while( keyPtr < keyLimit ); \ |
53 | 0 | \ |
54 | 0 | roundkeys = _mm256_broadcastsi128_si256( *( (const __m128i *) keyPtr ) ); \ |
55 | 0 | \ |
56 | 0 | c0 = _mm256_aesenclast_epi128( c0, roundkeys ); \ |
57 | 0 | c1 = _mm256_aesenclast_epi128( c1, roundkeys ); \ |
58 | 0 | c2 = _mm256_aesenclast_epi128( c2, roundkeys ); \ |
59 | 0 | c3 = _mm256_aesenclast_epi128( c3, roundkeys ); \ |
60 | 0 | c4 = _mm256_aesenclast_epi128( c4, roundkeys ); \ |
61 | 0 | c5 = _mm256_aesenclast_epi128( c5, roundkeys ); \ |
62 | 0 | c6 = _mm256_aesenclast_epi128( c6, roundkeys ); \ |
63 | 0 | c7 = _mm256_aesenclast_epi128( c7, roundkeys ); \ |
64 | 0 | }; |
65 | | |
66 | 0 | #define AES_DECRYPT_YMM_2048( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 ) \ |
67 | 0 | { \ |
68 | 0 | const BYTE (*keyPtr)[4][4]; \ |
69 | 0 | const BYTE (*keyLimit)[4][4]; \ |
70 | 0 | __m256i roundkeys; \ |
71 | 0 | \ |
72 | 0 | keyPtr = pExpandedKey->lastEncRoundKey; \ |
73 | 0 | keyLimit = pExpandedKey->lastDecRoundKey; \ |
74 | 0 | \ |
75 | 0 | /* _mm256_broadcastsi128_si256 requires AVX2 */ \ |
76 | 0 | roundkeys = _mm256_broadcastsi128_si256( *( (const __m128i *) keyPtr ) ); \ |
77 | 0 | keyPtr ++; \ |
78 | 0 | \ |
79 | 0 | /* _mm256_xor_si256 requires AVX2 */ \ |
80 | 0 | c0 = _mm256_xor_si256( c0, roundkeys ); \ |
81 | 0 | c1 = _mm256_xor_si256( c1, roundkeys ); \ |
82 | 0 | c2 = _mm256_xor_si256( c2, roundkeys ); \ |
83 | 0 | c3 = _mm256_xor_si256( c3, roundkeys ); \ |
84 | 0 | c4 = _mm256_xor_si256( c4, roundkeys ); \ |
85 | 0 | c5 = _mm256_xor_si256( c5, roundkeys ); \ |
86 | 0 | c6 = _mm256_xor_si256( c6, roundkeys ); \ |
87 | 0 | c7 = _mm256_xor_si256( c7, roundkeys ); \ |
88 | 0 | \ |
89 | 0 | do \ |
90 | 0 | { \ |
91 | 0 | roundkeys = _mm256_broadcastsi128_si256( *( (const __m128i *) keyPtr ) ); \ |
92 | 0 | keyPtr ++; \ |
93 | 0 | c0 = _mm256_aesdec_epi128( c0, roundkeys ); \ |
94 | 0 | c1 = _mm256_aesdec_epi128( c1, roundkeys ); \ |
95 | 0 | c2 = _mm256_aesdec_epi128( c2, roundkeys ); \ |
96 | 0 | c3 = _mm256_aesdec_epi128( c3, roundkeys ); \ |
97 | 0 | c4 = _mm256_aesdec_epi128( c4, roundkeys ); \ |
98 | 0 | c5 = _mm256_aesdec_epi128( c5, roundkeys ); \ |
99 | 0 | c6 = _mm256_aesdec_epi128( c6, roundkeys ); \ |
100 | 0 | c7 = _mm256_aesdec_epi128( c7, roundkeys ); \ |
101 | 0 | } while( keyPtr < keyLimit ); \ |
102 | 0 | \ |
103 | 0 | roundkeys = _mm256_broadcastsi128_si256( *( (const __m128i *) keyPtr ) ); \ |
104 | 0 | \ |
105 | 0 | c0 = _mm256_aesdeclast_epi128( c0, roundkeys ); \ |
106 | 0 | c1 = _mm256_aesdeclast_epi128( c1, roundkeys ); \ |
107 | 0 | c2 = _mm256_aesdeclast_epi128( c2, roundkeys ); \ |
108 | 0 | c3 = _mm256_aesdeclast_epi128( c3, roundkeys ); \ |
109 | 0 | c4 = _mm256_aesdeclast_epi128( c4, roundkeys ); \ |
110 | 0 | c5 = _mm256_aesdeclast_epi128( c5, roundkeys ); \ |
111 | 0 | c6 = _mm256_aesdeclast_epi128( c6, roundkeys ); \ |
112 | 0 | c7 = _mm256_aesdeclast_epi128( c7, roundkeys ); \ |
113 | 0 | }; |
114 | | |
115 | | VOID |
116 | | SYMCRYPT_CALL |
117 | | SymCryptXtsAesEncryptDataUnitYmm_2048( |
118 | | _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey, |
119 | | _Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbTweakBlock, |
120 | | _Out_writes_( SYMCRYPT_AES_BLOCK_SIZE*16 ) PBYTE pbScratch, |
121 | | _In_reads_( cbData ) PCBYTE pbSrc, |
122 | | _Out_writes_( cbData ) PBYTE pbDst, |
123 | | SIZE_T cbData ) |
124 | 0 | { |
125 | 0 | __m128i t0, t1, t2, t3, t4, t5, t6, t7; |
126 | 0 | __m256i c0, c1, c2, c3, c4, c5, c6, c7; |
127 | 0 | __m128i XTS_ALPHA_MASK; |
128 | 0 | __m256i XTS_ALPHA_MULTIPLIER_Ymm; |
129 | | |
130 | | // Load tweaks into big T |
131 | 0 | __m256i T0, T1, T2, T3, T4, T5, T6, T7; |
132 | |
|
133 | 0 | SIZE_T cbDataMain; // number of bytes to handle in the main loop |
134 | 0 | SIZE_T cbDataTail; // number of bytes to handle in the tail loop |
135 | | |
136 | | // To simplify logic and unusual size processing, we handle all |
137 | | // data not a multiple of 16 blocks in the tail loop |
138 | 0 | cbDataTail = cbData & ((16*SYMCRYPT_AES_BLOCK_SIZE)-1); |
139 | | // Additionally, so that ciphertext stealing logic does not rely on |
140 | | // reading back from the destination buffer, when we have a non-zero |
141 | | // tail, we ensure that we handle at least 1 whole block in the tail |
142 | 0 | cbDataTail += ((cbDataTail > 0) && (cbDataTail < SYMCRYPT_AES_BLOCK_SIZE)) ? (16*SYMCRYPT_AES_BLOCK_SIZE) : 0; |
143 | 0 | cbDataMain = cbData - cbDataTail; |
144 | |
|
145 | 0 | SYMCRYPT_ASSERT(cbDataMain <= cbData); |
146 | 0 | SYMCRYPT_ASSERT(cbDataTail <= cbData); |
147 | 0 | SYMCRYPT_ASSERT((cbDataMain & ((16*SYMCRYPT_AES_BLOCK_SIZE)-1)) == 0); |
148 | |
|
149 | 0 | if( cbDataMain == 0 ) |
150 | 0 | { |
151 | 0 | SymCryptXtsAesEncryptDataUnitXmm( pExpandedKey, pbTweakBlock, pbScratch, pbSrc, pbDst, cbDataTail ); |
152 | 0 | return; |
153 | 0 | } |
154 | | |
155 | 0 | t0 = _mm_loadu_si128( (__m128i *) pbTweakBlock ); |
156 | 0 | XTS_ALPHA_MASK = _mm_set_epi32( 1, 1, 1, 0x87 ); |
157 | 0 | XTS_ALPHA_MULTIPLIER_Ymm = _mm256_set_epi64x( 0, 0x87, 0, 0x87 ); |
158 | | |
159 | | // Do not stall. |
160 | 0 | XTS_MUL_ALPHA4( t0, t4 ); |
161 | 0 | XTS_MUL_ALPHA ( t0, t1 ); |
162 | 0 | XTS_MUL_ALPHA ( t4, t5 ); |
163 | 0 | XTS_MUL_ALPHA ( t1, t2 ); |
164 | 0 | XTS_MUL_ALPHA ( t5, t6 ); |
165 | 0 | XTS_MUL_ALPHA ( t2, t3 ); |
166 | 0 | XTS_MUL_ALPHA ( t6, t7 ); |
167 | |
|
168 | 0 | T0 = _mm256_insertf128_si256( _mm256_castsi128_si256( t0 ), t1, 1 ); // AVX |
169 | 0 | T1 = _mm256_insertf128_si256( _mm256_castsi128_si256( t2 ), t3, 1 ); |
170 | 0 | T2 = _mm256_insertf128_si256( _mm256_castsi128_si256( t4 ), t5, 1 ); |
171 | 0 | T3 = _mm256_insertf128_si256( _mm256_castsi128_si256( t6 ), t7, 1 ); |
172 | 0 | XTS_MUL_ALPHA8_YMM(T0, T4); |
173 | 0 | XTS_MUL_ALPHA8_YMM(T1, T5); |
174 | 0 | XTS_MUL_ALPHA8_YMM(T2, T6); |
175 | 0 | XTS_MUL_ALPHA8_YMM(T3, T7); |
176 | |
|
177 | 0 | for(;;) |
178 | 0 | { |
179 | 0 | c0 = _mm256_xor_si256( T0, _mm256_loadu_si256( ( __m256i * ) ( pbSrc + 0 ) ) ); |
180 | 0 | c1 = _mm256_xor_si256( T1, _mm256_loadu_si256( ( __m256i * ) ( pbSrc + 2*SYMCRYPT_AES_BLOCK_SIZE ) ) ); |
181 | 0 | c2 = _mm256_xor_si256( T2, _mm256_loadu_si256( ( __m256i * ) ( pbSrc + 4*SYMCRYPT_AES_BLOCK_SIZE ) ) ); |
182 | 0 | c3 = _mm256_xor_si256( T3, _mm256_loadu_si256( ( __m256i * ) ( pbSrc + 6*SYMCRYPT_AES_BLOCK_SIZE ) ) ); |
183 | 0 | c4 = _mm256_xor_si256( T4, _mm256_loadu_si256( ( __m256i * ) ( pbSrc + 8*SYMCRYPT_AES_BLOCK_SIZE ) ) ); |
184 | 0 | c5 = _mm256_xor_si256( T5, _mm256_loadu_si256( ( __m256i * ) ( pbSrc + 10*SYMCRYPT_AES_BLOCK_SIZE ) ) ); |
185 | 0 | c6 = _mm256_xor_si256( T6, _mm256_loadu_si256( ( __m256i * ) ( pbSrc + 12*SYMCRYPT_AES_BLOCK_SIZE ) ) ); |
186 | 0 | c7 = _mm256_xor_si256( T7, _mm256_loadu_si256( ( __m256i * ) ( pbSrc + 14*SYMCRYPT_AES_BLOCK_SIZE ) ) ); |
187 | |
|
188 | 0 | pbSrc += 16 * SYMCRYPT_AES_BLOCK_SIZE; |
189 | |
|
190 | 0 | AES_ENCRYPT_YMM_2048( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 ); |
191 | |
|
192 | 0 | _mm256_storeu_si256( ( __m256i * ) ( pbDst + 0 ), _mm256_xor_si256( c0, T0 ) ); |
193 | 0 | _mm256_storeu_si256( ( __m256i * ) ( pbDst + 2*SYMCRYPT_AES_BLOCK_SIZE ), _mm256_xor_si256( c1, T1 ) ); |
194 | 0 | _mm256_storeu_si256( ( __m256i * ) ( pbDst + 4*SYMCRYPT_AES_BLOCK_SIZE ), _mm256_xor_si256( c2, T2 ) ); |
195 | 0 | _mm256_storeu_si256( ( __m256i * ) ( pbDst + 6*SYMCRYPT_AES_BLOCK_SIZE ), _mm256_xor_si256( c3, T3 ) ); |
196 | 0 | _mm256_storeu_si256( ( __m256i * ) ( pbDst + 8*SYMCRYPT_AES_BLOCK_SIZE ), _mm256_xor_si256( c4, T4 ) ); |
197 | 0 | _mm256_storeu_si256( ( __m256i * ) ( pbDst + 10*SYMCRYPT_AES_BLOCK_SIZE ), _mm256_xor_si256( c5, T5 ) ); |
198 | 0 | _mm256_storeu_si256( ( __m256i * ) ( pbDst + 12*SYMCRYPT_AES_BLOCK_SIZE ), _mm256_xor_si256( c6, T6 ) ); |
199 | 0 | _mm256_storeu_si256( ( __m256i * ) ( pbDst + 14*SYMCRYPT_AES_BLOCK_SIZE ), _mm256_xor_si256( c7, T7 ) ); |
200 | |
|
201 | 0 | pbDst += 16 * SYMCRYPT_AES_BLOCK_SIZE; |
202 | |
|
203 | 0 | cbDataMain -= 16 * SYMCRYPT_AES_BLOCK_SIZE; |
204 | 0 | if( cbDataMain < 16 * SYMCRYPT_AES_BLOCK_SIZE ) |
205 | 0 | { |
206 | 0 | break; |
207 | 0 | } |
208 | | |
209 | 0 | XTS_MUL_ALPHA16_YMM(T0, T0); |
210 | 0 | XTS_MUL_ALPHA16_YMM(T1, T1); |
211 | 0 | XTS_MUL_ALPHA16_YMM(T2, T2); |
212 | 0 | XTS_MUL_ALPHA16_YMM(T3, T3); |
213 | 0 | XTS_MUL_ALPHA16_YMM(T4, T4); |
214 | 0 | XTS_MUL_ALPHA16_YMM(T5, T5); |
215 | 0 | XTS_MUL_ALPHA16_YMM(T6, T6); |
216 | 0 | XTS_MUL_ALPHA16_YMM(T7, T7); |
217 | 0 | } |
218 | | |
219 | | // We won't do another 16-block set so we don't update the tweak blocks |
220 | |
|
221 | 0 | if( cbDataTail > 0 ) |
222 | 0 | { |
223 | | // |
224 | | // This is a rare case: the data unit length is not a multiple of 256 bytes. |
225 | | // We do this in the Xmm implementation. |
226 | | // Fix up the tweak block first |
227 | | // |
228 | 0 | t7 = _mm256_extracti128_si256 ( T7, 1 /* Highest 128 bits */ ); // AVX2 |
229 | 0 | _mm256_zeroupper(); |
230 | 0 | XTS_MUL_ALPHA( t7, t0 ); |
231 | 0 | _mm_storeu_si128( (__m128i *) pbTweakBlock, t0 ); |
232 | |
|
233 | 0 | SymCryptXtsAesEncryptDataUnitXmm( pExpandedKey, pbTweakBlock, pbScratch, pbSrc, pbDst, cbDataTail ); |
234 | 0 | } |
235 | 0 | else { |
236 | 0 | _mm256_zeroupper(); |
237 | 0 | } |
238 | 0 | } |
239 | | |
240 | | VOID |
241 | | SYMCRYPT_CALL |
242 | | SymCryptXtsAesDecryptDataUnitYmm_2048( |
243 | | _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey, |
244 | | _Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbTweakBlock, |
245 | | _Out_writes_( SYMCRYPT_AES_BLOCK_SIZE*16 ) PBYTE pbScratch, |
246 | | _In_reads_( cbData ) PCBYTE pbSrc, |
247 | | _Out_writes_( cbData ) PBYTE pbDst, |
248 | | SIZE_T cbData ) |
249 | 0 | { |
250 | 0 | __m128i t0, t1, t2, t3, t4, t5, t6, t7; |
251 | 0 | __m256i c0, c1, c2, c3, c4, c5, c6, c7; |
252 | 0 | __m128i XTS_ALPHA_MASK; |
253 | 0 | __m256i XTS_ALPHA_MULTIPLIER_Ymm; |
254 | | |
255 | | // Load tweaks into big T |
256 | 0 | __m256i T0, T1, T2, T3, T4, T5, T6, T7; |
257 | |
|
258 | 0 | SIZE_T cbDataMain; // number of bytes to handle in the main loop |
259 | 0 | SIZE_T cbDataTail; // number of bytes to handle in the tail loop |
260 | | |
261 | | // To simplify logic and unusual size processing, we handle all |
262 | | // data not a multiple of 16 blocks in the tail loop |
263 | 0 | cbDataTail = cbData & ((16*SYMCRYPT_AES_BLOCK_SIZE)-1); |
264 | | // Additionally, so that ciphertext stealing logic does not rely on |
265 | | // reading back from the destination buffer, when we have a non-zero |
266 | | // tail, we ensure that we handle at least 1 whole block in the tail |
267 | 0 | cbDataTail += ((cbDataTail > 0) && (cbDataTail < SYMCRYPT_AES_BLOCK_SIZE)) ? (16*SYMCRYPT_AES_BLOCK_SIZE) : 0; |
268 | 0 | cbDataMain = cbData - cbDataTail; |
269 | |
|
270 | 0 | SYMCRYPT_ASSERT(cbDataMain <= cbData); |
271 | 0 | SYMCRYPT_ASSERT(cbDataTail <= cbData); |
272 | 0 | SYMCRYPT_ASSERT((cbDataMain & ((16*SYMCRYPT_AES_BLOCK_SIZE)-1)) == 0); |
273 | |
|
274 | 0 | if( cbDataMain == 0 ) |
275 | 0 | { |
276 | 0 | SymCryptXtsAesDecryptDataUnitXmm( pExpandedKey, pbTweakBlock, pbScratch, pbSrc, pbDst, cbDataTail ); |
277 | 0 | return; |
278 | 0 | } |
279 | | |
280 | 0 | t0 = _mm_loadu_si128( (__m128i *) pbTweakBlock ); |
281 | 0 | XTS_ALPHA_MASK = _mm_set_epi32( 1, 1, 1, 0x87 ); |
282 | 0 | XTS_ALPHA_MULTIPLIER_Ymm = _mm256_set_epi64x( 0, 0x87, 0, 0x87 ); |
283 | | |
284 | | // Do not stall. |
285 | 0 | XTS_MUL_ALPHA4( t0, t4 ); |
286 | 0 | XTS_MUL_ALPHA ( t0, t1 ); |
287 | 0 | XTS_MUL_ALPHA ( t4, t5 ); |
288 | 0 | XTS_MUL_ALPHA ( t1, t2 ); |
289 | 0 | XTS_MUL_ALPHA ( t5, t6 ); |
290 | 0 | XTS_MUL_ALPHA ( t2, t3 ); |
291 | 0 | XTS_MUL_ALPHA ( t6, t7 ); |
292 | |
|
293 | 0 | T0 = _mm256_insertf128_si256( _mm256_castsi128_si256( t0 ), t1, 1); // AVX |
294 | 0 | T1 = _mm256_insertf128_si256( _mm256_castsi128_si256( t2 ), t3, 1); |
295 | 0 | T2 = _mm256_insertf128_si256( _mm256_castsi128_si256( t4 ), t5, 1); |
296 | 0 | T3 = _mm256_insertf128_si256( _mm256_castsi128_si256( t6 ), t7, 1); |
297 | 0 | XTS_MUL_ALPHA8_YMM(T0, T4); |
298 | 0 | XTS_MUL_ALPHA8_YMM(T1, T5); |
299 | 0 | XTS_MUL_ALPHA8_YMM(T2, T6); |
300 | 0 | XTS_MUL_ALPHA8_YMM(T3, T7); |
301 | |
|
302 | 0 | for(;;) |
303 | 0 | { |
304 | 0 | c0 = _mm256_xor_si256( T0, _mm256_loadu_si256( ( __m256i * ) ( pbSrc + 0 ) ) ); |
305 | 0 | c1 = _mm256_xor_si256( T1, _mm256_loadu_si256( ( __m256i * ) ( pbSrc + 2*SYMCRYPT_AES_BLOCK_SIZE ) ) ); |
306 | 0 | c2 = _mm256_xor_si256( T2, _mm256_loadu_si256( ( __m256i * ) ( pbSrc + 4*SYMCRYPT_AES_BLOCK_SIZE ) ) ); |
307 | 0 | c3 = _mm256_xor_si256( T3, _mm256_loadu_si256( ( __m256i * ) ( pbSrc + 6*SYMCRYPT_AES_BLOCK_SIZE ) ) ); |
308 | 0 | c4 = _mm256_xor_si256( T4, _mm256_loadu_si256( ( __m256i * ) ( pbSrc + 8*SYMCRYPT_AES_BLOCK_SIZE ) ) ); |
309 | 0 | c5 = _mm256_xor_si256( T5, _mm256_loadu_si256( ( __m256i * ) ( pbSrc + 10*SYMCRYPT_AES_BLOCK_SIZE ) ) ); |
310 | 0 | c6 = _mm256_xor_si256( T6, _mm256_loadu_si256( ( __m256i * ) ( pbSrc + 12*SYMCRYPT_AES_BLOCK_SIZE ) ) ); |
311 | 0 | c7 = _mm256_xor_si256( T7, _mm256_loadu_si256( ( __m256i * ) ( pbSrc + 14*SYMCRYPT_AES_BLOCK_SIZE ) ) ); |
312 | |
|
313 | 0 | pbSrc += 16 * SYMCRYPT_AES_BLOCK_SIZE; |
314 | |
|
315 | 0 | AES_DECRYPT_YMM_2048( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 ); |
316 | |
|
317 | 0 | _mm256_storeu_si256( ( __m256i * ) ( pbDst + 0 ), _mm256_xor_si256( c0, T0 ) ); |
318 | 0 | _mm256_storeu_si256( ( __m256i * ) ( pbDst + 2*SYMCRYPT_AES_BLOCK_SIZE ), _mm256_xor_si256( c1, T1 ) ); |
319 | 0 | _mm256_storeu_si256( ( __m256i * ) ( pbDst + 4*SYMCRYPT_AES_BLOCK_SIZE ), _mm256_xor_si256( c2, T2 ) ); |
320 | 0 | _mm256_storeu_si256( ( __m256i * ) ( pbDst + 6*SYMCRYPT_AES_BLOCK_SIZE ), _mm256_xor_si256( c3, T3 ) ); |
321 | 0 | _mm256_storeu_si256( ( __m256i * ) ( pbDst + 8*SYMCRYPT_AES_BLOCK_SIZE ), _mm256_xor_si256( c4, T4 ) ); |
322 | 0 | _mm256_storeu_si256( ( __m256i * ) ( pbDst + 10*SYMCRYPT_AES_BLOCK_SIZE ), _mm256_xor_si256( c5, T5 ) ); |
323 | 0 | _mm256_storeu_si256( ( __m256i * ) ( pbDst + 12*SYMCRYPT_AES_BLOCK_SIZE ), _mm256_xor_si256( c6, T6 ) ); |
324 | 0 | _mm256_storeu_si256( ( __m256i * ) ( pbDst + 14*SYMCRYPT_AES_BLOCK_SIZE ), _mm256_xor_si256( c7, T7 ) ); |
325 | |
|
326 | 0 | pbDst += 16 * SYMCRYPT_AES_BLOCK_SIZE; |
327 | |
|
328 | 0 | cbDataMain -= 16 * SYMCRYPT_AES_BLOCK_SIZE; |
329 | 0 | if( cbDataMain < 16 * SYMCRYPT_AES_BLOCK_SIZE ) |
330 | 0 | { |
331 | 0 | break; |
332 | 0 | } |
333 | | |
334 | 0 | XTS_MUL_ALPHA16_YMM(T0, T0); |
335 | 0 | XTS_MUL_ALPHA16_YMM(T1, T1); |
336 | 0 | XTS_MUL_ALPHA16_YMM(T2, T2); |
337 | 0 | XTS_MUL_ALPHA16_YMM(T3, T3); |
338 | 0 | XTS_MUL_ALPHA16_YMM(T4, T4); |
339 | 0 | XTS_MUL_ALPHA16_YMM(T5, T5); |
340 | 0 | XTS_MUL_ALPHA16_YMM(T6, T6); |
341 | 0 | XTS_MUL_ALPHA16_YMM(T7, T7); |
342 | 0 | } |
343 | | |
344 | | // We won't do another 16-block set so we don't update the tweak blocks |
345 | |
|
346 | 0 | if( cbDataTail > 0 ) |
347 | 0 | { |
348 | | // |
349 | | // This is a rare case: the data unit length is not a multiple of 256 bytes. |
350 | | // We do this in the Xmm implementation. |
351 | | // Fix up the tweak block first |
352 | | // |
353 | 0 | t7 = _mm256_extracti128_si256 ( T7, 1 /* Highest 128 bits */ ); // AVX2 |
354 | 0 | _mm256_zeroupper(); |
355 | 0 | XTS_MUL_ALPHA( t7, t0 ); |
356 | 0 | _mm_storeu_si128( (__m128i *) pbTweakBlock, t0 ); |
357 | |
|
358 | 0 | SymCryptXtsAesDecryptDataUnitXmm( pExpandedKey, pbTweakBlock, pbScratch, pbSrc, pbDst, cbDataTail ); |
359 | 0 | } |
360 | 0 | else { |
361 | 0 | _mm256_zeroupper(); |
362 | 0 | } |
363 | 0 | } |
364 | | |
365 | 0 | #define AES_FULLROUND_16_GHASH_2_Ymm( roundkeys, keyPtr, c0, c1, c2, c3, c4, c5, c6, c7, r0, t0, t1, gHashPointer, byteReverseOrder, gHashExpandedKeyTable, todo, resl, resm, resh ) \ |
366 | 0 | { \ |
367 | 0 | roundkeys = _mm256_broadcastsi128_si256( *( (const __m128i *) keyPtr ) ); \ |
368 | 0 | keyPtr ++; \ |
369 | 0 | c0 = _mm256_aesenc_epi128( c0, roundkeys ); \ |
370 | 0 | c1 = _mm256_aesenc_epi128( c1, roundkeys ); \ |
371 | 0 | c2 = _mm256_aesenc_epi128( c2, roundkeys ); \ |
372 | 0 | c3 = _mm256_aesenc_epi128( c3, roundkeys ); \ |
373 | 0 | c4 = _mm256_aesenc_epi128( c4, roundkeys ); \ |
374 | 0 | c5 = _mm256_aesenc_epi128( c5, roundkeys ); \ |
375 | 0 | c6 = _mm256_aesenc_epi128( c6, roundkeys ); \ |
376 | 0 | c7 = _mm256_aesenc_epi128( c7, roundkeys ); \ |
377 | 0 | \ |
378 | 0 | r0 = _mm256_loadu_si256( (__m256i *) gHashPointer ); \ |
379 | 0 | r0 = _mm256_shuffle_epi8( r0, byteReverseOrder ); \ |
380 | 0 | gHashPointer += 32; \ |
381 | 0 | \ |
382 | 0 | t1 = _mm256_loadu_si256( (__m256i *) &GHASH_H_POWER(gHashExpandedKeyTable, todo) ); \ |
383 | 0 | t0 = _mm256_clmulepi64_epi128( r0, t1, 0x00 ); \ |
384 | 0 | t1 = _mm256_clmulepi64_epi128( r0, t1, 0x11 ); \ |
385 | 0 | \ |
386 | 0 | resl = _mm256_xor_si256( resl, t0 ); \ |
387 | 0 | resh = _mm256_xor_si256( resh, t1 ); \ |
388 | 0 | \ |
389 | 0 | t0 = _mm256_srli_si256( r0, 8 ); \ |
390 | 0 | r0 = _mm256_xor_si256( r0, t0 ); \ |
391 | 0 | t1 = _mm256_loadu_si256( (__m256i *) &GHASH_Hx_POWER(gHashExpandedKeyTable, todo) ); \ |
392 | 0 | t1 = _mm256_clmulepi64_epi128( r0, t1, 0x00 ); \ |
393 | 0 | \ |
394 | 0 | resm = _mm256_xor_si256( resm, t1 ); \ |
395 | 0 | todo -= 2; \ |
396 | 0 | }; |
397 | | |
398 | 0 | #define AES_GCM_ENCRYPT_16_Ymm( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7, gHashPointer, byteReverseOrder, gHashExpandedKeyTable, todo, resl, resm, resh ) \ |
399 | 0 | { \ |
400 | 0 | const BYTE (*keyPtr)[4][4]; \ |
401 | 0 | const BYTE (*keyLimit)[4][4]; \ |
402 | 0 | __m256i roundkeys; \ |
403 | 0 | __m256i t0, t1; \ |
404 | 0 | __m256i r0; \ |
405 | 0 | int aesEncryptGhashLoop; \ |
406 | 0 | \ |
407 | 0 | keyPtr = pExpandedKey->RoundKey; \ |
408 | 0 | keyLimit = pExpandedKey->lastEncRoundKey; \ |
409 | 0 | \ |
410 | 0 | /* _mm256_broadcastsi128_si256 requires AVX2 */ \ |
411 | 0 | roundkeys = _mm256_broadcastsi128_si256( *( (const __m128i *) keyPtr ) ); \ |
412 | 0 | keyPtr ++; \ |
413 | 0 | \ |
414 | 0 | /* _mm256_xor_si256 requires AVX2 */ \ |
415 | 0 | c0 = _mm256_xor_si256( c0, roundkeys ); \ |
416 | 0 | c1 = _mm256_xor_si256( c1, roundkeys ); \ |
417 | 0 | c2 = _mm256_xor_si256( c2, roundkeys ); \ |
418 | 0 | c3 = _mm256_xor_si256( c3, roundkeys ); \ |
419 | 0 | c4 = _mm256_xor_si256( c4, roundkeys ); \ |
420 | 0 | c5 = _mm256_xor_si256( c5, roundkeys ); \ |
421 | 0 | c6 = _mm256_xor_si256( c6, roundkeys ); \ |
422 | 0 | c7 = _mm256_xor_si256( c7, roundkeys ); \ |
423 | 0 | \ |
424 | 0 | /* Do 8(x2) full rounds (AES-128|AES-192|AES-256) with stitched GHASH */ \ |
425 | 0 | for( aesEncryptGhashLoop = 0; aesEncryptGhashLoop < 4; aesEncryptGhashLoop++ ) \ |
426 | 0 | { \ |
427 | 0 | AES_FULLROUND_16_GHASH_2_Ymm( roundkeys, keyPtr, c0, c1, c2, c3, c4, c5, c6, c7, r0, t0, t1, gHashPointer, byteReverseOrder, gHashExpandedKeyTable, todo, resl, resm, resh ); \ |
428 | 0 | AES_FULLROUND_16_GHASH_2_Ymm( roundkeys, keyPtr, c0, c1, c2, c3, c4, c5, c6, c7, r0, t0, t1, gHashPointer, byteReverseOrder, gHashExpandedKeyTable, todo, resl, resm, resh ); \ |
429 | 0 | } \ |
430 | 0 | \ |
431 | 0 | do \ |
432 | 0 | { \ |
433 | 0 | roundkeys = _mm256_broadcastsi128_si256( *( (const __m128i *) keyPtr ) ); \ |
434 | 0 | keyPtr ++; \ |
435 | 0 | c0 = _mm256_aesenc_epi128( c0, roundkeys ); \ |
436 | 0 | c1 = _mm256_aesenc_epi128( c1, roundkeys ); \ |
437 | 0 | c2 = _mm256_aesenc_epi128( c2, roundkeys ); \ |
438 | 0 | c3 = _mm256_aesenc_epi128( c3, roundkeys ); \ |
439 | 0 | c4 = _mm256_aesenc_epi128( c4, roundkeys ); \ |
440 | 0 | c5 = _mm256_aesenc_epi128( c5, roundkeys ); \ |
441 | 0 | c6 = _mm256_aesenc_epi128( c6, roundkeys ); \ |
442 | 0 | c7 = _mm256_aesenc_epi128( c7, roundkeys ); \ |
443 | 0 | } while( keyPtr < keyLimit ); \ |
444 | 0 | \ |
445 | 0 | roundkeys = _mm256_broadcastsi128_si256( *( (const __m128i *) keyPtr ) ); \ |
446 | 0 | \ |
447 | 0 | c0 = _mm256_aesenclast_epi128( c0, roundkeys ); \ |
448 | 0 | c1 = _mm256_aesenclast_epi128( c1, roundkeys ); \ |
449 | 0 | c2 = _mm256_aesenclast_epi128( c2, roundkeys ); \ |
450 | 0 | c3 = _mm256_aesenclast_epi128( c3, roundkeys ); \ |
451 | 0 | c4 = _mm256_aesenclast_epi128( c4, roundkeys ); \ |
452 | 0 | c5 = _mm256_aesenclast_epi128( c5, roundkeys ); \ |
453 | 0 | c6 = _mm256_aesenclast_epi128( c6, roundkeys ); \ |
454 | 0 | c7 = _mm256_aesenclast_epi128( c7, roundkeys ); \ |
455 | 0 | }; |
456 | | |
457 | | VOID |
458 | | SYMCRYPT_CALL |
459 | | SymCryptAesGcmEncryptStitchedYmm_2048( |
460 | | _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey, |
461 | | _In_reads_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue, |
462 | | _In_reads_( SYMCRYPT_GF128_FIELD_SIZE ) PCSYMCRYPT_GF128_ELEMENT expandedKeyTable, |
463 | | _Inout_ PSYMCRYPT_GF128_ELEMENT pState, |
464 | | _In_reads_( cbData ) PCBYTE pbSrc, |
465 | | _Out_writes_( cbData ) PBYTE pbDst, |
466 | | SIZE_T cbData ) |
467 | 0 | { |
468 | 0 | __m128i chain = _mm_loadu_si128( (__m128i *) pbChainingValue ); |
469 | |
|
470 | 0 | __m128i BYTE_REVERSE_ORDER_xmm = _mm_set_epi8( |
471 | 0 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ); |
472 | 0 | __m256i BYTE_REVERSE_ORDER = _mm256_set_epi64x( 0x0001020304050607, 0x08090a0b0c0d0e0f, 0x0001020304050607, 0x08090a0b0c0d0e0f ); |
473 | 0 | __m128i vMultiplicationConstant = _mm_set_epi32( 0, 0, 0xc2000000, 0 ); |
474 | |
|
475 | 0 | __m256i chainIncrementUpper1 = _mm256_set_epi64x( 0, 1, 0, 0 ); |
476 | 0 | __m256i chainIncrement2 = _mm256_set_epi64x( 0, 2, 0, 2 ); |
477 | 0 | __m256i chainIncrement4 = _mm256_set_epi64x( 0, 4, 0, 4 ); |
478 | 0 | __m256i chainIncrement16 = _mm256_set_epi64x( 0, 16, 0, 16 ); |
479 | |
|
480 | 0 | __m256i ctr0, ctr1, ctr2, ctr3, ctr4, ctr5, ctr6, ctr7; |
481 | 0 | __m256i c0, c1, c2, c3, c4, c5, c6, c7; |
482 | 0 | __m256i r0, r1, r2, r3, r4, r5, r6, r7; |
483 | 0 | __m256i Hi, Hix; |
484 | |
|
485 | 0 | __m128i state; |
486 | 0 | __m128i a0_xmm, a1_xmm, a2_xmm; |
487 | 0 | __m256i a0, a1, a2; |
488 | 0 | SIZE_T nBlocks = cbData / SYMCRYPT_GF128_BLOCK_SIZE; |
489 | 0 | SIZE_T todo; |
490 | 0 | PCBYTE pbGhashSrc = pbDst; |
491 | |
|
492 | 0 | SYMCRYPT_ASSERT( (cbData & SYMCRYPT_GCM_BLOCK_MOD_MASK) == 0 ); // cbData is multiple of block size |
493 | 0 | SYMCRYPT_ASSERT( nBlocks >= GCM_YMM_MINBLOCKS ); |
494 | |
|
495 | 0 | todo = SYMCRYPT_MIN( nBlocks, SYMCRYPT_GHASH_PCLMULQDQ_HPOWERS ) & ~(GCM_YMM_MINBLOCKS-1); |
496 | 0 | chain = _mm_shuffle_epi8( chain, BYTE_REVERSE_ORDER_xmm ); |
497 | |
|
498 | 0 | state = _mm_loadu_si128( (__m128i *) pState ); |
499 | 0 | ctr0 = _mm256_insertf128_si256( _mm256_castsi128_si256( chain ), chain, 1); // AVX |
500 | 0 | ctr0 = _mm256_add_epi32( ctr0, chainIncrementUpper1 ); |
501 | 0 | ctr1 = _mm256_add_epi32( ctr0, chainIncrement2 ); |
502 | 0 | ctr2 = _mm256_add_epi32( ctr0, chainIncrement4 ); |
503 | 0 | ctr3 = _mm256_add_epi32( ctr1, chainIncrement4 ); |
504 | 0 | ctr4 = _mm256_add_epi32( ctr2, chainIncrement4 ); |
505 | 0 | ctr5 = _mm256_add_epi32( ctr3, chainIncrement4 ); |
506 | 0 | ctr6 = _mm256_add_epi32( ctr4, chainIncrement4 ); |
507 | 0 | ctr7 = _mm256_add_epi32( ctr5, chainIncrement4 ); |
508 | |
|
509 | 0 | CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0_xmm, a1_xmm, a2_xmm ); |
510 | 0 | a0 = a1 = a2 = _mm256_setzero_si256(); |
511 | |
|
512 | 0 | c0 = _mm256_shuffle_epi8( ctr0, BYTE_REVERSE_ORDER ); |
513 | 0 | c1 = _mm256_shuffle_epi8( ctr1, BYTE_REVERSE_ORDER ); |
514 | 0 | c2 = _mm256_shuffle_epi8( ctr2, BYTE_REVERSE_ORDER ); |
515 | 0 | c3 = _mm256_shuffle_epi8( ctr3, BYTE_REVERSE_ORDER ); |
516 | 0 | c4 = _mm256_shuffle_epi8( ctr4, BYTE_REVERSE_ORDER ); |
517 | 0 | c5 = _mm256_shuffle_epi8( ctr5, BYTE_REVERSE_ORDER ); |
518 | 0 | c6 = _mm256_shuffle_epi8( ctr6, BYTE_REVERSE_ORDER ); |
519 | 0 | c7 = _mm256_shuffle_epi8( ctr7, BYTE_REVERSE_ORDER ); |
520 | |
|
521 | 0 | ctr0 = _mm256_add_epi32( ctr0, chainIncrement16 ); |
522 | 0 | ctr1 = _mm256_add_epi32( ctr1, chainIncrement16 ); |
523 | 0 | ctr2 = _mm256_add_epi32( ctr2, chainIncrement16 ); |
524 | 0 | ctr3 = _mm256_add_epi32( ctr3, chainIncrement16 ); |
525 | 0 | ctr4 = _mm256_add_epi32( ctr4, chainIncrement16 ); |
526 | 0 | ctr5 = _mm256_add_epi32( ctr5, chainIncrement16 ); |
527 | 0 | ctr6 = _mm256_add_epi32( ctr6, chainIncrement16 ); |
528 | 0 | ctr7 = _mm256_add_epi32( ctr7, chainIncrement16 ); |
529 | |
|
530 | 0 | AES_ENCRYPT_YMM_2048( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 ); |
531 | |
|
532 | 0 | _mm256_storeu_si256( (__m256i *) (pbDst + 0), _mm256_xor_si256( c0, _mm256_loadu_si256( ( __m256i * ) (pbSrc + 0) ) ) ); |
533 | 0 | _mm256_storeu_si256( (__m256i *) (pbDst + 32), _mm256_xor_si256( c1, _mm256_loadu_si256( ( __m256i * ) (pbSrc + 32) ) ) ); |
534 | 0 | _mm256_storeu_si256( (__m256i *) (pbDst + 64), _mm256_xor_si256( c2, _mm256_loadu_si256( ( __m256i * ) (pbSrc + 64) ) ) ); |
535 | 0 | _mm256_storeu_si256( (__m256i *) (pbDst + 96), _mm256_xor_si256( c3, _mm256_loadu_si256( ( __m256i * ) (pbSrc + 96) ) ) ); |
536 | 0 | _mm256_storeu_si256( (__m256i *) (pbDst +128), _mm256_xor_si256( c4, _mm256_loadu_si256( ( __m256i * ) (pbSrc +128) ) ) ); |
537 | 0 | _mm256_storeu_si256( (__m256i *) (pbDst +160), _mm256_xor_si256( c5, _mm256_loadu_si256( ( __m256i * ) (pbSrc +160) ) ) ); |
538 | 0 | _mm256_storeu_si256( (__m256i *) (pbDst +192), _mm256_xor_si256( c6, _mm256_loadu_si256( ( __m256i * ) (pbSrc +192) ) ) ); |
539 | 0 | _mm256_storeu_si256( (__m256i *) (pbDst +224), _mm256_xor_si256( c7, _mm256_loadu_si256( ( __m256i * ) (pbSrc +224) ) ) ); |
540 | |
|
541 | 0 | pbDst += 16 * SYMCRYPT_AES_BLOCK_SIZE; |
542 | 0 | pbSrc += 16 * SYMCRYPT_AES_BLOCK_SIZE; |
543 | |
|
544 | 0 | while( nBlocks >= 2*GCM_YMM_MINBLOCKS ) |
545 | 0 | { |
546 | 0 | c0 = _mm256_shuffle_epi8( ctr0, BYTE_REVERSE_ORDER ); |
547 | 0 | c1 = _mm256_shuffle_epi8( ctr1, BYTE_REVERSE_ORDER ); |
548 | 0 | c2 = _mm256_shuffle_epi8( ctr2, BYTE_REVERSE_ORDER ); |
549 | 0 | c3 = _mm256_shuffle_epi8( ctr3, BYTE_REVERSE_ORDER ); |
550 | 0 | c4 = _mm256_shuffle_epi8( ctr4, BYTE_REVERSE_ORDER ); |
551 | 0 | c5 = _mm256_shuffle_epi8( ctr5, BYTE_REVERSE_ORDER ); |
552 | 0 | c6 = _mm256_shuffle_epi8( ctr6, BYTE_REVERSE_ORDER ); |
553 | 0 | c7 = _mm256_shuffle_epi8( ctr7, BYTE_REVERSE_ORDER ); |
554 | |
|
555 | 0 | ctr0 = _mm256_add_epi32( ctr0, chainIncrement16 ); |
556 | 0 | ctr1 = _mm256_add_epi32( ctr1, chainIncrement16 ); |
557 | 0 | ctr2 = _mm256_add_epi32( ctr2, chainIncrement16 ); |
558 | 0 | ctr3 = _mm256_add_epi32( ctr3, chainIncrement16 ); |
559 | 0 | ctr4 = _mm256_add_epi32( ctr4, chainIncrement16 ); |
560 | 0 | ctr5 = _mm256_add_epi32( ctr5, chainIncrement16 ); |
561 | 0 | ctr6 = _mm256_add_epi32( ctr6, chainIncrement16 ); |
562 | 0 | ctr7 = _mm256_add_epi32( ctr7, chainIncrement16 ); |
563 | |
|
564 | 0 | AES_GCM_ENCRYPT_16_Ymm( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7, pbGhashSrc, BYTE_REVERSE_ORDER, expandedKeyTable, todo, a0, a1, a2 ); |
565 | |
|
566 | 0 | _mm256_storeu_si256( (__m256i *) (pbDst + 0), _mm256_xor_si256( c0, _mm256_loadu_si256( ( __m256i * ) (pbSrc + 0) ) ) ); |
567 | 0 | _mm256_storeu_si256( (__m256i *) (pbDst + 32), _mm256_xor_si256( c1, _mm256_loadu_si256( ( __m256i * ) (pbSrc + 32) ) ) ); |
568 | 0 | _mm256_storeu_si256( (__m256i *) (pbDst + 64), _mm256_xor_si256( c2, _mm256_loadu_si256( ( __m256i * ) (pbSrc + 64) ) ) ); |
569 | 0 | _mm256_storeu_si256( (__m256i *) (pbDst + 96), _mm256_xor_si256( c3, _mm256_loadu_si256( ( __m256i * ) (pbSrc + 96) ) ) ); |
570 | 0 | _mm256_storeu_si256( (__m256i *) (pbDst +128), _mm256_xor_si256( c4, _mm256_loadu_si256( ( __m256i * ) (pbSrc +128) ) ) ); |
571 | 0 | _mm256_storeu_si256( (__m256i *) (pbDst +160), _mm256_xor_si256( c5, _mm256_loadu_si256( ( __m256i * ) (pbSrc +160) ) ) ); |
572 | 0 | _mm256_storeu_si256( (__m256i *) (pbDst +192), _mm256_xor_si256( c6, _mm256_loadu_si256( ( __m256i * ) (pbSrc +192) ) ) ); |
573 | 0 | _mm256_storeu_si256( (__m256i *) (pbDst +224), _mm256_xor_si256( c7, _mm256_loadu_si256( ( __m256i * ) (pbSrc +224) ) ) ); |
574 | |
|
575 | 0 | pbDst += 16 * SYMCRYPT_AES_BLOCK_SIZE; |
576 | 0 | pbSrc += 16 * SYMCRYPT_AES_BLOCK_SIZE; |
577 | 0 | nBlocks -= 16; |
578 | |
|
579 | 0 | if ( todo == 0 ) |
580 | 0 | { |
581 | 0 | a0_xmm = _mm_xor_si128( a0_xmm, _mm256_extracti128_si256 ( a0, 0 /* Lowest 128 bits */ )); |
582 | 0 | a1_xmm = _mm_xor_si128( a1_xmm, _mm256_extracti128_si256 ( a1, 0 /* Lowest 128 bits */ )); |
583 | 0 | a2_xmm = _mm_xor_si128( a2_xmm, _mm256_extracti128_si256 ( a2, 0 /* Lowest 128 bits */ )); |
584 | |
|
585 | 0 | a0_xmm = _mm_xor_si128( a0_xmm, _mm256_extracti128_si256 ( a0, 1 /* Highest 128 bits */ )); |
586 | 0 | a1_xmm = _mm_xor_si128( a1_xmm, _mm256_extracti128_si256 ( a1, 1 /* Highest 128 bits */ )); |
587 | 0 | a2_xmm = _mm_xor_si128( a2_xmm, _mm256_extracti128_si256 ( a2, 1 /* Highest 128 bits */ )); |
588 | 0 | CLMUL_3_POST( a0_xmm, a1_xmm, a2_xmm ); |
589 | 0 | MODREDUCE( vMultiplicationConstant, a0_xmm, a1_xmm, a2_xmm, state ); |
590 | |
|
591 | 0 | todo = SYMCRYPT_MIN( nBlocks, SYMCRYPT_GHASH_PCLMULQDQ_HPOWERS ) & ~(GCM_YMM_MINBLOCKS-1); |
592 | 0 | CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0_xmm, a1_xmm, a2_xmm ); |
593 | 0 | a0 = a1 = a2 = _mm256_setzero_si256(); |
594 | 0 | } |
595 | 0 | } |
596 | |
|
597 | 0 | r0 = _mm256_shuffle_epi8( _mm256_loadu_si256( (__m256i *) (pbGhashSrc + 0) ), BYTE_REVERSE_ORDER ); |
598 | 0 | r1 = _mm256_shuffle_epi8( _mm256_loadu_si256( (__m256i *) (pbGhashSrc + 32) ), BYTE_REVERSE_ORDER ); |
599 | 0 | r2 = _mm256_shuffle_epi8( _mm256_loadu_si256( (__m256i *) (pbGhashSrc + 64) ), BYTE_REVERSE_ORDER ); |
600 | 0 | r3 = _mm256_shuffle_epi8( _mm256_loadu_si256( (__m256i *) (pbGhashSrc + 96) ), BYTE_REVERSE_ORDER ); |
601 | 0 | r4 = _mm256_shuffle_epi8( _mm256_loadu_si256( (__m256i *) (pbGhashSrc +128) ), BYTE_REVERSE_ORDER ); |
602 | 0 | r5 = _mm256_shuffle_epi8( _mm256_loadu_si256( (__m256i *) (pbGhashSrc +160) ), BYTE_REVERSE_ORDER ); |
603 | 0 | r6 = _mm256_shuffle_epi8( _mm256_loadu_si256( (__m256i *) (pbGhashSrc +192) ), BYTE_REVERSE_ORDER ); |
604 | 0 | r7 = _mm256_shuffle_epi8( _mm256_loadu_si256( (__m256i *) (pbGhashSrc +224) ), BYTE_REVERSE_ORDER ); |
605 | |
|
606 | 0 | Hi = _mm256_loadu_si256( (__m256i *) &GHASH_H_POWER(expandedKeyTable, todo - 0) ); |
607 | 0 | Hix = _mm256_loadu_si256( (__m256i *) &GHASH_Hx_POWER(expandedKeyTable, todo - 0) ); |
608 | 0 | CLMUL_ACC_3_Ymm( r0, Hi, Hix, a0, a1, a2 ); |
609 | 0 | Hi = _mm256_loadu_si256( (__m256i *) &GHASH_H_POWER(expandedKeyTable, todo - 2) ); |
610 | 0 | Hix = _mm256_loadu_si256( (__m256i *) &GHASH_Hx_POWER(expandedKeyTable, todo - 2) ); |
611 | 0 | CLMUL_ACC_3_Ymm( r1, Hi, Hix, a0, a1, a2 ); |
612 | 0 | Hi = _mm256_loadu_si256( (__m256i *) &GHASH_H_POWER(expandedKeyTable, todo - 4) ); |
613 | 0 | Hix = _mm256_loadu_si256( (__m256i *) &GHASH_Hx_POWER(expandedKeyTable, todo - 4) ); |
614 | 0 | CLMUL_ACC_3_Ymm( r2, Hi, Hix, a0, a1, a2 ); |
615 | 0 | Hi = _mm256_loadu_si256( (__m256i *) &GHASH_H_POWER(expandedKeyTable, todo - 6) ); |
616 | 0 | Hix = _mm256_loadu_si256( (__m256i *) &GHASH_Hx_POWER(expandedKeyTable, todo - 6) ); |
617 | 0 | CLMUL_ACC_3_Ymm( r3, Hi, Hix, a0, a1, a2 ); |
618 | 0 | Hi = _mm256_loadu_si256( (__m256i *) &GHASH_H_POWER(expandedKeyTable, todo - 8) ); |
619 | 0 | Hix = _mm256_loadu_si256( (__m256i *) &GHASH_Hx_POWER(expandedKeyTable, todo - 8) ); |
620 | 0 | CLMUL_ACC_3_Ymm( r4, Hi, Hix, a0, a1, a2 ); |
621 | 0 | Hi = _mm256_loadu_si256( (__m256i *) &GHASH_H_POWER(expandedKeyTable, todo -10) ); |
622 | 0 | Hix = _mm256_loadu_si256( (__m256i *) &GHASH_Hx_POWER(expandedKeyTable, todo -10) ); |
623 | 0 | CLMUL_ACC_3_Ymm( r5, Hi, Hix, a0, a1, a2 ); |
624 | 0 | Hi = _mm256_loadu_si256( (__m256i *) &GHASH_H_POWER(expandedKeyTable, todo -12) ); |
625 | 0 | Hix = _mm256_loadu_si256( (__m256i *) &GHASH_Hx_POWER(expandedKeyTable, todo -12) ); |
626 | 0 | CLMUL_ACC_3_Ymm( r6, Hi, Hix, a0, a1, a2 ); |
627 | 0 | Hi = _mm256_loadu_si256( (__m256i *) &GHASH_H_POWER(expandedKeyTable, todo -14) ); |
628 | 0 | Hix = _mm256_loadu_si256( (__m256i *) &GHASH_Hx_POWER(expandedKeyTable, todo -14) ); |
629 | 0 | CLMUL_ACC_3_Ymm( r7, Hi, Hix, a0, a1, a2 ); |
630 | |
|
631 | 0 | a0_xmm = _mm_xor_si128( a0_xmm, _mm256_extracti128_si256 ( a0, 0 /* Lowest 128 bits */ )); |
632 | 0 | a1_xmm = _mm_xor_si128( a1_xmm, _mm256_extracti128_si256 ( a1, 0 /* Lowest 128 bits */ )); |
633 | 0 | a2_xmm = _mm_xor_si128( a2_xmm, _mm256_extracti128_si256 ( a2, 0 /* Lowest 128 bits */ )); |
634 | |
|
635 | 0 | a0_xmm = _mm_xor_si128( a0_xmm, _mm256_extracti128_si256 ( a0, 1 /* Highest 128 bits */ )); |
636 | 0 | a1_xmm = _mm_xor_si128( a1_xmm, _mm256_extracti128_si256 ( a1, 1 /* Highest 128 bits */ )); |
637 | 0 | a2_xmm = _mm_xor_si128( a2_xmm, _mm256_extracti128_si256 ( a2, 1 /* Highest 128 bits */ )); |
638 | 0 | CLMUL_3_POST( a0_xmm, a1_xmm, a2_xmm ); |
639 | 0 | MODREDUCE( vMultiplicationConstant, a0_xmm, a1_xmm, a2_xmm, state ); |
640 | |
|
641 | 0 | chain = _mm256_extracti128_si256 ( ctr0, 0 /* Lowest 128 bits */ ); |
642 | 0 | _mm256_zeroupper(); |
643 | |
|
644 | 0 | chain = _mm_shuffle_epi8( chain, BYTE_REVERSE_ORDER_xmm ); |
645 | 0 | _mm_storeu_si128((__m128i *) pbChainingValue, chain ); |
646 | 0 | _mm_storeu_si128((__m128i *) pState, state ); |
647 | |
|
648 | 0 | cbData &= ( GCM_YMM_MINBLOCKS*SYMCRYPT_AES_BLOCK_SIZE ) - 1; |
649 | 0 | SYMCRYPT_ASSERT( cbData == (nBlocks-16)*SYMCRYPT_AES_BLOCK_SIZE ); |
650 | 0 | if ( cbData >= SYMCRYPT_AES_BLOCK_SIZE ) |
651 | 0 | { |
652 | 0 | SymCryptAesGcmEncryptStitchedXmm( pExpandedKey, pbChainingValue, expandedKeyTable, pState, pbSrc, pbDst, cbData); |
653 | 0 | } |
654 | 0 | } |
655 | | |
656 | | VOID |
657 | | SYMCRYPT_CALL |
658 | | SymCryptAesGcmDecryptStitchedYmm_2048( |
659 | | _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey, |
660 | | _In_reads_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue, |
661 | | _In_reads_( SYMCRYPT_GF128_FIELD_SIZE ) PCSYMCRYPT_GF128_ELEMENT expandedKeyTable, |
662 | | _Inout_ PSYMCRYPT_GF128_ELEMENT pState, |
663 | | _In_reads_( cbData ) PCBYTE pbSrc, |
664 | | _Out_writes_( cbData ) PBYTE pbDst, |
665 | | SIZE_T cbData ) |
666 | 0 | { |
667 | 0 | __m128i chain = _mm_loadu_si128( (__m128i *) pbChainingValue ); |
668 | |
|
669 | 0 | __m128i BYTE_REVERSE_ORDER_xmm = _mm_set_epi8( |
670 | 0 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ); |
671 | 0 | __m256i BYTE_REVERSE_ORDER = _mm256_set_epi64x( 0x0001020304050607, 0x08090a0b0c0d0e0f, 0x0001020304050607, 0x08090a0b0c0d0e0f ); |
672 | 0 | __m128i vMultiplicationConstant = _mm_set_epi32( 0, 0, 0xc2000000, 0 ); |
673 | |
|
674 | 0 | __m256i chainIncrementUpper1 = _mm256_set_epi64x( 0, 1, 0, 0 ); |
675 | 0 | __m256i chainIncrement2 = _mm256_set_epi64x( 0, 2, 0, 2 ); |
676 | 0 | __m256i chainIncrement4 = _mm256_set_epi64x( 0, 4, 0, 4 ); |
677 | 0 | __m256i chainIncrement16 = _mm256_set_epi64x( 0, 16, 0, 16 ); |
678 | |
|
679 | 0 | __m256i ctr0, ctr1, ctr2, ctr3, ctr4, ctr5, ctr6, ctr7; |
680 | 0 | __m256i c0, c1, c2, c3, c4, c5, c6, c7; |
681 | |
|
682 | 0 | __m128i state; |
683 | 0 | __m128i a0_xmm, a1_xmm, a2_xmm; |
684 | 0 | __m256i a0, a1, a2; |
685 | 0 | SIZE_T nBlocks = cbData / SYMCRYPT_GF128_BLOCK_SIZE; |
686 | 0 | SIZE_T todo; |
687 | 0 | PCBYTE pbGhashSrc = pbSrc; |
688 | |
|
689 | 0 | SYMCRYPT_ASSERT( (cbData & SYMCRYPT_GCM_BLOCK_MOD_MASK) == 0 ); // cbData is multiple of block size |
690 | 0 | SYMCRYPT_ASSERT( nBlocks >= GCM_YMM_MINBLOCKS ); |
691 | |
|
692 | 0 | todo = SYMCRYPT_MIN( nBlocks, SYMCRYPT_GHASH_PCLMULQDQ_HPOWERS ) & ~(GCM_YMM_MINBLOCKS-1); |
693 | 0 | chain = _mm_shuffle_epi8( chain, BYTE_REVERSE_ORDER_xmm ); |
694 | |
|
695 | 0 | state = _mm_loadu_si128( (__m128i *) pState ); |
696 | 0 | ctr0 = _mm256_insertf128_si256( _mm256_castsi128_si256( chain ), chain, 1); // AVX |
697 | 0 | ctr0 = _mm256_add_epi32( ctr0, chainIncrementUpper1 ); |
698 | 0 | ctr1 = _mm256_add_epi32( ctr0, chainIncrement2 ); |
699 | 0 | ctr2 = _mm256_add_epi32( ctr0, chainIncrement4 ); |
700 | 0 | ctr3 = _mm256_add_epi32( ctr1, chainIncrement4 ); |
701 | 0 | ctr4 = _mm256_add_epi32( ctr2, chainIncrement4 ); |
702 | 0 | ctr5 = _mm256_add_epi32( ctr3, chainIncrement4 ); |
703 | 0 | ctr6 = _mm256_add_epi32( ctr4, chainIncrement4 ); |
704 | 0 | ctr7 = _mm256_add_epi32( ctr5, chainIncrement4 ); |
705 | |
|
706 | 0 | CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0_xmm, a1_xmm, a2_xmm ); |
707 | 0 | a0 = a1 = a2 = _mm256_setzero_si256(); |
708 | |
|
709 | 0 | while( nBlocks >= GCM_YMM_MINBLOCKS ) |
710 | 0 | { |
711 | 0 | c0 = _mm256_shuffle_epi8( ctr0, BYTE_REVERSE_ORDER ); |
712 | 0 | c1 = _mm256_shuffle_epi8( ctr1, BYTE_REVERSE_ORDER ); |
713 | 0 | c2 = _mm256_shuffle_epi8( ctr2, BYTE_REVERSE_ORDER ); |
714 | 0 | c3 = _mm256_shuffle_epi8( ctr3, BYTE_REVERSE_ORDER ); |
715 | 0 | c4 = _mm256_shuffle_epi8( ctr4, BYTE_REVERSE_ORDER ); |
716 | 0 | c5 = _mm256_shuffle_epi8( ctr5, BYTE_REVERSE_ORDER ); |
717 | 0 | c6 = _mm256_shuffle_epi8( ctr6, BYTE_REVERSE_ORDER ); |
718 | 0 | c7 = _mm256_shuffle_epi8( ctr7, BYTE_REVERSE_ORDER ); |
719 | |
|
720 | 0 | ctr0 = _mm256_add_epi32( ctr0, chainIncrement16 ); |
721 | 0 | ctr1 = _mm256_add_epi32( ctr1, chainIncrement16 ); |
722 | 0 | ctr2 = _mm256_add_epi32( ctr2, chainIncrement16 ); |
723 | 0 | ctr3 = _mm256_add_epi32( ctr3, chainIncrement16 ); |
724 | 0 | ctr4 = _mm256_add_epi32( ctr4, chainIncrement16 ); |
725 | 0 | ctr5 = _mm256_add_epi32( ctr5, chainIncrement16 ); |
726 | 0 | ctr6 = _mm256_add_epi32( ctr6, chainIncrement16 ); |
727 | 0 | ctr7 = _mm256_add_epi32( ctr7, chainIncrement16 ); |
728 | |
|
729 | 0 | AES_GCM_ENCRYPT_16_Ymm( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7, pbGhashSrc, BYTE_REVERSE_ORDER, expandedKeyTable, todo, a0, a1, a2 ); |
730 | |
|
731 | 0 | _mm256_storeu_si256( (__m256i *) (pbDst + 0), _mm256_xor_si256( c0, _mm256_loadu_si256( ( __m256i * ) (pbSrc + 0) ) ) ); |
732 | 0 | _mm256_storeu_si256( (__m256i *) (pbDst + 32), _mm256_xor_si256( c1, _mm256_loadu_si256( ( __m256i * ) (pbSrc + 32) ) ) ); |
733 | 0 | _mm256_storeu_si256( (__m256i *) (pbDst + 64), _mm256_xor_si256( c2, _mm256_loadu_si256( ( __m256i * ) (pbSrc + 64) ) ) ); |
734 | 0 | _mm256_storeu_si256( (__m256i *) (pbDst + 96), _mm256_xor_si256( c3, _mm256_loadu_si256( ( __m256i * ) (pbSrc + 96) ) ) ); |
735 | 0 | _mm256_storeu_si256( (__m256i *) (pbDst +128), _mm256_xor_si256( c4, _mm256_loadu_si256( ( __m256i * ) (pbSrc +128) ) ) ); |
736 | 0 | _mm256_storeu_si256( (__m256i *) (pbDst +160), _mm256_xor_si256( c5, _mm256_loadu_si256( ( __m256i * ) (pbSrc +160) ) ) ); |
737 | 0 | _mm256_storeu_si256( (__m256i *) (pbDst +192), _mm256_xor_si256( c6, _mm256_loadu_si256( ( __m256i * ) (pbSrc +192) ) ) ); |
738 | 0 | _mm256_storeu_si256( (__m256i *) (pbDst +224), _mm256_xor_si256( c7, _mm256_loadu_si256( ( __m256i * ) (pbSrc +224) ) ) ); |
739 | |
|
740 | 0 | pbDst += 16 * SYMCRYPT_AES_BLOCK_SIZE; |
741 | 0 | pbSrc += 16 * SYMCRYPT_AES_BLOCK_SIZE; |
742 | 0 | nBlocks -= 16; |
743 | |
|
744 | 0 | if ( todo == 0 ) |
745 | 0 | { |
746 | 0 | a0_xmm = _mm_xor_si128( a0_xmm, _mm256_extracti128_si256 ( a0, 0 /* Lowest 128 bits */ )); |
747 | 0 | a1_xmm = _mm_xor_si128( a1_xmm, _mm256_extracti128_si256 ( a1, 0 /* Lowest 128 bits */ )); |
748 | 0 | a2_xmm = _mm_xor_si128( a2_xmm, _mm256_extracti128_si256 ( a2, 0 /* Lowest 128 bits */ )); |
749 | |
|
750 | 0 | a0_xmm = _mm_xor_si128( a0_xmm, _mm256_extracti128_si256 ( a0, 1 /* Highest 128 bits */ )); |
751 | 0 | a1_xmm = _mm_xor_si128( a1_xmm, _mm256_extracti128_si256 ( a1, 1 /* Highest 128 bits */ )); |
752 | 0 | a2_xmm = _mm_xor_si128( a2_xmm, _mm256_extracti128_si256 ( a2, 1 /* Highest 128 bits */ )); |
753 | 0 | CLMUL_3_POST( a0_xmm, a1_xmm, a2_xmm ); |
754 | 0 | MODREDUCE( vMultiplicationConstant, a0_xmm, a1_xmm, a2_xmm, state ); |
755 | |
|
756 | 0 | if ( nBlocks > 0 ) |
757 | 0 | { |
758 | 0 | todo = SYMCRYPT_MIN( nBlocks, SYMCRYPT_GHASH_PCLMULQDQ_HPOWERS ) & ~(GCM_YMM_MINBLOCKS-1); |
759 | 0 | CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0_xmm, a1_xmm, a2_xmm ); |
760 | 0 | a0 = a1 = a2 = _mm256_setzero_si256(); |
761 | 0 | } |
762 | 0 | } |
763 | 0 | } |
764 | |
|
765 | 0 | chain = _mm256_extracti128_si256 ( ctr0, 0 /* Lowest 128 bits */ ); |
766 | 0 | _mm256_zeroupper(); |
767 | |
|
768 | 0 | chain = _mm_shuffle_epi8( chain, BYTE_REVERSE_ORDER_xmm ); |
769 | 0 | _mm_storeu_si128((__m128i *) pbChainingValue, chain ); |
770 | 0 | _mm_storeu_si128((__m128i *) pState, state ); |
771 | |
|
772 | 0 | cbData &= ( GCM_YMM_MINBLOCKS*SYMCRYPT_AES_BLOCK_SIZE ) - 1; |
773 | 0 | SYMCRYPT_ASSERT( cbData == nBlocks*SYMCRYPT_AES_BLOCK_SIZE ); |
774 | 0 | if ( cbData >= SYMCRYPT_AES_BLOCK_SIZE ) |
775 | 0 | { |
776 | 0 | SymCryptAesGcmDecryptStitchedXmm( pExpandedKey, pbChainingValue, expandedKeyTable, pState, pbSrc, pbDst, cbData); |
777 | 0 | } |
778 | 0 | } |
779 | | |
780 | | #endif // CPU_X86 | CPU_AMD64 |