/src/SymCrypt/lib/aes-pattern.c
Line | Count | Source (jump to first uncovered line) |
1 | | // |
2 | | // aes-pattern.c |
3 | | // |
4 | | // Copyright (c) Microsoft Corporation. Licensed under the MIT license. |
5 | | // |
6 | | // This file contains "pattern" code for AES-related functions. It's not intended to be compiled |
7 | | // directly; rather it is included by other aes-*.c files which define the macros used here. |
8 | | // |
9 | | |
10 | | #if SYMCRYPT_CPU_ARM64 |
11 | | |
12 | | VOID |
13 | | SYMCRYPT_CALL |
14 | | SYMCRYPT_AesCtrMsbXxNeon( |
15 | | _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey, |
16 | | _Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue, |
17 | | _In_reads_( cbData ) PCBYTE pbSrc, |
18 | | _Out_writes_( cbData ) PBYTE pbDst, |
19 | | SIZE_T cbData ) |
20 | | { |
21 | | __n128 chain = *(__n128 *)pbChainingValue; |
22 | | const __n128 * pSrc = (const __n128 *) pbSrc; |
23 | | __n128 * pDst = (__n128 *) pbDst; |
24 | | |
25 | | const __n128 chainIncrement1 = SYMCRYPT_SET_N128_U64( 0, 1 ); |
26 | | const __n128 chainIncrement2 = SYMCRYPT_SET_N128_U64( 0, 2 ); |
27 | | const __n128 chainIncrement8 = SYMCRYPT_SET_N128_U64( 0, 8 ); |
28 | | |
29 | | __n128 ctr0, ctr1, ctr2, ctr3, ctr4, ctr5, ctr6, ctr7; |
30 | | __n128 c0, c1, c2, c3, c4, c5, c6, c7; |
31 | | |
32 | | cbData &= ~(SYMCRYPT_AES_BLOCK_SIZE - 1); |
33 | | |
34 | | // Our chain variable is in integer format, not the MSBfirst format loaded from memory. |
35 | | ctr0 = vrev64q_u8( chain ); |
36 | | ctr1 = VADDQ_UXX( ctr0, chainIncrement1 ); |
37 | | ctr2 = VADDQ_UXX( ctr0, chainIncrement2 ); |
38 | | ctr3 = VADDQ_UXX( ctr1, chainIncrement2 ); |
39 | | ctr4 = VADDQ_UXX( ctr2, chainIncrement2 ); |
40 | | ctr5 = VADDQ_UXX( ctr3, chainIncrement2 ); |
41 | | ctr6 = VADDQ_UXX( ctr4, chainIncrement2 ); |
42 | | ctr7 = VADDQ_UXX( ctr5, chainIncrement2 ); |
43 | | |
44 | | /* |
45 | | while cbData >= 5 * block |
46 | | generate 8 blocks of key stream |
47 | | if cbData < 8 * block |
48 | | break; |
49 | | process 8 blocks |
50 | | if cbData >= 5 * block |
51 | | process 5-7 blocks |
52 | | done |
53 | | if cbData >= 2 * block |
54 | | generate 4 blocks of key stream |
55 | | process 2-4 blocks |
56 | | done |
57 | | if cbData == 1 block |
58 | | generate 1 block of key stream |
59 | | process block |
60 | | */ |
61 | | while( cbData >= 5 * SYMCRYPT_AES_BLOCK_SIZE ) |
62 | | { |
63 | | c0 = vrev64q_u8( ctr0 ); |
64 | | c1 = vrev64q_u8( ctr1 ); |
65 | | c2 = vrev64q_u8( ctr2 ); |
66 | | c3 = vrev64q_u8( ctr3 ); |
67 | | c4 = vrev64q_u8( ctr4 ); |
68 | | c5 = vrev64q_u8( ctr5 ); |
69 | | c6 = vrev64q_u8( ctr6 ); |
70 | | c7 = vrev64q_u8( ctr7 ); |
71 | | |
72 | | ctr0 = VADDQ_UXX( ctr0, chainIncrement8 ); |
73 | | ctr1 = VADDQ_UXX( ctr1, chainIncrement8 ); |
74 | | ctr2 = VADDQ_UXX( ctr2, chainIncrement8 ); |
75 | | ctr3 = VADDQ_UXX( ctr3, chainIncrement8 ); |
76 | | ctr4 = VADDQ_UXX( ctr4, chainIncrement8 ); |
77 | | ctr5 = VADDQ_UXX( ctr5, chainIncrement8 ); |
78 | | ctr6 = VADDQ_UXX( ctr6, chainIncrement8 ); |
79 | | ctr7 = VADDQ_UXX( ctr7, chainIncrement8 ); |
80 | | |
81 | | AES_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 ); |
82 | | |
83 | | if( cbData < 8 * SYMCRYPT_AES_BLOCK_SIZE ) |
84 | | { |
85 | | break; |
86 | | } |
87 | | |
88 | | pDst[0] = veorq_u64( pSrc[0], c0 ); |
89 | | pDst[1] = veorq_u64( pSrc[1], c1 ); |
90 | | pDst[2] = veorq_u64( pSrc[2], c2 ); |
91 | | pDst[3] = veorq_u64( pSrc[3], c3 ); |
92 | | pDst[4] = veorq_u64( pSrc[4], c4 ); |
93 | | pDst[5] = veorq_u64( pSrc[5], c5 ); |
94 | | pDst[6] = veorq_u64( pSrc[6], c6 ); |
95 | | pDst[7] = veorq_u64( pSrc[7], c7 ); |
96 | | |
97 | | pDst += 8; |
98 | | pSrc += 8; |
99 | | cbData -= 8 * SYMCRYPT_AES_BLOCK_SIZE; |
100 | | } |
101 | | |
102 | | // |
103 | | // At this point we have one of the two following cases: |
104 | | // - cbData >= 5 * 16 and we have 8 blocks of key stream in c0-c7. ctr0-ctr7 is set to (c0+8)-(c7+8) |
105 | | // - cbData < 5 * 16 and we have no blocks of key stream, and ctr0-ctr7 set to the next 8 counters to use |
106 | | // |
107 | | |
108 | | if( cbData >= SYMCRYPT_AES_BLOCK_SIZE ) // quick exit of function if the request was a multiple of 8 blocks |
109 | | { |
110 | | if( cbData >= 5 * SYMCRYPT_AES_BLOCK_SIZE ) |
111 | | { |
112 | | // |
113 | | // We already have the key stream |
114 | | // |
115 | | pDst[0] = veorq_u64( pSrc[0], c0 ); |
116 | | pDst[1] = veorq_u64( pSrc[1], c1 ); |
117 | | pDst[2] = veorq_u64( pSrc[2], c2 ); |
118 | | pDst[3] = veorq_u64( pSrc[3], c3 ); |
119 | | pDst[4] = veorq_u64( pSrc[4], c4 ); |
120 | | chain = VSUBQ_UXX( ctr5, chainIncrement8 ); |
121 | | |
122 | | if( cbData >= 96 ) |
123 | | { |
124 | | chain = VSUBQ_UXX( ctr6, chainIncrement8 ); |
125 | | pDst[5] = veorq_u64( pSrc[5], c5 ); |
126 | | if( cbData >= 112 ) |
127 | | { |
128 | | chain = VSUBQ_UXX( ctr7, chainIncrement8 ); |
129 | | pDst[6] = veorq_u64( pSrc[6], c6 ); |
130 | | } |
131 | | } |
132 | | } |
133 | | else if( cbData >= 2 * SYMCRYPT_AES_BLOCK_SIZE ) |
134 | | { |
135 | | // Produce 4 blocks of key stream |
136 | | |
137 | | chain = ctr2; // chain is only incremented by 2 for now |
138 | | |
139 | | c0 = vrev64q_u8( ctr0 ); |
140 | | c1 = vrev64q_u8( ctr1 ); |
141 | | c2 = vrev64q_u8( ctr2 ); |
142 | | c3 = vrev64q_u8( ctr3 ); |
143 | | |
144 | | AES_ENCRYPT_4( pExpandedKey, c0, c1, c2, c3 ); |
145 | | |
146 | | pDst[0] = veorq_u64( pSrc[0], c0 ); |
147 | | pDst[1] = veorq_u64( pSrc[1], c1 ); |
148 | | if( cbData >= 48 ) |
149 | | { |
150 | | chain = ctr3; |
151 | | pDst[2] = veorq_u64( pSrc[2], c2 ); |
152 | | if( cbData >= 64 ) |
153 | | { |
154 | | chain = ctr4; |
155 | | pDst[3] = veorq_u64( pSrc[3], c3 ); |
156 | | } |
157 | | } |
158 | | } |
159 | | else |
160 | | { |
161 | | // Exactly 1 block to process |
162 | | chain = ctr1; |
163 | | |
164 | | c0 = vrev64q_u8( ctr0 ); |
165 | | |
166 | | AES_ENCRYPT_1( pExpandedKey, c0 ); |
167 | | pDst[0] = veorq_u64( pSrc[0], c0 ); |
168 | | } |
169 | | } |
170 | | else |
171 | | { |
172 | | chain = ctr0; |
173 | | } |
174 | | |
175 | | chain = vrev64q_u8( chain ); |
176 | | *(__n128 *)pbChainingValue = chain; |
177 | | } |
178 | | |
179 | | #endif // SYMCRYPT_CPU_ARM64 |
180 | | |
181 | | #if SYMCRYPT_CPU_X86 | SYMCRYPT_CPU_AMD64 |
182 | | |
183 | | VOID |
184 | | SYMCRYPT_CALL |
185 | | SYMCRYPT_AesCtrMsbXxXmm( |
186 | | _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey, |
187 | | _Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue, |
188 | | _In_reads_( cbData ) PCBYTE pbSrc, |
189 | | _Out_writes_( cbData ) PBYTE pbDst, |
190 | | SIZE_T cbData ) |
191 | 0 | { |
192 | 0 | __m128i chain = _mm_loadu_si128( (__m128i *) pbChainingValue ); |
193 | |
|
194 | 0 | __m128i BYTE_REVERSE_ORDER = _mm_set_epi8( |
195 | 0 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ); |
196 | |
|
197 | 0 | __m128i chainIncrement1 = _mm_set_epi32( 0, 0, 0, 1 ); |
198 | 0 | __m128i chainIncrement2 = _mm_set_epi32( 0, 0, 0, 2 ); |
199 | 0 | __m128i chainIncrement3 = _mm_set_epi32( 0, 0, 0, 3 ); |
200 | | //__m128i chainIncrement8 = _mm_set_epi32( 0, 0, 0, 8 ); |
201 | |
|
202 | 0 | __m128i c0, c1, c2, c3, c4, c5, c6, c7; |
203 | |
|
204 | 0 | cbData &= ~(SYMCRYPT_AES_BLOCK_SIZE - 1); |
205 | |
|
206 | 0 | chain = _mm_shuffle_epi8( chain, BYTE_REVERSE_ORDER ); |
207 | | |
208 | | /* |
209 | | while cbData >= 5 * block |
210 | | generate 8 blocks of key stream |
211 | | if cbData < 8 * block |
212 | | break; |
213 | | process 8 blocks |
214 | | if cbData >= 5 * block |
215 | | process 5-7 blocks |
216 | | done |
217 | | if cbData > 1 block |
218 | | generate 4 blocks of key stream |
219 | | process 2-4 blocks |
220 | | done |
221 | | if cbData == 1 block |
222 | | generate 1 block of key stream |
223 | | process block |
224 | | */ |
225 | 0 | while( cbData >= 5 * SYMCRYPT_AES_BLOCK_SIZE ) |
226 | 0 | { |
227 | 0 | c0 = chain; |
228 | 0 | c1 = MM_ADD_EPIXX( chain, chainIncrement1 ); |
229 | 0 | c2 = MM_ADD_EPIXX( chain, chainIncrement2 ); |
230 | 0 | c3 = MM_ADD_EPIXX( c1, chainIncrement2 ); |
231 | 0 | c4 = MM_ADD_EPIXX( c2, chainIncrement2 ); |
232 | 0 | c5 = MM_ADD_EPIXX( c3, chainIncrement2 ); |
233 | 0 | c6 = MM_ADD_EPIXX( c4, chainIncrement2 ); |
234 | 0 | c7 = MM_ADD_EPIXX( c5, chainIncrement2 ); |
235 | 0 | chain = MM_ADD_EPIXX( c6, chainIncrement2 ); |
236 | |
|
237 | 0 | c0 = _mm_shuffle_epi8( c0, BYTE_REVERSE_ORDER ); |
238 | 0 | c1 = _mm_shuffle_epi8( c1, BYTE_REVERSE_ORDER ); |
239 | 0 | c2 = _mm_shuffle_epi8( c2, BYTE_REVERSE_ORDER ); |
240 | 0 | c3 = _mm_shuffle_epi8( c3, BYTE_REVERSE_ORDER ); |
241 | 0 | c4 = _mm_shuffle_epi8( c4, BYTE_REVERSE_ORDER ); |
242 | 0 | c5 = _mm_shuffle_epi8( c5, BYTE_REVERSE_ORDER ); |
243 | 0 | c6 = _mm_shuffle_epi8( c6, BYTE_REVERSE_ORDER ); |
244 | 0 | c7 = _mm_shuffle_epi8( c7, BYTE_REVERSE_ORDER ); |
245 | |
|
246 | 0 | AES_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 ); |
247 | |
|
248 | 0 | if( cbData < 8 * SYMCRYPT_AES_BLOCK_SIZE ) |
249 | 0 | { |
250 | 0 | break; |
251 | 0 | } |
252 | | |
253 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 0), _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc + 0 ) ) ) ); |
254 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 16), _mm_xor_si128( c1, _mm_loadu_si128( ( __m128i * ) (pbSrc + 16 ) ) ) ); |
255 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 32), _mm_xor_si128( c2, _mm_loadu_si128( ( __m128i * ) (pbSrc + 32 ) ) ) ); |
256 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 48), _mm_xor_si128( c3, _mm_loadu_si128( ( __m128i * ) (pbSrc + 48 ) ) ) ); |
257 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 64), _mm_xor_si128( c4, _mm_loadu_si128( ( __m128i * ) (pbSrc + 64 ) ) ) ); |
258 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 80), _mm_xor_si128( c5, _mm_loadu_si128( ( __m128i * ) (pbSrc + 80 ) ) ) ); |
259 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 96), _mm_xor_si128( c6, _mm_loadu_si128( ( __m128i * ) (pbSrc + 96 ) ) ) ); |
260 | 0 | _mm_storeu_si128( (__m128i *) (pbDst +112), _mm_xor_si128( c7, _mm_loadu_si128( ( __m128i * ) (pbSrc +112 ) ) ) ); |
261 | 0 | pbDst += 8 * SYMCRYPT_AES_BLOCK_SIZE; |
262 | 0 | pbSrc += 8 * SYMCRYPT_AES_BLOCK_SIZE; |
263 | 0 | cbData -= 8 * SYMCRYPT_AES_BLOCK_SIZE; |
264 | 0 | } |
265 | | |
266 | | // |
267 | | // At this point we have one of the two following cases: |
268 | | // - cbData >= 5 * 16 and we have 8 blocks of key stream in c0-c7. chain is set to c7 + 1 |
269 | | // - cbData < 5 * 16 and we have no blocks of key stream, with chain the next value to use |
270 | | // |
271 | |
|
272 | 0 | if( cbData >= SYMCRYPT_AES_BLOCK_SIZE ) // quick exit of function if the request was a multiple of 8 blocks |
273 | 0 | { |
274 | 0 | if( cbData >= 5 * SYMCRYPT_AES_BLOCK_SIZE ) |
275 | 0 | { |
276 | | // |
277 | | // We already have the key stream |
278 | | // |
279 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 0), _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc + 0 ) ) ) ); |
280 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 16), _mm_xor_si128( c1, _mm_loadu_si128( ( __m128i * ) (pbSrc + 16 ) ) ) ); |
281 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 32), _mm_xor_si128( c2, _mm_loadu_si128( ( __m128i * ) (pbSrc + 32 ) ) ) ); |
282 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 48), _mm_xor_si128( c3, _mm_loadu_si128( ( __m128i * ) (pbSrc + 48 ) ) ) ); |
283 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 64), _mm_xor_si128( c4, _mm_loadu_si128( ( __m128i * ) (pbSrc + 64 ) ) ) ); |
284 | 0 | chain = MM_SUB_EPIXX( chain, chainIncrement3 ); |
285 | |
|
286 | 0 | if( cbData >= 96 ) |
287 | 0 | { |
288 | 0 | chain = MM_ADD_EPIXX( chain, chainIncrement1 ); |
289 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 80), _mm_xor_si128( c5, _mm_loadu_si128( ( __m128i * ) (pbSrc + 80 ) ) ) ); |
290 | 0 | if( cbData >= 112 ) |
291 | 0 | { |
292 | 0 | chain = MM_ADD_EPIXX( chain, chainIncrement1 ); |
293 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 96), _mm_xor_si128( c6, _mm_loadu_si128( ( __m128i * ) (pbSrc + 96 ) ) ) ); |
294 | 0 | } |
295 | 0 | } |
296 | 0 | } |
297 | 0 | else if( cbData >= 2 * SYMCRYPT_AES_BLOCK_SIZE ) |
298 | 0 | { |
299 | | // Produce 4 blocks of key stream |
300 | |
|
301 | 0 | c0 = chain; |
302 | 0 | c1 = MM_ADD_EPIXX( chain, chainIncrement1 ); |
303 | 0 | c2 = MM_ADD_EPIXX( chain, chainIncrement2 ); |
304 | 0 | c3 = MM_ADD_EPIXX( c1, chainIncrement2 ); |
305 | 0 | chain = c2; // chain is only incremented by 2 for now |
306 | |
|
307 | 0 | c0 = _mm_shuffle_epi8( c0, BYTE_REVERSE_ORDER ); |
308 | 0 | c1 = _mm_shuffle_epi8( c1, BYTE_REVERSE_ORDER ); |
309 | 0 | c2 = _mm_shuffle_epi8( c2, BYTE_REVERSE_ORDER ); |
310 | 0 | c3 = _mm_shuffle_epi8( c3, BYTE_REVERSE_ORDER ); |
311 | |
|
312 | 0 | AES_ENCRYPT_4( pExpandedKey, c0, c1, c2, c3 ); |
313 | |
|
314 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 0), _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc + 0 ) ) ) ); |
315 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 16), _mm_xor_si128( c1, _mm_loadu_si128( ( __m128i * ) (pbSrc + 16 ) ) ) ); |
316 | 0 | if( cbData >= 48 ) |
317 | 0 | { |
318 | 0 | chain = MM_ADD_EPIXX( chain, chainIncrement1 ); |
319 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 32), _mm_xor_si128( c2, _mm_loadu_si128( ( __m128i * ) (pbSrc + 32 ) ) ) ); |
320 | 0 | if( cbData >= 64 ) |
321 | 0 | { |
322 | 0 | chain = MM_ADD_EPIXX( chain, chainIncrement1 ); |
323 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 48), _mm_xor_si128( c3, _mm_loadu_si128( ( __m128i * ) (pbSrc + 48 ) ) ) ); |
324 | 0 | } |
325 | 0 | } |
326 | 0 | } |
327 | 0 | else |
328 | 0 | { |
329 | | // Exactly 1 block to process |
330 | 0 | c0 = chain; |
331 | 0 | chain = MM_ADD_EPIXX( chain, chainIncrement1 ); |
332 | |
|
333 | 0 | c0 = _mm_shuffle_epi8( c0, BYTE_REVERSE_ORDER ); |
334 | |
|
335 | 0 | AES_ENCRYPT_1( pExpandedKey, c0 ); |
336 | 0 | _mm_storeu_si128( (__m128i *) (pbDst + 0), _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc + 0 ) ) ) ); |
337 | 0 | } |
338 | 0 | } |
339 | |
|
340 | 0 | chain = _mm_shuffle_epi8( chain, BYTE_REVERSE_ORDER ); |
341 | 0 | _mm_storeu_si128( (__m128i *) pbChainingValue, chain ); |
342 | 0 | } Unexecuted instantiation: SymCryptAesCtrMsb64Xmm Unexecuted instantiation: SymCryptAesCtrMsb32Xmm |
343 | | |
344 | | #endif // SYMCRYPT_CPU_X86 | SYMCRYPT_CPU_AMD64 |