/src/SymCrypt/lib/aes-key.c
Line | Count | Source (jump to first uncovered line) |
1 | | // |
2 | | // aes.c code for AES implementation |
3 | | // |
4 | | // Copyright (c) Microsoft Corporation. Licensed under the MIT license. |
5 | | // |
6 | | // The actual encryption and decryption routines here are not nearly as fast as the |
7 | | // assembler ones. They are used on platforms that don't have assembler implementations |
8 | | // and for various testing purposes. |
9 | | // |
10 | | // This code derives from the orignal fast AES code that Niels Ferguson wrote |
11 | | // for BitLocker in Windows Vista. |
12 | | // The C code is derived from the AES that was already in the RSA32 library, |
13 | | // the assembler code was created new at that time. |
14 | | // |
15 | | |
16 | | |
17 | | #include "precomp.h" |
18 | | |
19 | | |
20 | | /////////////////////////////////////////////////////////////////////////////// |
21 | | // Key expansion uses two functions, a 4-byte S-box lookup and one |
22 | | // to create a decryption round key from an encryption round key. |
23 | | // These are the C implementations of these functions |
24 | | // |
25 | | |
26 | | |
27 | | static BYTE g_SymCryptAesRoundConstant[11] = |
28 | | { |
29 | | 0, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36, |
30 | | }; |
31 | | |
32 | | SYMCRYPT_NOINLINE |
33 | | SYMCRYPT_ERROR |
34 | | SYMCRYPT_CALL |
35 | | SymCryptAesExpandKeyInternal( |
36 | | _Out_ PSYMCRYPT_AES_EXPANDED_KEY pExpandedKey, |
37 | | _In_reads_(cbKey) PCBYTE pbKey, |
38 | | SIZE_T cbKey, |
39 | | BOOLEAN fCreateDecryptionKeys ) |
40 | 0 | { |
41 | 0 | UINT32 nRounds; |
42 | 0 | BYTE * p; |
43 | 0 | BYTE * q; |
44 | 0 | UINT32 i; |
45 | 0 | UINT32 t; |
46 | |
|
47 | 0 | BOOL UseSimd = FALSE; |
48 | 0 | SYMCRYPT_ERROR status = SYMCRYPT_NO_ERROR; |
49 | |
|
50 | | #if SYMCRYPT_CPU_X86 |
51 | | SYMCRYPT_EXTENDED_SAVE_DATA SaveData; |
52 | | |
53 | | if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURES_FOR_AESNI_CODE ) ) |
54 | | { |
55 | | if( SymCryptSaveXmm( &SaveData ) == SYMCRYPT_NO_ERROR ) |
56 | | { |
57 | | UseSimd = TRUE; |
58 | | } |
59 | | } |
60 | | #elif SYMCRYPT_CPU_AMD64 |
61 | 0 | if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURES_FOR_AESNI_CODE ) ) |
62 | 0 | { |
63 | 0 | UseSimd = TRUE; |
64 | 0 | } |
65 | | #elif SYMCRYPT_CPU_ARM64 |
66 | | if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURE_NEON_AES ) ) |
67 | | { |
68 | | UseSimd = TRUE; |
69 | | } |
70 | | #endif |
71 | |
|
72 | 0 | SYMCRYPT_SET_MAGIC( pExpandedKey ); |
73 | | |
74 | | // |
75 | | // Separate code for each key size, this is significantly faster. |
76 | | // We have a number of applications that do frequent key expansions. |
77 | | // |
78 | 0 | switch( cbKey ) |
79 | 0 | { |
80 | 0 | case 16: |
81 | 0 | nRounds = 10; |
82 | 0 | pExpandedKey->lastEncRoundKey = &pExpandedKey->RoundKey[nRounds]; |
83 | 0 | pExpandedKey->lastDecRoundKey = &pExpandedKey->RoundKey[2*nRounds]; |
84 | |
|
85 | 0 | memcpy( &pExpandedKey->RoundKey[0], pbKey, 16 ); |
86 | |
|
87 | 0 | p = (BYTE *)&pExpandedKey->RoundKey[1]; |
88 | |
|
89 | 0 | for( i=1; i<=nRounds; i++ ) |
90 | 0 | { |
91 | 0 | SymCryptAes4Sbox( &p[-4], p, UseSimd ); |
92 | 0 | t = ROR32(SYMCRYPT_LOAD_LSBFIRST32(p), 8) ^ SYMCRYPT_LOAD_LSBFIRST32(p - 16) ^ g_SymCryptAesRoundConstant[i]; |
93 | 0 | SYMCRYPT_STORE_LSBFIRST32( p, t ); // this is a macro that re-evaluates its arguments |
94 | |
|
95 | 0 | *(UINT32 *)(p+4) = *(UINT32 *) p ^ *(UINT32 *)(p - 12); |
96 | 0 | *(UINT32 *)(p+8) = *(UINT32 *)(p+4) ^ *(UINT32 *)(p - 8); |
97 | 0 | *(UINT32 *)(p+12) = *(UINT32 *)(p+8) ^ *(UINT32 *)(p - 4); |
98 | |
|
99 | 0 | p += 16; |
100 | 0 | } |
101 | |
|
102 | 0 | break; |
103 | | |
104 | 0 | case 24: |
105 | 0 | nRounds = 12; |
106 | 0 | pExpandedKey->lastEncRoundKey = &pExpandedKey->RoundKey[nRounds]; |
107 | 0 | pExpandedKey->lastDecRoundKey = &pExpandedKey->RoundKey[2*nRounds]; |
108 | |
|
109 | 0 | memcpy( &pExpandedKey->RoundKey[0], pbKey, 24 ); |
110 | |
|
111 | 0 | p = (BYTE *)&pExpandedKey->RoundKey[0] + 24; |
112 | | |
113 | | // |
114 | | // We have 12 rounds, 13 round keys, and 13*16 = 208 bytes of encrytion key to generate. |
115 | | // We have 24 already, so we need 184 more. |
116 | | // Each iteration produces 24 bytes, so we need to loop 8 times. |
117 | | // |
118 | 0 | for( i=1; i<=8; i++ ) |
119 | 0 | { |
120 | 0 | SymCryptAes4Sbox( &p[-4], p, UseSimd ); |
121 | 0 | t = ROR32(SYMCRYPT_LOAD_LSBFIRST32(p), 8) ^ SYMCRYPT_LOAD_LSBFIRST32(p - 24) ^ g_SymCryptAesRoundConstant[i]; |
122 | 0 | SYMCRYPT_STORE_LSBFIRST32( p, t ); |
123 | |
|
124 | 0 | *(UINT32 *)(p+4) = *(UINT32 *) p ^ *(UINT32 *)(p - 20); |
125 | 0 | *(UINT32 *)(p+8) = *(UINT32 *)(p+ 4) ^ *(UINT32 *)(p - 16); |
126 | 0 | *(UINT32 *)(p+12) = *(UINT32 *)(p+ 8) ^ *(UINT32 *)(p - 12); |
127 | 0 | *(UINT32 *)(p+16) = *(UINT32 *)(p+12) ^ *(UINT32 *)(p - 8); |
128 | 0 | *(UINT32 *)(p+20) = *(UINT32 *)(p+16) ^ *(UINT32 *)(p - 4); |
129 | |
|
130 | 0 | p += 24; |
131 | 0 | } |
132 | |
|
133 | 0 | break; |
134 | | |
135 | 0 | case 32: |
136 | 0 | nRounds = 14; |
137 | 0 | pExpandedKey->lastEncRoundKey = &pExpandedKey->RoundKey[nRounds]; |
138 | 0 | pExpandedKey->lastDecRoundKey = &pExpandedKey->RoundKey[2*nRounds]; |
139 | |
|
140 | 0 | memcpy( &pExpandedKey->RoundKey[0], pbKey, 32 ); |
141 | |
|
142 | 0 | p = (BYTE *)&pExpandedKey->RoundKey[0] + 32; |
143 | | |
144 | | // |
145 | | // We have 14 rounds, 15 round keys, and 15*16 = 240 bytes of encrytion key to generate. |
146 | | // We have 32 already, so we need 208 more. |
147 | | // Each iteration produces 32 bytes, so we need to loop 6.5 times. |
148 | | // |
149 | 0 | for( i=1; i<=6; i++ ) |
150 | 0 | { |
151 | 0 | SymCryptAes4Sbox( &p[-4], p, UseSimd ); |
152 | 0 | t = ROR32(SYMCRYPT_LOAD_LSBFIRST32(p), 8) ^ SYMCRYPT_LOAD_LSBFIRST32(p - 32) ^ g_SymCryptAesRoundConstant[i]; |
153 | 0 | SYMCRYPT_STORE_LSBFIRST32( p, t ); |
154 | |
|
155 | 0 | *(UINT32 *)(p+4) = *(UINT32 *) p ^ *(UINT32 *)(p - 28); |
156 | 0 | *(UINT32 *)(p+8) = *(UINT32 *)(p + 4) ^ *(UINT32 *)(p - 24); |
157 | 0 | *(UINT32 *)(p+12) = *(UINT32 *)(p + 8) ^ *(UINT32 *)(p - 20); |
158 | |
|
159 | 0 | SymCryptAes4Sbox( &p[12], &p[16], UseSimd ); |
160 | 0 | *(UINT32 *)(p+16) = *(UINT32 *)(p + 16) ^ *(UINT32 *)(p - 16); |
161 | |
|
162 | 0 | *(UINT32 *)(p+20) = *(UINT32 *)(p + 16) ^ *(UINT32 *)(p - 12); |
163 | 0 | *(UINT32 *)(p+24) = *(UINT32 *)(p + 20) ^ *(UINT32 *)(p - 8); |
164 | 0 | *(UINT32 *)(p+28) = *(UINT32 *)(p + 24) ^ *(UINT32 *)(p - 4); |
165 | |
|
166 | 0 | p += 32; |
167 | 0 | } |
168 | | |
169 | | // We looped 6 times, so here is the half-loop |
170 | |
|
171 | 0 | SymCryptAes4Sbox( &p[-4], p, UseSimd ); |
172 | 0 | t = ROR32(SYMCRYPT_LOAD_LSBFIRST32(p), 8) ^ SYMCRYPT_LOAD_LSBFIRST32(p - 32) ^ g_SymCryptAesRoundConstant[i]; |
173 | 0 | SYMCRYPT_STORE_LSBFIRST32( p, t ); |
174 | |
|
175 | 0 | *(UINT32 *)(p+4) = *(UINT32 *) p ^ *(UINT32 *)(p - 28); |
176 | 0 | *(UINT32 *)(p+8) = *(UINT32 *)(p + 4) ^ *(UINT32 *)(p - 24); |
177 | 0 | *(UINT32 *)(p+12) = *(UINT32 *)(p + 8) ^ *(UINT32 *)(p - 20); |
178 | |
|
179 | 0 | break; |
180 | | |
181 | 0 | default: |
182 | 0 | status = SYMCRYPT_WRONG_KEY_SIZE; |
183 | 0 | goto cleanup; |
184 | 0 | } |
185 | | |
186 | | |
187 | 0 | if( fCreateDecryptionKeys ) |
188 | 0 | { |
189 | 0 | p = &pExpandedKey->RoundKey[0][0][0]; |
190 | 0 | q = (PBYTE)(pExpandedKey->lastDecRoundKey); |
191 | | |
192 | | // The first encryption round key is the last decryption round key |
193 | 0 | memcpy( q, p, SYMCRYPT_AES_BLOCK_SIZE ); |
194 | 0 | p += 16; |
195 | 0 | q -= 16; |
196 | |
|
197 | 0 | while( p < (PBYTE) pExpandedKey->lastEncRoundKey ) |
198 | 0 | { |
199 | 0 | SymCryptAesCreateDecryptionRoundKey( p, q, UseSimd ); |
200 | 0 | q -= 16; |
201 | 0 | p += 16; |
202 | 0 | } |
203 | 0 | } |
204 | |
|
205 | 0 | cleanup: |
206 | |
|
207 | | #if SYMCRYPT_CPU_X86 |
208 | | if( UseSimd ) |
209 | | { |
210 | | SymCryptRestoreXmm( &SaveData ); |
211 | | } |
212 | | #endif |
213 | |
|
214 | 0 | return status; |
215 | 0 | } |
216 | | |
217 | | SYMCRYPT_ERROR |
218 | | SYMCRYPT_CALL |
219 | | SymCryptAesExpandKey( |
220 | | _Out_ PSYMCRYPT_AES_EXPANDED_KEY pExpandedKey, |
221 | | _In_reads_(cbKey) PCBYTE pbKey, |
222 | | SIZE_T cbKey ) |
223 | | |
224 | 0 | { |
225 | 0 | return SymCryptAesExpandKeyInternal( pExpandedKey, pbKey, cbKey, TRUE ); |
226 | 0 | } |
227 | | |
228 | | SYMCRYPT_ERROR |
229 | | SYMCRYPT_CALL |
230 | | SymCryptAesExpandKeyEncryptOnly( |
231 | | _Out_ PSYMCRYPT_AES_EXPANDED_KEY pExpandedKey, |
232 | | _In_reads_(cbKey) PCBYTE pbKey, |
233 | | SIZE_T cbKey ) |
234 | 0 | { |
235 | 0 | return SymCryptAesExpandKeyInternal( pExpandedKey, pbKey, cbKey, FALSE ); |
236 | 0 | } |
237 | | |
238 | | VOID |
239 | | SYMCRYPT_CALL |
240 | | SymCryptAesKeyCopy( _In_ PCSYMCRYPT_AES_EXPANDED_KEY pSrc, |
241 | | _Out_ PSYMCRYPT_AES_EXPANDED_KEY pDst ) |
242 | 0 | { |
243 | 0 | SYMCRYPT_CHECK_MAGIC( pSrc ); |
244 | |
|
245 | 0 | *pDst = *pSrc; |
246 | 0 | pDst->lastEncRoundKey = &pDst->RoundKey[0] + (pSrc->lastEncRoundKey - &pSrc->RoundKey[0]); |
247 | 0 | pDst->lastDecRoundKey = &pDst->RoundKey[0] + (pSrc->lastDecRoundKey - &pSrc->RoundKey[0]); |
248 | |
|
249 | 0 | SYMCRYPT_SET_MAGIC( pDst ); |
250 | 0 | } |
251 | | |
252 | | // |
253 | | // Self test code |
254 | | // |
255 | | |
256 | | |
257 | | const BYTE SymCryptAesNistTestVector128Ciphertext[16] = { |
258 | | 0x69, 0xc4, 0xe0, 0xd8, 0x6a, 0x7b, 0x04, 0x30, |
259 | | 0xd8, 0xcd, 0xb7, 0x80, 0x70, 0xb4, 0xc5, 0x5a, |
260 | | }; |
261 | | |
262 | | |
263 | | |
264 | | /**************************************************************** |
265 | | * OLD CODE |
266 | | * |
267 | | * Old code to generate the AES tables dynamically. |
268 | | * Kept for future reference. |
269 | | * |
270 | | |
271 | | |
272 | | // |
273 | | // Prototype; on some platforms this function is in assembler. |
274 | | // |
275 | | VOID |
276 | | SYMCRYPT_CALL |
277 | | SymCryptAesCreateRotatedTables( BYTE MatrixMult[4][256][4] ); |
278 | | |
279 | | VOID |
280 | | SYMCRYPT_CALL |
281 | | SymCryptAesCreateRotatedTables( _Inout_ BYTE MatrixMult[4][256][4] ) |
282 | | { |
283 | | int i,j,k; |
284 | | |
285 | | // |
286 | | // We do this byte-by-byte, which is easiest. |
287 | | // It would be faster to use UINT32 operations, |
288 | | // but that is endian-specific, and therefore platform-specific. |
289 | | // Endian-agnostic UINT32-based code would be a lot more complicated. |
290 | | // All this is extremely easy to do in assembler, which we do on those |
291 | | // platforms that have assembler implementations. |
292 | | // |
293 | | for( j=1; j<4; j++ ) { |
294 | | for( i=0; i<256; i++ ) { |
295 | | for( k=0; k<4; k++ ) { |
296 | | MatrixMult[j][i][k] = MatrixMult[0][i][(k-j)&3]; |
297 | | } |
298 | | } |
299 | | } |
300 | | } |
301 | | |
302 | | |
303 | | |
304 | | // |
305 | | // SymCryptAesInitMatrixMultiplyTable |
306 | | // |
307 | | // Initialize a matrix multiplication table. |
308 | | // Each matrix multiplication table consists of 4 tables of 256 entries of 4 bytes each. |
309 | | // The four tables are rotated copies of each other. |
310 | | // This funciton generates the first of those four tables from the init |
311 | | // value. |
312 | | // |
313 | | // After this call: |
314 | | // At index i the table contains the four bytes |
315 | | // i * init[0], i * init[1], i * init[2], i * init[3] |
316 | | // where multiplication is in GF(2^8). |
317 | | // |
318 | | // We do not do a GF(2^8) multiplication for each entry, but rather use the |
319 | | // relationship (a xor b) * init[.] = a * init[.] xor b * init[.] |
320 | | // And only compute i*init[.] for i = 1,2,4,8,...,128. This can be done |
321 | | // using repeated multiplication by x in the finite field. |
322 | | // |
323 | | // It is safe to call this function on two separate threads for the same table. |
324 | | // All invocations will write the same data to the table, and within a tread each entry is written |
325 | | // before it is read. Doing parallel initializations of the same table can be very inefficient |
326 | | // as multiple cores will be fighting over the cache lines, but the result will be correct. |
327 | | // We use this property to initialize the tables lazilly. |
328 | | // |
329 | | static |
330 | | VOID |
331 | | SYMCRYPT_CALL |
332 | | SymCryptAesInitMatrixMultiplyTable( _Out_ SYMCRYPT_ALIGN BYTE MatrixMult[256][4], |
333 | | _In_ SYMCRYPT_ALIGN BYTE init[4] |
334 | | ) |
335 | | { |
336 | | int i,j; |
337 | | SYMCRYPT_ALIGN BYTE initCopy[4]; |
338 | | UINT32 initCopyAsUint32; |
339 | | |
340 | | // |
341 | | // We copy the init value so that we can modify it without worrying about multi-threading |
342 | | // issues. |
343 | | // |
344 | | *(UINT32 *)initCopy = *(UINT32 *)init; |
345 | | |
346 | | *(UINT32 *)MatrixMult[0] = 0; |
347 | | for( i=1; i<256; i<<=1 ) |
348 | | { |
349 | | initCopyAsUint32 = *(UINT32 *)initCopy; |
350 | | for( j=0; j<i; j++ ) |
351 | | { |
352 | | *(UINT32 *)MatrixMult[i+j] = *(UINT32 *)MatrixMult[j] ^ initCopyAsUint32; |
353 | | } |
354 | | for( j=0; j<4; j++ ) |
355 | | { |
356 | | initCopy[j] = MULT_BY_X( initCopy[j] ); |
357 | | } |
358 | | } |
359 | | } |
360 | | |
361 | | |
362 | | // |
363 | | // SymCryptAesInitialize |
364 | | // |
365 | | // Initialize the static tables for the AES implementation. |
366 | | // This function is called by the key expansion function if it finds the |
367 | | // tables not initialized. |
368 | | // |
369 | | // This leads to an interesting case where multiple threads running on multiple |
370 | | // CPUs run this initialization code at the same time. |
371 | | // This code is carefully structured to allow that. When global data is written it is |
372 | | // always with the final value, and we never read uninitialized global data. |
373 | | // Thus, even if two CPUs run this code at the same time, they will both initialize each |
374 | | // memory location to the same correct value and the end result will be correct. |
375 | | // (Performance will suffer due to the fact that cache lines will be bounced back and force |
376 | | // between the two CPUs, but that is not a significant concern as this code is used only once.) |
377 | | // |
378 | | // At the end of the initialization the flag is set to indicate that further |
379 | | // key expansion invocations do not need to re-run the initialization. |
380 | | // We use memory barriers to keep this multi-thread safe. |
381 | | // |
382 | | static |
383 | | VOID |
384 | | SYMCRYPT_CALL |
385 | | SymCryptAesInitialize(void) |
386 | | { |
387 | | int i,j; |
388 | | BYTE S; |
389 | | BYTE Stimes2; |
390 | | |
391 | | // |
392 | | // We force alignment of these arrays as we sometimes treat them as a UINT32 |
393 | | // |
394 | | SYMCRYPT_ALIGN BYTE InvMatrixEntry[4] = {0xe, 0x9, 0xd, 0xb}; |
395 | | SYMCRYPT_ALIGN BYTE MatrixEntry[4] = {2, 1, 1, 3}; |
396 | | SYMCRYPT_ALIGN BYTE MatrixScratch[256][4]; |
397 | | |
398 | | // Generate the forward MDS multiplication table in the scratch space |
399 | | SymCryptAesInitMatrixMultiplyTable( MatrixScratch, MatrixEntry ); |
400 | | |
401 | | // Initialize first table of SymCryptAesInvMatrixMult |
402 | | SymCryptAesInitMatrixMultiplyTable( SymCryptAesInvMatrixMult[0], InvMatrixEntry ); |
403 | | |
404 | | // |
405 | | // Build the InvSbox table and the first table of SymCryptAesSboxMatrixMult and |
406 | | // SymCryptAesInvSboxMatrixMult |
407 | | // |
408 | | for( i=0; i<256; i++ ) { |
409 | | S = SymCryptAesSbox[i]; |
410 | | SymCryptAesInvSbox[S] = (BYTE) i; |
411 | | *(UINT32 *)SymCryptAesSboxMatrixMult[0][i] = *(UINT32 *)MatrixScratch[S]; |
412 | | *(UINT32 *)SymCryptAesInvSboxMatrixMult[0][S] = *(UINT32 *)SymCryptAesInvMatrixMult[0][i]; |
413 | | } |
414 | | |
415 | | // |
416 | | // Now we generate the byte rotations of the tables |
417 | | // |
418 | | SymCryptAesCreateRotatedTables( SymCryptAesSboxMatrixMult ); |
419 | | SymCryptAesCreateRotatedTables( SymCryptAesInvSboxMatrixMult ); |
420 | | SymCryptAesCreateRotatedTables( SymCryptAesInvMatrixMult ); |
421 | | |
422 | | // |
423 | | // This is a memory barrier. It ensures that all the memory writes we do before the barrier |
424 | | // are globally visible to other CPUs before the memory writes we do after the fence. |
425 | | // In this particular case, it ensures that every CPU sees the completed tables before |
426 | | // it sees the flag as set. |
427 | | // |
428 | | MemoryBarrier(); |
429 | | |
430 | | // |
431 | | // Set the flag to signal that the tables are initialized. |
432 | | // |
433 | | SymCryptAesTablesInitialized = TRUE; |
434 | | } |
435 | | |
436 | | |
437 | | */ |