/src/SymCrypt/lib/sha256.c
Line | Count | Source (jump to first uncovered line) |
1 | | // |
2 | | // Sha256.c |
3 | | // |
4 | | // Copyright (c) Microsoft Corporation. Licensed under the MIT license. |
5 | | // |
6 | | |
7 | | // |
8 | | // This module contains the routines to implement SHA2-256 from FIPS 180-2 |
9 | | // |
10 | | // This revised implementation is based on the older one in RSA32LIB by Scott Field from 2001 |
11 | | // |
12 | | |
13 | | #include "precomp.h" |
14 | | |
15 | | // |
16 | | // See the symcrypt.h file for documentation on what the various functions do. |
17 | | // |
18 | | |
19 | | const SYMCRYPT_HASH SymCryptSha256Algorithm_default = { |
20 | | &SymCryptSha256Init, |
21 | | &SymCryptSha256Append, |
22 | | &SymCryptSha256Result, |
23 | | &SymCryptSha256AppendBlocks, |
24 | | &SymCryptSha256StateCopy, |
25 | | sizeof( SYMCRYPT_SHA256_STATE ), |
26 | | SYMCRYPT_SHA256_RESULT_SIZE, |
27 | | SYMCRYPT_SHA256_INPUT_BLOCK_SIZE, |
28 | | SYMCRYPT_FIELD_OFFSET( SYMCRYPT_SHA256_STATE, chain ), |
29 | | SYMCRYPT_FIELD_SIZE( SYMCRYPT_SHA256_STATE, chain ), |
30 | | }; |
31 | | |
32 | | const PCSYMCRYPT_HASH SymCryptSha256Algorithm = &SymCryptSha256Algorithm_default; |
33 | | |
34 | | // |
35 | | // SHA-256 uses 64 magic constants of 32 bits each. These are |
36 | | // referred to as K^{256}_i for i=0...63 by FIPS 180-2. |
37 | | // This array is also used by the parallel SHA256 implementation |
38 | | // For performance we align to 256 bytes, which gives optimal cache alignment. |
39 | | // |
40 | | SYMCRYPT_ALIGN_AT( 256 ) const UINT32 SymCryptSha256K[64] = { |
41 | | 0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL, |
42 | | 0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL, |
43 | | 0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL, |
44 | | 0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL, |
45 | | 0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL, |
46 | | 0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL, |
47 | | 0x983e5152UL, 0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL, |
48 | | 0xc6e00bf3UL, 0xd5a79147UL, 0x06ca6351UL, 0x14292967UL, |
49 | | 0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL, 0x53380d13UL, |
50 | | 0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL, |
51 | | 0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL, |
52 | | 0xd192e819UL, 0xd6990624UL, 0xf40e3585UL, 0x106aa070UL, |
53 | | 0x19a4c116UL, 0x1e376c08UL, 0x2748774cUL, 0x34b0bcb5UL, |
54 | | 0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL, 0x682e6ff3UL, |
55 | | 0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL, |
56 | | 0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL |
57 | | }; |
58 | | |
59 | | |
60 | | // |
61 | | // Initial state |
62 | | // |
63 | | static const UINT32 sha256InitialState[8] = { |
64 | | 0x6a09e667UL, |
65 | | 0xbb67ae85UL, |
66 | | 0x3c6ef372UL, |
67 | | 0xa54ff53aUL, |
68 | | 0x510e527fUL, |
69 | | 0x9b05688cUL, |
70 | | 0x1f83d9abUL, |
71 | | 0x5be0cd19UL, |
72 | | }; |
73 | | |
74 | | // |
75 | | // SymCryptSha256 |
76 | | // |
77 | | #define ALG SHA256 |
78 | | #define Alg Sha256 |
79 | | #include "hash_pattern.c" |
80 | | #undef ALG |
81 | | #undef Alg |
82 | | |
83 | | |
84 | | |
85 | | // |
86 | | // SymCryptSha256Init |
87 | | // |
88 | | SYMCRYPT_NOINLINE |
89 | | VOID |
90 | | SYMCRYPT_CALL |
91 | | SymCryptSha256Init( _Out_ PSYMCRYPT_SHA256_STATE pState ) |
92 | 387 | { |
93 | 387 | SYMCRYPT_SET_MAGIC( pState ); |
94 | | |
95 | 387 | pState->dataLengthL = 0; |
96 | | //pState->dataLengthH = 0; // not used |
97 | 387 | pState->bytesInBuffer = 0; |
98 | | |
99 | 387 | memcpy( &pState->chain.H[0], &sha256InitialState[0], sizeof( sha256InitialState ) ); |
100 | | |
101 | | // |
102 | | // There is no need to initialize the buffer part of the state as that will be |
103 | | // filled before it is used. |
104 | | // |
105 | 387 | } |
106 | | |
107 | | |
108 | | // |
109 | | // SymCryptSha256Append |
110 | | // |
111 | | SYMCRYPT_NOINLINE |
112 | | VOID |
113 | | SYMCRYPT_CALL |
114 | | SymCryptSha256Append( |
115 | | _Inout_ PSYMCRYPT_SHA256_STATE pState, |
116 | | _In_reads_( cbData ) PCBYTE pbData, |
117 | | SIZE_T cbData ) |
118 | 48.0k | { |
119 | 48.0k | UINT32 bytesInBuffer; |
120 | 48.0k | UINT32 freeInBuffer; |
121 | 48.0k | SIZE_T tmp; |
122 | | |
123 | 48.0k | SYMCRYPT_CHECK_MAGIC( pState ); |
124 | | |
125 | 48.0k | pState->dataLengthL += cbData; // dataLengthH is not used... |
126 | | |
127 | 48.0k | bytesInBuffer = pState->bytesInBuffer; |
128 | | |
129 | | // |
130 | | // If previous data in buffer, buffer new input and transform if possible. |
131 | | // |
132 | 48.0k | if( bytesInBuffer > 0 ) |
133 | 41.9k | { |
134 | 41.9k | SYMCRYPT_ASSERT( SYMCRYPT_SHA256_INPUT_BLOCK_SIZE > bytesInBuffer ); |
135 | | |
136 | 41.9k | freeInBuffer = SYMCRYPT_SHA256_INPUT_BLOCK_SIZE - bytesInBuffer; |
137 | 41.9k | if( cbData < freeInBuffer ) |
138 | 37.9k | { |
139 | | // |
140 | | // All the data will fit in the buffer. |
141 | | // We don't do anything here. |
142 | | // As cbData < inputBlockSize the bulk data processing is skipped, |
143 | | // and the data will be copied to the buffer at the end |
144 | | // of this code. |
145 | 37.9k | } else { |
146 | | // |
147 | | // Enough data to fill the whole buffer & process it |
148 | | // |
149 | 3.99k | memcpy(&pState->buffer[bytesInBuffer], pbData, freeInBuffer); |
150 | 3.99k | pbData += freeInBuffer; |
151 | 3.99k | cbData -= freeInBuffer; |
152 | 3.99k | SymCryptSha256AppendBlocks( &pState->chain, &pState->buffer[0], SYMCRYPT_SHA256_INPUT_BLOCK_SIZE, &tmp ); |
153 | | |
154 | 3.99k | bytesInBuffer = 0; |
155 | 3.99k | } |
156 | 41.9k | } |
157 | | |
158 | | // |
159 | | // Internal buffer is empty; process all remaining whole blocks in the input |
160 | | // |
161 | 48.0k | if( cbData >= SYMCRYPT_SHA256_INPUT_BLOCK_SIZE ) |
162 | 4.73k | { |
163 | 4.73k | SymCryptSha256AppendBlocks( &pState->chain, pbData, cbData, &tmp ); |
164 | 4.73k | SYMCRYPT_ASSERT( tmp < SYMCRYPT_SHA256_INPUT_BLOCK_SIZE ); |
165 | 4.73k | pbData += cbData - tmp; |
166 | 4.73k | cbData = tmp; |
167 | 4.73k | } |
168 | | |
169 | 48.0k | SYMCRYPT_ASSERT( cbData < SYMCRYPT_SHA256_INPUT_BLOCK_SIZE ); |
170 | | |
171 | | // |
172 | | // buffer remaining input if necessary. |
173 | | // |
174 | 48.0k | if( cbData > 0 ) |
175 | 15.9k | { |
176 | 15.9k | memcpy( &pState->buffer[bytesInBuffer], pbData, cbData ); |
177 | 15.9k | bytesInBuffer += (UINT32) cbData; |
178 | 15.9k | } |
179 | | |
180 | 48.0k | pState->bytesInBuffer = bytesInBuffer; |
181 | 48.0k | } |
182 | | |
183 | | |
184 | | // |
185 | | // SymCryptSha256Result |
186 | | // |
187 | | SYMCRYPT_NOINLINE |
188 | | VOID |
189 | | SYMCRYPT_CALL |
190 | | SymCryptSha256Result( |
191 | | _Inout_ PSYMCRYPT_SHA256_STATE pState, |
192 | | _Out_writes_( SYMCRYPT_SHA256_RESULT_SIZE ) PBYTE pbResult ) |
193 | 11.5k | { |
194 | | // |
195 | | // We don't use the common padding code as that is slower, and SHA-256 is very frequently used in |
196 | | // performance-sensitive areas. |
197 | | // |
198 | 11.5k | UINT32 bytesInBuffer; |
199 | 11.5k | SIZE_T tmp; |
200 | | |
201 | 11.5k | SYMCRYPT_CHECK_MAGIC( pState ); |
202 | | |
203 | 11.5k | bytesInBuffer = pState->bytesInBuffer; |
204 | | |
205 | | // |
206 | | // The buffer is never completely full, so we can always put the first |
207 | | // padding byte in. |
208 | | // |
209 | 11.5k | pState->buffer[bytesInBuffer++] = 0x80; |
210 | | |
211 | 11.5k | if( bytesInBuffer > 64-8 ) { |
212 | | // |
213 | | // No room for the rest of the padding. Pad with zeroes & process block |
214 | | // bytesInBuffer is at most 64, so we do not have an integer underflow |
215 | | // |
216 | 736 | SymCryptWipe( &pState->buffer[bytesInBuffer], 64-bytesInBuffer ); |
217 | 736 | SymCryptSha256AppendBlocks( &pState->chain, pState->buffer, 64, &tmp ); |
218 | 736 | bytesInBuffer = 0; |
219 | 736 | } |
220 | | |
221 | | // |
222 | | // Set rest of padding |
223 | | // At this point bytesInBuffer <= 64-8, so we don't have an underflow |
224 | | // We wipe to the end of the buffer as it is 16-aligned, |
225 | | // and it is faster to wipe to an aligned point |
226 | | // |
227 | 11.5k | SymCryptWipe( &pState->buffer[bytesInBuffer], 64-bytesInBuffer ); |
228 | 11.5k | SYMCRYPT_STORE_MSBFIRST64( &pState->buffer[64-8], pState->dataLengthL * 8 ); |
229 | | |
230 | | // |
231 | | // Process the final block |
232 | | // |
233 | 11.5k | SymCryptSha256AppendBlocks( &pState->chain, pState->buffer, 64, &tmp ); |
234 | | |
235 | | // |
236 | | // Write the output in the correct byte order |
237 | | // |
238 | 11.5k | SymCryptUint32ToMsbFirst( &pState->chain.H[0], pbResult, 8 ); |
239 | | |
240 | | // |
241 | | // Wipe & re-initialize |
242 | | // We have to wipe the whole state because the Init call |
243 | | // might be optimized away by a smart compiler. |
244 | | // |
245 | 11.5k | SymCryptWipeKnownSize( pState, sizeof( *pState ) ); |
246 | | |
247 | 11.5k | memcpy( &pState->chain.H[0], &sha256InitialState[0], sizeof( sha256InitialState ) ); |
248 | 11.5k | SYMCRYPT_SET_MAGIC( pState ); |
249 | 11.5k | } |
250 | | |
251 | | |
252 | | VOID |
253 | | SYMCRYPT_CALL |
254 | | SymCryptSha256StateExport( |
255 | | _In_ PCSYMCRYPT_SHA256_STATE pState, |
256 | | _Out_writes_bytes_( SYMCRYPT_SHA256_STATE_EXPORT_SIZE ) PBYTE pbBlob ) |
257 | 0 | { |
258 | 0 | SYMCRYPT_ALIGN SYMCRYPT_SHA256_STATE_EXPORT_BLOB blob; // local copy to have proper alignment. |
259 | 0 | C_ASSERT( sizeof( blob ) == SYMCRYPT_SHA256_STATE_EXPORT_SIZE ); |
260 | |
|
261 | 0 | SYMCRYPT_CHECK_MAGIC( pState ); |
262 | |
|
263 | 0 | SymCryptWipeKnownSize( &blob, sizeof( blob ) ); // wipe to avoid any data leakage |
264 | |
|
265 | 0 | blob.header.magic = SYMCRYPT_BLOB_MAGIC; |
266 | 0 | blob.header.size = SYMCRYPT_SHA256_STATE_EXPORT_SIZE; |
267 | 0 | blob.header.type = SymCryptBlobTypeSha256State; |
268 | | |
269 | | // |
270 | | // Copy the relevant data. Buffer will be 0-padded. |
271 | | // |
272 | |
|
273 | 0 | SymCryptUint32ToMsbFirst( &pState->chain.H[0], &blob.chain[0], 8 ); |
274 | 0 | blob.dataLength = pState->dataLengthL; |
275 | 0 | memcpy( &blob.buffer[0], &pState->buffer[0], blob.dataLength & 0x3f ); |
276 | |
|
277 | 0 | SYMCRYPT_ASSERT( (PCBYTE) &blob + sizeof( blob ) - sizeof( SYMCRYPT_BLOB_TRAILER ) == (PCBYTE) &blob.trailer ); |
278 | 0 | SymCryptMarvin32( SymCryptMarvin32DefaultSeed, (PCBYTE) &blob, sizeof( blob ) - sizeof( SYMCRYPT_BLOB_TRAILER ), &blob.trailer.checksum[0] ); |
279 | |
|
280 | 0 | memcpy( pbBlob, &blob, sizeof( blob ) ); |
281 | | |
282 | | //cleanup: |
283 | 0 | SymCryptWipeKnownSize( &blob, sizeof( blob ) ); |
284 | 0 | return; |
285 | 0 | } |
286 | | |
287 | | SYMCRYPT_ERROR |
288 | | SYMCRYPT_CALL |
289 | | SymCryptSha256StateImport( |
290 | | _Out_ PSYMCRYPT_SHA256_STATE pState, |
291 | | _In_reads_bytes_( SYMCRYPT_SHA256_STATE_EXPORT_SIZE) PCBYTE pbBlob ) |
292 | 0 | { |
293 | 0 | SYMCRYPT_ERROR scError = SYMCRYPT_NO_ERROR; |
294 | 0 | SYMCRYPT_ALIGN SYMCRYPT_SHA256_STATE_EXPORT_BLOB blob; // local copy to have proper alignment. |
295 | 0 | BYTE checksum[8]; |
296 | |
|
297 | 0 | C_ASSERT( sizeof( blob ) == SYMCRYPT_SHA256_STATE_EXPORT_SIZE ); |
298 | 0 | memcpy( &blob, pbBlob, sizeof( blob ) ); |
299 | |
|
300 | 0 | if( blob.header.magic != SYMCRYPT_BLOB_MAGIC || |
301 | 0 | blob.header.size != SYMCRYPT_SHA256_STATE_EXPORT_SIZE || |
302 | 0 | blob.header.type != SymCryptBlobTypeSha256State ) |
303 | 0 | { |
304 | 0 | scError = SYMCRYPT_INVALID_BLOB; |
305 | 0 | goto cleanup; |
306 | 0 | } |
307 | | |
308 | 0 | SymCryptMarvin32( SymCryptMarvin32DefaultSeed, (PCBYTE) &blob, sizeof( blob ) - sizeof( SYMCRYPT_BLOB_TRAILER ), checksum ); |
309 | 0 | if( memcmp( checksum, &blob.trailer.checksum[0], 8 ) != 0 ) |
310 | 0 | { |
311 | 0 | scError = SYMCRYPT_INVALID_BLOB; |
312 | 0 | goto cleanup; |
313 | 0 | } |
314 | | |
315 | 0 | SymCryptMsbFirstToUint32( &blob.chain[0], &pState->chain.H[0], 8 ); |
316 | 0 | pState->dataLengthL = blob.dataLength; |
317 | 0 | pState->bytesInBuffer = blob.dataLength & 0x3f; |
318 | 0 | memcpy( &pState->buffer[0], &blob.buffer[0], pState->bytesInBuffer ); |
319 | |
|
320 | 0 | SYMCRYPT_SET_MAGIC( pState ); |
321 | |
|
322 | 0 | cleanup: |
323 | 0 | SymCryptWipeKnownSize( &blob, sizeof(blob) ); |
324 | 0 | return scError; |
325 | 0 | } |
326 | | |
327 | | |
328 | | |
329 | | // |
330 | | // Simple test vector for FIPS module testing |
331 | | // |
332 | | |
333 | | const BYTE SymCryptSha256KATAnswer[ 32 ] = { |
334 | | 0xba, 0x78, 0x16, 0xbf, 0x8f, 0x01, 0xcf, 0xea, |
335 | | 0x41, 0x41, 0x40, 0xde, 0x5d, 0xae, 0x22, 0x23, |
336 | | 0xb0, 0x03, 0x61, 0xa3, 0x96, 0x17, 0x7a, 0x9c, |
337 | | 0xb4, 0x10, 0xff, 0x61, 0xf2, 0x00, 0x15, 0xad, |
338 | | } ; |
339 | | |
340 | | VOID |
341 | | SYMCRYPT_CALL |
342 | | SymCryptSha256Selftest(void) |
343 | 0 | { |
344 | 0 | BYTE result[SYMCRYPT_SHA256_RESULT_SIZE]; |
345 | |
|
346 | 0 | SymCryptSha256( SymCryptTestMsg3, sizeof( SymCryptTestMsg3 ), result ); |
347 | |
|
348 | 0 | SymCryptInjectError( result, sizeof( result ) ); |
349 | |
|
350 | 0 | if( memcmp( result, SymCryptSha256KATAnswer, sizeof( result ) ) != 0 ) { |
351 | 0 | SymCryptFatal( 'SH25' ); |
352 | 0 | } |
353 | 0 | } |
354 | | |
355 | | |
356 | | |
357 | | // |
358 | | // Below are multiple implementations of the SymCryptSha256AppendBlocks function, |
359 | | // with a compile-time switch about which one to use. |
360 | | // We keep the multiple implementations here for future reference; |
361 | | // as CPU architectures evolve we might want to switch to one of the |
362 | | // other implementations. |
363 | | // All implementations here have been tested, but some lack production hardening. |
364 | | // |
365 | | |
366 | | // |
367 | | // Enable frame pointer omission to free up an extra register on X86. |
368 | | // |
369 | | #if SYMCRYPT_CPU_X86 && SYMCRYPT_MS_VC |
370 | | #pragma optimize( "y", on ) |
371 | | #endif |
372 | | |
373 | | // |
374 | | // For documentation on these function see FIPS 180-2 |
375 | | // |
376 | | // MAJ and CH are the functions Maj and Ch from the standard. |
377 | | // CSIGMA0 and CSIGMA1 are the capital sigma functions. |
378 | | // LSIGMA0 and LSIGMA1 are the lowercase sigma functions. |
379 | | // |
380 | | // The canonical definitions of the MAJ and CH functions are: |
381 | | //#define MAJ( x, y, z ) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) |
382 | | //#define CH( x, y, z ) (((x) & (y)) ^ ((~(x)) & (z))) |
383 | | // We use optimized versions defined below |
384 | | // |
385 | 28.4M | #define MAJ( x, y, z ) ((((z) | (y)) & (x) ) | ((z) & (y))) |
386 | 28.4M | #define CH( x, y, z ) ((((z) ^ (y)) & (x)) ^ (z)) |
387 | | |
388 | | // |
389 | | // The four Sigma functions |
390 | | // |
391 | | |
392 | | // |
393 | | // We have two versions of the rotate-and-xor functions. |
394 | | // one is just a macro that does the rotations and xors. |
395 | | // This works well on ARM |
396 | | // For Intel/AMD we have one where we use the rotated value |
397 | | // from one intermediate result to derive the next rotated |
398 | | // value from. This removes one register copy from the |
399 | | // code stream. |
400 | | // |
401 | | // In practice, our compiler doesn't take advantage of the |
402 | | // reduction in the # operations required, and inserts a |
403 | | // bunch of extra register copies anyway. |
404 | | // It actually hurts on AMD64. |
405 | | // |
406 | | // This should be re-tuned for every release to get the best overall |
407 | | // SHA-256 performance. |
408 | | // At the moment we get an improvement from 19.76 c/B to 19.40 c/B on a Core 2 core. |
409 | | // We should probably tune this to the Atom CPU. |
410 | | // |
411 | | #if SYMCRYPT_CPU_X86 |
412 | | #define USE_CSIGMA0_MULTIROT 1 |
413 | | #define USE_CSIGMA1_MULTIROT 0 |
414 | | #define USE_LSIGMA0_MULTIROT 0 |
415 | | #define USE_LSIGMA1_MULTIROT 0 |
416 | | |
417 | | #else |
418 | | // |
419 | | // On ARM we have no reason to believe this helps at all. |
420 | | // on AMD64 it slows our code down. |
421 | | // |
422 | | #define USE_CSIGMA0_MULTIROT 0 |
423 | | #define USE_CSIGMA1_MULTIROT 0 |
424 | | #define USE_LSIGMA0_MULTIROT 0 |
425 | | #define USE_LSIGMA1_MULTIROT 0 |
426 | | #endif |
427 | | |
428 | | #if USE_CSIGMA0_MULTIROT |
429 | | FORCEINLINE |
430 | | UINT32 |
431 | | CSIGMA0( UINT32 x ) |
432 | | { |
433 | | UINT32 res; |
434 | | x = ROR32( x, 2 ); |
435 | | res = x; |
436 | | x = ROR32( x, 11 ); |
437 | | res ^= x; |
438 | | x = ROR32( x, 9 ); |
439 | | res ^= x; |
440 | | return res; |
441 | | } |
442 | | #else |
443 | 28.4M | #define CSIGMA0( x ) (ROR32((x), 2) ^ ROR32((x), 13) ^ ROR32((x), 22)) |
444 | | #endif |
445 | | |
446 | | #if USE_CSIGMA1_MULTIROT |
447 | | FORCEINLINE |
448 | | UINT32 |
449 | | CSIGMA1( UINT32 x ) |
450 | | { |
451 | | UINT32 res; |
452 | | x = ROR32( x, 6 ); |
453 | | res = x; |
454 | | x = ROR32( x, 5 ); |
455 | | res ^= x; |
456 | | x = ROR32( x, 14 ); |
457 | | res ^= x; |
458 | | return res; |
459 | | } |
460 | | #else |
461 | 28.4M | #define CSIGMA1( x ) (ROR32((x), 6) ^ ROR32((x), 11) ^ ROR32((x), 25)) |
462 | | #endif |
463 | | |
464 | | #if USE_LSIGMA0_MULTIROT |
465 | | FORCEINLINE |
466 | | UINT32 |
467 | | LSIGMA0( UINT32 x ) |
468 | | { |
469 | | UINT32 res; |
470 | | res = x >> 3; |
471 | | x = ROR32( x, 7 ); |
472 | | res ^= x; |
473 | | x = ROR32( x, 11 ); |
474 | | res ^= x; |
475 | | return res; |
476 | | } |
477 | | #else |
478 | 21.3M | #define LSIGMA0( x ) (ROR32((x), 7) ^ ROR32((x), 18) ^ ((x)>> 3)) |
479 | | #endif |
480 | | |
481 | | #if USE_LSIGMA1_MULTIROT |
482 | | FORCEINLINE |
483 | | UINT32 |
484 | | LSIGMA1( UINT32 x ) |
485 | | { |
486 | | UINT32 res; |
487 | | res = x >> 10; |
488 | | x = ROR32( x, 17 ); |
489 | | res ^= x; |
490 | | x = ROR32( x, 2 ); |
491 | | res ^= x; |
492 | | return res; |
493 | | } |
494 | | #else |
495 | 21.3M | #define LSIGMA1( x ) (ROR32((x), 17) ^ ROR32((x), 19) ^ ((x)>>10)) |
496 | | #endif |
497 | | |
498 | | |
499 | | // |
500 | | // The values a-h are stored in an array called ah. |
501 | | // We have unrolled the loop 16 times. This makes both the indices into |
502 | | // the ah array constant, and it makes the message addressing constant. |
503 | | // This provides a significant speed improvement, at the cost of making |
504 | | // the main loop about 4 kB in code. |
505 | | // |
506 | | // The earlier implementation had the loop unrolled 8 times, and is |
507 | | // around 10 cycles/byte slower. If loading the code from disk takes |
508 | | // 100 cycles/byte, then we break even once you have hashed 20 kB. |
509 | | // This is a worthwhile tradeoff as all code is codesigned with SHA-256. |
510 | | // |
511 | | |
512 | | // |
513 | | // Core round macro |
514 | | // |
515 | | // r16 is the round number mod 16, r is the round number. |
516 | | // r16 is a separate macro argument because it is always a compile-time constant |
517 | | // which allows much better optimisations of the memory accesses. |
518 | | // |
519 | | // ah[ r16 &7] = h |
520 | | // ah[(r16+1)&7] = g; |
521 | | // ah[(r16+2)&7] = f; |
522 | | // ah[(r16+3)&7] = e; |
523 | | // ah[(r16+4)&7] = d; |
524 | | // ah[(r16+5)&7] = c; |
525 | | // ah[(r16+6)&7] = b; |
526 | | // ah[(r16+7)&7] = a; |
527 | | // |
528 | | // After that incrementing the round number will automatically map a->b, b->c, etc. |
529 | | // |
530 | | // The core round, after the message word has been computed for this round and put in Wt. |
531 | | // r16 is the round number modulo 16. (Static after loop unrolling) |
532 | | // r is the round number (dynamic, which is why we don't use (r&0xf) for r16) |
533 | | // In more readable form this macro does the following: |
534 | | // h += CSIGMA( e ) + CH( e, f, g ) + K[round] + W[round]; |
535 | | // d += h; |
536 | | // h += CSIGMA( a ) + MAJ( a, b, c ); |
537 | | // |
538 | 28.4M | #define CROUND( r16, r ) {;\ |
539 | 28.4M | ah[ r16 &7] += CSIGMA1(ah[(r16+3)&7]) + CH(ah[(r16+3)&7], ah[(r16+2)&7], ah[(r16+1)&7]) + SymCryptSha256K[r] + Wt;\ |
540 | 28.4M | ah[(r16+4)&7] += ah[r16 &7];\ |
541 | 28.4M | ah[ r16 &7] += CSIGMA0(ah[(r16+7)&7]) + MAJ(ah[(r16+7)&7], ah[(r16+6)&7], ah[(r16+5)&7]);\ |
542 | 28.4M | } |
543 | | |
544 | | // |
545 | | // Initial round that reads the message. |
546 | | // r is the round number 0..15 |
547 | | // |
548 | 7.11M | #define IROUND( r ) {\ |
549 | 7.11M | Wt = SYMCRYPT_LOAD_MSBFIRST32( &pbData[ 4*r ] );\ |
550 | 7.11M | W[r] = Wt; \ |
551 | 7.11M | CROUND(r,r);\ |
552 | 7.11M | } |
553 | | |
554 | | // |
555 | | // Subsequent rounds. |
556 | | // r16 is the round number mod 16. rb is the round number minus r16. |
557 | | // |
558 | 21.3M | #define FROUND(r16, rb) { \ |
559 | 21.3M | Wt = LSIGMA1( W[(r16-2) & 15] ) + W[(r16-7) & 15] + \ |
560 | 21.3M | LSIGMA0( W[(r16-15) & 15]) + W[r16 & 15]; \ |
561 | 21.3M | W[r16] = Wt; \ |
562 | 21.3M | CROUND( r16, r16+rb ); \ |
563 | 21.3M | } |
564 | | |
565 | | // |
566 | | // UINT32 implementation 1 |
567 | | // |
568 | | VOID |
569 | | SYMCRYPT_CALL |
570 | | SymCryptSha256AppendBlocks_ul1( |
571 | | _Inout_ SYMCRYPT_SHA256_CHAINING_STATE * pChain, |
572 | | _In_reads_( cbData ) PCBYTE pbData, |
573 | | SIZE_T cbData, |
574 | | _Out_ SIZE_T * pcbRemaining ) |
575 | 21.4k | { |
576 | 21.4k | SYMCRYPT_ALIGN UINT32 W[16]; |
577 | 21.4k | SYMCRYPT_ALIGN UINT32 ah[8]; |
578 | 21.4k | int round; |
579 | 21.4k | UINT32 Wt; |
580 | | |
581 | 465k | while( cbData >= 64 ) |
582 | 444k | { |
583 | 444k | ah[7] = pChain->H[0]; |
584 | 444k | ah[6] = pChain->H[1]; |
585 | 444k | ah[5] = pChain->H[2]; |
586 | 444k | ah[4] = pChain->H[3]; |
587 | 444k | ah[3] = pChain->H[4]; |
588 | 444k | ah[2] = pChain->H[5]; |
589 | 444k | ah[1] = pChain->H[6]; |
590 | 444k | ah[0] = pChain->H[7]; |
591 | | |
592 | | // |
593 | | // initial rounds 1 to 16 |
594 | | // |
595 | | |
596 | 444k | IROUND( 0 ); |
597 | 444k | IROUND( 1 ); |
598 | 444k | IROUND( 2 ); |
599 | 444k | IROUND( 3 ); |
600 | 444k | IROUND( 4 ); |
601 | 444k | IROUND( 5 ); |
602 | 444k | IROUND( 6 ); |
603 | 444k | IROUND( 7 ); |
604 | 444k | IROUND( 8 ); |
605 | 444k | IROUND( 9 ); |
606 | 444k | IROUND( 10 ); |
607 | 444k | IROUND( 11 ); |
608 | 444k | IROUND( 12 ); |
609 | 444k | IROUND( 13 ); |
610 | 444k | IROUND( 14 ); |
611 | 444k | IROUND( 15 ); |
612 | | |
613 | | |
614 | | // |
615 | | // rounds 16 to 64. |
616 | | // |
617 | 1.77M | for( round=16; round<64; round += 16 ) |
618 | 1.33M | { |
619 | 1.33M | FROUND( 0, round ); |
620 | 1.33M | FROUND( 1, round ); |
621 | 1.33M | FROUND( 2, round ); |
622 | 1.33M | FROUND( 3, round ); |
623 | 1.33M | FROUND( 4, round ); |
624 | 1.33M | FROUND( 5, round ); |
625 | 1.33M | FROUND( 6, round ); |
626 | 1.33M | FROUND( 7, round ); |
627 | 1.33M | FROUND( 8, round ); |
628 | 1.33M | FROUND( 9, round ); |
629 | 1.33M | FROUND( 10, round ); |
630 | 1.33M | FROUND( 11, round ); |
631 | 1.33M | FROUND( 12, round ); |
632 | 1.33M | FROUND( 13, round ); |
633 | 1.33M | FROUND( 14, round ); |
634 | 1.33M | FROUND( 15, round ); |
635 | 1.33M | } |
636 | | |
637 | 444k | pChain->H[0] = ah[7] + pChain->H[0]; |
638 | 444k | pChain->H[1] = ah[6] + pChain->H[1]; |
639 | 444k | pChain->H[2] = ah[5] + pChain->H[2]; |
640 | 444k | pChain->H[3] = ah[4] + pChain->H[3]; |
641 | 444k | pChain->H[4] = ah[3] + pChain->H[4]; |
642 | 444k | pChain->H[5] = ah[2] + pChain->H[5]; |
643 | 444k | pChain->H[6] = ah[1] + pChain->H[6]; |
644 | 444k | pChain->H[7] = ah[0] + pChain->H[7]; |
645 | | |
646 | 444k | pbData += 64; |
647 | 444k | cbData -= 64; |
648 | | |
649 | 444k | } |
650 | | |
651 | 21.4k | *pcbRemaining = cbData; |
652 | | |
653 | | // |
654 | | // Wipe the variables; |
655 | | // |
656 | 21.4k | SymCryptWipeKnownSize( ah, sizeof( ah ) ); |
657 | 21.4k | SymCryptWipeKnownSize( W, sizeof( W ) ); |
658 | 21.4k | SYMCRYPT_FORCE_WRITE32( &Wt, 0 ); |
659 | 21.4k | } |
660 | | |
661 | | VOID |
662 | | SYMCRYPT_CALL |
663 | | SymCryptSha256AppendBlocks_ul2( |
664 | | _Inout_ SYMCRYPT_SHA256_CHAINING_STATE * pChain, |
665 | | _In_reads_( cbData ) PCBYTE pbData, |
666 | | SIZE_T cbData, |
667 | | _Out_ SIZE_T * pcbRemaining ) |
668 | 0 | { |
669 | | // |
670 | | // Different arrangement of the code, currently 25 c/B vs 20 c/b for the version above. |
671 | | // On Atom: 50 c/B vs 41 c/B for the one above. |
672 | | // |
673 | 0 | SYMCRYPT_ALIGN UINT32 buf[4 + 8 + 64]; // chaining state concatenated with the expanded input block |
674 | 0 | UINT32 * W = &buf[4 + 8]; |
675 | 0 | UINT32 * ha = &buf[4]; // initial state words, in order h, g, ..., b, a |
676 | 0 | UINT32 A, B, C, D, T; |
677 | 0 | int r; |
678 | |
|
679 | 0 | ha[7] = pChain->H[0]; buf[3] = ha[7]; |
680 | 0 | ha[6] = pChain->H[1]; buf[2] = ha[6]; |
681 | 0 | ha[5] = pChain->H[2]; buf[1] = ha[5]; |
682 | 0 | ha[4] = pChain->H[3]; buf[0] = ha[4]; |
683 | 0 | ha[3] = pChain->H[4]; |
684 | 0 | ha[2] = pChain->H[5]; |
685 | 0 | ha[1] = pChain->H[6]; |
686 | 0 | ha[0] = pChain->H[7]; |
687 | |
|
688 | 0 | while( cbData >= 64 ) |
689 | 0 | { |
690 | | // |
691 | | // Capture the input into W[0..15] |
692 | | // |
693 | 0 | for( r=0; r<16; r++ ) |
694 | 0 | { |
695 | 0 | W[r] = SYMCRYPT_LOAD_MSBFIRST32( &pbData[ 4*r ] ); |
696 | 0 | } |
697 | | |
698 | | // |
699 | | // Expand the message |
700 | | // |
701 | 0 | A = W[15]; |
702 | 0 | B = W[14]; |
703 | 0 | D = W[0]; |
704 | 0 | for( r=16; r<64; r+= 2 ) |
705 | 0 | { |
706 | | // Loop invariant: A=W[r-1], B = W[r-2], D = W[r-16] |
707 | | |
708 | | // |
709 | | // Macro for one word of message expansion. |
710 | | // Invariant: |
711 | | // on entry: a = W[r-1], b = W[r-2], d = W[r-16] |
712 | | // on exit: W[r] computed, a = W[r-1], b = W[r], c = W[r-15] |
713 | | // |
714 | 0 | #define EXPAND( a, b, c, d, r ) \ |
715 | 0 | c = W[r-15]; \ |
716 | 0 | b = d + LSIGMA1( b ) + W[r-7] + LSIGMA0( c ); \ |
717 | 0 | W[r] = b; \ |
718 | 0 |
|
719 | 0 | EXPAND( A, B, C, D, r ); |
720 | 0 | EXPAND( B, A, D, C, (r+1)); |
721 | |
|
722 | 0 | #undef EXPAND |
723 | 0 | } |
724 | |
|
725 | 0 | A = ha[7]; |
726 | 0 | B = ha[6]; |
727 | 0 | C = ha[5]; |
728 | 0 | D = ha[4]; |
729 | |
|
730 | 0 | for( r=0; r<64; r += 4 ) |
731 | 0 | { |
732 | | // |
733 | | // Loop invariant: |
734 | | // A, B, C, and D are the a,b,c,d values of the current state. |
735 | | // W[r] is the next expanded message word to be processed. |
736 | | // W[r-8 .. r-5] contain the current state words h, g, f, e. |
737 | | // |
738 | | |
739 | | // |
740 | | // Macro to compute one round |
741 | | // |
742 | 0 | #define DO_ROUND( a, b, c, d, t, r ) \ |
743 | 0 | t = W[r] + CSIGMA1( W[r-5] ) + W[r-8] + CH( W[r-5], W[r-6], W[r-7] ) + SymCryptSha256K[r]; \ |
744 | 0 | W[r-4] = t + d; \ |
745 | 0 | d = t + CSIGMA0( a ) + MAJ( c, b, a ); |
746 | |
|
747 | 0 | DO_ROUND( A, B, C, D, T, r ); |
748 | 0 | DO_ROUND( D, A, B, C, T, (r+1) ); |
749 | 0 | DO_ROUND( C, D, A, B, T, (r+2) ); |
750 | 0 | DO_ROUND( B, C, D, A, T, (r+3) ); |
751 | 0 | #undef DO_ROUND |
752 | 0 | } |
753 | |
|
754 | 0 | buf[3] = ha[7] = buf[3] + A; |
755 | 0 | buf[2] = ha[6] = buf[2] + B; |
756 | 0 | buf[1] = ha[5] = buf[1] + C; |
757 | 0 | buf[0] = ha[4] = buf[0] + D; |
758 | 0 | ha[3] += W[r-5]; |
759 | 0 | ha[2] += W[r-6]; |
760 | 0 | ha[1] += W[r-7]; |
761 | 0 | ha[0] += W[r-8]; |
762 | |
|
763 | 0 | pbData += 64; |
764 | 0 | cbData -= 64; |
765 | 0 | } |
766 | |
|
767 | 0 | pChain->H[0] = ha[7]; |
768 | 0 | pChain->H[1] = ha[6]; |
769 | 0 | pChain->H[2] = ha[5]; |
770 | 0 | pChain->H[3] = ha[4]; |
771 | 0 | pChain->H[4] = ha[3]; |
772 | 0 | pChain->H[5] = ha[2]; |
773 | 0 | pChain->H[6] = ha[1]; |
774 | 0 | pChain->H[7] = ha[0]; |
775 | |
|
776 | 0 | *pcbRemaining = cbData; |
777 | |
|
778 | 0 | SymCryptWipeKnownSize( buf, sizeof( buf ) ); |
779 | 0 | SYMCRYPT_FORCE_WRITE32( &A, 0 ); |
780 | 0 | SYMCRYPT_FORCE_WRITE32( &B, 0 ); |
781 | 0 | SYMCRYPT_FORCE_WRITE32( &D, 0 ); |
782 | 0 | SYMCRYPT_FORCE_WRITE32( &T, 0 ); |
783 | 0 | } |
784 | | |
785 | | #undef CROUND |
786 | | #undef IROUND |
787 | | #undef FROUND |
788 | | |
789 | | #if SYMCRYPT_CPU_X86 | SYMCRYPT_CPU_AMD64 |
790 | | |
791 | | // |
792 | | // Don't omit frame pointer for XMM code; it isn't register-starved as much |
793 | | // |
794 | | #if SYMCRYPT_CPU_X86 && SYMCRYPT_MS_VC |
795 | | #pragma optimize( "y", off ) |
796 | | #endif |
797 | | |
798 | | // |
799 | | // Code that uses the XMM registers. |
800 | | // This code is currently unused. It was written in case it would provide better performance, but |
801 | | // it did not. We are retaining it in case it might be useful in a future CPU generation. |
802 | | // |
803 | | #if 0 |
804 | | |
805 | | #define MAJXMM( x, y, z ) _mm_or_si128( _mm_and_si128( _mm_or_si128( z, y ), x ), _mm_and_si128( z, y )) |
806 | | #define CHXMM( x, y, z ) _mm_xor_si128( _mm_and_si128( _mm_xor_si128( z, y ), x ), z ) |
807 | | |
808 | | #define CSIGMA0XMM( x ) \ |
809 | | _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( \ |
810 | | _mm_slli_epi32(x,30) , _mm_srli_epi32(x, 2) ),\ |
811 | | _mm_slli_epi32(x,19) ), _mm_srli_epi32(x, 13) ),\ |
812 | | _mm_slli_epi32(x,10) ), _mm_srli_epi32(x, 22) ) |
813 | | #define CSIGMA1XMM( x ) \ |
814 | | _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( \ |
815 | | _mm_slli_epi32(x,26) , _mm_srli_epi32(x, 6) ),\ |
816 | | _mm_slli_epi32(x,21) ), _mm_srli_epi32(x, 11) ),\ |
817 | | _mm_slli_epi32(x,7) ), _mm_srli_epi32(x, 25) ) |
818 | | #define LSIGMA0XMM( x ) \ |
819 | | _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( \ |
820 | | _mm_slli_epi32(x,25) , _mm_srli_epi32(x, 7) ),\ |
821 | | _mm_slli_epi32(x,14) ), _mm_srli_epi32(x, 18) ),\ |
822 | | _mm_srli_epi32(x, 3) ) |
823 | | #define LSIGMA1XMM( x ) \ |
824 | | _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( \ |
825 | | _mm_slli_epi32(x,15) , _mm_srli_epi32(x, 17) ),\ |
826 | | _mm_slli_epi32(x,13) ), _mm_srli_epi32(x, 19) ),\ |
827 | | _mm_srli_epi32(x,10) ) |
828 | | |
829 | | VOID |
830 | | SYMCRYPT_CALL |
831 | | SymCryptSha256AppendBlocks_xmm1( |
832 | | _Inout_ SYMCRYPT_SHA256_CHAINING_STATE * pChain, |
833 | | _In_reads_( cbData ) PCBYTE pbData, |
834 | | SIZE_T cbData, |
835 | | _Out_ SIZE_T * pcbRemaining ) |
836 | | { |
837 | | // |
838 | | // Implementation that has one value in each XMM register. |
839 | | // This is significantly slower than the _ul1 implementation |
840 | | // but can be extended to compute 4 hash blocks in parallel. |
841 | | // |
842 | | SYMCRYPT_ALIGN __m128i buf[4 + 8 + 64]; // chaining state concatenated with the expanded input block |
843 | | __m128i * W = &buf[4 + 8]; |
844 | | __m128i * ha = &buf[4]; // initial state words, in order h, g, ..., b, a |
845 | | __m128i A, B, C, D, T; |
846 | | int r; |
847 | | |
848 | | // |
849 | | // For 1-input only; set the input buffer to zero so that we have known values in every byte |
850 | | // |
851 | | //SymCryptWipeKnownSize( buf, sizeof( buf ) ); |
852 | | |
853 | | // |
854 | | // Copy the chaining state into the start of the buffer, order = h,g,f,e,d,c,b,a |
855 | | // |
856 | | ha[7] = _mm_insert_epi32(ha[7], pChain->H[0], 0); |
857 | | ha[6] = _mm_insert_epi32(ha[6], pChain->H[1], 0); |
858 | | ha[5] = _mm_insert_epi32(ha[5], pChain->H[2], 0); |
859 | | ha[4] = _mm_insert_epi32(ha[4], pChain->H[3], 0); |
860 | | ha[3] = _mm_insert_epi32(ha[3], pChain->H[4], 0); |
861 | | ha[2] = _mm_insert_epi32(ha[2], pChain->H[5], 0); |
862 | | ha[1] = _mm_insert_epi32(ha[1], pChain->H[6], 0); |
863 | | ha[0] = _mm_insert_epi32(ha[0], pChain->H[7], 0); |
864 | | |
865 | | buf[0] = ha[4]; |
866 | | buf[1] = ha[5]; |
867 | | buf[2] = ha[6]; |
868 | | buf[3] = ha[7]; |
869 | | |
870 | | while( cbData >= 64 ) |
871 | | { |
872 | | |
873 | | // |
874 | | // Capture the input into W[0..15] |
875 | | // |
876 | | for( r=0; r<16; r++ ) |
877 | | { |
878 | | W[r] = _mm_insert_epi32(W[r], SYMCRYPT_LOAD_MSBFIRST32( &pbData[ 4*r ] ), 0); |
879 | | } |
880 | | |
881 | | // |
882 | | // Expand the message |
883 | | // |
884 | | A = W[15]; |
885 | | B = W[14]; |
886 | | D = W[0]; |
887 | | for( r=16; r<64; r+= 2 ) |
888 | | { |
889 | | // Loop invariant: A=W[r-1], B = W[r-2], D = W[r-16] |
890 | | |
891 | | // |
892 | | // Macro for one word of message expansion. |
893 | | // Invariant: |
894 | | // on entry: a = W[r-1], b = W[r-2], d = W[r-16] |
895 | | // on exit: W[r] computed, a = W[r-1], b = W[r], c = W[r-15] |
896 | | // |
897 | | #define EXPAND( a, b, c, d, r ) \ |
898 | | c = W[r-15]; \ |
899 | | b = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( d, LSIGMA1XMM( b ) ), W[r-7] ), LSIGMA0XMM( c ) ); \ |
900 | | W[r] = b; \ |
901 | | |
902 | | EXPAND( A, B, C, D, r ); |
903 | | EXPAND( B, A, D, C, (r+1)); |
904 | | |
905 | | #undef EXPAND |
906 | | } |
907 | | |
908 | | A = ha[7]; |
909 | | B = ha[6]; |
910 | | C = ha[5]; |
911 | | D = ha[4]; |
912 | | |
913 | | for( r=0; r<64; r += 4 ) |
914 | | { |
915 | | // |
916 | | // Loop invariant: |
917 | | // A, B, C, and D are the a,b,c,d values of the current state. |
918 | | // W[r] is the next expanded message word to be processed. |
919 | | // W[r-8 .. r-5] contain the current state words h, g, f, e. |
920 | | // |
921 | | |
922 | | // |
923 | | // Macro to compute one round |
924 | | // |
925 | | #define DO_ROUND( a, b, c, d, t, r ) \ |
926 | | t = W[r]; \ |
927 | | t = _mm_add_epi32( t, CSIGMA1XMM( W[r-5] ) ); \ |
928 | | t = _mm_add_epi32( t, W[r-8] ); \ |
929 | | t = _mm_add_epi32( t, CHXMM( W[r-5], W[r-6], W[r-7] ) ); \ |
930 | | t = _mm_add_epi32( t, _mm_cvtsi32_si128( SymCryptSha256K[r] ) ); \ |
931 | | W[r-4] = _mm_add_epi32( t, d ); \ |
932 | | d = _mm_add_epi32( t, CSIGMA0XMM( a ) ); \ |
933 | | d = _mm_add_epi32( d, MAJXMM( c, b, a ) ); |
934 | | |
935 | | DO_ROUND( A, B, C, D, T, r ); |
936 | | DO_ROUND( D, A, B, C, T, (r+1) ); |
937 | | DO_ROUND( C, D, A, B, T, (r+2) ); |
938 | | DO_ROUND( B, C, D, A, T, (r+3) ); |
939 | | #undef DO_ROUND |
940 | | } |
941 | | |
942 | | buf[3] = ha[7] = _mm_add_epi32( buf[3], A ); |
943 | | buf[2] = ha[6] = _mm_add_epi32( buf[2], B ); |
944 | | buf[1] = ha[5] = _mm_add_epi32( buf[1], C ); |
945 | | buf[0] = ha[4] = _mm_add_epi32( buf[0], D ); |
946 | | ha[3] = _mm_add_epi32( ha[3], W[r-5] ); |
947 | | ha[2] = _mm_add_epi32( ha[2], W[r-6] ); |
948 | | ha[1] = _mm_add_epi32( ha[1], W[r-7] ); |
949 | | ha[0] = _mm_add_epi32( ha[0], W[r-8] ); |
950 | | |
951 | | pbData += 64; |
952 | | cbData -= 64; |
953 | | } |
954 | | |
955 | | // |
956 | | // Copy the chaining state back into the hash structure |
957 | | // |
958 | | pChain->H[0] = _mm_extract_epi32(ha[7], 0); |
959 | | pChain->H[1] = _mm_extract_epi32(ha[6], 0); |
960 | | pChain->H[2] = _mm_extract_epi32(ha[5], 0); |
961 | | pChain->H[3] = _mm_extract_epi32(ha[4], 0); |
962 | | pChain->H[4] = _mm_extract_epi32(ha[3], 0); |
963 | | pChain->H[5] = _mm_extract_epi32(ha[2], 0); |
964 | | pChain->H[6] = _mm_extract_epi32(ha[1], 0); |
965 | | pChain->H[7] = _mm_extract_epi32(ha[0], 0); |
966 | | |
967 | | *pcbRemaining = cbData; |
968 | | |
969 | | SymCryptWipeKnownSize( buf, sizeof( buf ) ); |
970 | | SymCryptWipeKnownSize( &A, sizeof( A ) ); |
971 | | SymCryptWipeKnownSize( &B, sizeof( B ) ); |
972 | | SymCryptWipeKnownSize( &C, sizeof( C ) ); |
973 | | SymCryptWipeKnownSize( &D, sizeof( D ) ); |
974 | | SymCryptWipeKnownSize( &T, sizeof( T ) ); |
975 | | } |
976 | | |
977 | | |
978 | | // |
979 | | // XMM implementation 2 |
980 | | // We use the XMM registers to compute part of the message schedule. |
981 | | // The load, BSWAP, and part of the message schedule recursion are done in XMM registers. |
982 | | // The rest of the work is done using integers. |
983 | | // |
984 | | // Core2: 0.1 c/B slower than the _ul1 |
985 | | // Atom: 1.0 c/B slower than _ul1 (42.34 vs 41.39 c/B) |
986 | | // |
987 | | VOID |
988 | | SYMCRYPT_CALL |
989 | | SymCryptSha256AppendBlocks_xmm2( |
990 | | _Inout_ SYMCRYPT_SHA256_CHAINING_STATE * pChain, |
991 | | _In_reads_( cbData ) PCBYTE pbData, |
992 | | SIZE_T cbData, |
993 | | _Out_ SIZE_T * pcbRemaining ) |
994 | | { |
995 | | SYMCRYPT_ALIGN union { UINT32 ul[16]; __m128i xmm[4]; } W; |
996 | | SYMCRYPT_ALIGN UINT32 ah[8]; |
997 | | int round; |
998 | | UINT32 Wt; |
999 | | const __m128i BYTE_REVERSE_32 = _mm_set_epi8( 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 ); |
1000 | | |
1001 | | ah[7] = pChain->H[0]; |
1002 | | ah[6] = pChain->H[1]; |
1003 | | ah[5] = pChain->H[2]; |
1004 | | ah[4] = pChain->H[3]; |
1005 | | ah[3] = pChain->H[4]; |
1006 | | ah[2] = pChain->H[5]; |
1007 | | ah[1] = pChain->H[6]; |
1008 | | ah[0] = pChain->H[7]; |
1009 | | |
1010 | | #define CROUND( r16, r ) {;\ |
1011 | | ah[ r16 &7] += CSIGMA1(ah[(r16+3)&7]) + CH(ah[(r16+3)&7], ah[(r16+2)&7], ah[(r16+1)&7]) + SymCryptSha256K[r] + Wt;\ |
1012 | | ah[(r16+4)&7] += ah[r16 &7];\ |
1013 | | ah[ r16 &7] += CSIGMA0(ah[(r16+7)&7]) + MAJ(ah[(r16+7)&7], ah[(r16+6)&7], ah[(r16+5)&7]);\ |
1014 | | } |
1015 | | |
1016 | | |
1017 | | // |
1018 | | // Initial round that reads the message. |
1019 | | // r is the round number 0..15 |
1020 | | // |
1021 | | // Wt = LOAD_MSBFIRST32( &pbData[ 4*r ] );\ |
1022 | | // W.ul[r] = Wt; \ |
1023 | | |
1024 | | #define IROUND( r ) {\ |
1025 | | Wt = W.ul[r];\ |
1026 | | CROUND(r,r);\ |
1027 | | } |
1028 | | |
1029 | | // |
1030 | | // Subsequent rounds. |
1031 | | // r16 is the round number mod 16. rb is the round number minus r16. |
1032 | | // |
1033 | | #define FROUND(r16, rb) { \ |
1034 | | Wt = W.ul[r16];\ |
1035 | | CROUND( r16, r16+rb ); \ |
1036 | | } |
1037 | | |
1038 | | |
1039 | | while( cbData >= 64 ) |
1040 | | { |
1041 | | // |
1042 | | // The code is faster if we directly access the W.ul array, rather than the W.xmm alias. |
1043 | | // I think the compiler gets more confused if you use the W.xmm values. |
1044 | | // We retain them in the union to ensure alignment |
1045 | | // |
1046 | | _mm_store_si128( (__m128i *)&W.ul[ 0], _mm_shuffle_epi8( _mm_loadu_si128( (__m128i *)&pbData[ 0 ] ), BYTE_REVERSE_32 )); |
1047 | | _mm_store_si128( (__m128i *)&W.ul[ 4], _mm_shuffle_epi8( _mm_loadu_si128( (__m128i *)&pbData[ 16 ] ), BYTE_REVERSE_32 )); |
1048 | | _mm_store_si128( (__m128i *)&W.ul[ 8], _mm_shuffle_epi8( _mm_loadu_si128( (__m128i *)&pbData[ 32 ] ), BYTE_REVERSE_32 )); |
1049 | | _mm_store_si128( (__m128i *)&W.ul[12], _mm_shuffle_epi8( _mm_loadu_si128( (__m128i *)&pbData[ 48 ] ), BYTE_REVERSE_32 )); |
1050 | | |
1051 | | // |
1052 | | // initial rounds 1 to 16 |
1053 | | // |
1054 | | |
1055 | | IROUND( 0 ); |
1056 | | IROUND( 1 ); |
1057 | | IROUND( 2 ); |
1058 | | IROUND( 3 ); |
1059 | | IROUND( 4 ); |
1060 | | IROUND( 5 ); |
1061 | | IROUND( 6 ); |
1062 | | IROUND( 7 ); |
1063 | | IROUND( 8 ); |
1064 | | IROUND( 9 ); |
1065 | | IROUND( 10 ); |
1066 | | IROUND( 11 ); |
1067 | | IROUND( 12 ); |
1068 | | IROUND( 13 ); |
1069 | | IROUND( 14 ); |
1070 | | IROUND( 15 ); |
1071 | | |
1072 | | |
1073 | | // |
1074 | | // rounds 16 to 64. |
1075 | | // |
1076 | | for( round=16; round<64; round += 16 ) |
1077 | | { |
1078 | | __m128i Tmp; |
1079 | | |
1080 | | Tmp = _mm_add_epi32( _mm_add_epi32( |
1081 | | LSIGMA0XMM(_mm_loadu_si128( (__m128i *)&W.ul[1] )), |
1082 | | _mm_load_si128( (__m128i *)&W.ul[0] ) ), |
1083 | | _mm_loadu_si128( (__m128i *)&W.ul[9] ) ); |
1084 | | |
1085 | | // |
1086 | | // The final part of the message schedule can be done in XMM registers, but it isn't worth it. |
1087 | | // The rotates in XMM take two shifts and an OR/XOR, vs one instruction in integer registers. |
1088 | | // As the sigma1( W_{t-2} ) recursion component can only be computed 2 at a time |
1089 | | // (because the result of the first two are the inputs to the second two) |
1090 | | // you lose more than you gain by using XMM registers. |
1091 | | // |
1092 | | //Tmp = _mm_add_epi32( Tmp, LSIGMA1XMM( _mm_srli_si128( _mm_load_si128( (__m128i *)&W.ul[12] ), 8 ) ) ); |
1093 | | //Tmp = _mm_add_epi32( Tmp, LSIGMA1XMM( _mm_slli_si128( Tmp, 8 ) ) ); |
1094 | | //_mm_store_si128( (__m128i *)&W.ul[0], Tmp ); |
1095 | | // |
1096 | | |
1097 | | _mm_store_si128( (__m128i *)&W.ul[0], Tmp ); |
1098 | | W.ul[0] += LSIGMA1( W.ul[14] ); |
1099 | | W.ul[1] += LSIGMA1( W.ul[15] ); |
1100 | | W.ul[2] += LSIGMA1( W.ul[0] ); |
1101 | | W.ul[3] += LSIGMA1( W.ul[1] ); |
1102 | | |
1103 | | FROUND( 0, round ); |
1104 | | FROUND( 1, round ); |
1105 | | FROUND( 2, round ); |
1106 | | FROUND( 3, round ); |
1107 | | |
1108 | | Tmp = _mm_add_epi32( _mm_add_epi32( |
1109 | | LSIGMA0XMM(_mm_loadu_si128( (__m128i *)&W.ul[5] )), |
1110 | | _mm_load_si128( (__m128i *)&W.ul[4] ) ), |
1111 | | _mm_alignr_epi8( _mm_load_si128( (__m128i *)&W.ul[0] ), _mm_load_si128( (__m128i *)&W.ul[12] ), 4) ); |
1112 | | |
1113 | | _mm_store_si128( (__m128i *)&W.ul[4], Tmp ); |
1114 | | |
1115 | | W.ul[4] += LSIGMA1( W.ul[2] ); |
1116 | | W.ul[5] += LSIGMA1( W.ul[3] ); |
1117 | | W.ul[6] += LSIGMA1( W.ul[4] ); |
1118 | | W.ul[7] += LSIGMA1( W.ul[5] ); |
1119 | | |
1120 | | FROUND( 4, round ); |
1121 | | FROUND( 5, round ); |
1122 | | FROUND( 6, round ); |
1123 | | FROUND( 7, round ); |
1124 | | |
1125 | | Tmp = _mm_add_epi32( _mm_add_epi32( |
1126 | | LSIGMA0XMM(_mm_loadu_si128( (__m128i *)&W.ul[9] )), |
1127 | | _mm_load_si128( (__m128i *)&W.ul[8] ) ), |
1128 | | _mm_loadu_si128( (__m128i *)&W.ul[1] ) ); |
1129 | | |
1130 | | _mm_store_si128( (__m128i *)&W.ul[8], Tmp ); |
1131 | | W.ul[ 8] += LSIGMA1( W.ul[6] ); |
1132 | | W.ul[ 9] += LSIGMA1( W.ul[7] ); |
1133 | | W.ul[10] += LSIGMA1( W.ul[8] ); |
1134 | | W.ul[11] += LSIGMA1( W.ul[9] ); |
1135 | | |
1136 | | FROUND( 8, round ); |
1137 | | FROUND( 9, round ); |
1138 | | FROUND( 10, round ); |
1139 | | FROUND( 11, round ); |
1140 | | |
1141 | | |
1142 | | Tmp = _mm_add_epi32( _mm_add_epi32( |
1143 | | LSIGMA0XMM( _mm_alignr_epi8( _mm_load_si128( (__m128i *)&W.ul[0] ), _mm_load_si128( (__m128i *)&W.ul[12] ), 4) ), |
1144 | | _mm_load_si128( (__m128i *)&W.ul[12] ) ), |
1145 | | _mm_loadu_si128( (__m128i *)&W.ul[5] ) ); |
1146 | | |
1147 | | _mm_store_si128( (__m128i *)&W.ul[12], Tmp ); |
1148 | | W.ul[12] += LSIGMA1( W.ul[10] ); |
1149 | | W.ul[13] += LSIGMA1( W.ul[11] ); |
1150 | | W.ul[14] += LSIGMA1( W.ul[12] ); |
1151 | | W.ul[15] += LSIGMA1( W.ul[13] ); |
1152 | | |
1153 | | FROUND( 12, round ); |
1154 | | FROUND( 13, round ); |
1155 | | FROUND( 14, round ); |
1156 | | FROUND( 15, round ); |
1157 | | } |
1158 | | |
1159 | | pChain->H[0] = ah[7] = ah[7] + pChain->H[0]; |
1160 | | pChain->H[1] = ah[6] = ah[6] + pChain->H[1]; |
1161 | | pChain->H[2] = ah[5] = ah[5] + pChain->H[2]; |
1162 | | pChain->H[3] = ah[4] = ah[4] + pChain->H[3]; |
1163 | | pChain->H[4] = ah[3] = ah[3] + pChain->H[4]; |
1164 | | pChain->H[5] = ah[2] = ah[2] + pChain->H[5]; |
1165 | | pChain->H[6] = ah[1] = ah[1] + pChain->H[6]; |
1166 | | pChain->H[7] = ah[0] = ah[0] + pChain->H[7]; |
1167 | | |
1168 | | pbData += 64; |
1169 | | cbData -= 64; |
1170 | | |
1171 | | } |
1172 | | |
1173 | | *pcbRemaining = cbData; |
1174 | | |
1175 | | // |
1176 | | // Wipe the variables; |
1177 | | // |
1178 | | SymCryptWipeKnownSize( ah, sizeof( ah ) ); |
1179 | | SymCryptWipeKnownSize( &W, sizeof( W ) ); |
1180 | | SYMCRYPT_FORCE_WRITE32( &Wt, 0 ); |
1181 | | |
1182 | | #undef IROUND |
1183 | | #undef FROUND |
1184 | | #undef CROUND |
1185 | | } |
1186 | | |
1187 | | #endif |
1188 | | |
1189 | | // |
1190 | | // SHA-NI Implementation |
1191 | | // |
1192 | | |
1193 | | #if SYMCRYPT_MS_VC |
1194 | | // Intrinsic definitions included here |
1195 | | // until the header is updated. |
1196 | | // ******************************* |
1197 | | // ******************************* |
1198 | | // ******************************* |
1199 | | extern __m128i _mm_sha256rnds2_epu32(__m128i, __m128i, __m128i); |
1200 | | extern __m128i _mm_sha256msg1_epu32(__m128i, __m128i); |
1201 | | extern __m128i _mm_sha256msg2_epu32(__m128i, __m128i); |
1202 | | // ******************************* |
1203 | | // ******************************* |
1204 | | // ******************************* |
1205 | | #endif |
1206 | | |
1207 | | // For the SHA-NI implementation we will utilize 128-bit XMM registers. Each |
1208 | | // XMM state will be denoted as (R_3, R_2, R_1, R_0), where each R_i |
1209 | | // is a 32-bit word and R_i refers to bits [32*i : (32*i + 31)] of the |
1210 | | // 128-bit XMM state. |
1211 | | // |
1212 | | // The following macro updates the state variables A,B,C,...,H of the SHA algorithms |
1213 | | // for 4 rounds using: |
1214 | | // - The current round number t with 0<=t<= 63 and t a multiple of 4. |
1215 | | // - A current message XMM state _MSG which consists of 4 32-bit words |
1216 | | // ( W_(t+3), W_(t+2), W_(t+1), W_(t+0) ). |
1217 | | // - Two XMM states _ABEF and _CDGH which contain the variables |
1218 | | // ( A, B, E, F ) and ( C, D, G, H ) respectively. |
1219 | | |
1220 | | #define SHANI_UPDATE_STATE( _round, _MSG, _ABEF, _CDGH ) \ |
1221 | 0 | _MSG = _mm_add_epi32( _MSG, *(__m128i *)&SymCryptSha256K[_round] ); /* Add the K_t constants to the W_t's */ \ |
1222 | 0 | _CDGH = _mm_sha256rnds2_epu32( _CDGH, _ABEF, _MSG ); /* 2 rounds using SHA-NI */ \ |
1223 | 0 | _MSG = _mm_shuffle_epi32( _MSG, 0x0e ); /* Move words 2 & 3 to positions 0 & 1 */ \ |
1224 | 0 | _ABEF = _mm_sha256rnds2_epu32( _ABEF, _CDGH, _MSG ); /* 2 rounds using SHA-NI */ |
1225 | | |
1226 | | // For the SHA message schedule (i.e. to create words W_16 to W_63) we use 4 XMM states / accumulators. |
1227 | | // Each accumulator holds 4 words. |
1228 | | // |
1229 | | // The final result for each word will be of the form W_t = X_t + Y_t, where |
1230 | | // X_t = W_(t-16) + \sigma_0(W_(t-15)) and |
1231 | | // Y_t = W_(t- 7) + \sigma_1(W_(t- 2)) |
1232 | | // |
1233 | | // The X_t's are calculated by the _mm_sha256msg1_epu32 intrinsic. |
1234 | | // The \sigma_1(W_(t-2)) part of the Y_t's by the _mm_sha256msg2_epu32 intrinsic. |
1235 | | // |
1236 | | // Remarks: |
1237 | | // - Calculation of the first four X_t's (i.e. 16<=t<=19) can start from round 4 (since 19-15 = 4). |
1238 | | // - Calculation of the first four Y_t's can start from round 12 (since 19-7=12 and W_(19-7) is calculated |
1239 | | // in the intrinsic call). |
1240 | | // - Due to the W_(t-7) term, producing the Y_t's need special shifting via the _mm_alignr_epi8 intrinsic and |
1241 | | // adding the correct accumulator into another variable MTEMP. |
1242 | | // |
1243 | | // For rounds 16 - 51 we execute the following macro in a loop. For all the other rounds we |
1244 | | // use specific code. |
1245 | | // |
1246 | | // The loop invariant to be satified at the beginning of iteration i (corresponding to rounds |
1247 | | // (16+4*i) to (19+4*i) ) is the following: |
1248 | | // _MSG_0 = ( W_(19 + 4*i), W_(18 + 4*i), W_(17 + 4*i), W_(16 + 4*i) ) |
1249 | | // _MSG_1 = ( X_(23 + 4*i), X_(22 + 4*i), X_(21 + 4*i), X_(20 + 4*i) ) |
1250 | | // _MSG_2 = ( X_(27 + 4*i), X_(26 + 4*i), X_(25 + 4*i), X_(24 + 4*i) ) |
1251 | | // _MSG_3 = ( W_(15 + 4*i), W_(14 + 4*i), W_(13 + 4*i), W_(12 + 4*i) ) |
1252 | | // |
1253 | | #define SHANI_MESSAGE_SCHEDULE( _MSG_0, _MSG_1, _MSG_2, _MSG_3, _MTEMP ) \ |
1254 | 0 | _MTEMP = _mm_alignr_epi8( _MSG_0, _MSG_3, 4); /* _MTEMP := ( W_(16 + 4*i), W_(15 + 4*i), W_(14 + 4*i), W_(13 + 4*i) ) */ \ |
1255 | 0 | _MSG_1 = _mm_add_epi32( _MSG_1, _MTEMP); /* _MSG_1 := _MSG_1 + ( W_(16 + 4*i), W_(15 + 4*i), W_(14 + 4*i), W_(13 + 4*i) ) */ \ |
1256 | 0 | _MSG_1 = _mm_sha256msg2_epu32( _MSG_1, _MSG_0 ); /* _MSG_1 := ( W_(23 + 4*i), W_(22 + 4*i), W_(21 + 4*i), W_(20 + 4*i) ) */ \ |
1257 | 0 | _MSG_3 = _mm_sha256msg1_epu32( _MSG_3, _MSG_0 ); /* _MSG_3 := ( X_(31+4*i), X_(30+4*i), X_(29+4*i), X_(28+4*i) ) */ |
1258 | | // |
1259 | | // After each iteration the subsequent call rotates the accumulators so that the loop |
1260 | | // invariant is preserved (please verify!): |
1261 | | // -- MSG_0 <---- MSG_1 <--- MSG_2 <--- MSG_3 <-- |
1262 | | // | | |
1263 | | // ---------------------------------------------- |
1264 | | |
1265 | | VOID |
1266 | | SYMCRYPT_CALL |
1267 | | SymCryptSha256AppendBlocks_shani( |
1268 | | _Inout_ SYMCRYPT_SHA256_CHAINING_STATE * pChain, |
1269 | | _In_reads_( cbData ) PCBYTE pbData, |
1270 | | SIZE_T cbData, |
1271 | | _Out_ SIZE_T * pcbRemaining ) |
1272 | 0 | { |
1273 | 0 | const __m128i BYTE_REVERSE_32 = _mm_set_epi8( 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 ); |
1274 | | |
1275 | | // Our chain state is in order A, B, ..., H. |
1276 | | // First load our chaining state |
1277 | 0 | __m128i DCBA = _mm_loadu_si128( (__m128i *)&(pChain->H[0]) ); // (D, C, B, A) |
1278 | 0 | __m128i HGFE = _mm_loadu_si128( (__m128i *)&(pChain->H[4]) ); // (H, G, F, E) |
1279 | 0 | __m128i FEBA = _mm_unpacklo_epi64( DCBA, HGFE ); // (F, E, B, A) |
1280 | 0 | __m128i HGDC = _mm_unpackhi_epi64( DCBA, HGFE ); // (H, G, D, C) |
1281 | 0 | __m128i ABEF = _mm_shuffle_epi32( FEBA, 0x1b ); // (A, B, E, F) |
1282 | 0 | __m128i CDGH = _mm_shuffle_epi32( HGDC, 0x1b ); // (C, D, G, H) |
1283 | |
|
1284 | 0 | while( cbData >= 64 ) |
1285 | 0 | { |
1286 | | // Save the current state for the feed-forward later |
1287 | 0 | __m128i ABEF_start = ABEF; |
1288 | 0 | __m128i CDGH_start = CDGH; |
1289 | | |
1290 | | // Current message and temporary state |
1291 | 0 | __m128i MSG; |
1292 | | |
1293 | | // Accumulators |
1294 | 0 | __m128i MSG_0; |
1295 | 0 | __m128i MSG_1; |
1296 | 0 | __m128i MSG_2; |
1297 | 0 | __m128i MSG_3; |
1298 | | |
1299 | | // Rounds 0-3 |
1300 | 0 | MSG = _mm_loadu_si128( (__m128i *)pbData ); // Reversed word - ( M_3, M_2, M_1, M_0 ) |
1301 | 0 | pbData += 16; |
1302 | 0 | MSG = _mm_shuffle_epi8( MSG, BYTE_REVERSE_32 ); // Reverse each word |
1303 | 0 | MSG_0 = MSG; // MSG_0 := ( W_3 = M3, W_2 = M_2, W_1 = M_1, W_0 = M_0 ) |
1304 | |
|
1305 | 0 | SHANI_UPDATE_STATE( 0, MSG, ABEF, CDGH ); |
1306 | | |
1307 | | // Rounds 4-7 |
1308 | 0 | MSG = _mm_loadu_si128( (__m128i *)pbData ); // Reversed word - ( M_7, M_6, M_5, M_4 ) |
1309 | 0 | pbData += 16; |
1310 | 0 | MSG = _mm_shuffle_epi8( MSG, BYTE_REVERSE_32 ); // Reverse each word |
1311 | 0 | MSG_1 = MSG; // MSG_1 := ( W_7 = M_7, W_6 = M_6, W_5 = M_5, W_4 = M_4 ) |
1312 | |
|
1313 | 0 | SHANI_UPDATE_STATE( 4, MSG, ABEF, CDGH ); |
1314 | |
|
1315 | 0 | MSG_0 = _mm_sha256msg1_epu32( MSG_0, MSG_1 ); // MSG_0 := ( X_19, X_18, X_17, X_16 ) = |
1316 | | // ( W_3 + \sigma_0(W_4), ..., W_0 + \sigma_0(W_1) ) |
1317 | | |
1318 | | // Rounds 8-11 |
1319 | 0 | MSG = _mm_loadu_si128( (__m128i *)pbData ); // Reversed word - ( M_11, M_10, M_9, M_8 ) |
1320 | 0 | pbData += 16; |
1321 | 0 | MSG = _mm_shuffle_epi8( MSG, BYTE_REVERSE_32 ); // Reverse each word |
1322 | 0 | MSG_2 = MSG; // MSG_2 := ( W_11 = M_11, W_10 = M_10, W_9 = M_9, W_8 = M_8 ) |
1323 | |
|
1324 | 0 | SHANI_UPDATE_STATE( 8, MSG, ABEF, CDGH ); |
1325 | |
|
1326 | 0 | MSG_1 = _mm_sha256msg1_epu32( MSG_1, MSG_2 ); // MSG_1 := ( X_23, X_22, X_21, X_20 ) |
1327 | | |
1328 | | // Rounds 12-15 |
1329 | 0 | MSG = _mm_loadu_si128( (__m128i *)pbData ); // Reversed word - ( M_15, M_14, M_13, M_12 ) |
1330 | 0 | pbData += 16; |
1331 | 0 | MSG = _mm_shuffle_epi8( MSG, BYTE_REVERSE_32 ); // Reverse each word |
1332 | 0 | MSG_3 = MSG; // MSG_3 := ( W_15 = M_15, W_14 = M_14, W_13 = M_13, W_12 = M_12 ) |
1333 | |
|
1334 | 0 | SHANI_UPDATE_STATE( 12, MSG, ABEF, CDGH ); |
1335 | |
|
1336 | 0 | MSG = _mm_alignr_epi8( MSG_3, MSG_2, 4); // MSG := ( W_12, W_11, W_10, W_9 ) |
1337 | 0 | MSG_0 = _mm_add_epi32( MSG_0, MSG); // MSG_0 := MSG_0 + ( W_12, W_11, W_10, W_9 ) |
1338 | 0 | MSG_0 = _mm_sha256msg2_epu32( MSG_0, MSG_3 ); // MSG_0 := ( W_19, W_18, W_17, W_16 ) = |
1339 | | // ( X_19 + W_12 + \sigma_1(W_17)], ..., X_16 + W_9 + \sigma_1(W_14)] ) |
1340 | |
|
1341 | 0 | MSG_2 = _mm_sha256msg1_epu32( MSG_2, MSG_3 ); // MSG_2 := ( X_27, X_26, X_25, X_24 ) |
1342 | | |
1343 | | |
1344 | | // Rounds 16 - 19 |
1345 | 0 | MSG = MSG_0; |
1346 | 0 | SHANI_UPDATE_STATE( 16, MSG, ABEF, CDGH ); |
1347 | 0 | SHANI_MESSAGE_SCHEDULE( MSG_0, MSG_1, MSG_2, MSG_3, MSG ); |
1348 | | |
1349 | | // Rounds 20 - 23 |
1350 | 0 | MSG = MSG_1; |
1351 | 0 | SHANI_UPDATE_STATE( 20, MSG, ABEF, CDGH ); |
1352 | 0 | SHANI_MESSAGE_SCHEDULE( MSG_1, MSG_2, MSG_3, MSG_0, MSG ); |
1353 | | |
1354 | | // Rounds 24 - 27 |
1355 | 0 | MSG = MSG_2; |
1356 | 0 | SHANI_UPDATE_STATE( 24, MSG, ABEF, CDGH ); |
1357 | 0 | SHANI_MESSAGE_SCHEDULE( MSG_2, MSG_3, MSG_0, MSG_1, MSG ); |
1358 | | |
1359 | | // Rounds 28 - 31 |
1360 | 0 | MSG = MSG_3; |
1361 | 0 | SHANI_UPDATE_STATE( 28, MSG, ABEF, CDGH ); |
1362 | 0 | SHANI_MESSAGE_SCHEDULE( MSG_3, MSG_0, MSG_1, MSG_2, MSG ); |
1363 | | |
1364 | | // Rounds 32 - 35 |
1365 | 0 | MSG = MSG_0; |
1366 | 0 | SHANI_UPDATE_STATE( 32, MSG, ABEF, CDGH ); |
1367 | 0 | SHANI_MESSAGE_SCHEDULE( MSG_0, MSG_1, MSG_2, MSG_3, MSG ); |
1368 | | |
1369 | | // Rounds 36 - 39 |
1370 | 0 | MSG = MSG_1; |
1371 | 0 | SHANI_UPDATE_STATE( 36, MSG, ABEF, CDGH ); |
1372 | 0 | SHANI_MESSAGE_SCHEDULE( MSG_1, MSG_2, MSG_3, MSG_0, MSG ); |
1373 | | |
1374 | | // Rounds 40 - 43 |
1375 | 0 | MSG = MSG_2; |
1376 | 0 | SHANI_UPDATE_STATE( 40, MSG, ABEF, CDGH ); |
1377 | 0 | SHANI_MESSAGE_SCHEDULE( MSG_2, MSG_3, MSG_0, MSG_1, MSG ); |
1378 | | |
1379 | | // Rounds 44 - 47 |
1380 | 0 | MSG = MSG_3; |
1381 | 0 | SHANI_UPDATE_STATE( 44, MSG, ABEF, CDGH ); |
1382 | 0 | SHANI_MESSAGE_SCHEDULE( MSG_3, MSG_0, MSG_1, MSG_2, MSG ); |
1383 | | |
1384 | | // Rounds 48 - 51 |
1385 | 0 | MSG = MSG_0; |
1386 | 0 | SHANI_UPDATE_STATE( 48, MSG, ABEF, CDGH ); |
1387 | 0 | SHANI_MESSAGE_SCHEDULE( MSG_0, MSG_1, MSG_2, MSG_3, MSG ); |
1388 | | |
1389 | | // Rounds 52 - 55 |
1390 | 0 | MSG = MSG_1; // ( W_55, W_54, W_53, W_52 ) |
1391 | 0 | SHANI_UPDATE_STATE( 52, MSG, ABEF, CDGH ); |
1392 | |
|
1393 | 0 | MSG = _mm_alignr_epi8( MSG_1, MSG_0, 4); // MSG := ( W_52, W_51, W_50, W_49 ) |
1394 | 0 | MSG_2 = _mm_add_epi32( MSG_2, MSG); // MSG_2 := MSG_2 + ( W_52, W_51, W_50, W_49 ) |
1395 | 0 | MSG_2 = _mm_sha256msg2_epu32( MSG_2, MSG_1 ); // Calculate ( W_59, W_58, W_57, W_56 ) |
1396 | | |
1397 | | // Rounds 56 - 59 |
1398 | 0 | MSG = MSG_2; // ( W_59, W_58, W_57, W_56 ) |
1399 | 0 | SHANI_UPDATE_STATE( 56, MSG, ABEF, CDGH ); |
1400 | |
|
1401 | 0 | MSG = _mm_alignr_epi8( MSG_2, MSG_1, 4); // MSG := ( W_56, W_55, W_54, W_53 ) |
1402 | 0 | MSG_3 = _mm_add_epi32( MSG_3, MSG); // MSG_3 := MSG_3 + ( W_56, W_55, W_54, W_53 ) |
1403 | 0 | MSG_3 = _mm_sha256msg2_epu32( MSG_3, MSG_2 ); // Calculate ( W_63, W_62, W_61, W_60 ) |
1404 | | |
1405 | | // Rounds 60 - 63 |
1406 | 0 | SHANI_UPDATE_STATE( 60, MSG_3, ABEF, CDGH ); |
1407 | | |
1408 | | // Add the feed-forward |
1409 | 0 | ABEF = _mm_add_epi32( ABEF, ABEF_start ); |
1410 | 0 | CDGH = _mm_add_epi32( CDGH, CDGH_start ); |
1411 | |
|
1412 | 0 | cbData -= 64; |
1413 | 0 | } |
1414 | | |
1415 | | // Unpack the state registers and store them in the state |
1416 | 0 | FEBA = _mm_shuffle_epi32( ABEF, 0x1b ); |
1417 | 0 | HGDC = _mm_shuffle_epi32( CDGH, 0x1b ); |
1418 | 0 | DCBA = _mm_unpacklo_epi64( FEBA, HGDC ); // (D, C, B, A) |
1419 | 0 | HGFE = _mm_unpackhi_epi64( FEBA, HGDC ); // (H, G, F, E) |
1420 | 0 | _mm_storeu_si128 ( (__m128i *)&(pChain->H[0]), DCBA); // (D, C, B, A) |
1421 | 0 | _mm_storeu_si128 ( (__m128i *)&(pChain->H[4]), HGFE); // (H, G, F, E) |
1422 | |
|
1423 | 0 | *pcbRemaining = cbData; |
1424 | 0 | } |
1425 | | |
1426 | | #undef SHANI_UPDATE_STATE |
1427 | | #undef SHANI_MESSAGE_SCHEDULE |
1428 | | |
1429 | | #endif // SYMCRYPT_CPU_X86 | SYMCRYPT_CPU_AMD64 |
1430 | | |
1431 | | #if SYMCRYPT_CPU_ARM64 |
1432 | | /* |
1433 | | ARM64 has special SHA-256 instructions |
1434 | | |
1435 | | SHA256H and SHA256H2 implement 4 rounds of SHA-256. The inputs are two registers containing the 256-bit state, |
1436 | | and one register containing 128 bits of expanded message plus the round constants. |
1437 | | These instructions perform the same computation, but SHA256H returns the first half of the 256-bit result, |
1438 | | and SHA256H2 returns the second half of the 256-bit result. |
1439 | | |
1440 | | SHA256H( ABCDE, FGHIJ, W ) |
1441 | | Where the least significant word of the ABCDE vector is A. The W vector contains W_i + K_i for the four rounds being computed. |
1442 | | |
1443 | | SHA256SU0 is the message schedule update function. |
1444 | | It takes 2 inputs and produces 1 output. |
1445 | | We describe the vectors for i=0,1,2,3 |
1446 | | Inputs: [W_{t-16+i}], [W_{t-12+i}] |
1447 | | Output: [Sigma0(W_{t-15+i}) + W_{t-16+i}] |
1448 | | |
1449 | | SHA256SU1 is the second message schedule update function |
1450 | | Takes 3 inputs and produces 1 output |
1451 | | Input 1: Output of SHA256SU0: [Sigma0(W_{t-15+i}) + W_{t-16+i}] |
1452 | | Input 2: |
1453 | | Input 3: [W_{t-4+i}] |
1454 | | |
1455 | | */ |
1456 | | |
1457 | | #define vldq(_p) (*(__n128 *)(_p)) |
1458 | | #define vstq(_p, _v) (*(__n128 *)(_p) = (_v) ) |
1459 | | |
1460 | | VOID |
1461 | | SYMCRYPT_CALL |
1462 | | SymCryptSha256AppendBlocks_instr( |
1463 | | _Inout_ SYMCRYPT_SHA256_CHAINING_STATE * pChain, |
1464 | | _In_reads_( cbData ) PCBYTE pbData, |
1465 | | SIZE_T cbData, |
1466 | | _Out_ SIZE_T * pcbRemaining ) |
1467 | | { |
1468 | | // |
1469 | | // Armv8 has 32 Neon registers. We can use a lot of variables. |
1470 | | // 16 for the constants, 4 for the message, 2 for the current state, 2 for the starting state, |
1471 | | // total = 24 which leaves enough for some temp values |
1472 | | // |
1473 | | __n128 ABCD, ABCDstart; |
1474 | | __n128 EFGH, EFGHstart; |
1475 | | __n128 W0, W1, W2, W3; |
1476 | | __n128 K0, K1, K2, K3, K4, K5, K6, K7, K8, K9, K10, K11, K12, K13, K14, K15; |
1477 | | |
1478 | | __n128 Wr; |
1479 | | __n128 t; |
1480 | | |
1481 | | ABCD = ABCDstart = vldq( &pChain->H[0] ); |
1482 | | EFGH = EFGHstart = vldq( &pChain->H[4] ); |
1483 | | |
1484 | | K0 = vldq( &SymCryptSha256K[ 4 * 0 ] ); |
1485 | | K1 = vldq( &SymCryptSha256K[ 4 * 1 ] ); |
1486 | | K2 = vldq( &SymCryptSha256K[ 4 * 2 ] ); |
1487 | | K3 = vldq( &SymCryptSha256K[ 4 * 3 ] ); |
1488 | | K4 = vldq( &SymCryptSha256K[ 4 * 4 ] ); |
1489 | | K5 = vldq( &SymCryptSha256K[ 4 * 5 ] ); |
1490 | | K6 = vldq( &SymCryptSha256K[ 4 * 6 ] ); |
1491 | | K7 = vldq( &SymCryptSha256K[ 4 * 7 ] ); |
1492 | | K8 = vldq( &SymCryptSha256K[ 4 * 8 ] ); |
1493 | | K9 = vldq( &SymCryptSha256K[ 4 * 9 ] ); |
1494 | | K10 = vldq( &SymCryptSha256K[ 4 * 10 ] ); |
1495 | | K11 = vldq( &SymCryptSha256K[ 4 * 11 ] ); |
1496 | | K12 = vldq( &SymCryptSha256K[ 4 * 12 ] ); |
1497 | | K13 = vldq( &SymCryptSha256K[ 4 * 13 ] ); |
1498 | | K14 = vldq( &SymCryptSha256K[ 4 * 14 ] ); |
1499 | | K15 = vldq( &SymCryptSha256K[ 4 * 15 ] ); |
1500 | | |
1501 | | while( cbData >= 64 ) |
1502 | | { |
1503 | | W0 = vrev32q_u8( vldq( &pbData[ 0] ) ); |
1504 | | W1 = vrev32q_u8( vldq( &pbData[16] ) ); |
1505 | | W2 = vrev32q_u8( vldq( &pbData[32] ) ); |
1506 | | W3 = vrev32q_u8( vldq( &pbData[48] ) ); |
1507 | | |
1508 | | // |
1509 | | // The sha256h/sha256h2 instructions overwrite one of the two state input registers. |
1510 | | // This implies we have to have a copy made of one of the input states. |
1511 | | // |
1512 | | #define ROUNDOP {\ |
1513 | | t = ABCD;\ |
1514 | | ABCD = vsha256hq_u32 ( ABCD, EFGH, Wr );\ |
1515 | | EFGH = vsha256h2q_u32( EFGH, t, Wr );\ |
1516 | | } |
1517 | | |
1518 | | Wr = vaddq_u32( W0, K0 ); |
1519 | | ROUNDOP; |
1520 | | Wr = vaddq_u32( W1, K1 ); |
1521 | | ROUNDOP; |
1522 | | Wr = vaddq_u32( W2, K2 ); |
1523 | | ROUNDOP; |
1524 | | Wr = vaddq_u32( W3, K3 ); |
1525 | | ROUNDOP; |
1526 | | |
1527 | | t = vsha256su0q_u32( W0, W1 ); |
1528 | | W0 = vsha256su1q_u32( t, W2, W3 ); |
1529 | | Wr = vaddq_u32( W0, K4 ); |
1530 | | ROUNDOP; |
1531 | | |
1532 | | t = vsha256su0q_u32( W1, W2 ); |
1533 | | W1 = vsha256su1q_u32( t, W3, W0 ); |
1534 | | Wr = vaddq_u32( W1, K5 ); |
1535 | | ROUNDOP; |
1536 | | |
1537 | | t = vsha256su0q_u32( W2, W3 ); |
1538 | | W2 = vsha256su1q_u32( t, W0, W1 ); |
1539 | | Wr = vaddq_u32( W2, K6 ); |
1540 | | ROUNDOP; |
1541 | | |
1542 | | t = vsha256su0q_u32( W3, W0 ); |
1543 | | W3 = vsha256su1q_u32( t, W1, W2 ); |
1544 | | Wr = vaddq_u32( W3, K7 ); |
1545 | | ROUNDOP; |
1546 | | |
1547 | | |
1548 | | t = vsha256su0q_u32( W0, W1 ); |
1549 | | W0 = vsha256su1q_u32( t, W2, W3 ); |
1550 | | Wr = vaddq_u32( W0, K8 ); |
1551 | | ROUNDOP; |
1552 | | |
1553 | | t = vsha256su0q_u32( W1, W2 ); |
1554 | | W1 = vsha256su1q_u32( t, W3, W0 ); |
1555 | | Wr = vaddq_u32( W1, K9 ); |
1556 | | ROUNDOP; |
1557 | | |
1558 | | t = vsha256su0q_u32( W2, W3 ); |
1559 | | W2 = vsha256su1q_u32( t, W0, W1 ); |
1560 | | Wr = vaddq_u32( W2, K10 ); |
1561 | | ROUNDOP; |
1562 | | |
1563 | | t = vsha256su0q_u32( W3, W0 ); |
1564 | | W3 = vsha256su1q_u32( t, W1, W2 ); |
1565 | | Wr = vaddq_u32( W3, K11 ); |
1566 | | ROUNDOP; |
1567 | | |
1568 | | |
1569 | | t = vsha256su0q_u32( W0, W1 ); |
1570 | | W0 = vsha256su1q_u32( t, W2, W3 ); |
1571 | | Wr = vaddq_u32( W0, K12 ); |
1572 | | ROUNDOP; |
1573 | | |
1574 | | t = vsha256su0q_u32( W1, W2 ); |
1575 | | W1 = vsha256su1q_u32( t, W3, W0 ); |
1576 | | Wr = vaddq_u32( W1, K13 ); |
1577 | | ROUNDOP; |
1578 | | |
1579 | | t = vsha256su0q_u32( W2, W3 ); |
1580 | | W2 = vsha256su1q_u32( t, W0, W1 ); |
1581 | | Wr = vaddq_u32( W2, K14 ); |
1582 | | ROUNDOP; |
1583 | | |
1584 | | t = vsha256su0q_u32( W3, W0 ); |
1585 | | W3 = vsha256su1q_u32( t, W1, W2 ); |
1586 | | Wr = vaddq_u32( W3, K15 ); |
1587 | | ROUNDOP; |
1588 | | |
1589 | | ABCDstart = ABCD = vaddq_u32( ABCDstart, ABCD ); |
1590 | | EFGHstart = EFGH = vaddq_u32( EFGHstart, EFGH ); |
1591 | | |
1592 | | pbData += 64; |
1593 | | cbData -= 64; |
1594 | | #undef ROUNDOP |
1595 | | |
1596 | | } |
1597 | | |
1598 | | *pcbRemaining = cbData; |
1599 | | vstq( &pChain->H[0], ABCD ); |
1600 | | vstq( &pChain->H[4], EFGH ); |
1601 | | |
1602 | | // |
1603 | | // All our local variables should be in registers, so no way to wipe them. |
1604 | | // |
1605 | | } |
1606 | | |
1607 | | #endif |
1608 | | |
1609 | | |
1610 | | |
1611 | | // |
1612 | | // Easy switch between different implementations |
1613 | | // |
1614 | | //FORCEINLINE |
1615 | | VOID |
1616 | | SYMCRYPT_CALL |
1617 | | SymCryptSha256AppendBlocks( |
1618 | | _Inout_ SYMCRYPT_SHA256_CHAINING_STATE* pChain, |
1619 | | _In_reads_(cbData) PCBYTE pbData, |
1620 | | SIZE_T cbData, |
1621 | | _Out_ SIZE_T* pcbRemaining) |
1622 | 21.4k | { |
1623 | 21.4k | #if SYMCRYPT_CPU_AMD64 |
1624 | | |
1625 | 21.4k | SYMCRYPT_EXTENDED_SAVE_DATA SaveData; |
1626 | | |
1627 | 21.4k | if (SYMCRYPT_CPU_FEATURES_PRESENT(SYMCRYPT_CPU_FEATURES_FOR_SHANI_CODE) && |
1628 | 21.4k | SymCryptSaveXmm(&SaveData) == SYMCRYPT_NO_ERROR) |
1629 | 0 | { |
1630 | 0 | SymCryptSha256AppendBlocks_shani(pChain, pbData, cbData, pcbRemaining); |
1631 | |
|
1632 | 0 | SymCryptRestoreXmm(&SaveData); |
1633 | 0 | } |
1634 | | // Temporarily disabling use of Ymm in SHA2 |
1635 | | // else if (SYMCRYPT_CPU_FEATURES_PRESENT(SYMCRYPT_CPU_FEATURE_AVX2 | SYMCRYPT_CPU_FEATURE_BMI2) && |
1636 | | // SymCryptSaveYmm(&SaveData) == SYMCRYPT_NO_ERROR) |
1637 | | // { |
1638 | | // //SymCryptSha256AppendBlocks_ul1(pChain, pbData, cbData, pcbRemaining); |
1639 | | // //SymCryptSha256AppendBlocks_ymm_8blocks(pChain, pbData, cbData, pcbRemaining); |
1640 | | // SymCryptSha256AppendBlocks_ymm_avx2_asm(pChain, pbData, cbData, pcbRemaining); |
1641 | | |
1642 | | // SymCryptRestoreYmm(&SaveData); |
1643 | | // } |
1644 | 21.4k | else if (SYMCRYPT_CPU_FEATURES_PRESENT(SYMCRYPT_CPU_FEATURE_SSSE3 | SYMCRYPT_CPU_FEATURE_BMI2) && |
1645 | 21.4k | SymCryptSaveXmm(&SaveData) == SYMCRYPT_NO_ERROR) |
1646 | 0 | { |
1647 | | //SymCryptSha256AppendBlocks_xmm_4blocks(pChain, pbData, cbData, pcbRemaining); |
1648 | 0 | SymCryptSha256AppendBlocks_xmm_ssse3_asm(pChain, pbData, cbData, pcbRemaining); |
1649 | | |
1650 | 0 | SymCryptRestoreXmm(&SaveData); |
1651 | 0 | } |
1652 | 21.4k | else |
1653 | 21.4k | { |
1654 | 21.4k | SymCryptSha256AppendBlocks_ul1( pChain, pbData, cbData, pcbRemaining ); |
1655 | | //SymCryptSha256AppendBlocks_ul2(pChain, pbData, cbData, pcbRemaining); |
1656 | 21.4k | } |
1657 | | #elif SYMCRYPT_CPU_X86 |
1658 | | SYMCRYPT_EXTENDED_SAVE_DATA SaveData; |
1659 | | |
1660 | | if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURES_FOR_SHANI_CODE | SYMCRYPT_CPU_FEATURE_SAVEXMM_NOFAIL ) && |
1661 | | SymCryptSaveXmm( &SaveData ) == SYMCRYPT_NO_ERROR ) |
1662 | | { |
1663 | | SymCryptSha256AppendBlocks_shani( pChain, pbData, cbData, pcbRemaining ); |
1664 | | SymCryptRestoreXmm( &SaveData ); |
1665 | | } |
1666 | | else if (SYMCRYPT_CPU_FEATURES_PRESENT(SYMCRYPT_CPU_FEATURE_SSSE3 | SYMCRYPT_CPU_FEATURE_BMI2) |
1667 | | && SymCryptSaveXmm(&SaveData) == SYMCRYPT_NO_ERROR) |
1668 | | { |
1669 | | SymCryptSha256AppendBlocks_xmm_4blocks(pChain, pbData, cbData, pcbRemaining); |
1670 | | SymCryptRestoreXmm(&SaveData); |
1671 | | } |
1672 | | else { |
1673 | | SymCryptSha256AppendBlocks_ul1( pChain, pbData, cbData, pcbRemaining ); |
1674 | | } |
1675 | | #elif SYMCRYPT_CPU_ARM64 |
1676 | | if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURE_NEON_SHA256 ) ) |
1677 | | { |
1678 | | SymCryptSha256AppendBlocks_instr( pChain, pbData, cbData, pcbRemaining ); |
1679 | | } else { |
1680 | | SymCryptSha256AppendBlocks_ul1( pChain, pbData, cbData, pcbRemaining ); |
1681 | | } |
1682 | | #else |
1683 | | SymCryptSha256AppendBlocks_ul1( pChain, pbData, cbData, pcbRemaining ); |
1684 | | #endif |
1685 | | |
1686 | | //SymCryptSha256AppendBlocks_ul2( pChain, pbData, cbData, pcbRemaining ); |
1687 | | //SymCryptSha256AppendBlocks_xmm1( pChain, pbData, cbData, pcbRemaining ); !!! Needs Save/restore logic |
1688 | | //SymCryptSha256AppendBlocks_xmm2( pChain, pbData, cbData, pcbRemaining ); |
1689 | 21.4k | } |
1690 | | |
1691 | | |