/src/SymCrypt/lib/sha512.c
Line | Count | Source (jump to first uncovered line) |
1 | | // |
2 | | // Sha512.c |
3 | | // |
4 | | // Copyright (c) Microsoft Corporation. Licensed under the MIT license. |
5 | | // |
6 | | |
7 | | // |
8 | | // This module contains the routines to implement SHA2-512 from FIPS 180-2 |
9 | | // |
10 | | |
11 | | |
12 | | #include "precomp.h" |
13 | | |
14 | | // |
15 | | // SHA-512 uses 80 magic constants of 64 bits each. These are |
16 | | // referred to as K^{512}_i for i=0...79 by FIPS 180-2. |
17 | | // We use a static array as that does not pollute the linker name space |
18 | | // For performance we align to the cache line size of 64 bytes |
19 | | // We have one extra value at the end to allow an XMM read from each element |
20 | | // of the array. |
21 | | // |
22 | | SYMCRYPT_ALIGN_AT( 64 ) const UINT64 SymCryptSha512K[81] = { |
23 | | 0x428a2f98d728ae22UL, 0x7137449123ef65cdUL, |
24 | | 0xb5c0fbcfec4d3b2fUL, 0xe9b5dba58189dbbcUL, |
25 | | 0x3956c25bf348b538UL, 0x59f111f1b605d019UL, |
26 | | 0x923f82a4af194f9bUL, 0xab1c5ed5da6d8118UL, |
27 | | 0xd807aa98a3030242UL, 0x12835b0145706fbeUL, |
28 | | 0x243185be4ee4b28cUL, 0x550c7dc3d5ffb4e2UL, |
29 | | 0x72be5d74f27b896fUL, 0x80deb1fe3b1696b1UL, |
30 | | 0x9bdc06a725c71235UL, 0xc19bf174cf692694UL, |
31 | | 0xe49b69c19ef14ad2UL, 0xefbe4786384f25e3UL, |
32 | | 0x0fc19dc68b8cd5b5UL, 0x240ca1cc77ac9c65UL, |
33 | | 0x2de92c6f592b0275UL, 0x4a7484aa6ea6e483UL, |
34 | | 0x5cb0a9dcbd41fbd4UL, 0x76f988da831153b5UL, |
35 | | 0x983e5152ee66dfabUL, 0xa831c66d2db43210UL, |
36 | | 0xb00327c898fb213fUL, 0xbf597fc7beef0ee4UL, |
37 | | 0xc6e00bf33da88fc2UL, 0xd5a79147930aa725UL, |
38 | | 0x06ca6351e003826fUL, 0x142929670a0e6e70UL, |
39 | | 0x27b70a8546d22ffcUL, 0x2e1b21385c26c926UL, |
40 | | 0x4d2c6dfc5ac42aedUL, 0x53380d139d95b3dfUL, |
41 | | 0x650a73548baf63deUL, 0x766a0abb3c77b2a8UL, |
42 | | 0x81c2c92e47edaee6UL, 0x92722c851482353bUL, |
43 | | 0xa2bfe8a14cf10364UL, 0xa81a664bbc423001UL, |
44 | | 0xc24b8b70d0f89791UL, 0xc76c51a30654be30UL, |
45 | | 0xd192e819d6ef5218UL, 0xd69906245565a910UL, |
46 | | 0xf40e35855771202aUL, 0x106aa07032bbd1b8UL, |
47 | | 0x19a4c116b8d2d0c8UL, 0x1e376c085141ab53UL, |
48 | | 0x2748774cdf8eeb99UL, 0x34b0bcb5e19b48a8UL, |
49 | | 0x391c0cb3c5c95a63UL, 0x4ed8aa4ae3418acbUL, |
50 | | 0x5b9cca4f7763e373UL, 0x682e6ff3d6b2b8a3UL, |
51 | | 0x748f82ee5defb2fcUL, 0x78a5636f43172f60UL, |
52 | | 0x84c87814a1f0ab72UL, 0x8cc702081a6439ecUL, |
53 | | 0x90befffa23631e28UL, 0xa4506cebde82bde9UL, |
54 | | 0xbef9a3f7b2c67915UL, 0xc67178f2e372532bUL, |
55 | | 0xca273eceea26619cUL, 0xd186b8c721c0c207UL, |
56 | | 0xeada7dd6cde0eb1eUL, 0xf57d4f7fee6ed178UL, |
57 | | 0x06f067aa72176fbaUL, 0x0a637dc5a2c898a6UL, |
58 | | 0x113f9804bef90daeUL, 0x1b710b35131c471bUL, |
59 | | 0x28db77f523047d84UL, 0x32caab7b40c72493UL, |
60 | | 0x3c9ebe0a15c9bebcUL, 0x431d67c49c100d4cUL, |
61 | | 0x4cc5d4becb3e42b6UL, 0x597f299cfc657e2aUL, |
62 | | 0x5fcb6fab3ad6faecUL, 0x6c44198c4a475817UL, |
63 | | }; |
64 | | |
65 | | // |
66 | | // Initial states |
67 | | // |
68 | | const UINT64 SymCryptSha512InitialState[8] = { |
69 | | 0x6a09e667f3bcc908UL, |
70 | | 0xbb67ae8584caa73bUL, |
71 | | 0x3c6ef372fe94f82bUL, |
72 | | 0xa54ff53a5f1d36f1UL, |
73 | | 0x510e527fade682d1UL, |
74 | | 0x9b05688c2b3e6c1fUL, |
75 | | 0x1f83d9abfb41bd6bUL, |
76 | | 0x5be0cd19137e2179UL, |
77 | | }; |
78 | | |
79 | | const UINT64 SymCryptSha384InitialState[8] = { |
80 | | 0xcbbb9d5dc1059ed8UL, |
81 | | 0x629a292a367cd507UL, |
82 | | 0x9159015a3070dd17UL, |
83 | | 0x152fecd8f70e5939UL, |
84 | | 0x67332667ffc00b31UL, |
85 | | 0x8eb44a8768581511UL, |
86 | | 0xdb0c2e0d64f98fa7UL, |
87 | | 0x47b5481dbefa4fa4UL, |
88 | | }; |
89 | | |
90 | | |
91 | | // |
92 | | // Todo: this structure pulls in the SHA284 code anytime someone uses |
93 | | // SHA-512; should be split into a separate file. |
94 | | // |
95 | | const SYMCRYPT_HASH SymCryptSha384Algorithm_default = { |
96 | | &SymCryptSha384Init, |
97 | | &SymCryptSha384Append, |
98 | | &SymCryptSha384Result, |
99 | | &SymCryptSha512AppendBlocks, |
100 | | &SymCryptSha384StateCopy, |
101 | | sizeof( SYMCRYPT_SHA384_STATE ), |
102 | | SYMCRYPT_SHA384_RESULT_SIZE, |
103 | | SYMCRYPT_SHA384_INPUT_BLOCK_SIZE, |
104 | | SYMCRYPT_FIELD_OFFSET( SYMCRYPT_SHA384_STATE, chain ), |
105 | | SYMCRYPT_FIELD_SIZE( SYMCRYPT_SHA384_STATE, chain ), |
106 | | }; |
107 | | |
108 | | const SYMCRYPT_HASH SymCryptSha512Algorithm_default = { |
109 | | &SymCryptSha512Init, |
110 | | &SymCryptSha512Append, |
111 | | &SymCryptSha512Result, |
112 | | &SymCryptSha512AppendBlocks, |
113 | | &SymCryptSha512StateCopy, |
114 | | sizeof( SYMCRYPT_SHA512_STATE ), |
115 | | SYMCRYPT_SHA512_RESULT_SIZE, |
116 | | SYMCRYPT_SHA512_INPUT_BLOCK_SIZE, |
117 | | SYMCRYPT_FIELD_OFFSET( SYMCRYPT_SHA512_STATE, chain ), |
118 | | SYMCRYPT_FIELD_SIZE( SYMCRYPT_SHA512_STATE, chain ), |
119 | | }; |
120 | | |
121 | | const PCSYMCRYPT_HASH SymCryptSha384Algorithm = &SymCryptSha384Algorithm_default; |
122 | | const PCSYMCRYPT_HASH SymCryptSha512Algorithm = &SymCryptSha512Algorithm_default; |
123 | | |
124 | | // |
125 | | // SymCryptSha384 |
126 | | // |
127 | | #define ALG SHA384 |
128 | | #define Alg Sha384 |
129 | | #include "hash_pattern.c" |
130 | | #undef ALG |
131 | | #undef Alg |
132 | | |
133 | | // |
134 | | // SymCryptSha512 |
135 | | // |
136 | | #define ALG SHA512 |
137 | | #define Alg Sha512 |
138 | | #include "hash_pattern.c" |
139 | | #undef ALG |
140 | | #undef Alg |
141 | | |
142 | | |
143 | | SYMCRYPT_NOINLINE |
144 | | VOID |
145 | | SYMCRYPT_CALL |
146 | | SymCryptSha512Init( _Out_ PSYMCRYPT_SHA512_STATE pState ) |
147 | 416 | { |
148 | 416 | SYMCRYPT_SET_MAGIC( pState ); |
149 | | |
150 | 416 | pState->dataLengthH = 0; |
151 | 416 | pState->dataLengthL = 0; |
152 | 416 | pState->bytesInBuffer = 0; |
153 | | |
154 | 416 | memcpy( &pState->chain.H[0], &SymCryptSha512InitialState[0], sizeof( SymCryptSha512InitialState ) ); |
155 | | |
156 | | // |
157 | | // There is no need to initialize the buffer part of the state as that will be |
158 | | // filled before it is used. |
159 | | // |
160 | 416 | } |
161 | | |
162 | | |
163 | | SYMCRYPT_NOINLINE |
164 | | VOID |
165 | | SYMCRYPT_CALL |
166 | | SymCryptSha384Init( _Out_ PSYMCRYPT_SHA384_STATE pState ) |
167 | 12.4k | { |
168 | 12.4k | SYMCRYPT_SET_MAGIC( pState ); |
169 | | |
170 | 12.4k | pState->dataLengthH = 0; |
171 | 12.4k | pState->dataLengthL = 0; |
172 | 12.4k | pState->bytesInBuffer = 0; |
173 | | |
174 | 12.4k | memcpy( &pState->chain.H[0], &SymCryptSha384InitialState[0], sizeof( SymCryptSha384InitialState ) ); |
175 | | |
176 | | // |
177 | | // There is no need to initialize the buffer part of the state as that will be |
178 | | // filled before it is used. |
179 | | // |
180 | 12.4k | } |
181 | | |
182 | | |
183 | | SYMCRYPT_NOINLINE |
184 | | VOID |
185 | | SYMCRYPT_CALL |
186 | | SymCryptSha512Append( |
187 | | _Inout_ PSYMCRYPT_SHA512_STATE pState, |
188 | | _In_reads_( cbData ) PCBYTE pbData, |
189 | | SIZE_T cbData ) |
190 | 96.5k | { |
191 | 96.5k | UINT32 bytesInBuffer; |
192 | 96.5k | UINT32 freeInBuffer; |
193 | 96.5k | SIZE_T tmp; |
194 | | |
195 | 96.5k | SYMCRYPT_CHECK_MAGIC( pState ); |
196 | | |
197 | 96.5k | pState->dataLengthL += cbData; |
198 | 96.5k | if( pState->dataLengthL < cbData ) { |
199 | 0 | pState->dataLengthH++; |
200 | 0 | } |
201 | | |
202 | 96.5k | bytesInBuffer = pState->bytesInBuffer; |
203 | | |
204 | | // |
205 | | // If previous data in buffer, buffer new input and transform if possible. |
206 | | // |
207 | 96.5k | if( bytesInBuffer > 0 ) |
208 | 81.8k | { |
209 | 81.8k | SYMCRYPT_ASSERT( SYMCRYPT_SHA512_INPUT_BLOCK_SIZE > bytesInBuffer ); |
210 | | |
211 | 81.8k | freeInBuffer = SYMCRYPT_SHA512_INPUT_BLOCK_SIZE - bytesInBuffer; |
212 | 81.8k | if( cbData < freeInBuffer ) |
213 | 71.8k | { |
214 | | // |
215 | | // All the data will fit in the buffer. |
216 | | // We don't do anything here. |
217 | | // As cbData < inputBlockSize the bulk data processing is skipped, |
218 | | // and the data will be copied to the buffer at the end |
219 | | // of this code. |
220 | 71.8k | } else { |
221 | | // |
222 | | // Enough data to fill the whole buffer & process it |
223 | | // |
224 | 10.0k | memcpy(&pState->buffer[bytesInBuffer], pbData, freeInBuffer); |
225 | 10.0k | pbData += freeInBuffer; |
226 | 10.0k | cbData -= freeInBuffer; |
227 | 10.0k | SymCryptSha512AppendBlocks( &pState->chain, &pState->buffer[0], SYMCRYPT_SHA512_INPUT_BLOCK_SIZE, &tmp ); |
228 | | |
229 | 10.0k | bytesInBuffer = 0; |
230 | 10.0k | } |
231 | 81.8k | } |
232 | | |
233 | | // |
234 | | // Internal buffer is empty; process all remaining whole blocks in the input |
235 | | // |
236 | 96.5k | if( cbData >= SYMCRYPT_SHA512_INPUT_BLOCK_SIZE ) |
237 | 9.26k | { |
238 | 9.26k | SymCryptSha512AppendBlocks( &pState->chain, pbData, cbData, &tmp ); |
239 | 9.26k | SYMCRYPT_ASSERT( tmp < SYMCRYPT_SHA512_INPUT_BLOCK_SIZE ); |
240 | 9.26k | pbData += cbData - tmp; |
241 | 9.26k | cbData = tmp; |
242 | 9.26k | } |
243 | | |
244 | 96.5k | SYMCRYPT_ASSERT( cbData < SYMCRYPT_SHA512_INPUT_BLOCK_SIZE ); |
245 | | |
246 | | // |
247 | | // buffer remaining input if necessary. |
248 | | // |
249 | 96.5k | if( cbData > 0 ) |
250 | 43.2k | { |
251 | 43.2k | memcpy( &pState->buffer[bytesInBuffer], pbData, cbData ); |
252 | 43.2k | bytesInBuffer += (UINT32) cbData; |
253 | 43.2k | } |
254 | | |
255 | 96.5k | pState->bytesInBuffer = bytesInBuffer; |
256 | | |
257 | 96.5k | } |
258 | | |
259 | | SYMCRYPT_NOINLINE |
260 | | VOID |
261 | | SYMCRYPT_CALL |
262 | | SymCryptSha384Append( |
263 | | _Inout_ PSYMCRYPT_SHA384_STATE pState, |
264 | | _In_reads_( cbData ) PCBYTE pbData, |
265 | | SIZE_T cbData ) |
266 | 25.6k | { |
267 | | |
268 | 25.6k | SymCryptSha512Append( (PSYMCRYPT_SHA512_STATE)pState, pbData, cbData ); |
269 | | |
270 | 25.6k | } |
271 | | |
272 | | |
273 | | SYMCRYPT_NOINLINE |
274 | | VOID |
275 | | SYMCRYPT_CALL |
276 | | SymCryptSha512Result( |
277 | | _Inout_ PSYMCRYPT_SHA512_STATE pState, |
278 | | _Out_writes_( SYMCRYPT_SHA512_RESULT_SIZE ) PBYTE pbResult ) |
279 | 28.7k | { |
280 | 28.7k | UINT32 bytesInBuffer; |
281 | 28.7k | SIZE_T tmp; |
282 | | |
283 | 28.7k | SYMCRYPT_CHECK_MAGIC( pState ); |
284 | | |
285 | 28.7k | bytesInBuffer = pState->bytesInBuffer; |
286 | | |
287 | | // |
288 | | // The buffer is never completely full, so we can always put the first |
289 | | // padding byte in. |
290 | | // |
291 | 28.7k | pState->buffer[bytesInBuffer++] = 0x80; |
292 | | |
293 | 28.7k | if( bytesInBuffer > 128-16 ) { |
294 | | // |
295 | | // No room for the rest of the padding. Pad with zeroes & process block |
296 | | // bytesInBuffer is at most 128, so we do not have an integer underflow |
297 | | // |
298 | 3.10k | SymCryptWipe( &pState->buffer[bytesInBuffer], 128-bytesInBuffer ); |
299 | 3.10k | SymCryptSha512AppendBlocks( &pState->chain, pState->buffer, 128, &tmp ); |
300 | 3.10k | bytesInBuffer = 0; |
301 | 3.10k | } |
302 | | |
303 | | // |
304 | | // Set rest of padding |
305 | | // We wipe to the end of the buffer as it is 16-aligned, |
306 | | // and it is faster to wipe to an aligned point |
307 | | // |
308 | 28.7k | SymCryptWipe( &pState->buffer[bytesInBuffer], 128-bytesInBuffer ); |
309 | 28.7k | SYMCRYPT_STORE_MSBFIRST64( &pState->buffer[128-16], (pState->dataLengthH << 3) + (pState->dataLengthL >> 61) ); |
310 | 28.7k | SYMCRYPT_STORE_MSBFIRST64( &pState->buffer[128- 8], (pState->dataLengthL << 3) ); |
311 | | |
312 | 28.7k | SymCryptSha512AppendBlocks( &pState->chain, pState->buffer, 128, &tmp ); |
313 | | |
314 | 28.7k | SymCryptUint64ToMsbFirst( &pState->chain.H[0], pbResult, 8 ); |
315 | | |
316 | | // |
317 | | // We have to wipe the whole state because the Init call |
318 | | // might be optimized away by a smart compiler. |
319 | | // |
320 | 28.7k | SymCryptWipeKnownSize( pState, sizeof( *pState ) ); |
321 | | |
322 | 28.7k | SYMCRYPT_SET_MAGIC( pState ); |
323 | | |
324 | 28.7k | memcpy( &pState->chain.H[0], &SymCryptSha512InitialState[0], sizeof( SymCryptSha512InitialState ) ); |
325 | 28.7k | } |
326 | | |
327 | | SYMCRYPT_NOINLINE |
328 | | VOID |
329 | | SYMCRYPT_CALL |
330 | | SymCryptSha384Result( |
331 | | _Inout_ PSYMCRYPT_SHA384_STATE pState, |
332 | | _Out_writes_( SYMCRYPT_SHA384_RESULT_SIZE ) PBYTE pbResult ) |
333 | 12.2k | { |
334 | | // |
335 | | // For simplicity we re-use SymCryptSha512Result. This is slightly slower, |
336 | | // but SHA-384 isn't used that much. |
337 | | // |
338 | 12.2k | SYMCRYPT_ALIGN BYTE sha512Result[SYMCRYPT_SHA512_RESULT_SIZE]; // Buffer for SHA-512 output |
339 | | |
340 | | // |
341 | | // The SHA-384 result is the first 48 bytes of the SHA-512 result of our state |
342 | | // |
343 | 12.2k | SymCryptSha512Result( (PSYMCRYPT_SHA512_STATE)pState, sha512Result ); |
344 | 12.2k | memcpy( pbResult, sha512Result, SYMCRYPT_SHA384_RESULT_SIZE ); |
345 | | |
346 | | // |
347 | | // The buffer was already wiped by the SymCryptSha512Result function, we |
348 | | // just have to re-initialize for SHA-384 |
349 | | // |
350 | 12.2k | SymCryptSha384Init( pState ); |
351 | | |
352 | 12.2k | SymCryptWipeKnownSize( sha512Result, sizeof( sha512Result ) ); |
353 | 12.2k | } |
354 | | |
355 | | |
356 | | VOID |
357 | | SYMCRYPT_CALL |
358 | | SymCryptSha512StateExportCore( |
359 | | _In_ PCSYMCRYPT_SHA512_STATE pState, |
360 | | _Out_writes_bytes_( SYMCRYPT_SHA512_STATE_EXPORT_SIZE ) PBYTE pbBlob, |
361 | | _In_ UINT32 type ) |
362 | 0 | { |
363 | 0 | SYMCRYPT_ALIGN SYMCRYPT_SHA512_STATE_EXPORT_BLOB blob; // local copy to have proper alignment. |
364 | 0 | C_ASSERT( sizeof( blob ) == SYMCRYPT_SHA512_STATE_EXPORT_SIZE ); |
365 | |
|
366 | 0 | SYMCRYPT_CHECK_MAGIC( pState ); |
367 | |
|
368 | 0 | SymCryptWipeKnownSize( &blob, sizeof( blob ) ); // wipe to avoid any data leakage |
369 | |
|
370 | 0 | blob.header.magic = SYMCRYPT_BLOB_MAGIC; |
371 | 0 | blob.header.size = SYMCRYPT_SHA512_STATE_EXPORT_SIZE; |
372 | 0 | blob.header.type = type; |
373 | | |
374 | | // |
375 | | // Copy the relevant data. Buffer will be 0-padded. |
376 | | // |
377 | |
|
378 | 0 | SymCryptUint64ToMsbFirst( &pState->chain.H[0], &blob.chain[0], 8 ); |
379 | 0 | blob.dataLengthL = pState->dataLengthL; |
380 | 0 | blob.dataLengthH = pState->dataLengthH; |
381 | 0 | memcpy( &blob.buffer[0], &pState->buffer[0], blob.dataLengthL & 0x7f ); |
382 | |
|
383 | 0 | SYMCRYPT_ASSERT( (PCBYTE) &blob + sizeof( blob ) - sizeof( SYMCRYPT_BLOB_TRAILER ) == (PCBYTE) &blob.trailer ); |
384 | 0 | SymCryptMarvin32( SymCryptMarvin32DefaultSeed, (PCBYTE) &blob, sizeof( blob ) - sizeof( SYMCRYPT_BLOB_TRAILER ), &blob.trailer.checksum[0] ); |
385 | |
|
386 | 0 | memcpy( pbBlob, &blob, sizeof( blob ) ); |
387 | | |
388 | | //cleanup: |
389 | 0 | SymCryptWipeKnownSize( &blob, sizeof( blob ) ); |
390 | 0 | return; |
391 | 0 | } |
392 | | |
393 | | VOID |
394 | | SYMCRYPT_CALL |
395 | | SymCryptSha512StateExport( |
396 | | _In_ PCSYMCRYPT_SHA512_STATE pState, |
397 | | _Out_writes_bytes_( SYMCRYPT_SHA512_STATE_EXPORT_SIZE ) PBYTE pbBlob ) |
398 | 0 | { |
399 | 0 | SymCryptSha512StateExportCore( pState, pbBlob, SymCryptBlobTypeSha512State ); |
400 | 0 | } |
401 | | |
402 | | VOID |
403 | | SYMCRYPT_CALL |
404 | | SymCryptSha384StateExport( |
405 | | _In_ PCSYMCRYPT_SHA384_STATE pState, |
406 | | _Out_writes_bytes_( SYMCRYPT_SHA384_STATE_EXPORT_SIZE ) PBYTE pbBlob ) |
407 | 0 | { |
408 | 0 | SymCryptSha512StateExportCore( (PCSYMCRYPT_SHA512_STATE)pState, pbBlob, SymCryptBlobTypeSha384State ); |
409 | 0 | } |
410 | | |
411 | | |
412 | | SYMCRYPT_ERROR |
413 | | SYMCRYPT_CALL |
414 | | SymCryptSha512StateImportCore( |
415 | | _Out_ PSYMCRYPT_SHA512_STATE pState, |
416 | | _In_reads_bytes_( SYMCRYPT_SHA512_STATE_EXPORT_SIZE) PCBYTE pbBlob, |
417 | | _In_ UINT32 type ) |
418 | 0 | { |
419 | 0 | SYMCRYPT_ERROR scError = SYMCRYPT_NO_ERROR; |
420 | 0 | SYMCRYPT_ALIGN SYMCRYPT_SHA512_STATE_EXPORT_BLOB blob; // local copy to have proper alignment. |
421 | 0 | BYTE checksum[8]; |
422 | |
|
423 | 0 | C_ASSERT( sizeof( blob ) == SYMCRYPT_SHA512_STATE_EXPORT_SIZE ); |
424 | 0 | memcpy( &blob, pbBlob, sizeof( blob ) ); |
425 | |
|
426 | 0 | if( blob.header.magic != SYMCRYPT_BLOB_MAGIC || |
427 | 0 | blob.header.size != SYMCRYPT_SHA512_STATE_EXPORT_SIZE || |
428 | 0 | blob.header.type != type ) |
429 | 0 | { |
430 | 0 | scError = SYMCRYPT_INVALID_BLOB; |
431 | 0 | goto cleanup; |
432 | 0 | } |
433 | | |
434 | 0 | SymCryptMarvin32( SymCryptMarvin32DefaultSeed, (PCBYTE) &blob, sizeof( blob ) - sizeof( SYMCRYPT_BLOB_TRAILER ), checksum ); |
435 | 0 | if( memcmp( checksum, &blob.trailer.checksum[0], 8 ) != 0 ) |
436 | 0 | { |
437 | 0 | scError = SYMCRYPT_INVALID_BLOB; |
438 | 0 | goto cleanup; |
439 | 0 | } |
440 | | |
441 | 0 | SymCryptMsbFirstToUint64( &blob.chain[0], &pState->chain.H[0], 8 ); |
442 | 0 | pState->dataLengthL = blob.dataLengthL; |
443 | 0 | pState->dataLengthH = blob.dataLengthH; |
444 | 0 | pState->bytesInBuffer = blob.dataLengthL & 0x7f; |
445 | 0 | memcpy( &pState->buffer[0], &blob.buffer[0], pState->bytesInBuffer ); |
446 | |
|
447 | 0 | SYMCRYPT_SET_MAGIC( pState ); |
448 | |
|
449 | 0 | cleanup: |
450 | 0 | SymCryptWipeKnownSize( &blob, sizeof(blob) ); |
451 | 0 | return scError; |
452 | 0 | } |
453 | | |
454 | | SYMCRYPT_ERROR |
455 | | SYMCRYPT_CALL |
456 | | SymCryptSha512StateImport( |
457 | | _Out_ PSYMCRYPT_SHA512_STATE pState, |
458 | | _In_reads_bytes_( SYMCRYPT_SHA512_STATE_EXPORT_SIZE) PCBYTE pbBlob ) |
459 | 0 | { |
460 | 0 | return SymCryptSha512StateImportCore( pState, pbBlob, SymCryptBlobTypeSha512State ); |
461 | 0 | } |
462 | | |
463 | | SYMCRYPT_ERROR |
464 | | SYMCRYPT_CALL |
465 | | SymCryptSha384StateImport( |
466 | | _Out_ PSYMCRYPT_SHA384_STATE pState, |
467 | | _In_reads_bytes_( SYMCRYPT_SHA384_STATE_EXPORT_SIZE) PCBYTE pbBlob ) |
468 | 0 | { |
469 | 0 | return SymCryptSha512StateImportCore( (PSYMCRYPT_SHA512_STATE)pState, pbBlob, SymCryptBlobTypeSha384State ); |
470 | 0 | } |
471 | | |
472 | | |
473 | | |
474 | | // |
475 | | // A simple test case intended for module testing for |
476 | | // FIPS compliance. |
477 | | // This is the one-block example message from FIPS 180-2 appendix C |
478 | | // |
479 | | |
480 | | const BYTE SymCryptSha512KATAnswer[64] = |
481 | | { |
482 | | 0xdd, 0xaf, 0x35, 0xa1, 0x93, 0x61, 0x7a, 0xba, |
483 | | 0xcc, 0x41, 0x73, 0x49, 0xae, 0x20, 0x41, 0x31, |
484 | | 0x12, 0xe6, 0xfa, 0x4e, 0x89, 0xa9, 0x7e, 0xa2, |
485 | | 0x0a, 0x9e, 0xee, 0xe6, 0x4b, 0x55, 0xd3, 0x9a, |
486 | | 0x21, 0x92, 0x99, 0x2a, 0x27, 0x4f, 0xc1, 0xa8, |
487 | | 0x36, 0xba, 0x3c, 0x23, 0xa3, 0xfe, 0xeb, 0xbd, |
488 | | 0x45, 0x4d, 0x44, 0x23, 0x64, 0x3c, 0xe8, 0x0e, |
489 | | 0x2a, 0x9a, 0xc9, 0x4f, 0xa5, 0x4c, 0xa4, 0x9f, |
490 | | }; |
491 | | |
492 | | VOID |
493 | | SYMCRYPT_CALL |
494 | | SymCryptSha512Selftest(void) |
495 | 0 | { |
496 | 0 | BYTE result[SYMCRYPT_SHA512_RESULT_SIZE]; |
497 | |
|
498 | 0 | SymCryptSha512( SymCryptTestMsg3, sizeof( SymCryptTestMsg3 ), result ); |
499 | |
|
500 | 0 | SymCryptInjectError( result, sizeof( result ) ); |
501 | |
|
502 | 0 | if( memcmp( result, SymCryptSha512KATAnswer, sizeof( result ) ) != 0 ) { |
503 | 0 | SymCryptFatal( 'SH51' ); |
504 | 0 | } |
505 | 0 | } |
506 | | |
507 | | // |
508 | | // A simple test case intended for module testing for |
509 | | // FIPS compliance. |
510 | | // This is the one-block example message from FIPS 180-2 appendix D |
511 | | // |
512 | | |
513 | | const BYTE SymCryptSha384KATAnswer[ 48 ] = |
514 | | { |
515 | | 0xcb, 0x00, 0x75, 0x3f, 0x45, 0xa3, 0x5e, 0x8b, |
516 | | 0xb5, 0xa0, 0x3d, 0x69, 0x9a, 0xc6, 0x50, 0x07, |
517 | | 0x27, 0x2c, 0x32, 0xab, 0x0e, 0xde, 0xd1, 0x63, |
518 | | 0x1a, 0x8b, 0x60, 0x5a, 0x43, 0xff, 0x5b, 0xed, |
519 | | 0x80, 0x86, 0x07, 0x2b, 0xa1, 0xe7, 0xcc, 0x23, |
520 | | 0x58, 0xba, 0xec, 0xa1, 0x34, 0xc8, 0x25, 0xa7, |
521 | | }; |
522 | | |
523 | | VOID |
524 | | SYMCRYPT_CALL |
525 | | SymCryptSha384Selftest(void) |
526 | 0 | { |
527 | 0 | BYTE result[SYMCRYPT_SHA384_RESULT_SIZE]; |
528 | |
|
529 | 0 | SymCryptSha384( SymCryptTestMsg3, sizeof( SymCryptTestMsg3 ), result ); |
530 | |
|
531 | 0 | SymCryptInjectError( result, sizeof( result ) ); |
532 | |
|
533 | 0 | if( memcmp( result, SymCryptSha384KATAnswer, sizeof( result ) ) != 0 ) { |
534 | 0 | SymCryptFatal( 'SH38' ); |
535 | 0 | } |
536 | 0 | } |
537 | | |
538 | | // |
539 | | // We keep multiple implementations in this file. |
540 | | // This allows us to switch different platforms to different implementations, whichever |
541 | | // is faster. Even if we don't use a particular implementation in one release, |
542 | | // we keep it around in case it becomes the preferred one for a new CPU release. |
543 | | // (Performance can change a lot with changes in micro-architecture.) |
544 | | // |
545 | | |
546 | | //=================================================================================== |
547 | | // Implementation of compression function using UINT64s |
548 | | // |
549 | | |
550 | | // |
551 | | // For documentation on these function see FIPS 180-2 |
552 | | // |
553 | | // MAJ and CH are the functions Maj and Ch from the standard. |
554 | | // CSIGMA0 and CSIGMA1 are the capital sigma functions. |
555 | | // LSIGMA0 and LSIGMA1 are the lowercase sigma functions. |
556 | | // |
557 | | // The canonical definitions of the MAJ and CH functions are: |
558 | | //#define MAJ( x, y, z ) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) |
559 | | //#define CH( x, y, z ) (((x) & (y)) ^ ((~(x)) & (z))) |
560 | | // We use optimized versions defined below |
561 | | // |
562 | 43.3M | #define MAJ( x, y, z ) ((((z) | (y)) & (x) ) | ((z) & (y))) |
563 | 43.3M | #define CH( x, y, z ) ((((z) ^ (y)) & (x)) ^ (z)) |
564 | | |
565 | | // |
566 | | // The four Sigma functions |
567 | | // |
568 | | |
569 | | //#define CSIGMA0( x ) (ROR64((x), 28) ^ ROR64((x), 34) ^ ROR64((x), 39)) |
570 | | //#define CSIGMA1( x ) (ROR64((x), 14) ^ ROR64((x), 18) ^ ROR64((x), 41)) |
571 | | //#define LSIGMA0( x ) (ROR64((x), 1) ^ ROR64((x), 8) ^ ((x)>> 7)) |
572 | | //#define LSIGMA1( x ) (ROR64((x), 19) ^ ROR64((x), 61) ^ ((x)>> 6)) |
573 | | |
574 | 43.3M | #define CSIGMA0( x ) (ROR64((ROR64((x), 6) ^ ROR64((x), 11) ^ (x)), 28)) |
575 | 43.3M | #define CSIGMA1( x ) (ROR64((ROR64((x), 4) ^ ROR64((x), 27) ^ (x)), 14)) |
576 | 34.6M | #define LSIGMA0( x ) (ROR64((x) ^ ROR64((x), 7), 1) ^ ((x)>> 7)) |
577 | 34.6M | #define LSIGMA1( x ) (ROR64((x) ^ ROR64((x), 42), 19) ^ ((x)>> 6)) |
578 | | |
579 | | |
580 | | |
581 | | // |
582 | | // The values a-h were stored in an array called ah. |
583 | | // We have unrolled the loop 16 times. This makes both the indices into |
584 | | // the ah array constant, and it makes the message addressing constant. |
585 | | // This provides a significant speed improvement, at the cost of making |
586 | | // the main loop about 4 kB in code. |
587 | | // |
588 | | // Initial round; r16 is the round number mod 16 |
589 | | // ah[ r16 &7] = h |
590 | | // ah[(r16+1)&7] = g; |
591 | | // ah[(r16+2)&7] = f; |
592 | | // ah[(r16+3)&7] = e; |
593 | | // ah[(r16+4)&7] = d; |
594 | | // ah[(r16+5)&7] = c; |
595 | | // ah[(r16+6)&7] = b; |
596 | | // ah[(r16+7)&7] = a; |
597 | | // |
598 | | // Unfortunately, the compiler seems to choke on this, allocating an extra variable for |
599 | | // each of the array indices, with duplicate stores to both locations. |
600 | | // |
601 | | |
602 | | // |
603 | | // The core round, after the message word has been computed for this round and put in Wt. |
604 | | // r16 is the round number modulo 16. (Static after loop unrolling) |
605 | | // r is the round number |
606 | 43.3M | #define CROUND( a, b, c, d, e, f, g, h, r, r16 ) {;\ |
607 | 43.3M | W[r16] = Wt; \ |
608 | 43.3M | h += CSIGMA1(e) + CH(e, f, g) + SymCryptSha512K[r] + Wt;\ |
609 | 43.3M | d += h;\ |
610 | 43.3M | h += CSIGMA0(a) + MAJ(a, b, c);\ |
611 | 43.3M | } |
612 | | |
613 | | // |
614 | | // Initial round that reads the message. |
615 | | // r is the round number 0..15 |
616 | | // |
617 | 8.66M | #define IROUND( a, b, c, d, e, f, g, h, r ) {\ |
618 | 8.66M | Wt = SYMCRYPT_LOAD_MSBFIRST64( &pbData[ 8*r ] );\ |
619 | 8.66M | CROUND( a, b, c, d, e, f, g, h, r, r);\ |
620 | 8.66M | } |
621 | | // |
622 | | // Subsequent rounds. |
623 | | // r is the round number, r16 is the round number mod 16. |
624 | | // These are separate as typically r is run-time and r16 is compile time constant. |
625 | | // |
626 | 34.6M | #define FROUND( a, b, c, d, e, f, g, h, r, r16 ) { \ |
627 | 34.6M | Wt = LSIGMA1( W[(r16-2) & 15] ) + W[(r16-7) & 15] + \ |
628 | 34.6M | LSIGMA0( W[(r16-15) & 15]) + W[r16 & 15]; \ |
629 | 34.6M | CROUND( a, b, c, d, e, f, g, h, r, r16 ); \ |
630 | 34.6M | } |
631 | | |
632 | | // |
633 | | // This is the core routine that does the actual hard work |
634 | | // This is based on the older one in RSA32LIB by Scott Field from 2001 |
635 | | // |
636 | | VOID |
637 | | SYMCRYPT_CALL |
638 | | SymCryptSha512AppendBlocks_ull( |
639 | | _Inout_ SYMCRYPT_SHA512_CHAINING_STATE * pChain, |
640 | | _In_reads_(cbData) PCBYTE pbData, |
641 | | SIZE_T cbData, |
642 | | _Out_ SIZE_T * pcbRemaining ) |
643 | 52.1k | { |
644 | 52.1k | SYMCRYPT_ALIGN UINT64 W[16]; |
645 | 52.1k | UINT64 A, B, C, D, E, F, G, H; |
646 | 52.1k | int round; |
647 | 52.1k | UINT64 Wt; |
648 | | |
649 | | |
650 | 593k | while( cbData >= 128 ) |
651 | 541k | { |
652 | 541k | A = pChain->H[0]; |
653 | 541k | B = pChain->H[1]; |
654 | 541k | C = pChain->H[2]; |
655 | 541k | D = pChain->H[3]; |
656 | 541k | E = pChain->H[4]; |
657 | 541k | F = pChain->H[5]; |
658 | 541k | G = pChain->H[6]; |
659 | 541k | H = pChain->H[7]; |
660 | | |
661 | | // |
662 | | // initial rounds 1 to 16 |
663 | | // |
664 | | |
665 | 541k | IROUND( A, B, C, D, E, F, G, H, 0 ); |
666 | 541k | IROUND( H, A, B, C, D, E, F, G, 1 ); |
667 | 541k | IROUND( G, H, A, B, C, D, E, F, 2 ); |
668 | 541k | IROUND( F, G, H, A, B, C, D, E, 3 ); |
669 | 541k | IROUND( E, F, G, H, A, B, C, D, 4 ); |
670 | 541k | IROUND( D, E, F, G, H, A, B, C, 5 ); |
671 | 541k | IROUND( C, D, E, F, G, H, A, B, 6 ); |
672 | 541k | IROUND( B, C, D, E, F, G, H, A, 7 ); |
673 | 541k | IROUND( A, B, C, D, E, F, G, H, 8 ); |
674 | 541k | IROUND( H, A, B, C, D, E, F, G, 9 ); |
675 | 541k | IROUND( G, H, A, B, C, D, E, F, 10 ); |
676 | 541k | IROUND( F, G, H, A, B, C, D, E, 11 ); |
677 | 541k | IROUND( E, F, G, H, A, B, C, D, 12 ); |
678 | 541k | IROUND( D, E, F, G, H, A, B, C, 13 ); |
679 | 541k | IROUND( C, D, E, F, G, H, A, B, 14 ); |
680 | 541k | IROUND( B, C, D, E, F, G, H, A, 15 ); |
681 | | |
682 | 2.70M | for( round=16; round<80; round += 16 ) |
683 | 2.16M | { |
684 | 2.16M | FROUND( A, B, C, D, E, F, G, H, round + 0, 0 ); |
685 | 2.16M | FROUND( H, A, B, C, D, E, F, G, round + 1, 1 ); |
686 | 2.16M | FROUND( G, H, A, B, C, D, E, F, round + 2, 2 ); |
687 | 2.16M | FROUND( F, G, H, A, B, C, D, E, round + 3, 3 ); |
688 | 2.16M | FROUND( E, F, G, H, A, B, C, D, round + 4, 4 ); |
689 | 2.16M | FROUND( D, E, F, G, H, A, B, C, round + 5, 5 ); |
690 | 2.16M | FROUND( C, D, E, F, G, H, A, B, round + 6, 6 ); |
691 | 2.16M | FROUND( B, C, D, E, F, G, H, A, round + 7, 7 ); |
692 | 2.16M | FROUND( A, B, C, D, E, F, G, H, round + 8, 8 ); |
693 | 2.16M | FROUND( H, A, B, C, D, E, F, G, round + 9, 9 ); |
694 | 2.16M | FROUND( G, H, A, B, C, D, E, F, round + 10, 10 ); |
695 | 2.16M | FROUND( F, G, H, A, B, C, D, E, round + 11, 11 ); |
696 | 2.16M | FROUND( E, F, G, H, A, B, C, D, round + 12, 12 ); |
697 | 2.16M | FROUND( D, E, F, G, H, A, B, C, round + 13, 13 ); |
698 | 2.16M | FROUND( C, D, E, F, G, H, A, B, round + 14, 14 ); |
699 | 2.16M | FROUND( B, C, D, E, F, G, H, A, round + 15, 15 ); |
700 | 2.16M | } |
701 | | |
702 | 541k | pChain->H[0] = A + pChain->H[0]; |
703 | 541k | pChain->H[1] = B + pChain->H[1]; |
704 | 541k | pChain->H[2] = C + pChain->H[2]; |
705 | 541k | pChain->H[3] = D + pChain->H[3]; |
706 | 541k | pChain->H[4] = E + pChain->H[4]; |
707 | 541k | pChain->H[5] = F + pChain->H[5]; |
708 | 541k | pChain->H[6] = G + pChain->H[6]; |
709 | 541k | pChain->H[7] = H + pChain->H[7]; |
710 | | |
711 | 541k | pbData += 128; |
712 | 541k | cbData -= 128; |
713 | 541k | } |
714 | | |
715 | 52.1k | *pcbRemaining = cbData; |
716 | | |
717 | | // |
718 | | // Wipe the variables; |
719 | | // |
720 | 52.1k | SymCryptWipeKnownSize( W, sizeof( W ) ); |
721 | 52.1k | SYMCRYPT_FORCE_WRITE64( &A, 0 ); |
722 | 52.1k | SYMCRYPT_FORCE_WRITE64( &B, 0 ); |
723 | 52.1k | SYMCRYPT_FORCE_WRITE64( &C, 0 ); |
724 | 52.1k | SYMCRYPT_FORCE_WRITE64( &D, 0 ); |
725 | 52.1k | SYMCRYPT_FORCE_WRITE64( &E, 0 ); |
726 | 52.1k | SYMCRYPT_FORCE_WRITE64( &F, 0 ); |
727 | 52.1k | SYMCRYPT_FORCE_WRITE64( &G, 0 ); |
728 | 52.1k | SYMCRYPT_FORCE_WRITE64( &H, 0 ); |
729 | 52.1k | SYMCRYPT_FORCE_WRITE64( &Wt, 0 ); |
730 | 52.1k | } |
731 | | |
732 | | // |
733 | | // UINT64 based implementation that |
734 | | // first computes the expanded message, and then the |
735 | | // actual hash computation. |
736 | | // It tries to use fewer registers; this is probably a good approach for CPUs with only 8 |
737 | | // 64-bit registers; which is what you would use on x86 XMM, but we have XMM code below. |
738 | | // This uses more memory, but might allow better register re-use and thereby |
739 | | // reduce the number of load/stores. |
740 | | // |
741 | | |
742 | | VOID |
743 | | SYMCRYPT_CALL |
744 | | SymCryptSha512AppendBlocks_ull2( |
745 | | _Inout_ SYMCRYPT_SHA512_CHAINING_STATE * pChain, |
746 | | _In_reads_(cbData) PCBYTE pbData, |
747 | | SIZE_T cbData, |
748 | | _Out_ SIZE_T * pcbRemaining ) |
749 | 0 | { |
750 | 0 | SYMCRYPT_ALIGN UINT64 buf[4 + 8 + 80]; // 4 words original chaining state, chaining state, and expanded input block |
751 | 0 | UINT64 * W = &buf[4 + 8]; |
752 | 0 | UINT64 * ha = &buf[4]; // initial state words, in order h, g, ..., b, a |
753 | 0 | UINT64 A, B, C, D, T; |
754 | 0 | int r; |
755 | |
|
756 | 0 | ha[7] = pChain->H[0]; buf[3] = ha[7]; |
757 | 0 | ha[6] = pChain->H[1]; buf[2] = ha[6]; |
758 | 0 | ha[5] = pChain->H[2]; buf[1] = ha[5]; |
759 | 0 | ha[4] = pChain->H[3]; buf[0] = ha[4]; |
760 | 0 | ha[3] = pChain->H[4]; |
761 | 0 | ha[2] = pChain->H[5]; |
762 | 0 | ha[1] = pChain->H[6]; |
763 | 0 | ha[0] = pChain->H[7]; |
764 | |
|
765 | 0 | while( cbData >= 128 ) |
766 | 0 | { |
767 | | |
768 | | // |
769 | | // Capture the input into W[0..15] |
770 | | // |
771 | 0 | for( r=0; r<16; r+= 2 ) |
772 | 0 | { |
773 | 0 | W[r ] = SYMCRYPT_LOAD_MSBFIRST64( &pbData[ 8* r ] ); |
774 | 0 | W[r+1] = SYMCRYPT_LOAD_MSBFIRST64( &pbData[ 8*(r+1) ] ); |
775 | 0 | } |
776 | | |
777 | | // |
778 | | // Expand the message |
779 | | // |
780 | 0 | A = W[15]; |
781 | 0 | B = W[14]; |
782 | 0 | D = W[0]; |
783 | 0 | for( r=16; r<80; r+= 2 ) |
784 | 0 | { |
785 | | // Loop invariant: A=W[r-1], B = W[r-2], D = W[r-16] |
786 | | |
787 | | // |
788 | | // Macro for one word of message expansion. |
789 | | // Invariant: |
790 | | // on entry: a = W[r-1], b = W[r-2], d = W[r-16] |
791 | | // on exit: W[r] computed, a = W[r-1], b = W[r], c = W[r-15] |
792 | | // |
793 | 0 | #define EXPAND( a, b, c, d, r ) \ |
794 | 0 | c = W[r-15]; \ |
795 | 0 | b = d + LSIGMA1( b ) + W[r-7] + LSIGMA0( c ); \ |
796 | 0 | W[r] = b; \ |
797 | 0 |
|
798 | 0 | EXPAND( A, B, C, D, r ); |
799 | 0 | EXPAND( B, A, D, C, (r+1)); |
800 | |
|
801 | 0 | #undef EXPAND |
802 | 0 | } |
803 | |
|
804 | 0 | A = ha[7]; |
805 | 0 | B = ha[6]; |
806 | 0 | C = ha[5]; |
807 | 0 | D = ha[4]; |
808 | |
|
809 | 0 | for( r=0; r<80; r += 4 ) |
810 | 0 | { |
811 | | // |
812 | | // Loop invariant: |
813 | | // A, B, C, and D are the a,b,c,d values of the current state. |
814 | | // W[r] is the next expanded message word to be processed. |
815 | | // W[r-8 .. r-5] contain the current state words h, g, f, e. |
816 | | // |
817 | | |
818 | | // |
819 | | // Macro to compute one round |
820 | | // |
821 | 0 | #define DO_ROUND( a, b, c, d, t, r ) \ |
822 | 0 | t = W[r] + CSIGMA1( W[r-5] ) + W[r-8] + CH( W[r-5], W[r-6], W[r-7] ) + SymCryptSha512K[r]; \ |
823 | 0 | W[r-4] = t + d; \ |
824 | 0 | d = t + CSIGMA0( a ) + MAJ( c, b, a ); |
825 | |
|
826 | 0 | DO_ROUND( A, B, C, D, T, r ); |
827 | 0 | DO_ROUND( D, A, B, C, T, (r+1) ); |
828 | 0 | DO_ROUND( C, D, A, B, T, (r+2) ); |
829 | 0 | DO_ROUND( B, C, D, A, T, (r+3) ); |
830 | 0 | #undef DO_ROUND |
831 | 0 | } |
832 | |
|
833 | 0 | buf[3] = ha[7] = buf[3] + A; |
834 | 0 | buf[2] = ha[6] = buf[2] + B; |
835 | 0 | buf[1] = ha[5] = buf[1] + C; |
836 | 0 | buf[0] = ha[4] = buf[0] + D; |
837 | 0 | ha[3] += W[r-5]; |
838 | 0 | ha[2] += W[r-6]; |
839 | 0 | ha[1] += W[r-7]; |
840 | 0 | ha[0] += W[r-8]; |
841 | |
|
842 | 0 | pbData += 128; |
843 | 0 | cbData -= 128; |
844 | 0 | } |
845 | |
|
846 | 0 | pChain->H[0] = ha[7]; |
847 | 0 | pChain->H[1] = ha[6]; |
848 | 0 | pChain->H[2] = ha[5]; |
849 | 0 | pChain->H[3] = ha[4]; |
850 | 0 | pChain->H[4] = ha[3]; |
851 | 0 | pChain->H[5] = ha[2]; |
852 | 0 | pChain->H[6] = ha[1]; |
853 | 0 | pChain->H[7] = ha[0]; |
854 | |
|
855 | 0 | *pcbRemaining = cbData; |
856 | | |
857 | | // |
858 | | // Wipe the variables; |
859 | | // |
860 | 0 | SymCryptWipeKnownSize( buf, sizeof( buf ) ); |
861 | 0 | SYMCRYPT_FORCE_WRITE64( &A, 0 ); |
862 | 0 | SYMCRYPT_FORCE_WRITE64( &B, 0 ); |
863 | 0 | SYMCRYPT_FORCE_WRITE64( &C, 0 ); |
864 | 0 | SYMCRYPT_FORCE_WRITE64( &D, 0 ); |
865 | 0 | SYMCRYPT_FORCE_WRITE64( &T, 0 ); |
866 | |
|
867 | 0 | } |
868 | | |
869 | | // |
870 | | // UINT64 based implementation that |
871 | | // first computes the expanded message, and then the |
872 | | // actual hash computation. |
873 | | // This one uses more registers than the previous one. |
874 | | // |
875 | | |
876 | | VOID |
877 | | SYMCRYPT_CALL |
878 | | SymCryptSha512AppendBlocks_ull3( |
879 | | _Inout_ SYMCRYPT_SHA512_CHAINING_STATE * pChain, |
880 | | _In_reads_(cbData) PCBYTE pbData, |
881 | | SIZE_T cbData, |
882 | | _Out_ SIZE_T * pcbRemaining ) |
883 | 0 | { |
884 | 0 | SYMCRYPT_ALIGN UINT64 W[80]; |
885 | 0 | SYMCRYPT_ALIGN UINT64 ha[8]; |
886 | 0 | UINT64 A, B, C, D, E, F, G, H; |
887 | 0 | int r; |
888 | |
|
889 | 0 | ha[7] = pChain->H[0]; |
890 | 0 | ha[6] = pChain->H[1]; |
891 | 0 | ha[5] = pChain->H[2]; |
892 | 0 | ha[4] = pChain->H[3]; |
893 | 0 | ha[3] = pChain->H[4]; |
894 | 0 | ha[2] = pChain->H[5]; |
895 | 0 | ha[1] = pChain->H[6]; |
896 | 0 | ha[0] = pChain->H[7]; |
897 | |
|
898 | 0 | while( cbData >= 128 ) |
899 | 0 | { |
900 | | |
901 | | // |
902 | | // Capture the input into W[0..15] |
903 | | // |
904 | 0 | for( r=0; r<16; r+= 2 ) |
905 | 0 | { |
906 | 0 | W[r ] = SYMCRYPT_LOAD_MSBFIRST64( &pbData[ 8* r ] ); |
907 | 0 | W[r+1] = SYMCRYPT_LOAD_MSBFIRST64( &pbData[ 8*(r+1) ] ); |
908 | 0 | } |
909 | | |
910 | | // |
911 | | // Expand the message |
912 | | // |
913 | 0 | A = W[15]; |
914 | 0 | B = W[14]; |
915 | 0 | D = W[0]; |
916 | 0 | for( r=16; r<80; r+= 2 ) |
917 | 0 | { |
918 | | // Loop invariant: A=W[r-1], B = W[r-2], D = W[r-16] |
919 | | |
920 | | // |
921 | | // Macro for one word of message expansion. |
922 | | // Invariant: |
923 | | // on entry: a = W[r-1], b = W[r-2], d = W[r-16] |
924 | | // on exit: W[r] computed, a = W[r-1], b = W[r], c = W[r-15] |
925 | | // |
926 | 0 | #define EXPAND( a, b, c, d, r ) \ |
927 | 0 | c = W[r-15]; \ |
928 | 0 | b = d + LSIGMA1( b ) + W[r-7] + LSIGMA0( c ); \ |
929 | 0 | W[r] = b; \ |
930 | 0 |
|
931 | 0 | EXPAND( A, B, C, D, r ); |
932 | 0 | EXPAND( B, A, D, C, (r+1)); |
933 | |
|
934 | 0 | #undef EXPAND |
935 | 0 | } |
936 | |
|
937 | 0 | A = ha[7]; |
938 | 0 | B = ha[6]; |
939 | 0 | C = ha[5]; |
940 | 0 | D = ha[4]; |
941 | 0 | E = ha[3]; |
942 | 0 | F = ha[2]; |
943 | 0 | G = ha[1]; |
944 | 0 | H = ha[0]; |
945 | |
|
946 | 0 | for( r=0; r<80; r += 8 ) |
947 | 0 | { |
948 | | // |
949 | | // Loop invariant: |
950 | | // A, B, C, and D, E, F, G, H, are the values of the current state. |
951 | | // W[r] is the next expanded message word to be processed. |
952 | | // |
953 | | |
954 | | // |
955 | | // Macro to compute one round |
956 | | // |
957 | 0 | #define DO_ROUND( a, b, c, d, e, f, g, h, r ) \ |
958 | 0 | h += W[r] + CSIGMA1( e ) + CH( e, f, g ) + SymCryptSha512K[r]; \ |
959 | 0 | d += h; \ |
960 | 0 | h += CSIGMA0( a ) + MAJ( c, b, a ); |
961 | |
|
962 | 0 | DO_ROUND( A, B, C, D, E, F, G, H, (r ) ); |
963 | 0 | DO_ROUND( H, A, B, C, D, E, F, G, (r+1) ); |
964 | 0 | DO_ROUND( G, H, A, B, C, D, E, F, (r+2) ); |
965 | 0 | DO_ROUND( F, G, H, A, B, C, D, E, (r+3) ); |
966 | 0 | DO_ROUND( E, F, G, H, A, B, C, D, (r+4) ); |
967 | 0 | DO_ROUND( D, E, F, G, H, A, B, C, (r+5) ); |
968 | 0 | DO_ROUND( C, D, E, F, G, H, A, B, (r+6) ); |
969 | 0 | DO_ROUND( B, C, D, E, F, G, H, A, (r+7) ); |
970 | 0 | #undef DO_ROUND |
971 | 0 | } |
972 | |
|
973 | 0 | ha[7] += A; |
974 | 0 | ha[6] += B; |
975 | 0 | ha[5] += C; |
976 | 0 | ha[4] += D; |
977 | 0 | ha[3] += E; |
978 | 0 | ha[2] += F; |
979 | 0 | ha[1] += G; |
980 | 0 | ha[0] += H; |
981 | |
|
982 | 0 | pbData += 128; |
983 | 0 | cbData -= 128; |
984 | 0 | } |
985 | |
|
986 | 0 | pChain->H[0] = ha[7]; |
987 | 0 | pChain->H[1] = ha[6]; |
988 | 0 | pChain->H[2] = ha[5]; |
989 | 0 | pChain->H[3] = ha[4]; |
990 | 0 | pChain->H[4] = ha[3]; |
991 | 0 | pChain->H[5] = ha[2]; |
992 | 0 | pChain->H[6] = ha[1]; |
993 | 0 | pChain->H[7] = ha[0]; |
994 | |
|
995 | 0 | *pcbRemaining = cbData; |
996 | | |
997 | | // |
998 | | // Wipe the variables; |
999 | | // |
1000 | 0 | SymCryptWipeKnownSize( W, sizeof( W ) ); |
1001 | 0 | SymCryptWipeKnownSize( ha, sizeof( ha ) ); |
1002 | 0 | SYMCRYPT_FORCE_WRITE64( &A, 0 ); |
1003 | 0 | SYMCRYPT_FORCE_WRITE64( &B, 0 ); |
1004 | 0 | SYMCRYPT_FORCE_WRITE64( &C, 0 ); |
1005 | 0 | SYMCRYPT_FORCE_WRITE64( &D, 0 ); |
1006 | 0 | SYMCRYPT_FORCE_WRITE64( &E, 0 ); |
1007 | 0 | SYMCRYPT_FORCE_WRITE64( &F, 0 ); |
1008 | 0 | SYMCRYPT_FORCE_WRITE64( &G, 0 ); |
1009 | 0 | SYMCRYPT_FORCE_WRITE64( &H, 0 ); |
1010 | 0 | } |
1011 | | |
1012 | | #undef MAJ |
1013 | | #undef CH |
1014 | | #undef CSIGMA0 |
1015 | | #undef CSIGMA1 |
1016 | | #undef LSIGMA0 |
1017 | | #undef LSIGMA1 |
1018 | | #undef CROUND |
1019 | | #undef IROUND |
1020 | | #undef FROUND |
1021 | | |
1022 | | //====================================================================================== |
1023 | | // Implementation using Xmm registers |
1024 | | // |
1025 | | #if SYMCRYPT_CPU_X86 // only on X86; AMD64 is faster when using UINT64s |
1026 | | |
1027 | | #if SYMCRYPT_MS_VC |
1028 | | #ifndef _mm_storeu_si64 |
1029 | | // Workaround missing intrinsic on some versions of MSVC |
1030 | | #define _mm_storeu_si64(p, a) (_mm_storel_epi64((__m128i*)(p), (a))) |
1031 | | #endif |
1032 | | #endif |
1033 | | |
1034 | | #define XMMADD( _a, _b ) _mm_add_epi64((_a), (_b)) |
1035 | | #define XMMAND( _a, _b ) _mm_and_si128((_a), (_b)) |
1036 | | #define XMMOR( _a, _b ) _mm_or_si128((_a), (_b)) |
1037 | | #define XMMROR( _a, _n ) _mm_xor_si128( _mm_slli_epi64( (_a), 64-(_n)), _mm_srli_epi64( (_a), (_n)) ) |
1038 | | #define XMMSHR( _a, _n ) _mm_srli_epi64((_a), (_n)) |
1039 | | #define XMMXOR( _a, _b ) _mm_xor_si128((_a), (_b)) |
1040 | | #define XMMSTORE_UINT64( _a, _addr ) _mm_storeu_si64((_addr), (_a)) |
1041 | | |
1042 | | #define XMMMAJ( x, y, z ) XMMOR( XMMAND( XMMOR( (z), (y)), (x)), XMMAND( (z), (y) ) ) |
1043 | | #define XMMCH( x, y, z ) XMMXOR( XMMAND( XMMXOR( (z), (y) ), (x)), (z)) |
1044 | | #define XMMCSIGMA0( x ) XMMXOR( XMMXOR( XMMROR((x), 28), XMMROR((x), 34)), XMMROR((x), 39)) |
1045 | | #define XMMCSIGMA1( x ) XMMXOR( XMMXOR( XMMROR((x), 14), XMMROR((x), 18)), XMMROR((x), 41)) |
1046 | | #define XMMLSIGMA0( x ) XMMXOR( XMMXOR( XMMROR((x), 1), XMMROR((x), 8)), XMMSHR((x), 7)) |
1047 | | #define XMMLSIGMA1( x ) XMMXOR( XMMXOR( XMMROR((x), 19), XMMROR((x), 61)), XMMSHR((x), 6)) |
1048 | | |
1049 | | // |
1050 | | // Core round takes two arguments: r16 = round number modulo 16, r = round number - r16. |
1051 | | // On entry, Wt must be equal to the sum of the round constant and the expanded message word for this round. |
1052 | | // Only the lower word of each Xmm register is used. |
1053 | | // |
1054 | | #define XMMCROUND( r16, r ) {;\ |
1055 | | ah[r16 & 7] = XMMADD( XMMADD( XMMADD( ah[r16 & 7], XMMCSIGMA1(ah[(r16+3)&7]) ), XMMCH(ah[(r16+3)&7], ah[(r16+2)&7], ah[(r16+1)&7]) ), Wt );\ |
1056 | | ah[(r16+4)&7] = XMMADD( ah[(r16+4)&7], ah[r16 &7] );\ |
1057 | | ah[r16 & 7] = XMMADD( XMMADD( ah[r16 & 7], XMMCSIGMA0(ah[(r16+7)&7])), XMMMAJ(ah[(r16+7)&7], ah[(r16+6)&7], ah[(r16+5)&7]) );\ |
1058 | | } |
1059 | | |
1060 | | #pragma warning( disable: 4127 ) // conditional expression is constant |
1061 | | |
1062 | | // |
1063 | | // Initial round; reads data and performs a round. |
1064 | | // Data is read in 128-bit chunks every other round. |
1065 | | // |
1066 | | #define XMMIROUND( r ) {\ |
1067 | | if( (r&1) == 0 ) \ |
1068 | | { \ |
1069 | | Wt = _mm_loadu_si128( (__m128i *)&pbData[ 8*r ] ); \ |
1070 | | Wt = _mm_shuffle_epi8( Wt, BYTE_REVERSE_64 ); \ |
1071 | | W[r/2] = Wt; \ |
1072 | | Wt = XMMADD( Wt, _mm_load_si128( (__m128i *)&SymCryptSha512K[r] ) ); \ |
1073 | | Ws = _mm_srli_si128( Wt, 8 ); \ |
1074 | | } else {\ |
1075 | | Wt = Ws;\ |
1076 | | }\ |
1077 | | XMMCROUND( r, r );\ |
1078 | | } |
1079 | | |
1080 | | // |
1081 | | // Working version of XMMIROUND: |
1082 | | // Wt = XMMFROM_MSBF( &pbData[ 8*r ] );\ |
1083 | | // W[r] = Wt;\ |
1084 | | // Wt = XMMADD( XMMFROM_UINT64(SymCryptSha512K[r]), Wt );\ |
1085 | | // XMMCROUND(r,r);\ |
1086 | | |
1087 | | #define XMMFROUND(r16, rb) { \ |
1088 | | if( (r16 & 1) == 0 ) \ |
1089 | | {\ |
1090 | | Wt = XMMADD( XMMADD( XMMADD( XMMLSIGMA1( W[((r16 - 2)&15)/2] ), \ |
1091 | | _mm_alignr_epi8( W[((r16 - 6)&15)/2], W[((r16 - 7)&15)/2], 8 ) ), \ |
1092 | | XMMLSIGMA0( _mm_alignr_epi8( W[((r16 - 14)&15)/2], W[((r16 - 15)&15)/2], 8 ) ) ), \ |
1093 | | W[((r16 - 16)&15)/2] ); \ |
1094 | | W[r16/2] = Wt;\ |
1095 | | Ws = _mm_load_si128( (__m128i *)&SymCryptSha512K[r16 + rb] );\ |
1096 | | Wt = XMMADD( Ws , Wt );\ |
1097 | | Ws = _mm_srli_si128( Wt, 8 );\ |
1098 | | } else {\ |
1099 | | Wt = Ws;\ |
1100 | | }\ |
1101 | | XMMCROUND( r16, r16+rb ); \ |
1102 | | } |
1103 | | |
1104 | | VOID |
1105 | | SYMCRYPT_CALL |
1106 | | SymCryptSha512AppendBlocks_xmm( |
1107 | | _Inout_ SYMCRYPT_SHA512_CHAINING_STATE * pChain, |
1108 | | _In_reads_(cbData) PCBYTE pbData, |
1109 | | SIZE_T cbData, |
1110 | | _Out_ SIZE_T * pcbRemaining ) |
1111 | | { |
1112 | | SYMCRYPT_ALIGN __m128i W[8]; // message expansion buffer, 8 elements each storing 2 consecutive UINT64s |
1113 | | SYMCRYPT_ALIGN __m128i ah[8]; |
1114 | | SYMCRYPT_ALIGN __m128i feedf[8]; |
1115 | | int round; |
1116 | | __m128i Wt, Ws; |
1117 | | const __m128i BYTE_REVERSE_64 = _mm_set_epi8( 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 ); |
1118 | | |
1119 | | Wt = _mm_loadu_si128( (__m128i *) &pChain->H[0] ); |
1120 | | feedf[7] = ah[7] = Wt; |
1121 | | feedf[6] = ah[6] = _mm_srli_si128( Wt, 8 ); |
1122 | | Wt = _mm_loadu_si128( (__m128i *) &pChain->H[2] ); |
1123 | | feedf[5] = ah[5] = Wt; |
1124 | | feedf[4] = ah[4] = _mm_srli_si128( Wt, 8 ); |
1125 | | Wt = _mm_loadu_si128( (__m128i *) &pChain->H[4] ); |
1126 | | feedf[3] = ah[3] = Wt; |
1127 | | feedf[2] = ah[2] = _mm_srli_si128( Wt, 8 ); |
1128 | | Wt = _mm_loadu_si128( (__m128i *) &pChain->H[6] ); |
1129 | | feedf[1] = ah[1] = Wt; |
1130 | | feedf[0] = ah[0] = _mm_srli_si128( Wt, 8 ); |
1131 | | |
1132 | | while( cbData >= 128 ) |
1133 | | { |
1134 | | // |
1135 | | // initial rounds 1 to 16 |
1136 | | // |
1137 | | |
1138 | | XMMIROUND( 0 ); |
1139 | | XMMIROUND( 1 ); |
1140 | | XMMIROUND( 2 ); |
1141 | | XMMIROUND( 3 ); |
1142 | | XMMIROUND( 4 ); |
1143 | | XMMIROUND( 5 ); |
1144 | | XMMIROUND( 6 ); |
1145 | | XMMIROUND( 7 ); |
1146 | | XMMIROUND( 8 ); |
1147 | | XMMIROUND( 9 ); |
1148 | | XMMIROUND( 10 ); |
1149 | | XMMIROUND( 11 ); |
1150 | | XMMIROUND( 12 ); |
1151 | | XMMIROUND( 13 ); |
1152 | | XMMIROUND( 14 ); |
1153 | | XMMIROUND( 15 ); |
1154 | | |
1155 | | for( round=16; round<80; round += 16 ) |
1156 | | { |
1157 | | XMMFROUND( 0, round ); |
1158 | | XMMFROUND( 1, round ); |
1159 | | XMMFROUND( 2, round ); |
1160 | | XMMFROUND( 3, round ); |
1161 | | XMMFROUND( 4, round ); |
1162 | | XMMFROUND( 5, round ); |
1163 | | XMMFROUND( 6, round ); |
1164 | | XMMFROUND( 7, round ); |
1165 | | XMMFROUND( 8, round ); |
1166 | | XMMFROUND( 9, round ); |
1167 | | XMMFROUND( 10, round ); |
1168 | | XMMFROUND( 11, round ); |
1169 | | XMMFROUND( 12, round ); |
1170 | | XMMFROUND( 13, round ); |
1171 | | XMMFROUND( 14, round ); |
1172 | | XMMFROUND( 15, round ); |
1173 | | } |
1174 | | |
1175 | | feedf[0] = ah[0] = XMMADD( ah[0], feedf[0] ); |
1176 | | feedf[1] = ah[1] = XMMADD( ah[1], feedf[1] ); |
1177 | | feedf[2] = ah[2] = XMMADD( ah[2], feedf[2] ); |
1178 | | feedf[3] = ah[3] = XMMADD( ah[3], feedf[3] ); |
1179 | | feedf[4] = ah[4] = XMMADD( ah[4], feedf[4] ); |
1180 | | feedf[5] = ah[5] = XMMADD( ah[5], feedf[5] ); |
1181 | | feedf[6] = ah[6] = XMMADD( ah[6], feedf[6] ); |
1182 | | feedf[7] = ah[7] = XMMADD( ah[7], feedf[7] ); |
1183 | | |
1184 | | pbData += 128; |
1185 | | cbData -= 128; |
1186 | | |
1187 | | } |
1188 | | |
1189 | | XMMSTORE_UINT64( ah[7], &(pChain->H[0]) ); |
1190 | | XMMSTORE_UINT64( ah[6], &(pChain->H[1]) ); |
1191 | | XMMSTORE_UINT64( ah[5], &(pChain->H[2]) ); |
1192 | | XMMSTORE_UINT64( ah[4], &(pChain->H[3]) ); |
1193 | | XMMSTORE_UINT64( ah[3], &(pChain->H[4]) ); |
1194 | | XMMSTORE_UINT64( ah[2], &(pChain->H[5]) ); |
1195 | | XMMSTORE_UINT64( ah[1], &(pChain->H[6]) ); |
1196 | | XMMSTORE_UINT64( ah[0], &(pChain->H[7]) ); |
1197 | | |
1198 | | *pcbRemaining = cbData; |
1199 | | |
1200 | | // |
1201 | | // Wipe the variables; |
1202 | | // |
1203 | | SymCryptWipeKnownSize( ah, sizeof( ah ) ); |
1204 | | SymCryptWipeKnownSize( feedf, sizeof( feedf ) ); |
1205 | | SymCryptWipeKnownSize( W, sizeof( W ) ); |
1206 | | SymCryptWipeKnownSize( &Wt, sizeof( Wt )); |
1207 | | SymCryptWipeKnownSize( &Ws, sizeof( Ws )); |
1208 | | } |
1209 | | |
1210 | | #endif |
1211 | | |
1212 | | |
1213 | | |
1214 | | //====================================================================================== |
1215 | | // Implementation using NEON registers |
1216 | | // |
1217 | | #if SYMCRYPT_CPU_ARM |
1218 | | |
1219 | | |
1220 | | #define ROR( _a, _n ) vorr_u64( vshl_n_u64( _a, 64 - _n ), vshr_n_u64( _a, _n ) ) |
1221 | | #define ADD( x, y ) vadd_u64( (x), (y) ) |
1222 | | |
1223 | | #define MAJ( x, y, z ) vorr_u64( vand_u64( vorr_u64( (z), (y)), (x)), vand_u64( (z), (y) ) ) |
1224 | | #define CH( x, y, z ) veor_u64( vand_u64( veor_u64( (z), (y) ), (x)), (z)) |
1225 | | #define CSIGMA0( x ) veor_u64( veor_u64( ROR((x), 28), ROR((x), 34)), ROR((x), 39)) |
1226 | | #define CSIGMA1( x ) veor_u64( veor_u64( ROR((x), 14), ROR((x), 18)), ROR((x), 41)) |
1227 | | #define LSIGMA0( x ) veor_u64( veor_u64( ROR((x), 1), ROR((x), 8)), vshr_n_u64((x), 7)) |
1228 | | #define LSIGMA1( x ) veor_u64( veor_u64( ROR((x), 19), ROR((x), 61)), vshr_n_u64((x), 6)) |
1229 | | |
1230 | | // |
1231 | | // r = round number, r16 = r mod 16 (often a compile-time constant when r is not) |
1232 | | // |
1233 | | #define CROUND( a, b, c, d, e, f, g, h, r, r16 ) {\ |
1234 | | W[r16] = Wt; \ |
1235 | | h = ADD( h, ADD( ADD( ADD( CSIGMA1(e), CH(e, f, g)), *(__n64 *)&SymCryptSha512K[r]), Wt ));\ |
1236 | | d = ADD( d, h );\ |
1237 | | h = ADD( h, ADD( CSIGMA0(a), MAJ(a, b, c)));\ |
1238 | | } |
1239 | | |
1240 | | // |
1241 | | // Initial round that reads the message. |
1242 | | // r is the round number 0..15 |
1243 | | // |
1244 | | #define IROUND( a, b, c, d, e, f, g, h, r ) {\ |
1245 | | Wt = vmov_n_u64( SYMCRYPT_LOAD_MSBFIRST64( &pbData[ 8*r ] ) );\ |
1246 | | CROUND( a, b, c, d, e, f, g, h, r, r);\ |
1247 | | } |
1248 | | // |
1249 | | // Subsequent rounds. |
1250 | | // r is the round number, r16 is the round number mod 16. |
1251 | | // These are separate as typically r is run-time and r16 is compile time constant. |
1252 | | // |
1253 | | #define FROUND( a, b, c, d, e, f, g, h, r, r16 ) { \ |
1254 | | Wt = ADD( ADD( LSIGMA1( W[(r16-2) & 15] ), LSIGMA0( W[(r16-15) & 15])) , ADD( W[(r16-7) & 15], W[r16 & 15])); \ |
1255 | | CROUND( a, b, c, d, e, f, g, h, r, r16 ); \ |
1256 | | } |
1257 | | |
1258 | | // |
1259 | | // This is the core routine that does the actual hard work |
1260 | | // This is based on the older one in RSA32LIB by Scott Field from 2001 |
1261 | | // |
1262 | | VOID |
1263 | | SYMCRYPT_CALL |
1264 | | SymCryptSha512AppendBlocks_neon( |
1265 | | _Inout_ SYMCRYPT_SHA512_CHAINING_STATE * pChain, |
1266 | | _In_reads_(cbData) PCBYTE pbData, |
1267 | | SIZE_T cbData, |
1268 | | _Out_ SIZE_T * pcbRemaining ) |
1269 | | { |
1270 | | SYMCRYPT_ALIGN __n64 W[16]; |
1271 | | __n64 A, B, C, D, E, F, G, H; |
1272 | | int round; |
1273 | | __n64 Wt; |
1274 | | __n64 * pH = (__n64 *) &pChain->H[0]; |
1275 | | |
1276 | | A = pH[0]; |
1277 | | B = pH[1]; |
1278 | | C = pH[2]; |
1279 | | D = pH[3]; |
1280 | | E = pH[4]; |
1281 | | F = pH[5]; |
1282 | | G = pH[6]; |
1283 | | H = pH[7]; |
1284 | | |
1285 | | while( cbData >= 128 ) |
1286 | | { |
1287 | | // |
1288 | | // initial rounds 1 to 16 |
1289 | | // |
1290 | | |
1291 | | IROUND( A, B, C, D, E, F, G, H, 0 ); |
1292 | | IROUND( H, A, B, C, D, E, F, G, 1 ); |
1293 | | IROUND( G, H, A, B, C, D, E, F, 2 ); |
1294 | | IROUND( F, G, H, A, B, C, D, E, 3 ); |
1295 | | IROUND( E, F, G, H, A, B, C, D, 4 ); |
1296 | | IROUND( D, E, F, G, H, A, B, C, 5 ); |
1297 | | IROUND( C, D, E, F, G, H, A, B, 6 ); |
1298 | | IROUND( B, C, D, E, F, G, H, A, 7 ); |
1299 | | IROUND( A, B, C, D, E, F, G, H, 8 ); |
1300 | | IROUND( H, A, B, C, D, E, F, G, 9 ); |
1301 | | IROUND( G, H, A, B, C, D, E, F, 10 ); |
1302 | | IROUND( F, G, H, A, B, C, D, E, 11 ); |
1303 | | IROUND( E, F, G, H, A, B, C, D, 12 ); |
1304 | | IROUND( D, E, F, G, H, A, B, C, 13 ); |
1305 | | IROUND( C, D, E, F, G, H, A, B, 14 ); |
1306 | | IROUND( B, C, D, E, F, G, H, A, 15 ); |
1307 | | |
1308 | | for( round=16; round<80; round += 16 ) |
1309 | | { |
1310 | | FROUND( A, B, C, D, E, F, G, H, round + 0, 0 ); |
1311 | | FROUND( H, A, B, C, D, E, F, G, round + 1, 1 ); |
1312 | | FROUND( G, H, A, B, C, D, E, F, round + 2, 2 ); |
1313 | | FROUND( F, G, H, A, B, C, D, E, round + 3, 3 ); |
1314 | | FROUND( E, F, G, H, A, B, C, D, round + 4, 4 ); |
1315 | | FROUND( D, E, F, G, H, A, B, C, round + 5, 5 ); |
1316 | | FROUND( C, D, E, F, G, H, A, B, round + 6, 6 ); |
1317 | | FROUND( B, C, D, E, F, G, H, A, round + 7, 7 ); |
1318 | | FROUND( A, B, C, D, E, F, G, H, round + 8, 8 ); |
1319 | | FROUND( H, A, B, C, D, E, F, G, round + 9, 9 ); |
1320 | | FROUND( G, H, A, B, C, D, E, F, round + 10, 10 ); |
1321 | | FROUND( F, G, H, A, B, C, D, E, round + 11, 11 ); |
1322 | | FROUND( E, F, G, H, A, B, C, D, round + 12, 12 ); |
1323 | | FROUND( D, E, F, G, H, A, B, C, round + 13, 13 ); |
1324 | | FROUND( C, D, E, F, G, H, A, B, round + 14, 14 ); |
1325 | | FROUND( B, C, D, E, F, G, H, A, round + 15, 15 ); |
1326 | | } |
1327 | | |
1328 | | pH[0] = A = ADD( A, pH[0] ); |
1329 | | pH[1] = B = ADD( B, pH[1] ); |
1330 | | pH[2] = C = ADD( C, pH[2] ); |
1331 | | pH[3] = D = ADD( D, pH[3] ); |
1332 | | pH[4] = E = ADD( E, pH[4] ); |
1333 | | pH[5] = F = ADD( F, pH[5] ); |
1334 | | pH[6] = G = ADD( G, pH[6] ); |
1335 | | pH[7] = H = ADD( H, pH[7] ); |
1336 | | |
1337 | | pbData += 128; |
1338 | | cbData -= 128; |
1339 | | } |
1340 | | |
1341 | | *pcbRemaining = cbData; |
1342 | | |
1343 | | // |
1344 | | // Wipe the variables; |
1345 | | // |
1346 | | SymCryptWipeKnownSize( W, sizeof( W ) ); |
1347 | | SymCryptWipeKnownSize( &A, sizeof( A ) ); |
1348 | | SymCryptWipeKnownSize( &B, sizeof( B ) ); |
1349 | | SymCryptWipeKnownSize( &C, sizeof( C ) ); |
1350 | | SymCryptWipeKnownSize( &D, sizeof( D ) ); |
1351 | | SymCryptWipeKnownSize( &E, sizeof( E ) ); |
1352 | | SymCryptWipeKnownSize( &F, sizeof( F ) ); |
1353 | | SymCryptWipeKnownSize( &G, sizeof( G ) ); |
1354 | | SymCryptWipeKnownSize( &H, sizeof( H ) ); |
1355 | | SymCryptWipeKnownSize( &Wt, sizeof( Wt ) ); |
1356 | | } |
1357 | | |
1358 | | #endif |
1359 | | |
1360 | | //====================================================================================== |
1361 | | // |
1362 | | // Switch between different implementations of compression function |
1363 | | // |
1364 | | //FORCEINLINE |
1365 | | VOID |
1366 | | SYMCRYPT_CALL |
1367 | | SymCryptSha512AppendBlocks( |
1368 | | _Inout_ SYMCRYPT_SHA512_CHAINING_STATE * pChain, |
1369 | | _In_reads_( cbData ) PCBYTE pbData, |
1370 | | SIZE_T cbData, |
1371 | | _Out_ SIZE_T * pcbRemaining ) |
1372 | 52.1k | { |
1373 | 52.1k | #if SYMCRYPT_CPU_AMD64 |
1374 | | |
1375 | | // Temporarily disabling use of Ymm in SHA2 |
1376 | | // SYMCRYPT_EXTENDED_SAVE_DATA SaveData; |
1377 | | |
1378 | | // if (SYMCRYPT_CPU_FEATURES_PRESENT(SYMCRYPT_CPU_FEATURE_AVX512 | SYMCRYPT_CPU_FEATURE_BMI2) && |
1379 | | // SymCryptSaveYmm(&SaveData) == SYMCRYPT_NO_ERROR) |
1380 | | // { |
1381 | | // SymCryptSha512AppendBlocks_ymm_avx512vl_asm(pChain, pbData, cbData, pcbRemaining); |
1382 | | |
1383 | | // SymCryptRestoreYmm(&SaveData); |
1384 | | // } |
1385 | | // else if (SYMCRYPT_CPU_FEATURES_PRESENT(SYMCRYPT_CPU_FEATURE_AVX2 | SYMCRYPT_CPU_FEATURE_BMI2) && |
1386 | | // SymCryptSaveYmm(&SaveData) == SYMCRYPT_NO_ERROR) |
1387 | | // { |
1388 | | // //SymCryptSha512AppendBlocks_ymm_1block(pChain, pbData, cbData, pcbRemaining); |
1389 | | // //SymCryptSha512AppendBlocks_ymm_2blocks(pChain, pbData, cbData, pcbRemaining); |
1390 | | // //SymCryptSha512AppendBlocks_ymm_4blocks(pChain, pbData, cbData, pcbRemaining); |
1391 | | // SymCryptSha512AppendBlocks_ymm_avx2_asm(pChain, pbData, cbData, pcbRemaining); |
1392 | | |
1393 | | // SymCryptRestoreYmm(&SaveData); |
1394 | | // } |
1395 | | // else |
1396 | 52.1k | { |
1397 | 52.1k | SymCryptSha512AppendBlocks_ull( pChain, pbData, cbData, pcbRemaining ); |
1398 | | //SymCryptSha512AppendBlocks_ull2( pChain, pbData, cbData, pcbRemaining ); |
1399 | | //SymCryptSha512AppendBlocks_ull3( pChain, pbData, cbData, pcbRemaining ); |
1400 | 52.1k | } |
1401 | | |
1402 | | |
1403 | | #elif SYMCRYPT_CPU_ARM |
1404 | | |
1405 | | if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURE_NEON ) ) |
1406 | | { |
1407 | | SymCryptSha512AppendBlocks_neon( pChain, pbData, cbData, pcbRemaining ); // Tegra T3: 48 c/B |
1408 | | } else { |
1409 | | SymCryptSha512AppendBlocks_ull( pChain, pbData, cbData, pcbRemaining ); // Tegra T3: 65.34 c/B |
1410 | | //SymCryptSha512AppendBlocks_ull2( pChain, pbData, cbData, pcbRemaining ); // Tegra T3: 77.4 c/B |
1411 | | //SymCryptSha512AppendBlocks_ull3( pChain, pbData, cbData, pcbRemaining ); // Tegra T3: 71.6 c/B |
1412 | | } |
1413 | | |
1414 | | #elif SYMCRYPT_CPU_X86 |
1415 | | |
1416 | | SYMCRYPT_EXTENDED_SAVE_DATA SaveData; |
1417 | | |
1418 | | if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURE_SSSE3 ) && SymCryptSaveXmm( &SaveData ) == SYMCRYPT_NO_ERROR ) |
1419 | | { |
1420 | | SymCryptSha512AppendBlocks_xmm( pChain, pbData, cbData, pcbRemaining ); |
1421 | | SymCryptRestoreXmm( &SaveData ); |
1422 | | } else { |
1423 | | SymCryptSha512AppendBlocks_ull( pChain, pbData, cbData, pcbRemaining ); // core2: 36.40 c/B |
1424 | | //SymCryptSha512AppendBlocks_ull2( pChain, pbData, cbData, pcbRemaining ); // core2: 49.09 c/B |
1425 | | //SymCryptSha512AppendBlocks_ull3( pChain, pbData, cbData, pcbRemaining ); // core2: 38.29 c/B |
1426 | | } |
1427 | | |
1428 | | #else |
1429 | | |
1430 | | SymCryptSha512AppendBlocks_ull( pChain, pbData, cbData, pcbRemaining ); // need tuning... |
1431 | | |
1432 | | #endif |
1433 | 52.1k | } |
1434 | | |
1435 | | |