Coverage Report

Created: 2024-11-21 07:03

/src/SymCrypt/lib/sha512.c
Line
Count
Source (jump to first uncovered line)
1
//
2
// Sha512.c
3
//
4
// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
5
//
6
7
//
8
// This module contains the routines to implement SHA2-512 from FIPS 180-2
9
//
10
11
12
#include "precomp.h"
13
14
//
15
// SHA-512 uses 80 magic constants of 64 bits each. These are
16
// referred to as K^{512}_i for i=0...79 by FIPS 180-2.
17
// We use a static array as that does not pollute the linker name space
18
// For performance we align to the cache line size of 64 bytes
19
// We have one extra value at the end to allow an XMM read from each element
20
// of the array.
21
//
22
SYMCRYPT_ALIGN_AT( 64 ) const  UINT64 SymCryptSha512K[81] = {
23
    0x428a2f98d728ae22UL, 0x7137449123ef65cdUL,
24
    0xb5c0fbcfec4d3b2fUL, 0xe9b5dba58189dbbcUL,
25
    0x3956c25bf348b538UL, 0x59f111f1b605d019UL,
26
    0x923f82a4af194f9bUL, 0xab1c5ed5da6d8118UL,
27
    0xd807aa98a3030242UL, 0x12835b0145706fbeUL,
28
    0x243185be4ee4b28cUL, 0x550c7dc3d5ffb4e2UL,
29
    0x72be5d74f27b896fUL, 0x80deb1fe3b1696b1UL,
30
    0x9bdc06a725c71235UL, 0xc19bf174cf692694UL,
31
    0xe49b69c19ef14ad2UL, 0xefbe4786384f25e3UL,
32
    0x0fc19dc68b8cd5b5UL, 0x240ca1cc77ac9c65UL,
33
    0x2de92c6f592b0275UL, 0x4a7484aa6ea6e483UL,
34
    0x5cb0a9dcbd41fbd4UL, 0x76f988da831153b5UL,
35
    0x983e5152ee66dfabUL, 0xa831c66d2db43210UL,
36
    0xb00327c898fb213fUL, 0xbf597fc7beef0ee4UL,
37
    0xc6e00bf33da88fc2UL, 0xd5a79147930aa725UL,
38
    0x06ca6351e003826fUL, 0x142929670a0e6e70UL,
39
    0x27b70a8546d22ffcUL, 0x2e1b21385c26c926UL,
40
    0x4d2c6dfc5ac42aedUL, 0x53380d139d95b3dfUL,
41
    0x650a73548baf63deUL, 0x766a0abb3c77b2a8UL,
42
    0x81c2c92e47edaee6UL, 0x92722c851482353bUL,
43
    0xa2bfe8a14cf10364UL, 0xa81a664bbc423001UL,
44
    0xc24b8b70d0f89791UL, 0xc76c51a30654be30UL,
45
    0xd192e819d6ef5218UL, 0xd69906245565a910UL,
46
    0xf40e35855771202aUL, 0x106aa07032bbd1b8UL,
47
    0x19a4c116b8d2d0c8UL, 0x1e376c085141ab53UL,
48
    0x2748774cdf8eeb99UL, 0x34b0bcb5e19b48a8UL,
49
    0x391c0cb3c5c95a63UL, 0x4ed8aa4ae3418acbUL,
50
    0x5b9cca4f7763e373UL, 0x682e6ff3d6b2b8a3UL,
51
    0x748f82ee5defb2fcUL, 0x78a5636f43172f60UL,
52
    0x84c87814a1f0ab72UL, 0x8cc702081a6439ecUL,
53
    0x90befffa23631e28UL, 0xa4506cebde82bde9UL,
54
    0xbef9a3f7b2c67915UL, 0xc67178f2e372532bUL,
55
    0xca273eceea26619cUL, 0xd186b8c721c0c207UL,
56
    0xeada7dd6cde0eb1eUL, 0xf57d4f7fee6ed178UL,
57
    0x06f067aa72176fbaUL, 0x0a637dc5a2c898a6UL,
58
    0x113f9804bef90daeUL, 0x1b710b35131c471bUL,
59
    0x28db77f523047d84UL, 0x32caab7b40c72493UL,
60
    0x3c9ebe0a15c9bebcUL, 0x431d67c49c100d4cUL,
61
    0x4cc5d4becb3e42b6UL, 0x597f299cfc657e2aUL,
62
    0x5fcb6fab3ad6faecUL, 0x6c44198c4a475817UL,
63
};
64
65
//
66
// Initial states
67
//
68
const UINT64 SymCryptSha512InitialState[8] = {
69
    0x6a09e667f3bcc908UL,
70
    0xbb67ae8584caa73bUL,
71
    0x3c6ef372fe94f82bUL,
72
    0xa54ff53a5f1d36f1UL,
73
    0x510e527fade682d1UL,
74
    0x9b05688c2b3e6c1fUL,
75
    0x1f83d9abfb41bd6bUL,
76
    0x5be0cd19137e2179UL,
77
};
78
79
const UINT64 SymCryptSha384InitialState[8] = {
80
    0xcbbb9d5dc1059ed8UL,
81
    0x629a292a367cd507UL,
82
    0x9159015a3070dd17UL,
83
    0x152fecd8f70e5939UL,
84
    0x67332667ffc00b31UL,
85
    0x8eb44a8768581511UL,
86
    0xdb0c2e0d64f98fa7UL,
87
    0x47b5481dbefa4fa4UL,
88
};
89
90
91
//
92
// Todo: this structure pulls in the SHA284 code anytime someone uses
93
// SHA-512; should be split into a separate file.
94
//
95
const SYMCRYPT_HASH SymCryptSha384Algorithm_default = {
96
    &SymCryptSha384Init,
97
    &SymCryptSha384Append,
98
    &SymCryptSha384Result,
99
    &SymCryptSha512AppendBlocks,
100
    &SymCryptSha384StateCopy,
101
    sizeof( SYMCRYPT_SHA384_STATE ),
102
    SYMCRYPT_SHA384_RESULT_SIZE,
103
    SYMCRYPT_SHA384_INPUT_BLOCK_SIZE,
104
    SYMCRYPT_FIELD_OFFSET( SYMCRYPT_SHA384_STATE, chain ),
105
    SYMCRYPT_FIELD_SIZE( SYMCRYPT_SHA384_STATE, chain ),
106
};
107
108
const SYMCRYPT_HASH SymCryptSha512Algorithm_default = {
109
    &SymCryptSha512Init,
110
    &SymCryptSha512Append,
111
    &SymCryptSha512Result,
112
    &SymCryptSha512AppendBlocks,
113
    &SymCryptSha512StateCopy,
114
    sizeof( SYMCRYPT_SHA512_STATE ),
115
    SYMCRYPT_SHA512_RESULT_SIZE,
116
    SYMCRYPT_SHA512_INPUT_BLOCK_SIZE,
117
    SYMCRYPT_FIELD_OFFSET( SYMCRYPT_SHA512_STATE, chain ),
118
    SYMCRYPT_FIELD_SIZE( SYMCRYPT_SHA512_STATE, chain ),
119
};
120
121
const PCSYMCRYPT_HASH SymCryptSha384Algorithm = &SymCryptSha384Algorithm_default;
122
const PCSYMCRYPT_HASH SymCryptSha512Algorithm = &SymCryptSha512Algorithm_default;
123
124
//
125
// SymCryptSha384
126
//
127
#define ALG SHA384
128
#define Alg Sha384
129
#include "hash_pattern.c"
130
#undef ALG
131
#undef Alg
132
133
//
134
// SymCryptSha512
135
//
136
#define ALG SHA512
137
#define Alg Sha512
138
#include "hash_pattern.c"
139
#undef ALG
140
#undef Alg
141
142
143
SYMCRYPT_NOINLINE
144
VOID
145
SYMCRYPT_CALL
146
SymCryptSha512Init( _Out_ PSYMCRYPT_SHA512_STATE pState )
147
416
{
148
416
    SYMCRYPT_SET_MAGIC( pState );
149
150
416
    pState->dataLengthH = 0;
151
416
    pState->dataLengthL = 0;
152
416
    pState->bytesInBuffer = 0;
153
154
416
    memcpy( &pState->chain.H[0], &SymCryptSha512InitialState[0], sizeof( SymCryptSha512InitialState ) );
155
156
    //
157
    // There is no need to initialize the buffer part of the state as that will be
158
    // filled before it is used.
159
    //
160
416
}
161
162
163
SYMCRYPT_NOINLINE
164
VOID
165
SYMCRYPT_CALL
166
SymCryptSha384Init( _Out_ PSYMCRYPT_SHA384_STATE pState )
167
12.4k
{
168
12.4k
    SYMCRYPT_SET_MAGIC( pState );
169
170
12.4k
    pState->dataLengthH = 0;
171
12.4k
    pState->dataLengthL = 0;
172
12.4k
    pState->bytesInBuffer = 0;
173
174
12.4k
    memcpy( &pState->chain.H[0], &SymCryptSha384InitialState[0], sizeof( SymCryptSha384InitialState ) );
175
176
    //
177
    // There is no need to initialize the buffer part of the state as that will be
178
    // filled before it is used.
179
    //
180
12.4k
}
181
182
183
SYMCRYPT_NOINLINE
184
VOID
185
SYMCRYPT_CALL
186
SymCryptSha512Append(
187
    _Inout_                 PSYMCRYPT_SHA512_STATE  pState,
188
    _In_reads_( cbData )    PCBYTE                  pbData,
189
                            SIZE_T                  cbData )
190
96.5k
{
191
96.5k
    UINT32 bytesInBuffer;
192
96.5k
    UINT32 freeInBuffer;
193
96.5k
    SIZE_T tmp;
194
195
96.5k
    SYMCRYPT_CHECK_MAGIC( pState );
196
197
96.5k
    pState->dataLengthL += cbData;
198
96.5k
    if( pState->dataLengthL < cbData ) {
199
0
        pState->dataLengthH++;
200
0
    }
201
202
96.5k
    bytesInBuffer = pState->bytesInBuffer;
203
204
    //
205
    // If previous data in buffer, buffer new input and transform if possible.
206
    //
207
96.5k
    if( bytesInBuffer > 0 )
208
81.8k
    {
209
81.8k
        SYMCRYPT_ASSERT( SYMCRYPT_SHA512_INPUT_BLOCK_SIZE > bytesInBuffer );
210
211
81.8k
        freeInBuffer = SYMCRYPT_SHA512_INPUT_BLOCK_SIZE - bytesInBuffer;
212
81.8k
        if( cbData < freeInBuffer )
213
71.8k
        {
214
            //
215
            // All the data will fit in the buffer.
216
            // We don't do anything here.
217
            // As cbData < inputBlockSize the bulk data processing is skipped,
218
            // and the data will be copied to the buffer at the end
219
            // of this code.
220
71.8k
        } else {
221
            //
222
            // Enough data to fill the whole buffer & process it
223
            //
224
10.0k
            memcpy(&pState->buffer[bytesInBuffer], pbData, freeInBuffer);
225
10.0k
            pbData += freeInBuffer;
226
10.0k
            cbData -= freeInBuffer;
227
10.0k
            SymCryptSha512AppendBlocks( &pState->chain, &pState->buffer[0], SYMCRYPT_SHA512_INPUT_BLOCK_SIZE, &tmp );
228
229
10.0k
            bytesInBuffer = 0;
230
10.0k
        }
231
81.8k
    }
232
233
    //
234
    // Internal buffer is empty; process all remaining whole blocks in the input
235
    //
236
96.5k
    if( cbData >= SYMCRYPT_SHA512_INPUT_BLOCK_SIZE )
237
9.26k
    {
238
9.26k
        SymCryptSha512AppendBlocks( &pState->chain, pbData, cbData, &tmp );
239
9.26k
        SYMCRYPT_ASSERT( tmp < SYMCRYPT_SHA512_INPUT_BLOCK_SIZE );
240
9.26k
        pbData += cbData - tmp;
241
9.26k
        cbData = tmp;
242
9.26k
    }
243
244
96.5k
    SYMCRYPT_ASSERT( cbData < SYMCRYPT_SHA512_INPUT_BLOCK_SIZE );
245
246
    //
247
    // buffer remaining input if necessary.
248
    //
249
96.5k
    if( cbData > 0 )
250
43.2k
    {
251
43.2k
        memcpy( &pState->buffer[bytesInBuffer], pbData, cbData );
252
43.2k
        bytesInBuffer += (UINT32) cbData;
253
43.2k
    }
254
255
96.5k
    pState->bytesInBuffer = bytesInBuffer;
256
257
96.5k
}
258
259
SYMCRYPT_NOINLINE
260
VOID
261
SYMCRYPT_CALL
262
SymCryptSha384Append(
263
    _Inout_                 PSYMCRYPT_SHA384_STATE  pState,
264
    _In_reads_( cbData )    PCBYTE                  pbData,
265
                            SIZE_T                  cbData )
266
25.6k
{
267
268
25.6k
    SymCryptSha512Append( (PSYMCRYPT_SHA512_STATE)pState, pbData, cbData );
269
270
25.6k
}
271
272
273
SYMCRYPT_NOINLINE
274
VOID
275
SYMCRYPT_CALL
276
SymCryptSha512Result(
277
    _Inout_                                     PSYMCRYPT_SHA512_STATE  pState,
278
    _Out_writes_( SYMCRYPT_SHA512_RESULT_SIZE ) PBYTE                   pbResult )
279
28.7k
{
280
28.7k
    UINT32 bytesInBuffer;
281
28.7k
    SIZE_T tmp;
282
283
28.7k
    SYMCRYPT_CHECK_MAGIC( pState );
284
285
28.7k
    bytesInBuffer = pState->bytesInBuffer;
286
287
    //
288
    // The buffer is never completely full, so we can always put the first
289
    // padding byte in.
290
    //
291
28.7k
    pState->buffer[bytesInBuffer++] = 0x80;
292
293
28.7k
    if( bytesInBuffer > 128-16 ) {
294
        //
295
        // No room for the rest of the padding. Pad with zeroes & process block
296
        // bytesInBuffer is at most 128, so we do not have an integer underflow
297
        //
298
3.10k
        SymCryptWipe( &pState->buffer[bytesInBuffer], 128-bytesInBuffer );
299
3.10k
        SymCryptSha512AppendBlocks( &pState->chain, pState->buffer, 128, &tmp );
300
3.10k
        bytesInBuffer = 0;
301
3.10k
    }
302
303
    //
304
    // Set rest of padding
305
    // We wipe to the end of the buffer as it is 16-aligned,
306
    // and it is faster to wipe to an aligned point
307
    //
308
28.7k
    SymCryptWipe( &pState->buffer[bytesInBuffer], 128-bytesInBuffer );
309
28.7k
    SYMCRYPT_STORE_MSBFIRST64( &pState->buffer[128-16], (pState->dataLengthH << 3) + (pState->dataLengthL >> 61)  );
310
28.7k
    SYMCRYPT_STORE_MSBFIRST64( &pState->buffer[128- 8], (pState->dataLengthL << 3) );
311
312
28.7k
    SymCryptSha512AppendBlocks( &pState->chain, pState->buffer, 128, &tmp );
313
314
28.7k
    SymCryptUint64ToMsbFirst( &pState->chain.H[0], pbResult, 8 );
315
316
    //
317
    // We have to wipe the whole state because the Init call
318
    // might be optimized away by a smart compiler.
319
    //
320
28.7k
    SymCryptWipeKnownSize( pState, sizeof( *pState ) );
321
322
28.7k
    SYMCRYPT_SET_MAGIC( pState );
323
324
28.7k
    memcpy( &pState->chain.H[0], &SymCryptSha512InitialState[0], sizeof( SymCryptSha512InitialState ) );
325
28.7k
    }
326
327
SYMCRYPT_NOINLINE
328
VOID
329
SYMCRYPT_CALL
330
SymCryptSha384Result(
331
    _Inout_                                     PSYMCRYPT_SHA384_STATE  pState,
332
    _Out_writes_( SYMCRYPT_SHA384_RESULT_SIZE ) PBYTE                   pbResult )
333
12.2k
{
334
    //
335
    // For simplicity we re-use SymCryptSha512Result. This is slightly slower,
336
    // but SHA-384 isn't used that much.
337
    //
338
12.2k
    SYMCRYPT_ALIGN BYTE sha512Result[SYMCRYPT_SHA512_RESULT_SIZE];      // Buffer for SHA-512 output
339
340
    //
341
    // The SHA-384 result is the first 48 bytes of the SHA-512 result of our state
342
    //
343
12.2k
    SymCryptSha512Result( (PSYMCRYPT_SHA512_STATE)pState, sha512Result );
344
12.2k
    memcpy( pbResult, sha512Result, SYMCRYPT_SHA384_RESULT_SIZE );
345
346
    //
347
    // The buffer was already wiped by the SymCryptSha512Result function, we
348
    // just have to re-initialize for SHA-384
349
    //
350
12.2k
    SymCryptSha384Init( pState );
351
352
12.2k
    SymCryptWipeKnownSize( sha512Result, sizeof( sha512Result ) );
353
12.2k
}
354
355
356
VOID
357
SYMCRYPT_CALL
358
SymCryptSha512StateExportCore(
359
    _In_                                                    PCSYMCRYPT_SHA512_STATE pState,
360
    _Out_writes_bytes_( SYMCRYPT_SHA512_STATE_EXPORT_SIZE ) PBYTE                   pbBlob,
361
    _In_                                                    UINT32                  type )
362
0
{
363
0
    SYMCRYPT_ALIGN SYMCRYPT_SHA512_STATE_EXPORT_BLOB blob;           // local copy to have proper alignment.
364
0
    C_ASSERT( sizeof( blob ) == SYMCRYPT_SHA512_STATE_EXPORT_SIZE );
365
366
0
    SYMCRYPT_CHECK_MAGIC( pState );
367
368
0
    SymCryptWipeKnownSize( &blob, sizeof( blob ) ); // wipe to avoid any data leakage
369
370
0
    blob.header.magic = SYMCRYPT_BLOB_MAGIC;
371
0
    blob.header.size = SYMCRYPT_SHA512_STATE_EXPORT_SIZE;
372
0
    blob.header.type = type;
373
374
    //
375
    // Copy the relevant data. Buffer will be 0-padded.
376
    //
377
378
0
    SymCryptUint64ToMsbFirst( &pState->chain.H[0], &blob.chain[0], 8 );
379
0
    blob.dataLengthL = pState->dataLengthL;
380
0
    blob.dataLengthH = pState->dataLengthH;
381
0
    memcpy( &blob.buffer[0], &pState->buffer[0], blob.dataLengthL & 0x7f );
382
383
0
    SYMCRYPT_ASSERT( (PCBYTE) &blob + sizeof( blob ) - sizeof( SYMCRYPT_BLOB_TRAILER ) == (PCBYTE) &blob.trailer );
384
0
    SymCryptMarvin32( SymCryptMarvin32DefaultSeed, (PCBYTE) &blob, sizeof( blob ) - sizeof( SYMCRYPT_BLOB_TRAILER ), &blob.trailer.checksum[0] );
385
386
0
    memcpy( pbBlob, &blob, sizeof( blob ) );
387
388
//cleanup:
389
0
    SymCryptWipeKnownSize( &blob, sizeof( blob ) );
390
0
    return;
391
0
}
392
393
VOID
394
SYMCRYPT_CALL
395
SymCryptSha512StateExport(
396
    _In_                                                    PCSYMCRYPT_SHA512_STATE pState,
397
    _Out_writes_bytes_( SYMCRYPT_SHA512_STATE_EXPORT_SIZE ) PBYTE                   pbBlob )
398
0
{
399
0
    SymCryptSha512StateExportCore( pState, pbBlob, SymCryptBlobTypeSha512State );
400
0
}
401
402
VOID
403
SYMCRYPT_CALL
404
SymCryptSha384StateExport(
405
    _In_                                                    PCSYMCRYPT_SHA384_STATE pState,
406
    _Out_writes_bytes_( SYMCRYPT_SHA384_STATE_EXPORT_SIZE ) PBYTE                   pbBlob )
407
0
{
408
0
    SymCryptSha512StateExportCore( (PCSYMCRYPT_SHA512_STATE)pState, pbBlob, SymCryptBlobTypeSha384State );
409
0
}
410
411
412
SYMCRYPT_ERROR
413
SYMCRYPT_CALL
414
SymCryptSha512StateImportCore(
415
    _Out_                                                   PSYMCRYPT_SHA512_STATE  pState,
416
    _In_reads_bytes_( SYMCRYPT_SHA512_STATE_EXPORT_SIZE)    PCBYTE                  pbBlob,
417
    _In_                                                    UINT32                  type )
418
0
{
419
0
    SYMCRYPT_ERROR                                      scError = SYMCRYPT_NO_ERROR;
420
0
    SYMCRYPT_ALIGN SYMCRYPT_SHA512_STATE_EXPORT_BLOB    blob;                       // local copy to have proper alignment.
421
0
    BYTE                                                checksum[8];
422
423
0
    C_ASSERT( sizeof( blob ) == SYMCRYPT_SHA512_STATE_EXPORT_SIZE );
424
0
    memcpy( &blob, pbBlob, sizeof( blob ) );
425
426
0
    if( blob.header.magic != SYMCRYPT_BLOB_MAGIC ||
427
0
        blob.header.size != SYMCRYPT_SHA512_STATE_EXPORT_SIZE ||
428
0
        blob.header.type != type )
429
0
    {
430
0
        scError = SYMCRYPT_INVALID_BLOB;
431
0
        goto cleanup;
432
0
    }
433
434
0
    SymCryptMarvin32( SymCryptMarvin32DefaultSeed, (PCBYTE) &blob, sizeof( blob ) - sizeof( SYMCRYPT_BLOB_TRAILER ), checksum );
435
0
    if( memcmp( checksum, &blob.trailer.checksum[0], 8 ) != 0 )
436
0
    {
437
0
        scError = SYMCRYPT_INVALID_BLOB;
438
0
        goto cleanup;
439
0
    }
440
441
0
    SymCryptMsbFirstToUint64( &blob.chain[0], &pState->chain.H[0], 8 );
442
0
    pState->dataLengthL = blob.dataLengthL;
443
0
    pState->dataLengthH = blob.dataLengthH;
444
0
    pState->bytesInBuffer = blob.dataLengthL & 0x7f;
445
0
    memcpy( &pState->buffer[0], &blob.buffer[0], pState->bytesInBuffer );
446
447
0
    SYMCRYPT_SET_MAGIC( pState );
448
449
0
cleanup:
450
0
    SymCryptWipeKnownSize( &blob, sizeof(blob) );
451
0
    return scError;
452
0
}
453
454
SYMCRYPT_ERROR
455
SYMCRYPT_CALL
456
SymCryptSha512StateImport(
457
    _Out_                                                   PSYMCRYPT_SHA512_STATE  pState,
458
    _In_reads_bytes_( SYMCRYPT_SHA512_STATE_EXPORT_SIZE)    PCBYTE                  pbBlob )
459
0
{
460
0
    return SymCryptSha512StateImportCore( pState, pbBlob, SymCryptBlobTypeSha512State );
461
0
}
462
463
SYMCRYPT_ERROR
464
SYMCRYPT_CALL
465
SymCryptSha384StateImport(
466
    _Out_                                                   PSYMCRYPT_SHA384_STATE  pState,
467
    _In_reads_bytes_( SYMCRYPT_SHA384_STATE_EXPORT_SIZE)    PCBYTE                  pbBlob )
468
0
{
469
0
    return SymCryptSha512StateImportCore( (PSYMCRYPT_SHA512_STATE)pState, pbBlob, SymCryptBlobTypeSha384State );
470
0
}
471
472
473
474
//
475
// A simple test case intended for module testing for
476
// FIPS compliance.
477
// This is the one-block example message from FIPS 180-2 appendix C
478
//
479
480
const BYTE SymCryptSha512KATAnswer[64] =
481
{
482
    0xdd, 0xaf, 0x35, 0xa1, 0x93, 0x61, 0x7a, 0xba,
483
    0xcc, 0x41, 0x73, 0x49, 0xae, 0x20, 0x41, 0x31,
484
    0x12, 0xe6, 0xfa, 0x4e, 0x89, 0xa9, 0x7e, 0xa2,
485
    0x0a, 0x9e, 0xee, 0xe6, 0x4b, 0x55, 0xd3, 0x9a,
486
    0x21, 0x92, 0x99, 0x2a, 0x27, 0x4f, 0xc1, 0xa8,
487
    0x36, 0xba, 0x3c, 0x23, 0xa3, 0xfe, 0xeb, 0xbd,
488
    0x45, 0x4d, 0x44, 0x23, 0x64, 0x3c, 0xe8, 0x0e,
489
    0x2a, 0x9a, 0xc9, 0x4f, 0xa5, 0x4c, 0xa4, 0x9f,
490
};
491
492
VOID
493
SYMCRYPT_CALL
494
SymCryptSha512Selftest(void)
495
0
{
496
0
    BYTE result[SYMCRYPT_SHA512_RESULT_SIZE];
497
498
0
    SymCryptSha512( SymCryptTestMsg3, sizeof( SymCryptTestMsg3 ), result );
499
500
0
    SymCryptInjectError( result, sizeof( result ) );
501
502
0
    if( memcmp( result, SymCryptSha512KATAnswer, sizeof( result ) ) != 0 ) {
503
0
        SymCryptFatal( 'SH51' );
504
0
    }
505
0
}
506
507
//
508
// A simple test case intended for module testing for
509
// FIPS compliance.
510
// This is the one-block example message from FIPS 180-2 appendix D
511
//
512
513
const BYTE SymCryptSha384KATAnswer[ 48 ] =
514
{
515
    0xcb, 0x00, 0x75, 0x3f, 0x45, 0xa3, 0x5e, 0x8b,
516
    0xb5, 0xa0, 0x3d, 0x69, 0x9a, 0xc6, 0x50, 0x07,
517
    0x27, 0x2c, 0x32, 0xab, 0x0e, 0xde, 0xd1, 0x63,
518
    0x1a, 0x8b, 0x60, 0x5a, 0x43, 0xff, 0x5b, 0xed,
519
    0x80, 0x86, 0x07, 0x2b, 0xa1, 0xe7, 0xcc, 0x23,
520
    0x58, 0xba, 0xec, 0xa1, 0x34, 0xc8, 0x25, 0xa7,
521
};
522
523
VOID
524
SYMCRYPT_CALL
525
SymCryptSha384Selftest(void)
526
0
{
527
0
    BYTE result[SYMCRYPT_SHA384_RESULT_SIZE];
528
529
0
    SymCryptSha384( SymCryptTestMsg3, sizeof( SymCryptTestMsg3 ), result );
530
531
0
    SymCryptInjectError( result, sizeof( result ) );
532
533
0
    if( memcmp( result, SymCryptSha384KATAnswer, sizeof( result ) ) != 0 ) {
534
0
        SymCryptFatal( 'SH38' );
535
0
    }
536
0
}
537
538
//
539
// We keep multiple implementations in this file.
540
// This allows us to switch different platforms to different implementations, whichever
541
// is faster. Even if we don't use a particular implementation in one release,
542
// we keep it around in case it becomes the preferred one for a new CPU release.
543
// (Performance can change a lot with changes in micro-architecture.)
544
//
545
546
//===================================================================================
547
// Implementation of compression function using UINT64s
548
//
549
550
//
551
// For documentation on these function see FIPS 180-2
552
//
553
// MAJ and CH are the functions Maj and Ch from the standard.
554
// CSIGMA0 and CSIGMA1 are the capital sigma functions.
555
// LSIGMA0 and LSIGMA1 are the lowercase sigma functions.
556
//
557
// The canonical definitions of the MAJ and CH functions are:
558
//#define MAJ( x, y, z )    (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
559
//#define CH( x, y, z )  (((x) & (y)) ^ ((~(x)) & (z)))
560
// We use optimized versions defined below
561
//
562
43.3M
#define MAJ( x, y, z )  ((((z) | (y)) & (x) ) | ((z) & (y)))
563
43.3M
#define CH( x, y, z )  ((((z) ^ (y)) & (x)) ^ (z))
564
565
//
566
// The four Sigma functions
567
//
568
569
//#define CSIGMA0( x )    (ROR64((x), 28) ^ ROR64((x), 34) ^ ROR64((x), 39))
570
//#define CSIGMA1( x )    (ROR64((x), 14) ^ ROR64((x), 18) ^ ROR64((x), 41))
571
//#define LSIGMA0( x )    (ROR64((x),  1) ^ ROR64((x),  8) ^ ((x)>> 7))
572
//#define LSIGMA1( x )    (ROR64((x), 19) ^ ROR64((x), 61) ^ ((x)>> 6))
573
574
43.3M
#define CSIGMA0( x )  (ROR64((ROR64((x), 6) ^ ROR64((x), 11) ^ (x)), 28))
575
43.3M
#define CSIGMA1( x )  (ROR64((ROR64((x), 4) ^ ROR64((x), 27) ^ (x)), 14))
576
34.6M
#define LSIGMA0( x )    (ROR64((x) ^ ROR64((x),  7),  1) ^ ((x)>> 7))
577
34.6M
#define LSIGMA1( x )    (ROR64((x) ^ ROR64((x), 42), 19) ^ ((x)>> 6))
578
579
580
581
//
582
// The values a-h were stored in an array called ah.
583
// We have unrolled the loop 16 times. This makes both the indices into
584
// the ah array constant, and it makes the message addressing constant.
585
// This provides a significant speed improvement, at the cost of making
586
// the main loop about 4 kB in code.
587
//
588
// Initial round; r16 is the round number mod 16
589
// ah[ r16   &7] = h
590
// ah[(r16+1)&7] = g;
591
// ah[(r16+2)&7] = f;
592
// ah[(r16+3)&7] = e;
593
// ah[(r16+4)&7] = d;
594
// ah[(r16+5)&7] = c;
595
// ah[(r16+6)&7] = b;
596
// ah[(r16+7)&7] = a;
597
//
598
// Unfortunately, the compiler seems to choke on this, allocating an extra variable for
599
// each of the array indices, with duplicate stores to both locations.
600
//
601
602
//
603
// The core round, after the message word has been computed for this round and put in Wt.
604
// r16 is the round number modulo 16. (Static after loop unrolling)
605
// r is the round number
606
43.3M
#define CROUND( a, b, c, d, e, f, g, h, r, r16 ) {;\
607
43.3M
    W[r16] = Wt; \
608
43.3M
    h += CSIGMA1(e) + CH(e, f, g) + SymCryptSha512K[r] + Wt;\
609
43.3M
    d += h;\
610
43.3M
    h += CSIGMA0(a) + MAJ(a, b, c);\
611
43.3M
}
612
613
//
614
// Initial round that reads the message.
615
// r is the round number 0..15
616
//
617
8.66M
#define IROUND( a, b, c, d, e, f, g, h, r ) {\
618
8.66M
    Wt = SYMCRYPT_LOAD_MSBFIRST64( &pbData[ 8*r ] );\
619
8.66M
    CROUND( a, b, c, d, e, f, g, h, r, r);\
620
8.66M
    }
621
//
622
// Subsequent rounds.
623
// r is the round number, r16 is the round number mod 16.
624
// These are separate as typically r is run-time and r16 is compile time constant.
625
//
626
34.6M
#define FROUND( a, b, c, d, e, f, g, h, r, r16 ) {                                      \
627
34.6M
    Wt = LSIGMA1( W[(r16-2) & 15] ) +   W[(r16-7) & 15] +    \
628
34.6M
         LSIGMA0( W[(r16-15) & 15]) +   W[r16 & 15];       \
629
34.6M
    CROUND( a, b, c, d, e, f, g, h, r, r16 ); \
630
34.6M
    }
631
632
//
633
// This is the core routine that does the actual hard work
634
// This is based on the older one in RSA32LIB by Scott Field from 2001
635
//
636
VOID
637
SYMCRYPT_CALL
638
SymCryptSha512AppendBlocks_ull(
639
    _Inout_                 SYMCRYPT_SHA512_CHAINING_STATE  *   pChain,
640
    _In_reads_(cbData)      PCBYTE                              pbData,
641
                            SIZE_T                              cbData,
642
    _Out_                   SIZE_T                            * pcbRemaining )
643
52.1k
{
644
52.1k
    SYMCRYPT_ALIGN UINT64 W[16];
645
52.1k
    UINT64 A, B, C, D, E, F, G, H;
646
52.1k
    int round;
647
52.1k
    UINT64 Wt;
648
649
650
593k
    while( cbData >= 128 )
651
541k
    {
652
541k
        A = pChain->H[0];
653
541k
        B = pChain->H[1];
654
541k
        C = pChain->H[2];
655
541k
        D = pChain->H[3];
656
541k
        E = pChain->H[4];
657
541k
        F = pChain->H[5];
658
541k
        G = pChain->H[6];
659
541k
        H = pChain->H[7];
660
661
        //
662
        // initial rounds 1 to 16
663
        //
664
665
541k
        IROUND( A, B, C, D, E, F, G, H,  0 );
666
541k
        IROUND( H, A, B, C, D, E, F, G,  1 );
667
541k
        IROUND( G, H, A, B, C, D, E, F,  2 );
668
541k
        IROUND( F, G, H, A, B, C, D, E,  3 );
669
541k
        IROUND( E, F, G, H, A, B, C, D,  4 );
670
541k
        IROUND( D, E, F, G, H, A, B, C,  5 );
671
541k
        IROUND( C, D, E, F, G, H, A, B,  6 );
672
541k
        IROUND( B, C, D, E, F, G, H, A,  7 );
673
541k
        IROUND( A, B, C, D, E, F, G, H,  8 );
674
541k
        IROUND( H, A, B, C, D, E, F, G,  9 );
675
541k
        IROUND( G, H, A, B, C, D, E, F, 10 );
676
541k
        IROUND( F, G, H, A, B, C, D, E, 11 );
677
541k
        IROUND( E, F, G, H, A, B, C, D, 12 );
678
541k
        IROUND( D, E, F, G, H, A, B, C, 13 );
679
541k
        IROUND( C, D, E, F, G, H, A, B, 14 );
680
541k
        IROUND( B, C, D, E, F, G, H, A, 15 );
681
682
2.70M
        for( round=16; round<80; round += 16 )
683
2.16M
        {
684
2.16M
            FROUND( A, B, C, D, E, F, G, H, round +  0,  0 );
685
2.16M
            FROUND( H, A, B, C, D, E, F, G, round +  1,  1 );
686
2.16M
            FROUND( G, H, A, B, C, D, E, F, round +  2,  2 );
687
2.16M
            FROUND( F, G, H, A, B, C, D, E, round +  3,  3 );
688
2.16M
            FROUND( E, F, G, H, A, B, C, D, round +  4,  4 );
689
2.16M
            FROUND( D, E, F, G, H, A, B, C, round +  5,  5 );
690
2.16M
            FROUND( C, D, E, F, G, H, A, B, round +  6,  6 );
691
2.16M
            FROUND( B, C, D, E, F, G, H, A, round +  7,  7 );
692
2.16M
            FROUND( A, B, C, D, E, F, G, H, round +  8,  8 );
693
2.16M
            FROUND( H, A, B, C, D, E, F, G, round +  9,  9 );
694
2.16M
            FROUND( G, H, A, B, C, D, E, F, round + 10, 10 );
695
2.16M
            FROUND( F, G, H, A, B, C, D, E, round + 11, 11 );
696
2.16M
            FROUND( E, F, G, H, A, B, C, D, round + 12, 12 );
697
2.16M
            FROUND( D, E, F, G, H, A, B, C, round + 13, 13 );
698
2.16M
            FROUND( C, D, E, F, G, H, A, B, round + 14, 14 );
699
2.16M
            FROUND( B, C, D, E, F, G, H, A, round + 15, 15 );
700
2.16M
        }
701
702
541k
        pChain->H[0] = A + pChain->H[0];
703
541k
        pChain->H[1] = B + pChain->H[1];
704
541k
        pChain->H[2] = C + pChain->H[2];
705
541k
        pChain->H[3] = D + pChain->H[3];
706
541k
        pChain->H[4] = E + pChain->H[4];
707
541k
        pChain->H[5] = F + pChain->H[5];
708
541k
        pChain->H[6] = G + pChain->H[6];
709
541k
        pChain->H[7] = H + pChain->H[7];
710
711
541k
        pbData += 128;
712
541k
        cbData -= 128;
713
541k
    }
714
715
52.1k
    *pcbRemaining = cbData;
716
717
    //
718
    // Wipe the variables;
719
    //
720
52.1k
    SymCryptWipeKnownSize( W, sizeof( W ) );
721
52.1k
    SYMCRYPT_FORCE_WRITE64( &A, 0 );
722
52.1k
    SYMCRYPT_FORCE_WRITE64( &B, 0 );
723
52.1k
    SYMCRYPT_FORCE_WRITE64( &C, 0 );
724
52.1k
    SYMCRYPT_FORCE_WRITE64( &D, 0 );
725
52.1k
    SYMCRYPT_FORCE_WRITE64( &E, 0 );
726
52.1k
    SYMCRYPT_FORCE_WRITE64( &F, 0 );
727
52.1k
    SYMCRYPT_FORCE_WRITE64( &G, 0 );
728
52.1k
    SYMCRYPT_FORCE_WRITE64( &H, 0 );
729
52.1k
    SYMCRYPT_FORCE_WRITE64( &Wt, 0 );
730
52.1k
}
731
732
//
733
// UINT64 based implementation that
734
// first computes the expanded message, and then the
735
// actual hash computation.
736
// It tries to use fewer registers; this is probably a good approach for CPUs with only 8
737
// 64-bit registers; which is what you would use on x86 XMM, but we have XMM code below.
738
// This uses more memory, but might allow better register re-use and thereby
739
// reduce the number of load/stores.
740
//
741
742
VOID
743
SYMCRYPT_CALL
744
SymCryptSha512AppendBlocks_ull2(
745
    _Inout_                 SYMCRYPT_SHA512_CHAINING_STATE  *   pChain,
746
    _In_reads_(cbData)      PCBYTE                              pbData,
747
                            SIZE_T                              cbData,
748
    _Out_                   SIZE_T                            * pcbRemaining )
749
0
{
750
0
    SYMCRYPT_ALIGN UINT64 buf[4 + 8 + 80];    // 4 words original chaining state, chaining state, and expanded input block
751
0
    UINT64 * W = &buf[4 + 8];
752
0
    UINT64 * ha = &buf[4]; // initial state words, in order h, g, ..., b, a
753
0
    UINT64 A, B, C, D, T;
754
0
    int r;
755
756
0
    ha[7] = pChain->H[0]; buf[3] = ha[7];
757
0
    ha[6] = pChain->H[1]; buf[2] = ha[6];
758
0
    ha[5] = pChain->H[2]; buf[1] = ha[5];
759
0
    ha[4] = pChain->H[3]; buf[0] = ha[4];
760
0
    ha[3] = pChain->H[4];
761
0
    ha[2] = pChain->H[5];
762
0
    ha[1] = pChain->H[6];
763
0
    ha[0] = pChain->H[7];
764
765
0
    while( cbData >= 128 )
766
0
    {
767
768
        //
769
        // Capture the input into W[0..15]
770
        //
771
0
        for( r=0; r<16; r+= 2 )
772
0
        {
773
0
            W[r  ] = SYMCRYPT_LOAD_MSBFIRST64( &pbData[ 8* r    ] );
774
0
            W[r+1] = SYMCRYPT_LOAD_MSBFIRST64( &pbData[ 8*(r+1) ] );
775
0
        }
776
777
        //
778
        // Expand the message
779
        //
780
0
        A = W[15];
781
0
        B = W[14];
782
0
        D = W[0];
783
0
        for( r=16; r<80; r+= 2 )
784
0
        {
785
            // Loop invariant: A=W[r-1], B = W[r-2], D = W[r-16]
786
787
            //
788
            // Macro for one word of message expansion.
789
            // Invariant:
790
            // on entry: a = W[r-1], b = W[r-2], d = W[r-16]
791
            // on exit:  W[r] computed, a = W[r-1], b = W[r], c = W[r-15]
792
            //
793
0
            #define EXPAND( a, b, c, d, r ) \
794
0
                        c = W[r-15]; \
795
0
                        b = d + LSIGMA1( b ) + W[r-7] + LSIGMA0( c ); \
796
0
                        W[r] = b; \
797
0
798
0
            EXPAND( A, B, C, D, r );
799
0
            EXPAND( B, A, D, C, (r+1));
800
801
0
            #undef EXPAND
802
0
        }
803
804
0
        A = ha[7];
805
0
        B = ha[6];
806
0
        C = ha[5];
807
0
        D = ha[4];
808
809
0
        for( r=0; r<80; r += 4 )
810
0
        {
811
            //
812
            // Loop invariant:
813
            // A, B, C, and D are the a,b,c,d values of the current state.
814
            // W[r] is the next expanded message word to be processed.
815
            // W[r-8 .. r-5] contain the current state words h, g, f, e.
816
            //
817
818
            //
819
            // Macro to compute one round
820
            //
821
0
            #define DO_ROUND( a, b, c, d, t, r ) \
822
0
                t = W[r] + CSIGMA1( W[r-5] ) + W[r-8] + CH( W[r-5], W[r-6], W[r-7] ) + SymCryptSha512K[r]; \
823
0
                W[r-4] = t + d; \
824
0
                d = t + CSIGMA0( a ) + MAJ( c, b, a );
825
826
0
            DO_ROUND( A, B, C, D, T, r );
827
0
            DO_ROUND( D, A, B, C, T, (r+1) );
828
0
            DO_ROUND( C, D, A, B, T, (r+2) );
829
0
            DO_ROUND( B, C, D, A, T, (r+3) );
830
0
            #undef DO_ROUND
831
0
        }
832
833
0
        buf[3] = ha[7] = buf[3] + A;
834
0
        buf[2] = ha[6] = buf[2] + B;
835
0
        buf[1] = ha[5] = buf[1] + C;
836
0
        buf[0] = ha[4] = buf[0] + D;
837
0
        ha[3] += W[r-5];
838
0
        ha[2] += W[r-6];
839
0
        ha[1] += W[r-7];
840
0
        ha[0] += W[r-8];
841
842
0
        pbData += 128;
843
0
        cbData -= 128;
844
0
    }
845
846
0
    pChain->H[0] = ha[7];
847
0
    pChain->H[1] = ha[6];
848
0
    pChain->H[2] = ha[5];
849
0
    pChain->H[3] = ha[4];
850
0
    pChain->H[4] = ha[3];
851
0
    pChain->H[5] = ha[2];
852
0
    pChain->H[6] = ha[1];
853
0
    pChain->H[7] = ha[0];
854
855
0
    *pcbRemaining = cbData;
856
857
    //
858
    // Wipe the variables;
859
    //
860
0
    SymCryptWipeKnownSize( buf, sizeof( buf ) );
861
0
    SYMCRYPT_FORCE_WRITE64( &A, 0 );
862
0
    SYMCRYPT_FORCE_WRITE64( &B, 0 );
863
0
    SYMCRYPT_FORCE_WRITE64( &C, 0 );
864
0
    SYMCRYPT_FORCE_WRITE64( &D, 0 );
865
0
    SYMCRYPT_FORCE_WRITE64( &T, 0 );
866
867
0
}
868
869
//
870
// UINT64 based implementation that
871
// first computes the expanded message, and then the
872
// actual hash computation.
873
// This one uses more registers than the previous one.
874
//
875
876
VOID
877
SYMCRYPT_CALL
878
SymCryptSha512AppendBlocks_ull3(
879
    _Inout_                 SYMCRYPT_SHA512_CHAINING_STATE  *   pChain,
880
    _In_reads_(cbData)      PCBYTE                              pbData,
881
                            SIZE_T                              cbData,
882
    _Out_                   SIZE_T                            * pcbRemaining )
883
0
{
884
0
    SYMCRYPT_ALIGN UINT64 W[80];
885
0
    SYMCRYPT_ALIGN UINT64 ha[8];
886
0
    UINT64 A, B, C, D, E, F, G, H;
887
0
    int r;
888
889
0
    ha[7] = pChain->H[0];
890
0
    ha[6] = pChain->H[1];
891
0
    ha[5] = pChain->H[2];
892
0
    ha[4] = pChain->H[3];
893
0
    ha[3] = pChain->H[4];
894
0
    ha[2] = pChain->H[5];
895
0
    ha[1] = pChain->H[6];
896
0
    ha[0] = pChain->H[7];
897
898
0
    while( cbData >= 128 )
899
0
    {
900
901
        //
902
        // Capture the input into W[0..15]
903
        //
904
0
        for( r=0; r<16; r+= 2 )
905
0
        {
906
0
            W[r  ] = SYMCRYPT_LOAD_MSBFIRST64( &pbData[ 8* r    ] );
907
0
            W[r+1] = SYMCRYPT_LOAD_MSBFIRST64( &pbData[ 8*(r+1) ] );
908
0
        }
909
910
        //
911
        // Expand the message
912
        //
913
0
        A = W[15];
914
0
        B = W[14];
915
0
        D = W[0];
916
0
        for( r=16; r<80; r+= 2 )
917
0
        {
918
            // Loop invariant: A=W[r-1], B = W[r-2], D = W[r-16]
919
920
            //
921
            // Macro for one word of message expansion.
922
            // Invariant:
923
            // on entry: a = W[r-1], b = W[r-2], d = W[r-16]
924
            // on exit:  W[r] computed, a = W[r-1], b = W[r], c = W[r-15]
925
            //
926
0
            #define EXPAND( a, b, c, d, r ) \
927
0
                        c = W[r-15]; \
928
0
                        b = d + LSIGMA1( b ) + W[r-7] + LSIGMA0( c ); \
929
0
                        W[r] = b; \
930
0
931
0
            EXPAND( A, B, C, D, r );
932
0
            EXPAND( B, A, D, C, (r+1));
933
934
0
            #undef EXPAND
935
0
        }
936
937
0
        A = ha[7];
938
0
        B = ha[6];
939
0
        C = ha[5];
940
0
        D = ha[4];
941
0
        E = ha[3];
942
0
        F = ha[2];
943
0
        G = ha[1];
944
0
        H = ha[0];
945
946
0
        for( r=0; r<80; r += 8 )
947
0
        {
948
            //
949
            // Loop invariant:
950
            // A, B, C, and D, E, F, G, H, are the values of the current state.
951
            // W[r] is the next expanded message word to be processed.
952
            //
953
954
            //
955
            // Macro to compute one round
956
            //
957
0
            #define DO_ROUND( a, b, c, d, e, f, g, h, r ) \
958
0
                h += W[r] + CSIGMA1( e ) + CH( e, f, g ) + SymCryptSha512K[r]; \
959
0
                d += h; \
960
0
                h += CSIGMA0( a ) + MAJ( c, b, a );
961
962
0
            DO_ROUND( A, B, C, D, E, F, G, H, (r  ) );
963
0
            DO_ROUND( H, A, B, C, D, E, F, G, (r+1) );
964
0
            DO_ROUND( G, H, A, B, C, D, E, F, (r+2) );
965
0
            DO_ROUND( F, G, H, A, B, C, D, E, (r+3) );
966
0
            DO_ROUND( E, F, G, H, A, B, C, D, (r+4) );
967
0
            DO_ROUND( D, E, F, G, H, A, B, C, (r+5) );
968
0
            DO_ROUND( C, D, E, F, G, H, A, B, (r+6) );
969
0
            DO_ROUND( B, C, D, E, F, G, H, A, (r+7) );
970
0
            #undef DO_ROUND
971
0
        }
972
973
0
        ha[7] += A;
974
0
        ha[6] += B;
975
0
        ha[5] += C;
976
0
        ha[4] += D;
977
0
        ha[3] += E;
978
0
        ha[2] += F;
979
0
        ha[1] += G;
980
0
        ha[0] += H;
981
982
0
        pbData += 128;
983
0
        cbData -= 128;
984
0
    }
985
986
0
    pChain->H[0] = ha[7];
987
0
    pChain->H[1] = ha[6];
988
0
    pChain->H[2] = ha[5];
989
0
    pChain->H[3] = ha[4];
990
0
    pChain->H[4] = ha[3];
991
0
    pChain->H[5] = ha[2];
992
0
    pChain->H[6] = ha[1];
993
0
    pChain->H[7] = ha[0];
994
995
0
    *pcbRemaining = cbData;
996
997
    //
998
    // Wipe the variables;
999
    //
1000
0
    SymCryptWipeKnownSize( W, sizeof( W ) );
1001
0
    SymCryptWipeKnownSize( ha, sizeof( ha ) );
1002
0
    SYMCRYPT_FORCE_WRITE64( &A, 0 );
1003
0
    SYMCRYPT_FORCE_WRITE64( &B, 0 );
1004
0
    SYMCRYPT_FORCE_WRITE64( &C, 0 );
1005
0
    SYMCRYPT_FORCE_WRITE64( &D, 0 );
1006
0
    SYMCRYPT_FORCE_WRITE64( &E, 0 );
1007
0
    SYMCRYPT_FORCE_WRITE64( &F, 0 );
1008
0
    SYMCRYPT_FORCE_WRITE64( &G, 0 );
1009
0
    SYMCRYPT_FORCE_WRITE64( &H, 0 );
1010
0
}
1011
1012
#undef MAJ
1013
#undef CH
1014
#undef CSIGMA0
1015
#undef CSIGMA1
1016
#undef LSIGMA0
1017
#undef LSIGMA1
1018
#undef CROUND
1019
#undef IROUND
1020
#undef FROUND
1021
1022
//======================================================================================
1023
// Implementation using Xmm registers
1024
//
1025
#if SYMCRYPT_CPU_X86 // only on X86; AMD64 is faster when using UINT64s
1026
1027
#if SYMCRYPT_MS_VC
1028
#ifndef _mm_storeu_si64
1029
    // Workaround missing intrinsic on some versions of MSVC
1030
    #define _mm_storeu_si64(p, a) (_mm_storel_epi64((__m128i*)(p), (a)))
1031
#endif
1032
#endif
1033
1034
#define XMMADD( _a, _b ) _mm_add_epi64((_a), (_b))
1035
#define XMMAND( _a, _b ) _mm_and_si128((_a), (_b))
1036
#define XMMOR(  _a, _b ) _mm_or_si128((_a), (_b))
1037
#define XMMROR( _a, _n ) _mm_xor_si128( _mm_slli_epi64( (_a), 64-(_n)), _mm_srli_epi64( (_a), (_n)) )
1038
#define XMMSHR( _a, _n ) _mm_srli_epi64((_a), (_n))
1039
#define XMMXOR( _a, _b ) _mm_xor_si128((_a), (_b))
1040
#define XMMSTORE_UINT64( _a, _addr ) _mm_storeu_si64((_addr), (_a))
1041
1042
#define XMMMAJ( x, y, z )  XMMOR( XMMAND( XMMOR( (z), (y)), (x)), XMMAND( (z), (y) ) )
1043
#define XMMCH(  x, y, z )  XMMXOR( XMMAND( XMMXOR( (z), (y) ), (x)), (z))
1044
#define XMMCSIGMA0( x )    XMMXOR( XMMXOR( XMMROR((x), 28), XMMROR((x), 34)), XMMROR((x), 39))
1045
#define XMMCSIGMA1( x )    XMMXOR( XMMXOR( XMMROR((x), 14), XMMROR((x), 18)), XMMROR((x), 41))
1046
#define XMMLSIGMA0( x )    XMMXOR( XMMXOR( XMMROR((x),  1), XMMROR((x),  8)), XMMSHR((x), 7))
1047
#define XMMLSIGMA1( x )    XMMXOR( XMMXOR( XMMROR((x), 19), XMMROR((x), 61)), XMMSHR((x), 6))
1048
1049
//
1050
// Core round takes two arguments: r16 = round number modulo 16, r = round number - r16.
1051
// On entry, Wt must be equal to the sum of the round constant and the expanded message word for this round.
1052
// Only the lower word of each Xmm register is used.
1053
//
1054
#define XMMCROUND( r16, r ) {;\
1055
    ah[r16 & 7] = XMMADD( XMMADD( XMMADD( ah[r16 & 7], XMMCSIGMA1(ah[(r16+3)&7]) ), XMMCH(ah[(r16+3)&7], ah[(r16+2)&7], ah[(r16+1)&7]) ), Wt );\
1056
    ah[(r16+4)&7] = XMMADD( ah[(r16+4)&7], ah[r16 &7] );\
1057
    ah[r16 & 7] = XMMADD( XMMADD( ah[r16 & 7], XMMCSIGMA0(ah[(r16+7)&7])), XMMMAJ(ah[(r16+7)&7], ah[(r16+6)&7], ah[(r16+5)&7]) );\
1058
}
1059
1060
#pragma warning( disable: 4127 )   // conditional expression is constant
1061
1062
//
1063
// Initial round; reads data and performs a round.
1064
// Data is read in 128-bit chunks every other round.
1065
//
1066
#define XMMIROUND( r ) {\
1067
    if( (r&1) == 0 ) \
1068
    { \
1069
        Wt = _mm_loadu_si128( (__m128i *)&pbData[ 8*r ] ); \
1070
        Wt = _mm_shuffle_epi8( Wt, BYTE_REVERSE_64 ); \
1071
        W[r/2] = Wt; \
1072
        Wt = XMMADD( Wt, _mm_load_si128( (__m128i *)&SymCryptSha512K[r] ) ); \
1073
        Ws = _mm_srli_si128( Wt, 8 ); \
1074
    } else {\
1075
        Wt = Ws;\
1076
    }\
1077
    XMMCROUND( r, r );\
1078
}
1079
1080
//
1081
// Working version of XMMIROUND:
1082
//    Wt = XMMFROM_MSBF( &pbData[ 8*r ] );\
1083
//    W[r] = Wt;\
1084
//    Wt = XMMADD( XMMFROM_UINT64(SymCryptSha512K[r]), Wt );\
1085
//    XMMCROUND(r,r);\
1086
1087
#define XMMFROUND(r16, rb) { \
1088
    if( (r16 & 1) == 0 ) \
1089
    {\
1090
        Wt = XMMADD( XMMADD( XMMADD(    XMMLSIGMA1( W[((r16 -  2)&15)/2] ), \
1091
                                        _mm_alignr_epi8( W[((r16 - 6)&15)/2], W[((r16 - 7)&15)/2], 8 ) ), \
1092
                                        XMMLSIGMA0( _mm_alignr_epi8( W[((r16 - 14)&15)/2], W[((r16 - 15)&15)/2], 8 ) ) ), \
1093
                                        W[((r16 - 16)&15)/2] ); \
1094
        W[r16/2] = Wt;\
1095
        Ws = _mm_load_si128( (__m128i *)&SymCryptSha512K[r16 + rb] );\
1096
        Wt = XMMADD( Ws , Wt );\
1097
        Ws = _mm_srli_si128( Wt, 8 );\
1098
    } else {\
1099
        Wt = Ws;\
1100
    }\
1101
    XMMCROUND( r16, r16+rb ); \
1102
}
1103
1104
VOID
1105
SYMCRYPT_CALL
1106
SymCryptSha512AppendBlocks_xmm(
1107
    _Inout_                 SYMCRYPT_SHA512_CHAINING_STATE  *   pChain,
1108
    _In_reads_(cbData)      PCBYTE                              pbData,
1109
                            SIZE_T                              cbData,
1110
    _Out_                   SIZE_T                            * pcbRemaining )
1111
{
1112
    SYMCRYPT_ALIGN __m128i W[8];   // message expansion buffer, 8 elements each storing 2 consecutive UINT64s
1113
    SYMCRYPT_ALIGN __m128i ah[8];
1114
    SYMCRYPT_ALIGN __m128i feedf[8];
1115
    int round;
1116
    __m128i Wt, Ws;
1117
    const __m128i BYTE_REVERSE_64 = _mm_set_epi8( 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 );
1118
1119
    Wt = _mm_loadu_si128( (__m128i *) &pChain->H[0] );
1120
    feedf[7] = ah[7] = Wt;
1121
    feedf[6] = ah[6] = _mm_srli_si128( Wt, 8 );
1122
    Wt = _mm_loadu_si128( (__m128i *) &pChain->H[2] );
1123
    feedf[5] = ah[5] = Wt;
1124
    feedf[4] = ah[4] = _mm_srli_si128( Wt, 8 );
1125
    Wt = _mm_loadu_si128( (__m128i *) &pChain->H[4] );
1126
    feedf[3] = ah[3] = Wt;
1127
    feedf[2] = ah[2] = _mm_srli_si128( Wt, 8 );
1128
    Wt = _mm_loadu_si128( (__m128i *) &pChain->H[6] );
1129
    feedf[1] = ah[1] = Wt;
1130
    feedf[0] = ah[0] = _mm_srli_si128( Wt, 8 );
1131
1132
    while( cbData >= 128 )
1133
    {
1134
        //
1135
        // initial rounds 1 to 16
1136
        //
1137
1138
        XMMIROUND(  0 );
1139
        XMMIROUND(  1 );
1140
        XMMIROUND(  2 );
1141
        XMMIROUND(  3 );
1142
        XMMIROUND(  4 );
1143
        XMMIROUND(  5 );
1144
        XMMIROUND(  6 );
1145
        XMMIROUND(  7 );
1146
        XMMIROUND(  8 );
1147
        XMMIROUND(  9 );
1148
        XMMIROUND( 10 );
1149
        XMMIROUND( 11 );
1150
        XMMIROUND( 12 );
1151
        XMMIROUND( 13 );
1152
        XMMIROUND( 14 );
1153
        XMMIROUND( 15 );
1154
1155
        for( round=16; round<80; round += 16 )
1156
        {
1157
            XMMFROUND(  0, round );
1158
            XMMFROUND(  1, round );
1159
            XMMFROUND(  2, round );
1160
            XMMFROUND(  3, round );
1161
            XMMFROUND(  4, round );
1162
            XMMFROUND(  5, round );
1163
            XMMFROUND(  6, round );
1164
            XMMFROUND(  7, round );
1165
            XMMFROUND(  8, round );
1166
            XMMFROUND(  9, round );
1167
            XMMFROUND( 10, round );
1168
            XMMFROUND( 11, round );
1169
            XMMFROUND( 12, round );
1170
            XMMFROUND( 13, round );
1171
            XMMFROUND( 14, round );
1172
            XMMFROUND( 15, round );
1173
        }
1174
1175
        feedf[0] = ah[0] = XMMADD( ah[0], feedf[0] );
1176
        feedf[1] = ah[1] = XMMADD( ah[1], feedf[1] );
1177
        feedf[2] = ah[2] = XMMADD( ah[2], feedf[2] );
1178
        feedf[3] = ah[3] = XMMADD( ah[3], feedf[3] );
1179
        feedf[4] = ah[4] = XMMADD( ah[4], feedf[4] );
1180
        feedf[5] = ah[5] = XMMADD( ah[5], feedf[5] );
1181
        feedf[6] = ah[6] = XMMADD( ah[6], feedf[6] );
1182
        feedf[7] = ah[7] = XMMADD( ah[7], feedf[7] );
1183
1184
        pbData += 128;
1185
        cbData -= 128;
1186
1187
    }
1188
1189
    XMMSTORE_UINT64( ah[7], &(pChain->H[0]) );
1190
    XMMSTORE_UINT64( ah[6], &(pChain->H[1]) );
1191
    XMMSTORE_UINT64( ah[5], &(pChain->H[2]) );
1192
    XMMSTORE_UINT64( ah[4], &(pChain->H[3]) );
1193
    XMMSTORE_UINT64( ah[3], &(pChain->H[4]) );
1194
    XMMSTORE_UINT64( ah[2], &(pChain->H[5]) );
1195
    XMMSTORE_UINT64( ah[1], &(pChain->H[6]) );
1196
    XMMSTORE_UINT64( ah[0], &(pChain->H[7]) );
1197
1198
    *pcbRemaining = cbData;
1199
1200
    //
1201
    // Wipe the variables;
1202
    //
1203
    SymCryptWipeKnownSize( ah, sizeof( ah ) );
1204
    SymCryptWipeKnownSize( feedf, sizeof( feedf ) );
1205
    SymCryptWipeKnownSize( W, sizeof( W ) );
1206
    SymCryptWipeKnownSize( &Wt, sizeof( Wt ));
1207
    SymCryptWipeKnownSize( &Ws, sizeof( Ws ));
1208
}
1209
1210
#endif
1211
1212
1213
1214
//======================================================================================
1215
// Implementation using NEON registers
1216
//
1217
#if SYMCRYPT_CPU_ARM
1218
1219
1220
#define ROR( _a, _n ) vorr_u64( vshl_n_u64( _a, 64 - _n ), vshr_n_u64( _a, _n ) )
1221
#define ADD( x, y ) vadd_u64( (x), (y) )
1222
1223
#define MAJ( x, y, z )  vorr_u64( vand_u64( vorr_u64( (z), (y)), (x)), vand_u64( (z), (y) ) )
1224
#define CH(  x, y, z )  veor_u64( vand_u64( veor_u64( (z), (y) ), (x)), (z))
1225
#define CSIGMA0( x )    veor_u64( veor_u64( ROR((x), 28), ROR((x), 34)), ROR((x), 39))
1226
#define CSIGMA1( x )    veor_u64( veor_u64( ROR((x), 14), ROR((x), 18)), ROR((x), 41))
1227
#define LSIGMA0( x )    veor_u64( veor_u64( ROR((x),  1), ROR((x),  8)), vshr_n_u64((x), 7))
1228
#define LSIGMA1( x )    veor_u64( veor_u64( ROR((x), 19), ROR((x), 61)), vshr_n_u64((x), 6))
1229
1230
//
1231
// r = round number, r16 = r mod 16 (often a compile-time constant when r is not)
1232
//
1233
#define CROUND( a, b, c, d, e, f, g, h, r, r16 ) {\
1234
    W[r16] = Wt; \
1235
    h = ADD( h, ADD( ADD( ADD( CSIGMA1(e), CH(e, f, g)), *(__n64 *)&SymCryptSha512K[r]),  Wt ));\
1236
    d = ADD( d, h );\
1237
    h = ADD( h, ADD( CSIGMA0(a), MAJ(a, b, c)));\
1238
}
1239
1240
//
1241
// Initial round that reads the message.
1242
// r is the round number 0..15
1243
//
1244
#define IROUND( a, b, c, d, e, f, g, h, r ) {\
1245
    Wt = vmov_n_u64( SYMCRYPT_LOAD_MSBFIRST64( &pbData[ 8*r ] ) );\
1246
    CROUND( a, b, c, d, e, f, g, h, r, r);\
1247
    }
1248
//
1249
// Subsequent rounds.
1250
// r is the round number, r16 is the round number mod 16.
1251
// These are separate as typically r is run-time and r16 is compile time constant.
1252
//
1253
#define FROUND( a, b, c, d, e, f, g, h, r, r16 ) {                                      \
1254
    Wt = ADD( ADD( LSIGMA1( W[(r16-2) & 15] ), LSIGMA0( W[(r16-15) & 15])) , ADD( W[(r16-7) & 15], W[r16 & 15])); \
1255
    CROUND( a, b, c, d, e, f, g, h, r, r16 ); \
1256
    }
1257
1258
//
1259
// This is the core routine that does the actual hard work
1260
// This is based on the older one in RSA32LIB by Scott Field from 2001
1261
//
1262
VOID
1263
SYMCRYPT_CALL
1264
SymCryptSha512AppendBlocks_neon(
1265
    _Inout_                 SYMCRYPT_SHA512_CHAINING_STATE  *   pChain,
1266
    _In_reads_(cbData)      PCBYTE                              pbData,
1267
                            SIZE_T                              cbData,
1268
    _Out_                   SIZE_T                            * pcbRemaining )
1269
{
1270
    SYMCRYPT_ALIGN __n64 W[16];
1271
    __n64 A, B, C, D, E, F, G, H;
1272
    int round;
1273
    __n64 Wt;
1274
    __n64 * pH = (__n64 *) &pChain->H[0];
1275
1276
    A = pH[0];
1277
    B = pH[1];
1278
    C = pH[2];
1279
    D = pH[3];
1280
    E = pH[4];
1281
    F = pH[5];
1282
    G = pH[6];
1283
    H = pH[7];
1284
1285
    while( cbData >= 128 )
1286
    {
1287
        //
1288
        // initial rounds 1 to 16
1289
        //
1290
1291
        IROUND( A, B, C, D, E, F, G, H,  0 );
1292
        IROUND( H, A, B, C, D, E, F, G,  1 );
1293
        IROUND( G, H, A, B, C, D, E, F,  2 );
1294
        IROUND( F, G, H, A, B, C, D, E,  3 );
1295
        IROUND( E, F, G, H, A, B, C, D,  4 );
1296
        IROUND( D, E, F, G, H, A, B, C,  5 );
1297
        IROUND( C, D, E, F, G, H, A, B,  6 );
1298
        IROUND( B, C, D, E, F, G, H, A,  7 );
1299
        IROUND( A, B, C, D, E, F, G, H,  8 );
1300
        IROUND( H, A, B, C, D, E, F, G,  9 );
1301
        IROUND( G, H, A, B, C, D, E, F, 10 );
1302
        IROUND( F, G, H, A, B, C, D, E, 11 );
1303
        IROUND( E, F, G, H, A, B, C, D, 12 );
1304
        IROUND( D, E, F, G, H, A, B, C, 13 );
1305
        IROUND( C, D, E, F, G, H, A, B, 14 );
1306
        IROUND( B, C, D, E, F, G, H, A, 15 );
1307
1308
        for( round=16; round<80; round += 16 )
1309
        {
1310
            FROUND( A, B, C, D, E, F, G, H, round +  0,  0 );
1311
            FROUND( H, A, B, C, D, E, F, G, round +  1,  1 );
1312
            FROUND( G, H, A, B, C, D, E, F, round +  2,  2 );
1313
            FROUND( F, G, H, A, B, C, D, E, round +  3,  3 );
1314
            FROUND( E, F, G, H, A, B, C, D, round +  4,  4 );
1315
            FROUND( D, E, F, G, H, A, B, C, round +  5,  5 );
1316
            FROUND( C, D, E, F, G, H, A, B, round +  6,  6 );
1317
            FROUND( B, C, D, E, F, G, H, A, round +  7,  7 );
1318
            FROUND( A, B, C, D, E, F, G, H, round +  8,  8 );
1319
            FROUND( H, A, B, C, D, E, F, G, round +  9,  9 );
1320
            FROUND( G, H, A, B, C, D, E, F, round + 10, 10 );
1321
            FROUND( F, G, H, A, B, C, D, E, round + 11, 11 );
1322
            FROUND( E, F, G, H, A, B, C, D, round + 12, 12 );
1323
            FROUND( D, E, F, G, H, A, B, C, round + 13, 13 );
1324
            FROUND( C, D, E, F, G, H, A, B, round + 14, 14 );
1325
            FROUND( B, C, D, E, F, G, H, A, round + 15, 15 );
1326
        }
1327
1328
        pH[0] = A = ADD( A, pH[0] );
1329
        pH[1] = B = ADD( B, pH[1] );
1330
        pH[2] = C = ADD( C, pH[2] );
1331
        pH[3] = D = ADD( D, pH[3] );
1332
        pH[4] = E = ADD( E, pH[4] );
1333
        pH[5] = F = ADD( F, pH[5] );
1334
        pH[6] = G = ADD( G, pH[6] );
1335
        pH[7] = H = ADD( H, pH[7] );
1336
1337
        pbData += 128;
1338
        cbData -= 128;
1339
    }
1340
1341
    *pcbRemaining = cbData;
1342
1343
    //
1344
    // Wipe the variables;
1345
    //
1346
    SymCryptWipeKnownSize( W, sizeof( W ) );
1347
    SymCryptWipeKnownSize( &A, sizeof( A ) );
1348
    SymCryptWipeKnownSize( &B, sizeof( B ) );
1349
    SymCryptWipeKnownSize( &C, sizeof( C ) );
1350
    SymCryptWipeKnownSize( &D, sizeof( D ) );
1351
    SymCryptWipeKnownSize( &E, sizeof( E ) );
1352
    SymCryptWipeKnownSize( &F, sizeof( F ) );
1353
    SymCryptWipeKnownSize( &G, sizeof( G ) );
1354
    SymCryptWipeKnownSize( &H, sizeof( H ) );
1355
    SymCryptWipeKnownSize( &Wt, sizeof( Wt ) );
1356
}
1357
1358
#endif
1359
1360
//======================================================================================
1361
//
1362
// Switch between different implementations of compression function
1363
//
1364
//FORCEINLINE
1365
VOID
1366
SYMCRYPT_CALL
1367
SymCryptSha512AppendBlocks(
1368
    _Inout_                 SYMCRYPT_SHA512_CHAINING_STATE *    pChain,
1369
    _In_reads_( cbData )    PCBYTE                              pbData,
1370
                            SIZE_T                              cbData,
1371
    _Out_                   SIZE_T                            * pcbRemaining )
1372
52.1k
{
1373
52.1k
#if SYMCRYPT_CPU_AMD64
1374
    
1375
    // Temporarily disabling use of Ymm in SHA2
1376
    // SYMCRYPT_EXTENDED_SAVE_DATA SaveData;
1377
1378
    // if (SYMCRYPT_CPU_FEATURES_PRESENT(SYMCRYPT_CPU_FEATURE_AVX512 | SYMCRYPT_CPU_FEATURE_BMI2) &&
1379
    //     SymCryptSaveYmm(&SaveData) == SYMCRYPT_NO_ERROR)
1380
    // {
1381
    //     SymCryptSha512AppendBlocks_ymm_avx512vl_asm(pChain, pbData, cbData, pcbRemaining);
1382
1383
    //     SymCryptRestoreYmm(&SaveData);
1384
    // }
1385
    // else if (SYMCRYPT_CPU_FEATURES_PRESENT(SYMCRYPT_CPU_FEATURE_AVX2 | SYMCRYPT_CPU_FEATURE_BMI2) &&
1386
    //     SymCryptSaveYmm(&SaveData) == SYMCRYPT_NO_ERROR)
1387
    // {
1388
    //     //SymCryptSha512AppendBlocks_ymm_1block(pChain, pbData, cbData, pcbRemaining);
1389
    //     //SymCryptSha512AppendBlocks_ymm_2blocks(pChain, pbData, cbData, pcbRemaining);
1390
    //     //SymCryptSha512AppendBlocks_ymm_4blocks(pChain, pbData, cbData, pcbRemaining);
1391
    //     SymCryptSha512AppendBlocks_ymm_avx2_asm(pChain, pbData, cbData, pcbRemaining);
1392
1393
    //     SymCryptRestoreYmm(&SaveData);
1394
    // }
1395
    // else
1396
52.1k
    {
1397
52.1k
        SymCryptSha512AppendBlocks_ull( pChain, pbData, cbData, pcbRemaining );
1398
        //SymCryptSha512AppendBlocks_ull2( pChain, pbData, cbData, pcbRemaining );
1399
        //SymCryptSha512AppendBlocks_ull3( pChain, pbData, cbData, pcbRemaining );
1400
52.1k
    }
1401
1402
1403
#elif SYMCRYPT_CPU_ARM
1404
1405
    if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURE_NEON ) )
1406
    {
1407
        SymCryptSha512AppendBlocks_neon( pChain, pbData, cbData, pcbRemaining );      // Tegra T3: 48 c/B
1408
    } else {
1409
        SymCryptSha512AppendBlocks_ull( pChain, pbData, cbData, pcbRemaining );       // Tegra T3: 65.34 c/B
1410
        //SymCryptSha512AppendBlocks_ull2( pChain, pbData, cbData, pcbRemaining );      // Tegra T3: 77.4 c/B
1411
        //SymCryptSha512AppendBlocks_ull3( pChain, pbData, cbData, pcbRemaining );      // Tegra T3: 71.6 c/B
1412
    }
1413
1414
#elif SYMCRYPT_CPU_X86
1415
1416
    SYMCRYPT_EXTENDED_SAVE_DATA SaveData;
1417
1418
    if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURE_SSSE3 ) && SymCryptSaveXmm( &SaveData ) == SYMCRYPT_NO_ERROR )
1419
    {
1420
        SymCryptSha512AppendBlocks_xmm( pChain, pbData, cbData, pcbRemaining );
1421
        SymCryptRestoreXmm( &SaveData );
1422
    } else {
1423
        SymCryptSha512AppendBlocks_ull( pChain, pbData, cbData, pcbRemaining );       // core2: 36.40 c/B
1424
        //SymCryptSha512AppendBlocks_ull2( pChain, pbData, cbData, pcbRemaining );      // core2: 49.09 c/B
1425
        //SymCryptSha512AppendBlocks_ull3( pChain, pbData, cbData, pcbRemaining );      // core2: 38.29 c/B
1426
    }
1427
1428
#else
1429
1430
    SymCryptSha512AppendBlocks_ull( pChain, pbData, cbData, pcbRemaining );       // need tuning...
1431
1432
#endif
1433
52.1k
}
1434
1435