Coverage Report

Created: 2024-11-21 07:03

/src/SymCrypt/lib/sha256.c
Line
Count
Source (jump to first uncovered line)
1
//
2
// Sha256.c
3
//
4
// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
5
//
6
7
//
8
// This module contains the routines to implement SHA2-256 from FIPS 180-2
9
//
10
// This revised implementation is based on the older one in RSA32LIB by Scott Field from 2001
11
//
12
13
#include "precomp.h"
14
15
//
16
// See the symcrypt.h file for documentation on what the various functions do.
17
//
18
19
const SYMCRYPT_HASH SymCryptSha256Algorithm_default = {
20
    &SymCryptSha256Init,
21
    &SymCryptSha256Append,
22
    &SymCryptSha256Result,
23
    &SymCryptSha256AppendBlocks,
24
    &SymCryptSha256StateCopy,
25
    sizeof( SYMCRYPT_SHA256_STATE ),
26
    SYMCRYPT_SHA256_RESULT_SIZE,
27
    SYMCRYPT_SHA256_INPUT_BLOCK_SIZE,
28
    SYMCRYPT_FIELD_OFFSET( SYMCRYPT_SHA256_STATE, chain ),
29
    SYMCRYPT_FIELD_SIZE( SYMCRYPT_SHA256_STATE, chain ),
30
};
31
32
const PCSYMCRYPT_HASH SymCryptSha256Algorithm = &SymCryptSha256Algorithm_default;
33
34
//
35
// SHA-256 uses 64 magic constants of 32 bits each. These are
36
// referred to as K^{256}_i for i=0...63 by FIPS 180-2.
37
// This array is also used by the parallel SHA256 implementation
38
// For performance we align to 256 bytes, which gives optimal cache alignment.
39
//
40
SYMCRYPT_ALIGN_AT( 256 ) const  UINT32 SymCryptSha256K[64] = {
41
    0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL,
42
    0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL,
43
    0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL,
44
    0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL,
45
    0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL,
46
    0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL,
47
    0x983e5152UL, 0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL,
48
    0xc6e00bf3UL, 0xd5a79147UL, 0x06ca6351UL, 0x14292967UL,
49
    0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL, 0x53380d13UL,
50
    0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL,
51
    0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL,
52
    0xd192e819UL, 0xd6990624UL, 0xf40e3585UL, 0x106aa070UL,
53
    0x19a4c116UL, 0x1e376c08UL, 0x2748774cUL, 0x34b0bcb5UL,
54
    0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL, 0x682e6ff3UL,
55
    0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL,
56
    0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL
57
};
58
59
60
//
61
// Initial state
62
//
63
static const UINT32 sha256InitialState[8] = {
64
    0x6a09e667UL,
65
    0xbb67ae85UL,
66
    0x3c6ef372UL,
67
    0xa54ff53aUL,
68
    0x510e527fUL,
69
    0x9b05688cUL,
70
    0x1f83d9abUL,
71
    0x5be0cd19UL,
72
};
73
74
//
75
// SymCryptSha256
76
//
77
#define ALG SHA256
78
#define Alg Sha256
79
#include "hash_pattern.c"
80
#undef ALG
81
#undef Alg
82
83
84
85
//
86
// SymCryptSha256Init
87
//
88
SYMCRYPT_NOINLINE
89
VOID
90
SYMCRYPT_CALL
91
SymCryptSha256Init( _Out_ PSYMCRYPT_SHA256_STATE pState )
92
387
{
93
387
    SYMCRYPT_SET_MAGIC( pState );
94
95
387
    pState->dataLengthL = 0;
96
    //pState->dataLengthH = 0;      // not used
97
387
    pState->bytesInBuffer = 0;
98
99
387
    memcpy( &pState->chain.H[0], &sha256InitialState[0], sizeof( sha256InitialState ) );
100
101
    //
102
    // There is no need to initialize the buffer part of the state as that will be
103
    // filled before it is used.
104
    //
105
387
}
106
107
108
//
109
// SymCryptSha256Append
110
//
111
SYMCRYPT_NOINLINE
112
VOID
113
SYMCRYPT_CALL
114
SymCryptSha256Append(
115
    _Inout_                 PSYMCRYPT_SHA256_STATE  pState,
116
    _In_reads_( cbData )    PCBYTE                  pbData,
117
                            SIZE_T                  cbData )
118
48.0k
{
119
48.0k
    UINT32  bytesInBuffer;
120
48.0k
    UINT32  freeInBuffer;
121
48.0k
    SIZE_T  tmp;
122
123
48.0k
    SYMCRYPT_CHECK_MAGIC( pState );
124
125
48.0k
    pState->dataLengthL += cbData;      // dataLengthH is not used...
126
127
48.0k
    bytesInBuffer = pState->bytesInBuffer;
128
129
    //
130
    // If previous data in buffer, buffer new input and transform if possible.
131
    //
132
48.0k
    if( bytesInBuffer > 0 )
133
41.9k
    {
134
41.9k
        SYMCRYPT_ASSERT( SYMCRYPT_SHA256_INPUT_BLOCK_SIZE > bytesInBuffer );
135
136
41.9k
        freeInBuffer = SYMCRYPT_SHA256_INPUT_BLOCK_SIZE - bytesInBuffer;
137
41.9k
        if( cbData < freeInBuffer )
138
37.9k
        {
139
            //
140
            // All the data will fit in the buffer.
141
            // We don't do anything here.
142
            // As cbData < inputBlockSize the bulk data processing is skipped,
143
            // and the data will be copied to the buffer at the end
144
            // of this code.
145
37.9k
        } else {
146
            //
147
            // Enough data to fill the whole buffer & process it
148
            //
149
3.99k
            memcpy(&pState->buffer[bytesInBuffer], pbData, freeInBuffer);
150
3.99k
            pbData += freeInBuffer;
151
3.99k
            cbData -= freeInBuffer;
152
3.99k
            SymCryptSha256AppendBlocks( &pState->chain, &pState->buffer[0], SYMCRYPT_SHA256_INPUT_BLOCK_SIZE, &tmp );
153
154
3.99k
            bytesInBuffer = 0;
155
3.99k
        }
156
41.9k
    }
157
158
    //
159
    // Internal buffer is empty; process all remaining whole blocks in the input
160
    //
161
48.0k
    if( cbData >= SYMCRYPT_SHA256_INPUT_BLOCK_SIZE )
162
4.73k
    {
163
4.73k
        SymCryptSha256AppendBlocks( &pState->chain, pbData, cbData, &tmp );
164
4.73k
        SYMCRYPT_ASSERT( tmp < SYMCRYPT_SHA256_INPUT_BLOCK_SIZE );
165
4.73k
        pbData += cbData - tmp;
166
4.73k
        cbData = tmp;
167
4.73k
    }
168
169
48.0k
    SYMCRYPT_ASSERT( cbData < SYMCRYPT_SHA256_INPUT_BLOCK_SIZE );
170
171
    //
172
    // buffer remaining input if necessary.
173
    //
174
48.0k
    if( cbData > 0 )
175
15.9k
    {
176
15.9k
        memcpy( &pState->buffer[bytesInBuffer], pbData, cbData );
177
15.9k
        bytesInBuffer += (UINT32) cbData;
178
15.9k
    }
179
180
48.0k
    pState->bytesInBuffer = bytesInBuffer;
181
48.0k
}
182
183
184
//
185
// SymCryptSha256Result
186
//
187
SYMCRYPT_NOINLINE
188
VOID
189
SYMCRYPT_CALL
190
SymCryptSha256Result(
191
    _Inout_                                     PSYMCRYPT_SHA256_STATE  pState,
192
    _Out_writes_( SYMCRYPT_SHA256_RESULT_SIZE ) PBYTE                   pbResult )
193
11.5k
{
194
    //
195
    // We don't use the common padding code as that is slower, and SHA-256 is very frequently used in
196
    // performance-sensitive areas.
197
    //
198
11.5k
    UINT32 bytesInBuffer;
199
11.5k
    SIZE_T tmp;
200
201
11.5k
    SYMCRYPT_CHECK_MAGIC( pState );
202
203
11.5k
    bytesInBuffer = pState->bytesInBuffer;
204
205
    //
206
    // The buffer is never completely full, so we can always put the first
207
    // padding byte in.
208
    //
209
11.5k
    pState->buffer[bytesInBuffer++] = 0x80;
210
211
11.5k
    if( bytesInBuffer > 64-8 ) {
212
        //
213
        // No room for the rest of the padding. Pad with zeroes & process block
214
        // bytesInBuffer is at most 64, so we do not have an integer underflow
215
        //
216
736
        SymCryptWipe( &pState->buffer[bytesInBuffer], 64-bytesInBuffer );
217
736
        SymCryptSha256AppendBlocks( &pState->chain, pState->buffer, 64, &tmp );
218
736
        bytesInBuffer = 0;
219
736
    }
220
221
    //
222
    // Set rest of padding
223
    // At this point bytesInBuffer <= 64-8, so we don't have an underflow
224
    // We wipe to the end of the buffer as it is 16-aligned,
225
    // and it is faster to wipe to an aligned point
226
    //
227
11.5k
    SymCryptWipe( &pState->buffer[bytesInBuffer], 64-bytesInBuffer );
228
11.5k
    SYMCRYPT_STORE_MSBFIRST64( &pState->buffer[64-8], pState->dataLengthL * 8 );
229
230
    //
231
    // Process the final block
232
    //
233
11.5k
    SymCryptSha256AppendBlocks( &pState->chain, pState->buffer, 64, &tmp );
234
235
    //
236
    // Write the output in the correct byte order
237
    //
238
11.5k
    SymCryptUint32ToMsbFirst( &pState->chain.H[0], pbResult, 8 );
239
240
    //
241
    // Wipe & re-initialize
242
    // We have to wipe the whole state because the Init call
243
    // might be optimized away by a smart compiler.
244
    //
245
11.5k
    SymCryptWipeKnownSize( pState, sizeof( *pState ) );
246
247
11.5k
    memcpy( &pState->chain.H[0], &sha256InitialState[0], sizeof( sha256InitialState ) );
248
11.5k
    SYMCRYPT_SET_MAGIC( pState );
249
11.5k
}
250
251
252
VOID
253
SYMCRYPT_CALL
254
SymCryptSha256StateExport(
255
    _In_                                                    PCSYMCRYPT_SHA256_STATE pState,
256
    _Out_writes_bytes_( SYMCRYPT_SHA256_STATE_EXPORT_SIZE ) PBYTE                   pbBlob )
257
0
{
258
0
    SYMCRYPT_ALIGN SYMCRYPT_SHA256_STATE_EXPORT_BLOB blob;           // local copy to have proper alignment.
259
0
    C_ASSERT( sizeof( blob ) == SYMCRYPT_SHA256_STATE_EXPORT_SIZE );
260
261
0
    SYMCRYPT_CHECK_MAGIC( pState );
262
263
0
    SymCryptWipeKnownSize( &blob, sizeof( blob ) ); // wipe to avoid any data leakage
264
265
0
    blob.header.magic = SYMCRYPT_BLOB_MAGIC;
266
0
    blob.header.size = SYMCRYPT_SHA256_STATE_EXPORT_SIZE;
267
0
    blob.header.type = SymCryptBlobTypeSha256State;
268
269
    //
270
    // Copy the relevant data. Buffer will be 0-padded.
271
    //
272
273
0
    SymCryptUint32ToMsbFirst( &pState->chain.H[0], &blob.chain[0], 8 );
274
0
    blob.dataLength = pState->dataLengthL;
275
0
    memcpy( &blob.buffer[0], &pState->buffer[0], blob.dataLength & 0x3f );
276
277
0
    SYMCRYPT_ASSERT( (PCBYTE) &blob + sizeof( blob ) - sizeof( SYMCRYPT_BLOB_TRAILER ) == (PCBYTE) &blob.trailer );
278
0
    SymCryptMarvin32( SymCryptMarvin32DefaultSeed, (PCBYTE) &blob, sizeof( blob ) - sizeof( SYMCRYPT_BLOB_TRAILER ), &blob.trailer.checksum[0] );
279
280
0
    memcpy( pbBlob, &blob, sizeof( blob ) );
281
282
//cleanup:
283
0
    SymCryptWipeKnownSize( &blob, sizeof( blob ) );
284
0
    return;
285
0
}
286
287
SYMCRYPT_ERROR
288
SYMCRYPT_CALL
289
SymCryptSha256StateImport(
290
    _Out_                                                   PSYMCRYPT_SHA256_STATE  pState,
291
    _In_reads_bytes_( SYMCRYPT_SHA256_STATE_EXPORT_SIZE)    PCBYTE                  pbBlob )
292
0
{
293
0
    SYMCRYPT_ERROR                      scError = SYMCRYPT_NO_ERROR;
294
0
    SYMCRYPT_ALIGN SYMCRYPT_SHA256_STATE_EXPORT_BLOB   blob;                       // local copy to have proper alignment.
295
0
    BYTE                                checksum[8];
296
297
0
    C_ASSERT( sizeof( blob ) == SYMCRYPT_SHA256_STATE_EXPORT_SIZE );
298
0
    memcpy( &blob, pbBlob, sizeof( blob ) );
299
300
0
    if( blob.header.magic != SYMCRYPT_BLOB_MAGIC ||
301
0
        blob.header.size != SYMCRYPT_SHA256_STATE_EXPORT_SIZE ||
302
0
        blob.header.type != SymCryptBlobTypeSha256State )
303
0
    {
304
0
        scError = SYMCRYPT_INVALID_BLOB;
305
0
        goto cleanup;
306
0
    }
307
308
0
    SymCryptMarvin32( SymCryptMarvin32DefaultSeed, (PCBYTE) &blob, sizeof( blob ) - sizeof( SYMCRYPT_BLOB_TRAILER ), checksum );
309
0
    if( memcmp( checksum, &blob.trailer.checksum[0], 8 ) != 0 )
310
0
    {
311
0
        scError = SYMCRYPT_INVALID_BLOB;
312
0
        goto cleanup;
313
0
    }
314
315
0
    SymCryptMsbFirstToUint32( &blob.chain[0], &pState->chain.H[0], 8 );
316
0
    pState->dataLengthL = blob.dataLength;
317
0
    pState->bytesInBuffer = blob.dataLength & 0x3f;
318
0
    memcpy( &pState->buffer[0], &blob.buffer[0], pState->bytesInBuffer );
319
320
0
    SYMCRYPT_SET_MAGIC( pState );
321
322
0
cleanup:
323
0
    SymCryptWipeKnownSize( &blob, sizeof(blob) );
324
0
    return scError;
325
0
}
326
327
328
329
//
330
// Simple test vector for FIPS module testing
331
//
332
333
const BYTE SymCryptSha256KATAnswer[ 32 ] = {
334
    0xba, 0x78, 0x16, 0xbf, 0x8f, 0x01, 0xcf, 0xea,
335
    0x41, 0x41, 0x40, 0xde, 0x5d, 0xae, 0x22, 0x23,
336
    0xb0, 0x03, 0x61, 0xa3, 0x96, 0x17, 0x7a, 0x9c,
337
    0xb4, 0x10, 0xff, 0x61, 0xf2, 0x00, 0x15, 0xad,
338
    } ;
339
340
VOID
341
SYMCRYPT_CALL
342
SymCryptSha256Selftest(void)
343
0
{
344
0
    BYTE result[SYMCRYPT_SHA256_RESULT_SIZE];
345
346
0
    SymCryptSha256( SymCryptTestMsg3, sizeof( SymCryptTestMsg3 ), result );
347
348
0
    SymCryptInjectError( result, sizeof( result ) );
349
350
0
    if( memcmp( result, SymCryptSha256KATAnswer, sizeof( result ) ) != 0 ) {
351
0
        SymCryptFatal( 'SH25' );
352
0
    }
353
0
}
354
355
356
357
//
358
// Below are multiple implementations of the SymCryptSha256AppendBlocks function,
359
// with a compile-time switch about which one to use.
360
// We keep the multiple implementations here for future reference;
361
// as CPU architectures evolve we might want to switch to one of the
362
// other implementations.
363
// All implementations here have been tested, but some lack production hardening.
364
//
365
366
//
367
// Enable frame pointer omission to free up an extra register on X86.
368
//
369
#if SYMCRYPT_CPU_X86 && SYMCRYPT_MS_VC
370
#pragma optimize( "y", on )
371
#endif
372
373
//
374
// For documentation on these function see FIPS 180-2
375
//
376
// MAJ and CH are the functions Maj and Ch from the standard.
377
// CSIGMA0 and CSIGMA1 are the capital sigma functions.
378
// LSIGMA0 and LSIGMA1 are the lowercase sigma functions.
379
//
380
// The canonical definitions of the MAJ and CH functions are:
381
//#define MAJ( x, y, z )    (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
382
//#define CH( x, y, z )  (((x) & (y)) ^ ((~(x)) & (z)))
383
// We use optimized versions defined below
384
//
385
28.4M
#define MAJ( x, y, z )  ((((z) | (y)) & (x) ) | ((z) & (y)))
386
28.4M
#define CH( x, y, z )  ((((z) ^ (y)) & (x)) ^ (z))
387
388
//
389
// The four Sigma functions
390
//
391
392
//
393
// We have two versions of the rotate-and-xor functions.
394
// one is just a macro that does the rotations and xors.
395
// This works well on ARM
396
// For Intel/AMD we have one where we use the rotated value
397
// from one intermediate result to derive the next rotated
398
// value from. This removes one register copy from the
399
// code stream.
400
//
401
// In practice, our compiler doesn't take advantage of the
402
// reduction in the # operations required, and inserts a
403
// bunch of extra register copies anyway.
404
// It actually hurts on AMD64.
405
//
406
// This should be re-tuned for every release to get the best overall
407
// SHA-256 performance.
408
// At the moment we get an improvement from 19.76 c/B to 19.40 c/B on a Core 2 core.
409
// We should probably tune this to the Atom CPU.
410
//
411
#if SYMCRYPT_CPU_X86
412
#define USE_CSIGMA0_MULTIROT 1
413
#define USE_CSIGMA1_MULTIROT 0
414
#define USE_LSIGMA0_MULTIROT 0
415
#define USE_LSIGMA1_MULTIROT 0
416
417
#else
418
//
419
// On ARM we have no reason to believe this helps at all.
420
// on AMD64 it slows our code down.
421
//
422
#define USE_CSIGMA0_MULTIROT 0
423
#define USE_CSIGMA1_MULTIROT 0
424
#define USE_LSIGMA0_MULTIROT 0
425
#define USE_LSIGMA1_MULTIROT 0
426
#endif
427
428
#if USE_CSIGMA0_MULTIROT
429
FORCEINLINE
430
UINT32
431
CSIGMA0( UINT32 x )
432
{
433
    UINT32 res;
434
    x = ROR32( x, 2 );
435
    res = x;
436
    x = ROR32( x, 11 );
437
    res ^= x;
438
    x = ROR32( x, 9 );
439
    res ^= x;
440
    return res;
441
}
442
#else
443
28.4M
#define CSIGMA0( x )    (ROR32((x),  2) ^ ROR32((x), 13) ^ ROR32((x), 22))
444
#endif
445
446
#if USE_CSIGMA1_MULTIROT
447
FORCEINLINE
448
UINT32
449
CSIGMA1( UINT32 x )
450
{
451
    UINT32 res;
452
    x = ROR32( x, 6 );
453
    res = x;
454
    x = ROR32( x, 5 );
455
    res ^= x;
456
    x = ROR32( x, 14 );
457
    res ^= x;
458
    return res;
459
}
460
#else
461
28.4M
#define CSIGMA1( x )    (ROR32((x),  6) ^ ROR32((x), 11) ^ ROR32((x), 25))
462
#endif
463
464
#if USE_LSIGMA0_MULTIROT
465
FORCEINLINE
466
UINT32
467
LSIGMA0( UINT32 x )
468
{
469
    UINT32 res;
470
    res = x >> 3;
471
    x = ROR32( x, 7 );
472
    res ^= x;
473
    x = ROR32( x, 11 );
474
    res ^= x;
475
    return res;
476
}
477
#else
478
21.3M
#define LSIGMA0( x )    (ROR32((x),  7) ^ ROR32((x), 18) ^ ((x)>> 3))
479
#endif
480
481
#if USE_LSIGMA1_MULTIROT
482
FORCEINLINE
483
UINT32
484
LSIGMA1( UINT32 x )
485
{
486
    UINT32 res;
487
    res = x >> 10;
488
    x = ROR32( x, 17 );
489
    res ^= x;
490
    x = ROR32( x, 2 );
491
    res ^= x;
492
    return res;
493
}
494
#else
495
21.3M
#define LSIGMA1( x )    (ROR32((x), 17) ^ ROR32((x), 19) ^ ((x)>>10))
496
#endif
497
498
499
//
500
// The values a-h are stored in an array called ah.
501
// We have unrolled the loop 16 times. This makes both the indices into
502
// the ah array constant, and it makes the message addressing constant.
503
// This provides a significant speed improvement, at the cost of making
504
// the main loop about 4 kB in code.
505
//
506
// The earlier implementation had the loop unrolled 8 times, and is
507
// around 10 cycles/byte slower. If loading the code from disk takes
508
// 100 cycles/byte, then we break even once you have hashed 20 kB.
509
// This is a worthwhile tradeoff as all code is codesigned with SHA-256.
510
//
511
512
//
513
// Core round macro
514
//
515
// r16 is the round number mod 16, r is the round number.
516
// r16 is a separate macro argument because it is always a compile-time constant
517
// which allows much better optimisations of the memory accesses.
518
//
519
// ah[ r16   &7] = h
520
// ah[(r16+1)&7] = g;
521
// ah[(r16+2)&7] = f;
522
// ah[(r16+3)&7] = e;
523
// ah[(r16+4)&7] = d;
524
// ah[(r16+5)&7] = c;
525
// ah[(r16+6)&7] = b;
526
// ah[(r16+7)&7] = a;
527
//
528
// After that incrementing the round number will automatically map a->b, b->c, etc.
529
//
530
// The core round, after the message word has been computed for this round and put in Wt.
531
// r16 is the round number modulo 16. (Static after loop unrolling)
532
// r is the round number (dynamic, which is why we don't use (r&0xf) for r16)
533
// In more readable form this macro does the following:
534
//      h += CSIGMA( e ) + CH( e, f, g ) + K[round] + W[round];
535
//      d += h;
536
//      h += CSIGMA( a ) + MAJ( a, b, c );
537
//
538
28.4M
#define CROUND( r16, r ) {;\
539
28.4M
    ah[ r16   &7] += CSIGMA1(ah[(r16+3)&7]) + CH(ah[(r16+3)&7], ah[(r16+2)&7], ah[(r16+1)&7]) + SymCryptSha256K[r] + Wt;\
540
28.4M
    ah[(r16+4)&7] += ah[r16 &7];\
541
28.4M
    ah[ r16   &7] += CSIGMA0(ah[(r16+7)&7]) + MAJ(ah[(r16+7)&7], ah[(r16+6)&7], ah[(r16+5)&7]);\
542
28.4M
}
543
544
//
545
// Initial round that reads the message.
546
// r is the round number 0..15
547
//
548
7.11M
#define IROUND( r ) {\
549
7.11M
    Wt = SYMCRYPT_LOAD_MSBFIRST32( &pbData[ 4*r ] );\
550
7.11M
    W[r] = Wt; \
551
7.11M
    CROUND(r,r);\
552
7.11M
    }
553
554
//
555
// Subsequent rounds.
556
// r16 is the round number mod 16. rb is the round number minus r16.
557
//
558
21.3M
#define FROUND(r16, rb) {                                      \
559
21.3M
    Wt = LSIGMA1( W[(r16-2) & 15] ) +   W[(r16-7) & 15] +    \
560
21.3M
         LSIGMA0( W[(r16-15) & 15]) +   W[r16 & 15];       \
561
21.3M
    W[r16] = Wt; \
562
21.3M
    CROUND( r16, r16+rb ); \
563
21.3M
}
564
565
//
566
// UINT32 implementation 1
567
//
568
VOID
569
SYMCRYPT_CALL
570
SymCryptSha256AppendBlocks_ul1(
571
    _Inout_                 SYMCRYPT_SHA256_CHAINING_STATE *    pChain,
572
    _In_reads_( cbData )    PCBYTE                              pbData,
573
                            SIZE_T                              cbData,
574
    _Out_                   SIZE_T                            * pcbRemaining )
575
21.4k
{
576
21.4k
    SYMCRYPT_ALIGN UINT32 W[16];
577
21.4k
    SYMCRYPT_ALIGN UINT32 ah[8];
578
21.4k
    int round;
579
21.4k
    UINT32 Wt;
580
581
465k
    while( cbData >= 64 )
582
444k
    {
583
444k
        ah[7] = pChain->H[0];
584
444k
        ah[6] = pChain->H[1];
585
444k
        ah[5] = pChain->H[2];
586
444k
        ah[4] = pChain->H[3];
587
444k
        ah[3] = pChain->H[4];
588
444k
        ah[2] = pChain->H[5];
589
444k
        ah[1] = pChain->H[6];
590
444k
        ah[0] = pChain->H[7];
591
592
        //
593
        // initial rounds 1 to 16
594
        //
595
596
444k
        IROUND(  0 );
597
444k
        IROUND(  1 );
598
444k
        IROUND(  2 );
599
444k
        IROUND(  3 );
600
444k
        IROUND(  4 );
601
444k
        IROUND(  5 );
602
444k
        IROUND(  6 );
603
444k
        IROUND(  7 );
604
444k
        IROUND(  8 );
605
444k
        IROUND(  9 );
606
444k
        IROUND( 10 );
607
444k
        IROUND( 11 );
608
444k
        IROUND( 12 );
609
444k
        IROUND( 13 );
610
444k
        IROUND( 14 );
611
444k
        IROUND( 15 );
612
613
614
        //
615
        // rounds 16 to 64.
616
        //
617
1.77M
        for( round=16; round<64; round += 16 )
618
1.33M
        {
619
1.33M
            FROUND(  0, round );
620
1.33M
            FROUND(  1, round );
621
1.33M
            FROUND(  2, round );
622
1.33M
            FROUND(  3, round );
623
1.33M
            FROUND(  4, round );
624
1.33M
            FROUND(  5, round );
625
1.33M
            FROUND(  6, round );
626
1.33M
            FROUND(  7, round );
627
1.33M
            FROUND(  8, round );
628
1.33M
            FROUND(  9, round );
629
1.33M
            FROUND( 10, round );
630
1.33M
            FROUND( 11, round );
631
1.33M
            FROUND( 12, round );
632
1.33M
            FROUND( 13, round );
633
1.33M
            FROUND( 14, round );
634
1.33M
            FROUND( 15, round );
635
1.33M
        }
636
637
444k
        pChain->H[0] = ah[7] + pChain->H[0];
638
444k
        pChain->H[1] = ah[6] + pChain->H[1];
639
444k
        pChain->H[2] = ah[5] + pChain->H[2];
640
444k
        pChain->H[3] = ah[4] + pChain->H[3];
641
444k
        pChain->H[4] = ah[3] + pChain->H[4];
642
444k
        pChain->H[5] = ah[2] + pChain->H[5];
643
444k
        pChain->H[6] = ah[1] + pChain->H[6];
644
444k
        pChain->H[7] = ah[0] + pChain->H[7];
645
646
444k
        pbData += 64;
647
444k
        cbData -= 64;
648
649
444k
    }
650
651
21.4k
    *pcbRemaining = cbData;
652
653
    //
654
    // Wipe the variables;
655
    //
656
21.4k
    SymCryptWipeKnownSize( ah, sizeof( ah ) );
657
21.4k
    SymCryptWipeKnownSize( W, sizeof( W ) );
658
21.4k
    SYMCRYPT_FORCE_WRITE32( &Wt, 0 );
659
21.4k
}
660
661
VOID
662
SYMCRYPT_CALL
663
SymCryptSha256AppendBlocks_ul2(
664
    _Inout_                 SYMCRYPT_SHA256_CHAINING_STATE *    pChain,
665
    _In_reads_( cbData )    PCBYTE                              pbData,
666
                            SIZE_T                              cbData,
667
    _Out_                   SIZE_T                            * pcbRemaining )
668
0
{
669
    //
670
    // Different arrangement of the code, currently 25 c/B vs 20 c/b for the version above.
671
    // On Atom: 50 c/B vs 41 c/B for the one above.
672
    //
673
0
    SYMCRYPT_ALIGN UINT32 buf[4 + 8 + 64];    // chaining state concatenated with the expanded input block
674
0
    UINT32 * W = &buf[4 + 8];
675
0
    UINT32 * ha = &buf[4]; // initial state words, in order h, g, ..., b, a
676
0
    UINT32 A, B, C, D, T;
677
0
    int r;
678
679
0
    ha[7] = pChain->H[0]; buf[3] = ha[7];
680
0
    ha[6] = pChain->H[1]; buf[2] = ha[6];
681
0
    ha[5] = pChain->H[2]; buf[1] = ha[5];
682
0
    ha[4] = pChain->H[3]; buf[0] = ha[4];
683
0
    ha[3] = pChain->H[4];
684
0
    ha[2] = pChain->H[5];
685
0
    ha[1] = pChain->H[6];
686
0
    ha[0] = pChain->H[7];
687
688
0
    while( cbData >= 64 )
689
0
    {
690
        //
691
        // Capture the input into W[0..15]
692
        //
693
0
        for( r=0; r<16; r++ )
694
0
        {
695
0
            W[r] = SYMCRYPT_LOAD_MSBFIRST32( &pbData[ 4*r ] );
696
0
        }
697
698
        //
699
        // Expand the message
700
        //
701
0
        A = W[15];
702
0
        B = W[14];
703
0
        D = W[0];
704
0
        for( r=16; r<64; r+= 2 )
705
0
        {
706
            // Loop invariant: A=W[r-1], B = W[r-2], D = W[r-16]
707
708
            //
709
            // Macro for one word of message expansion.
710
            // Invariant:
711
            // on entry: a = W[r-1], b = W[r-2], d = W[r-16]
712
            // on exit:  W[r] computed, a = W[r-1], b = W[r], c = W[r-15]
713
            //
714
0
            #define EXPAND( a, b, c, d, r ) \
715
0
                        c = W[r-15]; \
716
0
                        b =  d + LSIGMA1( b ) + W[r-7] + LSIGMA0( c ); \
717
0
                        W[r] = b; \
718
0
719
0
            EXPAND( A, B, C, D, r );
720
0
            EXPAND( B, A, D, C, (r+1));
721
722
0
            #undef EXPAND
723
0
        }
724
725
0
        A = ha[7];
726
0
        B = ha[6];
727
0
        C = ha[5];
728
0
        D = ha[4];
729
730
0
        for( r=0; r<64; r += 4 )
731
0
        {
732
            //
733
            // Loop invariant:
734
            // A, B, C, and D are the a,b,c,d values of the current state.
735
            // W[r] is the next expanded message word to be processed.
736
            // W[r-8 .. r-5] contain the current state words h, g, f, e.
737
            //
738
739
            //
740
            // Macro to compute one round
741
            //
742
0
            #define DO_ROUND( a, b, c, d, t, r ) \
743
0
                t = W[r] + CSIGMA1( W[r-5] ) + W[r-8] + CH( W[r-5], W[r-6], W[r-7] ) + SymCryptSha256K[r]; \
744
0
                W[r-4] = t + d; \
745
0
                d = t + CSIGMA0( a ) + MAJ( c, b, a );
746
747
0
            DO_ROUND( A, B, C, D, T, r );
748
0
            DO_ROUND( D, A, B, C, T, (r+1) );
749
0
            DO_ROUND( C, D, A, B, T, (r+2) );
750
0
            DO_ROUND( B, C, D, A, T, (r+3) );
751
0
            #undef DO_ROUND
752
0
        }
753
754
0
        buf[3] = ha[7] = buf[3] + A;
755
0
        buf[2] = ha[6] = buf[2] + B;
756
0
        buf[1] = ha[5] = buf[1] + C;
757
0
        buf[0] = ha[4] = buf[0] + D;
758
0
        ha[3] += W[r-5];
759
0
        ha[2] += W[r-6];
760
0
        ha[1] += W[r-7];
761
0
        ha[0] += W[r-8];
762
763
0
        pbData += 64;
764
0
        cbData -= 64;
765
0
    }
766
767
0
    pChain->H[0] = ha[7];
768
0
    pChain->H[1] = ha[6];
769
0
    pChain->H[2] = ha[5];
770
0
    pChain->H[3] = ha[4];
771
0
    pChain->H[4] = ha[3];
772
0
    pChain->H[5] = ha[2];
773
0
    pChain->H[6] = ha[1];
774
0
    pChain->H[7] = ha[0];
775
776
0
    *pcbRemaining = cbData;
777
778
0
    SymCryptWipeKnownSize( buf, sizeof( buf ) );
779
0
    SYMCRYPT_FORCE_WRITE32( &A, 0 );
780
0
    SYMCRYPT_FORCE_WRITE32( &B, 0 );
781
0
    SYMCRYPT_FORCE_WRITE32( &D, 0 );
782
0
    SYMCRYPT_FORCE_WRITE32( &T, 0 );
783
0
}
784
785
#undef CROUND
786
#undef IROUND
787
#undef FROUND
788
789
#if SYMCRYPT_CPU_X86 | SYMCRYPT_CPU_AMD64
790
791
//
792
// Don't omit frame pointer for XMM code; it isn't register-starved as much
793
//
794
#if SYMCRYPT_CPU_X86 && SYMCRYPT_MS_VC
795
#pragma optimize( "y", off )
796
#endif
797
798
//
799
// Code that uses the XMM registers.
800
// This code is currently unused. It was written in case it would provide better performance, but
801
// it did not. We are retaining it in case it might be useful in a future CPU generation.
802
//
803
#if 0
804
805
#define MAJXMM( x, y, z ) _mm_or_si128( _mm_and_si128( _mm_or_si128( z, y ), x ), _mm_and_si128( z, y ))
806
#define CHXMM( x, y, z )  _mm_xor_si128( _mm_and_si128( _mm_xor_si128( z, y ), x ), z )
807
808
#define CSIGMA0XMM( x ) \
809
    _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( \
810
        _mm_slli_epi32(x,30)  , _mm_srli_epi32(x,  2) ),\
811
        _mm_slli_epi32(x,19) ), _mm_srli_epi32(x, 13) ),\
812
        _mm_slli_epi32(x,10) ), _mm_srli_epi32(x, 22) )
813
#define CSIGMA1XMM( x ) \
814
    _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( \
815
        _mm_slli_epi32(x,26)  , _mm_srli_epi32(x,  6) ),\
816
        _mm_slli_epi32(x,21) ), _mm_srli_epi32(x, 11) ),\
817
        _mm_slli_epi32(x,7) ), _mm_srli_epi32(x, 25) )
818
#define LSIGMA0XMM( x ) \
819
    _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( \
820
        _mm_slli_epi32(x,25)  , _mm_srli_epi32(x,  7) ),\
821
        _mm_slli_epi32(x,14) ), _mm_srli_epi32(x, 18) ),\
822
        _mm_srli_epi32(x, 3) )
823
#define LSIGMA1XMM( x ) \
824
    _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( \
825
        _mm_slli_epi32(x,15)  , _mm_srli_epi32(x, 17) ),\
826
        _mm_slli_epi32(x,13) ), _mm_srli_epi32(x, 19) ),\
827
        _mm_srli_epi32(x,10) )
828
829
VOID
830
SYMCRYPT_CALL
831
SymCryptSha256AppendBlocks_xmm1(
832
    _Inout_                 SYMCRYPT_SHA256_CHAINING_STATE *    pChain,
833
    _In_reads_( cbData )    PCBYTE                              pbData,
834
                            SIZE_T                              cbData,
835
    _Out_                   SIZE_T                            * pcbRemaining )
836
{
837
    //
838
    // Implementation that has one value in each XMM register.
839
    // This is significantly slower than the _ul1 implementation
840
    // but can be extended to compute 4 hash blocks in parallel.
841
    //
842
    SYMCRYPT_ALIGN __m128i buf[4 + 8 + 64];    // chaining state concatenated with the expanded input block
843
    __m128i * W = &buf[4 + 8];
844
    __m128i * ha = &buf[4]; // initial state words, in order h, g, ..., b, a
845
    __m128i A, B, C, D, T;
846
    int r;
847
848
    //
849
    // For 1-input only; set the input buffer to zero so that we have known values in every byte
850
    //
851
    //SymCryptWipeKnownSize( buf, sizeof( buf ) );
852
853
    //
854
    // Copy the chaining state into the start of the buffer, order = h,g,f,e,d,c,b,a
855
    //
856
    ha[7] = _mm_insert_epi32(ha[7], pChain->H[0], 0);
857
    ha[6] = _mm_insert_epi32(ha[6], pChain->H[1], 0);
858
    ha[5] = _mm_insert_epi32(ha[5], pChain->H[2], 0);
859
    ha[4] = _mm_insert_epi32(ha[4], pChain->H[3], 0);
860
    ha[3] = _mm_insert_epi32(ha[3], pChain->H[4], 0);
861
    ha[2] = _mm_insert_epi32(ha[2], pChain->H[5], 0);
862
    ha[1] = _mm_insert_epi32(ha[1], pChain->H[6], 0);
863
    ha[0] = _mm_insert_epi32(ha[0], pChain->H[7], 0);
864
865
    buf[0] = ha[4];
866
    buf[1] = ha[5];
867
    buf[2] = ha[6];
868
    buf[3] = ha[7];
869
870
    while( cbData >= 64 )
871
    {
872
873
        //
874
        // Capture the input into W[0..15]
875
        //
876
        for( r=0; r<16; r++ )
877
        {
878
            W[r] = _mm_insert_epi32(W[r], SYMCRYPT_LOAD_MSBFIRST32( &pbData[ 4*r ] ), 0);
879
        }
880
881
        //
882
        // Expand the message
883
        //
884
        A = W[15];
885
        B = W[14];
886
        D = W[0];
887
        for( r=16; r<64; r+= 2 )
888
        {
889
            // Loop invariant: A=W[r-1], B = W[r-2], D = W[r-16]
890
891
            //
892
            // Macro for one word of message expansion.
893
            // Invariant:
894
            // on entry: a = W[r-1], b = W[r-2], d = W[r-16]
895
            // on exit:  W[r] computed, a = W[r-1], b = W[r], c = W[r-15]
896
            //
897
            #define EXPAND( a, b, c, d, r ) \
898
                        c = W[r-15]; \
899
                        b = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( d, LSIGMA1XMM( b ) ), W[r-7] ), LSIGMA0XMM( c ) ); \
900
                        W[r] = b; \
901
902
            EXPAND( A, B, C, D, r );
903
            EXPAND( B, A, D, C, (r+1));
904
905
            #undef EXPAND
906
        }
907
908
        A = ha[7];
909
        B = ha[6];
910
        C = ha[5];
911
        D = ha[4];
912
913
        for( r=0; r<64; r += 4 )
914
        {
915
            //
916
            // Loop invariant:
917
            // A, B, C, and D are the a,b,c,d values of the current state.
918
            // W[r] is the next expanded message word to be processed.
919
            // W[r-8 .. r-5] contain the current state words h, g, f, e.
920
            //
921
922
            //
923
            // Macro to compute one round
924
            //
925
            #define DO_ROUND( a, b, c, d, t, r ) \
926
                t = W[r]; \
927
                t = _mm_add_epi32( t, CSIGMA1XMM( W[r-5] ) ); \
928
                t = _mm_add_epi32( t, W[r-8] ); \
929
                t = _mm_add_epi32( t, CHXMM( W[r-5], W[r-6], W[r-7] ) ); \
930
                t = _mm_add_epi32( t, _mm_cvtsi32_si128( SymCryptSha256K[r] ) ); \
931
                W[r-4] = _mm_add_epi32( t, d ); \
932
                d = _mm_add_epi32( t, CSIGMA0XMM( a ) ); \
933
                d = _mm_add_epi32( d, MAJXMM( c, b, a ) );
934
935
            DO_ROUND( A, B, C, D, T, r );
936
            DO_ROUND( D, A, B, C, T, (r+1) );
937
            DO_ROUND( C, D, A, B, T, (r+2) );
938
            DO_ROUND( B, C, D, A, T, (r+3) );
939
            #undef DO_ROUND
940
        }
941
942
        buf[3] = ha[7] = _mm_add_epi32( buf[3], A );
943
        buf[2] = ha[6] = _mm_add_epi32( buf[2], B );
944
        buf[1] = ha[5] = _mm_add_epi32( buf[1], C );
945
        buf[0] = ha[4] = _mm_add_epi32( buf[0], D );
946
        ha[3] = _mm_add_epi32( ha[3], W[r-5] );
947
        ha[2] = _mm_add_epi32( ha[2], W[r-6] );
948
        ha[1] = _mm_add_epi32( ha[1], W[r-7] );
949
        ha[0] = _mm_add_epi32( ha[0], W[r-8] );
950
951
        pbData += 64;
952
        cbData -= 64;
953
    }
954
955
    //
956
    // Copy the chaining state back into the hash structure
957
    //
958
    pChain->H[0] = _mm_extract_epi32(ha[7], 0);
959
    pChain->H[1] = _mm_extract_epi32(ha[6], 0);
960
    pChain->H[2] = _mm_extract_epi32(ha[5], 0);
961
    pChain->H[3] = _mm_extract_epi32(ha[4], 0);
962
    pChain->H[4] = _mm_extract_epi32(ha[3], 0);
963
    pChain->H[5] = _mm_extract_epi32(ha[2], 0);
964
    pChain->H[6] = _mm_extract_epi32(ha[1], 0);
965
    pChain->H[7] = _mm_extract_epi32(ha[0], 0);
966
967
    *pcbRemaining = cbData;
968
969
    SymCryptWipeKnownSize( buf, sizeof( buf ) );
970
    SymCryptWipeKnownSize( &A, sizeof( A ) );
971
    SymCryptWipeKnownSize( &B, sizeof( B ) );
972
    SymCryptWipeKnownSize( &C, sizeof( C ) );
973
    SymCryptWipeKnownSize( &D, sizeof( D ) );
974
    SymCryptWipeKnownSize( &T, sizeof( T ) );
975
}
976
977
978
//
979
// XMM implementation 2
980
// We use the XMM registers to compute part of the message schedule.
981
// The load, BSWAP, and part of the message schedule recursion are done in XMM registers.
982
// The rest of the work is done using integers.
983
//
984
// Core2: 0.1 c/B slower than the _ul1
985
// Atom: 1.0 c/B slower than _ul1   (42.34 vs 41.39 c/B)
986
//
987
VOID
988
SYMCRYPT_CALL
989
SymCryptSha256AppendBlocks_xmm2(
990
    _Inout_                 SYMCRYPT_SHA256_CHAINING_STATE *    pChain,
991
    _In_reads_( cbData )    PCBYTE                              pbData,
992
                            SIZE_T                              cbData,
993
    _Out_                   SIZE_T                            * pcbRemaining )
994
{
995
    SYMCRYPT_ALIGN union { UINT32 ul[16]; __m128i xmm[4]; } W;
996
    SYMCRYPT_ALIGN UINT32 ah[8];
997
    int round;
998
    UINT32 Wt;
999
    const __m128i BYTE_REVERSE_32 = _mm_set_epi8( 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 );
1000
1001
    ah[7] = pChain->H[0];
1002
    ah[6] = pChain->H[1];
1003
    ah[5] = pChain->H[2];
1004
    ah[4] = pChain->H[3];
1005
    ah[3] = pChain->H[4];
1006
    ah[2] = pChain->H[5];
1007
    ah[1] = pChain->H[6];
1008
    ah[0] = pChain->H[7];
1009
1010
#define CROUND( r16, r ) {;\
1011
    ah[ r16   &7] += CSIGMA1(ah[(r16+3)&7]) + CH(ah[(r16+3)&7], ah[(r16+2)&7], ah[(r16+1)&7]) + SymCryptSha256K[r] + Wt;\
1012
    ah[(r16+4)&7] += ah[r16 &7];\
1013
    ah[ r16   &7] += CSIGMA0(ah[(r16+7)&7]) + MAJ(ah[(r16+7)&7], ah[(r16+6)&7], ah[(r16+5)&7]);\
1014
}
1015
1016
1017
//
1018
// Initial round that reads the message.
1019
// r is the round number 0..15
1020
//
1021
//    Wt = LOAD_MSBFIRST32( &pbData[ 4*r ] );\
1022
//    W.ul[r] = Wt; \
1023
1024
#define IROUND( r ) {\
1025
    Wt = W.ul[r];\
1026
    CROUND(r,r);\
1027
    }
1028
1029
//
1030
// Subsequent rounds.
1031
// r16 is the round number mod 16. rb is the round number minus r16.
1032
//
1033
#define FROUND(r16, rb) { \
1034
    Wt = W.ul[r16];\
1035
    CROUND( r16, r16+rb ); \
1036
}
1037
1038
1039
    while( cbData >= 64 )
1040
    {
1041
        //
1042
        // The code is faster if we directly access the W.ul array, rather than the W.xmm alias.
1043
        // I think the compiler gets more confused if you use the W.xmm values.
1044
        // We retain them in the union to ensure alignment
1045
        //
1046
        _mm_store_si128( (__m128i *)&W.ul[ 0], _mm_shuffle_epi8( _mm_loadu_si128( (__m128i *)&pbData[   0 ] ), BYTE_REVERSE_32 ));
1047
        _mm_store_si128( (__m128i *)&W.ul[ 4], _mm_shuffle_epi8( _mm_loadu_si128( (__m128i *)&pbData[  16 ] ), BYTE_REVERSE_32 ));
1048
        _mm_store_si128( (__m128i *)&W.ul[ 8], _mm_shuffle_epi8( _mm_loadu_si128( (__m128i *)&pbData[  32 ] ), BYTE_REVERSE_32 ));
1049
        _mm_store_si128( (__m128i *)&W.ul[12], _mm_shuffle_epi8( _mm_loadu_si128( (__m128i *)&pbData[  48 ] ), BYTE_REVERSE_32 ));
1050
1051
        //
1052
        // initial rounds 1 to 16
1053
        //
1054
1055
        IROUND(  0 );
1056
        IROUND(  1 );
1057
        IROUND(  2 );
1058
        IROUND(  3 );
1059
        IROUND(  4 );
1060
        IROUND(  5 );
1061
        IROUND(  6 );
1062
        IROUND(  7 );
1063
        IROUND(  8 );
1064
        IROUND(  9 );
1065
        IROUND( 10 );
1066
        IROUND( 11 );
1067
        IROUND( 12 );
1068
        IROUND( 13 );
1069
        IROUND( 14 );
1070
        IROUND( 15 );
1071
1072
1073
        //
1074
        // rounds 16 to 64.
1075
        //
1076
        for( round=16; round<64; round += 16 )
1077
        {
1078
            __m128i Tmp;
1079
1080
            Tmp = _mm_add_epi32( _mm_add_epi32(
1081
                    LSIGMA0XMM(_mm_loadu_si128( (__m128i *)&W.ul[1] )),
1082
                    _mm_load_si128( (__m128i *)&W.ul[0] ) ),
1083
                    _mm_loadu_si128( (__m128i *)&W.ul[9] ) );
1084
1085
            //
1086
            // The final part of the message schedule can be done in XMM registers, but it isn't worth it.
1087
            // The rotates in XMM take two shifts and an OR/XOR, vs one instruction in integer registers.
1088
            // As the sigma1( W_{t-2} ) recursion component can only be computed 2 at a time
1089
            // (because the result of the first two are the inputs to the second two)
1090
            // you lose more than you gain by using XMM registers.
1091
            //
1092
            //Tmp = _mm_add_epi32( Tmp, LSIGMA1XMM( _mm_srli_si128( _mm_load_si128( (__m128i *)&W.ul[12] ), 8 ) ) );
1093
            //Tmp = _mm_add_epi32( Tmp, LSIGMA1XMM( _mm_slli_si128( Tmp, 8 ) ) );
1094
            //_mm_store_si128( (__m128i *)&W.ul[0], Tmp );
1095
            //
1096
1097
            _mm_store_si128( (__m128i *)&W.ul[0], Tmp );
1098
            W.ul[0] += LSIGMA1( W.ul[14] );
1099
            W.ul[1] += LSIGMA1( W.ul[15] );
1100
            W.ul[2] += LSIGMA1( W.ul[0] );
1101
            W.ul[3] += LSIGMA1( W.ul[1] );
1102
1103
            FROUND(  0, round );
1104
            FROUND(  1, round );
1105
            FROUND(  2, round );
1106
            FROUND(  3, round );
1107
1108
            Tmp = _mm_add_epi32( _mm_add_epi32(
1109
                    LSIGMA0XMM(_mm_loadu_si128( (__m128i *)&W.ul[5] )),
1110
                    _mm_load_si128( (__m128i *)&W.ul[4] ) ),
1111
                    _mm_alignr_epi8( _mm_load_si128( (__m128i *)&W.ul[0] ), _mm_load_si128( (__m128i *)&W.ul[12] ), 4) );
1112
1113
            _mm_store_si128( (__m128i *)&W.ul[4], Tmp );
1114
1115
            W.ul[4] += LSIGMA1( W.ul[2] );
1116
            W.ul[5] += LSIGMA1( W.ul[3] );
1117
            W.ul[6] += LSIGMA1( W.ul[4] );
1118
            W.ul[7] += LSIGMA1( W.ul[5] );
1119
1120
            FROUND(  4, round );
1121
            FROUND(  5, round );
1122
            FROUND(  6, round );
1123
            FROUND(  7, round );
1124
1125
            Tmp = _mm_add_epi32( _mm_add_epi32(
1126
                    LSIGMA0XMM(_mm_loadu_si128( (__m128i *)&W.ul[9] )),
1127
                    _mm_load_si128( (__m128i *)&W.ul[8] ) ),
1128
                    _mm_loadu_si128( (__m128i *)&W.ul[1] ) );
1129
1130
            _mm_store_si128( (__m128i *)&W.ul[8], Tmp );
1131
            W.ul[ 8] += LSIGMA1( W.ul[6] );
1132
            W.ul[ 9] += LSIGMA1( W.ul[7] );
1133
            W.ul[10] += LSIGMA1( W.ul[8] );
1134
            W.ul[11] += LSIGMA1( W.ul[9] );
1135
1136
            FROUND(  8, round );
1137
            FROUND(  9, round );
1138
            FROUND( 10, round );
1139
            FROUND( 11, round );
1140
1141
1142
            Tmp = _mm_add_epi32( _mm_add_epi32(
1143
                    LSIGMA0XMM( _mm_alignr_epi8( _mm_load_si128( (__m128i *)&W.ul[0] ), _mm_load_si128( (__m128i *)&W.ul[12] ), 4) ),
1144
                    _mm_load_si128( (__m128i *)&W.ul[12] ) ),
1145
                    _mm_loadu_si128( (__m128i *)&W.ul[5] ) );
1146
1147
            _mm_store_si128( (__m128i *)&W.ul[12], Tmp );
1148
            W.ul[12] += LSIGMA1( W.ul[10] );
1149
            W.ul[13] += LSIGMA1( W.ul[11] );
1150
            W.ul[14] += LSIGMA1( W.ul[12] );
1151
            W.ul[15] += LSIGMA1( W.ul[13] );
1152
1153
            FROUND( 12, round );
1154
            FROUND( 13, round );
1155
            FROUND( 14, round );
1156
            FROUND( 15, round );
1157
        }
1158
1159
        pChain->H[0] = ah[7] = ah[7] + pChain->H[0];
1160
        pChain->H[1] = ah[6] = ah[6] + pChain->H[1];
1161
        pChain->H[2] = ah[5] = ah[5] + pChain->H[2];
1162
        pChain->H[3] = ah[4] = ah[4] + pChain->H[3];
1163
        pChain->H[4] = ah[3] = ah[3] + pChain->H[4];
1164
        pChain->H[5] = ah[2] = ah[2] + pChain->H[5];
1165
        pChain->H[6] = ah[1] = ah[1] + pChain->H[6];
1166
        pChain->H[7] = ah[0] = ah[0] + pChain->H[7];
1167
1168
        pbData += 64;
1169
        cbData -= 64;
1170
1171
    }
1172
1173
    *pcbRemaining = cbData;
1174
1175
    //
1176
    // Wipe the variables;
1177
    //
1178
    SymCryptWipeKnownSize( ah, sizeof( ah ) );
1179
    SymCryptWipeKnownSize( &W, sizeof( W ) );
1180
    SYMCRYPT_FORCE_WRITE32( &Wt, 0 );
1181
1182
#undef IROUND
1183
#undef FROUND
1184
#undef CROUND
1185
}
1186
1187
#endif
1188
1189
//
1190
// SHA-NI Implementation
1191
//
1192
1193
#if SYMCRYPT_MS_VC
1194
// Intrinsic definitions included here
1195
// until the header is updated.
1196
// *******************************
1197
// *******************************
1198
// *******************************
1199
extern __m128i _mm_sha256rnds2_epu32(__m128i, __m128i, __m128i);
1200
extern __m128i _mm_sha256msg1_epu32(__m128i, __m128i);
1201
extern __m128i _mm_sha256msg2_epu32(__m128i, __m128i);
1202
// *******************************
1203
// *******************************
1204
// *******************************
1205
#endif
1206
1207
// For the SHA-NI implementation we will utilize 128-bit XMM registers. Each
1208
// XMM state will be denoted as (R_3, R_2, R_1, R_0), where each R_i
1209
// is a 32-bit word and R_i refers to bits [32*i : (32*i + 31)] of the
1210
// 128-bit XMM state.
1211
//
1212
// The following macro updates the state variables A,B,C,...,H of the SHA algorithms
1213
// for 4 rounds using:
1214
//  - The current round number t with 0<=t<= 63 and t a multiple of 4.
1215
//  - A current message XMM state _MSG which consists of 4 32-bit words
1216
//      ( W_(t+3), W_(t+2), W_(t+1), W_(t+0) ).
1217
//  - Two XMM states _ABEF and _CDGH which contain the variables
1218
//      ( A, B, E, F ) and ( C, D, G, H ) respectively.
1219
1220
#define SHANI_UPDATE_STATE( _round, _MSG, _ABEF, _CDGH ) \
1221
0
    _MSG = _mm_add_epi32( _MSG, *(__m128i *)&SymCryptSha256K[_round] );     /* Add the K_t constants to the W_t's */    \
1222
0
    _CDGH = _mm_sha256rnds2_epu32( _CDGH, _ABEF, _MSG );                    /* 2 rounds using SHA-NI */                 \
1223
0
    _MSG = _mm_shuffle_epi32( _MSG, 0x0e );                                 /* Move words 2 & 3 to positions 0 & 1 */   \
1224
0
    _ABEF = _mm_sha256rnds2_epu32( _ABEF, _CDGH, _MSG );                    /* 2 rounds using SHA-NI */
1225
1226
// For the SHA message schedule (i.e. to create words W_16 to W_63) we use 4 XMM states / accumulators.
1227
// Each accumulator holds 4 words.
1228
//
1229
// The final result for each word will be of the form W_t = X_t + Y_t, where
1230
//          X_t = W_(t-16) + \sigma_0(W_(t-15)) and
1231
//          Y_t = W_(t- 7) + \sigma_1(W_(t- 2))
1232
//
1233
//          The X_t's are calculated by the _mm_sha256msg1_epu32 intrinsic.
1234
//          The \sigma_1(W_(t-2)) part of the Y_t's by the _mm_sha256msg2_epu32 intrinsic.
1235
//
1236
// Remarks:
1237
//      - Calculation of the first four X_t's (i.e. 16<=t<=19) can start from round 4 (since 19-15 = 4).
1238
//      - Calculation of the first four Y_t's can start from round 12 (since 19-7=12 and W_(19-7) is calculated
1239
//        in the intrinsic call).
1240
//      - Due to the W_(t-7) term, producing the Y_t's need special shifting via the _mm_alignr_epi8 intrinsic and
1241
//        adding the correct accumulator into another variable MTEMP.
1242
//
1243
// For rounds 16 - 51 we execute the following macro in a loop. For all the other rounds we
1244
// use specific code.
1245
//
1246
// The loop invariant to be satified at the beginning of iteration i (corresponding to rounds
1247
// (16+4*i) to (19+4*i) ) is the following:
1248
//      _MSG_0 = ( W_(19 + 4*i), W_(18 + 4*i), W_(17 + 4*i), W_(16 + 4*i) )
1249
//      _MSG_1 = ( X_(23 + 4*i), X_(22 + 4*i), X_(21 + 4*i), X_(20 + 4*i) )
1250
//      _MSG_2 = ( X_(27 + 4*i), X_(26 + 4*i), X_(25 + 4*i), X_(24 + 4*i) )
1251
//      _MSG_3 = ( W_(15 + 4*i), W_(14 + 4*i), W_(13 + 4*i), W_(12 + 4*i) )
1252
//
1253
#define SHANI_MESSAGE_SCHEDULE( _MSG_0, _MSG_1, _MSG_2, _MSG_3, _MTEMP ) \
1254
0
    _MTEMP = _mm_alignr_epi8( _MSG_0, _MSG_3, 4);       /* _MTEMP := ( W_(16 + 4*i), W_(15 + 4*i), W_(14 + 4*i), W_(13 + 4*i) ) */          \
1255
0
    _MSG_1 = _mm_add_epi32( _MSG_1, _MTEMP);            /* _MSG_1 := _MSG_1 + ( W_(16 + 4*i), W_(15 + 4*i), W_(14 + 4*i), W_(13 + 4*i) ) */ \
1256
0
    _MSG_1 = _mm_sha256msg2_epu32( _MSG_1, _MSG_0 );    /* _MSG_1 := ( W_(23 + 4*i), W_(22 + 4*i), W_(21 + 4*i), W_(20 + 4*i) ) */          \
1257
0
    _MSG_3 = _mm_sha256msg1_epu32( _MSG_3, _MSG_0 );    /* _MSG_3 := ( X_(31+4*i), X_(30+4*i), X_(29+4*i), X_(28+4*i) ) */
1258
//
1259
// After each iteration the subsequent call rotates the accumulators so that the loop
1260
// invariant is preserved (please verify!):
1261
//          -- MSG_0 <---- MSG_1 <--- MSG_2 <--- MSG_3 <--
1262
//          |                                            |
1263
//          ----------------------------------------------
1264
1265
VOID
1266
SYMCRYPT_CALL
1267
SymCryptSha256AppendBlocks_shani(
1268
    _Inout_                 SYMCRYPT_SHA256_CHAINING_STATE *    pChain,
1269
    _In_reads_( cbData )    PCBYTE                              pbData,
1270
                            SIZE_T                              cbData,
1271
    _Out_                   SIZE_T                            * pcbRemaining )
1272
0
{
1273
0
    const __m128i BYTE_REVERSE_32 = _mm_set_epi8( 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 );
1274
1275
    // Our chain state is in order A, B, ..., H.
1276
    // First load our chaining state
1277
0
    __m128i DCBA = _mm_loadu_si128( (__m128i *)&(pChain->H[0]) );   // (D, C, B, A)
1278
0
    __m128i HGFE = _mm_loadu_si128( (__m128i *)&(pChain->H[4]) );   // (H, G, F, E)
1279
0
    __m128i FEBA = _mm_unpacklo_epi64( DCBA, HGFE );                // (F, E, B, A)
1280
0
    __m128i HGDC = _mm_unpackhi_epi64( DCBA, HGFE );                // (H, G, D, C)
1281
0
    __m128i ABEF = _mm_shuffle_epi32( FEBA, 0x1b );                 // (A, B, E, F)
1282
0
    __m128i CDGH = _mm_shuffle_epi32( HGDC, 0x1b );                 // (C, D, G, H)
1283
1284
0
    while( cbData >= 64 )
1285
0
    {
1286
        // Save the current state for the feed-forward later
1287
0
        __m128i ABEF_start = ABEF;
1288
0
        __m128i CDGH_start = CDGH;
1289
1290
        // Current message and temporary state
1291
0
        __m128i MSG;
1292
1293
        // Accumulators
1294
0
        __m128i MSG_0;
1295
0
        __m128i MSG_1;
1296
0
        __m128i MSG_2;
1297
0
        __m128i MSG_3;
1298
1299
        // Rounds 0-3
1300
0
        MSG = _mm_loadu_si128( (__m128i *)pbData );         // Reversed word - ( M_3, M_2, M_1, M_0 )
1301
0
        pbData += 16;
1302
0
        MSG = _mm_shuffle_epi8( MSG, BYTE_REVERSE_32 );     // Reverse each word
1303
0
        MSG_0 = MSG;                                        // MSG_0 := ( W_3 = M3, W_2 = M_2, W_1 = M_1, W_0 = M_0 )
1304
1305
0
        SHANI_UPDATE_STATE( 0, MSG, ABEF, CDGH );
1306
1307
        // Rounds 4-7
1308
0
        MSG = _mm_loadu_si128( (__m128i *)pbData );         // Reversed word - ( M_7, M_6, M_5, M_4 )
1309
0
        pbData += 16;
1310
0
        MSG = _mm_shuffle_epi8( MSG, BYTE_REVERSE_32 );     // Reverse each word
1311
0
        MSG_1 = MSG;                                        // MSG_1 := ( W_7 = M_7, W_6 = M_6, W_5 = M_5, W_4 = M_4 )
1312
1313
0
        SHANI_UPDATE_STATE( 4, MSG, ABEF, CDGH );
1314
1315
0
        MSG_0 = _mm_sha256msg1_epu32( MSG_0, MSG_1 );       // MSG_0 := ( X_19, X_18, X_17, X_16 ) =
1316
                                                            // ( W_3 + \sigma_0(W_4), ..., W_0 + \sigma_0(W_1) )
1317
1318
        // Rounds 8-11
1319
0
        MSG = _mm_loadu_si128( (__m128i *)pbData );         // Reversed word - ( M_11, M_10, M_9, M_8 )
1320
0
        pbData += 16;
1321
0
        MSG = _mm_shuffle_epi8( MSG, BYTE_REVERSE_32 );     // Reverse each word
1322
0
        MSG_2 = MSG;                                        // MSG_2 := ( W_11 = M_11, W_10 = M_10, W_9 = M_9, W_8 = M_8 )
1323
1324
0
        SHANI_UPDATE_STATE( 8, MSG, ABEF, CDGH );
1325
1326
0
        MSG_1 = _mm_sha256msg1_epu32( MSG_1, MSG_2 );       // MSG_1 := ( X_23, X_22, X_21, X_20 )
1327
1328
        // Rounds 12-15
1329
0
        MSG = _mm_loadu_si128( (__m128i *)pbData );         // Reversed word - ( M_15, M_14, M_13, M_12 )
1330
0
        pbData += 16;
1331
0
        MSG = _mm_shuffle_epi8( MSG, BYTE_REVERSE_32 );     // Reverse each word
1332
0
        MSG_3 = MSG;                                        // MSG_3 := ( W_15 = M_15, W_14 = M_14, W_13 = M_13, W_12 = M_12 )
1333
1334
0
        SHANI_UPDATE_STATE( 12, MSG, ABEF, CDGH );
1335
1336
0
        MSG = _mm_alignr_epi8( MSG_3, MSG_2, 4);            // MSG := ( W_12, W_11, W_10, W_9 )
1337
0
        MSG_0 = _mm_add_epi32( MSG_0, MSG);                 // MSG_0 := MSG_0 + ( W_12, W_11, W_10, W_9 )
1338
0
        MSG_0 = _mm_sha256msg2_epu32( MSG_0, MSG_3 );       // MSG_0 := ( W_19, W_18, W_17, W_16 ) =
1339
                                                            // ( X_19 + W_12 + \sigma_1(W_17)], ..., X_16 + W_9 + \sigma_1(W_14)] )
1340
1341
0
        MSG_2 = _mm_sha256msg1_epu32( MSG_2, MSG_3 );       // MSG_2 := ( X_27, X_26, X_25, X_24 )
1342
1343
1344
        // Rounds 16 - 19
1345
0
        MSG = MSG_0;
1346
0
        SHANI_UPDATE_STATE( 16, MSG, ABEF, CDGH );
1347
0
        SHANI_MESSAGE_SCHEDULE( MSG_0, MSG_1, MSG_2, MSG_3, MSG );
1348
1349
        // Rounds 20 - 23
1350
0
        MSG = MSG_1;
1351
0
        SHANI_UPDATE_STATE( 20, MSG, ABEF, CDGH );
1352
0
        SHANI_MESSAGE_SCHEDULE( MSG_1, MSG_2, MSG_3, MSG_0, MSG );
1353
1354
        // Rounds 24 - 27
1355
0
        MSG = MSG_2;
1356
0
        SHANI_UPDATE_STATE( 24, MSG, ABEF, CDGH );
1357
0
        SHANI_MESSAGE_SCHEDULE( MSG_2, MSG_3, MSG_0, MSG_1, MSG );
1358
1359
        // Rounds 28 - 31
1360
0
        MSG = MSG_3;
1361
0
        SHANI_UPDATE_STATE( 28, MSG, ABEF, CDGH );
1362
0
        SHANI_MESSAGE_SCHEDULE( MSG_3, MSG_0, MSG_1, MSG_2, MSG );
1363
1364
        // Rounds 32 - 35
1365
0
        MSG = MSG_0;
1366
0
        SHANI_UPDATE_STATE( 32, MSG, ABEF, CDGH );
1367
0
        SHANI_MESSAGE_SCHEDULE( MSG_0, MSG_1, MSG_2, MSG_3, MSG );
1368
1369
        // Rounds 36 - 39
1370
0
        MSG = MSG_1;
1371
0
        SHANI_UPDATE_STATE( 36, MSG, ABEF, CDGH );
1372
0
        SHANI_MESSAGE_SCHEDULE( MSG_1, MSG_2, MSG_3, MSG_0, MSG );
1373
1374
        // Rounds 40 - 43
1375
0
        MSG = MSG_2;
1376
0
        SHANI_UPDATE_STATE( 40, MSG, ABEF, CDGH );
1377
0
        SHANI_MESSAGE_SCHEDULE( MSG_2, MSG_3, MSG_0, MSG_1, MSG );
1378
1379
        // Rounds 44 - 47
1380
0
        MSG = MSG_3;
1381
0
        SHANI_UPDATE_STATE( 44, MSG, ABEF, CDGH );
1382
0
        SHANI_MESSAGE_SCHEDULE( MSG_3, MSG_0, MSG_1, MSG_2, MSG );
1383
1384
        // Rounds 48 - 51
1385
0
        MSG = MSG_0;
1386
0
        SHANI_UPDATE_STATE( 48, MSG, ABEF, CDGH );
1387
0
        SHANI_MESSAGE_SCHEDULE( MSG_0, MSG_1, MSG_2, MSG_3, MSG );
1388
1389
        // Rounds 52 - 55
1390
0
        MSG = MSG_1;                                                    // ( W_55, W_54, W_53, W_52 )
1391
0
        SHANI_UPDATE_STATE( 52, MSG, ABEF, CDGH );
1392
1393
0
        MSG = _mm_alignr_epi8( MSG_1, MSG_0, 4);                        // MSG := ( W_52, W_51, W_50, W_49 )
1394
0
        MSG_2 = _mm_add_epi32( MSG_2, MSG);                             // MSG_2 := MSG_2 + ( W_52, W_51, W_50, W_49 )
1395
0
        MSG_2 = _mm_sha256msg2_epu32( MSG_2, MSG_1 );                   // Calculate ( W_59, W_58, W_57, W_56 )
1396
1397
        // Rounds 56 - 59
1398
0
        MSG = MSG_2;                                                    // ( W_59, W_58, W_57, W_56 )
1399
0
        SHANI_UPDATE_STATE( 56, MSG, ABEF, CDGH );
1400
1401
0
        MSG = _mm_alignr_epi8( MSG_2, MSG_1, 4);                        // MSG := ( W_56, W_55, W_54, W_53 )
1402
0
        MSG_3 = _mm_add_epi32( MSG_3, MSG);                             // MSG_3 := MSG_3 + ( W_56, W_55, W_54, W_53 )
1403
0
        MSG_3 = _mm_sha256msg2_epu32( MSG_3, MSG_2 );                   // Calculate ( W_63, W_62, W_61, W_60 )
1404
1405
        // Rounds 60 - 63
1406
0
        SHANI_UPDATE_STATE( 60, MSG_3, ABEF, CDGH );
1407
1408
        // Add the feed-forward
1409
0
        ABEF = _mm_add_epi32( ABEF, ABEF_start );
1410
0
        CDGH = _mm_add_epi32( CDGH, CDGH_start );
1411
1412
0
        cbData -= 64;
1413
0
    }
1414
1415
    // Unpack the state registers and store them in the state
1416
0
    FEBA = _mm_shuffle_epi32( ABEF, 0x1b );
1417
0
    HGDC = _mm_shuffle_epi32( CDGH, 0x1b );
1418
0
    DCBA = _mm_unpacklo_epi64( FEBA, HGDC );                        // (D, C, B, A)
1419
0
    HGFE = _mm_unpackhi_epi64( FEBA, HGDC );                        // (H, G, F, E)
1420
0
    _mm_storeu_si128 ( (__m128i *)&(pChain->H[0]), DCBA);            // (D, C, B, A)
1421
0
    _mm_storeu_si128 ( (__m128i *)&(pChain->H[4]), HGFE);            // (H, G, F, E)
1422
1423
0
    *pcbRemaining = cbData;
1424
0
}
1425
1426
#undef SHANI_UPDATE_STATE
1427
#undef SHANI_MESSAGE_SCHEDULE
1428
1429
#endif  // SYMCRYPT_CPU_X86 | SYMCRYPT_CPU_AMD64
1430
1431
#if SYMCRYPT_CPU_ARM64
1432
/*
1433
ARM64 has special SHA-256 instructions
1434
1435
SHA256H and SHA256H2 implement 4 rounds of SHA-256. The inputs are two registers containing the 256-bit state,
1436
and one register containing 128 bits of expanded message plus the round constants.
1437
These instructions perform the same computation, but SHA256H returns the first half of the 256-bit result,
1438
and SHA256H2 returns the second half of the 256-bit result.
1439
1440
SHA256H( ABCDE, FGHIJ, W )
1441
Where the least significant word of the ABCDE vector is A. The W vector contains W_i + K_i for the four rounds being computed.
1442
1443
SHA256SU0 is the message schedule update function.
1444
It takes 2 inputs and produces 1 output.
1445
We describe the vectors for i=0,1,2,3
1446
Inputs:  [W_{t-16+i}], [W_{t-12+i}]
1447
Output: [Sigma0(W_{t-15+i}) + W_{t-16+i}]
1448
1449
SHA256SU1 is the second message schedule update function
1450
Takes 3 inputs and produces 1 output
1451
Input 1: Output of SHA256SU0: [Sigma0(W_{t-15+i}) + W_{t-16+i}]
1452
Input 2:
1453
Input 3: [W_{t-4+i}]
1454
1455
*/
1456
1457
#define vldq(_p)     (*(__n128 *)(_p))
1458
#define vstq(_p, _v) (*(__n128 *)(_p) = (_v) )
1459
1460
VOID
1461
SYMCRYPT_CALL
1462
SymCryptSha256AppendBlocks_instr(
1463
    _Inout_                 SYMCRYPT_SHA256_CHAINING_STATE *    pChain,
1464
    _In_reads_( cbData )    PCBYTE                              pbData,
1465
                            SIZE_T                              cbData,
1466
    _Out_                   SIZE_T                            * pcbRemaining )
1467
{
1468
    //
1469
    // Armv8 has 32 Neon registers. We can use a lot of variables.
1470
    // 16 for the constants, 4 for the message, 2 for the current state, 2 for the starting state,
1471
    // total = 24 which leaves enough for some temp values
1472
    //
1473
    __n128      ABCD, ABCDstart;
1474
    __n128      EFGH, EFGHstart;
1475
    __n128      W0, W1, W2, W3;
1476
    __n128      K0, K1, K2, K3, K4, K5, K6, K7, K8, K9, K10, K11, K12, K13, K14, K15;
1477
1478
    __n128      Wr;
1479
    __n128      t;
1480
1481
    ABCD = ABCDstart = vldq( &pChain->H[0] );
1482
    EFGH = EFGHstart = vldq( &pChain->H[4] );
1483
1484
    K0  = vldq( &SymCryptSha256K[ 4 *  0 ] );
1485
    K1  = vldq( &SymCryptSha256K[ 4 *  1 ] );
1486
    K2  = vldq( &SymCryptSha256K[ 4 *  2 ] );
1487
    K3  = vldq( &SymCryptSha256K[ 4 *  3 ] );
1488
    K4  = vldq( &SymCryptSha256K[ 4 *  4 ] );
1489
    K5  = vldq( &SymCryptSha256K[ 4 *  5 ] );
1490
    K6  = vldq( &SymCryptSha256K[ 4 *  6 ] );
1491
    K7  = vldq( &SymCryptSha256K[ 4 *  7 ] );
1492
    K8  = vldq( &SymCryptSha256K[ 4 *  8 ] );
1493
    K9  = vldq( &SymCryptSha256K[ 4 *  9 ] );
1494
    K10 = vldq( &SymCryptSha256K[ 4 * 10 ] );
1495
    K11 = vldq( &SymCryptSha256K[ 4 * 11 ] );
1496
    K12 = vldq( &SymCryptSha256K[ 4 * 12 ] );
1497
    K13 = vldq( &SymCryptSha256K[ 4 * 13 ] );
1498
    K14 = vldq( &SymCryptSha256K[ 4 * 14 ] );
1499
    K15 = vldq( &SymCryptSha256K[ 4 * 15 ] );
1500
1501
    while( cbData >= 64 )
1502
    {
1503
        W0 = vrev32q_u8( vldq( &pbData[ 0] ) );
1504
        W1 = vrev32q_u8( vldq( &pbData[16] ) );
1505
        W2 = vrev32q_u8( vldq( &pbData[32] ) );
1506
        W3 = vrev32q_u8( vldq( &pbData[48] ) );
1507
1508
        //
1509
        // The sha256h/sha256h2 instructions overwrite one of the two state input registers.
1510
        // This implies we have to have a copy made of one of the input states.
1511
        //
1512
#define ROUNDOP {\
1513
    t = ABCD;\
1514
    ABCD = vsha256hq_u32 ( ABCD, EFGH, Wr );\
1515
    EFGH = vsha256h2q_u32( EFGH, t, Wr );\
1516
    }
1517
1518
        Wr = vaddq_u32( W0, K0 );
1519
        ROUNDOP;
1520
        Wr = vaddq_u32( W1, K1 );
1521
        ROUNDOP;
1522
        Wr = vaddq_u32( W2, K2 );
1523
        ROUNDOP;
1524
        Wr = vaddq_u32( W3, K3 );
1525
        ROUNDOP;
1526
1527
        t = vsha256su0q_u32( W0, W1 );
1528
        W0 = vsha256su1q_u32( t, W2, W3 );
1529
        Wr = vaddq_u32( W0, K4 );
1530
        ROUNDOP;
1531
1532
        t = vsha256su0q_u32( W1, W2 );
1533
        W1 = vsha256su1q_u32( t, W3, W0 );
1534
        Wr = vaddq_u32( W1, K5 );
1535
        ROUNDOP;
1536
1537
        t = vsha256su0q_u32( W2, W3 );
1538
        W2 = vsha256su1q_u32( t, W0, W1 );
1539
        Wr = vaddq_u32( W2, K6 );
1540
        ROUNDOP;
1541
1542
        t = vsha256su0q_u32( W3, W0 );
1543
        W3 = vsha256su1q_u32( t, W1, W2 );
1544
        Wr = vaddq_u32( W3, K7 );
1545
        ROUNDOP;
1546
1547
1548
        t = vsha256su0q_u32( W0, W1 );
1549
        W0 = vsha256su1q_u32( t, W2, W3 );
1550
        Wr = vaddq_u32( W0, K8 );
1551
        ROUNDOP;
1552
1553
        t = vsha256su0q_u32( W1, W2 );
1554
        W1 = vsha256su1q_u32( t, W3, W0 );
1555
        Wr = vaddq_u32( W1, K9 );
1556
        ROUNDOP;
1557
1558
        t = vsha256su0q_u32( W2, W3 );
1559
        W2 = vsha256su1q_u32( t, W0, W1 );
1560
        Wr = vaddq_u32( W2, K10 );
1561
        ROUNDOP;
1562
1563
        t = vsha256su0q_u32( W3, W0 );
1564
        W3 = vsha256su1q_u32( t, W1, W2 );
1565
        Wr = vaddq_u32( W3, K11 );
1566
        ROUNDOP;
1567
1568
1569
        t = vsha256su0q_u32( W0, W1 );
1570
        W0 = vsha256su1q_u32( t, W2, W3 );
1571
        Wr = vaddq_u32( W0, K12 );
1572
        ROUNDOP;
1573
1574
        t = vsha256su0q_u32( W1, W2 );
1575
        W1 = vsha256su1q_u32( t, W3, W0 );
1576
        Wr = vaddq_u32( W1, K13 );
1577
        ROUNDOP;
1578
1579
        t = vsha256su0q_u32( W2, W3 );
1580
        W2 = vsha256su1q_u32( t, W0, W1 );
1581
        Wr = vaddq_u32( W2, K14 );
1582
        ROUNDOP;
1583
1584
        t = vsha256su0q_u32( W3, W0 );
1585
        W3 = vsha256su1q_u32( t, W1, W2 );
1586
        Wr = vaddq_u32( W3, K15 );
1587
        ROUNDOP;
1588
1589
        ABCDstart = ABCD = vaddq_u32( ABCDstart, ABCD );
1590
        EFGHstart = EFGH = vaddq_u32( EFGHstart, EFGH );
1591
1592
        pbData += 64;
1593
        cbData -= 64;
1594
#undef ROUNDOP
1595
1596
    }
1597
1598
    *pcbRemaining = cbData;
1599
    vstq( &pChain->H[0], ABCD );
1600
    vstq( &pChain->H[4], EFGH );
1601
1602
    //
1603
    // All our local variables should be in registers, so no way to wipe them.
1604
    //
1605
}
1606
1607
#endif
1608
1609
1610
1611
//
1612
// Easy switch between different implementations
1613
//
1614
//FORCEINLINE
1615
VOID
1616
SYMCRYPT_CALL
1617
SymCryptSha256AppendBlocks(
1618
    _Inout_                 SYMCRYPT_SHA256_CHAINING_STATE* pChain,
1619
    _In_reads_(cbData)      PCBYTE                          pbData,
1620
                            SIZE_T                          cbData,
1621
    _Out_                   SIZE_T*                         pcbRemaining)
1622
21.4k
{
1623
21.4k
#if SYMCRYPT_CPU_AMD64
1624
1625
21.4k
    SYMCRYPT_EXTENDED_SAVE_DATA SaveData;
1626
1627
21.4k
    if (SYMCRYPT_CPU_FEATURES_PRESENT(SYMCRYPT_CPU_FEATURES_FOR_SHANI_CODE) &&
1628
21.4k
        SymCryptSaveXmm(&SaveData) == SYMCRYPT_NO_ERROR)
1629
0
    {
1630
0
        SymCryptSha256AppendBlocks_shani(pChain, pbData, cbData, pcbRemaining);
1631
1632
0
        SymCryptRestoreXmm(&SaveData);
1633
0
    }
1634
    // Temporarily disabling use of Ymm in SHA2
1635
    // else if (SYMCRYPT_CPU_FEATURES_PRESENT(SYMCRYPT_CPU_FEATURE_AVX2 | SYMCRYPT_CPU_FEATURE_BMI2) &&
1636
    //         SymCryptSaveYmm(&SaveData) == SYMCRYPT_NO_ERROR)
1637
    // {
1638
    //     //SymCryptSha256AppendBlocks_ul1(pChain, pbData, cbData, pcbRemaining);
1639
    //     //SymCryptSha256AppendBlocks_ymm_8blocks(pChain, pbData, cbData, pcbRemaining);
1640
    //     SymCryptSha256AppendBlocks_ymm_avx2_asm(pChain, pbData, cbData, pcbRemaining);
1641
1642
    //     SymCryptRestoreYmm(&SaveData);
1643
    // }
1644
21.4k
    else if (SYMCRYPT_CPU_FEATURES_PRESENT(SYMCRYPT_CPU_FEATURE_SSSE3 | SYMCRYPT_CPU_FEATURE_BMI2) &&
1645
21.4k
            SymCryptSaveXmm(&SaveData) == SYMCRYPT_NO_ERROR)
1646
0
    {
1647
        //SymCryptSha256AppendBlocks_xmm_4blocks(pChain, pbData, cbData, pcbRemaining);
1648
0
        SymCryptSha256AppendBlocks_xmm_ssse3_asm(pChain, pbData, cbData, pcbRemaining);
1649
 
1650
0
        SymCryptRestoreXmm(&SaveData);
1651
0
    }
1652
21.4k
    else
1653
21.4k
    {
1654
21.4k
       SymCryptSha256AppendBlocks_ul1( pChain, pbData, cbData, pcbRemaining );
1655
       //SymCryptSha256AppendBlocks_ul2(pChain, pbData, cbData, pcbRemaining);
1656
21.4k
    }
1657
#elif SYMCRYPT_CPU_X86
1658
    SYMCRYPT_EXTENDED_SAVE_DATA  SaveData;
1659
1660
    if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURES_FOR_SHANI_CODE | SYMCRYPT_CPU_FEATURE_SAVEXMM_NOFAIL ) &&
1661
        SymCryptSaveXmm( &SaveData ) == SYMCRYPT_NO_ERROR )
1662
    {
1663
        SymCryptSha256AppendBlocks_shani( pChain, pbData, cbData, pcbRemaining );
1664
        SymCryptRestoreXmm( &SaveData );
1665
    } 
1666
    else if (SYMCRYPT_CPU_FEATURES_PRESENT(SYMCRYPT_CPU_FEATURE_SSSE3 | SYMCRYPT_CPU_FEATURE_BMI2) 
1667
            && SymCryptSaveXmm(&SaveData) == SYMCRYPT_NO_ERROR)
1668
    {
1669
        SymCryptSha256AppendBlocks_xmm_4blocks(pChain, pbData, cbData, pcbRemaining);
1670
        SymCryptRestoreXmm(&SaveData);
1671
    }
1672
    else {
1673
        SymCryptSha256AppendBlocks_ul1( pChain, pbData, cbData, pcbRemaining );
1674
    }
1675
#elif SYMCRYPT_CPU_ARM64
1676
    if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURE_NEON_SHA256 ) )
1677
    {
1678
        SymCryptSha256AppendBlocks_instr( pChain, pbData, cbData, pcbRemaining );
1679
    } else {
1680
        SymCryptSha256AppendBlocks_ul1( pChain, pbData, cbData, pcbRemaining );
1681
    }
1682
#else
1683
    SymCryptSha256AppendBlocks_ul1( pChain, pbData, cbData, pcbRemaining );
1684
#endif
1685
1686
    //SymCryptSha256AppendBlocks_ul2( pChain, pbData, cbData, pcbRemaining );
1687
    //SymCryptSha256AppendBlocks_xmm1( pChain, pbData, cbData, pcbRemaining );  !!! Needs Save/restore logic
1688
    //SymCryptSha256AppendBlocks_xmm2( pChain, pbData, cbData, pcbRemaining );
1689
21.4k
}
1690
1691