/src/SymCrypt/lib/sha256-xmm.c

Source (jump to first uncovered line)
#include "precomp.h"

#if  SYMCRYPT_CPU_X86 | SYMCRYPT_CPU_AMD64


extern SYMCRYPT_ALIGN_AT(256) const  UINT32 SymCryptSha256K[64];


// Endianness transformation for 4 32-bit values in an XMM register
const SYMCRYPT_ALIGN_AT(16) UINT32 BYTE_REVERSE_32[4] = {
    0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
};

// Shuffle 32-bit words in an XMM register: W3 W2 W1 W0 -> 0 0 W2 W0
// Used by the SSSE3 assembly implementation
const SYMCRYPT_ALIGN_AT(16) UINT32 XMM_PACKLOW[4] = {
    0x03020100, 0x0b0a0908, 0x80808080, 0x80808080,
};

// Shuffle 32-bit words in an XMM register: W3 W2 W1 W0 -> W2 W0 0 0
// Used by the SSSE3 assembly implementation
const SYMCRYPT_ALIGN_AT(16) UINT32 XMM_PACKHIGH[4] = {
    0x80808080, 0x80808080, 0x03020100, 0x0b0a0908,
};


#if SYMCRYPT_MS_VC
#define RORX_U32  _rorx_u32
#define RORX_U64  _rorx_u64
#else
// TODO: implement _rorx functions for clang
#define RORX_U32  ROR32
#define RORX_U64  ROR64
#endif // SYMCRYPT_MS_VC


//
// For documentation on these function see FIPS 180-2
//
// MAJ and CH are the functions Maj and Ch from the standard.
// CSIGMA0 and CSIGMA1 are the capital sigma functions.
// LSIGMA0 and LSIGMA1 are the lowercase sigma functions.
//
// The canonical definitions of the MAJ and CH functions are:
//#define MAJ( x, y, z )    (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
//#define CH( x, y, z )  (((x) & (y)) ^ ((~(x)) & (z)))
// We use optimized versions defined below
//
#define MAJ( x, y, z )  ((((z) | (y)) & (x) ) | ((z) & (y)))
#define CH( x, y, z )  ((((z) ^ (y)) & (x)) ^ (z))

#define LSIGMA0( x )    (ROR32((x),  7) ^ ROR32((x), 18) ^ ((x)>> 3))
#define LSIGMA1( x )    (ROR32((x), 17) ^ ROR32((x), 19) ^ ((x)>>10))

#define CSIGMA0(x) (RORX_U32(x, 2) ^ RORX_U32(x, 13) ^ RORX_U32(x, 22))
#define CSIGMA1(x) (RORX_U32(x, 6) ^ RORX_U32(x, 11) ^ RORX_U32(x, 25))


#define LSIGMA0XMM( x ) \
    _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( \
        _mm_slli_epi32(x,25)  , _mm_srli_epi32(x,  7) ),\
        _mm_slli_epi32(x,14) ), _mm_srli_epi32(x, 18) ),\
        _mm_srli_epi32(x, 3) )
#define LSIGMA1XMM( x ) \
    _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( \
        _mm_slli_epi32(x,15)  , _mm_srli_epi32(x, 17) ),\
        _mm_slli_epi32(x,13) ), _mm_srli_epi32(x, 19) ),\
        _mm_srli_epi32(x,10) )



// Initial loading of message words and endianness transformation.
// bl : The number of blocks to load,  1 <= bl <= 4.
//
// When bl < 4, the high order lanes of the XMM registers corresponding to the missing blocks are unused.
//
#define SHA256_MSG_LOAD_4BLOCKS(bl) { \
        for(SIZE_T i = 0; i < bl; i++) \
        { \
            Wx.xmm[i +  0] = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &pbData[i * SYMCRYPT_SHA256_INPUT_BLOCK_SIZE +  0]), kBYTE_REVERSE_32); \
            Wx.xmm[i +  4] = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &pbData[i * SYMCRYPT_SHA256_INPUT_BLOCK_SIZE + 16]), kBYTE_REVERSE_32); \
            Wx.xmm[i +  8] = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &pbData[i * SYMCRYPT_SHA256_INPUT_BLOCK_SIZE + 32]), kBYTE_REVERSE_32); \
            Wx.xmm[i + 12] = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &pbData[i * SYMCRYPT_SHA256_INPUT_BLOCK_SIZE + 48]), kBYTE_REVERSE_32); \
        } \
}

// Shuffles the initially loaded message words from multiple blocks
// so that each XMM register contains message words with the same index
// within a block (e.g. Wx.xmm[0] contains the first words of each block).
// 
// We have to use this macro four times to transform the message blocks of 64-bytes.
// ind=0 processes the first quarter (16-bytes), ind=1 does the second quarter and so on.
//
#define SHA256_MSG_TRANSPOSE_QUARTER_4BLOCKS(ind)  { \
        __m128i t1, t2, t3, t4; \
        t1 = _mm_unpacklo_epi32(Wx.xmm[4 * (ind) + 0], Wx.xmm[4 * (ind) + 1]); \
        t2 = _mm_unpacklo_epi32(Wx.xmm[4 * (ind) + 2], Wx.xmm[4 * (ind) + 3]); \
        t3 = _mm_unpackhi_epi32(Wx.xmm[4 * (ind) + 0], Wx.xmm[4 * (ind) + 1]); \
        t4 = _mm_unpackhi_epi32(Wx.xmm[4 * (ind) + 2], Wx.xmm[4 * (ind) + 3]); \
        Wx.xmm[4 * (ind) + 0] = _mm_unpacklo_epi64(t1, t2); \
        Wx.xmm[4 * (ind) + 1] = _mm_unpackhi_epi64(t1, t2); \
        Wx.xmm[4 * (ind) + 2] = _mm_unpacklo_epi64(t3, t4); \
        Wx.xmm[4 * (ind) + 3] = _mm_unpackhi_epi64(t3, t4); \
}

#define SHA256_MSG_TRANSPOSE_4BLOCKS() { \
        SHA256_MSG_TRANSPOSE_QUARTER_4BLOCKS(0); \
        SHA256_MSG_TRANSPOSE_QUARTER_4BLOCKS(1); \
        SHA256_MSG_TRANSPOSE_QUARTER_4BLOCKS(2); \
        SHA256_MSG_TRANSPOSE_QUARTER_4BLOCKS(3); \
}

// One round message schedule, updates the rth message word. ( 16 <= r < 64 )
// Also adds the constants for round (r-16).
#define SHA256_MSG_EXPAND_4BLOCKS_1ROUND(r) { \
        Wx.xmm[r] = _mm_add_epi32(_mm_add_epi32(_mm_add_epi32(Wx.xmm[r - 16], Wx.xmm[r - 7]), \
                                LSIGMA0XMM(Wx.xmm[r - 15])), LSIGMA1XMM(Wx.xmm[r - 2])); \
        Wx.xmm[r - 16] = _mm_add_epi32(Wx.xmm[r - 16], _mm_set1_epi32(SymCryptSha256K[r - 16])); \
}

// Four rounds of message schedule. Generates message words for rounds r, r+1, r+2, r+3.
#define SHA256_MSG_EXPAND_4BLOCKS_4ROUNDS(r) { \
        SHA256_MSG_EXPAND_4BLOCKS_1ROUND((r) + 0); SHA256_MSG_EXPAND_4BLOCKS_1ROUND((r) + 1); \
        SHA256_MSG_EXPAND_4BLOCKS_1ROUND((r) + 2); SHA256_MSG_EXPAND_4BLOCKS_1ROUND((r) + 3); \
}
// Sixteen rounds of message schedule. Generates message words for rounds r, ..., r+15.
#define SHA256_MSG_EXPAND_4BLOCKS_16ROUNDS(r) { \
        SHA256_MSG_EXPAND_4BLOCKS_4ROUNDS((r) + 0); SHA256_MSG_EXPAND_4BLOCKS_4ROUNDS((r) + 4); \
        SHA256_MSG_EXPAND_4BLOCKS_4ROUNDS((r) + 8); SHA256_MSG_EXPAND_4BLOCKS_4ROUNDS((r) + 12); \
}

// Core round function using message words from Wx array.
// Wx contains -interleaved- expanded message words from b blocks.
// i.e. Message words for round r for each block, followed by the message words for the (r+1)^th block.
//
// r16 : round number mod 16
// rb  : base round number so that (rb+r16) gives the actual round number
// b   : message block index, b = 0..3
#define CROUND_4BLOCKS(r16, rb, b) {   \
    Wt = Wx.ul4[(rb)+(r16)][b]; \
    ah[ r16   &7] += CSIGMA1(ah[(r16+3)&7]) + CH(ah[(r16+3)&7], ah[(r16+2)&7], ah[(r16+1)&7]) + Wt;\
    ah[(r16+4)&7] += ah[r16 &7];\
    ah[ r16   &7] += CSIGMA0(ah[(r16+7)&7]) + MAJ(ah[(r16+7)&7], ah[(r16+6)&7], ah[(r16+5)&7]);\
}

//
// Core round function
// 
// r16 : round number mod 16
// r   : round number, r = 0..63
//
#define CROUND( r16, r ) {;\
    ah[ r16   &7] += CSIGMA1(ah[(r16+3)&7]) + CH(ah[(r16+3)&7], ah[(r16+2)&7], ah[(r16+1)&7]) + SymCryptSha256K[r] + Wt;\
    ah[(r16+4)&7] += ah[r16 &7];\
    ah[ r16   &7] += CSIGMA0(ah[(r16+7)&7]) + MAJ(ah[(r16+7)&7], ah[(r16+6)&7], ah[(r16+5)&7]);\
}

//
// Initial round that reads the message.
// r is the round number 0..15
//
#define IROUND( r ) {\
    Wt = SYMCRYPT_LOAD_MSBFIRST32( &pbData[ 4*r ] );\
    Wx.ul[r] = Wt; \
    CROUND(r,r);\
}

//
// Subsequent rounds.
// r16 is the round number mod 16. rb is the round number minus r16.
//
#define FROUND(r16, rb) {                                    \
    Wt = LSIGMA1( Wx.ul[(r16-2) & 15] ) +   Wx.ul[(r16-7) & 15] +    \
         LSIGMA0( Wx.ul[(r16-15) & 15]) +   Wx.ul[r16 & 15];         \
    Wx.ul[r16] = Wt; \
    CROUND( r16, r16+rb ); \
}



VOID
SYMCRYPT_CALL
SymCryptSha256AppendBlocks_xmm_4blocks(
    _Inout_                 SYMCRYPT_SHA256_CHAINING_STATE* pChain,
    _In_reads_(cbData)      PCBYTE                          pbData,
                            SIZE_T                          cbData,
    _Out_                   SIZE_T*                         pcbRemaining)
{

    SYMCRYPT_ALIGN union { UINT32 ul[16]; UINT32 ul4[64][4]; __m128i xmm[64]; } Wx;
    SYMCRYPT_ALIGN UINT32 ah[8];
    UINT32 Wt;
    SIZE_T uWipeSize = (cbData >= (3 * SYMCRYPT_SHA256_INPUT_BLOCK_SIZE)) ? (64 * 4 * sizeof(UINT32)) : (16 * sizeof(UINT32));

    const __m128i kBYTE_REVERSE_32 = _mm_load_si128((const __m128i*)BYTE_REVERSE_32);

    while (cbData >= (3 * SYMCRYPT_SHA256_INPUT_BLOCK_SIZE))
    {
        // If we have 4 or more blocks then process 4, else process whatever is left.
        SIZE_T numBlocks = (cbData >= 4 * SYMCRYPT_SHA256_INPUT_BLOCK_SIZE) ? 4 : (cbData / SYMCRYPT_SHA256_INPUT_BLOCK_SIZE);

        SHA256_MSG_LOAD_4BLOCKS(numBlocks);
        SHA256_MSG_TRANSPOSE_4BLOCKS();

        for (int j = 16; j < 64; j += 16)
        {
            SHA256_MSG_EXPAND_4BLOCKS_16ROUNDS(j);
        }

        // Constants up to r=48 were added during message expansion. Add the remaining ones here.
        for (int i = 48; i < 64; i++)
        {
            Wx.xmm[i] = _mm_add_epi32(Wx.xmm[i], _mm_set1_epi32(SymCryptSha256K[i]));
        }

        for (SIZE_T bl = 0; bl < numBlocks; bl++)
        {
            ah[7] = pChain->H[0];
            ah[6] = pChain->H[1];
            ah[5] = pChain->H[2];
            ah[4] = pChain->H[3];
            ah[3] = pChain->H[4];
            ah[2] = pChain->H[5];
            ah[1] = pChain->H[6];
            ah[0] = pChain->H[7];

            for (int iterCount = 0; iterCount < (64/8); iterCount++)
            {
                const int roundBase = iterCount*8; 
                CROUND_4BLOCKS( 0, roundBase, bl);
                CROUND_4BLOCKS( 1, roundBase, bl);
                CROUND_4BLOCKS( 2, roundBase, bl);
                CROUND_4BLOCKS( 3, roundBase, bl);
                CROUND_4BLOCKS( 4, roundBase, bl);
                CROUND_4BLOCKS( 5, roundBase, bl);
                CROUND_4BLOCKS( 6, roundBase, bl);
                CROUND_4BLOCKS( 7, roundBase, bl);
                //CROUND_4BLOCKS( 8, roundBase, bl);
                //CROUND_4BLOCKS( 9, roundBase, bl);
                //CROUND_4BLOCKS(10, roundBase, bl);
                //CROUND_4BLOCKS(11, roundBase, bl);
                //CROUND_4BLOCKS(12, roundBase, bl);
                //CROUND_4BLOCKS(13, roundBase, bl);
                //CROUND_4BLOCKS(14, roundBase, bl);
                //CROUND_4BLOCKS(15, roundBase, bl);
            }

            pChain->H[0] = ah[7] + pChain->H[0];
            pChain->H[1] = ah[6] + pChain->H[1];
            pChain->H[2] = ah[5] + pChain->H[2];
            pChain->H[3] = ah[4] + pChain->H[3];
            pChain->H[4] = ah[3] + pChain->H[4];
            pChain->H[5] = ah[2] + pChain->H[5];
            pChain->H[6] = ah[1] + pChain->H[6];
            pChain->H[7] = ah[0] + pChain->H[7];
        }

        pbData += (numBlocks * SYMCRYPT_SHA256_INPUT_BLOCK_SIZE);
        cbData -= (numBlocks * SYMCRYPT_SHA256_INPUT_BLOCK_SIZE);        
    } 

    
    while (cbData >= SYMCRYPT_SHA256_INPUT_BLOCK_SIZE)
    {
        ah[7] = pChain->H[0];
        ah[6] = pChain->H[1];
        ah[5] = pChain->H[2];
        ah[4] = pChain->H[3];
        ah[3] = pChain->H[4];
        ah[2] = pChain->H[5];
        ah[1] = pChain->H[6];
        ah[0] = pChain->H[7];

        //
        // initial rounds 1 to 16
        //

        IROUND(0);
        IROUND(1);
        IROUND(2);
        IROUND(3);
        IROUND(4);
        IROUND(5);
        IROUND(6);
        IROUND(7);
        IROUND(8);
        IROUND(9);
        IROUND(10);
        IROUND(11);
        IROUND(12);
        IROUND(13);
        IROUND(14);
        IROUND(15);


        //
        // rounds 16 to 64.
        //
        for (int iterCount = 1; iterCount < (64/16); iterCount++)
        {
            const int roundBase = iterCount*16; 
            FROUND(0, roundBase);
            FROUND(1, roundBase);
            FROUND(2, roundBase);
            FROUND(3, roundBase);
            FROUND(4, roundBase);
            FROUND(5, roundBase);
            FROUND(6, roundBase);
            FROUND(7, roundBase);
            FROUND(8, roundBase);
            FROUND(9, roundBase);
            FROUND(10, roundBase);
            FROUND(11, roundBase);
            FROUND(12, roundBase);
            FROUND(13, roundBase);
            FROUND(14, roundBase);
            FROUND(15, roundBase);
        }

        pChain->H[0] = ah[7] + pChain->H[0];
        pChain->H[1] = ah[6] + pChain->H[1];
        pChain->H[2] = ah[5] + pChain->H[2];
        pChain->H[3] = ah[4] + pChain->H[3];
        pChain->H[4] = ah[3] + pChain->H[4];
        pChain->H[5] = ah[2] + pChain->H[5];
        pChain->H[6] = ah[1] + pChain->H[6];
        pChain->H[7] = ah[0] + pChain->H[7];

        pbData += SYMCRYPT_SHA256_INPUT_BLOCK_SIZE;
        cbData -= SYMCRYPT_SHA256_INPUT_BLOCK_SIZE;
    }
 
    *pcbRemaining = cbData;

    //
    // Wipe the variables;
    //
    SymCryptWipe(&Wx, uWipeSize);
    SymCryptWipeKnownSize(ah, sizeof(ah));
}

#endif // SYMCRYPT_CPU_X86 | SYMCRYPT_CPU_AMD64


Coverage Report

Created: 2024-11-21 07:03

Line	Count	Source (jump to first uncovered line)
1		#include "precomp.h"
2
3		#if SYMCRYPT_CPU_X86 \| SYMCRYPT_CPU_AMD64
4
5
6		extern SYMCRYPT_ALIGN_AT(256) const UINT32 SymCryptSha256K[64];
7
8
9		// Endianness transformation for 4 32-bit values in an XMM register
10		const SYMCRYPT_ALIGN_AT(16) UINT32 BYTE_REVERSE_32[4] = {
11		0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
12		};
13
14		// Shuffle 32-bit words in an XMM register: W3 W2 W1 W0 -> 0 0 W2 W0
15		// Used by the SSSE3 assembly implementation
16		const SYMCRYPT_ALIGN_AT(16) UINT32 XMM_PACKLOW[4] = {
17		0x03020100, 0x0b0a0908, 0x80808080, 0x80808080,
18		};
19
20		// Shuffle 32-bit words in an XMM register: W3 W2 W1 W0 -> W2 W0 0 0
21		// Used by the SSSE3 assembly implementation
22		const SYMCRYPT_ALIGN_AT(16) UINT32 XMM_PACKHIGH[4] = {
23		0x80808080, 0x80808080, 0x03020100, 0x0b0a0908,
24		};
25
26
27		#if SYMCRYPT_MS_VC
28		#define RORX_U32 _rorx_u32
29		#define RORX_U64 _rorx_u64
30		#else
31		// TODO: implement _rorx functions for clang
32	0	#define RORX_U32 ROR32
33		#define RORX_U64 ROR64
34		#endif // SYMCRYPT_MS_VC
35
36
37		//
38		// For documentation on these function see FIPS 180-2
39		//
40		// MAJ and CH are the functions Maj and Ch from the standard.
41		// CSIGMA0 and CSIGMA1 are the capital sigma functions.
42		// LSIGMA0 and LSIGMA1 are the lowercase sigma functions.
43		//
44		// The canonical definitions of the MAJ and CH functions are:
45		//#define MAJ( x, y, z ) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
46		//#define CH( x, y, z ) (((x) & (y)) ^ ((~(x)) & (z)))
47		// We use optimized versions defined below
48		//
49	0	#define MAJ( x, y, z ) ((((z) \| (y)) & (x) ) \| ((z) & (y)))
50	0	#define CH( x, y, z ) ((((z) ^ (y)) & (x)) ^ (z))
51
52	0	#define LSIGMA0( x ) (ROR32((x), 7) ^ ROR32((x), 18) ^ ((x)>> 3))
53	0	#define LSIGMA1( x ) (ROR32((x), 17) ^ ROR32((x), 19) ^ ((x)>>10))
54
55	0	#define CSIGMA0(x) (RORX_U32(x, 2) ^ RORX_U32(x, 13) ^ RORX_U32(x, 22))
56	0	#define CSIGMA1(x) (RORX_U32(x, 6) ^ RORX_U32(x, 11) ^ RORX_U32(x, 25))
57
58
59		#define LSIGMA0XMM( x ) \
60	0	_mm_xor_si128( _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( \
61	0	_mm_slli_epi32(x,25) , _mm_srli_epi32(x, 7) ),\
62	0	_mm_slli_epi32(x,14) ), _mm_srli_epi32(x, 18) ),\
63	0	_mm_srli_epi32(x, 3) )
64		#define LSIGMA1XMM( x ) \
65	0	_mm_xor_si128( _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( \
66	0	_mm_slli_epi32(x,15) , _mm_srli_epi32(x, 17) ),\
67	0	_mm_slli_epi32(x,13) ), _mm_srli_epi32(x, 19) ),\
68	0	_mm_srli_epi32(x,10) )
69
70
71
72		// Initial loading of message words and endianness transformation.
73		// bl : The number of blocks to load, 1 <= bl <= 4.
74		//
75		// When bl < 4, the high order lanes of the XMM registers corresponding to the missing blocks are unused.
76		//
77	0	#define SHA256_MSG_LOAD_4BLOCKS(bl) { \
78	0	for(SIZE_T i = 0; i < bl; i++) \
79	0	{ \
80	0	Wx.xmm[i + 0] = _mm_shuffle_epi8(_mm_loadu_si128((__m128i) &pbData[i SYMCRYPT_SHA256_INPUT_BLOCK_SIZE + 0]), kBYTE_REVERSE_32); \
81	0	Wx.xmm[i + 4] = _mm_shuffle_epi8(_mm_loadu_si128((__m128i) &pbData[i SYMCRYPT_SHA256_INPUT_BLOCK_SIZE + 16]), kBYTE_REVERSE_32); \
82	0	Wx.xmm[i + 8] = _mm_shuffle_epi8(_mm_loadu_si128((__m128i) &pbData[i SYMCRYPT_SHA256_INPUT_BLOCK_SIZE + 32]), kBYTE_REVERSE_32); \
83	0	Wx.xmm[i + 12] = _mm_shuffle_epi8(_mm_loadu_si128((__m128i) &pbData[i SYMCRYPT_SHA256_INPUT_BLOCK_SIZE + 48]), kBYTE_REVERSE_32); \
84	0	} \
85	0	}
86
87		// Shuffles the initially loaded message words from multiple blocks
88		// so that each XMM register contains message words with the same index
89		// within a block (e.g. Wx.xmm[0] contains the first words of each block).
90		//
91		// We have to use this macro four times to transform the message blocks of 64-bytes.
92		// ind=0 processes the first quarter (16-bytes), ind=1 does the second quarter and so on.
93		//
94	0	#define SHA256_MSG_TRANSPOSE_QUARTER_4BLOCKS(ind) { \
95	0	__m128i t1, t2, t3, t4; \
96	0	t1 = _mm_unpacklo_epi32(Wx.xmm[4 * (ind) + 0], Wx.xmm[4 * (ind) + 1]); \
97	0	t2 = _mm_unpacklo_epi32(Wx.xmm[4 * (ind) + 2], Wx.xmm[4 * (ind) + 3]); \
98	0	t3 = _mm_unpackhi_epi32(Wx.xmm[4 * (ind) + 0], Wx.xmm[4 * (ind) + 1]); \
99	0	t4 = _mm_unpackhi_epi32(Wx.xmm[4 * (ind) + 2], Wx.xmm[4 * (ind) + 3]); \
100	0	Wx.xmm[4 * (ind) + 0] = _mm_unpacklo_epi64(t1, t2); \
101	0	Wx.xmm[4 * (ind) + 1] = _mm_unpackhi_epi64(t1, t2); \
102	0	Wx.xmm[4 * (ind) + 2] = _mm_unpacklo_epi64(t3, t4); \
103	0	Wx.xmm[4 * (ind) + 3] = _mm_unpackhi_epi64(t3, t4); \
104	0	}
105
106	0	#define SHA256_MSG_TRANSPOSE_4BLOCKS() { \
107	0	SHA256_MSG_TRANSPOSE_QUARTER_4BLOCKS(0); \
108	0	SHA256_MSG_TRANSPOSE_QUARTER_4BLOCKS(1); \
109	0	SHA256_MSG_TRANSPOSE_QUARTER_4BLOCKS(2); \
110	0	SHA256_MSG_TRANSPOSE_QUARTER_4BLOCKS(3); \
111	0	}
112
113		// One round message schedule, updates the rth message word. ( 16 <= r < 64 )
114		// Also adds the constants for round (r-16).
115	0	#define SHA256_MSG_EXPAND_4BLOCKS_1ROUND(r) { \
116	0	Wx.xmm[r] = _mm_add_epi32(_mm_add_epi32(_mm_add_epi32(Wx.xmm[r - 16], Wx.xmm[r - 7]), \
117	0	LSIGMA0XMM(Wx.xmm[r - 15])), LSIGMA1XMM(Wx.xmm[r - 2])); \
118	0	Wx.xmm[r - 16] = _mm_add_epi32(Wx.xmm[r - 16], _mm_set1_epi32(SymCryptSha256K[r - 16])); \
119	0	}
120
121		// Four rounds of message schedule. Generates message words for rounds r, r+1, r+2, r+3.
122	0	#define SHA256_MSG_EXPAND_4BLOCKS_4ROUNDS(r) { \
123	0	SHA256_MSG_EXPAND_4BLOCKS_1ROUND((r) + 0); SHA256_MSG_EXPAND_4BLOCKS_1ROUND((r) + 1); \
124	0	SHA256_MSG_EXPAND_4BLOCKS_1ROUND((r) + 2); SHA256_MSG_EXPAND_4BLOCKS_1ROUND((r) + 3); \
125	0	}
126		// Sixteen rounds of message schedule. Generates message words for rounds r, ..., r+15.
127	0	#define SHA256_MSG_EXPAND_4BLOCKS_16ROUNDS(r) { \
128	0	SHA256_MSG_EXPAND_4BLOCKS_4ROUNDS((r) + 0); SHA256_MSG_EXPAND_4BLOCKS_4ROUNDS((r) + 4); \
129	0	SHA256_MSG_EXPAND_4BLOCKS_4ROUNDS((r) + 8); SHA256_MSG_EXPAND_4BLOCKS_4ROUNDS((r) + 12); \
130	0	}
131
132		// Core round function using message words from Wx array.
133		// Wx contains -interleaved- expanded message words from b blocks.
134		// i.e. Message words for round r for each block, followed by the message words for the (r+1)^th block.
135		//
136		// r16 : round number mod 16
137		// rb : base round number so that (rb+r16) gives the actual round number
138		// b : message block index, b = 0..3
139	0	#define CROUND_4BLOCKS(r16, rb, b) { \
140	0	Wt = Wx.ul4[(rb)+(r16)][b]; \
141	0	ah[ r16 &7] += CSIGMA1(ah[(r16+3)&7]) + CH(ah[(r16+3)&7], ah[(r16+2)&7], ah[(r16+1)&7]) + Wt;\
142	0	ah[(r16+4)&7] += ah[r16 &7];\
143	0	ah[ r16 &7] += CSIGMA0(ah[(r16+7)&7]) + MAJ(ah[(r16+7)&7], ah[(r16+6)&7], ah[(r16+5)&7]);\
144	0	}
145
146		//
147		// Core round function
148		//
149		// r16 : round number mod 16
150		// r : round number, r = 0..63
151		//
152	0	#define CROUND( r16, r ) {;\
153	0	ah[ r16 &7] += CSIGMA1(ah[(r16+3)&7]) + CH(ah[(r16+3)&7], ah[(r16+2)&7], ah[(r16+1)&7]) + SymCryptSha256K[r] + Wt;\
154	0	ah[(r16+4)&7] += ah[r16 &7];\
155	0	ah[ r16 &7] += CSIGMA0(ah[(r16+7)&7]) + MAJ(ah[(r16+7)&7], ah[(r16+6)&7], ah[(r16+5)&7]);\
156	0	}
157
158		//
159		// Initial round that reads the message.
160		// r is the round number 0..15
161		//
162	0	#define IROUND( r ) {\
163	0	Wt = SYMCRYPT_LOAD_MSBFIRST32( &pbData[ 4*r ] );\
164	0	Wx.ul[r] = Wt; \
165	0	CROUND(r,r);\
166	0	}
167
168		//
169		// Subsequent rounds.
170		// r16 is the round number mod 16. rb is the round number minus r16.
171		//
172	0	#define FROUND(r16, rb) { \
173	0	Wt = LSIGMA1( Wx.ul[(r16-2) & 15] ) + Wx.ul[(r16-7) & 15] + \
174	0	LSIGMA0( Wx.ul[(r16-15) & 15]) + Wx.ul[r16 & 15]; \
175	0	Wx.ul[r16] = Wt; \
176	0	CROUND( r16, r16+rb ); \
177	0	}
178
179
180
181		VOID
182		SYMCRYPT_CALL
183		SymCryptSha256AppendBlocks_xmm_4blocks(
184		_Inout_ SYMCRYPT_SHA256_CHAINING_STATE* pChain,
185		_In_reads_(cbData) PCBYTE pbData,
186		SIZE_T cbData,
187		_Out_ SIZE_T* pcbRemaining)
188	0	{
189
190	0	SYMCRYPT_ALIGN union { UINT32 ul[16]; UINT32 ul4[64][4]; __m128i xmm[64]; } Wx;
191	0	SYMCRYPT_ALIGN UINT32 ah[8];
192	0	UINT32 Wt;
193	0	SIZE_T uWipeSize = (cbData >= (3 * SYMCRYPT_SHA256_INPUT_BLOCK_SIZE)) ? (64 * 4 * sizeof(UINT32)) : (16 * sizeof(UINT32));
194
195	0	const __m128i kBYTE_REVERSE_32 = _mm_load_si128((const __m128i*)BYTE_REVERSE_32);
196
197	0	while (cbData >= (3 * SYMCRYPT_SHA256_INPUT_BLOCK_SIZE))
198	0	{
199		// If we have 4 or more blocks then process 4, else process whatever is left.
200	0	SIZE_T numBlocks = (cbData >= 4 * SYMCRYPT_SHA256_INPUT_BLOCK_SIZE) ? 4 : (cbData / SYMCRYPT_SHA256_INPUT_BLOCK_SIZE);
201
202	0	SHA256_MSG_LOAD_4BLOCKS(numBlocks);
203	0	SHA256_MSG_TRANSPOSE_4BLOCKS();
204
205	0	for (int j = 16; j < 64; j += 16)
206	0	{
207	0	SHA256_MSG_EXPAND_4BLOCKS_16ROUNDS(j);
208	0	}
209
210		// Constants up to r=48 were added during message expansion. Add the remaining ones here.
211	0	for (int i = 48; i < 64; i++)
212	0	{
213	0	Wx.xmm[i] = _mm_add_epi32(Wx.xmm[i], _mm_set1_epi32(SymCryptSha256K[i]));
214	0	}
215
216	0	for (SIZE_T bl = 0; bl < numBlocks; bl++)
217	0	{
218	0	ah[7] = pChain->H[0];
219	0	ah[6] = pChain->H[1];
220	0	ah[5] = pChain->H[2];
221	0	ah[4] = pChain->H[3];
222	0	ah[3] = pChain->H[4];
223	0	ah[2] = pChain->H[5];
224	0	ah[1] = pChain->H[6];
225	0	ah[0] = pChain->H[7];
226
227	0	for (int iterCount = 0; iterCount < (64/8); iterCount++)
228	0	{
229	0	const int roundBase = iterCount*8;
230	0	CROUND_4BLOCKS( 0, roundBase, bl);
231	0	CROUND_4BLOCKS( 1, roundBase, bl);
232	0	CROUND_4BLOCKS( 2, roundBase, bl);
233	0	CROUND_4BLOCKS( 3, roundBase, bl);
234	0	CROUND_4BLOCKS( 4, roundBase, bl);
235	0	CROUND_4BLOCKS( 5, roundBase, bl);
236	0	CROUND_4BLOCKS( 6, roundBase, bl);
237	0	CROUND_4BLOCKS( 7, roundBase, bl);
238		//CROUND_4BLOCKS( 8, roundBase, bl);
239		//CROUND_4BLOCKS( 9, roundBase, bl);
240		//CROUND_4BLOCKS(10, roundBase, bl);
241		//CROUND_4BLOCKS(11, roundBase, bl);
242		//CROUND_4BLOCKS(12, roundBase, bl);
243		//CROUND_4BLOCKS(13, roundBase, bl);
244		//CROUND_4BLOCKS(14, roundBase, bl);
245		//CROUND_4BLOCKS(15, roundBase, bl);
246	0	}
247
248	0	pChain->H[0] = ah[7] + pChain->H[0];
249	0	pChain->H[1] = ah[6] + pChain->H[1];
250	0	pChain->H[2] = ah[5] + pChain->H[2];
251	0	pChain->H[3] = ah[4] + pChain->H[3];
252	0	pChain->H[4] = ah[3] + pChain->H[4];
253	0	pChain->H[5] = ah[2] + pChain->H[5];
254	0	pChain->H[6] = ah[1] + pChain->H[6];
255	0	pChain->H[7] = ah[0] + pChain->H[7];
256	0	}
257
258	0	pbData += (numBlocks * SYMCRYPT_SHA256_INPUT_BLOCK_SIZE);
259	0	cbData -= (numBlocks * SYMCRYPT_SHA256_INPUT_BLOCK_SIZE);
260	0	}
261
262
263	0	while (cbData >= SYMCRYPT_SHA256_INPUT_BLOCK_SIZE)
264	0	{
265	0	ah[7] = pChain->H[0];
266	0	ah[6] = pChain->H[1];
267	0	ah[5] = pChain->H[2];
268	0	ah[4] = pChain->H[3];
269	0	ah[3] = pChain->H[4];
270	0	ah[2] = pChain->H[5];
271	0	ah[1] = pChain->H[6];
272	0	ah[0] = pChain->H[7];
273
274		//
275		// initial rounds 1 to 16
276		//
277
278	0	IROUND(0);
279	0	IROUND(1);
280	0	IROUND(2);
281	0	IROUND(3);
282	0	IROUND(4);
283	0	IROUND(5);
284	0	IROUND(6);
285	0	IROUND(7);
286	0	IROUND(8);
287	0	IROUND(9);
288	0	IROUND(10);
289	0	IROUND(11);
290	0	IROUND(12);
291	0	IROUND(13);
292	0	IROUND(14);
293	0	IROUND(15);
294
295
296		//
297		// rounds 16 to 64.
298		//
299	0	for (int iterCount = 1; iterCount < (64/16); iterCount++)
300	0	{
301	0	const int roundBase = iterCount*16;
302	0	FROUND(0, roundBase);
303	0	FROUND(1, roundBase);
304	0	FROUND(2, roundBase);
305	0	FROUND(3, roundBase);
306	0	FROUND(4, roundBase);
307	0	FROUND(5, roundBase);
308	0	FROUND(6, roundBase);
309	0	FROUND(7, roundBase);
310	0	FROUND(8, roundBase);
311	0	FROUND(9, roundBase);
312	0	FROUND(10, roundBase);
313	0	FROUND(11, roundBase);
314	0	FROUND(12, roundBase);
315	0	FROUND(13, roundBase);
316	0	FROUND(14, roundBase);
317	0	FROUND(15, roundBase);
318	0	}
319
320	0	pChain->H[0] = ah[7] + pChain->H[0];
321	0	pChain->H[1] = ah[6] + pChain->H[1];
322	0	pChain->H[2] = ah[5] + pChain->H[2];
323	0	pChain->H[3] = ah[4] + pChain->H[3];
324	0	pChain->H[4] = ah[3] + pChain->H[4];
325	0	pChain->H[5] = ah[2] + pChain->H[5];
326	0	pChain->H[6] = ah[1] + pChain->H[6];
327	0	pChain->H[7] = ah[0] + pChain->H[7];
328
329	0	pbData += SYMCRYPT_SHA256_INPUT_BLOCK_SIZE;
330	0	cbData -= SYMCRYPT_SHA256_INPUT_BLOCK_SIZE;
331	0	}
332
333	0	*pcbRemaining = cbData;
334
335		//
336		// Wipe the variables;
337		//
338	0	SymCryptWipe(&Wx, uWipeSize);
339	0	SymCryptWipeKnownSize(ah, sizeof(ah));
340	0	}
341
342		#endif // SYMCRYPT_CPU_X86 \| SYMCRYPT_CPU_AMD64
343