Coverage Report

Created: 2024-11-21 07:03

/src/SymCrypt/lib/sha256-xmm.c
Line
Count
Source (jump to first uncovered line)
1
#include "precomp.h"
2
3
#if  SYMCRYPT_CPU_X86 | SYMCRYPT_CPU_AMD64
4
5
6
extern SYMCRYPT_ALIGN_AT(256) const  UINT32 SymCryptSha256K[64];
7
8
9
// Endianness transformation for 4 32-bit values in an XMM register
10
const SYMCRYPT_ALIGN_AT(16) UINT32 BYTE_REVERSE_32[4] = {
11
    0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
12
};
13
14
// Shuffle 32-bit words in an XMM register: W3 W2 W1 W0 -> 0 0 W2 W0
15
// Used by the SSSE3 assembly implementation
16
const SYMCRYPT_ALIGN_AT(16) UINT32 XMM_PACKLOW[4] = {
17
    0x03020100, 0x0b0a0908, 0x80808080, 0x80808080,
18
};
19
20
// Shuffle 32-bit words in an XMM register: W3 W2 W1 W0 -> W2 W0 0 0
21
// Used by the SSSE3 assembly implementation
22
const SYMCRYPT_ALIGN_AT(16) UINT32 XMM_PACKHIGH[4] = {
23
    0x80808080, 0x80808080, 0x03020100, 0x0b0a0908,
24
};
25
26
27
#if SYMCRYPT_MS_VC
28
#define RORX_U32  _rorx_u32
29
#define RORX_U64  _rorx_u64
30
#else
31
// TODO: implement _rorx functions for clang
32
0
#define RORX_U32  ROR32
33
#define RORX_U64  ROR64
34
#endif // SYMCRYPT_MS_VC
35
36
37
//
38
// For documentation on these function see FIPS 180-2
39
//
40
// MAJ and CH are the functions Maj and Ch from the standard.
41
// CSIGMA0 and CSIGMA1 are the capital sigma functions.
42
// LSIGMA0 and LSIGMA1 are the lowercase sigma functions.
43
//
44
// The canonical definitions of the MAJ and CH functions are:
45
//#define MAJ( x, y, z )    (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
46
//#define CH( x, y, z )  (((x) & (y)) ^ ((~(x)) & (z)))
47
// We use optimized versions defined below
48
//
49
0
#define MAJ( x, y, z )  ((((z) | (y)) & (x) ) | ((z) & (y)))
50
0
#define CH( x, y, z )  ((((z) ^ (y)) & (x)) ^ (z))
51
52
0
#define LSIGMA0( x )    (ROR32((x),  7) ^ ROR32((x), 18) ^ ((x)>> 3))
53
0
#define LSIGMA1( x )    (ROR32((x), 17) ^ ROR32((x), 19) ^ ((x)>>10))
54
55
0
#define CSIGMA0(x) (RORX_U32(x, 2) ^ RORX_U32(x, 13) ^ RORX_U32(x, 22))
56
0
#define CSIGMA1(x) (RORX_U32(x, 6) ^ RORX_U32(x, 11) ^ RORX_U32(x, 25))
57
58
59
#define LSIGMA0XMM( x ) \
60
0
    _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( \
61
0
        _mm_slli_epi32(x,25)  , _mm_srli_epi32(x,  7) ),\
62
0
        _mm_slli_epi32(x,14) ), _mm_srli_epi32(x, 18) ),\
63
0
        _mm_srli_epi32(x, 3) )
64
#define LSIGMA1XMM( x ) \
65
0
    _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( \
66
0
        _mm_slli_epi32(x,15)  , _mm_srli_epi32(x, 17) ),\
67
0
        _mm_slli_epi32(x,13) ), _mm_srli_epi32(x, 19) ),\
68
0
        _mm_srli_epi32(x,10) )
69
70
71
72
// Initial loading of message words and endianness transformation.
73
// bl : The number of blocks to load,  1 <= bl <= 4.
74
//
75
// When bl < 4, the high order lanes of the XMM registers corresponding to the missing blocks are unused.
76
//
77
0
#define SHA256_MSG_LOAD_4BLOCKS(bl) { \
78
0
        for(SIZE_T i = 0; i < bl; i++) \
79
0
        { \
80
0
            Wx.xmm[i +  0] = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &pbData[i * SYMCRYPT_SHA256_INPUT_BLOCK_SIZE +  0]), kBYTE_REVERSE_32); \
81
0
            Wx.xmm[i +  4] = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &pbData[i * SYMCRYPT_SHA256_INPUT_BLOCK_SIZE + 16]), kBYTE_REVERSE_32); \
82
0
            Wx.xmm[i +  8] = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &pbData[i * SYMCRYPT_SHA256_INPUT_BLOCK_SIZE + 32]), kBYTE_REVERSE_32); \
83
0
            Wx.xmm[i + 12] = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &pbData[i * SYMCRYPT_SHA256_INPUT_BLOCK_SIZE + 48]), kBYTE_REVERSE_32); \
84
0
        } \
85
0
}
86
87
// Shuffles the initially loaded message words from multiple blocks
88
// so that each XMM register contains message words with the same index
89
// within a block (e.g. Wx.xmm[0] contains the first words of each block).
90
// 
91
// We have to use this macro four times to transform the message blocks of 64-bytes.
92
// ind=0 processes the first quarter (16-bytes), ind=1 does the second quarter and so on.
93
//
94
0
#define SHA256_MSG_TRANSPOSE_QUARTER_4BLOCKS(ind)  { \
95
0
        __m128i t1, t2, t3, t4; \
96
0
        t1 = _mm_unpacklo_epi32(Wx.xmm[4 * (ind) + 0], Wx.xmm[4 * (ind) + 1]); \
97
0
        t2 = _mm_unpacklo_epi32(Wx.xmm[4 * (ind) + 2], Wx.xmm[4 * (ind) + 3]); \
98
0
        t3 = _mm_unpackhi_epi32(Wx.xmm[4 * (ind) + 0], Wx.xmm[4 * (ind) + 1]); \
99
0
        t4 = _mm_unpackhi_epi32(Wx.xmm[4 * (ind) + 2], Wx.xmm[4 * (ind) + 3]); \
100
0
        Wx.xmm[4 * (ind) + 0] = _mm_unpacklo_epi64(t1, t2); \
101
0
        Wx.xmm[4 * (ind) + 1] = _mm_unpackhi_epi64(t1, t2); \
102
0
        Wx.xmm[4 * (ind) + 2] = _mm_unpacklo_epi64(t3, t4); \
103
0
        Wx.xmm[4 * (ind) + 3] = _mm_unpackhi_epi64(t3, t4); \
104
0
}
105
106
0
#define SHA256_MSG_TRANSPOSE_4BLOCKS() { \
107
0
        SHA256_MSG_TRANSPOSE_QUARTER_4BLOCKS(0); \
108
0
        SHA256_MSG_TRANSPOSE_QUARTER_4BLOCKS(1); \
109
0
        SHA256_MSG_TRANSPOSE_QUARTER_4BLOCKS(2); \
110
0
        SHA256_MSG_TRANSPOSE_QUARTER_4BLOCKS(3); \
111
0
}
112
113
// One round message schedule, updates the rth message word. ( 16 <= r < 64 )
114
// Also adds the constants for round (r-16).
115
0
#define SHA256_MSG_EXPAND_4BLOCKS_1ROUND(r) { \
116
0
        Wx.xmm[r] = _mm_add_epi32(_mm_add_epi32(_mm_add_epi32(Wx.xmm[r - 16], Wx.xmm[r - 7]), \
117
0
                                LSIGMA0XMM(Wx.xmm[r - 15])), LSIGMA1XMM(Wx.xmm[r - 2])); \
118
0
        Wx.xmm[r - 16] = _mm_add_epi32(Wx.xmm[r - 16], _mm_set1_epi32(SymCryptSha256K[r - 16])); \
119
0
}
120
121
// Four rounds of message schedule. Generates message words for rounds r, r+1, r+2, r+3.
122
0
#define SHA256_MSG_EXPAND_4BLOCKS_4ROUNDS(r) { \
123
0
        SHA256_MSG_EXPAND_4BLOCKS_1ROUND((r) + 0); SHA256_MSG_EXPAND_4BLOCKS_1ROUND((r) + 1); \
124
0
        SHA256_MSG_EXPAND_4BLOCKS_1ROUND((r) + 2); SHA256_MSG_EXPAND_4BLOCKS_1ROUND((r) + 3); \
125
0
}
126
// Sixteen rounds of message schedule. Generates message words for rounds r, ..., r+15.
127
0
#define SHA256_MSG_EXPAND_4BLOCKS_16ROUNDS(r) { \
128
0
        SHA256_MSG_EXPAND_4BLOCKS_4ROUNDS((r) + 0); SHA256_MSG_EXPAND_4BLOCKS_4ROUNDS((r) + 4); \
129
0
        SHA256_MSG_EXPAND_4BLOCKS_4ROUNDS((r) + 8); SHA256_MSG_EXPAND_4BLOCKS_4ROUNDS((r) + 12); \
130
0
}
131
132
// Core round function using message words from Wx array.
133
// Wx contains -interleaved- expanded message words from b blocks.
134
// i.e. Message words for round r for each block, followed by the message words for the (r+1)^th block.
135
//
136
// r16 : round number mod 16
137
// rb  : base round number so that (rb+r16) gives the actual round number
138
// b   : message block index, b = 0..3
139
0
#define CROUND_4BLOCKS(r16, rb, b) {   \
140
0
    Wt = Wx.ul4[(rb)+(r16)][b]; \
141
0
    ah[ r16   &7] += CSIGMA1(ah[(r16+3)&7]) + CH(ah[(r16+3)&7], ah[(r16+2)&7], ah[(r16+1)&7]) + Wt;\
142
0
    ah[(r16+4)&7] += ah[r16 &7];\
143
0
    ah[ r16   &7] += CSIGMA0(ah[(r16+7)&7]) + MAJ(ah[(r16+7)&7], ah[(r16+6)&7], ah[(r16+5)&7]);\
144
0
}
145
146
//
147
// Core round function
148
// 
149
// r16 : round number mod 16
150
// r   : round number, r = 0..63
151
//
152
0
#define CROUND( r16, r ) {;\
153
0
    ah[ r16   &7] += CSIGMA1(ah[(r16+3)&7]) + CH(ah[(r16+3)&7], ah[(r16+2)&7], ah[(r16+1)&7]) + SymCryptSha256K[r] + Wt;\
154
0
    ah[(r16+4)&7] += ah[r16 &7];\
155
0
    ah[ r16   &7] += CSIGMA0(ah[(r16+7)&7]) + MAJ(ah[(r16+7)&7], ah[(r16+6)&7], ah[(r16+5)&7]);\
156
0
}
157
158
//
159
// Initial round that reads the message.
160
// r is the round number 0..15
161
//
162
0
#define IROUND( r ) {\
163
0
    Wt = SYMCRYPT_LOAD_MSBFIRST32( &pbData[ 4*r ] );\
164
0
    Wx.ul[r] = Wt; \
165
0
    CROUND(r,r);\
166
0
}
167
168
//
169
// Subsequent rounds.
170
// r16 is the round number mod 16. rb is the round number minus r16.
171
//
172
0
#define FROUND(r16, rb) {                                    \
173
0
    Wt = LSIGMA1( Wx.ul[(r16-2) & 15] ) +   Wx.ul[(r16-7) & 15] +    \
174
0
         LSIGMA0( Wx.ul[(r16-15) & 15]) +   Wx.ul[r16 & 15];         \
175
0
    Wx.ul[r16] = Wt; \
176
0
    CROUND( r16, r16+rb ); \
177
0
}
178
179
180
181
VOID
182
SYMCRYPT_CALL
183
SymCryptSha256AppendBlocks_xmm_4blocks(
184
    _Inout_                 SYMCRYPT_SHA256_CHAINING_STATE* pChain,
185
    _In_reads_(cbData)      PCBYTE                          pbData,
186
                            SIZE_T                          cbData,
187
    _Out_                   SIZE_T*                         pcbRemaining)
188
0
{
189
190
0
    SYMCRYPT_ALIGN union { UINT32 ul[16]; UINT32 ul4[64][4]; __m128i xmm[64]; } Wx;
191
0
    SYMCRYPT_ALIGN UINT32 ah[8];
192
0
    UINT32 Wt;
193
0
    SIZE_T uWipeSize = (cbData >= (3 * SYMCRYPT_SHA256_INPUT_BLOCK_SIZE)) ? (64 * 4 * sizeof(UINT32)) : (16 * sizeof(UINT32));
194
195
0
    const __m128i kBYTE_REVERSE_32 = _mm_load_si128((const __m128i*)BYTE_REVERSE_32);
196
197
0
    while (cbData >= (3 * SYMCRYPT_SHA256_INPUT_BLOCK_SIZE))
198
0
    {
199
        // If we have 4 or more blocks then process 4, else process whatever is left.
200
0
        SIZE_T numBlocks = (cbData >= 4 * SYMCRYPT_SHA256_INPUT_BLOCK_SIZE) ? 4 : (cbData / SYMCRYPT_SHA256_INPUT_BLOCK_SIZE);
201
202
0
        SHA256_MSG_LOAD_4BLOCKS(numBlocks);
203
0
        SHA256_MSG_TRANSPOSE_4BLOCKS();
204
205
0
        for (int j = 16; j < 64; j += 16)
206
0
        {
207
0
            SHA256_MSG_EXPAND_4BLOCKS_16ROUNDS(j);
208
0
        }
209
210
        // Constants up to r=48 were added during message expansion. Add the remaining ones here.
211
0
        for (int i = 48; i < 64; i++)
212
0
        {
213
0
            Wx.xmm[i] = _mm_add_epi32(Wx.xmm[i], _mm_set1_epi32(SymCryptSha256K[i]));
214
0
        }
215
216
0
        for (SIZE_T bl = 0; bl < numBlocks; bl++)
217
0
        {
218
0
            ah[7] = pChain->H[0];
219
0
            ah[6] = pChain->H[1];
220
0
            ah[5] = pChain->H[2];
221
0
            ah[4] = pChain->H[3];
222
0
            ah[3] = pChain->H[4];
223
0
            ah[2] = pChain->H[5];
224
0
            ah[1] = pChain->H[6];
225
0
            ah[0] = pChain->H[7];
226
227
0
            for (int iterCount = 0; iterCount < (64/8); iterCount++)
228
0
            {
229
0
                const int roundBase = iterCount*8; 
230
0
                CROUND_4BLOCKS( 0, roundBase, bl);
231
0
                CROUND_4BLOCKS( 1, roundBase, bl);
232
0
                CROUND_4BLOCKS( 2, roundBase, bl);
233
0
                CROUND_4BLOCKS( 3, roundBase, bl);
234
0
                CROUND_4BLOCKS( 4, roundBase, bl);
235
0
                CROUND_4BLOCKS( 5, roundBase, bl);
236
0
                CROUND_4BLOCKS( 6, roundBase, bl);
237
0
                CROUND_4BLOCKS( 7, roundBase, bl);
238
                //CROUND_4BLOCKS( 8, roundBase, bl);
239
                //CROUND_4BLOCKS( 9, roundBase, bl);
240
                //CROUND_4BLOCKS(10, roundBase, bl);
241
                //CROUND_4BLOCKS(11, roundBase, bl);
242
                //CROUND_4BLOCKS(12, roundBase, bl);
243
                //CROUND_4BLOCKS(13, roundBase, bl);
244
                //CROUND_4BLOCKS(14, roundBase, bl);
245
                //CROUND_4BLOCKS(15, roundBase, bl);
246
0
            }
247
248
0
            pChain->H[0] = ah[7] + pChain->H[0];
249
0
            pChain->H[1] = ah[6] + pChain->H[1];
250
0
            pChain->H[2] = ah[5] + pChain->H[2];
251
0
            pChain->H[3] = ah[4] + pChain->H[3];
252
0
            pChain->H[4] = ah[3] + pChain->H[4];
253
0
            pChain->H[5] = ah[2] + pChain->H[5];
254
0
            pChain->H[6] = ah[1] + pChain->H[6];
255
0
            pChain->H[7] = ah[0] + pChain->H[7];
256
0
        }
257
258
0
        pbData += (numBlocks * SYMCRYPT_SHA256_INPUT_BLOCK_SIZE);
259
0
        cbData -= (numBlocks * SYMCRYPT_SHA256_INPUT_BLOCK_SIZE);        
260
0
    } 
261
262
    
263
0
    while (cbData >= SYMCRYPT_SHA256_INPUT_BLOCK_SIZE)
264
0
    {
265
0
        ah[7] = pChain->H[0];
266
0
        ah[6] = pChain->H[1];
267
0
        ah[5] = pChain->H[2];
268
0
        ah[4] = pChain->H[3];
269
0
        ah[3] = pChain->H[4];
270
0
        ah[2] = pChain->H[5];
271
0
        ah[1] = pChain->H[6];
272
0
        ah[0] = pChain->H[7];
273
274
        //
275
        // initial rounds 1 to 16
276
        //
277
278
0
        IROUND(0);
279
0
        IROUND(1);
280
0
        IROUND(2);
281
0
        IROUND(3);
282
0
        IROUND(4);
283
0
        IROUND(5);
284
0
        IROUND(6);
285
0
        IROUND(7);
286
0
        IROUND(8);
287
0
        IROUND(9);
288
0
        IROUND(10);
289
0
        IROUND(11);
290
0
        IROUND(12);
291
0
        IROUND(13);
292
0
        IROUND(14);
293
0
        IROUND(15);
294
295
296
        //
297
        // rounds 16 to 64.
298
        //
299
0
        for (int iterCount = 1; iterCount < (64/16); iterCount++)
300
0
        {
301
0
            const int roundBase = iterCount*16; 
302
0
            FROUND(0, roundBase);
303
0
            FROUND(1, roundBase);
304
0
            FROUND(2, roundBase);
305
0
            FROUND(3, roundBase);
306
0
            FROUND(4, roundBase);
307
0
            FROUND(5, roundBase);
308
0
            FROUND(6, roundBase);
309
0
            FROUND(7, roundBase);
310
0
            FROUND(8, roundBase);
311
0
            FROUND(9, roundBase);
312
0
            FROUND(10, roundBase);
313
0
            FROUND(11, roundBase);
314
0
            FROUND(12, roundBase);
315
0
            FROUND(13, roundBase);
316
0
            FROUND(14, roundBase);
317
0
            FROUND(15, roundBase);
318
0
        }
319
320
0
        pChain->H[0] = ah[7] + pChain->H[0];
321
0
        pChain->H[1] = ah[6] + pChain->H[1];
322
0
        pChain->H[2] = ah[5] + pChain->H[2];
323
0
        pChain->H[3] = ah[4] + pChain->H[3];
324
0
        pChain->H[4] = ah[3] + pChain->H[4];
325
0
        pChain->H[5] = ah[2] + pChain->H[5];
326
0
        pChain->H[6] = ah[1] + pChain->H[6];
327
0
        pChain->H[7] = ah[0] + pChain->H[7];
328
329
0
        pbData += SYMCRYPT_SHA256_INPUT_BLOCK_SIZE;
330
0
        cbData -= SYMCRYPT_SHA256_INPUT_BLOCK_SIZE;
331
0
    }
332
 
333
0
    *pcbRemaining = cbData;
334
335
    //
336
    // Wipe the variables;
337
    //
338
0
    SymCryptWipe(&Wx, uWipeSize);
339
0
    SymCryptWipeKnownSize(ah, sizeof(ah));
340
0
}
341
342
#endif // SYMCRYPT_CPU_X86 | SYMCRYPT_CPU_AMD64
343