/src/SymCrypt/lib/sha256-xmm.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "precomp.h" |
2 | | |
3 | | #if SYMCRYPT_CPU_X86 | SYMCRYPT_CPU_AMD64 |
4 | | |
5 | | |
6 | | extern SYMCRYPT_ALIGN_AT(256) const UINT32 SymCryptSha256K[64]; |
7 | | |
8 | | |
9 | | // Endianness transformation for 4 32-bit values in an XMM register |
10 | | const SYMCRYPT_ALIGN_AT(16) UINT32 BYTE_REVERSE_32[4] = { |
11 | | 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f, |
12 | | }; |
13 | | |
14 | | // Shuffle 32-bit words in an XMM register: W3 W2 W1 W0 -> 0 0 W2 W0 |
15 | | // Used by the SSSE3 assembly implementation |
16 | | const SYMCRYPT_ALIGN_AT(16) UINT32 XMM_PACKLOW[4] = { |
17 | | 0x03020100, 0x0b0a0908, 0x80808080, 0x80808080, |
18 | | }; |
19 | | |
20 | | // Shuffle 32-bit words in an XMM register: W3 W2 W1 W0 -> W2 W0 0 0 |
21 | | // Used by the SSSE3 assembly implementation |
22 | | const SYMCRYPT_ALIGN_AT(16) UINT32 XMM_PACKHIGH[4] = { |
23 | | 0x80808080, 0x80808080, 0x03020100, 0x0b0a0908, |
24 | | }; |
25 | | |
26 | | |
27 | | #if SYMCRYPT_MS_VC |
28 | | #define RORX_U32 _rorx_u32 |
29 | | #define RORX_U64 _rorx_u64 |
30 | | #else |
31 | | // TODO: implement _rorx functions for clang |
32 | 0 | #define RORX_U32 ROR32 |
33 | | #define RORX_U64 ROR64 |
34 | | #endif // SYMCRYPT_MS_VC |
35 | | |
36 | | |
37 | | // |
38 | | // For documentation on these function see FIPS 180-2 |
39 | | // |
40 | | // MAJ and CH are the functions Maj and Ch from the standard. |
41 | | // CSIGMA0 and CSIGMA1 are the capital sigma functions. |
42 | | // LSIGMA0 and LSIGMA1 are the lowercase sigma functions. |
43 | | // |
44 | | // The canonical definitions of the MAJ and CH functions are: |
45 | | //#define MAJ( x, y, z ) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) |
46 | | //#define CH( x, y, z ) (((x) & (y)) ^ ((~(x)) & (z))) |
47 | | // We use optimized versions defined below |
48 | | // |
49 | 0 | #define MAJ( x, y, z ) ((((z) | (y)) & (x) ) | ((z) & (y))) |
50 | 0 | #define CH( x, y, z ) ((((z) ^ (y)) & (x)) ^ (z)) |
51 | | |
52 | 0 | #define LSIGMA0( x ) (ROR32((x), 7) ^ ROR32((x), 18) ^ ((x)>> 3)) |
53 | 0 | #define LSIGMA1( x ) (ROR32((x), 17) ^ ROR32((x), 19) ^ ((x)>>10)) |
54 | | |
55 | 0 | #define CSIGMA0(x) (RORX_U32(x, 2) ^ RORX_U32(x, 13) ^ RORX_U32(x, 22)) |
56 | 0 | #define CSIGMA1(x) (RORX_U32(x, 6) ^ RORX_U32(x, 11) ^ RORX_U32(x, 25)) |
57 | | |
58 | | |
59 | | #define LSIGMA0XMM( x ) \ |
60 | 0 | _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( \ |
61 | 0 | _mm_slli_epi32(x,25) , _mm_srli_epi32(x, 7) ),\ |
62 | 0 | _mm_slli_epi32(x,14) ), _mm_srli_epi32(x, 18) ),\ |
63 | 0 | _mm_srli_epi32(x, 3) ) |
64 | | #define LSIGMA1XMM( x ) \ |
65 | 0 | _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( \ |
66 | 0 | _mm_slli_epi32(x,15) , _mm_srli_epi32(x, 17) ),\ |
67 | 0 | _mm_slli_epi32(x,13) ), _mm_srli_epi32(x, 19) ),\ |
68 | 0 | _mm_srli_epi32(x,10) ) |
69 | | |
70 | | |
71 | | |
72 | | // Initial loading of message words and endianness transformation. |
73 | | // bl : The number of blocks to load, 1 <= bl <= 4. |
74 | | // |
75 | | // When bl < 4, the high order lanes of the XMM registers corresponding to the missing blocks are unused. |
76 | | // |
77 | 0 | #define SHA256_MSG_LOAD_4BLOCKS(bl) { \ |
78 | 0 | for(SIZE_T i = 0; i < bl; i++) \ |
79 | 0 | { \ |
80 | 0 | Wx.xmm[i + 0] = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &pbData[i * SYMCRYPT_SHA256_INPUT_BLOCK_SIZE + 0]), kBYTE_REVERSE_32); \ |
81 | 0 | Wx.xmm[i + 4] = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &pbData[i * SYMCRYPT_SHA256_INPUT_BLOCK_SIZE + 16]), kBYTE_REVERSE_32); \ |
82 | 0 | Wx.xmm[i + 8] = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &pbData[i * SYMCRYPT_SHA256_INPUT_BLOCK_SIZE + 32]), kBYTE_REVERSE_32); \ |
83 | 0 | Wx.xmm[i + 12] = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &pbData[i * SYMCRYPT_SHA256_INPUT_BLOCK_SIZE + 48]), kBYTE_REVERSE_32); \ |
84 | 0 | } \ |
85 | 0 | } |
86 | | |
87 | | // Shuffles the initially loaded message words from multiple blocks |
88 | | // so that each XMM register contains message words with the same index |
89 | | // within a block (e.g. Wx.xmm[0] contains the first words of each block). |
90 | | // |
91 | | // We have to use this macro four times to transform the message blocks of 64-bytes. |
92 | | // ind=0 processes the first quarter (16-bytes), ind=1 does the second quarter and so on. |
93 | | // |
94 | 0 | #define SHA256_MSG_TRANSPOSE_QUARTER_4BLOCKS(ind) { \ |
95 | 0 | __m128i t1, t2, t3, t4; \ |
96 | 0 | t1 = _mm_unpacklo_epi32(Wx.xmm[4 * (ind) + 0], Wx.xmm[4 * (ind) + 1]); \ |
97 | 0 | t2 = _mm_unpacklo_epi32(Wx.xmm[4 * (ind) + 2], Wx.xmm[4 * (ind) + 3]); \ |
98 | 0 | t3 = _mm_unpackhi_epi32(Wx.xmm[4 * (ind) + 0], Wx.xmm[4 * (ind) + 1]); \ |
99 | 0 | t4 = _mm_unpackhi_epi32(Wx.xmm[4 * (ind) + 2], Wx.xmm[4 * (ind) + 3]); \ |
100 | 0 | Wx.xmm[4 * (ind) + 0] = _mm_unpacklo_epi64(t1, t2); \ |
101 | 0 | Wx.xmm[4 * (ind) + 1] = _mm_unpackhi_epi64(t1, t2); \ |
102 | 0 | Wx.xmm[4 * (ind) + 2] = _mm_unpacklo_epi64(t3, t4); \ |
103 | 0 | Wx.xmm[4 * (ind) + 3] = _mm_unpackhi_epi64(t3, t4); \ |
104 | 0 | } |
105 | | |
106 | 0 | #define SHA256_MSG_TRANSPOSE_4BLOCKS() { \ |
107 | 0 | SHA256_MSG_TRANSPOSE_QUARTER_4BLOCKS(0); \ |
108 | 0 | SHA256_MSG_TRANSPOSE_QUARTER_4BLOCKS(1); \ |
109 | 0 | SHA256_MSG_TRANSPOSE_QUARTER_4BLOCKS(2); \ |
110 | 0 | SHA256_MSG_TRANSPOSE_QUARTER_4BLOCKS(3); \ |
111 | 0 | } |
112 | | |
113 | | // One round message schedule, updates the rth message word. ( 16 <= r < 64 ) |
114 | | // Also adds the constants for round (r-16). |
115 | 0 | #define SHA256_MSG_EXPAND_4BLOCKS_1ROUND(r) { \ |
116 | 0 | Wx.xmm[r] = _mm_add_epi32(_mm_add_epi32(_mm_add_epi32(Wx.xmm[r - 16], Wx.xmm[r - 7]), \ |
117 | 0 | LSIGMA0XMM(Wx.xmm[r - 15])), LSIGMA1XMM(Wx.xmm[r - 2])); \ |
118 | 0 | Wx.xmm[r - 16] = _mm_add_epi32(Wx.xmm[r - 16], _mm_set1_epi32(SymCryptSha256K[r - 16])); \ |
119 | 0 | } |
120 | | |
121 | | // Four rounds of message schedule. Generates message words for rounds r, r+1, r+2, r+3. |
122 | 0 | #define SHA256_MSG_EXPAND_4BLOCKS_4ROUNDS(r) { \ |
123 | 0 | SHA256_MSG_EXPAND_4BLOCKS_1ROUND((r) + 0); SHA256_MSG_EXPAND_4BLOCKS_1ROUND((r) + 1); \ |
124 | 0 | SHA256_MSG_EXPAND_4BLOCKS_1ROUND((r) + 2); SHA256_MSG_EXPAND_4BLOCKS_1ROUND((r) + 3); \ |
125 | 0 | } |
126 | | // Sixteen rounds of message schedule. Generates message words for rounds r, ..., r+15. |
127 | 0 | #define SHA256_MSG_EXPAND_4BLOCKS_16ROUNDS(r) { \ |
128 | 0 | SHA256_MSG_EXPAND_4BLOCKS_4ROUNDS((r) + 0); SHA256_MSG_EXPAND_4BLOCKS_4ROUNDS((r) + 4); \ |
129 | 0 | SHA256_MSG_EXPAND_4BLOCKS_4ROUNDS((r) + 8); SHA256_MSG_EXPAND_4BLOCKS_4ROUNDS((r) + 12); \ |
130 | 0 | } |
131 | | |
132 | | // Core round function using message words from Wx array. |
133 | | // Wx contains -interleaved- expanded message words from b blocks. |
134 | | // i.e. Message words for round r for each block, followed by the message words for the (r+1)^th block. |
135 | | // |
136 | | // r16 : round number mod 16 |
137 | | // rb : base round number so that (rb+r16) gives the actual round number |
138 | | // b : message block index, b = 0..3 |
139 | 0 | #define CROUND_4BLOCKS(r16, rb, b) { \ |
140 | 0 | Wt = Wx.ul4[(rb)+(r16)][b]; \ |
141 | 0 | ah[ r16 &7] += CSIGMA1(ah[(r16+3)&7]) + CH(ah[(r16+3)&7], ah[(r16+2)&7], ah[(r16+1)&7]) + Wt;\ |
142 | 0 | ah[(r16+4)&7] += ah[r16 &7];\ |
143 | 0 | ah[ r16 &7] += CSIGMA0(ah[(r16+7)&7]) + MAJ(ah[(r16+7)&7], ah[(r16+6)&7], ah[(r16+5)&7]);\ |
144 | 0 | } |
145 | | |
146 | | // |
147 | | // Core round function |
148 | | // |
149 | | // r16 : round number mod 16 |
150 | | // r : round number, r = 0..63 |
151 | | // |
152 | 0 | #define CROUND( r16, r ) {;\ |
153 | 0 | ah[ r16 &7] += CSIGMA1(ah[(r16+3)&7]) + CH(ah[(r16+3)&7], ah[(r16+2)&7], ah[(r16+1)&7]) + SymCryptSha256K[r] + Wt;\ |
154 | 0 | ah[(r16+4)&7] += ah[r16 &7];\ |
155 | 0 | ah[ r16 &7] += CSIGMA0(ah[(r16+7)&7]) + MAJ(ah[(r16+7)&7], ah[(r16+6)&7], ah[(r16+5)&7]);\ |
156 | 0 | } |
157 | | |
158 | | // |
159 | | // Initial round that reads the message. |
160 | | // r is the round number 0..15 |
161 | | // |
162 | 0 | #define IROUND( r ) {\ |
163 | 0 | Wt = SYMCRYPT_LOAD_MSBFIRST32( &pbData[ 4*r ] );\ |
164 | 0 | Wx.ul[r] = Wt; \ |
165 | 0 | CROUND(r,r);\ |
166 | 0 | } |
167 | | |
168 | | // |
169 | | // Subsequent rounds. |
170 | | // r16 is the round number mod 16. rb is the round number minus r16. |
171 | | // |
172 | 0 | #define FROUND(r16, rb) { \ |
173 | 0 | Wt = LSIGMA1( Wx.ul[(r16-2) & 15] ) + Wx.ul[(r16-7) & 15] + \ |
174 | 0 | LSIGMA0( Wx.ul[(r16-15) & 15]) + Wx.ul[r16 & 15]; \ |
175 | 0 | Wx.ul[r16] = Wt; \ |
176 | 0 | CROUND( r16, r16+rb ); \ |
177 | 0 | } |
178 | | |
179 | | |
180 | | |
181 | | VOID |
182 | | SYMCRYPT_CALL |
183 | | SymCryptSha256AppendBlocks_xmm_4blocks( |
184 | | _Inout_ SYMCRYPT_SHA256_CHAINING_STATE* pChain, |
185 | | _In_reads_(cbData) PCBYTE pbData, |
186 | | SIZE_T cbData, |
187 | | _Out_ SIZE_T* pcbRemaining) |
188 | 0 | { |
189 | |
|
190 | 0 | SYMCRYPT_ALIGN union { UINT32 ul[16]; UINT32 ul4[64][4]; __m128i xmm[64]; } Wx; |
191 | 0 | SYMCRYPT_ALIGN UINT32 ah[8]; |
192 | 0 | UINT32 Wt; |
193 | 0 | SIZE_T uWipeSize = (cbData >= (3 * SYMCRYPT_SHA256_INPUT_BLOCK_SIZE)) ? (64 * 4 * sizeof(UINT32)) : (16 * sizeof(UINT32)); |
194 | |
|
195 | 0 | const __m128i kBYTE_REVERSE_32 = _mm_load_si128((const __m128i*)BYTE_REVERSE_32); |
196 | |
|
197 | 0 | while (cbData >= (3 * SYMCRYPT_SHA256_INPUT_BLOCK_SIZE)) |
198 | 0 | { |
199 | | // If we have 4 or more blocks then process 4, else process whatever is left. |
200 | 0 | SIZE_T numBlocks = (cbData >= 4 * SYMCRYPT_SHA256_INPUT_BLOCK_SIZE) ? 4 : (cbData / SYMCRYPT_SHA256_INPUT_BLOCK_SIZE); |
201 | |
|
202 | 0 | SHA256_MSG_LOAD_4BLOCKS(numBlocks); |
203 | 0 | SHA256_MSG_TRANSPOSE_4BLOCKS(); |
204 | |
|
205 | 0 | for (int j = 16; j < 64; j += 16) |
206 | 0 | { |
207 | 0 | SHA256_MSG_EXPAND_4BLOCKS_16ROUNDS(j); |
208 | 0 | } |
209 | | |
210 | | // Constants up to r=48 were added during message expansion. Add the remaining ones here. |
211 | 0 | for (int i = 48; i < 64; i++) |
212 | 0 | { |
213 | 0 | Wx.xmm[i] = _mm_add_epi32(Wx.xmm[i], _mm_set1_epi32(SymCryptSha256K[i])); |
214 | 0 | } |
215 | |
|
216 | 0 | for (SIZE_T bl = 0; bl < numBlocks; bl++) |
217 | 0 | { |
218 | 0 | ah[7] = pChain->H[0]; |
219 | 0 | ah[6] = pChain->H[1]; |
220 | 0 | ah[5] = pChain->H[2]; |
221 | 0 | ah[4] = pChain->H[3]; |
222 | 0 | ah[3] = pChain->H[4]; |
223 | 0 | ah[2] = pChain->H[5]; |
224 | 0 | ah[1] = pChain->H[6]; |
225 | 0 | ah[0] = pChain->H[7]; |
226 | |
|
227 | 0 | for (int iterCount = 0; iterCount < (64/8); iterCount++) |
228 | 0 | { |
229 | 0 | const int roundBase = iterCount*8; |
230 | 0 | CROUND_4BLOCKS( 0, roundBase, bl); |
231 | 0 | CROUND_4BLOCKS( 1, roundBase, bl); |
232 | 0 | CROUND_4BLOCKS( 2, roundBase, bl); |
233 | 0 | CROUND_4BLOCKS( 3, roundBase, bl); |
234 | 0 | CROUND_4BLOCKS( 4, roundBase, bl); |
235 | 0 | CROUND_4BLOCKS( 5, roundBase, bl); |
236 | 0 | CROUND_4BLOCKS( 6, roundBase, bl); |
237 | 0 | CROUND_4BLOCKS( 7, roundBase, bl); |
238 | | //CROUND_4BLOCKS( 8, roundBase, bl); |
239 | | //CROUND_4BLOCKS( 9, roundBase, bl); |
240 | | //CROUND_4BLOCKS(10, roundBase, bl); |
241 | | //CROUND_4BLOCKS(11, roundBase, bl); |
242 | | //CROUND_4BLOCKS(12, roundBase, bl); |
243 | | //CROUND_4BLOCKS(13, roundBase, bl); |
244 | | //CROUND_4BLOCKS(14, roundBase, bl); |
245 | | //CROUND_4BLOCKS(15, roundBase, bl); |
246 | 0 | } |
247 | |
|
248 | 0 | pChain->H[0] = ah[7] + pChain->H[0]; |
249 | 0 | pChain->H[1] = ah[6] + pChain->H[1]; |
250 | 0 | pChain->H[2] = ah[5] + pChain->H[2]; |
251 | 0 | pChain->H[3] = ah[4] + pChain->H[3]; |
252 | 0 | pChain->H[4] = ah[3] + pChain->H[4]; |
253 | 0 | pChain->H[5] = ah[2] + pChain->H[5]; |
254 | 0 | pChain->H[6] = ah[1] + pChain->H[6]; |
255 | 0 | pChain->H[7] = ah[0] + pChain->H[7]; |
256 | 0 | } |
257 | |
|
258 | 0 | pbData += (numBlocks * SYMCRYPT_SHA256_INPUT_BLOCK_SIZE); |
259 | 0 | cbData -= (numBlocks * SYMCRYPT_SHA256_INPUT_BLOCK_SIZE); |
260 | 0 | } |
261 | | |
262 | | |
263 | 0 | while (cbData >= SYMCRYPT_SHA256_INPUT_BLOCK_SIZE) |
264 | 0 | { |
265 | 0 | ah[7] = pChain->H[0]; |
266 | 0 | ah[6] = pChain->H[1]; |
267 | 0 | ah[5] = pChain->H[2]; |
268 | 0 | ah[4] = pChain->H[3]; |
269 | 0 | ah[3] = pChain->H[4]; |
270 | 0 | ah[2] = pChain->H[5]; |
271 | 0 | ah[1] = pChain->H[6]; |
272 | 0 | ah[0] = pChain->H[7]; |
273 | | |
274 | | // |
275 | | // initial rounds 1 to 16 |
276 | | // |
277 | |
|
278 | 0 | IROUND(0); |
279 | 0 | IROUND(1); |
280 | 0 | IROUND(2); |
281 | 0 | IROUND(3); |
282 | 0 | IROUND(4); |
283 | 0 | IROUND(5); |
284 | 0 | IROUND(6); |
285 | 0 | IROUND(7); |
286 | 0 | IROUND(8); |
287 | 0 | IROUND(9); |
288 | 0 | IROUND(10); |
289 | 0 | IROUND(11); |
290 | 0 | IROUND(12); |
291 | 0 | IROUND(13); |
292 | 0 | IROUND(14); |
293 | 0 | IROUND(15); |
294 | | |
295 | | |
296 | | // |
297 | | // rounds 16 to 64. |
298 | | // |
299 | 0 | for (int iterCount = 1; iterCount < (64/16); iterCount++) |
300 | 0 | { |
301 | 0 | const int roundBase = iterCount*16; |
302 | 0 | FROUND(0, roundBase); |
303 | 0 | FROUND(1, roundBase); |
304 | 0 | FROUND(2, roundBase); |
305 | 0 | FROUND(3, roundBase); |
306 | 0 | FROUND(4, roundBase); |
307 | 0 | FROUND(5, roundBase); |
308 | 0 | FROUND(6, roundBase); |
309 | 0 | FROUND(7, roundBase); |
310 | 0 | FROUND(8, roundBase); |
311 | 0 | FROUND(9, roundBase); |
312 | 0 | FROUND(10, roundBase); |
313 | 0 | FROUND(11, roundBase); |
314 | 0 | FROUND(12, roundBase); |
315 | 0 | FROUND(13, roundBase); |
316 | 0 | FROUND(14, roundBase); |
317 | 0 | FROUND(15, roundBase); |
318 | 0 | } |
319 | |
|
320 | 0 | pChain->H[0] = ah[7] + pChain->H[0]; |
321 | 0 | pChain->H[1] = ah[6] + pChain->H[1]; |
322 | 0 | pChain->H[2] = ah[5] + pChain->H[2]; |
323 | 0 | pChain->H[3] = ah[4] + pChain->H[3]; |
324 | 0 | pChain->H[4] = ah[3] + pChain->H[4]; |
325 | 0 | pChain->H[5] = ah[2] + pChain->H[5]; |
326 | 0 | pChain->H[6] = ah[1] + pChain->H[6]; |
327 | 0 | pChain->H[7] = ah[0] + pChain->H[7]; |
328 | |
|
329 | 0 | pbData += SYMCRYPT_SHA256_INPUT_BLOCK_SIZE; |
330 | 0 | cbData -= SYMCRYPT_SHA256_INPUT_BLOCK_SIZE; |
331 | 0 | } |
332 | | |
333 | 0 | *pcbRemaining = cbData; |
334 | | |
335 | | // |
336 | | // Wipe the variables; |
337 | | // |
338 | 0 | SymCryptWipe(&Wx, uWipeSize); |
339 | 0 | SymCryptWipeKnownSize(ah, sizeof(ah)); |
340 | 0 | } |
341 | | |
342 | | #endif // SYMCRYPT_CPU_X86 | SYMCRYPT_CPU_AMD64 |
343 | | |