/src/nss/lib/freebl/sha256-x86.c
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | /* This Source Code Form is subject to the terms of the Mozilla Public  | 
2  |  |  * License, v. 2.0. If a copy of the MPL was not distributed with this  | 
3  |  |  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */  | 
4  |  |  | 
5  |  | #ifdef USE_HW_SHA2  | 
6  |  |  | 
7  |  | #include <immintrin.h>  | 
8  |  |  | 
9  |  | #ifdef FREEBL_NO_DEPEND  | 
10  |  | #include "stubs.h"  | 
11  |  | #endif  | 
12  |  |  | 
13  |  | #include "blapii.h"  | 
14  |  | #include "prcpucfg.h"  | 
15  |  | #include "prtypes.h" /* for PRUintXX */  | 
16  |  | #include "prlong.h"  | 
17  |  | #include "blapi.h"  | 
18  |  | #include "sha256.h"  | 
19  |  |  | 
20  |  | /* SHA-256 constants, K256. */  | 
21  |  | pre_align static const PRUint32 K256[64] post_align = { | 
22  |  |     0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,  | 
23  |  |     0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,  | 
24  |  |     0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,  | 
25  |  |     0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,  | 
26  |  |     0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,  | 
27  |  |     0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,  | 
28  |  |     0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,  | 
29  |  |     0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,  | 
30  |  |     0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,  | 
31  |  |     0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,  | 
32  |  |     0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,  | 
33  |  |     0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,  | 
34  |  |     0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,  | 
35  |  |     0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,  | 
36  |  |     0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,  | 
37  |  |     0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2  | 
38  |  | };  | 
39  |  |  | 
40  |  | #define ROUND(n, a, b, c, d)                                \  | 
41  | 0  |     {                                                       \ | 
42  | 0  |         __m128i t = _mm_add_epi32(a, k##n);                 \  | 
43  | 0  |         w1 = _mm_sha256rnds2_epu32(w1, w0, t);              \  | 
44  | 0  |         t = _mm_shuffle_epi32(t, 0x0e);                     \  | 
45  | 0  |         w0 = _mm_sha256rnds2_epu32(w0, w1, t);              \  | 
46  | 0  |         if (n < 12) {                                       \ | 
47  | 0  |             a = _mm_sha256msg1_epu32(a, b);                 \  | 
48  | 0  |             a = _mm_add_epi32(a, _mm_alignr_epi8(d, c, 4)); \  | 
49  | 0  |             a = _mm_sha256msg2_epu32(a, d);                 \  | 
50  | 0  |         }                                                   \  | 
51  | 0  |     }  | 
52  |  |  | 
53  |  | void  | 
54  |  | SHA256_Compress_Native(SHA256Context *ctx)  | 
55  | 0  | { | 
56  | 0  |     __m128i h0, h1, th;  | 
57  | 0  |     __m128i a, b, c, d;  | 
58  | 0  |     __m128i w0, w1;  | 
59  | 0  |     const __m128i shuffle = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);  | 
60  |  | 
  | 
61  | 0  |     const __m128i *K = (__m128i *)K256;  | 
62  | 0  |     const __m128i k0 = _mm_load_si128(K);  | 
63  | 0  |     const __m128i k1 = _mm_load_si128(K + 1);  | 
64  | 0  |     const __m128i k2 = _mm_load_si128(K + 2);  | 
65  | 0  |     const __m128i k3 = _mm_load_si128(K + 3);  | 
66  | 0  |     const __m128i k4 = _mm_load_si128(K + 4);  | 
67  | 0  |     const __m128i k5 = _mm_load_si128(K + 5);  | 
68  | 0  |     const __m128i k6 = _mm_load_si128(K + 6);  | 
69  | 0  |     const __m128i k7 = _mm_load_si128(K + 7);  | 
70  | 0  |     const __m128i k8 = _mm_load_si128(K + 8);  | 
71  | 0  |     const __m128i k9 = _mm_load_si128(K + 9);  | 
72  | 0  |     const __m128i k10 = _mm_load_si128(K + 10);  | 
73  | 0  |     const __m128i k11 = _mm_load_si128(K + 11);  | 
74  | 0  |     const __m128i k12 = _mm_load_si128(K + 12);  | 
75  | 0  |     const __m128i k13 = _mm_load_si128(K + 13);  | 
76  | 0  |     const __m128i k14 = _mm_load_si128(K + 14);  | 
77  | 0  |     const __m128i k15 = _mm_load_si128(K + 15);  | 
78  |  | 
  | 
79  | 0  |     const __m128i *input = (__m128i *)ctx->u.b;  | 
80  |  | 
  | 
81  | 0  |     h0 = _mm_loadu_si128((__m128i *)(ctx->h));  | 
82  | 0  |     h1 = _mm_loadu_si128((__m128i *)(ctx->h + 4));  | 
83  |  |  | 
84  |  |     /* H0123:4567 -> H01256:H2367 */  | 
85  | 0  |     th = _mm_shuffle_epi32(h0, 0xb1);  | 
86  | 0  |     h1 = _mm_shuffle_epi32(h1, 0x1b);  | 
87  | 0  |     h0 = _mm_alignr_epi8(th, h1, 8);  | 
88  | 0  |     h1 = _mm_blend_epi16(h1, th, 0xf0);  | 
89  |  | 
  | 
90  | 0  |     a = _mm_shuffle_epi8(_mm_loadu_si128(input), shuffle);  | 
91  | 0  |     b = _mm_shuffle_epi8(_mm_loadu_si128(input + 1), shuffle);  | 
92  | 0  |     c = _mm_shuffle_epi8(_mm_loadu_si128(input + 2), shuffle);  | 
93  | 0  |     d = _mm_shuffle_epi8(_mm_loadu_si128(input + 3), shuffle);  | 
94  |  | 
  | 
95  | 0  |     w0 = h0;  | 
96  | 0  |     w1 = h1;  | 
97  |  | 
  | 
98  | 0  |     ROUND(0, a, b, c, d)  | 
99  | 0  |     ROUND(1, b, c, d, a)  | 
100  | 0  |     ROUND(2, c, d, a, b)  | 
101  | 0  |     ROUND(3, d, a, b, c)  | 
102  | 0  |     ROUND(4, a, b, c, d)  | 
103  | 0  |     ROUND(5, b, c, d, a)  | 
104  | 0  |     ROUND(6, c, d, a, b)  | 
105  | 0  |     ROUND(7, d, a, b, c)  | 
106  | 0  |     ROUND(8, a, b, c, d)  | 
107  | 0  |     ROUND(9, b, c, d, a)  | 
108  | 0  |     ROUND(10, c, d, a, b)  | 
109  | 0  |     ROUND(11, d, a, b, c)  | 
110  | 0  |     ROUND(12, a, b, c, d)  | 
111  | 0  |     ROUND(13, b, c, d, a)  | 
112  | 0  |     ROUND(14, c, d, a, b)  | 
113  | 0  |     ROUND(15, d, a, b, c)  | 
114  |  | 
  | 
115  | 0  |     h0 = _mm_add_epi32(h0, w0);  | 
116  | 0  |     h1 = _mm_add_epi32(h1, w1);  | 
117  |  |  | 
118  |  |     /* H0145:2367 -> H0123:4567 */  | 
119  | 0  |     th = _mm_shuffle_epi32(h0, 0x1b);  | 
120  | 0  |     h1 = _mm_shuffle_epi32(h1, 0xb1);  | 
121  | 0  |     h0 = _mm_blend_epi16(th, h1, 0xf0);  | 
122  | 0  |     h1 = _mm_alignr_epi8(h1, th, 8);  | 
123  |  | 
  | 
124  | 0  |     _mm_storeu_si128((__m128i *)ctx->h, h0);  | 
125  | 0  |     _mm_storeu_si128((__m128i *)(ctx->h + 4), h1);  | 
126  | 0  | }  | 
127  |  |  | 
128  |  | void  | 
129  |  | SHA256_Update_Native(SHA256Context *ctx, const unsigned char *input,  | 
130  |  |                      unsigned int inputLen)  | 
131  | 0  | { | 
132  | 0  |     __m128i h0, h1, th;  | 
133  | 0  |     const __m128i shuffle = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);  | 
134  |  | 
  | 
135  | 0  |     const __m128i *K = (__m128i *)K256;  | 
136  | 0  |     const __m128i k0 = _mm_load_si128(K);  | 
137  | 0  |     const __m128i k1 = _mm_load_si128(K + 1);  | 
138  | 0  |     const __m128i k2 = _mm_load_si128(K + 2);  | 
139  | 0  |     const __m128i k3 = _mm_load_si128(K + 3);  | 
140  | 0  |     const __m128i k4 = _mm_load_si128(K + 4);  | 
141  | 0  |     const __m128i k5 = _mm_load_si128(K + 5);  | 
142  | 0  |     const __m128i k6 = _mm_load_si128(K + 6);  | 
143  | 0  |     const __m128i k7 = _mm_load_si128(K + 7);  | 
144  | 0  |     const __m128i k8 = _mm_load_si128(K + 8);  | 
145  | 0  |     const __m128i k9 = _mm_load_si128(K + 9);  | 
146  | 0  |     const __m128i k10 = _mm_load_si128(K + 10);  | 
147  | 0  |     const __m128i k11 = _mm_load_si128(K + 11);  | 
148  | 0  |     const __m128i k12 = _mm_load_si128(K + 12);  | 
149  | 0  |     const __m128i k13 = _mm_load_si128(K + 13);  | 
150  | 0  |     const __m128i k14 = _mm_load_si128(K + 14);  | 
151  | 0  |     const __m128i k15 = _mm_load_si128(K + 15);  | 
152  |  | 
  | 
153  | 0  |     unsigned int inBuf = ctx->sizeLo & 0x3f;  | 
154  | 0  |     if (!inputLen) { | 
155  | 0  |         return;  | 
156  | 0  |     }  | 
157  |  |  | 
158  |  |     /* Add inputLen into the count of bytes processed, before processing */  | 
159  | 0  |     if ((ctx->sizeLo += inputLen) < inputLen) { | 
160  | 0  |         ctx->sizeHi++;  | 
161  | 0  |     }  | 
162  |  |  | 
163  |  |     /* if data already in buffer, attempt to fill rest of buffer */  | 
164  | 0  |     if (inBuf) { | 
165  | 0  |         unsigned int todo = SHA256_BLOCK_LENGTH - inBuf;  | 
166  | 0  |         if (inputLen < todo) { | 
167  | 0  |             todo = inputLen;  | 
168  | 0  |         }  | 
169  | 0  |         memcpy(ctx->u.b + inBuf, input, todo);  | 
170  | 0  |         input += todo;  | 
171  | 0  |         inputLen -= todo;  | 
172  | 0  |         if (inBuf + todo == SHA256_BLOCK_LENGTH) { | 
173  | 0  |             SHA256_Compress_Native(ctx);  | 
174  | 0  |         }  | 
175  | 0  |     }  | 
176  |  | 
  | 
177  | 0  |     h0 = _mm_loadu_si128((__m128i *)(ctx->h));  | 
178  | 0  |     h1 = _mm_loadu_si128((__m128i *)(ctx->h + 4));  | 
179  |  |  | 
180  |  |     /* H0123:4567 -> H01256:H2367 */  | 
181  | 0  |     th = _mm_shuffle_epi32(h0, 0xb1);  | 
182  | 0  |     h1 = _mm_shuffle_epi32(h1, 0x1b);  | 
183  | 0  |     h0 = _mm_alignr_epi8(th, h1, 8);  | 
184  | 0  |     h1 = _mm_blend_epi16(h1, th, 0xf0);  | 
185  |  |  | 
186  |  |     /* if enough data to fill one or more whole buffers, process them. */  | 
187  | 0  |     while (inputLen >= SHA256_BLOCK_LENGTH) { | 
188  | 0  |         __m128i a, b, c, d;  | 
189  | 0  |         __m128i w0, w1;  | 
190  | 0  |         a = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)input), shuffle);  | 
191  | 0  |         b = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 16)), shuffle);  | 
192  | 0  |         c = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 32)), shuffle);  | 
193  | 0  |         d = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 48)), shuffle);  | 
194  | 0  |         input += SHA256_BLOCK_LENGTH;  | 
195  | 0  |         inputLen -= SHA256_BLOCK_LENGTH;  | 
196  |  | 
  | 
197  | 0  |         w0 = h0;  | 
198  | 0  |         w1 = h1;  | 
199  |  | 
  | 
200  | 0  |         ROUND(0, a, b, c, d)  | 
201  | 0  |         ROUND(1, b, c, d, a)  | 
202  | 0  |         ROUND(2, c, d, a, b)  | 
203  | 0  |         ROUND(3, d, a, b, c)  | 
204  | 0  |         ROUND(4, a, b, c, d)  | 
205  | 0  |         ROUND(5, b, c, d, a)  | 
206  | 0  |         ROUND(6, c, d, a, b)  | 
207  | 0  |         ROUND(7, d, a, b, c)  | 
208  | 0  |         ROUND(8, a, b, c, d)  | 
209  | 0  |         ROUND(9, b, c, d, a)  | 
210  | 0  |         ROUND(10, c, d, a, b)  | 
211  | 0  |         ROUND(11, d, a, b, c)  | 
212  | 0  |         ROUND(12, a, b, c, d)  | 
213  | 0  |         ROUND(13, b, c, d, a)  | 
214  | 0  |         ROUND(14, c, d, a, b)  | 
215  | 0  |         ROUND(15, d, a, b, c)  | 
216  |  | 
  | 
217  | 0  |         h0 = _mm_add_epi32(h0, w0);  | 
218  | 0  |         h1 = _mm_add_epi32(h1, w1);  | 
219  | 0  |     }  | 
220  |  |  | 
221  |  |     // H01234567 -> H01256 and H2367  | 
222  | 0  |     th = _mm_shuffle_epi32(h0, 0x1b);  | 
223  | 0  |     h1 = _mm_shuffle_epi32(h1, 0xb1);  | 
224  | 0  |     h0 = _mm_blend_epi16(th, h1, 0xf0);  | 
225  | 0  |     h1 = _mm_alignr_epi8(h1, th, 8);  | 
226  |  | 
  | 
227  | 0  |     _mm_storeu_si128((__m128i *)ctx->h, h0);  | 
228  | 0  |     _mm_storeu_si128((__m128i *)(ctx->h + 4), h1);  | 
229  |  |  | 
230  |  |     /* if data left over, fill it into buffer */  | 
231  | 0  |     if (inputLen) { | 
232  | 0  |         memcpy(ctx->u.b, input, inputLen);  | 
233  | 0  |     }  | 
234  | 0  | }  | 
235  |  |  | 
236  |  | #endif /* USE_HW_SHA2 */  |