/src/nss/lib/freebl/sha256-x86.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* This Source Code Form is subject to the terms of the Mozilla Public |
2 | | * License, v. 2.0. If a copy of the MPL was not distributed with this |
3 | | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
4 | | |
5 | | #ifdef USE_HW_SHA2 |
6 | | |
7 | | #include <immintrin.h> |
8 | | |
9 | | #ifdef FREEBL_NO_DEPEND |
10 | | #include "stubs.h" |
11 | | #endif |
12 | | |
13 | | #include "blapii.h" |
14 | | #include "prcpucfg.h" |
15 | | #include "prtypes.h" /* for PRUintXX */ |
16 | | #include "prlong.h" |
17 | | #include "blapi.h" |
18 | | #include "sha256.h" |
19 | | |
20 | | /* SHA-256 constants, K256. */ |
21 | | pre_align static const PRUint32 K256[64] post_align = { |
22 | | 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, |
23 | | 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, |
24 | | 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, |
25 | | 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, |
26 | | 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, |
27 | | 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, |
28 | | 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, |
29 | | 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, |
30 | | 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, |
31 | | 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, |
32 | | 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, |
33 | | 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, |
34 | | 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, |
35 | | 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, |
36 | | 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, |
37 | | 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 |
38 | | }; |
39 | | |
40 | | #define ROUND(n, a, b, c, d) \ |
41 | 0 | { \ |
42 | 0 | __m128i t = _mm_add_epi32(a, k##n); \ |
43 | 0 | w1 = _mm_sha256rnds2_epu32(w1, w0, t); \ |
44 | 0 | t = _mm_shuffle_epi32(t, 0x0e); \ |
45 | 0 | w0 = _mm_sha256rnds2_epu32(w0, w1, t); \ |
46 | 0 | if (n < 12) { \ |
47 | 0 | a = _mm_sha256msg1_epu32(a, b); \ |
48 | 0 | a = _mm_add_epi32(a, _mm_alignr_epi8(d, c, 4)); \ |
49 | 0 | a = _mm_sha256msg2_epu32(a, d); \ |
50 | 0 | } \ |
51 | 0 | } |
52 | | |
53 | | void |
54 | | SHA256_Compress_Native(SHA256Context *ctx) |
55 | 0 | { |
56 | 0 | __m128i h0, h1, th; |
57 | 0 | __m128i a, b, c, d; |
58 | 0 | __m128i w0, w1; |
59 | 0 | const __m128i shuffle = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3); |
60 | |
|
61 | 0 | const __m128i *K = (__m128i *)K256; |
62 | 0 | const __m128i k0 = _mm_load_si128(K); |
63 | 0 | const __m128i k1 = _mm_load_si128(K + 1); |
64 | 0 | const __m128i k2 = _mm_load_si128(K + 2); |
65 | 0 | const __m128i k3 = _mm_load_si128(K + 3); |
66 | 0 | const __m128i k4 = _mm_load_si128(K + 4); |
67 | 0 | const __m128i k5 = _mm_load_si128(K + 5); |
68 | 0 | const __m128i k6 = _mm_load_si128(K + 6); |
69 | 0 | const __m128i k7 = _mm_load_si128(K + 7); |
70 | 0 | const __m128i k8 = _mm_load_si128(K + 8); |
71 | 0 | const __m128i k9 = _mm_load_si128(K + 9); |
72 | 0 | const __m128i k10 = _mm_load_si128(K + 10); |
73 | 0 | const __m128i k11 = _mm_load_si128(K + 11); |
74 | 0 | const __m128i k12 = _mm_load_si128(K + 12); |
75 | 0 | const __m128i k13 = _mm_load_si128(K + 13); |
76 | 0 | const __m128i k14 = _mm_load_si128(K + 14); |
77 | 0 | const __m128i k15 = _mm_load_si128(K + 15); |
78 | |
|
79 | 0 | const __m128i *input = (__m128i *)ctx->u.b; |
80 | |
|
81 | 0 | h0 = _mm_loadu_si128((__m128i *)(ctx->h)); |
82 | 0 | h1 = _mm_loadu_si128((__m128i *)(ctx->h + 4)); |
83 | | |
84 | | /* H0123:4567 -> H01256:H2367 */ |
85 | 0 | th = _mm_shuffle_epi32(h0, 0xb1); |
86 | 0 | h1 = _mm_shuffle_epi32(h1, 0x1b); |
87 | 0 | h0 = _mm_alignr_epi8(th, h1, 8); |
88 | 0 | h1 = _mm_blend_epi16(h1, th, 0xf0); |
89 | |
|
90 | 0 | a = _mm_shuffle_epi8(_mm_loadu_si128(input), shuffle); |
91 | 0 | b = _mm_shuffle_epi8(_mm_loadu_si128(input + 1), shuffle); |
92 | 0 | c = _mm_shuffle_epi8(_mm_loadu_si128(input + 2), shuffle); |
93 | 0 | d = _mm_shuffle_epi8(_mm_loadu_si128(input + 3), shuffle); |
94 | |
|
95 | 0 | w0 = h0; |
96 | 0 | w1 = h1; |
97 | |
|
98 | 0 | ROUND(0, a, b, c, d) |
99 | 0 | ROUND(1, b, c, d, a) |
100 | 0 | ROUND(2, c, d, a, b) |
101 | 0 | ROUND(3, d, a, b, c) |
102 | 0 | ROUND(4, a, b, c, d) |
103 | 0 | ROUND(5, b, c, d, a) |
104 | 0 | ROUND(6, c, d, a, b) |
105 | 0 | ROUND(7, d, a, b, c) |
106 | 0 | ROUND(8, a, b, c, d) |
107 | 0 | ROUND(9, b, c, d, a) |
108 | 0 | ROUND(10, c, d, a, b) |
109 | 0 | ROUND(11, d, a, b, c) |
110 | 0 | ROUND(12, a, b, c, d) |
111 | 0 | ROUND(13, b, c, d, a) |
112 | 0 | ROUND(14, c, d, a, b) |
113 | 0 | ROUND(15, d, a, b, c) |
114 | |
|
115 | 0 | h0 = _mm_add_epi32(h0, w0); |
116 | 0 | h1 = _mm_add_epi32(h1, w1); |
117 | | |
118 | | /* H0145:2367 -> H0123:4567 */ |
119 | 0 | th = _mm_shuffle_epi32(h0, 0x1b); |
120 | 0 | h1 = _mm_shuffle_epi32(h1, 0xb1); |
121 | 0 | h0 = _mm_blend_epi16(th, h1, 0xf0); |
122 | 0 | h1 = _mm_alignr_epi8(h1, th, 8); |
123 | |
|
124 | 0 | _mm_storeu_si128((__m128i *)ctx->h, h0); |
125 | 0 | _mm_storeu_si128((__m128i *)(ctx->h + 4), h1); |
126 | 0 | } |
127 | | |
128 | | void |
129 | | SHA256_Update_Native(SHA256Context *ctx, const unsigned char *input, |
130 | | unsigned int inputLen) |
131 | 0 | { |
132 | 0 | __m128i h0, h1, th; |
133 | 0 | const __m128i shuffle = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3); |
134 | |
|
135 | 0 | const __m128i *K = (__m128i *)K256; |
136 | 0 | const __m128i k0 = _mm_load_si128(K); |
137 | 0 | const __m128i k1 = _mm_load_si128(K + 1); |
138 | 0 | const __m128i k2 = _mm_load_si128(K + 2); |
139 | 0 | const __m128i k3 = _mm_load_si128(K + 3); |
140 | 0 | const __m128i k4 = _mm_load_si128(K + 4); |
141 | 0 | const __m128i k5 = _mm_load_si128(K + 5); |
142 | 0 | const __m128i k6 = _mm_load_si128(K + 6); |
143 | 0 | const __m128i k7 = _mm_load_si128(K + 7); |
144 | 0 | const __m128i k8 = _mm_load_si128(K + 8); |
145 | 0 | const __m128i k9 = _mm_load_si128(K + 9); |
146 | 0 | const __m128i k10 = _mm_load_si128(K + 10); |
147 | 0 | const __m128i k11 = _mm_load_si128(K + 11); |
148 | 0 | const __m128i k12 = _mm_load_si128(K + 12); |
149 | 0 | const __m128i k13 = _mm_load_si128(K + 13); |
150 | 0 | const __m128i k14 = _mm_load_si128(K + 14); |
151 | 0 | const __m128i k15 = _mm_load_si128(K + 15); |
152 | |
|
153 | 0 | unsigned int inBuf = ctx->sizeLo & 0x3f; |
154 | 0 | if (!inputLen) { |
155 | 0 | return; |
156 | 0 | } |
157 | | |
158 | | /* Add inputLen into the count of bytes processed, before processing */ |
159 | 0 | if ((ctx->sizeLo += inputLen) < inputLen) { |
160 | 0 | ctx->sizeHi++; |
161 | 0 | } |
162 | | |
163 | | /* if data already in buffer, attempt to fill rest of buffer */ |
164 | 0 | if (inBuf) { |
165 | 0 | unsigned int todo = SHA256_BLOCK_LENGTH - inBuf; |
166 | 0 | if (inputLen < todo) { |
167 | 0 | todo = inputLen; |
168 | 0 | } |
169 | 0 | memcpy(ctx->u.b + inBuf, input, todo); |
170 | 0 | input += todo; |
171 | 0 | inputLen -= todo; |
172 | 0 | if (inBuf + todo == SHA256_BLOCK_LENGTH) { |
173 | 0 | SHA256_Compress_Native(ctx); |
174 | 0 | } |
175 | 0 | } |
176 | |
|
177 | 0 | h0 = _mm_loadu_si128((__m128i *)(ctx->h)); |
178 | 0 | h1 = _mm_loadu_si128((__m128i *)(ctx->h + 4)); |
179 | | |
180 | | /* H0123:4567 -> H01256:H2367 */ |
181 | 0 | th = _mm_shuffle_epi32(h0, 0xb1); |
182 | 0 | h1 = _mm_shuffle_epi32(h1, 0x1b); |
183 | 0 | h0 = _mm_alignr_epi8(th, h1, 8); |
184 | 0 | h1 = _mm_blend_epi16(h1, th, 0xf0); |
185 | | |
186 | | /* if enough data to fill one or more whole buffers, process them. */ |
187 | 0 | while (inputLen >= SHA256_BLOCK_LENGTH) { |
188 | 0 | __m128i a, b, c, d; |
189 | 0 | __m128i w0, w1; |
190 | 0 | a = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)input), shuffle); |
191 | 0 | b = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 16)), shuffle); |
192 | 0 | c = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 32)), shuffle); |
193 | 0 | d = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 48)), shuffle); |
194 | 0 | input += SHA256_BLOCK_LENGTH; |
195 | 0 | inputLen -= SHA256_BLOCK_LENGTH; |
196 | |
|
197 | 0 | w0 = h0; |
198 | 0 | w1 = h1; |
199 | |
|
200 | 0 | ROUND(0, a, b, c, d) |
201 | 0 | ROUND(1, b, c, d, a) |
202 | 0 | ROUND(2, c, d, a, b) |
203 | 0 | ROUND(3, d, a, b, c) |
204 | 0 | ROUND(4, a, b, c, d) |
205 | 0 | ROUND(5, b, c, d, a) |
206 | 0 | ROUND(6, c, d, a, b) |
207 | 0 | ROUND(7, d, a, b, c) |
208 | 0 | ROUND(8, a, b, c, d) |
209 | 0 | ROUND(9, b, c, d, a) |
210 | 0 | ROUND(10, c, d, a, b) |
211 | 0 | ROUND(11, d, a, b, c) |
212 | 0 | ROUND(12, a, b, c, d) |
213 | 0 | ROUND(13, b, c, d, a) |
214 | 0 | ROUND(14, c, d, a, b) |
215 | 0 | ROUND(15, d, a, b, c) |
216 | |
|
217 | 0 | h0 = _mm_add_epi32(h0, w0); |
218 | 0 | h1 = _mm_add_epi32(h1, w1); |
219 | 0 | } |
220 | | |
221 | | // H01234567 -> H01256 and H2367 |
222 | 0 | th = _mm_shuffle_epi32(h0, 0x1b); |
223 | 0 | h1 = _mm_shuffle_epi32(h1, 0xb1); |
224 | 0 | h0 = _mm_blend_epi16(th, h1, 0xf0); |
225 | 0 | h1 = _mm_alignr_epi8(h1, th, 8); |
226 | |
|
227 | 0 | _mm_storeu_si128((__m128i *)ctx->h, h0); |
228 | 0 | _mm_storeu_si128((__m128i *)(ctx->h + 4), h1); |
229 | | |
230 | | /* if data left over, fill it into buffer */ |
231 | 0 | if (inputLen) { |
232 | 0 | memcpy(ctx->u.b, input, inputLen); |
233 | 0 | } |
234 | 0 | } |
235 | | |
236 | | #endif /* USE_HW_SHA2 */ |