/src/cryptopp/sha_simd.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // sha_simd.cpp - written and placed in the public domain by |
2 | | // Jeffrey Walton, Uri Blumenthal and Marcel Raad. |
3 | | // |
4 | | // This source file uses intrinsics to gain access to SHA-NI and |
5 | | // ARMv8a SHA instructions. A separate source file is needed |
6 | | // because additional CXXFLAGS are required to enable the |
7 | | // appropriate instructions sets in some build configurations. |
8 | | |
9 | | #include "pch.h" |
10 | | #include "config.h" |
11 | | #include "sha.h" |
12 | | #include "misc.h" |
13 | | |
14 | | #if defined(CRYPTOPP_DISABLE_SHA_ASM) |
15 | | # undef CRYPTOPP_X86_ASM_AVAILABLE |
16 | | # undef CRYPTOPP_X32_ASM_AVAILABLE |
17 | | # undef CRYPTOPP_X64_ASM_AVAILABLE |
18 | | # undef CRYPTOPP_SSE2_ASM_AVAILABLE |
19 | | #endif |
20 | | |
21 | | #if (CRYPTOPP_SHANI_AVAILABLE) |
22 | | # include <nmmintrin.h> |
23 | | # include <immintrin.h> |
24 | | #endif |
25 | | |
26 | | // Android makes <arm_acle.h> available with ARMv7-a |
27 | | #if (CRYPTOPP_BOOL_ARMV8) |
28 | | # if (CRYPTOPP_ARM_NEON_HEADER) |
29 | | # include <arm_neon.h> |
30 | | # endif |
31 | | # if (CRYPTOPP_ARM_ACLE_HEADER) |
32 | | # include <stdint.h> |
33 | | # include <arm_acle.h> |
34 | | # endif |
35 | | #endif |
36 | | |
37 | | #if CRYPTOPP_POWER8_SHA_AVAILABLE |
38 | | # include "ppc_simd.h" |
39 | | #endif |
40 | | |
41 | | #ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY |
42 | | # include <signal.h> |
43 | | # include <setjmp.h> |
44 | | #endif |
45 | | |
46 | | #ifndef EXCEPTION_EXECUTE_HANDLER |
47 | | # define EXCEPTION_EXECUTE_HANDLER 1 |
48 | | #endif |
49 | | |
50 | | // Squash MS LNK4221 and libtool warnings |
51 | | extern const char SHA_SIMD_FNAME[] = __FILE__; |
52 | | |
53 | | NAMESPACE_BEGIN(CryptoPP) |
54 | | |
55 | | // ***************** SHA key tables ******************** |
56 | | |
57 | | extern const word32 SHA256_K[64]; |
58 | | extern const word64 SHA512_K[80]; |
59 | | |
60 | | // ***************** SIGILL probes ******************** |
61 | | |
62 | | #ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY |
63 | | extern "C" { |
64 | | typedef void (*SigHandler)(int); |
65 | | |
66 | | static jmp_buf s_jmpSIGILL; |
67 | | static void SigIllHandler(int) |
68 | 0 | { |
69 | 0 | longjmp(s_jmpSIGILL, 1); |
70 | 0 | } |
71 | | } |
72 | | #endif // Not CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY |
73 | | |
74 | | #if (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARMV8) |
75 | | bool CPU_ProbeSHA1() |
76 | | { |
77 | | #if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES) |
78 | | return false; |
79 | | #elif (CRYPTOPP_ARM_SHA1_AVAILABLE) |
80 | | # if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY) |
81 | | volatile bool result = true; |
82 | | __try |
83 | | { |
84 | | unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12}; |
85 | | uint32x4_t data1 = vld1q_u32(w+0); |
86 | | uint32x4_t data2 = vld1q_u32(w+4); |
87 | | uint32x4_t data3 = vld1q_u32(w+8); |
88 | | |
89 | | uint32x4_t r1 = vsha1cq_u32 (data1, 0, data2); |
90 | | uint32x4_t r2 = vsha1mq_u32 (data1, 0, data2); |
91 | | uint32x4_t r3 = vsha1pq_u32 (data1, 0, data2); |
92 | | uint32x4_t r4 = vsha1su0q_u32 (data1, data2, data3); |
93 | | uint32x4_t r5 = vsha1su1q_u32 (data1, data2); |
94 | | |
95 | | result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3) | vgetq_lane_u32(r5,0)); |
96 | | } |
97 | | __except (EXCEPTION_EXECUTE_HANDLER) |
98 | | { |
99 | | return false; |
100 | | } |
101 | | return result; |
102 | | # else |
103 | | |
104 | | // longjmp and clobber warnings. Volatile is required. |
105 | | // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854 |
106 | | volatile bool result = true; |
107 | | |
108 | | volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler); |
109 | | if (oldHandler == SIG_ERR) |
110 | | return false; |
111 | | |
112 | | volatile sigset_t oldMask; |
113 | | if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask)) |
114 | | { |
115 | | signal(SIGILL, oldHandler); |
116 | | return false; |
117 | | } |
118 | | |
119 | | if (setjmp(s_jmpSIGILL)) |
120 | | result = false; |
121 | | else |
122 | | { |
123 | | unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12}; |
124 | | uint32x4_t data1 = vld1q_u32(w+0); |
125 | | uint32x4_t data2 = vld1q_u32(w+4); |
126 | | uint32x4_t data3 = vld1q_u32(w+8); |
127 | | |
128 | | uint32x4_t r1 = vsha1cq_u32 (data1, 0, data2); |
129 | | uint32x4_t r2 = vsha1mq_u32 (data1, 0, data2); |
130 | | uint32x4_t r3 = vsha1pq_u32 (data1, 0, data2); |
131 | | uint32x4_t r4 = vsha1su0q_u32 (data1, data2, data3); |
132 | | uint32x4_t r5 = vsha1su1q_u32 (data1, data2); |
133 | | |
134 | | result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3) | vgetq_lane_u32(r5,0)); |
135 | | } |
136 | | |
137 | | sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR); |
138 | | signal(SIGILL, oldHandler); |
139 | | return result; |
140 | | # endif |
141 | | #else |
142 | | return false; |
143 | | #endif // CRYPTOPP_ARM_SHA1_AVAILABLE |
144 | | } |
145 | | |
146 | | bool CPU_ProbeSHA256() |
147 | | { |
148 | | #if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES) |
149 | | return false; |
150 | | #elif (CRYPTOPP_ARM_SHA2_AVAILABLE) |
151 | | # if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY) |
152 | | volatile bool result = true; |
153 | | __try |
154 | | { |
155 | | unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12}; |
156 | | uint32x4_t data1 = vld1q_u32(w+0); |
157 | | uint32x4_t data2 = vld1q_u32(w+4); |
158 | | uint32x4_t data3 = vld1q_u32(w+8); |
159 | | |
160 | | uint32x4_t r1 = vsha256hq_u32 (data1, data2, data3); |
161 | | uint32x4_t r2 = vsha256h2q_u32 (data1, data2, data3); |
162 | | uint32x4_t r3 = vsha256su0q_u32 (data1, data2); |
163 | | uint32x4_t r4 = vsha256su1q_u32 (data1, data2, data3); |
164 | | |
165 | | result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3)); |
166 | | } |
167 | | __except (EXCEPTION_EXECUTE_HANDLER) |
168 | | { |
169 | | return false; |
170 | | } |
171 | | return result; |
172 | | #else |
173 | | |
174 | | // longjmp and clobber warnings. Volatile is required. |
175 | | // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854 |
176 | | volatile bool result = true; |
177 | | |
178 | | volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler); |
179 | | if (oldHandler == SIG_ERR) |
180 | | return false; |
181 | | |
182 | | volatile sigset_t oldMask; |
183 | | if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask)) |
184 | | { |
185 | | signal(SIGILL, oldHandler); |
186 | | return false; |
187 | | } |
188 | | |
189 | | if (setjmp(s_jmpSIGILL)) |
190 | | result = false; |
191 | | else |
192 | | { |
193 | | unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12}; |
194 | | uint32x4_t data1 = vld1q_u32(w+0); |
195 | | uint32x4_t data2 = vld1q_u32(w+4); |
196 | | uint32x4_t data3 = vld1q_u32(w+8); |
197 | | |
198 | | uint32x4_t r1 = vsha256hq_u32 (data1, data2, data3); |
199 | | uint32x4_t r2 = vsha256h2q_u32 (data1, data2, data3); |
200 | | uint32x4_t r3 = vsha256su0q_u32 (data1, data2); |
201 | | uint32x4_t r4 = vsha256su1q_u32 (data1, data2, data3); |
202 | | |
203 | | result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3)); |
204 | | } |
205 | | |
206 | | sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR); |
207 | | signal(SIGILL, oldHandler); |
208 | | return result; |
209 | | # endif |
210 | | #else |
211 | | return false; |
212 | | #endif // CRYPTOPP_ARM_SHA2_AVAILABLE |
213 | | } |
214 | | #endif // ARM32 or ARM64 |
215 | | |
216 | | // ***************** Intel x86 SHA ******************** |
217 | | |
218 | | ///////////////////////////////////// |
219 | | // start of Walton and Gulley code // |
220 | | ///////////////////////////////////// |
221 | | |
222 | | #if CRYPTOPP_SHANI_AVAILABLE |
223 | | // Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley. |
224 | | void SHA1_HashMultipleBlocks_SHANI(word32 *state, const word32 *data, size_t length, ByteOrder order) |
225 | 71.7k | { |
226 | 71.7k | CRYPTOPP_ASSERT(state); |
227 | 71.7k | CRYPTOPP_ASSERT(data); |
228 | 71.7k | CRYPTOPP_ASSERT(length >= SHA1::BLOCKSIZE); |
229 | | |
230 | 71.7k | __m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1; |
231 | 71.7k | __m128i MASK, MSG0, MSG1, MSG2, MSG3; |
232 | | |
233 | | // Load initial values |
234 | 71.7k | ABCD = _mm_loadu_si128(CONST_M128_CAST(state)); |
235 | 71.7k | E0 = _mm_set_epi32(state[4], 0, 0, 0); |
236 | 71.7k | ABCD = _mm_shuffle_epi32(ABCD, 0x1B); |
237 | | |
238 | | // IA-32 SHA is little endian, SHA::Transform is big endian, |
239 | | // and SHA::HashMultipleBlocks can be either. ByteOrder |
240 | | // allows us to avoid extra endian reversals. It saves 1.0 cpb. |
241 | 71.7k | MASK = order == BIG_ENDIAN_ORDER ? // Data arrangement |
242 | 71.7k | _mm_set_epi8(0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15) : |
243 | 71.7k | _mm_set_epi8(3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12) ; |
244 | | |
245 | 989k | while (length >= SHA1::BLOCKSIZE) |
246 | 917k | { |
247 | | // Save current hash |
248 | 917k | ABCD_SAVE = ABCD; |
249 | 917k | E0_SAVE = E0; |
250 | | |
251 | | // Rounds 0-3 |
252 | 917k | MSG0 = _mm_loadu_si128(CONST_M128_CAST(data+0)); |
253 | 917k | MSG0 = _mm_shuffle_epi8(MSG0, MASK); |
254 | 917k | E0 = _mm_add_epi32(E0, MSG0); |
255 | 917k | E1 = ABCD; |
256 | 917k | ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); |
257 | | |
258 | | // Rounds 4-7 |
259 | 917k | MSG1 = _mm_loadu_si128(CONST_M128_CAST(data+4)); |
260 | 917k | MSG1 = _mm_shuffle_epi8(MSG1, MASK); |
261 | 917k | E1 = _mm_sha1nexte_epu32(E1, MSG1); |
262 | 917k | E0 = ABCD; |
263 | 917k | ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0); |
264 | 917k | MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); |
265 | | |
266 | | // Rounds 8-11 |
267 | 917k | MSG2 = _mm_loadu_si128(CONST_M128_CAST(data+8)); |
268 | 917k | MSG2 = _mm_shuffle_epi8(MSG2, MASK); |
269 | 917k | E0 = _mm_sha1nexte_epu32(E0, MSG2); |
270 | 917k | E1 = ABCD; |
271 | 917k | ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); |
272 | 917k | MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); |
273 | 917k | MSG0 = _mm_xor_si128(MSG0, MSG2); |
274 | | |
275 | | // Rounds 12-15 |
276 | 917k | MSG3 = _mm_loadu_si128(CONST_M128_CAST(data+12)); |
277 | 917k | MSG3 = _mm_shuffle_epi8(MSG3, MASK); |
278 | 917k | E1 = _mm_sha1nexte_epu32(E1, MSG3); |
279 | 917k | E0 = ABCD; |
280 | 917k | MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); |
281 | 917k | ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0); |
282 | 917k | MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); |
283 | 917k | MSG1 = _mm_xor_si128(MSG1, MSG3); |
284 | | |
285 | | // Rounds 16-19 |
286 | 917k | E0 = _mm_sha1nexte_epu32(E0, MSG0); |
287 | 917k | E1 = ABCD; |
288 | 917k | MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); |
289 | 917k | ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); |
290 | 917k | MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); |
291 | 917k | MSG2 = _mm_xor_si128(MSG2, MSG0); |
292 | | |
293 | | // Rounds 20-23 |
294 | 917k | E1 = _mm_sha1nexte_epu32(E1, MSG1); |
295 | 917k | E0 = ABCD; |
296 | 917k | MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); |
297 | 917k | ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); |
298 | 917k | MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); |
299 | 917k | MSG3 = _mm_xor_si128(MSG3, MSG1); |
300 | | |
301 | | // Rounds 24-27 |
302 | 917k | E0 = _mm_sha1nexte_epu32(E0, MSG2); |
303 | 917k | E1 = ABCD; |
304 | 917k | MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); |
305 | 917k | ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1); |
306 | 917k | MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); |
307 | 917k | MSG0 = _mm_xor_si128(MSG0, MSG2); |
308 | | |
309 | | // Rounds 28-31 |
310 | 917k | E1 = _mm_sha1nexte_epu32(E1, MSG3); |
311 | 917k | E0 = ABCD; |
312 | 917k | MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); |
313 | 917k | ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); |
314 | 917k | MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); |
315 | 917k | MSG1 = _mm_xor_si128(MSG1, MSG3); |
316 | | |
317 | | // Rounds 32-35 |
318 | 917k | E0 = _mm_sha1nexte_epu32(E0, MSG0); |
319 | 917k | E1 = ABCD; |
320 | 917k | MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); |
321 | 917k | ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1); |
322 | 917k | MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); |
323 | 917k | MSG2 = _mm_xor_si128(MSG2, MSG0); |
324 | | |
325 | | // Rounds 36-39 |
326 | 917k | E1 = _mm_sha1nexte_epu32(E1, MSG1); |
327 | 917k | E0 = ABCD; |
328 | 917k | MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); |
329 | 917k | ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); |
330 | 917k | MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); |
331 | 917k | MSG3 = _mm_xor_si128(MSG3, MSG1); |
332 | | |
333 | | // Rounds 40-43 |
334 | 917k | E0 = _mm_sha1nexte_epu32(E0, MSG2); |
335 | 917k | E1 = ABCD; |
336 | 917k | MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); |
337 | 917k | ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); |
338 | 917k | MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); |
339 | 917k | MSG0 = _mm_xor_si128(MSG0, MSG2); |
340 | | |
341 | | // Rounds 44-47 |
342 | 917k | E1 = _mm_sha1nexte_epu32(E1, MSG3); |
343 | 917k | E0 = ABCD; |
344 | 917k | MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); |
345 | 917k | ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2); |
346 | 917k | MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); |
347 | 917k | MSG1 = _mm_xor_si128(MSG1, MSG3); |
348 | | |
349 | | // Rounds 48-51 |
350 | 917k | E0 = _mm_sha1nexte_epu32(E0, MSG0); |
351 | 917k | E1 = ABCD; |
352 | 917k | MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); |
353 | 917k | ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); |
354 | 917k | MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); |
355 | 917k | MSG2 = _mm_xor_si128(MSG2, MSG0); |
356 | | |
357 | | // Rounds 52-55 |
358 | 917k | E1 = _mm_sha1nexte_epu32(E1, MSG1); |
359 | 917k | E0 = ABCD; |
360 | 917k | MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); |
361 | 917k | ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2); |
362 | 917k | MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); |
363 | 917k | MSG3 = _mm_xor_si128(MSG3, MSG1); |
364 | | |
365 | | // Rounds 56-59 |
366 | 917k | E0 = _mm_sha1nexte_epu32(E0, MSG2); |
367 | 917k | E1 = ABCD; |
368 | 917k | MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); |
369 | 917k | ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); |
370 | 917k | MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); |
371 | 917k | MSG0 = _mm_xor_si128(MSG0, MSG2); |
372 | | |
373 | | // Rounds 60-63 |
374 | 917k | E1 = _mm_sha1nexte_epu32(E1, MSG3); |
375 | 917k | E0 = ABCD; |
376 | 917k | MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); |
377 | 917k | ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); |
378 | 917k | MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); |
379 | 917k | MSG1 = _mm_xor_si128(MSG1, MSG3); |
380 | | |
381 | | // Rounds 64-67 |
382 | 917k | E0 = _mm_sha1nexte_epu32(E0, MSG0); |
383 | 917k | E1 = ABCD; |
384 | 917k | MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); |
385 | 917k | ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3); |
386 | 917k | MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); |
387 | 917k | MSG2 = _mm_xor_si128(MSG2, MSG0); |
388 | | |
389 | | // Rounds 68-71 |
390 | 917k | E1 = _mm_sha1nexte_epu32(E1, MSG1); |
391 | 917k | E0 = ABCD; |
392 | 917k | MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); |
393 | 917k | ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); |
394 | 917k | MSG3 = _mm_xor_si128(MSG3, MSG1); |
395 | | |
396 | | // Rounds 72-75 |
397 | 917k | E0 = _mm_sha1nexte_epu32(E0, MSG2); |
398 | 917k | E1 = ABCD; |
399 | 917k | MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); |
400 | 917k | ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3); |
401 | | |
402 | | // Rounds 76-79 |
403 | 917k | E1 = _mm_sha1nexte_epu32(E1, MSG3); |
404 | 917k | E0 = ABCD; |
405 | 917k | ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); |
406 | | |
407 | | // Add values back to state |
408 | 917k | E0 = _mm_sha1nexte_epu32(E0, E0_SAVE); |
409 | 917k | ABCD = _mm_add_epi32(ABCD, ABCD_SAVE); |
410 | | |
411 | 917k | data += SHA1::BLOCKSIZE/sizeof(word32); |
412 | 917k | length -= SHA1::BLOCKSIZE; |
413 | 917k | } |
414 | | |
415 | | // Save state |
416 | 71.7k | ABCD = _mm_shuffle_epi32(ABCD, 0x1B); |
417 | 71.7k | _mm_storeu_si128(M128_CAST(state), ABCD); |
418 | 71.7k | state[4] = _mm_extract_epi32(E0, 3); |
419 | 71.7k | } |
420 | | |
421 | | // Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley. |
422 | | void SHA256_HashMultipleBlocks_SHANI(word32 *state, const word32 *data, size_t length, ByteOrder order) |
423 | 179k | { |
424 | 179k | CRYPTOPP_ASSERT(state); |
425 | 179k | CRYPTOPP_ASSERT(data); |
426 | 179k | CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE); |
427 | | |
428 | 179k | __m128i STATE0, STATE1; |
429 | 179k | __m128i MSG, TMP, MASK; |
430 | 179k | __m128i TMSG0, TMSG1, TMSG2, TMSG3; |
431 | 179k | __m128i ABEF_SAVE, CDGH_SAVE; |
432 | | |
433 | | // Load initial values |
434 | 179k | TMP = _mm_loadu_si128(M128_CAST(&state[0])); |
435 | 179k | STATE1 = _mm_loadu_si128(M128_CAST(&state[4])); |
436 | | |
437 | | // IA-32 SHA is little endian, SHA::Transform is big endian, |
438 | | // and SHA::HashMultipleBlocks can be either. ByteOrder |
439 | | // allows us to avoid extra endian reversals. It saves 1.0 cpb. |
440 | 179k | MASK = order == BIG_ENDIAN_ORDER ? // Data arrangement |
441 | 179k | _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3) : |
442 | 179k | _mm_set_epi8(15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0) ; |
443 | | |
444 | 179k | TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB |
445 | 179k | STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH |
446 | 179k | STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF |
447 | 179k | STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH |
448 | | |
449 | 2.02M | while (length >= SHA256::BLOCKSIZE) |
450 | 1.84M | { |
451 | | // Save current hash |
452 | 1.84M | ABEF_SAVE = STATE0; |
453 | 1.84M | CDGH_SAVE = STATE1; |
454 | | |
455 | | // Rounds 0-3 |
456 | 1.84M | MSG = _mm_loadu_si128(CONST_M128_CAST(data+0)); |
457 | 1.84M | TMSG0 = _mm_shuffle_epi8(MSG, MASK); |
458 | 1.84M | MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0xE9B5DBA5B5C0FBCF), W64LIT(0x71374491428A2F98))); |
459 | 1.84M | STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); |
460 | 1.84M | MSG = _mm_shuffle_epi32(MSG, 0x0E); |
461 | 1.84M | STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); |
462 | | |
463 | | // Rounds 4-7 |
464 | 1.84M | TMSG1 = _mm_loadu_si128(CONST_M128_CAST(data+4)); |
465 | 1.84M | TMSG1 = _mm_shuffle_epi8(TMSG1, MASK); |
466 | 1.84M | MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0xAB1C5ED5923F82A4), W64LIT(0x59F111F13956C25B))); |
467 | 1.84M | STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); |
468 | 1.84M | MSG = _mm_shuffle_epi32(MSG, 0x0E); |
469 | 1.84M | STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); |
470 | 1.84M | TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); |
471 | | |
472 | | // Rounds 8-11 |
473 | 1.84M | TMSG2 = _mm_loadu_si128(CONST_M128_CAST(data+8)); |
474 | 1.84M | TMSG2 = _mm_shuffle_epi8(TMSG2, MASK); |
475 | 1.84M | MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x550C7DC3243185BE), W64LIT(0x12835B01D807AA98))); |
476 | 1.84M | STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); |
477 | 1.84M | MSG = _mm_shuffle_epi32(MSG, 0x0E); |
478 | 1.84M | STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); |
479 | 1.84M | TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); |
480 | | |
481 | | // Rounds 12-15 |
482 | 1.84M | TMSG3 = _mm_loadu_si128(CONST_M128_CAST(data+12)); |
483 | 1.84M | TMSG3 = _mm_shuffle_epi8(TMSG3, MASK); |
484 | 1.84M | MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC19BF1749BDC06A7), W64LIT(0x80DEB1FE72BE5D74))); |
485 | 1.84M | STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); |
486 | 1.84M | TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); |
487 | 1.84M | TMSG0 = _mm_add_epi32(TMSG0, TMP); |
488 | 1.84M | TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); |
489 | 1.84M | MSG = _mm_shuffle_epi32(MSG, 0x0E); |
490 | 1.84M | STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); |
491 | 1.84M | TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); |
492 | | |
493 | | // Rounds 16-19 |
494 | 1.84M | MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x240CA1CC0FC19DC6), W64LIT(0xEFBE4786E49B69C1))); |
495 | 1.84M | STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); |
496 | 1.84M | TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); |
497 | 1.84M | TMSG1 = _mm_add_epi32(TMSG1, TMP); |
498 | 1.84M | TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); |
499 | 1.84M | MSG = _mm_shuffle_epi32(MSG, 0x0E); |
500 | 1.84M | STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); |
501 | 1.84M | TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); |
502 | | |
503 | | // Rounds 20-23 |
504 | 1.84M | MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x76F988DA5CB0A9DC), W64LIT(0x4A7484AA2DE92C6F))); |
505 | 1.84M | STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); |
506 | 1.84M | TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); |
507 | 1.84M | TMSG2 = _mm_add_epi32(TMSG2, TMP); |
508 | 1.84M | TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); |
509 | 1.84M | MSG = _mm_shuffle_epi32(MSG, 0x0E); |
510 | 1.84M | STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); |
511 | 1.84M | TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); |
512 | | |
513 | | // Rounds 24-27 |
514 | 1.84M | MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xBF597FC7B00327C8), W64LIT(0xA831C66D983E5152))); |
515 | 1.84M | STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); |
516 | 1.84M | TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); |
517 | 1.84M | TMSG3 = _mm_add_epi32(TMSG3, TMP); |
518 | 1.84M | TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); |
519 | 1.84M | MSG = _mm_shuffle_epi32(MSG, 0x0E); |
520 | 1.84M | STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); |
521 | 1.84M | TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); |
522 | | |
523 | | // Rounds 28-31 |
524 | 1.84M | MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x1429296706CA6351), W64LIT(0xD5A79147C6E00BF3))); |
525 | 1.84M | STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); |
526 | 1.84M | TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); |
527 | 1.84M | TMSG0 = _mm_add_epi32(TMSG0, TMP); |
528 | 1.84M | TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); |
529 | 1.84M | MSG = _mm_shuffle_epi32(MSG, 0x0E); |
530 | 1.84M | STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); |
531 | 1.84M | TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); |
532 | | |
533 | | // Rounds 32-35 |
534 | 1.84M | MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x53380D134D2C6DFC), W64LIT(0x2E1B213827B70A85))); |
535 | 1.84M | STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); |
536 | 1.84M | TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); |
537 | 1.84M | TMSG1 = _mm_add_epi32(TMSG1, TMP); |
538 | 1.84M | TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); |
539 | 1.84M | MSG = _mm_shuffle_epi32(MSG, 0x0E); |
540 | 1.84M | STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); |
541 | 1.84M | TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); |
542 | | |
543 | | // Rounds 36-39 |
544 | 1.84M | MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x92722C8581C2C92E), W64LIT(0x766A0ABB650A7354))); |
545 | 1.84M | STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); |
546 | 1.84M | TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); |
547 | 1.84M | TMSG2 = _mm_add_epi32(TMSG2, TMP); |
548 | 1.84M | TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); |
549 | 1.84M | MSG = _mm_shuffle_epi32(MSG, 0x0E); |
550 | 1.84M | STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); |
551 | 1.84M | TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); |
552 | | |
553 | | // Rounds 40-43 |
554 | 1.84M | MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xC76C51A3C24B8B70), W64LIT(0xA81A664BA2BFE8A1))); |
555 | 1.84M | STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); |
556 | 1.84M | TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); |
557 | 1.84M | TMSG3 = _mm_add_epi32(TMSG3, TMP); |
558 | 1.84M | TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); |
559 | 1.84M | MSG = _mm_shuffle_epi32(MSG, 0x0E); |
560 | 1.84M | STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); |
561 | 1.84M | TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); |
562 | | |
563 | | // Rounds 44-47 |
564 | 1.84M | MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x106AA070F40E3585), W64LIT(0xD6990624D192E819))); |
565 | 1.84M | STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); |
566 | 1.84M | TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); |
567 | 1.84M | TMSG0 = _mm_add_epi32(TMSG0, TMP); |
568 | 1.84M | TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); |
569 | 1.84M | MSG = _mm_shuffle_epi32(MSG, 0x0E); |
570 | 1.84M | STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); |
571 | 1.84M | TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); |
572 | | |
573 | | // Rounds 48-51 |
574 | 1.84M | MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x34B0BCB52748774C), W64LIT(0x1E376C0819A4C116))); |
575 | 1.84M | STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); |
576 | 1.84M | TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); |
577 | 1.84M | TMSG1 = _mm_add_epi32(TMSG1, TMP); |
578 | 1.84M | TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); |
579 | 1.84M | MSG = _mm_shuffle_epi32(MSG, 0x0E); |
580 | 1.84M | STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); |
581 | 1.84M | TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); |
582 | | |
583 | | // Rounds 52-55 |
584 | 1.84M | MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x682E6FF35B9CCA4F), W64LIT(0x4ED8AA4A391C0CB3))); |
585 | 1.84M | STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); |
586 | 1.84M | TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); |
587 | 1.84M | TMSG2 = _mm_add_epi32(TMSG2, TMP); |
588 | 1.84M | TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); |
589 | 1.84M | MSG = _mm_shuffle_epi32(MSG, 0x0E); |
590 | 1.84M | STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); |
591 | | |
592 | | // Rounds 56-59 |
593 | 1.84M | MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x8CC7020884C87814), W64LIT(0x78A5636F748F82EE))); |
594 | 1.84M | STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); |
595 | 1.84M | TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); |
596 | 1.84M | TMSG3 = _mm_add_epi32(TMSG3, TMP); |
597 | 1.84M | TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); |
598 | 1.84M | MSG = _mm_shuffle_epi32(MSG, 0x0E); |
599 | 1.84M | STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); |
600 | | |
601 | | // Rounds 60-63 |
602 | 1.84M | MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC67178F2BEF9A3F7), W64LIT(0xA4506CEB90BEFFFA))); |
603 | 1.84M | STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); |
604 | 1.84M | MSG = _mm_shuffle_epi32(MSG, 0x0E); |
605 | 1.84M | STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); |
606 | | |
607 | | // Add values back to state |
608 | 1.84M | STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE); |
609 | 1.84M | STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE); |
610 | | |
611 | 1.84M | data += SHA256::BLOCKSIZE/sizeof(word32); |
612 | 1.84M | length -= SHA256::BLOCKSIZE; |
613 | 1.84M | } |
614 | | |
615 | 179k | TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA |
616 | 179k | STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG |
617 | 179k | STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA |
618 | 179k | STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF |
619 | | |
620 | | // Save state |
621 | 179k | _mm_storeu_si128(M128_CAST(&state[0]), STATE0); |
622 | 179k | _mm_storeu_si128(M128_CAST(&state[4]), STATE1); |
623 | 179k | } |
624 | | #endif // CRYPTOPP_SHANI_AVAILABLE |
625 | | |
626 | | /////////////////////////////////// |
627 | | // end of Walton and Gulley code // |
628 | | /////////////////////////////////// |
629 | | |
630 | | // ***************** ARMV8 SHA ******************** |
631 | | |
632 | | ///////////////////////////////////////////////////////////// |
633 | | // start of Walton, Schneiders, O'Rourke and Hovsmith code // |
634 | | ///////////////////////////////////////////////////////////// |
635 | | |
636 | | #if CRYPTOPP_ARM_SHA1_AVAILABLE |
637 | | void SHA1_HashMultipleBlocks_ARMV8(word32 *state, const word32 *data, size_t length, ByteOrder order) |
638 | | { |
639 | | CRYPTOPP_ASSERT(state); |
640 | | CRYPTOPP_ASSERT(data); |
641 | | CRYPTOPP_ASSERT(length >= SHA1::BLOCKSIZE); |
642 | | |
643 | | uint32x4_t C0, C1, C2, C3; |
644 | | uint32x4_t ABCD, ABCD_SAVED; |
645 | | uint32x4_t MSG0, MSG1, MSG2, MSG3; |
646 | | uint32x4_t TMP0, TMP1; |
647 | | uint32_t E0, E0_SAVED, E1; |
648 | | |
649 | | // Load initial values |
650 | | C0 = vdupq_n_u32(0x5A827999); |
651 | | C1 = vdupq_n_u32(0x6ED9EBA1); |
652 | | C2 = vdupq_n_u32(0x8F1BBCDC); |
653 | | C3 = vdupq_n_u32(0xCA62C1D6); |
654 | | |
655 | | ABCD = vld1q_u32(&state[0]); |
656 | | E0 = state[4]; |
657 | | |
658 | | while (length >= SHA1::BLOCKSIZE) |
659 | | { |
660 | | // Save current hash |
661 | | ABCD_SAVED = ABCD; |
662 | | E0_SAVED = E0; |
663 | | |
664 | | MSG0 = vld1q_u32(data + 0); |
665 | | MSG1 = vld1q_u32(data + 4); |
666 | | MSG2 = vld1q_u32(data + 8); |
667 | | MSG3 = vld1q_u32(data + 12); |
668 | | |
669 | | if (order == BIG_ENDIAN_ORDER) // Data arrangement |
670 | | { |
671 | | MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0))); |
672 | | MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1))); |
673 | | MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2))); |
674 | | MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3))); |
675 | | } |
676 | | |
677 | | TMP0 = vaddq_u32(MSG0, C0); |
678 | | TMP1 = vaddq_u32(MSG1, C0); |
679 | | |
680 | | // Rounds 0-3 |
681 | | E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); |
682 | | ABCD = vsha1cq_u32(ABCD, E0, TMP0); |
683 | | TMP0 = vaddq_u32(MSG2, C0); |
684 | | MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); |
685 | | |
686 | | // Rounds 4-7 |
687 | | E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); |
688 | | ABCD = vsha1cq_u32(ABCD, E1, TMP1); |
689 | | TMP1 = vaddq_u32(MSG3, C0); |
690 | | MSG0 = vsha1su1q_u32(MSG0, MSG3); |
691 | | MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); |
692 | | |
693 | | // Rounds 8-11 |
694 | | E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); |
695 | | ABCD = vsha1cq_u32(ABCD, E0, TMP0); |
696 | | TMP0 = vaddq_u32(MSG0, C0); |
697 | | MSG1 = vsha1su1q_u32(MSG1, MSG0); |
698 | | MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); |
699 | | |
700 | | // Rounds 12-15 |
701 | | E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); |
702 | | ABCD = vsha1cq_u32(ABCD, E1, TMP1); |
703 | | TMP1 = vaddq_u32(MSG1, C1); |
704 | | MSG2 = vsha1su1q_u32(MSG2, MSG1); |
705 | | MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); |
706 | | |
707 | | // Rounds 16-19 |
708 | | E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); |
709 | | ABCD = vsha1cq_u32(ABCD, E0, TMP0); |
710 | | TMP0 = vaddq_u32(MSG2, C1); |
711 | | MSG3 = vsha1su1q_u32(MSG3, MSG2); |
712 | | MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); |
713 | | |
714 | | // Rounds 20-23 |
715 | | E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); |
716 | | ABCD = vsha1pq_u32(ABCD, E1, TMP1); |
717 | | TMP1 = vaddq_u32(MSG3, C1); |
718 | | MSG0 = vsha1su1q_u32(MSG0, MSG3); |
719 | | MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); |
720 | | |
721 | | // Rounds 24-27 |
722 | | E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); |
723 | | ABCD = vsha1pq_u32(ABCD, E0, TMP0); |
724 | | TMP0 = vaddq_u32(MSG0, C1); |
725 | | MSG1 = vsha1su1q_u32(MSG1, MSG0); |
726 | | MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); |
727 | | |
728 | | // Rounds 28-31 |
729 | | E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); |
730 | | ABCD = vsha1pq_u32(ABCD, E1, TMP1); |
731 | | TMP1 = vaddq_u32(MSG1, C1); |
732 | | MSG2 = vsha1su1q_u32(MSG2, MSG1); |
733 | | MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); |
734 | | |
735 | | // Rounds 32-35 |
736 | | E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); |
737 | | ABCD = vsha1pq_u32(ABCD, E0, TMP0); |
738 | | TMP0 = vaddq_u32(MSG2, C2); |
739 | | MSG3 = vsha1su1q_u32(MSG3, MSG2); |
740 | | MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); |
741 | | |
742 | | // Rounds 36-39 |
743 | | E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); |
744 | | ABCD = vsha1pq_u32(ABCD, E1, TMP1); |
745 | | TMP1 = vaddq_u32(MSG3, C2); |
746 | | MSG0 = vsha1su1q_u32(MSG0, MSG3); |
747 | | MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); |
748 | | |
749 | | // Rounds 40-43 |
750 | | E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); |
751 | | ABCD = vsha1mq_u32(ABCD, E0, TMP0); |
752 | | TMP0 = vaddq_u32(MSG0, C2); |
753 | | MSG1 = vsha1su1q_u32(MSG1, MSG0); |
754 | | MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); |
755 | | |
756 | | // Rounds 44-47 |
757 | | E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); |
758 | | ABCD = vsha1mq_u32(ABCD, E1, TMP1); |
759 | | TMP1 = vaddq_u32(MSG1, C2); |
760 | | MSG2 = vsha1su1q_u32(MSG2, MSG1); |
761 | | MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); |
762 | | |
763 | | // Rounds 48-51 |
764 | | E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); |
765 | | ABCD = vsha1mq_u32(ABCD, E0, TMP0); |
766 | | TMP0 = vaddq_u32(MSG2, C2); |
767 | | MSG3 = vsha1su1q_u32(MSG3, MSG2); |
768 | | MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); |
769 | | |
770 | | // Rounds 52-55 |
771 | | E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); |
772 | | ABCD = vsha1mq_u32(ABCD, E1, TMP1); |
773 | | TMP1 = vaddq_u32(MSG3, C3); |
774 | | MSG0 = vsha1su1q_u32(MSG0, MSG3); |
775 | | MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); |
776 | | |
777 | | // Rounds 56-59 |
778 | | E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); |
779 | | ABCD = vsha1mq_u32(ABCD, E0, TMP0); |
780 | | TMP0 = vaddq_u32(MSG0, C3); |
781 | | MSG1 = vsha1su1q_u32(MSG1, MSG0); |
782 | | MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); |
783 | | |
784 | | // Rounds 60-63 |
785 | | E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); |
786 | | ABCD = vsha1pq_u32(ABCD, E1, TMP1); |
787 | | TMP1 = vaddq_u32(MSG1, C3); |
788 | | MSG2 = vsha1su1q_u32(MSG2, MSG1); |
789 | | MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); |
790 | | |
791 | | // Rounds 64-67 |
792 | | E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); |
793 | | ABCD = vsha1pq_u32(ABCD, E0, TMP0); |
794 | | TMP0 = vaddq_u32(MSG2, C3); |
795 | | MSG3 = vsha1su1q_u32(MSG3, MSG2); |
796 | | MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); |
797 | | |
798 | | // Rounds 68-71 |
799 | | E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); |
800 | | ABCD = vsha1pq_u32(ABCD, E1, TMP1); |
801 | | TMP1 = vaddq_u32(MSG3, C3); |
802 | | MSG0 = vsha1su1q_u32(MSG0, MSG3); |
803 | | |
804 | | // Rounds 72-75 |
805 | | E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); |
806 | | ABCD = vsha1pq_u32(ABCD, E0, TMP0); |
807 | | |
808 | | // Rounds 76-79 |
809 | | E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); |
810 | | ABCD = vsha1pq_u32(ABCD, E1, TMP1); |
811 | | |
812 | | E0 += E0_SAVED; |
813 | | ABCD = vaddq_u32(ABCD_SAVED, ABCD); |
814 | | |
815 | | data += SHA1::BLOCKSIZE/sizeof(word32); |
816 | | length -= SHA1::BLOCKSIZE; |
817 | | } |
818 | | |
819 | | // Save state |
820 | | vst1q_u32(&state[0], ABCD); |
821 | | state[4] = E0; |
822 | | } |
823 | | #endif // CRYPTOPP_ARM_SHA1_AVAILABLE |
824 | | |
825 | | #if CRYPTOPP_ARM_SHA2_AVAILABLE |
826 | | void SHA256_HashMultipleBlocks_ARMV8(word32 *state, const word32 *data, size_t length, ByteOrder order) |
827 | | { |
828 | | CRYPTOPP_ASSERT(state); |
829 | | CRYPTOPP_ASSERT(data); |
830 | | CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE); |
831 | | |
832 | | uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE; |
833 | | uint32x4_t MSG0, MSG1, MSG2, MSG3; |
834 | | uint32x4_t TMP0, TMP1, TMP2; |
835 | | |
836 | | // Load initial values |
837 | | STATE0 = vld1q_u32(&state[0]); |
838 | | STATE1 = vld1q_u32(&state[4]); |
839 | | |
840 | | while (length >= SHA256::BLOCKSIZE) |
841 | | { |
842 | | // Save current hash |
843 | | ABEF_SAVE = STATE0; |
844 | | CDGH_SAVE = STATE1; |
845 | | |
846 | | // Load message |
847 | | MSG0 = vld1q_u32(data + 0); |
848 | | MSG1 = vld1q_u32(data + 4); |
849 | | MSG2 = vld1q_u32(data + 8); |
850 | | MSG3 = vld1q_u32(data + 12); |
851 | | |
852 | | if (order == BIG_ENDIAN_ORDER) // Data arrangement |
853 | | { |
854 | | MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0))); |
855 | | MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1))); |
856 | | MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2))); |
857 | | MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3))); |
858 | | } |
859 | | |
860 | | TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x00])); |
861 | | |
862 | | // Rounds 0-3 |
863 | | MSG0 = vsha256su0q_u32(MSG0, MSG1); |
864 | | TMP2 = STATE0; |
865 | | TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x04])); |
866 | | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); |
867 | | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); |
868 | | MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3); |
869 | | |
870 | | // Rounds 4-7 |
871 | | MSG1 = vsha256su0q_u32(MSG1, MSG2); |
872 | | TMP2 = STATE0; |
873 | | TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x08])); |
874 | | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); |
875 | | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); |
876 | | MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0); |
877 | | |
878 | | // Rounds 8-11 |
879 | | MSG2 = vsha256su0q_u32(MSG2, MSG3); |
880 | | TMP2 = STATE0; |
881 | | TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x0c])); |
882 | | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); |
883 | | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); |
884 | | MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1); |
885 | | |
886 | | // Rounds 12-15 |
887 | | MSG3 = vsha256su0q_u32(MSG3, MSG0); |
888 | | TMP2 = STATE0; |
889 | | TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x10])); |
890 | | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); |
891 | | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); |
892 | | MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2); |
893 | | |
894 | | // Rounds 16-19 |
895 | | MSG0 = vsha256su0q_u32(MSG0, MSG1); |
896 | | TMP2 = STATE0; |
897 | | TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x14])); |
898 | | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); |
899 | | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); |
900 | | MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3); |
901 | | |
902 | | // Rounds 20-23 |
903 | | MSG1 = vsha256su0q_u32(MSG1, MSG2); |
904 | | TMP2 = STATE0; |
905 | | TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x18])); |
906 | | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); |
907 | | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); |
908 | | MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0); |
909 | | |
910 | | // Rounds 24-27 |
911 | | MSG2 = vsha256su0q_u32(MSG2, MSG3); |
912 | | TMP2 = STATE0; |
913 | | TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x1c])); |
914 | | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); |
915 | | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); |
916 | | MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1); |
917 | | |
918 | | // Rounds 28-31 |
919 | | MSG3 = vsha256su0q_u32(MSG3, MSG0); |
920 | | TMP2 = STATE0; |
921 | | TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x20])); |
922 | | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); |
923 | | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); |
924 | | MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2); |
925 | | |
926 | | // Rounds 32-35 |
927 | | MSG0 = vsha256su0q_u32(MSG0, MSG1); |
928 | | TMP2 = STATE0; |
929 | | TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x24])); |
930 | | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); |
931 | | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); |
932 | | MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3); |
933 | | |
934 | | // Rounds 36-39 |
935 | | MSG1 = vsha256su0q_u32(MSG1, MSG2); |
936 | | TMP2 = STATE0; |
937 | | TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x28])); |
938 | | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); |
939 | | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); |
940 | | MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0); |
941 | | |
942 | | // Rounds 40-43 |
943 | | MSG2 = vsha256su0q_u32(MSG2, MSG3); |
944 | | TMP2 = STATE0; |
945 | | TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x2c])); |
946 | | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); |
947 | | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); |
948 | | MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1); |
949 | | |
950 | | // Rounds 44-47 |
951 | | MSG3 = vsha256su0q_u32(MSG3, MSG0); |
952 | | TMP2 = STATE0; |
953 | | TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x30])); |
954 | | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); |
955 | | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); |
956 | | MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2); |
957 | | |
958 | | // Rounds 48-51 |
959 | | TMP2 = STATE0; |
960 | | TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x34])); |
961 | | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); |
962 | | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); |
963 | | |
964 | | // Rounds 52-55 |
965 | | TMP2 = STATE0; |
966 | | TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x38])); |
967 | | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); |
968 | | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); |
969 | | |
970 | | // Rounds 56-59 |
971 | | TMP2 = STATE0; |
972 | | TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x3c])); |
973 | | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); |
974 | | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); |
975 | | |
976 | | // Rounds 60-63 |
977 | | TMP2 = STATE0; |
978 | | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); |
979 | | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); |
980 | | |
981 | | // Add back to state |
982 | | STATE0 = vaddq_u32(STATE0, ABEF_SAVE); |
983 | | STATE1 = vaddq_u32(STATE1, CDGH_SAVE); |
984 | | |
985 | | data += SHA256::BLOCKSIZE/sizeof(word32); |
986 | | length -= SHA256::BLOCKSIZE; |
987 | | } |
988 | | |
989 | | // Save state |
990 | | vst1q_u32(&state[0], STATE0); |
991 | | vst1q_u32(&state[4], STATE1); |
992 | | } |
993 | | #endif // CRYPTOPP_ARM_SHA2_AVAILABLE |
994 | | |
995 | | /////////////////////////////////////////////////////////// |
996 | | // end of Walton, Schneiders, O'Rourke and Hovsmith code // |
997 | | /////////////////////////////////////////////////////////// |
998 | | |
999 | | // ***************** Power8 SHA ******************** |
1000 | | |
1001 | | ////////////////////////////////////////////////// |
1002 | | // start Gustavo, Serra, Scalet and Walton code // |
1003 | | ////////////////////////////////////////////////// |
1004 | | |
1005 | | #if CRYPTOPP_POWER8_SHA_AVAILABLE |
1006 | | |
1007 | | // Indexes into the S[] array |
1008 | | enum {A=0, B=1, C, D, E, F, G, H}; |
1009 | | |
1010 | | inline |
1011 | | uint32x4_p VecLoad32(const word32* data, int offset) |
1012 | | { |
1013 | | #if (CRYPTOPP_LITTLE_ENDIAN) |
1014 | | const uint8x16_p mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12}; |
1015 | | const uint32x4_p val = VecLoad(offset, data); |
1016 | | return (uint32x4_p)VecPermute(val, val, mask); |
1017 | | #else |
1018 | | return VecLoad(offset, data); |
1019 | | #endif |
1020 | | } |
1021 | | |
1022 | | template<class T> inline |
1023 | | void VecStore32(const T data, word32 dest[4]) |
1024 | | { |
1025 | | VecStore(data, dest); |
1026 | | } |
1027 | | |
1028 | | inline |
1029 | | uint32x4_p VectorCh(const uint32x4_p x, const uint32x4_p y, const uint32x4_p z) |
1030 | | { |
1031 | | // The trick below is due to Andy Polyakov and Jack Lloyd |
1032 | | return vec_sel(z,y,x); |
1033 | | } |
1034 | | |
1035 | | inline |
1036 | | uint32x4_p VectorMaj(const uint32x4_p x, const uint32x4_p y, const uint32x4_p z) |
1037 | | { |
1038 | | // The trick below is due to Andy Polyakov and Jack Lloyd |
1039 | | return vec_sel(y, z, VecXor(x, y)); |
1040 | | } |
1041 | | |
1042 | | inline |
1043 | | uint32x4_p Vector_sigma0(const uint32x4_p val) |
1044 | | { |
1045 | | return VecSHA256<0,0>(val); |
1046 | | } |
1047 | | |
1048 | | inline |
1049 | | uint32x4_p Vector_sigma1(const uint32x4_p val) |
1050 | | { |
1051 | | return VecSHA256<0,0xf>(val); |
1052 | | } |
1053 | | |
1054 | | inline |
1055 | | uint32x4_p VectorSigma0(const uint32x4_p val) |
1056 | | { |
1057 | | return VecSHA256<1,0>(val); |
1058 | | } |
1059 | | |
1060 | | inline |
1061 | | uint32x4_p VectorSigma1(const uint32x4_p val) |
1062 | | { |
1063 | | return VecSHA256<1,0xf>(val); |
1064 | | } |
1065 | | |
1066 | | inline |
1067 | | uint32x4_p VectorPack(const uint32x4_p a, const uint32x4_p b, |
1068 | | const uint32x4_p c, const uint32x4_p d) |
1069 | | { |
1070 | | const uint8x16_p m1 = {0,1,2,3, 16,17,18,19, 0,0,0,0, 0,0,0,0}; |
1071 | | const uint8x16_p m2 = {0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23}; |
1072 | | return VecPermute(VecPermute(a,b,m1), VecPermute(c,d,m1), m2); |
1073 | | } |
1074 | | |
1075 | | template <unsigned int R> inline |
1076 | | void SHA256_ROUND1(uint32x4_p W[16], uint32x4_p S[8], const uint32x4_p K, const uint32x4_p M) |
1077 | | { |
1078 | | uint32x4_p T1, T2; |
1079 | | |
1080 | | W[R] = M; |
1081 | | T1 = S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K + M; |
1082 | | T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]); |
1083 | | |
1084 | | S[H] = S[G]; S[G] = S[F]; S[F] = S[E]; |
1085 | | S[E] = S[D] + T1; |
1086 | | S[D] = S[C]; S[C] = S[B]; S[B] = S[A]; |
1087 | | S[A] = T1 + T2; |
1088 | | } |
1089 | | |
1090 | | template <unsigned int R> inline |
1091 | | void SHA256_ROUND2(uint32x4_p W[16], uint32x4_p S[8], const uint32x4_p K) |
1092 | | { |
1093 | | // Indexes into the W[] array |
1094 | | enum {IDX0=(R+0)&0xf, IDX1=(R+1)&0xf, IDX9=(R+9)&0xf, IDX14=(R+14)&0xf}; |
1095 | | |
1096 | | const uint32x4_p s0 = Vector_sigma0(W[IDX1]); |
1097 | | const uint32x4_p s1 = Vector_sigma1(W[IDX14]); |
1098 | | |
1099 | | uint32x4_p T1 = (W[IDX0] += s0 + s1 + W[IDX9]); |
1100 | | T1 += S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K; |
1101 | | uint32x4_p T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]); |
1102 | | |
1103 | | S[H] = S[G]; S[G] = S[F]; S[F] = S[E]; |
1104 | | S[E] = S[D] + T1; |
1105 | | S[D] = S[C]; S[C] = S[B]; S[B] = S[A]; |
1106 | | S[A] = T1 + T2; |
1107 | | } |
1108 | | |
1109 | | void SHA256_HashMultipleBlocks_POWER8(word32 *state, const word32 *data, size_t length, ByteOrder order) |
1110 | | { |
1111 | | CRYPTOPP_ASSERT(state); CRYPTOPP_ASSERT(data); |
1112 | | CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE); |
1113 | | CRYPTOPP_UNUSED(order); |
1114 | | |
1115 | | const uint32_t* k = reinterpret_cast<const uint32_t*>(SHA256_K); |
1116 | | const uint32_t* m = reinterpret_cast<const uint32_t*>(data); |
1117 | | |
1118 | | uint32x4_p abcd = VecLoad(state+0); |
1119 | | uint32x4_p efgh = VecLoad(state+4); |
1120 | | uint32x4_p W[16], S[8], vm, vk; |
1121 | | |
1122 | | size_t blocks = length / SHA256::BLOCKSIZE; |
1123 | | while (blocks--) |
1124 | | { |
1125 | | unsigned int offset=0; |
1126 | | |
1127 | | S[A] = abcd; S[E] = efgh; |
1128 | | S[B] = VecShiftLeftOctet<4>(S[A]); |
1129 | | S[F] = VecShiftLeftOctet<4>(S[E]); |
1130 | | S[C] = VecShiftLeftOctet<4>(S[B]); |
1131 | | S[G] = VecShiftLeftOctet<4>(S[F]); |
1132 | | S[D] = VecShiftLeftOctet<4>(S[C]); |
1133 | | S[H] = VecShiftLeftOctet<4>(S[G]); |
1134 | | |
1135 | | // Rounds 0-16 |
1136 | | vk = VecLoad(offset, k); |
1137 | | vm = VecLoad32(m, offset); |
1138 | | SHA256_ROUND1<0>(W,S, vk,vm); |
1139 | | offset+=16; |
1140 | | |
1141 | | vk = VecShiftLeftOctet<4>(vk); |
1142 | | vm = VecShiftLeftOctet<4>(vm); |
1143 | | SHA256_ROUND1<1>(W,S, vk,vm); |
1144 | | |
1145 | | vk = VecShiftLeftOctet<4>(vk); |
1146 | | vm = VecShiftLeftOctet<4>(vm); |
1147 | | SHA256_ROUND1<2>(W,S, vk,vm); |
1148 | | |
1149 | | vk = VecShiftLeftOctet<4>(vk); |
1150 | | vm = VecShiftLeftOctet<4>(vm); |
1151 | | SHA256_ROUND1<3>(W,S, vk,vm); |
1152 | | |
1153 | | vk = VecLoad(offset, k); |
1154 | | vm = VecLoad32(m, offset); |
1155 | | SHA256_ROUND1<4>(W,S, vk,vm); |
1156 | | offset+=16; |
1157 | | |
1158 | | vk = VecShiftLeftOctet<4>(vk); |
1159 | | vm = VecShiftLeftOctet<4>(vm); |
1160 | | SHA256_ROUND1<5>(W,S, vk,vm); |
1161 | | |
1162 | | vk = VecShiftLeftOctet<4>(vk); |
1163 | | vm = VecShiftLeftOctet<4>(vm); |
1164 | | SHA256_ROUND1<6>(W,S, vk,vm); |
1165 | | |
1166 | | vk = VecShiftLeftOctet<4>(vk); |
1167 | | vm = VecShiftLeftOctet<4>(vm); |
1168 | | SHA256_ROUND1<7>(W,S, vk,vm); |
1169 | | |
1170 | | vk = VecLoad(offset, k); |
1171 | | vm = VecLoad32(m, offset); |
1172 | | SHA256_ROUND1<8>(W,S, vk,vm); |
1173 | | offset+=16; |
1174 | | |
1175 | | vk = VecShiftLeftOctet<4>(vk); |
1176 | | vm = VecShiftLeftOctet<4>(vm); |
1177 | | SHA256_ROUND1<9>(W,S, vk,vm); |
1178 | | |
1179 | | vk = VecShiftLeftOctet<4>(vk); |
1180 | | vm = VecShiftLeftOctet<4>(vm); |
1181 | | SHA256_ROUND1<10>(W,S, vk,vm); |
1182 | | |
1183 | | vk = VecShiftLeftOctet<4>(vk); |
1184 | | vm = VecShiftLeftOctet<4>(vm); |
1185 | | SHA256_ROUND1<11>(W,S, vk,vm); |
1186 | | |
1187 | | vk = VecLoad(offset, k); |
1188 | | vm = VecLoad32(m, offset); |
1189 | | SHA256_ROUND1<12>(W,S, vk,vm); |
1190 | | offset+=16; |
1191 | | |
1192 | | vk = VecShiftLeftOctet<4>(vk); |
1193 | | vm = VecShiftLeftOctet<4>(vm); |
1194 | | SHA256_ROUND1<13>(W,S, vk,vm); |
1195 | | |
1196 | | vk = VecShiftLeftOctet<4>(vk); |
1197 | | vm = VecShiftLeftOctet<4>(vm); |
1198 | | SHA256_ROUND1<14>(W,S, vk,vm); |
1199 | | |
1200 | | vk = VecShiftLeftOctet<4>(vk); |
1201 | | vm = VecShiftLeftOctet<4>(vm); |
1202 | | SHA256_ROUND1<15>(W,S, vk,vm); |
1203 | | |
1204 | | m += 16; // 32-bit words, not bytes |
1205 | | |
1206 | | // Rounds 16-64 |
1207 | | for (unsigned int i=16; i<64; i+=16) |
1208 | | { |
1209 | | vk = VecLoad(offset, k); |
1210 | | SHA256_ROUND2<0>(W,S, vk); |
1211 | | SHA256_ROUND2<1>(W,S, VecShiftLeftOctet<4>(vk)); |
1212 | | SHA256_ROUND2<2>(W,S, VecShiftLeftOctet<8>(vk)); |
1213 | | SHA256_ROUND2<3>(W,S, VecShiftLeftOctet<12>(vk)); |
1214 | | offset+=16; |
1215 | | |
1216 | | vk = VecLoad(offset, k); |
1217 | | SHA256_ROUND2<4>(W,S, vk); |
1218 | | SHA256_ROUND2<5>(W,S, VecShiftLeftOctet<4>(vk)); |
1219 | | SHA256_ROUND2<6>(W,S, VecShiftLeftOctet<8>(vk)); |
1220 | | SHA256_ROUND2<7>(W,S, VecShiftLeftOctet<12>(vk)); |
1221 | | offset+=16; |
1222 | | |
1223 | | vk = VecLoad(offset, k); |
1224 | | SHA256_ROUND2<8>(W,S, vk); |
1225 | | SHA256_ROUND2<9>(W,S, VecShiftLeftOctet<4>(vk)); |
1226 | | SHA256_ROUND2<10>(W,S, VecShiftLeftOctet<8>(vk)); |
1227 | | SHA256_ROUND2<11>(W,S, VecShiftLeftOctet<12>(vk)); |
1228 | | offset+=16; |
1229 | | |
1230 | | vk = VecLoad(offset, k); |
1231 | | SHA256_ROUND2<12>(W,S, vk); |
1232 | | SHA256_ROUND2<13>(W,S, VecShiftLeftOctet<4>(vk)); |
1233 | | SHA256_ROUND2<14>(W,S, VecShiftLeftOctet<8>(vk)); |
1234 | | SHA256_ROUND2<15>(W,S, VecShiftLeftOctet<12>(vk)); |
1235 | | offset+=16; |
1236 | | } |
1237 | | |
1238 | | abcd += VectorPack(S[A],S[B],S[C],S[D]); |
1239 | | efgh += VectorPack(S[E],S[F],S[G],S[H]); |
1240 | | } |
1241 | | |
1242 | | VecStore32(abcd, state+0); |
1243 | | VecStore32(efgh, state+4); |
1244 | | } |
1245 | | |
1246 | | inline |
1247 | | void VecStore64(const uint64x2_p val, word64* data) |
1248 | | { |
1249 | | VecStore(val, data); |
1250 | | } |
1251 | | |
1252 | | inline |
1253 | | uint64x2_p VecLoad64(const word64* data, int offset) |
1254 | | { |
1255 | | #if (CRYPTOPP_LITTLE_ENDIAN) |
1256 | | const uint8x16_p mask = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}; |
1257 | | return VecPermute(VecLoad(offset, data), mask); |
1258 | | #else |
1259 | | return VecLoad(offset, data); |
1260 | | #endif |
1261 | | } |
1262 | | |
1263 | | inline |
1264 | | uint64x2_p VectorCh(const uint64x2_p x, const uint64x2_p y, const uint64x2_p z) |
1265 | | { |
1266 | | // The trick below is due to Andy Polyakov and Jack Lloyd |
1267 | | return vec_sel(z,y,x); |
1268 | | } |
1269 | | |
1270 | | inline |
1271 | | uint64x2_p VectorMaj(const uint64x2_p x, const uint64x2_p y, const uint64x2_p z) |
1272 | | { |
1273 | | // The trick below is due to Andy Polyakov and Jack Lloyd |
1274 | | return vec_sel(y, z, VecXor(x, y)); |
1275 | | } |
1276 | | |
1277 | | inline |
1278 | | uint64x2_p Vector_sigma0(const uint64x2_p val) |
1279 | | { |
1280 | | return VecSHA512<0,0>(val); |
1281 | | } |
1282 | | |
1283 | | inline |
1284 | | uint64x2_p Vector_sigma1(const uint64x2_p val) |
1285 | | { |
1286 | | return VecSHA512<0,0xf>(val); |
1287 | | } |
1288 | | |
1289 | | inline |
1290 | | uint64x2_p VectorSigma0(const uint64x2_p val) |
1291 | | { |
1292 | | return VecSHA512<1,0>(val); |
1293 | | } |
1294 | | |
1295 | | inline |
1296 | | uint64x2_p VectorSigma1(const uint64x2_p val) |
1297 | | { |
1298 | | return VecSHA512<1,0xf>(val); |
1299 | | } |
1300 | | |
1301 | | inline |
1302 | | uint64x2_p VectorPack(const uint64x2_p x, const uint64x2_p y) |
1303 | | { |
1304 | | const uint8x16_p m = {0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23}; |
1305 | | return VecPermute(x,y,m); |
1306 | | } |
1307 | | |
1308 | | template <unsigned int R> inline |
1309 | | void SHA512_ROUND1(uint64x2_p W[16], uint64x2_p S[8], const uint64x2_p K, const uint64x2_p M) |
1310 | | { |
1311 | | uint64x2_p T1, T2; |
1312 | | |
1313 | | W[R] = M; |
1314 | | T1 = S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K + M; |
1315 | | T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]); |
1316 | | |
1317 | | S[H] = S[G]; S[G] = S[F]; S[F] = S[E]; |
1318 | | S[E] = S[D] + T1; |
1319 | | S[D] = S[C]; S[C] = S[B]; S[B] = S[A]; |
1320 | | S[A] = T1 + T2; |
1321 | | } |
1322 | | |
1323 | | template <unsigned int R> inline |
1324 | | void SHA512_ROUND2(uint64x2_p W[16], uint64x2_p S[8], const uint64x2_p K) |
1325 | | { |
1326 | | // Indexes into the W[] array |
1327 | | enum {IDX0=(R+0)&0xf, IDX1=(R+1)&0xf, IDX9=(R+9)&0xf, IDX14=(R+14)&0xf}; |
1328 | | |
1329 | | const uint64x2_p s0 = Vector_sigma0(W[IDX1]); |
1330 | | const uint64x2_p s1 = Vector_sigma1(W[IDX14]); |
1331 | | |
1332 | | uint64x2_p T1 = (W[IDX0] += s0 + s1 + W[IDX9]); |
1333 | | T1 += S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K; |
1334 | | uint64x2_p T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]); |
1335 | | |
1336 | | S[H] = S[G]; S[G] = S[F]; S[F] = S[E]; |
1337 | | S[E] = S[D] + T1; |
1338 | | S[D] = S[C]; S[C] = S[B]; S[B] = S[A]; |
1339 | | S[A] = T1 + T2; |
1340 | | } |
1341 | | |
1342 | | void SHA512_HashMultipleBlocks_POWER8(word64 *state, const word64 *data, size_t length, ByteOrder order) |
1343 | | { |
1344 | | CRYPTOPP_ASSERT(state); CRYPTOPP_ASSERT(data); |
1345 | | CRYPTOPP_ASSERT(length >= SHA512::BLOCKSIZE); |
1346 | | CRYPTOPP_UNUSED(order); |
1347 | | |
1348 | | const uint64_t* k = reinterpret_cast<const uint64_t*>(SHA512_K); |
1349 | | const uint64_t* m = reinterpret_cast<const uint64_t*>(data); |
1350 | | |
1351 | | uint64x2_p ab = VecLoad(state+0); |
1352 | | uint64x2_p cd = VecLoad(state+2); |
1353 | | uint64x2_p ef = VecLoad(state+4); |
1354 | | uint64x2_p gh = VecLoad(state+6); |
1355 | | uint64x2_p W[16], S[8], vm, vk; |
1356 | | |
1357 | | size_t blocks = length / SHA512::BLOCKSIZE; |
1358 | | while (blocks--) |
1359 | | { |
1360 | | unsigned int offset=0; |
1361 | | |
1362 | | S[A] = ab; S[C] = cd; |
1363 | | S[E] = ef; S[G] = gh; |
1364 | | S[B] = VecShiftLeftOctet<8>(S[A]); |
1365 | | S[D] = VecShiftLeftOctet<8>(S[C]); |
1366 | | S[F] = VecShiftLeftOctet<8>(S[E]); |
1367 | | S[H] = VecShiftLeftOctet<8>(S[G]); |
1368 | | |
1369 | | // Rounds 0-16 |
1370 | | vk = VecLoad(offset, k); |
1371 | | vm = VecLoad64(m, offset); |
1372 | | SHA512_ROUND1<0>(W,S, vk,vm); |
1373 | | offset+=16; |
1374 | | |
1375 | | vk = VecShiftLeftOctet<8>(vk); |
1376 | | vm = VecShiftLeftOctet<8>(vm); |
1377 | | SHA512_ROUND1<1>(W,S, vk,vm); |
1378 | | |
1379 | | vk = VecLoad(offset, k); |
1380 | | vm = VecLoad64(m, offset); |
1381 | | SHA512_ROUND1<2>(W,S, vk,vm); |
1382 | | offset+=16; |
1383 | | |
1384 | | vk = VecShiftLeftOctet<8>(vk); |
1385 | | vm = VecShiftLeftOctet<8>(vm); |
1386 | | SHA512_ROUND1<3>(W,S, vk,vm); |
1387 | | |
1388 | | vk = VecLoad(offset, k); |
1389 | | vm = VecLoad64(m, offset); |
1390 | | SHA512_ROUND1<4>(W,S, vk,vm); |
1391 | | offset+=16; |
1392 | | |
1393 | | vk = VecShiftLeftOctet<8>(vk); |
1394 | | vm = VecShiftLeftOctet<8>(vm); |
1395 | | SHA512_ROUND1<5>(W,S, vk,vm); |
1396 | | |
1397 | | vk = VecLoad(offset, k); |
1398 | | vm = VecLoad64(m, offset); |
1399 | | SHA512_ROUND1<6>(W,S, vk,vm); |
1400 | | offset+=16; |
1401 | | |
1402 | | vk = VecShiftLeftOctet<8>(vk); |
1403 | | vm = VecShiftLeftOctet<8>(vm); |
1404 | | SHA512_ROUND1<7>(W,S, vk,vm); |
1405 | | |
1406 | | vk = VecLoad(offset, k); |
1407 | | vm = VecLoad64(m, offset); |
1408 | | SHA512_ROUND1<8>(W,S, vk,vm); |
1409 | | offset+=16; |
1410 | | |
1411 | | vk = VecShiftLeftOctet<8>(vk); |
1412 | | vm = VecShiftLeftOctet<8>(vm); |
1413 | | SHA512_ROUND1<9>(W,S, vk,vm); |
1414 | | |
1415 | | vk = VecLoad(offset, k); |
1416 | | vm = VecLoad64(m, offset); |
1417 | | SHA512_ROUND1<10>(W,S, vk,vm); |
1418 | | offset+=16; |
1419 | | |
1420 | | vk = VecShiftLeftOctet<8>(vk); |
1421 | | vm = VecShiftLeftOctet<8>(vm); |
1422 | | SHA512_ROUND1<11>(W,S, vk,vm); |
1423 | | |
1424 | | vk = VecLoad(offset, k); |
1425 | | vm = VecLoad64(m, offset); |
1426 | | SHA512_ROUND1<12>(W,S, vk,vm); |
1427 | | offset+=16; |
1428 | | |
1429 | | vk = VecShiftLeftOctet<8>(vk); |
1430 | | vm = VecShiftLeftOctet<8>(vm); |
1431 | | SHA512_ROUND1<13>(W,S, vk,vm); |
1432 | | |
1433 | | vk = VecLoad(offset, k); |
1434 | | vm = VecLoad64(m, offset); |
1435 | | SHA512_ROUND1<14>(W,S, vk,vm); |
1436 | | offset+=16; |
1437 | | |
1438 | | vk = VecShiftLeftOctet<8>(vk); |
1439 | | vm = VecShiftLeftOctet<8>(vm); |
1440 | | SHA512_ROUND1<15>(W,S, vk,vm); |
1441 | | |
1442 | | m += 16; // 64-bit words, not bytes |
1443 | | |
1444 | | // Rounds 16-80 |
1445 | | for (unsigned int i=16; i<80; i+=16) |
1446 | | { |
1447 | | vk = VecLoad(offset, k); |
1448 | | SHA512_ROUND2<0>(W,S, vk); |
1449 | | SHA512_ROUND2<1>(W,S, VecShiftLeftOctet<8>(vk)); |
1450 | | offset+=16; |
1451 | | |
1452 | | vk = VecLoad(offset, k); |
1453 | | SHA512_ROUND2<2>(W,S, vk); |
1454 | | SHA512_ROUND2<3>(W,S, VecShiftLeftOctet<8>(vk)); |
1455 | | offset+=16; |
1456 | | |
1457 | | vk = VecLoad(offset, k); |
1458 | | SHA512_ROUND2<4>(W,S, vk); |
1459 | | SHA512_ROUND2<5>(W,S, VecShiftLeftOctet<8>(vk)); |
1460 | | offset+=16; |
1461 | | |
1462 | | vk = VecLoad(offset, k); |
1463 | | SHA512_ROUND2<6>(W,S, vk); |
1464 | | SHA512_ROUND2<7>(W,S, VecShiftLeftOctet<8>(vk)); |
1465 | | offset+=16; |
1466 | | |
1467 | | vk = VecLoad(offset, k); |
1468 | | SHA512_ROUND2<8>(W,S, vk); |
1469 | | SHA512_ROUND2<9>(W,S, VecShiftLeftOctet<8>(vk)); |
1470 | | offset+=16; |
1471 | | |
1472 | | vk = VecLoad(offset, k); |
1473 | | SHA512_ROUND2<10>(W,S, vk); |
1474 | | SHA512_ROUND2<11>(W,S, VecShiftLeftOctet<8>(vk)); |
1475 | | offset+=16; |
1476 | | |
1477 | | vk = VecLoad(offset, k); |
1478 | | SHA512_ROUND2<12>(W,S, vk); |
1479 | | SHA512_ROUND2<13>(W,S, VecShiftLeftOctet<8>(vk)); |
1480 | | offset+=16; |
1481 | | |
1482 | | vk = VecLoad(offset, k); |
1483 | | SHA512_ROUND2<14>(W,S, vk); |
1484 | | SHA512_ROUND2<15>(W,S, VecShiftLeftOctet<8>(vk)); |
1485 | | offset+=16; |
1486 | | } |
1487 | | |
1488 | | ab += VectorPack(S[A],S[B]); |
1489 | | cd += VectorPack(S[C],S[D]); |
1490 | | ef += VectorPack(S[E],S[F]); |
1491 | | gh += VectorPack(S[G],S[H]); |
1492 | | } |
1493 | | |
1494 | | VecStore64(ab, state+0); |
1495 | | VecStore64(cd, state+2); |
1496 | | VecStore64(ef, state+4); |
1497 | | VecStore64(gh, state+6); |
1498 | | } |
1499 | | |
1500 | | #endif // CRYPTOPP_POWER8_SHA_AVAILABLE |
1501 | | |
1502 | | //////////////////////////////////////////////// |
1503 | | // end Gustavo, Serra, Scalet and Walton code // |
1504 | | //////////////////////////////////////////////// |
1505 | | |
1506 | | NAMESPACE_END |