Coverage Report

Created: 2024-11-21 07:03

/src/cryptopp/sha_simd.cpp
Line
Count
Source (jump to first uncovered line)
1
// sha_simd.cpp - written and placed in the public domain by
2
//                Jeffrey Walton, Uri Blumenthal and Marcel Raad.
3
//
4
//    This source file uses intrinsics to gain access to SHA-NI and
5
//    ARMv8a SHA instructions. A separate source file is needed
6
//    because additional CXXFLAGS are required to enable the
7
//    appropriate instructions sets in some build configurations.
8
9
#include "pch.h"
10
#include "config.h"
11
#include "sha.h"
12
#include "misc.h"
13
14
#if defined(CRYPTOPP_DISABLE_SHA_ASM)
15
# undef CRYPTOPP_X86_ASM_AVAILABLE
16
# undef CRYPTOPP_X32_ASM_AVAILABLE
17
# undef CRYPTOPP_X64_ASM_AVAILABLE
18
# undef CRYPTOPP_SSE2_ASM_AVAILABLE
19
#endif
20
21
#if (CRYPTOPP_SHANI_AVAILABLE)
22
# include <nmmintrin.h>
23
# include <immintrin.h>
24
#endif
25
26
// Android makes <arm_acle.h> available with ARMv7-a
27
#if (CRYPTOPP_BOOL_ARMV8)
28
# if (CRYPTOPP_ARM_NEON_HEADER)
29
#  include <arm_neon.h>
30
# endif
31
# if (CRYPTOPP_ARM_ACLE_HEADER)
32
#  include <stdint.h>
33
#  include <arm_acle.h>
34
# endif
35
#endif
36
37
#if CRYPTOPP_POWER8_SHA_AVAILABLE
38
# include "ppc_simd.h"
39
#endif
40
41
#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
42
# include <signal.h>
43
# include <setjmp.h>
44
#endif
45
46
#ifndef EXCEPTION_EXECUTE_HANDLER
47
# define EXCEPTION_EXECUTE_HANDLER 1
48
#endif
49
50
// Squash MS LNK4221 and libtool warnings
51
extern const char SHA_SIMD_FNAME[] = __FILE__;
52
53
NAMESPACE_BEGIN(CryptoPP)
54
55
// ***************** SHA key tables ********************
56
57
extern const word32 SHA256_K[64];
58
extern const word64 SHA512_K[80];
59
60
// ***************** SIGILL probes ********************
61
62
#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
63
extern "C" {
64
    typedef void (*SigHandler)(int);
65
66
    static jmp_buf s_jmpSIGILL;
67
    static void SigIllHandler(int)
68
0
    {
69
0
        longjmp(s_jmpSIGILL, 1);
70
0
    }
71
}
72
#endif  // Not CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
73
74
#if (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARMV8)
75
bool CPU_ProbeSHA1()
76
{
77
#if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
78
    return false;
79
#elif (CRYPTOPP_ARM_SHA1_AVAILABLE)
80
# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
81
    volatile bool result = true;
82
    __try
83
    {
84
        unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12};
85
        uint32x4_t data1 = vld1q_u32(w+0);
86
        uint32x4_t data2 = vld1q_u32(w+4);
87
        uint32x4_t data3 = vld1q_u32(w+8);
88
89
        uint32x4_t r1 = vsha1cq_u32 (data1, 0, data2);
90
        uint32x4_t r2 = vsha1mq_u32 (data1, 0, data2);
91
        uint32x4_t r3 = vsha1pq_u32 (data1, 0, data2);
92
        uint32x4_t r4 = vsha1su0q_u32 (data1, data2, data3);
93
        uint32x4_t r5 = vsha1su1q_u32 (data1, data2);
94
95
        result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3) | vgetq_lane_u32(r5,0));
96
    }
97
    __except (EXCEPTION_EXECUTE_HANDLER)
98
    {
99
        return false;
100
    }
101
    return result;
102
# else
103
104
    // longjmp and clobber warnings. Volatile is required.
105
    // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
106
    volatile bool result = true;
107
108
    volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
109
    if (oldHandler == SIG_ERR)
110
        return false;
111
112
    volatile sigset_t oldMask;
113
    if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
114
    {
115
        signal(SIGILL, oldHandler);
116
        return false;
117
    }
118
119
    if (setjmp(s_jmpSIGILL))
120
        result = false;
121
    else
122
    {
123
        unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12};
124
        uint32x4_t data1 = vld1q_u32(w+0);
125
        uint32x4_t data2 = vld1q_u32(w+4);
126
        uint32x4_t data3 = vld1q_u32(w+8);
127
128
        uint32x4_t r1 = vsha1cq_u32 (data1, 0, data2);
129
        uint32x4_t r2 = vsha1mq_u32 (data1, 0, data2);
130
        uint32x4_t r3 = vsha1pq_u32 (data1, 0, data2);
131
        uint32x4_t r4 = vsha1su0q_u32 (data1, data2, data3);
132
        uint32x4_t r5 = vsha1su1q_u32 (data1, data2);
133
134
        result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3) | vgetq_lane_u32(r5,0));
135
    }
136
137
    sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
138
    signal(SIGILL, oldHandler);
139
    return result;
140
# endif
141
#else
142
    return false;
143
#endif  // CRYPTOPP_ARM_SHA1_AVAILABLE
144
}
145
146
bool CPU_ProbeSHA256()
147
{
148
#if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
149
    return false;
150
#elif (CRYPTOPP_ARM_SHA2_AVAILABLE)
151
# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
152
    volatile bool result = true;
153
    __try
154
    {
155
        unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12};
156
        uint32x4_t data1 = vld1q_u32(w+0);
157
        uint32x4_t data2 = vld1q_u32(w+4);
158
        uint32x4_t data3 = vld1q_u32(w+8);
159
160
        uint32x4_t r1 = vsha256hq_u32 (data1, data2, data3);
161
        uint32x4_t r2 = vsha256h2q_u32 (data1, data2, data3);
162
        uint32x4_t r3 = vsha256su0q_u32 (data1, data2);
163
        uint32x4_t r4 = vsha256su1q_u32 (data1, data2, data3);
164
165
        result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3));
166
    }
167
    __except (EXCEPTION_EXECUTE_HANDLER)
168
    {
169
        return false;
170
    }
171
    return result;
172
#else
173
174
    // longjmp and clobber warnings. Volatile is required.
175
    // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
176
    volatile bool result = true;
177
178
    volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
179
    if (oldHandler == SIG_ERR)
180
        return false;
181
182
    volatile sigset_t oldMask;
183
    if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
184
    {
185
        signal(SIGILL, oldHandler);
186
        return false;
187
    }
188
189
    if (setjmp(s_jmpSIGILL))
190
        result = false;
191
    else
192
    {
193
        unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12};
194
        uint32x4_t data1 = vld1q_u32(w+0);
195
        uint32x4_t data2 = vld1q_u32(w+4);
196
        uint32x4_t data3 = vld1q_u32(w+8);
197
198
        uint32x4_t r1 = vsha256hq_u32 (data1, data2, data3);
199
        uint32x4_t r2 = vsha256h2q_u32 (data1, data2, data3);
200
        uint32x4_t r3 = vsha256su0q_u32 (data1, data2);
201
        uint32x4_t r4 = vsha256su1q_u32 (data1, data2, data3);
202
203
        result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3));
204
    }
205
206
    sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
207
    signal(SIGILL, oldHandler);
208
    return result;
209
# endif
210
#else
211
    return false;
212
#endif  // CRYPTOPP_ARM_SHA2_AVAILABLE
213
}
214
#endif  // ARM32 or ARM64
215
216
// ***************** Intel x86 SHA ********************
217
218
/////////////////////////////////////
219
// start of Walton and Gulley code //
220
/////////////////////////////////////
221
222
#if CRYPTOPP_SHANI_AVAILABLE
223
// Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley.
224
void SHA1_HashMultipleBlocks_SHANI(word32 *state, const word32 *data, size_t length, ByteOrder order)
225
71.7k
{
226
71.7k
    CRYPTOPP_ASSERT(state);
227
71.7k
    CRYPTOPP_ASSERT(data);
228
71.7k
    CRYPTOPP_ASSERT(length >= SHA1::BLOCKSIZE);
229
230
71.7k
    __m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1;
231
71.7k
    __m128i MASK, MSG0, MSG1, MSG2, MSG3;
232
233
    // Load initial values
234
71.7k
    ABCD = _mm_loadu_si128(CONST_M128_CAST(state));
235
71.7k
    E0 = _mm_set_epi32(state[4], 0, 0, 0);
236
71.7k
    ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
237
238
    // IA-32 SHA is little endian, SHA::Transform is big endian,
239
    // and SHA::HashMultipleBlocks can be either. ByteOrder
240
    // allows us to avoid extra endian reversals. It saves 1.0 cpb.
241
71.7k
    MASK = order == BIG_ENDIAN_ORDER ?  // Data arrangement
242
71.7k
           _mm_set_epi8(0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15) :
243
71.7k
           _mm_set_epi8(3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12) ;
244
245
989k
    while (length >= SHA1::BLOCKSIZE)
246
917k
    {
247
        // Save current hash
248
917k
        ABCD_SAVE = ABCD;
249
917k
        E0_SAVE = E0;
250
251
        // Rounds 0-3
252
917k
        MSG0 = _mm_loadu_si128(CONST_M128_CAST(data+0));
253
917k
        MSG0 = _mm_shuffle_epi8(MSG0, MASK);
254
917k
        E0 = _mm_add_epi32(E0, MSG0);
255
917k
        E1 = ABCD;
256
917k
        ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
257
258
        // Rounds 4-7
259
917k
        MSG1 = _mm_loadu_si128(CONST_M128_CAST(data+4));
260
917k
        MSG1 = _mm_shuffle_epi8(MSG1, MASK);
261
917k
        E1 = _mm_sha1nexte_epu32(E1, MSG1);
262
917k
        E0 = ABCD;
263
917k
        ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
264
917k
        MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
265
266
        // Rounds 8-11
267
917k
        MSG2 = _mm_loadu_si128(CONST_M128_CAST(data+8));
268
917k
        MSG2 = _mm_shuffle_epi8(MSG2, MASK);
269
917k
        E0 = _mm_sha1nexte_epu32(E0, MSG2);
270
917k
        E1 = ABCD;
271
917k
        ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
272
917k
        MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
273
917k
        MSG0 = _mm_xor_si128(MSG0, MSG2);
274
275
        // Rounds 12-15
276
917k
        MSG3 = _mm_loadu_si128(CONST_M128_CAST(data+12));
277
917k
        MSG3 = _mm_shuffle_epi8(MSG3, MASK);
278
917k
        E1 = _mm_sha1nexte_epu32(E1, MSG3);
279
917k
        E0 = ABCD;
280
917k
        MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
281
917k
        ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
282
917k
        MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
283
917k
        MSG1 = _mm_xor_si128(MSG1, MSG3);
284
285
        // Rounds 16-19
286
917k
        E0 = _mm_sha1nexte_epu32(E0, MSG0);
287
917k
        E1 = ABCD;
288
917k
        MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
289
917k
        ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
290
917k
        MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
291
917k
        MSG2 = _mm_xor_si128(MSG2, MSG0);
292
293
        // Rounds 20-23
294
917k
        E1 = _mm_sha1nexte_epu32(E1, MSG1);
295
917k
        E0 = ABCD;
296
917k
        MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
297
917k
        ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
298
917k
        MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
299
917k
        MSG3 = _mm_xor_si128(MSG3, MSG1);
300
301
        // Rounds 24-27
302
917k
        E0 = _mm_sha1nexte_epu32(E0, MSG2);
303
917k
        E1 = ABCD;
304
917k
        MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
305
917k
        ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
306
917k
        MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
307
917k
        MSG0 = _mm_xor_si128(MSG0, MSG2);
308
309
        // Rounds 28-31
310
917k
        E1 = _mm_sha1nexte_epu32(E1, MSG3);
311
917k
        E0 = ABCD;
312
917k
        MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
313
917k
        ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
314
917k
        MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
315
917k
        MSG1 = _mm_xor_si128(MSG1, MSG3);
316
317
        // Rounds 32-35
318
917k
        E0 = _mm_sha1nexte_epu32(E0, MSG0);
319
917k
        E1 = ABCD;
320
917k
        MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
321
917k
        ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
322
917k
        MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
323
917k
        MSG2 = _mm_xor_si128(MSG2, MSG0);
324
325
        // Rounds 36-39
326
917k
        E1 = _mm_sha1nexte_epu32(E1, MSG1);
327
917k
        E0 = ABCD;
328
917k
        MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
329
917k
        ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
330
917k
        MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
331
917k
        MSG3 = _mm_xor_si128(MSG3, MSG1);
332
333
        // Rounds 40-43
334
917k
        E0 = _mm_sha1nexte_epu32(E0, MSG2);
335
917k
        E1 = ABCD;
336
917k
        MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
337
917k
        ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
338
917k
        MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
339
917k
        MSG0 = _mm_xor_si128(MSG0, MSG2);
340
341
        // Rounds 44-47
342
917k
        E1 = _mm_sha1nexte_epu32(E1, MSG3);
343
917k
        E0 = ABCD;
344
917k
        MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
345
917k
        ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
346
917k
        MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
347
917k
        MSG1 = _mm_xor_si128(MSG1, MSG3);
348
349
        // Rounds 48-51
350
917k
        E0 = _mm_sha1nexte_epu32(E0, MSG0);
351
917k
        E1 = ABCD;
352
917k
        MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
353
917k
        ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
354
917k
        MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
355
917k
        MSG2 = _mm_xor_si128(MSG2, MSG0);
356
357
        // Rounds 52-55
358
917k
        E1 = _mm_sha1nexte_epu32(E1, MSG1);
359
917k
        E0 = ABCD;
360
917k
        MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
361
917k
        ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
362
917k
        MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
363
917k
        MSG3 = _mm_xor_si128(MSG3, MSG1);
364
365
        // Rounds 56-59
366
917k
        E0 = _mm_sha1nexte_epu32(E0, MSG2);
367
917k
        E1 = ABCD;
368
917k
        MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
369
917k
        ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
370
917k
        MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
371
917k
        MSG0 = _mm_xor_si128(MSG0, MSG2);
372
373
        // Rounds 60-63
374
917k
        E1 = _mm_sha1nexte_epu32(E1, MSG3);
375
917k
        E0 = ABCD;
376
917k
        MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
377
917k
        ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
378
917k
        MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
379
917k
        MSG1 = _mm_xor_si128(MSG1, MSG3);
380
381
        // Rounds 64-67
382
917k
        E0 = _mm_sha1nexte_epu32(E0, MSG0);
383
917k
        E1 = ABCD;
384
917k
        MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
385
917k
        ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
386
917k
        MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
387
917k
        MSG2 = _mm_xor_si128(MSG2, MSG0);
388
389
        // Rounds 68-71
390
917k
        E1 = _mm_sha1nexte_epu32(E1, MSG1);
391
917k
        E0 = ABCD;
392
917k
        MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
393
917k
        ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
394
917k
        MSG3 = _mm_xor_si128(MSG3, MSG1);
395
396
        // Rounds 72-75
397
917k
        E0 = _mm_sha1nexte_epu32(E0, MSG2);
398
917k
        E1 = ABCD;
399
917k
        MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
400
917k
        ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
401
402
        // Rounds 76-79
403
917k
        E1 = _mm_sha1nexte_epu32(E1, MSG3);
404
917k
        E0 = ABCD;
405
917k
        ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
406
407
        // Add values back to state
408
917k
        E0 = _mm_sha1nexte_epu32(E0, E0_SAVE);
409
917k
        ABCD = _mm_add_epi32(ABCD, ABCD_SAVE);
410
411
917k
        data += SHA1::BLOCKSIZE/sizeof(word32);
412
917k
        length -= SHA1::BLOCKSIZE;
413
917k
    }
414
415
    // Save state
416
71.7k
    ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
417
71.7k
    _mm_storeu_si128(M128_CAST(state), ABCD);
418
71.7k
    state[4] = _mm_extract_epi32(E0, 3);
419
71.7k
}
420
421
// Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley.
422
void SHA256_HashMultipleBlocks_SHANI(word32 *state, const word32 *data, size_t length, ByteOrder order)
423
179k
{
424
179k
    CRYPTOPP_ASSERT(state);
425
179k
    CRYPTOPP_ASSERT(data);
426
179k
    CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE);
427
428
179k
    __m128i STATE0, STATE1;
429
179k
    __m128i MSG, TMP, MASK;
430
179k
    __m128i TMSG0, TMSG1, TMSG2, TMSG3;
431
179k
    __m128i ABEF_SAVE, CDGH_SAVE;
432
433
    // Load initial values
434
179k
    TMP    = _mm_loadu_si128(M128_CAST(&state[0]));
435
179k
    STATE1 = _mm_loadu_si128(M128_CAST(&state[4]));
436
437
    // IA-32 SHA is little endian, SHA::Transform is big endian,
438
    // and SHA::HashMultipleBlocks can be either. ByteOrder
439
    // allows us to avoid extra endian reversals. It saves 1.0 cpb.
440
179k
    MASK = order == BIG_ENDIAN_ORDER ?  // Data arrangement
441
179k
           _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3) :
442
179k
           _mm_set_epi8(15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0) ;
443
444
179k
    TMP = _mm_shuffle_epi32(TMP, 0xB1);          // CDAB
445
179k
    STATE1 = _mm_shuffle_epi32(STATE1, 0x1B);    // EFGH
446
179k
    STATE0 = _mm_alignr_epi8(TMP, STATE1, 8);    // ABEF
447
179k
    STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
448
449
2.02M
    while (length >= SHA256::BLOCKSIZE)
450
1.84M
    {
451
        // Save current hash
452
1.84M
        ABEF_SAVE = STATE0;
453
1.84M
        CDGH_SAVE = STATE1;
454
455
        // Rounds 0-3
456
1.84M
        MSG = _mm_loadu_si128(CONST_M128_CAST(data+0));
457
1.84M
        TMSG0 = _mm_shuffle_epi8(MSG, MASK);
458
1.84M
        MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0xE9B5DBA5B5C0FBCF), W64LIT(0x71374491428A2F98)));
459
1.84M
        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
460
1.84M
        MSG = _mm_shuffle_epi32(MSG, 0x0E);
461
1.84M
        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
462
463
        // Rounds 4-7
464
1.84M
        TMSG1 = _mm_loadu_si128(CONST_M128_CAST(data+4));
465
1.84M
        TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
466
1.84M
        MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0xAB1C5ED5923F82A4), W64LIT(0x59F111F13956C25B)));
467
1.84M
        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
468
1.84M
        MSG = _mm_shuffle_epi32(MSG, 0x0E);
469
1.84M
        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
470
1.84M
        TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
471
472
        // Rounds 8-11
473
1.84M
        TMSG2 = _mm_loadu_si128(CONST_M128_CAST(data+8));
474
1.84M
        TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
475
1.84M
        MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x550C7DC3243185BE), W64LIT(0x12835B01D807AA98)));
476
1.84M
        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
477
1.84M
        MSG = _mm_shuffle_epi32(MSG, 0x0E);
478
1.84M
        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
479
1.84M
        TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
480
481
        // Rounds 12-15
482
1.84M
        TMSG3 = _mm_loadu_si128(CONST_M128_CAST(data+12));
483
1.84M
        TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
484
1.84M
        MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC19BF1749BDC06A7), W64LIT(0x80DEB1FE72BE5D74)));
485
1.84M
        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
486
1.84M
        TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
487
1.84M
        TMSG0 = _mm_add_epi32(TMSG0, TMP);
488
1.84M
        TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
489
1.84M
        MSG = _mm_shuffle_epi32(MSG, 0x0E);
490
1.84M
        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
491
1.84M
        TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
492
493
        // Rounds 16-19
494
1.84M
        MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x240CA1CC0FC19DC6), W64LIT(0xEFBE4786E49B69C1)));
495
1.84M
        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
496
1.84M
        TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
497
1.84M
        TMSG1 = _mm_add_epi32(TMSG1, TMP);
498
1.84M
        TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
499
1.84M
        MSG = _mm_shuffle_epi32(MSG, 0x0E);
500
1.84M
        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
501
1.84M
        TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
502
503
        // Rounds 20-23
504
1.84M
        MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x76F988DA5CB0A9DC), W64LIT(0x4A7484AA2DE92C6F)));
505
1.84M
        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
506
1.84M
        TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
507
1.84M
        TMSG2 = _mm_add_epi32(TMSG2, TMP);
508
1.84M
        TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
509
1.84M
        MSG = _mm_shuffle_epi32(MSG, 0x0E);
510
1.84M
        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
511
1.84M
        TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
512
513
        // Rounds 24-27
514
1.84M
        MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xBF597FC7B00327C8), W64LIT(0xA831C66D983E5152)));
515
1.84M
        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
516
1.84M
        TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
517
1.84M
        TMSG3 = _mm_add_epi32(TMSG3, TMP);
518
1.84M
        TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
519
1.84M
        MSG = _mm_shuffle_epi32(MSG, 0x0E);
520
1.84M
        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
521
1.84M
        TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
522
523
        // Rounds 28-31
524
1.84M
        MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x1429296706CA6351), W64LIT(0xD5A79147C6E00BF3)));
525
1.84M
        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
526
1.84M
        TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
527
1.84M
        TMSG0 = _mm_add_epi32(TMSG0, TMP);
528
1.84M
        TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
529
1.84M
        MSG = _mm_shuffle_epi32(MSG, 0x0E);
530
1.84M
        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
531
1.84M
        TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
532
533
        // Rounds 32-35
534
1.84M
        MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x53380D134D2C6DFC), W64LIT(0x2E1B213827B70A85)));
535
1.84M
        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
536
1.84M
        TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
537
1.84M
        TMSG1 = _mm_add_epi32(TMSG1, TMP);
538
1.84M
        TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
539
1.84M
        MSG = _mm_shuffle_epi32(MSG, 0x0E);
540
1.84M
        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
541
1.84M
        TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
542
543
        // Rounds 36-39
544
1.84M
        MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x92722C8581C2C92E), W64LIT(0x766A0ABB650A7354)));
545
1.84M
        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
546
1.84M
        TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
547
1.84M
        TMSG2 = _mm_add_epi32(TMSG2, TMP);
548
1.84M
        TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
549
1.84M
        MSG = _mm_shuffle_epi32(MSG, 0x0E);
550
1.84M
        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
551
1.84M
        TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
552
553
        // Rounds 40-43
554
1.84M
        MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xC76C51A3C24B8B70), W64LIT(0xA81A664BA2BFE8A1)));
555
1.84M
        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
556
1.84M
        TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
557
1.84M
        TMSG3 = _mm_add_epi32(TMSG3, TMP);
558
1.84M
        TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
559
1.84M
        MSG = _mm_shuffle_epi32(MSG, 0x0E);
560
1.84M
        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
561
1.84M
        TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
562
563
        // Rounds 44-47
564
1.84M
        MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x106AA070F40E3585), W64LIT(0xD6990624D192E819)));
565
1.84M
        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
566
1.84M
        TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
567
1.84M
        TMSG0 = _mm_add_epi32(TMSG0, TMP);
568
1.84M
        TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
569
1.84M
        MSG = _mm_shuffle_epi32(MSG, 0x0E);
570
1.84M
        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
571
1.84M
        TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
572
573
        // Rounds 48-51
574
1.84M
        MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x34B0BCB52748774C), W64LIT(0x1E376C0819A4C116)));
575
1.84M
        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
576
1.84M
        TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
577
1.84M
        TMSG1 = _mm_add_epi32(TMSG1, TMP);
578
1.84M
        TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
579
1.84M
        MSG = _mm_shuffle_epi32(MSG, 0x0E);
580
1.84M
        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
581
1.84M
        TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
582
583
        // Rounds 52-55
584
1.84M
        MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x682E6FF35B9CCA4F), W64LIT(0x4ED8AA4A391C0CB3)));
585
1.84M
        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
586
1.84M
        TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
587
1.84M
        TMSG2 = _mm_add_epi32(TMSG2, TMP);
588
1.84M
        TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
589
1.84M
        MSG = _mm_shuffle_epi32(MSG, 0x0E);
590
1.84M
        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
591
592
        // Rounds 56-59
593
1.84M
        MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x8CC7020884C87814), W64LIT(0x78A5636F748F82EE)));
594
1.84M
        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
595
1.84M
        TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
596
1.84M
        TMSG3 = _mm_add_epi32(TMSG3, TMP);
597
1.84M
        TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
598
1.84M
        MSG = _mm_shuffle_epi32(MSG, 0x0E);
599
1.84M
        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
600
601
        // Rounds 60-63
602
1.84M
        MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC67178F2BEF9A3F7), W64LIT(0xA4506CEB90BEFFFA)));
603
1.84M
        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
604
1.84M
        MSG = _mm_shuffle_epi32(MSG, 0x0E);
605
1.84M
        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
606
607
        // Add values back to state
608
1.84M
        STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
609
1.84M
        STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
610
611
1.84M
        data += SHA256::BLOCKSIZE/sizeof(word32);
612
1.84M
        length -= SHA256::BLOCKSIZE;
613
1.84M
    }
614
615
179k
    TMP = _mm_shuffle_epi32(STATE0, 0x1B);       // FEBA
616
179k
    STATE1 = _mm_shuffle_epi32(STATE1, 0xB1);    // DCHG
617
179k
    STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
618
179k
    STATE1 = _mm_alignr_epi8(STATE1, TMP, 8);    // ABEF
619
620
    // Save state
621
179k
    _mm_storeu_si128(M128_CAST(&state[0]), STATE0);
622
179k
    _mm_storeu_si128(M128_CAST(&state[4]), STATE1);
623
179k
}
624
#endif  // CRYPTOPP_SHANI_AVAILABLE
625
626
///////////////////////////////////
627
// end of Walton and Gulley code //
628
///////////////////////////////////
629
630
// ***************** ARMV8 SHA ********************
631
632
/////////////////////////////////////////////////////////////
633
// start of Walton, Schneiders, O'Rourke and Hovsmith code //
634
/////////////////////////////////////////////////////////////
635
636
#if CRYPTOPP_ARM_SHA1_AVAILABLE
637
void SHA1_HashMultipleBlocks_ARMV8(word32 *state, const word32 *data, size_t length, ByteOrder order)
638
{
639
    CRYPTOPP_ASSERT(state);
640
    CRYPTOPP_ASSERT(data);
641
    CRYPTOPP_ASSERT(length >= SHA1::BLOCKSIZE);
642
643
    uint32x4_t C0, C1, C2, C3;
644
    uint32x4_t ABCD, ABCD_SAVED;
645
    uint32x4_t MSG0, MSG1, MSG2, MSG3;
646
    uint32x4_t TMP0, TMP1;
647
    uint32_t   E0, E0_SAVED, E1;
648
649
    // Load initial values
650
    C0 = vdupq_n_u32(0x5A827999);
651
    C1 = vdupq_n_u32(0x6ED9EBA1);
652
    C2 = vdupq_n_u32(0x8F1BBCDC);
653
    C3 = vdupq_n_u32(0xCA62C1D6);
654
655
    ABCD = vld1q_u32(&state[0]);
656
    E0 = state[4];
657
658
    while (length >= SHA1::BLOCKSIZE)
659
    {
660
        // Save current hash
661
        ABCD_SAVED = ABCD;
662
        E0_SAVED = E0;
663
664
        MSG0 = vld1q_u32(data +  0);
665
        MSG1 = vld1q_u32(data +  4);
666
        MSG2 = vld1q_u32(data +  8);
667
        MSG3 = vld1q_u32(data + 12);
668
669
        if (order == BIG_ENDIAN_ORDER)  // Data arrangement
670
        {
671
            MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0)));
672
            MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1)));
673
            MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2)));
674
            MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3)));
675
        }
676
677
        TMP0 = vaddq_u32(MSG0, C0);
678
        TMP1 = vaddq_u32(MSG1, C0);
679
680
        // Rounds 0-3
681
        E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
682
        ABCD = vsha1cq_u32(ABCD, E0, TMP0);
683
        TMP0 = vaddq_u32(MSG2, C0);
684
        MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
685
686
        // Rounds 4-7
687
        E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
688
        ABCD = vsha1cq_u32(ABCD, E1, TMP1);
689
        TMP1 = vaddq_u32(MSG3, C0);
690
        MSG0 = vsha1su1q_u32(MSG0, MSG3);
691
        MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
692
693
        // Rounds 8-11
694
        E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
695
        ABCD = vsha1cq_u32(ABCD, E0, TMP0);
696
        TMP0 = vaddq_u32(MSG0, C0);
697
        MSG1 = vsha1su1q_u32(MSG1, MSG0);
698
        MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
699
700
        // Rounds 12-15
701
        E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
702
        ABCD = vsha1cq_u32(ABCD, E1, TMP1);
703
        TMP1 = vaddq_u32(MSG1, C1);
704
        MSG2 = vsha1su1q_u32(MSG2, MSG1);
705
        MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
706
707
        // Rounds 16-19
708
        E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
709
        ABCD = vsha1cq_u32(ABCD, E0, TMP0);
710
        TMP0 = vaddq_u32(MSG2, C1);
711
        MSG3 = vsha1su1q_u32(MSG3, MSG2);
712
        MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
713
714
        // Rounds 20-23
715
        E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
716
        ABCD = vsha1pq_u32(ABCD, E1, TMP1);
717
        TMP1 = vaddq_u32(MSG3, C1);
718
        MSG0 = vsha1su1q_u32(MSG0, MSG3);
719
        MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
720
721
        // Rounds 24-27
722
        E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
723
        ABCD = vsha1pq_u32(ABCD, E0, TMP0);
724
        TMP0 = vaddq_u32(MSG0, C1);
725
        MSG1 = vsha1su1q_u32(MSG1, MSG0);
726
        MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
727
728
        // Rounds 28-31
729
        E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
730
        ABCD = vsha1pq_u32(ABCD, E1, TMP1);
731
        TMP1 = vaddq_u32(MSG1, C1);
732
        MSG2 = vsha1su1q_u32(MSG2, MSG1);
733
        MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
734
735
        // Rounds 32-35
736
        E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
737
        ABCD = vsha1pq_u32(ABCD, E0, TMP0);
738
        TMP0 = vaddq_u32(MSG2, C2);
739
        MSG3 = vsha1su1q_u32(MSG3, MSG2);
740
        MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
741
742
        // Rounds 36-39
743
        E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
744
        ABCD = vsha1pq_u32(ABCD, E1, TMP1);
745
        TMP1 = vaddq_u32(MSG3, C2);
746
        MSG0 = vsha1su1q_u32(MSG0, MSG3);
747
        MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
748
749
        // Rounds 40-43
750
        E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
751
        ABCD = vsha1mq_u32(ABCD, E0, TMP0);
752
        TMP0 = vaddq_u32(MSG0, C2);
753
        MSG1 = vsha1su1q_u32(MSG1, MSG0);
754
        MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
755
756
        // Rounds 44-47
757
        E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
758
        ABCD = vsha1mq_u32(ABCD, E1, TMP1);
759
        TMP1 = vaddq_u32(MSG1, C2);
760
        MSG2 = vsha1su1q_u32(MSG2, MSG1);
761
        MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
762
763
        // Rounds 48-51
764
        E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
765
        ABCD = vsha1mq_u32(ABCD, E0, TMP0);
766
        TMP0 = vaddq_u32(MSG2, C2);
767
        MSG3 = vsha1su1q_u32(MSG3, MSG2);
768
        MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
769
770
        // Rounds 52-55
771
        E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
772
        ABCD = vsha1mq_u32(ABCD, E1, TMP1);
773
        TMP1 = vaddq_u32(MSG3, C3);
774
        MSG0 = vsha1su1q_u32(MSG0, MSG3);
775
        MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
776
777
        // Rounds 56-59
778
        E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
779
        ABCD = vsha1mq_u32(ABCD, E0, TMP0);
780
        TMP0 = vaddq_u32(MSG0, C3);
781
        MSG1 = vsha1su1q_u32(MSG1, MSG0);
782
        MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
783
784
        // Rounds 60-63
785
        E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
786
        ABCD = vsha1pq_u32(ABCD, E1, TMP1);
787
        TMP1 = vaddq_u32(MSG1, C3);
788
        MSG2 = vsha1su1q_u32(MSG2, MSG1);
789
        MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
790
791
        // Rounds 64-67
792
        E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
793
        ABCD = vsha1pq_u32(ABCD, E0, TMP0);
794
        TMP0 = vaddq_u32(MSG2, C3);
795
        MSG3 = vsha1su1q_u32(MSG3, MSG2);
796
        MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
797
798
        // Rounds 68-71
799
        E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
800
        ABCD = vsha1pq_u32(ABCD, E1, TMP1);
801
        TMP1 = vaddq_u32(MSG3, C3);
802
        MSG0 = vsha1su1q_u32(MSG0, MSG3);
803
804
        // Rounds 72-75
805
        E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
806
        ABCD = vsha1pq_u32(ABCD, E0, TMP0);
807
808
        // Rounds 76-79
809
        E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
810
        ABCD = vsha1pq_u32(ABCD, E1, TMP1);
811
812
        E0 += E0_SAVED;
813
        ABCD = vaddq_u32(ABCD_SAVED, ABCD);
814
815
        data += SHA1::BLOCKSIZE/sizeof(word32);
816
        length -= SHA1::BLOCKSIZE;
817
    }
818
819
    // Save state
820
    vst1q_u32(&state[0], ABCD);
821
    state[4] = E0;
822
}
823
#endif  // CRYPTOPP_ARM_SHA1_AVAILABLE
824
825
#if CRYPTOPP_ARM_SHA2_AVAILABLE
826
void SHA256_HashMultipleBlocks_ARMV8(word32 *state, const word32 *data, size_t length, ByteOrder order)
827
{
828
    CRYPTOPP_ASSERT(state);
829
    CRYPTOPP_ASSERT(data);
830
    CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE);
831
832
    uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE;
833
    uint32x4_t MSG0, MSG1, MSG2, MSG3;
834
    uint32x4_t TMP0, TMP1, TMP2;
835
836
    // Load initial values
837
    STATE0 = vld1q_u32(&state[0]);
838
    STATE1 = vld1q_u32(&state[4]);
839
840
    while (length >= SHA256::BLOCKSIZE)
841
    {
842
        // Save current hash
843
        ABEF_SAVE = STATE0;
844
        CDGH_SAVE = STATE1;
845
846
        // Load message
847
        MSG0 = vld1q_u32(data +  0);
848
        MSG1 = vld1q_u32(data +  4);
849
        MSG2 = vld1q_u32(data +  8);
850
        MSG3 = vld1q_u32(data + 12);
851
852
        if (order == BIG_ENDIAN_ORDER)  // Data arrangement
853
        {
854
            MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0)));
855
            MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1)));
856
            MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2)));
857
            MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3)));
858
        }
859
860
        TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x00]));
861
862
        // Rounds 0-3
863
        MSG0 = vsha256su0q_u32(MSG0, MSG1);
864
        TMP2 = STATE0;
865
        TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x04]));
866
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
867
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
868
        MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
869
870
        // Rounds 4-7
871
        MSG1 = vsha256su0q_u32(MSG1, MSG2);
872
        TMP2 = STATE0;
873
        TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x08]));
874
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
875
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
876
        MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
877
878
        // Rounds 8-11
879
        MSG2 = vsha256su0q_u32(MSG2, MSG3);
880
        TMP2 = STATE0;
881
        TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x0c]));
882
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
883
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
884
        MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
885
886
        // Rounds 12-15
887
        MSG3 = vsha256su0q_u32(MSG3, MSG0);
888
        TMP2 = STATE0;
889
        TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x10]));
890
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
891
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
892
        MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
893
894
        // Rounds 16-19
895
        MSG0 = vsha256su0q_u32(MSG0, MSG1);
896
        TMP2 = STATE0;
897
        TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x14]));
898
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
899
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
900
        MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
901
902
        // Rounds 20-23
903
        MSG1 = vsha256su0q_u32(MSG1, MSG2);
904
        TMP2 = STATE0;
905
        TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x18]));
906
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
907
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
908
        MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
909
910
        // Rounds 24-27
911
        MSG2 = vsha256su0q_u32(MSG2, MSG3);
912
        TMP2 = STATE0;
913
        TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x1c]));
914
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
915
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
916
        MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
917
918
        // Rounds 28-31
919
        MSG3 = vsha256su0q_u32(MSG3, MSG0);
920
        TMP2 = STATE0;
921
        TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x20]));
922
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
923
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
924
        MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
925
926
        // Rounds 32-35
927
        MSG0 = vsha256su0q_u32(MSG0, MSG1);
928
        TMP2 = STATE0;
929
        TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x24]));
930
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
931
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
932
        MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
933
934
        // Rounds 36-39
935
        MSG1 = vsha256su0q_u32(MSG1, MSG2);
936
        TMP2 = STATE0;
937
        TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x28]));
938
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
939
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
940
        MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
941
942
        // Rounds 40-43
943
        MSG2 = vsha256su0q_u32(MSG2, MSG3);
944
        TMP2 = STATE0;
945
        TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x2c]));
946
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
947
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
948
        MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
949
950
        // Rounds 44-47
951
        MSG3 = vsha256su0q_u32(MSG3, MSG0);
952
        TMP2 = STATE0;
953
        TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x30]));
954
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
955
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
956
        MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
957
958
        // Rounds 48-51
959
        TMP2 = STATE0;
960
        TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x34]));
961
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
962
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
963
964
        // Rounds 52-55
965
        TMP2 = STATE0;
966
        TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x38]));
967
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
968
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
969
970
        // Rounds 56-59
971
        TMP2 = STATE0;
972
        TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x3c]));
973
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
974
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
975
976
        // Rounds 60-63
977
        TMP2 = STATE0;
978
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
979
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
980
981
        // Add back to state
982
        STATE0 = vaddq_u32(STATE0, ABEF_SAVE);
983
        STATE1 = vaddq_u32(STATE1, CDGH_SAVE);
984
985
        data += SHA256::BLOCKSIZE/sizeof(word32);
986
        length -= SHA256::BLOCKSIZE;
987
    }
988
989
    // Save state
990
    vst1q_u32(&state[0], STATE0);
991
    vst1q_u32(&state[4], STATE1);
992
}
993
#endif  // CRYPTOPP_ARM_SHA2_AVAILABLE
994
995
///////////////////////////////////////////////////////////
996
// end of Walton, Schneiders, O'Rourke and Hovsmith code //
997
///////////////////////////////////////////////////////////
998
999
// ***************** Power8 SHA ********************
1000
1001
//////////////////////////////////////////////////
1002
// start Gustavo, Serra, Scalet and Walton code //
1003
//////////////////////////////////////////////////
1004
1005
#if CRYPTOPP_POWER8_SHA_AVAILABLE
1006
1007
// Indexes into the S[] array
1008
enum {A=0, B=1, C, D, E, F, G, H};
1009
1010
inline
1011
uint32x4_p VecLoad32(const word32* data, int offset)
1012
{
1013
#if (CRYPTOPP_LITTLE_ENDIAN)
1014
    const uint8x16_p mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
1015
    const uint32x4_p val = VecLoad(offset, data);
1016
    return (uint32x4_p)VecPermute(val, val, mask);
1017
#else
1018
    return VecLoad(offset, data);
1019
#endif
1020
}
1021
1022
template<class T> inline
1023
void VecStore32(const T data, word32 dest[4])
1024
{
1025
    VecStore(data, dest);
1026
}
1027
1028
inline
1029
uint32x4_p VectorCh(const uint32x4_p x, const uint32x4_p y, const uint32x4_p z)
1030
{
1031
    // The trick below is due to Andy Polyakov and Jack Lloyd
1032
    return vec_sel(z,y,x);
1033
}
1034
1035
inline
1036
uint32x4_p VectorMaj(const uint32x4_p x, const uint32x4_p y, const uint32x4_p z)
1037
{
1038
    // The trick below is due to Andy Polyakov and Jack Lloyd
1039
    return vec_sel(y, z, VecXor(x, y));
1040
}
1041
1042
inline
1043
uint32x4_p Vector_sigma0(const uint32x4_p val)
1044
{
1045
    return VecSHA256<0,0>(val);
1046
}
1047
1048
inline
1049
uint32x4_p Vector_sigma1(const uint32x4_p val)
1050
{
1051
    return VecSHA256<0,0xf>(val);
1052
}
1053
1054
inline
1055
uint32x4_p VectorSigma0(const uint32x4_p val)
1056
{
1057
    return VecSHA256<1,0>(val);
1058
}
1059
1060
inline
1061
uint32x4_p VectorSigma1(const uint32x4_p val)
1062
{
1063
    return VecSHA256<1,0xf>(val);
1064
}
1065
1066
inline
1067
uint32x4_p VectorPack(const uint32x4_p a, const uint32x4_p b,
1068
                       const uint32x4_p c, const uint32x4_p d)
1069
{
1070
    const uint8x16_p m1 = {0,1,2,3, 16,17,18,19, 0,0,0,0, 0,0,0,0};
1071
    const uint8x16_p m2 = {0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
1072
    return VecPermute(VecPermute(a,b,m1), VecPermute(c,d,m1), m2);
1073
}
1074
1075
template <unsigned int R> inline
1076
void SHA256_ROUND1(uint32x4_p W[16], uint32x4_p S[8], const uint32x4_p K, const uint32x4_p M)
1077
{
1078
    uint32x4_p T1, T2;
1079
1080
    W[R] = M;
1081
    T1 = S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K + M;
1082
    T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1083
1084
    S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1085
    S[E] = S[D] + T1;
1086
    S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1087
    S[A] = T1 + T2;
1088
}
1089
1090
template <unsigned int R> inline
1091
void SHA256_ROUND2(uint32x4_p W[16], uint32x4_p S[8], const uint32x4_p K)
1092
{
1093
    // Indexes into the W[] array
1094
    enum {IDX0=(R+0)&0xf, IDX1=(R+1)&0xf, IDX9=(R+9)&0xf, IDX14=(R+14)&0xf};
1095
1096
    const uint32x4_p s0 = Vector_sigma0(W[IDX1]);
1097
    const uint32x4_p s1 = Vector_sigma1(W[IDX14]);
1098
1099
    uint32x4_p T1 = (W[IDX0] += s0 + s1 + W[IDX9]);
1100
    T1 += S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K;
1101
    uint32x4_p T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1102
1103
    S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1104
    S[E] = S[D] + T1;
1105
    S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1106
    S[A] = T1 + T2;
1107
}
1108
1109
void SHA256_HashMultipleBlocks_POWER8(word32 *state, const word32 *data, size_t length, ByteOrder order)
1110
{
1111
    CRYPTOPP_ASSERT(state); CRYPTOPP_ASSERT(data);
1112
    CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE);
1113
    CRYPTOPP_UNUSED(order);
1114
1115
    const uint32_t* k = reinterpret_cast<const uint32_t*>(SHA256_K);
1116
    const uint32_t* m = reinterpret_cast<const uint32_t*>(data);
1117
1118
    uint32x4_p abcd = VecLoad(state+0);
1119
    uint32x4_p efgh = VecLoad(state+4);
1120
    uint32x4_p W[16], S[8], vm, vk;
1121
1122
    size_t blocks = length / SHA256::BLOCKSIZE;
1123
    while (blocks--)
1124
    {
1125
        unsigned int offset=0;
1126
1127
        S[A] = abcd; S[E] = efgh;
1128
        S[B] = VecShiftLeftOctet<4>(S[A]);
1129
        S[F] = VecShiftLeftOctet<4>(S[E]);
1130
        S[C] = VecShiftLeftOctet<4>(S[B]);
1131
        S[G] = VecShiftLeftOctet<4>(S[F]);
1132
        S[D] = VecShiftLeftOctet<4>(S[C]);
1133
        S[H] = VecShiftLeftOctet<4>(S[G]);
1134
1135
        // Rounds 0-16
1136
        vk = VecLoad(offset, k);
1137
        vm = VecLoad32(m, offset);
1138
        SHA256_ROUND1<0>(W,S, vk,vm);
1139
        offset+=16;
1140
1141
        vk = VecShiftLeftOctet<4>(vk);
1142
        vm = VecShiftLeftOctet<4>(vm);
1143
        SHA256_ROUND1<1>(W,S, vk,vm);
1144
1145
        vk = VecShiftLeftOctet<4>(vk);
1146
        vm = VecShiftLeftOctet<4>(vm);
1147
        SHA256_ROUND1<2>(W,S, vk,vm);
1148
1149
        vk = VecShiftLeftOctet<4>(vk);
1150
        vm = VecShiftLeftOctet<4>(vm);
1151
        SHA256_ROUND1<3>(W,S, vk,vm);
1152
1153
        vk = VecLoad(offset, k);
1154
        vm = VecLoad32(m, offset);
1155
        SHA256_ROUND1<4>(W,S, vk,vm);
1156
        offset+=16;
1157
1158
        vk = VecShiftLeftOctet<4>(vk);
1159
        vm = VecShiftLeftOctet<4>(vm);
1160
        SHA256_ROUND1<5>(W,S, vk,vm);
1161
1162
        vk = VecShiftLeftOctet<4>(vk);
1163
        vm = VecShiftLeftOctet<4>(vm);
1164
        SHA256_ROUND1<6>(W,S, vk,vm);
1165
1166
        vk = VecShiftLeftOctet<4>(vk);
1167
        vm = VecShiftLeftOctet<4>(vm);
1168
        SHA256_ROUND1<7>(W,S, vk,vm);
1169
1170
        vk = VecLoad(offset, k);
1171
        vm = VecLoad32(m, offset);
1172
        SHA256_ROUND1<8>(W,S, vk,vm);
1173
        offset+=16;
1174
1175
        vk = VecShiftLeftOctet<4>(vk);
1176
        vm = VecShiftLeftOctet<4>(vm);
1177
        SHA256_ROUND1<9>(W,S, vk,vm);
1178
1179
        vk = VecShiftLeftOctet<4>(vk);
1180
        vm = VecShiftLeftOctet<4>(vm);
1181
        SHA256_ROUND1<10>(W,S, vk,vm);
1182
1183
        vk = VecShiftLeftOctet<4>(vk);
1184
        vm = VecShiftLeftOctet<4>(vm);
1185
        SHA256_ROUND1<11>(W,S, vk,vm);
1186
1187
        vk = VecLoad(offset, k);
1188
        vm = VecLoad32(m, offset);
1189
        SHA256_ROUND1<12>(W,S, vk,vm);
1190
        offset+=16;
1191
1192
        vk = VecShiftLeftOctet<4>(vk);
1193
        vm = VecShiftLeftOctet<4>(vm);
1194
        SHA256_ROUND1<13>(W,S, vk,vm);
1195
1196
        vk = VecShiftLeftOctet<4>(vk);
1197
        vm = VecShiftLeftOctet<4>(vm);
1198
        SHA256_ROUND1<14>(W,S, vk,vm);
1199
1200
        vk = VecShiftLeftOctet<4>(vk);
1201
        vm = VecShiftLeftOctet<4>(vm);
1202
        SHA256_ROUND1<15>(W,S, vk,vm);
1203
1204
        m += 16; // 32-bit words, not bytes
1205
1206
        // Rounds 16-64
1207
        for (unsigned int i=16; i<64; i+=16)
1208
        {
1209
            vk = VecLoad(offset, k);
1210
            SHA256_ROUND2<0>(W,S, vk);
1211
            SHA256_ROUND2<1>(W,S, VecShiftLeftOctet<4>(vk));
1212
            SHA256_ROUND2<2>(W,S, VecShiftLeftOctet<8>(vk));
1213
            SHA256_ROUND2<3>(W,S, VecShiftLeftOctet<12>(vk));
1214
            offset+=16;
1215
1216
            vk = VecLoad(offset, k);
1217
            SHA256_ROUND2<4>(W,S, vk);
1218
            SHA256_ROUND2<5>(W,S, VecShiftLeftOctet<4>(vk));
1219
            SHA256_ROUND2<6>(W,S, VecShiftLeftOctet<8>(vk));
1220
            SHA256_ROUND2<7>(W,S, VecShiftLeftOctet<12>(vk));
1221
            offset+=16;
1222
1223
            vk = VecLoad(offset, k);
1224
            SHA256_ROUND2<8>(W,S, vk);
1225
            SHA256_ROUND2<9>(W,S, VecShiftLeftOctet<4>(vk));
1226
            SHA256_ROUND2<10>(W,S, VecShiftLeftOctet<8>(vk));
1227
            SHA256_ROUND2<11>(W,S, VecShiftLeftOctet<12>(vk));
1228
            offset+=16;
1229
1230
            vk = VecLoad(offset, k);
1231
            SHA256_ROUND2<12>(W,S, vk);
1232
            SHA256_ROUND2<13>(W,S, VecShiftLeftOctet<4>(vk));
1233
            SHA256_ROUND2<14>(W,S, VecShiftLeftOctet<8>(vk));
1234
            SHA256_ROUND2<15>(W,S, VecShiftLeftOctet<12>(vk));
1235
            offset+=16;
1236
        }
1237
1238
        abcd += VectorPack(S[A],S[B],S[C],S[D]);
1239
        efgh += VectorPack(S[E],S[F],S[G],S[H]);
1240
    }
1241
1242
    VecStore32(abcd, state+0);
1243
    VecStore32(efgh, state+4);
1244
}
1245
1246
inline
1247
void VecStore64(const uint64x2_p val, word64* data)
1248
{
1249
    VecStore(val, data);
1250
}
1251
1252
inline
1253
uint64x2_p VecLoad64(const word64* data, int offset)
1254
{
1255
#if (CRYPTOPP_LITTLE_ENDIAN)
1256
    const uint8x16_p mask = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15};
1257
    return VecPermute(VecLoad(offset, data), mask);
1258
#else
1259
    return VecLoad(offset, data);
1260
#endif
1261
}
1262
1263
inline
1264
uint64x2_p VectorCh(const uint64x2_p x, const uint64x2_p y, const uint64x2_p z)
1265
{
1266
    // The trick below is due to Andy Polyakov and Jack Lloyd
1267
    return vec_sel(z,y,x);
1268
}
1269
1270
inline
1271
uint64x2_p VectorMaj(const uint64x2_p x, const uint64x2_p y, const uint64x2_p z)
1272
{
1273
    // The trick below is due to Andy Polyakov and Jack Lloyd
1274
    return vec_sel(y, z, VecXor(x, y));
1275
}
1276
1277
inline
1278
uint64x2_p Vector_sigma0(const uint64x2_p val)
1279
{
1280
    return VecSHA512<0,0>(val);
1281
}
1282
1283
inline
1284
uint64x2_p Vector_sigma1(const uint64x2_p val)
1285
{
1286
    return VecSHA512<0,0xf>(val);
1287
}
1288
1289
inline
1290
uint64x2_p VectorSigma0(const uint64x2_p val)
1291
{
1292
    return VecSHA512<1,0>(val);
1293
}
1294
1295
inline
1296
uint64x2_p VectorSigma1(const uint64x2_p val)
1297
{
1298
    return VecSHA512<1,0xf>(val);
1299
}
1300
1301
inline
1302
uint64x2_p VectorPack(const uint64x2_p x, const uint64x2_p y)
1303
{
1304
    const uint8x16_p m = {0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
1305
    return VecPermute(x,y,m);
1306
}
1307
1308
template <unsigned int R> inline
1309
void SHA512_ROUND1(uint64x2_p W[16], uint64x2_p S[8], const uint64x2_p K, const uint64x2_p M)
1310
{
1311
    uint64x2_p T1, T2;
1312
1313
    W[R] = M;
1314
    T1 = S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K + M;
1315
    T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1316
1317
    S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1318
    S[E] = S[D] + T1;
1319
    S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1320
    S[A] = T1 + T2;
1321
}
1322
1323
template <unsigned int R> inline
1324
void SHA512_ROUND2(uint64x2_p W[16], uint64x2_p S[8], const uint64x2_p K)
1325
{
1326
    // Indexes into the W[] array
1327
    enum {IDX0=(R+0)&0xf, IDX1=(R+1)&0xf, IDX9=(R+9)&0xf, IDX14=(R+14)&0xf};
1328
1329
    const uint64x2_p s0 = Vector_sigma0(W[IDX1]);
1330
    const uint64x2_p s1 = Vector_sigma1(W[IDX14]);
1331
1332
    uint64x2_p T1 = (W[IDX0] += s0 + s1 + W[IDX9]);
1333
    T1 += S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K;
1334
    uint64x2_p T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1335
1336
    S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1337
    S[E] = S[D] + T1;
1338
    S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1339
    S[A] = T1 + T2;
1340
}
1341
1342
void SHA512_HashMultipleBlocks_POWER8(word64 *state, const word64 *data, size_t length, ByteOrder order)
1343
{
1344
    CRYPTOPP_ASSERT(state); CRYPTOPP_ASSERT(data);
1345
    CRYPTOPP_ASSERT(length >= SHA512::BLOCKSIZE);
1346
    CRYPTOPP_UNUSED(order);
1347
1348
    const uint64_t* k = reinterpret_cast<const uint64_t*>(SHA512_K);
1349
    const uint64_t* m = reinterpret_cast<const uint64_t*>(data);
1350
1351
    uint64x2_p ab = VecLoad(state+0);
1352
    uint64x2_p cd = VecLoad(state+2);
1353
    uint64x2_p ef = VecLoad(state+4);
1354
    uint64x2_p gh = VecLoad(state+6);
1355
    uint64x2_p W[16], S[8], vm, vk;
1356
1357
    size_t blocks = length / SHA512::BLOCKSIZE;
1358
    while (blocks--)
1359
    {
1360
        unsigned int offset=0;
1361
1362
        S[A] = ab; S[C] = cd;
1363
        S[E] = ef; S[G] = gh;
1364
        S[B] = VecShiftLeftOctet<8>(S[A]);
1365
        S[D] = VecShiftLeftOctet<8>(S[C]);
1366
        S[F] = VecShiftLeftOctet<8>(S[E]);
1367
        S[H] = VecShiftLeftOctet<8>(S[G]);
1368
1369
        // Rounds 0-16
1370
        vk = VecLoad(offset, k);
1371
        vm = VecLoad64(m, offset);
1372
        SHA512_ROUND1<0>(W,S, vk,vm);
1373
        offset+=16;
1374
1375
        vk = VecShiftLeftOctet<8>(vk);
1376
        vm = VecShiftLeftOctet<8>(vm);
1377
        SHA512_ROUND1<1>(W,S, vk,vm);
1378
1379
        vk = VecLoad(offset, k);
1380
        vm = VecLoad64(m, offset);
1381
        SHA512_ROUND1<2>(W,S, vk,vm);
1382
        offset+=16;
1383
1384
        vk = VecShiftLeftOctet<8>(vk);
1385
        vm = VecShiftLeftOctet<8>(vm);
1386
        SHA512_ROUND1<3>(W,S, vk,vm);
1387
1388
        vk = VecLoad(offset, k);
1389
        vm = VecLoad64(m, offset);
1390
        SHA512_ROUND1<4>(W,S, vk,vm);
1391
        offset+=16;
1392
1393
        vk = VecShiftLeftOctet<8>(vk);
1394
        vm = VecShiftLeftOctet<8>(vm);
1395
        SHA512_ROUND1<5>(W,S, vk,vm);
1396
1397
        vk = VecLoad(offset, k);
1398
        vm = VecLoad64(m, offset);
1399
        SHA512_ROUND1<6>(W,S, vk,vm);
1400
        offset+=16;
1401
1402
        vk = VecShiftLeftOctet<8>(vk);
1403
        vm = VecShiftLeftOctet<8>(vm);
1404
        SHA512_ROUND1<7>(W,S, vk,vm);
1405
1406
        vk = VecLoad(offset, k);
1407
        vm = VecLoad64(m, offset);
1408
        SHA512_ROUND1<8>(W,S, vk,vm);
1409
        offset+=16;
1410
1411
        vk = VecShiftLeftOctet<8>(vk);
1412
        vm = VecShiftLeftOctet<8>(vm);
1413
        SHA512_ROUND1<9>(W,S, vk,vm);
1414
1415
        vk = VecLoad(offset, k);
1416
        vm = VecLoad64(m, offset);
1417
        SHA512_ROUND1<10>(W,S, vk,vm);
1418
        offset+=16;
1419
1420
        vk = VecShiftLeftOctet<8>(vk);
1421
        vm = VecShiftLeftOctet<8>(vm);
1422
        SHA512_ROUND1<11>(W,S, vk,vm);
1423
1424
        vk = VecLoad(offset, k);
1425
        vm = VecLoad64(m, offset);
1426
        SHA512_ROUND1<12>(W,S, vk,vm);
1427
        offset+=16;
1428
1429
        vk = VecShiftLeftOctet<8>(vk);
1430
        vm = VecShiftLeftOctet<8>(vm);
1431
        SHA512_ROUND1<13>(W,S, vk,vm);
1432
1433
        vk = VecLoad(offset, k);
1434
        vm = VecLoad64(m, offset);
1435
        SHA512_ROUND1<14>(W,S, vk,vm);
1436
        offset+=16;
1437
1438
        vk = VecShiftLeftOctet<8>(vk);
1439
        vm = VecShiftLeftOctet<8>(vm);
1440
        SHA512_ROUND1<15>(W,S, vk,vm);
1441
1442
        m += 16; // 64-bit words, not bytes
1443
1444
        // Rounds 16-80
1445
        for (unsigned int i=16; i<80; i+=16)
1446
        {
1447
            vk = VecLoad(offset, k);
1448
            SHA512_ROUND2<0>(W,S, vk);
1449
            SHA512_ROUND2<1>(W,S, VecShiftLeftOctet<8>(vk));
1450
            offset+=16;
1451
1452
            vk = VecLoad(offset, k);
1453
            SHA512_ROUND2<2>(W,S, vk);
1454
            SHA512_ROUND2<3>(W,S, VecShiftLeftOctet<8>(vk));
1455
            offset+=16;
1456
1457
            vk = VecLoad(offset, k);
1458
            SHA512_ROUND2<4>(W,S, vk);
1459
            SHA512_ROUND2<5>(W,S, VecShiftLeftOctet<8>(vk));
1460
            offset+=16;
1461
1462
            vk = VecLoad(offset, k);
1463
            SHA512_ROUND2<6>(W,S, vk);
1464
            SHA512_ROUND2<7>(W,S, VecShiftLeftOctet<8>(vk));
1465
            offset+=16;
1466
1467
            vk = VecLoad(offset, k);
1468
            SHA512_ROUND2<8>(W,S, vk);
1469
            SHA512_ROUND2<9>(W,S, VecShiftLeftOctet<8>(vk));
1470
            offset+=16;
1471
1472
            vk = VecLoad(offset, k);
1473
            SHA512_ROUND2<10>(W,S, vk);
1474
            SHA512_ROUND2<11>(W,S, VecShiftLeftOctet<8>(vk));
1475
            offset+=16;
1476
1477
            vk = VecLoad(offset, k);
1478
            SHA512_ROUND2<12>(W,S, vk);
1479
            SHA512_ROUND2<13>(W,S, VecShiftLeftOctet<8>(vk));
1480
            offset+=16;
1481
1482
            vk = VecLoad(offset, k);
1483
            SHA512_ROUND2<14>(W,S, vk);
1484
            SHA512_ROUND2<15>(W,S, VecShiftLeftOctet<8>(vk));
1485
            offset+=16;
1486
        }
1487
1488
        ab += VectorPack(S[A],S[B]);
1489
        cd += VectorPack(S[C],S[D]);
1490
        ef += VectorPack(S[E],S[F]);
1491
        gh += VectorPack(S[G],S[H]);
1492
    }
1493
1494
    VecStore64(ab, state+0);
1495
    VecStore64(cd, state+2);
1496
    VecStore64(ef, state+4);
1497
    VecStore64(gh, state+6);
1498
}
1499
1500
#endif  // CRYPTOPP_POWER8_SHA_AVAILABLE
1501
1502
////////////////////////////////////////////////
1503
// end Gustavo, Serra, Scalet and Walton code //
1504
////////////////////////////////////////////////
1505
1506
NAMESPACE_END