Coverage Report

Created: 2024-11-21 07:03

/src/cryptopp/adv_simd.h
Line
Count
Source (jump to first uncovered line)
1
// adv_simd.h - written and placed in the public domain by Jeffrey Walton
2
3
/// \file adv_simd.h
4
/// \brief Template for AdvancedProcessBlocks and SIMD processing
5
6
//    The SIMD based implementations for ciphers that use SSE, NEON and Power7
7
//    have a common pattern. Namely, they have a specialized implementation of
8
//    AdvancedProcessBlocks which processes multiple block using hardware
9
//    acceleration. After several implementations we noticed a lot of copy and
10
//    paste occurring. adv_simd.h provides a template to avoid the copy and paste.
11
//
12
//    There are 6 templates provided in this file. The number following the
13
//    function name, 128, is the block size in bits. The name following the
14
//    block size is the arrangement and acceleration. For example 4x1_SSE means
15
//    Intel SSE using two encrypt (or decrypt) functions: one that operates on
16
//    4 SIMD words, and one that operates on 1 SIMD words.
17
//
18
//      * AdvancedProcessBlocks128_4x1_SSE
19
//      * AdvancedProcessBlocks128_6x2_SSE
20
//      * AdvancedProcessBlocks128_4x1_NEON
21
//      * AdvancedProcessBlocks128_6x1_NEON
22
//      * AdvancedProcessBlocks128_4x1_ALTIVEC
23
//      * AdvancedProcessBlocks128_6x1_ALTIVEC
24
//
25
//    If an arrangement ends in 2, like 6x2, then the template will handle the
26
//    single block case by padding with 0's and using the two SIMD word
27
//    function. This happens at most one time when processing multiple blocks.
28
//    The extra processing of a zero block is trivial and worth the tradeoff.
29
//
30
//    The MAYBE_CONST macro present on x86 is a SunCC workaround. Some versions
31
//    of SunCC lose/drop the const-ness in the F1 and F4 functions. It eventually
32
//    results in a failed link due to the const/non-const mismatch.
33
//
34
//    In July 2020 the library stopped using 64-bit block version of
35
//    AdvancedProcessBlocks. Testing showed unreliable results and failed
36
//    self tests on occasion. Also see Issue 945 and
37
//    https://github.com/weidai11/cryptopp/commit/dd7598e638bb.
38
39
#ifndef CRYPTOPP_ADVANCED_SIMD_TEMPLATES
40
#define CRYPTOPP_ADVANCED_SIMD_TEMPLATES
41
42
#include "config.h"
43
#include "misc.h"
44
#include "stdcpp.h"
45
46
#if (CRYPTOPP_ARM_NEON_HEADER)
47
# include <arm_neon.h>
48
#endif
49
50
#if (CRYPTOPP_ARM_ACLE_HEADER)
51
# include <stdint.h>
52
# include <arm_acle.h>
53
#endif
54
55
#if (CRYPTOPP_SSE2_INTRIN_AVAILABLE)
56
# include <emmintrin.h>
57
# include <xmmintrin.h>
58
#endif
59
60
// SunCC needs CRYPTOPP_SSSE3_AVAILABLE, too
61
#if (CRYPTOPP_SSSE3_AVAILABLE)
62
# include <emmintrin.h>
63
# include <pmmintrin.h>
64
# include <xmmintrin.h>
65
#endif
66
67
#if defined(__ALTIVEC__)
68
# include "ppc_simd.h"
69
#endif
70
71
// ************************ All block ciphers *********************** //
72
73
ANONYMOUS_NAMESPACE_BEGIN
74
75
using CryptoPP::BlockTransformation;
76
77
CRYPTOPP_CONSTANT(BT_XorInput = BlockTransformation::BT_XorInput);
78
CRYPTOPP_CONSTANT(BT_AllowParallel = BlockTransformation::BT_AllowParallel);
79
CRYPTOPP_CONSTANT(BT_InBlockIsCounter = BlockTransformation::BT_InBlockIsCounter);
80
CRYPTOPP_CONSTANT(BT_ReverseDirection = BlockTransformation::BT_ReverseDirection);
81
CRYPTOPP_CONSTANT(BT_DontIncrementInOutPointers = BlockTransformation::BT_DontIncrementInOutPointers);
82
83
ANONYMOUS_NAMESPACE_END
84
85
// *************************** ARM NEON ************************** //
86
87
#if (CRYPTOPP_ARM_NEON_AVAILABLE) || (CRYPTOPP_ARM_ASIMD_AVAILABLE) || \
88
    defined(CRYPTOPP_DOXYGEN_PROCESSING)
89
NAMESPACE_BEGIN(CryptoPP)
90
91
/// \brief AdvancedProcessBlocks for 1 and 6 blocks
92
/// \tparam F1 function to process 1 128-bit block
93
/// \tparam F6 function to process 6 128-bit blocks
94
/// \tparam W word type of the subkey table
95
/// \details AdvancedProcessBlocks128_6x1_NEON processes 6 and 2 NEON SIMD words
96
///  at a time.
97
/// \details The subkey type is usually word32 or word64. F1 and F6 must use the
98
///  same word type.
99
template <typename F1, typename F6, typename W>
100
inline size_t AdvancedProcessBlocks128_6x1_NEON(F1 func1, F6 func6,
101
            const W *subKeys, size_t rounds, const byte *inBlocks,
102
            const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
103
{
104
    CRYPTOPP_ASSERT(subKeys);
105
    CRYPTOPP_ASSERT(inBlocks);
106
    CRYPTOPP_ASSERT(outBlocks);
107
    CRYPTOPP_ASSERT(length >= 16);
108
109
    const unsigned int w_one[] = {0, 0<<24, 0, 1<<24};
110
    const uint32x4_t s_one = vld1q_u32(w_one);
111
112
    const size_t blockSize = 16;
113
    // const size_t neonBlockSize = 16;
114
115
    size_t inIncrement = (flags & (EnumToInt(BT_InBlockIsCounter)|EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
116
    size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
117
    size_t outIncrement = (flags & EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
118
119
    // Clang and Coverity are generating findings using xorBlocks as a flag.
120
    const bool xorInput = (xorBlocks != NULLPTR) && (flags & EnumToInt(BT_XorInput));
121
    const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & EnumToInt(BT_XorInput));
122
123
    if (flags & BT_ReverseDirection)
124
    {
125
        inBlocks = PtrAdd(inBlocks, length - blockSize);
126
        xorBlocks = PtrAdd(xorBlocks, length - blockSize);
127
        outBlocks = PtrAdd(outBlocks, length - blockSize);
128
        inIncrement = 0-inIncrement;
129
        xorIncrement = 0-xorIncrement;
130
        outIncrement = 0-outIncrement;
131
    }
132
133
    if (flags & BT_AllowParallel)
134
    {
135
        while (length >= 6*blockSize)
136
        {
137
            uint64x2_t block0, block1, block2, block3, block4, block5;
138
            if (flags & BT_InBlockIsCounter)
139
            {
140
                const uint64x2_t one = vreinterpretq_u64_u32(s_one);
141
                block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
142
                block1 = vaddq_u64(block0, one);
143
                block2 = vaddq_u64(block1, one);
144
                block3 = vaddq_u64(block2, one);
145
                block4 = vaddq_u64(block3, one);
146
                block5 = vaddq_u64(block4, one);
147
                vst1q_u8(const_cast<byte*>(inBlocks),
148
                    vreinterpretq_u8_u64(vaddq_u64(block5, one)));
149
            }
150
            else
151
            {
152
                block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
153
                inBlocks = PtrAdd(inBlocks, inIncrement);
154
                block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
155
                inBlocks = PtrAdd(inBlocks, inIncrement);
156
                block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
157
                inBlocks = PtrAdd(inBlocks, inIncrement);
158
                block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
159
                inBlocks = PtrAdd(inBlocks, inIncrement);
160
                block4 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
161
                inBlocks = PtrAdd(inBlocks, inIncrement);
162
                block5 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
163
                inBlocks = PtrAdd(inBlocks, inIncrement);
164
            }
165
166
            if (xorInput)
167
            {
168
                block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
169
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
170
                block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
171
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
172
                block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
173
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
174
                block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
175
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
176
                block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
177
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
178
                block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
179
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
180
            }
181
182
            func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
183
184
            if (xorOutput)
185
            {
186
                block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
187
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
188
                block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
189
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
190
                block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
191
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
192
                block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
193
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
194
                block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
195
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
196
                block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
197
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
198
            }
199
200
            vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
201
            outBlocks = PtrAdd(outBlocks, outIncrement);
202
            vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
203
            outBlocks = PtrAdd(outBlocks, outIncrement);
204
            vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2));
205
            outBlocks = PtrAdd(outBlocks, outIncrement);
206
            vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3));
207
            outBlocks = PtrAdd(outBlocks, outIncrement);
208
            vst1q_u8(outBlocks, vreinterpretq_u8_u64(block4));
209
            outBlocks = PtrAdd(outBlocks, outIncrement);
210
            vst1q_u8(outBlocks, vreinterpretq_u8_u64(block5));
211
            outBlocks = PtrAdd(outBlocks, outIncrement);
212
213
            length -= 6*blockSize;
214
        }
215
    }
216
217
    while (length >= blockSize)
218
    {
219
        uint64x2_t block;
220
        block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
221
222
        if (xorInput)
223
            block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
224
225
        if (flags & BT_InBlockIsCounter)
226
            const_cast<byte *>(inBlocks)[15]++;
227
228
        func1(block, subKeys, static_cast<unsigned int>(rounds));
229
230
        if (xorOutput)
231
            block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
232
233
        vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
234
235
        inBlocks = PtrAdd(inBlocks, inIncrement);
236
        outBlocks = PtrAdd(outBlocks, outIncrement);
237
        xorBlocks = PtrAdd(xorBlocks, xorIncrement);
238
        length -= blockSize;
239
    }
240
241
    return length;
242
}
243
244
/// \brief AdvancedProcessBlocks for 1 and 4 blocks
245
/// \tparam F1 function to process 1 128-bit block
246
/// \tparam F4 function to process 4 128-bit blocks
247
/// \tparam W word type of the subkey table
248
/// \details AdvancedProcessBlocks128_4x1_NEON processes 4 and 1 NEON SIMD words
249
///  at a time.
250
/// \details The subkey type is usually word32 or word64. V is the vector type and it is
251
///  usually uint32x4_t or uint32x4_t. F1, F4, and W must use the same word and
252
///  vector type.
253
template <typename F1, typename F4, typename W>
254
inline size_t AdvancedProcessBlocks128_4x1_NEON(F1 func1, F4 func4,
255
            const W *subKeys, size_t rounds, const byte *inBlocks,
256
            const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
257
{
258
    CRYPTOPP_ASSERT(subKeys);
259
    CRYPTOPP_ASSERT(inBlocks);
260
    CRYPTOPP_ASSERT(outBlocks);
261
    CRYPTOPP_ASSERT(length >= 16);
262
263
    const unsigned int w_one[] = {0, 0<<24, 0, 1<<24};
264
    const uint32x4_t s_one = vld1q_u32(w_one);
265
266
    const size_t blockSize = 16;
267
    // const size_t neonBlockSize = 16;
268
269
    size_t inIncrement = (flags & (EnumToInt(BT_InBlockIsCounter)|EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
270
    size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
271
    size_t outIncrement = (flags & EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
272
273
    // Clang and Coverity are generating findings using xorBlocks as a flag.
274
    const bool xorInput = (xorBlocks != NULLPTR) && (flags & EnumToInt(BT_XorInput));
275
    const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & EnumToInt(BT_XorInput));
276
277
    if (flags & BT_ReverseDirection)
278
    {
279
        inBlocks = PtrAdd(inBlocks, length - blockSize);
280
        xorBlocks = PtrAdd(xorBlocks, length - blockSize);
281
        outBlocks = PtrAdd(outBlocks, length - blockSize);
282
        inIncrement = 0-inIncrement;
283
        xorIncrement = 0-xorIncrement;
284
        outIncrement = 0-outIncrement;
285
    }
286
287
    if (flags & BT_AllowParallel)
288
    {
289
        while (length >= 4*blockSize)
290
        {
291
            uint32x4_t block0, block1, block2, block3;
292
            if (flags & BT_InBlockIsCounter)
293
            {
294
                const uint32x4_t one = s_one;
295
                block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
296
                block1 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(block0), vreinterpretq_u64_u32(one)));
297
                block2 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(block1), vreinterpretq_u64_u32(one)));
298
                block3 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(block2), vreinterpretq_u64_u32(one)));
299
                vst1q_u8(const_cast<byte*>(inBlocks), vreinterpretq_u8_u64(vaddq_u64(
300
                    vreinterpretq_u64_u32(block3), vreinterpretq_u64_u32(one))));
301
            }
302
            else
303
            {
304
                block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
305
                inBlocks = PtrAdd(inBlocks, inIncrement);
306
                block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
307
                inBlocks = PtrAdd(inBlocks, inIncrement);
308
                block2 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
309
                inBlocks = PtrAdd(inBlocks, inIncrement);
310
                block3 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
311
                inBlocks = PtrAdd(inBlocks, inIncrement);
312
            }
313
314
            if (xorInput)
315
            {
316
                block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
317
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
318
                block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
319
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
320
                block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
321
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
322
                block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
323
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
324
            }
325
326
            func4(block0, block1, block2, block3, subKeys, static_cast<unsigned int>(rounds));
327
328
            if (xorOutput)
329
            {
330
                block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
331
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
332
                block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
333
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
334
                block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
335
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
336
                block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
337
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
338
            }
339
340
            vst1q_u8(outBlocks, vreinterpretq_u8_u32(block0));
341
            outBlocks = PtrAdd(outBlocks, outIncrement);
342
            vst1q_u8(outBlocks, vreinterpretq_u8_u32(block1));
343
            outBlocks = PtrAdd(outBlocks, outIncrement);
344
            vst1q_u8(outBlocks, vreinterpretq_u8_u32(block2));
345
            outBlocks = PtrAdd(outBlocks, outIncrement);
346
            vst1q_u8(outBlocks, vreinterpretq_u8_u32(block3));
347
            outBlocks = PtrAdd(outBlocks, outIncrement);
348
349
            length -= 4*blockSize;
350
        }
351
    }
352
353
    while (length >= blockSize)
354
    {
355
        uint32x4_t block = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
356
357
        if (xorInput)
358
            block = veorq_u32(block, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
359
360
        if (flags & BT_InBlockIsCounter)
361
            const_cast<byte *>(inBlocks)[15]++;
362
363
        func1(block, subKeys, static_cast<unsigned int>(rounds));
364
365
        if (xorOutput)
366
            block = veorq_u32(block, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
367
368
        vst1q_u8(outBlocks, vreinterpretq_u8_u32(block));
369
370
        inBlocks = PtrAdd(inBlocks, inIncrement);
371
        outBlocks = PtrAdd(outBlocks, outIncrement);
372
        xorBlocks = PtrAdd(xorBlocks, xorIncrement);
373
        length -= blockSize;
374
    }
375
376
    return length;
377
}
378
379
/// \brief AdvancedProcessBlocks for 2 and 6 blocks
380
/// \tparam F2 function to process 2 128-bit blocks
381
/// \tparam F6 function to process 6 128-bit blocks
382
/// \tparam W word type of the subkey table
383
/// \details AdvancedProcessBlocks128_6x2_NEON processes 6 and 2 NEON SIMD words
384
///  at a time. For a single block the template uses F2 with a zero block.
385
/// \details The subkey type is usually word32 or word64. F2 and F6 must use the
386
///  same word type.
387
template <typename F2, typename F6, typename W>
388
inline size_t AdvancedProcessBlocks128_6x2_NEON(F2 func2, F6 func6,
389
            const W *subKeys, size_t rounds, const byte *inBlocks,
390
            const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
391
{
392
    CRYPTOPP_ASSERT(subKeys);
393
    CRYPTOPP_ASSERT(inBlocks);
394
    CRYPTOPP_ASSERT(outBlocks);
395
    CRYPTOPP_ASSERT(length >= 16);
396
397
    const unsigned int w_one[] = {0, 0<<24, 0, 1<<24};
398
    const uint32x4_t s_one = vld1q_u32(w_one);
399
400
    const size_t blockSize = 16;
401
    // const size_t neonBlockSize = 16;
402
403
    size_t inIncrement = (flags & (EnumToInt(BT_InBlockIsCounter)|EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
404
    size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
405
    size_t outIncrement = (flags & EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
406
407
    // Clang and Coverity are generating findings using xorBlocks as a flag.
408
    const bool xorInput = (xorBlocks != NULLPTR) && (flags & EnumToInt(BT_XorInput));
409
    const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & EnumToInt(BT_XorInput));
410
411
    if (flags & BT_ReverseDirection)
412
    {
413
        inBlocks = PtrAdd(inBlocks, length - blockSize);
414
        xorBlocks = PtrAdd(xorBlocks, length - blockSize);
415
        outBlocks = PtrAdd(outBlocks, length - blockSize);
416
        inIncrement = 0-inIncrement;
417
        xorIncrement = 0-xorIncrement;
418
        outIncrement = 0-outIncrement;
419
    }
420
421
    if (flags & BT_AllowParallel)
422
    {
423
        while (length >= 6*blockSize)
424
        {
425
            uint64x2_t block0, block1, block2, block3, block4, block5;
426
            if (flags & BT_InBlockIsCounter)
427
            {
428
                const uint64x2_t one = vreinterpretq_u64_u32(s_one);
429
                block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
430
                block1 = vaddq_u64(block0, one);
431
                block2 = vaddq_u64(block1, one);
432
                block3 = vaddq_u64(block2, one);
433
                block4 = vaddq_u64(block3, one);
434
                block5 = vaddq_u64(block4, one);
435
                vst1q_u8(const_cast<byte*>(inBlocks),
436
                    vreinterpretq_u8_u64(vaddq_u64(block5, one)));
437
            }
438
            else
439
            {
440
                block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
441
                inBlocks = PtrAdd(inBlocks, inIncrement);
442
                block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
443
                inBlocks = PtrAdd(inBlocks, inIncrement);
444
                block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
445
                inBlocks = PtrAdd(inBlocks, inIncrement);
446
                block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
447
                inBlocks = PtrAdd(inBlocks, inIncrement);
448
                block4 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
449
                inBlocks = PtrAdd(inBlocks, inIncrement);
450
                block5 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
451
                inBlocks = PtrAdd(inBlocks, inIncrement);
452
            }
453
454
            if (xorInput)
455
            {
456
                block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
457
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
458
                block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
459
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
460
                block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
461
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
462
                block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
463
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
464
                block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
465
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
466
                block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
467
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
468
            }
469
470
            func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
471
472
            if (xorOutput)
473
            {
474
                block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
475
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
476
                block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
477
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
478
                block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
479
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
480
                block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
481
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
482
                block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
483
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
484
                block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
485
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
486
            }
487
488
            vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
489
            outBlocks = PtrAdd(outBlocks, outIncrement);
490
            vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
491
            outBlocks = PtrAdd(outBlocks, outIncrement);
492
            vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2));
493
            outBlocks = PtrAdd(outBlocks, outIncrement);
494
            vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3));
495
            outBlocks = PtrAdd(outBlocks, outIncrement);
496
            vst1q_u8(outBlocks, vreinterpretq_u8_u64(block4));
497
            outBlocks = PtrAdd(outBlocks, outIncrement);
498
            vst1q_u8(outBlocks, vreinterpretq_u8_u64(block5));
499
            outBlocks = PtrAdd(outBlocks, outIncrement);
500
501
            length -= 6*blockSize;
502
        }
503
504
        while (length >= 2*blockSize)
505
        {
506
            uint64x2_t block0, block1;
507
            if (flags & BT_InBlockIsCounter)
508
            {
509
                const uint64x2_t one = vreinterpretq_u64_u32(s_one);
510
                block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
511
                block1 = vaddq_u64(block0, one);
512
                vst1q_u8(const_cast<byte*>(inBlocks),
513
                    vreinterpretq_u8_u64(vaddq_u64(block1, one)));
514
            }
515
            else
516
            {
517
                block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
518
                inBlocks = PtrAdd(inBlocks, inIncrement);
519
                block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
520
                inBlocks = PtrAdd(inBlocks, inIncrement);
521
            }
522
523
            if (xorInput)
524
            {
525
                block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
526
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
527
                block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
528
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
529
            }
530
531
            func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
532
533
            if (xorOutput)
534
            {
535
                block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
536
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
537
                block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
538
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
539
            }
540
541
            vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
542
            outBlocks = PtrAdd(outBlocks, outIncrement);
543
            vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
544
            outBlocks = PtrAdd(outBlocks, outIncrement);
545
546
            length -= 2*blockSize;
547
        }
548
    }
549
550
    while (length >= blockSize)
551
    {
552
        uint64x2_t block, zero = {0,0};
553
        block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
554
555
        if (xorInput)
556
            block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
557
558
        if (flags & BT_InBlockIsCounter)
559
            const_cast<byte *>(inBlocks)[15]++;
560
561
        func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
562
563
        if (xorOutput)
564
            block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
565
566
        vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
567
568
        inBlocks = PtrAdd(inBlocks, inIncrement);
569
        outBlocks = PtrAdd(outBlocks, outIncrement);
570
        xorBlocks = PtrAdd(xorBlocks, xorIncrement);
571
        length -= blockSize;
572
    }
573
574
    return length;
575
}
576
577
NAMESPACE_END  // CryptoPP
578
579
#endif  // CRYPTOPP_ARM_NEON_AVAILABLE
580
581
// *************************** Intel SSE ************************** //
582
583
#if defined(CRYPTOPP_SSSE3_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
584
585
#if defined(CRYPTOPP_DOXYGEN_PROCESSING)
586
/// \brief SunCC workaround
587
/// \details SunCC loses the const on AES_Enc_Block and AES_Dec_Block
588
/// \sa <A HREF="http://github.com/weidai11/cryptopp/issues/224">Issue
589
///  224, SunCC and failed compile for rijndael.cpp</A>
590
# define MAYBE_CONST const
591
/// \brief SunCC workaround
592
/// \details SunCC loses the const on AES_Enc_Block and AES_Dec_Block
593
/// \sa <A HREF="http://github.com/weidai11/cryptopp/issues/224">Issue
594
///  224, SunCC and failed compile for rijndael.cpp</A>
595
# define MAYBE_UNCONST_CAST(T, x) (x)
596
#elif (__SUNPRO_CC >= 0x5130)
597
# define MAYBE_CONST
598
# define MAYBE_UNCONST_CAST(T, x) const_cast<MAYBE_CONST T>(x)
599
#else
600
1.55k
# define MAYBE_CONST const
601
1.55k
# define MAYBE_UNCONST_CAST(T, x) (x)
602
#endif
603
604
#if defined(CRYPTOPP_DOXYGEN_PROCESSING)
605
/// \brief Clang workaround
606
/// \details Clang issues spurious alignment warnings
607
/// \sa <A HREF="http://bugs.llvm.org/show_bug.cgi?id=20670">Issue
608
///  20670, _mm_loadu_si128 parameter has wrong type</A>
609
# define M128_CAST(x) ((__m128i *)(void *)(x))
610
/// \brief Clang workaround
611
/// \details Clang issues spurious alignment warnings
612
/// \sa <A HREF="http://bugs.llvm.org/show_bug.cgi?id=20670">Issue
613
///  20670, _mm_loadu_si128 parameter has wrong type</A>
614
# define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
615
#else
616
# ifndef M128_CAST
617
#  define M128_CAST(x) ((__m128i *)(void *)(x))
618
# endif
619
# ifndef CONST_M128_CAST
620
#  define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
621
# endif
622
#endif
623
624
NAMESPACE_BEGIN(CryptoPP)
625
626
/// \brief AdvancedProcessBlocks for 2 and 6 blocks
627
/// \tparam F2 function to process 2 128-bit blocks
628
/// \tparam F6 function to process 6 128-bit blocks
629
/// \tparam W word type of the subkey table
630
/// \details AdvancedProcessBlocks128_6x2_SSE processes 6 and 2 SSE SIMD words
631
///  at a time. For a single block the template uses F2 with a zero block.
632
/// \details The subkey type is usually word32 or word64. F2 and F6 must use the
633
///  same word type.
634
template <typename F2, typename F6, typename W>
635
inline size_t AdvancedProcessBlocks128_6x2_SSE(F2 func2, F6 func6,
636
        MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks,
637
        const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
638
16
{
639
16
    CRYPTOPP_ASSERT(subKeys);
640
16
    CRYPTOPP_ASSERT(inBlocks);
641
16
    CRYPTOPP_ASSERT(outBlocks);
642
16
    CRYPTOPP_ASSERT(length >= 16);
643
644
16
    const size_t blockSize = 16;
645
    // const size_t xmmBlockSize = 16;
646
647
16
    size_t inIncrement = (flags & (EnumToInt(BT_InBlockIsCounter)|EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
648
16
    size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
649
16
    size_t outIncrement = (flags & EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
650
651
    // Clang and Coverity are generating findings using xorBlocks as a flag.
652
16
    const bool xorInput = (xorBlocks != NULLPTR) && (flags & EnumToInt(BT_XorInput));
653
16
    const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & EnumToInt(BT_XorInput));
654
655
16
    if (flags & BT_ReverseDirection)
656
0
    {
657
0
        inBlocks = PtrAdd(inBlocks, length - blockSize);
658
0
        xorBlocks = PtrAdd(xorBlocks, length - blockSize);
659
0
        outBlocks = PtrAdd(outBlocks, length - blockSize);
660
0
        inIncrement = 0-inIncrement;
661
0
        xorIncrement = 0-xorIncrement;
662
0
        outIncrement = 0-outIncrement;
663
0
    }
664
665
16
    if (flags & BT_AllowParallel)
666
16
    {
667
26
        while (length >= 6*blockSize)
668
10
        {
669
10
            __m128i block0, block1, block2, block3, block4, block5;
670
10
            if (flags & BT_InBlockIsCounter)
671
0
            {
672
                // Increment of 1 in big-endian compatible with the ctr byte array.
673
0
                const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
674
0
                block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
675
0
                block1 = _mm_add_epi32(block0, s_one);
676
0
                block2 = _mm_add_epi32(block1, s_one);
677
0
                block3 = _mm_add_epi32(block2, s_one);
678
0
                block4 = _mm_add_epi32(block3, s_one);
679
0
                block5 = _mm_add_epi32(block4, s_one);
680
0
                _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block5, s_one));
681
0
            }
682
10
            else
683
10
            {
684
10
                block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
685
10
                inBlocks = PtrAdd(inBlocks, inIncrement);
686
10
                block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
687
10
                inBlocks = PtrAdd(inBlocks, inIncrement);
688
10
                block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
689
10
                inBlocks = PtrAdd(inBlocks, inIncrement);
690
10
                block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
691
10
                inBlocks = PtrAdd(inBlocks, inIncrement);
692
10
                block4 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
693
10
                inBlocks = PtrAdd(inBlocks, inIncrement);
694
10
                block5 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
695
10
                inBlocks = PtrAdd(inBlocks, inIncrement);
696
10
            }
697
698
10
            if (xorInput)
699
0
            {
700
0
                block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
701
0
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
702
0
                block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
703
0
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
704
0
                block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
705
0
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
706
0
                block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
707
0
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
708
0
                block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
709
0
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
710
0
                block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
711
0
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
712
0
            }
713
714
10
            func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
715
716
10
            if (xorOutput)
717
0
            {
718
0
                block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
719
0
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
720
0
                block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
721
0
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
722
0
                block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
723
0
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
724
0
                block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
725
0
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
726
0
                block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
727
0
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
728
0
                block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
729
0
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
730
0
            }
731
732
10
            _mm_storeu_si128(M128_CAST(outBlocks), block0);
733
10
            outBlocks = PtrAdd(outBlocks, outIncrement);
734
10
            _mm_storeu_si128(M128_CAST(outBlocks), block1);
735
10
            outBlocks = PtrAdd(outBlocks, outIncrement);
736
10
            _mm_storeu_si128(M128_CAST(outBlocks), block2);
737
10
            outBlocks = PtrAdd(outBlocks, outIncrement);
738
10
            _mm_storeu_si128(M128_CAST(outBlocks), block3);
739
10
            outBlocks = PtrAdd(outBlocks, outIncrement);
740
10
            _mm_storeu_si128(M128_CAST(outBlocks), block4);
741
10
            outBlocks = PtrAdd(outBlocks, outIncrement);
742
10
            _mm_storeu_si128(M128_CAST(outBlocks), block5);
743
10
            outBlocks = PtrAdd(outBlocks, outIncrement);
744
745
10
            length -= 6*blockSize;
746
10
        }
747
748
19
        while (length >= 2*blockSize)
749
3
        {
750
3
            __m128i block0, block1;
751
3
            if (flags & BT_InBlockIsCounter)
752
0
            {
753
                // Increment of 1 in big-endian compatible with the ctr byte array.
754
0
                const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
755
0
                block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
756
0
                block1 = _mm_add_epi32(block0, s_one);
757
0
                _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block1, s_one));
758
0
            }
759
3
            else
760
3
            {
761
3
                block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
762
3
                inBlocks = PtrAdd(inBlocks, inIncrement);
763
3
                block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
764
3
                inBlocks = PtrAdd(inBlocks, inIncrement);
765
3
            }
766
767
3
            if (xorInput)
768
0
            {
769
0
                block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
770
0
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
771
0
                block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
772
0
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
773
0
            }
774
775
3
            func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
776
777
3
            if (xorOutput)
778
0
            {
779
0
                block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
780
0
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
781
0
                block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
782
0
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
783
0
            }
784
785
3
            _mm_storeu_si128(M128_CAST(outBlocks), block0);
786
3
            outBlocks = PtrAdd(outBlocks, outIncrement);
787
3
            _mm_storeu_si128(M128_CAST(outBlocks), block1);
788
3
            outBlocks = PtrAdd(outBlocks, outIncrement);
789
790
3
            length -= 2*blockSize;
791
3
        }
792
16
    }
793
794
31
    while (length >= blockSize)
795
15
    {
796
15
        __m128i block, zero = _mm_setzero_si128();
797
15
        block = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
798
799
15
        if (xorInput)
800
0
            block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
801
802
15
        if (flags & BT_InBlockIsCounter)
803
0
            const_cast<byte *>(inBlocks)[15]++;
804
805
15
        func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
806
807
15
        if (xorOutput)
808
0
            block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
809
810
15
        _mm_storeu_si128(M128_CAST(outBlocks), block);
811
812
15
        inBlocks = PtrAdd(inBlocks, inIncrement);
813
15
        outBlocks = PtrAdd(outBlocks, outIncrement);
814
15
        xorBlocks = PtrAdd(xorBlocks, xorIncrement);
815
15
        length -= blockSize;
816
15
    }
817
818
16
    return length;
819
16
}
820
821
/// \brief AdvancedProcessBlocks for 1 and 4 blocks
822
/// \tparam F1 function to process 1 128-bit block
823
/// \tparam F4 function to process 4 128-bit blocks
824
/// \tparam W word type of the subkey table
825
/// \details AdvancedProcessBlocks128_4x1_SSE processes 4 and 1 SSE SIMD words
826
///  at a time.
827
/// \details The subkey type is usually word32 or word64. F1 and F4 must use the
828
///  same word type.
829
template <typename F1, typename F4, typename W>
830
inline size_t AdvancedProcessBlocks128_4x1_SSE(F1 func1, F4 func4,
831
        MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks,
832
        const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
833
530
{
834
530
    CRYPTOPP_ASSERT(subKeys);
835
530
    CRYPTOPP_ASSERT(inBlocks);
836
530
    CRYPTOPP_ASSERT(outBlocks);
837
530
    CRYPTOPP_ASSERT(length >= 16);
838
839
530
    const size_t blockSize = 16;
840
    // const size_t xmmBlockSize = 16;
841
842
530
    size_t inIncrement = (flags & (EnumToInt(BT_InBlockIsCounter)|EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
843
530
    size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
844
530
    size_t outIncrement = (flags & EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
845
846
    // Clang and Coverity are generating findings using xorBlocks as a flag.
847
530
    const bool xorInput = (xorBlocks != NULLPTR) && (flags & EnumToInt(BT_XorInput));
848
530
    const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & EnumToInt(BT_XorInput));
849
850
530
    if (flags & BT_ReverseDirection)
851
3
    {
852
3
        inBlocks = PtrAdd(inBlocks, length - blockSize);
853
3
        xorBlocks = PtrAdd(xorBlocks, length - blockSize);
854
3
        outBlocks = PtrAdd(outBlocks, length - blockSize);
855
3
        inIncrement = 0-inIncrement;
856
3
        xorIncrement = 0-xorIncrement;
857
3
        outIncrement = 0-outIncrement;
858
3
    }
859
860
530
    if (flags & BT_AllowParallel)
861
287
    {
862
2.63k
        while (length >= 4*blockSize)
863
2.34k
        {
864
2.34k
            __m128i block0, block1, block2, block3;
865
2.34k
            if (flags & BT_InBlockIsCounter)
866
2.29k
            {
867
                // Increment of 1 in big-endian compatible with the ctr byte array.
868
2.29k
                const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
869
2.29k
                block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
870
2.29k
                block1 = _mm_add_epi32(block0, s_one);
871
2.29k
                block2 = _mm_add_epi32(block1, s_one);
872
2.29k
                block3 = _mm_add_epi32(block2, s_one);
873
2.29k
                _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block3, s_one));
874
2.29k
            }
875
51
            else
876
51
            {
877
51
                block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
878
51
                inBlocks = PtrAdd(inBlocks, inIncrement);
879
51
                block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
880
51
                inBlocks = PtrAdd(inBlocks, inIncrement);
881
51
                block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
882
51
                inBlocks = PtrAdd(inBlocks, inIncrement);
883
51
                block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
884
51
                inBlocks = PtrAdd(inBlocks, inIncrement);
885
51
            }
886
887
2.34k
            if (xorInput)
888
0
            {
889
0
                block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
890
0
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
891
0
                block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
892
0
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
893
0
                block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
894
0
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
895
0
                block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
896
0
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
897
0
            }
898
899
2.34k
            func4(block0, block1, block2, block3, subKeys, static_cast<unsigned int>(rounds));
900
901
2.34k
            if (xorOutput)
902
2.29k
            {
903
2.29k
                block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
904
2.29k
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
905
2.29k
                block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
906
2.29k
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
907
2.29k
                block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
908
2.29k
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
909
2.29k
                block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
910
2.29k
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
911
2.29k
            }
912
913
2.34k
            _mm_storeu_si128(M128_CAST(outBlocks), block0);
914
2.34k
            outBlocks = PtrAdd(outBlocks, outIncrement);
915
2.34k
            _mm_storeu_si128(M128_CAST(outBlocks), block1);
916
2.34k
            outBlocks = PtrAdd(outBlocks, outIncrement);
917
2.34k
            _mm_storeu_si128(M128_CAST(outBlocks), block2);
918
2.34k
            outBlocks = PtrAdd(outBlocks, outIncrement);
919
2.34k
            _mm_storeu_si128(M128_CAST(outBlocks), block3);
920
2.34k
            outBlocks = PtrAdd(outBlocks, outIncrement);
921
922
2.34k
            length -= 4*blockSize;
923
2.34k
        }
924
287
    }
925
926
1.16k
    while (length >= blockSize)
927
637
    {
928
637
        __m128i block = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
929
930
637
        if (xorInput)
931
22
            block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
932
933
637
        if (flags & BT_InBlockIsCounter)
934
289
            const_cast<byte *>(inBlocks)[15]++;
935
936
637
        func1(block, subKeys, static_cast<unsigned int>(rounds));
937
938
637
        if (xorOutput)
939
234
            block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
940
941
637
        _mm_storeu_si128(M128_CAST(outBlocks), block);
942
943
637
        inBlocks = PtrAdd(inBlocks, inIncrement);
944
637
        outBlocks = PtrAdd(outBlocks, outIncrement);
945
637
        xorBlocks = PtrAdd(xorBlocks, xorIncrement);
946
637
        length -= blockSize;
947
637
    }
948
949
530
    return length;
950
530
}
951
952
NAMESPACE_END  // CryptoPP
953
954
#endif  // CRYPTOPP_SSSE3_AVAILABLE
955
956
// ************************** Altivec/Power 4 ************************** //
957
958
#if defined(__ALTIVEC__) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
959
960
NAMESPACE_BEGIN(CryptoPP)
961
962
/// \brief AdvancedProcessBlocks for 1 and 4 blocks
963
/// \tparam F1 function to process 1 128-bit block
964
/// \tparam F4 function to process 4 128-bit blocks
965
/// \tparam W word type of the subkey table
966
/// \details AdvancedProcessBlocks128_4x1_ALTIVEC processes 4 and 1 Altivec SIMD words
967
///  at a time.
968
/// \details The subkey type is usually word32 or word64. F1 and F4 must use the
969
///  same word type.
970
template <typename F1, typename F4, typename W>
971
inline size_t AdvancedProcessBlocks128_4x1_ALTIVEC(F1 func1, F4 func4,
972
        const W *subKeys, size_t rounds, const byte *inBlocks,
973
        const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
974
{
975
    CRYPTOPP_ASSERT(subKeys);
976
    CRYPTOPP_ASSERT(inBlocks);
977
    CRYPTOPP_ASSERT(outBlocks);
978
    CRYPTOPP_ASSERT(length >= 16);
979
980
#if (CRYPTOPP_LITTLE_ENDIAN)
981
    const uint32x4_p s_one  = {1,0,0,0};
982
#else
983
    const uint32x4_p s_one = {0,0,0,1};
984
#endif
985
986
    const size_t blockSize = 16;
987
    // const size_t simdBlockSize = 16;
988
989
    size_t inIncrement = (flags & (EnumToInt(BT_InBlockIsCounter)|EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
990
    size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
991
    size_t outIncrement = (flags & EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
992
993
    // Clang and Coverity are generating findings using xorBlocks as a flag.
994
    const bool xorInput = (xorBlocks != NULLPTR) && (flags & EnumToInt(BT_XorInput));
995
    const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & EnumToInt(BT_XorInput));
996
997
    if (flags & BT_ReverseDirection)
998
    {
999
        inBlocks = PtrAdd(inBlocks, length - blockSize);
1000
        xorBlocks = PtrAdd(xorBlocks, length - blockSize);
1001
        outBlocks = PtrAdd(outBlocks, length - blockSize);
1002
        inIncrement = 0-inIncrement;
1003
        xorIncrement = 0-xorIncrement;
1004
        outIncrement = 0-outIncrement;
1005
    }
1006
1007
    if (flags & BT_AllowParallel)
1008
    {
1009
        while (length >= 4*blockSize)
1010
        {
1011
            uint32x4_p block0, block1, block2, block3;
1012
1013
            if (flags & BT_InBlockIsCounter)
1014
            {
1015
                block0 = VecLoadBE(inBlocks);
1016
                block1 = VecAdd(block0, s_one);
1017
                block2 = VecAdd(block1, s_one);
1018
                block3 = VecAdd(block2, s_one);
1019
1020
                // Hack due to big-endian loads used by POWER8 (and maybe ARM-BE).
1021
                // CTR_ModePolicy::OperateKeystream is wired such that after
1022
                // returning from this function CTR_ModePolicy will detect wrap on
1023
                // on the last counter byte and increment the next to last byte.
1024
                // The problem is, with a big-endian load, inBlocks[15] is really
1025
                // located at index 15. The vector addition using a 32-bit element
1026
                // generates a carry into inBlocks[14] and then CTR_ModePolicy
1027
                // increments inBlocks[14] too.
1028
                const_cast<byte*>(inBlocks)[15] += 6;
1029
            }
1030
            else
1031
            {
1032
                block0 = VecLoadBE(inBlocks);
1033
                inBlocks = PtrAdd(inBlocks, inIncrement);
1034
                block1 = VecLoadBE(inBlocks);
1035
                inBlocks = PtrAdd(inBlocks, inIncrement);
1036
                block2 = VecLoadBE(inBlocks);
1037
                inBlocks = PtrAdd(inBlocks, inIncrement);
1038
                block3 = VecLoadBE(inBlocks);
1039
                inBlocks = PtrAdd(inBlocks, inIncrement);
1040
            }
1041
1042
            if (xorInput)
1043
            {
1044
                block0 = VecXor(block0, VecLoadBE(xorBlocks));
1045
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1046
                block1 = VecXor(block1, VecLoadBE(xorBlocks));
1047
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1048
                block2 = VecXor(block2, VecLoadBE(xorBlocks));
1049
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1050
                block3 = VecXor(block3, VecLoadBE(xorBlocks));
1051
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1052
            }
1053
1054
            func4(block0, block1, block2, block3, subKeys, rounds);
1055
1056
            if (xorOutput)
1057
            {
1058
                block0 = VecXor(block0, VecLoadBE(xorBlocks));
1059
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1060
                block1 = VecXor(block1, VecLoadBE(xorBlocks));
1061
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1062
                block2 = VecXor(block2, VecLoadBE(xorBlocks));
1063
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1064
                block3 = VecXor(block3, VecLoadBE(xorBlocks));
1065
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1066
            }
1067
1068
            VecStoreBE(block0, outBlocks);
1069
            outBlocks = PtrAdd(outBlocks, outIncrement);
1070
            VecStoreBE(block1, outBlocks);
1071
            outBlocks = PtrAdd(outBlocks, outIncrement);
1072
            VecStoreBE(block2, outBlocks);
1073
            outBlocks = PtrAdd(outBlocks, outIncrement);
1074
            VecStoreBE(block3, outBlocks);
1075
            outBlocks = PtrAdd(outBlocks, outIncrement);
1076
1077
            length -= 4*blockSize;
1078
        }
1079
    }
1080
1081
    while (length >= blockSize)
1082
    {
1083
        uint32x4_p block = VecLoadBE(inBlocks);
1084
1085
        if (xorInput)
1086
            block = VecXor(block, VecLoadBE(xorBlocks));
1087
1088
        if (flags & BT_InBlockIsCounter)
1089
            const_cast<byte *>(inBlocks)[15]++;
1090
1091
        func1(block, subKeys, rounds);
1092
1093
        if (xorOutput)
1094
            block = VecXor(block, VecLoadBE(xorBlocks));
1095
1096
        VecStoreBE(block, outBlocks);
1097
1098
        inBlocks = PtrAdd(inBlocks, inIncrement);
1099
        outBlocks = PtrAdd(outBlocks, outIncrement);
1100
        xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1101
        length -= blockSize;
1102
    }
1103
1104
    return length;
1105
}
1106
1107
/// \brief AdvancedProcessBlocks for 1 and 6 blocks
1108
/// \tparam F1 function to process 1 128-bit block
1109
/// \tparam F6 function to process 6 128-bit blocks
1110
/// \tparam W word type of the subkey table
1111
/// \details AdvancedProcessBlocks128_6x1_ALTIVEC processes 6 and 1 Altivec SIMD words
1112
///  at a time.
1113
/// \details The subkey type is usually word32 or word64. F1 and F6 must use the
1114
///  same word type.
1115
template <typename F1, typename F6, typename W>
1116
inline size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6,
1117
        const W *subKeys, size_t rounds, const byte *inBlocks,
1118
        const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1119
{
1120
    CRYPTOPP_ASSERT(subKeys);
1121
    CRYPTOPP_ASSERT(inBlocks);
1122
    CRYPTOPP_ASSERT(outBlocks);
1123
    CRYPTOPP_ASSERT(length >= 16);
1124
1125
#if (CRYPTOPP_LITTLE_ENDIAN)
1126
    const uint32x4_p s_one  = {1,0,0,0};
1127
#else
1128
    const uint32x4_p s_one = {0,0,0,1};
1129
#endif
1130
1131
    const size_t blockSize = 16;
1132
    // const size_t simdBlockSize = 16;
1133
1134
    size_t inIncrement = (flags & (EnumToInt(BT_InBlockIsCounter)|EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
1135
    size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
1136
    size_t outIncrement = (flags & EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
1137
1138
    // Clang and Coverity are generating findings using xorBlocks as a flag.
1139
    const bool xorInput = (xorBlocks != NULLPTR) && (flags & EnumToInt(BT_XorInput));
1140
    const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & EnumToInt(BT_XorInput));
1141
1142
    if (flags & BT_ReverseDirection)
1143
    {
1144
        inBlocks = PtrAdd(inBlocks, length - blockSize);
1145
        xorBlocks = PtrAdd(xorBlocks, length - blockSize);
1146
        outBlocks = PtrAdd(outBlocks, length - blockSize);
1147
        inIncrement = 0-inIncrement;
1148
        xorIncrement = 0-xorIncrement;
1149
        outIncrement = 0-outIncrement;
1150
    }
1151
1152
    if (flags & BT_AllowParallel)
1153
    {
1154
        while (length >= 6*blockSize)
1155
        {
1156
            uint32x4_p block0, block1, block2, block3, block4, block5;
1157
1158
            if (flags & BT_InBlockIsCounter)
1159
            {
1160
                block0 = VecLoadBE(inBlocks);
1161
                block1 = VecAdd(block0, s_one);
1162
                block2 = VecAdd(block1, s_one);
1163
                block3 = VecAdd(block2, s_one);
1164
                block4 = VecAdd(block3, s_one);
1165
                block5 = VecAdd(block4, s_one);
1166
1167
                // Hack due to big-endian loads used by POWER8 (and maybe ARM-BE).
1168
                // CTR_ModePolicy::OperateKeystream is wired such that after
1169
                // returning from this function CTR_ModePolicy will detect wrap on
1170
                // on the last counter byte and increment the next to last byte.
1171
                // The problem is, with a big-endian load, inBlocks[15] is really
1172
                // located at index 15. The vector addition using a 32-bit element
1173
                // generates a carry into inBlocks[14] and then CTR_ModePolicy
1174
                // increments inBlocks[14] too.
1175
                //
1176
                // To find this bug we needed a test case with a ctr of 0xNN...FA.
1177
                // The last octet is 0xFA and adding 6 creates the wrap to trigger
1178
                // the issue. If the last octet was 0xFC then 4 would trigger it.
1179
                // We dumb-lucked into the test with SPECK-128. The test case of
1180
                // interest is the one with IV 348ECA9766C09F04 826520DE47A212FA.
1181
                uint8x16_p temp = VecAdd((uint8x16_p)block5, (uint8x16_p)s_one);
1182
                VecStoreBE(temp, const_cast<byte*>(inBlocks));
1183
            }
1184
            else
1185
            {
1186
                block0 = VecLoadBE(inBlocks);
1187
                inBlocks = PtrAdd(inBlocks, inIncrement);
1188
                block1 = VecLoadBE(inBlocks);
1189
                inBlocks = PtrAdd(inBlocks, inIncrement);
1190
                block2 = VecLoadBE(inBlocks);
1191
                inBlocks = PtrAdd(inBlocks, inIncrement);
1192
                block3 = VecLoadBE(inBlocks);
1193
                inBlocks = PtrAdd(inBlocks, inIncrement);
1194
                block4 = VecLoadBE(inBlocks);
1195
                inBlocks = PtrAdd(inBlocks, inIncrement);
1196
                block5 = VecLoadBE(inBlocks);
1197
                inBlocks = PtrAdd(inBlocks, inIncrement);
1198
            }
1199
1200
            if (xorInput)
1201
            {
1202
                block0 = VecXor(block0, VecLoadBE(xorBlocks));
1203
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1204
                block1 = VecXor(block1, VecLoadBE(xorBlocks));
1205
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1206
                block2 = VecXor(block2, VecLoadBE(xorBlocks));
1207
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1208
                block3 = VecXor(block3, VecLoadBE(xorBlocks));
1209
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1210
                block4 = VecXor(block4, VecLoadBE(xorBlocks));
1211
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1212
                block5 = VecXor(block5, VecLoadBE(xorBlocks));
1213
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1214
            }
1215
1216
            func6(block0, block1, block2, block3, block4, block5, subKeys, rounds);
1217
1218
            if (xorOutput)
1219
            {
1220
                block0 = VecXor(block0, VecLoadBE(xorBlocks));
1221
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1222
                block1 = VecXor(block1, VecLoadBE(xorBlocks));
1223
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1224
                block2 = VecXor(block2, VecLoadBE(xorBlocks));
1225
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1226
                block3 = VecXor(block3, VecLoadBE(xorBlocks));
1227
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1228
                block4 = VecXor(block4, VecLoadBE(xorBlocks));
1229
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1230
                block5 = VecXor(block5, VecLoadBE(xorBlocks));
1231
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1232
            }
1233
1234
            VecStoreBE(block0, outBlocks);
1235
            outBlocks = PtrAdd(outBlocks, outIncrement);
1236
            VecStoreBE(block1, outBlocks);
1237
            outBlocks = PtrAdd(outBlocks, outIncrement);
1238
            VecStoreBE(block2, outBlocks);
1239
            outBlocks = PtrAdd(outBlocks, outIncrement);
1240
            VecStoreBE(block3, outBlocks);
1241
            outBlocks = PtrAdd(outBlocks, outIncrement);
1242
            VecStoreBE(block4, outBlocks);
1243
            outBlocks = PtrAdd(outBlocks, outIncrement);
1244
            VecStoreBE(block5, outBlocks);
1245
            outBlocks = PtrAdd(outBlocks, outIncrement);
1246
1247
            length -= 6*blockSize;
1248
        }
1249
    }
1250
1251
    while (length >= blockSize)
1252
    {
1253
        uint32x4_p block = VecLoadBE(inBlocks);
1254
1255
        if (xorInput)
1256
            block = VecXor(block, VecLoadBE(xorBlocks));
1257
1258
        if (flags & BT_InBlockIsCounter)
1259
            const_cast<byte *>(inBlocks)[15]++;
1260
1261
        func1(block, subKeys, rounds);
1262
1263
        if (xorOutput)
1264
            block = VecXor(block, VecLoadBE(xorBlocks));
1265
1266
        VecStoreBE(block, outBlocks);
1267
1268
        inBlocks = PtrAdd(inBlocks, inIncrement);
1269
        outBlocks = PtrAdd(outBlocks, outIncrement);
1270
        xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1271
        length -= blockSize;
1272
    }
1273
1274
    return length;
1275
}
1276
1277
NAMESPACE_END  // CryptoPP
1278
1279
#endif  // __ALTIVEC__
1280
1281
#endif  // CRYPTOPP_ADVANCED_SIMD_TEMPLATES