Coverage Report

Created: 2024-11-21 07:03

/src/cryptopp/chacha_avx.cpp
Line
Count
Source (jump to first uncovered line)
1
// chacha_avx.cpp - written and placed in the public domain by
2
//                  Jack Lloyd and Jeffrey Walton
3
//
4
//    This source file uses intrinsics and built-ins to gain access to
5
//    AVX2 instructions. A separate source file is needed because
6
//    additional CXXFLAGS are required to enable the appropriate
7
//    instructions sets in some build configurations.
8
//
9
//    AVX2 implementation based on Botan's chacha_avx.cpp. Many thanks
10
//    to Jack Lloyd and the Botan team for allowing us to use it.
11
//
12
//    Here are some relative numbers for ChaCha8:
13
//    * Intel Skylake,   3.0 GHz: AVX2 at 4411 MB/s; 0.57 cpb.
14
//    * Intel Broadwell, 2.3 GHz: AVX2 at 3828 MB/s; 0.58 cpb.
15
//    * AMD Bulldozer,   3.3 GHz: AVX2 at 1680 MB/s; 1.47 cpb.
16
17
#include "pch.h"
18
#include "config.h"
19
20
#include "chacha.h"
21
#include "misc.h"
22
23
#if defined(CRYPTOPP_AVX2_AVAILABLE)
24
# include <xmmintrin.h>
25
# include <emmintrin.h>
26
# include <immintrin.h>
27
#endif
28
29
// Squash MS LNK4221 and libtool warnings
30
extern const char CHACHA_AVX_FNAME[] = __FILE__;
31
32
// Sun Studio 12.4 OK, 12.5 and 12.6 compile error.
33
#if (__SUNPRO_CC >= 0x5140) && (__SUNPRO_CC <= 0x5150)
34
# define MAYBE_CONST
35
#else
36
# define MAYBE_CONST const
37
#endif
38
39
// VS2017 and global optimization bug. Also see
40
// https://github.com/weidai11/cryptopp/issues/649 and
41
// https://github.com/weidai11/cryptopp/issues/735. The
42
// 649 issue affects AES but it is the same here. The 735
43
// issue is ChaCha AVX2 cut-in where it surfaced again.
44
#if (CRYPTOPP_MSC_VERSION >= 1910) && (CRYPTOPP_MSC_VERSION <= 1916)
45
# ifndef CRYPTOPP_DEBUG
46
#  pragma optimize("", off)
47
#  pragma optimize("ts", on)
48
# endif
49
#endif
50
51
// The data is aligned, but Clang issues warning based on type
52
// and not the actual alignment of the variable and data.
53
#if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE
54
# pragma GCC diagnostic ignored "-Wcast-align"
55
#endif
56
57
ANONYMOUS_NAMESPACE_BEGIN
58
59
#if (CRYPTOPP_AVX2_AVAILABLE)
60
61
template <unsigned int R>
62
inline __m256i RotateLeft(const __m256i val)
63
0
{
64
0
    return _mm256_or_si256(_mm256_slli_epi32(val, R), _mm256_srli_epi32(val, 32-R));
65
0
}
Unexecuted instantiation: chacha_avx.cpp:long long __vector(4) (anonymous namespace)::RotateLeft<12u>(long long __vector(4))
Unexecuted instantiation: chacha_avx.cpp:long long __vector(4) (anonymous namespace)::RotateLeft<7u>(long long __vector(4))
66
67
template <>
68
inline __m256i RotateLeft<8>(const __m256i val)
69
0
{
70
0
    const __m256i mask = _mm256_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3,
71
0
                                         14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
72
0
    return _mm256_shuffle_epi8(val, mask);
73
0
}
74
75
template <>
76
inline __m256i RotateLeft<16>(const __m256i val)
77
0
{
78
0
    const __m256i mask = _mm256_set_epi8(13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2,
79
0
                                         13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2);
80
0
    return _mm256_shuffle_epi8(val, mask);
81
0
}
82
83
#endif  // CRYPTOPP_AVX2_AVAILABLE
84
85
ANONYMOUS_NAMESPACE_END
86
87
NAMESPACE_BEGIN(CryptoPP)
88
89
#if (CRYPTOPP_AVX2_AVAILABLE)
90
91
void ChaCha_OperateKeystream_AVX2(const word32 *state, const byte* input, byte *output, unsigned int rounds)
92
0
{
93
0
    const __m256i state0 = _mm256_broadcastsi128_si256(
94
0
        _mm_loadu_si128(reinterpret_cast<const __m128i*>(state+0*4)));
95
0
    const __m256i state1 = _mm256_broadcastsi128_si256(
96
0
        _mm_loadu_si128(reinterpret_cast<const __m128i*>(state+1*4)));
97
0
    const __m256i state2 = _mm256_broadcastsi128_si256(
98
0
        _mm_loadu_si128(reinterpret_cast<const __m128i*>(state+2*4)));
99
0
    const __m256i state3 = _mm256_broadcastsi128_si256(
100
0
        _mm_loadu_si128(reinterpret_cast<const __m128i*>(state+3*4)));
101
102
0
    const word32 C = 0xFFFFFFFFu - state[12];
103
0
    const __m256i CTR0 = _mm256_set_epi32(0, 0,     0, 0, 0, 0, C < 4, 4);
104
0
    const __m256i CTR1 = _mm256_set_epi32(0, 0, C < 1, 1, 0, 0, C < 5, 5);
105
0
    const __m256i CTR2 = _mm256_set_epi32(0, 0, C < 2, 2, 0, 0, C < 6, 6);
106
0
    const __m256i CTR3 = _mm256_set_epi32(0, 0, C < 3, 3, 0, 0, C < 7, 7);
107
108
0
    __m256i X0_0 = state0;
109
0
    __m256i X0_1 = state1;
110
0
    __m256i X0_2 = state2;
111
0
    __m256i X0_3 = _mm256_add_epi32(state3, CTR0);
112
113
0
    __m256i X1_0 = state0;
114
0
    __m256i X1_1 = state1;
115
0
    __m256i X1_2 = state2;
116
0
    __m256i X1_3 = _mm256_add_epi32(state3, CTR1);
117
118
0
    __m256i X2_0 = state0;
119
0
    __m256i X2_1 = state1;
120
0
    __m256i X2_2 = state2;
121
0
    __m256i X2_3 = _mm256_add_epi32(state3, CTR2);
122
123
0
    __m256i X3_0 = state0;
124
0
    __m256i X3_1 = state1;
125
0
    __m256i X3_2 = state2;
126
0
    __m256i X3_3 = _mm256_add_epi32(state3, CTR3);
127
128
0
    for (int i = static_cast<int>(rounds); i > 0; i -= 2)
129
0
    {
130
0
        X0_0 = _mm256_add_epi32(X0_0, X0_1);
131
0
        X1_0 = _mm256_add_epi32(X1_0, X1_1);
132
0
        X2_0 = _mm256_add_epi32(X2_0, X2_1);
133
0
        X3_0 = _mm256_add_epi32(X3_0, X3_1);
134
135
0
        X0_3 = _mm256_xor_si256(X0_3, X0_0);
136
0
        X1_3 = _mm256_xor_si256(X1_3, X1_0);
137
0
        X2_3 = _mm256_xor_si256(X2_3, X2_0);
138
0
        X3_3 = _mm256_xor_si256(X3_3, X3_0);
139
140
0
        X0_3 = RotateLeft<16>(X0_3);
141
0
        X1_3 = RotateLeft<16>(X1_3);
142
0
        X2_3 = RotateLeft<16>(X2_3);
143
0
        X3_3 = RotateLeft<16>(X3_3);
144
145
0
        X0_2 = _mm256_add_epi32(X0_2, X0_3);
146
0
        X1_2 = _mm256_add_epi32(X1_2, X1_3);
147
0
        X2_2 = _mm256_add_epi32(X2_2, X2_3);
148
0
        X3_2 = _mm256_add_epi32(X3_2, X3_3);
149
150
0
        X0_1 = _mm256_xor_si256(X0_1, X0_2);
151
0
        X1_1 = _mm256_xor_si256(X1_1, X1_2);
152
0
        X2_1 = _mm256_xor_si256(X2_1, X2_2);
153
0
        X3_1 = _mm256_xor_si256(X3_1, X3_2);
154
155
0
        X0_1 = RotateLeft<12>(X0_1);
156
0
        X1_1 = RotateLeft<12>(X1_1);
157
0
        X2_1 = RotateLeft<12>(X2_1);
158
0
        X3_1 = RotateLeft<12>(X3_1);
159
160
0
        X0_0 = _mm256_add_epi32(X0_0, X0_1);
161
0
        X1_0 = _mm256_add_epi32(X1_0, X1_1);
162
0
        X2_0 = _mm256_add_epi32(X2_0, X2_1);
163
0
        X3_0 = _mm256_add_epi32(X3_0, X3_1);
164
165
0
        X0_3 = _mm256_xor_si256(X0_3, X0_0);
166
0
        X1_3 = _mm256_xor_si256(X1_3, X1_0);
167
0
        X2_3 = _mm256_xor_si256(X2_3, X2_0);
168
0
        X3_3 = _mm256_xor_si256(X3_3, X3_0);
169
170
0
        X0_3 = RotateLeft<8>(X0_3);
171
0
        X1_3 = RotateLeft<8>(X1_3);
172
0
        X2_3 = RotateLeft<8>(X2_3);
173
0
        X3_3 = RotateLeft<8>(X3_3);
174
175
0
        X0_2 = _mm256_add_epi32(X0_2, X0_3);
176
0
        X1_2 = _mm256_add_epi32(X1_2, X1_3);
177
0
        X2_2 = _mm256_add_epi32(X2_2, X2_3);
178
0
        X3_2 = _mm256_add_epi32(X3_2, X3_3);
179
180
0
        X0_1 = _mm256_xor_si256(X0_1, X0_2);
181
0
        X1_1 = _mm256_xor_si256(X1_1, X1_2);
182
0
        X2_1 = _mm256_xor_si256(X2_1, X2_2);
183
0
        X3_1 = _mm256_xor_si256(X3_1, X3_2);
184
185
0
        X0_1 = RotateLeft<7>(X0_1);
186
0
        X1_1 = RotateLeft<7>(X1_1);
187
0
        X2_1 = RotateLeft<7>(X2_1);
188
0
        X3_1 = RotateLeft<7>(X3_1);
189
190
0
        X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(0, 3, 2, 1));
191
0
        X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2));
192
0
        X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(2, 1, 0, 3));
193
194
0
        X1_1 = _mm256_shuffle_epi32(X1_1, _MM_SHUFFLE(0, 3, 2, 1));
195
0
        X1_2 = _mm256_shuffle_epi32(X1_2, _MM_SHUFFLE(1, 0, 3, 2));
196
0
        X1_3 = _mm256_shuffle_epi32(X1_3, _MM_SHUFFLE(2, 1, 0, 3));
197
198
0
        X2_1 = _mm256_shuffle_epi32(X2_1, _MM_SHUFFLE(0, 3, 2, 1));
199
0
        X2_2 = _mm256_shuffle_epi32(X2_2, _MM_SHUFFLE(1, 0, 3, 2));
200
0
        X2_3 = _mm256_shuffle_epi32(X2_3, _MM_SHUFFLE(2, 1, 0, 3));
201
202
0
        X3_1 = _mm256_shuffle_epi32(X3_1, _MM_SHUFFLE(0, 3, 2, 1));
203
0
        X3_2 = _mm256_shuffle_epi32(X3_2, _MM_SHUFFLE(1, 0, 3, 2));
204
0
        X3_3 = _mm256_shuffle_epi32(X3_3, _MM_SHUFFLE(2, 1, 0, 3));
205
206
0
        X0_0 = _mm256_add_epi32(X0_0, X0_1);
207
0
        X1_0 = _mm256_add_epi32(X1_0, X1_1);
208
0
        X2_0 = _mm256_add_epi32(X2_0, X2_1);
209
0
        X3_0 = _mm256_add_epi32(X3_0, X3_1);
210
211
0
        X0_3 = _mm256_xor_si256(X0_3, X0_0);
212
0
        X1_3 = _mm256_xor_si256(X1_3, X1_0);
213
0
        X2_3 = _mm256_xor_si256(X2_3, X2_0);
214
0
        X3_3 = _mm256_xor_si256(X3_3, X3_0);
215
216
0
        X0_3 = RotateLeft<16>(X0_3);
217
0
        X1_3 = RotateLeft<16>(X1_3);
218
0
        X2_3 = RotateLeft<16>(X2_3);
219
0
        X3_3 = RotateLeft<16>(X3_3);
220
221
0
        X0_2 = _mm256_add_epi32(X0_2, X0_3);
222
0
        X1_2 = _mm256_add_epi32(X1_2, X1_3);
223
0
        X2_2 = _mm256_add_epi32(X2_2, X2_3);
224
0
        X3_2 = _mm256_add_epi32(X3_2, X3_3);
225
226
0
        X0_1 = _mm256_xor_si256(X0_1, X0_2);
227
0
        X1_1 = _mm256_xor_si256(X1_1, X1_2);
228
0
        X2_1 = _mm256_xor_si256(X2_1, X2_2);
229
0
        X3_1 = _mm256_xor_si256(X3_1, X3_2);
230
231
0
        X0_1 = RotateLeft<12>(X0_1);
232
0
        X1_1 = RotateLeft<12>(X1_1);
233
0
        X2_1 = RotateLeft<12>(X2_1);
234
0
        X3_1 = RotateLeft<12>(X3_1);
235
236
0
        X0_0 = _mm256_add_epi32(X0_0, X0_1);
237
0
        X1_0 = _mm256_add_epi32(X1_0, X1_1);
238
0
        X2_0 = _mm256_add_epi32(X2_0, X2_1);
239
0
        X3_0 = _mm256_add_epi32(X3_0, X3_1);
240
241
0
        X0_3 = _mm256_xor_si256(X0_3, X0_0);
242
0
        X1_3 = _mm256_xor_si256(X1_3, X1_0);
243
0
        X2_3 = _mm256_xor_si256(X2_3, X2_0);
244
0
        X3_3 = _mm256_xor_si256(X3_3, X3_0);
245
246
0
        X0_3 = RotateLeft<8>(X0_3);
247
0
        X1_3 = RotateLeft<8>(X1_3);
248
0
        X2_3 = RotateLeft<8>(X2_3);
249
0
        X3_3 = RotateLeft<8>(X3_3);
250
251
0
        X0_2 = _mm256_add_epi32(X0_2, X0_3);
252
0
        X1_2 = _mm256_add_epi32(X1_2, X1_3);
253
0
        X2_2 = _mm256_add_epi32(X2_2, X2_3);
254
0
        X3_2 = _mm256_add_epi32(X3_2, X3_3);
255
256
0
        X0_1 = _mm256_xor_si256(X0_1, X0_2);
257
0
        X1_1 = _mm256_xor_si256(X1_1, X1_2);
258
0
        X2_1 = _mm256_xor_si256(X2_1, X2_2);
259
0
        X3_1 = _mm256_xor_si256(X3_1, X3_2);
260
261
0
        X0_1 = RotateLeft<7>(X0_1);
262
0
        X1_1 = RotateLeft<7>(X1_1);
263
0
        X2_1 = RotateLeft<7>(X2_1);
264
0
        X3_1 = RotateLeft<7>(X3_1);
265
266
0
        X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(2, 1, 0, 3));
267
0
        X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2));
268
0
        X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(0, 3, 2, 1));
269
270
0
        X1_1 = _mm256_shuffle_epi32(X1_1, _MM_SHUFFLE(2, 1, 0, 3));
271
0
        X1_2 = _mm256_shuffle_epi32(X1_2, _MM_SHUFFLE(1, 0, 3, 2));
272
0
        X1_3 = _mm256_shuffle_epi32(X1_3, _MM_SHUFFLE(0, 3, 2, 1));
273
274
0
        X2_1 = _mm256_shuffle_epi32(X2_1, _MM_SHUFFLE(2, 1, 0, 3));
275
0
        X2_2 = _mm256_shuffle_epi32(X2_2, _MM_SHUFFLE(1, 0, 3, 2));
276
0
        X2_3 = _mm256_shuffle_epi32(X2_3, _MM_SHUFFLE(0, 3, 2, 1));
277
278
0
        X3_1 = _mm256_shuffle_epi32(X3_1, _MM_SHUFFLE(2, 1, 0, 3));
279
0
        X3_2 = _mm256_shuffle_epi32(X3_2, _MM_SHUFFLE(1, 0, 3, 2));
280
0
        X3_3 = _mm256_shuffle_epi32(X3_3, _MM_SHUFFLE(0, 3, 2, 1));
281
0
    }
282
283
0
    X0_0 = _mm256_add_epi32(X0_0, state0);
284
0
    X0_1 = _mm256_add_epi32(X0_1, state1);
285
0
    X0_2 = _mm256_add_epi32(X0_2, state2);
286
0
    X0_3 = _mm256_add_epi32(X0_3, state3);
287
0
    X0_3 = _mm256_add_epi32(X0_3, CTR0);
288
289
0
    X1_0 = _mm256_add_epi32(X1_0, state0);
290
0
    X1_1 = _mm256_add_epi32(X1_1, state1);
291
0
    X1_2 = _mm256_add_epi32(X1_2, state2);
292
0
    X1_3 = _mm256_add_epi32(X1_3, state3);
293
0
    X1_3 = _mm256_add_epi32(X1_3, CTR1);
294
295
0
    X2_0 = _mm256_add_epi32(X2_0, state0);
296
0
    X2_1 = _mm256_add_epi32(X2_1, state1);
297
0
    X2_2 = _mm256_add_epi32(X2_2, state2);
298
0
    X2_3 = _mm256_add_epi32(X2_3, state3);
299
0
    X2_3 = _mm256_add_epi32(X2_3, CTR2);
300
301
0
    X3_0 = _mm256_add_epi32(X3_0, state0);
302
0
    X3_1 = _mm256_add_epi32(X3_1, state1);
303
0
    X3_2 = _mm256_add_epi32(X3_2, state2);
304
0
    X3_3 = _mm256_add_epi32(X3_3, state3);
305
0
    X3_3 = _mm256_add_epi32(X3_3, CTR3);
306
307
0
    if (input)
308
0
    {
309
0
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+0*32),
310
0
            _mm256_xor_si256(_mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4)),
311
0
            _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+0*32)))));
312
0
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+1*32),
313
0
            _mm256_xor_si256(_mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4)),
314
0
            _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+1*32)))));
315
0
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+2*32),
316
0
            _mm256_xor_si256(_mm256_permute2x128_si256(X1_0, X1_1, 1 + (3 << 4)),
317
0
            _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+2*32)))));
318
0
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+3*32),
319
0
            _mm256_xor_si256(_mm256_permute2x128_si256(X1_2, X1_3, 1 + (3 << 4)),
320
0
            _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+3*32)))));
321
0
    }
322
0
    else
323
0
    {
324
0
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+0*32),
325
0
            _mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4)));
326
0
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+1*32),
327
0
            _mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4)));
328
0
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+2*32),
329
0
            _mm256_permute2x128_si256(X1_0, X1_1, 1 + (3 << 4)));
330
0
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+3*32),
331
0
            _mm256_permute2x128_si256(X1_2, X1_3, 1 + (3 << 4)));
332
0
    }
333
334
0
    if (input)
335
0
    {
336
0
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+4*32),
337
0
            _mm256_xor_si256(_mm256_permute2x128_si256(X2_0, X2_1, 1 + (3 << 4)),
338
0
            _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+4*32)))));
339
0
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+5*32),
340
0
            _mm256_xor_si256(_mm256_permute2x128_si256(X2_2, X2_3, 1 + (3 << 4)),
341
0
            _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+5*32)))));
342
0
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+6*32),
343
0
            _mm256_xor_si256(_mm256_permute2x128_si256(X3_0, X3_1, 1 + (3 << 4)),
344
0
            _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+6*32)))));
345
0
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+7*32),
346
0
            _mm256_xor_si256(_mm256_permute2x128_si256(X3_2, X3_3, 1 + (3 << 4)),
347
0
            _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+7*32)))));
348
0
    }
349
0
    else
350
0
    {
351
0
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+4*32),
352
0
            _mm256_permute2x128_si256(X2_0, X2_1, 1 + (3 << 4)));
353
0
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+5*32),
354
0
            _mm256_permute2x128_si256(X2_2, X2_3, 1 + (3 << 4)));
355
0
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+6*32),
356
0
            _mm256_permute2x128_si256(X3_0, X3_1, 1 + (3 << 4)));
357
0
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+7*32),
358
0
            _mm256_permute2x128_si256(X3_2, X3_3, 1 + (3 << 4)));
359
0
    }
360
361
0
    if (input)
362
0
    {
363
0
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+ 8*32),
364
0
            _mm256_xor_si256(_mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4)),
365
0
            _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+8*32)))));
366
0
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+ 9*32),
367
0
            _mm256_xor_si256(_mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4)),
368
0
            _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+9*32)))));
369
0
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+10*32),
370
0
            _mm256_xor_si256(_mm256_permute2x128_si256(X1_0, X1_1, 0 + (2 << 4)),
371
0
            _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+10*32)))));
372
0
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+11*32),
373
0
            _mm256_xor_si256(_mm256_permute2x128_si256(X1_2, X1_3, 0 + (2 << 4)),
374
0
            _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+11*32)))));
375
0
    }
376
0
    else
377
0
    {
378
0
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+ 8*32),
379
0
            _mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4)));
380
0
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+ 9*32),
381
0
            _mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4)));
382
0
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+10*32),
383
0
            _mm256_permute2x128_si256(X1_0, X1_1, 0 + (2 << 4)));
384
0
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+11*32),
385
0
            _mm256_permute2x128_si256(X1_2, X1_3, 0 + (2 << 4)));
386
0
    }
387
388
0
    if (input)
389
0
    {
390
0
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+12*32),
391
0
            _mm256_xor_si256(_mm256_permute2x128_si256(X2_0, X2_1, 0 + (2 << 4)),
392
0
            _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+12*32)))));
393
0
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+13*32),
394
0
            _mm256_xor_si256(_mm256_permute2x128_si256(X2_2, X2_3, 0 + (2 << 4)),
395
0
            _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+13*32)))));
396
0
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+14*32),
397
0
            _mm256_xor_si256(_mm256_permute2x128_si256(X3_0, X3_1, 0 + (2 << 4)),
398
0
            _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+14*32)))));
399
0
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+15*32),
400
0
            _mm256_xor_si256(_mm256_permute2x128_si256(X3_2, X3_3, 0 + (2 << 4)),
401
0
            _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+15*32)))));
402
0
    }
403
0
    else
404
0
    {
405
0
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+12*32),
406
0
            _mm256_permute2x128_si256(X2_0, X2_1, 0 + (2 << 4)));
407
0
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+13*32),
408
0
            _mm256_permute2x128_si256(X2_2, X2_3, 0 + (2 << 4)));
409
0
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+14*32),
410
0
            _mm256_permute2x128_si256(X3_0, X3_1, 0 + (2 << 4)));
411
0
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+15*32),
412
0
            _mm256_permute2x128_si256(X3_2, X3_3, 0 + (2 << 4)));
413
0
    }
414
415
    // https://software.intel.com/en-us/articles/avoiding-avx-sse-transition-penalties
416
0
    _mm256_zeroupper();
417
0
}
418
419
#endif  // CRYPTOPP_AVX2_AVAILABLE
420
421
NAMESPACE_END