/src/cryptopp/chacha_avx.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // chacha_avx.cpp - written and placed in the public domain by |
2 | | // Jack Lloyd and Jeffrey Walton |
3 | | // |
4 | | // This source file uses intrinsics and built-ins to gain access to |
5 | | // AVX2 instructions. A separate source file is needed because |
6 | | // additional CXXFLAGS are required to enable the appropriate |
7 | | // instructions sets in some build configurations. |
8 | | // |
9 | | // AVX2 implementation based on Botan's chacha_avx.cpp. Many thanks |
10 | | // to Jack Lloyd and the Botan team for allowing us to use it. |
11 | | // |
12 | | // Here are some relative numbers for ChaCha8: |
13 | | // * Intel Skylake, 3.0 GHz: AVX2 at 4411 MB/s; 0.57 cpb. |
14 | | // * Intel Broadwell, 2.3 GHz: AVX2 at 3828 MB/s; 0.58 cpb. |
15 | | // * AMD Bulldozer, 3.3 GHz: AVX2 at 1680 MB/s; 1.47 cpb. |
16 | | |
17 | | #include "pch.h" |
18 | | #include "config.h" |
19 | | |
20 | | #include "chacha.h" |
21 | | #include "misc.h" |
22 | | |
23 | | #if defined(CRYPTOPP_AVX2_AVAILABLE) |
24 | | # include <xmmintrin.h> |
25 | | # include <emmintrin.h> |
26 | | # include <immintrin.h> |
27 | | #endif |
28 | | |
29 | | // Squash MS LNK4221 and libtool warnings |
30 | | extern const char CHACHA_AVX_FNAME[] = __FILE__; |
31 | | |
32 | | // Sun Studio 12.4 OK, 12.5 and 12.6 compile error. |
33 | | #if (__SUNPRO_CC >= 0x5140) && (__SUNPRO_CC <= 0x5150) |
34 | | # define MAYBE_CONST |
35 | | #else |
36 | | # define MAYBE_CONST const |
37 | | #endif |
38 | | |
39 | | // VS2017 and global optimization bug. Also see |
40 | | // https://github.com/weidai11/cryptopp/issues/649 and |
41 | | // https://github.com/weidai11/cryptopp/issues/735. The |
42 | | // 649 issue affects AES but it is the same here. The 735 |
43 | | // issue is ChaCha AVX2 cut-in where it surfaced again. |
44 | | #if (CRYPTOPP_MSC_VERSION >= 1910) && (CRYPTOPP_MSC_VERSION <= 1916) |
45 | | # ifndef CRYPTOPP_DEBUG |
46 | | # pragma optimize("", off) |
47 | | # pragma optimize("ts", on) |
48 | | # endif |
49 | | #endif |
50 | | |
51 | | // The data is aligned, but Clang issues warning based on type |
52 | | // and not the actual alignment of the variable and data. |
53 | | #if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE |
54 | | # pragma GCC diagnostic ignored "-Wcast-align" |
55 | | #endif |
56 | | |
57 | | ANONYMOUS_NAMESPACE_BEGIN |
58 | | |
59 | | #if (CRYPTOPP_AVX2_AVAILABLE) |
60 | | |
61 | | template <unsigned int R> |
62 | | inline __m256i RotateLeft(const __m256i val) |
63 | 0 | { |
64 | 0 | return _mm256_or_si256(_mm256_slli_epi32(val, R), _mm256_srli_epi32(val, 32-R)); |
65 | 0 | } Unexecuted instantiation: chacha_avx.cpp:long long __vector(4) (anonymous namespace)::RotateLeft<12u>(long long __vector(4)) Unexecuted instantiation: chacha_avx.cpp:long long __vector(4) (anonymous namespace)::RotateLeft<7u>(long long __vector(4)) |
66 | | |
67 | | template <> |
68 | | inline __m256i RotateLeft<8>(const __m256i val) |
69 | 0 | { |
70 | 0 | const __m256i mask = _mm256_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3, |
71 | 0 | 14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3); |
72 | 0 | return _mm256_shuffle_epi8(val, mask); |
73 | 0 | } |
74 | | |
75 | | template <> |
76 | | inline __m256i RotateLeft<16>(const __m256i val) |
77 | 0 | { |
78 | 0 | const __m256i mask = _mm256_set_epi8(13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2, |
79 | 0 | 13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2); |
80 | 0 | return _mm256_shuffle_epi8(val, mask); |
81 | 0 | } |
82 | | |
83 | | #endif // CRYPTOPP_AVX2_AVAILABLE |
84 | | |
85 | | ANONYMOUS_NAMESPACE_END |
86 | | |
87 | | NAMESPACE_BEGIN(CryptoPP) |
88 | | |
89 | | #if (CRYPTOPP_AVX2_AVAILABLE) |
90 | | |
91 | | void ChaCha_OperateKeystream_AVX2(const word32 *state, const byte* input, byte *output, unsigned int rounds) |
92 | 0 | { |
93 | 0 | const __m256i state0 = _mm256_broadcastsi128_si256( |
94 | 0 | _mm_loadu_si128(reinterpret_cast<const __m128i*>(state+0*4))); |
95 | 0 | const __m256i state1 = _mm256_broadcastsi128_si256( |
96 | 0 | _mm_loadu_si128(reinterpret_cast<const __m128i*>(state+1*4))); |
97 | 0 | const __m256i state2 = _mm256_broadcastsi128_si256( |
98 | 0 | _mm_loadu_si128(reinterpret_cast<const __m128i*>(state+2*4))); |
99 | 0 | const __m256i state3 = _mm256_broadcastsi128_si256( |
100 | 0 | _mm_loadu_si128(reinterpret_cast<const __m128i*>(state+3*4))); |
101 | |
|
102 | 0 | const word32 C = 0xFFFFFFFFu - state[12]; |
103 | 0 | const __m256i CTR0 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, C < 4, 4); |
104 | 0 | const __m256i CTR1 = _mm256_set_epi32(0, 0, C < 1, 1, 0, 0, C < 5, 5); |
105 | 0 | const __m256i CTR2 = _mm256_set_epi32(0, 0, C < 2, 2, 0, 0, C < 6, 6); |
106 | 0 | const __m256i CTR3 = _mm256_set_epi32(0, 0, C < 3, 3, 0, 0, C < 7, 7); |
107 | |
|
108 | 0 | __m256i X0_0 = state0; |
109 | 0 | __m256i X0_1 = state1; |
110 | 0 | __m256i X0_2 = state2; |
111 | 0 | __m256i X0_3 = _mm256_add_epi32(state3, CTR0); |
112 | |
|
113 | 0 | __m256i X1_0 = state0; |
114 | 0 | __m256i X1_1 = state1; |
115 | 0 | __m256i X1_2 = state2; |
116 | 0 | __m256i X1_3 = _mm256_add_epi32(state3, CTR1); |
117 | |
|
118 | 0 | __m256i X2_0 = state0; |
119 | 0 | __m256i X2_1 = state1; |
120 | 0 | __m256i X2_2 = state2; |
121 | 0 | __m256i X2_3 = _mm256_add_epi32(state3, CTR2); |
122 | |
|
123 | 0 | __m256i X3_0 = state0; |
124 | 0 | __m256i X3_1 = state1; |
125 | 0 | __m256i X3_2 = state2; |
126 | 0 | __m256i X3_3 = _mm256_add_epi32(state3, CTR3); |
127 | |
|
128 | 0 | for (int i = static_cast<int>(rounds); i > 0; i -= 2) |
129 | 0 | { |
130 | 0 | X0_0 = _mm256_add_epi32(X0_0, X0_1); |
131 | 0 | X1_0 = _mm256_add_epi32(X1_0, X1_1); |
132 | 0 | X2_0 = _mm256_add_epi32(X2_0, X2_1); |
133 | 0 | X3_0 = _mm256_add_epi32(X3_0, X3_1); |
134 | |
|
135 | 0 | X0_3 = _mm256_xor_si256(X0_3, X0_0); |
136 | 0 | X1_3 = _mm256_xor_si256(X1_3, X1_0); |
137 | 0 | X2_3 = _mm256_xor_si256(X2_3, X2_0); |
138 | 0 | X3_3 = _mm256_xor_si256(X3_3, X3_0); |
139 | |
|
140 | 0 | X0_3 = RotateLeft<16>(X0_3); |
141 | 0 | X1_3 = RotateLeft<16>(X1_3); |
142 | 0 | X2_3 = RotateLeft<16>(X2_3); |
143 | 0 | X3_3 = RotateLeft<16>(X3_3); |
144 | |
|
145 | 0 | X0_2 = _mm256_add_epi32(X0_2, X0_3); |
146 | 0 | X1_2 = _mm256_add_epi32(X1_2, X1_3); |
147 | 0 | X2_2 = _mm256_add_epi32(X2_2, X2_3); |
148 | 0 | X3_2 = _mm256_add_epi32(X3_2, X3_3); |
149 | |
|
150 | 0 | X0_1 = _mm256_xor_si256(X0_1, X0_2); |
151 | 0 | X1_1 = _mm256_xor_si256(X1_1, X1_2); |
152 | 0 | X2_1 = _mm256_xor_si256(X2_1, X2_2); |
153 | 0 | X3_1 = _mm256_xor_si256(X3_1, X3_2); |
154 | |
|
155 | 0 | X0_1 = RotateLeft<12>(X0_1); |
156 | 0 | X1_1 = RotateLeft<12>(X1_1); |
157 | 0 | X2_1 = RotateLeft<12>(X2_1); |
158 | 0 | X3_1 = RotateLeft<12>(X3_1); |
159 | |
|
160 | 0 | X0_0 = _mm256_add_epi32(X0_0, X0_1); |
161 | 0 | X1_0 = _mm256_add_epi32(X1_0, X1_1); |
162 | 0 | X2_0 = _mm256_add_epi32(X2_0, X2_1); |
163 | 0 | X3_0 = _mm256_add_epi32(X3_0, X3_1); |
164 | |
|
165 | 0 | X0_3 = _mm256_xor_si256(X0_3, X0_0); |
166 | 0 | X1_3 = _mm256_xor_si256(X1_3, X1_0); |
167 | 0 | X2_3 = _mm256_xor_si256(X2_3, X2_0); |
168 | 0 | X3_3 = _mm256_xor_si256(X3_3, X3_0); |
169 | |
|
170 | 0 | X0_3 = RotateLeft<8>(X0_3); |
171 | 0 | X1_3 = RotateLeft<8>(X1_3); |
172 | 0 | X2_3 = RotateLeft<8>(X2_3); |
173 | 0 | X3_3 = RotateLeft<8>(X3_3); |
174 | |
|
175 | 0 | X0_2 = _mm256_add_epi32(X0_2, X0_3); |
176 | 0 | X1_2 = _mm256_add_epi32(X1_2, X1_3); |
177 | 0 | X2_2 = _mm256_add_epi32(X2_2, X2_3); |
178 | 0 | X3_2 = _mm256_add_epi32(X3_2, X3_3); |
179 | |
|
180 | 0 | X0_1 = _mm256_xor_si256(X0_1, X0_2); |
181 | 0 | X1_1 = _mm256_xor_si256(X1_1, X1_2); |
182 | 0 | X2_1 = _mm256_xor_si256(X2_1, X2_2); |
183 | 0 | X3_1 = _mm256_xor_si256(X3_1, X3_2); |
184 | |
|
185 | 0 | X0_1 = RotateLeft<7>(X0_1); |
186 | 0 | X1_1 = RotateLeft<7>(X1_1); |
187 | 0 | X2_1 = RotateLeft<7>(X2_1); |
188 | 0 | X3_1 = RotateLeft<7>(X3_1); |
189 | |
|
190 | 0 | X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(0, 3, 2, 1)); |
191 | 0 | X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2)); |
192 | 0 | X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(2, 1, 0, 3)); |
193 | |
|
194 | 0 | X1_1 = _mm256_shuffle_epi32(X1_1, _MM_SHUFFLE(0, 3, 2, 1)); |
195 | 0 | X1_2 = _mm256_shuffle_epi32(X1_2, _MM_SHUFFLE(1, 0, 3, 2)); |
196 | 0 | X1_3 = _mm256_shuffle_epi32(X1_3, _MM_SHUFFLE(2, 1, 0, 3)); |
197 | |
|
198 | 0 | X2_1 = _mm256_shuffle_epi32(X2_1, _MM_SHUFFLE(0, 3, 2, 1)); |
199 | 0 | X2_2 = _mm256_shuffle_epi32(X2_2, _MM_SHUFFLE(1, 0, 3, 2)); |
200 | 0 | X2_3 = _mm256_shuffle_epi32(X2_3, _MM_SHUFFLE(2, 1, 0, 3)); |
201 | |
|
202 | 0 | X3_1 = _mm256_shuffle_epi32(X3_1, _MM_SHUFFLE(0, 3, 2, 1)); |
203 | 0 | X3_2 = _mm256_shuffle_epi32(X3_2, _MM_SHUFFLE(1, 0, 3, 2)); |
204 | 0 | X3_3 = _mm256_shuffle_epi32(X3_3, _MM_SHUFFLE(2, 1, 0, 3)); |
205 | |
|
206 | 0 | X0_0 = _mm256_add_epi32(X0_0, X0_1); |
207 | 0 | X1_0 = _mm256_add_epi32(X1_0, X1_1); |
208 | 0 | X2_0 = _mm256_add_epi32(X2_0, X2_1); |
209 | 0 | X3_0 = _mm256_add_epi32(X3_0, X3_1); |
210 | |
|
211 | 0 | X0_3 = _mm256_xor_si256(X0_3, X0_0); |
212 | 0 | X1_3 = _mm256_xor_si256(X1_3, X1_0); |
213 | 0 | X2_3 = _mm256_xor_si256(X2_3, X2_0); |
214 | 0 | X3_3 = _mm256_xor_si256(X3_3, X3_0); |
215 | |
|
216 | 0 | X0_3 = RotateLeft<16>(X0_3); |
217 | 0 | X1_3 = RotateLeft<16>(X1_3); |
218 | 0 | X2_3 = RotateLeft<16>(X2_3); |
219 | 0 | X3_3 = RotateLeft<16>(X3_3); |
220 | |
|
221 | 0 | X0_2 = _mm256_add_epi32(X0_2, X0_3); |
222 | 0 | X1_2 = _mm256_add_epi32(X1_2, X1_3); |
223 | 0 | X2_2 = _mm256_add_epi32(X2_2, X2_3); |
224 | 0 | X3_2 = _mm256_add_epi32(X3_2, X3_3); |
225 | |
|
226 | 0 | X0_1 = _mm256_xor_si256(X0_1, X0_2); |
227 | 0 | X1_1 = _mm256_xor_si256(X1_1, X1_2); |
228 | 0 | X2_1 = _mm256_xor_si256(X2_1, X2_2); |
229 | 0 | X3_1 = _mm256_xor_si256(X3_1, X3_2); |
230 | |
|
231 | 0 | X0_1 = RotateLeft<12>(X0_1); |
232 | 0 | X1_1 = RotateLeft<12>(X1_1); |
233 | 0 | X2_1 = RotateLeft<12>(X2_1); |
234 | 0 | X3_1 = RotateLeft<12>(X3_1); |
235 | |
|
236 | 0 | X0_0 = _mm256_add_epi32(X0_0, X0_1); |
237 | 0 | X1_0 = _mm256_add_epi32(X1_0, X1_1); |
238 | 0 | X2_0 = _mm256_add_epi32(X2_0, X2_1); |
239 | 0 | X3_0 = _mm256_add_epi32(X3_0, X3_1); |
240 | |
|
241 | 0 | X0_3 = _mm256_xor_si256(X0_3, X0_0); |
242 | 0 | X1_3 = _mm256_xor_si256(X1_3, X1_0); |
243 | 0 | X2_3 = _mm256_xor_si256(X2_3, X2_0); |
244 | 0 | X3_3 = _mm256_xor_si256(X3_3, X3_0); |
245 | |
|
246 | 0 | X0_3 = RotateLeft<8>(X0_3); |
247 | 0 | X1_3 = RotateLeft<8>(X1_3); |
248 | 0 | X2_3 = RotateLeft<8>(X2_3); |
249 | 0 | X3_3 = RotateLeft<8>(X3_3); |
250 | |
|
251 | 0 | X0_2 = _mm256_add_epi32(X0_2, X0_3); |
252 | 0 | X1_2 = _mm256_add_epi32(X1_2, X1_3); |
253 | 0 | X2_2 = _mm256_add_epi32(X2_2, X2_3); |
254 | 0 | X3_2 = _mm256_add_epi32(X3_2, X3_3); |
255 | |
|
256 | 0 | X0_1 = _mm256_xor_si256(X0_1, X0_2); |
257 | 0 | X1_1 = _mm256_xor_si256(X1_1, X1_2); |
258 | 0 | X2_1 = _mm256_xor_si256(X2_1, X2_2); |
259 | 0 | X3_1 = _mm256_xor_si256(X3_1, X3_2); |
260 | |
|
261 | 0 | X0_1 = RotateLeft<7>(X0_1); |
262 | 0 | X1_1 = RotateLeft<7>(X1_1); |
263 | 0 | X2_1 = RotateLeft<7>(X2_1); |
264 | 0 | X3_1 = RotateLeft<7>(X3_1); |
265 | |
|
266 | 0 | X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(2, 1, 0, 3)); |
267 | 0 | X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2)); |
268 | 0 | X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(0, 3, 2, 1)); |
269 | |
|
270 | 0 | X1_1 = _mm256_shuffle_epi32(X1_1, _MM_SHUFFLE(2, 1, 0, 3)); |
271 | 0 | X1_2 = _mm256_shuffle_epi32(X1_2, _MM_SHUFFLE(1, 0, 3, 2)); |
272 | 0 | X1_3 = _mm256_shuffle_epi32(X1_3, _MM_SHUFFLE(0, 3, 2, 1)); |
273 | |
|
274 | 0 | X2_1 = _mm256_shuffle_epi32(X2_1, _MM_SHUFFLE(2, 1, 0, 3)); |
275 | 0 | X2_2 = _mm256_shuffle_epi32(X2_2, _MM_SHUFFLE(1, 0, 3, 2)); |
276 | 0 | X2_3 = _mm256_shuffle_epi32(X2_3, _MM_SHUFFLE(0, 3, 2, 1)); |
277 | |
|
278 | 0 | X3_1 = _mm256_shuffle_epi32(X3_1, _MM_SHUFFLE(2, 1, 0, 3)); |
279 | 0 | X3_2 = _mm256_shuffle_epi32(X3_2, _MM_SHUFFLE(1, 0, 3, 2)); |
280 | 0 | X3_3 = _mm256_shuffle_epi32(X3_3, _MM_SHUFFLE(0, 3, 2, 1)); |
281 | 0 | } |
282 | |
|
283 | 0 | X0_0 = _mm256_add_epi32(X0_0, state0); |
284 | 0 | X0_1 = _mm256_add_epi32(X0_1, state1); |
285 | 0 | X0_2 = _mm256_add_epi32(X0_2, state2); |
286 | 0 | X0_3 = _mm256_add_epi32(X0_3, state3); |
287 | 0 | X0_3 = _mm256_add_epi32(X0_3, CTR0); |
288 | |
|
289 | 0 | X1_0 = _mm256_add_epi32(X1_0, state0); |
290 | 0 | X1_1 = _mm256_add_epi32(X1_1, state1); |
291 | 0 | X1_2 = _mm256_add_epi32(X1_2, state2); |
292 | 0 | X1_3 = _mm256_add_epi32(X1_3, state3); |
293 | 0 | X1_3 = _mm256_add_epi32(X1_3, CTR1); |
294 | |
|
295 | 0 | X2_0 = _mm256_add_epi32(X2_0, state0); |
296 | 0 | X2_1 = _mm256_add_epi32(X2_1, state1); |
297 | 0 | X2_2 = _mm256_add_epi32(X2_2, state2); |
298 | 0 | X2_3 = _mm256_add_epi32(X2_3, state3); |
299 | 0 | X2_3 = _mm256_add_epi32(X2_3, CTR2); |
300 | |
|
301 | 0 | X3_0 = _mm256_add_epi32(X3_0, state0); |
302 | 0 | X3_1 = _mm256_add_epi32(X3_1, state1); |
303 | 0 | X3_2 = _mm256_add_epi32(X3_2, state2); |
304 | 0 | X3_3 = _mm256_add_epi32(X3_3, state3); |
305 | 0 | X3_3 = _mm256_add_epi32(X3_3, CTR3); |
306 | |
|
307 | 0 | if (input) |
308 | 0 | { |
309 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+0*32), |
310 | 0 | _mm256_xor_si256(_mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4)), |
311 | 0 | _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+0*32))))); |
312 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+1*32), |
313 | 0 | _mm256_xor_si256(_mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4)), |
314 | 0 | _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+1*32))))); |
315 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+2*32), |
316 | 0 | _mm256_xor_si256(_mm256_permute2x128_si256(X1_0, X1_1, 1 + (3 << 4)), |
317 | 0 | _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+2*32))))); |
318 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+3*32), |
319 | 0 | _mm256_xor_si256(_mm256_permute2x128_si256(X1_2, X1_3, 1 + (3 << 4)), |
320 | 0 | _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+3*32))))); |
321 | 0 | } |
322 | 0 | else |
323 | 0 | { |
324 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+0*32), |
325 | 0 | _mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4))); |
326 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+1*32), |
327 | 0 | _mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4))); |
328 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+2*32), |
329 | 0 | _mm256_permute2x128_si256(X1_0, X1_1, 1 + (3 << 4))); |
330 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+3*32), |
331 | 0 | _mm256_permute2x128_si256(X1_2, X1_3, 1 + (3 << 4))); |
332 | 0 | } |
333 | |
|
334 | 0 | if (input) |
335 | 0 | { |
336 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+4*32), |
337 | 0 | _mm256_xor_si256(_mm256_permute2x128_si256(X2_0, X2_1, 1 + (3 << 4)), |
338 | 0 | _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+4*32))))); |
339 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+5*32), |
340 | 0 | _mm256_xor_si256(_mm256_permute2x128_si256(X2_2, X2_3, 1 + (3 << 4)), |
341 | 0 | _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+5*32))))); |
342 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+6*32), |
343 | 0 | _mm256_xor_si256(_mm256_permute2x128_si256(X3_0, X3_1, 1 + (3 << 4)), |
344 | 0 | _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+6*32))))); |
345 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+7*32), |
346 | 0 | _mm256_xor_si256(_mm256_permute2x128_si256(X3_2, X3_3, 1 + (3 << 4)), |
347 | 0 | _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+7*32))))); |
348 | 0 | } |
349 | 0 | else |
350 | 0 | { |
351 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+4*32), |
352 | 0 | _mm256_permute2x128_si256(X2_0, X2_1, 1 + (3 << 4))); |
353 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+5*32), |
354 | 0 | _mm256_permute2x128_si256(X2_2, X2_3, 1 + (3 << 4))); |
355 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+6*32), |
356 | 0 | _mm256_permute2x128_si256(X3_0, X3_1, 1 + (3 << 4))); |
357 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+7*32), |
358 | 0 | _mm256_permute2x128_si256(X3_2, X3_3, 1 + (3 << 4))); |
359 | 0 | } |
360 | |
|
361 | 0 | if (input) |
362 | 0 | { |
363 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+ 8*32), |
364 | 0 | _mm256_xor_si256(_mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4)), |
365 | 0 | _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+8*32))))); |
366 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+ 9*32), |
367 | 0 | _mm256_xor_si256(_mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4)), |
368 | 0 | _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+9*32))))); |
369 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+10*32), |
370 | 0 | _mm256_xor_si256(_mm256_permute2x128_si256(X1_0, X1_1, 0 + (2 << 4)), |
371 | 0 | _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+10*32))))); |
372 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+11*32), |
373 | 0 | _mm256_xor_si256(_mm256_permute2x128_si256(X1_2, X1_3, 0 + (2 << 4)), |
374 | 0 | _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+11*32))))); |
375 | 0 | } |
376 | 0 | else |
377 | 0 | { |
378 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+ 8*32), |
379 | 0 | _mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4))); |
380 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+ 9*32), |
381 | 0 | _mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4))); |
382 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+10*32), |
383 | 0 | _mm256_permute2x128_si256(X1_0, X1_1, 0 + (2 << 4))); |
384 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+11*32), |
385 | 0 | _mm256_permute2x128_si256(X1_2, X1_3, 0 + (2 << 4))); |
386 | 0 | } |
387 | |
|
388 | 0 | if (input) |
389 | 0 | { |
390 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+12*32), |
391 | 0 | _mm256_xor_si256(_mm256_permute2x128_si256(X2_0, X2_1, 0 + (2 << 4)), |
392 | 0 | _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+12*32))))); |
393 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+13*32), |
394 | 0 | _mm256_xor_si256(_mm256_permute2x128_si256(X2_2, X2_3, 0 + (2 << 4)), |
395 | 0 | _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+13*32))))); |
396 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+14*32), |
397 | 0 | _mm256_xor_si256(_mm256_permute2x128_si256(X3_0, X3_1, 0 + (2 << 4)), |
398 | 0 | _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+14*32))))); |
399 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+15*32), |
400 | 0 | _mm256_xor_si256(_mm256_permute2x128_si256(X3_2, X3_3, 0 + (2 << 4)), |
401 | 0 | _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+15*32))))); |
402 | 0 | } |
403 | 0 | else |
404 | 0 | { |
405 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+12*32), |
406 | 0 | _mm256_permute2x128_si256(X2_0, X2_1, 0 + (2 << 4))); |
407 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+13*32), |
408 | 0 | _mm256_permute2x128_si256(X2_2, X2_3, 0 + (2 << 4))); |
409 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+14*32), |
410 | 0 | _mm256_permute2x128_si256(X3_0, X3_1, 0 + (2 << 4))); |
411 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+15*32), |
412 | 0 | _mm256_permute2x128_si256(X3_2, X3_3, 0 + (2 << 4))); |
413 | 0 | } |
414 | | |
415 | | // https://software.intel.com/en-us/articles/avoiding-avx-sse-transition-penalties |
416 | 0 | _mm256_zeroupper(); |
417 | 0 | } |
418 | | |
419 | | #endif // CRYPTOPP_AVX2_AVAILABLE |
420 | | |
421 | | NAMESPACE_END |