Coverage Report

Created: 2023-02-13 06:21

/src/botan/src/lib/block/threefish_512/threefish_512_avx2/threefish_512_avx2.cpp
Line
Count
Source (jump to first uncovered line)
1
/*
2
* Threefish-512 using AVX2
3
* (C) 2013,2016 Jack Lloyd
4
*
5
* Botan is released under the Simplified BSD License (see license.txt)
6
*/
7
8
#include <botan/internal/threefish_512.h>
9
#include <botan/internal/simd_avx2.h>
10
#include <immintrin.h>
11
12
namespace Botan {
13
14
namespace {
15
16
BOTAN_AVX2_FN
17
inline void interleave_epi64(__m256i& X0, __m256i& X1)
18
0
   {
19
   // interleave X0 and X1 qwords
20
   // (X0,X1,X2,X3),(X4,X5,X6,X7) -> (X0,X2,X4,X6),(X1,X3,X5,X7)
21
22
0
   const __m256i T0 = _mm256_unpacklo_epi64(X0, X1);
23
0
   const __m256i T1 = _mm256_unpackhi_epi64(X0, X1);
24
25
0
   X0 = _mm256_permute4x64_epi64(T0, _MM_SHUFFLE(3,1,2,0));
26
0
   X1 = _mm256_permute4x64_epi64(T1, _MM_SHUFFLE(3,1,2,0));
27
0
   }
28
29
BOTAN_AVX2_FN
30
inline void deinterleave_epi64(__m256i& X0, __m256i& X1)
31
0
   {
32
0
   const __m256i T0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(3,1,2,0));
33
0
   const __m256i T1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(3,1,2,0));
34
35
0
   X0 = _mm256_unpacklo_epi64(T0, T1);
36
0
   X1 = _mm256_unpackhi_epi64(T0, T1);
37
0
   }
38
39
BOTAN_AVX2_FN
40
inline void rotate_keys(__m256i& R0, __m256i& R1, __m256i R2)
41
0
   {
42
   /*
43
   Behold. The key schedule progresses like so. The values
44
   loop back to the originals after the rounds are complete
45
   so we don't need to reload for starting the next block.
46
47
                 R0        R1        R2
48
     K1,K2,K3 (7,5,3,1),(8,6,4,2),(0,7,5,3)
49
     K3,K4,K5 (0,7,5,3),(1,8,6,4),(2,0,7,5)
50
     K5,K6,K7 (2,0,7,5),(3,1,8,6),(4,2,0,7)
51
52
     K7,K8,K0 (4,2,0,7),(5,3,1,8),(6,4,2,0)
53
     K0,K1,K2 (6,4,2,0),(7,5,3,1),(8,6,4,2)
54
     K2,K3,K4 (8,6,4,2),(0,7,5,3),(1,8,6,4)
55
56
     K4,K5,K6 (1,8,6,4),(2,0,7,5),(3,1,8,6)
57
     K6,K7,K8 (3,1,8,6),(4,2,0,7),(5,3,1,8)
58
     K8,K0,K1 (5,3,1,8),(6,4,2,0),(7,5,3,1)
59
60
   To compute the values for the next round:
61
     X0 is X2 from the last round
62
     X1 becomes (X0[4],X1[1:3])
63
     X2 becomes (X1[4],X2[1:3])
64
65
   Uses 3 permutes and 2 blends, is there a faster way?
66
   */
67
0
   __m256i T0 = _mm256_permute4x64_epi64(R0, _MM_SHUFFLE(0,0,0,0));
68
0
   __m256i T1 = _mm256_permute4x64_epi64(R1, _MM_SHUFFLE(0,3,2,1));
69
0
   __m256i T2 = _mm256_permute4x64_epi64(R2, _MM_SHUFFLE(0,3,2,1));
70
71
0
   R0 = _mm256_blend_epi32(T1, T0, 0xC0);
72
0
   R1 = _mm256_blend_epi32(T2, T1, 0xC0);
73
0
   }
74
75
76
}
77
78
BOTAN_AVX2_FN
79
void Threefish_512::avx2_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
80
0
   {
81
0
   _mm256_zeroupper();
82
83
0
   const uint64_t* K = m_K.data();
84
0
   const uint64_t* T_64 = m_T.data();
85
86
0
   const __m256i ROTATE_1 = _mm256_set_epi64x(37,19,36,46);
87
0
   const __m256i ROTATE_2 = _mm256_set_epi64x(42,14,27,33);
88
0
   const __m256i ROTATE_3 = _mm256_set_epi64x(39,36,49,17);
89
0
   const __m256i ROTATE_4 = _mm256_set_epi64x(56,54, 9,44);
90
0
   const __m256i ROTATE_5 = _mm256_set_epi64x(24,34,30,39);
91
0
   const __m256i ROTATE_6 = _mm256_set_epi64x(17,10,50,13);
92
0
   const __m256i ROTATE_7 = _mm256_set_epi64x(43,39,29,25);
93
0
   const __m256i ROTATE_8 = _mm256_set_epi64x(22,56,35, 8);
94
95
0
#define THREEFISH_ROUND(X0, X1, SHL)                                                \
96
0
   do {                                                                             \
97
0
      const __m256i SHR = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHL);            \
98
0
      X0 = _mm256_add_epi64(X0, X1);                                                \
99
0
      X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \
100
0
      X1 = _mm256_xor_si256(X1, X0);                                                \
101
0
      X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(0, 3, 2, 1));                   \
102
0
      X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0));                   \
103
0
   } while(0)
104
105
0
#define THREEFISH_ROUND_2(X0, X1, X2, X3, SHL)                           \
106
0
   do {                                                                             \
107
0
      const __m256i SHR = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHL);            \
108
0
      X0 = _mm256_add_epi64(X0, X1);                                                \
109
0
      X2 = _mm256_add_epi64(X2, X3);                                                \
110
0
      X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \
111
0
      X3 = _mm256_or_si256(_mm256_sllv_epi64(X3, SHL), _mm256_srlv_epi64(X3, SHR)); \
112
0
      X1 = _mm256_xor_si256(X1, X0);                                                \
113
0
      X3 = _mm256_xor_si256(X3, X2);                                                \
114
0
      X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(0, 3, 2, 1));                   \
115
0
      X2 = _mm256_permute4x64_epi64(X2, _MM_SHUFFLE(0, 3, 2, 1));                   \
116
0
      X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0));                   \
117
0
      X3 = _mm256_permute4x64_epi64(X3, _MM_SHUFFLE(1, 2, 3, 0));                   \
118
0
   } while(0)
119
120
0
#define THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0I, T1I)                        \
121
0
   do {                                                                          \
122
0
      const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \
123
0
      const __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \
124
0
      X0 = _mm256_add_epi64(X0, K0);                                             \
125
0
      X1 = _mm256_add_epi64(X1, K1);                                             \
126
0
      X1 = _mm256_add_epi64(X1, _mm256_set_epi64x(R,0,0,0));                     \
127
0
      X0 = _mm256_add_epi64(X0, T0);                                             \
128
0
      X1 = _mm256_add_epi64(X1, T1);                                             \
129
0
   } while(0)
130
131
0
#define THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0I, T1I)              \
132
0
   do {                                                                          \
133
0
      const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \
134
0
      __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \
135
0
      X0 = _mm256_add_epi64(X0, K0);                                             \
136
0
      X2 = _mm256_add_epi64(X2, K0);                                             \
137
0
      X1 = _mm256_add_epi64(X1, K1);                                             \
138
0
      X3 = _mm256_add_epi64(X3, K1);                                             \
139
0
      T1 = _mm256_add_epi64(T1, _mm256_set_epi64x(R,0,0,0));                     \
140
0
      X0 = _mm256_add_epi64(X0, T0);                                             \
141
0
      X2 = _mm256_add_epi64(X2, T0);                                             \
142
0
      X1 = _mm256_add_epi64(X1, T1);                                             \
143
0
      X3 = _mm256_add_epi64(X3, T1);                                             \
144
0
   } while(0)
145
146
0
#define THREEFISH_ENC_8_ROUNDS(X0, X1, R, K0, K1, K2, T0, T1, T2)        \
147
0
   do {                                                        \
148
0
      rotate_keys(K1, K2, K0);                                 \
149
0
      THREEFISH_ROUND(X0, X1, ROTATE_1);                       \
150
0
      THREEFISH_ROUND(X0, X1, ROTATE_2);                       \
151
0
      THREEFISH_ROUND(X0, X1, ROTATE_3);                       \
152
0
      THREEFISH_ROUND(X0, X1, ROTATE_4);                       \
153
0
      THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0, T1);         \
154
0
                                                               \
155
0
      THREEFISH_ROUND(X0, X1, ROTATE_5);                       \
156
0
      THREEFISH_ROUND(X0, X1, ROTATE_6);                       \
157
0
      THREEFISH_ROUND(X0, X1, ROTATE_7);                       \
158
0
      THREEFISH_ROUND(X0, X1, ROTATE_8);                       \
159
0
      THREEFISH_INJECT_KEY(X0, X1, R+1, K1, K2, T2, T0);       \
160
0
   } while(0)
161
162
0
#define THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K0, K1, K2, T0, T1, T2) \
163
0
   do {                                                                  \
164
0
      rotate_keys(K1, K2, K0);                                 \
165
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_1);                       \
166
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_2);                       \
167
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_3);                       \
168
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_4);                       \
169
0
      THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0, T1);         \
170
0
                                                                         \
171
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_5);                       \
172
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_6);                       \
173
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_7);                       \
174
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_8);                       \
175
0
      THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R+1, K1, K2, T2, T0);       \
176
0
   } while(0)
177
178
0
   __m256i K0 = _mm256_set_epi64x(K[5], K[3], K[1], K[8]);
179
0
   __m256i K1 = _mm256_set_epi64x(K[6], K[4], K[2], K[0]);
180
0
   __m256i K2 = _mm256_set_epi64x(K[7], K[5], K[3], K[1]);
181
182
0
   const __m256i* in_mm = reinterpret_cast<const __m256i*>(in);
183
0
   __m256i* out_mm = reinterpret_cast<__m256i*>(out);
184
185
0
   while(blocks >= 2)
186
0
      {
187
0
      __m256i X0 = _mm256_loadu_si256(in_mm++);
188
0
      __m256i X1 = _mm256_loadu_si256(in_mm++);
189
0
      __m256i X2 = _mm256_loadu_si256(in_mm++);
190
0
      __m256i X3 = _mm256_loadu_si256(in_mm++);
191
192
0
      const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);
193
194
0
      interleave_epi64(X0, X1);
195
0
      interleave_epi64(X2, X3);
196
197
0
      THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, 0, K1, K2, 2, 3);
198
199
0
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3,  1, K2,K0,K1, 1, 2, 3);
200
0
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3,  3, K1,K2,K0, 2, 3, 1);
201
0
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3,  5, K0,K1,K2, 3, 1, 2);
202
0
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3,  7, K2,K0,K1, 1, 2, 3);
203
0
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3,  9, K1,K2,K0, 2, 3, 1);
204
0
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 11, K0,K1,K2, 3, 1, 2);
205
0
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 13, K2,K0,K1, 1, 2, 3);
206
0
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 15, K1,K2,K0, 2, 3, 1);
207
0
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 17, K0,K1,K2, 3, 1, 2);
208
209
0
      deinterleave_epi64(X0, X1);
210
0
      deinterleave_epi64(X2, X3);
211
212
0
      _mm256_storeu_si256(out_mm++, X0);
213
0
      _mm256_storeu_si256(out_mm++, X1);
214
0
      _mm256_storeu_si256(out_mm++, X2);
215
0
      _mm256_storeu_si256(out_mm++, X3);
216
217
0
      blocks -= 2;
218
0
      }
219
220
0
   for(size_t i = 0; i != blocks; ++i)
221
0
      {
222
0
      __m256i X0 = _mm256_loadu_si256(in_mm++);
223
0
      __m256i X1 = _mm256_loadu_si256(in_mm++);
224
225
0
      const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);
226
227
0
      interleave_epi64(X0, X1);
228
229
0
      THREEFISH_INJECT_KEY(X0, X1, 0, K1, K2, 2, 3);
230
231
0
      THREEFISH_ENC_8_ROUNDS(X0, X1,  1, K2,K0,K1, 1, 2, 3);
232
0
      THREEFISH_ENC_8_ROUNDS(X0, X1,  3, K1,K2,K0, 2, 3, 1);
233
0
      THREEFISH_ENC_8_ROUNDS(X0, X1,  5, K0,K1,K2, 3, 1, 2);
234
0
      THREEFISH_ENC_8_ROUNDS(X0, X1,  7, K2,K0,K1, 1, 2, 3);
235
0
      THREEFISH_ENC_8_ROUNDS(X0, X1,  9, K1,K2,K0, 2, 3, 1);
236
0
      THREEFISH_ENC_8_ROUNDS(X0, X1, 11, K0,K1,K2, 3, 1, 2);
237
0
      THREEFISH_ENC_8_ROUNDS(X0, X1, 13, K2,K0,K1, 1, 2, 3);
238
0
      THREEFISH_ENC_8_ROUNDS(X0, X1, 15, K1,K2,K0, 2, 3, 1);
239
0
      THREEFISH_ENC_8_ROUNDS(X0, X1, 17, K0,K1,K2, 3, 1, 2);
240
241
0
      deinterleave_epi64(X0, X1);
242
243
0
      _mm256_storeu_si256(out_mm++, X0);
244
0
      _mm256_storeu_si256(out_mm++, X1);
245
0
      }
246
247
0
   _mm256_zeroall();
248
249
0
#undef THREEFISH_ENC_8_ROUNDS
250
0
#undef THREEFISH_ROUND
251
0
#undef THREEFISH_INJECT_KEY
252
0
#undef THREEFISH_DEC_2_8_ROUNDS
253
0
#undef THREEFISH_ROUND_2
254
0
#undef THREEFISH_INJECT_KEY_2
255
0
   }
256
257
BOTAN_AVX2_FN
258
void Threefish_512::avx2_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
259
0
   {
260
0
   _mm256_zeroupper();
261
262
0
   const uint64_t* K = m_K.data();
263
0
   const uint64_t* T_64 = m_T.data();
264
265
0
   const __m256i ROTATE_1 = _mm256_set_epi64x(37,19,36,46);
266
0
   const __m256i ROTATE_2 = _mm256_set_epi64x(42,14,27,33);
267
0
   const __m256i ROTATE_3 = _mm256_set_epi64x(39,36,49,17);
268
0
   const __m256i ROTATE_4 = _mm256_set_epi64x(56,54, 9,44);
269
0
   const __m256i ROTATE_5 = _mm256_set_epi64x(24,34,30,39);
270
0
   const __m256i ROTATE_6 = _mm256_set_epi64x(17,10,50,13);
271
0
   const __m256i ROTATE_7 = _mm256_set_epi64x(43,39,29,25);
272
0
   const __m256i ROTATE_8 = _mm256_set_epi64x(22,56,35, 8);
273
274
0
#define THREEFISH_ROUND(X0, X1, SHR)                                                \
275
0
   do {                                                                             \
276
0
      const __m256i SHL = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHR);            \
277
0
      X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(2, 1, 0, 3));                   \
278
0
      X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0));                   \
279
0
      X1 = _mm256_xor_si256(X1, X0);                                                \
280
0
      X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \
281
0
      X0 = _mm256_sub_epi64(X0, X1);                                                \
282
0
   } while(0)
283
284
0
#define THREEFISH_ROUND_2(X0, X1, X2, X3, SHR)                                                \
285
0
   do {                                                                             \
286
0
      const __m256i SHL = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHR);            \
287
0
      X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(2, 1, 0, 3));                   \
288
0
      X2 = _mm256_permute4x64_epi64(X2, _MM_SHUFFLE(2, 1, 0, 3));                   \
289
0
      X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0));                   \
290
0
      X3 = _mm256_permute4x64_epi64(X3, _MM_SHUFFLE(1, 2, 3, 0));                   \
291
0
      X1 = _mm256_xor_si256(X1, X0);                                                \
292
0
      X3 = _mm256_xor_si256(X3, X2);                                                \
293
0
      X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \
294
0
      X3 = _mm256_or_si256(_mm256_sllv_epi64(X3, SHL), _mm256_srlv_epi64(X3, SHR)); \
295
0
      X0 = _mm256_sub_epi64(X0, X1);                                                \
296
0
      X2 = _mm256_sub_epi64(X2, X3);                                                \
297
0
   } while(0)
298
299
0
#define THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0I, T1I)                \
300
0
   do {                                                                          \
301
0
      const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \
302
0
      const __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \
303
0
      X0 = _mm256_sub_epi64(X0, K0);                                             \
304
0
      X1 = _mm256_sub_epi64(X1, K1);                                             \
305
0
      X1 = _mm256_sub_epi64(X1, _mm256_set_epi64x(R, 0, 0, 0));                  \
306
0
      X0 = _mm256_sub_epi64(X0, T0);                                             \
307
0
      X1 = _mm256_sub_epi64(X1, T1);                                             \
308
0
   } while(0)
309
310
0
#define THREEFISH_DEC_8_ROUNDS(X0, X1, R, K1, K2, K3, T0, T1, T2)   \
311
0
   do {                                                      \
312
0
      THREEFISH_INJECT_KEY(X0, X1, R+1, K2, K3, T2, T0);     \
313
0
      THREEFISH_ROUND(X0, X1, ROTATE_8);                     \
314
0
      THREEFISH_ROUND(X0, X1, ROTATE_7);                     \
315
0
      THREEFISH_ROUND(X0, X1, ROTATE_6);                     \
316
0
      THREEFISH_ROUND(X0, X1, ROTATE_5);                     \
317
0
                                                             \
318
0
      THREEFISH_INJECT_KEY(X0, X1, R, K1, K2, T0, T1);       \
319
0
      THREEFISH_ROUND(X0, X1, ROTATE_4);                     \
320
0
      THREEFISH_ROUND(X0, X1, ROTATE_3);                     \
321
0
      THREEFISH_ROUND(X0, X1, ROTATE_2);                     \
322
0
      THREEFISH_ROUND(X0, X1, ROTATE_1);                     \
323
0
   } while(0)
324
325
0
#define THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0I, T1I)              \
326
0
   do {                                                                          \
327
0
      const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \
328
0
      __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \
329
0
      X0 = _mm256_sub_epi64(X0, K0);                                             \
330
0
      X2 = _mm256_sub_epi64(X2, K0);                                             \
331
0
      X1 = _mm256_sub_epi64(X1, K1);                                             \
332
0
      X3 = _mm256_sub_epi64(X3, K1);                                             \
333
0
      T1 = _mm256_add_epi64(T1, _mm256_set_epi64x(R,0,0,0));                     \
334
0
      X0 = _mm256_sub_epi64(X0, T0);                                             \
335
0
      X2 = _mm256_sub_epi64(X2, T0);                                             \
336
0
      X1 = _mm256_sub_epi64(X1, T1);                                             \
337
0
      X3 = _mm256_sub_epi64(X3, T1);                                             \
338
0
   } while(0)
339
340
0
#define THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, R, K1, K2, K3, T0, T1, T2) \
341
0
   do {                                                                  \
342
0
      THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R+1, K2, K3, T2, T0);         \
343
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_8);                       \
344
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_7);                       \
345
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_6);                       \
346
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_5);                       \
347
0
                                                                         \
348
0
      THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K1, K2, T0, T1);       \
349
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_4);                       \
350
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_3);                       \
351
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_2);                       \
352
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_1);                       \
353
0
   } while(0)
354
355
   /*
356
   v1.0 key schedule: 9 ymm registers (only need 2 or 3)
357
   (0,1,2,3),(4,5,6,7) [8]
358
   then mutating with vpermq
359
   */
360
0
   const __m256i K0 = _mm256_set_epi64x(K[6], K[4], K[2], K[0]);
361
0
   const __m256i K1 = _mm256_set_epi64x(K[7], K[5], K[3], K[1]);
362
0
   const __m256i K2 = _mm256_set_epi64x(K[8], K[6], K[4], K[2]);
363
0
   const __m256i K3 = _mm256_set_epi64x(K[0], K[7], K[5], K[3]);
364
0
   const __m256i K4 = _mm256_set_epi64x(K[1], K[8], K[6], K[4]);
365
0
   const __m256i K5 = _mm256_set_epi64x(K[2], K[0], K[7], K[5]);
366
0
   const __m256i K6 = _mm256_set_epi64x(K[3], K[1], K[8], K[6]);
367
0
   const __m256i K7 = _mm256_set_epi64x(K[4], K[2], K[0], K[7]);
368
0
   const __m256i K8 = _mm256_set_epi64x(K[5], K[3], K[1], K[8]);
369
370
0
   const __m256i* in_mm = reinterpret_cast<const __m256i*>(in);
371
0
   __m256i* out_mm = reinterpret_cast<__m256i*>(out);
372
373
0
   while(blocks >= 2)
374
0
      {
375
0
      __m256i X0 = _mm256_loadu_si256(in_mm++);
376
0
      __m256i X1 = _mm256_loadu_si256(in_mm++);
377
0
      __m256i X2 = _mm256_loadu_si256(in_mm++);
378
0
      __m256i X3 = _mm256_loadu_si256(in_mm++);
379
380
0
      const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);
381
382
0
      interleave_epi64(X0, X1);
383
0
      interleave_epi64(X2, X3);
384
385
0
      THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 17, K8,K0,K1, 3, 1, 2);
386
0
      THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 15, K6,K7,K8, 2, 3, 1);
387
0
      THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 13, K4,K5,K6, 1, 2, 3);
388
0
      THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 11, K2,K3,K4, 3, 1, 2);
389
0
      THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 9, K0,K1,K2, 2, 3, 1);
390
0
      THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 7, K7,K8,K0, 1, 2, 3);
391
0
      THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 5, K5,K6,K7, 3, 1, 2);
392
0
      THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 3, K3,K4,K5, 2, 3, 1);
393
0
      THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 1, K1,K2,K3, 1, 2, 3);
394
395
0
      THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, 0, K0, K1, 2, 3);
396
397
0
      deinterleave_epi64(X0, X1);
398
0
      deinterleave_epi64(X2, X3);
399
400
0
      _mm256_storeu_si256(out_mm++, X0);
401
0
      _mm256_storeu_si256(out_mm++, X1);
402
0
      _mm256_storeu_si256(out_mm++, X2);
403
0
      _mm256_storeu_si256(out_mm++, X3);
404
405
0
      blocks -= 2;
406
0
      }
407
408
0
   for(size_t i = 0; i != blocks; ++i)
409
0
      {
410
0
      __m256i X0 = _mm256_loadu_si256(in_mm++);
411
0
      __m256i X1 = _mm256_loadu_si256(in_mm++);
412
413
0
      const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);
414
415
0
      interleave_epi64(X0, X1);
416
417
0
      THREEFISH_DEC_8_ROUNDS(X0, X1, 17, K8,K0,K1, 3, 1, 2);
418
0
      THREEFISH_DEC_8_ROUNDS(X0, X1, 15, K6,K7,K8, 2, 3, 1);
419
0
      THREEFISH_DEC_8_ROUNDS(X0, X1, 13, K4,K5,K6, 1, 2, 3);
420
0
      THREEFISH_DEC_8_ROUNDS(X0, X1, 11, K2,K3,K4, 3, 1, 2);
421
0
      THREEFISH_DEC_8_ROUNDS(X0, X1, 9, K0,K1,K2, 2, 3, 1);
422
0
      THREEFISH_DEC_8_ROUNDS(X0, X1, 7, K7,K8,K0, 1, 2, 3);
423
0
      THREEFISH_DEC_8_ROUNDS(X0, X1, 5, K5,K6,K7, 3, 1, 2);
424
0
      THREEFISH_DEC_8_ROUNDS(X0, X1, 3, K3,K4,K5, 2, 3, 1);
425
0
      THREEFISH_DEC_8_ROUNDS(X0, X1, 1, K1,K2,K3, 1, 2, 3);
426
427
0
      THREEFISH_INJECT_KEY(X0, X1, 0, K0, K1, 2, 3);
428
429
0
      deinterleave_epi64(X0, X1);
430
431
0
      _mm256_storeu_si256(out_mm++, X0);
432
0
      _mm256_storeu_si256(out_mm++, X1);
433
0
      }
434
435
0
#undef THREEFISH_DEC_8_ROUNDS
436
0
#undef THREEFISH_ROUND
437
0
#undef THREEFISH_INJECT_KEY
438
0
#undef THREEFISH_DEC_2_8_ROUNDS
439
0
#undef THREEFISH_ROUND_2
440
0
#undef THREEFISH_INJECT_KEY_2
441
442
0
   _mm256_zeroall();
443
0
   }
444
445
}