Coverage Report

Created: 2022-06-23 06:44

/src/botan/src/lib/block/threefish_512/threefish_512_avx2/threefish_512_avx2.cpp
Line
Count
Source (jump to first uncovered line)
1
/*
2
* Threefish-512 using AVX2
3
* (C) 2013,2016 Jack Lloyd
4
*
5
* Botan is released under the Simplified BSD License (see license.txt)
6
*/
7
8
#include <botan/internal/threefish_512.h>
9
#include <immintrin.h>
10
11
namespace Botan {
12
13
namespace {
14
15
BOTAN_FUNC_ISA("avx2")
16
inline void interleave_epi64(__m256i& X0, __m256i& X1)
17
0
   {
18
   // interleave X0 and X1 qwords
19
   // (X0,X1,X2,X3),(X4,X5,X6,X7) -> (X0,X2,X4,X6),(X1,X3,X5,X7)
20
21
0
   const __m256i T0 = _mm256_unpacklo_epi64(X0, X1);
22
0
   const __m256i T1 = _mm256_unpackhi_epi64(X0, X1);
23
24
0
   X0 = _mm256_permute4x64_epi64(T0, _MM_SHUFFLE(3,1,2,0));
25
0
   X1 = _mm256_permute4x64_epi64(T1, _MM_SHUFFLE(3,1,2,0));
26
0
   }
27
28
BOTAN_FUNC_ISA("avx2")
29
inline void deinterleave_epi64(__m256i& X0, __m256i& X1)
30
0
   {
31
0
   const __m256i T0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(3,1,2,0));
32
0
   const __m256i T1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(3,1,2,0));
33
34
0
   X0 = _mm256_unpacklo_epi64(T0, T1);
35
0
   X1 = _mm256_unpackhi_epi64(T0, T1);
36
0
   }
37
38
BOTAN_FUNC_ISA("avx2")
39
inline void rotate_keys(__m256i& R0, __m256i& R1, __m256i R2)
40
0
   {
41
   /*
42
   Behold. The key schedule progresses like so. The values
43
   loop back to the originals after the rounds are complete
44
   so we don't need to reload for starting the next block.
45
46
                 R0        R1        R2
47
     K1,K2,K3 (7,5,3,1),(8,6,4,2),(0,7,5,3)
48
     K3,K4,K5 (0,7,5,3),(1,8,6,4),(2,0,7,5)
49
     K5,K6,K7 (2,0,7,5),(3,1,8,6),(4,2,0,7)
50
51
     K7,K8,K0 (4,2,0,7),(5,3,1,8),(6,4,2,0)
52
     K0,K1,K2 (6,4,2,0),(7,5,3,1),(8,6,4,2)
53
     K2,K3,K4 (8,6,4,2),(0,7,5,3),(1,8,6,4)
54
55
     K4,K5,K6 (1,8,6,4),(2,0,7,5),(3,1,8,6)
56
     K6,K7,K8 (3,1,8,6),(4,2,0,7),(5,3,1,8)
57
     K8,K0,K1 (5,3,1,8),(6,4,2,0),(7,5,3,1)
58
59
   To compute the values for the next round:
60
     X0 is X2 from the last round
61
     X1 becomes (X0[4],X1[1:3])
62
     X2 becomes (X1[4],X2[1:3])
63
64
   Uses 3 permutes and 2 blends, is there a faster way?
65
   */
66
0
   __m256i T0 = _mm256_permute4x64_epi64(R0, _MM_SHUFFLE(0,0,0,0));
67
0
   __m256i T1 = _mm256_permute4x64_epi64(R1, _MM_SHUFFLE(0,3,2,1));
68
0
   __m256i T2 = _mm256_permute4x64_epi64(R2, _MM_SHUFFLE(0,3,2,1));
69
70
0
   R0 = _mm256_blend_epi32(T1, T0, 0xC0);
71
0
   R1 = _mm256_blend_epi32(T2, T1, 0xC0);
72
0
   }
73
74
75
}
76
77
BOTAN_FUNC_ISA("avx2")
78
void Threefish_512::avx2_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
79
0
   {
80
0
   _mm256_zeroupper();
81
82
0
   const uint64_t* K = m_K.data();
83
0
   const uint64_t* T_64 = m_T.data();
84
85
0
   const __m256i ROTATE_1 = _mm256_set_epi64x(37,19,36,46);
86
0
   const __m256i ROTATE_2 = _mm256_set_epi64x(42,14,27,33);
87
0
   const __m256i ROTATE_3 = _mm256_set_epi64x(39,36,49,17);
88
0
   const __m256i ROTATE_4 = _mm256_set_epi64x(56,54, 9,44);
89
0
   const __m256i ROTATE_5 = _mm256_set_epi64x(24,34,30,39);
90
0
   const __m256i ROTATE_6 = _mm256_set_epi64x(17,10,50,13);
91
0
   const __m256i ROTATE_7 = _mm256_set_epi64x(43,39,29,25);
92
0
   const __m256i ROTATE_8 = _mm256_set_epi64x(22,56,35, 8);
93
94
0
#define THREEFISH_ROUND(X0, X1, SHL)                                                \
95
0
   do {                                                                             \
96
0
      const __m256i SHR = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHL);            \
97
0
      X0 = _mm256_add_epi64(X0, X1);                                                \
98
0
      X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \
99
0
      X1 = _mm256_xor_si256(X1, X0);                                                \
100
0
      X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(0, 3, 2, 1));                   \
101
0
      X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0));                   \
102
0
   } while(0)
103
104
0
#define THREEFISH_ROUND_2(X0, X1, X2, X3, SHL)                           \
105
0
   do {                                                                             \
106
0
      const __m256i SHR = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHL);            \
107
0
      X0 = _mm256_add_epi64(X0, X1);                                                \
108
0
      X2 = _mm256_add_epi64(X2, X3);                                                \
109
0
      X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \
110
0
      X3 = _mm256_or_si256(_mm256_sllv_epi64(X3, SHL), _mm256_srlv_epi64(X3, SHR)); \
111
0
      X1 = _mm256_xor_si256(X1, X0);                                                \
112
0
      X3 = _mm256_xor_si256(X3, X2);                                                \
113
0
      X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(0, 3, 2, 1));                   \
114
0
      X2 = _mm256_permute4x64_epi64(X2, _MM_SHUFFLE(0, 3, 2, 1));                   \
115
0
      X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0));                   \
116
0
      X3 = _mm256_permute4x64_epi64(X3, _MM_SHUFFLE(1, 2, 3, 0));                   \
117
0
   } while(0)
118
119
0
#define THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0I, T1I)                        \
120
0
   do {                                                                          \
121
0
      const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \
122
0
      const __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \
123
0
      X0 = _mm256_add_epi64(X0, K0);                                             \
124
0
      X1 = _mm256_add_epi64(X1, K1);                                             \
125
0
      X1 = _mm256_add_epi64(X1, _mm256_set_epi64x(R,0,0,0));                     \
126
0
      X0 = _mm256_add_epi64(X0, T0);                                             \
127
0
      X1 = _mm256_add_epi64(X1, T1);                                             \
128
0
   } while(0)
129
130
0
#define THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0I, T1I)              \
131
0
   do {                                                                          \
132
0
      const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \
133
0
      __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \
134
0
      X0 = _mm256_add_epi64(X0, K0);                                             \
135
0
      X2 = _mm256_add_epi64(X2, K0);                                             \
136
0
      X1 = _mm256_add_epi64(X1, K1);                                             \
137
0
      X3 = _mm256_add_epi64(X3, K1);                                             \
138
0
      T1 = _mm256_add_epi64(T1, _mm256_set_epi64x(R,0,0,0));                     \
139
0
      X0 = _mm256_add_epi64(X0, T0);                                             \
140
0
      X2 = _mm256_add_epi64(X2, T0);                                             \
141
0
      X1 = _mm256_add_epi64(X1, T1);                                             \
142
0
      X3 = _mm256_add_epi64(X3, T1);                                             \
143
0
   } while(0)
144
145
0
#define THREEFISH_ENC_8_ROUNDS(X0, X1, R, K0, K1, K2, T0, T1, T2)        \
146
0
   do {                                                        \
147
0
      rotate_keys(K1, K2, K0);                                 \
148
0
      THREEFISH_ROUND(X0, X1, ROTATE_1);                       \
149
0
      THREEFISH_ROUND(X0, X1, ROTATE_2);                       \
150
0
      THREEFISH_ROUND(X0, X1, ROTATE_3);                       \
151
0
      THREEFISH_ROUND(X0, X1, ROTATE_4);                       \
152
0
      THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0, T1);         \
153
0
                                                               \
154
0
      THREEFISH_ROUND(X0, X1, ROTATE_5);                       \
155
0
      THREEFISH_ROUND(X0, X1, ROTATE_6);                       \
156
0
      THREEFISH_ROUND(X0, X1, ROTATE_7);                       \
157
0
      THREEFISH_ROUND(X0, X1, ROTATE_8);                       \
158
0
      THREEFISH_INJECT_KEY(X0, X1, R+1, K1, K2, T2, T0);       \
159
0
   } while(0)
160
161
0
#define THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K0, K1, K2, T0, T1, T2) \
162
0
   do {                                                                  \
163
0
      rotate_keys(K1, K2, K0);                                 \
164
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_1);                       \
165
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_2);                       \
166
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_3);                       \
167
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_4);                       \
168
0
      THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0, T1);         \
169
0
                                                                         \
170
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_5);                       \
171
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_6);                       \
172
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_7);                       \
173
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_8);                       \
174
0
      THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R+1, K1, K2, T2, T0);       \
175
0
   } while(0)
176
177
0
   __m256i K0 = _mm256_set_epi64x(K[5], K[3], K[1], K[8]);
178
0
   __m256i K1 = _mm256_set_epi64x(K[6], K[4], K[2], K[0]);
179
0
   __m256i K2 = _mm256_set_epi64x(K[7], K[5], K[3], K[1]);
180
181
0
   const __m256i* in_mm = reinterpret_cast<const __m256i*>(in);
182
0
   __m256i* out_mm = reinterpret_cast<__m256i*>(out);
183
184
0
   while(blocks >= 2)
185
0
      {
186
0
      __m256i X0 = _mm256_loadu_si256(in_mm++);
187
0
      __m256i X1 = _mm256_loadu_si256(in_mm++);
188
0
      __m256i X2 = _mm256_loadu_si256(in_mm++);
189
0
      __m256i X3 = _mm256_loadu_si256(in_mm++);
190
191
0
      const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);
192
193
0
      interleave_epi64(X0, X1);
194
0
      interleave_epi64(X2, X3);
195
196
0
      THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, 0, K1, K2, 2, 3);
197
198
0
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3,  1, K2,K0,K1, 1, 2, 3);
199
0
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3,  3, K1,K2,K0, 2, 3, 1);
200
0
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3,  5, K0,K1,K2, 3, 1, 2);
201
0
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3,  7, K2,K0,K1, 1, 2, 3);
202
0
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3,  9, K1,K2,K0, 2, 3, 1);
203
0
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 11, K0,K1,K2, 3, 1, 2);
204
0
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 13, K2,K0,K1, 1, 2, 3);
205
0
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 15, K1,K2,K0, 2, 3, 1);
206
0
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 17, K0,K1,K2, 3, 1, 2);
207
208
0
      deinterleave_epi64(X0, X1);
209
0
      deinterleave_epi64(X2, X3);
210
211
0
      _mm256_storeu_si256(out_mm++, X0);
212
0
      _mm256_storeu_si256(out_mm++, X1);
213
0
      _mm256_storeu_si256(out_mm++, X2);
214
0
      _mm256_storeu_si256(out_mm++, X3);
215
216
0
      blocks -= 2;
217
0
      }
218
219
0
   for(size_t i = 0; i != blocks; ++i)
220
0
      {
221
0
      __m256i X0 = _mm256_loadu_si256(in_mm++);
222
0
      __m256i X1 = _mm256_loadu_si256(in_mm++);
223
224
0
      const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);
225
226
0
      interleave_epi64(X0, X1);
227
228
0
      THREEFISH_INJECT_KEY(X0, X1, 0, K1, K2, 2, 3);
229
230
0
      THREEFISH_ENC_8_ROUNDS(X0, X1,  1, K2,K0,K1, 1, 2, 3);
231
0
      THREEFISH_ENC_8_ROUNDS(X0, X1,  3, K1,K2,K0, 2, 3, 1);
232
0
      THREEFISH_ENC_8_ROUNDS(X0, X1,  5, K0,K1,K2, 3, 1, 2);
233
0
      THREEFISH_ENC_8_ROUNDS(X0, X1,  7, K2,K0,K1, 1, 2, 3);
234
0
      THREEFISH_ENC_8_ROUNDS(X0, X1,  9, K1,K2,K0, 2, 3, 1);
235
0
      THREEFISH_ENC_8_ROUNDS(X0, X1, 11, K0,K1,K2, 3, 1, 2);
236
0
      THREEFISH_ENC_8_ROUNDS(X0, X1, 13, K2,K0,K1, 1, 2, 3);
237
0
      THREEFISH_ENC_8_ROUNDS(X0, X1, 15, K1,K2,K0, 2, 3, 1);
238
0
      THREEFISH_ENC_8_ROUNDS(X0, X1, 17, K0,K1,K2, 3, 1, 2);
239
240
0
      deinterleave_epi64(X0, X1);
241
242
0
      _mm256_storeu_si256(out_mm++, X0);
243
0
      _mm256_storeu_si256(out_mm++, X1);
244
0
      }
245
246
0
   _mm256_zeroall();
247
248
0
#undef THREEFISH_ENC_8_ROUNDS
249
0
#undef THREEFISH_ROUND
250
0
#undef THREEFISH_INJECT_KEY
251
0
#undef THREEFISH_DEC_2_8_ROUNDS
252
0
#undef THREEFISH_ROUND_2
253
0
#undef THREEFISH_INJECT_KEY_2
254
0
   }
255
256
BOTAN_FUNC_ISA("avx2")
257
void Threefish_512::avx2_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
258
0
   {
259
0
   _mm256_zeroupper();
260
261
0
   const uint64_t* K = m_K.data();
262
0
   const uint64_t* T_64 = m_T.data();
263
264
0
   const __m256i ROTATE_1 = _mm256_set_epi64x(37,19,36,46);
265
0
   const __m256i ROTATE_2 = _mm256_set_epi64x(42,14,27,33);
266
0
   const __m256i ROTATE_3 = _mm256_set_epi64x(39,36,49,17);
267
0
   const __m256i ROTATE_4 = _mm256_set_epi64x(56,54, 9,44);
268
0
   const __m256i ROTATE_5 = _mm256_set_epi64x(24,34,30,39);
269
0
   const __m256i ROTATE_6 = _mm256_set_epi64x(17,10,50,13);
270
0
   const __m256i ROTATE_7 = _mm256_set_epi64x(43,39,29,25);
271
0
   const __m256i ROTATE_8 = _mm256_set_epi64x(22,56,35, 8);
272
273
0
#define THREEFISH_ROUND(X0, X1, SHR)                                                \
274
0
   do {                                                                             \
275
0
      const __m256i SHL = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHR);            \
276
0
      X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(2, 1, 0, 3));                   \
277
0
      X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0));                   \
278
0
      X1 = _mm256_xor_si256(X1, X0);                                                \
279
0
      X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \
280
0
      X0 = _mm256_sub_epi64(X0, X1);                                                \
281
0
   } while(0)
282
283
0
#define THREEFISH_ROUND_2(X0, X1, X2, X3, SHR)                                                \
284
0
   do {                                                                             \
285
0
      const __m256i SHL = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHR);            \
286
0
      X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(2, 1, 0, 3));                   \
287
0
      X2 = _mm256_permute4x64_epi64(X2, _MM_SHUFFLE(2, 1, 0, 3));                   \
288
0
      X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0));                   \
289
0
      X3 = _mm256_permute4x64_epi64(X3, _MM_SHUFFLE(1, 2, 3, 0));                   \
290
0
      X1 = _mm256_xor_si256(X1, X0);                                                \
291
0
      X3 = _mm256_xor_si256(X3, X2);                                                \
292
0
      X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \
293
0
      X3 = _mm256_or_si256(_mm256_sllv_epi64(X3, SHL), _mm256_srlv_epi64(X3, SHR)); \
294
0
      X0 = _mm256_sub_epi64(X0, X1);                                                \
295
0
      X2 = _mm256_sub_epi64(X2, X3);                                                \
296
0
   } while(0)
297
298
0
#define THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0I, T1I)                \
299
0
   do {                                                                          \
300
0
      const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \
301
0
      const __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \
302
0
      X0 = _mm256_sub_epi64(X0, K0);                                             \
303
0
      X1 = _mm256_sub_epi64(X1, K1);                                             \
304
0
      X1 = _mm256_sub_epi64(X1, _mm256_set_epi64x(R, 0, 0, 0));                  \
305
0
      X0 = _mm256_sub_epi64(X0, T0);                                             \
306
0
      X1 = _mm256_sub_epi64(X1, T1);                                             \
307
0
   } while(0)
308
309
0
#define THREEFISH_DEC_8_ROUNDS(X0, X1, R, K1, K2, K3, T0, T1, T2)   \
310
0
   do {                                                      \
311
0
      THREEFISH_INJECT_KEY(X0, X1, R+1, K2, K3, T2, T0);     \
312
0
      THREEFISH_ROUND(X0, X1, ROTATE_8);                     \
313
0
      THREEFISH_ROUND(X0, X1, ROTATE_7);                     \
314
0
      THREEFISH_ROUND(X0, X1, ROTATE_6);                     \
315
0
      THREEFISH_ROUND(X0, X1, ROTATE_5);                     \
316
0
                                                             \
317
0
      THREEFISH_INJECT_KEY(X0, X1, R, K1, K2, T0, T1);       \
318
0
      THREEFISH_ROUND(X0, X1, ROTATE_4);                     \
319
0
      THREEFISH_ROUND(X0, X1, ROTATE_3);                     \
320
0
      THREEFISH_ROUND(X0, X1, ROTATE_2);                     \
321
0
      THREEFISH_ROUND(X0, X1, ROTATE_1);                     \
322
0
   } while(0)
323
324
0
#define THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0I, T1I)              \
325
0
   do {                                                                          \
326
0
      const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \
327
0
      __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \
328
0
      X0 = _mm256_sub_epi64(X0, K0);                                             \
329
0
      X2 = _mm256_sub_epi64(X2, K0);                                             \
330
0
      X1 = _mm256_sub_epi64(X1, K1);                                             \
331
0
      X3 = _mm256_sub_epi64(X3, K1);                                             \
332
0
      T1 = _mm256_add_epi64(T1, _mm256_set_epi64x(R,0,0,0));                     \
333
0
      X0 = _mm256_sub_epi64(X0, T0);                                             \
334
0
      X2 = _mm256_sub_epi64(X2, T0);                                             \
335
0
      X1 = _mm256_sub_epi64(X1, T1);                                             \
336
0
      X3 = _mm256_sub_epi64(X3, T1);                                             \
337
0
   } while(0)
338
339
0
#define THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, R, K1, K2, K3, T0, T1, T2) \
340
0
   do {                                                                  \
341
0
      THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R+1, K2, K3, T2, T0);         \
342
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_8);                       \
343
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_7);                       \
344
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_6);                       \
345
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_5);                       \
346
0
                                                                         \
347
0
      THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K1, K2, T0, T1);       \
348
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_4);                       \
349
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_3);                       \
350
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_2);                       \
351
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_1);                       \
352
0
   } while(0)
353
354
   /*
355
   v1.0 key schedule: 9 ymm registers (only need 2 or 3)
356
   (0,1,2,3),(4,5,6,7) [8]
357
   then mutating with vpermq
358
   */
359
0
   const __m256i K0 = _mm256_set_epi64x(K[6], K[4], K[2], K[0]);
360
0
   const __m256i K1 = _mm256_set_epi64x(K[7], K[5], K[3], K[1]);
361
0
   const __m256i K2 = _mm256_set_epi64x(K[8], K[6], K[4], K[2]);
362
0
   const __m256i K3 = _mm256_set_epi64x(K[0], K[7], K[5], K[3]);
363
0
   const __m256i K4 = _mm256_set_epi64x(K[1], K[8], K[6], K[4]);
364
0
   const __m256i K5 = _mm256_set_epi64x(K[2], K[0], K[7], K[5]);
365
0
   const __m256i K6 = _mm256_set_epi64x(K[3], K[1], K[8], K[6]);
366
0
   const __m256i K7 = _mm256_set_epi64x(K[4], K[2], K[0], K[7]);
367
0
   const __m256i K8 = _mm256_set_epi64x(K[5], K[3], K[1], K[8]);
368
369
0
   const __m256i* in_mm = reinterpret_cast<const __m256i*>(in);
370
0
   __m256i* out_mm = reinterpret_cast<__m256i*>(out);
371
372
0
   while(blocks >= 2)
373
0
      {
374
0
      __m256i X0 = _mm256_loadu_si256(in_mm++);
375
0
      __m256i X1 = _mm256_loadu_si256(in_mm++);
376
0
      __m256i X2 = _mm256_loadu_si256(in_mm++);
377
0
      __m256i X3 = _mm256_loadu_si256(in_mm++);
378
379
0
      const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);
380
381
0
      interleave_epi64(X0, X1);
382
0
      interleave_epi64(X2, X3);
383
384
0
      THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 17, K8,K0,K1, 3, 1, 2);
385
0
      THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 15, K6,K7,K8, 2, 3, 1);
386
0
      THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 13, K4,K5,K6, 1, 2, 3);
387
0
      THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 11, K2,K3,K4, 3, 1, 2);
388
0
      THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 9, K0,K1,K2, 2, 3, 1);
389
0
      THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 7, K7,K8,K0, 1, 2, 3);
390
0
      THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 5, K5,K6,K7, 3, 1, 2);
391
0
      THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 3, K3,K4,K5, 2, 3, 1);
392
0
      THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 1, K1,K2,K3, 1, 2, 3);
393
394
0
      THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, 0, K0, K1, 2, 3);
395
396
0
      deinterleave_epi64(X0, X1);
397
0
      deinterleave_epi64(X2, X3);
398
399
0
      _mm256_storeu_si256(out_mm++, X0);
400
0
      _mm256_storeu_si256(out_mm++, X1);
401
0
      _mm256_storeu_si256(out_mm++, X2);
402
0
      _mm256_storeu_si256(out_mm++, X3);
403
404
0
      blocks -= 2;
405
0
      }
406
407
0
   for(size_t i = 0; i != blocks; ++i)
408
0
      {
409
0
      __m256i X0 = _mm256_loadu_si256(in_mm++);
410
0
      __m256i X1 = _mm256_loadu_si256(in_mm++);
411
412
0
      const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);
413
414
0
      interleave_epi64(X0, X1);
415
416
0
      THREEFISH_DEC_8_ROUNDS(X0, X1, 17, K8,K0,K1, 3, 1, 2);
417
0
      THREEFISH_DEC_8_ROUNDS(X0, X1, 15, K6,K7,K8, 2, 3, 1);
418
0
      THREEFISH_DEC_8_ROUNDS(X0, X1, 13, K4,K5,K6, 1, 2, 3);
419
0
      THREEFISH_DEC_8_ROUNDS(X0, X1, 11, K2,K3,K4, 3, 1, 2);
420
0
      THREEFISH_DEC_8_ROUNDS(X0, X1, 9, K0,K1,K2, 2, 3, 1);
421
0
      THREEFISH_DEC_8_ROUNDS(X0, X1, 7, K7,K8,K0, 1, 2, 3);
422
0
      THREEFISH_DEC_8_ROUNDS(X0, X1, 5, K5,K6,K7, 3, 1, 2);
423
0
      THREEFISH_DEC_8_ROUNDS(X0, X1, 3, K3,K4,K5, 2, 3, 1);
424
0
      THREEFISH_DEC_8_ROUNDS(X0, X1, 1, K1,K2,K3, 1, 2, 3);
425
426
0
      THREEFISH_INJECT_KEY(X0, X1, 0, K0, K1, 2, 3);
427
428
0
      deinterleave_epi64(X0, X1);
429
430
0
      _mm256_storeu_si256(out_mm++, X0);
431
0
      _mm256_storeu_si256(out_mm++, X1);
432
0
      }
433
434
0
#undef THREEFISH_DEC_8_ROUNDS
435
0
#undef THREEFISH_ROUND
436
0
#undef THREEFISH_INJECT_KEY
437
0
#undef THREEFISH_DEC_2_8_ROUNDS
438
0
#undef THREEFISH_ROUND_2
439
0
#undef THREEFISH_INJECT_KEY_2
440
441
0
   _mm256_zeroall();
442
0
   }
443
444
}