Coverage Report

Created: 2020-10-17 06:46

/src/botan/src/lib/block/threefish_512/threefish_512_avx2/threefish_512_avx2.cpp
Line
Count
Source (jump to first uncovered line)
1
/*
2
* Threefish-512 using AVX2
3
* (C) 2013,2016 Jack Lloyd
4
*
5
* Botan is released under the Simplified BSD License (see license.txt)
6
*/
7
8
#include <botan/threefish_512.h>
9
#include <immintrin.h>
10
11
namespace Botan {
12
13
namespace {
14
15
BOTAN_FUNC_ISA("avx2")
16
inline void interleave_epi64(__m256i& X0, __m256i& X1)
17
0
   {
18
   // interleave X0 and X1 qwords
19
   // (X0,X1,X2,X3),(X4,X5,X6,X7) -> (X0,X2,X4,X6),(X1,X3,X5,X7)
20
0
21
0
   const __m256i T0 = _mm256_unpacklo_epi64(X0, X1);
22
0
   const __m256i T1 = _mm256_unpackhi_epi64(X0, X1);
23
0
24
0
   X0 = _mm256_permute4x64_epi64(T0, _MM_SHUFFLE(3,1,2,0));
25
0
   X1 = _mm256_permute4x64_epi64(T1, _MM_SHUFFLE(3,1,2,0));
26
0
   }
27
28
BOTAN_FUNC_ISA("avx2")
29
inline void deinterleave_epi64(__m256i& X0, __m256i& X1)
30
0
   {
31
0
   const __m256i T0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(3,1,2,0));
32
0
   const __m256i T1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(3,1,2,0));
33
0
34
0
   X0 = _mm256_unpacklo_epi64(T0, T1);
35
0
   X1 = _mm256_unpackhi_epi64(T0, T1);
36
0
   }
37
38
BOTAN_FUNC_ISA("avx2")
39
inline void rotate_keys(__m256i& R0, __m256i& R1, __m256i R2)
40
0
   {
41
   /*
42
   Behold. The key schedule progresses like so. The values
43
   loop back to the originals after the rounds are complete
44
   so we don't need to reload for starting the next block.
45
46
                 R0        R1        R2
47
     K1,K2,K3 (7,5,3,1),(8,6,4,2),(0,7,5,3)
48
     K3,K4,K5 (0,7,5,3),(1,8,6,4),(2,0,7,5)
49
     K5,K6,K7 (2,0,7,5),(3,1,8,6),(4,2,0,7)
50
51
     K7,K8,K0 (4,2,0,7),(5,3,1,8),(6,4,2,0)
52
     K0,K1,K2 (6,4,2,0),(7,5,3,1),(8,6,4,2)
53
     K2,K3,K4 (8,6,4,2),(0,7,5,3),(1,8,6,4)
54
55
     K4,K5,K6 (1,8,6,4),(2,0,7,5),(3,1,8,6)
56
     K6,K7,K8 (3,1,8,6),(4,2,0,7),(5,3,1,8)
57
     K8,K0,K1 (5,3,1,8),(6,4,2,0),(7,5,3,1)
58
59
   To compute the values for the next round:
60
     X0 is X2 from the last round
61
     X1 becomes (X0[4],X1[1:3])
62
     X2 becomes (X1[4],X2[1:3])
63
64
   Uses 3 permutes and 2 blends, is there a faster way?
65
   */
66
0
   __m256i T0 = _mm256_permute4x64_epi64(R0, _MM_SHUFFLE(0,0,0,0));
67
0
   __m256i T1 = _mm256_permute4x64_epi64(R1, _MM_SHUFFLE(0,3,2,1));
68
0
   __m256i T2 = _mm256_permute4x64_epi64(R2, _MM_SHUFFLE(0,3,2,1));
69
0
70
0
   R0 = _mm256_blend_epi32(T1, T0, 0xC0);
71
0
   R1 = _mm256_blend_epi32(T2, T1, 0xC0);
72
0
   }
73
74
75
}
76
77
BOTAN_FUNC_ISA("avx2")
78
void Threefish_512::avx2_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
79
0
   {
80
0
   _mm256_zeroupper();
81
0
82
0
   const uint64_t* K = m_K.data();
83
0
   const uint64_t* T_64 = m_T.data();
84
0
85
0
   const __m256i ROTATE_1 = _mm256_set_epi64x(37,19,36,46);
86
0
   const __m256i ROTATE_2 = _mm256_set_epi64x(42,14,27,33);
87
0
   const __m256i ROTATE_3 = _mm256_set_epi64x(39,36,49,17);
88
0
   const __m256i ROTATE_4 = _mm256_set_epi64x(56,54, 9,44);
89
0
   const __m256i ROTATE_5 = _mm256_set_epi64x(24,34,30,39);
90
0
   const __m256i ROTATE_6 = _mm256_set_epi64x(17,10,50,13);
91
0
   const __m256i ROTATE_7 = _mm256_set_epi64x(43,39,29,25);
92
0
   const __m256i ROTATE_8 = _mm256_set_epi64x(22,56,35, 8);
93
0
94
0
#define THREEFISH_ROUND(X0, X1, SHL)                                                \
95
0
   do {                                                                             \
96
0
      const __m256i SHR = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHL);            \
97
0
      X0 = _mm256_add_epi64(X0, X1);                                                \
98
0
      X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \
99
0
      X1 = _mm256_xor_si256(X1, X0);                                                \
100
0
      X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(0, 3, 2, 1));                   \
101
0
      X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0));                   \
102
0
   } while(0)
103
0
104
0
#define THREEFISH_ROUND_2(X0, X1, X2, X3, SHL)                           \
105
0
   do {                                                                             \
106
0
      const __m256i SHR = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHL);            \
107
0
      X0 = _mm256_add_epi64(X0, X1);                                                \
108
0
      X2 = _mm256_add_epi64(X2, X3);                                                \
109
0
      X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \
110
0
      X3 = _mm256_or_si256(_mm256_sllv_epi64(X3, SHL), _mm256_srlv_epi64(X3, SHR)); \
111
0
      X1 = _mm256_xor_si256(X1, X0);                                                \
112
0
      X3 = _mm256_xor_si256(X3, X2);                                                \
113
0
      X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(0, 3, 2, 1));                   \
114
0
      X2 = _mm256_permute4x64_epi64(X2, _MM_SHUFFLE(0, 3, 2, 1));                   \
115
0
      X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0));                   \
116
0
      X3 = _mm256_permute4x64_epi64(X3, _MM_SHUFFLE(1, 2, 3, 0));                   \
117
0
   } while(0)
118
0
119
0
#define THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0I, T1I)                        \
120
0
   do {                                                                          \
121
0
      const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \
122
0
      const __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \
123
0
      X0 = _mm256_add_epi64(X0, K0);                                             \
124
0
      X1 = _mm256_add_epi64(X1, K1);                                             \
125
0
      X1 = _mm256_add_epi64(X1, _mm256_set_epi64x(R,0,0,0));                     \
126
0
      X0 = _mm256_add_epi64(X0, T0);                                             \
127
0
      X1 = _mm256_add_epi64(X1, T1);                                             \
128
0
   } while(0)
129
0
130
0
#define THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0I, T1I)              \
131
0
   do {                                                                          \
132
0
      const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \
133
0
      __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \
134
0
      X0 = _mm256_add_epi64(X0, K0);                                             \
135
0
      X2 = _mm256_add_epi64(X2, K0);                                             \
136
0
      X1 = _mm256_add_epi64(X1, K1);                                             \
137
0
      X3 = _mm256_add_epi64(X3, K1);                                             \
138
0
      T1 = _mm256_add_epi64(T1, _mm256_set_epi64x(R,0,0,0));                     \
139
0
      X0 = _mm256_add_epi64(X0, T0);                                             \
140
0
      X2 = _mm256_add_epi64(X2, T0);                                             \
141
0
      X1 = _mm256_add_epi64(X1, T1);                                             \
142
0
      X3 = _mm256_add_epi64(X3, T1);                                             \
143
0
   } while(0)
144
0
145
0
#define THREEFISH_ENC_8_ROUNDS(X0, X1, R, K0, K1, K2, T0, T1, T2)        \
146
0
   do {                                                        \
147
0
      rotate_keys(K1, K2, K0);                                 \
148
0
      THREEFISH_ROUND(X0, X1, ROTATE_1);                       \
149
0
      THREEFISH_ROUND(X0, X1, ROTATE_2);                       \
150
0
      THREEFISH_ROUND(X0, X1, ROTATE_3);                       \
151
0
      THREEFISH_ROUND(X0, X1, ROTATE_4);                       \
152
0
      THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0, T1);         \
153
0
                                                               \
154
0
      THREEFISH_ROUND(X0, X1, ROTATE_5);                       \
155
0
      THREEFISH_ROUND(X0, X1, ROTATE_6);                       \
156
0
      THREEFISH_ROUND(X0, X1, ROTATE_7);                       \
157
0
      THREEFISH_ROUND(X0, X1, ROTATE_8);                       \
158
0
      THREEFISH_INJECT_KEY(X0, X1, R+1, K1, K2, T2, T0);       \
159
0
   } while(0)
160
0
161
0
#define THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K0, K1, K2, T0, T1, T2) \
162
0
   do {                                                                  \
163
0
      rotate_keys(K1, K2, K0);                                 \
164
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_1);                       \
165
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_2);                       \
166
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_3);                       \
167
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_4);                       \
168
0
      THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0, T1);         \
169
0
                                                                         \
170
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_5);                       \
171
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_6);                       \
172
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_7);                       \
173
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_8);                       \
174
0
      THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R+1, K1, K2, T2, T0);       \
175
0
   } while(0)
176
0
177
0
   __m256i K0 = _mm256_set_epi64x(K[5], K[3], K[1], K[8]);
178
0
   __m256i K1 = _mm256_set_epi64x(K[6], K[4], K[2], K[0]);
179
0
   __m256i K2 = _mm256_set_epi64x(K[7], K[5], K[3], K[1]);
180
0
181
0
   const __m256i* in_mm = reinterpret_cast<const __m256i*>(in);
182
0
   __m256i* out_mm = reinterpret_cast<__m256i*>(out);
183
0
184
0
   while(blocks >= 2)
185
0
      {
186
0
      __m256i X0 = _mm256_loadu_si256(in_mm++);
187
0
      __m256i X1 = _mm256_loadu_si256(in_mm++);
188
0
      __m256i X2 = _mm256_loadu_si256(in_mm++);
189
0
      __m256i X3 = _mm256_loadu_si256(in_mm++);
190
0
191
0
      const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);
192
0
193
0
      interleave_epi64(X0, X1);
194
0
      interleave_epi64(X2, X3);
195
0
196
0
      THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, 0, K1, K2, 2, 3);
197
0
198
0
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3,  1, K2,K0,K1, 1, 2, 3);
199
0
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3,  3, K1,K2,K0, 2, 3, 1);
200
0
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3,  5, K0,K1,K2, 3, 1, 2);
201
0
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3,  7, K2,K0,K1, 1, 2, 3);
202
0
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3,  9, K1,K2,K0, 2, 3, 1);
203
0
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 11, K0,K1,K2, 3, 1, 2);
204
0
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 13, K2,K0,K1, 1, 2, 3);
205
0
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 15, K1,K2,K0, 2, 3, 1);
206
0
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 17, K0,K1,K2, 3, 1, 2);
207
0
208
0
      deinterleave_epi64(X0, X1);
209
0
      deinterleave_epi64(X2, X3);
210
0
211
0
      _mm256_storeu_si256(out_mm++, X0);
212
0
      _mm256_storeu_si256(out_mm++, X1);
213
0
      _mm256_storeu_si256(out_mm++, X2);
214
0
      _mm256_storeu_si256(out_mm++, X3);
215
0
216
0
      blocks -= 2;
217
0
      }
218
0
219
0
   for(size_t i = 0; i != blocks; ++i)
220
0
      {
221
0
      __m256i X0 = _mm256_loadu_si256(in_mm++);
222
0
      __m256i X1 = _mm256_loadu_si256(in_mm++);
223
0
224
0
      const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);
225
0
226
0
      interleave_epi64(X0, X1);
227
0
228
0
      THREEFISH_INJECT_KEY(X0, X1, 0, K1, K2, 2, 3);
229
0
230
0
      THREEFISH_ENC_8_ROUNDS(X0, X1,  1, K2,K0,K1, 1, 2, 3);
231
0
      THREEFISH_ENC_8_ROUNDS(X0, X1,  3, K1,K2,K0, 2, 3, 1);
232
0
      THREEFISH_ENC_8_ROUNDS(X0, X1,  5, K0,K1,K2, 3, 1, 2);
233
0
      THREEFISH_ENC_8_ROUNDS(X0, X1,  7, K2,K0,K1, 1, 2, 3);
234
0
      THREEFISH_ENC_8_ROUNDS(X0, X1,  9, K1,K2,K0, 2, 3, 1);
235
0
      THREEFISH_ENC_8_ROUNDS(X0, X1, 11, K0,K1,K2, 3, 1, 2);
236
0
      THREEFISH_ENC_8_ROUNDS(X0, X1, 13, K2,K0,K1, 1, 2, 3);
237
0
      THREEFISH_ENC_8_ROUNDS(X0, X1, 15, K1,K2,K0, 2, 3, 1);
238
0
      THREEFISH_ENC_8_ROUNDS(X0, X1, 17, K0,K1,K2, 3, 1, 2);
239
0
240
0
      deinterleave_epi64(X0, X1);
241
0
242
0
      _mm256_storeu_si256(out_mm++, X0);
243
0
      _mm256_storeu_si256(out_mm++, X1);
244
0
      }
245
0
246
0
   _mm256_zeroall();
247
0
248
0
#undef THREEFISH_ENC_8_ROUNDS
249
0
#undef THREEFISH_ROUND
250
0
#undef THREEFISH_INJECT_KEY
251
0
#undef THREEFISH_DEC_2_8_ROUNDS
252
0
#undef THREEFISH_ROUND_2
253
0
#undef THREEFISH_INJECT_KEY_2
254
0
   }
255
256
BOTAN_FUNC_ISA("avx2")
257
void Threefish_512::avx2_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
258
0
   {
259
0
   _mm256_zeroupper();
260
0
261
0
   const uint64_t* K = m_K.data();
262
0
   const uint64_t* T_64 = m_T.data();
263
0
264
0
   const __m256i ROTATE_1 = _mm256_set_epi64x(37,19,36,46);
265
0
   const __m256i ROTATE_2 = _mm256_set_epi64x(42,14,27,33);
266
0
   const __m256i ROTATE_3 = _mm256_set_epi64x(39,36,49,17);
267
0
   const __m256i ROTATE_4 = _mm256_set_epi64x(56,54, 9,44);
268
0
   const __m256i ROTATE_5 = _mm256_set_epi64x(24,34,30,39);
269
0
   const __m256i ROTATE_6 = _mm256_set_epi64x(17,10,50,13);
270
0
   const __m256i ROTATE_7 = _mm256_set_epi64x(43,39,29,25);
271
0
   const __m256i ROTATE_8 = _mm256_set_epi64x(22,56,35, 8);
272
0
273
0
#define THREEFISH_ROUND(X0, X1, SHR)                                                \
274
0
   do {                                                                             \
275
0
      const __m256i SHL = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHR);            \
276
0
      X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(2, 1, 0, 3));                   \
277
0
      X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0));                   \
278
0
      X1 = _mm256_xor_si256(X1, X0);                                                \
279
0
      X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \
280
0
      X0 = _mm256_sub_epi64(X0, X1);                                                \
281
0
   } while(0)
282
0
283
0
#define THREEFISH_ROUND_2(X0, X1, X2, X3, SHR)                                                \
284
0
   do {                                                                             \
285
0
      const __m256i SHL = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHR);            \
286
0
      X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(2, 1, 0, 3));                   \
287
0
      X2 = _mm256_permute4x64_epi64(X2, _MM_SHUFFLE(2, 1, 0, 3));                   \
288
0
      X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0));                   \
289
0
      X3 = _mm256_permute4x64_epi64(X3, _MM_SHUFFLE(1, 2, 3, 0));                   \
290
0
      X1 = _mm256_xor_si256(X1, X0);                                                \
291
0
      X3 = _mm256_xor_si256(X3, X2);                                                \
292
0
      X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \
293
0
      X3 = _mm256_or_si256(_mm256_sllv_epi64(X3, SHL), _mm256_srlv_epi64(X3, SHR)); \
294
0
      X0 = _mm256_sub_epi64(X0, X1);                                                \
295
0
      X2 = _mm256_sub_epi64(X2, X3);                                                \
296
0
   } while(0)
297
0
298
0
#define THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0I, T1I)                \
299
0
   do {                                                                          \
300
0
      const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \
301
0
      const __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \
302
0
      X0 = _mm256_sub_epi64(X0, K0);                                             \
303
0
      X1 = _mm256_sub_epi64(X1, K1);                                             \
304
0
      X1 = _mm256_sub_epi64(X1, _mm256_set_epi64x(R, 0, 0, 0));                  \
305
0
      X0 = _mm256_sub_epi64(X0, T0);                                             \
306
0
      X1 = _mm256_sub_epi64(X1, T1);                                             \
307
0
   } while(0)
308
0
309
0
#define THREEFISH_DEC_8_ROUNDS(X0, X1, R, K1, K2, K3, T0, T1, T2)   \
310
0
   do {                                                      \
311
0
      THREEFISH_INJECT_KEY(X0, X1, R+1, K2, K3, T2, T0);     \
312
0
      THREEFISH_ROUND(X0, X1, ROTATE_8);                     \
313
0
      THREEFISH_ROUND(X0, X1, ROTATE_7);                     \
314
0
      THREEFISH_ROUND(X0, X1, ROTATE_6);                     \
315
0
      THREEFISH_ROUND(X0, X1, ROTATE_5);                     \
316
0
                                                             \
317
0
      THREEFISH_INJECT_KEY(X0, X1, R, K1, K2, T0, T1);       \
318
0
      THREEFISH_ROUND(X0, X1, ROTATE_4);                     \
319
0
      THREEFISH_ROUND(X0, X1, ROTATE_3);                     \
320
0
      THREEFISH_ROUND(X0, X1, ROTATE_2);                     \
321
0
      THREEFISH_ROUND(X0, X1, ROTATE_1);                     \
322
0
   } while(0)
323
0
324
0
#define THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0I, T1I)              \
325
0
   do {                                                                          \
326
0
      const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \
327
0
      __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \
328
0
      X0 = _mm256_sub_epi64(X0, K0);                                             \
329
0
      X2 = _mm256_sub_epi64(X2, K0);                                             \
330
0
      X1 = _mm256_sub_epi64(X1, K1);                                             \
331
0
      X3 = _mm256_sub_epi64(X3, K1);                                             \
332
0
      T1 = _mm256_add_epi64(T1, _mm256_set_epi64x(R,0,0,0));                     \
333
0
      X0 = _mm256_sub_epi64(X0, T0);                                             \
334
0
      X2 = _mm256_sub_epi64(X2, T0);                                             \
335
0
      X1 = _mm256_sub_epi64(X1, T1);                                             \
336
0
      X3 = _mm256_sub_epi64(X3, T1);                                             \
337
0
   } while(0)
338
0
339
0
#define THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, R, K1, K2, K3, T0, T1, T2) \
340
0
   do {                                                                  \
341
0
      THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R+1, K2, K3, T2, T0);         \
342
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_8);                       \
343
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_7);                       \
344
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_6);                       \
345
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_5);                       \
346
0
                                                                         \
347
0
      THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K1, K2, T0, T1);       \
348
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_4);                       \
349
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_3);                       \
350
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_2);                       \
351
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_1);                       \
352
0
   } while(0)
353
0
354
   /*
355
   v1.0 key schedule: 9 ymm registers (only need 2 or 3)
356
   (0,1,2,3),(4,5,6,7) [8]
357
   then mutating with vpermq
358
   */
359
0
   const __m256i K0 = _mm256_set_epi64x(K[6], K[4], K[2], K[0]);
360
0
   const __m256i K1 = _mm256_set_epi64x(K[7], K[5], K[3], K[1]);
361
0
   const __m256i K2 = _mm256_set_epi64x(K[8], K[6], K[4], K[2]);
362
0
   const __m256i K3 = _mm256_set_epi64x(K[0], K[7], K[5], K[3]);
363
0
   const __m256i K4 = _mm256_set_epi64x(K[1], K[8], K[6], K[4]);
364
0
   const __m256i K5 = _mm256_set_epi64x(K[2], K[0], K[7], K[5]);
365
0
   const __m256i K6 = _mm256_set_epi64x(K[3], K[1], K[8], K[6]);
366
0
   const __m256i K7 = _mm256_set_epi64x(K[4], K[2], K[0], K[7]);
367
0
   const __m256i K8 = _mm256_set_epi64x(K[5], K[3], K[1], K[8]);
368
0
369
0
   const __m256i* in_mm = reinterpret_cast<const __m256i*>(in);
370
0
   __m256i* out_mm = reinterpret_cast<__m256i*>(out);
371
0
372
0
   while(blocks >= 2)
373
0
      {
374
0
      __m256i X0 = _mm256_loadu_si256(in_mm++);
375
0
      __m256i X1 = _mm256_loadu_si256(in_mm++);
376
0
      __m256i X2 = _mm256_loadu_si256(in_mm++);
377
0
      __m256i X3 = _mm256_loadu_si256(in_mm++);
378
0
379
0
      const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);
380
0
381
0
      interleave_epi64(X0, X1);
382
0
      interleave_epi64(X2, X3);
383
0
384
0
      THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 17, K8,K0,K1, 3, 1, 2);
385
0
      THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 15, K6,K7,K8, 2, 3, 1);
386
0
      THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 13, K4,K5,K6, 1, 2, 3);
387
0
      THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 11, K2,K3,K4, 3, 1, 2);
388
0
      THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 9, K0,K1,K2, 2, 3, 1);
389
0
      THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 7, K7,K8,K0, 1, 2, 3);
390
0
      THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 5, K5,K6,K7, 3, 1, 2);
391
0
      THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 3, K3,K4,K5, 2, 3, 1);
392
0
      THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 1, K1,K2,K3, 1, 2, 3);
393
0
394
0
      THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, 0, K0, K1, 2, 3);
395
0
396
0
      deinterleave_epi64(X0, X1);
397
0
      deinterleave_epi64(X2, X3);
398
0
399
0
      _mm256_storeu_si256(out_mm++, X0);
400
0
      _mm256_storeu_si256(out_mm++, X1);
401
0
      _mm256_storeu_si256(out_mm++, X2);
402
0
      _mm256_storeu_si256(out_mm++, X3);
403
0
404
0
      blocks -= 2;
405
0
      }
406
0
407
0
   for(size_t i = 0; i != blocks; ++i)
408
0
      {
409
0
      __m256i X0 = _mm256_loadu_si256(in_mm++);
410
0
      __m256i X1 = _mm256_loadu_si256(in_mm++);
411
0
412
0
      const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);
413
0
414
0
      interleave_epi64(X0, X1);
415
0
416
0
      THREEFISH_DEC_8_ROUNDS(X0, X1, 17, K8,K0,K1, 3, 1, 2);
417
0
      THREEFISH_DEC_8_ROUNDS(X0, X1, 15, K6,K7,K8, 2, 3, 1);
418
0
      THREEFISH_DEC_8_ROUNDS(X0, X1, 13, K4,K5,K6, 1, 2, 3);
419
0
      THREEFISH_DEC_8_ROUNDS(X0, X1, 11, K2,K3,K4, 3, 1, 2);
420
0
      THREEFISH_DEC_8_ROUNDS(X0, X1, 9, K0,K1,K2, 2, 3, 1);
421
0
      THREEFISH_DEC_8_ROUNDS(X0, X1, 7, K7,K8,K0, 1, 2, 3);
422
0
      THREEFISH_DEC_8_ROUNDS(X0, X1, 5, K5,K6,K7, 3, 1, 2);
423
0
      THREEFISH_DEC_8_ROUNDS(X0, X1, 3, K3,K4,K5, 2, 3, 1);
424
0
      THREEFISH_DEC_8_ROUNDS(X0, X1, 1, K1,K2,K3, 1, 2, 3);
425
0
426
0
      THREEFISH_INJECT_KEY(X0, X1, 0, K0, K1, 2, 3);
427
0
428
0
      deinterleave_epi64(X0, X1);
429
0
430
0
      _mm256_storeu_si256(out_mm++, X0);
431
0
      _mm256_storeu_si256(out_mm++, X1);
432
0
      }
433
0
434
0
#undef THREEFISH_DEC_8_ROUNDS
435
0
#undef THREEFISH_ROUND
436
0
#undef THREEFISH_INJECT_KEY
437
0
#undef THREEFISH_DEC_2_8_ROUNDS
438
0
#undef THREEFISH_ROUND_2
439
0
#undef THREEFISH_INJECT_KEY_2
440
0
441
0
   _mm256_zeroall();
442
0
   }
443
444
}