Coverage Report

Created: 2020-02-14 15:38

/src/botan/src/lib/block/threefish_512/threefish_512_avx2/threefish_512_avx2.cpp
Line
Count
Source (jump to first uncovered line)
1
/*
2
* Threefish-512 using AVX2
3
* (C) 2013,2016 Jack Lloyd
4
*
5
* Botan is released under the Simplified BSD License (see license.txt)
6
*/
7
8
#include <botan/threefish_512.h>
9
#include <immintrin.h>
10
11
namespace Botan {
12
13
namespace {
14
15
BOTAN_FUNC_ISA("avx2")
16
inline void interleave_epi64(__m256i& X0, __m256i& X1)
17
0
   {
18
0
   // interleave X0 and X1 qwords
19
0
   // (X0,X1,X2,X3),(X4,X5,X6,X7) -> (X0,X2,X4,X6),(X1,X3,X5,X7)
20
0
21
0
   const __m256i T0 = _mm256_unpacklo_epi64(X0, X1);
22
0
   const __m256i T1 = _mm256_unpackhi_epi64(X0, X1);
23
0
24
0
   X0 = _mm256_permute4x64_epi64(T0, _MM_SHUFFLE(3,1,2,0));
25
0
   X1 = _mm256_permute4x64_epi64(T1, _MM_SHUFFLE(3,1,2,0));
26
0
   }
27
28
BOTAN_FUNC_ISA("avx2")
29
inline void deinterleave_epi64(__m256i& X0, __m256i& X1)
30
0
   {
31
0
   const __m256i T0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(3,1,2,0));
32
0
   const __m256i T1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(3,1,2,0));
33
0
34
0
   X0 = _mm256_unpacklo_epi64(T0, T1);
35
0
   X1 = _mm256_unpackhi_epi64(T0, T1);
36
0
   }
37
38
BOTAN_FUNC_ISA("avx2")
39
inline void rotate_keys(__m256i& R0, __m256i& R1, __m256i R2)
40
0
   {
41
0
   /*
42
0
   Behold. The key schedule progresses like so. The values
43
0
   loop back to the originals after the rounds are complete
44
0
   so we don't need to reload for starting the next block.
45
0
46
0
                 R0        R1        R2
47
0
     K1,K2,K3 (7,5,3,1),(8,6,4,2),(0,7,5,3)
48
0
     K3,K4,K5 (0,7,5,3),(1,8,6,4),(2,0,7,5)
49
0
     K5,K6,K7 (2,0,7,5),(3,1,8,6),(4,2,0,7)
50
0
51
0
     K7,K8,K0 (4,2,0,7),(5,3,1,8),(6,4,2,0)
52
0
     K0,K1,K2 (6,4,2,0),(7,5,3,1),(8,6,4,2)
53
0
     K2,K3,K4 (8,6,4,2),(0,7,5,3),(1,8,6,4)
54
0
55
0
     K4,K5,K6 (1,8,6,4),(2,0,7,5),(3,1,8,6)
56
0
     K6,K7,K8 (3,1,8,6),(4,2,0,7),(5,3,1,8)
57
0
     K8,K0,K1 (5,3,1,8),(6,4,2,0),(7,5,3,1)
58
0
59
0
   To compute the values for the next round:
60
0
     X0 is X2 from the last round
61
0
     X1 becomes (X0[4],X1[1:3])
62
0
     X2 becomes (X1[4],X2[1:3])
63
0
64
0
   Uses 3 permutes and 2 blends, is there a faster way?
65
0
   */
66
0
   __m256i T0 = _mm256_permute4x64_epi64(R0, _MM_SHUFFLE(0,0,0,0));
67
0
   __m256i T1 = _mm256_permute4x64_epi64(R1, _MM_SHUFFLE(0,3,2,1));
68
0
   __m256i T2 = _mm256_permute4x64_epi64(R2, _MM_SHUFFLE(0,3,2,1));
69
0
70
0
   R0 = _mm256_blend_epi32(T1, T0, 0xC0);
71
0
   R1 = _mm256_blend_epi32(T2, T1, 0xC0);
72
0
   }
73
74
75
}
76
77
BOTAN_FUNC_ISA("avx2")
78
void Threefish_512::avx2_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
79
0
   {
80
0
   _mm256_zeroupper();
81
0
82
0
   const uint64_t* K = m_K.data();
83
0
   const uint64_t* T_64 = m_T.data();
84
0
85
0
   const __m256i ROTATE_1 = _mm256_set_epi64x(37,19,36,46);
86
0
   const __m256i ROTATE_2 = _mm256_set_epi64x(42,14,27,33);
87
0
   const __m256i ROTATE_3 = _mm256_set_epi64x(39,36,49,17);
88
0
   const __m256i ROTATE_4 = _mm256_set_epi64x(56,54, 9,44);
89
0
   const __m256i ROTATE_5 = _mm256_set_epi64x(24,34,30,39);
90
0
   const __m256i ROTATE_6 = _mm256_set_epi64x(17,10,50,13);
91
0
   const __m256i ROTATE_7 = _mm256_set_epi64x(43,39,29,25);
92
0
   const __m256i ROTATE_8 = _mm256_set_epi64x(22,56,35, 8);
93
0
94
0
#define THREEFISH_ROUND(X0, X1, SHL)                                                \
95
0
   do {                                                                             \
96
0
      const __m256i SHR = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHL);            \
97
0
      X0 = _mm256_add_epi64(X0, X1);                                                \
98
0
      X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \
99
0
      X1 = _mm256_xor_si256(X1, X0);                                                \
100
0
      X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(0, 3, 2, 1));                   \
101
0
      X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0));                   \
102
0
   } while(0)
103
0
104
0
#define THREEFISH_ROUND_2(X0, X1, X2, X3, SHL)                           \
105
0
   do {                                                                             \
106
0
      const __m256i SHR = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHL);            \
107
0
      X0 = _mm256_add_epi64(X0, X1);                                                \
108
0
      X2 = _mm256_add_epi64(X2, X3);                                                \
109
0
      X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \
110
0
      X3 = _mm256_or_si256(_mm256_sllv_epi64(X3, SHL), _mm256_srlv_epi64(X3, SHR)); \
111
0
      X1 = _mm256_xor_si256(X1, X0);                                                \
112
0
      X3 = _mm256_xor_si256(X3, X2);                                                \
113
0
      X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(0, 3, 2, 1));                   \
114
0
      X2 = _mm256_permute4x64_epi64(X2, _MM_SHUFFLE(0, 3, 2, 1));                   \
115
0
      X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0));                   \
116
0
      X3 = _mm256_permute4x64_epi64(X3, _MM_SHUFFLE(1, 2, 3, 0));                   \
117
0
   } while(0)
118
0
119
0
#define THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0I, T1I)                        \
120
0
   do {                                                                          \
121
0
      const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \
122
0
      const __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \
123
0
      X0 = _mm256_add_epi64(X0, K0);                                             \
124
0
      X1 = _mm256_add_epi64(X1, K1);                                             \
125
0
      X1 = _mm256_add_epi64(X1, _mm256_set_epi64x(R,0,0,0));                     \
126
0
      X0 = _mm256_add_epi64(X0, T0);                                             \
127
0
      X1 = _mm256_add_epi64(X1, T1);                                             \
128
0
   } while(0)
129
0
130
0
#define THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0I, T1I)              \
131
0
   do {                                                                          \
132
0
      const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \
133
0
      __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \
134
0
      X0 = _mm256_add_epi64(X0, K0);                                             \
135
0
      X2 = _mm256_add_epi64(X2, K0);                                             \
136
0
      X1 = _mm256_add_epi64(X1, K1);                                             \
137
0
      X3 = _mm256_add_epi64(X3, K1);                                             \
138
0
      T1 = _mm256_add_epi64(T1, _mm256_set_epi64x(R,0,0,0));                     \
139
0
      X0 = _mm256_add_epi64(X0, T0);                                             \
140
0
      X2 = _mm256_add_epi64(X2, T0);                                             \
141
0
      X1 = _mm256_add_epi64(X1, T1);                                             \
142
0
      X3 = _mm256_add_epi64(X3, T1);                                             \
143
0
   } while(0)
144
0
145
0
#define THREEFISH_ENC_8_ROUNDS(X0, X1, R, K0, K1, K2, T0, T1, T2)        \
146
0
   do {                                                        \
147
0
      rotate_keys(K1, K2, K0);                                 \
148
0
      THREEFISH_ROUND(X0, X1, ROTATE_1);                       \
149
0
      THREEFISH_ROUND(X0, X1, ROTATE_2);                       \
150
0
      THREEFISH_ROUND(X0, X1, ROTATE_3);                       \
151
0
      THREEFISH_ROUND(X0, X1, ROTATE_4);                       \
152
0
      THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0, T1);         \
153
0
                                                               \
154
0
      THREEFISH_ROUND(X0, X1, ROTATE_5);                       \
155
0
      THREEFISH_ROUND(X0, X1, ROTATE_6);                       \
156
0
      THREEFISH_ROUND(X0, X1, ROTATE_7);                       \
157
0
      THREEFISH_ROUND(X0, X1, ROTATE_8);                       \
158
0
      THREEFISH_INJECT_KEY(X0, X1, R+1, K1, K2, T2, T0);       \
159
0
   } while(0)
160
0
161
0
#define THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K0, K1, K2, T0, T1, T2) \
162
0
   do {                                                                  \
163
0
      rotate_keys(K1, K2, K0);                                 \
164
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_1);                       \
165
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_2);                       \
166
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_3);                       \
167
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_4);                       \
168
0
      THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0, T1);         \
169
0
                                                                         \
170
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_5);                       \
171
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_6);                       \
172
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_7);                       \
173
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_8);                       \
174
0
      THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R+1, K1, K2, T2, T0);       \
175
0
   } while(0)
176
0
177
0
   __m256i K0 = _mm256_set_epi64x(K[5], K[3], K[1], K[8]);
178
0
   __m256i K1 = _mm256_set_epi64x(K[6], K[4], K[2], K[0]);
179
0
   __m256i K2 = _mm256_set_epi64x(K[7], K[5], K[3], K[1]);
180
0
181
0
   const __m256i* in_mm = reinterpret_cast<const __m256i*>(in);
182
0
   __m256i* out_mm = reinterpret_cast<__m256i*>(out);
183
0
184
0
   while(blocks >= 2)
185
0
      {
186
0
      __m256i X0 = _mm256_loadu_si256(in_mm++);
187
0
      __m256i X1 = _mm256_loadu_si256(in_mm++);
188
0
      __m256i X2 = _mm256_loadu_si256(in_mm++);
189
0
      __m256i X3 = _mm256_loadu_si256(in_mm++);
190
0
191
0
      const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);
192
0
193
0
      interleave_epi64(X0, X1);
194
0
      interleave_epi64(X2, X3);
195
0
196
0
      THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, 0, K1, K2, 2, 3);
197
0
198
0
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3,  1, K2,K0,K1, 1, 2, 3);
199
0
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3,  3, K1,K2,K0, 2, 3, 1);
200
0
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3,  5, K0,K1,K2, 3, 1, 2);
201
0
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3,  7, K2,K0,K1, 1, 2, 3);
202
0
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3,  9, K1,K2,K0, 2, 3, 1);
203
0
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 11, K0,K1,K2, 3, 1, 2);
204
0
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 13, K2,K0,K1, 1, 2, 3);
205
0
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 15, K1,K2,K0, 2, 3, 1);
206
0
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 17, K0,K1,K2, 3, 1, 2);
207
0
208
0
      deinterleave_epi64(X0, X1);
209
0
      deinterleave_epi64(X2, X3);
210
0
211
0
      _mm256_storeu_si256(out_mm++, X0);
212
0
      _mm256_storeu_si256(out_mm++, X1);
213
0
      _mm256_storeu_si256(out_mm++, X2);
214
0
      _mm256_storeu_si256(out_mm++, X3);
215
0
216
0
      blocks -= 2;
217
0
      }
218
0
219
0
   for(size_t i = 0; i != blocks; ++i)
220
0
      {
221
0
      __m256i X0 = _mm256_loadu_si256(in_mm++);
222
0
      __m256i X1 = _mm256_loadu_si256(in_mm++);
223
0
224
0
      const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);
225
0
226
0
      interleave_epi64(X0, X1);
227
0
228
0
      THREEFISH_INJECT_KEY(X0, X1, 0, K1, K2, 2, 3);
229
0
230
0
      THREEFISH_ENC_8_ROUNDS(X0, X1,  1, K2,K0,K1, 1, 2, 3);
231
0
      THREEFISH_ENC_8_ROUNDS(X0, X1,  3, K1,K2,K0, 2, 3, 1);
232
0
      THREEFISH_ENC_8_ROUNDS(X0, X1,  5, K0,K1,K2, 3, 1, 2);
233
0
      THREEFISH_ENC_8_ROUNDS(X0, X1,  7, K2,K0,K1, 1, 2, 3);
234
0
      THREEFISH_ENC_8_ROUNDS(X0, X1,  9, K1,K2,K0, 2, 3, 1);
235
0
      THREEFISH_ENC_8_ROUNDS(X0, X1, 11, K0,K1,K2, 3, 1, 2);
236
0
      THREEFISH_ENC_8_ROUNDS(X0, X1, 13, K2,K0,K1, 1, 2, 3);
237
0
      THREEFISH_ENC_8_ROUNDS(X0, X1, 15, K1,K2,K0, 2, 3, 1);
238
0
      THREEFISH_ENC_8_ROUNDS(X0, X1, 17, K0,K1,K2, 3, 1, 2);
239
0
240
0
      deinterleave_epi64(X0, X1);
241
0
242
0
      _mm256_storeu_si256(out_mm++, X0);
243
0
      _mm256_storeu_si256(out_mm++, X1);
244
0
      }
245
0
246
0
   _mm256_zeroall();
247
0
248
0
#undef THREEFISH_ENC_8_ROUNDS
249
0
#undef THREEFISH_ROUND
250
0
#undef THREEFISH_INJECT_KEY
251
0
#undef THREEFISH_DEC_2_8_ROUNDS
252
0
#undef THREEFISH_ROUND_2
253
0
#undef THREEFISH_INJECT_KEY_2
254
0
   }
255
256
BOTAN_FUNC_ISA("avx2")
257
void Threefish_512::avx2_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
258
0
   {
259
0
   _mm256_zeroupper();
260
0
261
0
   const uint64_t* K = m_K.data();
262
0
   const uint64_t* T_64 = m_T.data();
263
0
264
0
   const __m256i ROTATE_1 = _mm256_set_epi64x(37,19,36,46);
265
0
   const __m256i ROTATE_2 = _mm256_set_epi64x(42,14,27,33);
266
0
   const __m256i ROTATE_3 = _mm256_set_epi64x(39,36,49,17);
267
0
   const __m256i ROTATE_4 = _mm256_set_epi64x(56,54, 9,44);
268
0
   const __m256i ROTATE_5 = _mm256_set_epi64x(24,34,30,39);
269
0
   const __m256i ROTATE_6 = _mm256_set_epi64x(17,10,50,13);
270
0
   const __m256i ROTATE_7 = _mm256_set_epi64x(43,39,29,25);
271
0
   const __m256i ROTATE_8 = _mm256_set_epi64x(22,56,35, 8);
272
0
273
0
#define THREEFISH_ROUND(X0, X1, SHR)                                                \
274
0
   do {                                                                             \
275
0
      const __m256i SHL = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHR);            \
276
0
      X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(2, 1, 0, 3));                   \
277
0
      X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0));                   \
278
0
      X1 = _mm256_xor_si256(X1, X0);                                                \
279
0
      X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \
280
0
      X0 = _mm256_sub_epi64(X0, X1);                                                \
281
0
   } while(0)
282
0
283
0
#define THREEFISH_ROUND_2(X0, X1, X2, X3, SHR)                                                \
284
0
   do {                                                                             \
285
0
      const __m256i SHL = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHR);            \
286
0
      X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(2, 1, 0, 3));                   \
287
0
      X2 = _mm256_permute4x64_epi64(X2, _MM_SHUFFLE(2, 1, 0, 3));                   \
288
0
      X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0));                   \
289
0
      X3 = _mm256_permute4x64_epi64(X3, _MM_SHUFFLE(1, 2, 3, 0));                   \
290
0
      X1 = _mm256_xor_si256(X1, X0);                                                \
291
0
      X3 = _mm256_xor_si256(X3, X2);                                                \
292
0
      X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \
293
0
      X3 = _mm256_or_si256(_mm256_sllv_epi64(X3, SHL), _mm256_srlv_epi64(X3, SHR)); \
294
0
      X0 = _mm256_sub_epi64(X0, X1);                                                \
295
0
      X2 = _mm256_sub_epi64(X2, X3);                                                \
296
0
   } while(0)
297
0
298
0
#define THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0I, T1I)                \
299
0
   do {                                                                          \
300
0
      const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \
301
0
      const __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \
302
0
      X0 = _mm256_sub_epi64(X0, K0);                                             \
303
0
      X1 = _mm256_sub_epi64(X1, K1);                                             \
304
0
      X1 = _mm256_sub_epi64(X1, _mm256_set_epi64x(R, 0, 0, 0));                  \
305
0
      X0 = _mm256_sub_epi64(X0, T0);                                             \
306
0
      X1 = _mm256_sub_epi64(X1, T1);                                             \
307
0
   } while(0)
308
0
309
0
#define THREEFISH_DEC_8_ROUNDS(X0, X1, R, K1, K2, K3, T0, T1, T2)   \
310
0
   do {                                                      \
311
0
      THREEFISH_INJECT_KEY(X0, X1, R+1, K2, K3, T2, T0);     \
312
0
      THREEFISH_ROUND(X0, X1, ROTATE_8);                     \
313
0
      THREEFISH_ROUND(X0, X1, ROTATE_7);                     \
314
0
      THREEFISH_ROUND(X0, X1, ROTATE_6);                     \
315
0
      THREEFISH_ROUND(X0, X1, ROTATE_5);                     \
316
0
                                                             \
317
0
      THREEFISH_INJECT_KEY(X0, X1, R, K1, K2, T0, T1);       \
318
0
      THREEFISH_ROUND(X0, X1, ROTATE_4);                     \
319
0
      THREEFISH_ROUND(X0, X1, ROTATE_3);                     \
320
0
      THREEFISH_ROUND(X0, X1, ROTATE_2);                     \
321
0
      THREEFISH_ROUND(X0, X1, ROTATE_1);                     \
322
0
   } while(0)
323
0
324
0
#define THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0I, T1I)              \
325
0
   do {                                                                          \
326
0
      const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \
327
0
      __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \
328
0
      X0 = _mm256_sub_epi64(X0, K0);                                             \
329
0
      X2 = _mm256_sub_epi64(X2, K0);                                             \
330
0
      X1 = _mm256_sub_epi64(X1, K1);                                             \
331
0
      X3 = _mm256_sub_epi64(X3, K1);                                             \
332
0
      T1 = _mm256_add_epi64(T1, _mm256_set_epi64x(R,0,0,0));                     \
333
0
      X0 = _mm256_sub_epi64(X0, T0);                                             \
334
0
      X2 = _mm256_sub_epi64(X2, T0);                                             \
335
0
      X1 = _mm256_sub_epi64(X1, T1);                                             \
336
0
      X3 = _mm256_sub_epi64(X3, T1);                                             \
337
0
   } while(0)
338
0
339
0
#define THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, R, K1, K2, K3, T0, T1, T2) \
340
0
   do {                                                                  \
341
0
      THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R+1, K2, K3, T2, T0);         \
342
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_8);                       \
343
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_7);                       \
344
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_6);                       \
345
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_5);                       \
346
0
                                                                         \
347
0
      THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K1, K2, T0, T1);       \
348
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_4);                       \
349
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_3);                       \
350
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_2);                       \
351
0
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_1);                       \
352
0
   } while(0)
353
0
354
0
   /*
355
0
   v1.0 key schedule: 9 ymm registers (only need 2 or 3)
356
0
   (0,1,2,3),(4,5,6,7) [8]
357
0
   then mutating with vpermq
358
0
   */
359
0
   const __m256i K0 = _mm256_set_epi64x(K[6], K[4], K[2], K[0]);
360
0
   const __m256i K1 = _mm256_set_epi64x(K[7], K[5], K[3], K[1]);
361
0
   const __m256i K2 = _mm256_set_epi64x(K[8], K[6], K[4], K[2]);
362
0
   const __m256i K3 = _mm256_set_epi64x(K[0], K[7], K[5], K[3]);
363
0
   const __m256i K4 = _mm256_set_epi64x(K[1], K[8], K[6], K[4]);
364
0
   const __m256i K5 = _mm256_set_epi64x(K[2], K[0], K[7], K[5]);
365
0
   const __m256i K6 = _mm256_set_epi64x(K[3], K[1], K[8], K[6]);
366
0
   const __m256i K7 = _mm256_set_epi64x(K[4], K[2], K[0], K[7]);
367
0
   const __m256i K8 = _mm256_set_epi64x(K[5], K[3], K[1], K[8]);
368
0
369
0
   const __m256i* in_mm = reinterpret_cast<const __m256i*>(in);
370
0
   __m256i* out_mm = reinterpret_cast<__m256i*>(out);
371
0
372
0
   while(blocks >= 2)
373
0
      {
374
0
      __m256i X0 = _mm256_loadu_si256(in_mm++);
375
0
      __m256i X1 = _mm256_loadu_si256(in_mm++);
376
0
      __m256i X2 = _mm256_loadu_si256(in_mm++);
377
0
      __m256i X3 = _mm256_loadu_si256(in_mm++);
378
0
379
0
      const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);
380
0
381
0
      interleave_epi64(X0, X1);
382
0
      interleave_epi64(X2, X3);
383
0
384
0
      THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 17, K8,K0,K1, 3, 1, 2);
385
0
      THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 15, K6,K7,K8, 2, 3, 1);
386
0
      THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 13, K4,K5,K6, 1, 2, 3);
387
0
      THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 11, K2,K3,K4, 3, 1, 2);
388
0
      THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 9, K0,K1,K2, 2, 3, 1);
389
0
      THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 7, K7,K8,K0, 1, 2, 3);
390
0
      THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 5, K5,K6,K7, 3, 1, 2);
391
0
      THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 3, K3,K4,K5, 2, 3, 1);
392
0
      THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 1, K1,K2,K3, 1, 2, 3);
393
0
394
0
      THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, 0, K0, K1, 2, 3);
395
0
396
0
      deinterleave_epi64(X0, X1);
397
0
      deinterleave_epi64(X2, X3);
398
0
399
0
      _mm256_storeu_si256(out_mm++, X0);
400
0
      _mm256_storeu_si256(out_mm++, X1);
401
0
      _mm256_storeu_si256(out_mm++, X2);
402
0
      _mm256_storeu_si256(out_mm++, X3);
403
0
404
0
      blocks -= 2;
405
0
      }
406
0
407
0
   for(size_t i = 0; i != blocks; ++i)
408
0
      {
409
0
      __m256i X0 = _mm256_loadu_si256(in_mm++);
410
0
      __m256i X1 = _mm256_loadu_si256(in_mm++);
411
0
412
0
      const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);
413
0
414
0
      interleave_epi64(X0, X1);
415
0
416
0
      THREEFISH_DEC_8_ROUNDS(X0, X1, 17, K8,K0,K1, 3, 1, 2);
417
0
      THREEFISH_DEC_8_ROUNDS(X0, X1, 15, K6,K7,K8, 2, 3, 1);
418
0
      THREEFISH_DEC_8_ROUNDS(X0, X1, 13, K4,K5,K6, 1, 2, 3);
419
0
      THREEFISH_DEC_8_ROUNDS(X0, X1, 11, K2,K3,K4, 3, 1, 2);
420
0
      THREEFISH_DEC_8_ROUNDS(X0, X1, 9, K0,K1,K2, 2, 3, 1);
421
0
      THREEFISH_DEC_8_ROUNDS(X0, X1, 7, K7,K8,K0, 1, 2, 3);
422
0
      THREEFISH_DEC_8_ROUNDS(X0, X1, 5, K5,K6,K7, 3, 1, 2);
423
0
      THREEFISH_DEC_8_ROUNDS(X0, X1, 3, K3,K4,K5, 2, 3, 1);
424
0
      THREEFISH_DEC_8_ROUNDS(X0, X1, 1, K1,K2,K3, 1, 2, 3);
425
0
426
0
      THREEFISH_INJECT_KEY(X0, X1, 0, K0, K1, 2, 3);
427
0
428
0
      deinterleave_epi64(X0, X1);
429
0
430
0
      _mm256_storeu_si256(out_mm++, X0);
431
0
      _mm256_storeu_si256(out_mm++, X1);
432
0
      }
433
0
434
0
#undef THREEFISH_DEC_8_ROUNDS
435
0
#undef THREEFISH_ROUND
436
0
#undef THREEFISH_INJECT_KEY
437
0
#undef THREEFISH_DEC_2_8_ROUNDS
438
0
#undef THREEFISH_ROUND_2
439
0
#undef THREEFISH_INJECT_KEY_2
440
0
441
0
   _mm256_zeroall();
442
0
   }
443
444
}