/src/botan/src/lib/block/threefish_512/threefish_512_avx2/threefish_512_avx2.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Threefish-512 using AVX2 |
3 | | * (C) 2013,2016 Jack Lloyd |
4 | | * |
5 | | * Botan is released under the Simplified BSD License (see license.txt) |
6 | | */ |
7 | | |
8 | | #include <botan/internal/threefish_512.h> |
9 | | #include <botan/internal/simd_avx2.h> |
10 | | #include <immintrin.h> |
11 | | |
12 | | namespace Botan { |
13 | | |
14 | | namespace { |
15 | | |
16 | | BOTAN_AVX2_FN |
17 | | inline void interleave_epi64(__m256i& X0, __m256i& X1) |
18 | 0 | { |
19 | | // interleave X0 and X1 qwords |
20 | | // (X0,X1,X2,X3),(X4,X5,X6,X7) -> (X0,X2,X4,X6),(X1,X3,X5,X7) |
21 | |
|
22 | 0 | const __m256i T0 = _mm256_unpacklo_epi64(X0, X1); |
23 | 0 | const __m256i T1 = _mm256_unpackhi_epi64(X0, X1); |
24 | |
|
25 | 0 | X0 = _mm256_permute4x64_epi64(T0, _MM_SHUFFLE(3,1,2,0)); |
26 | 0 | X1 = _mm256_permute4x64_epi64(T1, _MM_SHUFFLE(3,1,2,0)); |
27 | 0 | } |
28 | | |
29 | | BOTAN_AVX2_FN |
30 | | inline void deinterleave_epi64(__m256i& X0, __m256i& X1) |
31 | 0 | { |
32 | 0 | const __m256i T0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(3,1,2,0)); |
33 | 0 | const __m256i T1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(3,1,2,0)); |
34 | |
|
35 | 0 | X0 = _mm256_unpacklo_epi64(T0, T1); |
36 | 0 | X1 = _mm256_unpackhi_epi64(T0, T1); |
37 | 0 | } |
38 | | |
39 | | BOTAN_AVX2_FN |
40 | | inline void rotate_keys(__m256i& R0, __m256i& R1, __m256i R2) |
41 | 0 | { |
42 | | /* |
43 | | Behold. The key schedule progresses like so. The values |
44 | | loop back to the originals after the rounds are complete |
45 | | so we don't need to reload for starting the next block. |
46 | | |
47 | | R0 R1 R2 |
48 | | K1,K2,K3 (7,5,3,1),(8,6,4,2),(0,7,5,3) |
49 | | K3,K4,K5 (0,7,5,3),(1,8,6,4),(2,0,7,5) |
50 | | K5,K6,K7 (2,0,7,5),(3,1,8,6),(4,2,0,7) |
51 | | |
52 | | K7,K8,K0 (4,2,0,7),(5,3,1,8),(6,4,2,0) |
53 | | K0,K1,K2 (6,4,2,0),(7,5,3,1),(8,6,4,2) |
54 | | K2,K3,K4 (8,6,4,2),(0,7,5,3),(1,8,6,4) |
55 | | |
56 | | K4,K5,K6 (1,8,6,4),(2,0,7,5),(3,1,8,6) |
57 | | K6,K7,K8 (3,1,8,6),(4,2,0,7),(5,3,1,8) |
58 | | K8,K0,K1 (5,3,1,8),(6,4,2,0),(7,5,3,1) |
59 | | |
60 | | To compute the values for the next round: |
61 | | X0 is X2 from the last round |
62 | | X1 becomes (X0[4],X1[1:3]) |
63 | | X2 becomes (X1[4],X2[1:3]) |
64 | | |
65 | | Uses 3 permutes and 2 blends, is there a faster way? |
66 | | */ |
67 | 0 | __m256i T0 = _mm256_permute4x64_epi64(R0, _MM_SHUFFLE(0,0,0,0)); |
68 | 0 | __m256i T1 = _mm256_permute4x64_epi64(R1, _MM_SHUFFLE(0,3,2,1)); |
69 | 0 | __m256i T2 = _mm256_permute4x64_epi64(R2, _MM_SHUFFLE(0,3,2,1)); |
70 | |
|
71 | 0 | R0 = _mm256_blend_epi32(T1, T0, 0xC0); |
72 | 0 | R1 = _mm256_blend_epi32(T2, T1, 0xC0); |
73 | 0 | } |
74 | | |
75 | | |
76 | | } |
77 | | |
78 | | BOTAN_AVX2_FN |
79 | | void Threefish_512::avx2_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const |
80 | 0 | { |
81 | 0 | _mm256_zeroupper(); |
82 | |
|
83 | 0 | const uint64_t* K = m_K.data(); |
84 | 0 | const uint64_t* T_64 = m_T.data(); |
85 | |
|
86 | 0 | const __m256i ROTATE_1 = _mm256_set_epi64x(37,19,36,46); |
87 | 0 | const __m256i ROTATE_2 = _mm256_set_epi64x(42,14,27,33); |
88 | 0 | const __m256i ROTATE_3 = _mm256_set_epi64x(39,36,49,17); |
89 | 0 | const __m256i ROTATE_4 = _mm256_set_epi64x(56,54, 9,44); |
90 | 0 | const __m256i ROTATE_5 = _mm256_set_epi64x(24,34,30,39); |
91 | 0 | const __m256i ROTATE_6 = _mm256_set_epi64x(17,10,50,13); |
92 | 0 | const __m256i ROTATE_7 = _mm256_set_epi64x(43,39,29,25); |
93 | 0 | const __m256i ROTATE_8 = _mm256_set_epi64x(22,56,35, 8); |
94 | |
|
95 | 0 | #define THREEFISH_ROUND(X0, X1, SHL) \ |
96 | 0 | do { \ |
97 | 0 | const __m256i SHR = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHL); \ |
98 | 0 | X0 = _mm256_add_epi64(X0, X1); \ |
99 | 0 | X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \ |
100 | 0 | X1 = _mm256_xor_si256(X1, X0); \ |
101 | 0 | X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(0, 3, 2, 1)); \ |
102 | 0 | X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0)); \ |
103 | 0 | } while(0) |
104 | |
|
105 | 0 | #define THREEFISH_ROUND_2(X0, X1, X2, X3, SHL) \ |
106 | 0 | do { \ |
107 | 0 | const __m256i SHR = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHL); \ |
108 | 0 | X0 = _mm256_add_epi64(X0, X1); \ |
109 | 0 | X2 = _mm256_add_epi64(X2, X3); \ |
110 | 0 | X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \ |
111 | 0 | X3 = _mm256_or_si256(_mm256_sllv_epi64(X3, SHL), _mm256_srlv_epi64(X3, SHR)); \ |
112 | 0 | X1 = _mm256_xor_si256(X1, X0); \ |
113 | 0 | X3 = _mm256_xor_si256(X3, X2); \ |
114 | 0 | X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(0, 3, 2, 1)); \ |
115 | 0 | X2 = _mm256_permute4x64_epi64(X2, _MM_SHUFFLE(0, 3, 2, 1)); \ |
116 | 0 | X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0)); \ |
117 | 0 | X3 = _mm256_permute4x64_epi64(X3, _MM_SHUFFLE(1, 2, 3, 0)); \ |
118 | 0 | } while(0) |
119 | |
|
120 | 0 | #define THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0I, T1I) \ |
121 | 0 | do { \ |
122 | 0 | const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \ |
123 | 0 | const __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \ |
124 | 0 | X0 = _mm256_add_epi64(X0, K0); \ |
125 | 0 | X1 = _mm256_add_epi64(X1, K1); \ |
126 | 0 | X1 = _mm256_add_epi64(X1, _mm256_set_epi64x(R,0,0,0)); \ |
127 | 0 | X0 = _mm256_add_epi64(X0, T0); \ |
128 | 0 | X1 = _mm256_add_epi64(X1, T1); \ |
129 | 0 | } while(0) |
130 | |
|
131 | 0 | #define THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0I, T1I) \ |
132 | 0 | do { \ |
133 | 0 | const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \ |
134 | 0 | __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \ |
135 | 0 | X0 = _mm256_add_epi64(X0, K0); \ |
136 | 0 | X2 = _mm256_add_epi64(X2, K0); \ |
137 | 0 | X1 = _mm256_add_epi64(X1, K1); \ |
138 | 0 | X3 = _mm256_add_epi64(X3, K1); \ |
139 | 0 | T1 = _mm256_add_epi64(T1, _mm256_set_epi64x(R,0,0,0)); \ |
140 | 0 | X0 = _mm256_add_epi64(X0, T0); \ |
141 | 0 | X2 = _mm256_add_epi64(X2, T0); \ |
142 | 0 | X1 = _mm256_add_epi64(X1, T1); \ |
143 | 0 | X3 = _mm256_add_epi64(X3, T1); \ |
144 | 0 | } while(0) |
145 | |
|
146 | 0 | #define THREEFISH_ENC_8_ROUNDS(X0, X1, R, K0, K1, K2, T0, T1, T2) \ |
147 | 0 | do { \ |
148 | 0 | rotate_keys(K1, K2, K0); \ |
149 | 0 | THREEFISH_ROUND(X0, X1, ROTATE_1); \ |
150 | 0 | THREEFISH_ROUND(X0, X1, ROTATE_2); \ |
151 | 0 | THREEFISH_ROUND(X0, X1, ROTATE_3); \ |
152 | 0 | THREEFISH_ROUND(X0, X1, ROTATE_4); \ |
153 | 0 | THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0, T1); \ |
154 | 0 | \ |
155 | 0 | THREEFISH_ROUND(X0, X1, ROTATE_5); \ |
156 | 0 | THREEFISH_ROUND(X0, X1, ROTATE_6); \ |
157 | 0 | THREEFISH_ROUND(X0, X1, ROTATE_7); \ |
158 | 0 | THREEFISH_ROUND(X0, X1, ROTATE_8); \ |
159 | 0 | THREEFISH_INJECT_KEY(X0, X1, R+1, K1, K2, T2, T0); \ |
160 | 0 | } while(0) |
161 | |
|
162 | 0 | #define THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K0, K1, K2, T0, T1, T2) \ |
163 | 0 | do { \ |
164 | 0 | rotate_keys(K1, K2, K0); \ |
165 | 0 | THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_1); \ |
166 | 0 | THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_2); \ |
167 | 0 | THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_3); \ |
168 | 0 | THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_4); \ |
169 | 0 | THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0, T1); \ |
170 | 0 | \ |
171 | 0 | THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_5); \ |
172 | 0 | THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_6); \ |
173 | 0 | THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_7); \ |
174 | 0 | THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_8); \ |
175 | 0 | THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R+1, K1, K2, T2, T0); \ |
176 | 0 | } while(0) |
177 | |
|
178 | 0 | __m256i K0 = _mm256_set_epi64x(K[5], K[3], K[1], K[8]); |
179 | 0 | __m256i K1 = _mm256_set_epi64x(K[6], K[4], K[2], K[0]); |
180 | 0 | __m256i K2 = _mm256_set_epi64x(K[7], K[5], K[3], K[1]); |
181 | |
|
182 | 0 | const __m256i* in_mm = reinterpret_cast<const __m256i*>(in); |
183 | 0 | __m256i* out_mm = reinterpret_cast<__m256i*>(out); |
184 | |
|
185 | 0 | while(blocks >= 2) |
186 | 0 | { |
187 | 0 | __m256i X0 = _mm256_loadu_si256(in_mm++); |
188 | 0 | __m256i X1 = _mm256_loadu_si256(in_mm++); |
189 | 0 | __m256i X2 = _mm256_loadu_si256(in_mm++); |
190 | 0 | __m256i X3 = _mm256_loadu_si256(in_mm++); |
191 | |
|
192 | 0 | const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0); |
193 | |
|
194 | 0 | interleave_epi64(X0, X1); |
195 | 0 | interleave_epi64(X2, X3); |
196 | |
|
197 | 0 | THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, 0, K1, K2, 2, 3); |
198 | |
|
199 | 0 | THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 1, K2,K0,K1, 1, 2, 3); |
200 | 0 | THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 3, K1,K2,K0, 2, 3, 1); |
201 | 0 | THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 5, K0,K1,K2, 3, 1, 2); |
202 | 0 | THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 7, K2,K0,K1, 1, 2, 3); |
203 | 0 | THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 9, K1,K2,K0, 2, 3, 1); |
204 | 0 | THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 11, K0,K1,K2, 3, 1, 2); |
205 | 0 | THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 13, K2,K0,K1, 1, 2, 3); |
206 | 0 | THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 15, K1,K2,K0, 2, 3, 1); |
207 | 0 | THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 17, K0,K1,K2, 3, 1, 2); |
208 | |
|
209 | 0 | deinterleave_epi64(X0, X1); |
210 | 0 | deinterleave_epi64(X2, X3); |
211 | |
|
212 | 0 | _mm256_storeu_si256(out_mm++, X0); |
213 | 0 | _mm256_storeu_si256(out_mm++, X1); |
214 | 0 | _mm256_storeu_si256(out_mm++, X2); |
215 | 0 | _mm256_storeu_si256(out_mm++, X3); |
216 | |
|
217 | 0 | blocks -= 2; |
218 | 0 | } |
219 | |
|
220 | 0 | for(size_t i = 0; i != blocks; ++i) |
221 | 0 | { |
222 | 0 | __m256i X0 = _mm256_loadu_si256(in_mm++); |
223 | 0 | __m256i X1 = _mm256_loadu_si256(in_mm++); |
224 | |
|
225 | 0 | const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0); |
226 | |
|
227 | 0 | interleave_epi64(X0, X1); |
228 | |
|
229 | 0 | THREEFISH_INJECT_KEY(X0, X1, 0, K1, K2, 2, 3); |
230 | |
|
231 | 0 | THREEFISH_ENC_8_ROUNDS(X0, X1, 1, K2,K0,K1, 1, 2, 3); |
232 | 0 | THREEFISH_ENC_8_ROUNDS(X0, X1, 3, K1,K2,K0, 2, 3, 1); |
233 | 0 | THREEFISH_ENC_8_ROUNDS(X0, X1, 5, K0,K1,K2, 3, 1, 2); |
234 | 0 | THREEFISH_ENC_8_ROUNDS(X0, X1, 7, K2,K0,K1, 1, 2, 3); |
235 | 0 | THREEFISH_ENC_8_ROUNDS(X0, X1, 9, K1,K2,K0, 2, 3, 1); |
236 | 0 | THREEFISH_ENC_8_ROUNDS(X0, X1, 11, K0,K1,K2, 3, 1, 2); |
237 | 0 | THREEFISH_ENC_8_ROUNDS(X0, X1, 13, K2,K0,K1, 1, 2, 3); |
238 | 0 | THREEFISH_ENC_8_ROUNDS(X0, X1, 15, K1,K2,K0, 2, 3, 1); |
239 | 0 | THREEFISH_ENC_8_ROUNDS(X0, X1, 17, K0,K1,K2, 3, 1, 2); |
240 | |
|
241 | 0 | deinterleave_epi64(X0, X1); |
242 | |
|
243 | 0 | _mm256_storeu_si256(out_mm++, X0); |
244 | 0 | _mm256_storeu_si256(out_mm++, X1); |
245 | 0 | } |
246 | |
|
247 | 0 | _mm256_zeroall(); |
248 | |
|
249 | 0 | #undef THREEFISH_ENC_8_ROUNDS |
250 | 0 | #undef THREEFISH_ROUND |
251 | 0 | #undef THREEFISH_INJECT_KEY |
252 | 0 | #undef THREEFISH_DEC_2_8_ROUNDS |
253 | 0 | #undef THREEFISH_ROUND_2 |
254 | 0 | #undef THREEFISH_INJECT_KEY_2 |
255 | 0 | } |
256 | | |
257 | | BOTAN_AVX2_FN |
258 | | void Threefish_512::avx2_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const |
259 | 0 | { |
260 | 0 | _mm256_zeroupper(); |
261 | |
|
262 | 0 | const uint64_t* K = m_K.data(); |
263 | 0 | const uint64_t* T_64 = m_T.data(); |
264 | |
|
265 | 0 | const __m256i ROTATE_1 = _mm256_set_epi64x(37,19,36,46); |
266 | 0 | const __m256i ROTATE_2 = _mm256_set_epi64x(42,14,27,33); |
267 | 0 | const __m256i ROTATE_3 = _mm256_set_epi64x(39,36,49,17); |
268 | 0 | const __m256i ROTATE_4 = _mm256_set_epi64x(56,54, 9,44); |
269 | 0 | const __m256i ROTATE_5 = _mm256_set_epi64x(24,34,30,39); |
270 | 0 | const __m256i ROTATE_6 = _mm256_set_epi64x(17,10,50,13); |
271 | 0 | const __m256i ROTATE_7 = _mm256_set_epi64x(43,39,29,25); |
272 | 0 | const __m256i ROTATE_8 = _mm256_set_epi64x(22,56,35, 8); |
273 | |
|
274 | 0 | #define THREEFISH_ROUND(X0, X1, SHR) \ |
275 | 0 | do { \ |
276 | 0 | const __m256i SHL = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHR); \ |
277 | 0 | X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(2, 1, 0, 3)); \ |
278 | 0 | X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0)); \ |
279 | 0 | X1 = _mm256_xor_si256(X1, X0); \ |
280 | 0 | X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \ |
281 | 0 | X0 = _mm256_sub_epi64(X0, X1); \ |
282 | 0 | } while(0) |
283 | |
|
284 | 0 | #define THREEFISH_ROUND_2(X0, X1, X2, X3, SHR) \ |
285 | 0 | do { \ |
286 | 0 | const __m256i SHL = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHR); \ |
287 | 0 | X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(2, 1, 0, 3)); \ |
288 | 0 | X2 = _mm256_permute4x64_epi64(X2, _MM_SHUFFLE(2, 1, 0, 3)); \ |
289 | 0 | X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0)); \ |
290 | 0 | X3 = _mm256_permute4x64_epi64(X3, _MM_SHUFFLE(1, 2, 3, 0)); \ |
291 | 0 | X1 = _mm256_xor_si256(X1, X0); \ |
292 | 0 | X3 = _mm256_xor_si256(X3, X2); \ |
293 | 0 | X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \ |
294 | 0 | X3 = _mm256_or_si256(_mm256_sllv_epi64(X3, SHL), _mm256_srlv_epi64(X3, SHR)); \ |
295 | 0 | X0 = _mm256_sub_epi64(X0, X1); \ |
296 | 0 | X2 = _mm256_sub_epi64(X2, X3); \ |
297 | 0 | } while(0) |
298 | |
|
299 | 0 | #define THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0I, T1I) \ |
300 | 0 | do { \ |
301 | 0 | const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \ |
302 | 0 | const __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \ |
303 | 0 | X0 = _mm256_sub_epi64(X0, K0); \ |
304 | 0 | X1 = _mm256_sub_epi64(X1, K1); \ |
305 | 0 | X1 = _mm256_sub_epi64(X1, _mm256_set_epi64x(R, 0, 0, 0)); \ |
306 | 0 | X0 = _mm256_sub_epi64(X0, T0); \ |
307 | 0 | X1 = _mm256_sub_epi64(X1, T1); \ |
308 | 0 | } while(0) |
309 | |
|
310 | 0 | #define THREEFISH_DEC_8_ROUNDS(X0, X1, R, K1, K2, K3, T0, T1, T2) \ |
311 | 0 | do { \ |
312 | 0 | THREEFISH_INJECT_KEY(X0, X1, R+1, K2, K3, T2, T0); \ |
313 | 0 | THREEFISH_ROUND(X0, X1, ROTATE_8); \ |
314 | 0 | THREEFISH_ROUND(X0, X1, ROTATE_7); \ |
315 | 0 | THREEFISH_ROUND(X0, X1, ROTATE_6); \ |
316 | 0 | THREEFISH_ROUND(X0, X1, ROTATE_5); \ |
317 | 0 | \ |
318 | 0 | THREEFISH_INJECT_KEY(X0, X1, R, K1, K2, T0, T1); \ |
319 | 0 | THREEFISH_ROUND(X0, X1, ROTATE_4); \ |
320 | 0 | THREEFISH_ROUND(X0, X1, ROTATE_3); \ |
321 | 0 | THREEFISH_ROUND(X0, X1, ROTATE_2); \ |
322 | 0 | THREEFISH_ROUND(X0, X1, ROTATE_1); \ |
323 | 0 | } while(0) |
324 | |
|
325 | 0 | #define THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0I, T1I) \ |
326 | 0 | do { \ |
327 | 0 | const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \ |
328 | 0 | __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \ |
329 | 0 | X0 = _mm256_sub_epi64(X0, K0); \ |
330 | 0 | X2 = _mm256_sub_epi64(X2, K0); \ |
331 | 0 | X1 = _mm256_sub_epi64(X1, K1); \ |
332 | 0 | X3 = _mm256_sub_epi64(X3, K1); \ |
333 | 0 | T1 = _mm256_add_epi64(T1, _mm256_set_epi64x(R,0,0,0)); \ |
334 | 0 | X0 = _mm256_sub_epi64(X0, T0); \ |
335 | 0 | X2 = _mm256_sub_epi64(X2, T0); \ |
336 | 0 | X1 = _mm256_sub_epi64(X1, T1); \ |
337 | 0 | X3 = _mm256_sub_epi64(X3, T1); \ |
338 | 0 | } while(0) |
339 | |
|
340 | 0 | #define THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, R, K1, K2, K3, T0, T1, T2) \ |
341 | 0 | do { \ |
342 | 0 | THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R+1, K2, K3, T2, T0); \ |
343 | 0 | THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_8); \ |
344 | 0 | THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_7); \ |
345 | 0 | THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_6); \ |
346 | 0 | THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_5); \ |
347 | 0 | \ |
348 | 0 | THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K1, K2, T0, T1); \ |
349 | 0 | THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_4); \ |
350 | 0 | THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_3); \ |
351 | 0 | THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_2); \ |
352 | 0 | THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_1); \ |
353 | 0 | } while(0) |
354 | | |
355 | | /* |
356 | | v1.0 key schedule: 9 ymm registers (only need 2 or 3) |
357 | | (0,1,2,3),(4,5,6,7) [8] |
358 | | then mutating with vpermq |
359 | | */ |
360 | 0 | const __m256i K0 = _mm256_set_epi64x(K[6], K[4], K[2], K[0]); |
361 | 0 | const __m256i K1 = _mm256_set_epi64x(K[7], K[5], K[3], K[1]); |
362 | 0 | const __m256i K2 = _mm256_set_epi64x(K[8], K[6], K[4], K[2]); |
363 | 0 | const __m256i K3 = _mm256_set_epi64x(K[0], K[7], K[5], K[3]); |
364 | 0 | const __m256i K4 = _mm256_set_epi64x(K[1], K[8], K[6], K[4]); |
365 | 0 | const __m256i K5 = _mm256_set_epi64x(K[2], K[0], K[7], K[5]); |
366 | 0 | const __m256i K6 = _mm256_set_epi64x(K[3], K[1], K[8], K[6]); |
367 | 0 | const __m256i K7 = _mm256_set_epi64x(K[4], K[2], K[0], K[7]); |
368 | 0 | const __m256i K8 = _mm256_set_epi64x(K[5], K[3], K[1], K[8]); |
369 | |
|
370 | 0 | const __m256i* in_mm = reinterpret_cast<const __m256i*>(in); |
371 | 0 | __m256i* out_mm = reinterpret_cast<__m256i*>(out); |
372 | |
|
373 | 0 | while(blocks >= 2) |
374 | 0 | { |
375 | 0 | __m256i X0 = _mm256_loadu_si256(in_mm++); |
376 | 0 | __m256i X1 = _mm256_loadu_si256(in_mm++); |
377 | 0 | __m256i X2 = _mm256_loadu_si256(in_mm++); |
378 | 0 | __m256i X3 = _mm256_loadu_si256(in_mm++); |
379 | |
|
380 | 0 | const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0); |
381 | |
|
382 | 0 | interleave_epi64(X0, X1); |
383 | 0 | interleave_epi64(X2, X3); |
384 | |
|
385 | 0 | THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 17, K8,K0,K1, 3, 1, 2); |
386 | 0 | THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 15, K6,K7,K8, 2, 3, 1); |
387 | 0 | THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 13, K4,K5,K6, 1, 2, 3); |
388 | 0 | THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 11, K2,K3,K4, 3, 1, 2); |
389 | 0 | THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 9, K0,K1,K2, 2, 3, 1); |
390 | 0 | THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 7, K7,K8,K0, 1, 2, 3); |
391 | 0 | THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 5, K5,K6,K7, 3, 1, 2); |
392 | 0 | THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 3, K3,K4,K5, 2, 3, 1); |
393 | 0 | THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 1, K1,K2,K3, 1, 2, 3); |
394 | |
|
395 | 0 | THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, 0, K0, K1, 2, 3); |
396 | |
|
397 | 0 | deinterleave_epi64(X0, X1); |
398 | 0 | deinterleave_epi64(X2, X3); |
399 | |
|
400 | 0 | _mm256_storeu_si256(out_mm++, X0); |
401 | 0 | _mm256_storeu_si256(out_mm++, X1); |
402 | 0 | _mm256_storeu_si256(out_mm++, X2); |
403 | 0 | _mm256_storeu_si256(out_mm++, X3); |
404 | |
|
405 | 0 | blocks -= 2; |
406 | 0 | } |
407 | |
|
408 | 0 | for(size_t i = 0; i != blocks; ++i) |
409 | 0 | { |
410 | 0 | __m256i X0 = _mm256_loadu_si256(in_mm++); |
411 | 0 | __m256i X1 = _mm256_loadu_si256(in_mm++); |
412 | |
|
413 | 0 | const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0); |
414 | |
|
415 | 0 | interleave_epi64(X0, X1); |
416 | |
|
417 | 0 | THREEFISH_DEC_8_ROUNDS(X0, X1, 17, K8,K0,K1, 3, 1, 2); |
418 | 0 | THREEFISH_DEC_8_ROUNDS(X0, X1, 15, K6,K7,K8, 2, 3, 1); |
419 | 0 | THREEFISH_DEC_8_ROUNDS(X0, X1, 13, K4,K5,K6, 1, 2, 3); |
420 | 0 | THREEFISH_DEC_8_ROUNDS(X0, X1, 11, K2,K3,K4, 3, 1, 2); |
421 | 0 | THREEFISH_DEC_8_ROUNDS(X0, X1, 9, K0,K1,K2, 2, 3, 1); |
422 | 0 | THREEFISH_DEC_8_ROUNDS(X0, X1, 7, K7,K8,K0, 1, 2, 3); |
423 | 0 | THREEFISH_DEC_8_ROUNDS(X0, X1, 5, K5,K6,K7, 3, 1, 2); |
424 | 0 | THREEFISH_DEC_8_ROUNDS(X0, X1, 3, K3,K4,K5, 2, 3, 1); |
425 | 0 | THREEFISH_DEC_8_ROUNDS(X0, X1, 1, K1,K2,K3, 1, 2, 3); |
426 | |
|
427 | 0 | THREEFISH_INJECT_KEY(X0, X1, 0, K0, K1, 2, 3); |
428 | |
|
429 | 0 | deinterleave_epi64(X0, X1); |
430 | |
|
431 | 0 | _mm256_storeu_si256(out_mm++, X0); |
432 | 0 | _mm256_storeu_si256(out_mm++, X1); |
433 | 0 | } |
434 | |
|
435 | 0 | #undef THREEFISH_DEC_8_ROUNDS |
436 | 0 | #undef THREEFISH_ROUND |
437 | 0 | #undef THREEFISH_INJECT_KEY |
438 | 0 | #undef THREEFISH_DEC_2_8_ROUNDS |
439 | 0 | #undef THREEFISH_ROUND_2 |
440 | 0 | #undef THREEFISH_INJECT_KEY_2 |
441 | |
|
442 | 0 | _mm256_zeroall(); |
443 | 0 | } |
444 | | |
445 | | } |