/src/botan/src/lib/block/threefish_512/threefish_512_avx2/threefish_512_avx2.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Threefish-512 using AVX2 |
3 | | * (C) 2013,2016 Jack Lloyd |
4 | | * |
5 | | * Botan is released under the Simplified BSD License (see license.txt) |
6 | | */ |
7 | | |
8 | | #include <botan/threefish_512.h> |
9 | | #include <immintrin.h> |
10 | | |
11 | | namespace Botan { |
12 | | |
13 | | namespace { |
14 | | |
15 | | BOTAN_FUNC_ISA("avx2") |
16 | | inline void interleave_epi64(__m256i& X0, __m256i& X1) |
17 | 0 | { |
18 | | // interleave X0 and X1 qwords |
19 | | // (X0,X1,X2,X3),(X4,X5,X6,X7) -> (X0,X2,X4,X6),(X1,X3,X5,X7) |
20 | 0 |
|
21 | 0 | const __m256i T0 = _mm256_unpacklo_epi64(X0, X1); |
22 | 0 | const __m256i T1 = _mm256_unpackhi_epi64(X0, X1); |
23 | 0 |
|
24 | 0 | X0 = _mm256_permute4x64_epi64(T0, _MM_SHUFFLE(3,1,2,0)); |
25 | 0 | X1 = _mm256_permute4x64_epi64(T1, _MM_SHUFFLE(3,1,2,0)); |
26 | 0 | } |
27 | | |
28 | | BOTAN_FUNC_ISA("avx2") |
29 | | inline void deinterleave_epi64(__m256i& X0, __m256i& X1) |
30 | 0 | { |
31 | 0 | const __m256i T0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(3,1,2,0)); |
32 | 0 | const __m256i T1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(3,1,2,0)); |
33 | 0 |
|
34 | 0 | X0 = _mm256_unpacklo_epi64(T0, T1); |
35 | 0 | X1 = _mm256_unpackhi_epi64(T0, T1); |
36 | 0 | } |
37 | | |
38 | | BOTAN_FUNC_ISA("avx2") |
39 | | inline void rotate_keys(__m256i& R0, __m256i& R1, __m256i R2) |
40 | 0 | { |
41 | | /* |
42 | | Behold. The key schedule progresses like so. The values |
43 | | loop back to the originals after the rounds are complete |
44 | | so we don't need to reload for starting the next block. |
45 | | |
46 | | R0 R1 R2 |
47 | | K1,K2,K3 (7,5,3,1),(8,6,4,2),(0,7,5,3) |
48 | | K3,K4,K5 (0,7,5,3),(1,8,6,4),(2,0,7,5) |
49 | | K5,K6,K7 (2,0,7,5),(3,1,8,6),(4,2,0,7) |
50 | | |
51 | | K7,K8,K0 (4,2,0,7),(5,3,1,8),(6,4,2,0) |
52 | | K0,K1,K2 (6,4,2,0),(7,5,3,1),(8,6,4,2) |
53 | | K2,K3,K4 (8,6,4,2),(0,7,5,3),(1,8,6,4) |
54 | | |
55 | | K4,K5,K6 (1,8,6,4),(2,0,7,5),(3,1,8,6) |
56 | | K6,K7,K8 (3,1,8,6),(4,2,0,7),(5,3,1,8) |
57 | | K8,K0,K1 (5,3,1,8),(6,4,2,0),(7,5,3,1) |
58 | | |
59 | | To compute the values for the next round: |
60 | | X0 is X2 from the last round |
61 | | X1 becomes (X0[4],X1[1:3]) |
62 | | X2 becomes (X1[4],X2[1:3]) |
63 | | |
64 | | Uses 3 permutes and 2 blends, is there a faster way? |
65 | | */ |
66 | 0 | __m256i T0 = _mm256_permute4x64_epi64(R0, _MM_SHUFFLE(0,0,0,0)); |
67 | 0 | __m256i T1 = _mm256_permute4x64_epi64(R1, _MM_SHUFFLE(0,3,2,1)); |
68 | 0 | __m256i T2 = _mm256_permute4x64_epi64(R2, _MM_SHUFFLE(0,3,2,1)); |
69 | 0 |
|
70 | 0 | R0 = _mm256_blend_epi32(T1, T0, 0xC0); |
71 | 0 | R1 = _mm256_blend_epi32(T2, T1, 0xC0); |
72 | 0 | } |
73 | | |
74 | | |
75 | | } |
76 | | |
77 | | BOTAN_FUNC_ISA("avx2") |
78 | | void Threefish_512::avx2_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const |
79 | 0 | { |
80 | 0 | _mm256_zeroupper(); |
81 | 0 |
|
82 | 0 | const uint64_t* K = m_K.data(); |
83 | 0 | const uint64_t* T_64 = m_T.data(); |
84 | 0 |
|
85 | 0 | const __m256i ROTATE_1 = _mm256_set_epi64x(37,19,36,46); |
86 | 0 | const __m256i ROTATE_2 = _mm256_set_epi64x(42,14,27,33); |
87 | 0 | const __m256i ROTATE_3 = _mm256_set_epi64x(39,36,49,17); |
88 | 0 | const __m256i ROTATE_4 = _mm256_set_epi64x(56,54, 9,44); |
89 | 0 | const __m256i ROTATE_5 = _mm256_set_epi64x(24,34,30,39); |
90 | 0 | const __m256i ROTATE_6 = _mm256_set_epi64x(17,10,50,13); |
91 | 0 | const __m256i ROTATE_7 = _mm256_set_epi64x(43,39,29,25); |
92 | 0 | const __m256i ROTATE_8 = _mm256_set_epi64x(22,56,35, 8); |
93 | 0 |
|
94 | 0 | #define THREEFISH_ROUND(X0, X1, SHL) \ |
95 | 0 | do { \ |
96 | 0 | const __m256i SHR = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHL); \ |
97 | 0 | X0 = _mm256_add_epi64(X0, X1); \ |
98 | 0 | X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \ |
99 | 0 | X1 = _mm256_xor_si256(X1, X0); \ |
100 | 0 | X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(0, 3, 2, 1)); \ |
101 | 0 | X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0)); \ |
102 | 0 | } while(0) |
103 | 0 |
|
104 | 0 | #define THREEFISH_ROUND_2(X0, X1, X2, X3, SHL) \ |
105 | 0 | do { \ |
106 | 0 | const __m256i SHR = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHL); \ |
107 | 0 | X0 = _mm256_add_epi64(X0, X1); \ |
108 | 0 | X2 = _mm256_add_epi64(X2, X3); \ |
109 | 0 | X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \ |
110 | 0 | X3 = _mm256_or_si256(_mm256_sllv_epi64(X3, SHL), _mm256_srlv_epi64(X3, SHR)); \ |
111 | 0 | X1 = _mm256_xor_si256(X1, X0); \ |
112 | 0 | X3 = _mm256_xor_si256(X3, X2); \ |
113 | 0 | X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(0, 3, 2, 1)); \ |
114 | 0 | X2 = _mm256_permute4x64_epi64(X2, _MM_SHUFFLE(0, 3, 2, 1)); \ |
115 | 0 | X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0)); \ |
116 | 0 | X3 = _mm256_permute4x64_epi64(X3, _MM_SHUFFLE(1, 2, 3, 0)); \ |
117 | 0 | } while(0) |
118 | 0 |
|
119 | 0 | #define THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0I, T1I) \ |
120 | 0 | do { \ |
121 | 0 | const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \ |
122 | 0 | const __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \ |
123 | 0 | X0 = _mm256_add_epi64(X0, K0); \ |
124 | 0 | X1 = _mm256_add_epi64(X1, K1); \ |
125 | 0 | X1 = _mm256_add_epi64(X1, _mm256_set_epi64x(R,0,0,0)); \ |
126 | 0 | X0 = _mm256_add_epi64(X0, T0); \ |
127 | 0 | X1 = _mm256_add_epi64(X1, T1); \ |
128 | 0 | } while(0) |
129 | 0 |
|
130 | 0 | #define THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0I, T1I) \ |
131 | 0 | do { \ |
132 | 0 | const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \ |
133 | 0 | __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \ |
134 | 0 | X0 = _mm256_add_epi64(X0, K0); \ |
135 | 0 | X2 = _mm256_add_epi64(X2, K0); \ |
136 | 0 | X1 = _mm256_add_epi64(X1, K1); \ |
137 | 0 | X3 = _mm256_add_epi64(X3, K1); \ |
138 | 0 | T1 = _mm256_add_epi64(T1, _mm256_set_epi64x(R,0,0,0)); \ |
139 | 0 | X0 = _mm256_add_epi64(X0, T0); \ |
140 | 0 | X2 = _mm256_add_epi64(X2, T0); \ |
141 | 0 | X1 = _mm256_add_epi64(X1, T1); \ |
142 | 0 | X3 = _mm256_add_epi64(X3, T1); \ |
143 | 0 | } while(0) |
144 | 0 |
|
145 | 0 | #define THREEFISH_ENC_8_ROUNDS(X0, X1, R, K0, K1, K2, T0, T1, T2) \ |
146 | 0 | do { \ |
147 | 0 | rotate_keys(K1, K2, K0); \ |
148 | 0 | THREEFISH_ROUND(X0, X1, ROTATE_1); \ |
149 | 0 | THREEFISH_ROUND(X0, X1, ROTATE_2); \ |
150 | 0 | THREEFISH_ROUND(X0, X1, ROTATE_3); \ |
151 | 0 | THREEFISH_ROUND(X0, X1, ROTATE_4); \ |
152 | 0 | THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0, T1); \ |
153 | 0 | \ |
154 | 0 | THREEFISH_ROUND(X0, X1, ROTATE_5); \ |
155 | 0 | THREEFISH_ROUND(X0, X1, ROTATE_6); \ |
156 | 0 | THREEFISH_ROUND(X0, X1, ROTATE_7); \ |
157 | 0 | THREEFISH_ROUND(X0, X1, ROTATE_8); \ |
158 | 0 | THREEFISH_INJECT_KEY(X0, X1, R+1, K1, K2, T2, T0); \ |
159 | 0 | } while(0) |
160 | 0 |
|
161 | 0 | #define THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K0, K1, K2, T0, T1, T2) \ |
162 | 0 | do { \ |
163 | 0 | rotate_keys(K1, K2, K0); \ |
164 | 0 | THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_1); \ |
165 | 0 | THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_2); \ |
166 | 0 | THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_3); \ |
167 | 0 | THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_4); \ |
168 | 0 | THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0, T1); \ |
169 | 0 | \ |
170 | 0 | THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_5); \ |
171 | 0 | THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_6); \ |
172 | 0 | THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_7); \ |
173 | 0 | THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_8); \ |
174 | 0 | THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R+1, K1, K2, T2, T0); \ |
175 | 0 | } while(0) |
176 | 0 |
|
177 | 0 | __m256i K0 = _mm256_set_epi64x(K[5], K[3], K[1], K[8]); |
178 | 0 | __m256i K1 = _mm256_set_epi64x(K[6], K[4], K[2], K[0]); |
179 | 0 | __m256i K2 = _mm256_set_epi64x(K[7], K[5], K[3], K[1]); |
180 | 0 |
|
181 | 0 | const __m256i* in_mm = reinterpret_cast<const __m256i*>(in); |
182 | 0 | __m256i* out_mm = reinterpret_cast<__m256i*>(out); |
183 | 0 |
|
184 | 0 | while(blocks >= 2) |
185 | 0 | { |
186 | 0 | __m256i X0 = _mm256_loadu_si256(in_mm++); |
187 | 0 | __m256i X1 = _mm256_loadu_si256(in_mm++); |
188 | 0 | __m256i X2 = _mm256_loadu_si256(in_mm++); |
189 | 0 | __m256i X3 = _mm256_loadu_si256(in_mm++); |
190 | 0 |
|
191 | 0 | const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0); |
192 | 0 |
|
193 | 0 | interleave_epi64(X0, X1); |
194 | 0 | interleave_epi64(X2, X3); |
195 | 0 |
|
196 | 0 | THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, 0, K1, K2, 2, 3); |
197 | 0 |
|
198 | 0 | THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 1, K2,K0,K1, 1, 2, 3); |
199 | 0 | THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 3, K1,K2,K0, 2, 3, 1); |
200 | 0 | THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 5, K0,K1,K2, 3, 1, 2); |
201 | 0 | THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 7, K2,K0,K1, 1, 2, 3); |
202 | 0 | THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 9, K1,K2,K0, 2, 3, 1); |
203 | 0 | THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 11, K0,K1,K2, 3, 1, 2); |
204 | 0 | THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 13, K2,K0,K1, 1, 2, 3); |
205 | 0 | THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 15, K1,K2,K0, 2, 3, 1); |
206 | 0 | THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 17, K0,K1,K2, 3, 1, 2); |
207 | 0 |
|
208 | 0 | deinterleave_epi64(X0, X1); |
209 | 0 | deinterleave_epi64(X2, X3); |
210 | 0 |
|
211 | 0 | _mm256_storeu_si256(out_mm++, X0); |
212 | 0 | _mm256_storeu_si256(out_mm++, X1); |
213 | 0 | _mm256_storeu_si256(out_mm++, X2); |
214 | 0 | _mm256_storeu_si256(out_mm++, X3); |
215 | 0 |
|
216 | 0 | blocks -= 2; |
217 | 0 | } |
218 | 0 |
|
219 | 0 | for(size_t i = 0; i != blocks; ++i) |
220 | 0 | { |
221 | 0 | __m256i X0 = _mm256_loadu_si256(in_mm++); |
222 | 0 | __m256i X1 = _mm256_loadu_si256(in_mm++); |
223 | 0 |
|
224 | 0 | const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0); |
225 | 0 |
|
226 | 0 | interleave_epi64(X0, X1); |
227 | 0 |
|
228 | 0 | THREEFISH_INJECT_KEY(X0, X1, 0, K1, K2, 2, 3); |
229 | 0 |
|
230 | 0 | THREEFISH_ENC_8_ROUNDS(X0, X1, 1, K2,K0,K1, 1, 2, 3); |
231 | 0 | THREEFISH_ENC_8_ROUNDS(X0, X1, 3, K1,K2,K0, 2, 3, 1); |
232 | 0 | THREEFISH_ENC_8_ROUNDS(X0, X1, 5, K0,K1,K2, 3, 1, 2); |
233 | 0 | THREEFISH_ENC_8_ROUNDS(X0, X1, 7, K2,K0,K1, 1, 2, 3); |
234 | 0 | THREEFISH_ENC_8_ROUNDS(X0, X1, 9, K1,K2,K0, 2, 3, 1); |
235 | 0 | THREEFISH_ENC_8_ROUNDS(X0, X1, 11, K0,K1,K2, 3, 1, 2); |
236 | 0 | THREEFISH_ENC_8_ROUNDS(X0, X1, 13, K2,K0,K1, 1, 2, 3); |
237 | 0 | THREEFISH_ENC_8_ROUNDS(X0, X1, 15, K1,K2,K0, 2, 3, 1); |
238 | 0 | THREEFISH_ENC_8_ROUNDS(X0, X1, 17, K0,K1,K2, 3, 1, 2); |
239 | 0 |
|
240 | 0 | deinterleave_epi64(X0, X1); |
241 | 0 |
|
242 | 0 | _mm256_storeu_si256(out_mm++, X0); |
243 | 0 | _mm256_storeu_si256(out_mm++, X1); |
244 | 0 | } |
245 | 0 |
|
246 | 0 | _mm256_zeroall(); |
247 | 0 |
|
248 | 0 | #undef THREEFISH_ENC_8_ROUNDS |
249 | 0 | #undef THREEFISH_ROUND |
250 | 0 | #undef THREEFISH_INJECT_KEY |
251 | 0 | #undef THREEFISH_DEC_2_8_ROUNDS |
252 | 0 | #undef THREEFISH_ROUND_2 |
253 | 0 | #undef THREEFISH_INJECT_KEY_2 |
254 | 0 | } |
255 | | |
256 | | BOTAN_FUNC_ISA("avx2") |
257 | | void Threefish_512::avx2_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const |
258 | 0 | { |
259 | 0 | _mm256_zeroupper(); |
260 | 0 |
|
261 | 0 | const uint64_t* K = m_K.data(); |
262 | 0 | const uint64_t* T_64 = m_T.data(); |
263 | 0 |
|
264 | 0 | const __m256i ROTATE_1 = _mm256_set_epi64x(37,19,36,46); |
265 | 0 | const __m256i ROTATE_2 = _mm256_set_epi64x(42,14,27,33); |
266 | 0 | const __m256i ROTATE_3 = _mm256_set_epi64x(39,36,49,17); |
267 | 0 | const __m256i ROTATE_4 = _mm256_set_epi64x(56,54, 9,44); |
268 | 0 | const __m256i ROTATE_5 = _mm256_set_epi64x(24,34,30,39); |
269 | 0 | const __m256i ROTATE_6 = _mm256_set_epi64x(17,10,50,13); |
270 | 0 | const __m256i ROTATE_7 = _mm256_set_epi64x(43,39,29,25); |
271 | 0 | const __m256i ROTATE_8 = _mm256_set_epi64x(22,56,35, 8); |
272 | 0 |
|
273 | 0 | #define THREEFISH_ROUND(X0, X1, SHR) \ |
274 | 0 | do { \ |
275 | 0 | const __m256i SHL = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHR); \ |
276 | 0 | X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(2, 1, 0, 3)); \ |
277 | 0 | X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0)); \ |
278 | 0 | X1 = _mm256_xor_si256(X1, X0); \ |
279 | 0 | X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \ |
280 | 0 | X0 = _mm256_sub_epi64(X0, X1); \ |
281 | 0 | } while(0) |
282 | 0 |
|
283 | 0 | #define THREEFISH_ROUND_2(X0, X1, X2, X3, SHR) \ |
284 | 0 | do { \ |
285 | 0 | const __m256i SHL = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHR); \ |
286 | 0 | X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(2, 1, 0, 3)); \ |
287 | 0 | X2 = _mm256_permute4x64_epi64(X2, _MM_SHUFFLE(2, 1, 0, 3)); \ |
288 | 0 | X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0)); \ |
289 | 0 | X3 = _mm256_permute4x64_epi64(X3, _MM_SHUFFLE(1, 2, 3, 0)); \ |
290 | 0 | X1 = _mm256_xor_si256(X1, X0); \ |
291 | 0 | X3 = _mm256_xor_si256(X3, X2); \ |
292 | 0 | X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \ |
293 | 0 | X3 = _mm256_or_si256(_mm256_sllv_epi64(X3, SHL), _mm256_srlv_epi64(X3, SHR)); \ |
294 | 0 | X0 = _mm256_sub_epi64(X0, X1); \ |
295 | 0 | X2 = _mm256_sub_epi64(X2, X3); \ |
296 | 0 | } while(0) |
297 | 0 |
|
298 | 0 | #define THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0I, T1I) \ |
299 | 0 | do { \ |
300 | 0 | const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \ |
301 | 0 | const __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \ |
302 | 0 | X0 = _mm256_sub_epi64(X0, K0); \ |
303 | 0 | X1 = _mm256_sub_epi64(X1, K1); \ |
304 | 0 | X1 = _mm256_sub_epi64(X1, _mm256_set_epi64x(R, 0, 0, 0)); \ |
305 | 0 | X0 = _mm256_sub_epi64(X0, T0); \ |
306 | 0 | X1 = _mm256_sub_epi64(X1, T1); \ |
307 | 0 | } while(0) |
308 | 0 |
|
309 | 0 | #define THREEFISH_DEC_8_ROUNDS(X0, X1, R, K1, K2, K3, T0, T1, T2) \ |
310 | 0 | do { \ |
311 | 0 | THREEFISH_INJECT_KEY(X0, X1, R+1, K2, K3, T2, T0); \ |
312 | 0 | THREEFISH_ROUND(X0, X1, ROTATE_8); \ |
313 | 0 | THREEFISH_ROUND(X0, X1, ROTATE_7); \ |
314 | 0 | THREEFISH_ROUND(X0, X1, ROTATE_6); \ |
315 | 0 | THREEFISH_ROUND(X0, X1, ROTATE_5); \ |
316 | 0 | \ |
317 | 0 | THREEFISH_INJECT_KEY(X0, X1, R, K1, K2, T0, T1); \ |
318 | 0 | THREEFISH_ROUND(X0, X1, ROTATE_4); \ |
319 | 0 | THREEFISH_ROUND(X0, X1, ROTATE_3); \ |
320 | 0 | THREEFISH_ROUND(X0, X1, ROTATE_2); \ |
321 | 0 | THREEFISH_ROUND(X0, X1, ROTATE_1); \ |
322 | 0 | } while(0) |
323 | 0 |
|
324 | 0 | #define THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0I, T1I) \ |
325 | 0 | do { \ |
326 | 0 | const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \ |
327 | 0 | __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \ |
328 | 0 | X0 = _mm256_sub_epi64(X0, K0); \ |
329 | 0 | X2 = _mm256_sub_epi64(X2, K0); \ |
330 | 0 | X1 = _mm256_sub_epi64(X1, K1); \ |
331 | 0 | X3 = _mm256_sub_epi64(X3, K1); \ |
332 | 0 | T1 = _mm256_add_epi64(T1, _mm256_set_epi64x(R,0,0,0)); \ |
333 | 0 | X0 = _mm256_sub_epi64(X0, T0); \ |
334 | 0 | X2 = _mm256_sub_epi64(X2, T0); \ |
335 | 0 | X1 = _mm256_sub_epi64(X1, T1); \ |
336 | 0 | X3 = _mm256_sub_epi64(X3, T1); \ |
337 | 0 | } while(0) |
338 | 0 |
|
339 | 0 | #define THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, R, K1, K2, K3, T0, T1, T2) \ |
340 | 0 | do { \ |
341 | 0 | THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R+1, K2, K3, T2, T0); \ |
342 | 0 | THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_8); \ |
343 | 0 | THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_7); \ |
344 | 0 | THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_6); \ |
345 | 0 | THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_5); \ |
346 | 0 | \ |
347 | 0 | THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K1, K2, T0, T1); \ |
348 | 0 | THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_4); \ |
349 | 0 | THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_3); \ |
350 | 0 | THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_2); \ |
351 | 0 | THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_1); \ |
352 | 0 | } while(0) |
353 | 0 |
|
354 | | /* |
355 | | v1.0 key schedule: 9 ymm registers (only need 2 or 3) |
356 | | (0,1,2,3),(4,5,6,7) [8] |
357 | | then mutating with vpermq |
358 | | */ |
359 | 0 | const __m256i K0 = _mm256_set_epi64x(K[6], K[4], K[2], K[0]); |
360 | 0 | const __m256i K1 = _mm256_set_epi64x(K[7], K[5], K[3], K[1]); |
361 | 0 | const __m256i K2 = _mm256_set_epi64x(K[8], K[6], K[4], K[2]); |
362 | 0 | const __m256i K3 = _mm256_set_epi64x(K[0], K[7], K[5], K[3]); |
363 | 0 | const __m256i K4 = _mm256_set_epi64x(K[1], K[8], K[6], K[4]); |
364 | 0 | const __m256i K5 = _mm256_set_epi64x(K[2], K[0], K[7], K[5]); |
365 | 0 | const __m256i K6 = _mm256_set_epi64x(K[3], K[1], K[8], K[6]); |
366 | 0 | const __m256i K7 = _mm256_set_epi64x(K[4], K[2], K[0], K[7]); |
367 | 0 | const __m256i K8 = _mm256_set_epi64x(K[5], K[3], K[1], K[8]); |
368 | 0 |
|
369 | 0 | const __m256i* in_mm = reinterpret_cast<const __m256i*>(in); |
370 | 0 | __m256i* out_mm = reinterpret_cast<__m256i*>(out); |
371 | 0 |
|
372 | 0 | while(blocks >= 2) |
373 | 0 | { |
374 | 0 | __m256i X0 = _mm256_loadu_si256(in_mm++); |
375 | 0 | __m256i X1 = _mm256_loadu_si256(in_mm++); |
376 | 0 | __m256i X2 = _mm256_loadu_si256(in_mm++); |
377 | 0 | __m256i X3 = _mm256_loadu_si256(in_mm++); |
378 | 0 |
|
379 | 0 | const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0); |
380 | 0 |
|
381 | 0 | interleave_epi64(X0, X1); |
382 | 0 | interleave_epi64(X2, X3); |
383 | 0 |
|
384 | 0 | THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 17, K8,K0,K1, 3, 1, 2); |
385 | 0 | THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 15, K6,K7,K8, 2, 3, 1); |
386 | 0 | THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 13, K4,K5,K6, 1, 2, 3); |
387 | 0 | THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 11, K2,K3,K4, 3, 1, 2); |
388 | 0 | THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 9, K0,K1,K2, 2, 3, 1); |
389 | 0 | THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 7, K7,K8,K0, 1, 2, 3); |
390 | 0 | THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 5, K5,K6,K7, 3, 1, 2); |
391 | 0 | THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 3, K3,K4,K5, 2, 3, 1); |
392 | 0 | THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 1, K1,K2,K3, 1, 2, 3); |
393 | 0 |
|
394 | 0 | THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, 0, K0, K1, 2, 3); |
395 | 0 |
|
396 | 0 | deinterleave_epi64(X0, X1); |
397 | 0 | deinterleave_epi64(X2, X3); |
398 | 0 |
|
399 | 0 | _mm256_storeu_si256(out_mm++, X0); |
400 | 0 | _mm256_storeu_si256(out_mm++, X1); |
401 | 0 | _mm256_storeu_si256(out_mm++, X2); |
402 | 0 | _mm256_storeu_si256(out_mm++, X3); |
403 | 0 |
|
404 | 0 | blocks -= 2; |
405 | 0 | } |
406 | 0 |
|
407 | 0 | for(size_t i = 0; i != blocks; ++i) |
408 | 0 | { |
409 | 0 | __m256i X0 = _mm256_loadu_si256(in_mm++); |
410 | 0 | __m256i X1 = _mm256_loadu_si256(in_mm++); |
411 | 0 |
|
412 | 0 | const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0); |
413 | 0 |
|
414 | 0 | interleave_epi64(X0, X1); |
415 | 0 |
|
416 | 0 | THREEFISH_DEC_8_ROUNDS(X0, X1, 17, K8,K0,K1, 3, 1, 2); |
417 | 0 | THREEFISH_DEC_8_ROUNDS(X0, X1, 15, K6,K7,K8, 2, 3, 1); |
418 | 0 | THREEFISH_DEC_8_ROUNDS(X0, X1, 13, K4,K5,K6, 1, 2, 3); |
419 | 0 | THREEFISH_DEC_8_ROUNDS(X0, X1, 11, K2,K3,K4, 3, 1, 2); |
420 | 0 | THREEFISH_DEC_8_ROUNDS(X0, X1, 9, K0,K1,K2, 2, 3, 1); |
421 | 0 | THREEFISH_DEC_8_ROUNDS(X0, X1, 7, K7,K8,K0, 1, 2, 3); |
422 | 0 | THREEFISH_DEC_8_ROUNDS(X0, X1, 5, K5,K6,K7, 3, 1, 2); |
423 | 0 | THREEFISH_DEC_8_ROUNDS(X0, X1, 3, K3,K4,K5, 2, 3, 1); |
424 | 0 | THREEFISH_DEC_8_ROUNDS(X0, X1, 1, K1,K2,K3, 1, 2, 3); |
425 | 0 |
|
426 | 0 | THREEFISH_INJECT_KEY(X0, X1, 0, K0, K1, 2, 3); |
427 | 0 |
|
428 | 0 | deinterleave_epi64(X0, X1); |
429 | 0 |
|
430 | 0 | _mm256_storeu_si256(out_mm++, X0); |
431 | 0 | _mm256_storeu_si256(out_mm++, X1); |
432 | 0 | } |
433 | 0 |
|
434 | 0 | #undef THREEFISH_DEC_8_ROUNDS |
435 | 0 | #undef THREEFISH_ROUND |
436 | 0 | #undef THREEFISH_INJECT_KEY |
437 | 0 | #undef THREEFISH_DEC_2_8_ROUNDS |
438 | 0 | #undef THREEFISH_ROUND_2 |
439 | 0 | #undef THREEFISH_INJECT_KEY_2 |
440 | 0 |
|
441 | 0 | _mm256_zeroall(); |
442 | 0 | } |
443 | | |
444 | | } |