/src/cryptopp/blake2b_simd.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // blake2_simd.cpp - written and placed in the public domain by |
2 | | // Samuel Neves, Jeffrey Walton, Uri Blumenthal |
3 | | // and Marcel Raad. |
4 | | // |
5 | | // This source file uses intrinsics to gain access to ARMv7a/ARMv8a |
6 | | // NEON, Power8 and SSE4.1 instructions. A separate source file is |
7 | | // needed because additional CXXFLAGS are required to enable the |
8 | | // appropriate instructions sets in some build configurations. |
9 | | |
10 | | #include "pch.h" |
11 | | #include "config.h" |
12 | | #include "misc.h" |
13 | | #include "blake2.h" |
14 | | |
15 | | // Uncomment for benchmarking C++ against SSE2 or NEON. |
16 | | // Do so in both blake2.cpp and blake2_simd.cpp. |
17 | | // #undef CRYPTOPP_SSE41_AVAILABLE |
18 | | // #undef CRYPTOPP_ARM_NEON_AVAILABLE |
19 | | // #undef CRYPTOPP_ALTIVEC_AVAILABLE |
20 | | |
21 | | // Disable NEON/ASIMD for Cortex-A53 and A57. The shifts are too slow and C/C++ is about |
22 | | // 3 cpb faster than NEON/ASIMD. Also see http://github.com/weidai11/cryptopp/issues/367. |
23 | | #if (defined(__aarch32__) || defined(__aarch64__)) && defined(CRYPTOPP_SLOW_ARMV8_SHIFT) |
24 | | # undef CRYPTOPP_ARM_NEON_AVAILABLE |
25 | | #endif |
26 | | |
27 | | // BLAKE2s bug on AIX 7.1 (POWER7) with XLC 12.01 |
28 | | // https://github.com/weidai11/cryptopp/issues/743 |
29 | | #if defined(__xlC__) && (__xlC__ < 0x0d01) |
30 | | # define CRYPTOPP_DISABLE_ALTIVEC 1 |
31 | | # undef CRYPTOPP_POWER8_AVAILABLE |
32 | | # undef CRYPTOPP_ALTIVEC_AVAILABLE |
33 | | #endif |
34 | | |
35 | | #if defined(__XOP__) |
36 | | # if defined(CRYPTOPP_GCC_COMPATIBLE) |
37 | | # include <x86intrin.h> |
38 | | # endif |
39 | | # include <ammintrin.h> |
40 | | #endif // XOP |
41 | | |
42 | | #if (CRYPTOPP_SSE41_AVAILABLE) |
43 | | # include <emmintrin.h> |
44 | | # include <tmmintrin.h> |
45 | | # include <smmintrin.h> |
46 | | #endif |
47 | | |
48 | | #if (CRYPTOPP_ARM_NEON_HEADER) |
49 | | # include <arm_neon.h> |
50 | | #endif |
51 | | |
52 | | #if (CRYPTOPP_ARM_ACLE_HEADER) |
53 | | # include <stdint.h> |
54 | | # include <arm_acle.h> |
55 | | #endif |
56 | | |
57 | | #if (CRYPTOPP_POWER8_AVAILABLE) |
58 | | # include "ppc_simd.h" |
59 | | #endif |
60 | | |
61 | | #if defined(CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE) |
62 | | /* Ignore "warning: vec_lvsl is deprecated..." */ |
63 | | # pragma GCC diagnostic ignored "-Wdeprecated" |
64 | | #endif |
65 | | |
66 | | // Squash MS LNK4221 and libtool warnings |
67 | | extern const char BLAKE2B_SIMD_FNAME[] = __FILE__; |
68 | | |
69 | | NAMESPACE_BEGIN(CryptoPP) |
70 | | |
71 | | // Exported by blake2.cpp |
72 | | extern const word32 BLAKE2S_IV[8]; |
73 | | extern const word64 BLAKE2B_IV[8]; |
74 | | |
75 | | #if CRYPTOPP_SSE41_AVAILABLE |
76 | | |
77 | 3.67M | #define LOADU(p) _mm_loadu_si128((const __m128i *)(const void*)(p)) |
78 | 816k | #define STOREU(p,r) _mm_storeu_si128((__m128i *)(void*)(p), r) |
79 | | #define TOF(reg) _mm_castsi128_ps((reg)) |
80 | | #define TOI(reg) _mm_castps_si128((reg)) |
81 | | |
82 | | void BLAKE2_Compress64_SSE4(const byte* input, BLAKE2b_State& state) |
83 | 204k | { |
84 | 204k | #define BLAKE2B_LOAD_MSG_0_1(b0, b1) \ |
85 | 204k | do { \ |
86 | 204k | b0 = _mm_unpacklo_epi64(m0, m1); \ |
87 | 204k | b1 = _mm_unpacklo_epi64(m2, m3); \ |
88 | 204k | } while(0) |
89 | | |
90 | 204k | #define BLAKE2B_LOAD_MSG_0_2(b0, b1) \ |
91 | 204k | do { \ |
92 | 204k | b0 = _mm_unpackhi_epi64(m0, m1); \ |
93 | 204k | b1 = _mm_unpackhi_epi64(m2, m3); \ |
94 | 204k | } while(0) |
95 | | |
96 | 204k | #define BLAKE2B_LOAD_MSG_0_3(b0, b1) \ |
97 | 204k | do { \ |
98 | 204k | b0 = _mm_unpacklo_epi64(m4, m5); \ |
99 | 204k | b1 = _mm_unpacklo_epi64(m6, m7); \ |
100 | 204k | } while(0) |
101 | | |
102 | 204k | #define BLAKE2B_LOAD_MSG_0_4(b0, b1) \ |
103 | 204k | do { \ |
104 | 204k | b0 = _mm_unpackhi_epi64(m4, m5); \ |
105 | 204k | b1 = _mm_unpackhi_epi64(m6, m7); \ |
106 | 204k | } while(0) |
107 | | |
108 | 204k | #define BLAKE2B_LOAD_MSG_1_1(b0, b1) \ |
109 | 204k | do { \ |
110 | 204k | b0 = _mm_unpacklo_epi64(m7, m2); \ |
111 | 204k | b1 = _mm_unpackhi_epi64(m4, m6); \ |
112 | 204k | } while(0) |
113 | | |
114 | 204k | #define BLAKE2B_LOAD_MSG_1_2(b0, b1) \ |
115 | 204k | do { \ |
116 | 204k | b0 = _mm_unpacklo_epi64(m5, m4); \ |
117 | 204k | b1 = _mm_alignr_epi8(m3, m7, 8); \ |
118 | 204k | } while(0) |
119 | | |
120 | 204k | #define BLAKE2B_LOAD_MSG_1_3(b0, b1) \ |
121 | 204k | do { \ |
122 | 204k | b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \ |
123 | 204k | b1 = _mm_unpackhi_epi64(m5, m2); \ |
124 | 204k | } while(0) |
125 | | |
126 | 204k | #define BLAKE2B_LOAD_MSG_1_4(b0, b1) \ |
127 | 204k | do { \ |
128 | 204k | b0 = _mm_unpacklo_epi64(m6, m1); \ |
129 | 204k | b1 = _mm_unpackhi_epi64(m3, m1); \ |
130 | 204k | } while(0) |
131 | | |
132 | 204k | #define BLAKE2B_LOAD_MSG_2_1(b0, b1) \ |
133 | 204k | do { \ |
134 | 204k | b0 = _mm_alignr_epi8(m6, m5, 8); \ |
135 | 204k | b1 = _mm_unpackhi_epi64(m2, m7); \ |
136 | 204k | } while(0) |
137 | | |
138 | 204k | #define BLAKE2B_LOAD_MSG_2_2(b0, b1) \ |
139 | 204k | do { \ |
140 | 204k | b0 = _mm_unpacklo_epi64(m4, m0); \ |
141 | 204k | b1 = _mm_blend_epi16(m1, m6, 0xF0); \ |
142 | 204k | } while(0) |
143 | | |
144 | 204k | #define BLAKE2B_LOAD_MSG_2_3(b0, b1) \ |
145 | 204k | do { \ |
146 | 204k | b0 = _mm_blend_epi16(m5, m1, 0xF0); \ |
147 | 204k | b1 = _mm_unpackhi_epi64(m3, m4); \ |
148 | 204k | } while(0) |
149 | | |
150 | 204k | #define BLAKE2B_LOAD_MSG_2_4(b0, b1) \ |
151 | 204k | do { \ |
152 | 204k | b0 = _mm_unpacklo_epi64(m7, m3); \ |
153 | 204k | b1 = _mm_alignr_epi8(m2, m0, 8); \ |
154 | 204k | } while(0) |
155 | | |
156 | 204k | #define BLAKE2B_LOAD_MSG_3_1(b0, b1) \ |
157 | 204k | do { \ |
158 | 204k | b0 = _mm_unpackhi_epi64(m3, m1); \ |
159 | 204k | b1 = _mm_unpackhi_epi64(m6, m5); \ |
160 | 204k | } while(0) |
161 | | |
162 | 204k | #define BLAKE2B_LOAD_MSG_3_2(b0, b1) \ |
163 | 204k | do { \ |
164 | 204k | b0 = _mm_unpackhi_epi64(m4, m0); \ |
165 | 204k | b1 = _mm_unpacklo_epi64(m6, m7); \ |
166 | 204k | } while(0) |
167 | | |
168 | 204k | #define BLAKE2B_LOAD_MSG_3_3(b0, b1) \ |
169 | 204k | do { \ |
170 | 204k | b0 = _mm_blend_epi16(m1, m2, 0xF0); \ |
171 | 204k | b1 = _mm_blend_epi16(m2, m7, 0xF0); \ |
172 | 204k | } while(0) |
173 | | |
174 | 204k | #define BLAKE2B_LOAD_MSG_3_4(b0, b1) \ |
175 | 204k | do { \ |
176 | 204k | b0 = _mm_unpacklo_epi64(m3, m5); \ |
177 | 204k | b1 = _mm_unpacklo_epi64(m0, m4); \ |
178 | 204k | } while(0) |
179 | | |
180 | 204k | #define BLAKE2B_LOAD_MSG_4_1(b0, b1) \ |
181 | 204k | do { \ |
182 | 204k | b0 = _mm_unpackhi_epi64(m4, m2); \ |
183 | 204k | b1 = _mm_unpacklo_epi64(m1, m5); \ |
184 | 204k | } while(0) |
185 | | |
186 | 204k | #define BLAKE2B_LOAD_MSG_4_2(b0, b1) \ |
187 | 204k | do { \ |
188 | 204k | b0 = _mm_blend_epi16(m0, m3, 0xF0); \ |
189 | 204k | b1 = _mm_blend_epi16(m2, m7, 0xF0); \ |
190 | 204k | } while(0) |
191 | | |
192 | 204k | #define BLAKE2B_LOAD_MSG_4_3(b0, b1) \ |
193 | 204k | do { \ |
194 | 204k | b0 = _mm_blend_epi16(m7, m5, 0xF0); \ |
195 | 204k | b1 = _mm_blend_epi16(m3, m1, 0xF0); \ |
196 | 204k | } while(0) |
197 | | |
198 | 204k | #define BLAKE2B_LOAD_MSG_4_4(b0, b1) \ |
199 | 204k | do { \ |
200 | 204k | b0 = _mm_alignr_epi8(m6, m0, 8); \ |
201 | 204k | b1 = _mm_blend_epi16(m4, m6, 0xF0); \ |
202 | 204k | } while(0) |
203 | | |
204 | 204k | #define BLAKE2B_LOAD_MSG_5_1(b0, b1) \ |
205 | 204k | do { \ |
206 | 204k | b0 = _mm_unpacklo_epi64(m1, m3); \ |
207 | 204k | b1 = _mm_unpacklo_epi64(m0, m4); \ |
208 | 204k | } while(0) |
209 | | |
210 | 204k | #define BLAKE2B_LOAD_MSG_5_2(b0, b1) \ |
211 | 204k | do { \ |
212 | 204k | b0 = _mm_unpacklo_epi64(m6, m5); \ |
213 | 204k | b1 = _mm_unpackhi_epi64(m5, m1); \ |
214 | 204k | } while(0) |
215 | | |
216 | 204k | #define BLAKE2B_LOAD_MSG_5_3(b0, b1) \ |
217 | 204k | do { \ |
218 | 204k | b0 = _mm_blend_epi16(m2, m3, 0xF0); \ |
219 | 204k | b1 = _mm_unpackhi_epi64(m7, m0); \ |
220 | 204k | } while(0) |
221 | | |
222 | 204k | #define BLAKE2B_LOAD_MSG_5_4(b0, b1) \ |
223 | 204k | do { \ |
224 | 204k | b0 = _mm_unpackhi_epi64(m6, m2); \ |
225 | 204k | b1 = _mm_blend_epi16(m7, m4, 0xF0); \ |
226 | 204k | } while(0) |
227 | | |
228 | 204k | #define BLAKE2B_LOAD_MSG_6_1(b0, b1) \ |
229 | 204k | do { \ |
230 | 204k | b0 = _mm_blend_epi16(m6, m0, 0xF0); \ |
231 | 204k | b1 = _mm_unpacklo_epi64(m7, m2); \ |
232 | 204k | } while(0) |
233 | | |
234 | 204k | #define BLAKE2B_LOAD_MSG_6_2(b0, b1) \ |
235 | 204k | do { \ |
236 | 204k | b0 = _mm_unpackhi_epi64(m2, m7); \ |
237 | 204k | b1 = _mm_alignr_epi8(m5, m6, 8); \ |
238 | 204k | } while(0) |
239 | | |
240 | 204k | #define BLAKE2B_LOAD_MSG_6_3(b0, b1) \ |
241 | 204k | do { \ |
242 | 204k | b0 = _mm_unpacklo_epi64(m0, m3); \ |
243 | 204k | b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \ |
244 | 204k | } while(0) |
245 | | |
246 | 204k | #define BLAKE2B_LOAD_MSG_6_4(b0, b1) \ |
247 | 204k | do { \ |
248 | 204k | b0 = _mm_unpackhi_epi64(m3, m1); \ |
249 | 204k | b1 = _mm_blend_epi16(m1, m5, 0xF0); \ |
250 | 204k | } while(0) |
251 | | |
252 | 204k | #define BLAKE2B_LOAD_MSG_7_1(b0, b1) \ |
253 | 204k | do { \ |
254 | 204k | b0 = _mm_unpackhi_epi64(m6, m3); \ |
255 | 204k | b1 = _mm_blend_epi16(m6, m1, 0xF0); \ |
256 | 204k | } while(0) |
257 | | |
258 | 204k | #define BLAKE2B_LOAD_MSG_7_2(b0, b1) \ |
259 | 204k | do { \ |
260 | 204k | b0 = _mm_alignr_epi8(m7, m5, 8); \ |
261 | 204k | b1 = _mm_unpackhi_epi64(m0, m4); \ |
262 | 204k | } while(0) |
263 | | |
264 | 204k | #define BLAKE2B_LOAD_MSG_7_3(b0, b1) \ |
265 | 204k | do { \ |
266 | 204k | b0 = _mm_unpackhi_epi64(m2, m7); \ |
267 | 204k | b1 = _mm_unpacklo_epi64(m4, m1); \ |
268 | 204k | } while(0) |
269 | | |
270 | 204k | #define BLAKE2B_LOAD_MSG_7_4(b0, b1) \ |
271 | 204k | do { \ |
272 | 204k | b0 = _mm_unpacklo_epi64(m0, m2); \ |
273 | 204k | b1 = _mm_unpacklo_epi64(m3, m5); \ |
274 | 204k | } while(0) |
275 | | |
276 | 204k | #define BLAKE2B_LOAD_MSG_8_1(b0, b1) \ |
277 | 204k | do { \ |
278 | 204k | b0 = _mm_unpacklo_epi64(m3, m7); \ |
279 | 204k | b1 = _mm_alignr_epi8(m0, m5, 8); \ |
280 | 204k | } while(0) |
281 | | |
282 | 204k | #define BLAKE2B_LOAD_MSG_8_2(b0, b1) \ |
283 | 204k | do { \ |
284 | 204k | b0 = _mm_unpackhi_epi64(m7, m4); \ |
285 | 204k | b1 = _mm_alignr_epi8(m4, m1, 8); \ |
286 | 204k | } while(0) |
287 | | |
288 | 204k | #define BLAKE2B_LOAD_MSG_8_3(b0, b1) \ |
289 | 204k | do { \ |
290 | 204k | b0 = m6; \ |
291 | 204k | b1 = _mm_alignr_epi8(m5, m0, 8); \ |
292 | 204k | } while(0) |
293 | | |
294 | 204k | #define BLAKE2B_LOAD_MSG_8_4(b0, b1) \ |
295 | 204k | do { \ |
296 | 204k | b0 = _mm_blend_epi16(m1, m3, 0xF0); \ |
297 | 204k | b1 = m2; \ |
298 | 204k | } while(0) |
299 | | |
300 | 204k | #define BLAKE2B_LOAD_MSG_9_1(b0, b1) \ |
301 | 204k | do { \ |
302 | 204k | b0 = _mm_unpacklo_epi64(m5, m4); \ |
303 | 204k | b1 = _mm_unpackhi_epi64(m3, m0); \ |
304 | 204k | } while(0) |
305 | | |
306 | 204k | #define BLAKE2B_LOAD_MSG_9_2(b0, b1) \ |
307 | 204k | do { \ |
308 | 204k | b0 = _mm_unpacklo_epi64(m1, m2); \ |
309 | 204k | b1 = _mm_blend_epi16(m3, m2, 0xF0); \ |
310 | 204k | } while(0) |
311 | | |
312 | 204k | #define BLAKE2B_LOAD_MSG_9_3(b0, b1) \ |
313 | 204k | do { \ |
314 | 204k | b0 = _mm_unpackhi_epi64(m7, m4); \ |
315 | 204k | b1 = _mm_unpackhi_epi64(m1, m6); \ |
316 | 204k | } while(0) |
317 | | |
318 | 204k | #define BLAKE2B_LOAD_MSG_9_4(b0, b1) \ |
319 | 204k | do { \ |
320 | 204k | b0 = _mm_alignr_epi8(m7, m5, 8); \ |
321 | 204k | b1 = _mm_unpacklo_epi64(m6, m0); \ |
322 | 204k | } while(0) |
323 | | |
324 | 204k | #define BLAKE2B_LOAD_MSG_10_1(b0, b1) \ |
325 | 204k | do { \ |
326 | 204k | b0 = _mm_unpacklo_epi64(m0, m1); \ |
327 | 204k | b1 = _mm_unpacklo_epi64(m2, m3); \ |
328 | 204k | } while(0) |
329 | | |
330 | 204k | #define BLAKE2B_LOAD_MSG_10_2(b0, b1) \ |
331 | 204k | do { \ |
332 | 204k | b0 = _mm_unpackhi_epi64(m0, m1); \ |
333 | 204k | b1 = _mm_unpackhi_epi64(m2, m3); \ |
334 | 204k | } while(0) |
335 | | |
336 | 204k | #define BLAKE2B_LOAD_MSG_10_3(b0, b1) \ |
337 | 204k | do { \ |
338 | 204k | b0 = _mm_unpacklo_epi64(m4, m5); \ |
339 | 204k | b1 = _mm_unpacklo_epi64(m6, m7); \ |
340 | 204k | } while(0) |
341 | | |
342 | 204k | #define BLAKE2B_LOAD_MSG_10_4(b0, b1) \ |
343 | 204k | do { \ |
344 | 204k | b0 = _mm_unpackhi_epi64(m4, m5); \ |
345 | 204k | b1 = _mm_unpackhi_epi64(m6, m7); \ |
346 | 204k | } while(0) |
347 | | |
348 | 204k | #define BLAKE2B_LOAD_MSG_11_1(b0, b1) \ |
349 | 204k | do { \ |
350 | 204k | b0 = _mm_unpacklo_epi64(m7, m2); \ |
351 | 204k | b1 = _mm_unpackhi_epi64(m4, m6); \ |
352 | 204k | } while(0) |
353 | | |
354 | 204k | #define BLAKE2B_LOAD_MSG_11_2(b0, b1) \ |
355 | 204k | do { \ |
356 | 204k | b0 = _mm_unpacklo_epi64(m5, m4); \ |
357 | 204k | b1 = _mm_alignr_epi8(m3, m7, 8); \ |
358 | 204k | } while(0) |
359 | | |
360 | 204k | #define BLAKE2B_LOAD_MSG_11_3(b0, b1) \ |
361 | 204k | do { \ |
362 | 204k | b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \ |
363 | 204k | b1 = _mm_unpackhi_epi64(m5, m2); \ |
364 | 204k | } while(0) |
365 | | |
366 | 204k | #define BLAKE2B_LOAD_MSG_11_4(b0, b1) \ |
367 | 204k | do { \ |
368 | 204k | b0 = _mm_unpacklo_epi64(m6, m1); \ |
369 | 204k | b1 = _mm_unpackhi_epi64(m3, m1); \ |
370 | 204k | } while(0) |
371 | | |
372 | | #ifdef __XOP__ |
373 | | # define MM_ROTI_EPI64(r, c) \ |
374 | | _mm_roti_epi64(r, c) |
375 | | #else |
376 | 204k | # define MM_ROTI_EPI64(x, c) \ |
377 | 39.1M | (-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1)) \ |
378 | 39.1M | : (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \ |
379 | 29.3M | : (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \ |
380 | 19.5M | : (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x))) \ |
381 | 9.79M | : _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c)))) |
382 | 204k | #endif |
383 | | |
384 | 204k | #define BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ |
385 | 4.89M | row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \ |
386 | 4.89M | row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \ |
387 | 4.89M | \ |
388 | 4.89M | row4l = _mm_xor_si128(row4l, row1l); \ |
389 | 4.89M | row4h = _mm_xor_si128(row4h, row1h); \ |
390 | 4.89M | \ |
391 | 4.89M | row4l = MM_ROTI_EPI64(row4l, -32); \ |
392 | 4.89M | row4h = MM_ROTI_EPI64(row4h, -32); \ |
393 | 4.89M | \ |
394 | 4.89M | row3l = _mm_add_epi64(row3l, row4l); \ |
395 | 4.89M | row3h = _mm_add_epi64(row3h, row4h); \ |
396 | 4.89M | \ |
397 | 4.89M | row2l = _mm_xor_si128(row2l, row3l); \ |
398 | 4.89M | row2h = _mm_xor_si128(row2h, row3h); \ |
399 | 4.89M | \ |
400 | 4.89M | row2l = MM_ROTI_EPI64(row2l, -24); \ |
401 | 4.89M | row2h = MM_ROTI_EPI64(row2h, -24); |
402 | | |
403 | 204k | #define BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ |
404 | 4.89M | row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \ |
405 | 4.89M | row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \ |
406 | 4.89M | \ |
407 | 4.89M | row4l = _mm_xor_si128(row4l, row1l); \ |
408 | 4.89M | row4h = _mm_xor_si128(row4h, row1h); \ |
409 | 4.89M | \ |
410 | 4.89M | row4l = MM_ROTI_EPI64(row4l, -16); \ |
411 | 4.89M | row4h = MM_ROTI_EPI64(row4h, -16); \ |
412 | 4.89M | \ |
413 | 4.89M | row3l = _mm_add_epi64(row3l, row4l); \ |
414 | 4.89M | row3h = _mm_add_epi64(row3h, row4h); \ |
415 | 4.89M | \ |
416 | 4.89M | row2l = _mm_xor_si128(row2l, row3l); \ |
417 | 4.89M | row2h = _mm_xor_si128(row2h, row3h); \ |
418 | 4.89M | \ |
419 | 4.89M | row2l = MM_ROTI_EPI64(row2l, -63); \ |
420 | 4.89M | row2h = MM_ROTI_EPI64(row2h, -63); \ |
421 | 204k | |
422 | 204k | #define BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ |
423 | 2.44M | t0 = row4l;\ |
424 | 2.44M | t1 = row2l;\ |
425 | 2.44M | row4l = row3l;\ |
426 | 2.44M | row3l = row3h;\ |
427 | 2.44M | row3h = row4l;\ |
428 | 2.44M | row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); \ |
429 | 2.44M | row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); \ |
430 | 2.44M | row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); \ |
431 | 2.44M | row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)) |
432 | | |
433 | 204k | #define BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ |
434 | 2.44M | t0 = row3l;\ |
435 | 2.44M | row3l = row3h;\ |
436 | 2.44M | row3h = t0;\ |
437 | 2.44M | t0 = row2l;\ |
438 | 2.44M | t1 = row4l;\ |
439 | 2.44M | row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); \ |
440 | 2.44M | row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); \ |
441 | 2.44M | row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); \ |
442 | 2.44M | row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)) |
443 | | |
444 | 204k | #define BLAKE2B_ROUND(r) \ |
445 | 2.44M | BLAKE2B_LOAD_MSG_ ##r ##_1(b0, b1); \ |
446 | 2.44M | BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ |
447 | 2.44M | BLAKE2B_LOAD_MSG_ ##r ##_2(b0, b1); \ |
448 | 2.44M | BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ |
449 | 2.44M | BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ |
450 | 2.44M | BLAKE2B_LOAD_MSG_ ##r ##_3(b0, b1); \ |
451 | 2.44M | BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ |
452 | 2.44M | BLAKE2B_LOAD_MSG_ ##r ##_4(b0, b1); \ |
453 | 2.44M | BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ |
454 | 2.44M | BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); |
455 | | |
456 | 204k | __m128i row1l, row1h; |
457 | 204k | __m128i row2l, row2h; |
458 | 204k | __m128i row3l, row3h; |
459 | 204k | __m128i row4l, row4h; |
460 | 204k | __m128i b0, b1; |
461 | 204k | __m128i t0, t1; |
462 | | |
463 | 204k | const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9); |
464 | 204k | const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10); |
465 | | |
466 | 204k | const __m128i m0 = LOADU(input + 00); |
467 | 204k | const __m128i m1 = LOADU(input + 16); |
468 | 204k | const __m128i m2 = LOADU(input + 32); |
469 | 204k | const __m128i m3 = LOADU(input + 48); |
470 | 204k | const __m128i m4 = LOADU(input + 64); |
471 | 204k | const __m128i m5 = LOADU(input + 80); |
472 | 204k | const __m128i m6 = LOADU(input + 96); |
473 | 204k | const __m128i m7 = LOADU(input + 112); |
474 | | |
475 | 204k | row1l = LOADU(state.h()+0); |
476 | 204k | row1h = LOADU(state.h()+2); |
477 | 204k | row2l = LOADU(state.h()+4); |
478 | 204k | row2h = LOADU(state.h()+6); |
479 | 204k | row3l = LOADU(BLAKE2B_IV+0); |
480 | 204k | row3h = LOADU(BLAKE2B_IV+2); |
481 | 204k | row4l = _mm_xor_si128(LOADU(BLAKE2B_IV+4), LOADU(state.t()+0)); |
482 | 204k | row4h = _mm_xor_si128(LOADU(BLAKE2B_IV+6), LOADU(state.f()+0)); |
483 | | |
484 | 204k | BLAKE2B_ROUND(0); |
485 | 204k | BLAKE2B_ROUND(1); |
486 | 204k | BLAKE2B_ROUND(2); |
487 | 204k | BLAKE2B_ROUND(3); |
488 | 204k | BLAKE2B_ROUND(4); |
489 | 204k | BLAKE2B_ROUND(5); |
490 | 204k | BLAKE2B_ROUND(6); |
491 | 204k | BLAKE2B_ROUND(7); |
492 | 204k | BLAKE2B_ROUND(8); |
493 | 204k | BLAKE2B_ROUND(9); |
494 | 204k | BLAKE2B_ROUND(10); |
495 | 204k | BLAKE2B_ROUND(11); |
496 | | |
497 | 204k | row1l = _mm_xor_si128(row3l, row1l); |
498 | 204k | row1h = _mm_xor_si128(row3h, row1h); |
499 | 204k | STOREU(state.h()+0, _mm_xor_si128(LOADU(state.h()+0), row1l)); |
500 | 204k | STOREU(state.h()+2, _mm_xor_si128(LOADU(state.h()+2), row1h)); |
501 | 204k | row2l = _mm_xor_si128(row4l, row2l); |
502 | 204k | row2h = _mm_xor_si128(row4h, row2h); |
503 | 204k | STOREU(state.h()+4, _mm_xor_si128(LOADU(state.h()+4), row2l)); |
504 | 204k | STOREU(state.h()+6, _mm_xor_si128(LOADU(state.h()+6), row2h)); |
505 | 204k | } |
506 | | #endif // CRYPTOPP_SSE41_AVAILABLE |
507 | | |
508 | | #if CRYPTOPP_ARM_NEON_AVAILABLE |
509 | | void BLAKE2_Compress64_NEON(const byte* input, BLAKE2b_State& state) |
510 | | { |
511 | | #define BLAKE2B_LOAD_MSG_0_1(b0, b1) \ |
512 | | do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m1)); b1 = vcombine_u64(vget_low_u64(m2), vget_low_u64(m3)); } while(0) |
513 | | |
514 | | #define BLAKE2B_LOAD_MSG_0_2(b0, b1) \ |
515 | | do { b0 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m3)); } while(0) |
516 | | |
517 | | #define BLAKE2B_LOAD_MSG_0_3(b0, b1) \ |
518 | | do { b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m5)); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7)); } while(0) |
519 | | |
520 | | #define BLAKE2B_LOAD_MSG_0_4(b0, b1) \ |
521 | | do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m5)); b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m7)); } while(0) |
522 | | |
523 | | #define BLAKE2B_LOAD_MSG_1_1(b0, b1) \ |
524 | | do { b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); b1 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m6)); } while(0) |
525 | | |
526 | | #define BLAKE2B_LOAD_MSG_1_2(b0, b1) \ |
527 | | do { b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); b1 = vextq_u64(m7, m3, 1); } while(0) |
528 | | |
529 | | #define BLAKE2B_LOAD_MSG_1_3(b0, b1) \ |
530 | | do { b0 = vextq_u64(m0, m0, 1); b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m2)); } while(0) |
531 | | |
532 | | #define BLAKE2B_LOAD_MSG_1_4(b0, b1) \ |
533 | | do { b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m1)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); } while(0) |
534 | | |
535 | | #define BLAKE2B_LOAD_MSG_2_1(b0, b1) \ |
536 | | do { b0 = vextq_u64(m5, m6, 1); b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); } while(0) |
537 | | |
538 | | #define BLAKE2B_LOAD_MSG_2_2(b0, b1) \ |
539 | | do { b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m0)); b1 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m6)); } while(0) |
540 | | |
541 | | #define BLAKE2B_LOAD_MSG_2_3(b0, b1) \ |
542 | | do { b0 = vcombine_u64(vget_low_u64(m5), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m4)); } while(0) |
543 | | |
544 | | #define BLAKE2B_LOAD_MSG_2_4(b0, b1) \ |
545 | | do { b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m3)); b1 = vextq_u64(m0, m2, 1); } while(0) |
546 | | |
547 | | #define BLAKE2B_LOAD_MSG_3_1(b0, b1) \ |
548 | | do { b0 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m5)); } while(0) |
549 | | |
550 | | #define BLAKE2B_LOAD_MSG_3_2(b0, b1) \ |
551 | | do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m0)); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7)); } while(0) |
552 | | |
553 | | #define BLAKE2B_LOAD_MSG_3_3(b0, b1) \ |
554 | | do { b0 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m2)); b1 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m7)); } while(0) |
555 | | |
556 | | #define BLAKE2B_LOAD_MSG_3_4(b0, b1) \ |
557 | | do { b0 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m5)); b1 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m4)); } while(0) |
558 | | |
559 | | #define BLAKE2B_LOAD_MSG_4_1(b0, b1) \ |
560 | | do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m2)); b1 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m5)); } while(0) |
561 | | |
562 | | #define BLAKE2B_LOAD_MSG_4_2(b0, b1) \ |
563 | | do { b0 = vcombine_u64(vget_low_u64(m0), vget_high_u64(m3)); b1 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m7)); } while(0) |
564 | | |
565 | | #define BLAKE2B_LOAD_MSG_4_3(b0, b1) \ |
566 | | do { b0 = vcombine_u64(vget_low_u64(m7), vget_high_u64(m5)); b1 = vcombine_u64(vget_low_u64(m3), vget_high_u64(m1)); } while(0) |
567 | | |
568 | | #define BLAKE2B_LOAD_MSG_4_4(b0, b1) \ |
569 | | do { b0 = vextq_u64(m0, m6, 1); b1 = vcombine_u64(vget_low_u64(m4), vget_high_u64(m6)); } while(0) |
570 | | |
571 | | #define BLAKE2B_LOAD_MSG_5_1(b0, b1) \ |
572 | | do { b0 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m3)); b1 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m4)); } while(0) |
573 | | |
574 | | #define BLAKE2B_LOAD_MSG_5_2(b0, b1) \ |
575 | | do { b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m5)); b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m1)); } while(0) |
576 | | |
577 | | #define BLAKE2B_LOAD_MSG_5_3(b0, b1) \ |
578 | | do { b0 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m3)); b1 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m0)); } while(0) |
579 | | |
580 | | #define BLAKE2B_LOAD_MSG_5_4(b0, b1) \ |
581 | | do { b0 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m2)); b1 = vcombine_u64(vget_low_u64(m7), vget_high_u64(m4)); } while(0) |
582 | | |
583 | | #define BLAKE2B_LOAD_MSG_6_1(b0, b1) \ |
584 | | do { b0 = vcombine_u64(vget_low_u64(m6), vget_high_u64(m0)); b1 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); } while(0) |
585 | | |
586 | | #define BLAKE2B_LOAD_MSG_6_2(b0, b1) \ |
587 | | do { b0 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); b1 = vextq_u64(m6, m5, 1); } while(0) |
588 | | |
589 | | #define BLAKE2B_LOAD_MSG_6_3(b0, b1) \ |
590 | | do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m3)); b1 = vextq_u64(m4, m4, 1); } while(0) |
591 | | |
592 | | #define BLAKE2B_LOAD_MSG_6_4(b0, b1) \ |
593 | | do { b0 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); b1 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m5)); } while(0) |
594 | | |
595 | | #define BLAKE2B_LOAD_MSG_7_1(b0, b1) \ |
596 | | do { b0 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m3)); b1 = vcombine_u64(vget_low_u64(m6), vget_high_u64(m1)); } while(0) |
597 | | |
598 | | #define BLAKE2B_LOAD_MSG_7_2(b0, b1) \ |
599 | | do { b0 = vextq_u64(m5, m7, 1); b1 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m4)); } while(0) |
600 | | |
601 | | #define BLAKE2B_LOAD_MSG_7_3(b0, b1) \ |
602 | | do { b0 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); b1 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m1)); } while(0) |
603 | | |
604 | | #define BLAKE2B_LOAD_MSG_7_4(b0, b1) \ |
605 | | do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m2)); b1 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m5)); } while(0) |
606 | | |
607 | | #define BLAKE2B_LOAD_MSG_8_1(b0, b1) \ |
608 | | do { b0 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m7)); b1 = vextq_u64(m5, m0, 1); } while(0) |
609 | | |
610 | | #define BLAKE2B_LOAD_MSG_8_2(b0, b1) \ |
611 | | do { b0 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m4)); b1 = vextq_u64(m1, m4, 1); } while(0) |
612 | | |
613 | | #define BLAKE2B_LOAD_MSG_8_3(b0, b1) \ |
614 | | do { b0 = m6; b1 = vextq_u64(m0, m5, 1); } while(0) |
615 | | |
616 | | #define BLAKE2B_LOAD_MSG_8_4(b0, b1) \ |
617 | | do { b0 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m3)); b1 = m2; } while(0) |
618 | | |
619 | | #define BLAKE2B_LOAD_MSG_9_1(b0, b1) \ |
620 | | do { b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m0)); } while(0) |
621 | | |
622 | | #define BLAKE2B_LOAD_MSG_9_2(b0, b1) \ |
623 | | do { b0 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m2)); b1 = vcombine_u64(vget_low_u64(m3), vget_high_u64(m2)); } while(0) |
624 | | |
625 | | #define BLAKE2B_LOAD_MSG_9_3(b0, b1) \ |
626 | | do { b0 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m4)); b1 = vcombine_u64(vget_high_u64(m1), vget_high_u64(m6)); } while(0) |
627 | | |
628 | | #define BLAKE2B_LOAD_MSG_9_4(b0, b1) \ |
629 | | do { b0 = vextq_u64(m5, m7, 1); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m0)); } while(0) |
630 | | |
631 | | #define BLAKE2B_LOAD_MSG_10_1(b0, b1) \ |
632 | | do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m1)); b1 = vcombine_u64(vget_low_u64(m2), vget_low_u64(m3)); } while(0) |
633 | | |
634 | | #define BLAKE2B_LOAD_MSG_10_2(b0, b1) \ |
635 | | do { b0 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m3)); } while(0) |
636 | | |
637 | | #define BLAKE2B_LOAD_MSG_10_3(b0, b1) \ |
638 | | do { b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m5)); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7)); } while(0) |
639 | | |
640 | | #define BLAKE2B_LOAD_MSG_10_4(b0, b1) \ |
641 | | do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m5)); b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m7)); } while(0) |
642 | | |
643 | | #define BLAKE2B_LOAD_MSG_11_1(b0, b1) \ |
644 | | do { b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); b1 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m6)); } while(0) |
645 | | |
646 | | #define BLAKE2B_LOAD_MSG_11_2(b0, b1) \ |
647 | | do { b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); b1 = vextq_u64(m7, m3, 1); } while(0) |
648 | | |
649 | | #define BLAKE2B_LOAD_MSG_11_3(b0, b1) \ |
650 | | do { b0 = vextq_u64(m0, m0, 1); b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m2)); } while(0) |
651 | | |
652 | | #define BLAKE2B_LOAD_MSG_11_4(b0, b1) \ |
653 | | do { b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m1)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); } while(0) |
654 | | |
655 | | #define vrorq_n_u64_32(x) vreinterpretq_u64_u32(vrev64q_u32(vreinterpretq_u32_u64((x)))) |
656 | | |
657 | | #define vrorq_n_u64_24(x) vcombine_u64( \ |
658 | | vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_low_u64(x)), vreinterpret_u8_u64(vget_low_u64(x)), 3)), \ |
659 | | vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_high_u64(x)), vreinterpret_u8_u64(vget_high_u64(x)), 3))) |
660 | | |
661 | | #define vrorq_n_u64_16(x) vcombine_u64( \ |
662 | | vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_low_u64(x)), vreinterpret_u8_u64(vget_low_u64(x)), 2)), \ |
663 | | vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_high_u64(x)), vreinterpret_u8_u64(vget_high_u64(x)), 2))) |
664 | | |
665 | | #define vrorq_n_u64_63(x) veorq_u64(vaddq_u64(x, x), vshrq_n_u64(x, 63)) |
666 | | |
667 | | #define BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ |
668 | | do { \ |
669 | | row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l); \ |
670 | | row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h); \ |
671 | | row4l = veorq_u64(row4l, row1l); row4h = veorq_u64(row4h, row1h); \ |
672 | | row4l = vrorq_n_u64_32(row4l); row4h = vrorq_n_u64_32(row4h); \ |
673 | | row3l = vaddq_u64(row3l, row4l); row3h = vaddq_u64(row3h, row4h); \ |
674 | | row2l = veorq_u64(row2l, row3l); row2h = veorq_u64(row2h, row3h); \ |
675 | | row2l = vrorq_n_u64_24(row2l); row2h = vrorq_n_u64_24(row2h); \ |
676 | | } while(0) |
677 | | |
678 | | #define BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ |
679 | | do { \ |
680 | | row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l); \ |
681 | | row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h); \ |
682 | | row4l = veorq_u64(row4l, row1l); row4h = veorq_u64(row4h, row1h); \ |
683 | | row4l = vrorq_n_u64_16(row4l); row4h = vrorq_n_u64_16(row4h); \ |
684 | | row3l = vaddq_u64(row3l, row4l); row3h = vaddq_u64(row3h, row4h); \ |
685 | | row2l = veorq_u64(row2l, row3l); row2h = veorq_u64(row2h, row3h); \ |
686 | | row2l = vrorq_n_u64_63(row2l); row2h = vrorq_n_u64_63(row2h); \ |
687 | | } while(0) |
688 | | |
689 | | #define BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ |
690 | | do { \ |
691 | | uint64x2_t t0 = vextq_u64(row2l, row2h, 1); \ |
692 | | uint64x2_t t1 = vextq_u64(row2h, row2l, 1); \ |
693 | | row2l = t0; row2h = t1; t0 = row3l; row3l = row3h; row3h = t0; \ |
694 | | t0 = vextq_u64(row4h, row4l, 1); t1 = vextq_u64(row4l, row4h, 1); \ |
695 | | row4l = t0; row4h = t1; \ |
696 | | } while(0) |
697 | | |
698 | | #define BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ |
699 | | do { \ |
700 | | uint64x2_t t0 = vextq_u64(row2h, row2l, 1); \ |
701 | | uint64x2_t t1 = vextq_u64(row2l, row2h, 1); \ |
702 | | row2l = t0; row2h = t1; t0 = row3l; row3l = row3h; row3h = t0; \ |
703 | | t0 = vextq_u64(row4l, row4h, 1); t1 = vextq_u64(row4h, row4l, 1); \ |
704 | | row4l = t0; row4h = t1; \ |
705 | | } while(0) |
706 | | |
707 | | #define BLAKE2B_ROUND(r) \ |
708 | | do { \ |
709 | | uint64x2_t b0, b1; \ |
710 | | BLAKE2B_LOAD_MSG_ ##r ##_1(b0, b1); \ |
711 | | BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ |
712 | | BLAKE2B_LOAD_MSG_ ##r ##_2(b0, b1); \ |
713 | | BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ |
714 | | BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ |
715 | | BLAKE2B_LOAD_MSG_ ##r ##_3(b0, b1); \ |
716 | | BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ |
717 | | BLAKE2B_LOAD_MSG_ ##r ##_4(b0, b1); \ |
718 | | BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ |
719 | | BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ |
720 | | } while(0) |
721 | | |
722 | | const uint64x2_t m0 = vreinterpretq_u64_u8(vld1q_u8(input + 00)); |
723 | | const uint64x2_t m1 = vreinterpretq_u64_u8(vld1q_u8(input + 16)); |
724 | | const uint64x2_t m2 = vreinterpretq_u64_u8(vld1q_u8(input + 32)); |
725 | | const uint64x2_t m3 = vreinterpretq_u64_u8(vld1q_u8(input + 48)); |
726 | | const uint64x2_t m4 = vreinterpretq_u64_u8(vld1q_u8(input + 64)); |
727 | | const uint64x2_t m5 = vreinterpretq_u64_u8(vld1q_u8(input + 80)); |
728 | | const uint64x2_t m6 = vreinterpretq_u64_u8(vld1q_u8(input + 96)); |
729 | | const uint64x2_t m7 = vreinterpretq_u64_u8(vld1q_u8(input + 112)); |
730 | | |
731 | | uint64x2_t row1l, row1h, row2l, row2h; |
732 | | uint64x2_t row3l, row3h, row4l, row4h; |
733 | | |
734 | | const uint64x2_t h0 = row1l = vld1q_u64(state.h()+0); |
735 | | const uint64x2_t h1 = row1h = vld1q_u64(state.h()+2); |
736 | | const uint64x2_t h2 = row2l = vld1q_u64(state.h()+4); |
737 | | const uint64x2_t h3 = row2h = vld1q_u64(state.h()+6); |
738 | | |
739 | | row3l = vld1q_u64(BLAKE2B_IV+0); |
740 | | row3h = vld1q_u64(BLAKE2B_IV+2); |
741 | | row4l = veorq_u64(vld1q_u64(BLAKE2B_IV+4), vld1q_u64(state.t()+0)); |
742 | | row4h = veorq_u64(vld1q_u64(BLAKE2B_IV+6), vld1q_u64(state.f()+0)); |
743 | | |
744 | | BLAKE2B_ROUND(0); |
745 | | BLAKE2B_ROUND(1); |
746 | | BLAKE2B_ROUND(2); |
747 | | BLAKE2B_ROUND(3); |
748 | | BLAKE2B_ROUND(4); |
749 | | BLAKE2B_ROUND(5); |
750 | | BLAKE2B_ROUND(6); |
751 | | BLAKE2B_ROUND(7); |
752 | | BLAKE2B_ROUND(8); |
753 | | BLAKE2B_ROUND(9); |
754 | | BLAKE2B_ROUND(10); |
755 | | BLAKE2B_ROUND(11); |
756 | | |
757 | | vst1q_u64(state.h()+0, veorq_u64(h0, veorq_u64(row1l, row3l))); |
758 | | vst1q_u64(state.h()+2, veorq_u64(h1, veorq_u64(row1h, row3h))); |
759 | | vst1q_u64(state.h()+4, veorq_u64(h2, veorq_u64(row2l, row4l))); |
760 | | vst1q_u64(state.h()+6, veorq_u64(h3, veorq_u64(row2h, row4h))); |
761 | | } |
762 | | #endif // CRYPTOPP_ARM_NEON_AVAILABLE |
763 | | |
764 | | #if (CRYPTOPP_POWER8_AVAILABLE) |
765 | | |
766 | | inline uint64x2_p VecLoad64(const void* p) |
767 | | { |
768 | | return (uint64x2_p)vec_xl(0, CONST_V32_CAST(p)); |
769 | | } |
770 | | |
771 | | inline uint64x2_p VecLoad64LE(const void* p, const uint8x16_p le_mask) |
772 | | { |
773 | | #if defined(CRYPTOPP_BIG_ENDIAN) |
774 | | const uint32x4_p v = vec_xl(0, CONST_V32_CAST(p)); |
775 | | return (uint64x2_p)VecPermute(v, v, le_mask); |
776 | | #else |
777 | | CRYPTOPP_UNUSED(le_mask); |
778 | | return (uint64x2_p)vec_xl(0, CONST_V32_CAST(p)); |
779 | | #endif |
780 | | } |
781 | | |
782 | | inline void VecStore64(void* p, const uint64x2_p x) |
783 | | { |
784 | | vec_xst((uint32x4_p)x, 0, NCONST_V32_CAST(p)); |
785 | | } |
786 | | |
787 | | inline void VecStore64LE(void* p, const uint64x2_p x, const uint8x16_p le_mask) |
788 | | { |
789 | | #if defined(CRYPTOPP_BIG_ENDIAN) |
790 | | const uint64x2_p v = VecPermute(x, x, le_mask); |
791 | | vec_xst((uint32x4_p)v, 0, NCONST_V32_CAST(p)); |
792 | | #else |
793 | | CRYPTOPP_UNUSED(le_mask); |
794 | | vec_xst((uint32x4_p)x, 0, NCONST_V32_CAST(p)); |
795 | | #endif |
796 | | } |
797 | | |
798 | | #if defined(CRYPTOPP_BIG_ENDIAN) |
799 | | #define vec_shl_8(a,b) (uint64x2_p)vec_sld((uint8x16_p)a,(uint8x16_p)b,8) |
800 | | #else |
801 | | #define vec_shl_8(a,b) (uint64x2_p)vec_sld((uint8x16_p)b,(uint8x16_p)a,8) |
802 | | #endif |
803 | | |
804 | | #define vec_merge_hi(a, b) vec_mergeh(a,b) |
805 | | #define vec_merge_hi_lo(a, b) vec_mergeh(a,(uint64x2_p)vec_sld((uint8x16_p)b,(uint8x16_p)b,8)) |
806 | | #define vec_merge_lo(a, b) vec_mergel(a,b) |
807 | | |
808 | | void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state) |
809 | | { |
810 | | #define BLAKE2B_LOAD_MSG_0_1(b0, b1) \ |
811 | | do { \ |
812 | | b0 = vec_merge_hi(m0, m1); \ |
813 | | b1 = vec_merge_hi(m2, m3); \ |
814 | | } while(0) |
815 | | |
816 | | #define BLAKE2B_LOAD_MSG_0_2(b0, b1) \ |
817 | | do { \ |
818 | | b0 = vec_merge_lo(m0, m1); \ |
819 | | b1 = vec_merge_lo(m2, m3); \ |
820 | | } while(0) |
821 | | |
822 | | #define BLAKE2B_LOAD_MSG_0_3(b0, b1) \ |
823 | | do { \ |
824 | | b0 = vec_merge_hi(m4, m5); \ |
825 | | b1 = vec_merge_hi(m6, m7); \ |
826 | | } while(0) |
827 | | |
828 | | #define BLAKE2B_LOAD_MSG_0_4(b0, b1) \ |
829 | | do { \ |
830 | | b0 = vec_merge_lo(m4, m5); \ |
831 | | b1 = vec_merge_lo(m6, m7); \ |
832 | | } while(0) |
833 | | |
834 | | #define BLAKE2B_LOAD_MSG_1_1(b0, b1) \ |
835 | | do { \ |
836 | | b0 = vec_merge_hi(m7, m2); \ |
837 | | b1 = vec_merge_lo(m4, m6); \ |
838 | | } while(0) |
839 | | |
840 | | #define BLAKE2B_LOAD_MSG_1_2(b0, b1) \ |
841 | | do { \ |
842 | | b0 = vec_merge_hi(m5, m4); \ |
843 | | b1 = vec_shl_8(m7, m3); \ |
844 | | } while(0) |
845 | | |
846 | | #define BLAKE2B_LOAD_MSG_1_3(b0, b1) \ |
847 | | do { \ |
848 | | b0 = vec_shl_8(m0, m0); \ |
849 | | b1 = vec_merge_lo(m5, m2); \ |
850 | | } while(0) |
851 | | |
852 | | #define BLAKE2B_LOAD_MSG_1_4(b0, b1) \ |
853 | | do { \ |
854 | | b0 = vec_merge_hi(m6, m1); \ |
855 | | b1 = vec_merge_lo(m3, m1); \ |
856 | | } while(0) |
857 | | |
858 | | #define BLAKE2B_LOAD_MSG_2_1(b0, b1) \ |
859 | | do { \ |
860 | | b0 = vec_shl_8(m5, m6); \ |
861 | | b1 = vec_merge_lo(m2, m7); \ |
862 | | } while(0) |
863 | | |
864 | | #define BLAKE2B_LOAD_MSG_2_2(b0, b1) \ |
865 | | do { \ |
866 | | b0 = vec_merge_hi(m4, m0); \ |
867 | | b1 = vec_merge_hi_lo(m1, m6); \ |
868 | | } while(0) |
869 | | |
870 | | #define BLAKE2B_LOAD_MSG_2_3(b0, b1) \ |
871 | | do { \ |
872 | | b0 = vec_merge_hi_lo(m5, m1); \ |
873 | | b1 = vec_merge_lo(m3, m4); \ |
874 | | } while(0) |
875 | | |
876 | | #define BLAKE2B_LOAD_MSG_2_4(b0, b1) \ |
877 | | do { \ |
878 | | b0 = vec_merge_hi(m7, m3); \ |
879 | | b1 = vec_shl_8(m0, m2); \ |
880 | | } while(0) |
881 | | |
882 | | #define BLAKE2B_LOAD_MSG_3_1(b0, b1) \ |
883 | | do { \ |
884 | | b0 = vec_merge_lo(m3, m1); \ |
885 | | b1 = vec_merge_lo(m6, m5); \ |
886 | | } while(0) |
887 | | |
888 | | #define BLAKE2B_LOAD_MSG_3_2(b0, b1) \ |
889 | | do { \ |
890 | | b0 = vec_merge_lo(m4, m0); \ |
891 | | b1 = vec_merge_hi(m6, m7); \ |
892 | | } while(0) |
893 | | |
894 | | #define BLAKE2B_LOAD_MSG_3_3(b0, b1) \ |
895 | | do { \ |
896 | | b0 = vec_merge_hi_lo(m1, m2); \ |
897 | | b1 = vec_merge_hi_lo(m2, m7); \ |
898 | | } while(0) |
899 | | |
900 | | #define BLAKE2B_LOAD_MSG_3_4(b0, b1) \ |
901 | | do { \ |
902 | | b0 = vec_merge_hi(m3, m5); \ |
903 | | b1 = vec_merge_hi(m0, m4); \ |
904 | | } while(0) |
905 | | |
906 | | #define BLAKE2B_LOAD_MSG_4_1(b0, b1) \ |
907 | | do { \ |
908 | | b0 = vec_merge_lo(m4, m2); \ |
909 | | b1 = vec_merge_hi(m1, m5); \ |
910 | | } while(0) |
911 | | |
912 | | #define BLAKE2B_LOAD_MSG_4_2(b0, b1) \ |
913 | | do { \ |
914 | | b0 = vec_merge_hi_lo(m0, m3); \ |
915 | | b1 = vec_merge_hi_lo(m2, m7); \ |
916 | | } while(0) |
917 | | |
918 | | #define BLAKE2B_LOAD_MSG_4_3(b0, b1) \ |
919 | | do { \ |
920 | | b0 = vec_merge_hi_lo(m7, m5); \ |
921 | | b1 = vec_merge_hi_lo(m3, m1); \ |
922 | | } while(0) |
923 | | |
924 | | #define BLAKE2B_LOAD_MSG_4_4(b0, b1) \ |
925 | | do { \ |
926 | | b0 = vec_shl_8(m0, m6); \ |
927 | | b1 = vec_merge_hi_lo(m4, m6); \ |
928 | | } while(0) |
929 | | |
930 | | #define BLAKE2B_LOAD_MSG_5_1(b0, b1) \ |
931 | | do { \ |
932 | | b0 = vec_merge_hi(m1, m3); \ |
933 | | b1 = vec_merge_hi(m0, m4); \ |
934 | | } while(0) |
935 | | |
936 | | #define BLAKE2B_LOAD_MSG_5_2(b0, b1) \ |
937 | | do { \ |
938 | | b0 = vec_merge_hi(m6, m5); \ |
939 | | b1 = vec_merge_lo(m5, m1); \ |
940 | | } while(0) |
941 | | |
942 | | #define BLAKE2B_LOAD_MSG_5_3(b0, b1) \ |
943 | | do { \ |
944 | | b0 = vec_merge_hi_lo(m2, m3); \ |
945 | | b1 = vec_merge_lo(m7, m0); \ |
946 | | } while(0) |
947 | | |
948 | | #define BLAKE2B_LOAD_MSG_5_4(b0, b1) \ |
949 | | do { \ |
950 | | b0 = vec_merge_lo(m6, m2); \ |
951 | | b1 = vec_merge_hi_lo(m7, m4); \ |
952 | | } while(0) |
953 | | |
954 | | #define BLAKE2B_LOAD_MSG_6_1(b0, b1) \ |
955 | | do { \ |
956 | | b0 = vec_merge_hi_lo(m6, m0); \ |
957 | | b1 = vec_merge_hi(m7, m2); \ |
958 | | } while(0) |
959 | | |
960 | | #define BLAKE2B_LOAD_MSG_6_2(b0, b1) \ |
961 | | do { \ |
962 | | b0 = vec_merge_lo(m2, m7); \ |
963 | | b1 = vec_shl_8(m6, m5); \ |
964 | | } while(0) |
965 | | |
966 | | #define BLAKE2B_LOAD_MSG_6_3(b0, b1) \ |
967 | | do { \ |
968 | | b0 = vec_merge_hi(m0, m3); \ |
969 | | b1 = vec_shl_8(m4, m4); \ |
970 | | } while(0) |
971 | | |
972 | | #define BLAKE2B_LOAD_MSG_6_4(b0, b1) \ |
973 | | do { \ |
974 | | b0 = vec_merge_lo(m3, m1); \ |
975 | | b1 = vec_merge_hi_lo(m1, m5); \ |
976 | | } while(0) |
977 | | |
978 | | #define BLAKE2B_LOAD_MSG_7_1(b0, b1) \ |
979 | | do { \ |
980 | | b0 = vec_merge_lo(m6, m3); \ |
981 | | b1 = vec_merge_hi_lo(m6, m1); \ |
982 | | } while(0) |
983 | | |
984 | | #define BLAKE2B_LOAD_MSG_7_2(b0, b1) \ |
985 | | do { \ |
986 | | b0 = vec_shl_8(m5, m7); \ |
987 | | b1 = vec_merge_lo(m0, m4); \ |
988 | | } while(0) |
989 | | |
990 | | #define BLAKE2B_LOAD_MSG_7_3(b0, b1) \ |
991 | | do { \ |
992 | | b0 = vec_merge_lo(m2, m7); \ |
993 | | b1 = vec_merge_hi(m4, m1); \ |
994 | | } while(0) |
995 | | |
996 | | #define BLAKE2B_LOAD_MSG_7_4(b0, b1) \ |
997 | | do { \ |
998 | | b0 = vec_merge_hi(m0, m2); \ |
999 | | b1 = vec_merge_hi(m3, m5); \ |
1000 | | } while(0) |
1001 | | |
1002 | | #define BLAKE2B_LOAD_MSG_8_1(b0, b1) \ |
1003 | | do { \ |
1004 | | b0 = vec_merge_hi(m3, m7); \ |
1005 | | b1 = vec_shl_8(m5, m0); \ |
1006 | | } while(0) |
1007 | | |
1008 | | #define BLAKE2B_LOAD_MSG_8_2(b0, b1) \ |
1009 | | do { \ |
1010 | | b0 = vec_merge_lo(m7, m4); \ |
1011 | | b1 = vec_shl_8(m1, m4); \ |
1012 | | } while(0) |
1013 | | |
1014 | | #define BLAKE2B_LOAD_MSG_8_3(b0, b1) \ |
1015 | | do { \ |
1016 | | b0 = m6; \ |
1017 | | b1 = vec_shl_8(m0, m5); \ |
1018 | | } while(0) |
1019 | | |
1020 | | #define BLAKE2B_LOAD_MSG_8_4(b0, b1) \ |
1021 | | do { \ |
1022 | | b0 = vec_merge_hi_lo(m1, m3); \ |
1023 | | b1 = m2; \ |
1024 | | } while(0) |
1025 | | |
1026 | | #define BLAKE2B_LOAD_MSG_9_1(b0, b1) \ |
1027 | | do { \ |
1028 | | b0 = vec_merge_hi(m5, m4); \ |
1029 | | b1 = vec_merge_lo(m3, m0); \ |
1030 | | } while(0) |
1031 | | |
1032 | | #define BLAKE2B_LOAD_MSG_9_2(b0, b1) \ |
1033 | | do { \ |
1034 | | b0 = vec_merge_hi(m1, m2); \ |
1035 | | b1 = vec_merge_hi_lo(m3, m2); \ |
1036 | | } while(0) |
1037 | | |
1038 | | #define BLAKE2B_LOAD_MSG_9_3(b0, b1) \ |
1039 | | do { \ |
1040 | | b0 = vec_merge_lo(m7, m4); \ |
1041 | | b1 = vec_merge_lo(m1, m6); \ |
1042 | | } while(0) |
1043 | | |
1044 | | #define BLAKE2B_LOAD_MSG_9_4(b0, b1) \ |
1045 | | do { \ |
1046 | | b0 = vec_shl_8(m5, m7); \ |
1047 | | b1 = vec_merge_hi(m6, m0); \ |
1048 | | } while(0) |
1049 | | |
1050 | | #define BLAKE2B_LOAD_MSG_10_1(b0, b1) \ |
1051 | | do { \ |
1052 | | b0 = vec_merge_hi(m0, m1); \ |
1053 | | b1 = vec_merge_hi(m2, m3); \ |
1054 | | } while(0) |
1055 | | |
1056 | | #define BLAKE2B_LOAD_MSG_10_2(b0, b1) \ |
1057 | | do { \ |
1058 | | b0 = vec_merge_lo(m0, m1); \ |
1059 | | b1 = vec_merge_lo(m2, m3); \ |
1060 | | } while(0) |
1061 | | |
1062 | | #define BLAKE2B_LOAD_MSG_10_3(b0, b1) \ |
1063 | | do { \ |
1064 | | b0 = vec_merge_hi(m4, m5); \ |
1065 | | b1 = vec_merge_hi(m6, m7); \ |
1066 | | } while(0) |
1067 | | |
1068 | | #define BLAKE2B_LOAD_MSG_10_4(b0, b1) \ |
1069 | | do { \ |
1070 | | b0 = vec_merge_lo(m4, m5); \ |
1071 | | b1 = vec_merge_lo(m6, m7); \ |
1072 | | } while(0) |
1073 | | |
1074 | | #define BLAKE2B_LOAD_MSG_11_1(b0, b1) \ |
1075 | | do { \ |
1076 | | b0 = vec_merge_hi(m7, m2); \ |
1077 | | b1 = vec_merge_lo(m4, m6); \ |
1078 | | } while(0) |
1079 | | |
1080 | | #define BLAKE2B_LOAD_MSG_11_2(b0, b1) \ |
1081 | | do { \ |
1082 | | b0 = vec_merge_hi(m5, m4); \ |
1083 | | b1 = vec_shl_8(m7, m3); \ |
1084 | | } while(0) |
1085 | | |
1086 | | #define BLAKE2B_LOAD_MSG_11_3(b0, b1) \ |
1087 | | do { \ |
1088 | | b0 = vec_shl_8(m0, m0); \ |
1089 | | b1 = vec_merge_lo(m5, m2); \ |
1090 | | } while(0) |
1091 | | |
1092 | | #define BLAKE2B_LOAD_MSG_11_4(b0, b1) \ |
1093 | | do { \ |
1094 | | b0 = vec_merge_hi(m6, m1); \ |
1095 | | b1 = vec_merge_lo(m3, m1); \ |
1096 | | } while(0) |
1097 | | |
1098 | | // Power8 has packed 64-bit rotate, but in terms of left rotate |
1099 | | const uint64x2_p ROR16_MASK = { 64-16, 64-16 }; |
1100 | | const uint64x2_p ROR24_MASK = { 64-24, 64-24 }; |
1101 | | const uint64x2_p ROR32_MASK = { 64-32, 64-32 }; |
1102 | | const uint64x2_p ROR63_MASK = { 64-63, 64-63 }; |
1103 | | |
1104 | | #define vec_ror_32(x) vec_rl(x, ROR32_MASK) |
1105 | | #define vec_ror_24(x) vec_rl(x, ROR24_MASK) |
1106 | | #define vec_ror_16(x) vec_rl(x, ROR16_MASK) |
1107 | | #define vec_ror_63(x) vec_rl(x, ROR63_MASK) |
1108 | | |
1109 | | #define BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ |
1110 | | do { \ |
1111 | | row1l = VecAdd(VecAdd(row1l, b0), row2l); \ |
1112 | | row1h = VecAdd(VecAdd(row1h, b1), row2h); \ |
1113 | | row4l = VecXor(row4l, row1l); row4h = VecXor(row4h, row1h); \ |
1114 | | row4l = vec_ror_32(row4l); row4h = vec_ror_32(row4h); \ |
1115 | | row3l = VecAdd(row3l, row4l); row3h = VecAdd(row3h, row4h); \ |
1116 | | row2l = VecXor(row2l, row3l); row2h = VecXor(row2h, row3h); \ |
1117 | | row2l = vec_ror_24(row2l); row2h = vec_ror_24(row2h); \ |
1118 | | } while(0) |
1119 | | |
1120 | | #define BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ |
1121 | | do { \ |
1122 | | row1l = VecAdd(VecAdd(row1l, b0), row2l); \ |
1123 | | row1h = VecAdd(VecAdd(row1h, b1), row2h); \ |
1124 | | row4l = VecXor(row4l, row1l); row4h = VecXor(row4h, row1h); \ |
1125 | | row4l = vec_ror_16(row4l); row4h = vec_ror_16(row4h); \ |
1126 | | row3l = VecAdd(row3l, row4l); row3h = VecAdd(row3h, row4h); \ |
1127 | | row2l = VecXor(row2l, row3l); row2h = VecXor(row2h, row3h); \ |
1128 | | row2l = vec_ror_63(row2l); row2h = vec_ror_63(row2h); \ |
1129 | | } while(0) |
1130 | | |
1131 | | #define BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ |
1132 | | do { \ |
1133 | | uint64x2_p t0 = vec_shl_8(row2l, row2h); \ |
1134 | | uint64x2_p t1 = vec_shl_8(row2h, row2l); \ |
1135 | | row2l = t0; row2h = t1; t0 = row3l; row3l = row3h; row3h = t0; \ |
1136 | | t0 = vec_shl_8(row4h, row4l); t1 = vec_shl_8(row4l, row4h); \ |
1137 | | row4l = t0; row4h = t1; \ |
1138 | | } while(0) |
1139 | | |
1140 | | #define BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ |
1141 | | do { \ |
1142 | | uint64x2_p t0 = vec_shl_8(row2h, row2l); \ |
1143 | | uint64x2_p t1 = vec_shl_8(row2l, row2h); \ |
1144 | | row2l = t0; row2h = t1; t0 = row3l; row3l = row3h; row3h = t0; \ |
1145 | | t0 = vec_shl_8(row4l, row4h); t1 = vec_shl_8(row4h, row4l); \ |
1146 | | row4l = t0; row4h = t1; \ |
1147 | | } while(0) |
1148 | | |
1149 | | #define BLAKE2B_ROUND(r) \ |
1150 | | do { \ |
1151 | | uint64x2_p b0, b1; \ |
1152 | | BLAKE2B_LOAD_MSG_ ##r ##_1(b0, b1); \ |
1153 | | BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ |
1154 | | BLAKE2B_LOAD_MSG_ ##r ##_2(b0, b1); \ |
1155 | | BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ |
1156 | | BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ |
1157 | | BLAKE2B_LOAD_MSG_ ##r ##_3(b0, b1); \ |
1158 | | BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ |
1159 | | BLAKE2B_LOAD_MSG_ ##r ##_4(b0, b1); \ |
1160 | | BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ |
1161 | | BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ |
1162 | | } while(0) |
1163 | | |
1164 | | // Possibly unaligned user messages |
1165 | | uint64x2_p m0, m1, m2, m3, m4, m5, m6, m7; |
1166 | | // Endian conversion mask |
1167 | | const uint8x16_p le_mask = {7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8}; |
1168 | | |
1169 | | #if defined(_ARCH_PWR9) |
1170 | | // POWER9 provides loads for char's and short's |
1171 | | m0 = (uint64x2_p) vec_xl( 0, CONST_V8_CAST( input )); |
1172 | | m1 = (uint64x2_p) vec_xl( 16, CONST_V8_CAST( input )); |
1173 | | m2 = (uint64x2_p) vec_xl( 32, CONST_V8_CAST( input )); |
1174 | | m3 = (uint64x2_p) vec_xl( 48, CONST_V8_CAST( input )); |
1175 | | m4 = (uint64x2_p) vec_xl( 64, CONST_V8_CAST( input )); |
1176 | | m5 = (uint64x2_p) vec_xl( 80, CONST_V8_CAST( input )); |
1177 | | m6 = (uint64x2_p) vec_xl( 96, CONST_V8_CAST( input )); |
1178 | | m7 = (uint64x2_p) vec_xl(112, CONST_V8_CAST( input )); |
1179 | | |
1180 | | # if defined(CRYPTOPP_BIG_ENDIAN) |
1181 | | m0 = vec_perm(m0, m0, le_mask); |
1182 | | m1 = vec_perm(m1, m1, le_mask); |
1183 | | m2 = vec_perm(m2, m2, le_mask); |
1184 | | m3 = vec_perm(m3, m3, le_mask); |
1185 | | m4 = vec_perm(m4, m4, le_mask); |
1186 | | m5 = vec_perm(m5, m5, le_mask); |
1187 | | m6 = vec_perm(m6, m6, le_mask); |
1188 | | m7 = vec_perm(m7, m7, le_mask); |
1189 | | # endif |
1190 | | #else |
1191 | | // Altivec only provides 16-byte aligned loads |
1192 | | // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf |
1193 | | m0 = (uint64x2_p) vec_ld( 0, CONST_V8_CAST( input )); |
1194 | | m1 = (uint64x2_p) vec_ld( 16, CONST_V8_CAST( input )); |
1195 | | m2 = (uint64x2_p) vec_ld( 32, CONST_V8_CAST( input )); |
1196 | | m3 = (uint64x2_p) vec_ld( 48, CONST_V8_CAST( input )); |
1197 | | m4 = (uint64x2_p) vec_ld( 64, CONST_V8_CAST( input )); |
1198 | | m5 = (uint64x2_p) vec_ld( 80, CONST_V8_CAST( input )); |
1199 | | m6 = (uint64x2_p) vec_ld( 96, CONST_V8_CAST( input )); |
1200 | | m7 = (uint64x2_p) vec_ld(112, CONST_V8_CAST( input )); |
1201 | | |
1202 | | // Alignment check for load of the message buffer |
1203 | | const uintptr_t addr = (uintptr_t)input; |
1204 | | if (addr%16 == 0) |
1205 | | { |
1206 | | // Already aligned. Perform a little-endian swap as required |
1207 | | # if defined(CRYPTOPP_BIG_ENDIAN) |
1208 | | m0 = vec_perm(m0, m0, le_mask); |
1209 | | m1 = vec_perm(m1, m1, le_mask); |
1210 | | m2 = vec_perm(m2, m2, le_mask); |
1211 | | m3 = vec_perm(m3, m3, le_mask); |
1212 | | m4 = vec_perm(m4, m4, le_mask); |
1213 | | m5 = vec_perm(m5, m5, le_mask); |
1214 | | m6 = vec_perm(m6, m6, le_mask); |
1215 | | m7 = vec_perm(m7, m7, le_mask); |
1216 | | # endif |
1217 | | } |
1218 | | else |
1219 | | { |
1220 | | // Not aligned. Fix vectors and perform a little-endian swap as required |
1221 | | // http://mirror.informatimago.com/next/developer.apple.com/ |
1222 | | // hardwaredrivers/ve/code_optimization.html |
1223 | | uint64x2_p ex; uint8x16_p perm; |
1224 | | ex = (uint64x2_p) vec_ld(112+15, CONST_V8_CAST( input )); |
1225 | | perm = vec_lvsl(0, CONST_V8_CAST( addr )); |
1226 | | |
1227 | | # if defined(CRYPTOPP_BIG_ENDIAN) |
1228 | | // Combine the vector permute with the little-endian swap |
1229 | | perm = vec_perm(perm, perm, le_mask); |
1230 | | # endif |
1231 | | |
1232 | | m0 = vec_perm(m0, m1, perm); |
1233 | | m1 = vec_perm(m1, m2, perm); |
1234 | | m2 = vec_perm(m2, m3, perm); |
1235 | | m3 = vec_perm(m3, m4, perm); |
1236 | | m4 = vec_perm(m4, m5, perm); |
1237 | | m5 = vec_perm(m5, m6, perm); |
1238 | | m6 = vec_perm(m6, m7, perm); |
1239 | | m7 = vec_perm(m7, ex, perm); |
1240 | | } |
1241 | | #endif |
1242 | | |
1243 | | uint64x2_p row1l, row1h, row2l, row2h; |
1244 | | uint64x2_p row3l, row3h, row4l, row4h; |
1245 | | |
1246 | | const uint64x2_p h0 = row1l = VecLoad64LE(state.h()+0, le_mask); |
1247 | | const uint64x2_p h1 = row1h = VecLoad64LE(state.h()+2, le_mask); |
1248 | | const uint64x2_p h2 = row2l = VecLoad64LE(state.h()+4, le_mask); |
1249 | | const uint64x2_p h3 = row2h = VecLoad64LE(state.h()+6, le_mask); |
1250 | | |
1251 | | row3l = VecLoad64(BLAKE2B_IV+0); |
1252 | | row3h = VecLoad64(BLAKE2B_IV+2); |
1253 | | row4l = VecXor(VecLoad64(BLAKE2B_IV+4), VecLoad64(state.t()+0)); |
1254 | | row4h = VecXor(VecLoad64(BLAKE2B_IV+6), VecLoad64(state.f()+0)); |
1255 | | |
1256 | | BLAKE2B_ROUND(0); |
1257 | | BLAKE2B_ROUND(1); |
1258 | | BLAKE2B_ROUND(2); |
1259 | | BLAKE2B_ROUND(3); |
1260 | | BLAKE2B_ROUND(4); |
1261 | | BLAKE2B_ROUND(5); |
1262 | | BLAKE2B_ROUND(6); |
1263 | | BLAKE2B_ROUND(7); |
1264 | | BLAKE2B_ROUND(8); |
1265 | | BLAKE2B_ROUND(9); |
1266 | | BLAKE2B_ROUND(10); |
1267 | | BLAKE2B_ROUND(11); |
1268 | | |
1269 | | VecStore64LE(state.h()+0, VecXor(h0, VecXor(row1l, row3l)), le_mask); |
1270 | | VecStore64LE(state.h()+2, VecXor(h1, VecXor(row1h, row3h)), le_mask); |
1271 | | VecStore64LE(state.h()+4, VecXor(h2, VecXor(row2l, row4l)), le_mask); |
1272 | | VecStore64LE(state.h()+6, VecXor(h3, VecXor(row2h, row4h)), le_mask); |
1273 | | } |
1274 | | #endif // CRYPTOPP_POWER8_AVAILABLE |
1275 | | |
1276 | | NAMESPACE_END |