Line data Source code
1 :
2 : // Source originally from https://github.com/BLAKE3-team/BLAKE3
3 : // From commit: c0ea395cf91d242f078c23d5f8d87eb9dd5f7b78
4 :
5 : #include "fd_blake3.h"
6 : #include "fd_blake3_private.h"
7 : #include "../../util/simd/fd_sse.h"
8 : #include <assert.h>
9 :
10 : #define _mm_shuffle_ps2(a, b, c) \
11 68528 : (_mm_castps_si128( \
12 68528 : _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
13 :
14 60018 : #define vu_rot16 vb_exch_adj_pair
15 :
16 : static inline __attribute__((always_inline)) vu_t
17 60018 : vu_rot12( vu_t x ) {
18 60018 : return vu_xor( vu_shr( x, 12 ), vu_shl( x, 32-12 ) );
19 60018 : }
20 :
21 : static inline __attribute__((always_inline)) vu_t
22 60018 : vu_rot8( vu_t x ) {
23 60018 : vb_t const mask = vb( 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,15,12 );
24 60018 : return _mm_shuffle_epi8( x, mask );
25 60018 : }
26 :
27 : static inline __attribute__((always_inline)) vu_t
28 60018 : vu_rot7( vu_t x ) {
29 60018 : return vu_xor( vu_shr( x, 7 ), vu_shl( x, 32-7 ) );
30 60018 : }
31 :
32 : static inline __attribute__((always_inline)) void
33 : g1( vu_t * row0,
34 : vu_t * row1,
35 : vu_t * row2,
36 : vu_t * row3,
37 60018 : vu_t m ) {
38 60018 : *row0 = vu_add(vu_add(*row0, m), *row1);
39 60018 : *row3 = vu_xor(*row3, *row0);
40 60018 : *row3 = vu_rot16(*row3);
41 60018 : *row2 = vu_add(*row2, *row3);
42 60018 : *row1 = vu_xor(*row1, *row2);
43 60018 : *row1 = vu_rot12(*row1);
44 60018 : }
45 :
46 : static inline __attribute__((always_inline)) void
47 : g2( vu_t * row0,
48 : vu_t * row1,
49 : vu_t * row2,
50 : vu_t * row3,
51 60018 : vu_t m ) {
52 60018 : *row0 = vu_add(vu_add(*row0, m), *row1);
53 60018 : *row3 = vu_xor(*row3, *row0);
54 60018 : *row3 = vu_rot8(*row3);
55 60018 : *row2 = vu_add(*row2, *row3);
56 60018 : *row1 = vu_xor(*row1, *row2);
57 60018 : *row1 = vu_rot7(*row1);
58 60018 : }
59 :
60 : // Note the optimization here of leaving row1 as the unrotated row, rather than
61 : // row0. All the message loads below are adjusted to compensate for this. See
62 : // discussion at https://github.com/sneves/blake2-avx2/pull/4
63 : static inline __attribute__((always_inline)) void
64 30009 : diagonalize(vu_t *row0, vu_t *row2, vu_t *row3) {
65 30009 : *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
66 30009 : *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
67 30009 : *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
68 30009 : }
69 :
70 : static inline __attribute__((always_inline)) void
71 30009 : undiagonalize(vu_t *row0, vu_t *row2, vu_t *row3) {
72 30009 : *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
73 30009 : *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
74 30009 : *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
75 30009 : }
76 :
77 : static inline __attribute__((always_inline)) void
78 : compress_pre( vu_t rows[4],
79 : uint const cv[ static 8 ],
80 : uchar const block[ static FD_BLAKE3_BLOCK_SZ ],
81 : uint block_len,
82 : ulong ctr,
83 4283 : uint flags ) {
84 4283 : rows[0] = vu_ld( cv );
85 4283 : rows[1] = vu_ld( cv+4 );
86 4283 : rows[2] = vu( FD_BLAKE3_IV[0], FD_BLAKE3_IV[1], FD_BLAKE3_IV[2], FD_BLAKE3_IV[3] );
87 4283 : rows[3] = vu( (uint)(ctr&UINT_MAX), (uint)(ctr>>32),
88 4283 : block_len, flags );
89 :
90 4283 : vu_t m0 = vb_ldu( block ); vu_t m1 = vb_ldu( block+16 );
91 4283 : vu_t m2 = vb_ldu( block+32 ); vu_t m3 = vb_ldu( block+48 );
92 :
93 4283 : vu_t t0, t1, t2, t3, tt;
94 :
95 : // Round 1. The first round permutes the message words from the original
96 : // input order, into the groups that get mixed in parallel.
97 4283 : t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0
98 4283 : g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
99 4283 : t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1
100 4283 : g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
101 4283 : diagonalize(&rows[0], &rows[2], &rows[3]);
102 4283 : t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8
103 4283 : t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14
104 4283 : g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
105 4283 : t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9
106 4283 : t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15
107 4283 : g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
108 4283 : undiagonalize(&rows[0], &rows[2], &rows[3]);
109 4283 : m0 = t0;
110 4283 : m1 = t1;
111 4283 : m2 = t2;
112 4283 : m3 = t3;
113 :
114 : // Round 2. This round and all following rounds apply a fixed permutation
115 : // to the message words from the round before.
116 4283 : t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
117 4283 : t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
118 4283 : g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
119 4283 : t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
120 4283 : tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
121 4283 : t1 = _mm_blend_epi16(tt, t1, 0xCC);
122 4283 : g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
123 4283 : diagonalize(&rows[0], &rows[2], &rows[3]);
124 4283 : t2 = _mm_unpacklo_epi64(m3, m1);
125 4283 : tt = _mm_blend_epi16(t2, m2, 0xC0);
126 4283 : t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
127 4283 : g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
128 4283 : t3 = _mm_unpackhi_epi32(m1, m3);
129 4283 : tt = _mm_unpacklo_epi32(m2, t3);
130 4283 : t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
131 4283 : g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
132 4283 : undiagonalize(&rows[0], &rows[2], &rows[3]);
133 4283 : m0 = t0;
134 4283 : m1 = t1;
135 4283 : m2 = t2;
136 4283 : m3 = t3;
137 :
138 : // Round 3
139 4283 : t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
140 4283 : t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
141 4283 : g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
142 4283 : t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
143 4283 : tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
144 4283 : t1 = _mm_blend_epi16(tt, t1, 0xCC);
145 4283 : g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
146 4283 : diagonalize(&rows[0], &rows[2], &rows[3]);
147 4283 : t2 = _mm_unpacklo_epi64(m3, m1);
148 4283 : tt = _mm_blend_epi16(t2, m2, 0xC0);
149 4283 : t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
150 4283 : g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
151 4283 : t3 = _mm_unpackhi_epi32(m1, m3);
152 4283 : tt = _mm_unpacklo_epi32(m2, t3);
153 4283 : t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
154 4283 : g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
155 4283 : undiagonalize(&rows[0], &rows[2], &rows[3]);
156 4283 : m0 = t0;
157 4283 : m1 = t1;
158 4283 : m2 = t2;
159 4283 : m3 = t3;
160 :
161 : // Round 4
162 4283 : t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
163 4283 : t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
164 4283 : g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
165 4283 : t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
166 4283 : tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
167 4283 : t1 = _mm_blend_epi16(tt, t1, 0xCC);
168 4283 : g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
169 4283 : diagonalize(&rows[0], &rows[2], &rows[3]);
170 4283 : t2 = _mm_unpacklo_epi64(m3, m1);
171 4283 : tt = _mm_blend_epi16(t2, m2, 0xC0);
172 4283 : t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
173 4283 : g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
174 4283 : t3 = _mm_unpackhi_epi32(m1, m3);
175 4283 : tt = _mm_unpacklo_epi32(m2, t3);
176 4283 : t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
177 4283 : g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
178 4283 : undiagonalize(&rows[0], &rows[2], &rows[3]);
179 4283 : m0 = t0;
180 4283 : m1 = t1;
181 4283 : m2 = t2;
182 4283 : m3 = t3;
183 :
184 : // Round 5
185 4283 : t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
186 4283 : t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
187 4283 : g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
188 4283 : t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
189 4283 : tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
190 4283 : t1 = _mm_blend_epi16(tt, t1, 0xCC);
191 4283 : g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
192 4283 : diagonalize(&rows[0], &rows[2], &rows[3]);
193 4283 : t2 = _mm_unpacklo_epi64(m3, m1);
194 4283 : tt = _mm_blend_epi16(t2, m2, 0xC0);
195 4283 : t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
196 4283 : g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
197 4283 : t3 = _mm_unpackhi_epi32(m1, m3);
198 4283 : tt = _mm_unpacklo_epi32(m2, t3);
199 4283 : t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
200 4283 : g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
201 4283 : undiagonalize(&rows[0], &rows[2], &rows[3]);
202 4283 : m0 = t0;
203 4283 : m1 = t1;
204 4283 : m2 = t2;
205 4283 : m3 = t3;
206 :
207 : // Round 6
208 4283 : t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
209 4283 : t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
210 4283 : g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
211 4283 : t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
212 4283 : tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
213 4283 : t1 = _mm_blend_epi16(tt, t1, 0xCC);
214 4283 : g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
215 4283 : diagonalize(&rows[0], &rows[2], &rows[3]);
216 4283 : t2 = _mm_unpacklo_epi64(m3, m1);
217 4283 : tt = _mm_blend_epi16(t2, m2, 0xC0);
218 4283 : t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
219 4283 : g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
220 4283 : t3 = _mm_unpackhi_epi32(m1, m3);
221 4283 : tt = _mm_unpacklo_epi32(m2, t3);
222 4283 : t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
223 4283 : g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
224 4283 : undiagonalize(&rows[0], &rows[2], &rows[3]);
225 4283 : m0 = t0;
226 4283 : m1 = t1;
227 4283 : m2 = t2;
228 4283 : m3 = t3;
229 :
230 : // Round 7
231 4283 : t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
232 4283 : t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
233 4283 : g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
234 4283 : t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
235 4283 : tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
236 4283 : t1 = _mm_blend_epi16(tt, t1, 0xCC);
237 4283 : g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
238 4283 : diagonalize(&rows[0], &rows[2], &rows[3]);
239 4283 : t2 = _mm_unpacklo_epi64(m3, m1);
240 4283 : tt = _mm_blend_epi16(t2, m2, 0xC0);
241 4283 : t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
242 4283 : g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
243 4283 : t3 = _mm_unpackhi_epi32(m1, m3);
244 4283 : tt = _mm_unpacklo_epi32(m2, t3);
245 4283 : t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
246 4283 : g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
247 4283 : undiagonalize(&rows[0], &rows[2], &rows[3]);
248 4283 : }
249 :
250 : void
251 : fd_blake3_sse_compress1( uchar * restrict out,
252 : uchar const * restrict msg,
253 : uint msg_sz,
254 : ulong counter,
255 : uint const flags,
256 : uchar * restrict out_chain,
257 3658 : uchar const * restrict in_chain ) {
258 3658 : FD_BLAKE3_TRACE(( "fd_blake3_sse_compress1(out=%p,msg=%p,sz=%u,counter=%lu,flags=%02x)",
259 3658 : (void *)out, (void *)msg, msg_sz, counter, flags ));
260 3658 : assert( msg_sz<=FD_BLAKE3_CHUNK_SZ );
261 :
262 0 : uint cv[8] = { FD_BLAKE3_IV[0], FD_BLAKE3_IV[1], FD_BLAKE3_IV[2], FD_BLAKE3_IV[3],
263 3658 : FD_BLAKE3_IV[4], FD_BLAKE3_IV[5], FD_BLAKE3_IV[6], FD_BLAKE3_IV[7] };
264 3658 : if( FD_UNLIKELY( in_chain ) ) {
265 0 : memcpy( cv, in_chain, FD_BLAKE3_OUTCHAIN_SZ );
266 0 : }
267 3658 : vu_t rows[4];
268 :
269 3658 : uint flag_mask = ~fd_uint_if( flags&FD_BLAKE3_FLAG_PARENT,
270 3658 : FD_BLAKE3_FLAG_CHUNK_START|FD_BLAKE3_FLAG_CHUNK_END,
271 3658 : 0U );
272 :
273 3658 : uint block_flags = flags | (flag_mask & FD_BLAKE3_FLAG_CHUNK_START);
274 3658 : if( FD_UNLIKELY( in_chain && !(flags&FD_BLAKE3_FLAG_CHUNK_START) ) ) {
275 0 : block_flags &= ~FD_BLAKE3_FLAG_CHUNK_START;
276 0 : }
277 6094 : do {
278 6094 : uint block_sz = fd_uint_min( msg_sz, FD_BLAKE3_BLOCK_SZ );
279 6094 : block_flags |= FD_BLAKE3_FLAG_CHUNK_END;
280 6094 : block_flags &= (flag_mask & ~fd_uint_if( msg_sz<=FD_BLAKE3_BLOCK_SZ, 0, (FD_BLAKE3_FLAG_CHUNK_END|FD_BLAKE3_FLAG_ROOT) ) );
281 :
282 6094 : uchar tail[ FD_BLAKE3_BLOCK_SZ ] __attribute__((aligned(16)));
283 6094 : uchar const * restrict block;
284 6094 : if( FD_LIKELY( msg_sz>=FD_BLAKE3_BLOCK_SZ ) ) {
285 3631 : block = msg;
286 3631 : } else {
287 2463 : vb_st( tail, vu_zero() );
288 2463 : vb_st( tail+16, vu_zero() );
289 2463 : vb_st( tail+32, vu_zero() );
290 2463 : vb_st( tail+48, vu_zero() );
291 2463 : fd_memcpy( tail, msg, msg_sz );
292 2463 : block = tail;
293 2463 : }
294 :
295 6094 : if( FD_UNLIKELY( out_chain && (block_flags & FD_BLAKE3_FLAG_CHUNK_END) ) ) {
296 : /* FIXME better document and polish the transition from the compress
297 : part to the expand part. */
298 1810 : fd_memcpy( out, block, FD_BLAKE3_BLOCK_SZ ); /* FIXME DOCUMENT OVERLOADING OF OUT ARGUMENT */
299 1810 : fd_memcpy( out_chain, cv, FD_BLAKE3_OUTCHAIN_SZ );
300 1810 : FD_BLAKE3_TRACE(( "fd_blake3_sse_compress1: done (XOF mode)" ));
301 1810 : return;
302 1810 : }
303 :
304 4284 : FD_BLAKE3_TRACE(( "fd_blake3_sse_compress1: sz=%u counter=%u flags=%x", block_sz, (uint)counter, block_flags ));
305 4284 : compress_pre( rows, cv, block, block_sz, counter, block_flags );
306 4284 : if( FD_UNLIKELY( in_chain ) ) {
307 : /* FIXME UGLY */
308 0 : vu_stu( out+32, vu_xor( vu_ld( cv ), rows[2] ) );
309 0 : vu_stu( out+48, vu_xor( vu_ld( cv+4 ), rows[3] ) );
310 0 : }
311 4284 : vu_st( cv, vu_xor( rows[0], rows[2] ) );
312 4284 : vu_st( cv+4, vu_xor( rows[1], rows[3] ) );
313 4284 : msg += FD_BLAKE3_BLOCK_SZ;
314 4284 : msg_sz -= block_sz;
315 4284 : block_flags = flags;
316 4284 : } while( (int)msg_sz>0 );
317 :
318 1848 : vu_stu( out, vu_ld( cv ) );
319 1848 : vu_stu( out+16, vu_ld( cv+4 ) );
320 :
321 1848 : FD_BLAKE3_TRACE(( "fd_blake3_sse_compress1: done" ));
322 1848 : }
|