Line data Source code
1 : #include "fd_blake3.h"
2 : #include "fd_blake3_private.h"
3 : #include <assert.h>
4 :
5 : /* Hash state machine *************************************************/
6 :
7 : static FD_FN_UNUSED fd_blake3_pos_t *
8 : fd_blake3_pos_init( fd_blake3_pos_t * s,
9 : uchar const * data,
10 4259 : ulong sz ) {
11 4259 : *s = (fd_blake3_pos_t) {
12 4259 : .input = data,
13 4259 : .input_sz = sz,
14 4259 : .magic = FD_BLAKE3_MAGIC,
15 4259 : };
16 4259 : return s;
17 4259 : }
18 :
19 : /* fd_blake3_l0_complete returns 1 if all leaf nodes have been hashed,
20 : 0 otherwise. */
21 :
22 : FD_FN_PURE static inline int
23 170337 : fd_blake3_l0_complete( fd_blake3_pos_t const * s ) {
24 170337 : return ( s->leaf_idx<<FD_BLAKE3_CHUNK_LG_SZ ) >= fd_ulong_max( s->input_sz, 64 );
25 170337 : }
26 :
27 : FD_FN_PURE static inline int
28 : fd_blake3_is_finished( fd_blake3_pos_t const * s,
29 73817 : ulong tick ) {
30 73817 : int l0_complete = fd_blake3_l0_complete( s );
31 73817 : int ln_complete = s->live_cnt == 1UL;
32 73817 : int idle = tick >= s->next_tick;
33 73817 : return l0_complete & ln_complete & idle;
34 73817 : }
35 :
36 : static fd_blake3_op_t *
37 : fd_blake3_prepare_leaf( fd_blake3_pos_t * restrict s,
38 : fd_blake3_buf_t * restrict buf,
39 : fd_blake3_op_t * restrict op,
40 11436 : ulong tick ) {
41 :
42 11436 : ulong msg_off = s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ;
43 11436 : ulong msg_sz = fd_ulong_min( s->input_sz - msg_off, 1024UL );
44 11436 : uchar const * msg = s->input + msg_off;
45 11436 : uchar * out = buf->slots[ s->layer ][ s->head.uc[ s->layer ] ];
46 :
47 11436 : int flags = fd_int_if( s->input_sz <= FD_BLAKE3_CHUNK_SZ, FD_BLAKE3_FLAG_ROOT, 0 );
48 :
49 11436 : *op = (fd_blake3_op_t) {
50 11436 : .msg = msg,
51 11436 : .out = out,
52 11436 : .counter = s->leaf_idx,
53 11436 : .sz = (ushort)msg_sz,
54 11436 : .flags = (uchar)flags
55 11436 : };
56 :
57 11436 : s->head.uc[ 0 ] = (uchar)( s->head.uc[ 0 ]+1 );
58 11436 : s->leaf_idx++;
59 11436 : s->live_cnt++;
60 11436 : s->next_tick = tick+1;
61 :
62 11436 : return op;
63 :
64 11436 : }
65 :
66 : static int
67 : fd_blake3_seek_branch( fd_blake3_pos_t * restrict s,
68 : fd_blake3_buf_t * restrict buf,
69 61645 : ulong tick ) {
70 :
71 61645 : if( s->live_cnt == 1UL )
72 0 : return 0;
73 :
74 61645 : if( !fd_blake3_l0_complete( s ) )
75 28704 : return ( s->tail.uc[ s->layer - 1 ] + 1 ) <
76 28704 : ( s->head.uc[ s->layer - 1 ] );
77 :
78 32941 : # if FD_HAS_AVX
79 :
80 32941 : wb_t diff = wb_sub( s->head.wb, s->tail.wb );
81 :
82 32941 : uint mergeable_layers = (uint)_mm256_movemask_epi8( wb_gt( diff, wb_bcast( 1 ) ) );
83 32941 : int merge_layer = fd_uint_find_lsb_w_default( mergeable_layers, -1 );
84 32941 : if( merge_layer>=0 ) {
85 28771 : if( ((uint)merge_layer >= s->layer) & (tick < s->next_tick) )
86 4797 : return 0; /* still waiting for previous merge */
87 23974 : s->layer = (uint)merge_layer+1U;
88 23974 : return 1;
89 28771 : }
90 :
91 4170 : uint single_layers = (uint)_mm256_movemask_epi8( wb_eq( diff, wb_bcast( 1 ) ) );
92 4170 : uint single_lo = (uint)fd_uint_find_lsb( single_layers );
93 4170 : uint single_hi = (uint)fd_uint_find_lsb( single_layers & ( ~fd_uint_mask_lsb( (int)(single_lo+1U) ) ) );
94 :
95 4170 : wb_t node = wb_ld( buf->slots[ single_lo ][ s->tail.uc[ single_lo ] ] );
96 4170 : wb_st( buf->slots[ single_hi ][ s->head.uc[ single_hi ] ], node );
97 :
98 : # else /* FD_HAS_AVX */
99 :
100 : uchar diff[ 32 ];
101 : for( ulong j=0UL; j<32UL; j++ ) diff[j] = (uchar)( s->head.uc[j] - s->tail.uc[j] );
102 :
103 : int merge_layer = -1;
104 : for( uint j=0U; j<32U; j++ ) {
105 : if( diff[j]>1 ) {
106 : merge_layer = (int)j;
107 : break;
108 : }
109 : }
110 : if( merge_layer>=0 ) {
111 : if( ((uint)merge_layer >= s->layer) & (tick < s->next_tick) )
112 : return 0; /* still waiting for previous merge */
113 : s->layer = (uint)(merge_layer+1);
114 : return 1;
115 : }
116 :
117 : uint j=0U;
118 : uint single_lo = 0UL;
119 : uint single_hi = 0UL;
120 : for( ; j<32U; j++ ) {
121 : if( diff[j] ) {
122 : single_lo = j;
123 : break;
124 : }
125 : }
126 : j++;
127 : for( ; j<32U; j++ ) {
128 : if( diff[j] ) {
129 : single_hi = j;
130 : break;
131 : }
132 : }
133 :
134 : memcpy( buf->slots[ single_hi ][ s->head.uc[ single_hi ] ],
135 : buf->slots[ single_lo ][ s->tail.uc[ single_lo ] ],
136 : 32UL );
137 :
138 : # endif /* FD_HAS_AVX */
139 :
140 4170 : FD_BLAKE3_TRACE(( "fd_blake3_seek_branch: moving up %u/%u to %u/%u",
141 4170 : single_lo, s->tail.uc[ single_lo ],
142 4170 : single_hi, s->head.uc[ single_hi ] ));
143 :
144 4170 : if( ((uint)single_hi >= s->layer) & (tick < s->next_tick) )
145 1796 : return 0; /* still waiting for previous merge */
146 :
147 2374 : s->head.uc[ single_lo ] = (uchar)( s->head.uc[ single_lo ]-1 );
148 2374 : s->head.uc[ single_hi ] = (uchar)( s->head.uc[ single_hi ]+1 );
149 :
150 2374 : s->layer = (uint)single_hi+1U;
151 2374 : return 1;
152 4170 : }
153 :
154 : static fd_blake3_op_t *
155 : fd_blake3_prepare_branch( fd_blake3_pos_t * restrict s,
156 : fd_blake3_buf_t * restrict buf,
157 : fd_blake3_op_t * restrict op,
158 61645 : ulong tick ) {
159 :
160 61645 : if( !fd_blake3_seek_branch( s, buf, tick ) )
161 6593 : return NULL;
162 :
163 55052 : assert( s->layer < FD_BLAKE3_ROW_CNT );
164 :
165 0 : uchar const * msg = buf->slots[ s->layer-1U ][ s->tail.uc[ s->layer-1U ] ];
166 55052 : uchar * out = buf->slots[ s->layer ][ s->head.uc[ s->layer ] ];
167 :
168 55052 : s->head.uc[ s->layer ] = (uchar)( s->head.uc[ s->layer ]+1 );
169 55052 : s->tail.uc[ s->layer-1 ] = (uchar)( s->tail.uc[ s->layer-1 ]+2 );
170 55052 : s->live_cnt--;
171 55052 : s->next_tick = tick+1;
172 :
173 55052 : uint flags = FD_BLAKE3_FLAG_PARENT |
174 55052 : fd_uint_if( s->live_cnt==1UL, FD_BLAKE3_FLAG_ROOT, 0u );
175 :
176 55052 : *op = (fd_blake3_op_t) {
177 55052 : .msg = msg,
178 55052 : .out = out,
179 55052 : .counter = 0UL,
180 55052 : .sz = 64U,
181 55052 : .flags = (uchar)flags
182 55052 : };
183 55052 : return op;
184 :
185 61645 : }
186 :
187 : static void
188 13231 : fd_blake3_advance( fd_blake3_pos_t * restrict s ) {
189 :
190 13231 : # if FD_HAS_AVX
191 :
192 13231 : wb_t mask = wb_eq( s->tail.wb, s->head.wb );
193 13231 : s->tail.wb = wb_andnot( mask, s->tail.wb );
194 13231 : s->head.wb = wb_andnot( mask, s->head.wb );
195 :
196 : # else /* FD_HAS_AVX */
197 :
198 : for( ulong j=0UL; j<32UL; j++ ) {
199 : if( s->tail.uc[j] == s->head.uc[j] ) {
200 : s->tail.uc[j] = 0;
201 : s->head.uc[j] = 0;
202 : }
203 : }
204 :
205 : # endif /* FD_HAS_AVX */
206 :
207 13231 : if( s->head.uc[ s->layer ]==FD_BLAKE3_COL_CNT ) {
208 1196 : s->layer++;
209 1196 : }
210 12035 : else if( ( s->layer > 0UL ) &&
211 12035 : ( s->tail.uc[ s->layer-1 ] < s->head.uc[ s->layer-1 ] ) ) {
212 : /* pass */
213 2394 : }
214 9641 : else if( fd_blake3_l0_complete( s ) ) {
215 7195 : s->layer++;
216 7195 : }
217 2446 : else if( s->layer > 0UL ) {
218 598 : s->layer = 0UL;
219 598 : }
220 :
221 13231 : }
222 :
223 : static fd_blake3_op_t *
224 : fd_blake3_prepare( fd_blake3_pos_t * restrict s,
225 : fd_blake3_buf_t * restrict buf,
226 : fd_blake3_op_t * restrict op,
227 73716 : ulong tick ) {
228 :
229 73716 : assert( s->layer < FD_BLAKE3_ROW_CNT );
230 :
231 73716 : if( fd_blake3_is_finished( s, tick ) )
232 0 : return NULL;
233 :
234 73716 : if( tick >= s->next_tick )
235 13231 : fd_blake3_advance( s );
236 :
237 73716 : if( s->layer != 0 )
238 61647 : return fd_blake3_prepare_branch( s, buf, op, tick );
239 :
240 12069 : if( ( s->head.uc[0] >= FD_BLAKE3_COL_CNT ) |
241 12069 : ( fd_blake3_l0_complete( s ) ) ) {
242 2450 : return NULL;
243 2450 : }
244 :
245 9619 : return fd_blake3_prepare_leaf( s, buf, op, tick );
246 :
247 12069 : }
248 :
249 : #if FD_BLAKE3_PARA_MAX>1
250 :
251 : /* fd_blake3_prepare_fast does streamlined hashing of full chunks or
252 : full branches. */
253 :
254 : static fd_blake3_op_t *
255 : fd_blake3_prepare_fast( fd_blake3_pos_t * restrict s,
256 : fd_blake3_buf_t * restrict buf,
257 : fd_blake3_op_t * restrict op,
258 : ulong n,
259 7773 : ulong min ) {
260 :
261 7773 : if( s->layer && s->head.uc[ s->layer-1 ]==FD_BLAKE3_COL_CNT ) {
262 2392 : op->msg = buf->rows[ s->layer-1 ];
263 2392 : op->out = buf->rows[ s->layer ] + (s->head.uc[ s->layer ]<<FD_BLAKE3_OUTCHAIN_LG_SZ);
264 2392 : op->counter = 0UL;
265 2392 : op->flags = FD_BLAKE3_FLAG_PARENT;
266 :
267 : /* Assume that branch layer is fully hashed (up to col cnt) */
268 2392 : s->head.uc[ s->layer-1 ] = 0;
269 2392 : s->head.uc[ s->layer ] = (uchar)( (ulong)s->head.uc[ s->layer ]+n );
270 2392 : s->live_cnt -= n;
271 2392 : s->layer = fd_uint_if( s->head.uc[ s->layer ]==FD_BLAKE3_COL_CNT,
272 2392 : s->layer+1U, 0U );
273 :
274 2392 : return op;
275 2392 : }
276 :
277 5381 : ulong pos = s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ;
278 5381 : ulong avail = fd_ulong_align_dn( s->input_sz - pos, FD_BLAKE3_CHUNK_SZ ) >> FD_BLAKE3_CHUNK_LG_SZ;
279 5381 : n = fd_ulong_min( n, avail );
280 :
281 : /* This constants controls the threshold when to use the (slow)
282 : scheduler instead of fast single-message hashing. Carefully tuned
283 : for best overall performance. */
284 5381 : if( n<min ) return NULL;
285 :
286 5381 : op->msg = s->input + (s->leaf_idx<<FD_BLAKE3_CHUNK_LG_SZ);
287 5381 : op->out = buf->rows[0] + (s->head.uc[0]<<FD_BLAKE3_OUTCHAIN_LG_SZ);
288 5381 : op->counter = s->leaf_idx;
289 5381 : op->flags = 0;
290 :
291 5381 : s->head.uc[0] = (uchar)( (ulong)s->head.uc[0]+n );
292 5381 : s->leaf_idx += n;
293 5381 : s->live_cnt += n;
294 5381 : s->layer = fd_uint_if( s->head.uc[0]==FD_BLAKE3_COL_CNT, 1U, 0U );
295 :
296 5381 : return op;
297 5381 : }
298 :
299 : static void
300 : fd_blake3_batch_hash( fd_blake3_op_t const * ops,
301 10836 : ulong op_cnt ) {
302 10836 : uchar const * batch_data [ FD_BLAKE3_PARA_MAX ] __attribute__((aligned(64)));
303 10836 : uint batch_data_sz[ FD_BLAKE3_PARA_MAX ] = {0};
304 10836 : uchar * batch_hash [ FD_BLAKE3_PARA_MAX ] __attribute__((aligned(64)));
305 10836 : ulong batch_ctr [ FD_BLAKE3_PARA_MAX ];
306 10836 : uint batch_flags [ FD_BLAKE3_PARA_MAX ];
307 73141 : for( ulong j=0UL; j<op_cnt; j++ ) {
308 62305 : batch_data [ j ] = ops[ j ].msg;
309 62305 : batch_hash [ j ] = ops[ j ].out;
310 62305 : batch_data_sz[ j ] = ops[ j ].sz;
311 62305 : batch_ctr [ j ] = ops[ j ].counter;
312 62305 : batch_flags [ j ] = ops[ j ].flags;
313 62305 : }
314 10836 : #if FD_HAS_AVX512
315 10836 : fd_blake3_avx512_compress16( op_cnt, batch_data, batch_data_sz, batch_ctr, batch_flags, fd_type_pun( batch_hash ), NULL, 32U, NULL );
316 : #elif FD_HAS_AVX
317 : fd_blake3_avx_compress8 ( op_cnt, batch_data, batch_data_sz, batch_ctr, batch_flags, fd_type_pun( batch_hash ), NULL, 32U, NULL );
318 : #else
319 : #error "FIXME missing para support"
320 : #endif
321 10836 : }
322 :
323 : #endif
324 :
325 : /* Simple API *********************************************************/
326 :
327 : ulong
328 0 : fd_blake3_align( void ) {
329 0 : return FD_BLAKE3_ALIGN;
330 0 : }
331 :
332 : ulong
333 0 : fd_blake3_footprint( void ) {
334 0 : return FD_BLAKE3_FOOTPRINT;
335 0 : }
336 :
337 : void *
338 0 : fd_blake3_new( void * shmem ) {
339 0 : fd_blake3_t * sha = (fd_blake3_t *)shmem;
340 :
341 0 : if( FD_UNLIKELY( !shmem ) ) {
342 0 : FD_LOG_WARNING(( "NULL shmem" ));
343 0 : return NULL;
344 0 : }
345 :
346 0 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)shmem, fd_blake3_align() ) ) ) {
347 0 : FD_LOG_WARNING(( "misaligned shmem" ));
348 0 : return NULL;
349 0 : }
350 :
351 0 : ulong footprint = fd_blake3_footprint();
352 :
353 0 : fd_memset( sha, 0, footprint );
354 :
355 0 : FD_COMPILER_MFENCE();
356 0 : FD_VOLATILE( sha->pos.magic ) = FD_BLAKE3_MAGIC;
357 0 : FD_COMPILER_MFENCE();
358 :
359 0 : return (void *)sha;
360 0 : }
361 :
362 : fd_blake3_t *
363 0 : fd_blake3_join( void * shsha ) {
364 :
365 0 : if( FD_UNLIKELY( !shsha ) ) {
366 0 : FD_LOG_WARNING(( "NULL shsha" ));
367 0 : return NULL;
368 0 : }
369 :
370 0 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)shsha, fd_blake3_align() ) ) ) {
371 0 : FD_LOG_WARNING(( "misaligned shsha" ));
372 0 : return NULL;
373 0 : }
374 :
375 0 : fd_blake3_t * sha = (fd_blake3_t *)shsha;
376 :
377 0 : if( FD_UNLIKELY( sha->pos.magic!=FD_BLAKE3_MAGIC ) ) {
378 0 : FD_LOG_WARNING(( "bad magic" ));
379 0 : return NULL;
380 0 : }
381 :
382 0 : return sha;
383 0 : }
384 :
385 : void *
386 0 : fd_blake3_leave( fd_blake3_t * sha ) {
387 :
388 0 : if( FD_UNLIKELY( !sha ) ) {
389 0 : FD_LOG_WARNING(( "NULL sha" ));
390 0 : return NULL;
391 0 : }
392 :
393 0 : return (void *)sha;
394 0 : }
395 :
396 : void *
397 0 : fd_blake3_delete( void * shsha ) {
398 :
399 0 : if( FD_UNLIKELY( !shsha ) ) {
400 0 : FD_LOG_WARNING(( "NULL shsha" ));
401 0 : return NULL;
402 0 : }
403 :
404 0 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)shsha, fd_blake3_align() ) ) ) {
405 0 : FD_LOG_WARNING(( "misaligned shsha" ));
406 0 : return NULL;
407 0 : }
408 :
409 0 : fd_blake3_t * sha = (fd_blake3_t *)shsha;
410 :
411 0 : if( FD_UNLIKELY( sha->pos.magic!=FD_BLAKE3_MAGIC ) ) {
412 0 : FD_LOG_WARNING(( "bad magic" ));
413 0 : return NULL;
414 0 : }
415 :
416 0 : FD_COMPILER_MFENCE();
417 0 : FD_VOLATILE( sha->pos.magic ) = 0UL;
418 0 : FD_COMPILER_MFENCE();
419 :
420 0 : return (void *)sha;
421 0 : }
422 :
423 :
424 : fd_blake3_t *
425 4259 : fd_blake3_init( fd_blake3_t * sha ) {
426 4259 : FD_BLAKE3_TRACE(( "fd_blake3_init(sha=%p)", (void *)sha ));
427 4259 : fd_blake3_pos_init( &sha->pos, NULL, 0UL );
428 4259 : sha->block_sz = 0UL;
429 4259 : return sha;
430 4259 : }
431 :
432 : #if FD_BLAKE3_PARA_MAX>1
433 :
434 : static void
435 : fd_blake3_append_blocks( fd_blake3_pos_t * s,
436 : fd_blake3_buf_t * tbl,
437 : uchar const * data,
438 1797 : ulong buf_cnt ) {
439 1797 : s->input = data - (s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ); /* TODO HACKY!! */
440 7179 : for( ulong i=0UL; i<buf_cnt; i++ ) {
441 5382 : fd_blake3_op_t op[1];
442 7774 : do {
443 7774 : if( !fd_blake3_prepare_fast( s, tbl, op, FD_BLAKE3_PARA_MAX, FD_BLAKE3_PARA_MAX ) )
444 0 : return;
445 7774 : #if FD_HAS_AVX512
446 7774 : fd_blake3_avx512_compress16_fast( op->msg, op->out, op->counter, op->flags );
447 : #elif FD_HAS_AVX
448 : fd_blake3_avx_compress8_fast( op->msg, op->out, op->counter, op->flags );
449 : #else
450 : #error "missing para support"
451 : #endif
452 7774 : } while( op->flags & FD_BLAKE3_FLAG_PARENT );
453 5382 : }
454 1797 : }
455 :
456 : #else
457 :
458 : static void
459 : fd_blake3_append_blocks( fd_blake3_pos_t * s,
460 : fd_blake3_buf_t * tbl,
461 : uchar const * data,
462 : ulong buf_cnt ) {
463 : (void)buf_cnt;
464 : s->input = data - (s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ); /* TODO HACKY!! */
465 : fd_blake3_op_t op[1];
466 : while( buf_cnt ) {
467 : if( !fd_blake3_prepare( s, tbl, op, s->next_tick ) ) {
468 : FD_BLAKE3_TRACE(( "fd_blake3_append_blocks: no more ops to prepare" ));
469 : break;
470 : }
471 : if( op->flags & FD_BLAKE3_FLAG_PARENT ) {
472 : FD_BLAKE3_TRACE(( "fd_blake3_append_blocks: compressing output chaining values (layer %u)", s->layer ));
473 : fd_blake3_ref_compress1( op->out, op->msg, 64UL, op->counter, op->flags, NULL, NULL );
474 : } else {
475 : FD_BLAKE3_TRACE(( "fd_blake3_append_blocks: compressing %lu leaf chunks", FD_BLAKE3_COL_CNT ));
476 : fd_blake3_ref_compress1( op->out, op->msg, FD_BLAKE3_CHUNK_SZ, op->counter, op->flags, NULL, NULL );
477 : buf_cnt--;
478 : }
479 : s->next_tick++;
480 : }
481 : }
482 :
483 : #endif
484 :
485 : fd_blake3_t *
486 : fd_blake3_append( fd_blake3_t * sha,
487 : void const * _data,
488 21193 : ulong sz ) {
489 :
490 : /* If no data to append, we are done */
491 :
492 21193 : if( FD_UNLIKELY( !sz ) ) return sha;
493 19998 : FD_BLAKE3_TRACE(( "fd_blake3_append(sha=%p,data=%p,sz=%lu)", (void *)sha, _data, sz ));
494 :
495 : /* Unpack inputs */
496 :
497 19998 : fd_blake3_pos_t * s = &sha->pos;
498 19998 : fd_blake3_buf_t * tbl = &sha->buf;
499 19998 : uchar * buf = sha->block;
500 19998 : ulong buf_used = sha->block_sz;
501 :
502 19998 : uchar const * data = (uchar const *)_data;
503 :
504 : /* Update input_sz */
505 :
506 19998 : s->input_sz += sz;
507 :
508 : /* Edge case: For the first completed 1024 bytes of input, don't
509 : immediately hash, since it is not clear whether this chunk has
510 : the root flag set. */
511 19998 : if( FD_UNLIKELY( FD_BLAKE3_PARA_MAX==1 && s->input_sz==1024UL ) ) {
512 0 : fd_memcpy( buf + buf_used, data, sz );
513 0 : sha->block_sz = FD_BLAKE3_CHUNK_SZ;
514 0 : return sha;
515 0 : }
516 :
517 : /* Handle buffered bytes from previous appends */
518 :
519 19998 : if( FD_UNLIKELY( buf_used ) ) { /* optimized for well aligned use of append */
520 :
521 : /* If the append isn't large enough to complete the current block,
522 : buffer these bytes too and return */
523 :
524 15750 : ulong buf_rem = FD_BLAKE3_PRIVATE_BUF_MAX - buf_used; /* In (0,FD_BLAKE3_PRIVATE_BUF_MAX) */
525 15750 : if( FD_UNLIKELY( sz < buf_rem ) ) { /* optimize for large append */
526 14554 : fd_memcpy( buf + buf_used, data, sz );
527 14554 : sha->block_sz = buf_used + sz;
528 14554 : return sha;
529 14554 : }
530 :
531 : /* Otherwise, buffer enough leading bytes of data to complete the
532 : block, update the hash and then continue processing any remaining
533 : bytes of data. */
534 :
535 1196 : fd_memcpy( buf + buf_used, data, buf_rem );
536 1196 : data += buf_rem;
537 1196 : sz -= buf_rem;
538 :
539 1196 : fd_blake3_append_blocks( s, tbl, buf, 1UL );
540 1196 : sha->block_sz = 0UL;
541 1196 : }
542 :
543 : /* Append the bulk of the data */
544 :
545 5444 : ulong buf_cnt = sz >> FD_BLAKE3_PRIVATE_LG_BUF_MAX;
546 5444 : if( FD_LIKELY( buf_cnt ) ) fd_blake3_append_blocks( s, tbl, data, buf_cnt ); /* optimized for large append */
547 :
548 : /* Buffer any leftover bytes */
549 :
550 5444 : buf_used = sz & (FD_BLAKE3_PRIVATE_BUF_MAX-1UL); /* In [0,FD_BLAKE3_PRIVATE_BUF_MAX) */
551 5450 : if( FD_UNLIKELY( buf_used ) ) { /* optimized for well aligned use of append */
552 5450 : fd_memcpy( buf, data + (buf_cnt << FD_BLAKE3_PRIVATE_LG_BUF_MAX), buf_used );
553 5450 : sha->block_sz = buf_used; /* In (0,FD_BLAKE3_PRIVATE_BUF_MAX) */
554 5450 : }
555 :
556 5444 : FD_BLAKE3_TRACE(( "fd_blake3_append: done" ));
557 5444 : return sha;
558 19998 : }
559 :
560 : static void const *
561 : fd_blake3_single_hash( fd_blake3_pos_t * s,
562 51 : fd_blake3_buf_t * tbl ) {
563 51 : #if FD_BLAKE3_PARA_MAX>1
564 51 : ulong tick = 0UL;
565 102 : while( !fd_blake3_is_finished( s, tick ) ) {
566 51 : fd_blake3_op_t ops[ FD_BLAKE3_PARA_MAX ] = {0};
567 51 : ulong op_cnt = 0UL;
568 102 : while( op_cnt<FD_BLAKE3_PARA_MAX ) {
569 102 : fd_blake3_op_t * op = &ops[ op_cnt ];
570 102 : if( !fd_blake3_prepare( s, tbl, op, tick ) )
571 51 : break;
572 51 : op_cnt++;
573 51 : }
574 :
575 51 : fd_blake3_batch_hash( ops, op_cnt );
576 51 : tick++;
577 51 : }
578 : #else
579 : while( !fd_blake3_is_finished( s, s->next_tick ) ) {
580 : fd_blake3_op_t op[1] = {0};
581 : if( !fd_blake3_prepare( s, tbl, op, s->next_tick ) )
582 : break;
583 : s->next_tick++;
584 : FD_BLAKE3_TRACE(( "fd_blake3_single_hash: compressing %hu bytes at layer %u, counter %lu, flags 0x%x",
585 : op->sz, s->layer, op->counter, op->flags ));
586 : # if FD_HAS_SSE
587 : fd_blake3_sse_compress1( op->out, op->msg, op->sz, op->counter, op->flags, NULL, NULL );
588 : # else
589 : fd_blake3_ref_compress1( op->out, op->msg, op->sz, op->counter, op->flags, NULL, NULL );
590 : # endif
591 : }
592 : #endif
593 51 : return tbl->slots[ s->layer ][0];
594 51 : }
595 :
596 : void *
597 : fd_blake3_fini( fd_blake3_t * sha,
598 51 : void * hash ) {
599 :
600 : /* Unpack inputs */
601 :
602 51 : fd_blake3_pos_t * s = &sha->pos;
603 51 : fd_blake3_buf_t * tbl = &sha->buf;
604 51 : uchar * buf = sha->block;
605 51 : ulong buf_used = sha->block_sz;
606 51 : FD_BLAKE3_TRACE(( "fd_blake3_fini(sha=%p,sz=%lu)", (void *)sha, s->input_sz ));
607 :
608 : /* TODO HACKY!! */
609 51 : s->input = buf - ( s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ );
610 51 : s->input_sz = ( s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ ) + buf_used;
611 :
612 51 : void const * hash_ = fd_blake3_single_hash( s, tbl );
613 51 : memcpy( hash, hash_, 32UL );
614 51 : return hash;
615 51 : }
616 :
617 : /* fd_blake3_fini_xof_compress performs BLAKE3 compression (input
618 : hashing) for all blocks in the hash tree except for the root block.
619 : Root compression inputs are returned via the function's out pointers:
620 : On return, root_msg[0..64] contains the padded message input for the
621 : root block, root_cv_pre[0..64] contains the output chaining value of
622 : the previous block (or the BLAKE3 IV if root block is the only block
623 : in the hash operation, i.e. <=64 byte hash input).
624 : Other values (counter, flags, size) are re-derived by the XOF
625 : implementation using the blake3 state object. */
626 :
627 : void
628 : fd_blake3_fini_xof_compress( fd_blake3_t * sha,
629 : uchar * root_msg,
630 4207 : uchar * root_cv_pre ) {
631 4207 : fd_blake3_pos_t * s = &sha->pos;
632 4207 : fd_blake3_buf_t * tbl = &sha->buf;
633 4207 : uchar * buf = sha->block;
634 4207 : ulong buf_used = sha->block_sz;
635 :
636 : /* TODO HACKY!! */
637 4207 : s->input = buf - ( s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ );
638 4207 : s->input_sz = ( s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ ) + buf_used;
639 :
640 : /* The root block is contained in a leaf. Process all but the last
641 : blocks of the chunk. (The last block is the "root" block) */
642 4207 : if( s->input_sz<=FD_BLAKE3_CHUNK_SZ ) {
643 1807 : fd_blake3_op_t op[1];
644 1807 : if( !fd_blake3_prepare_leaf( s, tbl, op, s->next_tick ) )
645 0 : FD_LOG_ERR(( "fd_blake3_fini_xof_compress invariant violation: failed to prepare compression of <=1024 byte message (duplicate call to fini?)" ));
646 1807 : #if FD_HAS_SSE
647 1807 : fd_blake3_sse_compress1( root_msg, op->msg, op->sz, op->counter, op->flags, root_cv_pre, NULL );
648 : #else
649 : fd_blake3_ref_compress1( root_msg, op->msg, op->sz, op->counter, op->flags, root_cv_pre, NULL );
650 : #endif
651 1807 : return;
652 1807 : }
653 :
654 : /* The root block is a branch node. Continue working until there are
655 : only two blocks remaining. */
656 2400 : ulong tick = sha->pos.next_tick+1;
657 13183 : for(;;) {
658 13183 : int l0_complete = fd_blake3_l0_complete( s );
659 13183 : int ln_complete = s->live_cnt == 2UL;
660 13183 : if( l0_complete & ln_complete ) break;
661 :
662 10784 : #if FD_BLAKE3_PARA_MAX>1
663 10784 : fd_blake3_op_t ops[ FD_BLAKE3_PARA_MAX ] = {0};
664 10784 : ulong op_cnt = 0UL;
665 73006 : while( op_cnt<FD_BLAKE3_PARA_MAX ) {
666 71213 : fd_blake3_op_t * op = &ops[ op_cnt ];
667 71213 : if( !fd_blake3_prepare( s, tbl, op, tick ) )
668 8991 : break;
669 62222 : op_cnt++;
670 62222 : }
671 10784 : if( FD_UNLIKELY( !op_cnt ) ) {
672 0 : FD_LOG_ERR(( "fd_blake3_fini_xof_compress invariant violation: failed to prepare branch compression with live_cnt=%lu (duplicate call to fini?)", s->live_cnt ));
673 0 : }
674 :
675 10784 : fd_blake3_batch_hash( ops, op_cnt );
676 : #else
677 : fd_blake3_op_t op[1] = {0};
678 : if( !fd_blake3_prepare( s, tbl, op, tick ) )
679 : break;
680 : # if FD_HAS_SSE
681 : fd_blake3_sse_compress1( op->out, op->msg, op->sz, op->counter, op->flags, NULL, NULL );
682 : # else
683 : fd_blake3_ref_compress1( op->out, op->msg, op->sz, op->counter, op->flags, NULL, NULL );
684 : # endif
685 : #endif
686 10784 : tick++;
687 10784 : }
688 2400 : }
689 :
690 : void *
691 : fd_blake3_fini_2048( fd_blake3_t * sha,
692 4207 : void * hash ) {
693 4207 : FD_BLAKE3_TRACE(( "fd_blake3_fini_2048(sha=%p,hash=%p)", (void *)sha, hash ));
694 :
695 : /* Compress input until the last remaining piece of work is the BLAKE3
696 : root block. This root block is put through the compression
697 : function repeatedly to "expand" the hash output (XOF hashing).
698 : Solana uses this to generate a 2048 byte 'LtHash' value.
699 : fd_blake3 does this SIMD-parallel for better performance. */
700 4207 : uchar root_msg [ 64 ] __attribute__((aligned(64)));
701 4207 : uchar root_cv_pre[ 32 ] __attribute__((aligned(32)));
702 4207 : fd_blake3_fini_xof_compress( sha, root_msg, root_cv_pre );
703 :
704 : /* Restore root block details */
705 4207 : uint last_block_sz = 64u;
706 4207 : uint last_block_flags = FD_BLAKE3_FLAG_ROOT | FD_BLAKE3_FLAG_PARENT;
707 4207 : ulong ctr0 = 0UL;
708 4207 : if( sha->pos.input_sz<=FD_BLAKE3_CHUNK_SZ ) {
709 1810 : last_block_sz = (uint)sha->pos.input_sz & 63u;
710 1810 : if( fd_ulong_is_aligned( sha->pos.input_sz, 64 ) ) last_block_sz = 64;
711 1810 : if( FD_UNLIKELY( sha->pos.input_sz==0UL ) ) last_block_sz = 0u;
712 1810 : last_block_flags = FD_BLAKE3_FLAG_ROOT | FD_BLAKE3_FLAG_CHUNK_END;
713 1810 : if( sha->pos.input_sz<=FD_BLAKE3_BLOCK_SZ ) last_block_flags |= FD_BLAKE3_FLAG_CHUNK_START;
714 1810 : ctr0 = sha->pos.leaf_idx-1UL;
715 2397 : } else {
716 2397 : fd_blake3_op_t op[1];
717 2397 : if( FD_UNLIKELY( !fd_blake3_prepare( &sha->pos, &sha->buf, op, sha->pos.next_tick+1UL ) ) ) {
718 0 : FD_LOG_ERR(( "fd_blake3_fini_2048 invariant violation: failed to prepare branch root compression (duplicate call to fini?)" ));
719 0 : }
720 2397 : memcpy( root_msg, op->msg, 64UL );
721 2397 : memcpy( root_cv_pre, FD_BLAKE3_IV, 32UL );
722 2397 : }
723 4207 : FD_BLAKE3_TRACE(( "fd_blake3_fini_2048: sz=%lu ctr0=%lu flags=%x",
724 4207 : sha->pos.input_sz, ctr0, last_block_flags ));
725 :
726 : /* Expand LtHash
727 : For now, this uses the generic AVX2/AVX512 compress backend.
728 : Could write a more optimized version in the future saving some of
729 : the matrix transpose work. */
730 12620 : for( ulong i=0UL; i<32UL; i+=FD_BLAKE3_PARA_MAX ) {
731 8413 : #if FD_HAS_AVX512
732 8413 : ulong batch_data [ 16 ] __attribute__((aligned(64)));
733 143021 : /* */ for( ulong j=0; j<16; j++ ) batch_data [ j ] = (ulong)root_msg;
734 143021 : uint batch_sz [ 16 ]; for( ulong j=0; j<16; j++ ) batch_sz [ j ] = last_block_sz;
735 143021 : ulong batch_ctr [ 16 ]; for( ulong j=0; j<16; j++ ) batch_ctr [ j ] = ctr0+i+j;
736 143021 : uint batch_flags[ 16 ]; for( ulong j=0; j<16; j++ ) batch_flags[ j ] = last_block_flags;
737 143021 : void * batch_hash [ 16 ]; for( ulong j=0; j<16; j++ ) batch_hash [ j ] = (uchar *)hash + (i+j)*64;
738 143021 : void * batch_cv [ 16 ]; for( ulong j=0; j<16; j++ ) batch_cv [ j ] = root_cv_pre;
739 8413 : fd_blake3_avx512_compress16( 16UL, batch_data, batch_sz, batch_ctr, batch_flags, batch_hash, NULL, 64U, batch_cv );
740 : #elif FD_HAS_AVX
741 : ulong batch_data [ 8 ]; for( ulong j=0; j<8; j++ ) batch_data [ j ] = (ulong)root_msg;
742 : uint batch_sz [ 8 ]; for( ulong j=0; j<8; j++ ) batch_sz [ j ] = last_block_sz;
743 : ulong batch_ctr [ 8 ]; for( ulong j=0; j<8; j++ ) batch_ctr [ j ] = ctr0+i+j;
744 : uint batch_flags[ 8 ]; for( ulong j=0; j<8; j++ ) batch_flags[ j ] = last_block_flags;
745 : void * batch_hash [ 8 ]; for( ulong j=0; j<8; j++ ) batch_hash [ j ] = (uchar *)hash + (i+j)*64;
746 : void * batch_cv [ 8 ]; for( ulong j=0; j<8; j++ ) batch_cv [ j ] = root_cv_pre;
747 : fd_blake3_avx_compress8( 8UL, batch_data, batch_sz, batch_ctr, batch_flags, batch_hash, NULL, 64U, batch_cv );
748 : #elif FD_HAS_SSE
749 : fd_blake3_sse_compress1( (uchar *)hash+i*64, root_msg, last_block_sz, ctr0+i, last_block_flags, NULL, root_cv_pre );
750 : #else
751 : fd_blake3_ref_compress1( (uchar *)hash+i*64, root_msg, last_block_sz, ctr0+i, last_block_flags, NULL, root_cv_pre );
752 : #endif
753 8413 : }
754 :
755 4207 : FD_BLAKE3_TRACE(( "fd_blake3_fini_2048: done" ));
756 4207 : return hash;
757 4207 : }
758 :
759 : void *
760 : fd_blake3_hash( void const * data,
761 : ulong sz,
762 0 : void * hash ) {
763 :
764 0 : fd_blake3_buf_t tbl[1];
765 0 : fd_blake3_pos_t s[1];
766 0 : fd_blake3_pos_init( s, data, sz );
767 :
768 0 : #if FD_BLAKE3_PARA_MAX>1
769 0 : for(;;) {
770 0 : fd_blake3_op_t op[1];
771 0 : if( !fd_blake3_prepare_fast( s, tbl, op, FD_BLAKE3_PARA_MAX, 4 ) )
772 0 : break;
773 0 : #if FD_HAS_AVX512
774 0 : fd_blake3_avx512_compress16_fast( op->msg, op->out, op->counter, op->flags );
775 : #elif FD_HAS_AVX
776 : fd_blake3_avx_compress8_fast( op->msg, op->out, op->counter, op->flags );
777 : #else
778 : #error "missing para support"
779 : #endif
780 0 : }
781 0 : #endif
782 :
783 0 : void const * hash_ = fd_blake3_single_hash( s, tbl );
784 0 : memcpy( hash, hash_, 32UL );
785 0 : return hash;
786 0 : }
787 :
788 : #if FD_HAS_AVX
789 :
790 : void
791 : fd_blake3_lthash_batch8(
792 : void const * batch_data[8], /* align=32 ele_align=1 */
793 : uint const batch_sz [8], /* align=32 */
794 : void * out_lthash /* align=32 */
795 0 : ) {
796 0 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)batch_data, 32 ) ) ) {
797 0 : FD_LOG_ERR(( "misaligned batch_data: %p", (void *)batch_data ));
798 0 : }
799 0 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)batch_sz, 32 ) ) ) {
800 0 : FD_LOG_ERR(( "misaligned batch_sz: %p", (void *)batch_sz ));
801 0 : }
802 0 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)out_lthash, 32 ) ) ) {
803 0 : FD_LOG_ERR(( "misaligned out_lthash: %p", (void *)out_lthash ));
804 0 : }
805 :
806 0 : ulong batch_ctr [ 8 ] = {0};
807 0 : uint batch_flags[ 8 ]; for( uint i=0; i<8; i++ ) batch_flags[ i ] = FD_BLAKE3_FLAG_ROOT;
808 0 : fd_blake3_avx_compress8( 8UL, batch_data, batch_sz, batch_ctr, batch_flags, NULL, out_lthash, 32U, NULL );
809 0 : }
810 :
811 : #endif
812 :
813 : #if FD_HAS_AVX512
814 :
815 : void
816 : fd_blake3_lthash_batch16(
817 : void const * batch_data[16], /* align=32 ele_align=1 */
818 : uint const batch_sz [16], /* align=32 */
819 : void * out_lthash /* align=32 */
820 0 : ) {
821 0 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)batch_data, 64 ) ) ) {
822 0 : FD_LOG_ERR(( "misaligned batch_data: %p", (void *)batch_data ));
823 0 : }
824 0 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)batch_sz, 64 ) ) ) {
825 0 : FD_LOG_ERR(( "misaligned batch_sz: %p", (void *)batch_sz ));
826 0 : }
827 0 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)out_lthash, 64 ) ) ) {
828 0 : FD_LOG_ERR(( "misaligned out_lthash: %p", (void *)out_lthash ));
829 0 : }
830 :
831 0 : ulong batch_ctr [ 16 ] = {0};
832 0 : uint batch_flags[ 16 ]; for( uint i=0; i<16; i++ ) batch_flags[ i ] = FD_BLAKE3_FLAG_ROOT;
833 0 : fd_blake3_avx512_compress16( 16UL, batch_data, batch_sz, batch_ctr, batch_flags, NULL, out_lthash, 32U, NULL );
834 0 : }
835 :
836 : #endif
|