LCOV - code coverage report
Current view: top level - ballet/blake3 - fd_blake3.c (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 309 432 71.5 %
Date: 2026-03-19 18:19:27 Functions: 17 26 65.4 %

          Line data    Source code
       1             : #include "fd_blake3.h"
       2             : #include "fd_blake3_private.h"
       3             : #include <assert.h>
       4             : 
       5             : /* Hash state machine *************************************************/
       6             : 
       7             : static FD_FN_UNUSED fd_blake3_pos_t *
       8             : fd_blake3_pos_init( fd_blake3_pos_t * s,
       9             :                     uchar const *     data,
      10        4259 :                     ulong             sz ) {
      11        4259 :   *s = (fd_blake3_pos_t) {
      12        4259 :     .input    = data,
      13        4259 :     .input_sz = sz,
      14        4259 :     .magic    = FD_BLAKE3_MAGIC,
      15        4259 :   };
      16        4259 :   return s;
      17        4259 : }
      18             : 
      19             : /* fd_blake3_l0_complete returns 1 if all leaf nodes have been hashed,
      20             :    0 otherwise. */
      21             : 
      22             : FD_FN_PURE static inline int
      23      170337 : fd_blake3_l0_complete( fd_blake3_pos_t const * s ) {
      24      170337 :   return ( s->leaf_idx<<FD_BLAKE3_CHUNK_LG_SZ ) >= fd_ulong_max( s->input_sz, 64 );
      25      170337 : }
      26             : 
      27             : FD_FN_PURE static inline int
      28             : fd_blake3_is_finished( fd_blake3_pos_t const * s,
      29       73817 :                        ulong                   tick ) {
      30       73817 :   int l0_complete = fd_blake3_l0_complete( s );
      31       73817 :   int ln_complete = s->live_cnt == 1UL;
      32       73817 :   int idle        = tick >= s->next_tick;
      33       73817 :   return l0_complete & ln_complete & idle;
      34       73817 : }
      35             : 
      36             : static fd_blake3_op_t *
      37             : fd_blake3_prepare_leaf( fd_blake3_pos_t * restrict s,
      38             :                         fd_blake3_buf_t * restrict buf,
      39             :                         fd_blake3_op_t *  restrict op,
      40       11436 :                         ulong                      tick ) {
      41             : 
      42       11436 :   ulong         msg_off = s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ;
      43       11436 :   ulong         msg_sz  = fd_ulong_min( s->input_sz - msg_off, 1024UL );
      44       11436 :   uchar const * msg     = s->input + msg_off;
      45       11436 :   uchar       * out     = buf->slots[ s->layer ][ s->head.uc[ s->layer ] ];
      46             : 
      47       11436 :   int flags = fd_int_if( s->input_sz <= FD_BLAKE3_CHUNK_SZ, FD_BLAKE3_FLAG_ROOT, 0 );
      48             : 
      49       11436 :   *op = (fd_blake3_op_t) {
      50       11436 :     .msg     = msg,
      51       11436 :     .out     = out,
      52       11436 :     .counter = s->leaf_idx,
      53       11436 :     .sz      = (ushort)msg_sz,
      54       11436 :     .flags   = (uchar)flags
      55       11436 :   };
      56             : 
      57       11436 :   s->head.uc[ 0 ] = (uchar)( s->head.uc[ 0 ]+1 );
      58       11436 :   s->leaf_idx++;
      59       11436 :   s->live_cnt++;
      60       11436 :   s->next_tick = tick+1;
      61             : 
      62       11436 :   return op;
      63             : 
      64       11436 : }
      65             : 
      66             : static int
      67             : fd_blake3_seek_branch( fd_blake3_pos_t * restrict s,
      68             :                        fd_blake3_buf_t * restrict buf,
      69       61645 :                        ulong                      tick ) {
      70             : 
      71       61645 :   if( s->live_cnt == 1UL )
      72           0 :     return 0;
      73             : 
      74       61645 :   if( !fd_blake3_l0_complete( s ) )
      75       28704 :     return ( s->tail.uc[ s->layer - 1 ] + 1 ) <
      76       28704 :            ( s->head.uc[ s->layer - 1 ]     );
      77             : 
      78       32941 : # if FD_HAS_AVX
      79             : 
      80       32941 :   wb_t diff = wb_sub( s->head.wb, s->tail.wb );
      81             : 
      82       32941 :   uint mergeable_layers = (uint)_mm256_movemask_epi8( wb_gt( diff, wb_bcast( 1 ) ) );
      83       32941 :   int  merge_layer = fd_uint_find_lsb_w_default( mergeable_layers, -1 );
      84       32941 :   if( merge_layer>=0 ) {
      85       28771 :     if( ((uint)merge_layer >= s->layer) & (tick < s->next_tick) )
      86        4797 :       return 0;  /* still waiting for previous merge */
      87       23974 :     s->layer = (uint)merge_layer+1U;
      88       23974 :     return 1;
      89       28771 :   }
      90             : 
      91        4170 :   uint single_layers = (uint)_mm256_movemask_epi8( wb_eq( diff, wb_bcast( 1 ) ) );
      92        4170 :   uint single_lo = (uint)fd_uint_find_lsb( single_layers );
      93        4170 :   uint single_hi = (uint)fd_uint_find_lsb( single_layers & ( ~fd_uint_mask_lsb( (int)(single_lo+1U) ) ) );
      94             : 
      95        4170 :   wb_t node = wb_ld( buf->slots[ single_lo ][ s->tail.uc[ single_lo ] ] );
      96        4170 :               wb_st( buf->slots[ single_hi ][ s->head.uc[ single_hi ] ], node );
      97             : 
      98             : # else /* FD_HAS_AVX */
      99             : 
     100             :   uchar diff[ 32 ];
     101             :   for( ulong j=0UL; j<32UL; j++ ) diff[j] = (uchar)( s->head.uc[j] - s->tail.uc[j] );
     102             : 
     103             :   int merge_layer = -1;
     104             :   for( uint j=0U; j<32U; j++ ) {
     105             :     if( diff[j]>1 ) {
     106             :       merge_layer = (int)j;
     107             :       break;
     108             :     }
     109             :   }
     110             :   if( merge_layer>=0 ) {
     111             :     if( ((uint)merge_layer >= s->layer) & (tick < s->next_tick) )
     112             :       return 0;  /* still waiting for previous merge */
     113             :     s->layer = (uint)(merge_layer+1);
     114             :     return 1;
     115             :   }
     116             : 
     117             :   uint j=0U;
     118             :   uint single_lo = 0UL;
     119             :   uint single_hi = 0UL;
     120             :   for( ; j<32U; j++ ) {
     121             :     if( diff[j] ) {
     122             :       single_lo = j;
     123             :       break;
     124             :     }
     125             :   }
     126             :   j++;
     127             :   for( ; j<32U; j++ ) {
     128             :     if( diff[j] ) {
     129             :       single_hi = j;
     130             :       break;
     131             :     }
     132             :   }
     133             : 
     134             :   memcpy( buf->slots[ single_hi ][ s->head.uc[ single_hi ] ],
     135             :           buf->slots[ single_lo ][ s->tail.uc[ single_lo ] ],
     136             :           32UL );
     137             : 
     138             : # endif /* FD_HAS_AVX */
     139             : 
     140        4170 :   FD_BLAKE3_TRACE(( "fd_blake3_seek_branch: moving up %u/%u to %u/%u",
     141        4170 :                     single_lo, s->tail.uc[ single_lo ],
     142        4170 :                     single_hi, s->head.uc[ single_hi ] ));
     143             : 
     144        4170 :   if( ((uint)single_hi >= s->layer) & (tick < s->next_tick) )
     145        1796 :     return 0;  /* still waiting for previous merge */
     146             : 
     147        2374 :   s->head.uc[ single_lo ] = (uchar)( s->head.uc[ single_lo ]-1 );
     148        2374 :   s->head.uc[ single_hi ] = (uchar)( s->head.uc[ single_hi ]+1 );
     149             : 
     150        2374 :   s->layer = (uint)single_hi+1U;
     151        2374 :   return 1;
     152        4170 : }
     153             : 
     154             : static fd_blake3_op_t *
     155             : fd_blake3_prepare_branch( fd_blake3_pos_t * restrict s,
     156             :                           fd_blake3_buf_t * restrict buf,
     157             :                           fd_blake3_op_t *  restrict op,
     158       61645 :                           ulong                      tick ) {
     159             : 
     160       61645 :   if( !fd_blake3_seek_branch( s, buf, tick ) )
     161        6593 :     return NULL;
     162             : 
     163       55052 :   assert( s->layer < FD_BLAKE3_ROW_CNT );
     164             : 
     165           0 :   uchar const * msg = buf->slots[ s->layer-1U ][ s->tail.uc[ s->layer-1U ] ];
     166       55052 :   uchar       * out = buf->slots[ s->layer    ][ s->head.uc[ s->layer    ] ];
     167             : 
     168       55052 :   s->head.uc[ s->layer   ] = (uchar)( s->head.uc[ s->layer   ]+1 );
     169       55052 :   s->tail.uc[ s->layer-1 ] = (uchar)( s->tail.uc[ s->layer-1 ]+2 );
     170       55052 :   s->live_cnt--;
     171       55052 :   s->next_tick = tick+1;
     172             : 
     173       55052 :   uint flags = FD_BLAKE3_FLAG_PARENT |
     174       55052 :                fd_uint_if( s->live_cnt==1UL, FD_BLAKE3_FLAG_ROOT, 0u );
     175             : 
     176       55052 :   *op = (fd_blake3_op_t) {
     177       55052 :     .msg     = msg,
     178       55052 :     .out     = out,
     179       55052 :     .counter = 0UL,
     180       55052 :     .sz      = 64U,
     181       55052 :     .flags   = (uchar)flags
     182       55052 :   };
     183       55052 :   return op;
     184             : 
     185       61645 : }
     186             : 
     187             : static void
     188       13231 : fd_blake3_advance( fd_blake3_pos_t * restrict s ) {
     189             : 
     190       13231 : # if FD_HAS_AVX
     191             : 
     192       13231 :   wb_t mask = wb_eq( s->tail.wb, s->head.wb );
     193       13231 :   s->tail.wb = wb_andnot( mask, s->tail.wb );
     194       13231 :   s->head.wb = wb_andnot( mask, s->head.wb );
     195             : 
     196             : # else /* FD_HAS_AVX */
     197             : 
     198             :   for( ulong j=0UL; j<32UL; j++ ) {
     199             :     if( s->tail.uc[j] == s->head.uc[j] ) {
     200             :       s->tail.uc[j] = 0;
     201             :       s->head.uc[j] = 0;
     202             :     }
     203             :   }
     204             : 
     205             : # endif /* FD_HAS_AVX */
     206             : 
     207       13231 :   if( s->head.uc[ s->layer ]==FD_BLAKE3_COL_CNT ) {
     208        1196 :     s->layer++;
     209        1196 :   }
     210       12035 :   else if( ( s->layer > 0UL ) &&
     211       12035 :            ( s->tail.uc[ s->layer-1 ] < s->head.uc[ s->layer-1 ] ) ) {
     212             :     /* pass */
     213        2394 :   }
     214        9641 :   else if( fd_blake3_l0_complete( s ) ) {
     215        7195 :     s->layer++;
     216        7195 :   }
     217        2446 :   else if( s->layer > 0UL ) {
     218         598 :     s->layer = 0UL;
     219         598 :   }
     220             : 
     221       13231 : }
     222             : 
     223             : static fd_blake3_op_t *
     224             : fd_blake3_prepare( fd_blake3_pos_t * restrict s,
     225             :                    fd_blake3_buf_t * restrict buf,
     226             :                    fd_blake3_op_t *  restrict op,
     227       73716 :                    ulong                      tick ) {
     228             : 
     229       73716 :   assert( s->layer < FD_BLAKE3_ROW_CNT );
     230             : 
     231       73716 :   if( fd_blake3_is_finished( s, tick ) )
     232           0 :     return NULL;
     233             : 
     234       73716 :   if( tick >= s->next_tick )
     235       13231 :     fd_blake3_advance( s );
     236             : 
     237       73716 :   if( s->layer != 0 )
     238       61647 :     return fd_blake3_prepare_branch( s, buf, op, tick );
     239             : 
     240       12069 :   if( ( s->head.uc[0] >= FD_BLAKE3_COL_CNT ) |
     241       12069 :       ( fd_blake3_l0_complete( s )         ) ) {
     242        2450 :     return NULL;
     243        2450 :   }
     244             : 
     245        9619 :   return fd_blake3_prepare_leaf( s, buf, op, tick );
     246             : 
     247       12069 : }
     248             : 
     249             : #if FD_BLAKE3_PARA_MAX>1
     250             : 
     251             : /* fd_blake3_prepare_fast does streamlined hashing of full chunks or
     252             :    full branches. */
     253             : 
     254             : static fd_blake3_op_t *
     255             : fd_blake3_prepare_fast( fd_blake3_pos_t * restrict s,
     256             :                         fd_blake3_buf_t * restrict buf,
     257             :                         fd_blake3_op_t *  restrict op,
     258             :                         ulong                      n,
     259        7773 :                         ulong                      min ) {
     260             : 
     261        7773 :   if( s->layer && s->head.uc[ s->layer-1 ]==FD_BLAKE3_COL_CNT ) {
     262        2392 :     op->msg     = buf->rows[ s->layer-1 ];
     263        2392 :     op->out     = buf->rows[ s->layer ] + (s->head.uc[ s->layer ]<<FD_BLAKE3_OUTCHAIN_LG_SZ);
     264        2392 :     op->counter = 0UL;
     265        2392 :     op->flags   = FD_BLAKE3_FLAG_PARENT;
     266             : 
     267             :     /* Assume that branch layer is fully hashed (up to col cnt) */
     268        2392 :     s->head.uc[ s->layer-1 ] =  0;
     269        2392 :     s->head.uc[ s->layer   ] = (uchar)( (ulong)s->head.uc[ s->layer ]+n );
     270        2392 :     s->live_cnt -= n;
     271        2392 :     s->layer = fd_uint_if( s->head.uc[ s->layer ]==FD_BLAKE3_COL_CNT,
     272        2392 :                            s->layer+1U, 0U );
     273             : 
     274        2392 :     return op;
     275        2392 :   }
     276             : 
     277        5381 :   ulong pos   = s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ;
     278        5381 :   ulong avail = fd_ulong_align_dn( s->input_sz - pos, FD_BLAKE3_CHUNK_SZ ) >> FD_BLAKE3_CHUNK_LG_SZ;
     279        5381 :   n = fd_ulong_min( n, avail );
     280             : 
     281             :   /* This constants controls the threshold when to use the (slow)
     282             :      scheduler instead of fast single-message hashing.  Carefully tuned
     283             :      for best overall performance. */
     284        5381 :   if( n<min ) return NULL;
     285             : 
     286        5381 :   op->msg     = s->input + (s->leaf_idx<<FD_BLAKE3_CHUNK_LG_SZ);
     287        5381 :   op->out     = buf->rows[0] + (s->head.uc[0]<<FD_BLAKE3_OUTCHAIN_LG_SZ);
     288        5381 :   op->counter = s->leaf_idx;
     289        5381 :   op->flags   = 0;
     290             : 
     291        5381 :   s->head.uc[0] = (uchar)( (ulong)s->head.uc[0]+n );
     292        5381 :   s->leaf_idx   += n;
     293        5381 :   s->live_cnt   += n;
     294        5381 :   s->layer      =  fd_uint_if( s->head.uc[0]==FD_BLAKE3_COL_CNT, 1U, 0U );
     295             : 
     296        5381 :   return op;
     297        5381 : }
     298             : 
     299             : static void
     300             : fd_blake3_batch_hash( fd_blake3_op_t const * ops,
     301       10836 :                       ulong                  op_cnt ) {
     302       10836 :   uchar const * batch_data   [ FD_BLAKE3_PARA_MAX ] __attribute__((aligned(64)));
     303       10836 :   uint          batch_data_sz[ FD_BLAKE3_PARA_MAX ] = {0};
     304       10836 :   uchar *       batch_hash   [ FD_BLAKE3_PARA_MAX ] __attribute__((aligned(64)));
     305       10836 :   ulong         batch_ctr    [ FD_BLAKE3_PARA_MAX ];
     306       10836 :   uint          batch_flags  [ FD_BLAKE3_PARA_MAX ];
     307       73141 :   for( ulong j=0UL; j<op_cnt; j++ ) {
     308       62305 :     batch_data   [ j ] = ops[ j ].msg;
     309       62305 :     batch_hash   [ j ] = ops[ j ].out;
     310       62305 :     batch_data_sz[ j ] = ops[ j ].sz;
     311       62305 :     batch_ctr    [ j ] = ops[ j ].counter;
     312       62305 :     batch_flags  [ j ] = ops[ j ].flags;
     313       62305 :   }
     314       10836 : #if FD_HAS_AVX512
     315       10836 :   fd_blake3_avx512_compress16( op_cnt, batch_data, batch_data_sz, batch_ctr, batch_flags, fd_type_pun( batch_hash ), NULL, 32U, NULL );
     316             : #elif FD_HAS_AVX
     317             :   fd_blake3_avx_compress8    ( op_cnt, batch_data, batch_data_sz, batch_ctr, batch_flags, fd_type_pun( batch_hash ), NULL, 32U, NULL );
     318             : #else
     319             :   #error "FIXME missing para support"
     320             : #endif
     321       10836 : }
     322             : 
     323             : #endif
     324             : 
     325             : /* Simple API *********************************************************/
     326             : 
     327             : ulong
     328           0 : fd_blake3_align( void ) {
     329           0 :   return FD_BLAKE3_ALIGN;
     330           0 : }
     331             : 
     332             : ulong
     333           0 : fd_blake3_footprint( void ) {
     334           0 :   return FD_BLAKE3_FOOTPRINT;
     335           0 : }
     336             : 
     337             : void *
     338           0 : fd_blake3_new( void * shmem ) {
     339           0 :   fd_blake3_t * sha = (fd_blake3_t *)shmem;
     340             : 
     341           0 :   if( FD_UNLIKELY( !shmem ) ) {
     342           0 :     FD_LOG_WARNING(( "NULL shmem" ));
     343           0 :     return NULL;
     344           0 :   }
     345             : 
     346           0 :   if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)shmem, fd_blake3_align() ) ) ) {
     347           0 :     FD_LOG_WARNING(( "misaligned shmem" ));
     348           0 :     return NULL;
     349           0 :   }
     350             : 
     351           0 :   ulong footprint = fd_blake3_footprint();
     352             : 
     353           0 :   fd_memset( sha, 0, footprint );
     354             : 
     355           0 :   FD_COMPILER_MFENCE();
     356           0 :   FD_VOLATILE( sha->pos.magic ) = FD_BLAKE3_MAGIC;
     357           0 :   FD_COMPILER_MFENCE();
     358             : 
     359           0 :   return (void *)sha;
     360           0 : }
     361             : 
     362             : fd_blake3_t *
     363           0 : fd_blake3_join( void * shsha ) {
     364             : 
     365           0 :   if( FD_UNLIKELY( !shsha ) ) {
     366           0 :     FD_LOG_WARNING(( "NULL shsha" ));
     367           0 :     return NULL;
     368           0 :   }
     369             : 
     370           0 :   if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)shsha, fd_blake3_align() ) ) ) {
     371           0 :     FD_LOG_WARNING(( "misaligned shsha" ));
     372           0 :     return NULL;
     373           0 :   }
     374             : 
     375           0 :   fd_blake3_t * sha = (fd_blake3_t *)shsha;
     376             : 
     377           0 :   if( FD_UNLIKELY( sha->pos.magic!=FD_BLAKE3_MAGIC ) ) {
     378           0 :     FD_LOG_WARNING(( "bad magic" ));
     379           0 :     return NULL;
     380           0 :   }
     381             : 
     382           0 :   return sha;
     383           0 : }
     384             : 
     385             : void *
     386           0 : fd_blake3_leave( fd_blake3_t * sha ) {
     387             : 
     388           0 :   if( FD_UNLIKELY( !sha ) ) {
     389           0 :     FD_LOG_WARNING(( "NULL sha" ));
     390           0 :     return NULL;
     391           0 :   }
     392             : 
     393           0 :   return (void *)sha;
     394           0 : }
     395             : 
     396             : void *
     397           0 : fd_blake3_delete( void * shsha ) {
     398             : 
     399           0 :   if( FD_UNLIKELY( !shsha ) ) {
     400           0 :     FD_LOG_WARNING(( "NULL shsha" ));
     401           0 :     return NULL;
     402           0 :   }
     403             : 
     404           0 :   if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)shsha, fd_blake3_align() ) ) ) {
     405           0 :     FD_LOG_WARNING(( "misaligned shsha" ));
     406           0 :     return NULL;
     407           0 :   }
     408             : 
     409           0 :   fd_blake3_t * sha = (fd_blake3_t *)shsha;
     410             : 
     411           0 :   if( FD_UNLIKELY( sha->pos.magic!=FD_BLAKE3_MAGIC ) ) {
     412           0 :     FD_LOG_WARNING(( "bad magic" ));
     413           0 :     return NULL;
     414           0 :   }
     415             : 
     416           0 :   FD_COMPILER_MFENCE();
     417           0 :   FD_VOLATILE( sha->pos.magic ) = 0UL;
     418           0 :   FD_COMPILER_MFENCE();
     419             : 
     420           0 :   return (void *)sha;
     421           0 : }
     422             : 
     423             : 
     424             : fd_blake3_t *
     425        4259 : fd_blake3_init( fd_blake3_t * sha ) {
     426        4259 :   FD_BLAKE3_TRACE(( "fd_blake3_init(sha=%p)", (void *)sha ));
     427        4259 :   fd_blake3_pos_init( &sha->pos, NULL, 0UL );
     428        4259 :   sha->block_sz = 0UL;
     429        4259 :   return sha;
     430        4259 : }
     431             : 
     432             : #if FD_BLAKE3_PARA_MAX>1
     433             : 
     434             : static void
     435             : fd_blake3_append_blocks( fd_blake3_pos_t * s,
     436             :                          fd_blake3_buf_t * tbl,
     437             :                          uchar const *     data,
     438        1797 :                          ulong             buf_cnt ) {
     439        1797 :   s->input = data - (s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ); /* TODO HACKY!! */
     440        7179 :   for( ulong i=0UL; i<buf_cnt; i++ ) {
     441        5382 :     fd_blake3_op_t op[1];
     442        7774 :     do {
     443        7774 :       if( !fd_blake3_prepare_fast( s, tbl, op, FD_BLAKE3_PARA_MAX, FD_BLAKE3_PARA_MAX ) )
     444           0 :         return;
     445        7774 : #if FD_HAS_AVX512
     446        7774 :       fd_blake3_avx512_compress16_fast( op->msg, op->out, op->counter, op->flags );
     447             : #elif FD_HAS_AVX
     448             :       fd_blake3_avx_compress8_fast( op->msg, op->out, op->counter, op->flags );
     449             : #else
     450             :       #error "missing para support"
     451             : #endif
     452        7774 :     } while( op->flags & FD_BLAKE3_FLAG_PARENT );
     453        5382 :   }
     454        1797 : }
     455             : 
     456             : #else
     457             : 
     458             : static void
     459             : fd_blake3_append_blocks( fd_blake3_pos_t * s,
     460             :                          fd_blake3_buf_t * tbl,
     461             :                          uchar const *     data,
     462             :                          ulong             buf_cnt ) {
     463             :   (void)buf_cnt;
     464             :   s->input = data - (s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ); /* TODO HACKY!! */
     465             :   fd_blake3_op_t op[1];
     466             :   while( buf_cnt ) {
     467             :     if( !fd_blake3_prepare( s, tbl, op, s->next_tick ) ) {
     468             :       FD_BLAKE3_TRACE(( "fd_blake3_append_blocks: no more ops to prepare" ));
     469             :       break;
     470             :     }
     471             :     if( op->flags & FD_BLAKE3_FLAG_PARENT ) {
     472             :       FD_BLAKE3_TRACE(( "fd_blake3_append_blocks: compressing output chaining values (layer %u)", s->layer ));
     473             :       fd_blake3_ref_compress1( op->out, op->msg, 64UL, op->counter, op->flags, NULL, NULL );
     474             :     } else {
     475             :       FD_BLAKE3_TRACE(( "fd_blake3_append_blocks: compressing %lu leaf chunks", FD_BLAKE3_COL_CNT ));
     476             :       fd_blake3_ref_compress1( op->out, op->msg, FD_BLAKE3_CHUNK_SZ, op->counter, op->flags, NULL, NULL );
     477             :       buf_cnt--;
     478             :     }
     479             :     s->next_tick++;
     480             :   }
     481             : }
     482             : 
     483             : #endif
     484             : 
     485             : fd_blake3_t *
     486             : fd_blake3_append( fd_blake3_t * sha,
     487             :                   void const *  _data,
     488       21193 :                   ulong         sz ) {
     489             : 
     490             :   /* If no data to append, we are done */
     491             : 
     492       21193 :   if( FD_UNLIKELY( !sz ) ) return sha;
     493       19998 :   FD_BLAKE3_TRACE(( "fd_blake3_append(sha=%p,data=%p,sz=%lu)", (void *)sha, _data, sz ));
     494             : 
     495             :   /* Unpack inputs */
     496             : 
     497       19998 :   fd_blake3_pos_t * s        = &sha->pos;
     498       19998 :   fd_blake3_buf_t * tbl      = &sha->buf;
     499       19998 :   uchar *           buf      = sha->block;
     500       19998 :   ulong             buf_used = sha->block_sz;
     501             : 
     502       19998 :   uchar const * data = (uchar const *)_data;
     503             : 
     504             :   /* Update input_sz */
     505             : 
     506       19998 :   s->input_sz += sz;
     507             : 
     508             :   /* Edge case: For the first completed 1024 bytes of input, don't
     509             :      immediately hash, since it is not clear whether this chunk has
     510             :      the root flag set. */
     511       19998 :   if( FD_UNLIKELY( FD_BLAKE3_PARA_MAX==1 && s->input_sz==1024UL ) ) {
     512           0 :     fd_memcpy( buf + buf_used, data, sz );
     513           0 :     sha->block_sz = FD_BLAKE3_CHUNK_SZ;
     514           0 :     return sha;
     515           0 :   }
     516             : 
     517             :   /* Handle buffered bytes from previous appends */
     518             : 
     519       19998 :   if( FD_UNLIKELY( buf_used ) ) { /* optimized for well aligned use of append */
     520             : 
     521             :     /* If the append isn't large enough to complete the current block,
     522             :        buffer these bytes too and return */
     523             : 
     524       15750 :     ulong buf_rem = FD_BLAKE3_PRIVATE_BUF_MAX - buf_used; /* In (0,FD_BLAKE3_PRIVATE_BUF_MAX) */
     525       15750 :     if( FD_UNLIKELY( sz < buf_rem ) ) { /* optimize for large append */
     526       14554 :       fd_memcpy( buf + buf_used, data, sz );
     527       14554 :       sha->block_sz = buf_used + sz;
     528       14554 :       return sha;
     529       14554 :     }
     530             : 
     531             :     /* Otherwise, buffer enough leading bytes of data to complete the
     532             :        block, update the hash and then continue processing any remaining
     533             :        bytes of data. */
     534             : 
     535        1196 :     fd_memcpy( buf + buf_used, data, buf_rem );
     536        1196 :     data += buf_rem;
     537        1196 :     sz   -= buf_rem;
     538             : 
     539        1196 :     fd_blake3_append_blocks( s, tbl, buf, 1UL );
     540        1196 :     sha->block_sz = 0UL;
     541        1196 :   }
     542             : 
     543             :   /* Append the bulk of the data */
     544             : 
     545        5444 :   ulong buf_cnt = sz >> FD_BLAKE3_PRIVATE_LG_BUF_MAX;
     546        5444 :   if( FD_LIKELY( buf_cnt ) ) fd_blake3_append_blocks( s, tbl, data, buf_cnt ); /* optimized for large append */
     547             : 
     548             :   /* Buffer any leftover bytes */
     549             : 
     550        5444 :   buf_used = sz & (FD_BLAKE3_PRIVATE_BUF_MAX-1UL); /* In [0,FD_BLAKE3_PRIVATE_BUF_MAX) */
     551        5450 :   if( FD_UNLIKELY( buf_used ) ) { /* optimized for well aligned use of append */
     552        5450 :     fd_memcpy( buf, data + (buf_cnt << FD_BLAKE3_PRIVATE_LG_BUF_MAX), buf_used );
     553        5450 :     sha->block_sz = buf_used; /* In (0,FD_BLAKE3_PRIVATE_BUF_MAX) */
     554        5450 :   }
     555             : 
     556        5444 :   FD_BLAKE3_TRACE(( "fd_blake3_append: done" ));
     557        5444 :   return sha;
     558       19998 : }
     559             : 
     560             : static void const *
     561             : fd_blake3_single_hash( fd_blake3_pos_t * s,
     562          51 :                        fd_blake3_buf_t * tbl ) {
     563          51 : #if FD_BLAKE3_PARA_MAX>1
     564          51 :   ulong tick = 0UL;
     565         102 :   while( !fd_blake3_is_finished( s, tick ) ) {
     566          51 :     fd_blake3_op_t ops[ FD_BLAKE3_PARA_MAX ] = {0};
     567          51 :     ulong          op_cnt = 0UL;
     568         102 :     while( op_cnt<FD_BLAKE3_PARA_MAX ) {
     569         102 :       fd_blake3_op_t * op = &ops[ op_cnt ];
     570         102 :       if( !fd_blake3_prepare( s, tbl, op, tick ) )
     571          51 :         break;
     572          51 :       op_cnt++;
     573          51 :     }
     574             : 
     575          51 :     fd_blake3_batch_hash( ops, op_cnt );
     576          51 :     tick++;
     577          51 :   }
     578             : #else
     579             :   while( !fd_blake3_is_finished( s, s->next_tick ) ) {
     580             :     fd_blake3_op_t op[1] = {0};
     581             :     if( !fd_blake3_prepare( s, tbl, op, s->next_tick ) )
     582             :       break;
     583             :     s->next_tick++;
     584             :     FD_BLAKE3_TRACE(( "fd_blake3_single_hash: compressing %hu bytes at layer %u, counter %lu, flags 0x%x",
     585             :                       op->sz, s->layer, op->counter, op->flags ));
     586             : #   if FD_HAS_SSE
     587             :     fd_blake3_sse_compress1( op->out, op->msg, op->sz, op->counter, op->flags, NULL, NULL );
     588             : #   else
     589             :     fd_blake3_ref_compress1( op->out, op->msg, op->sz, op->counter, op->flags, NULL, NULL );
     590             : #   endif
     591             :   }
     592             : #endif
     593          51 :   return tbl->slots[ s->layer ][0];
     594          51 : }
     595             : 
     596             : void *
     597             : fd_blake3_fini( fd_blake3_t * sha,
     598          51 :                 void *        hash ) {
     599             : 
     600             :   /* Unpack inputs */
     601             : 
     602          51 :   fd_blake3_pos_t * s        = &sha->pos;
     603          51 :   fd_blake3_buf_t * tbl      = &sha->buf;
     604          51 :   uchar *           buf      = sha->block;
     605          51 :   ulong             buf_used = sha->block_sz;
     606          51 :   FD_BLAKE3_TRACE(( "fd_blake3_fini(sha=%p,sz=%lu)", (void *)sha, s->input_sz ));
     607             : 
     608             :   /* TODO HACKY!! */
     609          51 :   s->input    = buf - ( s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ );
     610          51 :   s->input_sz = ( s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ ) + buf_used;
     611             : 
     612          51 :   void const * hash_ = fd_blake3_single_hash( s, tbl );
     613          51 :   memcpy( hash, hash_, 32UL );
     614          51 :   return hash;
     615          51 : }
     616             : 
     617             : /* fd_blake3_fini_xof_compress performs BLAKE3 compression (input
     618             :    hashing) for all blocks in the hash tree except for the root block.
     619             :    Root compression inputs are returned via the function's out pointers:
     620             :    On return, root_msg[0..64] contains the padded message input for the
     621             :    root block, root_cv_pre[0..64] contains the output chaining value of
     622             :    the previous block (or the BLAKE3 IV if root block is the only block
     623             :    in the hash operation, i.e. <=64 byte hash input).
     624             :    Other values (counter, flags, size) are re-derived by the XOF
     625             :    implementation using the blake3 state object. */
     626             : 
     627             : void
     628             : fd_blake3_fini_xof_compress( fd_blake3_t * sha,
     629             :                              uchar *       root_msg,
     630        4207 :                              uchar *       root_cv_pre ) {
     631        4207 :   fd_blake3_pos_t * s        = &sha->pos;
     632        4207 :   fd_blake3_buf_t * tbl      = &sha->buf;
     633        4207 :   uchar *           buf      = sha->block;
     634        4207 :   ulong             buf_used = sha->block_sz;
     635             : 
     636             :   /* TODO HACKY!! */
     637        4207 :   s->input    = buf - ( s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ );
     638        4207 :   s->input_sz = ( s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ ) + buf_used;
     639             : 
     640             :   /* The root block is contained in a leaf.  Process all but the last
     641             :      blocks of the chunk.  (The last block is the "root" block) */
     642        4207 :   if( s->input_sz<=FD_BLAKE3_CHUNK_SZ ) {
     643        1807 :     fd_blake3_op_t op[1];
     644        1807 :     if( !fd_blake3_prepare_leaf( s, tbl, op, s->next_tick ) )
     645           0 :       FD_LOG_ERR(( "fd_blake3_fini_xof_compress invariant violation: failed to prepare compression of <=1024 byte message (duplicate call to fini?)" ));
     646        1807 : #if FD_HAS_SSE
     647        1807 :     fd_blake3_sse_compress1( root_msg, op->msg, op->sz, op->counter, op->flags, root_cv_pre, NULL );
     648             : #else
     649             :     fd_blake3_ref_compress1( root_msg, op->msg, op->sz, op->counter, op->flags, root_cv_pre, NULL );
     650             : #endif
     651        1807 :     return;
     652        1807 :   }
     653             : 
     654             :   /* The root block is a branch node.  Continue working until there are
     655             :      only two blocks remaining. */
     656        2400 :   ulong tick = sha->pos.next_tick+1;
     657       13183 :   for(;;) {
     658       13183 :     int l0_complete = fd_blake3_l0_complete( s );
     659       13183 :     int ln_complete = s->live_cnt == 2UL;
     660       13183 :     if( l0_complete & ln_complete ) break;
     661             : 
     662       10784 : #if FD_BLAKE3_PARA_MAX>1
     663       10784 :     fd_blake3_op_t ops[ FD_BLAKE3_PARA_MAX ] = {0};
     664       10784 :     ulong          op_cnt = 0UL;
     665       73006 :     while( op_cnt<FD_BLAKE3_PARA_MAX ) {
     666       71213 :       fd_blake3_op_t * op = &ops[ op_cnt ];
     667       71213 :       if( !fd_blake3_prepare( s, tbl, op, tick ) )
     668        8991 :         break;
     669       62222 :       op_cnt++;
     670       62222 :     }
     671       10784 :     if( FD_UNLIKELY( !op_cnt ) ) {
     672           0 :       FD_LOG_ERR(( "fd_blake3_fini_xof_compress invariant violation: failed to prepare branch compression with live_cnt=%lu (duplicate call to fini?)", s->live_cnt ));
     673           0 :     }
     674             : 
     675       10784 :     fd_blake3_batch_hash( ops, op_cnt );
     676             : #else
     677             :     fd_blake3_op_t op[1] = {0};
     678             :     if( !fd_blake3_prepare( s, tbl, op, tick ) )
     679             :       break;
     680             : #   if FD_HAS_SSE
     681             :     fd_blake3_sse_compress1( op->out, op->msg, op->sz, op->counter, op->flags, NULL, NULL );
     682             : #   else
     683             :     fd_blake3_ref_compress1( op->out, op->msg, op->sz, op->counter, op->flags, NULL, NULL );
     684             : #   endif
     685             : #endif
     686       10784 :     tick++;
     687       10784 :   }
     688        2400 : }
     689             : 
     690             : void *
     691             : fd_blake3_fini_2048( fd_blake3_t * sha,
     692        4207 :                      void *        hash ) {
     693        4207 :   FD_BLAKE3_TRACE(( "fd_blake3_fini_2048(sha=%p,hash=%p)", (void *)sha, hash ));
     694             : 
     695             :   /* Compress input until the last remaining piece of work is the BLAKE3
     696             :      root block.  This root block is put through the compression
     697             :      function repeatedly to "expand" the hash output (XOF hashing).
     698             :      Solana uses this to generate a 2048 byte 'LtHash' value.
     699             :      fd_blake3 does this SIMD-parallel for better performance. */
     700        4207 :   uchar root_msg   [ 64 ] __attribute__((aligned(64)));
     701        4207 :   uchar root_cv_pre[ 32 ] __attribute__((aligned(32)));
     702        4207 :   fd_blake3_fini_xof_compress( sha, root_msg, root_cv_pre );
     703             : 
     704             :   /* Restore root block details */
     705        4207 :   uint          last_block_sz    = 64u;
     706        4207 :   uint          last_block_flags = FD_BLAKE3_FLAG_ROOT | FD_BLAKE3_FLAG_PARENT;
     707        4207 :   ulong         ctr0             = 0UL;
     708        4207 :   if( sha->pos.input_sz<=FD_BLAKE3_CHUNK_SZ ) {
     709        1810 :     last_block_sz    = (uint)sha->pos.input_sz & 63u;
     710        1810 :     if( fd_ulong_is_aligned( sha->pos.input_sz, 64 ) ) last_block_sz = 64;
     711        1810 :     if( FD_UNLIKELY( sha->pos.input_sz==0UL        ) ) last_block_sz = 0u;
     712        1810 :     last_block_flags = FD_BLAKE3_FLAG_ROOT | FD_BLAKE3_FLAG_CHUNK_END;
     713        1810 :     if( sha->pos.input_sz<=FD_BLAKE3_BLOCK_SZ ) last_block_flags |= FD_BLAKE3_FLAG_CHUNK_START;
     714        1810 :     ctr0             = sha->pos.leaf_idx-1UL;
     715        2397 :   } else {
     716        2397 :     fd_blake3_op_t op[1];
     717        2397 :     if( FD_UNLIKELY( !fd_blake3_prepare( &sha->pos, &sha->buf, op, sha->pos.next_tick+1UL ) ) ) {
     718           0 :       FD_LOG_ERR(( "fd_blake3_fini_2048 invariant violation: failed to prepare branch root compression (duplicate call to fini?)" ));
     719           0 :     }
     720        2397 :     memcpy( root_msg,    op->msg,      64UL );
     721        2397 :     memcpy( root_cv_pre, FD_BLAKE3_IV, 32UL );
     722        2397 :   }
     723        4207 :   FD_BLAKE3_TRACE(( "fd_blake3_fini_2048: sz=%lu ctr0=%lu flags=%x",
     724        4207 :                     sha->pos.input_sz, ctr0, last_block_flags ));
     725             : 
     726             :   /* Expand LtHash
     727             :      For now, this uses the generic AVX2/AVX512 compress backend.
     728             :      Could write a more optimized version in the future saving some of
     729             :      the matrix transpose work. */
     730       12620 :   for( ulong i=0UL; i<32UL; i+=FD_BLAKE3_PARA_MAX ) {
     731        8413 : #if FD_HAS_AVX512
     732        8413 :     ulong  batch_data [ 16 ] __attribute__((aligned(64)));
     733      143021 :     /*                     */ for( ulong j=0; j<16; j++ ) batch_data [ j ] = (ulong)root_msg;
     734      143021 :     uint   batch_sz   [ 16 ]; for( ulong j=0; j<16; j++ ) batch_sz   [ j ] = last_block_sz;
     735      143021 :     ulong  batch_ctr  [ 16 ]; for( ulong j=0; j<16; j++ ) batch_ctr  [ j ] = ctr0+i+j;
     736      143021 :     uint   batch_flags[ 16 ]; for( ulong j=0; j<16; j++ ) batch_flags[ j ] = last_block_flags;
     737      143021 :     void * batch_hash [ 16 ]; for( ulong j=0; j<16; j++ ) batch_hash [ j ] = (uchar *)hash + (i+j)*64;
     738      143021 :     void * batch_cv   [ 16 ]; for( ulong j=0; j<16; j++ ) batch_cv   [ j ] = root_cv_pre;
     739        8413 :     fd_blake3_avx512_compress16( 16UL, batch_data, batch_sz, batch_ctr, batch_flags, batch_hash, NULL, 64U, batch_cv );
     740             : #elif FD_HAS_AVX
     741             :     ulong  batch_data [ 8 ]; for( ulong j=0; j<8; j++ ) batch_data [ j ] = (ulong)root_msg;
     742             :     uint   batch_sz   [ 8 ]; for( ulong j=0; j<8; j++ ) batch_sz   [ j ] = last_block_sz;
     743             :     ulong  batch_ctr  [ 8 ]; for( ulong j=0; j<8; j++ ) batch_ctr  [ j ] = ctr0+i+j;
     744             :     uint   batch_flags[ 8 ]; for( ulong j=0; j<8; j++ ) batch_flags[ j ] = last_block_flags;
     745             :     void * batch_hash [ 8 ]; for( ulong j=0; j<8; j++ ) batch_hash [ j ] = (uchar *)hash + (i+j)*64;
     746             :     void * batch_cv   [ 8 ]; for( ulong j=0; j<8; j++ ) batch_cv   [ j ] = root_cv_pre;
     747             :     fd_blake3_avx_compress8( 8UL, batch_data, batch_sz, batch_ctr, batch_flags, batch_hash, NULL, 64U, batch_cv );
     748             : #elif FD_HAS_SSE
     749             :     fd_blake3_sse_compress1( (uchar *)hash+i*64, root_msg, last_block_sz, ctr0+i, last_block_flags, NULL, root_cv_pre );
     750             : #else
     751             :     fd_blake3_ref_compress1( (uchar *)hash+i*64, root_msg, last_block_sz, ctr0+i, last_block_flags, NULL, root_cv_pre );
     752             : #endif
     753        8413 :   }
     754             : 
     755        4207 :   FD_BLAKE3_TRACE(( "fd_blake3_fini_2048: done" ));
     756        4207 :   return hash;
     757        4207 : }
     758             : 
     759             : void *
     760             : fd_blake3_hash( void const * data,
     761             :                 ulong        sz,
     762           0 :                 void *       hash ) {
     763             : 
     764           0 :   fd_blake3_buf_t tbl[1];
     765           0 :   fd_blake3_pos_t s[1];
     766           0 :   fd_blake3_pos_init( s, data, sz );
     767             : 
     768           0 : #if FD_BLAKE3_PARA_MAX>1
     769           0 :   for(;;) {
     770           0 :     fd_blake3_op_t op[1];
     771           0 :     if( !fd_blake3_prepare_fast( s, tbl, op, FD_BLAKE3_PARA_MAX, 4 ) )
     772           0 :       break;
     773           0 : #if FD_HAS_AVX512
     774           0 :     fd_blake3_avx512_compress16_fast( op->msg, op->out, op->counter, op->flags );
     775             : #elif FD_HAS_AVX
     776             :     fd_blake3_avx_compress8_fast( op->msg, op->out, op->counter, op->flags );
     777             : #else
     778             :     #error "missing para support"
     779             : #endif
     780           0 :   }
     781           0 : #endif
     782             : 
     783           0 :   void const * hash_ = fd_blake3_single_hash( s, tbl );
     784           0 :   memcpy( hash, hash_, 32UL );
     785           0 :   return hash;
     786           0 : }
     787             : 
     788             : #if FD_HAS_AVX
     789             : 
     790             : void
     791             : fd_blake3_lthash_batch8(
     792             :     void const * batch_data[8],  /* align=32 ele_align=1 */
     793             :     uint const   batch_sz  [8],  /* align=32 */
     794             :     void *       out_lthash      /* align=32 */
     795           0 : ) {
     796           0 :   if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)batch_data, 32 ) ) ) {
     797           0 :     FD_LOG_ERR(( "misaligned batch_data: %p", (void *)batch_data ));
     798           0 :   }
     799           0 :   if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)batch_sz, 32 ) ) ) {
     800           0 :     FD_LOG_ERR(( "misaligned batch_sz: %p", (void *)batch_sz ));
     801           0 :   }
     802           0 :   if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)out_lthash, 32 ) ) ) {
     803           0 :     FD_LOG_ERR(( "misaligned out_lthash: %p", (void *)out_lthash ));
     804           0 :   }
     805             : 
     806           0 :   ulong batch_ctr  [ 8 ] = {0};
     807           0 :   uint  batch_flags[ 8 ]; for( uint i=0; i<8; i++ ) batch_flags[ i ] = FD_BLAKE3_FLAG_ROOT;
     808           0 :   fd_blake3_avx_compress8( 8UL, batch_data, batch_sz, batch_ctr, batch_flags, NULL, out_lthash, 32U, NULL );
     809           0 : }
     810             : 
     811             : #endif
     812             : 
     813             : #if FD_HAS_AVX512
     814             : 
     815             : void
     816             : fd_blake3_lthash_batch16(
     817             :     void const * batch_data[16],  /* align=32 ele_align=1 */
     818             :     uint const   batch_sz  [16],  /* align=32 */
     819             :     void *       out_lthash      /* align=32 */
     820           0 : ) {
     821           0 :   if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)batch_data, 64 ) ) ) {
     822           0 :     FD_LOG_ERR(( "misaligned batch_data: %p", (void *)batch_data ));
     823           0 :   }
     824           0 :   if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)batch_sz, 64 ) ) ) {
     825           0 :     FD_LOG_ERR(( "misaligned batch_sz: %p", (void *)batch_sz ));
     826           0 :   }
     827           0 :   if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)out_lthash, 64 ) ) ) {
     828           0 :     FD_LOG_ERR(( "misaligned out_lthash: %p", (void *)out_lthash ));
     829           0 :   }
     830             : 
     831           0 :   ulong batch_ctr  [ 16 ] = {0};
     832           0 :   uint  batch_flags[ 16 ]; for( uint i=0; i<16; i++ ) batch_flags[ i ] = FD_BLAKE3_FLAG_ROOT;
     833           0 :   fd_blake3_avx512_compress16( 16UL, batch_data, batch_sz, batch_ctr, batch_flags, NULL, out_lthash, 32U, NULL );
     834           0 : }
     835             : 
     836             : #endif

Generated by: LCOV version 1.14