LCOV - code coverage report
Current view: top level - ballet/blake3 - fd_blake3_avx2.c (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 0 485 0.0 %
Date: 2026-03-19 18:19:27 Functions: 0 6 0.0 %

          Line data    Source code
       1             : 
       2             : // Source originally from https://github.com/BLAKE3-team/BLAKE3
       3             : // From commit: 64747d48ffe9d1fbf4b71e94cabeb8a211461081
       4             : 
       5             : #include "fd_blake3.h"
       6             : #include "fd_blake3_private.h"
       7             : #include "../../util/simd/fd_avx.h"
       8             : #include <assert.h>
       9             : 
      10           0 : #define wu_rot16 wb_exch_adj_pair
      11             : 
      12             : static inline __attribute__((always_inline)) wu_t
      13           0 : wu_rot12( wu_t x ) {
      14           0 :   return wu_ror( x, 12 );
      15           0 : }
      16             : 
      17             : static inline __attribute__((always_inline)) wu_t
      18           0 : wu_rot8( wu_t x ) {
      19           0 :   wb_t const mask =
      20           0 :     wb( 1,2,3,0,  5,6,7,4,  9,10,11,8,  13,14,15,12,
      21           0 :         1,2,3,0,  5,6,7,4,  9,10,11,8,  13,14,15,12 );
      22           0 :   return _mm256_shuffle_epi8( x, mask );
      23           0 : }
      24             : 
      25             : static inline __attribute__((always_inline)) wu_t
      26           0 : wu_rot7( wu_t x ) {
      27           0 :   return wu_ror( x, 7 );
      28           0 : }
      29             : 
      30             : static inline __attribute__((always_inline)) void
      31             : round_fn8( wu_t  v[16],
      32             :            wu_t  m[16],
      33           0 :            ulong r ) {
      34           0 :   v[ 0] = wu_add(v[0], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][0]]);
      35           0 :   v[ 1] = wu_add(v[1], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][2]]);
      36           0 :   v[ 2] = wu_add(v[2], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][4]]);
      37           0 :   v[ 3] = wu_add(v[3], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][6]]);
      38           0 :   v[ 0] = wu_add(v[0], v[4]);
      39           0 :   v[ 1] = wu_add(v[1], v[5]);
      40           0 :   v[ 2] = wu_add(v[2], v[6]);
      41           0 :   v[ 3] = wu_add(v[3], v[7]);
      42           0 :   v[12] = wu_xor(v[12], v[0]);
      43           0 :   v[13] = wu_xor(v[13], v[1]);
      44           0 :   v[14] = wu_xor(v[14], v[2]);
      45           0 :   v[15] = wu_xor(v[15], v[3]);
      46           0 :   v[12] = wu_rot16(v[12]);
      47           0 :   v[13] = wu_rot16(v[13]);
      48           0 :   v[14] = wu_rot16(v[14]);
      49           0 :   v[15] = wu_rot16(v[15]);
      50           0 :   v[ 8] = wu_add(v[8], v[12]);
      51           0 :   v[ 9] = wu_add(v[9], v[13]);
      52           0 :   v[10] = wu_add(v[10], v[14]);
      53           0 :   v[11] = wu_add(v[11], v[15]);
      54           0 :   v[ 4] = wu_xor(v[4], v[8]);
      55           0 :   v[ 5] = wu_xor(v[5], v[9]);
      56           0 :   v[ 6] = wu_xor(v[6], v[10]);
      57           0 :   v[ 7] = wu_xor(v[7], v[11]);
      58           0 :   v[ 4] = wu_rot12(v[4]);
      59           0 :   v[ 5] = wu_rot12(v[5]);
      60           0 :   v[ 6] = wu_rot12(v[6]);
      61           0 :   v[ 7] = wu_rot12(v[7]);
      62           0 :   v[ 0] = wu_add(v[0], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][1]]);
      63           0 :   v[ 1] = wu_add(v[1], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][3]]);
      64           0 :   v[ 2] = wu_add(v[2], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][5]]);
      65           0 :   v[ 3] = wu_add(v[3], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][7]]);
      66           0 :   v[ 0] = wu_add(v[0], v[4]);
      67           0 :   v[ 1] = wu_add(v[1], v[5]);
      68           0 :   v[ 2] = wu_add(v[2], v[6]);
      69           0 :   v[ 3] = wu_add(v[3], v[7]);
      70           0 :   v[12] = wu_xor(v[12], v[0]);
      71           0 :   v[13] = wu_xor(v[13], v[1]);
      72           0 :   v[14] = wu_xor(v[14], v[2]);
      73           0 :   v[15] = wu_xor(v[15], v[3]);
      74           0 :   v[12] = wu_rot8(v[12]);
      75           0 :   v[13] = wu_rot8(v[13]);
      76           0 :   v[14] = wu_rot8(v[14]);
      77           0 :   v[15] = wu_rot8(v[15]);
      78           0 :   v[ 8] = wu_add(v[8], v[12]);
      79           0 :   v[ 9] = wu_add(v[9], v[13]);
      80           0 :   v[10] = wu_add(v[10], v[14]);
      81           0 :   v[11] = wu_add(v[11], v[15]);
      82           0 :   v[ 4] = wu_xor(v[4], v[8]);
      83           0 :   v[ 5] = wu_xor(v[5], v[9]);
      84           0 :   v[ 6] = wu_xor(v[6], v[10]);
      85           0 :   v[ 7] = wu_xor(v[7], v[11]);
      86           0 :   v[ 4] = wu_rot7(v[4]);
      87           0 :   v[ 5] = wu_rot7(v[5]);
      88           0 :   v[ 6] = wu_rot7(v[6]);
      89           0 :   v[ 7] = wu_rot7(v[7]);
      90             : 
      91           0 :   v[ 0] = wu_add(v[0], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][8]]);
      92           0 :   v[ 1] = wu_add(v[1], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][10]]);
      93           0 :   v[ 2] = wu_add(v[2], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][12]]);
      94           0 :   v[ 3] = wu_add(v[3], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][14]]);
      95           0 :   v[ 0] = wu_add(v[0], v[5]);
      96           0 :   v[ 1] = wu_add(v[1], v[6]);
      97           0 :   v[ 2] = wu_add(v[2], v[7]);
      98           0 :   v[ 3] = wu_add(v[3], v[4]);
      99           0 :   v[15] = wu_xor(v[15], v[0]);
     100           0 :   v[12] = wu_xor(v[12], v[1]);
     101           0 :   v[13] = wu_xor(v[13], v[2]);
     102           0 :   v[14] = wu_xor(v[14], v[3]);
     103           0 :   v[15] = wu_rot16(v[15]);
     104           0 :   v[12] = wu_rot16(v[12]);
     105           0 :   v[13] = wu_rot16(v[13]);
     106           0 :   v[14] = wu_rot16(v[14]);
     107           0 :   v[10] = wu_add(v[10], v[15]);
     108           0 :   v[11] = wu_add(v[11], v[12]);
     109           0 :   v[ 8] = wu_add(v[8], v[13]);
     110           0 :   v[ 9] = wu_add(v[9], v[14]);
     111           0 :   v[ 5] = wu_xor(v[5], v[10]);
     112           0 :   v[ 6] = wu_xor(v[6], v[11]);
     113           0 :   v[ 7] = wu_xor(v[7], v[8]);
     114           0 :   v[ 4] = wu_xor(v[4], v[9]);
     115           0 :   v[ 5] = wu_rot12(v[5]);
     116           0 :   v[ 6] = wu_rot12(v[6]);
     117           0 :   v[ 7] = wu_rot12(v[7]);
     118           0 :   v[ 4] = wu_rot12(v[4]);
     119           0 :   v[ 0] = wu_add(v[0], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][9]]);
     120           0 :   v[ 1] = wu_add(v[1], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][11]]);
     121           0 :   v[ 2] = wu_add(v[2], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][13]]);
     122           0 :   v[ 3] = wu_add(v[3], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][15]]);
     123           0 :   v[ 0] = wu_add(v[0], v[5]);
     124           0 :   v[ 1] = wu_add(v[1], v[6]);
     125           0 :   v[ 2] = wu_add(v[2], v[7]);
     126           0 :   v[ 3] = wu_add(v[3], v[4]);
     127           0 :   v[15] = wu_xor(v[15], v[0]);
     128           0 :   v[12] = wu_xor(v[12], v[1]);
     129           0 :   v[13] = wu_xor(v[13], v[2]);
     130           0 :   v[14] = wu_xor(v[14], v[3]);
     131           0 :   v[15] = wu_rot8(v[15]);
     132           0 :   v[12] = wu_rot8(v[12]);
     133           0 :   v[13] = wu_rot8(v[13]);
     134           0 :   v[14] = wu_rot8(v[14]);
     135           0 :   v[10] = wu_add(v[10], v[15]);
     136           0 :   v[11] = wu_add(v[11], v[12]);
     137           0 :   v[ 8] = wu_add(v[8], v[13]);
     138           0 :   v[ 9] = wu_add(v[9], v[14]);
     139           0 :   v[ 5] = wu_xor(v[5], v[10]);
     140           0 :   v[ 6] = wu_xor(v[6], v[11]);
     141           0 :   v[ 7] = wu_xor(v[7], v[8]);
     142           0 :   v[ 4] = wu_xor(v[4], v[9]);
     143           0 :   v[ 5] = wu_rot7(v[5]);
     144           0 :   v[ 6] = wu_rot7(v[6]);
     145           0 :   v[ 7] = wu_rot7(v[7]);
     146           0 :   v[ 4] = wu_rot7(v[4]);
     147           0 : }
     148             : 
     149             : void
     150             : fd_blake3_avx_compress8( ulong                   batch_cnt,
     151             :                          void   const * restrict _batch_data,
     152             :                          uint   const * restrict batch_sz,
     153             :                          ulong  const * restrict ctr_vec,
     154             :                          uint   const * restrict batch_flags,
     155             :                          void * const * restrict _batch_hash,
     156             :                          ushort *       restrict lthash,
     157             :                          uint                    out_sz,
     158           0 :                          void const *   restrict batch_cv ) {
     159           0 :   if( FD_UNLIKELY( lthash && batch_cnt!=8 ) ) FD_LOG_ERR(( "Lane masking not supported for fd_blake3_avx_compress8 in LtHash mode" ));
     160           0 :   if( FD_UNLIKELY( batch_cnt==0 || batch_cnt>8 ) ) FD_LOG_ERR(( "Invalid batch_cnt %lu", batch_cnt ));
     161             : 
     162           0 :   ulong const * batch_data = (ulong const *)_batch_data;
     163             : 
     164           0 :   if( FD_UNLIKELY( batch_cnt==1 ) ) {
     165           0 :     fd_blake3_sse_compress1( (uchar *)(_batch_hash[0]),
     166           0 :                              (uchar const *)(batch_data[0]),
     167           0 :                              batch_sz[0],
     168           0 :                              ctr_vec[0],
     169           0 :                              batch_flags[0],
     170           0 :                              NULL,
     171           0 :                              NULL );
     172           0 :     return;
     173           0 :   }
     174             : 
     175             : #if FD_BLAKE3_TRACING
     176             :   /* This log_line buffer is oversized by a fair bit (due to all the
     177             :      NULL terminators) but that's fine */
     178             :   char log_line[
     179             :       sizeof( "fd_blake3_avx_compress8" )+
     180             :       sizeof( "(batch_cnt=" )+21+
     181             :       sizeof( ",sz=["       )+(8*11)+sizeof( "]" )+
     182             :       sizeof( ",counter=["  )+(8*21)+sizeof( "]" )+
     183             :       sizeof( ",flags=["    )+(8* 2)+sizeof( "]" )+
     184             :       sizeof( ",custom_cv"  )+
     185             :       sizeof( ",lthash" )+
     186             :       sizeof( ")" ) ];
     187             : 
     188             :   char * p = fd_cstr_init( log_line );
     189             :   p = fd_cstr_append_text( p, "fd_blake3_avx_compress8(batch_cnt=", 34UL );
     190             :   p = fd_cstr_append_ulong_as_text( p, 0, 0, batch_cnt, fd_uchar_base10_dig_cnt( (uchar)batch_cnt ) );
     191             :   p = fd_cstr_append_text( p, ",sz=[", 5UL );
     192             :   for( ulong i=0UL; i<batch_cnt; i++ ) {
     193             :     p = fd_cstr_append_uint_as_text( p, ' ', 0, batch_sz[ i ], fd_uint_base10_dig_cnt( batch_sz[ i ] ) );
     194             :     if( i+1<batch_cnt ) p = fd_cstr_append_char( p, ',' );
     195             :   }
     196             :   p = fd_cstr_append_text( p, "],counter=[", 11UL );
     197             :   for( ulong i=0UL; i<batch_cnt; i++ ) {
     198             :     p = fd_cstr_append_ulong_as_text( p, ' ', 0, ctr_vec[ i ], fd_ulong_base10_dig_cnt( ctr_vec[ i ] ) );
     199             :     if( i+1<batch_cnt ) p = fd_cstr_append_char( p, ',' );
     200             :   }
     201             :   p = fd_cstr_append_text( p, "],flags=[", 9UL );
     202             :   for( ulong i=0UL; i<batch_cnt; i++ ) {
     203             :     static char const hex_lut[ 16 ] = {
     204             :       '0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'
     205             :     };
     206             :     p = fd_cstr_append_char( p, hex_lut[ batch_flags[ i ]&0xf ] );
     207             :     if( i+1<batch_cnt ) p = fd_cstr_append_char( p, ',' );
     208             :   }
     209             :   p = fd_cstr_append_char( p, ']' );
     210             :   if( batch_cv ) p = fd_cstr_append_text( p, ",custom_cv", 10UL );
     211             :   if( lthash   ) p = fd_cstr_append_text( p, ",lthash", 7UL );
     212             :   p = fd_cstr_append_char( p, ')' );
     213             :   ulong line_len = (ulong)( p-log_line );
     214             :   fd_cstr_fini( p );
     215             : 
     216             :   FD_BLAKE3_TRACE(( "%.*s", (int)line_len, log_line ));
     217             : #endif
     218             : 
     219             :   /* We can only process input blocks of 64 bytes, but message data size
     220             :      is not necessarily a multiple of 64.  We compute the tail block of
     221             :      each message here.  We then process complete blocks of the original
     222             :      message in place, switching to processing to these  tail blocks in
     223             :      the same pass toward the end. */
     224             : 
     225           0 :   ulong batch_tail_data[ 8 ] __attribute__((aligned(32)));
     226           0 :   ulong batch_tail_rem [ 8 ] __attribute__((aligned(32)));
     227             : 
     228           0 :   uchar scratch[ 8*FD_BLAKE3_BLOCK_SZ ] __attribute__((aligned(128)));
     229           0 :   do {
     230           0 :     ulong scratch_free = (ulong)scratch;
     231             : 
     232           0 :     wv_t zero = wv_zero();
     233             : 
     234           0 :     for( ulong batch_idx=0UL; batch_idx<batch_cnt; batch_idx++ ) {
     235             : 
     236             :       /* Allocate the tail blocks for this message */
     237             : 
     238           0 :       ulong data = batch_data[ batch_idx ];
     239           0 :       ulong sz   = batch_sz  [ batch_idx ];
     240             : 
     241           0 :       ulong tail_data     = scratch_free;
     242           0 :       ulong tail_data_sz  = sz & (FD_BLAKE3_BLOCK_SZ-1UL);
     243           0 :       ulong tail_data_off = fd_ulong_align_dn( sz, FD_BLAKE3_BLOCK_SZ );
     244             : 
     245           0 :       batch_tail_data[ batch_idx ] = tail_data;
     246           0 :       batch_tail_rem [ batch_idx ] = (ulong)( (!!tail_data_sz) ^ (!sz) );  /* (hash 1 tail block if 0 sz) */
     247             : 
     248           0 :       scratch_free += FD_BLAKE3_BLOCK_SZ;
     249             : 
     250             :       /* Populate the tail blocks.  We first clear the blocks.  Then we
     251             :          copy any straggler data bytes into the tail. */
     252             : 
     253           0 :       wv_st( (ulong *) tail_data,     zero );
     254           0 :       wv_st( (ulong *)(tail_data+32), zero );
     255             : 
     256           0 : #     if 1
     257             :       /* See fd_sha256_private_batch_avx */
     258           0 :       ulong src = (ulong)data + tail_data_off;
     259           0 :       ulong dst = tail_data;
     260           0 :       ulong rem = tail_data_sz;
     261           0 :       while( rem>=32UL ) { wv_st( (ulong *)dst, wv_ldu( (ulong const *)src ) ); dst += 32UL; src += 32UL; rem -= 32UL; }
     262           0 :       while( rem>= 8UL ) { *(ulong  *)dst = FD_LOAD( ulong,  src );             dst +=  8UL; src +=  8UL; rem -=  8UL; }
     263           0 :       if   ( rem>= 4UL ) { *(uint   *)dst = FD_LOAD( uint,   src );             dst +=  4UL; src +=  4UL; rem -=  4UL; }
     264           0 :       if   ( rem>= 2UL ) { *(ushort *)dst = FD_LOAD( ushort, src );             dst +=  2UL; src +=  2UL; rem -=  2UL; }
     265           0 :       if   ( rem       ) { *(uchar  *)dst = FD_LOAD( uchar,  src );             dst++;                                 }
     266             : #     else
     267             :       fd_memcpy( (void *)tail_data, (void const *)(data + tail_data_off), tail_data_sz );
     268             : #     endif
     269           0 :     }
     270           0 :   } while(0);
     271             : 
     272             : 
     273           0 :   wu_t const iv0 = wu_bcast( FD_BLAKE3_IV[0] );
     274           0 :   wu_t const iv1 = wu_bcast( FD_BLAKE3_IV[1] );
     275           0 :   wu_t const iv2 = wu_bcast( FD_BLAKE3_IV[2] );
     276           0 :   wu_t const iv3 = wu_bcast( FD_BLAKE3_IV[3] );
     277           0 :   wu_t const iv4 = wu_bcast( FD_BLAKE3_IV[4] );
     278           0 :   wu_t const iv5 = wu_bcast( FD_BLAKE3_IV[5] );
     279           0 :   wu_t const iv6 = wu_bcast( FD_BLAKE3_IV[6] );
     280           0 :   wu_t const iv7 = wu_bcast( FD_BLAKE3_IV[7] );
     281             : 
     282           0 :   wu_t h0=iv0; wu_t h1=iv1; wu_t h2=iv2; wu_t h3=iv3;
     283           0 :   wu_t h4=iv4; wu_t h5=iv5; wu_t h6=iv6; wu_t h7=iv7;
     284           0 :   if( FD_UNLIKELY( batch_cv ) ) {
     285             :     /* If the input chaining value is overridden, transpose the input to
     286             :        AVX representation (8x8 transpose). */
     287           0 :     __m256i const ** cv_vec = (__m256i const **)batch_cv;
     288           0 :     wu_t cv[8];
     289           0 :     for( ulong i=0UL; i<8UL; i++ ) cv[i] = _mm256_loadu_si256( cv_vec[ i ] );
     290           0 :     wu_transpose_8x8( cv[0], cv[1], cv[2], cv[3], cv[4], cv[5], cv[6], cv[7],
     291           0 :                       h0,    h1,    h2,    h3,    h4,    h5,    h6,    h7 );
     292           0 :   }
     293             : 
     294           0 :   wu_t ctr_lo = wu( ctr_vec[0],     ctr_vec[1],     ctr_vec[2],     ctr_vec[3],
     295           0 :                     ctr_vec[4],     ctr_vec[5],     ctr_vec[6],     ctr_vec[7] );
     296           0 :   wu_t ctr_hi = wu( ctr_vec[0]>>32, ctr_vec[1]>>32, ctr_vec[2]>>32, ctr_vec[3]>>32,
     297           0 :                     ctr_vec[4]>>32, ctr_vec[5]>>32, ctr_vec[6]>>32, ctr_vec[7]>>32 );
     298           0 :   wu_t flags = wu_ldu( batch_flags );
     299           0 :   wu_t off   = wu_zero();
     300           0 :   wu_t sz    = wu_ldu( batch_sz );
     301             : 
     302           0 :   wv_t wv_64        = wv_bcast( FD_BLAKE3_BLOCK_SZ );
     303           0 :   wv_t W_sentinel   = wv_bcast( (ulong)scratch );
     304           0 :   wc_t batch_lane   = wc_unpack( (1<<batch_cnt)-1 );
     305             : 
     306           0 :   wv_t tail_lo      = wv_ld( batch_tail_data   );
     307           0 :   wv_t tail_hi      = wv_ld( batch_tail_data+4 );
     308             : 
     309           0 :   wv_t tail_rem_lo  = wv_ld( batch_tail_rem    );
     310           0 :   wv_t tail_rem_hi  = wv_ld( batch_tail_rem+4  );
     311             : 
     312           0 :   wv_t W_lo         = wv_ld( batch_data        );
     313           0 :   wv_t W_hi         = wv_ld( batch_data+4      );
     314             : 
     315           0 :   wv_t batch_sz_lo  = _mm256_cvtepi32_epi64( _mm256_extractf128_si256( sz, 0 ) );
     316           0 :   wv_t batch_sz_hi  = _mm256_cvtepi32_epi64( _mm256_extractf128_si256( sz, 1 ) );
     317             : 
     318           0 :   wv_t block_rem_lo = wv_notczero( wc_expand( batch_lane, 0 ),
     319           0 :                         wv_add( wv_shr( batch_sz_lo, FD_BLAKE3_BLOCK_LG_SZ ), tail_rem_lo ) );
     320           0 :   wv_t block_rem_hi = wv_notczero( wc_expand( batch_lane, 1 ),
     321           0 :                         wv_add( wv_shr( batch_sz_hi, FD_BLAKE3_BLOCK_LG_SZ ), tail_rem_hi ) );
     322             : 
     323             :   /* Upper half of the compression function output.
     324             :      Usually thrown away, but kept in the final compression round if
     325             :      out_sz==64. */
     326           0 :   wu_t hu[8] = {0};
     327             : 
     328           0 :   ulong lthash_rem    = lthash ? 32 : 0; /* Number of LtHash (XOF) blocks remaining */
     329           0 :   int   compress_done = 0;
     330           0 :   for(;;) {
     331             :     /* Switch lanes that have hit the end of their in-place bulk
     332             :        processing to their out-of-place scratch tail regions as
     333             :        necessary. */
     334             : 
     335           0 :     W_lo = wv_if( wv_eq( block_rem_lo, tail_rem_lo ), tail_lo, W_lo );
     336           0 :     W_hi = wv_if( wv_eq( block_rem_hi, tail_rem_hi ), tail_hi, W_hi );
     337             : 
     338             :     /* Derive per-block flags and block sizes */
     339             : 
     340           0 :     wc_t block_first = wu_eq( off, wu_zero() );
     341           0 :     wc_t block_last  = wi_lt( sz,  wu_add( off, wu_bcast( FD_BLAKE3_BLOCK_SZ+1 ) ) );
     342             : 
     343             :     /* Suppress root flag unless last block */
     344             : 
     345           0 :     wu_t root_mask = wu_or( block_last, wu_bcast( ~FD_BLAKE3_FLAG_ROOT ) );
     346           0 :     wu_t block_flags = wu_and( flags, root_mask );
     347             : 
     348             :     /* LtHash mode ends compression one early */
     349             : 
     350           0 :     wc_t active_lane_lo;
     351           0 :     wc_t active_lane_hi;
     352           0 :     if( FD_UNLIKELY( lthash ) ) {
     353             :       /* Compress until root block */
     354           0 :       wu_t all_root = wu_bcast( FD_BLAKE3_FLAG_ROOT );
     355           0 :       wu_t not_root = wu_ne( wu_and( block_flags, all_root ), all_root );
     356           0 :       active_lane_lo = _mm256_cvtepi32_epi64( _mm256_extractf128_si256( not_root, 0 ) );
     357           0 :       active_lane_hi = _mm256_cvtepi32_epi64( _mm256_extractf128_si256( not_root, 1 ) );
     358           0 :     } else {
     359             :       /* Complete when there is no more input data */
     360           0 :       active_lane_lo = wv_to_wc( block_rem_lo );
     361           0 :       active_lane_hi = wv_to_wc( block_rem_hi );
     362           0 :     }
     363             : 
     364             :     /* Suppress CHUNK_{START,END} flags unless leaf node */
     365             : 
     366           0 :     wc_t is_parent = wu_shl( flags, 5 );  /* shift FLAG_PARENT into AVX condition bit */
     367           0 :     wu_t chunk_flags = wu_if( block_last,  wu_bcast( FD_BLAKE3_FLAG_CHUNK_END   ), wu_zero() );
     368           0 :     if( out_sz==32 ) {
     369             :       /* Hacky: out_sz==64 is only used for post-compress XOF hashing,
     370             :          so use that as a hint when to suppress the 'CHUNK_START' flag. */
     371           0 :       chunk_flags = wu_or( chunk_flags, wu_if( block_first, wu_bcast( FD_BLAKE3_FLAG_CHUNK_START ), wu_zero() ) );
     372           0 :     }
     373           0 :     wu_t block_sz = wu_min( wu_sub( sz, off ), wu_bcast( FD_BLAKE3_BLOCK_SZ ) );
     374           0 :     block_flags = wu_or( block_flags, wu_if( is_parent, wu_zero(), chunk_flags ) );
     375             : 
     376             :     /* Check if we are done compressing */
     377             : 
     378           0 :     compress_done |= !wc_any( wc_or( active_lane_lo, active_lane_hi ) );
     379           0 :     if( FD_UNLIKELY( compress_done ) ) {
     380           0 :       if( FD_UNLIKELY( !lthash_rem ) ) break;
     381           0 :       active_lane_lo = wc_bcast( INT_MAX );
     382           0 :       active_lane_hi = wc_bcast( INT_MAX );
     383             :       /* Load the next message block and fall through to XOF expansion */
     384           0 :     }
     385             : 
     386             :     /* At this point, we have at least 1 block in this message segment
     387             :        pass that has not been processed.  Load the next 64 bytes of
     388             :        each unprocessed block.  Inactive lanes (e.g. message segments
     389             :        in this pass for which we've already processed all the blocks)
     390             :        will load garbage from a sentinel location (and the result of
     391             :        the state computations for the inactive lane will be ignored). */
     392             : 
     393           0 :     wv_t W03 = wv_if( active_lane_lo, W_lo, W_sentinel );
     394           0 :     uchar const * W0 = (uchar const *)wv_extract( W03, 0 );
     395           0 :     uchar const * W1 = (uchar const *)wv_extract( W03, 1 );
     396           0 :     uchar const * W2 = (uchar const *)wv_extract( W03, 2 );
     397           0 :     uchar const * W3 = (uchar const *)wv_extract( W03, 3 );
     398             : 
     399           0 :     wv_t W47 = wv_if( active_lane_hi, W_hi, W_sentinel );
     400           0 :     uchar const * W4 = (uchar const *)wv_extract( W47, 0 );
     401           0 :     uchar const * W5 = (uchar const *)wv_extract( W47, 1 );
     402           0 :     uchar const * W6 = (uchar const *)wv_extract( W47, 2 );
     403           0 :     uchar const * W7 = (uchar const *)wv_extract( W47, 3 );
     404             : 
     405           0 :     wu_t m[16] = { wu_ldu( W0    ), wu_ldu( W1    ), wu_ldu( W2    ), wu_ldu( W3    ),
     406           0 :                    wu_ldu( W4    ), wu_ldu( W5    ), wu_ldu( W6    ), wu_ldu( W7    ),
     407           0 :                    wu_ldu( W0+32 ), wu_ldu( W1+32 ), wu_ldu( W2+32 ), wu_ldu( W3+32 ),
     408           0 :                    wu_ldu( W4+32 ), wu_ldu( W5+32 ), wu_ldu( W6+32 ), wu_ldu( W7+32 ) };
     409             : 
     410           0 :     wu_transpose_8x8( m[0x0], m[0x1], m[0x2], m[0x3], m[0x4], m[0x5], m[0x6], m[0x7],
     411           0 :                       m[0x0], m[0x1], m[0x2], m[0x3], m[0x4], m[0x5], m[0x6], m[0x7] );
     412           0 :     wu_transpose_8x8( m[0x8], m[0x9], m[0xa], m[0xb], m[0xc], m[0xd], m[0xe], m[0xf],
     413           0 :                       m[0x8], m[0x9], m[0xa], m[0xb], m[0xc], m[0xd], m[0xe], m[0xf] );
     414             : 
     415             :     /* Compute the BLAKE3 compression function updates */
     416             : 
     417           0 : compress: (void)0;
     418           0 :     wu_t v[16] = {
     419           0 :         h0,     h1,     h2,       h3,
     420           0 :         h4,     h5,     h6,       h7,
     421           0 :         iv0,    iv1,    iv2,      iv3,
     422           0 :         ctr_lo, ctr_hi, block_sz, block_flags,
     423           0 :     };
     424             : 
     425             :     /* Debug utility */
     426           0 : #define STATE_FMT         "state[%u] =\n  %08x %08x %08x %08x\n  %08x %08x %08x %08x\n  %08x %08x %08x %08x\n  %08x %08x %08x %08x"
     427           0 : #define STATE_FMT_ARGS(v,i) (uint)i,\
     428           0 :         fd_uint_bswap(wu_extract(v[0x0],i)),fd_uint_bswap(wu_extract(v[0x1],i)),fd_uint_bswap(wu_extract(v[0x2],i)),fd_uint_bswap(wu_extract(v[0x3],i)),\
     429           0 :         fd_uint_bswap(wu_extract(v[0x4],i)),fd_uint_bswap(wu_extract(v[0x5],i)),fd_uint_bswap(wu_extract(v[0x6],i)),fd_uint_bswap(wu_extract(v[0x7],i)),\
     430           0 :         fd_uint_bswap(wu_extract(v[0x8],i)),fd_uint_bswap(wu_extract(v[0x9],i)),fd_uint_bswap(wu_extract(v[0xa],i)),fd_uint_bswap(wu_extract(v[0xb],i)),\
     431           0 :         fd_uint_bswap(wu_extract(v[0xc],i)),fd_uint_bswap(wu_extract(v[0xd],i)),fd_uint_bswap(wu_extract(v[0xe],i)),fd_uint_bswap(wu_extract(v[0xf],i))
     432             : 
     433             :     // FD_LOG_NOTICE(( STATE_FMT, STATE_FMT_ARGS(v,0) ));
     434           0 :     round_fn8( v, m, 0 );
     435           0 :     round_fn8( v, m, 1 );
     436           0 :     round_fn8( v, m, 2 );
     437           0 :     round_fn8( v, m, 3 );
     438           0 :     round_fn8( v, m, 4 );
     439           0 :     round_fn8( v, m, 5 );
     440           0 :     round_fn8( v, m, 6 );
     441             :     // FD_LOG_NOTICE(( STATE_FMT, STATE_FMT_ARGS(v,0) ));
     442             : 
     443           0 :     wu_t d[8] = {
     444           0 :       wu_xor( v[ 0], v[ 8] ), wu_xor( v[ 1], v[ 9] ),
     445           0 :       wu_xor( v[ 2], v[10] ), wu_xor( v[ 3], v[11] ),
     446           0 :       wu_xor( v[ 4], v[12] ), wu_xor( v[ 5], v[13] ),
     447           0 :       wu_xor( v[ 6], v[14] ), wu_xor( v[ 7], v[15] )
     448           0 :     };
     449             : 
     450           0 :     if( FD_LIKELY( !compress_done ) ) {
     451             : 
     452             :       /* Apply the state updates to the active lanes */
     453             : 
     454           0 :       wc_t active_lane = wc_narrow( active_lane_lo, active_lane_hi );
     455           0 :       if( FD_UNLIKELY( out_sz==64 ) ) {
     456             :         /* FIXME only export in the last iteration */
     457           0 :         hu[0] = wu_if( active_lane, wu_xor( h0, v[ 8] ), hu[0] );
     458           0 :         hu[1] = wu_if( active_lane, wu_xor( h1, v[ 9] ), hu[1] );
     459           0 :         hu[2] = wu_if( active_lane, wu_xor( h2, v[10] ), hu[2] );
     460           0 :         hu[3] = wu_if( active_lane, wu_xor( h3, v[11] ), hu[3] );
     461           0 :         hu[4] = wu_if( active_lane, wu_xor( h4, v[12] ), hu[4] );
     462           0 :         hu[5] = wu_if( active_lane, wu_xor( h5, v[13] ), hu[5] );
     463           0 :         hu[6] = wu_if( active_lane, wu_xor( h6, v[14] ), hu[6] );
     464           0 :         hu[7] = wu_if( active_lane, wu_xor( h7, v[15] ), hu[7] );
     465           0 :       }
     466           0 :       h0 = wu_if( active_lane, d[0], h0 );
     467           0 :       h1 = wu_if( active_lane, d[1], h1 );
     468           0 :       h2 = wu_if( active_lane, d[2], h2 );
     469           0 :       h3 = wu_if( active_lane, d[3], h3 );
     470           0 :       h4 = wu_if( active_lane, d[4], h4 );
     471           0 :       h5 = wu_if( active_lane, d[5], h5 );
     472           0 :       h6 = wu_if( active_lane, d[6], h6 );
     473           0 :       h7 = wu_if( active_lane, d[7], h7 );
     474             : 
     475             :       /* Advance to the next message segment blocks.  In pseudo code,
     476             :          the below is:
     477             : 
     478             :            W += 64; if( block_rem ) block_rem--;
     479             : 
     480             :          Since wc_to_wv_raw(false/true) is 0UL/~0UL, we can use wv_add /
     481             :          wc_to_wv_raw instead of wv_sub / wc_to_wv to save some ops.
     482             :          (Consider conditional increment / decrement operations?)
     483             : 
     484             :          Also since we do not load anything at W(lane) above unless
     485             :          block_rem(lane) is non-zero, we can omit vector conditional
     486             :          operations for W(lane) below to save some additional ops. */
     487             : 
     488           0 :       W_lo = wv_add( W_lo, wv_if( active_lane_lo, wv_64, wv_zero() ) );
     489           0 :       W_hi = wv_add( W_hi, wv_if( active_lane_hi, wv_64, wv_zero() ) );
     490           0 :       off  = wu_add( off,  wu_if( active_lane, wu_bcast( FD_BLAKE3_BLOCK_SZ ), wv_zero() ) );
     491             : 
     492           0 :       block_rem_lo = wv_add( block_rem_lo, wv_if( active_lane_lo, wc_to_wv_raw( active_lane_lo ), wv_zero() ) );
     493           0 :       block_rem_hi = wv_add( block_rem_hi, wv_if( active_lane_hi, wc_to_wv_raw( active_lane_hi ), wv_zero() ) );
     494             : 
     495           0 :     } else { /* LtHash mode */
     496             : 
     497             :       /* d[i] contains output_off+(i*4) 32-bit words across output[0..8] */
     498           0 :       wu_t dh[ 8 ] = {
     499           0 :         wu_xor( h0, v[0x8] ),
     500           0 :         wu_xor( h1, v[0x9] ),
     501           0 :         wu_xor( h2, v[0xa] ),
     502           0 :         wu_xor( h3, v[0xb] ),
     503           0 :         wu_xor( h4, v[0xc] ),
     504           0 :         wu_xor( h5, v[0xd] ),
     505           0 :         wu_xor( h6, v[0xe] ),
     506           0 :         wu_xor( h7, v[0xf] )
     507           0 :       };
     508             : 
     509             :       /* Transpose outer 8x8 blocks */
     510           0 :       wu_transpose_8x8( d [0],d [1],d [2],d [3],d [4],d [5],d [6],d [7],
     511           0 :                         d [0],d [1],d [2],d [3],d [4],d [5],d [6],d [7] );
     512           0 :       wu_transpose_8x8( dh[0],dh[1],dh[2],dh[3],dh[4],dh[5],dh[6],dh[7],
     513           0 :                         dh[0],dh[1],dh[2],dh[3],dh[4],dh[5],dh[6],dh[7] );
     514             : 
     515             :       /* d[i] contains output[i]+out_off */
     516             : 
     517             :       /* Reduce-add into d[0] */
     518           0 :       d [0] = wh_add( d [0], d [1] ); /* sum(l[0 1]) */
     519           0 :       dh[0] = wh_add( dh[0], dh[1] ); /* sum(h[0 1]) */
     520           0 :       d [2] = wh_add( d [2], d [3] ); /* sum(l[2 3]) */
     521           0 :       dh[2] = wh_add( dh[2], dh[3] ); /* sum(h[2 3]) */
     522           0 :       d [4] = wh_add( d [4], d [5] ); /* sum(l[4 5])*/
     523           0 :       dh[4] = wh_add( dh[4], dh[5] ); /* sum(h[4 5]) */
     524           0 :       d [6] = wh_add( d [6], d [7] ); /* sum(l[6 7]) */
     525           0 :       dh[6] = wh_add( dh[6], dh[7] ); /* sum(h[6 7]) */
     526           0 :       d [0] = wh_add( d [0], d [2] ); /* sum(l[0 1 2 3]) */
     527           0 :       dh[0] = wh_add( dh[0], dh[2] ); /* sum(h[0 1 2 3]) */
     528           0 :       d [4] = wh_add( d [4], d [6] ); /* sum(l[4 5 6 7]) */
     529           0 :       dh[4] = wh_add( dh[4], dh[6] ); /* sum(h[4 5 6 7]) */
     530           0 :       d [0] = wh_add( d [0], d [4] ); /* sum(l[0 1 2 3 4 5 6 7]) */
     531           0 :       dh[0] = wh_add( dh[0], dh[4] ); /* sum(h[0 1 2 3 4 5 6 7]) */
     532           0 :       wh_st( lthash,    d [0] );
     533           0 :       wh_st( lthash+16, dh[0] );
     534             : 
     535             :       /* Wind up for next iteration */
     536           0 :       lthash += 32;
     537           0 :       lthash_rem--;
     538           0 :       wu_t ctr_add   = wu_bcast( 1 );
     539           0 :       /**/ ctr_lo    = wu_add( ctr_lo, ctr_add );
     540           0 :       wu_t ctr_carry = wi_gt ( wu_xor( ctr_add, wu_bcast( 0x80000000 ) ),
     541           0 :                                wu_xor( ctr_lo,  wu_bcast( 0x80000000 ) ) );
     542           0 :       /**/ ctr_hi    = wu_sub( ctr_hi, ctr_carry );
     543           0 :       if( FD_UNLIKELY( !lthash_rem ) ) {
     544           0 :         FD_BLAKE3_TRACE(( "fd_blake3_avx_compress8: done (lthash para)" ));
     545           0 :         return;
     546           0 :       }
     547           0 :       goto compress;
     548             : 
     549           0 : #   undef STATE_FMT
     550           0 : #   undef STATE_FMT_ARGS
     551           0 :     }
     552           0 :   }
     553             : 
     554             :   /* Store the results */
     555             : 
     556           0 :   wu_transpose_8x8( h0, h1, h2, h3, h4, h5, h6, h7,
     557           0 :                     h0, h1, h2, h3, h4, h5, h6, h7 );
     558             : 
     559           0 :   uint * const * batch_hash = (uint * const *)__builtin_assume_aligned( _batch_hash, 32 );
     560           0 :   if( FD_LIKELY( out_sz==32 ) ) {
     561           0 :     switch( batch_cnt ) { /* application dependent prob */
     562           0 :     case 8UL: wu_st( batch_hash[7], h7 ); __attribute__((fallthrough));
     563           0 :     case 7UL: wu_st( batch_hash[6], h6 ); __attribute__((fallthrough));
     564           0 :     case 6UL: wu_st( batch_hash[5], h5 ); __attribute__((fallthrough));
     565           0 :     case 5UL: wu_st( batch_hash[4], h4 ); __attribute__((fallthrough));
     566           0 :     case 4UL: wu_st( batch_hash[3], h3 ); __attribute__((fallthrough));
     567           0 :     case 3UL: wu_st( batch_hash[2], h2 ); __attribute__((fallthrough));
     568           0 :     case 2UL: wu_st( batch_hash[1], h1 ); __attribute__((fallthrough));
     569           0 :     case 1UL: wu_st( batch_hash[0], h0 ); __attribute__((fallthrough));
     570           0 :     default: break;
     571           0 :     }
     572           0 :   } else if( out_sz==64 ) {
     573           0 :     wu_transpose_8x8( hu[0], hu[1], hu[2], hu[3], hu[4], hu[5], hu[6], hu[7],
     574           0 :                       hu[0], hu[1], hu[2], hu[3], hu[4], hu[5], hu[6], hu[7] );
     575           0 :     switch( batch_cnt ) { /* application dependent prob */
     576           0 :     case 8UL: wu_st( batch_hash[7],   h7    );
     577           0 :               wu_st( batch_hash[7]+8, hu[7] ); __attribute__((fallthrough));
     578           0 :     case 7UL: wu_st( batch_hash[6],   h6    );
     579           0 :               wu_st( batch_hash[6]+8, hu[6] ); __attribute__((fallthrough));
     580           0 :     case 6UL: wu_st( batch_hash[5],   h5    );
     581           0 :               wu_st( batch_hash[5]+8, hu[5] ); __attribute__((fallthrough));
     582           0 :     case 5UL: wu_st( batch_hash[4],   h4    );
     583           0 :               wu_st( batch_hash[4]+8, hu[4] ); __attribute__((fallthrough));
     584           0 :     case 4UL: wu_st( batch_hash[3],   h3    );
     585           0 :               wu_st( batch_hash[3]+8, hu[3] ); __attribute__((fallthrough));
     586           0 :     case 3UL: wu_st( batch_hash[2],   h2    );
     587           0 :               wu_st( batch_hash[2]+8, hu[2] ); __attribute__((fallthrough));
     588           0 :     case 2UL: wu_st( batch_hash[1],   h1    );
     589           0 :               wu_st( batch_hash[1]+8, hu[1] ); __attribute__((fallthrough));
     590           0 :     case 1UL: wu_st( batch_hash[0],   h0    );
     591           0 :               wu_st( batch_hash[0]+8, hu[0] ); __attribute__((fallthrough));
     592           0 :     default: break;
     593           0 :     }
     594           0 :   } else {
     595           0 :     FD_LOG_ERR(( "Invalid out_sz %u", out_sz ));
     596           0 :   }
     597           0 : }
     598             : 
     599             : void
     600             : fd_blake3_avx_compress8_fast( uchar const * restrict msg,
     601             :                               uchar       * restrict _out,
     602             :                               ulong                  counter,
     603           0 :                               uchar                  flags ) {
     604           0 :   FD_BLAKE3_TRACE(( "fd_blake3_avx_compress8_fast(msg=%p,out=%p,counter=%lu,flags=%02x)", (void *)msg, (void *)_out, counter, flags ));
     605             : 
     606           0 :   uchar * restrict out = __builtin_assume_aligned( _out, 32 );
     607             : 
     608           0 :   int   parent = flags & FD_BLAKE3_FLAG_PARENT;
     609           0 :   int   lg_sz  = fd_int_if( parent, FD_BLAKE3_OUTCHAIN_LG_SZ+1, FD_BLAKE3_CHUNK_LG_SZ );
     610           0 :   ulong sz     = 1UL<<lg_sz;
     611             : 
     612             :   /* counters stay the same for each block.  Across chunks, they
     613             :      increment if we are hashing leaves.  Otherwise, they are zero. */
     614             : 
     615           0 :   wu_t ctr_add   = wu_and( wu_bcast( parent ? 0 : UINT_MAX ),
     616           0 :                            wu( 0, 1, 2, 3, 4, 5, 6, 7 ) );
     617           0 :   wu_t ctr_lo    = wu_add( wu_bcast( counter ), ctr_add );
     618           0 :   wu_t ctr_carry = wi_gt ( wu_xor( ctr_add, wu_bcast( 0x80000000 ) ),
     619           0 :                            wu_xor( ctr_lo,  wu_bcast( 0x80000000 ) ) );
     620           0 :   wu_t ctr_hi    = wu_sub( wu_bcast( counter>>32 ), ctr_carry );
     621           0 :   wu_t sz_vec    = wu_bcast( FD_BLAKE3_BLOCK_SZ );
     622             : 
     623           0 :   wu_t const iv0 = wu_bcast( FD_BLAKE3_IV[0] );
     624           0 :   wu_t const iv1 = wu_bcast( FD_BLAKE3_IV[1] );
     625           0 :   wu_t const iv2 = wu_bcast( FD_BLAKE3_IV[2] );
     626           0 :   wu_t const iv3 = wu_bcast( FD_BLAKE3_IV[3] );
     627           0 :   wu_t const iv4 = wu_bcast( FD_BLAKE3_IV[4] );
     628           0 :   wu_t const iv5 = wu_bcast( FD_BLAKE3_IV[5] );
     629           0 :   wu_t const iv6 = wu_bcast( FD_BLAKE3_IV[6] );
     630           0 :   wu_t const iv7 = wu_bcast( FD_BLAKE3_IV[7] );
     631             : 
     632           0 :   wu_t h0=iv0; wu_t h1=iv1; wu_t h2=iv2; wu_t h3=iv3;
     633           0 :   wu_t h4=iv4; wu_t h5=iv5; wu_t h6=iv6; wu_t h7=iv7;
     634             : 
     635           0 :   ulong off = 0UL;
     636           0 :   do {
     637           0 :     ulong const off_next = off+FD_BLAKE3_BLOCK_SZ;
     638           0 :     uint chunk_flags =
     639           0 :         ( off     ==0UL ? FD_BLAKE3_FLAG_CHUNK_START : 0u ) |
     640           0 :         ( off_next==sz  ? FD_BLAKE3_FLAG_CHUNK_END   : 0u );
     641           0 :     uint flags_ = flags | fd_uint_if( parent, 0, chunk_flags );
     642           0 :     wu_t flags_vec = wu_bcast( flags_ );
     643             : 
     644           0 :     wu_t m[16];
     645           0 :     m[ 0] = wu_ldu( msg + (0<<lg_sz) + off      );
     646           0 :     m[ 1] = wu_ldu( msg + (1<<lg_sz) + off      );
     647           0 :     m[ 2] = wu_ldu( msg + (2<<lg_sz) + off      );
     648           0 :     m[ 3] = wu_ldu( msg + (3<<lg_sz) + off      );
     649           0 :     m[ 4] = wu_ldu( msg + (4<<lg_sz) + off      );
     650           0 :     m[ 5] = wu_ldu( msg + (5<<lg_sz) + off      );
     651           0 :     m[ 6] = wu_ldu( msg + (6<<lg_sz) + off      );
     652           0 :     m[ 7] = wu_ldu( msg + (7<<lg_sz) + off      );
     653           0 :     m[ 8] = wu_ldu( msg + (0<<lg_sz) + off + 32 );
     654           0 :     m[ 9] = wu_ldu( msg + (1<<lg_sz) + off + 32 );
     655           0 :     m[10] = wu_ldu( msg + (2<<lg_sz) + off + 32 );
     656           0 :     m[11] = wu_ldu( msg + (3<<lg_sz) + off + 32 );
     657           0 :     m[12] = wu_ldu( msg + (4<<lg_sz) + off + 32 );
     658           0 :     m[13] = wu_ldu( msg + (5<<lg_sz) + off + 32 );
     659           0 :     m[14] = wu_ldu( msg + (6<<lg_sz) + off + 32 );
     660           0 :     m[15] = wu_ldu( msg + (7<<lg_sz) + off + 32 );
     661             : 
     662           0 :     wu_transpose_8x8( m[0x0], m[0x1], m[0x2], m[0x3], m[0x4], m[0x5], m[0x6], m[0x7],
     663           0 :                       m[0x0], m[0x1], m[0x2], m[0x3], m[0x4], m[0x5], m[0x6], m[0x7] );
     664           0 :     wu_transpose_8x8( m[0x8], m[0x9], m[0xa], m[0xb], m[0xc], m[0xd], m[0xe], m[0xf],
     665           0 :                       m[0x8], m[0x9], m[0xa], m[0xb], m[0xc], m[0xd], m[0xe], m[0xf] );
     666             : 
     667           0 :     wu_t v[16] = {
     668           0 :         h0,     h1,     h2,     h3,
     669           0 :         h4,     h5,     h6,     h7,
     670           0 :         iv0,    iv1,    iv2,    iv3,
     671           0 :         ctr_lo, ctr_hi, sz_vec, flags_vec,
     672           0 :     };
     673             : 
     674           0 :     round_fn8( v, m, 0 );
     675           0 :     round_fn8( v, m, 1 );
     676           0 :     round_fn8( v, m, 2 );
     677           0 :     round_fn8( v, m, 3 );
     678           0 :     round_fn8( v, m, 4 );
     679           0 :     round_fn8( v, m, 5 );
     680           0 :     round_fn8( v, m, 6 );
     681             : 
     682           0 :     h0 = wu_xor( v[ 0], v[ 8] );
     683           0 :     h1 = wu_xor( v[ 1], v[ 9] );
     684           0 :     h2 = wu_xor( v[ 2], v[10] );
     685           0 :     h3 = wu_xor( v[ 3], v[11] );
     686           0 :     h4 = wu_xor( v[ 4], v[12] );
     687           0 :     h5 = wu_xor( v[ 5], v[13] );
     688           0 :     h6 = wu_xor( v[ 6], v[14] );
     689           0 :     h7 = wu_xor( v[ 7], v[15] );
     690             : 
     691           0 :     off = off_next;
     692           0 :   } while( off!=sz );
     693             : 
     694           0 :   wu_transpose_8x8( h0, h1, h2, h3, h4, h5, h6, h7,
     695           0 :                     h0, h1, h2, h3, h4, h5, h6, h7 );
     696             : 
     697           0 :   wu_st( (uint *)( out + (0UL<<FD_BLAKE3_OUTCHAIN_LG_SZ) ), h0 );
     698           0 :   wu_st( (uint *)( out + (1UL<<FD_BLAKE3_OUTCHAIN_LG_SZ) ), h1 );
     699           0 :   wu_st( (uint *)( out + (2UL<<FD_BLAKE3_OUTCHAIN_LG_SZ) ), h2 );
     700           0 :   wu_st( (uint *)( out + (3UL<<FD_BLAKE3_OUTCHAIN_LG_SZ) ), h3 );
     701           0 :   wu_st( (uint *)( out + (4UL<<FD_BLAKE3_OUTCHAIN_LG_SZ) ), h4 );
     702           0 :   wu_st( (uint *)( out + (5UL<<FD_BLAKE3_OUTCHAIN_LG_SZ) ), h5 );
     703           0 :   wu_st( (uint *)( out + (6UL<<FD_BLAKE3_OUTCHAIN_LG_SZ) ), h6 );
     704           0 :   wu_st( (uint *)( out + (7UL<<FD_BLAKE3_OUTCHAIN_LG_SZ) ), h7 );
     705           0 : }

Generated by: LCOV version 1.14