LCOV - cov.lcov - ballet/blake3/fd_blake3

LCOV - code coverage report

Current view:	top level - ballet/blake3 - fd_blake3_avx512.c (source / functions)		Hit	Total	Coverage
Test:	cov.lcov	Lines:	528	538	98.1 %
Date:	2026-03-19 18:19:27	Functions:	3	3	100.0 %

          Line data    Source code

       1             : 
       2             : // Source originally from https://github.com/BLAKE3-team/BLAKE3
       3             : // From commit: c0ea395cf91d242f078c23d5f8d87eb9dd5f7b78
       4             : 
       5             : #include "fd_blake3_private.h"
       6             : #include "../../util/simd/fd_avx512.h"
       7             : #include "../../util/simd/fd_avx.h"
       8             : 
       9             : static inline __attribute__((always_inline)) void
      10             : round_fn16( wwu_t v[16],
      11             :             wwu_t m[16],
      12      916265 :             ulong r ) {
      13      916265 :   v[0x0] = wwu_add(v[0x0], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][0]]);
      14      916265 :   v[0x1] = wwu_add(v[0x1], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][2]]);
      15      916265 :   v[0x2] = wwu_add(v[0x2], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][4]]);
      16      916265 :   v[0x3] = wwu_add(v[0x3], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][6]]);
      17      916265 :   v[0x0] = wwu_add(v[0x0], v[0x4]);
      18      916265 :   v[0x1] = wwu_add(v[0x1], v[0x5]);
      19      916265 :   v[0x2] = wwu_add(v[0x2], v[0x6]);
      20      916265 :   v[0x3] = wwu_add(v[0x3], v[0x7]);
      21      916265 :   v[0xc] = wwu_xor(v[0xc], v[0x0]);
      22      916265 :   v[0xd] = wwu_xor(v[0xd], v[0x1]);
      23      916265 :   v[0xe] = wwu_xor(v[0xe], v[0x2]);
      24      916265 :   v[0xf] = wwu_xor(v[0xf], v[0x3]);
      25      916265 :   v[0xc] = wwu_ror(v[0xc], 16);
      26      916265 :   v[0xd] = wwu_ror(v[0xd], 16);
      27      916265 :   v[0xe] = wwu_ror(v[0xe], 16);
      28      916265 :   v[0xf] = wwu_ror(v[0xf], 16);
      29      916265 :   v[0x8] = wwu_add(v[0x8], v[0xc]);
      30      916265 :   v[0x9] = wwu_add(v[0x9], v[0xd]);
      31      916265 :   v[0xa] = wwu_add(v[0xa], v[0xe]);
      32      916265 :   v[0xb] = wwu_add(v[0xb], v[0xf]);
      33      916265 :   v[0x4] = wwu_xor(v[0x4], v[0x8]);
      34      916265 :   v[0x5] = wwu_xor(v[0x5], v[0x9]);
      35      916265 :   v[0x6] = wwu_xor(v[0x6], v[0xa]);
      36      916265 :   v[0x7] = wwu_xor(v[0x7], v[0xb]);
      37      916265 :   v[0x4] = wwu_ror(v[0x4], 12);
      38      916265 :   v[0x5] = wwu_ror(v[0x5], 12);
      39      916265 :   v[0x6] = wwu_ror(v[0x6], 12);
      40      916265 :   v[0x7] = wwu_ror(v[0x7], 12);
      41      916265 :   v[0x0] = wwu_add(v[0x0], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][1]]);
      42      916265 :   v[0x1] = wwu_add(v[0x1], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][3]]);
      43      916265 :   v[0x2] = wwu_add(v[0x2], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][5]]);
      44      916265 :   v[0x3] = wwu_add(v[0x3], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][7]]);
      45      916265 :   v[0x0] = wwu_add(v[0x0], v[0x4]);
      46      916265 :   v[0x1] = wwu_add(v[0x1], v[0x5]);
      47      916265 :   v[0x2] = wwu_add(v[0x2], v[0x6]);
      48      916265 :   v[0x3] = wwu_add(v[0x3], v[0x7]);
      49      916265 :   v[0xc] = wwu_xor(v[0xc], v[0x0]);
      50      916265 :   v[0xd] = wwu_xor(v[0xd], v[0x1]);
      51      916265 :   v[0xe] = wwu_xor(v[0xe], v[0x2]);
      52      916265 :   v[0xf] = wwu_xor(v[0xf], v[0x3]);
      53      916265 :   v[0xc] = wwu_ror(v[0xc], 8);
      54      916265 :   v[0xd] = wwu_ror(v[0xd], 8);
      55      916265 :   v[0xe] = wwu_ror(v[0xe], 8);
      56      916265 :   v[0xf] = wwu_ror(v[0xf], 8);
      57      916265 :   v[0x8] = wwu_add(v[0x8], v[0xc]);
      58      916265 :   v[0x9] = wwu_add(v[0x9], v[0xd]);
      59      916265 :   v[0xa] = wwu_add(v[0xa], v[0xe]);
      60      916265 :   v[0xb] = wwu_add(v[0xb], v[0xf]);
      61      916265 :   v[0x4] = wwu_xor(v[0x4], v[0x8]);
      62      916265 :   v[0x5] = wwu_xor(v[0x5], v[0x9]);
      63      916265 :   v[0x6] = wwu_xor(v[0x6], v[0xa]);
      64      916265 :   v[0x7] = wwu_xor(v[0x7], v[0xb]);
      65      916265 :   v[0x4] = wwu_ror(v[0x4], 7);
      66      916265 :   v[0x5] = wwu_ror(v[0x5], 7);
      67      916265 :   v[0x6] = wwu_ror(v[0x6], 7);
      68      916265 :   v[0x7] = wwu_ror(v[0x7], 7);
      69             : 
      70      916265 :   v[0x0] = wwu_add(v[0x0], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][8]]);
      71      916265 :   v[0x1] = wwu_add(v[0x1], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][10]]);
      72      916265 :   v[0x2] = wwu_add(v[0x2], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][12]]);
      73      916265 :   v[0x3] = wwu_add(v[0x3], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][14]]);
      74      916265 :   v[0x0] = wwu_add(v[0x0], v[0x5]);
      75      916265 :   v[0x1] = wwu_add(v[0x1], v[0x6]);
      76      916265 :   v[0x2] = wwu_add(v[0x2], v[0x7]);
      77      916265 :   v[0x3] = wwu_add(v[0x3], v[0x4]);
      78      916265 :   v[0xf] = wwu_xor(v[0xf], v[0x0]);
      79      916265 :   v[0xc] = wwu_xor(v[0xc], v[0x1]);
      80      916265 :   v[0xd] = wwu_xor(v[0xd], v[0x2]);
      81      916265 :   v[0xe] = wwu_xor(v[0xe], v[0x3]);
      82      916265 :   v[0xf] = wwu_ror(v[0xf], 16);
      83      916265 :   v[0xc] = wwu_ror(v[0xc], 16);
      84      916265 :   v[0xd] = wwu_ror(v[0xd], 16);
      85      916265 :   v[0xe] = wwu_ror(v[0xe], 16);
      86      916265 :   v[0xa] = wwu_add(v[0xa], v[0xf]);
      87      916265 :   v[0xb] = wwu_add(v[0xb], v[0xc]);
      88      916265 :   v[0x8] = wwu_add(v[0x8], v[0xd]);
      89      916265 :   v[0x9] = wwu_add(v[0x9], v[0xe]);
      90      916265 :   v[0x5] = wwu_xor(v[0x5], v[0xa]);
      91      916265 :   v[0x6] = wwu_xor(v[0x6], v[0xb]);
      92      916265 :   v[0x7] = wwu_xor(v[0x7], v[0x8]);
      93      916265 :   v[0x4] = wwu_xor(v[0x4], v[0x9]);
      94      916265 :   v[0x5] = wwu_ror(v[0x5], 12);
      95      916265 :   v[0x6] = wwu_ror(v[0x6], 12);
      96      916265 :   v[0x7] = wwu_ror(v[0x7], 12);
      97      916265 :   v[0x4] = wwu_ror(v[0x4], 12);
      98      916265 :   v[0x0] = wwu_add(v[0x0], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][9]]);
      99      916265 :   v[0x1] = wwu_add(v[0x1], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][11]]);
     100      916265 :   v[0x2] = wwu_add(v[0x2], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][13]]);
     101      916265 :   v[0x3] = wwu_add(v[0x3], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][15]]);
     102      916265 :   v[0x0] = wwu_add(v[0x0], v[0x5]);
     103      916265 :   v[0x1] = wwu_add(v[0x1], v[0x6]);
     104      916265 :   v[0x2] = wwu_add(v[0x2], v[0x7]);
     105      916265 :   v[0x3] = wwu_add(v[0x3], v[0x4]);
     106      916265 :   v[0xf] = wwu_xor(v[0xf], v[0x0]);
     107      916265 :   v[0xc] = wwu_xor(v[0xc], v[0x1]);
     108      916265 :   v[0xd] = wwu_xor(v[0xd], v[0x2]);
     109      916265 :   v[0xe] = wwu_xor(v[0xe], v[0x3]);
     110      916265 :   v[0xf] = wwu_ror(v[0xf], 8);
     111      916265 :   v[0xc] = wwu_ror(v[0xc], 8);
     112      916265 :   v[0xd] = wwu_ror(v[0xd], 8);
     113      916265 :   v[0xe] = wwu_ror(v[0xe], 8);
     114      916265 :   v[0xa] = wwu_add(v[0xa], v[0xf]);
     115      916265 :   v[0xb] = wwu_add(v[0xb], v[0xc]);
     116      916265 :   v[0x8] = wwu_add(v[0x8], v[0xd]);
     117      916265 :   v[0x9] = wwu_add(v[0x9], v[0xe]);
     118      916265 :   v[0x5] = wwu_xor(v[0x5], v[0xa]);
     119      916265 :   v[0x6] = wwu_xor(v[0x6], v[0xb]);
     120      916265 :   v[0x7] = wwu_xor(v[0x7], v[0x8]);
     121      916265 :   v[0x4] = wwu_xor(v[0x4], v[0x9]);
     122      916265 :   v[0x5] = wwu_ror(v[0x5], 7);
     123      916265 :   v[0x6] = wwu_ror(v[0x6], 7);
     124      916265 :   v[0x7] = wwu_ror(v[0x7], 7);
     125      916265 :   v[0x4] = wwu_ror(v[0x4], 7);
     126      916265 : }
     127             : 
     128             : void
     129             : fd_blake3_avx512_compress16( ulong                   batch_cnt,
     130             :                              void const   * restrict _batch_data,
     131             :                              uint const   * restrict batch_sz,
     132             :                              ulong const  * restrict ctr_vec,
     133             :                              uint const   * restrict batch_flags,
     134             :                              void * const * restrict _batch_hash,
     135             :                              ushort *       restrict lthash,
     136             :                              uint                    out_sz,
     137       19244 :                              void const *   restrict batch_cv ) {
     138       19244 :   if( FD_UNLIKELY( lthash && batch_cnt!=16 ) ) FD_LOG_ERR(( "Lane masking not supported for fd_blake3_avx512_compress16 in LtHash mode" ));
     139       19244 :   if( FD_UNLIKELY( batch_cnt==0 || batch_cnt>16 ) ) FD_LOG_ERR(( "Invalid batch_cnt %lu", batch_cnt ));
     140             : 
     141             :   /* We can only process input blocks of 64 bytes, but message data size
     142             :      is not necessarily a multiple of 64.  We compute the tail block of
     143             :      each message here.  We then process complete blocks of the original
     144             :      message in place, switching to processing to these  tail blocks in
     145             :      the same pass toward the end. */
     146             : 
     147       19244 :   ulong const * batch_data = (ulong const *)_batch_data;
     148             : 
     149       19244 :   if( FD_UNLIKELY( batch_cnt==1 ) ) {
     150        1851 :     fd_blake3_sse_compress1( (uchar *)(_batch_hash[0]),
     151        1851 :                              (uchar const *)(batch_data[0]),
     152        1851 :                              batch_sz[0],
     153        1851 :                              ctr_vec[0],
     154        1851 :                              batch_flags[0],
     155        1851 :                              NULL,
     156        1851 :                              NULL );
     157        1851 :     return;
     158        1851 :   }
     159             : 
     160             : #if FD_BLAKE3_TRACING
     161             :   /* This log_line buffer is oversized by a fair bit (due to all the
     162             :      NULL terminators) but that's fine */
     163             :   char log_line[
     164             :       sizeof( "fd_blake3_avx512_compress16" )+
     165             :       sizeof( "(batch_cnt=" )+21+
     166             :       sizeof( ",sz=["       )+(16*11)+sizeof( "]" )+
     167             :       sizeof( ",counter=["  )+(16*21)+sizeof( "]" )+
     168             :       sizeof( ",flags=["    )+(16* 2)+sizeof( "]" )+
     169             :       sizeof( ",custom_cv"  )+
     170             :       sizeof( ",lthash" )+
     171             :       sizeof( ")" ) ];
     172             : 
     173             :   char * p = fd_cstr_init( log_line );
     174             :   p = fd_cstr_append_text( p, "fd_blake3_avx512_compress16(batch_cnt=", 38UL );
     175             :   p = fd_cstr_append_ulong_as_text( p, 0, 0, batch_cnt, fd_uchar_base10_dig_cnt( (uchar)batch_cnt ) );
     176             :   p = fd_cstr_append_text( p, ",sz=[", 5UL );
     177             :   for( ulong i=0UL; i<batch_cnt; i++ ) {
     178             :     p = fd_cstr_append_uint_as_text( p, ' ', 0, batch_sz[ i ], fd_uint_base10_dig_cnt( batch_sz[ i ] ) );
     179             :     if( i+1<batch_cnt ) p = fd_cstr_append_char( p, ',' );
     180             :   }
     181             :   p = fd_cstr_append_text( p, "],counter=[", 11UL );
     182             :   for( ulong i=0UL; i<batch_cnt; i++ ) {
     183             :     p = fd_cstr_append_ulong_as_text( p, ' ', 0, ctr_vec[ i ], fd_ulong_base10_dig_cnt( ctr_vec[ i ] ) );
     184             :     if( i+1<batch_cnt ) p = fd_cstr_append_char( p, ',' );
     185             :   }
     186             :   p = fd_cstr_append_text( p, "],flags=[", 9UL );
     187             :   for( ulong i=0UL; i<batch_cnt; i++ ) {
     188             :     static char const hex_lut[ 16 ] = {
     189             :       '0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'
     190             :     };
     191             :     p = fd_cstr_append_char( p, hex_lut[ batch_flags[ i ]&0xf ] );
     192             :     if( i+1<batch_cnt ) p = fd_cstr_append_char( p, ',' );
     193             :   }
     194             :   p = fd_cstr_append_char( p, ']' );
     195             :   if( batch_cv ) p = fd_cstr_append_text( p, ",custom_cv", 10UL );
     196             :   if( lthash   ) p = fd_cstr_append_text( p, ",lthash", 7UL );
     197             :   p = fd_cstr_append_char( p, ')' );
     198             :   ulong line_len = (ulong)( p-log_line );
     199             :   fd_cstr_fini( p );
     200             : 
     201             :   FD_BLAKE3_TRACE(( "%.*s", (int)line_len, log_line ));
     202             : #endif
     203             : 
     204       17393 :   ulong batch_tail_data[ 16 ] __attribute__((aligned(64)));
     205       17393 :   ulong batch_tail_rem [ 16 ] __attribute__((aligned(64)));
     206             : 
     207       17393 :   uchar scratch[ 16*FD_BLAKE3_BLOCK_SZ ] __attribute__((aligned(128)));
     208       17393 :   do {
     209       17393 :     ulong scratch_free = (ulong)scratch;
     210             : 
     211       17393 :     wwv_t zero = wwv_zero();
     212             : 
     213      212292 :     for( ulong batch_idx=0UL; batch_idx<batch_cnt; batch_idx++ ) {
     214             : 
     215             :       /* Allocate the tail blocks for this message */
     216             : 
     217      194899 :       ulong data = batch_data[ batch_idx ];
     218      194899 :       ulong sz   = batch_sz  [ batch_idx ];
     219             : 
     220      194899 :       ulong tail_data     = scratch_free;
     221      194899 :       ulong tail_data_sz  = sz & (FD_BLAKE3_BLOCK_SZ-1UL);
     222      194899 :       ulong tail_data_off = fd_ulong_align_dn( sz, FD_BLAKE3_BLOCK_SZ );
     223             : 
     224      194899 :       batch_tail_data[ batch_idx ] = tail_data;
     225      194899 :       batch_tail_rem [ batch_idx ] = (!!tail_data_sz) ^ (!sz);  /* (hash 1 tail block if 0 sz) */
     226             : 
     227      194899 :       scratch_free += FD_BLAKE3_BLOCK_SZ;
     228             : 
     229             :       /* Populate the tail blocks.  We first clear the blocks.  Then we
     230             :          copy any straggler data bytes into the tail. */
     231             : 
     232      194899 :       wwv_st( (ulong *) tail_data, zero );
     233             : 
     234      194899 : #     if 1
     235             :       /* See fd_sha256_private_batch_avx */
     236      194899 :       ulong src = (ulong)data + tail_data_off;
     237      194899 :       ulong dst = tail_data;
     238      194899 :       ulong rem = tail_data_sz;
     239      214582 :       while( rem>=32UL ) { wv_st( (ulong *)dst, wv_ldu( (ulong const *)src ) ); dst += 32UL; src += 32UL; rem -= 32UL; }
     240      275692 :       while( rem>= 8UL ) { *(ulong  *)dst = FD_LOAD( ulong,  src );             dst +=  8UL; src +=  8UL; rem -=  8UL; }
     241      194899 :       if   ( rem>= 4UL ) { *(uint   *)dst = FD_LOAD( uint,   src );             dst +=  4UL; src +=  4UL; rem -=  4UL; }
     242      194899 :       if   ( rem>= 2UL ) { *(ushort *)dst = FD_LOAD( ushort, src );             dst +=  2UL; src +=  2UL; rem -=  2UL; }
     243      194899 :       if   ( rem       ) { *(uchar  *)dst = FD_LOAD( uchar,  src );             dst++;                                 }
     244             : #     else
     245             :       fd_memcpy( (void *)tail_data, (void const *)(data + tail_data_off), tail_data_sz );
     246             : #     endif
     247      194899 :     }
     248       17393 :   } while(0);
     249             : 
     250             : 
     251       17393 :   wwu_t const iv0 = wwu_bcast( FD_BLAKE3_IV[0] );
     252       17393 :   wwu_t const iv1 = wwu_bcast( FD_BLAKE3_IV[1] );
     253       17393 :   wwu_t const iv2 = wwu_bcast( FD_BLAKE3_IV[2] );
     254       17393 :   wwu_t const iv3 = wwu_bcast( FD_BLAKE3_IV[3] );
     255       17393 :   wwu_t const iv4 = wwu_bcast( FD_BLAKE3_IV[4] );
     256       17393 :   wwu_t const iv5 = wwu_bcast( FD_BLAKE3_IV[5] );
     257       17393 :   wwu_t const iv6 = wwu_bcast( FD_BLAKE3_IV[6] );
     258       17393 :   wwu_t const iv7 = wwu_bcast( FD_BLAKE3_IV[7] );
     259             : 
     260       17393 :   wwu_t h0=iv0; wwu_t h1=iv1; wwu_t h2=iv2; wwu_t h3=iv3;
     261       17393 :   wwu_t h4=iv4; wwu_t h5=iv5; wwu_t h6=iv6; wwu_t h7=iv7;
     262       17393 :   if( FD_UNLIKELY( batch_cv ) ) {
     263             :     /* If the input chaining value is overridden, transpose the input
     264             :        to AVX512 representation.  (wwu 16x8 transpose)  FIXME There's
     265             :        probably a way to do this using AVX512 instead of AVX. */
     266        8415 :     __m256i const ** cv_vec = (__m256i const **)batch_cv;
     267        8415 :     wu_t cv_lo[8]; wu_t cv_hi[8];
     268        8415 :     cv_lo[ 0 ] = _mm256_loadu_si256( cv_vec[  0 ] );
     269        8415 :     cv_lo[ 1 ] = _mm256_loadu_si256( cv_vec[  1 ] );
     270        8415 :     cv_lo[ 2 ] = _mm256_loadu_si256( cv_vec[  2 ] );
     271        8415 :     cv_lo[ 3 ] = _mm256_loadu_si256( cv_vec[  3 ] );
     272        8415 :     cv_lo[ 4 ] = _mm256_loadu_si256( cv_vec[  4 ] );
     273        8415 :     cv_lo[ 5 ] = _mm256_loadu_si256( cv_vec[  5 ] );
     274        8415 :     cv_lo[ 6 ] = _mm256_loadu_si256( cv_vec[  6 ] );
     275        8415 :     cv_lo[ 7 ] = _mm256_loadu_si256( cv_vec[  7 ] );
     276        8415 :     cv_hi[ 0 ] = _mm256_loadu_si256( cv_vec[  8 ] );
     277        8415 :     cv_hi[ 1 ] = _mm256_loadu_si256( cv_vec[  9 ] );
     278        8415 :     cv_hi[ 2 ] = _mm256_loadu_si256( cv_vec[ 10 ] );
     279        8415 :     cv_hi[ 3 ] = _mm256_loadu_si256( cv_vec[ 11 ] );
     280        8415 :     cv_hi[ 4 ] = _mm256_loadu_si256( cv_vec[ 12 ] );
     281        8415 :     cv_hi[ 5 ] = _mm256_loadu_si256( cv_vec[ 13 ] );
     282        8415 :     cv_hi[ 6 ] = _mm256_loadu_si256( cv_vec[ 14 ] );
     283        8415 :     cv_hi[ 7 ] = _mm256_loadu_si256( cv_vec[ 15 ] );
     284        8415 :     wu_transpose_8x8( cv_lo[0], cv_lo[1], cv_lo[2], cv_lo[3], cv_lo[4], cv_lo[5], cv_lo[6], cv_lo[7],
     285        8415 :                       cv_lo[0], cv_lo[1], cv_lo[2], cv_lo[3], cv_lo[4], cv_lo[5], cv_lo[6], cv_lo[7] );
     286        8415 :     wu_transpose_8x8( cv_hi[0], cv_hi[1], cv_hi[2], cv_hi[3], cv_hi[4], cv_hi[5], cv_hi[6], cv_hi[7],
     287        8415 :                       cv_hi[0], cv_hi[1], cv_hi[2], cv_hi[3], cv_hi[4], cv_hi[5], cv_hi[6], cv_hi[7] );
     288        8415 :     h0 = _mm512_inserti64x4( _mm512_castsi256_si512( cv_lo[ 0 ] ), cv_hi[ 0 ], 1 );
     289        8415 :     h1 = _mm512_inserti64x4( _mm512_castsi256_si512( cv_lo[ 1 ] ), cv_hi[ 1 ], 1 );
     290        8415 :     h2 = _mm512_inserti64x4( _mm512_castsi256_si512( cv_lo[ 2 ] ), cv_hi[ 2 ], 1 );
     291        8415 :     h3 = _mm512_inserti64x4( _mm512_castsi256_si512( cv_lo[ 3 ] ), cv_hi[ 3 ], 1 );
     292        8415 :     h4 = _mm512_inserti64x4( _mm512_castsi256_si512( cv_lo[ 4 ] ), cv_hi[ 4 ], 1 );
     293        8415 :     h5 = _mm512_inserti64x4( _mm512_castsi256_si512( cv_lo[ 5 ] ), cv_hi[ 5 ], 1 );
     294        8415 :     h6 = _mm512_inserti64x4( _mm512_castsi256_si512( cv_lo[ 6 ] ), cv_hi[ 6 ], 1 );
     295        8415 :     h7 = _mm512_inserti64x4( _mm512_castsi256_si512( cv_lo[ 7 ] ), cv_hi[ 7 ], 1 );
     296        8415 :   }
     297             : 
     298       17393 :   wwu_t ctr_lo = wwu( ctr_vec[ 0],     ctr_vec[ 1],     ctr_vec[ 2],     ctr_vec[ 3],
     299       17393 :                       ctr_vec[ 4],     ctr_vec[ 5],     ctr_vec[ 6],     ctr_vec[ 7],
     300       17393 :                       ctr_vec[ 8],     ctr_vec[ 9],     ctr_vec[10],     ctr_vec[11],
     301       17393 :                       ctr_vec[12],     ctr_vec[13],     ctr_vec[14],     ctr_vec[15] );
     302       17393 :   wwu_t ctr_hi = wwu( ctr_vec[ 0]>>32, ctr_vec[ 1]>>32, ctr_vec[ 2]>>32, ctr_vec[ 3]>>32,
     303       17393 :                       ctr_vec[ 4]>>32, ctr_vec[ 5]>>32, ctr_vec[ 6]>>32, ctr_vec[ 7]>>32,
     304       17393 :                       ctr_vec[ 8]>>32, ctr_vec[ 9]>>32, ctr_vec[10]>>32, ctr_vec[11]>>32,
     305       17393 :                       ctr_vec[12]>>32, ctr_vec[13]>>32, ctr_vec[14]>>32, ctr_vec[15]>>32 );
     306       17393 :   wwu_t flags = wwu_ldu( batch_flags );
     307       17393 :   wwu_t off   = wwu_zero();
     308       17393 :   wwu_t sz    = wwu_ldu( batch_sz    );
     309             : 
     310       17393 :   wwv_t zero         = wwv_zero();
     311       17393 :   wwv_t one          = wwv_one();
     312       17393 :   wwu_t wwu_64       = wwu_bcast( FD_BLAKE3_BLOCK_SZ );
     313       17393 :   wwv_t wwv_64       = wwv_bcast( FD_BLAKE3_BLOCK_SZ );
     314       17393 :   wwv_t W_sentinel   = wwv_bcast( (ulong)scratch );
     315             :   //wwc_t batch_lane   = wc_unpack( (1<<batch_cnt)-1 );
     316             : 
     317       17393 :   wwv_t tail_lo      = wwv_ld( batch_tail_data   );
     318       17393 :   wwv_t tail_hi      = wwv_ld( batch_tail_data+8 );
     319             : 
     320       17393 :   wwv_t tail_rem_lo  = wwv_ld( batch_tail_rem    );
     321       17393 :   wwv_t tail_rem_hi  = wwv_ld( batch_tail_rem+8  );
     322             : 
     323       17393 :   wwv_t W_lo         = wwv_ld( batch_data        );
     324       17393 :   wwv_t W_hi         = wwv_ld( batch_data+8      );
     325             : 
     326       17393 :   wwv_t batch_sz_lo  = _mm512_cvtepi32_epi64( _mm512_extracti32x8_epi32( sz, 0 ) );
     327       17393 :   wwv_t batch_sz_hi  = _mm512_cvtepi32_epi64( _mm512_extracti32x8_epi32( sz, 1 ) );
     328             : 
     329       17393 :   wwv_t block_rem_lo = wwv_if( ((1<<batch_cnt)-1) & 0xff,
     330       17393 :                                wwv_add( wwv_shr( batch_sz_lo, FD_BLAKE3_BLOCK_LG_SZ ), tail_rem_lo ), zero );
     331       17393 :   wwv_t block_rem_hi = wwv_if( ((1<<batch_cnt)-1) >> 8,
     332       17393 :                                wwv_add( wwv_shr( batch_sz_hi, FD_BLAKE3_BLOCK_LG_SZ ), tail_rem_hi ), zero );
     333             : 
     334             :   /* Upper half of the compression function output.
     335             :      Usually thrown away, but kept in the final compression round if
     336             :      out_sz==64. */
     337       17393 :   wwu_t hu[8] = {0};
     338             : 
     339       17393 :   ulong lthash_rem    = lthash ? 32 : 0; /* Number of LtHash (XOF) blocks remaining */
     340       17393 :   int   compress_done = 0;
     341       61609 :   for(;;) {
     342             :     /* Switch lanes that have hit the end of their in-place bulk
     343             :        processing to their out-of-place scratch tail regions as
     344             :        necessary. */
     345             : 
     346       61609 :     W_lo = wwv_if( wwv_eq( block_rem_lo, tail_rem_lo ), tail_lo, W_lo );
     347       61609 :     W_hi = wwv_if( wwv_eq( block_rem_hi, tail_rem_hi ), tail_hi, W_hi );
     348             : 
     349             :     /* Derive per-block flags and block sizes */
     350             : 
     351       61609 :     int block_first = wwu_eq( off, wwu_zero() );
     352       61609 :     int block_last  = wwi_le( sz, wwu_add( off, wwu_bcast( FD_BLAKE3_BLOCK_SZ ) ) );
     353             : 
     354             :     /* Suppress root flag unless last block */
     355             : 
     356       61609 :     wwu_t root_mask   = wwu_if( block_last, wwu_bcast( UINT_MAX ), wwu_bcast( ~FD_BLAKE3_FLAG_ROOT ) );
     357       61609 :     wwu_t block_flags = wwu_and( flags, root_mask );
     358             : 
     359             :     /* Mask lanes that completed */
     360             : 
     361       61609 :     int active_lane_lo;
     362       61609 :     int active_lane_hi;
     363       61609 :     if( FD_UNLIKELY( lthash ) ) {
     364             :       /* Compress until root block */
     365           0 :       wwu_t all_root = wwu_bcast( FD_BLAKE3_FLAG_ROOT );
     366           0 :       int   not_root = wwu_ne( wwu_and( block_flags, all_root ), all_root );
     367           0 :       active_lane_lo = (int)(__mmask8)not_root;
     368           0 :       active_lane_hi = (int)(__mmask8)(not_root>>8);
     369       61609 :     } else {
     370             :       /* Complete when there is no more input data */
     371       61609 :       active_lane_lo = wwv_ne( block_rem_lo, zero );
     372       61609 :       active_lane_hi = wwv_ne( block_rem_hi, zero );
     373       61609 :     }
     374             : 
     375             :     /* Suppress CHUNK_{START,END} flags unless leaf node */
     376             : 
     377       61609 :     int is_parent = wwu_ne( wwu_and( flags, wwu_bcast( FD_BLAKE3_FLAG_PARENT ) ), wwu_zero() );
     378       61609 :     wwu_t chunk_flags = wwu_if( block_last, wwu_bcast( FD_BLAKE3_FLAG_CHUNK_END ), wwu_zero() );
     379       61609 :     if( out_sz==32 ) {
     380             :       /* Hacky: out_sz==64 is only used for post-compress XOF hashing,
     381             :          so use that as a hint when to suppress the 'CHUNK_START' flag. */
     382       44849 :       chunk_flags = wwu_or( chunk_flags, wwu_if( block_first, wwu_bcast( FD_BLAKE3_FLAG_CHUNK_START ), wwu_zero() ) );
     383       44849 :     }
     384       61609 :     wwu_t block_sz = wwu_min( wwu_sub( sz, off ), wwu_64 );
     385       61609 :     block_flags = wwu_or( block_flags, wwu_if( is_parent, wwu_zero(), chunk_flags ) );
     386             : 
     387             :     /* Check if we are done compressing */
     388             : 
     389       61609 :     compress_done |= !(active_lane_lo | active_lane_hi);
     390       61609 :     if( FD_UNLIKELY( compress_done ) ) {
     391       17389 :       if( FD_UNLIKELY( !lthash_rem ) ) break;
     392           0 :       active_lane_lo = 0xff;
     393           0 :       active_lane_hi = 0xff;
     394             :       /* Load the next message block and fall through to XOF expansion */
     395           0 :     }
     396             : 
     397             :     /* At this point, we have at least 1 block in this message segment
     398             :        pass that has not been processed.  Load the next 64 bytes of
     399             :        each unprocessed block.  Inactive lanes (e.g. message segments
     400             :        in this pass for which we've already processed all the blocks)
     401             :        will load garbage from a sentinel location (and the result of
     402             :        the state computations for the inactive lane will be ignored). */
     403             : 
     404       44220 :     ulong _W0; ulong _W1; ulong _W2; ulong _W3; ulong _W4; ulong _W5; ulong _W6; ulong _W7;
     405       44220 :     ulong _W8; ulong _W9; ulong _Wa; ulong _Wb; ulong _Wc; ulong _Wd; ulong _We; ulong _Wf;
     406       44220 :     wwv_unpack( wwv_if( active_lane_lo, W_lo, W_sentinel ), _W0, _W1, _W2, _W3, _W4, _W5, _W6, _W7 );
     407       44220 :     wwv_unpack( wwv_if( active_lane_hi, W_hi, W_sentinel ), _W8, _W9, _Wa, _Wb, _Wc, _Wd, _We, _Wf );
     408       44220 :     uchar const * W0 = (uchar const *)_W0; uchar const * W1 = (uchar const *)_W1;
     409       44220 :     uchar const * W2 = (uchar const *)_W2; uchar const * W3 = (uchar const *)_W3;
     410       44220 :     uchar const * W4 = (uchar const *)_W4; uchar const * W5 = (uchar const *)_W5;
     411       44220 :     uchar const * W6 = (uchar const *)_W6; uchar const * W7 = (uchar const *)_W7;
     412       44220 :     uchar const * W8 = (uchar const *)_W8; uchar const * W9 = (uchar const *)_W9;
     413       44220 :     uchar const * Wa = (uchar const *)_Wa; uchar const * Wb = (uchar const *)_Wb;
     414       44220 :     uchar const * Wc = (uchar const *)_Wc; uchar const * Wd = (uchar const *)_Wd;
     415       44220 :     uchar const * We = (uchar const *)_We; uchar const * Wf = (uchar const *)_Wf;
     416             : 
     417       44220 :     wwu_t m[16];
     418       44220 :     m[0x0] = wwu_ldu( W0 );  m[0x1] = wwu_ldu( W1 );
     419       44220 :     m[0x2] = wwu_ldu( W2 );  m[0x3] = wwu_ldu( W3 );
     420       44220 :     m[0x4] = wwu_ldu( W4 );  m[0x5] = wwu_ldu( W5 );
     421       44220 :     m[0x6] = wwu_ldu( W6 );  m[0x7] = wwu_ldu( W7 );
     422       44220 :     m[0x8] = wwu_ldu( W8 );  m[0x9] = wwu_ldu( W9 );
     423       44220 :     m[0xa] = wwu_ldu( Wa );  m[0xb] = wwu_ldu( Wb );
     424       44220 :     m[0xc] = wwu_ldu( Wc );  m[0xd] = wwu_ldu( Wd );
     425       44220 :     m[0xe] = wwu_ldu( We );  m[0xf] = wwu_ldu( Wf );
     426             : 
     427       44220 :     wwu_transpose_16x16( m[0x0], m[0x1], m[0x2], m[0x3], m[0x4], m[0x5], m[0x6], m[0x7],
     428       44220 :                          m[0x8], m[0x9], m[0xa], m[0xb], m[0xc], m[0xd], m[0xe], m[0xf],
     429       44220 :                          m[0x0], m[0x1], m[0x2], m[0x3], m[0x4], m[0x5], m[0x6], m[0x7],
     430       44220 :                          m[0x8], m[0x9], m[0xa], m[0xb], m[0xc], m[0xd], m[0xe], m[0xf] );
     431             : 
     432             :     /* Compute the BLAKE3 compression function updates */
     433             : 
     434       44220 : compress: (void)0;
     435       44157 :     wwu_t v[16] = {
     436       44157 :         h0,     h1,     h2,       h3,
     437       44157 :         h4,     h5,     h6,       h7,
     438       44157 :         iv0,    iv1,    iv2,      iv3,
     439       44157 :         ctr_lo, ctr_hi, block_sz, block_flags,
     440       44157 :     };
     441             : 
     442             :     /* Debug utility */
     443       44157 : #define STATE_FMT         "[%u] =\n  %08x %08x %08x %08x\n  %08x %08x %08x %08x\n  %08x %08x %08x %08x\n  %08x %08x %08x %08x"
     444       44157 : #define STATE_FMT_ARGS(v,i) (uint)i,\
     445       44157 :         fd_uint_bswap(wwu_extract(v[0x0],i)),fd_uint_bswap(wwu_extract(v[0x1],i)),fd_uint_bswap(wwu_extract(v[0x2],i)),fd_uint_bswap(wwu_extract(v[0x3],i)),\
     446       44157 :         fd_uint_bswap(wwu_extract(v[0x4],i)),fd_uint_bswap(wwu_extract(v[0x5],i)),fd_uint_bswap(wwu_extract(v[0x6],i)),fd_uint_bswap(wwu_extract(v[0x7],i)),\
     447       44157 :         fd_uint_bswap(wwu_extract(v[0x8],i)),fd_uint_bswap(wwu_extract(v[0x9],i)),fd_uint_bswap(wwu_extract(v[0xa],i)),fd_uint_bswap(wwu_extract(v[0xb],i)),\
     448       44157 :         fd_uint_bswap(wwu_extract(v[0xc],i)),fd_uint_bswap(wwu_extract(v[0xd],i)),fd_uint_bswap(wwu_extract(v[0xe],i)),fd_uint_bswap(wwu_extract(v[0xf],i))
     449             : 
     450             :     // FD_LOG_NOTICE(( "pre " STATE_FMT, STATE_FMT_ARGS(v,0) ));
     451       44157 :     round_fn16( v, m, 0 );
     452       44157 :     round_fn16( v, m, 1 );
     453       44157 :     round_fn16( v, m, 2 );
     454       44157 :     round_fn16( v, m, 3 );
     455       44157 :     round_fn16( v, m, 4 );
     456       44157 :     round_fn16( v, m, 5 );
     457       44157 :     round_fn16( v, m, 6 );
     458             :     // FD_LOG_NOTICE(( "post" STATE_FMT, STATE_FMT_ARGS(v,0) ));
     459             : 
     460       44270 :     if( FD_LIKELY( !compress_done ) ) {
     461             : 
     462             :       /* Apply the state updates to the active lanes */
     463             : 
     464       44270 :       int active_lane = active_lane_lo | (active_lane_hi<<8);
     465       44270 :       FD_BLAKE3_TRACE(( "fd_blake3_avx512_compress16: compress lanes %02x%02x", active_lane_hi, active_lane_lo ));
     466             : 
     467       44270 :       if( FD_UNLIKELY( out_sz==64 ) ) {
     468             :         /* FIXME only export in the last iteration */
     469        8414 :         hu[0] = wwu_xor_if( active_lane, h0, v[ 8], hu[0] );
     470        8414 :         hu[1] = wwu_xor_if( active_lane, h1, v[ 9], hu[1] );
     471        8414 :         hu[2] = wwu_xor_if( active_lane, h2, v[10], hu[2] );
     472        8414 :         hu[3] = wwu_xor_if( active_lane, h3, v[11], hu[3] );
     473        8414 :         hu[4] = wwu_xor_if( active_lane, h4, v[12], hu[4] );
     474        8414 :         hu[5] = wwu_xor_if( active_lane, h5, v[13], hu[5] );
     475        8414 :         hu[6] = wwu_xor_if( active_lane, h6, v[14], hu[6] );
     476        8414 :         hu[7] = wwu_xor_if( active_lane, h7, v[15], hu[7] );
     477        8414 :       }
     478       44270 :       h0 = wwu_xor_if( active_lane, v[ 0], v[ 8], h0 );
     479       44270 :       h1 = wwu_xor_if( active_lane, v[ 1], v[ 9], h1 );
     480       44270 :       h2 = wwu_xor_if( active_lane, v[ 2], v[10], h2 );
     481       44270 :       h3 = wwu_xor_if( active_lane, v[ 3], v[11], h3 );
     482       44270 :       h4 = wwu_xor_if( active_lane, v[ 4], v[12], h4 );
     483       44270 :       h5 = wwu_xor_if( active_lane, v[ 5], v[13], h5 );
     484       44270 :       h6 = wwu_xor_if( active_lane, v[ 6], v[14], h6 );
     485       44270 :       h7 = wwu_xor_if( active_lane, v[ 7], v[15], h7 );
     486             : 
     487             :       /* Advance to the next message segment blocks.  In pseudo code,
     488             :          the below is:
     489             : 
     490             :            W += 64; if( block_rem ) block_rem--;
     491             : 
     492             :          Since we do not load anything at W(lane) above unless
     493             :          block_rem(lane) is non-zero, we can omit vector conditional
     494             :          operations for W(lane) below. */
     495             : 
     496       44270 :       W_lo = wwv_add_if( active_lane_lo, W_lo, wwv_64, W_lo );
     497       44270 :       W_hi = wwv_add_if( active_lane_hi, W_hi, wwv_64, W_hi );
     498       44270 :       off  = wwu_add_if( active_lane,    off,  wwu_64, off  );
     499             : 
     500       44270 :       block_rem_lo = wwv_sub_if( active_lane_lo, block_rem_lo, one, block_rem_lo );
     501       44270 :       block_rem_hi = wwv_sub_if( active_lane_hi, block_rem_hi, one, block_rem_hi );
     502             : 
     503 >1844*10^16 :     } else { /* LtHash mode */
     504             : 
     505             :       /* d[i] contains output_off+(i*4) 32-bit words across output[0..8] */
     506 >1844*10^16 :       FD_BLAKE3_TRACE(( "fd_blake3_avx512_compress16: expand lanes" ));
     507 >1844*10^16 :       wwu_t d[ 16 ] = {
     508 >1844*10^16 :         wwu_xor( v[0x0], v[0x8] ),
     509 >1844*10^16 :         wwu_xor( v[0x1], v[0x9] ),
     510 >1844*10^16 :         wwu_xor( v[0x2], v[0xa] ),
     511 >1844*10^16 :         wwu_xor( v[0x3], v[0xb] ),
     512 >1844*10^16 :         wwu_xor( v[0x4], v[0xc] ),
     513 >1844*10^16 :         wwu_xor( v[0x5], v[0xd] ),
     514 >1844*10^16 :         wwu_xor( v[0x6], v[0xe] ),
     515 >1844*10^16 :         wwu_xor( v[0x7], v[0xf] ),
     516 >1844*10^16 :         wwu_xor( h0,     v[0x8] ),
     517 >1844*10^16 :         wwu_xor( h1,     v[0x9] ),
     518 >1844*10^16 :         wwu_xor( h2,     v[0xa] ),
     519 >1844*10^16 :         wwu_xor( h3,     v[0xb] ),
     520 >1844*10^16 :         wwu_xor( h4,     v[0xc] ),
     521 >1844*10^16 :         wwu_xor( h5,     v[0xd] ),
     522 >1844*10^16 :         wwu_xor( h6,     v[0xe] ),
     523 >1844*10^16 :         wwu_xor( h7,     v[0xf] )
     524 >1844*10^16 :       };
     525             : 
     526             :       /* Transpose each 8x8 block */
     527 >1844*10^16 :       wwu_transpose_16x16( d[0x0], d[0x1], d[0x2], d[0x3], d[0x4], d[0x5], d[0x6], d[0x7],
     528 >1844*10^16 :                            d[0x8], d[0x9], d[0xa], d[0xb], d[0xc], d[0xd], d[0xe], d[0xf],
     529 >1844*10^16 :                            d[0x0], d[0x1], d[0x2], d[0x3], d[0x4], d[0x5], d[0x6], d[0x7],
     530 >1844*10^16 :                            d[0x8], d[0x9], d[0xa], d[0xb], d[0xc], d[0xd], d[0xe], d[0xf] );
     531             : 
     532             :       /* Reduce-add into d[0] */
     533 >1844*10^16 :       d[0x0] = wwh_add( d[0x0], d[0x1] ); /* sum(l[0 1]) */
     534 >1844*10^16 :       d[0x2] = wwh_add( d[0x2], d[0x3] ); /* sum(l[2 3]) */
     535 >1844*10^16 :       d[0x4] = wwh_add( d[0x4], d[0x5] ); /* sum(l[4 5]) */
     536 >1844*10^16 :       d[0x6] = wwh_add( d[0x6], d[0x7] ); /* sum(l[6 7]) */
     537 >1844*10^16 :       d[0x8] = wwh_add( d[0x8], d[0x9] ); /* sum(l[8 9]) */
     538 >1844*10^16 :       d[0xa] = wwh_add( d[0xa], d[0xb] ); /* sum(l[a b]) */
     539 >1844*10^16 :       d[0xc] = wwh_add( d[0xc], d[0xd] ); /* sum(l[c d]) */
     540 >1844*10^16 :       d[0xe] = wwh_add( d[0xe], d[0xf] ); /* sum(l[e f]) */
     541 >1844*10^16 :       d[0x0] = wwh_add( d[0x0], d[0x2] ); /* sum(l[0 1 2 3]) */
     542 >1844*10^16 :       d[0x4] = wwh_add( d[0x4], d[0x6] ); /* sum(l[4 5 6 7]) */
     543 >1844*10^16 :       d[0x8] = wwh_add( d[0x8], d[0xa] ); /* sum(l[8 9 a b]) */
     544 >1844*10^16 :       d[0xc] = wwh_add( d[0xc], d[0xe] ); /* sum(l[c d e f]) */
     545 >1844*10^16 :       d[0x0] = wwh_add( d[0x0], d[0x4] ); /* sum(l[0 1 2 3 4 5 6 7]) */
     546 >1844*10^16 :       d[0x8] = wwh_add( d[0x8], d[0xc] ); /* sum(l[8 9 a b c d e f]) */
     547 >1844*10^16 :       d[0x0] = wwh_add( d[0x0], d[0x8] ); /* sum(l[0 1 2 3 4 5 6 7 8 9 a b c d e f]) */
     548 >1844*10^16 :       wwh_st( lthash, d[0x0] );
     549             : 
     550             :       /* Wind up for next iteration */
     551 >1844*10^16 :       lthash += 32; /* 64 byte stride */
     552 >1844*10^16 :       lthash_rem--;
     553 >1844*10^16 :       wwu_t ctr_add   = wwu_bcast( 1 );
     554 >1844*10^16 :       /**/  ctr_lo    = wwu_add( ctr_lo, ctr_add );
     555 >1844*10^16 :       int   ctr_carry = wwi_gt ( wwu_xor( ctr_add, wwu_bcast( 0x80000000 ) ),
     556 >1844*10^16 :                                  wwu_xor( ctr_lo,  wwu_bcast( 0x80000000 ) ) );
     557 >1844*10^16 :       /**/  ctr_hi    = wwu_add_if( ctr_carry, ctr_hi, wwu_one(), ctr_hi );
     558 >1844*10^16 :       if( FD_UNLIKELY( !lthash_rem ) ) {
     559           0 :         FD_BLAKE3_TRACE(( "fd_blake3_avx512_compress16: done (lthash para)" ));
     560           0 :         return;
     561           0 :       }
     562 >1844*10^16 :       goto compress;
     563             : 
     564 >1844*10^16 : #   undef STATE_FMT
     565 >1844*10^16 : #   undef STATE_FMT_ARGS
     566 >1844*10^16 :     }
     567       44157 :   }
     568             : 
     569             :   /* Store the results */
     570             : 
     571       17443 :   wwu_t o0; wwu_t o1; wwu_t o2; wwu_t o3; wwu_t o4; wwu_t o5; wwu_t o6; wwu_t o7;
     572       17443 :   wwu_t o8; wwu_t o9; wwu_t oA; wwu_t oB; wwu_t oC; wwu_t oD; wwu_t oE; wwu_t oF;
     573             : 
     574       17443 :   wwu_transpose_16x16( h0,   h1,   h2,   h3,   h4,   h5,   h6,   h7,
     575       17443 :                        hu[0],hu[1],hu[2],hu[3],hu[4],hu[5],hu[6],hu[7],
     576       17443 :                        o0,   o1,   o2,   o3,   o4,   o5,   o6,   o7,
     577       17443 :                        o8,   o9,   oA,   oB,   oC,   oD,   oE,   oF );
     578             : 
     579       17443 :   uint * const * batch_hash = (uint * const *)_batch_hash;
     580       17443 :   if( FD_LIKELY( out_sz==32 ) ) {
     581        8984 :     switch( batch_cnt ) { /* application dependent prob */
     582        1794 :     case 16UL: wu_stu( batch_hash[15], _mm512_castsi512_si256( oF ) ); __attribute__((fallthrough));
     583        1794 :     case 15UL: wu_stu( batch_hash[14], _mm512_castsi512_si256( oE ) ); __attribute__((fallthrough));
     584        1794 :     case 14UL: wu_stu( batch_hash[13], _mm512_castsi512_si256( oD ) ); __attribute__((fallthrough));
     585        1794 :     case 13UL: wu_stu( batch_hash[12], _mm512_castsi512_si256( oC ) ); __attribute__((fallthrough));
     586        1794 :     case 12UL: wu_stu( batch_hash[11], _mm512_castsi512_si256( oB ) ); __attribute__((fallthrough));
     587        1794 :     case 11UL: wu_stu( batch_hash[10], _mm512_castsi512_si256( oA ) ); __attribute__((fallthrough));
     588        2392 :     case 10UL: wu_stu( batch_hash[ 9], _mm512_castsi512_si256( o9 ) ); __attribute__((fallthrough));
     589        2392 :     case  9UL: wu_stu( batch_hash[ 8], _mm512_castsi512_si256( o8 ) ); __attribute__((fallthrough));
     590        2994 :     case  8UL: wu_stu( batch_hash[ 7], _mm512_castsi512_si256( o7 ) ); __attribute__((fallthrough));
     591        2994 :     case  7UL: wu_stu( batch_hash[ 6], _mm512_castsi512_si256( o6 ) ); __attribute__((fallthrough));
     592        3591 :     case  6UL: wu_stu( batch_hash[ 5], _mm512_castsi512_si256( o5 ) ); __attribute__((fallthrough));
     593        4787 :     case  5UL: wu_stu( batch_hash[ 4], _mm512_castsi512_si256( o4 ) ); __attribute__((fallthrough));
     594        5990 :     case  4UL: wu_stu( batch_hash[ 3], _mm512_castsi512_si256( o3 ) ); __attribute__((fallthrough));
     595        6586 :     case  3UL: wu_stu( batch_hash[ 2], _mm512_castsi512_si256( o2 ) ); __attribute__((fallthrough));
     596        8985 :     case  2UL: wu_stu( batch_hash[ 1], _mm512_castsi512_si256( o1 ) ); __attribute__((fallthrough));
     597        8985 :     case  1UL: wu_stu( batch_hash[ 0], _mm512_castsi512_si256( o0 ) ); __attribute__((fallthrough));
     598        8985 :     default: break;
     599        8984 :     }
     600        8984 :     FD_BLAKE3_TRACE(( "fd_blake3_avx512_compress16: done" ));
     601        8984 :   } else if( out_sz==64 ) {
     602        8411 :     switch( batch_cnt ) { /* application dependent prob */
     603        8412 :     case 16UL: wwu_stu( batch_hash[15], oF ); __attribute__((fallthrough));
     604        8412 :     case 15UL: wwu_stu( batch_hash[14], oE ); __attribute__((fallthrough));
     605        8412 :     case 14UL: wwu_stu( batch_hash[13], oD ); __attribute__((fallthrough));
     606        8412 :     case 13UL: wwu_stu( batch_hash[12], oC ); __attribute__((fallthrough));
     607        8412 :     case 12UL: wwu_stu( batch_hash[11], oB ); __attribute__((fallthrough));
     608        8412 :     case 11UL: wwu_stu( batch_hash[10], oA ); __attribute__((fallthrough));
     609        8412 :     case 10UL: wwu_stu( batch_hash[ 9], o9 ); __attribute__((fallthrough));
     610        8412 :     case  9UL: wwu_stu( batch_hash[ 8], o8 ); __attribute__((fallthrough));
     611        8412 :     case  8UL: wwu_stu( batch_hash[ 7], o7 ); __attribute__((fallthrough));
     612        8412 :     case  7UL: wwu_stu( batch_hash[ 6], o6 ); __attribute__((fallthrough));
     613        8412 :     case  6UL: wwu_stu( batch_hash[ 5], o5 ); __attribute__((fallthrough));
     614        8412 :     case  5UL: wwu_stu( batch_hash[ 4], o4 ); __attribute__((fallthrough));
     615        8412 :     case  4UL: wwu_stu( batch_hash[ 3], o3 ); __attribute__((fallthrough));
     616        8412 :     case  3UL: wwu_stu( batch_hash[ 2], o2 ); __attribute__((fallthrough));
     617        8412 :     case  2UL: wwu_stu( batch_hash[ 1], o1 ); __attribute__((fallthrough));
     618        8412 :     case  1UL: wwu_stu( batch_hash[ 0], o0 ); __attribute__((fallthrough));
     619        8412 :     default: break;
     620        8411 :     }
     621        8413 :     FD_BLAKE3_TRACE(( "fd_blake3_avx512_compress16: done (out_sz=64)" ));
     622        8413 :   } else {
     623          48 :     FD_LOG_ERR(( "Invalid out_sz %u", out_sz ));
     624          48 :   }
     625       17443 : }
     626             : 
     627             : void
     628             : fd_blake3_avx512_compress16_fast( uchar const * restrict msg,
     629             :                                   uchar       * restrict out,
     630             :                                   ulong                  counter,
     631        7774 :                                   uchar                  flags ) {
     632        7774 :   FD_BLAKE3_TRACE(( "fd_blake3_avx512_compress16_fast(msg=%p,out=%p,counter=%lu,flags=%02x)", (void *)msg, (void *)out, counter, flags ));
     633             : 
     634        7774 :   int   parent = flags & FD_BLAKE3_FLAG_PARENT;
     635        7774 :   int   lg_sz  = fd_int_if( parent, FD_BLAKE3_OUTCHAIN_LG_SZ+1, FD_BLAKE3_CHUNK_LG_SZ );
     636        7774 :   ulong sz     = 1UL<<lg_sz;
     637             : 
     638             :   /* counters stay the same for each block.  Across chunks, they
     639             :      increment if we are hashing leaves.  Otherwise, they are zero. */
     640             : 
     641        7774 :   wwu_t ctr_add   = wwu_and( wwu_bcast( parent ? 0 : UINT_MAX ),
     642        7774 :                              wwu( 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
     643        7774 :                                   0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf ) );
     644        7774 :   wwu_t ctr_lo    = wwu_add( wwu_bcast( counter ), ctr_add );
     645        7774 :   int   ctr_carry = wwi_gt ( wwu_xor( ctr_add, wwu_bcast( 0x80000000 ) ),
     646        7774 :                              wwu_xor( ctr_lo,  wwu_bcast( 0x80000000 ) ) );
     647        7774 :   wwu_t ctr_hi    = wwu_add_if( ctr_carry, wwu_bcast( counter>>32 ), wwu_one(), wwu_bcast( counter>>32 ) );
     648        7774 :   wwu_t sz_vec    = wwu_bcast( FD_BLAKE3_BLOCK_SZ );
     649             : 
     650        7774 :   wwu_t const iv0 = wwu_bcast( FD_BLAKE3_IV[0] );
     651        7774 :   wwu_t const iv1 = wwu_bcast( FD_BLAKE3_IV[1] );
     652        7774 :   wwu_t const iv2 = wwu_bcast( FD_BLAKE3_IV[2] );
     653        7774 :   wwu_t const iv3 = wwu_bcast( FD_BLAKE3_IV[3] );
     654        7774 :   wwu_t const iv4 = wwu_bcast( FD_BLAKE3_IV[4] );
     655        7774 :   wwu_t const iv5 = wwu_bcast( FD_BLAKE3_IV[5] );
     656        7774 :   wwu_t const iv6 = wwu_bcast( FD_BLAKE3_IV[6] );
     657        7774 :   wwu_t const iv7 = wwu_bcast( FD_BLAKE3_IV[7] );
     658             : 
     659        7774 :   wwu_t h0=iv0; wwu_t h1=iv1; wwu_t h2=iv2; wwu_t h3=iv3;
     660        7774 :   wwu_t h4=iv4; wwu_t h5=iv5; wwu_t h6=iv6; wwu_t h7=iv7;
     661             : 
     662        7774 :   ulong off = 0UL;
     663       87551 :   do {
     664       87551 :     ulong const off_next = off+FD_BLAKE3_BLOCK_SZ;
     665       87551 :     int chunk_flags =
     666       87551 :         ( off     ==0UL ? FD_BLAKE3_FLAG_CHUNK_START : 0 ) |
     667       87551 :         ( off_next==sz  ? FD_BLAKE3_FLAG_CHUNK_END   : 0 );
     668       87551 :     int flags_ = flags | fd_int_if( parent, 0, chunk_flags );
     669       87551 :     wwu_t flags_vec = wwu_bcast( flags_ );
     670             : 
     671       87551 :     wwu_t m[16];
     672       87551 :     m[0x0] = wwu_ldu( msg + (0x0<<lg_sz) + off );
     673       87551 :     m[0x1] = wwu_ldu( msg + (0x1<<lg_sz) + off );
     674       87551 :     m[0x2] = wwu_ldu( msg + (0x2<<lg_sz) + off );
     675       87551 :     m[0x3] = wwu_ldu( msg + (0x3<<lg_sz) + off );
     676       87551 :     m[0x4] = wwu_ldu( msg + (0x4<<lg_sz) + off );
     677       87551 :     m[0x5] = wwu_ldu( msg + (0x5<<lg_sz) + off );
     678       87551 :     m[0x6] = wwu_ldu( msg + (0x6<<lg_sz) + off );
     679       87551 :     m[0x7] = wwu_ldu( msg + (0x7<<lg_sz) + off );
     680       87551 :     m[0x8] = wwu_ldu( msg + (0x8<<lg_sz) + off );
     681       87551 :     m[0x9] = wwu_ldu( msg + (0x9<<lg_sz) + off );
     682       87551 :     m[0xa] = wwu_ldu( msg + (0xa<<lg_sz) + off );
     683       87551 :     m[0xb] = wwu_ldu( msg + (0xb<<lg_sz) + off );
     684       87551 :     m[0xc] = wwu_ldu( msg + (0xc<<lg_sz) + off );
     685       87551 :     m[0xd] = wwu_ldu( msg + (0xd<<lg_sz) + off );
     686       87551 :     m[0xe] = wwu_ldu( msg + (0xe<<lg_sz) + off );
     687       87551 :     m[0xf] = wwu_ldu( msg + (0xf<<lg_sz) + off );
     688             : 
     689       87551 :     wwu_transpose_16x16( m[0x0], m[0x1], m[0x2], m[0x3], m[0x4], m[0x5], m[0x6], m[0x7],
     690       87551 :                          m[0x8], m[0x9], m[0xa], m[0xb], m[0xc], m[0xd], m[0xe], m[0xf],
     691       87551 :                          m[0x0], m[0x1], m[0x2], m[0x3], m[0x4], m[0x5], m[0x6], m[0x7],
     692       87551 :                          m[0x8], m[0x9], m[0xa], m[0xb], m[0xc], m[0xd], m[0xe], m[0xf] );
     693             : 
     694       87551 :     wwu_t v[16] = {
     695       87551 :         h0,     h1,     h2,     h3,
     696       87551 :         h4,     h5,     h6,     h7,
     697       87551 :         iv0,    iv1,    iv2,    iv3,
     698       87551 :         ctr_lo, ctr_hi, sz_vec, flags_vec,
     699       87551 :     };
     700             : 
     701       87551 :     round_fn16( v, m, 0 );
     702       87551 :     round_fn16( v, m, 1 );
     703       87551 :     round_fn16( v, m, 2 );
     704       87551 :     round_fn16( v, m, 3 );
     705       87551 :     round_fn16( v, m, 4 );
     706       87551 :     round_fn16( v, m, 5 );
     707       87551 :     round_fn16( v, m, 6 );
     708             : 
     709       87551 :     h0 = wwu_xor( v[ 0], v[ 8] );
     710       87551 :     h1 = wwu_xor( v[ 1], v[ 9] );
     711       87551 :     h2 = wwu_xor( v[ 2], v[10] );
     712       87551 :     h3 = wwu_xor( v[ 3], v[11] );
     713       87551 :     h4 = wwu_xor( v[ 4], v[12] );
     714       87551 :     h5 = wwu_xor( v[ 5], v[13] );
     715       87551 :     h6 = wwu_xor( v[ 6], v[14] );
     716       87551 :     h7 = wwu_xor( v[ 7], v[15] );
     717             : 
     718       87551 :     off = off_next;
     719       87551 :   } while( off!=sz );
     720             : 
     721        7774 :   wwu_t o0; wwu_t o1; wwu_t o2; wwu_t o3; wwu_t o4; wwu_t o5; wwu_t o6; wwu_t o7;
     722        7774 :   wwu_t o8; wwu_t o9; wwu_t oA; wwu_t oB; wwu_t oC; wwu_t oD; wwu_t oE; wwu_t oF;
     723             : 
     724        7774 :   wwu_t zero = wwu_zero();
     725        7774 :   wwu_transpose_16x16( h0,   h1,   h2,   h3,   h4,   h5,   h6,   h7,
     726        7774 :                        zero, zero, zero, zero, zero, zero, zero, zero,
     727        7774 :                        o0,   o1,   o2,   o3,   o4,   o5,   o6,   o7,
     728        7774 :                        o8,   o9,   oA,   oB,   oC,   oD,   oE,   oF );
     729             : 
     730        7774 :   wb_st( out + (0x0UL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( o0 ) );
     731        7774 :   wb_st( out + (0x1UL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( o1 ) );
     732        7774 :   wb_st( out + (0x2UL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( o2 ) );
     733        7774 :   wb_st( out + (0x3UL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( o3 ) );
     734        7774 :   wb_st( out + (0x4UL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( o4 ) );
     735        7774 :   wb_st( out + (0x5UL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( o5 ) );
     736        7774 :   wb_st( out + (0x6UL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( o6 ) );
     737        7774 :   wb_st( out + (0x7UL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( o7 ) );
     738        7774 :   wb_st( out + (0x8UL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( o8 ) );
     739        7774 :   wb_st( out + (0x9UL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( o9 ) );
     740        7774 :   wb_st( out + (0xaUL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( oA ) );
     741        7774 :   wb_st( out + (0xbUL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( oB ) );
     742        7774 :   wb_st( out + (0xcUL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( oC ) );
     743        7774 :   wb_st( out + (0xdUL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( oD ) );
     744        7774 :   wb_st( out + (0xeUL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( oE ) );
     745        7774 :   wb_st( out + (0xfUL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( oF ) );
     746        7774 : }

Generated by: LCOV version 1.14