LCOV - code coverage report
Current view: top level - ballet/blake3 - fd_blake3_sse41.c (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 234 242 96.7 %
Date: 2026-03-19 18:19:27 Functions: 9 9 100.0 %

          Line data    Source code
       1             : 
       2             : // Source originally from https://github.com/BLAKE3-team/BLAKE3
       3             : // From commit: c0ea395cf91d242f078c23d5f8d87eb9dd5f7b78
       4             : 
       5             : #include "fd_blake3.h"
       6             : #include "fd_blake3_private.h"
       7             : #include "../../util/simd/fd_sse.h"
       8             : #include <assert.h>
       9             : 
      10             : #define _mm_shuffle_ps2(a, b, c)                                       \
      11       68528 :   (_mm_castps_si128(                                                   \
      12       68528 :       _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
      13             : 
      14       60018 : #define vu_rot16 vb_exch_adj_pair
      15             : 
      16             : static inline __attribute__((always_inline)) vu_t
      17       60018 : vu_rot12( vu_t x ) {
      18       60018 :   return vu_xor( vu_shr( x, 12 ), vu_shl( x, 32-12 ) );
      19       60018 : }
      20             : 
      21             : static inline __attribute__((always_inline)) vu_t
      22       60018 : vu_rot8( vu_t x ) {
      23       60018 :   vb_t const mask = vb( 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,15,12 );
      24       60018 :   return _mm_shuffle_epi8( x, mask );
      25       60018 : }
      26             : 
      27             : static inline __attribute__((always_inline)) vu_t
      28       60018 : vu_rot7( vu_t x ) {
      29       60018 :   return vu_xor( vu_shr( x, 7 ), vu_shl( x, 32-7 ) );
      30       60018 : }
      31             : 
      32             : static inline __attribute__((always_inline)) void
      33             : g1( vu_t * row0,
      34             :     vu_t * row1,
      35             :     vu_t * row2,
      36             :     vu_t * row3,
      37       60018 :     vu_t   m ) {
      38       60018 :   *row0 = vu_add(vu_add(*row0, m), *row1);
      39       60018 :   *row3 = vu_xor(*row3, *row0);
      40       60018 :   *row3 = vu_rot16(*row3);
      41       60018 :   *row2 = vu_add(*row2, *row3);
      42       60018 :   *row1 = vu_xor(*row1, *row2);
      43       60018 :   *row1 = vu_rot12(*row1);
      44       60018 : }
      45             : 
      46             : static inline __attribute__((always_inline)) void
      47             : g2( vu_t * row0,
      48             :     vu_t * row1,
      49             :     vu_t * row2,
      50             :     vu_t * row3,
      51       60018 :     vu_t   m ) {
      52       60018 :   *row0 = vu_add(vu_add(*row0, m), *row1);
      53       60018 :   *row3 = vu_xor(*row3, *row0);
      54       60018 :   *row3 = vu_rot8(*row3);
      55       60018 :   *row2 = vu_add(*row2, *row3);
      56       60018 :   *row1 = vu_xor(*row1, *row2);
      57       60018 :   *row1 = vu_rot7(*row1);
      58       60018 : }
      59             : 
      60             : // Note the optimization here of leaving row1 as the unrotated row, rather than
      61             : // row0. All the message loads below are adjusted to compensate for this. See
      62             : // discussion at https://github.com/sneves/blake2-avx2/pull/4
      63             : static inline __attribute__((always_inline)) void
      64       30009 : diagonalize(vu_t *row0, vu_t *row2, vu_t *row3) {
      65       30009 :   *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
      66       30009 :   *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
      67       30009 :   *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
      68       30009 : }
      69             : 
      70             : static inline __attribute__((always_inline)) void
      71       30009 : undiagonalize(vu_t *row0, vu_t *row2, vu_t *row3) {
      72       30009 :   *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
      73       30009 :   *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
      74       30009 :   *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
      75       30009 : }
      76             : 
      77             : static inline __attribute__((always_inline)) void
      78             : compress_pre( vu_t        rows[4],
      79             :               uint  const cv[ static 8 ],
      80             :               uchar const block[ static FD_BLAKE3_BLOCK_SZ ],
      81             :               uint        block_len,
      82             :               ulong       ctr,
      83        4283 :               uint        flags ) {
      84        4283 :   rows[0] = vu_ld( cv   );
      85        4283 :   rows[1] = vu_ld( cv+4 );
      86        4283 :   rows[2] = vu( FD_BLAKE3_IV[0], FD_BLAKE3_IV[1], FD_BLAKE3_IV[2], FD_BLAKE3_IV[3] );
      87        4283 :   rows[3] = vu( (uint)(ctr&UINT_MAX), (uint)(ctr>>32),
      88        4283 :                 block_len,            flags );
      89             : 
      90        4283 :   vu_t m0 = vb_ldu( block    ); vu_t m1 = vb_ldu( block+16 );
      91        4283 :   vu_t m2 = vb_ldu( block+32 ); vu_t m3 = vb_ldu( block+48 );
      92             : 
      93        4283 :   vu_t t0, t1, t2, t3, tt;
      94             : 
      95             :   // Round 1. The first round permutes the message words from the original
      96             :   // input order, into the groups that get mixed in parallel.
      97        4283 :   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); //  6  4  2  0
      98        4283 :   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
      99        4283 :   t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); //  7  5  3  1
     100        4283 :   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
     101        4283 :   diagonalize(&rows[0], &rows[2], &rows[3]);
     102        4283 :   t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10  8
     103        4283 :   t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3));   // 12 10  8 14
     104        4283 :   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
     105        4283 :   t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11  9
     106        4283 :   t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3));   // 13 11  9 15
     107        4283 :   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
     108        4283 :   undiagonalize(&rows[0], &rows[2], &rows[3]);
     109        4283 :   m0 = t0;
     110        4283 :   m1 = t1;
     111        4283 :   m2 = t2;
     112        4283 :   m3 = t3;
     113             : 
     114             :   // Round 2. This round and all following rounds apply a fixed permutation
     115             :   // to the message words from the round before.
     116        4283 :   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
     117        4283 :   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
     118        4283 :   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
     119        4283 :   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
     120        4283 :   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
     121        4283 :   t1 = _mm_blend_epi16(tt, t1, 0xCC);
     122        4283 :   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
     123        4283 :   diagonalize(&rows[0], &rows[2], &rows[3]);
     124        4283 :   t2 = _mm_unpacklo_epi64(m3, m1);
     125        4283 :   tt = _mm_blend_epi16(t2, m2, 0xC0);
     126        4283 :   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
     127        4283 :   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
     128        4283 :   t3 = _mm_unpackhi_epi32(m1, m3);
     129        4283 :   tt = _mm_unpacklo_epi32(m2, t3);
     130        4283 :   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
     131        4283 :   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
     132        4283 :   undiagonalize(&rows[0], &rows[2], &rows[3]);
     133        4283 :   m0 = t0;
     134        4283 :   m1 = t1;
     135        4283 :   m2 = t2;
     136        4283 :   m3 = t3;
     137             : 
     138             :   // Round 3
     139        4283 :   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
     140        4283 :   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
     141        4283 :   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
     142        4283 :   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
     143        4283 :   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
     144        4283 :   t1 = _mm_blend_epi16(tt, t1, 0xCC);
     145        4283 :   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
     146        4283 :   diagonalize(&rows[0], &rows[2], &rows[3]);
     147        4283 :   t2 = _mm_unpacklo_epi64(m3, m1);
     148        4283 :   tt = _mm_blend_epi16(t2, m2, 0xC0);
     149        4283 :   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
     150        4283 :   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
     151        4283 :   t3 = _mm_unpackhi_epi32(m1, m3);
     152        4283 :   tt = _mm_unpacklo_epi32(m2, t3);
     153        4283 :   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
     154        4283 :   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
     155        4283 :   undiagonalize(&rows[0], &rows[2], &rows[3]);
     156        4283 :   m0 = t0;
     157        4283 :   m1 = t1;
     158        4283 :   m2 = t2;
     159        4283 :   m3 = t3;
     160             : 
     161             :   // Round 4
     162        4283 :   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
     163        4283 :   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
     164        4283 :   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
     165        4283 :   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
     166        4283 :   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
     167        4283 :   t1 = _mm_blend_epi16(tt, t1, 0xCC);
     168        4283 :   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
     169        4283 :   diagonalize(&rows[0], &rows[2], &rows[3]);
     170        4283 :   t2 = _mm_unpacklo_epi64(m3, m1);
     171        4283 :   tt = _mm_blend_epi16(t2, m2, 0xC0);
     172        4283 :   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
     173        4283 :   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
     174        4283 :   t3 = _mm_unpackhi_epi32(m1, m3);
     175        4283 :   tt = _mm_unpacklo_epi32(m2, t3);
     176        4283 :   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
     177        4283 :   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
     178        4283 :   undiagonalize(&rows[0], &rows[2], &rows[3]);
     179        4283 :   m0 = t0;
     180        4283 :   m1 = t1;
     181        4283 :   m2 = t2;
     182        4283 :   m3 = t3;
     183             : 
     184             :   // Round 5
     185        4283 :   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
     186        4283 :   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
     187        4283 :   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
     188        4283 :   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
     189        4283 :   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
     190        4283 :   t1 = _mm_blend_epi16(tt, t1, 0xCC);
     191        4283 :   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
     192        4283 :   diagonalize(&rows[0], &rows[2], &rows[3]);
     193        4283 :   t2 = _mm_unpacklo_epi64(m3, m1);
     194        4283 :   tt = _mm_blend_epi16(t2, m2, 0xC0);
     195        4283 :   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
     196        4283 :   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
     197        4283 :   t3 = _mm_unpackhi_epi32(m1, m3);
     198        4283 :   tt = _mm_unpacklo_epi32(m2, t3);
     199        4283 :   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
     200        4283 :   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
     201        4283 :   undiagonalize(&rows[0], &rows[2], &rows[3]);
     202        4283 :   m0 = t0;
     203        4283 :   m1 = t1;
     204        4283 :   m2 = t2;
     205        4283 :   m3 = t3;
     206             : 
     207             :   // Round 6
     208        4283 :   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
     209        4283 :   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
     210        4283 :   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
     211        4283 :   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
     212        4283 :   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
     213        4283 :   t1 = _mm_blend_epi16(tt, t1, 0xCC);
     214        4283 :   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
     215        4283 :   diagonalize(&rows[0], &rows[2], &rows[3]);
     216        4283 :   t2 = _mm_unpacklo_epi64(m3, m1);
     217        4283 :   tt = _mm_blend_epi16(t2, m2, 0xC0);
     218        4283 :   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
     219        4283 :   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
     220        4283 :   t3 = _mm_unpackhi_epi32(m1, m3);
     221        4283 :   tt = _mm_unpacklo_epi32(m2, t3);
     222        4283 :   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
     223        4283 :   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
     224        4283 :   undiagonalize(&rows[0], &rows[2], &rows[3]);
     225        4283 :   m0 = t0;
     226        4283 :   m1 = t1;
     227        4283 :   m2 = t2;
     228        4283 :   m3 = t3;
     229             : 
     230             :   // Round 7
     231        4283 :   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
     232        4283 :   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
     233        4283 :   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
     234        4283 :   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
     235        4283 :   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
     236        4283 :   t1 = _mm_blend_epi16(tt, t1, 0xCC);
     237        4283 :   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
     238        4283 :   diagonalize(&rows[0], &rows[2], &rows[3]);
     239        4283 :   t2 = _mm_unpacklo_epi64(m3, m1);
     240        4283 :   tt = _mm_blend_epi16(t2, m2, 0xC0);
     241        4283 :   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
     242        4283 :   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
     243        4283 :   t3 = _mm_unpackhi_epi32(m1, m3);
     244        4283 :   tt = _mm_unpacklo_epi32(m2, t3);
     245        4283 :   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
     246        4283 :   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
     247        4283 :   undiagonalize(&rows[0], &rows[2], &rows[3]);
     248        4283 : }
     249             : 
     250             : void
     251             : fd_blake3_sse_compress1( uchar * restrict       out,
     252             :                          uchar const * restrict msg,
     253             :                          uint                   msg_sz,
     254             :                          ulong                  counter,
     255             :                          uint const             flags,
     256             :                          uchar * restrict       out_chain,
     257        3658 :                          uchar const * restrict in_chain ) {
     258        3658 :   FD_BLAKE3_TRACE(( "fd_blake3_sse_compress1(out=%p,msg=%p,sz=%u,counter=%lu,flags=%02x)",
     259        3658 :                     (void *)out, (void *)msg, msg_sz, counter, flags ));
     260        3658 :   assert( msg_sz<=FD_BLAKE3_CHUNK_SZ );
     261             : 
     262           0 :   uint cv[8] = { FD_BLAKE3_IV[0], FD_BLAKE3_IV[1], FD_BLAKE3_IV[2], FD_BLAKE3_IV[3],
     263        3658 :                  FD_BLAKE3_IV[4], FD_BLAKE3_IV[5], FD_BLAKE3_IV[6], FD_BLAKE3_IV[7] };
     264        3658 :   if( FD_UNLIKELY( in_chain ) ) {
     265           0 :     memcpy( cv, in_chain, FD_BLAKE3_OUTCHAIN_SZ );
     266           0 :   }
     267        3658 :   vu_t rows[4];
     268             : 
     269        3658 :   uint flag_mask = ~fd_uint_if( flags&FD_BLAKE3_FLAG_PARENT,
     270        3658 :                                 FD_BLAKE3_FLAG_CHUNK_START|FD_BLAKE3_FLAG_CHUNK_END,
     271        3658 :                                 0U );
     272             : 
     273        3658 :   uint block_flags = flags | (flag_mask & FD_BLAKE3_FLAG_CHUNK_START);
     274        3658 :   if( FD_UNLIKELY( in_chain && !(flags&FD_BLAKE3_FLAG_CHUNK_START) ) ) {
     275           0 :     block_flags &= ~FD_BLAKE3_FLAG_CHUNK_START;
     276           0 :   }
     277        6094 :   do {
     278        6094 :     uint block_sz = fd_uint_min( msg_sz, FD_BLAKE3_BLOCK_SZ );
     279        6094 :     block_flags |= FD_BLAKE3_FLAG_CHUNK_END;
     280        6094 :     block_flags &= (flag_mask & ~fd_uint_if( msg_sz<=FD_BLAKE3_BLOCK_SZ, 0, (FD_BLAKE3_FLAG_CHUNK_END|FD_BLAKE3_FLAG_ROOT) ) );
     281             : 
     282        6094 :     uchar tail[ FD_BLAKE3_BLOCK_SZ ] __attribute__((aligned(16)));
     283        6094 :     uchar const * restrict block;
     284        6094 :     if( FD_LIKELY( msg_sz>=FD_BLAKE3_BLOCK_SZ ) ) {
     285        3631 :       block = msg;
     286        3631 :     } else {
     287        2463 :       vb_st( tail,    vu_zero() );
     288        2463 :       vb_st( tail+16, vu_zero() );
     289        2463 :       vb_st( tail+32, vu_zero() );
     290        2463 :       vb_st( tail+48, vu_zero() );
     291        2463 :       fd_memcpy( tail, msg, msg_sz );
     292        2463 :       block = tail;
     293        2463 :     }
     294             : 
     295        6094 :     if( FD_UNLIKELY( out_chain && (block_flags & FD_BLAKE3_FLAG_CHUNK_END) ) ) {
     296             :       /* FIXME better document and polish the transition from the compress
     297             :                part to the expand part. */
     298        1810 :       fd_memcpy( out,       block, FD_BLAKE3_BLOCK_SZ    ); /* FIXME DOCUMENT OVERLOADING OF OUT ARGUMENT */
     299        1810 :       fd_memcpy( out_chain, cv,    FD_BLAKE3_OUTCHAIN_SZ );
     300        1810 :       FD_BLAKE3_TRACE(( "fd_blake3_sse_compress1: done (XOF mode)" ));
     301        1810 :       return;
     302        1810 :     }
     303             : 
     304        4284 :     FD_BLAKE3_TRACE(( "fd_blake3_sse_compress1: sz=%u counter=%u flags=%x", block_sz, (uint)counter, block_flags ));
     305        4284 :     compress_pre( rows, cv, block, block_sz, counter, block_flags );
     306        4284 :     if( FD_UNLIKELY( in_chain ) ) {
     307             :       /* FIXME UGLY */
     308           0 :       vu_stu( out+32, vu_xor( vu_ld( cv   ), rows[2] ) );
     309           0 :       vu_stu( out+48, vu_xor( vu_ld( cv+4 ), rows[3] ) );
     310           0 :     }
     311        4284 :     vu_st( cv,   vu_xor( rows[0], rows[2] ) );
     312        4284 :     vu_st( cv+4, vu_xor( rows[1], rows[3] ) );
     313        4284 :     msg    += FD_BLAKE3_BLOCK_SZ;
     314        4284 :     msg_sz -= block_sz;
     315        4284 :     block_flags = flags;
     316        4284 :   } while( (int)msg_sz>0 );
     317             : 
     318        1848 :   vu_stu( out,    vu_ld( cv   ) );
     319        1848 :   vu_stu( out+16, vu_ld( cv+4 ) );
     320             : 
     321        1848 :   FD_BLAKE3_TRACE(( "fd_blake3_sse_compress1: done" ));
     322        1848 : }

Generated by: LCOV version 1.14