LCOV - cov.lcov - disco/pack/fd

LCOV - code coverage report

Current view:	top level - disco/pack - fd_chkdup.h (source / functions)		Hit	Total	Coverage
Test:	cov.lcov	Lines:	0	123	0.0 %
Date:	2026-03-19 18:19:27	Functions:	0	36	0.0 %

          Line data    Source code

       1             : #ifndef HEADER_fd_src_disco_pack_fd_chkdup_h
       2             : #define HEADER_fd_src_disco_pack_fd_chkdup_h
       3             : 
       4             : #include "../../ballet/fd_ballet_base.h"
       5             : #include "../../ballet/txn/fd_txn.h"
       6             : 
       7             : /* fd_chkdup declares a set of functions for ultra-HPC checking if a
       8             :    list of account addresses contains any duplicates.  It's important
       9             :    that this be fast, because a transaction containing duplicate account
      10             :    addresses fails to sanitize and is not charged a fee.  Although this
      11             :    check can (and really ought) to be done in parallel, perhaps in the
      12             :    verify tiles, right now, it's done in pack, which means it's serial
      13             :    and on the critical path.
      14             : 
      15             :    On platforms with AVX, the current implementation uses a fast initial
      16             :    check which may have false positives (thinking there are duplicates
      17             :    when there aren't).  Any transaction that fails the initial check is
      18             :    then subjected to the full, precise check.  Without AVX, all
      19             :    transactions use the slow path. */
      20             : 
      21             : /* The functions are defined in the header to facilitate inlining since
      22             :    they take 10s of cycles in the good case, but should probably be
      23             :    treated as if they were defined in a .c file. */
      24             : 
      25             : #ifndef FD_CHKDUP_IMPL
      26             : # if FD_HAS_AVX512
      27             : #   include "../../util/simd/fd_avx.h"
      28           0 : #   define FD_CHKDUP_IMPL 2
      29             : # elif FD_HAS_AVX
      30             : #   define FD_CHKDUP_IMPL 1
      31             : # else
      32             : #   define FD_CHKDUP_IMPL 0
      33             : # endif
      34             : #endif
      35             : 
      36             : 
      37             : #if FD_CHKDUP_IMPL==2
      38             : # define FD_CHKDUP_ALIGN ( 64UL)
      39             : #elif FD_CHKDUP_IMPL==1
      40             : # define FD_CHKDUP_ALIGN ( 32UL)
      41             : #elif FD_CHKDUP_IMPL==0
      42             : # define FD_CHKDUP_ALIGN ( 32UL)
      43             : #else
      44             : # error "Unrecognized value of FD_CHKDUP_IMPL"
      45             : #endif
      46             : 
      47             : 
      48             : # define FD_CHKDUP_FOOTPRINT  FD_LAYOUT_FINI( FD_LAYOUT_APPEND(                                         \
      49             :                                                              FD_LAYOUT_APPEND( FD_LAYOUT_INIT,               \
      50             :                                                              FD_CHKDUP_ALIGN, 32*FD_CHKDUP_IMPL ), \
      51             :                                                              32UL,                 (1UL<<8)*32UL    ),       \
      52             :                                                    FD_CHKDUP_ALIGN )
      53             : 
      54             : FD_STATIC_ASSERT( (1UL<<8)==2*FD_TXN_ACCT_ADDR_MAX, "hash table size" );
      55             : 
      56             : /* Fixed size (just over 8kB) and safe for declaration on the stack or
      57             :    inclusion in a struct. */
      58             : struct fd_chkdup_private;
      59             : typedef struct fd_chkdup_private fd_chkdup_t;
      60             : 
      61             : FD_PROTOTYPES_BEGIN
      62             : /* fd_chkdup_{footprint, align} return the footprint and alignment of
      63             :    the scratch memory that duplicate detection requires. */
      64           0 : static inline ulong fd_chkdup_footprint( void ) { return FD_CHKDUP_FOOTPRINT; }
      65           0 : static inline ulong fd_chkdup_align    ( void ) { return FD_CHKDUP_ALIGN;     }
      66             : 
      67             : /* fd_chkdup_new formats an appropriately sized region of memory for use
      68             :    in duplicate address detection.  shmem must point to the first byte
      69             :    of a region of memory with the appropriate alignment and footprint.
      70             :    rng must be a pointer to a local join of an RNG.  Some slots of the
      71             :    RNG will be consumed, but no interest in the RNG will be retained
      72             :    after the function returns.  Returns shmem on success and NULL on
      73             :    failure (logs details).  The only failure cases are if shmem is NULL
      74             :    or not aligned.
      75             : 
      76             :    fd_chkdup_join joins the caller to the formatted region of memory.
      77             :    Returns shmem.
      78             : 
      79             :    fd_chkdup_leave unjoins the caller to chkdup.  Returns chkdup.
      80             :    fd_chkdup_delete unformats the region of memory.  Returns a pointer
      81             :    to the unformatted memory region. */
      82             : static inline void *        fd_chkdup_new   ( void * shmem, fd_rng_t * rng );
      83             : static inline fd_chkdup_t * fd_chkdup_join  ( void * shmem                 );
      84             : static inline void *        fd_chkdup_leave ( fd_chkdup_t * chkdup         );
      85             : static inline void *        fd_chkdup_delete( void * shmem                 );
      86             : 
      87             : 
      88             : /* fd_chkdup_check{,_slow,_fast} check a list of account addresses for
      89             :    any duplicate addresses, i.e. an account address that appears twice
      90             :    in the list.  The list does not need to be sorted or have any
      91             :    particular order.  The list may be decomposed into two sublists
      92             :    (list0 and list1) to facilitate 0-copy usage with address lookup
      93             :    tables, but list0 and list1 are logically concatenated prior to
      94             :    checking for duplicates.
      95             : 
      96             :    chkdup is a pointer to a valid local join of a chkdup object.
      97             : 
      98             :    list0 and list1 point to the first account address of the respective
      99             :    sublists.  The memory they point to need not have any particular
     100             :    alignment.  list0==NULL is okay only if list0_cnt==0, and similarly
     101             :    for list1.  list0 is accessed with indices [0, list0_cnt) and list1
     102             :    is accessed with indices [0, list1_cnt).  list0 and list1 must not
     103             :    overlap.  Requires list0_cnt+list1_cnt<=128, and the function is
     104             :    somewhat tuned for smaller values.
     105             : 
     106             :    fd_chkdup_check and the _slow version return 1 if the list of
     107             :    transactions contains at least one duplicated account address and 0
     108             :    otherwise (implying each account address in the provided list is
     109             :    unique).
     110             : 
     111             :    fd_chkdup_check_fast returns 1 if the list of transactions contains
     112             :    at least one duplicated account address and typically returns 0 if
     113             :    each account address in the provided list is unique, but may
     114             :    sometimes spuriiously return 1 even without duplicates.
     115             : 
     116             :    WARNING: the _fast version MAY HAVE FALSE POSITIVES.  You probably
     117             :    want the un-suffixed version, which is precise.  It uses the fast
     118             :    version as a fast-path and then does a slower full check if the
     119             :    fast-path suggests there may be a duplicate.
     120             : 
     121             :    However, it's also worth calling out again that the _fast version
     122             :    only makes errors in one direction.  If the list contains duplicates,
     123             :    it will definitely return 1.  If it returns 0, the list definitely
     124             :    does not contain duplicates. (Those two statements are equivalent).
     125             :    */
     126             : static inline int
     127             : fd_chkdup_check     ( fd_chkdup_t          * chkdup,
     128             :                       fd_acct_addr_t const * list0,     ulong list0_cnt,
     129             :                       fd_acct_addr_t const * list1,     ulong list1_cnt );
     130             : static inline int
     131             : fd_chkdup_check_slow( fd_chkdup_t          * chkdup,
     132             :                       fd_acct_addr_t const * list0,     ulong list0_cnt,
     133             :                       fd_acct_addr_t const * list1,     ulong list1_cnt );
     134             : static inline int
     135             : fd_chkdup_check_fast( fd_chkdup_t          * chkdup,
     136             :                       fd_acct_addr_t const * list0,     ulong list0_cnt,
     137             :                       fd_acct_addr_t const * list1,     ulong list1_cnt );
     138             : 
     139             : 
     140             : /*   -----  Implementation details and discussion follow------
     141             : 
     142             :    The fast path implementation is somewhat interesting.  The normal way
     143             :    to do this is with a Bloom filter, but Bloom filters do lots of
     144             :    unpredictable reads and the pack tile is somewhat cache sensitive.
     145             :    Instead, this implements a variant on a Bloom filter that lives
     146             :    entirely in AVX registers.
     147             : 
     148             :    Basically, we use C W-bit words stored in (C*W)/256 AVX2 registers or
     149             :    (C*W)/512 AVX512 registers.  Each word is a modified Bloom filter
     150             :    with one associated hash function.  For each account address, we
     151             :    compute C hashes, giving (up to) C positions across the AVX
     152             :    registers.  If any of those positions have an unset bit, then we know
     153             :    we have not seen the account address before.  Finally, we then set
     154             :    all the positions.
     155             : 
     156             :    The only difference between this idea and a normal Bloom filter is
     157             :    that sometimes the hash function may not select a bit.  There's a
     158             :    tradeoff to be made here: suppose you insert R account addresses, and
     159             :    R is at least almost as large as W.  Then each word fills up, and
     160             :    false positives become increasingly likely.  Only testing and
     161             :    inserting a bit, say, half the time, effectively halves C, but
     162             :    prevents each word from getting saturated as quickly, and makes the
     163             :    algorithm effective for larger values of R.  We use Intel's quirky
     164             :    behavior to get this for free.
     165             : 
     166             :    (Note: The R and C notation is supposed to suggest a tabular layout
     167             :    in which the account addresses are rows and the words are columns.)
     168             : 
     169             :    The key insight into making this work quickly is that the vpsllv{d,q}
     170             :    variable left shift instructions are cheap (1 cycle, can execute on
     171             :    port 0 or 1 for AVX2, still 1 cycle on port 0 for AVX512).  If we can
     172             :    compute the hashes in parallel with SIMD, then we can variably shift
     173             :    bcast(0x1) quickly, and select several bits at a time.  The rest is
     174             :    just bitwise logic, which is extremely cheap with AVX.  This approach
     175             :    constrains W to be either 32 or 64, and C to be a multiple of the
     176             :    number of words in a vector, but those are both pretty acceptable
     177             :    constraints.
     178             : 
     179             :    The final ingredient is a cheap hash function that places the hashes
     180             :    in the appropriate position for vpsllv.  We just xor the account
     181             :    address with some validator-specific entropy and use a mask to select
     182             :    certain bits.
     183             : 
     184             :    The slow-path implementation uses a hash table to check for
     185             :    duplicates.  This is slower than sorting for transactions with only a
     186             :    few account addresses, but substantially faster than sorting for
     187             :    transactions with large numbers of account addresses, which is when
     188             :    the slow-path matters more anyway.
     189             : 
     190             : 
     191             :    You can see from above there are a variety of knobs to tune.  Should
     192             :    W be 32 or 64?  How big should C be?  How many bits should we mask
     193             :    off for the hash, which controls the frequency with which we skip a
     194             :    word, neither checking nor inserting a bit?  It would be good to have
     195             :    a rigorous understanding of the false positive rate as a function of
     196             :    these parameters so we can make a decision that minimizes the
     197             :    expected compute required.
     198             : 
     199             :    Unfortunately, the false positive computation is tricky.  The key
     200             :    difficulty is that whether processing account address r results in a
     201             :    false positive is not independent from whether processing account
     202             :    address r+1 results in a false positive.  This leads to an obnoxious
     203             :    inclusion-exclusion formula which quickly becomes more unwieldy than
     204             :    I (or Sage) can handle.
     205             : 
     206             :    A dynamic-programming-ish algorithm can compute the false positive
     207             :    rate in approximately O(R*2^R) time.  To start, we just want to
     208             :    understand a single word/column.  Suppose k bits in the word have
     209             :    been set.  Then there are (W-k) hash values that set a new bit, and
     210             :    (V+k) hash values that don't set a new, where V is the number of hash
     211             :    values that don't select a bit.  The hash values that set a bit are
     212             :    also exactly the ones that provide information that the account
     213             :    address is not a false positive.  I think about this as "spending a
     214             :    bit" to know it's not a false positive.
     215             : 
     216             :    Denoting the rows at which we spend a bit by a 1 and the rows at
     217             :    which we don't spend a bit by 0, we might get a column like:
     218             : 
     219             :                                 1
     220             :                                 0
     221             :                                 1
     222             :                                 1
     223             :                                 0.
     224             :    The number of ways this column can occur is x1 =
     225             :    W*(V+1)*(W-1)*(W-2)*(V+3), which means that the probability the
     226             :    column occurs is x1/(W+V)^5.  Expanding to multiple columns is easy,
     227             :    since the number of ways that two specific columns can occur is just
     228             :    the product of the number of ways each can occur.  For example,
     229             :                               1 0
     230             :                               0 1
     231             :                               1 1
     232             :                               1 0
     233             :                               0 0
     234             :    can occur x1 * x2 ways, where x2 = V*W*(W-1)*(V+2)*(V+2).
     235             :    A false positive happens when there's a row of all 0s, as in the last
     236             :    row of the example.
     237             : 
     238             :    It's cleaner to count the number of ways not to get a false positive.
     239             :    This gives us the inclusion-exclusion formula:
     240             :    (all ways)
     241             :         - (all ways where row 0 is all 0s)
     242             :         - (all ways where row 1 is all 0s)
     243             :         - ...
     244             :         + (all ways where rows 0 and 1 are both all 0s)
     245             :         + (all ways where rows 0 and 2 are both all 0s)
     246             :         + ...
     247             :         - (all ways where rows 0, 1, and 2 are all 0s)
     248             :         - (all ways where rows 0, 1, and 3 are all 0s)
     249             :         +, - ...
     250             :         + (-1)^R (all ways in which all rows are all 0s).
     251             : 
     252             :    There's a nice way to understand each of these terms.  For example,
     253             :    in the R=2, C=3 case, the term in which row 0 is all 0s has the
     254             :    following elements:
     255             :     0 0 0   0 0 0   0 0 0   0 0 0   0 0 0   0 0 0   0 0 0   0 0 0
     256             :     0 0 0   0 0 1   0 1 0   0 1 1   1 0 0   1 0 1   1 1 0   1 1 1
     257             :    Rather than enumerating all 2^( (R-1)*C ) elements, we'll represent
     258             :    it as
     259             :                       ( 0  + 0 )^3
     260             :                       ( 0    1 )
     261             : 
     262             :    Skipping some steps, the task boils down to counting the number of
     263             :    ways to get columns that match a mask, then raising that to the Cth
     264             :    power.
     265             : 
     266             :    Now, with all this behind us, our goal is to pick the optimal value
     267             :    of W,V, and C given a supposed distribution of transactions, and a
     268             :    performance model.  Based on some back of the envelope calculations
     269             :    based on instruction throughput and latency measurements and
     270             :    confirmed by some experiments, the fast path code takes about 3.5*R
     271             :    cycles if using one AVX2 vector (W*C==256) and
     272             :    2.5*R+2*ceil(W*C/256)*R cycles otherwise.  The slow path takes about
     273             :    133*J cycles.  Then the expected value of the number of cycles it
     274             :    takes to process a transaction with R accounts is
     275             :      R*(2.5+2*ceil(W*C/256) - [W*C<=256]) + FP_{W,V,R,C}*133*R
     276             : 
     277             :    Based on a sample of 100,000 slots containing about 100M
     278             :    transactions, the CDF looks like
     279             : 
     280             :      Fraction of transactions containing <=  3 account addresses     71%
     281             :                                          <= 13 account addresses     80%
     282             :                                          <= 24 account addresses     91%
     283             :                                          <= 31 account addresses     95%
     284             :                                          <= 44 account addresses     98%
     285             :                                          <= 50 account addresses     99%
     286             : 
     287             :    Basically, there's a peak at 3 (votes), and then a very long, very
     288             :    fat tail.  When using AVX2, it basically boils down into 2 regimes:
     289             :           0 <= R <  28            W=32, C=8,  V=0  (one AVX vector)
     290             :          28 <= R <= 64            W=32, C=32, V=32 (four AVX vectors)
     291             : 
     292             :    This combination has an expected value of about 54 cycles over all
     293             :    transactions.  For a typical transaction with 3 account addresses,
     294             :    this takes about 10 cycles and the false positive probability is
     295             :    about 2e-10.
     296             : 
     297             :    When using AVX512, the regimes are similar:
     298             :           0 <= R <  36            W=32, C=16, V=0  (one AVX vector)
     299             :          36 <= R <= 64            W=32, C=32, V=32 (two AVX vectors)
     300             :   This combination has an expected value of about 33 cycles over all
     301             :   transactions.  Again, the typical 3 account address account takes
     302             :   about 9 cycles and has a negligible false positive probability. */
     303             : 
     304             : 
     305             : struct fd_chkdup_waddr {
     306             :   fd_acct_addr_t key; /* account address */
     307             : };
     308             : typedef struct fd_chkdup_waddr fd_chkdup_waddr_t;
     309             : static const fd_acct_addr_t chkdup_null_addr = {{ 0 }};
     310             : 
     311             : #define MAP_NAME              fd_chkdup_pmap
     312           0 : #define MAP_T                 fd_chkdup_waddr_t
     313           0 : #define MAP_KEY_T             fd_acct_addr_t
     314           0 : #define MAP_KEY_NULL          chkdup_null_addr
     315           0 : #define MAP_KEY_INVAL(k)      MAP_KEY_EQUAL(k, chkdup_null_addr)
     316           0 : #define MAP_KEY_EQUAL(k0,k1)  (!memcmp((k0).b,(k1).b, FD_TXN_ACCT_ADDR_SZ))
     317             : #define MAP_KEY_EQUAL_IS_SLOW 1
     318             : #define MAP_MEMOIZE           0
     319           0 : #define MAP_KEY_HASH(key)     ((uint)fd_ulong_hash( fd_ulong_load_8( (key).b ) ))
     320           0 : #define MAP_LG_SLOT_CNT 8
     321             : #include "../../util/tmpl/fd_map.c"
     322             : 
     323             : 
     324             : struct __attribute__((aligned(FD_CHKDUP_ALIGN))) fd_chkdup_private {
     325             : #if FD_CHKDUP_IMPL >= 1
     326             :   uchar  entropy[ 32*FD_CHKDUP_IMPL ];
     327             : #endif
     328             : 
     329             :   fd_chkdup_waddr_t hashmap[ 1UL<<8 ];
     330             : };
     331             : 
     332             : static inline void *
     333             : fd_chkdup_new( void     * shmem,
     334           0 :                fd_rng_t * rng   ) {
     335           0 :   fd_chkdup_t * chkdup = (fd_chkdup_t *)shmem;
     336           0 : #if FD_CHKDUP_IMPL >= 1
     337           0 :   for( ulong i=0UL; i<32*FD_CHKDUP_IMPL; i++ ) chkdup->entropy[ i ] = fd_rng_uchar( rng );
     338             : #else
     339             :   (void)rng;
     340             : #endif
     341           0 :   FD_TEST( fd_chkdup_pmap_footprint()==sizeof(chkdup->hashmap) ); /* Known at compile time */
     342             : 
     343           0 :   fd_chkdup_pmap_new( chkdup->hashmap );
     344           0 :   return chkdup;
     345           0 : }
     346             : 
     347           0 : static inline fd_chkdup_t * fd_chkdup_join  ( void * shmem ) { return (fd_chkdup_t *)shmem; }
     348             : 
     349           0 : static inline void * fd_chkdup_leave ( fd_chkdup_t * chkdup ) { return (void *)chkdup; }
     350           0 : static inline void * fd_chkdup_delete( void        * shmem  ) { return         shmem;  }
     351             : 
     352             : 
     353             : static inline int
     354             : fd_chkdup_check     ( fd_chkdup_t          * chkdup,
     355             :                       fd_acct_addr_t const * list0,     ulong list0_cnt,
     356           0 :                       fd_acct_addr_t const * list1,     ulong list1_cnt ) {
     357           0 :   if( FD_LIKELY( 0==fd_chkdup_check_fast( chkdup, list0, list0_cnt, list1, list1_cnt ) ) ) return 0;
     358           0 :   return fd_chkdup_check_slow( chkdup, list0, list0_cnt, list1, list1_cnt );
     359           0 : }
     360             : 
     361             : static inline int
     362             : fd_chkdup_check_slow( fd_chkdup_t          * chkdup,
     363             :                       fd_acct_addr_t const * list0,     ulong list0_cnt,
     364           0 :                       fd_acct_addr_t const * list1,     ulong list1_cnt ) {
     365             :   /* Precondition: total account count must fit the hashmap and inserted array. */
     366           0 :   FD_TEST( list0_cnt+list1_cnt<=FD_TXN_ACCT_ADDR_MAX );
     367             : 
     368           0 :   fd_chkdup_waddr_t * map = fd_chkdup_pmap_join( chkdup->hashmap );
     369           0 :   fd_chkdup_waddr_t * inserted[ FD_TXN_ACCT_ADDR_MAX ];
     370           0 :   ulong inserted_cnt = 0UL;
     371             : 
     372           0 :   int any_duplicates = 0;
     373           0 :   int skipped_inval = 0;
     374           0 :   for( ulong i0=0UL; (i0<list0_cnt) & !any_duplicates; i0++ ) {
     375           0 :     if( FD_UNLIKELY( fd_chkdup_pmap_key_inval( list0[ i0 ] ) ) ) {
     376             :       /* Okay if this is the 1st, but not if the 2nd */
     377           0 :       any_duplicates |= skipped_inval;
     378           0 :       skipped_inval   = 1;
     379           0 :       continue;
     380           0 :     }
     381           0 :     fd_chkdup_waddr_t * ins = fd_chkdup_pmap_insert( map, list0[ i0 ] );
     382           0 :     inserted[ inserted_cnt++ ] = ins;
     383           0 :     any_duplicates |=        (NULL==ins);
     384           0 :     inserted_cnt   -= (ulong)(NULL==ins); /* Correct inserted_cnt if we just stored a NULL */
     385           0 :   }
     386           0 :   for( ulong i1=0UL; (i1<list1_cnt) & !any_duplicates; i1++ ) {
     387           0 :     if( FD_UNLIKELY( fd_chkdup_pmap_key_inval( list1[ i1 ] ) ) ) {
     388           0 :       any_duplicates |= skipped_inval;
     389           0 :       skipped_inval   = 1;
     390           0 :       continue;
     391           0 :     }
     392           0 :     fd_chkdup_waddr_t * ins = fd_chkdup_pmap_insert( map, list1[ i1 ] );
     393           0 :     inserted[ inserted_cnt++ ] = ins;
     394           0 :     any_duplicates |=        (NULL==ins);
     395           0 :     inserted_cnt   -= (ulong)(NULL==ins);
     396           0 :   }
     397             : 
     398             :   /* FIXME: This depends on undocumented map behavior for correctness.
     399             :      Deleting in the opposite order of insertion preserves previously
     400             :      inserted pointers. That behavior should be documented. */
     401           0 :   for( ulong i=0UL; i<inserted_cnt; i++ ) fd_chkdup_pmap_remove( map, inserted[ inserted_cnt-1UL-i ] );
     402             : 
     403           0 :   fd_chkdup_pmap_leave( map );
     404             : 
     405           0 :   return any_duplicates;
     406           0 : }
     407             : 
     408             : 
     409             : #if FD_CHKDUP_IMPL==1
     410             : 
     411             : /* AVX2 implementation */
     412             : #include "../../util/simd/fd_avx.h"
     413             : static inline int
     414             : fd_chkdup_check_fast( fd_chkdup_t          * chkdup,
     415             :                       fd_acct_addr_t const * list0,     ulong list0_cnt,
     416             :                       fd_acct_addr_t const * list1,     ulong list1_cnt ) {
     417             :   if( FD_UNLIKELY( list0_cnt+list1_cnt<=1UL ) ) return 0UL;
     418             : 
     419             :   int any_duplicates = 0;
     420             : 
     421             :   const wu_t entropy = wb_ld( chkdup->entropy );
     422             :   const wu_t one     = wu_bcast( 1U );
     423             : 
     424             : 
     425             :   if( FD_LIKELY( list0_cnt+list1_cnt<28UL ) ) {
     426             :     /* Single vector implementation */
     427             :     const wu_t mask = wu_bcast( 0x1FU );
     428             : 
     429             :     wu_t bloom = wu_zero();
     430             :     for( ulong i0=0UL; i0<list0_cnt; i0++ ) {
     431             :       wu_t addr    = wb_ldu( list0+i0 );
     432             :       wu_t masked  = wu_and( wu_xor( addr, entropy ), mask );
     433             :       wu_t select  = wu_shl_vector( one, masked );
     434             :       /* testc: "Compute the bitwise NOT of a and then AND with b, and
     435             :          [return] 1 if the result is zero." */
     436             :       any_duplicates |= _mm256_testc_si256( bloom, select );
     437             :       bloom = wu_or( bloom, select );
     438             :     }
     439             :     for( ulong i1=0UL; i1<list1_cnt; i1++ ) {
     440             :       wu_t addr    = wb_ldu( list1+i1 );
     441             :       wu_t masked  = wu_and( wu_xor( addr, entropy ), mask );
     442             :       wu_t select  = wu_shl_vector( one, masked );
     443             :       any_duplicates |= _mm256_testc_si256( bloom, select );
     444             :       bloom = wu_or( bloom, select );
     445             :     }
     446             :     return any_duplicates;
     447             : 
     448             :   } else {
     449             :     /* 4-vector implementation: slower but much better false positive
     450             :        rate so that we don't have to fall back to the slow path as
     451             :        frequently. */
     452             :     const wu_t mask = wu_bcast( 0x3FU );
     453             : 
     454             :     wu_t bloom0 = wu_zero();                                wu_t bloom1 = wu_zero();
     455             :     wu_t bloom2 = wu_zero();                                wu_t bloom3 = wu_zero();
     456             :     for( ulong i0=0UL; i0<list0_cnt; i0++ ) {
     457             :       wu_t addr    = wb_ldu( list0+i0 );
     458             :       wu_t blinded = wu_xor( addr, entropy );
     459             :       wu_t masked0 = wu_and( mask, blinded );               wu_t masked1 = wu_and( mask, wu_shr( blinded,  6 ) );
     460             :       wu_t masked2 = wu_and( mask, wu_shr( blinded, 12 ) ); wu_t masked3 = wu_and( mask, wu_shr( blinded, 18 ) );
     461             :       wu_t select0 = wu_shl_vector( one, masked0 );         wu_t select1 = wu_shl_vector( one, masked1 );
     462             :       wu_t select2 = wu_shl_vector( one, masked2 );         wu_t select3 = wu_shl_vector( one, masked3 );
     463             : 
     464             :       wu_t any_differences = wu_or(
     465             :            wu_or( wu_andnot( bloom0, select0 ),             wu_andnot( bloom1, select1 ) ),
     466             :            wu_or( wu_andnot( bloom2, select2 ),             wu_andnot( bloom3, select3 ) ) );
     467             : 
     468             :       bloom0  = wu_or( bloom0, select0 );                   bloom1  = wu_or( bloom1, select1 );
     469             :       bloom2  = wu_or( bloom2, select2 );                   bloom3  = wu_or( bloom3, select3 );
     470             : 
     471             :       any_duplicates |= _mm256_testz_si256( any_differences, any_differences );
     472             :       FD_COMPILER_FORGET( any_duplicates );
     473             :     }
     474             :     for( ulong i1=0UL; i1<list1_cnt; i1++ ) {
     475             :       wu_t addr    = wb_ldu( list1+i1 );
     476             :       wu_t blinded = wu_xor( addr, entropy );
     477             :       wu_t masked0 = wu_and( mask, blinded );               wu_t masked1 = wu_and( mask, wu_shr( blinded,  6 ) );
     478             :       wu_t masked2 = wu_and( mask, wu_shr( blinded, 12 ) ); wu_t masked3 = wu_and( mask, wu_shr( blinded, 18 ) );
     479             :       wu_t select0 = wu_shl_vector( one, masked0 );         wu_t select1 = wu_shl_vector( one, masked1 );
     480             :       wu_t select2 = wu_shl_vector( one, masked2 );         wu_t select3 = wu_shl_vector( one, masked3 );
     481             : 
     482             :       wu_t any_differences = wu_or(
     483             :            wu_or( wu_andnot( bloom0, select0 ),             wu_andnot( bloom1, select1 ) ),
     484             :            wu_or( wu_andnot( bloom2, select2 ),             wu_andnot( bloom3, select3 ) ) );
     485             : 
     486             :       bloom0  = wu_or( bloom0, select0 );                   bloom1  = wu_or( bloom1, select1 );
     487             :       bloom2  = wu_or( bloom2, select2 );                   bloom3  = wu_or( bloom3, select3 );
     488             : 
     489             :       any_duplicates |= _mm256_testz_si256( any_differences, any_differences );
     490             :       FD_COMPILER_FORGET( any_duplicates );
     491             :     }
     492             :     return any_duplicates;
     493             :   }
     494             : }
     495             : 
     496             : #elif FD_CHKDUP_IMPL==2
     497             : 
     498             : /* AVX512 implementation */
     499             : #include "../../util/simd/fd_avx512.h"
     500             : static inline int
     501             : fd_chkdup_check_fast( fd_chkdup_t              * chkdup,
     502             :                       fd_acct_addr_t const     * list0,     ulong list0_cnt,
     503           0 :                       fd_acct_addr_t const     * list1,     ulong list1_cnt ) {
     504           0 :   if( FD_UNLIKELY( list0_cnt+list1_cnt<=1UL ) ) return 0UL;
     505             : 
     506           0 :   int any_duplicates = 0;
     507             : 
     508           0 :   const wwu_t entropy = wwu_ld( (uint const *)chkdup->entropy );
     509           0 :   const wwu_t one     = wwu_bcast( 1U );
     510             : 
     511           0 :   if( FD_LIKELY( list0_cnt+list1_cnt<36UL ) ) {
     512             :     /* One vector version */
     513             :     /* Our analysis assumed the 64 bytes of hash were all independent,
     514             :        but if we just xor and then use the low 5 bits of both parts of
     515             :        the vector, we get a lot more false positives than the math
     516             :        predicts. */
     517           0 :     const wwu_t mask = wwu_bcast( 0x1FU );
     518           0 :     wwu_t bloom = wwu_zero();
     519           0 :     for( ulong i0=0UL; i0<list0_cnt; i0++ ) {
     520           0 :       wwu_t addr    = _mm512_broadcast_i64x4( wu_ldu( list0+i0 ) );
     521           0 :       wwu_t blinded = wwu_xor( addr, entropy );
     522           0 :       wwu_t masked  = wwu_and( mask, _mm512_mask_srli_epi32( blinded, 0xFF00, blinded, 6 ) );
     523           0 :       wwu_t select  = _mm512_rolv_epi32( one, masked );
     524           0 :       wwu_t next    = wwu_or( bloom, select );
     525           0 :       __mmask8 any_differences = _mm512_cmp_epi64_mask( bloom, next, _MM_CMPINT_NE ); /* if non-zero, not a duplicate */
     526           0 :       bloom = next;
     527             :       /* kortestz_mask8_u8: "Compute the bitwise OR of 8-bit masks a and
     528             :          b. If the result is all zeroes, [return] 1" */
     529           0 :       any_duplicates |= _kortestz_mask8_u8( any_differences, any_differences );
     530           0 :       FD_COMPILER_FORGET( any_duplicates );
     531           0 :     }
     532           0 :     for( ulong i1=0UL; i1<list1_cnt; i1++ ) {
     533           0 :       wwu_t addr    = _mm512_broadcast_i64x4( wu_ldu( list1+i1 ) );
     534           0 :       wwu_t blinded = wwu_xor( addr, entropy );
     535           0 :       wwu_t masked  = wwu_and( mask, _mm512_mask_srli_epi32( blinded, 0xFF00, blinded, 6 ) );
     536           0 :       wwu_t select  = _mm512_rolv_epi32( one, masked );
     537           0 :       wwu_t next    = wwu_or( bloom, select );
     538           0 :       __mmask8 any_differences = _mm512_cmp_epi64_mask( bloom, next, _MM_CMPINT_NE );
     539           0 :       bloom = next;
     540           0 :       any_duplicates |= _kortestz_mask8_u8( any_differences, any_differences );
     541           0 :       FD_COMPILER_FORGET( any_duplicates );
     542           0 :     }
     543           0 :     return any_duplicates;
     544           0 :   } else {
     545             :     /* Two vector version */
     546           0 :     const wwu_t mask = wwu_bcast( 0x3FU );
     547           0 :     const wwu_t shift0 = wwu( 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U,
     548           0 :                               6U, 6U, 6U, 6U, 6U, 6U, 6U, 6U );
     549           0 :     const wwu_t shift1 = wwu( 12U, 12U, 12U, 12U, 12U, 12U, 12U, 12U,
     550           0 :                               18U, 18U, 18U, 18U, 18U, 18U, 18U, 18U );
     551           0 :     wwu_t bloom0 = wwu_zero();            wwu_t bloom1 = wwu_zero();
     552           0 :     for( ulong i0=0UL; i0<list0_cnt; i0++ ) {
     553           0 :       wwu_t addr    = _mm512_broadcast_i64x4( wu_ldu( list0+i0 ) );
     554           0 :       wwu_t blinded = wwu_xor( addr, entropy );
     555           0 :       wwu_t masked0 = wwu_and( mask, wwu_shr_vector( blinded, shift0 ) );  wwu_t masked1 = wwu_and( mask, wwu_shr_vector( blinded, shift1 ) );
     556           0 :       wwu_t select0 = wwu_shl_vector( one, masked0 );                      wwu_t select1 = wwu_shl_vector( one, masked1 );
     557           0 :       wwu_t next0   = wwu_or( bloom0, select0 );                           wwu_t next1   = wwu_or( bloom1, select1 );
     558           0 :       __mmask8 any_differences = _kor_mask8(
     559           0 :           _mm512_cmp_epi64_mask( bloom0, next0, _MM_CMPINT_NE ),           _mm512_cmp_epi64_mask( bloom1, next1, _MM_CMPINT_NE ) );
     560             : 
     561           0 :       bloom0 = next0;                                                      bloom1 = next1;
     562             : 
     563           0 :       any_duplicates |= _kortestz_mask8_u8( any_differences, any_differences );
     564           0 :       FD_COMPILER_FORGET( any_duplicates );
     565           0 :     }
     566           0 :     for( ulong i1=0UL; i1<list1_cnt; i1++ ) {
     567           0 :       wwu_t addr    = _mm512_broadcast_i64x4( wu_ldu( list1+i1 ) );
     568           0 :       wwu_t blinded = wwu_xor( addr, entropy );
     569           0 :       wwu_t masked0 = wwu_and( mask, wwu_shr_vector( blinded, shift0 ) );  wwu_t masked1 = wwu_and( mask, wwu_shr_vector( blinded, shift1 ) );
     570           0 :       wwu_t select0 = wwu_shl_vector( one, masked0 );                      wwu_t select1 = wwu_shl_vector( one, masked1 );
     571           0 :       wwu_t next0   = wwu_or( bloom0, select0 );                           wwu_t next1   = wwu_or( bloom1, select1 );
     572           0 :       __mmask8 any_differences = _kor_mask8(
     573           0 :           _mm512_cmp_epi64_mask( bloom0, next0, _MM_CMPINT_NE ),           _mm512_cmp_epi64_mask( bloom1, next1, _MM_CMPINT_NE ) );
     574             : 
     575           0 :       bloom0 = next0;                                                      bloom1 = next1;
     576             : 
     577           0 :       any_duplicates |= _kortestz_mask8_u8( any_differences, any_differences );
     578           0 :       FD_COMPILER_FORGET( any_duplicates );
     579           0 :     }
     580           0 :     return any_duplicates;
     581           0 :   }
     582           0 : }
     583             : 
     584             : #else
     585             : 
     586             : static inline int
     587             : fd_chkdup_check_fast( fd_chkdup_t          * chkdup,
     588             :                       fd_acct_addr_t const * list0,     ulong list0_cnt,
     589             :                       fd_acct_addr_t const * list1,     ulong list1_cnt ) {
     590             :   (void)chkdup;
     591             :   (void)list0;
     592             :   (void)list1;
     593             :   (void)list0_cnt;
     594             :   (void)list1_cnt;
     595             :   return 1;
     596             : }
     597             : 
     598             : #endif
     599             : 
     600             : 
     601             : FD_PROTOTYPES_END
     602             : 
     603             : #endif /* HEADER_fd_src_disco_pack_fd_chkdup_h */

Generated by: LCOV version 1.14