Line data Source code
1 : #ifndef HEADER_fd_src_disco_pack_fd_chkdup_h
2 : #define HEADER_fd_src_disco_pack_fd_chkdup_h
3 :
4 : #include "../../ballet/fd_ballet_base.h"
5 : #include "../../ballet/txn/fd_txn.h"
6 :
7 : /* fd_chkdup declares a set of functions for ultra-HPC checking if a
8 : list of account addresses contains any duplicates. It's important
9 : that this be fast, because a transaction containing duplicate account
10 : addresses fails to sanitize and is not charged a fee. Although this
11 : check can (and really ought) to be done in parallel, perhaps in the
12 : verify tiles, right now, it's done in pack, which means it's serial
13 : and on the critical path.
14 :
15 : On platforms with AVX, the current implementation uses a fast initial
16 : check which may have false positives (thinking there are duplicates
17 : when there aren't). Any transaction that fails the initial check is
18 : then subjected to the full, precise check. Without AVX, all
19 : transactions use the slow path. */
20 :
21 : /* The functions are defined in the header to facilitate inlining since
22 : they take 10s of cycles in the good case, but should probably be
23 : treated as if they were defined in a .c file. */
24 :
25 : #ifndef FD_CHKDUP_IMPL
26 : # if FD_HAS_AVX512
27 : # include "../../util/simd/fd_avx.h"
28 0 : # define FD_CHKDUP_IMPL 2
29 : # elif FD_HAS_AVX
30 : # define FD_CHKDUP_IMPL 1
31 : # else
32 : # define FD_CHKDUP_IMPL 0
33 : # endif
34 : #endif
35 :
36 :
37 : #if FD_CHKDUP_IMPL==2
38 : # define FD_CHKDUP_ALIGN ( 64UL)
39 : #elif FD_CHKDUP_IMPL==1
40 : # define FD_CHKDUP_ALIGN ( 32UL)
41 : #elif FD_CHKDUP_IMPL==0
42 : # define FD_CHKDUP_ALIGN ( 32UL)
43 : #else
44 : # error "Unrecognized value of FD_CHKDUP_IMPL"
45 : #endif
46 :
47 :
48 : # define FD_CHKDUP_FOOTPRINT FD_LAYOUT_FINI( FD_LAYOUT_APPEND( \
49 : FD_LAYOUT_APPEND( FD_LAYOUT_INIT, \
50 : FD_CHKDUP_ALIGN, 32*FD_CHKDUP_IMPL ), \
51 : 32UL, (1UL<<8)*32UL ), \
52 : FD_CHKDUP_ALIGN )
53 :
54 : FD_STATIC_ASSERT( (1UL<<8)==2*FD_TXN_ACCT_ADDR_MAX, "hash table size" );
55 :
56 : /* Fixed size (just over 8kB) and safe for declaration on the stack or
57 : inclusion in a struct. */
58 : struct fd_chkdup_private;
59 : typedef struct fd_chkdup_private fd_chkdup_t;
60 :
61 : FD_PROTOTYPES_BEGIN
62 : /* fd_chkdup_{footprint, align} return the footprint and alignment of
63 : the scratch memory that duplicate detection requires. */
64 0 : static inline ulong fd_chkdup_footprint( void ) { return FD_CHKDUP_FOOTPRINT; }
65 0 : static inline ulong fd_chkdup_align ( void ) { return FD_CHKDUP_ALIGN; }
66 :
67 : /* fd_chkdup_new formats an appropriately sized region of memory for use
68 : in duplicate address detection. shmem must point to the first byte
69 : of a region of memory with the appropriate alignment and footprint.
70 : rng must be a pointer to a local join of an RNG. Some slots of the
71 : RNG will be consumed, but no interest in the RNG will be retained
72 : after the function returns. Returns shmem on success and NULL on
73 : failure (logs details). The only failure cases are if shmem is NULL
74 : or not aligned.
75 :
76 : fd_chkdup_join joins the caller to the formatted region of memory.
77 : Returns shmem.
78 :
79 : fd_chkdup_leave unjoins the caller to chkdup. Returns chkdup.
80 : fd_chkdup_delete unformats the region of memory. Returns a pointer
81 : to the unformatted memory region. */
82 : static inline void * fd_chkdup_new ( void * shmem, fd_rng_t * rng );
83 : static inline fd_chkdup_t * fd_chkdup_join ( void * shmem );
84 : static inline void * fd_chkdup_leave ( fd_chkdup_t * chkdup );
85 : static inline void * fd_chkdup_delete( void * shmem );
86 :
87 :
88 : /* fd_chkdup_check{,_slow,_fast} check a list of account addresses for
89 : any duplicate addresses, i.e. an account address that appears twice
90 : in the list. The list does not need to be sorted or have any
91 : particular order. The list may be decomposed into two sublists
92 : (list0 and list1) to facilitate 0-copy usage with address lookup
93 : tables, but list0 and list1 are logically concatenated prior to
94 : checking for duplicates.
95 :
96 : chkdup is a pointer to a valid local join of a chkdup object.
97 :
98 : list0 and list1 point to the first account address of the respective
99 : sublists. The memory they point to need not have any particular
100 : alignment. list0==NULL is okay only if list0_cnt==0, and similarly
101 : for list1. list0 is accessed with indices [0, list0_cnt) and list1
102 : is accessed with indices [0, list1_cnt). list0 and list1 must not
103 : overlap. Requires list0_cnt+list1_cnt<=128, and the function is
104 : somewhat tuned for smaller values.
105 :
106 : fd_chkdup_check and the _slow version return 1 if the list of
107 : transactions contains at least one duplicated account address and 0
108 : otherwise (implying each account address in the provided list is
109 : unique).
110 :
111 : fd_chkdup_check_fast returns 1 if the list of transactions contains
112 : at least one duplicated account address and typically returns 0 if
113 : each account address in the provided list is unique, but may
114 : sometimes spuriiously return 1 even without duplicates.
115 :
116 : WARNING: the _fast version MAY HAVE FALSE POSITIVES. You probably
117 : want the un-suffixed version, which is precise. It uses the fast
118 : version as a fast-path and then does a slower full check if the
119 : fast-path suggests there may be a duplicate.
120 :
121 : However, it's also worth calling out again that the _fast version
122 : only makes errors in one direction. If the list contains duplicates,
123 : it will definitely return 1. If it returns 0, the list definitely
124 : does not contain duplicates. (Those two statements are equivalent).
125 : */
126 : static inline int
127 : fd_chkdup_check ( fd_chkdup_t * chkdup,
128 : fd_acct_addr_t const * list0, ulong list0_cnt,
129 : fd_acct_addr_t const * list1, ulong list1_cnt );
130 : static inline int
131 : fd_chkdup_check_slow( fd_chkdup_t * chkdup,
132 : fd_acct_addr_t const * list0, ulong list0_cnt,
133 : fd_acct_addr_t const * list1, ulong list1_cnt );
134 : static inline int
135 : fd_chkdup_check_fast( fd_chkdup_t * chkdup,
136 : fd_acct_addr_t const * list0, ulong list0_cnt,
137 : fd_acct_addr_t const * list1, ulong list1_cnt );
138 :
139 :
140 : /* ----- Implementation details and discussion follow------
141 :
142 : The fast path implementation is somewhat interesting. The normal way
143 : to do this is with a Bloom filter, but Bloom filters do lots of
144 : unpredictable reads and the pack tile is somewhat cache sensitive.
145 : Instead, this implements a variant on a Bloom filter that lives
146 : entirely in AVX registers.
147 :
148 : Basically, we use C W-bit words stored in (C*W)/256 AVX2 registers or
149 : (C*W)/512 AVX512 registers. Each word is a modified Bloom filter
150 : with one associated hash function. For each account address, we
151 : compute C hashes, giving (up to) C positions across the AVX
152 : registers. If any of those positions have an unset bit, then we know
153 : we have not seen the account address before. Finally, we then set
154 : all the positions.
155 :
156 : The only difference between this idea and a normal Bloom filter is
157 : that sometimes the hash function may not select a bit. There's a
158 : tradeoff to be made here: suppose you insert R account addresses, and
159 : R is at least almost as large as W. Then each word fills up, and
160 : false positives become increasingly likely. Only testing and
161 : inserting a bit, say, half the time, effectively halves C, but
162 : prevents each word from getting saturated as quickly, and makes the
163 : algorithm effective for larger values of R. We use Intel's quirky
164 : behavior to get this for free.
165 :
166 : (Note: The R and C notation is supposed to suggest a tabular layout
167 : in which the account addresses are rows and the words are columns.)
168 :
169 : The key insight into making this work quickly is that the vpsllv{d,q}
170 : variable left shift instructions are cheap (1 cycle, can execute on
171 : port 0 or 1 for AVX2, still 1 cycle on port 0 for AVX512). If we can
172 : compute the hashes in parallel with SIMD, then we can variably shift
173 : bcast(0x1) quickly, and select several bits at a time. The rest is
174 : just bitwise logic, which is extremely cheap with AVX. This approach
175 : constrains W to be either 32 or 64, and C to be a multiple of the
176 : number of words in a vector, but those are both pretty acceptable
177 : constraints.
178 :
179 : The final ingredient is a cheap hash function that places the hashes
180 : in the appropriate position for vpsllv. We just xor the account
181 : address with some validator-specific entropy and use a mask to select
182 : certain bits.
183 :
184 : The slow-path implementation uses a hash table to check for
185 : duplicates. This is slower than sorting for transactions with only a
186 : few account addresses, but substantially faster than sorting for
187 : transactions with large numbers of account addresses, which is when
188 : the slow-path matters more anyway.
189 :
190 :
191 : You can see from above there are a variety of knobs to tune. Should
192 : W be 32 or 64? How big should C be? How many bits should we mask
193 : off for the hash, which controls the frequency with which we skip a
194 : word, neither checking nor inserting a bit? It would be good to have
195 : a rigorous understanding of the false positive rate as a function of
196 : these parameters so we can make a decision that minimizes the
197 : expected compute required.
198 :
199 : Unfortunately, the false positive computation is tricky. The key
200 : difficulty is that whether processing account address r results in a
201 : false positive is not independent from whether processing account
202 : address r+1 results in a false positive. This leads to an obnoxious
203 : inclusion-exclusion formula which quickly becomes more unwieldy than
204 : I (or Sage) can handle.
205 :
206 : A dynamic-programming-ish algorithm can compute the false positive
207 : rate in approximately O(R*2^R) time. To start, we just want to
208 : understand a single word/column. Suppose k bits in the word have
209 : been set. Then there are (W-k) hash values that set a new bit, and
210 : (V+k) hash values that don't set a new, where V is the number of hash
211 : values that don't select a bit. The hash values that set a bit are
212 : also exactly the ones that provide information that the account
213 : address is not a false positive. I think about this as "spending a
214 : bit" to know it's not a false positive.
215 :
216 : Denoting the rows at which we spend a bit by a 1 and the rows at
217 : which we don't spend a bit by 0, we might get a column like:
218 :
219 : 1
220 : 0
221 : 1
222 : 1
223 : 0.
224 : The number of ways this column can occur is x1 =
225 : W*(V+1)*(W-1)*(W-2)*(V+3), which means that the probability the
226 : column occurs is x1/(W+V)^5. Expanding to multiple columns is easy,
227 : since the number of ways that two specific columns can occur is just
228 : the product of the number of ways each can occur. For example,
229 : 1 0
230 : 0 1
231 : 1 1
232 : 1 0
233 : 0 0
234 : can occur x1 * x2 ways, where x2 = V*W*(W-1)*(V+2)*(V+2).
235 : A false positive happens when there's a row of all 0s, as in the last
236 : row of the example.
237 :
238 : It's cleaner to count the number of ways not to get a false positive.
239 : This gives us the inclusion-exclusion formula:
240 : (all ways)
241 : - (all ways where row 0 is all 0s)
242 : - (all ways where row 1 is all 0s)
243 : - ...
244 : + (all ways where rows 0 and 1 are both all 0s)
245 : + (all ways where rows 0 and 2 are both all 0s)
246 : + ...
247 : - (all ways where rows 0, 1, and 2 are all 0s)
248 : - (all ways where rows 0, 1, and 3 are all 0s)
249 : +, - ...
250 : + (-1)^R (all ways in which all rows are all 0s).
251 :
252 : There's a nice way to understand each of these terms. For example,
253 : in the R=2, C=3 case, the term in which row 0 is all 0s has the
254 : following elements:
255 : 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
256 : 0 0 0 0 0 1 0 1 0 0 1 1 1 0 0 1 0 1 1 1 0 1 1 1
257 : Rather than enumerating all 2^( (R-1)*C ) elements, we'll represent
258 : it as
259 : ( 0 + 0 )^3
260 : ( 0 1 )
261 :
262 : Skipping some steps, the task boils down to counting the number of
263 : ways to get columns that match a mask, then raising that to the Cth
264 : power.
265 :
266 : Now, with all this behind us, our goal is to pick the optimal value
267 : of W,V, and C given a supposed distribution of transactions, and a
268 : performance model. Based on some back of the envelope calculations
269 : based on instruction throughput and latency measurements and
270 : confirmed by some experiments, the fast path code takes about 3.5*R
271 : cycles if using one AVX2 vector (W*C==256) and
272 : 2.5*R+2*ceil(W*C/256)*R cycles otherwise. The slow path takes about
273 : 133*J cycles. Then the expected value of the number of cycles it
274 : takes to process a transaction with R accounts is
275 : R*(2.5+2*ceil(W*C/256) - [W*C<=256]) + FP_{W,V,R,C}*133*R
276 :
277 : Based on a sample of 100,000 slots containing about 100M
278 : transactions, the CDF looks like
279 :
280 : Fraction of transactions containing <= 3 account addresses 71%
281 : <= 13 account addresses 80%
282 : <= 24 account addresses 91%
283 : <= 31 account addresses 95%
284 : <= 44 account addresses 98%
285 : <= 50 account addresses 99%
286 :
287 : Basically, there's a peak at 3 (votes), and then a very long, very
288 : fat tail. When using AVX2, it basically boils down into 2 regimes:
289 : 0 <= R < 28 W=32, C=8, V=0 (one AVX vector)
290 : 28 <= R <= 64 W=32, C=32, V=32 (four AVX vectors)
291 :
292 : This combination has an expected value of about 54 cycles over all
293 : transactions. For a typical transaction with 3 account addresses,
294 : this takes about 10 cycles and the false positive probability is
295 : about 2e-10.
296 :
297 : When using AVX512, the regimes are similar:
298 : 0 <= R < 36 W=32, C=16, V=0 (one AVX vector)
299 : 36 <= R <= 64 W=32, C=32, V=32 (two AVX vectors)
300 : This combination has an expected value of about 33 cycles over all
301 : transactions. Again, the typical 3 account address account takes
302 : about 9 cycles and has a negligible false positive probability. */
303 :
304 :
305 : struct fd_chkdup_waddr {
306 : fd_acct_addr_t key; /* account address */
307 : };
308 : typedef struct fd_chkdup_waddr fd_chkdup_waddr_t;
309 : static const fd_acct_addr_t chkdup_null_addr = {{ 0 }};
310 :
311 : #define MAP_NAME fd_chkdup_pmap
312 0 : #define MAP_T fd_chkdup_waddr_t
313 0 : #define MAP_KEY_T fd_acct_addr_t
314 0 : #define MAP_KEY_NULL chkdup_null_addr
315 0 : #define MAP_KEY_INVAL(k) MAP_KEY_EQUAL(k, chkdup_null_addr)
316 0 : #define MAP_KEY_EQUAL(k0,k1) (!memcmp((k0).b,(k1).b, FD_TXN_ACCT_ADDR_SZ))
317 : #define MAP_KEY_EQUAL_IS_SLOW 1
318 : #define MAP_MEMOIZE 0
319 0 : #define MAP_KEY_HASH(key) ((uint)fd_ulong_hash( fd_ulong_load_8( (key).b ) ))
320 0 : #define MAP_LG_SLOT_CNT 8
321 : #include "../../util/tmpl/fd_map.c"
322 :
323 :
324 : struct __attribute__((aligned(FD_CHKDUP_ALIGN))) fd_chkdup_private {
325 : #if FD_CHKDUP_IMPL >= 1
326 : uchar entropy[ 32*FD_CHKDUP_IMPL ];
327 : #endif
328 :
329 : fd_chkdup_waddr_t hashmap[ 1UL<<8 ];
330 : };
331 :
332 : static inline void *
333 : fd_chkdup_new( void * shmem,
334 0 : fd_rng_t * rng ) {
335 0 : fd_chkdup_t * chkdup = (fd_chkdup_t *)shmem;
336 0 : #if FD_CHKDUP_IMPL >= 1
337 0 : for( ulong i=0UL; i<32*FD_CHKDUP_IMPL; i++ ) chkdup->entropy[ i ] = fd_rng_uchar( rng );
338 : #else
339 : (void)rng;
340 : #endif
341 0 : FD_TEST( fd_chkdup_pmap_footprint()==sizeof(chkdup->hashmap) ); /* Known at compile time */
342 :
343 0 : fd_chkdup_pmap_new( chkdup->hashmap );
344 0 : return chkdup;
345 0 : }
346 :
347 0 : static inline fd_chkdup_t * fd_chkdup_join ( void * shmem ) { return (fd_chkdup_t *)shmem; }
348 :
349 0 : static inline void * fd_chkdup_leave ( fd_chkdup_t * chkdup ) { return (void *)chkdup; }
350 0 : static inline void * fd_chkdup_delete( void * shmem ) { return shmem; }
351 :
352 :
353 : static inline int
354 : fd_chkdup_check ( fd_chkdup_t * chkdup,
355 : fd_acct_addr_t const * list0, ulong list0_cnt,
356 0 : fd_acct_addr_t const * list1, ulong list1_cnt ) {
357 0 : if( FD_LIKELY( 0==fd_chkdup_check_fast( chkdup, list0, list0_cnt, list1, list1_cnt ) ) ) return 0;
358 0 : return fd_chkdup_check_slow( chkdup, list0, list0_cnt, list1, list1_cnt );
359 0 : }
360 :
361 : static inline int
362 : fd_chkdup_check_slow( fd_chkdup_t * chkdup,
363 : fd_acct_addr_t const * list0, ulong list0_cnt,
364 0 : fd_acct_addr_t const * list1, ulong list1_cnt ) {
365 : /* Precondition: total account count must fit the hashmap and inserted array. */
366 0 : FD_TEST( list0_cnt+list1_cnt<=FD_TXN_ACCT_ADDR_MAX );
367 :
368 0 : fd_chkdup_waddr_t * map = fd_chkdup_pmap_join( chkdup->hashmap );
369 0 : fd_chkdup_waddr_t * inserted[ FD_TXN_ACCT_ADDR_MAX ];
370 0 : ulong inserted_cnt = 0UL;
371 :
372 0 : int any_duplicates = 0;
373 0 : int skipped_inval = 0;
374 0 : for( ulong i0=0UL; (i0<list0_cnt) & !any_duplicates; i0++ ) {
375 0 : if( FD_UNLIKELY( fd_chkdup_pmap_key_inval( list0[ i0 ] ) ) ) {
376 : /* Okay if this is the 1st, but not if the 2nd */
377 0 : any_duplicates |= skipped_inval;
378 0 : skipped_inval = 1;
379 0 : continue;
380 0 : }
381 0 : fd_chkdup_waddr_t * ins = fd_chkdup_pmap_insert( map, list0[ i0 ] );
382 0 : inserted[ inserted_cnt++ ] = ins;
383 0 : any_duplicates |= (NULL==ins);
384 0 : inserted_cnt -= (ulong)(NULL==ins); /* Correct inserted_cnt if we just stored a NULL */
385 0 : }
386 0 : for( ulong i1=0UL; (i1<list1_cnt) & !any_duplicates; i1++ ) {
387 0 : if( FD_UNLIKELY( fd_chkdup_pmap_key_inval( list1[ i1 ] ) ) ) {
388 0 : any_duplicates |= skipped_inval;
389 0 : skipped_inval = 1;
390 0 : continue;
391 0 : }
392 0 : fd_chkdup_waddr_t * ins = fd_chkdup_pmap_insert( map, list1[ i1 ] );
393 0 : inserted[ inserted_cnt++ ] = ins;
394 0 : any_duplicates |= (NULL==ins);
395 0 : inserted_cnt -= (ulong)(NULL==ins);
396 0 : }
397 :
398 : /* FIXME: This depends on undocumented map behavior for correctness.
399 : Deleting in the opposite order of insertion preserves previously
400 : inserted pointers. That behavior should be documented. */
401 0 : for( ulong i=0UL; i<inserted_cnt; i++ ) fd_chkdup_pmap_remove( map, inserted[ inserted_cnt-1UL-i ] );
402 :
403 0 : fd_chkdup_pmap_leave( map );
404 :
405 0 : return any_duplicates;
406 0 : }
407 :
408 :
409 : #if FD_CHKDUP_IMPL==1
410 :
411 : /* AVX2 implementation */
412 : #include "../../util/simd/fd_avx.h"
413 : static inline int
414 : fd_chkdup_check_fast( fd_chkdup_t * chkdup,
415 : fd_acct_addr_t const * list0, ulong list0_cnt,
416 : fd_acct_addr_t const * list1, ulong list1_cnt ) {
417 : if( FD_UNLIKELY( list0_cnt+list1_cnt<=1UL ) ) return 0UL;
418 :
419 : int any_duplicates = 0;
420 :
421 : const wu_t entropy = wb_ld( chkdup->entropy );
422 : const wu_t one = wu_bcast( 1U );
423 :
424 :
425 : if( FD_LIKELY( list0_cnt+list1_cnt<28UL ) ) {
426 : /* Single vector implementation */
427 : const wu_t mask = wu_bcast( 0x1FU );
428 :
429 : wu_t bloom = wu_zero();
430 : for( ulong i0=0UL; i0<list0_cnt; i0++ ) {
431 : wu_t addr = wb_ldu( list0+i0 );
432 : wu_t masked = wu_and( wu_xor( addr, entropy ), mask );
433 : wu_t select = wu_shl_vector( one, masked );
434 : /* testc: "Compute the bitwise NOT of a and then AND with b, and
435 : [return] 1 if the result is zero." */
436 : any_duplicates |= _mm256_testc_si256( bloom, select );
437 : bloom = wu_or( bloom, select );
438 : }
439 : for( ulong i1=0UL; i1<list1_cnt; i1++ ) {
440 : wu_t addr = wb_ldu( list1+i1 );
441 : wu_t masked = wu_and( wu_xor( addr, entropy ), mask );
442 : wu_t select = wu_shl_vector( one, masked );
443 : any_duplicates |= _mm256_testc_si256( bloom, select );
444 : bloom = wu_or( bloom, select );
445 : }
446 : return any_duplicates;
447 :
448 : } else {
449 : /* 4-vector implementation: slower but much better false positive
450 : rate so that we don't have to fall back to the slow path as
451 : frequently. */
452 : const wu_t mask = wu_bcast( 0x3FU );
453 :
454 : wu_t bloom0 = wu_zero(); wu_t bloom1 = wu_zero();
455 : wu_t bloom2 = wu_zero(); wu_t bloom3 = wu_zero();
456 : for( ulong i0=0UL; i0<list0_cnt; i0++ ) {
457 : wu_t addr = wb_ldu( list0+i0 );
458 : wu_t blinded = wu_xor( addr, entropy );
459 : wu_t masked0 = wu_and( mask, blinded ); wu_t masked1 = wu_and( mask, wu_shr( blinded, 6 ) );
460 : wu_t masked2 = wu_and( mask, wu_shr( blinded, 12 ) ); wu_t masked3 = wu_and( mask, wu_shr( blinded, 18 ) );
461 : wu_t select0 = wu_shl_vector( one, masked0 ); wu_t select1 = wu_shl_vector( one, masked1 );
462 : wu_t select2 = wu_shl_vector( one, masked2 ); wu_t select3 = wu_shl_vector( one, masked3 );
463 :
464 : wu_t any_differences = wu_or(
465 : wu_or( wu_andnot( bloom0, select0 ), wu_andnot( bloom1, select1 ) ),
466 : wu_or( wu_andnot( bloom2, select2 ), wu_andnot( bloom3, select3 ) ) );
467 :
468 : bloom0 = wu_or( bloom0, select0 ); bloom1 = wu_or( bloom1, select1 );
469 : bloom2 = wu_or( bloom2, select2 ); bloom3 = wu_or( bloom3, select3 );
470 :
471 : any_duplicates |= _mm256_testz_si256( any_differences, any_differences );
472 : FD_COMPILER_FORGET( any_duplicates );
473 : }
474 : for( ulong i1=0UL; i1<list1_cnt; i1++ ) {
475 : wu_t addr = wb_ldu( list1+i1 );
476 : wu_t blinded = wu_xor( addr, entropy );
477 : wu_t masked0 = wu_and( mask, blinded ); wu_t masked1 = wu_and( mask, wu_shr( blinded, 6 ) );
478 : wu_t masked2 = wu_and( mask, wu_shr( blinded, 12 ) ); wu_t masked3 = wu_and( mask, wu_shr( blinded, 18 ) );
479 : wu_t select0 = wu_shl_vector( one, masked0 ); wu_t select1 = wu_shl_vector( one, masked1 );
480 : wu_t select2 = wu_shl_vector( one, masked2 ); wu_t select3 = wu_shl_vector( one, masked3 );
481 :
482 : wu_t any_differences = wu_or(
483 : wu_or( wu_andnot( bloom0, select0 ), wu_andnot( bloom1, select1 ) ),
484 : wu_or( wu_andnot( bloom2, select2 ), wu_andnot( bloom3, select3 ) ) );
485 :
486 : bloom0 = wu_or( bloom0, select0 ); bloom1 = wu_or( bloom1, select1 );
487 : bloom2 = wu_or( bloom2, select2 ); bloom3 = wu_or( bloom3, select3 );
488 :
489 : any_duplicates |= _mm256_testz_si256( any_differences, any_differences );
490 : FD_COMPILER_FORGET( any_duplicates );
491 : }
492 : return any_duplicates;
493 : }
494 : }
495 :
496 : #elif FD_CHKDUP_IMPL==2
497 :
498 : /* AVX512 implementation */
499 : #include "../../util/simd/fd_avx512.h"
500 : static inline int
501 : fd_chkdup_check_fast( fd_chkdup_t * chkdup,
502 : fd_acct_addr_t const * list0, ulong list0_cnt,
503 0 : fd_acct_addr_t const * list1, ulong list1_cnt ) {
504 0 : if( FD_UNLIKELY( list0_cnt+list1_cnt<=1UL ) ) return 0UL;
505 :
506 0 : int any_duplicates = 0;
507 :
508 0 : const wwu_t entropy = wwu_ld( (uint const *)chkdup->entropy );
509 0 : const wwu_t one = wwu_bcast( 1U );
510 :
511 0 : if( FD_LIKELY( list0_cnt+list1_cnt<36UL ) ) {
512 : /* One vector version */
513 : /* Our analysis assumed the 64 bytes of hash were all independent,
514 : but if we just xor and then use the low 5 bits of both parts of
515 : the vector, we get a lot more false positives than the math
516 : predicts. */
517 0 : const wwu_t mask = wwu_bcast( 0x1FU );
518 0 : wwu_t bloom = wwu_zero();
519 0 : for( ulong i0=0UL; i0<list0_cnt; i0++ ) {
520 0 : wwu_t addr = _mm512_broadcast_i64x4( wu_ldu( list0+i0 ) );
521 0 : wwu_t blinded = wwu_xor( addr, entropy );
522 0 : wwu_t masked = wwu_and( mask, _mm512_mask_srli_epi32( blinded, 0xFF00, blinded, 6 ) );
523 0 : wwu_t select = _mm512_rolv_epi32( one, masked );
524 0 : wwu_t next = wwu_or( bloom, select );
525 0 : __mmask8 any_differences = _mm512_cmp_epi64_mask( bloom, next, _MM_CMPINT_NE ); /* if non-zero, not a duplicate */
526 0 : bloom = next;
527 : /* kortestz_mask8_u8: "Compute the bitwise OR of 8-bit masks a and
528 : b. If the result is all zeroes, [return] 1" */
529 0 : any_duplicates |= _kortestz_mask8_u8( any_differences, any_differences );
530 0 : FD_COMPILER_FORGET( any_duplicates );
531 0 : }
532 0 : for( ulong i1=0UL; i1<list1_cnt; i1++ ) {
533 0 : wwu_t addr = _mm512_broadcast_i64x4( wu_ldu( list1+i1 ) );
534 0 : wwu_t blinded = wwu_xor( addr, entropy );
535 0 : wwu_t masked = wwu_and( mask, _mm512_mask_srli_epi32( blinded, 0xFF00, blinded, 6 ) );
536 0 : wwu_t select = _mm512_rolv_epi32( one, masked );
537 0 : wwu_t next = wwu_or( bloom, select );
538 0 : __mmask8 any_differences = _mm512_cmp_epi64_mask( bloom, next, _MM_CMPINT_NE );
539 0 : bloom = next;
540 0 : any_duplicates |= _kortestz_mask8_u8( any_differences, any_differences );
541 0 : FD_COMPILER_FORGET( any_duplicates );
542 0 : }
543 0 : return any_duplicates;
544 0 : } else {
545 : /* Two vector version */
546 0 : const wwu_t mask = wwu_bcast( 0x3FU );
547 0 : const wwu_t shift0 = wwu( 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U,
548 0 : 6U, 6U, 6U, 6U, 6U, 6U, 6U, 6U );
549 0 : const wwu_t shift1 = wwu( 12U, 12U, 12U, 12U, 12U, 12U, 12U, 12U,
550 0 : 18U, 18U, 18U, 18U, 18U, 18U, 18U, 18U );
551 0 : wwu_t bloom0 = wwu_zero(); wwu_t bloom1 = wwu_zero();
552 0 : for( ulong i0=0UL; i0<list0_cnt; i0++ ) {
553 0 : wwu_t addr = _mm512_broadcast_i64x4( wu_ldu( list0+i0 ) );
554 0 : wwu_t blinded = wwu_xor( addr, entropy );
555 0 : wwu_t masked0 = wwu_and( mask, wwu_shr_vector( blinded, shift0 ) ); wwu_t masked1 = wwu_and( mask, wwu_shr_vector( blinded, shift1 ) );
556 0 : wwu_t select0 = wwu_shl_vector( one, masked0 ); wwu_t select1 = wwu_shl_vector( one, masked1 );
557 0 : wwu_t next0 = wwu_or( bloom0, select0 ); wwu_t next1 = wwu_or( bloom1, select1 );
558 0 : __mmask8 any_differences = _kor_mask8(
559 0 : _mm512_cmp_epi64_mask( bloom0, next0, _MM_CMPINT_NE ), _mm512_cmp_epi64_mask( bloom1, next1, _MM_CMPINT_NE ) );
560 :
561 0 : bloom0 = next0; bloom1 = next1;
562 :
563 0 : any_duplicates |= _kortestz_mask8_u8( any_differences, any_differences );
564 0 : FD_COMPILER_FORGET( any_duplicates );
565 0 : }
566 0 : for( ulong i1=0UL; i1<list1_cnt; i1++ ) {
567 0 : wwu_t addr = _mm512_broadcast_i64x4( wu_ldu( list1+i1 ) );
568 0 : wwu_t blinded = wwu_xor( addr, entropy );
569 0 : wwu_t masked0 = wwu_and( mask, wwu_shr_vector( blinded, shift0 ) ); wwu_t masked1 = wwu_and( mask, wwu_shr_vector( blinded, shift1 ) );
570 0 : wwu_t select0 = wwu_shl_vector( one, masked0 ); wwu_t select1 = wwu_shl_vector( one, masked1 );
571 0 : wwu_t next0 = wwu_or( bloom0, select0 ); wwu_t next1 = wwu_or( bloom1, select1 );
572 0 : __mmask8 any_differences = _kor_mask8(
573 0 : _mm512_cmp_epi64_mask( bloom0, next0, _MM_CMPINT_NE ), _mm512_cmp_epi64_mask( bloom1, next1, _MM_CMPINT_NE ) );
574 :
575 0 : bloom0 = next0; bloom1 = next1;
576 :
577 0 : any_duplicates |= _kortestz_mask8_u8( any_differences, any_differences );
578 0 : FD_COMPILER_FORGET( any_duplicates );
579 0 : }
580 0 : return any_duplicates;
581 0 : }
582 0 : }
583 :
584 : #else
585 :
586 : static inline int
587 : fd_chkdup_check_fast( fd_chkdup_t * chkdup,
588 : fd_acct_addr_t const * list0, ulong list0_cnt,
589 : fd_acct_addr_t const * list1, ulong list1_cnt ) {
590 : (void)chkdup;
591 : (void)list0;
592 : (void)list1;
593 : (void)list0_cnt;
594 : (void)list1_cnt;
595 : return 1;
596 : }
597 :
598 : #endif
599 :
600 :
601 : FD_PROTOTYPES_END
602 :
603 : #endif /* HEADER_fd_src_disco_pack_fd_chkdup_h */
|