/src/rocksdb/util/bloom_impl.h
Line | Count | Source |
1 | | // Copyright (c) 2019-present, Facebook, Inc. All rights reserved. |
2 | | // This source code is licensed under both the GPLv2 (found in the |
3 | | // COPYING file in the root directory) and Apache 2.0 License |
4 | | // (found in the LICENSE.Apache file in the root directory). |
5 | | // |
6 | | // Implementation details of various Bloom filter implementations used in |
7 | | // RocksDB. (DynamicBloom is in a separate file for now because it |
8 | | // supports concurrent write.) |
9 | | |
10 | | #pragma once |
11 | | #include <stddef.h> |
12 | | #include <stdint.h> |
13 | | |
14 | | #include <cmath> |
15 | | |
16 | | #include "port/port.h" // for PREFETCH |
17 | | #include "rocksdb/slice.h" |
18 | | #include "util/hash.h" |
19 | | |
20 | | #ifdef __AVX2__ |
21 | | #include <immintrin.h> |
22 | | #endif |
23 | | |
24 | | namespace ROCKSDB_NAMESPACE { |
25 | | |
26 | | class BloomMath { |
27 | | public: |
28 | | // False positive rate of a standard Bloom filter, for given ratio of |
29 | | // filter memory bits to added keys, and number of probes per operation. |
30 | | // (The false positive rate is effectively independent of scale, assuming |
31 | | // the implementation scales OK.) |
32 | 0 | static double StandardFpRate(double bits_per_key, int num_probes) { |
33 | | // Standard very-good-estimate formula. See |
34 | | // https://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives |
35 | 0 | return std::pow(1.0 - std::exp(-num_probes / bits_per_key), num_probes); |
36 | 0 | } |
37 | | |
38 | | // False positive rate of a "blocked"/"shareded"/"cache-local" Bloom filter, |
39 | | // for given ratio of filter memory bits to added keys, number of probes per |
40 | | // operation (all within the given block or cache line size), and block or |
41 | | // cache line size. |
42 | | static double CacheLocalFpRate(double bits_per_key, int num_probes, |
43 | 0 | int cache_line_bits) { |
44 | 0 | if (bits_per_key <= 0.0) { |
45 | | // Fix a discontinuity |
46 | 0 | return 1.0; |
47 | 0 | } |
48 | 0 | double keys_per_cache_line = cache_line_bits / bits_per_key; |
49 | | // A reasonable estimate is the average of the FP rates for one standard |
50 | | // deviation above and below the mean bucket occupancy. See |
51 | | // https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter#the-math |
52 | 0 | double keys_stddev = std::sqrt(keys_per_cache_line); |
53 | 0 | double crowded_fp = StandardFpRate( |
54 | 0 | cache_line_bits / (keys_per_cache_line + keys_stddev), num_probes); |
55 | 0 | double uncrowded_fp = StandardFpRate( |
56 | 0 | cache_line_bits / (keys_per_cache_line - keys_stddev), num_probes); |
57 | 0 | return (crowded_fp + uncrowded_fp) / 2; |
58 | 0 | } |
59 | | |
60 | | // False positive rate of querying a new item against `num_keys` items, all |
61 | | // hashed to `fingerprint_bits` bits. (This assumes the fingerprint hashes |
62 | | // themselves are stored losslessly. See Section 4 of |
63 | | // http://www.ccs.neu.edu/home/pete/pub/bloom-filters-verification.pdf) |
64 | 0 | static double FingerprintFpRate(size_t num_keys, int fingerprint_bits) { |
65 | 0 | double inv_fingerprint_space = std::pow(0.5, fingerprint_bits); |
66 | | // Base estimate assumes each key maps to a unique fingerprint. |
67 | | // Could be > 1 in extreme cases. |
68 | 0 | double base_estimate = num_keys * inv_fingerprint_space; |
69 | | // To account for potential overlap, we choose between two formulas |
70 | 0 | if (base_estimate > 0.0001) { |
71 | | // A very good formula assuming we don't construct a floating point |
72 | | // number extremely close to 1. Always produces a probability < 1. |
73 | 0 | return 1.0 - std::exp(-base_estimate); |
74 | 0 | } else { |
75 | | // A very good formula when base_estimate is far below 1. (Subtract |
76 | | // away the integral-approximated sum that some key has same hash as |
77 | | // one coming before it in a list.) |
78 | 0 | return base_estimate - (base_estimate * base_estimate * 0.5); |
79 | 0 | } |
80 | 0 | } |
81 | | |
82 | | // Returns the probably of either of two independent(-ish) events |
83 | | // happening, given their probabilities. (This is useful for combining |
84 | | // results from StandardFpRate or CacheLocalFpRate with FingerprintFpRate |
85 | | // for a hash-efficient Bloom filter's FP rate. See Section 4 of |
86 | | // http://www.ccs.neu.edu/home/pete/pub/bloom-filters-verification.pdf) |
87 | 0 | static double IndependentProbabilitySum(double rate1, double rate2) { |
88 | | // Use formula that avoids floating point extremely close to 1 if |
89 | | // rates are extremely small. |
90 | 0 | return rate1 + rate2 - (rate1 * rate2); |
91 | 0 | } |
92 | | }; |
93 | | |
94 | | // A fast, flexible, and accurate cache-local Bloom implementation with |
95 | | // SIMD-optimized query performance (currently using AVX2 on Intel). Write |
96 | | // performance and non-SIMD read are very good, benefiting from FastRange32 |
97 | | // used in place of % and single-cycle multiplication on recent processors. |
98 | | // |
99 | | // Most other SIMD Bloom implementations sacrifice flexibility and/or |
100 | | // accuracy by requiring num_probes to be a power of two and restricting |
101 | | // where each probe can occur in a cache line. This implementation sacrifices |
102 | | // SIMD-optimization for add (might still be possible, especially with AVX512) |
103 | | // in favor of allowing any num_probes, not crossing cache line boundary, |
104 | | // and accuracy close to theoretical best accuracy for a cache-local Bloom. |
105 | | // E.g. theoretical best for 10 bits/key, num_probes=6, and 512-bit bucket |
106 | | // (Intel cache line size) is 0.9535% FP rate. This implementation yields |
107 | | // about 0.957%. (Compare to LegacyLocalityBloomImpl<false> at 1.138%, or |
108 | | // about 0.951% for 1024-bit buckets, cache line size for some ARM CPUs.) |
109 | | // |
110 | | // This implementation can use a 32-bit hash (let h2 be h1 * 0x9e3779b9) or |
111 | | // a 64-bit hash (split into two uint32s). With many millions of keys, the |
112 | | // false positive rate associated with using a 32-bit hash can dominate the |
113 | | // false positive rate of the underlying filter. At 10 bits/key setting, the |
114 | | // inflection point is about 40 million keys, so 32-bit hash is a bad idea |
115 | | // with 10s of millions of keys or more. |
116 | | // |
117 | | // Despite accepting a 64-bit hash, this implementation uses 32-bit fastrange |
118 | | // to pick a cache line, which can be faster than 64-bit in some cases. |
119 | | // This only hurts accuracy as you get into 10s of GB for a single filter, |
120 | | // and accuracy abruptly breaks down at 256GB (2^32 cache lines). Switch to |
121 | | // 64-bit fastrange if you need filters so big. ;) |
122 | | // |
123 | | // Using only a 32-bit input hash within each cache line has negligible |
124 | | // impact for any reasonable cache line / bucket size, for arbitrary filter |
125 | | // size, and potentially saves intermediate data size in some cases vs. |
126 | | // tracking full 64 bits. (Even in an implementation using 64-bit arithmetic |
127 | | // to generate indices, I might do the same, as a single multiplication |
128 | | // suffices to generate a sufficiently mixed 64 bits from 32 bits.) |
129 | | // |
130 | | // This implementation is currently tied to Intel cache line size, 64 bytes == |
131 | | // 512 bits. If there's sufficient demand for other cache line sizes, this is |
132 | | // a pretty good implementation to extend, but slight performance enhancements |
133 | | // are possible with an alternate implementation (probably not very compatible |
134 | | // with SIMD): |
135 | | // (1) Use rotation in addition to multiplication for remixing |
136 | | // (like murmur hash). (Using multiplication alone *slightly* hurts accuracy |
137 | | // because lower bits never depend on original upper bits.) |
138 | | // (2) Extract more than one bit index from each re-mix. (Only if rotation |
139 | | // or similar is part of remix, because otherwise you're making the |
140 | | // multiplication-only problem worse.) |
141 | | // (3) Re-mix full 64 bit hash, to get maximum number of bit indices per |
142 | | // re-mix. |
143 | | // |
144 | | class FastLocalBloomImpl { |
145 | | public: |
146 | | // NOTE: this has only been validated to enough accuracy for producing |
147 | | // reasonable warnings / user feedback, not for making functional decisions. |
148 | | static double EstimatedFpRate(size_t keys, size_t bytes, int num_probes, |
149 | 0 | int hash_bits) { |
150 | 0 | return BloomMath::IndependentProbabilitySum( |
151 | 0 | BloomMath::CacheLocalFpRate(8.0 * bytes / keys, num_probes, |
152 | 0 | /*cache line bits*/ 512), |
153 | 0 | BloomMath::FingerprintFpRate(keys, hash_bits)); |
154 | 0 | } |
155 | | |
156 | 0 | static inline int ChooseNumProbes(int millibits_per_key) { |
157 | | // Since this implementation can (with AVX2) make up to 8 probes |
158 | | // for the same cost, we pick the most accurate num_probes, based |
159 | | // on actual tests of the implementation. Note that for higher |
160 | | // bits/key, the best choice for cache-local Bloom can be notably |
161 | | // smaller than standard bloom, e.g. 9 instead of 11 @ 16 b/k. |
162 | 0 | if (millibits_per_key <= 2080) { |
163 | 0 | return 1; |
164 | 0 | } else if (millibits_per_key <= 3580) { |
165 | 0 | return 2; |
166 | 0 | } else if (millibits_per_key <= 5100) { |
167 | 0 | return 3; |
168 | 0 | } else if (millibits_per_key <= 6640) { |
169 | 0 | return 4; |
170 | 0 | } else if (millibits_per_key <= 8300) { |
171 | 0 | return 5; |
172 | 0 | } else if (millibits_per_key <= 10070) { |
173 | 0 | return 6; |
174 | 0 | } else if (millibits_per_key <= 11720) { |
175 | 0 | return 7; |
176 | 0 | } else if (millibits_per_key <= 14001) { |
177 | | // Would be something like <= 13800 but sacrificing *slightly* for |
178 | | // more settings using <= 8 probes. |
179 | 0 | return 8; |
180 | 0 | } else if (millibits_per_key <= 16050) { |
181 | 0 | return 9; |
182 | 0 | } else if (millibits_per_key <= 18300) { |
183 | 0 | return 10; |
184 | 0 | } else if (millibits_per_key <= 22001) { |
185 | 0 | return 11; |
186 | 0 | } else if (millibits_per_key <= 25501) { |
187 | 0 | return 12; |
188 | 0 | } else if (millibits_per_key > 50000) { |
189 | | // Top out at 24 probes (three sets of 8) |
190 | 0 | return 24; |
191 | 0 | } else { |
192 | | // Roughly optimal choices for remaining range |
193 | | // e.g. |
194 | | // 28000 -> 12, 28001 -> 13 |
195 | | // 50000 -> 23, 50001 -> 24 |
196 | 0 | return (millibits_per_key - 1) / 2000 - 1; |
197 | 0 | } |
198 | 0 | } |
199 | | |
200 | | static inline void AddHash(uint32_t h1, uint32_t h2, uint32_t len_bytes, |
201 | 0 | int num_probes, char* data) { |
202 | 0 | uint32_t bytes_to_cache_line = FastRange32(h1, len_bytes >> 6) << 6; |
203 | 0 | AddHashPrepared(h2, num_probes, data + bytes_to_cache_line); |
204 | 0 | } |
205 | | |
206 | | static inline void AddHashPrepared(uint32_t h2, int num_probes, |
207 | 0 | char* data_at_cache_line) { |
208 | 0 | uint32_t h = h2; |
209 | 0 | for (int i = 0; i < num_probes; ++i, h *= uint32_t{0x9e3779b9}) { |
210 | | // 9-bit address within 512 bit cache line |
211 | 0 | int bitpos = h >> (32 - 9); |
212 | 0 | data_at_cache_line[bitpos >> 3] |= (uint8_t{1} << (bitpos & 7)); |
213 | 0 | } |
214 | 0 | } |
215 | | |
216 | | static inline void PrepareHash(uint32_t h1, uint32_t len_bytes, |
217 | | const char* data, |
218 | 0 | uint32_t /*out*/* byte_offset) { |
219 | 0 | uint32_t bytes_to_cache_line = FastRange32(h1, len_bytes >> 6) << 6; |
220 | 0 | PREFETCH(data + bytes_to_cache_line, 0 /* rw */, 1 /* locality */); |
221 | 0 | PREFETCH(data + bytes_to_cache_line + 63, 0 /* rw */, 1 /* locality */); |
222 | 0 | *byte_offset = bytes_to_cache_line; |
223 | 0 | } |
224 | | |
225 | | static inline bool HashMayMatch(uint32_t h1, uint32_t h2, uint32_t len_bytes, |
226 | 0 | int num_probes, const char* data) { |
227 | 0 | uint32_t bytes_to_cache_line = FastRange32(h1, len_bytes >> 6) << 6; |
228 | 0 | return HashMayMatchPrepared(h2, num_probes, data + bytes_to_cache_line); |
229 | 0 | } |
230 | | |
231 | | static inline bool HashMayMatchPrepared(uint32_t h2, int num_probes, |
232 | 0 | const char* data_at_cache_line) { |
233 | 0 | uint32_t h = h2; |
234 | 0 | #ifdef __AVX2__ |
235 | 0 | int rem_probes = num_probes; |
236 | | |
237 | | // NOTE: For better performance for num_probes in {1, 2, 9, 10, 17, 18, |
238 | | // etc.} one can insert specialized code for rem_probes <= 2, bypassing |
239 | | // the SIMD code in those cases. There is a detectable but minor overhead |
240 | | // applied to other values of num_probes (when not statically determined), |
241 | | // but smoother performance curve vs. num_probes. But for now, when |
242 | | // in doubt, don't add unnecessary code. |
243 | | |
244 | | // Powers of 32-bit golden ratio, mod 2**32. |
245 | 0 | const __m256i multipliers = |
246 | 0 | _mm256_setr_epi32(0x00000001, 0x9e3779b9, 0xe35e67b1, 0x734297e9, |
247 | 0 | 0x35fbe861, 0xdeb7c719, 0x448b211, 0x3459b749); |
248 | |
|
249 | 0 | for (;;) { |
250 | | // Eight copies of hash |
251 | 0 | __m256i hash_vector = _mm256_set1_epi32(h); |
252 | | |
253 | | // Same effect as repeated multiplication by 0x9e3779b9 thanks to |
254 | | // associativity of multiplication. |
255 | 0 | hash_vector = _mm256_mullo_epi32(hash_vector, multipliers); |
256 | | |
257 | | // Now the top 9 bits of each of the eight 32-bit values in |
258 | | // hash_vector are bit addresses for probes within the cache line. |
259 | | // While the platform-independent code uses byte addressing (6 bits |
260 | | // to pick a byte + 3 bits to pick a bit within a byte), here we work |
261 | | // with 32-bit words (4 bits to pick a word + 5 bits to pick a bit |
262 | | // within a word) because that works well with AVX2 and is equivalent |
263 | | // under little-endian. |
264 | | |
265 | | // Shift each right by 28 bits to get 4-bit word addresses. |
266 | 0 | const __m256i word_addresses = _mm256_srli_epi32(hash_vector, 28); |
267 | | |
268 | | // Gather 32-bit values spread over 512 bits by 4-bit address. In |
269 | | // essence, we are dereferencing eight pointers within the cache |
270 | | // line. |
271 | | // |
272 | | // Option 1: AVX2 gather (seems to be a little slow - understandable) |
273 | | // const __m256i value_vector = |
274 | | // _mm256_i32gather_epi32(static_cast<const int |
275 | | // *>(data_at_cache_line), |
276 | | // word_addresses, |
277 | | // /*bytes / i32*/ 4); |
278 | | // END Option 1 |
279 | | // Potentially unaligned as we're not *always* cache-aligned -> loadu |
280 | 0 | const __m256i* mm_data = |
281 | 0 | reinterpret_cast<const __m256i*>(data_at_cache_line); |
282 | 0 | __m256i lower = _mm256_loadu_si256(mm_data); |
283 | 0 | __m256i upper = _mm256_loadu_si256(mm_data + 1); |
284 | | // Option 2: AVX512VL permute hack |
285 | | // Only negligibly faster than Option 3, so not yet worth supporting |
286 | | // const __m256i value_vector = |
287 | | // _mm256_permutex2var_epi32(lower, word_addresses, upper); |
288 | | // END Option 2 |
289 | | // Option 3: AVX2 permute+blend hack |
290 | | // Use lowest three bits to order probing values, as if all from same |
291 | | // 256 bit piece. |
292 | 0 | lower = _mm256_permutevar8x32_epi32(lower, word_addresses); |
293 | 0 | upper = _mm256_permutevar8x32_epi32(upper, word_addresses); |
294 | | // Just top 1 bit of address, to select between lower and upper. |
295 | 0 | const __m256i upper_lower_selector = _mm256_srai_epi32(hash_vector, 31); |
296 | | // Finally: the next 8 probed 32-bit values, in probing sequence order. |
297 | 0 | const __m256i value_vector = |
298 | 0 | _mm256_blendv_epi8(lower, upper, upper_lower_selector); |
299 | | // END Option 3 |
300 | | |
301 | | // We might not need to probe all 8, so build a mask for selecting only |
302 | | // what we need. (The k_selector(s) could be pre-computed but that |
303 | | // doesn't seem to make a noticeable performance difference.) |
304 | 0 | const __m256i zero_to_seven = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); |
305 | | // Subtract rem_probes from each of those constants |
306 | 0 | __m256i k_selector = |
307 | 0 | _mm256_sub_epi32(zero_to_seven, _mm256_set1_epi32(rem_probes)); |
308 | | // Negative after subtract -> use/select |
309 | | // Keep only high bit (logical shift right each by 31). |
310 | 0 | k_selector = _mm256_srli_epi32(k_selector, 31); |
311 | | |
312 | | // Strip off the 4 bit word address (shift left) |
313 | 0 | __m256i bit_addresses = _mm256_slli_epi32(hash_vector, 4); |
314 | | // And keep only 5-bit (32 - 27) bit-within-32-bit-word addresses. |
315 | 0 | bit_addresses = _mm256_srli_epi32(bit_addresses, 27); |
316 | | // Build a bit mask |
317 | 0 | const __m256i bit_mask = _mm256_sllv_epi32(k_selector, bit_addresses); |
318 | | |
319 | | // Like ((~value_vector) & bit_mask) == 0) |
320 | 0 | bool match = _mm256_testc_si256(value_vector, bit_mask) != 0; |
321 | | |
322 | | // This check first so that it's easy for branch predictor to optimize |
323 | | // num_probes <= 8 case, making it free of unpredictable branches. |
324 | 0 | if (rem_probes <= 8) { |
325 | 0 | return match; |
326 | 0 | } else if (!match) { |
327 | 0 | return false; |
328 | 0 | } |
329 | | // otherwise |
330 | | // Need another iteration. 0xab25f4c1 == golden ratio to the 8th power |
331 | 0 | h *= 0xab25f4c1; |
332 | 0 | rem_probes -= 8; |
333 | 0 | } |
334 | | #else |
335 | | for (int i = 0; i < num_probes; ++i, h *= uint32_t{0x9e3779b9}) { |
336 | | // 9-bit address within 512 bit cache line |
337 | | int bitpos = h >> (32 - 9); |
338 | | if ((data_at_cache_line[bitpos >> 3] & (char(1) << (bitpos & 7))) == 0) { |
339 | | return false; |
340 | | } |
341 | | } |
342 | | return true; |
343 | | #endif |
344 | 0 | } |
345 | | }; |
346 | | |
347 | | // A legacy Bloom filter implementation with no locality of probes (slow). |
348 | | // It uses double hashing to generate a sequence of hash values. |
349 | | // Asymptotic analysis is in [Kirsch,Mitzenmacher 2006], but known to have |
350 | | // subtle accuracy flaws for practical sizes [Dillinger,Manolios 2004]. |
351 | | // |
352 | | // DO NOT REUSE |
353 | | // |
354 | | class LegacyNoLocalityBloomImpl { |
355 | | public: |
356 | 0 | static inline int ChooseNumProbes(int bits_per_key) { |
357 | | // We intentionally round down to reduce probing cost a little bit |
358 | 0 | int num_probes = static_cast<int>(bits_per_key * 0.69); // 0.69 =~ ln(2) |
359 | 0 | if (num_probes < 1) num_probes = 1; |
360 | 0 | if (num_probes > 30) num_probes = 30; |
361 | 0 | return num_probes; |
362 | 0 | } |
363 | | |
364 | | static inline void AddHash(uint32_t h, uint32_t total_bits, int num_probes, |
365 | 0 | char* data) { |
366 | 0 | const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits |
367 | 0 | for (int i = 0; i < num_probes; i++) { |
368 | 0 | const uint32_t bitpos = h % total_bits; |
369 | 0 | data[bitpos / 8] |= (1 << (bitpos % 8)); |
370 | 0 | h += delta; |
371 | 0 | } |
372 | 0 | } |
373 | | |
374 | | static inline bool HashMayMatch(uint32_t h, uint32_t total_bits, |
375 | 0 | int num_probes, const char* data) { |
376 | 0 | const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits |
377 | 0 | for (int i = 0; i < num_probes; i++) { |
378 | 0 | const uint32_t bitpos = h % total_bits; |
379 | 0 | if ((data[bitpos / 8] & (1 << (bitpos % 8))) == 0) { |
380 | 0 | return false; |
381 | 0 | } |
382 | 0 | h += delta; |
383 | 0 | } |
384 | 0 | return true; |
385 | 0 | } |
386 | | }; |
387 | | |
388 | | // A legacy Bloom filter implementation with probes local to a single |
389 | | // cache line (fast). Because SST files might be transported between |
390 | | // platforms, the cache line size is a parameter rather than hard coded. |
391 | | // (But if specified as a constant parameter, an optimizing compiler |
392 | | // should take advantage of that.) |
393 | | // |
394 | | // When ExtraRotates is false, this implementation is notably deficient in |
395 | | // accuracy. Specifically, it uses double hashing with a 1/512 chance of the |
396 | | // increment being zero (when cache line size is 512 bits). Thus, there's a |
397 | | // 1/512 chance of probing only one index, which we'd expect to incur about |
398 | | // a 1/2 * 1/512 or absolute 0.1% FP rate penalty. More detail at |
399 | | // https://github.com/facebook/rocksdb/issues/4120 |
400 | | // |
401 | | // DO NOT REUSE |
402 | | // |
403 | | template <bool ExtraRotates> |
404 | | class LegacyLocalityBloomImpl { |
405 | | private: |
406 | 0 | static inline uint32_t GetLine(uint32_t h, uint32_t num_lines) { |
407 | 0 | uint32_t offset_h = ExtraRotates ? (h >> 11) | (h << 21) : h; |
408 | 0 | return offset_h % num_lines; |
409 | 0 | } Unexecuted instantiation: rocksdb::LegacyLocalityBloomImpl<false>::GetLine(unsigned int, unsigned int) Unexecuted instantiation: rocksdb::LegacyLocalityBloomImpl<true>::GetLine(unsigned int, unsigned int) |
410 | | |
411 | | public: |
412 | | // NOTE: this has only been validated to enough accuracy for producing |
413 | | // reasonable warnings / user feedback, not for making functional decisions. |
414 | 0 | static double EstimatedFpRate(size_t keys, size_t bytes, int num_probes) { |
415 | 0 | double bits_per_key = 8.0 * bytes / keys; |
416 | 0 | double filter_rate = BloomMath::CacheLocalFpRate(bits_per_key, num_probes, |
417 | 0 | /*cache line bits*/ 512); |
418 | 0 | if (!ExtraRotates) { |
419 | | // Good estimate of impact of flaw in index computation. |
420 | | // Adds roughly 0.002 around 50 bits/key and 0.001 around 100 bits/key. |
421 | | // The + 22 shifts it nicely to fit for lower bits/key. |
422 | 0 | filter_rate += 0.1 / (bits_per_key * 0.75 + 22); |
423 | 0 | } else { |
424 | | // Not yet validated |
425 | 0 | assert(false); |
426 | 0 | } |
427 | | // Always uses 32-bit hash |
428 | 0 | double fingerprint_rate = BloomMath::FingerprintFpRate(keys, 32); |
429 | 0 | return BloomMath::IndependentProbabilitySum(filter_rate, fingerprint_rate); |
430 | 0 | } |
431 | | |
432 | | static inline void AddHash(uint32_t h, uint32_t num_lines, int num_probes, |
433 | 0 | char* data, int log2_cache_line_bytes) { |
434 | 0 | const int log2_cache_line_bits = log2_cache_line_bytes + 3; |
435 | |
|
436 | 0 | char* data_at_offset = |
437 | 0 | data + (GetLine(h, num_lines) << log2_cache_line_bytes); |
438 | 0 | const uint32_t delta = (h >> 17) | (h << 15); |
439 | 0 | for (int i = 0; i < num_probes; ++i) { |
440 | | // Mask to bit-within-cache-line address |
441 | 0 | const uint32_t bitpos = h & ((1 << log2_cache_line_bits) - 1); |
442 | 0 | data_at_offset[bitpos / 8] |= (1 << (bitpos % 8)); |
443 | 0 | if (ExtraRotates) { |
444 | 0 | h = (h >> log2_cache_line_bits) | (h << (32 - log2_cache_line_bits)); |
445 | 0 | } |
446 | 0 | h += delta; |
447 | 0 | } |
448 | 0 | } Unexecuted instantiation: rocksdb::LegacyLocalityBloomImpl<false>::AddHash(unsigned int, unsigned int, int, char*, int) Unexecuted instantiation: rocksdb::LegacyLocalityBloomImpl<true>::AddHash(unsigned int, unsigned int, int, char*, int) |
449 | | |
450 | | static inline void PrepareHashMayMatch(uint32_t h, uint32_t num_lines, |
451 | | const char* data, |
452 | | uint32_t /*out*/* byte_offset, |
453 | 0 | int log2_cache_line_bytes) { |
454 | 0 | uint32_t b = GetLine(h, num_lines) << log2_cache_line_bytes; |
455 | 0 | PREFETCH(data + b, 0 /* rw */, 1 /* locality */); |
456 | 0 | PREFETCH(data + b + ((1 << log2_cache_line_bytes) - 1), 0 /* rw */, |
457 | 0 | 1 /* locality */); |
458 | 0 | *byte_offset = b; |
459 | 0 | } Unexecuted instantiation: rocksdb::LegacyLocalityBloomImpl<false>::PrepareHashMayMatch(unsigned int, unsigned int, char const*, unsigned int*, int) Unexecuted instantiation: rocksdb::LegacyLocalityBloomImpl<true>::PrepareHashMayMatch(unsigned int, unsigned int, char const*, unsigned int*, int) |
460 | | |
461 | | static inline bool HashMayMatch(uint32_t h, uint32_t num_lines, |
462 | | int num_probes, const char* data, |
463 | 0 | int log2_cache_line_bytes) { |
464 | 0 | uint32_t b = GetLine(h, num_lines) << log2_cache_line_bytes; |
465 | 0 | return HashMayMatchPrepared(h, num_probes, data + b, log2_cache_line_bytes); |
466 | 0 | } |
467 | | |
468 | | static inline bool HashMayMatchPrepared(uint32_t h, int num_probes, |
469 | | const char* data_at_offset, |
470 | 0 | int log2_cache_line_bytes) { |
471 | 0 | const int log2_cache_line_bits = log2_cache_line_bytes + 3; |
472 | |
|
473 | 0 | const uint32_t delta = (h >> 17) | (h << 15); |
474 | 0 | for (int i = 0; i < num_probes; ++i) { |
475 | | // Mask to bit-within-cache-line address |
476 | 0 | const uint32_t bitpos = h & ((1 << log2_cache_line_bits) - 1); |
477 | 0 | if (((data_at_offset[bitpos / 8]) & (1 << (bitpos % 8))) == 0) { |
478 | 0 | return false; |
479 | 0 | } |
480 | 0 | if (ExtraRotates) { |
481 | 0 | h = (h >> log2_cache_line_bits) | (h << (32 - log2_cache_line_bits)); |
482 | 0 | } |
483 | 0 | h += delta; |
484 | 0 | } |
485 | 0 | return true; |
486 | 0 | } Unexecuted instantiation: rocksdb::LegacyLocalityBloomImpl<false>::HashMayMatchPrepared(unsigned int, int, char const*, int) Unexecuted instantiation: rocksdb::LegacyLocalityBloomImpl<true>::HashMayMatchPrepared(unsigned int, int, char const*, int) |
487 | | }; |
488 | | |
489 | | } // namespace ROCKSDB_NAMESPACE |