/src/tarantool/src/lib/salad/bloom.h
Line | Count | Source (jump to first uncovered line) |
1 | | #ifndef TARANTOOL_LIB_SALAD_BLOOM_H_INCLUDED |
2 | | #define TARANTOOL_LIB_SALAD_BLOOM_H_INCLUDED |
3 | | /* |
4 | | * Copyright 2010-2019, Tarantool AUTHORS, please see AUTHORS file. |
5 | | * |
6 | | * Redistribution and use in source and binary forms, with or |
7 | | * without modification, are permitted provided that the following |
8 | | * conditions are met: |
9 | | * |
10 | | * 1. Redistributions of source code must retain the above |
11 | | * copyright notice, this list of conditions and the |
12 | | * following disclaimer. |
13 | | * |
14 | | * 2. Redistributions in binary form must reproduce the above |
15 | | * copyright notice, this list of conditions and the following |
16 | | * disclaimer in the documentation and/or other materials |
17 | | * provided with the distribution. |
18 | | * |
19 | | * THIS SOFTWARE IS PROVIDED BY <COPYRIGHT HOLDER> ``AS IS'' AND |
20 | | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
21 | | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
22 | | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL |
23 | | * <COPYRIGHT HOLDER> OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, |
24 | | * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
25 | | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
26 | | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR |
27 | | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
28 | | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
29 | | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF |
30 | | * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
31 | | * SUCH DAMAGE. |
32 | | */ |
33 | | |
34 | | /* Classic bloom filter with several improvements |
35 | | * 1) Cache oblivious: |
36 | | * Putze, F.; Sanders, P.; Singler, J. (2007), |
37 | | * "Cache-, Hash- and Space-Efficient Bloom Filters" |
38 | | * http://algo2.iti.kit.edu/singler/publications/cacheefficientbloomfilters-wea2007.pdf |
39 | | * 2) Fast hash function calculation: |
40 | | * Kirsch, Adam; Mitzenmacher, Michael (2006) |
41 | | * "Less Hashing, Same Performance: Building a Better Bloom Filter" |
42 | | * https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf |
43 | | * 3) Using only one hash value that is splitted into several independent parts |
44 | | */ |
45 | | |
46 | | #include <stdint.h> |
47 | | #include <stdbool.h> |
48 | | #include <stddef.h> |
49 | | #include <limits.h> |
50 | | #include "bit/bit.h" |
51 | | |
52 | | #if defined(__cplusplus) |
53 | | extern "C" { |
54 | | #endif /* defined(__cplusplus) */ |
55 | | |
56 | | enum { |
57 | | /* Expected cache line of target processor */ |
58 | | BLOOM_CACHE_LINE = 64, |
59 | | }; |
60 | | |
61 | | typedef uint32_t bloom_hash_t; |
62 | | |
63 | | /** |
64 | | * Cache-line-size block of bloom filter |
65 | | */ |
66 | | struct bloom_block { |
67 | | unsigned char bits[BLOOM_CACHE_LINE]; |
68 | | }; |
69 | | |
70 | | /** |
71 | | * Bloom filter data structure |
72 | | */ |
73 | | struct bloom { |
74 | | /* Number of buckets (blocks) in the table */ |
75 | | uint32_t table_size; |
76 | | /* Number of hash function per value */ |
77 | | uint16_t hash_count; |
78 | | /* Bit field table */ |
79 | | struct bloom_block *table; |
80 | | }; |
81 | | |
82 | | /* {{{ API declaration */ |
83 | | |
84 | | /** |
85 | | * Allocate and initialize an instance of bloom filter |
86 | | * |
87 | | * @param bloom - structure to initialize |
88 | | * @param number_of_values - estimated number of values to be added |
89 | | * @param false_positive_rate - desired false positive rate |
90 | | * @return 0 - OK, -1 - memory error |
91 | | */ |
92 | | int |
93 | | bloom_create(struct bloom *bloom, uint32_t number_of_values, |
94 | | double false_positive_rate); |
95 | | |
96 | | /** |
97 | | * Free resources of the bloom filter |
98 | | * |
99 | | * @param bloom - the bloom filter |
100 | | */ |
101 | | void |
102 | | bloom_destroy(struct bloom *bloom); |
103 | | |
104 | | /** |
105 | | * Add a value into the data set |
106 | | * @param bloom - the bloom filter |
107 | | * @param hash - hash of the value |
108 | | */ |
109 | | static void |
110 | | bloom_add(struct bloom *bloom, bloom_hash_t hash); |
111 | | |
112 | | /** |
113 | | * Query for presence of a value in the data set |
114 | | * @param bloom - the bloom filter |
115 | | * @param hash - hash of the value |
116 | | * @return true - the value could be in data set; false - the value is |
117 | | * definitively not in data set |
118 | | * |
119 | | */ |
120 | | static bool |
121 | | bloom_maybe_has(const struct bloom *bloom, bloom_hash_t hash); |
122 | | |
123 | | /** |
124 | | * Return the expected false positive rate of a bloom filter. |
125 | | * @param bloom - the bloom filter |
126 | | * @param number_of_values - number of values stored in the filter |
127 | | * @return - expected false positive rate |
128 | | */ |
129 | | double |
130 | | bloom_fpr(const struct bloom *bloom, uint32_t number_of_values); |
131 | | |
132 | | /** |
133 | | * Calculate size of a buffer that is needed for storing bloom table |
134 | | * @param bloom - the bloom filter to store |
135 | | * @return - Exact size |
136 | | */ |
137 | | size_t |
138 | | bloom_store_size(const struct bloom *bloom); |
139 | | |
140 | | /** |
141 | | * Store bloom filter table to the given buffer |
142 | | * Other struct bloom members must be stored manually. |
143 | | * @param bloom - the bloom filter to store |
144 | | * @param table - buffer to store to |
145 | | * #return - end of written buffer |
146 | | */ |
147 | | char * |
148 | | bloom_store(const struct bloom *bloom, char *table); |
149 | | |
150 | | /** |
151 | | * Allocate table and load it from given buffer. |
152 | | * Other struct bloom members must be loaded manually. |
153 | | * |
154 | | * @param bloom - structure to load to |
155 | | * @param table - data to load |
156 | | * @return 0 - OK, -1 - memory error |
157 | | */ |
158 | | int |
159 | | bloom_load_table(struct bloom *bloom, const char *table); |
160 | | |
161 | | /* }}} API declaration */ |
162 | | |
163 | | /* {{{ API definition */ |
164 | | |
165 | | static inline void |
166 | | bloom_add(struct bloom *bloom, bloom_hash_t hash) |
167 | 0 | { |
168 | | /* Using lower part of the has for finding a block */ |
169 | 0 | bloom_hash_t pos = hash % bloom->table_size; |
170 | 0 | hash = hash / bloom->table_size; |
171 | | /* __builtin_prefetch(bloom->table + pos, 1); */ |
172 | 0 | const bloom_hash_t bloom_block_bits = BLOOM_CACHE_LINE * CHAR_BIT; |
173 | | /* bit_no in block is less than bloom_block_bits (512). |
174 | | * split the given hash into independent lower part and high part. */ |
175 | 0 | bloom_hash_t hash2 = hash / bloom_block_bits + 1; |
176 | 0 | for (bloom_hash_t i = 0; i < bloom->hash_count; i++) { |
177 | 0 | bloom_hash_t bit_no = hash % bloom_block_bits; |
178 | 0 | bit_set(bloom->table[pos].bits, bit_no); |
179 | | /* Combine two hashes to create required number of hashes */ |
180 | | /* Add i**2 for better distribution */ |
181 | 0 | hash += hash2 + i * i; |
182 | 0 | } |
183 | 0 | } Unexecuted instantiation: vy_stmt.c:bloom_add Unexecuted instantiation: vy_run.c:bloom_add Unexecuted instantiation: tuple_bloom.c:bloom_add Unexecuted instantiation: bloom.c:bloom_add |
184 | | |
185 | | static inline bool |
186 | | bloom_maybe_has(const struct bloom *bloom, bloom_hash_t hash) |
187 | 0 | { |
188 | | /* Using lower part of the has for finding a block */ |
189 | 0 | bloom_hash_t pos = hash % bloom->table_size; |
190 | 0 | hash = hash / bloom->table_size; |
191 | | /* __builtin_prefetch(bloom->table + pos, 0); */ |
192 | 0 | const bloom_hash_t bloom_block_bits = BLOOM_CACHE_LINE * CHAR_BIT; |
193 | | /* bit_no in block is less than bloom_block_bits (512). |
194 | | * split the given hash into independent lower part and high part. */ |
195 | 0 | bloom_hash_t hash2 = hash / bloom_block_bits + 1; |
196 | 0 | for (bloom_hash_t i = 0; i < bloom->hash_count; i++) { |
197 | 0 | bloom_hash_t bit_no = hash % bloom_block_bits; |
198 | 0 | if (!bit_test(bloom->table[pos].bits, bit_no)) |
199 | 0 | return false; |
200 | | /* Combine two hashes to create required number of hashes */ |
201 | | /* Add i**2 for better distribution */ |
202 | 0 | hash += hash2 + i * i; |
203 | 0 | } |
204 | 0 | return true; |
205 | 0 | } Unexecuted instantiation: vy_stmt.c:bloom_maybe_has Unexecuted instantiation: vy_run.c:bloom_maybe_has Unexecuted instantiation: tuple_bloom.c:bloom_maybe_has Unexecuted instantiation: bloom.c:bloom_maybe_has |
206 | | |
207 | | /* }}} API definition */ |
208 | | |
209 | | #if defined(__cplusplus) |
210 | | } /* extern "C" */ |
211 | | #endif /* defined(__cplusplus) */ |
212 | | |
213 | | #endif /* TARANTOOL_LIB_SALAD_BLOOM_H_INCLUDED */ |