/src/postgres/src/backend/access/brin/brin_bloom.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * brin_bloom.c |
3 | | * Implementation of Bloom opclass for BRIN |
4 | | * |
5 | | * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group |
6 | | * Portions Copyright (c) 1994, Regents of the University of California |
7 | | * |
8 | | * |
9 | | * A BRIN opclass summarizing page range into a bloom filter. |
10 | | * |
11 | | * Bloom filters allow efficient testing whether a given page range contains |
12 | | * a particular value. Therefore, if we summarize each page range into a small |
13 | | * bloom filter, we can easily (and cheaply) test whether it contains values |
14 | | * we get later. |
15 | | * |
16 | | * The index only supports equality operators, similarly to hash indexes. |
17 | | * Bloom indexes are however much smaller, and support only bitmap scans. |
18 | | * |
19 | | * Note: Don't confuse this with bloom indexes, implemented in a contrib |
20 | | * module. That extension implements an entirely new AM, building a bloom |
21 | | * filter on multiple columns in a single row. This opclass works with an |
22 | | * existing AM (BRIN) and builds bloom filter on a column. |
23 | | * |
24 | | * |
25 | | * values vs. hashes |
26 | | * ----------------- |
27 | | * |
28 | | * The original column values are not used directly, but are first hashed |
29 | | * using the regular type-specific hash function, producing a uint32 hash. |
30 | | * And this hash value is then added to the summary - i.e. it's hashed |
31 | | * again and added to the bloom filter. |
32 | | * |
33 | | * This allows the code to treat all data types (byval/byref/...) the same |
34 | | * way, with only minimal space requirements, because we're working with |
35 | | * hashes and not the original values. Everything is uint32. |
36 | | * |
37 | | * Of course, this assumes the built-in hash function is reasonably good, |
38 | | * without too many collisions etc. But that does seem to be the case, at |
39 | | * least based on past experience. After all, the same hash functions are |
40 | | * used for hash indexes, hash partitioning and so on. |
41 | | * |
42 | | * |
43 | | * hashing scheme |
44 | | * -------------- |
45 | | * |
46 | | * Bloom filters require a number of independent hash functions. There are |
47 | | * different schemes how to construct them - for example we might use |
48 | | * hash_uint32_extended with random seeds, but that seems fairly expensive. |
49 | | * We use a scheme requiring only two functions described in this paper: |
50 | | * |
51 | | * Less Hashing, Same Performance:Building a Better Bloom Filter |
52 | | * Adam Kirsch, Michael Mitzenmacher, Harvard School of Engineering and |
53 | | * Applied Sciences, Cambridge, Massachusetts [DOI 10.1002/rsa.20208] |
54 | | * |
55 | | * The two hash functions h1 and h2 are calculated using hard-coded seeds, |
56 | | * and then combined using (h1 + i * h2) to generate the hash functions. |
57 | | * |
58 | | * |
59 | | * sizing the bloom filter |
60 | | * ----------------------- |
61 | | * |
62 | | * Size of a bloom filter depends on the number of distinct values we will |
63 | | * store in it, and the desired false positive rate. The higher the number |
64 | | * of distinct values and/or the lower the false positive rate, the larger |
65 | | * the bloom filter. On the other hand, we want to keep the index as small |
66 | | * as possible - that's one of the basic advantages of BRIN indexes. |
67 | | * |
68 | | * Although the number of distinct elements (in a page range) depends on |
69 | | * the data, we can consider it fixed. This simplifies the trade-off to |
70 | | * just false positive rate vs. size. |
71 | | * |
72 | | * At the page range level, false positive rate is a probability the bloom |
73 | | * filter matches a random value. For the whole index (with sufficiently |
74 | | * many page ranges) it represents the fraction of the index ranges (and |
75 | | * thus fraction of the table to be scanned) matching the random value. |
76 | | * |
77 | | * Furthermore, the size of the bloom filter is subject to implementation |
78 | | * limits - it has to fit onto a single index page (8kB by default). As |
79 | | * the bitmap is inherently random (when "full" about half the bits is set |
80 | | * to 1, randomly), compression can't help very much. |
81 | | * |
82 | | * To reduce the size of a filter (to fit to a page), we have to either |
83 | | * accept higher false positive rate (undesirable), or reduce the number |
84 | | * of distinct items to be stored in the filter. We can't alter the input |
85 | | * data, of course, but we may make the BRIN page ranges smaller - instead |
86 | | * of the default 128 pages (1MB) we may build index with 16-page ranges, |
87 | | * or something like that. This should reduce the number of distinct values |
88 | | * in the page range, making the filter smaller (with fixed false positive |
89 | | * rate). Even for random data sets this should help, as the number of rows |
90 | | * per heap page is limited (to ~290 with very narrow tables, likely ~20 |
91 | | * in practice). |
92 | | * |
93 | | * Of course, good sizing decisions depend on having the necessary data, |
94 | | * i.e. number of distinct values in a page range (of a given size) and |
95 | | * table size (to estimate cost change due to change in false positive |
96 | | * rate due to having larger index vs. scanning larger indexes). We may |
97 | | * not have that data - for example when building an index on empty table |
98 | | * it's not really possible. And for some data we only have estimates for |
99 | | * the whole table and we can only estimate per-range values (ndistinct). |
100 | | * |
101 | | * Another challenge is that while the bloom filter is per-column, it's |
102 | | * the whole index tuple that has to fit into a page. And for multi-column |
103 | | * indexes that may include pieces we have no control over (not necessarily |
104 | | * bloom filters, the other columns may use other BRIN opclasses). So it's |
105 | | * not entirely clear how to distribute the space between those columns. |
106 | | * |
107 | | * The current logic, implemented in brin_bloom_get_ndistinct, attempts to |
108 | | * make some basic sizing decisions, based on the size of BRIN ranges, and |
109 | | * the maximum number of rows per range. |
110 | | * |
111 | | * |
112 | | * IDENTIFICATION |
113 | | * src/backend/access/brin/brin_bloom.c |
114 | | */ |
115 | | #include "postgres.h" |
116 | | |
117 | | #include <math.h> |
118 | | |
119 | | #include "access/brin.h" |
120 | | #include "access/brin_internal.h" |
121 | | #include "access/brin_page.h" |
122 | | #include "access/brin_tuple.h" |
123 | | #include "access/genam.h" |
124 | | #include "access/htup_details.h" |
125 | | #include "access/reloptions.h" |
126 | | #include "catalog/pg_am.h" |
127 | | #include "catalog/pg_type.h" |
128 | | #include "common/hashfn.h" |
129 | | #include "utils/fmgrprotos.h" |
130 | | #include "utils/rel.h" |
131 | | |
132 | 0 | #define BloomEqualStrategyNumber 1 |
133 | | |
134 | | /* |
135 | | * Additional SQL level support functions. We only have one, which is |
136 | | * used to calculate hash of the input value. |
137 | | * |
138 | | * Procedure numbers must not use values reserved for BRIN itself; see |
139 | | * brin_internal.h. |
140 | | */ |
141 | | #define BLOOM_MAX_PROCNUMS 1 /* maximum support procs we need */ |
142 | 0 | #define PROCNUM_HASH 11 /* required */ |
143 | | |
144 | | /* |
145 | | * Subtract this from procnum to obtain index in BloomOpaque arrays |
146 | | * (Must be equal to minimum of private procnums). |
147 | | */ |
148 | 0 | #define PROCNUM_BASE 11 |
149 | | |
150 | | /* |
151 | | * Storage type for BRIN's reloptions. |
152 | | */ |
153 | | typedef struct BloomOptions |
154 | | { |
155 | | int32 vl_len_; /* varlena header (do not touch directly!) */ |
156 | | double nDistinctPerRange; /* number of distinct values per range */ |
157 | | double falsePositiveRate; /* false positive for bloom filter */ |
158 | | } BloomOptions; |
159 | | |
160 | | /* |
161 | | * The current min value (16) is somewhat arbitrary, but it's based |
162 | | * on the fact that the filter header is ~20B alone, which is about |
163 | | * the same as the filter bitmap for 16 distinct items with 1% false |
164 | | * positive rate. So by allowing lower values we'd not gain much. In |
165 | | * any case, the min should not be larger than MaxHeapTuplesPerPage |
166 | | * (~290), which is the theoretical maximum for single-page ranges. |
167 | | */ |
168 | | #define BLOOM_MIN_NDISTINCT_PER_RANGE 16 |
169 | | |
170 | | /* |
171 | | * Used to determine number of distinct items, based on the number of rows |
172 | | * in a page range. The 10% is somewhat similar to what estimate_num_groups |
173 | | * does, so we use the same factor here. |
174 | | */ |
175 | 0 | #define BLOOM_DEFAULT_NDISTINCT_PER_RANGE -0.1 /* 10% of values */ |
176 | | |
177 | | /* |
178 | | * Allowed range and default value for the false positive range. The exact |
179 | | * values are somewhat arbitrary, but were chosen considering the various |
180 | | * parameters (size of filter vs. page size, etc.). |
181 | | * |
182 | | * The lower the false-positive rate, the more accurate the filter is, but |
183 | | * it also gets larger - at some point this eliminates the main advantage |
184 | | * of BRIN indexes, which is the tiny size. At 0.01% the index is about |
185 | | * 10% of the table (assuming 290 distinct values per 8kB page). |
186 | | * |
187 | | * On the other hand, as the false-positive rate increases, larger part of |
188 | | * the table has to be scanned due to mismatches - at 25% we're probably |
189 | | * close to sequential scan being cheaper. |
190 | | */ |
191 | 0 | #define BLOOM_MIN_FALSE_POSITIVE_RATE 0.0001 /* 0.01% fp rate */ |
192 | 0 | #define BLOOM_MAX_FALSE_POSITIVE_RATE 0.25 /* 25% fp rate */ |
193 | 0 | #define BLOOM_DEFAULT_FALSE_POSITIVE_RATE 0.01 /* 1% fp rate */ |
194 | | |
195 | | #define BloomGetNDistinctPerRange(opts) \ |
196 | 0 | ((opts) && (((BloomOptions *) (opts))->nDistinctPerRange != 0) ? \ |
197 | 0 | (((BloomOptions *) (opts))->nDistinctPerRange) : \ |
198 | 0 | BLOOM_DEFAULT_NDISTINCT_PER_RANGE) |
199 | | |
200 | | #define BloomGetFalsePositiveRate(opts) \ |
201 | 0 | ((opts) && (((BloomOptions *) (opts))->falsePositiveRate != 0.0) ? \ |
202 | 0 | (((BloomOptions *) (opts))->falsePositiveRate) : \ |
203 | 0 | BLOOM_DEFAULT_FALSE_POSITIVE_RATE) |
204 | | |
205 | | /* |
206 | | * And estimate of the largest bloom we can fit onto a page. This is not |
207 | | * a perfect guarantee, for a couple of reasons. For example, the row may |
208 | | * be larger because the index has multiple columns. |
209 | | */ |
210 | | #define BloomMaxFilterSize \ |
211 | 0 | MAXALIGN_DOWN(BLCKSZ - \ |
212 | | (MAXALIGN(SizeOfPageHeaderData + \ |
213 | | sizeof(ItemIdData)) + \ |
214 | | MAXALIGN(sizeof(BrinSpecialSpace)) + \ |
215 | | SizeOfBrinTuple)) |
216 | | |
217 | | /* |
218 | | * Seeds used to calculate two hash functions h1 and h2, which are then used |
219 | | * to generate k hashes using the (h1 + i * h2) scheme. |
220 | | */ |
221 | 0 | #define BLOOM_SEED_1 0x71d924af |
222 | 0 | #define BLOOM_SEED_2 0xba48b314 |
223 | | |
224 | | /* |
225 | | * Bloom Filter |
226 | | * |
227 | | * Represents a bloom filter, built on hashes of the indexed values. That is, |
228 | | * we compute a uint32 hash of the value, and then store this hash into the |
229 | | * bloom filter (and compute additional hashes on it). |
230 | | * |
231 | | * XXX We could implement "sparse" bloom filters, keeping only the bytes that |
232 | | * are not entirely 0. But while indexes don't support TOAST, the varlena can |
233 | | * still be compressed. So this seems unnecessary, because the compression |
234 | | * should do the same job. |
235 | | * |
236 | | * XXX We can also watch the number of bits set in the bloom filter, and then |
237 | | * stop using it (and not store the bitmap, to save space) when the false |
238 | | * positive rate gets too high. But even if the false positive rate exceeds the |
239 | | * desired value, it still can eliminate some page ranges. |
240 | | */ |
241 | | typedef struct BloomFilter |
242 | | { |
243 | | /* varlena header (do not touch directly!) */ |
244 | | int32 vl_len_; |
245 | | |
246 | | /* space for various flags (unused for now) */ |
247 | | uint16 flags; |
248 | | |
249 | | /* fields for the HASHED phase */ |
250 | | uint8 nhashes; /* number of hash functions */ |
251 | | uint32 nbits; /* number of bits in the bitmap (size) */ |
252 | | uint32 nbits_set; /* number of bits set to 1 */ |
253 | | |
254 | | /* data of the bloom filter */ |
255 | | char data[FLEXIBLE_ARRAY_MEMBER]; |
256 | | } BloomFilter; |
257 | | |
258 | | /* |
259 | | * bloom_filter_size |
260 | | * Calculate Bloom filter parameters (nbits, nbytes, nhashes). |
261 | | * |
262 | | * Given expected number of distinct values and desired false positive rate, |
263 | | * calculates the optimal parameters of the Bloom filter. |
264 | | * |
265 | | * The resulting parameters are returned through nbytesp (number of bytes), |
266 | | * nbitsp (number of bits) and nhashesp (number of hash functions). If a |
267 | | * pointer is NULL, the parameter is not returned. |
268 | | */ |
269 | | static void |
270 | | bloom_filter_size(int ndistinct, double false_positive_rate, |
271 | | int *nbytesp, int *nbitsp, int *nhashesp) |
272 | 0 | { |
273 | 0 | double k; |
274 | 0 | int nbits, |
275 | 0 | nbytes; |
276 | | |
277 | | /* sizing bloom filter: -(n * ln(p)) / (ln(2))^2 */ |
278 | 0 | nbits = ceil(-(ndistinct * log(false_positive_rate)) / pow(log(2.0), 2)); |
279 | | |
280 | | /* round m to whole bytes */ |
281 | 0 | nbytes = ((nbits + 7) / 8); |
282 | 0 | nbits = nbytes * 8; |
283 | | |
284 | | /* |
285 | | * round(log(2.0) * m / ndistinct), but assume round() may not be |
286 | | * available on Windows |
287 | | */ |
288 | 0 | k = log(2.0) * nbits / ndistinct; |
289 | 0 | k = (k - floor(k) >= 0.5) ? ceil(k) : floor(k); |
290 | |
|
291 | 0 | if (nbytesp) |
292 | 0 | *nbytesp = nbytes; |
293 | |
|
294 | 0 | if (nbitsp) |
295 | 0 | *nbitsp = nbits; |
296 | |
|
297 | 0 | if (nhashesp) |
298 | 0 | *nhashesp = (int) k; |
299 | 0 | } |
300 | | |
301 | | /* |
302 | | * bloom_init |
303 | | * Initialize the Bloom Filter, allocate all the memory. |
304 | | * |
305 | | * The filter is initialized with optimal size for ndistinct expected values |
306 | | * and the requested false positive rate. The filter is stored as varlena. |
307 | | */ |
308 | | static BloomFilter * |
309 | | bloom_init(int ndistinct, double false_positive_rate) |
310 | 0 | { |
311 | 0 | Size len; |
312 | 0 | BloomFilter *filter; |
313 | |
|
314 | 0 | int nbits; /* size of filter / number of bits */ |
315 | 0 | int nbytes; /* size of filter / number of bytes */ |
316 | 0 | int nhashes; /* number of hash functions */ |
317 | |
|
318 | 0 | Assert(ndistinct > 0); |
319 | 0 | Assert(false_positive_rate > 0 && false_positive_rate < 1); |
320 | | |
321 | | /* calculate bloom filter size / parameters */ |
322 | 0 | bloom_filter_size(ndistinct, false_positive_rate, |
323 | 0 | &nbytes, &nbits, &nhashes); |
324 | | |
325 | | /* |
326 | | * Reject filters that are obviously too large to store on a page. |
327 | | * |
328 | | * Initially the bloom filter is just zeroes and so very compressible, but |
329 | | * as we add values it gets more and more random, and so less and less |
330 | | * compressible. So initially everything fits on the page, but we might |
331 | | * get surprising failures later - we want to prevent that, so we reject |
332 | | * bloom filter that are obviously too large. |
333 | | * |
334 | | * XXX It's not uncommon to oversize the bloom filter a bit, to defend |
335 | | * against unexpected data anomalies (parts of table with more distinct |
336 | | * values per range etc.). But we still need to make sure even the |
337 | | * oversized filter fits on page, if such need arises. |
338 | | * |
339 | | * XXX This check is not perfect, because the index may have multiple |
340 | | * filters that are small individually, but too large when combined. |
341 | | */ |
342 | 0 | if (nbytes > BloomMaxFilterSize) |
343 | 0 | elog(ERROR, "the bloom filter is too large (%d > %zu)", nbytes, |
344 | 0 | BloomMaxFilterSize); |
345 | | |
346 | | /* |
347 | | * We allocate the whole filter. Most of it is going to be 0 bits, so the |
348 | | * varlena is easy to compress. |
349 | | */ |
350 | 0 | len = offsetof(BloomFilter, data) + nbytes; |
351 | |
|
352 | 0 | filter = (BloomFilter *) palloc0(len); |
353 | |
|
354 | 0 | filter->flags = 0; |
355 | 0 | filter->nhashes = nhashes; |
356 | 0 | filter->nbits = nbits; |
357 | |
|
358 | 0 | SET_VARSIZE(filter, len); |
359 | |
|
360 | 0 | return filter; |
361 | 0 | } |
362 | | |
363 | | |
364 | | /* |
365 | | * bloom_add_value |
366 | | * Add value to the bloom filter. |
367 | | */ |
368 | | static BloomFilter * |
369 | | bloom_add_value(BloomFilter *filter, uint32 value, bool *updated) |
370 | 0 | { |
371 | 0 | int i; |
372 | 0 | uint64 h1, |
373 | 0 | h2; |
374 | | |
375 | | /* compute the hashes, used for the bloom filter */ |
376 | 0 | h1 = hash_bytes_uint32_extended(value, BLOOM_SEED_1) % filter->nbits; |
377 | 0 | h2 = hash_bytes_uint32_extended(value, BLOOM_SEED_2) % filter->nbits; |
378 | | |
379 | | /* compute the requested number of hashes */ |
380 | 0 | for (i = 0; i < filter->nhashes; i++) |
381 | 0 | { |
382 | | /* h1 + h2 + f(i) */ |
383 | 0 | uint32 h = (h1 + i * h2) % filter->nbits; |
384 | 0 | uint32 byte = (h / 8); |
385 | 0 | uint32 bit = (h % 8); |
386 | | |
387 | | /* if the bit is not set, set it and remember we did that */ |
388 | 0 | if (!(filter->data[byte] & (0x01 << bit))) |
389 | 0 | { |
390 | 0 | filter->data[byte] |= (0x01 << bit); |
391 | 0 | filter->nbits_set++; |
392 | 0 | if (updated) |
393 | 0 | *updated = true; |
394 | 0 | } |
395 | 0 | } |
396 | |
|
397 | 0 | return filter; |
398 | 0 | } |
399 | | |
400 | | |
401 | | /* |
402 | | * bloom_contains_value |
403 | | * Check if the bloom filter contains a particular value. |
404 | | */ |
405 | | static bool |
406 | | bloom_contains_value(BloomFilter *filter, uint32 value) |
407 | 0 | { |
408 | 0 | int i; |
409 | 0 | uint64 h1, |
410 | 0 | h2; |
411 | | |
412 | | /* calculate the two hashes */ |
413 | 0 | h1 = hash_bytes_uint32_extended(value, BLOOM_SEED_1) % filter->nbits; |
414 | 0 | h2 = hash_bytes_uint32_extended(value, BLOOM_SEED_2) % filter->nbits; |
415 | | |
416 | | /* compute the requested number of hashes */ |
417 | 0 | for (i = 0; i < filter->nhashes; i++) |
418 | 0 | { |
419 | | /* h1 + h2 + f(i) */ |
420 | 0 | uint32 h = (h1 + i * h2) % filter->nbits; |
421 | 0 | uint32 byte = (h / 8); |
422 | 0 | uint32 bit = (h % 8); |
423 | | |
424 | | /* if the bit is not set, the value is not there */ |
425 | 0 | if (!(filter->data[byte] & (0x01 << bit))) |
426 | 0 | return false; |
427 | 0 | } |
428 | | |
429 | | /* all hashes found in bloom filter */ |
430 | 0 | return true; |
431 | 0 | } |
432 | | |
433 | | typedef struct BloomOpaque |
434 | | { |
435 | | /* |
436 | | * XXX At this point we only need a single proc (to compute the hash), but |
437 | | * let's keep the array just like inclusion and minmax opclasses, for |
438 | | * consistency. We may need additional procs in the future. |
439 | | */ |
440 | | FmgrInfo extra_procinfos[BLOOM_MAX_PROCNUMS]; |
441 | | } BloomOpaque; |
442 | | |
443 | | static FmgrInfo *bloom_get_procinfo(BrinDesc *bdesc, uint16 attno, |
444 | | uint16 procnum); |
445 | | |
446 | | |
447 | | Datum |
448 | | brin_bloom_opcinfo(PG_FUNCTION_ARGS) |
449 | 0 | { |
450 | 0 | BrinOpcInfo *result; |
451 | | |
452 | | /* |
453 | | * opaque->strategy_procinfos is initialized lazily; here it is set to |
454 | | * all-uninitialized by palloc0 which sets fn_oid to InvalidOid. |
455 | | * |
456 | | * bloom indexes only store the filter as a single BYTEA column |
457 | | */ |
458 | |
|
459 | 0 | result = palloc0(MAXALIGN(SizeofBrinOpcInfo(1)) + |
460 | 0 | sizeof(BloomOpaque)); |
461 | 0 | result->oi_nstored = 1; |
462 | 0 | result->oi_regular_nulls = true; |
463 | 0 | result->oi_opaque = (BloomOpaque *) |
464 | 0 | MAXALIGN((char *) result + SizeofBrinOpcInfo(1)); |
465 | 0 | result->oi_typcache[0] = lookup_type_cache(PG_BRIN_BLOOM_SUMMARYOID, 0); |
466 | |
|
467 | 0 | PG_RETURN_POINTER(result); |
468 | 0 | } |
469 | | |
470 | | /* |
471 | | * brin_bloom_get_ndistinct |
472 | | * Determine the ndistinct value used to size bloom filter. |
473 | | * |
474 | | * Adjust the ndistinct value based on the pagesPerRange value. First, |
475 | | * if it's negative, it's assumed to be relative to maximum number of |
476 | | * tuples in the range (assuming each page gets MaxHeapTuplesPerPage |
477 | | * tuples, which is likely a significant over-estimate). We also clamp |
478 | | * the value, not to over-size the bloom filter unnecessarily. |
479 | | * |
480 | | * XXX We can only do this when the pagesPerRange value was supplied. |
481 | | * If it wasn't, it has to be a read-only access to the index, in which |
482 | | * case we don't really care. But perhaps we should fall-back to the |
483 | | * default pagesPerRange value? |
484 | | * |
485 | | * XXX We might also fetch info about ndistinct estimate for the column, |
486 | | * and compute the expected number of distinct values in a range. But |
487 | | * that may be tricky due to data being sorted in various ways, so it |
488 | | * seems better to rely on the upper estimate. |
489 | | * |
490 | | * XXX We might also calculate a better estimate of rows per BRIN range, |
491 | | * instead of using MaxHeapTuplesPerPage (which probably produces values |
492 | | * much higher than reality). |
493 | | */ |
494 | | static int |
495 | | brin_bloom_get_ndistinct(BrinDesc *bdesc, BloomOptions *opts) |
496 | 0 | { |
497 | 0 | double ndistinct; |
498 | 0 | double maxtuples; |
499 | 0 | BlockNumber pagesPerRange; |
500 | |
|
501 | 0 | pagesPerRange = BrinGetPagesPerRange(bdesc->bd_index); |
502 | 0 | ndistinct = BloomGetNDistinctPerRange(opts); |
503 | |
|
504 | 0 | Assert(BlockNumberIsValid(pagesPerRange)); |
505 | |
|
506 | 0 | maxtuples = MaxHeapTuplesPerPage * pagesPerRange; |
507 | | |
508 | | /* |
509 | | * Similarly to n_distinct, negative values are relative - in this case to |
510 | | * maximum number of tuples in the page range (maxtuples). |
511 | | */ |
512 | 0 | if (ndistinct < 0) |
513 | 0 | ndistinct = (-ndistinct) * maxtuples; |
514 | | |
515 | | /* |
516 | | * Positive values are to be used directly, but we still apply a couple of |
517 | | * safeties to avoid using unreasonably small bloom filters. |
518 | | */ |
519 | 0 | ndistinct = Max(ndistinct, BLOOM_MIN_NDISTINCT_PER_RANGE); |
520 | | |
521 | | /* |
522 | | * And don't use more than the maximum possible number of tuples, in the |
523 | | * range, which would be entirely wasteful. |
524 | | */ |
525 | 0 | ndistinct = Min(ndistinct, maxtuples); |
526 | |
|
527 | 0 | return (int) ndistinct; |
528 | 0 | } |
529 | | |
530 | | /* |
531 | | * Examine the given index tuple (which contains partial status of a certain |
532 | | * page range) by comparing it to the given value that comes from another heap |
533 | | * tuple. If the new value is outside the bloom filter specified by the |
534 | | * existing tuple values, update the index tuple and return true. Otherwise, |
535 | | * return false and do not modify in this case. |
536 | | */ |
537 | | Datum |
538 | | brin_bloom_add_value(PG_FUNCTION_ARGS) |
539 | 0 | { |
540 | 0 | BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0); |
541 | 0 | BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1); |
542 | 0 | Datum newval = PG_GETARG_DATUM(2); |
543 | 0 | bool isnull PG_USED_FOR_ASSERTS_ONLY = PG_GETARG_DATUM(3); |
544 | 0 | BloomOptions *opts = (BloomOptions *) PG_GET_OPCLASS_OPTIONS(); |
545 | 0 | Oid colloid = PG_GET_COLLATION(); |
546 | 0 | FmgrInfo *hashFn; |
547 | 0 | uint32 hashValue; |
548 | 0 | bool updated = false; |
549 | 0 | AttrNumber attno; |
550 | 0 | BloomFilter *filter; |
551 | |
|
552 | 0 | Assert(!isnull); |
553 | |
|
554 | 0 | attno = column->bv_attno; |
555 | | |
556 | | /* |
557 | | * If this is the first non-null value, we need to initialize the bloom |
558 | | * filter. Otherwise just extract the existing bloom filter from |
559 | | * BrinValues. |
560 | | */ |
561 | 0 | if (column->bv_allnulls) |
562 | 0 | { |
563 | 0 | filter = bloom_init(brin_bloom_get_ndistinct(bdesc, opts), |
564 | 0 | BloomGetFalsePositiveRate(opts)); |
565 | 0 | column->bv_values[0] = PointerGetDatum(filter); |
566 | 0 | column->bv_allnulls = false; |
567 | 0 | updated = true; |
568 | 0 | } |
569 | 0 | else |
570 | 0 | filter = (BloomFilter *) PG_DETOAST_DATUM(column->bv_values[0]); |
571 | | |
572 | | /* |
573 | | * Compute the hash of the new value, using the supplied hash function, |
574 | | * and then add the hash value to the bloom filter. |
575 | | */ |
576 | 0 | hashFn = bloom_get_procinfo(bdesc, attno, PROCNUM_HASH); |
577 | |
|
578 | 0 | hashValue = DatumGetUInt32(FunctionCall1Coll(hashFn, colloid, newval)); |
579 | |
|
580 | 0 | filter = bloom_add_value(filter, hashValue, &updated); |
581 | |
|
582 | 0 | column->bv_values[0] = PointerGetDatum(filter); |
583 | |
|
584 | 0 | PG_RETURN_BOOL(updated); |
585 | 0 | } |
586 | | |
587 | | /* |
588 | | * Given an index tuple corresponding to a certain page range and a scan key, |
589 | | * return whether the scan key is consistent with the index tuple's bloom |
590 | | * filter. Return true if so, false otherwise. |
591 | | */ |
592 | | Datum |
593 | | brin_bloom_consistent(PG_FUNCTION_ARGS) |
594 | 0 | { |
595 | 0 | BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0); |
596 | 0 | BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1); |
597 | 0 | ScanKey *keys = (ScanKey *) PG_GETARG_POINTER(2); |
598 | 0 | int nkeys = PG_GETARG_INT32(3); |
599 | 0 | Oid colloid = PG_GET_COLLATION(); |
600 | 0 | AttrNumber attno; |
601 | 0 | Datum value; |
602 | 0 | bool matches; |
603 | 0 | FmgrInfo *finfo; |
604 | 0 | uint32 hashValue; |
605 | 0 | BloomFilter *filter; |
606 | 0 | int keyno; |
607 | |
|
608 | 0 | filter = (BloomFilter *) PG_DETOAST_DATUM(column->bv_values[0]); |
609 | |
|
610 | 0 | Assert(filter); |
611 | | |
612 | | /* |
613 | | * Assume all scan keys match. We'll be searching for a scan key |
614 | | * eliminating the page range (we can stop on the first such key). |
615 | | */ |
616 | 0 | matches = true; |
617 | |
|
618 | 0 | for (keyno = 0; keyno < nkeys; keyno++) |
619 | 0 | { |
620 | 0 | ScanKey key = keys[keyno]; |
621 | | |
622 | | /* NULL keys are handled and filtered-out in bringetbitmap */ |
623 | 0 | Assert(!(key->sk_flags & SK_ISNULL)); |
624 | |
|
625 | 0 | attno = key->sk_attno; |
626 | 0 | value = key->sk_argument; |
627 | |
|
628 | 0 | switch (key->sk_strategy) |
629 | 0 | { |
630 | 0 | case BloomEqualStrategyNumber: |
631 | | |
632 | | /* |
633 | | * We want to return the current page range if the bloom |
634 | | * filter seems to contain the value. |
635 | | */ |
636 | 0 | finfo = bloom_get_procinfo(bdesc, attno, PROCNUM_HASH); |
637 | |
|
638 | 0 | hashValue = DatumGetUInt32(FunctionCall1Coll(finfo, colloid, value)); |
639 | 0 | matches &= bloom_contains_value(filter, hashValue); |
640 | |
|
641 | 0 | break; |
642 | 0 | default: |
643 | | /* shouldn't happen */ |
644 | 0 | elog(ERROR, "invalid strategy number %d", key->sk_strategy); |
645 | 0 | matches = false; |
646 | 0 | break; |
647 | 0 | } |
648 | | |
649 | 0 | if (!matches) |
650 | 0 | break; |
651 | 0 | } |
652 | | |
653 | 0 | PG_RETURN_BOOL(matches); |
654 | 0 | } |
655 | | |
656 | | /* |
657 | | * Given two BrinValues, update the first of them as a union of the summary |
658 | | * values contained in both. The second one is untouched. |
659 | | * |
660 | | * XXX We assume the bloom filters have the same parameters for now. In the |
661 | | * future we should have 'can union' function, to decide if we can combine |
662 | | * two particular bloom filters. |
663 | | */ |
664 | | Datum |
665 | | brin_bloom_union(PG_FUNCTION_ARGS) |
666 | 0 | { |
667 | 0 | int i; |
668 | 0 | int nbytes; |
669 | 0 | BrinValues *col_a = (BrinValues *) PG_GETARG_POINTER(1); |
670 | 0 | BrinValues *col_b = (BrinValues *) PG_GETARG_POINTER(2); |
671 | 0 | BloomFilter *filter_a; |
672 | 0 | BloomFilter *filter_b; |
673 | |
|
674 | 0 | Assert(col_a->bv_attno == col_b->bv_attno); |
675 | 0 | Assert(!col_a->bv_allnulls && !col_b->bv_allnulls); |
676 | |
|
677 | 0 | filter_a = (BloomFilter *) PG_DETOAST_DATUM(col_a->bv_values[0]); |
678 | 0 | filter_b = (BloomFilter *) PG_DETOAST_DATUM(col_b->bv_values[0]); |
679 | | |
680 | | /* make sure the filters use the same parameters */ |
681 | 0 | Assert(filter_a && filter_b); |
682 | 0 | Assert(filter_a->nbits == filter_b->nbits); |
683 | 0 | Assert(filter_a->nhashes == filter_b->nhashes); |
684 | 0 | Assert((filter_a->nbits > 0) && (filter_a->nbits % 8 == 0)); |
685 | |
|
686 | 0 | nbytes = (filter_a->nbits) / 8; |
687 | | |
688 | | /* simply OR the bitmaps */ |
689 | 0 | for (i = 0; i < nbytes; i++) |
690 | 0 | filter_a->data[i] |= filter_b->data[i]; |
691 | | |
692 | | /* update the number of bits set in the filter */ |
693 | 0 | filter_a->nbits_set = pg_popcount((const char *) filter_a->data, nbytes); |
694 | | |
695 | | /* if we decompressed filter_a, update the summary */ |
696 | 0 | if (PointerGetDatum(filter_a) != col_a->bv_values[0]) |
697 | 0 | { |
698 | 0 | pfree(DatumGetPointer(col_a->bv_values[0])); |
699 | 0 | col_a->bv_values[0] = PointerGetDatum(filter_a); |
700 | 0 | } |
701 | | |
702 | | /* also free filter_b, if it was decompressed */ |
703 | 0 | if (PointerGetDatum(filter_b) != col_b->bv_values[0]) |
704 | 0 | pfree(filter_b); |
705 | |
|
706 | 0 | PG_RETURN_VOID(); |
707 | 0 | } |
708 | | |
709 | | /* |
710 | | * Cache and return inclusion opclass support procedure |
711 | | * |
712 | | * Return the procedure corresponding to the given function support number |
713 | | * or null if it does not exist. |
714 | | */ |
715 | | static FmgrInfo * |
716 | | bloom_get_procinfo(BrinDesc *bdesc, uint16 attno, uint16 procnum) |
717 | 0 | { |
718 | 0 | BloomOpaque *opaque; |
719 | 0 | uint16 basenum = procnum - PROCNUM_BASE; |
720 | | |
721 | | /* |
722 | | * We cache these in the opaque struct, to avoid repetitive syscache |
723 | | * lookups. |
724 | | */ |
725 | 0 | opaque = (BloomOpaque *) bdesc->bd_info[attno - 1]->oi_opaque; |
726 | |
|
727 | 0 | if (opaque->extra_procinfos[basenum].fn_oid == InvalidOid) |
728 | 0 | { |
729 | 0 | if (RegProcedureIsValid(index_getprocid(bdesc->bd_index, attno, |
730 | 0 | procnum))) |
731 | 0 | fmgr_info_copy(&opaque->extra_procinfos[basenum], |
732 | 0 | index_getprocinfo(bdesc->bd_index, attno, procnum), |
733 | 0 | bdesc->bd_context); |
734 | 0 | else |
735 | 0 | ereport(ERROR, |
736 | 0 | errcode(ERRCODE_INVALID_OBJECT_DEFINITION), |
737 | 0 | errmsg_internal("invalid opclass definition"), |
738 | 0 | errdetail_internal("The operator class is missing support function %d for column %d.", |
739 | 0 | procnum, attno)); |
740 | 0 | } |
741 | | |
742 | 0 | return &opaque->extra_procinfos[basenum]; |
743 | 0 | } |
744 | | |
745 | | Datum |
746 | | brin_bloom_options(PG_FUNCTION_ARGS) |
747 | 0 | { |
748 | 0 | local_relopts *relopts = (local_relopts *) PG_GETARG_POINTER(0); |
749 | |
|
750 | 0 | init_local_reloptions(relopts, sizeof(BloomOptions)); |
751 | |
|
752 | 0 | add_local_real_reloption(relopts, "n_distinct_per_range", |
753 | 0 | "number of distinct items expected in a BRIN page range", |
754 | 0 | BLOOM_DEFAULT_NDISTINCT_PER_RANGE, |
755 | 0 | -1.0, INT_MAX, offsetof(BloomOptions, nDistinctPerRange)); |
756 | |
|
757 | 0 | add_local_real_reloption(relopts, "false_positive_rate", |
758 | 0 | "desired false-positive rate for the bloom filters", |
759 | 0 | BLOOM_DEFAULT_FALSE_POSITIVE_RATE, |
760 | 0 | BLOOM_MIN_FALSE_POSITIVE_RATE, |
761 | 0 | BLOOM_MAX_FALSE_POSITIVE_RATE, |
762 | 0 | offsetof(BloomOptions, falsePositiveRate)); |
763 | |
|
764 | 0 | PG_RETURN_VOID(); |
765 | 0 | } |
766 | | |
767 | | /* |
768 | | * brin_bloom_summary_in |
769 | | * - input routine for type brin_bloom_summary. |
770 | | * |
771 | | * brin_bloom_summary is only used internally to represent summaries |
772 | | * in BRIN bloom indexes, so it has no operations of its own, and we |
773 | | * disallow input too. |
774 | | */ |
775 | | Datum |
776 | | brin_bloom_summary_in(PG_FUNCTION_ARGS) |
777 | 0 | { |
778 | | /* |
779 | | * brin_bloom_summary stores the data in binary form and parsing text |
780 | | * input is not needed, so disallow this. |
781 | | */ |
782 | 0 | ereport(ERROR, |
783 | 0 | (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), |
784 | 0 | errmsg("cannot accept a value of type %s", "pg_brin_bloom_summary"))); |
785 | | |
786 | 0 | PG_RETURN_VOID(); /* keep compiler quiet */ |
787 | 0 | } |
788 | | |
789 | | |
790 | | /* |
791 | | * brin_bloom_summary_out |
792 | | * - output routine for type brin_bloom_summary. |
793 | | * |
794 | | * BRIN bloom summaries are serialized into a bytea value, but we want |
795 | | * to output something nicer humans can understand. |
796 | | */ |
797 | | Datum |
798 | | brin_bloom_summary_out(PG_FUNCTION_ARGS) |
799 | 0 | { |
800 | 0 | BloomFilter *filter; |
801 | 0 | StringInfoData str; |
802 | | |
803 | | /* detoast the data to get value with a full 4B header */ |
804 | 0 | filter = (BloomFilter *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0)); |
805 | |
|
806 | 0 | initStringInfo(&str); |
807 | 0 | appendStringInfoChar(&str, '{'); |
808 | |
|
809 | 0 | appendStringInfo(&str, "mode: hashed nhashes: %u nbits: %u nbits_set: %u", |
810 | 0 | filter->nhashes, filter->nbits, filter->nbits_set); |
811 | |
|
812 | 0 | appendStringInfoChar(&str, '}'); |
813 | |
|
814 | 0 | PG_RETURN_CSTRING(str.data); |
815 | 0 | } |
816 | | |
817 | | /* |
818 | | * brin_bloom_summary_recv |
819 | | * - binary input routine for type brin_bloom_summary. |
820 | | */ |
821 | | Datum |
822 | | brin_bloom_summary_recv(PG_FUNCTION_ARGS) |
823 | 0 | { |
824 | 0 | ereport(ERROR, |
825 | 0 | (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), |
826 | 0 | errmsg("cannot accept a value of type %s", "pg_brin_bloom_summary"))); |
827 | | |
828 | 0 | PG_RETURN_VOID(); /* keep compiler quiet */ |
829 | 0 | } |
830 | | |
831 | | /* |
832 | | * brin_bloom_summary_send |
833 | | * - binary output routine for type brin_bloom_summary. |
834 | | * |
835 | | * BRIN bloom summaries are serialized in a bytea value (although the |
836 | | * type is named differently), so let's just send that. |
837 | | */ |
838 | | Datum |
839 | | brin_bloom_summary_send(PG_FUNCTION_ARGS) |
840 | 0 | { |
841 | 0 | return byteasend(fcinfo); |
842 | 0 | } |