Line data Source code
1 : // Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package sstable
6 :
7 : import (
8 : "encoding/binary"
9 : "fmt"
10 : "math"
11 : "sync"
12 : "unsafe"
13 :
14 : "github.com/cockroachdb/pebble/internal/base"
15 : "github.com/cockroachdb/pebble/internal/rangekey"
16 : )
17 :
18 : // Block properties are an optional user-facing feature that can be used to
19 : // filter data blocks (and whole sstables) from an Iterator before they are
20 : // loaded. They do not apply to range delete blocks. These are expected to
21 : // very concisely represent a set of some attribute value contained within the
22 : // key or value, such that the set includes all the attribute values in the
23 : // block. This has some similarities with OLAP pruning approaches that
24 : // maintain min-max attribute values for some column (which concisely
25 : // represent a set), that is then used to prune at query time. In Pebble's
26 : // case, data blocks are small, typically 25-50KB, so these properties should
27 : // reduce their precision in order to be concise -- a good rule of thumb is to
28 : // not consume more than 50-100 bytes across all properties maintained for a
29 : // block, i.e., a 500x reduction compared to loading the data block.
30 : //
31 : // A block property must be assigned a unique name, which is encoded and
32 : // stored in the sstable. This name must be unique among all user-properties
33 : // encoded in an sstable.
34 : //
35 : // A property is represented as a []byte. A nil value or empty byte slice are
36 : // considered semantically identical. The caller is free to choose the
37 : // semantics of an empty byte slice e.g. they could use it to represent the
38 : // empty set or the universal set, whichever they think is more common and
39 : // therefore better to encode more concisely. The serialization of the
40 : // property for the various Finish*() calls in a BlockPropertyCollector
41 : // implementation should be identical, since the corresponding
42 : // BlockPropertyFilter implementation is not told the context in which it is
43 : // deserializing the property.
44 : //
45 : // Block properties are more general than table properties and should be
46 : // preferred over using table properties. A BlockPropertyCollector can achieve
47 : // identical behavior to table properties by returning the nil slice from
48 : // FinishDataBlock and FinishIndexBlock, and interpret them as the universal
49 : // set in BlockPropertyFilter, and return a non-universal set in FinishTable.
50 : //
51 : // Block property filtering is nondeterministic because the separation of keys
52 : // into blocks is nondeterministic. Clients use block-property filters to
53 : // implement efficient application of a filter F that applies to key-value pairs
54 : // (abbreviated as kv-filter). Consider correctness defined as surfacing exactly
55 : // the same key-value pairs that would be surfaced if one applied the filter F
56 : // above normal iteration. With this correctness definition, block property
57 : // filtering may introduce two kinds of errors:
58 : //
59 : // a) Block property filtering that uses a kv-filter may produce additional
60 : // key-value pairs that don't satisfy the filter because of the separation
61 : // of keys into blocks. Clients may remove these extra key-value pairs by
62 : // re-applying the kv filter while reading results back from Pebble.
63 : //
64 : // b) Block property filtering may surface deleted key-value pairs if the
65 : // kv filter is not a strict function of the key's user key. A block
66 : // containing k.DEL may be filtered, while a block containing the deleted
67 : // key k.SET may not be filtered, if the kv filter applies to one but not
68 : // the other.
69 : //
70 : // This error may be avoided trivially by using a kv filter that is a pure
71 : // function of the user key. A filter that examines values or key kinds
72 : // requires care to ensure F(k.SET, <value>) = F(k.DEL) = F(k.SINGLEDEL).
73 : //
74 : // The combination of range deletions and filtering by table-level properties
75 : // add another opportunity for deleted point keys to be surfaced. The pebble
76 : // Iterator stack takes care to correctly apply filtered tables' range deletions
77 : // to lower tables, preventing this form of nondeterministic error.
78 : //
79 : // In addition to the non-determinism discussed in (b), which limits the use
80 : // of properties over values, we now have support for values that are not
81 : // stored together with the key, and may not even be retrieved during
82 : // compactions. If Pebble is configured with such value separation, block
83 : // properties must only apply to the key, and will be provided a nil value.
84 :
85 : // BlockPropertyCollector is used when writing a sstable.
86 : //
87 : // - All calls to Add are included in the next FinishDataBlock, after which
88 : // the next data block is expected to start.
89 : //
90 : // - The index entry generated for the data block, which contains the return
91 : // value from FinishDataBlock, is not immediately included in the current
92 : // index block. It is included when AddPrevDataBlockToIndexBlock is called.
93 : // An alternative would be to return an opaque handle from FinishDataBlock
94 : // and pass it to a new AddToIndexBlock method, which requires more
95 : // plumbing, and passing of an interface{} results in a undesirable heap
96 : // allocation. AddPrevDataBlockToIndexBlock must be called before keys are
97 : // added to the new data block.
98 : type BlockPropertyCollector interface {
99 : // Name returns the name of the block property collector.
100 : Name() string
101 : // Add is called with each new entry added to a data block in the sstable.
102 : // The callee can assume that these are in sorted order.
103 : Add(key InternalKey, value []byte) error
104 : // FinishDataBlock is called when all the entries have been added to a
105 : // data block. Subsequent Add calls will be for the next data block. It
106 : // returns the property value for the finished block.
107 : FinishDataBlock(buf []byte) ([]byte, error)
108 : // AddPrevDataBlockToIndexBlock adds the entry corresponding to the
109 : // previous FinishDataBlock to the current index block.
110 : AddPrevDataBlockToIndexBlock()
111 : // FinishIndexBlock is called when an index block, containing all the
112 : // key-value pairs since the last FinishIndexBlock, will no longer see new
113 : // entries. It returns the property value for the index block.
114 : FinishIndexBlock(buf []byte) ([]byte, error)
115 : // FinishTable is called when the sstable is finished, and returns the
116 : // property value for the sstable.
117 : FinishTable(buf []byte) ([]byte, error)
118 : }
119 :
120 : // SuffixReplaceableBlockCollector is an extension to the BlockPropertyCollector
121 : // interface that allows a block property collector to indicate that it supports
122 : // being *updated* during suffix replacement, i.e. when an existing SST in which
123 : // all keys have the same key suffix is updated to have a new suffix.
124 : //
125 : // A collector which supports being updated in such cases must be able to derive
126 : // its updated value from its old value and the change being made to the suffix,
127 : // without needing to be passed each updated K/V.
128 : //
129 : // For example, a collector that only inspects values would can simply copy its
130 : // previously computed property as-is, since key-suffix replacement does not
131 : // change values, while a collector that depends only on key suffixes, like one
132 : // which collected mvcc-timestamp bounds from timestamp-suffixed keys, can just
133 : // set its new bounds from the new suffix, as it is common to all keys, without
134 : // needing to recompute it from every key.
135 : //
136 : // An implementation of DataBlockIntervalCollector can also implement this
137 : // interface, in which case the BlockPropertyCollector returned by passing it to
138 : // NewBlockIntervalCollector will also implement this interface automatically.
139 : type SuffixReplaceableBlockCollector interface {
140 : // UpdateKeySuffixes is called when a block is updated to change the suffix of
141 : // all keys in the block, and is passed the old value for that prop, if any,
142 : // for that block as well as the old and new suffix.
143 : UpdateKeySuffixes(oldProp []byte, oldSuffix, newSuffix []byte) error
144 : }
145 :
146 : // BlockPropertyFilter is used in an Iterator to filter sstables and blocks
147 : // within the sstable. It should not maintain any per-sstable state, and must
148 : // be thread-safe.
149 : type BlockPropertyFilter = base.BlockPropertyFilter
150 :
151 : // BoundLimitedBlockPropertyFilter implements the block-property filter but
152 : // imposes an additional constraint on its usage, requiring that only blocks
153 : // containing exclusively keys between its lower and upper bounds may be
154 : // filtered. The bounds may be change during iteration, so the filter doesn't
155 : // expose the bounds, instead implementing KeyIsWithin[Lower,Upper]Bound methods
156 : // for performing bound comparisons.
157 : //
158 : // To be used, a BoundLimitedBlockPropertyFilter must be supplied directly
159 : // through NewBlockPropertiesFilterer's dedicated parameter. If supplied through
160 : // the ordinary slice of block property filters, this filter's bounds will be
161 : // ignored.
162 : //
163 : // The current [lower,upper) bounds of the filter are unknown, because they may
164 : // be changing. During forward iteration the lower bound is externally
165 : // guaranteed, meaning Intersects only returns false if the sstable iterator is
166 : // already known to be positioned at a key ≥ lower. The sstable iterator is then
167 : // only responsible for ensuring filtered blocks also meet the upper bound, and
168 : // should only allow a block to be filtered if all its keys are < upper. The
169 : // sstable iterator may invoke KeyIsWithinUpperBound(key) to perform this check,
170 : // where key is an inclusive upper bound on the block's keys.
171 : //
172 : // During backward iteration the upper bound is externally guaranteed, and
173 : // Intersects only returns false if the sstable iterator is already known to be
174 : // positioned at a key < upper. The sstable iterator is responsible for ensuring
175 : // filtered blocks also meet the lower bound, enforcing that a block is only
176 : // filtered if all its keys are ≥ lower. This check is made through passing the
177 : // block's inclusive lower bound to KeyIsWithinLowerBound.
178 : //
179 : // Implementations may become active or inactive through implementing Intersects
180 : // to return true whenever the filter is disabled.
181 : //
182 : // Usage of BoundLimitedBlockPropertyFilter is subtle, and Pebble consumers
183 : // should not implement this interface directly. This interface is an internal
184 : // detail in the implementation of block-property range-key masking.
185 : type BoundLimitedBlockPropertyFilter interface {
186 : BlockPropertyFilter
187 :
188 : // KeyIsWithinLowerBound tests whether the provided internal key falls
189 : // within the current lower bound of the filter. A true return value
190 : // indicates that the filter may be used to filter blocks that exclusively
191 : // contain keys ≥ `key`, so long as the blocks' keys also satisfy the upper
192 : // bound.
193 : KeyIsWithinLowerBound(key []byte) bool
194 : // KeyIsWithinUpperBound tests whether the provided internal key falls
195 : // within the current upper bound of the filter. A true return value
196 : // indicates that the filter may be used to filter blocks that exclusively
197 : // contain keys ≤ `key`, so long as the blocks' keys also satisfy the lower
198 : // bound.
199 : KeyIsWithinUpperBound(key []byte) bool
200 : }
201 :
202 : // BlockIntervalCollector is a helper implementation of BlockPropertyCollector
203 : // for users who want to represent a set of the form [lower,upper) where both
204 : // lower and upper are uint64, and lower <= upper.
205 : //
206 : // The set is encoded as:
207 : // - Two varint integers, (lower,upper-lower), when upper-lower > 0
208 : // - Nil, when upper-lower=0
209 : //
210 : // Users must not expect this to preserve differences between empty sets --
211 : // they will all get turned into the semantically equivalent [0,0).
212 : //
213 : // A BlockIntervalCollector that collects over point and range keys needs to
214 : // have both the point and range DataBlockIntervalCollector specified, since
215 : // point and range keys are fed to the BlockIntervalCollector in an interleaved
216 : // fashion, independently of one another. This also implies that the
217 : // DataBlockIntervalCollectors for point and range keys should be references to
218 : // independent instances, rather than references to the same collector, as point
219 : // and range keys are tracked independently.
220 : type BlockIntervalCollector struct {
221 : name string
222 : points DataBlockIntervalCollector
223 : ranges DataBlockIntervalCollector
224 :
225 : blockInterval interval
226 : indexInterval interval
227 : tableInterval interval
228 : }
229 :
230 : var _ BlockPropertyCollector = &BlockIntervalCollector{}
231 :
232 : // DataBlockIntervalCollector is the interface used by BlockIntervalCollector
233 : // that contains the actual logic pertaining to the property. It only
234 : // maintains state for the current data block, and resets that state in
235 : // FinishDataBlock. This interface can be used to reduce parsing costs.
236 : type DataBlockIntervalCollector interface {
237 : // Add is called with each new entry added to a data block in the sstable.
238 : // The callee can assume that these are in sorted order.
239 : Add(key InternalKey, value []byte) error
240 : // FinishDataBlock is called when all the entries have been added to a
241 : // data block. Subsequent Add calls will be for the next data block. It
242 : // returns the [lower, upper) for the finished block.
243 : FinishDataBlock() (lower uint64, upper uint64, err error)
244 : }
245 :
246 : // NewBlockIntervalCollector constructs a BlockIntervalCollector with the given
247 : // name. The BlockIntervalCollector makes use of the given point and range key
248 : // DataBlockIntervalCollectors when encountering point and range keys,
249 : // respectively.
250 : //
251 : // The caller may pass a nil DataBlockIntervalCollector for one of the point or
252 : // range key collectors, in which case keys of those types will be ignored. This
253 : // allows for flexible construction of BlockIntervalCollectors that operate on
254 : // just point keys, just range keys, or both point and range keys.
255 : //
256 : // If both point and range keys are to be tracked, two independent collectors
257 : // should be provided, rather than the same collector passed in twice (see the
258 : // comment on BlockIntervalCollector for more detail)
259 : func NewBlockIntervalCollector(
260 : name string, pointCollector, rangeCollector DataBlockIntervalCollector,
261 1 : ) BlockPropertyCollector {
262 1 : if pointCollector == nil && rangeCollector == nil {
263 0 : panic("sstable: at least one interval collector must be provided")
264 : }
265 1 : bic := BlockIntervalCollector{
266 1 : name: name,
267 1 : points: pointCollector,
268 1 : ranges: rangeCollector,
269 1 : }
270 1 : if _, ok := pointCollector.(SuffixReplaceableBlockCollector); ok {
271 0 : return &suffixReplacementBlockCollectorWrapper{bic}
272 0 : }
273 1 : return &bic
274 : }
275 :
276 : // Name implements the BlockPropertyCollector interface.
277 1 : func (b *BlockIntervalCollector) Name() string {
278 1 : return b.name
279 1 : }
280 :
281 : // Add implements the BlockPropertyCollector interface.
282 1 : func (b *BlockIntervalCollector) Add(key InternalKey, value []byte) error {
283 1 : if rangekey.IsRangeKey(key.Kind()) {
284 1 : if b.ranges != nil {
285 0 : return b.ranges.Add(key, value)
286 0 : }
287 1 : } else if b.points != nil {
288 1 : return b.points.Add(key, value)
289 1 : }
290 1 : return nil
291 : }
292 :
293 : // FinishDataBlock implements the BlockPropertyCollector interface.
294 1 : func (b *BlockIntervalCollector) FinishDataBlock(buf []byte) ([]byte, error) {
295 1 : if b.points == nil {
296 0 : return buf, nil
297 0 : }
298 1 : var err error
299 1 : b.blockInterval.lower, b.blockInterval.upper, err = b.points.FinishDataBlock()
300 1 : if err != nil {
301 0 : return buf, err
302 0 : }
303 1 : buf = b.blockInterval.encode(buf)
304 1 : b.tableInterval.union(b.blockInterval)
305 1 : return buf, nil
306 : }
307 :
308 : // AddPrevDataBlockToIndexBlock implements the BlockPropertyCollector
309 : // interface.
310 1 : func (b *BlockIntervalCollector) AddPrevDataBlockToIndexBlock() {
311 1 : b.indexInterval.union(b.blockInterval)
312 1 : b.blockInterval = interval{}
313 1 : }
314 :
315 : // FinishIndexBlock implements the BlockPropertyCollector interface.
316 1 : func (b *BlockIntervalCollector) FinishIndexBlock(buf []byte) ([]byte, error) {
317 1 : buf = b.indexInterval.encode(buf)
318 1 : b.indexInterval = interval{}
319 1 : return buf, nil
320 1 : }
321 :
322 : // FinishTable implements the BlockPropertyCollector interface.
323 1 : func (b *BlockIntervalCollector) FinishTable(buf []byte) ([]byte, error) {
324 1 : // If the collector is tracking range keys, the range key interval is union-ed
325 1 : // with the point key interval for the table.
326 1 : if b.ranges != nil {
327 0 : var rangeInterval interval
328 0 : var err error
329 0 : rangeInterval.lower, rangeInterval.upper, err = b.ranges.FinishDataBlock()
330 0 : if err != nil {
331 0 : return buf, err
332 0 : }
333 0 : b.tableInterval.union(rangeInterval)
334 : }
335 1 : return b.tableInterval.encode(buf), nil
336 : }
337 :
338 : type interval struct {
339 : lower uint64
340 : upper uint64
341 : }
342 :
343 1 : func (i interval) encode(buf []byte) []byte {
344 1 : if i.lower < i.upper {
345 1 : var encoded [binary.MaxVarintLen64 * 2]byte
346 1 : n := binary.PutUvarint(encoded[:], i.lower)
347 1 : n += binary.PutUvarint(encoded[n:], i.upper-i.lower)
348 1 : buf = append(buf, encoded[:n]...)
349 1 : }
350 1 : return buf
351 : }
352 :
353 1 : func (i *interval) decode(buf []byte) error {
354 1 : if len(buf) == 0 {
355 1 : *i = interval{}
356 1 : return nil
357 1 : }
358 1 : var n int
359 1 : i.lower, n = binary.Uvarint(buf)
360 1 : if n <= 0 || n >= len(buf) {
361 0 : return base.CorruptionErrorf("cannot decode interval from buf %x", buf)
362 0 : }
363 1 : pos := n
364 1 : i.upper, n = binary.Uvarint(buf[pos:])
365 1 : pos += n
366 1 : if pos != len(buf) || n <= 0 {
367 0 : return base.CorruptionErrorf("cannot decode interval from buf %x", buf)
368 0 : }
369 : // Delta decode.
370 1 : i.upper += i.lower
371 1 : if i.upper < i.lower {
372 0 : return base.CorruptionErrorf("unexpected overflow, upper %d < lower %d", i.upper, i.lower)
373 0 : }
374 1 : return nil
375 : }
376 :
377 1 : func (i *interval) union(x interval) {
378 1 : if x.lower >= x.upper {
379 1 : // x is the empty set.
380 1 : return
381 1 : }
382 1 : if i.lower >= i.upper {
383 1 : // i is the empty set.
384 1 : *i = x
385 1 : return
386 1 : }
387 : // Both sets are non-empty.
388 1 : if x.lower < i.lower {
389 1 : i.lower = x.lower
390 1 : }
391 1 : if x.upper > i.upper {
392 1 : i.upper = x.upper
393 1 : }
394 : }
395 :
396 1 : func (i interval) intersects(x interval) bool {
397 1 : if i.lower >= i.upper || x.lower >= x.upper {
398 1 : // At least one of the sets is empty.
399 1 : return false
400 1 : }
401 : // Neither set is empty.
402 1 : return i.upper > x.lower && i.lower < x.upper
403 : }
404 :
405 : type suffixReplacementBlockCollectorWrapper struct {
406 : BlockIntervalCollector
407 : }
408 :
409 : // UpdateKeySuffixes implements the SuffixReplaceableBlockCollector interface.
410 : func (w *suffixReplacementBlockCollectorWrapper) UpdateKeySuffixes(
411 : oldProp []byte, from, to []byte,
412 0 : ) error {
413 0 : return w.BlockIntervalCollector.points.(SuffixReplaceableBlockCollector).UpdateKeySuffixes(oldProp, from, to)
414 0 : }
415 :
416 : // BlockIntervalFilter is an implementation of BlockPropertyFilter when the
417 : // corresponding collector is a BlockIntervalCollector. That is, the set is of
418 : // the form [lower, upper).
419 : type BlockIntervalFilter struct {
420 : name string
421 : filterInterval interval
422 : }
423 :
424 : var _ BlockPropertyFilter = (*BlockIntervalFilter)(nil)
425 :
426 : // NewBlockIntervalFilter constructs a BlockPropertyFilter that filters blocks
427 : // based on an interval property collected by BlockIntervalCollector and the
428 : // given [lower, upper) bounds. The given name specifies the
429 : // BlockIntervalCollector's properties to read.
430 1 : func NewBlockIntervalFilter(name string, lower uint64, upper uint64) *BlockIntervalFilter {
431 1 : b := new(BlockIntervalFilter)
432 1 : b.Init(name, lower, upper)
433 1 : return b
434 1 : }
435 :
436 : // Init initializes (or re-initializes, clearing previous state) an existing
437 : // BLockPropertyFilter to filter blocks based on an interval property collected
438 : // by BlockIntervalCollector and the given [lower, upper) bounds. The given name
439 : // specifies the BlockIntervalCollector's properties to read.
440 1 : func (b *BlockIntervalFilter) Init(name string, lower, upper uint64) {
441 1 : *b = BlockIntervalFilter{
442 1 : name: name,
443 1 : filterInterval: interval{lower: lower, upper: upper},
444 1 : }
445 1 : }
446 :
447 : // Name implements the BlockPropertyFilter interface.
448 1 : func (b *BlockIntervalFilter) Name() string {
449 1 : return b.name
450 1 : }
451 :
452 : // Intersects implements the BlockPropertyFilter interface.
453 1 : func (b *BlockIntervalFilter) Intersects(prop []byte) (bool, error) {
454 1 : var i interval
455 1 : if err := i.decode(prop); err != nil {
456 0 : return false, err
457 0 : }
458 1 : return i.intersects(b.filterInterval), nil
459 : }
460 :
461 : // SetInterval adjusts the [lower, upper) bounds used by the filter. It is not
462 : // generally safe to alter the filter while it's in use, except as part of the
463 : // implementation of BlockPropertyFilterMask.SetSuffix used for range-key
464 : // masking.
465 1 : func (b *BlockIntervalFilter) SetInterval(lower, upper uint64) {
466 1 : b.filterInterval = interval{lower: lower, upper: upper}
467 1 : }
468 :
469 : // When encoding block properties for each block, we cannot afford to encode
470 : // the name. Instead, the name is mapped to a shortID, in the scope of that
471 : // sstable, and the shortID is encoded. Since we use a uint8, there is a limit
472 : // of 256 block property collectors per sstable.
473 : type shortID uint8
474 :
475 : type blockPropertiesEncoder struct {
476 : propsBuf []byte
477 : scratch []byte
478 : }
479 :
480 1 : func (e *blockPropertiesEncoder) getScratchForProp() []byte {
481 1 : return e.scratch[:0]
482 1 : }
483 :
484 1 : func (e *blockPropertiesEncoder) resetProps() {
485 1 : e.propsBuf = e.propsBuf[:0]
486 1 : }
487 :
488 1 : func (e *blockPropertiesEncoder) addProp(id shortID, scratch []byte) {
489 1 : const lenID = 1
490 1 : lenProp := uvarintLen(uint32(len(scratch)))
491 1 : n := lenID + lenProp + len(scratch)
492 1 : if cap(e.propsBuf)-len(e.propsBuf) < n {
493 1 : size := len(e.propsBuf) + 2*n
494 1 : if size < 2*cap(e.propsBuf) {
495 1 : size = 2 * cap(e.propsBuf)
496 1 : }
497 1 : buf := make([]byte, len(e.propsBuf), size)
498 1 : copy(buf, e.propsBuf)
499 1 : e.propsBuf = buf
500 : }
501 1 : pos := len(e.propsBuf)
502 1 : b := e.propsBuf[pos : pos+lenID]
503 1 : b[0] = byte(id)
504 1 : pos += lenID
505 1 : b = e.propsBuf[pos : pos+lenProp]
506 1 : n = binary.PutUvarint(b, uint64(len(scratch)))
507 1 : pos += n
508 1 : b = e.propsBuf[pos : pos+len(scratch)]
509 1 : pos += len(scratch)
510 1 : copy(b, scratch)
511 1 : e.propsBuf = e.propsBuf[0:pos]
512 1 : e.scratch = scratch
513 : }
514 :
515 1 : func (e *blockPropertiesEncoder) unsafeProps() []byte {
516 1 : return e.propsBuf
517 1 : }
518 :
519 1 : func (e *blockPropertiesEncoder) props() []byte {
520 1 : buf := make([]byte, len(e.propsBuf))
521 1 : copy(buf, e.propsBuf)
522 1 : return buf
523 1 : }
524 :
525 : type blockPropertiesDecoder struct {
526 : props []byte
527 : }
528 :
529 1 : func (d *blockPropertiesDecoder) done() bool {
530 1 : return len(d.props) == 0
531 1 : }
532 :
533 : // REQUIRES: !done()
534 1 : func (d *blockPropertiesDecoder) next() (id shortID, prop []byte, err error) {
535 1 : const lenID = 1
536 1 : id = shortID(d.props[0])
537 1 : propLen, m := binary.Uvarint(d.props[lenID:])
538 1 : n := lenID + m
539 1 : if m <= 0 || propLen == 0 || (n+int(propLen)) > len(d.props) {
540 0 : return 0, nil, base.CorruptionErrorf("corrupt block property length")
541 0 : }
542 1 : prop = d.props[n : n+int(propLen)]
543 1 : d.props = d.props[n+int(propLen):]
544 1 : return id, prop, nil
545 : }
546 :
547 : // BlockPropertiesFilterer provides filtering support when reading an sstable
548 : // in the context of an iterator that has a slice of BlockPropertyFilters.
549 : // After the call to NewBlockPropertiesFilterer, the caller must call
550 : // IntersectsUserPropsAndFinishInit to check if the sstable intersects with
551 : // the filters. If it does intersect, this function also finishes initializing
552 : // the BlockPropertiesFilterer using the shortIDs for the relevant filters.
553 : // Subsequent checks for relevance of a block should use the intersects
554 : // method.
555 : type BlockPropertiesFilterer struct {
556 : filters []BlockPropertyFilter
557 : // Maps shortID => index in filters. This can be sparse, and shortIDs for
558 : // which there is no filter are represented with an index of -1. The
559 : // length of this can be shorter than the shortIDs allocated in the
560 : // sstable. e.g. if the sstable used shortIDs 0, 1, 2, 3, and the iterator
561 : // has two filters, corresponding to shortIDs 2, 0, this would be:
562 : // len(shortIDToFiltersIndex)==3, 0=>1, 1=>-1, 2=>0.
563 : shortIDToFiltersIndex []int
564 :
565 : // boundLimitedFilter, if non-nil, holds a single block-property filter with
566 : // additional constraints on its filtering. A boundLimitedFilter may only
567 : // filter blocks that are wholly contained within its bounds. During forward
568 : // iteration the lower bound (and during backward iteration the upper bound)
569 : // must be externally guaranteed, with Intersects only returning false if
570 : // that bound is met. The opposite bound is verified during iteration by the
571 : // sstable iterator.
572 : //
573 : // boundLimitedFilter is permitted to be defined on a property (`Name()`)
574 : // for which another filter exists in filters. In this case both filters
575 : // will be consulted, and either filter may exclude block(s). Only a single
576 : // bound-limited block-property filter may be set.
577 : //
578 : // The boundLimitedShortID field contains the shortID of the filter's
579 : // property within the sstable. It's set to -1 if the property was not
580 : // collected when the table was built.
581 : boundLimitedFilter BoundLimitedBlockPropertyFilter
582 : boundLimitedShortID int
583 : }
584 :
585 : var blockPropertiesFiltererPool = sync.Pool{
586 1 : New: func() interface{} {
587 1 : return &BlockPropertiesFilterer{}
588 1 : },
589 : }
590 :
591 : // newBlockPropertiesFilterer returns a partially initialized filterer. To complete
592 : // initialization, call IntersectsUserPropsAndFinishInit.
593 : func newBlockPropertiesFilterer(
594 : filters []BlockPropertyFilter, limited BoundLimitedBlockPropertyFilter,
595 1 : ) *BlockPropertiesFilterer {
596 1 : filterer := blockPropertiesFiltererPool.Get().(*BlockPropertiesFilterer)
597 1 : *filterer = BlockPropertiesFilterer{
598 1 : filters: filters,
599 1 : shortIDToFiltersIndex: filterer.shortIDToFiltersIndex[:0],
600 1 : boundLimitedFilter: limited,
601 1 : boundLimitedShortID: -1,
602 1 : }
603 1 : return filterer
604 1 : }
605 :
606 1 : func releaseBlockPropertiesFilterer(filterer *BlockPropertiesFilterer) {
607 1 : *filterer = BlockPropertiesFilterer{
608 1 : shortIDToFiltersIndex: filterer.shortIDToFiltersIndex[:0],
609 1 : }
610 1 : blockPropertiesFiltererPool.Put(filterer)
611 1 : }
612 :
613 : // IntersectsTable evaluates the provided block-property filter against the
614 : // provided set of table-level properties. If there is no intersection between
615 : // the filters and the table or an error is encountered, IntersectsTable returns
616 : // a nil filterer (and possibly an error). If there is an intersection,
617 : // IntersectsTable returns a non-nil filterer that may be used by an iterator
618 : // reading the table.
619 : func IntersectsTable(
620 : filters []BlockPropertyFilter,
621 : limited BoundLimitedBlockPropertyFilter,
622 : userProperties map[string]string,
623 1 : ) (*BlockPropertiesFilterer, error) {
624 1 : f := newBlockPropertiesFilterer(filters, limited)
625 1 : ok, err := f.intersectsUserPropsAndFinishInit(userProperties)
626 1 : if !ok || err != nil {
627 1 : releaseBlockPropertiesFilterer(f)
628 1 : return nil, err
629 1 : }
630 1 : return f, nil
631 : }
632 :
633 : // intersectsUserPropsAndFinishInit is called with the user properties map for
634 : // the sstable and returns whether the sstable intersects the filters. It
635 : // additionally initializes the shortIDToFiltersIndex for the filters that are
636 : // relevant to this sstable.
637 : func (f *BlockPropertiesFilterer) intersectsUserPropsAndFinishInit(
638 : userProperties map[string]string,
639 1 : ) (bool, error) {
640 1 : for i := range f.filters {
641 1 : props, ok := userProperties[f.filters[i].Name()]
642 1 : if !ok {
643 1 : // Collector was not used when writing this file, so it is
644 1 : // considered intersecting.
645 1 : continue
646 : }
647 1 : if len(props) < 1 {
648 0 : return false, base.CorruptionErrorf(
649 0 : "block properties for %s is corrupted", f.filters[i].Name())
650 0 : }
651 1 : shortID := shortID(props[0])
652 1 : {
653 1 : // Use an unsafe conversion to avoid allocating. Intersects() is not
654 1 : // supposed to modify the given slice.
655 1 : // Note that unsafe.StringData only works if the string is not empty
656 1 : // (which we already checked).
657 1 : byteProps := unsafe.Slice(unsafe.StringData(props), len(props))
658 1 : intersects, err := f.filters[i].Intersects(byteProps[1:])
659 1 : if err != nil || !intersects {
660 1 : return false, err
661 1 : }
662 : }
663 : // Intersects the sstable, so need to use this filter when
664 : // deciding whether to read blocks.
665 1 : n := len(f.shortIDToFiltersIndex)
666 1 : if n <= int(shortID) {
667 1 : if cap(f.shortIDToFiltersIndex) <= int(shortID) {
668 1 : index := make([]int, shortID+1, 2*(shortID+1))
669 1 : copy(index, f.shortIDToFiltersIndex)
670 1 : f.shortIDToFiltersIndex = index
671 1 : } else {
672 1 : f.shortIDToFiltersIndex = f.shortIDToFiltersIndex[:shortID+1]
673 1 : }
674 1 : for j := n; j < int(shortID); j++ {
675 1 : f.shortIDToFiltersIndex[j] = -1
676 1 : }
677 : }
678 1 : f.shortIDToFiltersIndex[shortID] = i
679 : }
680 1 : if f.boundLimitedFilter == nil {
681 1 : return true, nil
682 1 : }
683 :
684 : // There's a bound-limited filter. Find its shortID. It's possible that
685 : // there's an existing filter in f.filters on the same property. That's
686 : // okay. Both filters will be consulted whenever a relevant prop is decoded.
687 1 : props, ok := userProperties[f.boundLimitedFilter.Name()]
688 1 : if !ok {
689 1 : // The collector was not used when writing this file, so it's
690 1 : // intersecting. We leave f.boundLimitedShortID=-1, so the filter will
691 1 : // be unused within this file.
692 1 : return true, nil
693 1 : }
694 1 : if len(props) < 1 {
695 0 : return false, base.CorruptionErrorf(
696 0 : "block properties for %s is corrupted", f.boundLimitedFilter.Name())
697 0 : }
698 1 : f.boundLimitedShortID = int(props[0])
699 1 :
700 1 : // We don't check for table-level intersection for the bound-limited filter.
701 1 : // The bound-limited filter is treated as vacuously intersecting.
702 1 : //
703 1 : // NB: If a block-property filter needs to be toggled inactive/active, it
704 1 : // should be implemented within the Intersects implementation.
705 1 : //
706 1 : // TODO(jackson): We could filter at the table-level by threading the table
707 1 : // smallest and largest bounds here.
708 1 :
709 1 : // The bound-limited filter isn't included in shortIDToFiltersIndex.
710 1 : //
711 1 : // When determining intersection, we decode props only up to the shortID
712 1 : // len(shortIDToFiltersIndex). If f.limitedShortID is greater than any of
713 1 : // the existing filters' shortIDs, we need to grow shortIDToFiltersIndex.
714 1 : // Growing the index with -1s ensures we're able to consult the index
715 1 : // without length checks.
716 1 : if n := len(f.shortIDToFiltersIndex); n <= f.boundLimitedShortID {
717 1 : if cap(f.shortIDToFiltersIndex) <= f.boundLimitedShortID {
718 1 : index := make([]int, f.boundLimitedShortID+1)
719 1 : copy(index, f.shortIDToFiltersIndex)
720 1 : f.shortIDToFiltersIndex = index
721 1 : } else {
722 1 : f.shortIDToFiltersIndex = f.shortIDToFiltersIndex[:f.boundLimitedShortID+1]
723 1 : }
724 1 : for j := n; j <= f.boundLimitedShortID; j++ {
725 1 : f.shortIDToFiltersIndex[j] = -1
726 1 : }
727 : }
728 1 : return true, nil
729 : }
730 :
731 : type intersectsResult int8
732 :
733 : const (
734 : blockIntersects intersectsResult = iota
735 : blockExcluded
736 : // blockMaybeExcluded is returned by BlockPropertiesFilterer.intersects when
737 : // no filters unconditionally exclude the block, but the bound-limited block
738 : // property filter will exclude it if the block's bounds fall within the
739 : // filter's current bounds. See the reader's
740 : // {single,two}LevelIterator.resolveMaybeExcluded methods.
741 : blockMaybeExcluded
742 : )
743 :
744 1 : func (f *BlockPropertiesFilterer) intersects(props []byte) (ret intersectsResult, err error) {
745 1 : i := 0
746 1 : decoder := blockPropertiesDecoder{props: props}
747 1 : ret = blockIntersects
748 1 : for i < len(f.shortIDToFiltersIndex) {
749 1 : var id int
750 1 : var prop []byte
751 1 : if !decoder.done() {
752 1 : var shortID shortID
753 1 : var err error
754 1 : shortID, prop, err = decoder.next()
755 1 : if err != nil {
756 0 : return ret, err
757 0 : }
758 1 : id = int(shortID)
759 1 : } else {
760 1 : id = math.MaxUint8 + 1
761 1 : }
762 1 : for i < len(f.shortIDToFiltersIndex) && id > i {
763 1 : // The property for this id is not encoded for this block, but there
764 1 : // may still be a filter for this id.
765 1 : if intersects, err := f.intersectsFilter(i, nil); err != nil {
766 0 : return ret, err
767 1 : } else if intersects == blockExcluded {
768 0 : return blockExcluded, nil
769 1 : } else if intersects == blockMaybeExcluded {
770 1 : ret = blockMaybeExcluded
771 1 : }
772 1 : i++
773 : }
774 1 : if i >= len(f.shortIDToFiltersIndex) {
775 1 : return ret, nil
776 1 : }
777 : // INVARIANT: id <= i. And since i is always incremented by 1, id==i.
778 1 : if id != i {
779 0 : panic(fmt.Sprintf("%d != %d", id, i))
780 : }
781 1 : if intersects, err := f.intersectsFilter(i, prop); err != nil {
782 0 : return ret, err
783 1 : } else if intersects == blockExcluded {
784 1 : return blockExcluded, nil
785 1 : } else if intersects == blockMaybeExcluded {
786 1 : ret = blockMaybeExcluded
787 1 : }
788 1 : i++
789 : }
790 : // ret == blockIntersects || ret == blockMaybeExcluded
791 1 : return ret, nil
792 : }
793 :
794 1 : func (f *BlockPropertiesFilterer) intersectsFilter(i int, prop []byte) (intersectsResult, error) {
795 1 : if f.shortIDToFiltersIndex[i] >= 0 {
796 1 : intersects, err := f.filters[f.shortIDToFiltersIndex[i]].Intersects(prop)
797 1 : if err != nil {
798 0 : return blockIntersects, err
799 0 : }
800 1 : if !intersects {
801 1 : return blockExcluded, nil
802 1 : }
803 : }
804 1 : if i == f.boundLimitedShortID {
805 1 : // The bound-limited filter uses this id.
806 1 : //
807 1 : // The bound-limited filter only applies within a keyspan interval. We
808 1 : // expect the Intersects call to be cheaper than bounds checks. If
809 1 : // Intersects determines that there is no intersection, we return
810 1 : // `blockMaybeExcluded` if no other bpf unconditionally excludes the
811 1 : // block.
812 1 : intersects, err := f.boundLimitedFilter.Intersects(prop)
813 1 : if err != nil {
814 0 : return blockIntersects, err
815 1 : } else if !intersects {
816 1 : return blockMaybeExcluded, nil
817 1 : }
818 : }
819 1 : return blockIntersects, nil
820 : }
|