Line data Source code
1 : // Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package sstable
6 :
7 : import (
8 : "encoding/binary"
9 : "fmt"
10 : "math"
11 : "sync"
12 :
13 : "github.com/cockroachdb/pebble/internal/base"
14 : "github.com/cockroachdb/pebble/internal/rangekey"
15 : )
16 :
17 : // Block properties are an optional user-facing feature that can be used to
18 : // filter data blocks (and whole sstables) from an Iterator before they are
19 : // loaded. They do not apply to range delete blocks. These are expected to
20 : // very concisely represent a set of some attribute value contained within the
21 : // key or value, such that the set includes all the attribute values in the
22 : // block. This has some similarities with OLAP pruning approaches that
23 : // maintain min-max attribute values for some column (which concisely
24 : // represent a set), that is then used to prune at query time. In Pebble's
25 : // case, data blocks are small, typically 25-50KB, so these properties should
26 : // reduce their precision in order to be concise -- a good rule of thumb is to
27 : // not consume more than 50-100 bytes across all properties maintained for a
28 : // block, i.e., a 500x reduction compared to loading the data block.
29 : //
30 : // A block property must be assigned a unique name, which is encoded and
31 : // stored in the sstable. This name must be unique among all user-properties
32 : // encoded in an sstable.
33 : //
34 : // A property is represented as a []byte. A nil value or empty byte slice are
35 : // considered semantically identical. The caller is free to choose the
36 : // semantics of an empty byte slice e.g. they could use it to represent the
37 : // empty set or the universal set, whichever they think is more common and
38 : // therefore better to encode more concisely. The serialization of the
39 : // property for the various Finish*() calls in a BlockPropertyCollector
40 : // implementation should be identical, since the corresponding
41 : // BlockPropertyFilter implementation is not told the context in which it is
42 : // deserializing the property.
43 : //
44 : // Block properties are more general than table properties and should be
45 : // preferred over using table properties. A BlockPropertyCollector can achieve
46 : // identical behavior to table properties by returning the nil slice from
47 : // FinishDataBlock and FinishIndexBlock, and interpret them as the universal
48 : // set in BlockPropertyFilter, and return a non-universal set in FinishTable.
49 : //
50 : // Block property filtering is nondeterministic because the separation of keys
51 : // into blocks is nondeterministic. Clients use block-property filters to
52 : // implement efficient application of a filter F that applies to key-value pairs
53 : // (abbreviated as kv-filter). Consider correctness defined as surfacing exactly
54 : // the same key-value pairs that would be surfaced if one applied the filter F
55 : // above normal iteration. With this correctness definition, block property
56 : // filtering may introduce two kinds of errors:
57 : //
58 : // a) Block property filtering that uses a kv-filter may produce additional
59 : // key-value pairs that don't satisfy the filter because of the separation
60 : // of keys into blocks. Clients may remove these extra key-value pairs by
61 : // re-applying the kv filter while reading results back from Pebble.
62 : //
63 : // b) Block property filtering may surface deleted key-value pairs if the
64 : // kv filter is not a strict function of the key's user key. A block
65 : // containing k.DEL may be filtered, while a block containing the deleted
66 : // key k.SET may not be filtered, if the kv filter applies to one but not
67 : // the other.
68 : //
69 : // This error may be avoided trivially by using a kv filter that is a pure
70 : // function of the user key. A filter that examines values or key kinds
71 : // requires care to ensure F(k.SET, <value>) = F(k.DEL) = F(k.SINGLEDEL).
72 : //
73 : // The combination of range deletions and filtering by table-level properties
74 : // add another opportunity for deleted point keys to be surfaced. The pebble
75 : // Iterator stack takes care to correctly apply filtered tables' range deletions
76 : // to lower tables, preventing this form of nondeterministic error.
77 : //
78 : // In addition to the non-determinism discussed in (b), which limits the use
79 : // of properties over values, we now have support for values that are not
80 : // stored together with the key, and may not even be retrieved during
81 : // compactions. If Pebble is configured with such value separation, block
82 : // properties must only apply to the key, and will be provided a nil value.
83 :
84 : // BlockPropertyCollector is used when writing a sstable.
85 : //
86 : // - All calls to Add are included in the next FinishDataBlock, after which
87 : // the next data block is expected to start.
88 : //
89 : // - The index entry generated for the data block, which contains the return
90 : // value from FinishDataBlock, is not immediately included in the current
91 : // index block. It is included when AddPrevDataBlockToIndexBlock is called.
92 : // An alternative would be to return an opaque handle from FinishDataBlock
93 : // and pass it to a new AddToIndexBlock method, which requires more
94 : // plumbing, and passing of an interface{} results in a undesirable heap
95 : // allocation. AddPrevDataBlockToIndexBlock must be called before keys are
96 : // added to the new data block.
97 : type BlockPropertyCollector interface {
98 : // Name returns the name of the block property collector.
99 : Name() string
100 : // Add is called with each new entry added to a data block in the sstable.
101 : // The callee can assume that these are in sorted order.
102 : Add(key InternalKey, value []byte) error
103 : // FinishDataBlock is called when all the entries have been added to a
104 : // data block. Subsequent Add calls will be for the next data block. It
105 : // returns the property value for the finished block.
106 : FinishDataBlock(buf []byte) ([]byte, error)
107 : // AddPrevDataBlockToIndexBlock adds the entry corresponding to the
108 : // previous FinishDataBlock to the current index block.
109 : AddPrevDataBlockToIndexBlock()
110 : // FinishIndexBlock is called when an index block, containing all the
111 : // key-value pairs since the last FinishIndexBlock, will no longer see new
112 : // entries. It returns the property value for the index block.
113 : FinishIndexBlock(buf []byte) ([]byte, error)
114 : // FinishTable is called when the sstable is finished, and returns the
115 : // property value for the sstable.
116 : FinishTable(buf []byte) ([]byte, error)
117 : }
118 :
119 : // SuffixReplaceableBlockCollector is an extension to the BlockPropertyCollector
120 : // interface that allows a block property collector to indicate that it supports
121 : // being *updated* during suffix replacement, i.e. when an existing SST in which
122 : // all keys have the same key suffix is updated to have a new suffix.
123 : //
124 : // A collector which supports being updated in such cases must be able to derive
125 : // its updated value from its old value and the change being made to the suffix,
126 : // without needing to be passed each updated K/V.
127 : //
128 : // For example, a collector that only inspects values would can simply copy its
129 : // previously computed property as-is, since key-suffix replacement does not
130 : // change values, while a collector that depends only on key suffixes, like one
131 : // which collected mvcc-timestamp bounds from timestamp-suffixed keys, can just
132 : // set its new bounds from the new suffix, as it is common to all keys, without
133 : // needing to recompute it from every key.
134 : //
135 : // An implementation of DataBlockIntervalCollector can also implement this
136 : // interface, in which case the BlockPropertyCollector returned by passing it to
137 : // NewBlockIntervalCollector will also implement this interface automatically.
138 : type SuffixReplaceableBlockCollector interface {
139 : // UpdateKeySuffixes is called when a block is updated to change the suffix of
140 : // all keys in the block, and is passed the old value for that prop, if any,
141 : // for that block as well as the old and new suffix.
142 : UpdateKeySuffixes(oldProp []byte, oldSuffix, newSuffix []byte) error
143 : }
144 :
145 : // BlockPropertyFilter is used in an Iterator to filter sstables and blocks
146 : // within the sstable. It should not maintain any per-sstable state, and must
147 : // be thread-safe.
148 : type BlockPropertyFilter = base.BlockPropertyFilter
149 :
150 : // BoundLimitedBlockPropertyFilter implements the block-property filter but
151 : // imposes an additional constraint on its usage, requiring that only blocks
152 : // containing exclusively keys between its lower and upper bounds may be
153 : // filtered. The bounds may be change during iteration, so the filter doesn't
154 : // expose the bounds, instead implementing KeyIsWithin[Lower,Upper]Bound methods
155 : // for performing bound comparisons.
156 : //
157 : // To be used, a BoundLimitedBlockPropertyFilter must be supplied directly
158 : // through NewBlockPropertiesFilterer's dedicated parameter. If supplied through
159 : // the ordinary slice of block property filters, this filter's bounds will be
160 : // ignored.
161 : //
162 : // The current [lower,upper) bounds of the filter are unknown, because they may
163 : // be changing. During forward iteration the lower bound is externally
164 : // guaranteed, meaning Intersects only returns false if the sstable iterator is
165 : // already known to be positioned at a key ≥ lower. The sstable iterator is then
166 : // only responsible for ensuring filtered blocks also meet the upper bound, and
167 : // should only allow a block to be filtered if all its keys are < upper. The
168 : // sstable iterator may invoke KeyIsWithinUpperBound(key) to perform this check,
169 : // where key is an inclusive upper bound on the block's keys.
170 : //
171 : // During backward iteration the upper bound is externally guaranteed, and
172 : // Intersects only returns false if the sstable iterator is already known to be
173 : // positioned at a key < upper. The sstable iterator is responsible for ensuring
174 : // filtered blocks also meet the lower bound, enforcing that a block is only
175 : // filtered if all its keys are ≥ lower. This check is made through passing the
176 : // block's inclusive lower bound to KeyIsWithinLowerBound.
177 : //
178 : // Implementations may become active or inactive through implementing Intersects
179 : // to return true whenever the filter is disabled.
180 : //
181 : // Usage of BoundLimitedBlockPropertyFilter is subtle, and Pebble consumers
182 : // should not implement this interface directly. This interface is an internal
183 : // detail in the implementation of block-property range-key masking.
184 : type BoundLimitedBlockPropertyFilter interface {
185 : BlockPropertyFilter
186 :
187 : // KeyIsWithinLowerBound tests whether the provided internal key falls
188 : // within the current lower bound of the filter. A true return value
189 : // indicates that the filter may be used to filter blocks that exclusively
190 : // contain keys ≥ `key`, so long as the blocks' keys also satisfy the upper
191 : // bound.
192 : KeyIsWithinLowerBound(key []byte) bool
193 : // KeyIsWithinUpperBound tests whether the provided internal key falls
194 : // within the current upper bound of the filter. A true return value
195 : // indicates that the filter may be used to filter blocks that exclusively
196 : // contain keys ≤ `key`, so long as the blocks' keys also satisfy the lower
197 : // bound.
198 : KeyIsWithinUpperBound(key []byte) bool
199 : }
200 :
201 : // BlockIntervalCollector is a helper implementation of BlockPropertyCollector
202 : // for users who want to represent a set of the form [lower,upper) where both
203 : // lower and upper are uint64, and lower <= upper.
204 : //
205 : // The set is encoded as:
206 : // - Two varint integers, (lower,upper-lower), when upper-lower > 0
207 : // - Nil, when upper-lower=0
208 : //
209 : // Users must not expect this to preserve differences between empty sets --
210 : // they will all get turned into the semantically equivalent [0,0).
211 : //
212 : // A BlockIntervalCollector that collects over point and range keys needs to
213 : // have both the point and range DataBlockIntervalCollector specified, since
214 : // point and range keys are fed to the BlockIntervalCollector in an interleaved
215 : // fashion, independently of one another. This also implies that the
216 : // DataBlockIntervalCollectors for point and range keys should be references to
217 : // independent instances, rather than references to the same collector, as point
218 : // and range keys are tracked independently.
219 : type BlockIntervalCollector struct {
220 : name string
221 : points DataBlockIntervalCollector
222 : ranges DataBlockIntervalCollector
223 :
224 : blockInterval interval
225 : indexInterval interval
226 : tableInterval interval
227 : }
228 :
229 : var _ BlockPropertyCollector = &BlockIntervalCollector{}
230 :
231 : // DataBlockIntervalCollector is the interface used by BlockIntervalCollector
232 : // that contains the actual logic pertaining to the property. It only
233 : // maintains state for the current data block, and resets that state in
234 : // FinishDataBlock. This interface can be used to reduce parsing costs.
235 : type DataBlockIntervalCollector interface {
236 : // Add is called with each new entry added to a data block in the sstable.
237 : // The callee can assume that these are in sorted order.
238 : Add(key InternalKey, value []byte) error
239 : // FinishDataBlock is called when all the entries have been added to a
240 : // data block. Subsequent Add calls will be for the next data block. It
241 : // returns the [lower, upper) for the finished block.
242 : FinishDataBlock() (lower uint64, upper uint64, err error)
243 : }
244 :
245 : // NewBlockIntervalCollector constructs a BlockIntervalCollector with the given
246 : // name. The BlockIntervalCollector makes use of the given point and range key
247 : // DataBlockIntervalCollectors when encountering point and range keys,
248 : // respectively.
249 : //
250 : // The caller may pass a nil DataBlockIntervalCollector for one of the point or
251 : // range key collectors, in which case keys of those types will be ignored. This
252 : // allows for flexible construction of BlockIntervalCollectors that operate on
253 : // just point keys, just range keys, or both point and range keys.
254 : //
255 : // If both point and range keys are to be tracked, two independent collectors
256 : // should be provided, rather than the same collector passed in twice (see the
257 : // comment on BlockIntervalCollector for more detail)
258 : func NewBlockIntervalCollector(
259 : name string, pointCollector, rangeCollector DataBlockIntervalCollector,
260 1 : ) BlockPropertyCollector {
261 1 : if pointCollector == nil && rangeCollector == nil {
262 1 : panic("sstable: at least one interval collector must be provided")
263 : }
264 1 : bic := BlockIntervalCollector{
265 1 : name: name,
266 1 : points: pointCollector,
267 1 : ranges: rangeCollector,
268 1 : }
269 1 : if _, ok := pointCollector.(SuffixReplaceableBlockCollector); ok {
270 1 : return &suffixReplacementBlockCollectorWrapper{bic}
271 1 : }
272 1 : return &bic
273 : }
274 :
275 : // Name implements the BlockPropertyCollector interface.
276 1 : func (b *BlockIntervalCollector) Name() string {
277 1 : return b.name
278 1 : }
279 :
280 : // Add implements the BlockPropertyCollector interface.
281 1 : func (b *BlockIntervalCollector) Add(key InternalKey, value []byte) error {
282 1 : if rangekey.IsRangeKey(key.Kind()) {
283 1 : if b.ranges != nil {
284 1 : return b.ranges.Add(key, value)
285 1 : }
286 1 : } else if b.points != nil {
287 1 : return b.points.Add(key, value)
288 1 : }
289 1 : return nil
290 : }
291 :
292 : // FinishDataBlock implements the BlockPropertyCollector interface.
293 1 : func (b *BlockIntervalCollector) FinishDataBlock(buf []byte) ([]byte, error) {
294 1 : if b.points == nil {
295 1 : return buf, nil
296 1 : }
297 1 : var err error
298 1 : b.blockInterval.lower, b.blockInterval.upper, err = b.points.FinishDataBlock()
299 1 : if err != nil {
300 0 : return buf, err
301 0 : }
302 1 : buf = b.blockInterval.encode(buf)
303 1 : b.tableInterval.union(b.blockInterval)
304 1 : return buf, nil
305 : }
306 :
307 : // AddPrevDataBlockToIndexBlock implements the BlockPropertyCollector
308 : // interface.
309 1 : func (b *BlockIntervalCollector) AddPrevDataBlockToIndexBlock() {
310 1 : b.indexInterval.union(b.blockInterval)
311 1 : b.blockInterval = interval{}
312 1 : }
313 :
314 : // FinishIndexBlock implements the BlockPropertyCollector interface.
315 1 : func (b *BlockIntervalCollector) FinishIndexBlock(buf []byte) ([]byte, error) {
316 1 : buf = b.indexInterval.encode(buf)
317 1 : b.indexInterval = interval{}
318 1 : return buf, nil
319 1 : }
320 :
321 : // FinishTable implements the BlockPropertyCollector interface.
322 1 : func (b *BlockIntervalCollector) FinishTable(buf []byte) ([]byte, error) {
323 1 : // If the collector is tracking range keys, the range key interval is union-ed
324 1 : // with the point key interval for the table.
325 1 : if b.ranges != nil {
326 1 : var rangeInterval interval
327 1 : var err error
328 1 : rangeInterval.lower, rangeInterval.upper, err = b.ranges.FinishDataBlock()
329 1 : if err != nil {
330 0 : return buf, err
331 0 : }
332 1 : b.tableInterval.union(rangeInterval)
333 : }
334 1 : return b.tableInterval.encode(buf), nil
335 : }
336 :
337 : type interval struct {
338 : lower uint64
339 : upper uint64
340 : }
341 :
342 1 : func (i interval) encode(buf []byte) []byte {
343 1 : if i.lower < i.upper {
344 1 : var encoded [binary.MaxVarintLen64 * 2]byte
345 1 : n := binary.PutUvarint(encoded[:], i.lower)
346 1 : n += binary.PutUvarint(encoded[n:], i.upper-i.lower)
347 1 : buf = append(buf, encoded[:n]...)
348 1 : }
349 1 : return buf
350 : }
351 :
352 1 : func (i *interval) decode(buf []byte) error {
353 1 : if len(buf) == 0 {
354 1 : *i = interval{}
355 1 : return nil
356 1 : }
357 1 : var n int
358 1 : i.lower, n = binary.Uvarint(buf)
359 1 : if n <= 0 || n >= len(buf) {
360 0 : return base.CorruptionErrorf("cannot decode interval from buf %x", buf)
361 0 : }
362 1 : pos := n
363 1 : i.upper, n = binary.Uvarint(buf[pos:])
364 1 : pos += n
365 1 : if pos != len(buf) || n <= 0 {
366 0 : return base.CorruptionErrorf("cannot decode interval from buf %x", buf)
367 0 : }
368 : // Delta decode.
369 1 : i.upper += i.lower
370 1 : if i.upper < i.lower {
371 0 : return base.CorruptionErrorf("unexpected overflow, upper %d < lower %d", i.upper, i.lower)
372 0 : }
373 1 : return nil
374 : }
375 :
376 1 : func (i *interval) union(x interval) {
377 1 : if x.lower >= x.upper {
378 1 : // x is the empty set.
379 1 : return
380 1 : }
381 1 : if i.lower >= i.upper {
382 1 : // i is the empty set.
383 1 : *i = x
384 1 : return
385 1 : }
386 : // Both sets are non-empty.
387 1 : if x.lower < i.lower {
388 1 : i.lower = x.lower
389 1 : }
390 1 : if x.upper > i.upper {
391 1 : i.upper = x.upper
392 1 : }
393 : }
394 :
395 1 : func (i interval) intersects(x interval) bool {
396 1 : if i.lower >= i.upper || x.lower >= x.upper {
397 1 : // At least one of the sets is empty.
398 1 : return false
399 1 : }
400 : // Neither set is empty.
401 1 : return i.upper > x.lower && i.lower < x.upper
402 : }
403 :
404 : type suffixReplacementBlockCollectorWrapper struct {
405 : BlockIntervalCollector
406 : }
407 :
408 : // UpdateKeySuffixes implements the SuffixReplaceableBlockCollector interface.
409 : func (w *suffixReplacementBlockCollectorWrapper) UpdateKeySuffixes(
410 : oldProp []byte, from, to []byte,
411 1 : ) error {
412 1 : return w.BlockIntervalCollector.points.(SuffixReplaceableBlockCollector).UpdateKeySuffixes(oldProp, from, to)
413 1 : }
414 :
415 : // BlockIntervalFilter is an implementation of BlockPropertyFilter when the
416 : // corresponding collector is a BlockIntervalCollector. That is, the set is of
417 : // the form [lower, upper).
418 : type BlockIntervalFilter struct {
419 : name string
420 : filterInterval interval
421 : }
422 :
423 : var _ BlockPropertyFilter = (*BlockIntervalFilter)(nil)
424 :
425 : // NewBlockIntervalFilter constructs a BlockPropertyFilter that filters blocks
426 : // based on an interval property collected by BlockIntervalCollector and the
427 : // given [lower, upper) bounds. The given name specifies the
428 : // BlockIntervalCollector's properties to read.
429 1 : func NewBlockIntervalFilter(name string, lower uint64, upper uint64) *BlockIntervalFilter {
430 1 : b := new(BlockIntervalFilter)
431 1 : b.Init(name, lower, upper)
432 1 : return b
433 1 : }
434 :
435 : // Init initializes (or re-initializes, clearing previous state) an existing
436 : // BLockPropertyFilter to filter blocks based on an interval property collected
437 : // by BlockIntervalCollector and the given [lower, upper) bounds. The given name
438 : // specifies the BlockIntervalCollector's properties to read.
439 1 : func (b *BlockIntervalFilter) Init(name string, lower, upper uint64) {
440 1 : *b = BlockIntervalFilter{
441 1 : name: name,
442 1 : filterInterval: interval{lower: lower, upper: upper},
443 1 : }
444 1 : }
445 :
446 : // Name implements the BlockPropertyFilter interface.
447 1 : func (b *BlockIntervalFilter) Name() string {
448 1 : return b.name
449 1 : }
450 :
451 : // Intersects implements the BlockPropertyFilter interface.
452 1 : func (b *BlockIntervalFilter) Intersects(prop []byte) (bool, error) {
453 1 : var i interval
454 1 : if err := i.decode(prop); err != nil {
455 0 : return false, err
456 0 : }
457 1 : return i.intersects(b.filterInterval), nil
458 : }
459 :
460 : // SetInterval adjusts the [lower, upper) bounds used by the filter. It is not
461 : // generally safe to alter the filter while it's in use, except as part of the
462 : // implementation of BlockPropertyFilterMask.SetSuffix used for range-key
463 : // masking.
464 1 : func (b *BlockIntervalFilter) SetInterval(lower, upper uint64) {
465 1 : b.filterInterval = interval{lower: lower, upper: upper}
466 1 : }
467 :
468 : // When encoding block properties for each block, we cannot afford to encode
469 : // the name. Instead, the name is mapped to a shortID, in the scope of that
470 : // sstable, and the shortID is encoded. Since we use a uint8, there is a limit
471 : // of 256 block property collectors per sstable.
472 : type shortID uint8
473 :
474 : type blockPropertiesEncoder struct {
475 : propsBuf []byte
476 : scratch []byte
477 : }
478 :
479 1 : func (e *blockPropertiesEncoder) getScratchForProp() []byte {
480 1 : return e.scratch[:0]
481 1 : }
482 :
483 1 : func (e *blockPropertiesEncoder) resetProps() {
484 1 : e.propsBuf = e.propsBuf[:0]
485 1 : }
486 :
487 1 : func (e *blockPropertiesEncoder) addProp(id shortID, scratch []byte) {
488 1 : const lenID = 1
489 1 : lenProp := uvarintLen(uint32(len(scratch)))
490 1 : n := lenID + lenProp + len(scratch)
491 1 : if cap(e.propsBuf)-len(e.propsBuf) < n {
492 1 : size := len(e.propsBuf) + 2*n
493 1 : if size < 2*cap(e.propsBuf) {
494 1 : size = 2 * cap(e.propsBuf)
495 1 : }
496 1 : buf := make([]byte, len(e.propsBuf), size)
497 1 : copy(buf, e.propsBuf)
498 1 : e.propsBuf = buf
499 : }
500 1 : pos := len(e.propsBuf)
501 1 : b := e.propsBuf[pos : pos+lenID]
502 1 : b[0] = byte(id)
503 1 : pos += lenID
504 1 : b = e.propsBuf[pos : pos+lenProp]
505 1 : n = binary.PutUvarint(b, uint64(len(scratch)))
506 1 : pos += n
507 1 : b = e.propsBuf[pos : pos+len(scratch)]
508 1 : pos += len(scratch)
509 1 : copy(b, scratch)
510 1 : e.propsBuf = e.propsBuf[0:pos]
511 1 : e.scratch = scratch
512 : }
513 :
514 1 : func (e *blockPropertiesEncoder) unsafeProps() []byte {
515 1 : return e.propsBuf
516 1 : }
517 :
518 1 : func (e *blockPropertiesEncoder) props() []byte {
519 1 : buf := make([]byte, len(e.propsBuf))
520 1 : copy(buf, e.propsBuf)
521 1 : return buf
522 1 : }
523 :
524 : type blockPropertiesDecoder struct {
525 : props []byte
526 : }
527 :
528 1 : func (d *blockPropertiesDecoder) done() bool {
529 1 : return len(d.props) == 0
530 1 : }
531 :
532 : // REQUIRES: !done()
533 1 : func (d *blockPropertiesDecoder) next() (id shortID, prop []byte, err error) {
534 1 : const lenID = 1
535 1 : id = shortID(d.props[0])
536 1 : propLen, m := binary.Uvarint(d.props[lenID:])
537 1 : n := lenID + m
538 1 : if m <= 0 || propLen == 0 || (n+int(propLen)) > len(d.props) {
539 0 : return 0, nil, base.CorruptionErrorf("corrupt block property length")
540 0 : }
541 1 : prop = d.props[n : n+int(propLen)]
542 1 : d.props = d.props[n+int(propLen):]
543 1 : return id, prop, nil
544 : }
545 :
546 : // BlockPropertiesFilterer provides filtering support when reading an sstable
547 : // in the context of an iterator that has a slice of BlockPropertyFilters.
548 : // After the call to NewBlockPropertiesFilterer, the caller must call
549 : // IntersectsUserPropsAndFinishInit to check if the sstable intersects with
550 : // the filters. If it does intersect, this function also finishes initializing
551 : // the BlockPropertiesFilterer using the shortIDs for the relevant filters.
552 : // Subsequent checks for relevance of a block should use the intersects
553 : // method.
554 : type BlockPropertiesFilterer struct {
555 : filters []BlockPropertyFilter
556 : // Maps shortID => index in filters. This can be sparse, and shortIDs for
557 : // which there is no filter are represented with an index of -1. The
558 : // length of this can be shorter than the shortIDs allocated in the
559 : // sstable. e.g. if the sstable used shortIDs 0, 1, 2, 3, and the iterator
560 : // has two filters, corresponding to shortIDs 2, 0, this would be:
561 : // len(shortIDToFiltersIndex)==3, 0=>1, 1=>-1, 2=>0.
562 : shortIDToFiltersIndex []int
563 :
564 : // boundLimitedFilter, if non-nil, holds a single block-property filter with
565 : // additional constraints on its filtering. A boundLimitedFilter may only
566 : // filter blocks that are wholly contained within its bounds. During forward
567 : // iteration the lower bound (and during backward iteration the upper bound)
568 : // must be externally guaranteed, with Intersects only returning false if
569 : // that bound is met. The opposite bound is verified during iteration by the
570 : // sstable iterator.
571 : //
572 : // boundLimitedFilter is permitted to be defined on a property (`Name()`)
573 : // for which another filter exists in filters. In this case both filters
574 : // will be consulted, and either filter may exclude block(s). Only a single
575 : // bound-limited block-property filter may be set.
576 : //
577 : // The boundLimitedShortID field contains the shortID of the filter's
578 : // property within the sstable. It's set to -1 if the property was not
579 : // collected when the table was built.
580 : boundLimitedFilter BoundLimitedBlockPropertyFilter
581 : boundLimitedShortID int
582 : }
583 :
584 : var blockPropertiesFiltererPool = sync.Pool{
585 1 : New: func() interface{} {
586 1 : return &BlockPropertiesFilterer{}
587 1 : },
588 : }
589 :
590 : // newBlockPropertiesFilterer returns a partially initialized filterer. To complete
591 : // initialization, call IntersectsUserPropsAndFinishInit.
592 : func newBlockPropertiesFilterer(
593 : filters []BlockPropertyFilter, limited BoundLimitedBlockPropertyFilter,
594 1 : ) *BlockPropertiesFilterer {
595 1 : filterer := blockPropertiesFiltererPool.Get().(*BlockPropertiesFilterer)
596 1 : *filterer = BlockPropertiesFilterer{
597 1 : filters: filters,
598 1 : shortIDToFiltersIndex: filterer.shortIDToFiltersIndex[:0],
599 1 : boundLimitedFilter: limited,
600 1 : boundLimitedShortID: -1,
601 1 : }
602 1 : return filterer
603 1 : }
604 :
605 1 : func releaseBlockPropertiesFilterer(filterer *BlockPropertiesFilterer) {
606 1 : *filterer = BlockPropertiesFilterer{
607 1 : shortIDToFiltersIndex: filterer.shortIDToFiltersIndex[:0],
608 1 : }
609 1 : blockPropertiesFiltererPool.Put(filterer)
610 1 : }
611 :
612 : // IntersectsTable evaluates the provided block-property filter against the
613 : // provided set of table-level properties. If there is no intersection between
614 : // the filters and the table or an error is encountered, IntersectsTable returns
615 : // a nil filterer (and possibly an error). If there is an intersection,
616 : // IntersectsTable returns a non-nil filterer that may be used by an iterator
617 : // reading the table.
618 : func IntersectsTable(
619 : filters []BlockPropertyFilter,
620 : limited BoundLimitedBlockPropertyFilter,
621 : userProperties map[string]string,
622 1 : ) (*BlockPropertiesFilterer, error) {
623 1 : f := newBlockPropertiesFilterer(filters, limited)
624 1 : ok, err := f.intersectsUserPropsAndFinishInit(userProperties)
625 1 : if !ok || err != nil {
626 1 : releaseBlockPropertiesFilterer(f)
627 1 : return nil, err
628 1 : }
629 1 : return f, nil
630 : }
631 :
632 : // intersectsUserPropsAndFinishInit is called with the user properties map for
633 : // the sstable and returns whether the sstable intersects the filters. It
634 : // additionally initializes the shortIDToFiltersIndex for the filters that are
635 : // relevant to this sstable.
636 : func (f *BlockPropertiesFilterer) intersectsUserPropsAndFinishInit(
637 : userProperties map[string]string,
638 1 : ) (bool, error) {
639 1 : for i := range f.filters {
640 1 : props, ok := userProperties[f.filters[i].Name()]
641 1 : if !ok {
642 1 : // Collector was not used when writing this file, so it is
643 1 : // considered intersecting.
644 1 : continue
645 : }
646 1 : byteProps := []byte(props)
647 1 : if len(byteProps) < 1 {
648 0 : return false, base.CorruptionErrorf(
649 0 : "block properties for %s is corrupted", f.filters[i].Name())
650 0 : }
651 1 : shortID := shortID(byteProps[0])
652 1 : intersects, err := f.filters[i].Intersects(byteProps[1:])
653 1 : if err != nil || !intersects {
654 1 : return false, err
655 1 : }
656 : // Intersects the sstable, so need to use this filter when
657 : // deciding whether to read blocks.
658 1 : n := len(f.shortIDToFiltersIndex)
659 1 : if n <= int(shortID) {
660 1 : if cap(f.shortIDToFiltersIndex) <= int(shortID) {
661 1 : index := make([]int, shortID+1, 2*(shortID+1))
662 1 : copy(index, f.shortIDToFiltersIndex)
663 1 : f.shortIDToFiltersIndex = index
664 1 : } else {
665 1 : f.shortIDToFiltersIndex = f.shortIDToFiltersIndex[:shortID+1]
666 1 : }
667 1 : for j := n; j < int(shortID); j++ {
668 1 : f.shortIDToFiltersIndex[j] = -1
669 1 : }
670 : }
671 1 : f.shortIDToFiltersIndex[shortID] = i
672 : }
673 1 : if f.boundLimitedFilter == nil {
674 1 : return true, nil
675 1 : }
676 :
677 : // There's a bound-limited filter. Find its shortID. It's possible that
678 : // there's an existing filter in f.filters on the same property. That's
679 : // okay. Both filters will be consulted whenever a relevant prop is decoded.
680 1 : props, ok := userProperties[f.boundLimitedFilter.Name()]
681 1 : if !ok {
682 1 : // The collector was not used when writing this file, so it's
683 1 : // intersecting. We leave f.boundLimitedShortID=-1, so the filter will
684 1 : // be unused within this file.
685 1 : return true, nil
686 1 : }
687 1 : byteProps := []byte(props)
688 1 : if len(byteProps) < 1 {
689 0 : return false, base.CorruptionErrorf(
690 0 : "block properties for %s is corrupted", f.boundLimitedFilter.Name())
691 0 : }
692 1 : f.boundLimitedShortID = int(byteProps[0])
693 1 :
694 1 : // We don't check for table-level intersection for the bound-limited filter.
695 1 : // The bound-limited filter is treated as vacuously intersecting.
696 1 : //
697 1 : // NB: If a block-property filter needs to be toggled inactive/active, it
698 1 : // should be implemented within the Intersects implementation.
699 1 : //
700 1 : // TODO(jackson): We could filter at the table-level by threading the table
701 1 : // smallest and largest bounds here.
702 1 :
703 1 : // The bound-limited filter isn't included in shortIDToFiltersIndex.
704 1 : //
705 1 : // When determining intersection, we decode props only up to the shortID
706 1 : // len(shortIDToFiltersIndex). If f.limitedShortID is greater than any of
707 1 : // the existing filters' shortIDs, we need to grow shortIDToFiltersIndex.
708 1 : // Growing the index with -1s ensures we're able to consult the index
709 1 : // without length checks.
710 1 : if n := len(f.shortIDToFiltersIndex); n <= f.boundLimitedShortID {
711 1 : if cap(f.shortIDToFiltersIndex) <= f.boundLimitedShortID {
712 1 : index := make([]int, f.boundLimitedShortID+1)
713 1 : copy(index, f.shortIDToFiltersIndex)
714 1 : f.shortIDToFiltersIndex = index
715 1 : } else {
716 1 : f.shortIDToFiltersIndex = f.shortIDToFiltersIndex[:f.boundLimitedShortID+1]
717 1 : }
718 1 : for j := n; j <= f.boundLimitedShortID; j++ {
719 1 : f.shortIDToFiltersIndex[j] = -1
720 1 : }
721 : }
722 1 : return true, nil
723 : }
724 :
725 : type intersectsResult int8
726 :
727 : const (
728 : blockIntersects intersectsResult = iota
729 : blockExcluded
730 : // blockMaybeExcluded is returned by BlockPropertiesFilterer.intersects when
731 : // no filters unconditionally exclude the block, but the bound-limited block
732 : // property filter will exclude it if the block's bounds fall within the
733 : // filter's current bounds. See the reader's
734 : // {single,two}LevelIterator.resolveMaybeExcluded methods.
735 : blockMaybeExcluded
736 : )
737 :
738 1 : func (f *BlockPropertiesFilterer) intersects(props []byte) (ret intersectsResult, err error) {
739 1 : i := 0
740 1 : decoder := blockPropertiesDecoder{props: props}
741 1 : ret = blockIntersects
742 1 : for i < len(f.shortIDToFiltersIndex) {
743 1 : var id int
744 1 : var prop []byte
745 1 : if !decoder.done() {
746 1 : var shortID shortID
747 1 : var err error
748 1 : shortID, prop, err = decoder.next()
749 1 : if err != nil {
750 0 : return ret, err
751 0 : }
752 1 : id = int(shortID)
753 1 : } else {
754 1 : id = math.MaxUint8 + 1
755 1 : }
756 1 : for i < len(f.shortIDToFiltersIndex) && id > i {
757 1 : // The property for this id is not encoded for this block, but there
758 1 : // may still be a filter for this id.
759 1 : if intersects, err := f.intersectsFilter(i, nil); err != nil {
760 0 : return ret, err
761 1 : } else if intersects == blockExcluded {
762 1 : return blockExcluded, nil
763 1 : } else if intersects == blockMaybeExcluded {
764 0 : ret = blockMaybeExcluded
765 0 : }
766 1 : i++
767 : }
768 1 : if i >= len(f.shortIDToFiltersIndex) {
769 1 : return ret, nil
770 1 : }
771 : // INVARIANT: id <= i. And since i is always incremented by 1, id==i.
772 1 : if id != i {
773 0 : panic(fmt.Sprintf("%d != %d", id, i))
774 : }
775 1 : if intersects, err := f.intersectsFilter(i, prop); err != nil {
776 0 : return ret, err
777 1 : } else if intersects == blockExcluded {
778 1 : return blockExcluded, nil
779 1 : } else if intersects == blockMaybeExcluded {
780 1 : ret = blockMaybeExcluded
781 1 : }
782 1 : i++
783 : }
784 : // ret == blockIntersects || ret == blockMaybeExcluded
785 1 : return ret, nil
786 : }
787 :
788 1 : func (f *BlockPropertiesFilterer) intersectsFilter(i int, prop []byte) (intersectsResult, error) {
789 1 : if f.shortIDToFiltersIndex[i] >= 0 {
790 1 : intersects, err := f.filters[f.shortIDToFiltersIndex[i]].Intersects(prop)
791 1 : if err != nil {
792 0 : return blockIntersects, err
793 0 : }
794 1 : if !intersects {
795 1 : return blockExcluded, nil
796 1 : }
797 : }
798 1 : if i == f.boundLimitedShortID {
799 1 : // The bound-limited filter uses this id.
800 1 : //
801 1 : // The bound-limited filter only applies within a keyspan interval. We
802 1 : // expect the Intersects call to be cheaper than bounds checks. If
803 1 : // Intersects determines that there is no intersection, we return
804 1 : // `blockMaybeExcluded` if no other bpf unconditionally excludes the
805 1 : // block.
806 1 : intersects, err := f.boundLimitedFilter.Intersects(prop)
807 1 : if err != nil {
808 0 : return blockIntersects, err
809 1 : } else if !intersects {
810 1 : return blockMaybeExcluded, nil
811 1 : }
812 : }
813 1 : return blockIntersects, nil
814 : }
|