Line data Source code
1 : // Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package sstable
6 :
7 : import (
8 : "encoding/binary"
9 : "fmt"
10 : "math"
11 : "sync"
12 : "unsafe"
13 :
14 : "github.com/cockroachdb/errors"
15 : "github.com/cockroachdb/pebble/internal/base"
16 : "github.com/cockroachdb/pebble/internal/rangekey"
17 : )
18 :
19 : // Block properties are an optional user-facing feature that can be used to
20 : // filter data blocks (and whole sstables) from an Iterator before they are
21 : // loaded. They do not apply to range delete blocks. These are expected to
22 : // very concisely represent a set of some attribute value contained within the
23 : // key or value, such that the set includes all the attribute values in the
24 : // block. This has some similarities with OLAP pruning approaches that
25 : // maintain min-max attribute values for some column (which concisely
26 : // represent a set), that is then used to prune at query time. In Pebble's
27 : // case, data blocks are small, typically 25-50KB, so these properties should
28 : // reduce their precision in order to be concise -- a good rule of thumb is to
29 : // not consume more than 50-100 bytes across all properties maintained for a
30 : // block, i.e., a 500x reduction compared to loading the data block.
31 : //
32 : // A block property must be assigned a unique name, which is encoded and
33 : // stored in the sstable. This name must be unique among all user-properties
34 : // encoded in an sstable.
35 : //
36 : // A property is represented as a []byte. A nil value or empty byte slice are
37 : // considered semantically identical. The caller is free to choose the
38 : // semantics of an empty byte slice e.g. they could use it to represent the
39 : // empty set or the universal set, whichever they think is more common and
40 : // therefore better to encode more concisely. The serialization of the
41 : // property for the various Finish*() calls in a BlockPropertyCollector
42 : // implementation should be identical, since the corresponding
43 : // BlockPropertyFilter implementation is not told the context in which it is
44 : // deserializing the property.
45 : //
46 : // Block properties are more general than table properties and should be
47 : // preferred over using table properties. A BlockPropertyCollector can achieve
48 : // identical behavior to table properties by returning the nil slice from
49 : // FinishDataBlock and FinishIndexBlock, and interpret them as the universal
50 : // set in BlockPropertyFilter, and return a non-universal set in FinishTable.
51 : //
52 : // Block property filtering is nondeterministic because the separation of keys
53 : // into blocks is nondeterministic. Clients use block-property filters to
54 : // implement efficient application of a filter F that applies to key-value pairs
55 : // (abbreviated as kv-filter). Consider correctness defined as surfacing exactly
56 : // the same key-value pairs that would be surfaced if one applied the filter F
57 : // above normal iteration. With this correctness definition, block property
58 : // filtering may introduce two kinds of errors:
59 : //
60 : // a) Block property filtering that uses a kv-filter may produce additional
61 : // key-value pairs that don't satisfy the filter because of the separation
62 : // of keys into blocks. Clients may remove these extra key-value pairs by
63 : // re-applying the kv filter while reading results back from Pebble.
64 : //
65 : // b) Block property filtering may surface deleted key-value pairs if the
66 : // kv filter is not a strict function of the key's user key. A block
67 : // containing k.DEL may be filtered, while a block containing the deleted
68 : // key k.SET may not be filtered, if the kv filter applies to one but not
69 : // the other.
70 : //
71 : // This error may be avoided trivially by using a kv filter that is a pure
72 : // function of the user key. A filter that examines values or key kinds
73 : // requires care to ensure F(k.SET, <value>) = F(k.DEL) = F(k.SINGLEDEL).
74 : //
75 : // The combination of range deletions and filtering by table-level properties
76 : // add another opportunity for deleted point keys to be surfaced. The pebble
77 : // Iterator stack takes care to correctly apply filtered tables' range deletions
78 : // to lower tables, preventing this form of nondeterministic error.
79 : //
80 : // In addition to the non-determinism discussed in (b), which limits the use
81 : // of properties over values, we now have support for values that are not
82 : // stored together with the key, and may not even be retrieved during
83 : // compactions. If Pebble is configured with such value separation, block
84 : // properties must only apply to the key, and will be provided a nil value.
85 :
86 : // BlockPropertyCollector is used when writing a sstable.
87 : //
88 : // - All calls to Add are included in the next FinishDataBlock, after which
89 : // the next data block is expected to start.
90 : //
91 : // - The index entry generated for the data block, which contains the return
92 : // value from FinishDataBlock, is not immediately included in the current
93 : // index block. It is included when AddPrevDataBlockToIndexBlock is called.
94 : // An alternative would be to return an opaque handle from FinishDataBlock
95 : // and pass it to a new AddToIndexBlock method, which requires more
96 : // plumbing, and passing of an interface{} results in a undesirable heap
97 : // allocation. AddPrevDataBlockToIndexBlock must be called before keys are
98 : // added to the new data block.
99 : type BlockPropertyCollector interface {
100 : // Name returns the name of the block property collector.
101 : Name() string
102 : // Add is called with each new entry added to a data block in the sstable.
103 : // The callee can assume that these are in sorted order.
104 : Add(key InternalKey, value []byte) error
105 : // FinishDataBlock is called when all the entries have been added to a
106 : // data block. Subsequent Add calls will be for the next data block. It
107 : // returns the property value for the finished block.
108 : FinishDataBlock(buf []byte) ([]byte, error)
109 : // AddPrevDataBlockToIndexBlock adds the entry corresponding to the
110 : // previous FinishDataBlock to the current index block.
111 : AddPrevDataBlockToIndexBlock()
112 : // FinishIndexBlock is called when an index block, containing all the
113 : // key-value pairs since the last FinishIndexBlock, will no longer see new
114 : // entries. It returns the property value for the index block.
115 : FinishIndexBlock(buf []byte) ([]byte, error)
116 : // FinishTable is called when the sstable is finished, and returns the
117 : // property value for the sstable.
118 : FinishTable(buf []byte) ([]byte, error)
119 : }
120 :
121 : // SuffixReplaceableBlockCollector is an extension to the BlockPropertyCollector
122 : // interface that allows a block property collector to indicate that it supports
123 : // being *updated* during suffix replacement, i.e. when an existing SST in which
124 : // all keys have the same key suffix is updated to have a new suffix.
125 : //
126 : // A collector which supports being updated in such cases must be able to derive
127 : // its updated value from its old value and the change being made to the suffix,
128 : // without needing to be passed each updated K/V.
129 : //
130 : // For example, a collector that only inspects values would can simply copy its
131 : // previously computed property as-is, since key-suffix replacement does not
132 : // change values, while a collector that depends only on key suffixes, like one
133 : // which collected mvcc-timestamp bounds from timestamp-suffixed keys, can just
134 : // set its new bounds from the new suffix, as it is common to all keys, without
135 : // needing to recompute it from every key.
136 : //
137 : // An implementation of DataBlockIntervalCollector can also implement this
138 : // interface, in which case the BlockPropertyCollector returned by passing it to
139 : // NewBlockIntervalCollector will also implement this interface automatically.
140 : type SuffixReplaceableBlockCollector interface {
141 : // UpdateKeySuffixes is called when a block is updated to change the suffix of
142 : // all keys in the block, and is passed the old value for that prop, if any,
143 : // for that block as well as the old and new suffix.
144 : UpdateKeySuffixes(oldProp []byte, oldSuffix, newSuffix []byte) error
145 : }
146 :
147 : // BlockPropertyFilter is used in an Iterator to filter sstables and blocks
148 : // within the sstable. It should not maintain any per-sstable state, and must
149 : // be thread-safe.
150 : type BlockPropertyFilter = base.BlockPropertyFilter
151 :
152 : // BoundLimitedBlockPropertyFilter implements the block-property filter but
153 : // imposes an additional constraint on its usage, requiring that only blocks
154 : // containing exclusively keys between its lower and upper bounds may be
155 : // filtered. The bounds may be change during iteration, so the filter doesn't
156 : // expose the bounds, instead implementing KeyIsWithin[Lower,Upper]Bound methods
157 : // for performing bound comparisons.
158 : //
159 : // To be used, a BoundLimitedBlockPropertyFilter must be supplied directly
160 : // through NewBlockPropertiesFilterer's dedicated parameter. If supplied through
161 : // the ordinary slice of block property filters, this filter's bounds will be
162 : // ignored.
163 : //
164 : // The current [lower,upper) bounds of the filter are unknown, because they may
165 : // be changing. During forward iteration the lower bound is externally
166 : // guaranteed, meaning Intersects only returns false if the sstable iterator is
167 : // already known to be positioned at a key ≥ lower. The sstable iterator is then
168 : // only responsible for ensuring filtered blocks also meet the upper bound, and
169 : // should only allow a block to be filtered if all its keys are < upper. The
170 : // sstable iterator may invoke KeyIsWithinUpperBound(key) to perform this check,
171 : // where key is an inclusive upper bound on the block's keys.
172 : //
173 : // During backward iteration the upper bound is externally guaranteed, and
174 : // Intersects only returns false if the sstable iterator is already known to be
175 : // positioned at a key < upper. The sstable iterator is responsible for ensuring
176 : // filtered blocks also meet the lower bound, enforcing that a block is only
177 : // filtered if all its keys are ≥ lower. This check is made through passing the
178 : // block's inclusive lower bound to KeyIsWithinLowerBound.
179 : //
180 : // Implementations may become active or inactive through implementing Intersects
181 : // to return true whenever the filter is disabled.
182 : //
183 : // Usage of BoundLimitedBlockPropertyFilter is subtle, and Pebble consumers
184 : // should not implement this interface directly. This interface is an internal
185 : // detail in the implementation of block-property range-key masking.
186 : type BoundLimitedBlockPropertyFilter interface {
187 : BlockPropertyFilter
188 :
189 : // KeyIsWithinLowerBound tests whether the provided internal key falls
190 : // within the current lower bound of the filter. A true return value
191 : // indicates that the filter may be used to filter blocks that exclusively
192 : // contain keys ≥ `key`, so long as the blocks' keys also satisfy the upper
193 : // bound.
194 : KeyIsWithinLowerBound(key []byte) bool
195 : // KeyIsWithinUpperBound tests whether the provided internal key falls
196 : // within the current upper bound of the filter. A true return value
197 : // indicates that the filter may be used to filter blocks that exclusively
198 : // contain keys ≤ `key`, so long as the blocks' keys also satisfy the lower
199 : // bound.
200 : KeyIsWithinUpperBound(key []byte) bool
201 : }
202 :
203 : // BlockIntervalCollector is a helper implementation of BlockPropertyCollector
204 : // for users who want to represent a set of the form [lower,upper) where both
205 : // lower and upper are uint64, and lower <= upper.
206 : //
207 : // The set is encoded as:
208 : // - Two varint integers, (lower,upper-lower), when upper-lower > 0
209 : // - Nil, when upper-lower=0
210 : //
211 : // Users must not expect this to preserve differences between empty sets --
212 : // they will all get turned into the semantically equivalent [0,0).
213 : //
214 : // A BlockIntervalCollector that collects over point and range keys needs to
215 : // have both the point and range DataBlockIntervalCollector specified, since
216 : // point and range keys are fed to the BlockIntervalCollector in an interleaved
217 : // fashion, independently of one another. This also implies that the
218 : // DataBlockIntervalCollectors for point and range keys should be references to
219 : // independent instances, rather than references to the same collector, as point
220 : // and range keys are tracked independently.
221 : type BlockIntervalCollector struct {
222 : name string
223 : points DataBlockIntervalCollector
224 : ranges DataBlockIntervalCollector
225 :
226 : blockInterval interval
227 : indexInterval interval
228 : tableInterval interval
229 : }
230 :
231 : var _ BlockPropertyCollector = &BlockIntervalCollector{}
232 :
233 : // DataBlockIntervalCollector is the interface used by BlockIntervalCollector
234 : // that contains the actual logic pertaining to the property. It only
235 : // maintains state for the current data block, and resets that state in
236 : // FinishDataBlock. This interface can be used to reduce parsing costs.
237 : type DataBlockIntervalCollector interface {
238 : // Add is called with each new entry added to a data block in the sstable.
239 : // The callee can assume that these are in sorted order.
240 : Add(key InternalKey, value []byte) error
241 : // FinishDataBlock is called when all the entries have been added to a
242 : // data block. Subsequent Add calls will be for the next data block. It
243 : // returns the [lower, upper) for the finished block.
244 : FinishDataBlock() (lower uint64, upper uint64, err error)
245 : }
246 :
247 : // NewBlockIntervalCollector constructs a BlockIntervalCollector with the given
248 : // name. The BlockIntervalCollector makes use of the given point and range key
249 : // DataBlockIntervalCollectors when encountering point and range keys,
250 : // respectively.
251 : //
252 : // The caller may pass a nil DataBlockIntervalCollector for one of the point or
253 : // range key collectors, in which case keys of those types will be ignored. This
254 : // allows for flexible construction of BlockIntervalCollectors that operate on
255 : // just point keys, just range keys, or both point and range keys.
256 : //
257 : // If both point and range keys are to be tracked, two independent collectors
258 : // should be provided, rather than the same collector passed in twice (see the
259 : // comment on BlockIntervalCollector for more detail)
260 : func NewBlockIntervalCollector(
261 : name string, pointCollector, rangeCollector DataBlockIntervalCollector,
262 1 : ) BlockPropertyCollector {
263 1 : if pointCollector == nil && rangeCollector == nil {
264 0 : panic("sstable: at least one interval collector must be provided")
265 : }
266 1 : bic := BlockIntervalCollector{
267 1 : name: name,
268 1 : points: pointCollector,
269 1 : ranges: rangeCollector,
270 1 : }
271 1 : if _, ok := pointCollector.(SuffixReplaceableBlockCollector); ok {
272 0 : return &suffixReplacementBlockCollectorWrapper{bic}
273 0 : }
274 1 : return &bic
275 : }
276 :
277 : // Name implements the BlockPropertyCollector interface.
278 1 : func (b *BlockIntervalCollector) Name() string {
279 1 : return b.name
280 1 : }
281 :
282 : // Add implements the BlockPropertyCollector interface.
283 1 : func (b *BlockIntervalCollector) Add(key InternalKey, value []byte) error {
284 1 : if rangekey.IsRangeKey(key.Kind()) {
285 1 : if b.ranges != nil {
286 0 : return b.ranges.Add(key, value)
287 0 : }
288 1 : } else if b.points != nil {
289 1 : return b.points.Add(key, value)
290 1 : }
291 1 : return nil
292 : }
293 :
294 : // FinishDataBlock implements the BlockPropertyCollector interface.
295 1 : func (b *BlockIntervalCollector) FinishDataBlock(buf []byte) ([]byte, error) {
296 1 : if b.points == nil {
297 0 : return buf, nil
298 0 : }
299 1 : var err error
300 1 : b.blockInterval.lower, b.blockInterval.upper, err = b.points.FinishDataBlock()
301 1 : if err != nil {
302 0 : return buf, err
303 0 : }
304 1 : buf = b.blockInterval.encode(buf)
305 1 : b.tableInterval.union(b.blockInterval)
306 1 : return buf, nil
307 : }
308 :
309 : // AddPrevDataBlockToIndexBlock implements the BlockPropertyCollector
310 : // interface.
311 1 : func (b *BlockIntervalCollector) AddPrevDataBlockToIndexBlock() {
312 1 : b.indexInterval.union(b.blockInterval)
313 1 : b.blockInterval = interval{}
314 1 : }
315 :
316 : // FinishIndexBlock implements the BlockPropertyCollector interface.
317 1 : func (b *BlockIntervalCollector) FinishIndexBlock(buf []byte) ([]byte, error) {
318 1 : buf = b.indexInterval.encode(buf)
319 1 : b.indexInterval = interval{}
320 1 : return buf, nil
321 1 : }
322 :
323 : // FinishTable implements the BlockPropertyCollector interface.
324 1 : func (b *BlockIntervalCollector) FinishTable(buf []byte) ([]byte, error) {
325 1 : // If the collector is tracking range keys, the range key interval is union-ed
326 1 : // with the point key interval for the table.
327 1 : if b.ranges != nil {
328 0 : var rangeInterval interval
329 0 : var err error
330 0 : rangeInterval.lower, rangeInterval.upper, err = b.ranges.FinishDataBlock()
331 0 : if err != nil {
332 0 : return buf, err
333 0 : }
334 0 : b.tableInterval.union(rangeInterval)
335 : }
336 1 : return b.tableInterval.encode(buf), nil
337 : }
338 :
339 : type interval struct {
340 : lower uint64
341 : upper uint64
342 : }
343 :
344 1 : func (i interval) encode(buf []byte) []byte {
345 1 : if i.lower < i.upper {
346 1 : var encoded [binary.MaxVarintLen64 * 2]byte
347 1 : n := binary.PutUvarint(encoded[:], i.lower)
348 1 : n += binary.PutUvarint(encoded[n:], i.upper-i.lower)
349 1 : buf = append(buf, encoded[:n]...)
350 1 : }
351 1 : return buf
352 : }
353 :
354 1 : func (i *interval) decode(buf []byte) error {
355 1 : if len(buf) == 0 {
356 1 : *i = interval{}
357 1 : return nil
358 1 : }
359 1 : var n int
360 1 : i.lower, n = binary.Uvarint(buf)
361 1 : if n <= 0 || n >= len(buf) {
362 0 : return base.CorruptionErrorf("cannot decode interval from buf %x", buf)
363 0 : }
364 1 : pos := n
365 1 : i.upper, n = binary.Uvarint(buf[pos:])
366 1 : pos += n
367 1 : if pos != len(buf) || n <= 0 {
368 0 : return base.CorruptionErrorf("cannot decode interval from buf %x", buf)
369 0 : }
370 : // Delta decode.
371 1 : i.upper += i.lower
372 1 : if i.upper < i.lower {
373 0 : return base.CorruptionErrorf("unexpected overflow, upper %d < lower %d", i.upper, i.lower)
374 0 : }
375 1 : return nil
376 : }
377 :
378 1 : func (i *interval) union(x interval) {
379 1 : if x.lower >= x.upper {
380 1 : // x is the empty set.
381 1 : return
382 1 : }
383 1 : if i.lower >= i.upper {
384 1 : // i is the empty set.
385 1 : *i = x
386 1 : return
387 1 : }
388 : // Both sets are non-empty.
389 1 : if x.lower < i.lower {
390 1 : i.lower = x.lower
391 1 : }
392 1 : if x.upper > i.upper {
393 1 : i.upper = x.upper
394 1 : }
395 : }
396 :
397 1 : func (i interval) intersects(x interval) bool {
398 1 : if i.lower >= i.upper || x.lower >= x.upper {
399 1 : // At least one of the sets is empty.
400 1 : return false
401 1 : }
402 : // Neither set is empty.
403 1 : return i.upper > x.lower && i.lower < x.upper
404 : }
405 :
406 : type suffixReplacementBlockCollectorWrapper struct {
407 : BlockIntervalCollector
408 : }
409 :
410 : // UpdateKeySuffixes implements the SuffixReplaceableBlockCollector interface.
411 : func (w *suffixReplacementBlockCollectorWrapper) UpdateKeySuffixes(
412 : oldProp []byte, from, to []byte,
413 0 : ) error {
414 0 : return w.BlockIntervalCollector.points.(SuffixReplaceableBlockCollector).UpdateKeySuffixes(oldProp, from, to)
415 0 : }
416 :
417 : // BlockIntervalSyntheticReplacer provides methods to conduct just in time
418 : // adjustments of a passed in block prop interval before filtering.
419 : type BlockIntervalSyntheticReplacer interface {
420 : // AdjustIntervalWithSyntheticSuffix adjusts the [lower, upper) range for a data
421 : // block (specifically, the range returned by the corresponding
422 : // DataBlockIntervalCollector.FinishDataBlock) to what it would have been if
423 : // all the input keys had the given synthetic suffix.
424 : AdjustIntervalWithSyntheticSuffix(lower uint64, upper uint64, suffix []byte) (adjustedLower uint64, adjustedUpper uint64, err error)
425 : }
426 :
427 : // BlockIntervalFilter is an implementation of BlockPropertyFilter when the
428 : // corresponding collector is a BlockIntervalCollector. That is, the set is of
429 : // the form [lower, upper).
430 : type BlockIntervalFilter struct {
431 : name string
432 : filterInterval interval
433 : syntheticReplacer BlockIntervalSyntheticReplacer
434 : }
435 :
436 : var _ BlockPropertyFilter = (*BlockIntervalFilter)(nil)
437 :
438 : // NewBlockIntervalFilter constructs a BlockPropertyFilter that filters blocks
439 : // based on an interval property collected by BlockIntervalCollector and the
440 : // given [lower, upper) bounds. The given name specifies the
441 : // BlockIntervalCollector's properties to read.
442 : func NewBlockIntervalFilter(
443 : name string, lower uint64, upper uint64, syntheticReplacer BlockIntervalSyntheticReplacer,
444 1 : ) *BlockIntervalFilter {
445 1 : b := new(BlockIntervalFilter)
446 1 : b.Init(name, lower, upper, syntheticReplacer)
447 1 : return b
448 1 : }
449 :
450 : // Init initializes (or re-initializes, clearing previous state) an existing
451 : // BLockPropertyFilter to filter blocks based on an interval property collected
452 : // by BlockIntervalCollector and the given [lower, upper) bounds. The given name
453 : // specifies the BlockIntervalCollector's properties to read.
454 : func (b *BlockIntervalFilter) Init(
455 : name string, lower, upper uint64, syntheticReplacer BlockIntervalSyntheticReplacer,
456 1 : ) {
457 1 : *b = BlockIntervalFilter{
458 1 : name: name,
459 1 : filterInterval: interval{lower: lower, upper: upper},
460 1 : syntheticReplacer: syntheticReplacer,
461 1 : }
462 1 : }
463 :
464 : // Name implements the BlockPropertyFilter interface.
465 1 : func (b *BlockIntervalFilter) Name() string {
466 1 : return b.name
467 1 : }
468 :
469 : // Intersects implements the BlockPropertyFilter interface.
470 1 : func (b *BlockIntervalFilter) Intersects(prop []byte) (bool, error) {
471 1 : var i interval
472 1 : if err := i.decode(prop); err != nil {
473 0 : return false, err
474 0 : }
475 1 : return i.intersects(b.filterInterval), nil
476 : }
477 :
478 : // SyntheticSuffixIntersects implements the BlockPropertyFilter interface.
479 1 : func (b *BlockIntervalFilter) SyntheticSuffixIntersects(prop []byte, suffix []byte) (bool, error) {
480 1 : if b.syntheticReplacer == nil {
481 0 : return false, errors.AssertionFailedf("missing SyntheticReplacer for SyntheticSuffixIntersects()")
482 0 : }
483 1 : var i interval
484 1 : if err := i.decode(prop); err != nil {
485 0 : return false, err
486 0 : }
487 :
488 1 : newLower, newUpper, err := b.syntheticReplacer.AdjustIntervalWithSyntheticSuffix(i.lower, i.upper, suffix)
489 1 : if err != nil {
490 0 : return false, err
491 0 : }
492 1 : newInterval := interval{lower: newLower, upper: newUpper}
493 1 : return newInterval.intersects(b.filterInterval), nil
494 : }
495 :
496 : // SetInterval adjusts the [lower, upper) bounds used by the filter. It is not
497 : // generally safe to alter the filter while it's in use, except as part of the
498 : // implementation of BlockPropertyFilterMask.SetSuffix used for range-key
499 : // masking.
500 1 : func (b *BlockIntervalFilter) SetInterval(lower, upper uint64) {
501 1 : b.filterInterval = interval{lower: lower, upper: upper}
502 1 : }
503 :
504 : // When encoding block properties for each block, we cannot afford to encode
505 : // the name. Instead, the name is mapped to a shortID, in the scope of that
506 : // sstable, and the shortID is encoded. Since we use a uint8, there is a limit
507 : // of 256 block property collectors per sstable.
508 : type shortID uint8
509 :
510 : type blockPropertiesEncoder struct {
511 : propsBuf []byte
512 : scratch []byte
513 : }
514 :
515 1 : func (e *blockPropertiesEncoder) getScratchForProp() []byte {
516 1 : return e.scratch[:0]
517 1 : }
518 :
519 1 : func (e *blockPropertiesEncoder) resetProps() {
520 1 : e.propsBuf = e.propsBuf[:0]
521 1 : }
522 :
523 1 : func (e *blockPropertiesEncoder) addProp(id shortID, scratch []byte) {
524 1 : const lenID = 1
525 1 : lenProp := uvarintLen(uint32(len(scratch)))
526 1 : n := lenID + lenProp + len(scratch)
527 1 : if cap(e.propsBuf)-len(e.propsBuf) < n {
528 1 : size := len(e.propsBuf) + 2*n
529 1 : if size < 2*cap(e.propsBuf) {
530 1 : size = 2 * cap(e.propsBuf)
531 1 : }
532 1 : buf := make([]byte, len(e.propsBuf), size)
533 1 : copy(buf, e.propsBuf)
534 1 : e.propsBuf = buf
535 : }
536 1 : pos := len(e.propsBuf)
537 1 : b := e.propsBuf[pos : pos+lenID]
538 1 : b[0] = byte(id)
539 1 : pos += lenID
540 1 : b = e.propsBuf[pos : pos+lenProp]
541 1 : n = binary.PutUvarint(b, uint64(len(scratch)))
542 1 : pos += n
543 1 : b = e.propsBuf[pos : pos+len(scratch)]
544 1 : pos += len(scratch)
545 1 : copy(b, scratch)
546 1 : e.propsBuf = e.propsBuf[0:pos]
547 1 : e.scratch = scratch
548 : }
549 :
550 1 : func (e *blockPropertiesEncoder) unsafeProps() []byte {
551 1 : return e.propsBuf
552 1 : }
553 :
554 1 : func (e *blockPropertiesEncoder) props() []byte {
555 1 : buf := make([]byte, len(e.propsBuf))
556 1 : copy(buf, e.propsBuf)
557 1 : return buf
558 1 : }
559 :
560 : type blockPropertiesDecoder struct {
561 : props []byte
562 : }
563 :
564 1 : func (d *blockPropertiesDecoder) done() bool {
565 1 : return len(d.props) == 0
566 1 : }
567 :
568 : // REQUIRES: !done()
569 1 : func (d *blockPropertiesDecoder) next() (id shortID, prop []byte, err error) {
570 1 : const lenID = 1
571 1 : id = shortID(d.props[0])
572 1 : propLen, m := binary.Uvarint(d.props[lenID:])
573 1 : n := lenID + m
574 1 : if m <= 0 || propLen == 0 || (n+int(propLen)) > len(d.props) {
575 0 : return 0, nil, base.CorruptionErrorf("corrupt block property length")
576 0 : }
577 1 : prop = d.props[n : n+int(propLen)]
578 1 : d.props = d.props[n+int(propLen):]
579 1 : return id, prop, nil
580 : }
581 :
582 : // BlockPropertiesFilterer provides filtering support when reading an sstable
583 : // in the context of an iterator that has a slice of BlockPropertyFilters.
584 : // After the call to NewBlockPropertiesFilterer, the caller must call
585 : // IntersectsUserPropsAndFinishInit to check if the sstable intersects with
586 : // the filters. If it does intersect, this function also finishes initializing
587 : // the BlockPropertiesFilterer using the shortIDs for the relevant filters.
588 : // Subsequent checks for relevance of a block should use the intersects
589 : // method.
590 : type BlockPropertiesFilterer struct {
591 : filters []BlockPropertyFilter
592 : // Maps shortID => index in filters. This can be sparse, and shortIDs for
593 : // which there is no filter are represented with an index of -1. The
594 : // length of this can be shorter than the shortIDs allocated in the
595 : // sstable. e.g. if the sstable used shortIDs 0, 1, 2, 3, and the iterator
596 : // has two filters, corresponding to shortIDs 2, 0, this would be:
597 : // len(shortIDToFiltersIndex)==3, 0=>1, 1=>-1, 2=>0.
598 : shortIDToFiltersIndex []int
599 :
600 : // boundLimitedFilter, if non-nil, holds a single block-property filter with
601 : // additional constraints on its filtering. A boundLimitedFilter may only
602 : // filter blocks that are wholly contained within its bounds. During forward
603 : // iteration the lower bound (and during backward iteration the upper bound)
604 : // must be externally guaranteed, with Intersects only returning false if
605 : // that bound is met. The opposite bound is verified during iteration by the
606 : // sstable iterator.
607 : //
608 : // boundLimitedFilter is permitted to be defined on a property (`Name()`)
609 : // for which another filter exists in filters. In this case both filters
610 : // will be consulted, and either filter may exclude block(s). Only a single
611 : // bound-limited block-property filter may be set.
612 : //
613 : // The boundLimitedShortID field contains the shortID of the filter's
614 : // property within the sstable. It's set to -1 if the property was not
615 : // collected when the table was built.
616 : boundLimitedFilter BoundLimitedBlockPropertyFilter
617 : boundLimitedShortID int
618 :
619 : syntheticSuffix SyntheticSuffix
620 : }
621 :
622 : var blockPropertiesFiltererPool = sync.Pool{
623 1 : New: func() interface{} {
624 1 : return &BlockPropertiesFilterer{}
625 1 : },
626 : }
627 :
628 : // newBlockPropertiesFilterer returns a partially initialized filterer. To complete
629 : // initialization, call IntersectsUserPropsAndFinishInit.
630 : func newBlockPropertiesFilterer(
631 : filters []BlockPropertyFilter,
632 : limited BoundLimitedBlockPropertyFilter,
633 : syntheticSuffix SyntheticSuffix,
634 1 : ) *BlockPropertiesFilterer {
635 1 : filterer := blockPropertiesFiltererPool.Get().(*BlockPropertiesFilterer)
636 1 : *filterer = BlockPropertiesFilterer{
637 1 : filters: filters,
638 1 : shortIDToFiltersIndex: filterer.shortIDToFiltersIndex[:0],
639 1 : boundLimitedFilter: limited,
640 1 : boundLimitedShortID: -1,
641 1 : syntheticSuffix: syntheticSuffix,
642 1 : }
643 1 : return filterer
644 1 : }
645 :
646 1 : func releaseBlockPropertiesFilterer(filterer *BlockPropertiesFilterer) {
647 1 : *filterer = BlockPropertiesFilterer{
648 1 : shortIDToFiltersIndex: filterer.shortIDToFiltersIndex[:0],
649 1 : }
650 1 : blockPropertiesFiltererPool.Put(filterer)
651 1 : }
652 :
653 : // IntersectsTable evaluates the provided block-property filter against the
654 : // provided set of table-level properties. If there is no intersection between
655 : // the filters and the table or an error is encountered, IntersectsTable returns
656 : // a nil filterer (and possibly an error). If there is an intersection,
657 : // IntersectsTable returns a non-nil filterer that may be used by an iterator
658 : // reading the table.
659 : func IntersectsTable(
660 : filters []BlockPropertyFilter,
661 : limited BoundLimitedBlockPropertyFilter,
662 : userProperties map[string]string,
663 : syntheticSuffix SyntheticSuffix,
664 1 : ) (*BlockPropertiesFilterer, error) {
665 1 : f := newBlockPropertiesFilterer(filters, limited, syntheticSuffix)
666 1 : ok, err := f.intersectsUserPropsAndFinishInit(userProperties)
667 1 : if !ok || err != nil {
668 1 : releaseBlockPropertiesFilterer(f)
669 1 : return nil, err
670 1 : }
671 1 : return f, nil
672 : }
673 :
674 : // intersectsUserPropsAndFinishInit is called with the user properties map for
675 : // the sstable and returns whether the sstable intersects the filters. It
676 : // additionally initializes the shortIDToFiltersIndex for the filters that are
677 : // relevant to this sstable.
678 : func (f *BlockPropertiesFilterer) intersectsUserPropsAndFinishInit(
679 : userProperties map[string]string,
680 1 : ) (bool, error) {
681 1 : for i := range f.filters {
682 1 : props, ok := userProperties[f.filters[i].Name()]
683 1 : if !ok {
684 1 : // Collector was not used when writing this file, so it is
685 1 : // considered intersecting.
686 1 : continue
687 : }
688 1 : if len(props) < 1 {
689 0 : return false, base.CorruptionErrorf(
690 0 : "block properties for %s is corrupted", f.filters[i].Name())
691 0 : }
692 1 : shortID := shortID(props[0])
693 1 : {
694 1 : // Use an unsafe conversion to avoid allocating. Intersects() is not
695 1 : // supposed to modify the given slice.
696 1 : // Note that unsafe.StringData only works if the string is not empty
697 1 : // (which we already checked).
698 1 : byteProps := unsafe.Slice(unsafe.StringData(props), len(props))
699 1 : var intersects bool
700 1 : var err error
701 1 : if len(f.syntheticSuffix) == 0 {
702 1 : intersects, err = f.filters[i].Intersects(byteProps[1:])
703 1 : } else {
704 1 : intersects, err = f.filters[i].SyntheticSuffixIntersects(byteProps[1:], f.syntheticSuffix)
705 1 : }
706 1 : if err != nil || !intersects {
707 1 : return false, err
708 1 : }
709 : }
710 : // Intersects the sstable, so need to use this filter when
711 : // deciding whether to read blocks.
712 1 : n := len(f.shortIDToFiltersIndex)
713 1 : if n <= int(shortID) {
714 1 : if cap(f.shortIDToFiltersIndex) <= int(shortID) {
715 1 : index := make([]int, shortID+1, 2*(shortID+1))
716 1 : copy(index, f.shortIDToFiltersIndex)
717 1 : f.shortIDToFiltersIndex = index
718 1 : } else {
719 1 : f.shortIDToFiltersIndex = f.shortIDToFiltersIndex[:shortID+1]
720 1 : }
721 1 : for j := n; j < int(shortID); j++ {
722 1 : f.shortIDToFiltersIndex[j] = -1
723 1 : }
724 : }
725 1 : f.shortIDToFiltersIndex[shortID] = i
726 : }
727 1 : if f.boundLimitedFilter == nil {
728 1 : return true, nil
729 1 : }
730 :
731 : // There's a bound-limited filter. Find its shortID. It's possible that
732 : // there's an existing filter in f.filters on the same property. That's
733 : // okay. Both filters will be consulted whenever a relevant prop is decoded.
734 1 : props, ok := userProperties[f.boundLimitedFilter.Name()]
735 1 : if !ok {
736 1 : // The collector was not used when writing this file, so it's
737 1 : // intersecting. We leave f.boundLimitedShortID=-1, so the filter will
738 1 : // be unused within this file.
739 1 : return true, nil
740 1 : }
741 1 : if len(props) < 1 {
742 0 : return false, base.CorruptionErrorf(
743 0 : "block properties for %s is corrupted", f.boundLimitedFilter.Name())
744 0 : }
745 1 : f.boundLimitedShortID = int(props[0])
746 1 :
747 1 : // We don't check for table-level intersection for the bound-limited filter.
748 1 : // The bound-limited filter is treated as vacuously intersecting.
749 1 : //
750 1 : // NB: If a block-property filter needs to be toggled inactive/active, it
751 1 : // should be implemented within the Intersects implementation.
752 1 : //
753 1 : // TODO(jackson): We could filter at the table-level by threading the table
754 1 : // smallest and largest bounds here.
755 1 :
756 1 : // The bound-limited filter isn't included in shortIDToFiltersIndex.
757 1 : //
758 1 : // When determining intersection, we decode props only up to the shortID
759 1 : // len(shortIDToFiltersIndex). If f.limitedShortID is greater than any of
760 1 : // the existing filters' shortIDs, we need to grow shortIDToFiltersIndex.
761 1 : // Growing the index with -1s ensures we're able to consult the index
762 1 : // without length checks.
763 1 : if n := len(f.shortIDToFiltersIndex); n <= f.boundLimitedShortID {
764 1 : if cap(f.shortIDToFiltersIndex) <= f.boundLimitedShortID {
765 1 : index := make([]int, f.boundLimitedShortID+1)
766 1 : copy(index, f.shortIDToFiltersIndex)
767 1 : f.shortIDToFiltersIndex = index
768 1 : } else {
769 1 : f.shortIDToFiltersIndex = f.shortIDToFiltersIndex[:f.boundLimitedShortID+1]
770 1 : }
771 1 : for j := n; j <= f.boundLimitedShortID; j++ {
772 1 : f.shortIDToFiltersIndex[j] = -1
773 1 : }
774 : }
775 1 : return true, nil
776 : }
777 :
778 : type intersectsResult int8
779 :
780 : const (
781 : blockIntersects intersectsResult = iota
782 : blockExcluded
783 : // blockMaybeExcluded is returned by BlockPropertiesFilterer.intersects when
784 : // no filters unconditionally exclude the block, but the bound-limited block
785 : // property filter will exclude it if the block's bounds fall within the
786 : // filter's current bounds. See the reader's
787 : // {single,two}LevelIterator.resolveMaybeExcluded methods.
788 : blockMaybeExcluded
789 : )
790 :
791 1 : func (f *BlockPropertiesFilterer) intersects(props []byte) (ret intersectsResult, err error) {
792 1 : i := 0
793 1 : decoder := blockPropertiesDecoder{props: props}
794 1 : ret = blockIntersects
795 1 : for i < len(f.shortIDToFiltersIndex) {
796 1 : var id int
797 1 : var prop []byte
798 1 : if !decoder.done() {
799 1 : var shortID shortID
800 1 : var err error
801 1 : shortID, prop, err = decoder.next()
802 1 : if err != nil {
803 0 : return ret, err
804 0 : }
805 1 : id = int(shortID)
806 1 : } else {
807 1 : id = math.MaxUint8 + 1
808 1 : }
809 1 : for i < len(f.shortIDToFiltersIndex) && id > i {
810 1 : // The property for this id is not encoded for this block, but there
811 1 : // may still be a filter for this id.
812 1 : if intersects, err := f.intersectsFilter(i, nil); err != nil {
813 0 : return ret, err
814 1 : } else if intersects == blockExcluded {
815 0 : return blockExcluded, nil
816 1 : } else if intersects == blockMaybeExcluded {
817 1 : ret = blockMaybeExcluded
818 1 : }
819 1 : i++
820 : }
821 1 : if i >= len(f.shortIDToFiltersIndex) {
822 1 : return ret, nil
823 1 : }
824 : // INVARIANT: id <= i. And since i is always incremented by 1, id==i.
825 1 : if id != i {
826 0 : panic(fmt.Sprintf("%d != %d", id, i))
827 : }
828 1 : if intersects, err := f.intersectsFilter(i, prop); err != nil {
829 0 : return ret, err
830 1 : } else if intersects == blockExcluded {
831 1 : return blockExcluded, nil
832 1 : } else if intersects == blockMaybeExcluded {
833 1 : ret = blockMaybeExcluded
834 1 : }
835 1 : i++
836 : }
837 : // ret == blockIntersects || ret == blockMaybeExcluded
838 1 : return ret, nil
839 : }
840 :
841 1 : func (f *BlockPropertiesFilterer) intersectsFilter(i int, prop []byte) (intersectsResult, error) {
842 1 : var intersects bool
843 1 : var err error
844 1 : if f.shortIDToFiltersIndex[i] >= 0 {
845 1 : if len(f.syntheticSuffix) == 0 {
846 1 : intersects, err = f.filters[f.shortIDToFiltersIndex[i]].Intersects(prop)
847 1 : } else {
848 1 : intersects, err = f.filters[f.shortIDToFiltersIndex[i]].SyntheticSuffixIntersects(prop, f.syntheticSuffix)
849 1 : }
850 1 : if err != nil {
851 0 : return blockIntersects, err
852 0 : }
853 1 : if !intersects {
854 1 : return blockExcluded, nil
855 1 : }
856 : }
857 1 : if i == f.boundLimitedShortID {
858 1 : // The bound-limited filter uses this id.
859 1 : //
860 1 : // The bound-limited filter only applies within a keyspan interval. We
861 1 : // expect the Intersects call to be cheaper than bounds checks. If
862 1 : // Intersects determines that there is no intersection, we return
863 1 : // `blockMaybeExcluded` if no other bpf unconditionally excludes the
864 1 : // block.
865 1 : if len(f.syntheticSuffix) == 0 {
866 1 : intersects, err = f.boundLimitedFilter.Intersects(prop)
867 1 : } else {
868 1 : intersects, err = f.boundLimitedFilter.SyntheticSuffixIntersects(prop, f.syntheticSuffix)
869 1 : }
870 1 : if err != nil {
871 0 : return blockIntersects, err
872 1 : } else if !intersects {
873 1 : return blockMaybeExcluded, nil
874 1 : }
875 : }
876 1 : return blockIntersects, nil
877 : }
878 :
879 1 : func uvarintLen(v uint32) int {
880 1 : i := 0
881 1 : for v >= 0x80 {
882 0 : v >>= 7
883 0 : i++
884 0 : }
885 1 : return i + 1
886 : }
|