Line data Source code
1 : // Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package sstable
6 :
7 : import (
8 : "encoding/binary"
9 : "fmt"
10 : "math"
11 : "sync"
12 : "unsafe"
13 :
14 : "github.com/cockroachdb/pebble/internal/base"
15 : "github.com/cockroachdb/pebble/internal/invariants"
16 : "github.com/cockroachdb/pebble/internal/rangekey"
17 : )
18 :
19 : // Block properties are an optional user-facing feature that can be used to
20 : // filter data blocks (and whole sstables) from an Iterator before they are
21 : // loaded. They do not apply to range delete blocks. These are expected to
22 : // very concisely represent a set of some attribute value contained within the
23 : // key or value, such that the set includes all the attribute values in the
24 : // block. This has some similarities with OLAP pruning approaches that
25 : // maintain min-max attribute values for some column (which concisely
26 : // represent a set), that is then used to prune at query time. In Pebble's
27 : // case, data blocks are small, typically 25-50KB, so these properties should
28 : // reduce their precision in order to be concise -- a good rule of thumb is to
29 : // not consume more than 50-100 bytes across all properties maintained for a
30 : // block, i.e., a 500x reduction compared to loading the data block.
31 : //
32 : // A block property must be assigned a unique name, which is encoded and
33 : // stored in the sstable. This name must be unique among all user-properties
34 : // encoded in an sstable.
35 : //
36 : // A property is represented as a []byte. A nil value or empty byte slice are
37 : // considered semantically identical. The caller is free to choose the
38 : // semantics of an empty byte slice e.g. they could use it to represent the
39 : // empty set or the universal set, whichever they think is more common and
40 : // therefore better to encode more concisely. The serialization of the
41 : // property for the various Finish*() calls in a BlockPropertyCollector
42 : // implementation should be identical, since the corresponding
43 : // BlockPropertyFilter implementation is not told the context in which it is
44 : // deserializing the property.
45 : //
46 : // Block properties are more general than table properties and should be
47 : // preferred over using table properties. A BlockPropertyCollector can achieve
48 : // identical behavior to table properties by returning the nil slice from
49 : // FinishDataBlock and FinishIndexBlock, and interpret them as the universal
50 : // set in BlockPropertyFilter, and return a non-universal set in FinishTable.
51 : //
52 : // Block property filtering is nondeterministic because the separation of keys
53 : // into blocks is nondeterministic. Clients use block-property filters to
54 : // implement efficient application of a filter F that applies to key-value pairs
55 : // (abbreviated as kv-filter). Consider correctness defined as surfacing exactly
56 : // the same key-value pairs that would be surfaced if one applied the filter F
57 : // above normal iteration. With this correctness definition, block property
58 : // filtering may introduce two kinds of errors:
59 : //
60 : // a) Block property filtering that uses a kv-filter may produce additional
61 : // key-value pairs that don't satisfy the filter because of the separation
62 : // of keys into blocks. Clients may remove these extra key-value pairs by
63 : // re-applying the kv filter while reading results back from Pebble.
64 : //
65 : // b) Block property filtering may surface deleted key-value pairs if the
66 : // kv filter is not a strict function of the key's user key. A block
67 : // containing k.DEL may be filtered, while a block containing the deleted
68 : // key k.SET may not be filtered, if the kv filter applies to one but not
69 : // the other.
70 : //
71 : // This error may be avoided trivially by using a kv filter that is a pure
72 : // function of the user key. A filter that examines values or key kinds
73 : // requires care to ensure F(k.SET, <value>) = F(k.DEL) = F(k.SINGLEDEL).
74 : //
75 : // The combination of range deletions and filtering by table-level properties
76 : // add another opportunity for deleted point keys to be surfaced. The pebble
77 : // Iterator stack takes care to correctly apply filtered tables' range deletions
78 : // to lower tables, preventing this form of nondeterministic error.
79 : //
80 : // In addition to the non-determinism discussed in (b), which limits the use
81 : // of properties over values, we now have support for values that are not
82 : // stored together with the key, and may not even be retrieved during
83 : // compactions. If Pebble is configured with such value separation, block
84 : // properties must only apply to the key, and will be provided a nil value.
85 :
86 : // BlockPropertyCollector is used when writing a sstable.
87 : //
88 : // - All calls to Add are included in the next FinishDataBlock, after which
89 : // the next data block is expected to start.
90 : //
91 : // - The index entry generated for the data block, which contains the return
92 : // value from FinishDataBlock, is not immediately included in the current
93 : // index block. It is included when AddPrevDataBlockToIndexBlock is called.
94 : // An alternative would be to return an opaque handle from FinishDataBlock
95 : // and pass it to a new AddToIndexBlock method, which requires more
96 : // plumbing, and passing of an interface{} results in a undesirable heap
97 : // allocation. AddPrevDataBlockToIndexBlock must be called before keys are
98 : // added to the new data block.
99 : type BlockPropertyCollector interface {
100 : // Name returns the name of the block property collector.
101 : Name() string
102 :
103 : // Add is called with each new entry added to a data block in the sstable.
104 : // The callee can assume that these are in sorted order.
105 : Add(key InternalKey, value []byte) error
106 :
107 : // AddCollectedWithSuffixReplacement adds previously collected property data
108 : // and updates it to reflect a change of suffix on all keys: the old property
109 : // data is assumed to be constructed from keys that all have the same
110 : // oldSuffix and is recalculated to reflect the same keys but with newSuffix.
111 : //
112 : // A collector which supports this method must be able to derive its updated
113 : // value from its old value and the change being made to the suffix, without
114 : // needing to be passed each updated K/V.
115 : //
116 : // For example, a collector that only inspects values can simply copy its
117 : // previously computed property as-is, since key-suffix replacement does not
118 : // change values, while a collector that depends only on key suffixes, like
119 : // one which collected mvcc-timestamp bounds from timestamp-suffixed keys, can
120 : // just set its new bounds from the new suffix, as it is common to all keys,
121 : // without needing to recompute it from every key.
122 : //
123 : // This method is optional (if it is not implemented, it always returns an
124 : // error). SupportsSuffixReplacement() can be used to check if this method is
125 : // implemented.
126 : AddCollectedWithSuffixReplacement(oldProp []byte, oldSuffix, newSuffix []byte) error
127 :
128 : // SupportsSuffixReplacement returns whether the collector supports the
129 : // AddCollectedWithSuffixReplacement method.
130 : SupportsSuffixReplacement() bool
131 :
132 : // FinishDataBlock is called when all the entries have been added to a
133 : // data block. Subsequent Add calls will be for the next data block. It
134 : // returns the property value for the finished block.
135 : FinishDataBlock(buf []byte) ([]byte, error)
136 :
137 : // AddPrevDataBlockToIndexBlock adds the entry corresponding to the
138 : // previous FinishDataBlock to the current index block.
139 : AddPrevDataBlockToIndexBlock()
140 :
141 : // FinishIndexBlock is called when an index block, containing all the
142 : // key-value pairs since the last FinishIndexBlock, will no longer see new
143 : // entries. It returns the property value for the index block.
144 : FinishIndexBlock(buf []byte) ([]byte, error)
145 :
146 : // FinishTable is called when the sstable is finished, and returns the
147 : // property value for the sstable.
148 : FinishTable(buf []byte) ([]byte, error)
149 : }
150 :
151 : // BlockPropertyFilter is used in an Iterator to filter sstables and blocks
152 : // within the sstable. It should not maintain any per-sstable state, and must
153 : // be thread-safe.
154 : type BlockPropertyFilter = base.BlockPropertyFilter
155 :
156 : // BoundLimitedBlockPropertyFilter implements the block-property filter but
157 : // imposes an additional constraint on its usage, requiring that only blocks
158 : // containing exclusively keys between its lower and upper bounds may be
159 : // filtered. The bounds may be change during iteration, so the filter doesn't
160 : // expose the bounds, instead implementing KeyIsWithin[Lower,Upper]Bound methods
161 : // for performing bound comparisons.
162 : //
163 : // To be used, a BoundLimitedBlockPropertyFilter must be supplied directly
164 : // through NewBlockPropertiesFilterer's dedicated parameter. If supplied through
165 : // the ordinary slice of block property filters, this filter's bounds will be
166 : // ignored.
167 : //
168 : // The current [lower,upper) bounds of the filter are unknown, because they may
169 : // be changing. During forward iteration the lower bound is externally
170 : // guaranteed, meaning Intersects only returns false if the sstable iterator is
171 : // already known to be positioned at a key ≥ lower. The sstable iterator is then
172 : // only responsible for ensuring filtered blocks also meet the upper bound, and
173 : // should only allow a block to be filtered if all its keys are < upper. The
174 : // sstable iterator may invoke KeyIsWithinUpperBound(key) to perform this check,
175 : // where key is an inclusive upper bound on the block's keys.
176 : //
177 : // During backward iteration the upper bound is externally guaranteed, and
178 : // Intersects only returns false if the sstable iterator is already known to be
179 : // positioned at a key < upper. The sstable iterator is responsible for ensuring
180 : // filtered blocks also meet the lower bound, enforcing that a block is only
181 : // filtered if all its keys are ≥ lower. This check is made through passing the
182 : // block's inclusive lower bound to KeyIsWithinLowerBound.
183 : //
184 : // Implementations may become active or inactive through implementing Intersects
185 : // to return true whenever the filter is disabled.
186 : //
187 : // Usage of BoundLimitedBlockPropertyFilter is subtle, and Pebble consumers
188 : // should not implement this interface directly. This interface is an internal
189 : // detail in the implementation of block-property range-key masking.
190 : type BoundLimitedBlockPropertyFilter interface {
191 : BlockPropertyFilter
192 :
193 : // KeyIsWithinLowerBound tests whether the provided internal key falls
194 : // within the current lower bound of the filter. A true return value
195 : // indicates that the filter may be used to filter blocks that exclusively
196 : // contain keys ≥ `key`, so long as the blocks' keys also satisfy the upper
197 : // bound.
198 : KeyIsWithinLowerBound(key []byte) bool
199 : // KeyIsWithinUpperBound tests whether the provided internal key falls
200 : // within the current upper bound of the filter. A true return value
201 : // indicates that the filter may be used to filter blocks that exclusively
202 : // contain keys ≤ `key`, so long as the blocks' keys also satisfy the lower
203 : // bound.
204 : KeyIsWithinUpperBound(key []byte) bool
205 : }
206 :
207 : // BlockIntervalCollector is a helper implementation of BlockPropertyCollector
208 : // for users who want to represent a set of the form [lower,upper) where both
209 : // lower and upper are uint64, and lower <= upper.
210 : //
211 : // The set is encoded as:
212 : // - Two varint integers, (lower,upper-lower), when upper-lower > 0
213 : // - Nil, when upper-lower=0
214 : //
215 : // Users must not expect this to preserve differences between empty sets --
216 : // they will all get turned into the semantically equivalent [0,0).
217 : //
218 : // A BlockIntervalCollector that collects over point and range keys needs to
219 : // have both the point and range DataBlockIntervalCollector specified, since
220 : // point and range keys are fed to the BlockIntervalCollector in an interleaved
221 : // fashion, independently of one another. This also implies that the
222 : // DataBlockIntervalCollectors for point and range keys should be references to
223 : // independent instances, rather than references to the same collector, as point
224 : // and range keys are tracked independently.
225 : type BlockIntervalCollector struct {
226 : name string
227 : points DataBlockIntervalCollector
228 : ranges DataBlockIntervalCollector
229 :
230 : blockInterval interval
231 : indexInterval interval
232 : tableInterval interval
233 : }
234 :
235 : var _ BlockPropertyCollector = &BlockIntervalCollector{}
236 :
237 : // DataBlockIntervalCollector is the interface used by BlockIntervalCollector
238 : // that contains the actual logic pertaining to the property. It only
239 : // maintains state for the current data block, and resets that state in
240 : // FinishDataBlock. This interface can be used to reduce parsing costs.
241 : type DataBlockIntervalCollector interface {
242 : // Add is called with each new entry added to a data block in the sstable.
243 : // The callee can assume that these are in sorted order.
244 : Add(key InternalKey, value []byte) error
245 :
246 : // AddCollectedWithSuffixReplacement extends the interval with a given
247 : // interval after updating it to reflect a change of suffix on all keys: the
248 : // [oldLower, oldUpper) interval is assumed to be constructed from keys that
249 : // all have the same oldSuffix and is recalculated to reflect the same keys but
250 : // with newSuffix.
251 : //
252 : // This method is optional (if it is not implemented, it always returns an
253 : // error). SupportsSuffixReplacement() can be used to check if this method is
254 : // implemented.
255 : //
256 : // See BlockPropertyCollector.AddCollectedWithSuffixReplacement for more
257 : // information.
258 : AddCollectedWithSuffixReplacement(oldLower, oldUpper uint64, oldSuffix, newSuffix []byte) error
259 :
260 : // SupportsSuffixReplacement returns whether the collector supports the
261 : // AddCollectedWithSuffixReplacement method.
262 : SupportsSuffixReplacement() bool
263 :
264 : // FinishDataBlock is called when all the entries have been added to a
265 : // data block. Subsequent Add calls will be for the next data block. It
266 : // returns the [lower, upper) for the finished block.
267 : FinishDataBlock() (lower uint64, upper uint64, err error)
268 : }
269 :
270 : // NewBlockIntervalCollector constructs a BlockIntervalCollector with the given
271 : // name. The BlockIntervalCollector makes use of the given point and range key
272 : // DataBlockIntervalCollectors when encountering point and range keys,
273 : // respectively.
274 : //
275 : // The caller may pass a nil DataBlockIntervalCollector for one of the point or
276 : // range key collectors, in which case keys of those types will be ignored. This
277 : // allows for flexible construction of BlockIntervalCollectors that operate on
278 : // just point keys, just range keys, or both point and range keys.
279 : //
280 : // If both point and range keys are to be tracked, two independent collectors
281 : // should be provided, rather than the same collector passed in twice (see the
282 : // comment on BlockIntervalCollector for more detail)
283 : func NewBlockIntervalCollector(
284 : name string, pointCollector, rangeCollector DataBlockIntervalCollector,
285 2 : ) BlockPropertyCollector {
286 2 : if pointCollector == nil && rangeCollector == nil {
287 1 : panic("sstable: at least one interval collector must be provided")
288 : }
289 2 : return &BlockIntervalCollector{
290 2 : name: name,
291 2 : points: pointCollector,
292 2 : ranges: rangeCollector,
293 2 : }
294 : }
295 :
296 : // Name is part of the BlockPropertyCollector interface.
297 2 : func (b *BlockIntervalCollector) Name() string {
298 2 : return b.name
299 2 : }
300 :
301 : // Add is part of the BlockPropertyCollector interface.
302 2 : func (b *BlockIntervalCollector) Add(key InternalKey, value []byte) error {
303 2 : if rangekey.IsRangeKey(key.Kind()) {
304 2 : if b.ranges != nil {
305 1 : return b.ranges.Add(key, value)
306 1 : }
307 2 : } else if b.points != nil {
308 2 : return b.points.Add(key, value)
309 2 : }
310 2 : return nil
311 : }
312 :
313 : // AddCollectedWithSuffixReplacement is part of the BlockPropertyCollector interface.
314 : func (b *BlockIntervalCollector) AddCollectedWithSuffixReplacement(
315 : oldProp []byte, oldSuffix, newSuffix []byte,
316 1 : ) error {
317 1 : var i interval
318 1 : if err := i.decode(oldProp); err != nil {
319 0 : return err
320 0 : }
321 1 : return b.points.AddCollectedWithSuffixReplacement(i.lower, i.upper, oldSuffix, newSuffix)
322 : }
323 :
324 : // SupportsSuffixReplacement is part of the BlockPropertyCollector interface.
325 1 : func (b *BlockIntervalCollector) SupportsSuffixReplacement() bool {
326 1 : return b.points.SupportsSuffixReplacement()
327 1 : }
328 :
329 : // FinishDataBlock is part of the BlockPropertyCollector interface.
330 2 : func (b *BlockIntervalCollector) FinishDataBlock(buf []byte) ([]byte, error) {
331 2 : if b.points == nil {
332 1 : return buf, nil
333 1 : }
334 2 : var err error
335 2 : b.blockInterval.lower, b.blockInterval.upper, err = b.points.FinishDataBlock()
336 2 : if err != nil {
337 0 : return buf, err
338 0 : }
339 2 : buf = b.blockInterval.encode(buf)
340 2 : b.tableInterval.union(b.blockInterval)
341 2 : return buf, nil
342 : }
343 :
344 : // AddPrevDataBlockToIndexBlock implements the BlockPropertyCollector
345 : // interface.
346 2 : func (b *BlockIntervalCollector) AddPrevDataBlockToIndexBlock() {
347 2 : b.indexInterval.union(b.blockInterval)
348 2 : b.blockInterval = interval{}
349 2 : }
350 :
351 : // FinishIndexBlock implements the BlockPropertyCollector interface.
352 2 : func (b *BlockIntervalCollector) FinishIndexBlock(buf []byte) ([]byte, error) {
353 2 : buf = b.indexInterval.encode(buf)
354 2 : b.indexInterval = interval{}
355 2 : return buf, nil
356 2 : }
357 :
358 : // FinishTable implements the BlockPropertyCollector interface.
359 2 : func (b *BlockIntervalCollector) FinishTable(buf []byte) ([]byte, error) {
360 2 : // If the collector is tracking range keys, the range key interval is union-ed
361 2 : // with the point key interval for the table.
362 2 : if b.ranges != nil {
363 1 : var rangeInterval interval
364 1 : var err error
365 1 : rangeInterval.lower, rangeInterval.upper, err = b.ranges.FinishDataBlock()
366 1 : if err != nil {
367 0 : return buf, err
368 0 : }
369 1 : b.tableInterval.union(rangeInterval)
370 : }
371 2 : return b.tableInterval.encode(buf), nil
372 : }
373 :
374 : type interval struct {
375 : lower uint64
376 : upper uint64
377 : }
378 :
379 2 : func (i interval) encode(buf []byte) []byte {
380 2 : if i.lower < i.upper {
381 2 : var encoded [binary.MaxVarintLen64 * 2]byte
382 2 : n := binary.PutUvarint(encoded[:], i.lower)
383 2 : n += binary.PutUvarint(encoded[n:], i.upper-i.lower)
384 2 : buf = append(buf, encoded[:n]...)
385 2 : }
386 2 : return buf
387 : }
388 :
389 2 : func (i *interval) decode(buf []byte) error {
390 2 : if len(buf) == 0 {
391 2 : *i = interval{}
392 2 : return nil
393 2 : }
394 2 : var n int
395 2 : i.lower, n = binary.Uvarint(buf)
396 2 : if n <= 0 || n >= len(buf) {
397 0 : return base.CorruptionErrorf("cannot decode interval from buf %x", buf)
398 0 : }
399 2 : pos := n
400 2 : i.upper, n = binary.Uvarint(buf[pos:])
401 2 : pos += n
402 2 : if pos != len(buf) || n <= 0 {
403 0 : return base.CorruptionErrorf("cannot decode interval from buf %x", buf)
404 0 : }
405 : // Delta decode.
406 2 : i.upper += i.lower
407 2 : if i.upper < i.lower {
408 0 : return base.CorruptionErrorf("unexpected overflow, upper %d < lower %d", i.upper, i.lower)
409 0 : }
410 2 : return nil
411 : }
412 :
413 2 : func (i *interval) union(x interval) {
414 2 : if x.lower >= x.upper {
415 2 : // x is the empty set.
416 2 : return
417 2 : }
418 2 : if i.lower >= i.upper {
419 2 : // i is the empty set.
420 2 : *i = x
421 2 : return
422 2 : }
423 : // Both sets are non-empty.
424 2 : if x.lower < i.lower {
425 2 : i.lower = x.lower
426 2 : }
427 2 : if x.upper > i.upper {
428 2 : i.upper = x.upper
429 2 : }
430 : }
431 :
432 2 : func (i interval) intersects(x interval) bool {
433 2 : if i.lower >= i.upper || x.lower >= x.upper {
434 2 : // At least one of the sets is empty.
435 2 : return false
436 2 : }
437 : // Neither set is empty.
438 2 : return i.upper > x.lower && i.lower < x.upper
439 : }
440 :
441 : // BlockIntervalSyntheticReplacer provides methods to conduct just in time
442 : // adjustments of a passed in block prop interval before filtering.
443 : type BlockIntervalSyntheticReplacer interface {
444 : // AdjustIntervalWithSyntheticSuffix adjusts the [lower, upper) range for a data
445 : // block (specifically, the range returned by the corresponding
446 : // DataBlockIntervalCollector.FinishDataBlock) to what it would have been if
447 : // all the input keys had the given synthetic suffix.
448 : AdjustIntervalWithSyntheticSuffix(lower uint64, upper uint64, suffix []byte) (adjustedLower uint64, adjustedUpper uint64, err error)
449 : }
450 :
451 : // BlockIntervalFilter is an implementation of BlockPropertyFilter when the
452 : // corresponding collector is a BlockIntervalCollector. That is, the set is of
453 : // the form [lower, upper).
454 : type BlockIntervalFilter struct {
455 : name string
456 : filterInterval interval
457 : syntheticReplacer BlockIntervalSyntheticReplacer
458 : }
459 :
460 : var _ BlockPropertyFilter = (*BlockIntervalFilter)(nil)
461 :
462 : // NewBlockIntervalFilter constructs a BlockPropertyFilter that filters blocks
463 : // based on an interval property collected by BlockIntervalCollector and the
464 : // given [lower, upper) bounds. The given name specifies the
465 : // BlockIntervalCollector's properties to read.
466 : func NewBlockIntervalFilter(
467 : name string, lower uint64, upper uint64, syntheticReplacer BlockIntervalSyntheticReplacer,
468 2 : ) *BlockIntervalFilter {
469 2 : b := new(BlockIntervalFilter)
470 2 : b.Init(name, lower, upper, syntheticReplacer)
471 2 : return b
472 2 : }
473 :
474 : // Init initializes (or re-initializes, clearing previous state) an existing
475 : // BLockPropertyFilter to filter blocks based on an interval property collected
476 : // by BlockIntervalCollector and the given [lower, upper) bounds. The given name
477 : // specifies the BlockIntervalCollector's properties to read.
478 : func (b *BlockIntervalFilter) Init(
479 : name string, lower, upper uint64, syntheticReplacer BlockIntervalSyntheticReplacer,
480 2 : ) {
481 2 : *b = BlockIntervalFilter{
482 2 : name: name,
483 2 : filterInterval: interval{lower: lower, upper: upper},
484 2 : syntheticReplacer: syntheticReplacer,
485 2 : }
486 2 : }
487 :
488 : // Name implements the BlockPropertyFilter interface.
489 2 : func (b *BlockIntervalFilter) Name() string {
490 2 : return b.name
491 2 : }
492 :
493 : // Intersects implements the BlockPropertyFilter interface.
494 2 : func (b *BlockIntervalFilter) Intersects(prop []byte) (bool, error) {
495 2 : var i interval
496 2 : if err := i.decode(prop); err != nil {
497 0 : return false, err
498 0 : }
499 2 : return i.intersects(b.filterInterval), nil
500 : }
501 :
502 : // SyntheticSuffixIntersects implements the BlockPropertyFilter interface.
503 2 : func (b *BlockIntervalFilter) SyntheticSuffixIntersects(prop []byte, suffix []byte) (bool, error) {
504 2 : if b.syntheticReplacer == nil {
505 0 : return false, base.AssertionFailedf("missing SyntheticReplacer for SyntheticSuffixIntersects()")
506 0 : }
507 2 : var i interval
508 2 : if err := i.decode(prop); err != nil {
509 0 : return false, err
510 0 : }
511 :
512 2 : newLower, newUpper, err := b.syntheticReplacer.AdjustIntervalWithSyntheticSuffix(i.lower, i.upper, suffix)
513 2 : if err != nil {
514 0 : return false, err
515 0 : }
516 2 : newInterval := interval{lower: newLower, upper: newUpper}
517 2 : return newInterval.intersects(b.filterInterval), nil
518 : }
519 :
520 : // SetInterval adjusts the [lower, upper) bounds used by the filter. It is not
521 : // generally safe to alter the filter while it's in use, except as part of the
522 : // implementation of BlockPropertyFilterMask.SetSuffix used for range-key
523 : // masking.
524 2 : func (b *BlockIntervalFilter) SetInterval(lower, upper uint64) {
525 2 : b.filterInterval = interval{lower: lower, upper: upper}
526 2 : }
527 :
528 : // When encoding block properties for each block, we cannot afford to encode the
529 : // name. Instead, the name is mapped to a shortID, in the scope of that sstable,
530 : // and the shortID is encoded as a single byte (which imposes a limit of of 256
531 : // block property collectors per sstable).
532 : // Note that the in-memory type is int16 to avoid overflows (e.g. in loops) and
533 : // to allow special values like -1 in code.
534 : type shortID int16
535 :
536 : const invalidShortID shortID = -1
537 : const maxShortID shortID = math.MaxUint8
538 : const maxPropertyCollectors = int(maxShortID) + 1
539 :
540 2 : func (id shortID) IsValid() bool {
541 2 : return id >= 0 && id <= maxShortID
542 2 : }
543 :
544 2 : func (id shortID) ToByte() byte {
545 2 : if invariants.Enabled && !id.IsValid() {
546 0 : panic(fmt.Sprintf("inavlid id %d", id))
547 : }
548 2 : return byte(id)
549 : }
550 :
551 : type blockPropertiesEncoder struct {
552 : propsBuf []byte
553 : scratch []byte
554 : }
555 :
556 2 : func (e *blockPropertiesEncoder) getScratchForProp() []byte {
557 2 : return e.scratch[:0]
558 2 : }
559 :
560 2 : func (e *blockPropertiesEncoder) resetProps() {
561 2 : e.propsBuf = e.propsBuf[:0]
562 2 : }
563 :
564 2 : func (e *blockPropertiesEncoder) addProp(id shortID, scratch []byte) {
565 2 : if len(scratch) == 0 {
566 2 : // We omit empty properties. The decoder will know that any missing IDs had
567 2 : // empty values.
568 2 : return
569 2 : }
570 2 : const lenID = 1
571 2 : lenProp := uvarintLen(uint32(len(scratch)))
572 2 : n := lenID + lenProp + len(scratch)
573 2 : if cap(e.propsBuf)-len(e.propsBuf) < n {
574 2 : size := len(e.propsBuf) + 2*n
575 2 : if size < 2*cap(e.propsBuf) {
576 2 : size = 2 * cap(e.propsBuf)
577 2 : }
578 2 : buf := make([]byte, len(e.propsBuf), size)
579 2 : copy(buf, e.propsBuf)
580 2 : e.propsBuf = buf
581 : }
582 2 : pos := len(e.propsBuf)
583 2 : b := e.propsBuf[pos : pos+lenID]
584 2 : b[0] = id.ToByte()
585 2 : pos += lenID
586 2 : b = e.propsBuf[pos : pos+lenProp]
587 2 : n = binary.PutUvarint(b, uint64(len(scratch)))
588 2 : pos += n
589 2 : b = e.propsBuf[pos : pos+len(scratch)]
590 2 : pos += len(scratch)
591 2 : copy(b, scratch)
592 2 : e.propsBuf = e.propsBuf[0:pos]
593 2 : e.scratch = scratch
594 : }
595 :
596 2 : func (e *blockPropertiesEncoder) unsafeProps() []byte {
597 2 : return e.propsBuf
598 2 : }
599 :
600 2 : func (e *blockPropertiesEncoder) props() []byte {
601 2 : buf := make([]byte, len(e.propsBuf))
602 2 : copy(buf, e.propsBuf)
603 2 : return buf
604 2 : }
605 :
606 : type blockPropertiesDecoder struct {
607 : props []byte
608 :
609 : // numCollectedProps is the number of collectors that were used when writing
610 : // these properties. The encoded properties contain values for shortIDs 0
611 : // through numCollectedProps-1, in order (with empty properties omitted).
612 : numCollectedProps int
613 : nextID shortID
614 : }
615 :
616 2 : func makeBlockPropertiesDecoder(numCollectedProps int, propsBuf []byte) blockPropertiesDecoder {
617 2 : return blockPropertiesDecoder{
618 2 : props: propsBuf,
619 2 : numCollectedProps: numCollectedProps,
620 2 : }
621 2 : }
622 :
623 2 : func (d *blockPropertiesDecoder) Done() bool {
624 2 : return int(d.nextID) >= d.numCollectedProps
625 2 : }
626 :
627 : // Next returns the property for each shortID between 0 and numCollectedProps-1, in order.
628 : // Note that some properties might be empty.
629 : // REQUIRES: !Done()
630 2 : func (d *blockPropertiesDecoder) Next() (id shortID, prop []byte, err error) {
631 2 : id = d.nextID
632 2 : d.nextID++
633 2 :
634 2 : if len(d.props) == 0 || shortID(d.props[0]) != id {
635 2 : if invariants.Enabled && len(d.props) > 0 && shortID(d.props[0]) < id {
636 0 : panic("shortIDs are not in order")
637 : }
638 : // This property was omitted because it was empty.
639 2 : return id, nil, nil
640 : }
641 :
642 2 : const lenID = 1
643 2 : propLen, m := binary.Uvarint(d.props[lenID:])
644 2 : n := lenID + m
645 2 : if m <= 0 || propLen == 0 || (n+int(propLen)) > len(d.props) {
646 0 : return 0, nil, base.CorruptionErrorf("corrupt block property length")
647 0 : }
648 2 : prop = d.props[n : n+int(propLen)]
649 2 : d.props = d.props[n+int(propLen):]
650 2 : return id, prop, nil
651 : }
652 :
653 : // BlockPropertiesFilterer provides filtering support when reading an sstable
654 : // in the context of an iterator that has a slice of BlockPropertyFilters.
655 : // After the call to NewBlockPropertiesFilterer, the caller must call
656 : // IntersectsUserPropsAndFinishInit to check if the sstable intersects with
657 : // the filters. If it does intersect, this function also finishes initializing
658 : // the BlockPropertiesFilterer using the shortIDs for the relevant filters.
659 : // Subsequent checks for relevance of a block should use the intersects
660 : // method.
661 : type BlockPropertiesFilterer struct {
662 : filters []BlockPropertyFilter
663 : // Maps shortID => index in filters. This can be sparse, and shortIDs for
664 : // which there is no filter are represented with an index of -1. The
665 : // length of this can be shorter than the shortIDs allocated in the
666 : // sstable. e.g. if the sstable used shortIDs 0, 1, 2, 3, and the iterator
667 : // has two filters, corresponding to shortIDs 2, 0, this would be:
668 : // len(shortIDToFiltersIndex)==3, 0=>1, 1=>-1, 2=>0.
669 : shortIDToFiltersIndex []int
670 :
671 : // boundLimitedFilter, if non-nil, holds a single block-property filter with
672 : // additional constraints on its filtering. A boundLimitedFilter may only
673 : // filter blocks that are wholly contained within its bounds. During forward
674 : // iteration the lower bound (and during backward iteration the upper bound)
675 : // must be externally guaranteed, with Intersects only returning false if
676 : // that bound is met. The opposite bound is verified during iteration by the
677 : // sstable iterator.
678 : //
679 : // boundLimitedFilter is permitted to be defined on a property (`Name()`)
680 : // for which another filter exists in filters. In this case both filters
681 : // will be consulted, and either filter may exclude block(s). Only a single
682 : // bound-limited block-property filter may be set.
683 : //
684 : // The boundLimitedShortID field contains the shortID of the filter's
685 : // property within the sstable. It's set to -1 if the property was not
686 : // collected when the table was built.
687 : boundLimitedFilter BoundLimitedBlockPropertyFilter
688 : boundLimitedShortID int
689 :
690 : syntheticSuffix SyntheticSuffix
691 : }
692 :
693 : var blockPropertiesFiltererPool = sync.Pool{
694 2 : New: func() interface{} {
695 2 : return &BlockPropertiesFilterer{}
696 2 : },
697 : }
698 :
699 : // newBlockPropertiesFilterer returns a partially initialized filterer. To complete
700 : // initialization, call IntersectsUserPropsAndFinishInit.
701 : func newBlockPropertiesFilterer(
702 : filters []BlockPropertyFilter,
703 : limited BoundLimitedBlockPropertyFilter,
704 : syntheticSuffix SyntheticSuffix,
705 2 : ) *BlockPropertiesFilterer {
706 2 : filterer := blockPropertiesFiltererPool.Get().(*BlockPropertiesFilterer)
707 2 : *filterer = BlockPropertiesFilterer{
708 2 : filters: filters,
709 2 : shortIDToFiltersIndex: filterer.shortIDToFiltersIndex[:0],
710 2 : boundLimitedFilter: limited,
711 2 : boundLimitedShortID: -1,
712 2 : syntheticSuffix: syntheticSuffix,
713 2 : }
714 2 : return filterer
715 2 : }
716 :
717 2 : func releaseBlockPropertiesFilterer(filterer *BlockPropertiesFilterer) {
718 2 : *filterer = BlockPropertiesFilterer{
719 2 : shortIDToFiltersIndex: filterer.shortIDToFiltersIndex[:0],
720 2 : }
721 2 : blockPropertiesFiltererPool.Put(filterer)
722 2 : }
723 :
724 : // IntersectsTable evaluates the provided block-property filter against the
725 : // provided set of table-level properties. If there is no intersection between
726 : // the filters and the table or an error is encountered, IntersectsTable returns
727 : // a nil filterer (and possibly an error). If there is an intersection,
728 : // IntersectsTable returns a non-nil filterer that may be used by an iterator
729 : // reading the table.
730 : func IntersectsTable(
731 : filters []BlockPropertyFilter,
732 : limited BoundLimitedBlockPropertyFilter,
733 : userProperties map[string]string,
734 : syntheticSuffix SyntheticSuffix,
735 2 : ) (*BlockPropertiesFilterer, error) {
736 2 : f := newBlockPropertiesFilterer(filters, limited, syntheticSuffix)
737 2 : ok, err := f.intersectsUserPropsAndFinishInit(userProperties)
738 2 : if !ok || err != nil {
739 2 : releaseBlockPropertiesFilterer(f)
740 2 : return nil, err
741 2 : }
742 2 : return f, nil
743 : }
744 :
745 : // intersectsUserPropsAndFinishInit is called with the user properties map for
746 : // the sstable and returns whether the sstable intersects the filters. It
747 : // additionally initializes the shortIDToFiltersIndex for the filters that are
748 : // relevant to this sstable.
749 : func (f *BlockPropertiesFilterer) intersectsUserPropsAndFinishInit(
750 : userProperties map[string]string,
751 2 : ) (bool, error) {
752 2 : for i := range f.filters {
753 2 : props, ok := userProperties[f.filters[i].Name()]
754 2 : if !ok {
755 2 : // Collector was not used when writing this file, so it is
756 2 : // considered intersecting.
757 2 : continue
758 : }
759 2 : if len(props) < 1 {
760 0 : return false, base.CorruptionErrorf(
761 0 : "block properties for %s is corrupted", f.filters[i].Name())
762 0 : }
763 2 : shortID := shortID(props[0])
764 2 : {
765 2 : // Use an unsafe conversion to avoid allocating. Intersects() is not
766 2 : // supposed to modify the given slice.
767 2 : // Note that unsafe.StringData only works if the string is not empty
768 2 : // (which we already checked).
769 2 : byteProps := unsafe.Slice(unsafe.StringData(props), len(props))
770 2 : var intersects bool
771 2 : var err error
772 2 : if len(f.syntheticSuffix) == 0 {
773 2 : intersects, err = f.filters[i].Intersects(byteProps[1:])
774 2 : } else {
775 2 : intersects, err = f.filters[i].SyntheticSuffixIntersects(byteProps[1:], f.syntheticSuffix)
776 2 : }
777 2 : if err != nil || !intersects {
778 2 : return false, err
779 2 : }
780 : }
781 : // Intersects the sstable, so need to use this filter when
782 : // deciding whether to read blocks.
783 2 : n := len(f.shortIDToFiltersIndex)
784 2 : if n <= int(shortID) {
785 2 : if cap(f.shortIDToFiltersIndex) <= int(shortID) {
786 2 : index := make([]int, shortID+1, 2*(shortID+1))
787 2 : copy(index, f.shortIDToFiltersIndex)
788 2 : f.shortIDToFiltersIndex = index
789 2 : } else {
790 2 : f.shortIDToFiltersIndex = f.shortIDToFiltersIndex[:shortID+1]
791 2 : }
792 2 : for j := n; j < int(shortID); j++ {
793 2 : f.shortIDToFiltersIndex[j] = -1
794 2 : }
795 : }
796 2 : f.shortIDToFiltersIndex[shortID] = i
797 : }
798 2 : if f.boundLimitedFilter == nil {
799 2 : return true, nil
800 2 : }
801 :
802 : // There's a bound-limited filter. Find its shortID. It's possible that
803 : // there's an existing filter in f.filters on the same property. That's
804 : // okay. Both filters will be consulted whenever a relevant prop is decoded.
805 2 : props, ok := userProperties[f.boundLimitedFilter.Name()]
806 2 : if !ok {
807 2 : // The collector was not used when writing this file, so it's
808 2 : // intersecting. We leave f.boundLimitedShortID=-1, so the filter will
809 2 : // be unused within this file.
810 2 : return true, nil
811 2 : }
812 2 : if len(props) < 1 {
813 0 : return false, base.CorruptionErrorf(
814 0 : "block properties for %s is corrupted", f.boundLimitedFilter.Name())
815 0 : }
816 2 : f.boundLimitedShortID = int(props[0])
817 2 :
818 2 : // We don't check for table-level intersection for the bound-limited filter.
819 2 : // The bound-limited filter is treated as vacuously intersecting.
820 2 : //
821 2 : // NB: If a block-property filter needs to be toggled inactive/active, it
822 2 : // should be implemented within the Intersects implementation.
823 2 : //
824 2 : // TODO(jackson): We could filter at the table-level by threading the table
825 2 : // smallest and largest bounds here.
826 2 :
827 2 : // The bound-limited filter isn't included in shortIDToFiltersIndex.
828 2 : //
829 2 : // When determining intersection, we decode props only up to the shortID
830 2 : // len(shortIDToFiltersIndex). If f.limitedShortID is greater than any of
831 2 : // the existing filters' shortIDs, we need to grow shortIDToFiltersIndex.
832 2 : // Growing the index with -1s ensures we're able to consult the index
833 2 : // without length checks.
834 2 : if n := len(f.shortIDToFiltersIndex); n <= f.boundLimitedShortID {
835 2 : if cap(f.shortIDToFiltersIndex) <= f.boundLimitedShortID {
836 2 : index := make([]int, f.boundLimitedShortID+1)
837 2 : copy(index, f.shortIDToFiltersIndex)
838 2 : f.shortIDToFiltersIndex = index
839 2 : } else {
840 2 : f.shortIDToFiltersIndex = f.shortIDToFiltersIndex[:f.boundLimitedShortID+1]
841 2 : }
842 2 : for j := n; j <= f.boundLimitedShortID; j++ {
843 2 : f.shortIDToFiltersIndex[j] = -1
844 2 : }
845 : }
846 2 : return true, nil
847 : }
848 :
849 : type intersectsResult int8
850 :
851 : const (
852 : blockIntersects intersectsResult = iota
853 : blockExcluded
854 : // blockMaybeExcluded is returned by BlockPropertiesFilterer.intersects when
855 : // no filters unconditionally exclude the block, but the bound-limited block
856 : // property filter will exclude it if the block's bounds fall within the
857 : // filter's current bounds. See the reader's
858 : // {single,two}LevelIterator.resolveMaybeExcluded methods.
859 : blockMaybeExcluded
860 : )
861 :
862 2 : func (f *BlockPropertiesFilterer) intersects(props []byte) (intersectsResult, error) {
863 2 : decoder := makeBlockPropertiesDecoder(len(f.shortIDToFiltersIndex), props)
864 2 : ret := blockIntersects
865 2 : for !decoder.Done() {
866 2 : id, prop, err := decoder.Next()
867 2 : if err != nil {
868 0 : return ret, err
869 0 : }
870 2 : intersects, err := f.intersectsFilter(id, prop)
871 2 : if err != nil {
872 0 : return ret, err
873 0 : }
874 2 : if intersects == blockExcluded {
875 2 : return blockExcluded, nil
876 2 : }
877 2 : if intersects == blockMaybeExcluded {
878 2 : ret = blockMaybeExcluded
879 2 : }
880 : }
881 : // ret is either blockIntersects or blockMaybeExcluded.
882 2 : return ret, nil
883 : }
884 :
885 : func (f *BlockPropertiesFilterer) intersectsFilter(
886 : id shortID, prop []byte,
887 2 : ) (intersectsResult, error) {
888 2 : var intersects bool
889 2 : var err error
890 2 : if filterIdx := f.shortIDToFiltersIndex[id]; filterIdx >= 0 {
891 2 : if !f.syntheticSuffix.IsSet() {
892 2 : intersects, err = f.filters[filterIdx].Intersects(prop)
893 2 : } else {
894 2 : intersects, err = f.filters[filterIdx].SyntheticSuffixIntersects(prop, f.syntheticSuffix)
895 2 : }
896 2 : if err != nil {
897 0 : return blockIntersects, err
898 0 : }
899 2 : if !intersects {
900 2 : return blockExcluded, nil
901 2 : }
902 : }
903 2 : if int(id) == f.boundLimitedShortID {
904 2 : // The bound-limited filter uses this id.
905 2 : //
906 2 : // The bound-limited filter only applies within a keyspan interval. We
907 2 : // expect the Intersects call to be cheaper than bounds checks. If
908 2 : // Intersects determines that there is no intersection, we return
909 2 : // `blockMaybeExcluded` if no other bpf unconditionally excludes the
910 2 : // block.
911 2 : if !f.syntheticSuffix.IsSet() {
912 2 : intersects, err = f.boundLimitedFilter.Intersects(prop)
913 2 : } else {
914 2 : intersects, err = f.boundLimitedFilter.SyntheticSuffixIntersects(prop, f.syntheticSuffix)
915 2 : }
916 2 : if err != nil {
917 0 : return blockIntersects, err
918 2 : } else if !intersects {
919 2 : return blockMaybeExcluded, nil
920 2 : }
921 : }
922 2 : return blockIntersects, nil
923 : }
924 :
925 2 : func uvarintLen(v uint32) int {
926 2 : i := 0
927 2 : for v >= 0x80 {
928 1 : v >>= 7
929 1 : i++
930 1 : }
931 2 : return i + 1
932 : }
|