Line data Source code
1 : // Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package sstable
6 :
7 : import (
8 : "encoding/binary"
9 : "fmt"
10 : "math"
11 : "sync"
12 : "unsafe"
13 :
14 : "github.com/cockroachdb/pebble/internal/base"
15 : "github.com/cockroachdb/pebble/internal/invariants"
16 : "github.com/cockroachdb/pebble/internal/keyspan"
17 : )
18 :
19 : // Block properties are an optional user-facing feature that can be used to
20 : // filter data blocks (and whole sstables) from an Iterator before they are
21 : // loaded. They do not apply to range delete blocks. These are expected to
22 : // very concisely represent a set of some attribute value contained within the
23 : // key or value, such that the set includes all the attribute values in the
24 : // block. This has some similarities with OLAP pruning approaches that
25 : // maintain min-max attribute values for some column (which concisely
26 : // represent a set), that is then used to prune at query time. In Pebble's
27 : // case, data blocks are small, typically 25-50KB, so these properties should
28 : // reduce their precision in order to be concise -- a good rule of thumb is to
29 : // not consume more than 50-100 bytes across all properties maintained for a
30 : // block, i.e., a 500x reduction compared to loading the data block.
31 : //
32 : // A block property must be assigned a unique name, which is encoded and
33 : // stored in the sstable. This name must be unique among all user-properties
34 : // encoded in an sstable.
35 : //
36 : // A property is represented as a []byte. A nil value or empty byte slice are
37 : // considered semantically identical. The caller is free to choose the
38 : // semantics of an empty byte slice e.g. they could use it to represent the
39 : // empty set or the universal set, whichever they think is more common and
40 : // therefore better to encode more concisely. The serialization of the
41 : // property for the various Finish*() calls in a BlockPropertyCollector
42 : // implementation should be identical, since the corresponding
43 : // BlockPropertyFilter implementation is not told the context in which it is
44 : // deserializing the property.
45 : //
46 : // Block properties are more general than table properties and should be
47 : // preferred over using table properties. A BlockPropertyCollector can achieve
48 : // identical behavior to table properties by returning the nil slice from
49 : // FinishDataBlock and FinishIndexBlock, and interpret them as the universal
50 : // set in BlockPropertyFilter, and return a non-universal set in FinishTable.
51 : //
52 : // Block property filtering is nondeterministic because the separation of keys
53 : // into blocks is nondeterministic. Clients use block-property filters to
54 : // implement efficient application of a filter F that applies to key-value pairs
55 : // (abbreviated as kv-filter). Consider correctness defined as surfacing exactly
56 : // the same key-value pairs that would be surfaced if one applied the filter F
57 : // above normal iteration. With this correctness definition, block property
58 : // filtering may introduce two kinds of errors:
59 : //
60 : // a) Block property filtering that uses a kv-filter may produce additional
61 : // key-value pairs that don't satisfy the filter because of the separation
62 : // of keys into blocks. Clients may remove these extra key-value pairs by
63 : // re-applying the kv filter while reading results back from Pebble.
64 : //
65 : // b) Block property filtering may surface deleted key-value pairs if the
66 : // kv filter is not a strict function of the key's user key. A block
67 : // containing k.DEL may be filtered, while a block containing the deleted
68 : // key k.SET may not be filtered, if the kv filter applies to one but not
69 : // the other.
70 : //
71 : // This error may be avoided trivially by using a kv filter that is a pure
72 : // function of the user key. A filter that examines values or key kinds
73 : // requires care to ensure F(k.SET, <value>) = F(k.DEL) = F(k.SINGLEDEL).
74 : //
75 : // The combination of range deletions and filtering by table-level properties
76 : // add another opportunity for deleted point keys to be surfaced. The pebble
77 : // Iterator stack takes care to correctly apply filtered tables' range deletions
78 : // to lower tables, preventing this form of nondeterministic error.
79 : //
80 : // In addition to the non-determinism discussed in (b), which limits the use
81 : // of properties over values, we now have support for values that are not
82 : // stored together with the key, and may not even be retrieved during
83 : // compactions. If Pebble is configured with such value separation, block
84 : // properties must only apply to the key, and will be provided a nil value.
85 :
86 : // BlockPropertyCollector is used when writing a sstable.
87 : //
88 : // - All calls to Add are included in the next FinishDataBlock, after which
89 : // the next data block is expected to start.
90 : //
91 : // - The index entry generated for the data block, which contains the return
92 : // value from FinishDataBlock, is not immediately included in the current
93 : // index block. It is included when AddPrevDataBlockToIndexBlock is called.
94 : // An alternative would be to return an opaque handle from FinishDataBlock
95 : // and pass it to a new AddToIndexBlock method, which requires more
96 : // plumbing, and passing of an interface{} results in a undesirable heap
97 : // allocation. AddPrevDataBlockToIndexBlock must be called before keys are
98 : // added to the new data block.
99 : type BlockPropertyCollector interface {
100 : // Name returns the name of the block property collector.
101 : Name() string
102 :
103 : // AddPointKey is called with each new key added to a data block in the
104 : // sstable. The callee can assume that these are in sorted order.
105 : AddPointKey(key InternalKey, value []byte) error
106 :
107 : // AddRangeKeys is called for each range span added to the sstable. The range
108 : // key properties are stored separately and don't contribute to data block
109 : // properties. They are only used when FinishTable is called.
110 : // TODO(radu): clean up this subtle semantic.
111 : AddRangeKeys(span keyspan.Span) error
112 :
113 : // AddCollectedWithSuffixReplacement adds previously collected property data
114 : // and updates it to reflect a change of suffix on all keys: the old property
115 : // data is assumed to be constructed from keys that all have the same
116 : // oldSuffix and is recalculated to reflect the same keys but with newSuffix.
117 : //
118 : // A collector which supports this method must be able to derive its updated
119 : // value from its old value and the change being made to the suffix, without
120 : // needing to be passed each updated K/V.
121 : //
122 : // For example, a collector that only inspects values can simply copy its
123 : // previously computed property as-is, since key-suffix replacement does not
124 : // change values, while a collector that depends only on key suffixes, like
125 : // one which collected mvcc-timestamp bounds from timestamp-suffixed keys, can
126 : // just set its new bounds from the new suffix, as it is common to all keys,
127 : // without needing to recompute it from every key.
128 : //
129 : // This method is optional (if it is not implemented, it always returns an
130 : // error). SupportsSuffixReplacement() can be used to check if this method is
131 : // implemented.
132 : AddCollectedWithSuffixReplacement(oldProp []byte, oldSuffix, newSuffix []byte) error
133 :
134 : // SupportsSuffixReplacement returns whether the collector supports the
135 : // AddCollectedWithSuffixReplacement method.
136 : SupportsSuffixReplacement() bool
137 :
138 : // FinishDataBlock is called when all the entries have been added to a
139 : // data block. Subsequent Add calls will be for the next data block. It
140 : // returns the property value for the finished block.
141 : FinishDataBlock(buf []byte) ([]byte, error)
142 :
143 : // AddPrevDataBlockToIndexBlock adds the entry corresponding to the
144 : // previous FinishDataBlock to the current index block.
145 : AddPrevDataBlockToIndexBlock()
146 :
147 : // FinishIndexBlock is called when an index block, containing all the
148 : // key-value pairs since the last FinishIndexBlock, will no longer see new
149 : // entries. It returns the property value for the index block.
150 : FinishIndexBlock(buf []byte) ([]byte, error)
151 :
152 : // FinishTable is called when the sstable is finished, and returns the
153 : // property value for the sstable.
154 : FinishTable(buf []byte) ([]byte, error)
155 : }
156 :
157 : // BlockPropertyFilter is used in an Iterator to filter sstables and blocks
158 : // within the sstable. It should not maintain any per-sstable state, and must
159 : // be thread-safe.
160 : type BlockPropertyFilter = base.BlockPropertyFilter
161 :
162 : // BoundLimitedBlockPropertyFilter implements the block-property filter but
163 : // imposes an additional constraint on its usage, requiring that only blocks
164 : // containing exclusively keys between its lower and upper bounds may be
165 : // filtered. The bounds may be change during iteration, so the filter doesn't
166 : // expose the bounds, instead implementing KeyIsWithin[Lower,Upper]Bound methods
167 : // for performing bound comparisons.
168 : //
169 : // To be used, a BoundLimitedBlockPropertyFilter must be supplied directly
170 : // through NewBlockPropertiesFilterer's dedicated parameter. If supplied through
171 : // the ordinary slice of block property filters, this filter's bounds will be
172 : // ignored.
173 : //
174 : // The current [lower,upper) bounds of the filter are unknown, because they may
175 : // be changing. During forward iteration the lower bound is externally
176 : // guaranteed, meaning Intersects only returns false if the sstable iterator is
177 : // already known to be positioned at a key ≥ lower. The sstable iterator is then
178 : // only responsible for ensuring filtered blocks also meet the upper bound, and
179 : // should only allow a block to be filtered if all its keys are < upper. The
180 : // sstable iterator may invoke KeyIsWithinUpperBound(key) to perform this check,
181 : // where key is an inclusive upper bound on the block's keys.
182 : //
183 : // During backward iteration the upper bound is externally guaranteed, and
184 : // Intersects only returns false if the sstable iterator is already known to be
185 : // positioned at a key < upper. The sstable iterator is responsible for ensuring
186 : // filtered blocks also meet the lower bound, enforcing that a block is only
187 : // filtered if all its keys are ≥ lower. This check is made through passing the
188 : // block's inclusive lower bound to KeyIsWithinLowerBound.
189 : //
190 : // Implementations may become active or inactive through implementing Intersects
191 : // to return true whenever the filter is disabled.
192 : //
193 : // Usage of BoundLimitedBlockPropertyFilter is subtle, and Pebble consumers
194 : // should not implement this interface directly. This interface is an internal
195 : // detail in the implementation of block-property range-key masking.
196 : type BoundLimitedBlockPropertyFilter interface {
197 : BlockPropertyFilter
198 :
199 : // KeyIsWithinLowerBound tests whether the provided internal key falls
200 : // within the current lower bound of the filter. A true return value
201 : // indicates that the filter may be used to filter blocks that exclusively
202 : // contain keys ≥ `key`, so long as the blocks' keys also satisfy the upper
203 : // bound.
204 : KeyIsWithinLowerBound(key []byte) bool
205 : // KeyIsWithinUpperBound tests whether the provided internal key falls
206 : // within the current upper bound of the filter. A true return value
207 : // indicates that the filter may be used to filter blocks that exclusively
208 : // contain keys ≤ `key`, so long as the blocks' keys also satisfy the lower
209 : // bound.
210 : KeyIsWithinUpperBound(key []byte) bool
211 : }
212 :
213 : // BlockIntervalCollector is a helper implementation of BlockPropertyCollector
214 : // for users who want to represent a set of the form [lower,upper) where both
215 : // lower and upper are uint64, and lower <= upper.
216 : //
217 : // The set is encoded as:
218 : // - Two varint integers, (lower,upper-lower), when upper-lower > 0
219 : // - Nil, when upper-lower=0
220 : //
221 : // Users must not expect this to preserve differences between empty sets --
222 : // they will all get turned into the semantically equivalent [0,0).
223 : //
224 : // A BlockIntervalCollector that collects over point and range keys needs to
225 : // have both the point and range DataBlockIntervalCollector specified, since
226 : // point and range keys are fed to the BlockIntervalCollector in an interleaved
227 : // fashion, independently of one another. This also implies that the
228 : // DataBlockIntervalCollectors for point and range keys should be references to
229 : // independent instances, rather than references to the same collector, as point
230 : // and range keys are tracked independently.
231 : type BlockIntervalCollector struct {
232 : name string
233 : mapper IntervalMapper
234 : suffixReplacer BlockIntervalSuffixReplacer
235 :
236 : blockInterval BlockInterval
237 : indexInterval BlockInterval
238 : tableInterval BlockInterval
239 : }
240 :
241 : var _ BlockPropertyCollector = &BlockIntervalCollector{}
242 :
243 : // IntervalMapper is an interface through which a user can define the mapping
244 : // between keys and intervals. The interval for any collection of keys (e.g. a
245 : // data block, a table) is the union of intervals for all keys.
246 : type IntervalMapper interface {
247 : // MapPointKey maps a point key to an interval. The interval can be empty, which
248 : // means that this key will effectively be ignored.
249 : MapPointKey(key InternalKey, value []byte) (BlockInterval, error)
250 :
251 : // MapRangeKeys maps a range key span to an interval. The interval can be
252 : // empty, which means that this span will effectively be ignored.
253 : MapRangeKeys(span Span) (BlockInterval, error)
254 : }
255 :
256 : // NewBlockIntervalCollector constructs a BlockIntervalCollector with the given
257 : // name. The BlockIntervalCollector makes use of the given point and range key
258 : // DataBlockIntervalCollectors when encountering point and range keys,
259 : // respectively.
260 : //
261 : // The caller may pass a nil DataBlockIntervalCollector for one of the point or
262 : // range key collectors, in which case keys of those types will be ignored. This
263 : // allows for flexible construction of BlockIntervalCollectors that operate on
264 : // just point keys, just range keys, or both point and range keys.
265 : //
266 : // If both point and range keys are to be tracked, two independent collectors
267 : // should be provided, rather than the same collector passed in twice (see the
268 : // comment on BlockIntervalCollector for more detail)
269 : // XXX update
270 : func NewBlockIntervalCollector(
271 : name string, mapper IntervalMapper, suffixReplacer BlockIntervalSuffixReplacer,
272 1 : ) BlockPropertyCollector {
273 1 : if mapper == nil {
274 1 : panic("mapper must be provided")
275 : }
276 1 : return &BlockIntervalCollector{
277 1 : name: name,
278 1 : mapper: mapper,
279 1 : suffixReplacer: suffixReplacer,
280 1 : }
281 : }
282 :
283 : // Name is part of the BlockPropertyCollector interface.
284 1 : func (b *BlockIntervalCollector) Name() string {
285 1 : return b.name
286 1 : }
287 :
288 : // AddPointKey is part of the BlockPropertyCollector interface.
289 1 : func (b *BlockIntervalCollector) AddPointKey(key InternalKey, value []byte) error {
290 1 : interval, err := b.mapper.MapPointKey(key, value)
291 1 : if err != nil {
292 0 : return err
293 0 : }
294 1 : b.blockInterval.UnionWith(interval)
295 1 : return nil
296 : }
297 :
298 : // AddRangeKeys is part of the BlockPropertyCollector interface.
299 1 : func (b *BlockIntervalCollector) AddRangeKeys(span Span) error {
300 1 : if span.Empty() {
301 0 : return nil
302 0 : }
303 1 : interval, err := b.mapper.MapRangeKeys(span)
304 1 : if err != nil {
305 0 : return err
306 0 : }
307 : // Range keys are not included in block or index intervals; they just apply
308 : // directly to the table interval.
309 1 : b.tableInterval.UnionWith(interval)
310 1 : return nil
311 : }
312 :
313 : // AddCollectedWithSuffixReplacement is part of the BlockPropertyCollector interface.
314 : func (b *BlockIntervalCollector) AddCollectedWithSuffixReplacement(
315 : oldProp []byte, oldSuffix, newSuffix []byte,
316 0 : ) error {
317 0 : i, err := decodeBlockInterval(oldProp)
318 0 : if err != nil {
319 0 : return err
320 0 : }
321 0 : i, err = b.suffixReplacer.ApplySuffixReplacement(i, newSuffix)
322 0 : if err != nil {
323 0 : return err
324 0 : }
325 0 : b.blockInterval.UnionWith(i)
326 0 : return nil
327 : }
328 :
329 : // SupportsSuffixReplacement is part of the BlockPropertyCollector interface.
330 0 : func (b *BlockIntervalCollector) SupportsSuffixReplacement() bool {
331 0 : return b.suffixReplacer != nil
332 0 : }
333 :
334 : // FinishDataBlock is part of the BlockPropertyCollector interface.
335 1 : func (b *BlockIntervalCollector) FinishDataBlock(buf []byte) ([]byte, error) {
336 1 : buf = encodeBlockInterval(b.blockInterval, buf)
337 1 : b.tableInterval.UnionWith(b.blockInterval)
338 1 : return buf, nil
339 1 : }
340 :
341 : // AddPrevDataBlockToIndexBlock implements the BlockPropertyCollector
342 : // interface.
343 1 : func (b *BlockIntervalCollector) AddPrevDataBlockToIndexBlock() {
344 1 : b.indexInterval.UnionWith(b.blockInterval)
345 1 : b.blockInterval = BlockInterval{}
346 1 : }
347 :
348 : // FinishIndexBlock implements the BlockPropertyCollector interface.
349 1 : func (b *BlockIntervalCollector) FinishIndexBlock(buf []byte) ([]byte, error) {
350 1 : buf = encodeBlockInterval(b.indexInterval, buf)
351 1 : b.indexInterval = BlockInterval{}
352 1 : return buf, nil
353 1 : }
354 :
355 : // FinishTable implements the BlockPropertyCollector interface.
356 1 : func (b *BlockIntervalCollector) FinishTable(buf []byte) ([]byte, error) {
357 1 : return encodeBlockInterval(b.tableInterval, buf), nil
358 1 : }
359 :
360 : // BlockInterval represents the [Lower, Upper) interval of 64-bit values
361 : // corresponding to a set of keys. The meaning of the values themselves is
362 : // opaque to the BlockIntervalCollector.
363 : //
364 : // If Lower >= Upper, the interval is the empty set.
365 : type BlockInterval struct {
366 : Lower uint64
367 : Upper uint64
368 : }
369 :
370 : // IsEmpty returns true if the interval is empty.
371 1 : func (i BlockInterval) IsEmpty() bool {
372 1 : return i.Lower >= i.Upper
373 1 : }
374 :
375 : // Intersects returns true if the two intervals intersect.
376 1 : func (i BlockInterval) Intersects(other BlockInterval) bool {
377 1 : return !i.IsEmpty() && !other.IsEmpty() && i.Upper > other.Lower && i.Lower < other.Upper
378 1 : }
379 :
380 : // UnionWith extends the receiver to include another interval.
381 1 : func (i *BlockInterval) UnionWith(other BlockInterval) {
382 1 : switch {
383 1 : case other.IsEmpty():
384 1 : case i.IsEmpty():
385 1 : *i = other
386 1 : default:
387 1 : i.Lower = min(i.Lower, other.Lower)
388 1 : i.Upper = max(i.Upper, other.Upper)
389 : }
390 : }
391 :
392 1 : func encodeBlockInterval(i BlockInterval, buf []byte) []byte {
393 1 : if i.IsEmpty() {
394 1 : return buf
395 1 : }
396 :
397 1 : var encoded [binary.MaxVarintLen64 * 2]byte
398 1 : n := binary.PutUvarint(encoded[:], i.Lower)
399 1 : n += binary.PutUvarint(encoded[n:], i.Upper-i.Lower)
400 1 : return append(buf, encoded[:n]...)
401 : }
402 :
403 1 : func decodeBlockInterval(buf []byte) (BlockInterval, error) {
404 1 : if len(buf) == 0 {
405 1 : return BlockInterval{}, nil
406 1 : }
407 1 : var i BlockInterval
408 1 : var n int
409 1 : i.Lower, n = binary.Uvarint(buf)
410 1 : if n <= 0 || n >= len(buf) {
411 0 : return BlockInterval{}, base.CorruptionErrorf("cannot decode interval from buf %x", buf)
412 0 : }
413 1 : pos := n
414 1 : i.Upper, n = binary.Uvarint(buf[pos:])
415 1 : pos += n
416 1 : if pos != len(buf) || n <= 0 {
417 0 : return BlockInterval{}, base.CorruptionErrorf("cannot decode interval from buf %x", buf)
418 0 : }
419 : // Delta decode.
420 1 : i.Upper += i.Lower
421 1 : if i.Upper < i.Lower {
422 0 : return BlockInterval{}, base.CorruptionErrorf("unexpected overflow, upper %d < lower %d", i.Upper, i.Lower)
423 0 : }
424 1 : return i, nil
425 : }
426 :
427 : // BlockIntervalSuffixReplacer provides methods to conduct just in time
428 : // adjustments of a passed in block prop interval before filtering.
429 : type BlockIntervalSuffixReplacer interface {
430 : // ApplySuffixReplacement recalculates a previously calculated interval (which
431 : // corresponds to an arbitrary collection of keys) under the assumption
432 : // that those keys are rewritten with a new prefix.
433 : //
434 : // Such a transformation is possible when the intervals depend only on the
435 : // suffixes.
436 : ApplySuffixReplacement(interval BlockInterval, newSuffix []byte) (BlockInterval, error)
437 : }
438 :
439 : // BlockIntervalFilter is an implementation of BlockPropertyFilter when the
440 : // corresponding collector is a BlockIntervalCollector. That is, the set is of
441 : // the form [lower, upper).
442 : type BlockIntervalFilter struct {
443 : name string
444 : filterInterval BlockInterval
445 : suffixReplacer BlockIntervalSuffixReplacer
446 : }
447 :
448 : var _ BlockPropertyFilter = (*BlockIntervalFilter)(nil)
449 :
450 : // NewBlockIntervalFilter constructs a BlockPropertyFilter that filters blocks
451 : // based on an interval property collected by BlockIntervalCollector and the
452 : // given [lower, upper) bounds. The given name specifies the
453 : // BlockIntervalCollector's properties to read.
454 : func NewBlockIntervalFilter(
455 : name string, lower uint64, upper uint64, suffixReplacer BlockIntervalSuffixReplacer,
456 1 : ) *BlockIntervalFilter {
457 1 : b := new(BlockIntervalFilter)
458 1 : b.Init(name, lower, upper, suffixReplacer)
459 1 : return b
460 1 : }
461 :
462 : // Init initializes (or re-initializes, clearing previous state) an existing
463 : // BLockPropertyFilter to filter blocks based on an interval property collected
464 : // by BlockIntervalCollector and the given [lower, upper) bounds. The given name
465 : // specifies the BlockIntervalCollector's properties to read.
466 : func (b *BlockIntervalFilter) Init(
467 : name string, lower, upper uint64, suffixReplacer BlockIntervalSuffixReplacer,
468 1 : ) {
469 1 : *b = BlockIntervalFilter{
470 1 : name: name,
471 1 : filterInterval: BlockInterval{Lower: lower, Upper: upper},
472 1 : suffixReplacer: suffixReplacer,
473 1 : }
474 1 : }
475 :
476 : // Name implements the BlockPropertyFilter interface.
477 1 : func (b *BlockIntervalFilter) Name() string {
478 1 : return b.name
479 1 : }
480 :
481 : // Intersects implements the BlockPropertyFilter interface.
482 1 : func (b *BlockIntervalFilter) Intersects(prop []byte) (bool, error) {
483 1 : i, err := decodeBlockInterval(prop)
484 1 : if err != nil {
485 0 : return false, err
486 0 : }
487 1 : return i.Intersects(b.filterInterval), nil
488 : }
489 :
490 : // SyntheticSuffixIntersects implements the BlockPropertyFilter interface.
491 1 : func (b *BlockIntervalFilter) SyntheticSuffixIntersects(prop []byte, suffix []byte) (bool, error) {
492 1 : if b.suffixReplacer == nil {
493 0 : return false, base.AssertionFailedf("missing SuffixReplacer for SyntheticSuffixIntersects()")
494 0 : }
495 1 : i, err := decodeBlockInterval(prop)
496 1 : if err != nil {
497 0 : return false, err
498 0 : }
499 :
500 1 : newInterval, err := b.suffixReplacer.ApplySuffixReplacement(i, suffix)
501 1 : if err != nil {
502 0 : return false, err
503 0 : }
504 1 : return newInterval.Intersects(b.filterInterval), nil
505 : }
506 :
507 : // SetInterval adjusts the [lower, upper) bounds used by the filter. It is not
508 : // generally safe to alter the filter while it's in use, except as part of the
509 : // implementation of BlockPropertyFilterMask.SetSuffix used for range-key
510 : // masking.
511 1 : func (b *BlockIntervalFilter) SetInterval(lower, upper uint64) {
512 1 : b.filterInterval = BlockInterval{Lower: lower, Upper: upper}
513 1 : }
514 :
515 : // When encoding block properties for each block, we cannot afford to encode the
516 : // name. Instead, the name is mapped to a shortID, in the scope of that sstable,
517 : // and the shortID is encoded as a single byte (which imposes a limit of of 256
518 : // block property collectors per sstable).
519 : // Note that the in-memory type is int16 to avoid overflows (e.g. in loops) and
520 : // to allow special values like -1 in code.
521 : type shortID int16
522 :
523 : const invalidShortID shortID = -1
524 : const maxShortID shortID = math.MaxUint8
525 : const maxPropertyCollectors = int(maxShortID) + 1
526 :
527 1 : func (id shortID) IsValid() bool {
528 1 : return id >= 0 && id <= maxShortID
529 1 : }
530 :
531 1 : func (id shortID) ToByte() byte {
532 1 : if invariants.Enabled && !id.IsValid() {
533 0 : panic(fmt.Sprintf("inavlid id %d", id))
534 : }
535 1 : return byte(id)
536 : }
537 :
538 : type blockPropertiesEncoder struct {
539 : propsBuf []byte
540 : scratch []byte
541 : }
542 :
543 1 : func (e *blockPropertiesEncoder) getScratchForProp() []byte {
544 1 : return e.scratch[:0]
545 1 : }
546 :
547 1 : func (e *blockPropertiesEncoder) resetProps() {
548 1 : e.propsBuf = e.propsBuf[:0]
549 1 : }
550 :
551 1 : func (e *blockPropertiesEncoder) addProp(id shortID, scratch []byte) {
552 1 : if len(scratch) == 0 {
553 1 : // We omit empty properties. The decoder will know that any missing IDs had
554 1 : // empty values.
555 1 : return
556 1 : }
557 1 : const lenID = 1
558 1 : lenProp := uvarintLen(uint32(len(scratch)))
559 1 : n := lenID + lenProp + len(scratch)
560 1 : if cap(e.propsBuf)-len(e.propsBuf) < n {
561 1 : size := len(e.propsBuf) + 2*n
562 1 : if size < 2*cap(e.propsBuf) {
563 1 : size = 2 * cap(e.propsBuf)
564 1 : }
565 1 : buf := make([]byte, len(e.propsBuf), size)
566 1 : copy(buf, e.propsBuf)
567 1 : e.propsBuf = buf
568 : }
569 1 : pos := len(e.propsBuf)
570 1 : b := e.propsBuf[pos : pos+lenID]
571 1 : b[0] = id.ToByte()
572 1 : pos += lenID
573 1 : b = e.propsBuf[pos : pos+lenProp]
574 1 : n = binary.PutUvarint(b, uint64(len(scratch)))
575 1 : pos += n
576 1 : b = e.propsBuf[pos : pos+len(scratch)]
577 1 : pos += len(scratch)
578 1 : copy(b, scratch)
579 1 : e.propsBuf = e.propsBuf[0:pos]
580 1 : e.scratch = scratch
581 : }
582 :
583 1 : func (e *blockPropertiesEncoder) unsafeProps() []byte {
584 1 : return e.propsBuf
585 1 : }
586 :
587 1 : func (e *blockPropertiesEncoder) props() []byte {
588 1 : buf := make([]byte, len(e.propsBuf))
589 1 : copy(buf, e.propsBuf)
590 1 : return buf
591 1 : }
592 :
593 : type blockPropertiesDecoder struct {
594 : props []byte
595 :
596 : // numCollectedProps is the number of collectors that were used when writing
597 : // these properties. The encoded properties contain values for shortIDs 0
598 : // through numCollectedProps-1, in order (with empty properties omitted).
599 : numCollectedProps int
600 : nextID shortID
601 : }
602 :
603 1 : func makeBlockPropertiesDecoder(numCollectedProps int, propsBuf []byte) blockPropertiesDecoder {
604 1 : return blockPropertiesDecoder{
605 1 : props: propsBuf,
606 1 : numCollectedProps: numCollectedProps,
607 1 : }
608 1 : }
609 :
610 1 : func (d *blockPropertiesDecoder) Done() bool {
611 1 : return int(d.nextID) >= d.numCollectedProps
612 1 : }
613 :
614 : // Next returns the property for each shortID between 0 and numCollectedProps-1, in order.
615 : // Note that some properties might be empty.
616 : // REQUIRES: !Done()
617 1 : func (d *blockPropertiesDecoder) Next() (id shortID, prop []byte, err error) {
618 1 : id = d.nextID
619 1 : d.nextID++
620 1 :
621 1 : if len(d.props) == 0 || shortID(d.props[0]) != id {
622 1 : if invariants.Enabled && len(d.props) > 0 && shortID(d.props[0]) < id {
623 0 : panic("shortIDs are not in order")
624 : }
625 : // This property was omitted because it was empty.
626 1 : return id, nil, nil
627 : }
628 :
629 1 : const lenID = 1
630 1 : propLen, m := binary.Uvarint(d.props[lenID:])
631 1 : n := lenID + m
632 1 : if m <= 0 || propLen == 0 || (n+int(propLen)) > len(d.props) {
633 0 : return 0, nil, base.CorruptionErrorf("corrupt block property length")
634 0 : }
635 1 : prop = d.props[n : n+int(propLen)]
636 1 : d.props = d.props[n+int(propLen):]
637 1 : return id, prop, nil
638 : }
639 :
640 : // BlockPropertiesFilterer provides filtering support when reading an sstable
641 : // in the context of an iterator that has a slice of BlockPropertyFilters.
642 : // After the call to NewBlockPropertiesFilterer, the caller must call
643 : // IntersectsUserPropsAndFinishInit to check if the sstable intersects with
644 : // the filters. If it does intersect, this function also finishes initializing
645 : // the BlockPropertiesFilterer using the shortIDs for the relevant filters.
646 : // Subsequent checks for relevance of a block should use the intersects
647 : // method.
648 : type BlockPropertiesFilterer struct {
649 : filters []BlockPropertyFilter
650 : // Maps shortID => index in filters. This can be sparse, and shortIDs for
651 : // which there is no filter are represented with an index of -1. The
652 : // length of this can be shorter than the shortIDs allocated in the
653 : // sstable. e.g. if the sstable used shortIDs 0, 1, 2, 3, and the iterator
654 : // has two filters, corresponding to shortIDs 2, 0, this would be:
655 : // len(shortIDToFiltersIndex)==3, 0=>1, 1=>-1, 2=>0.
656 : shortIDToFiltersIndex []int
657 :
658 : // boundLimitedFilter, if non-nil, holds a single block-property filter with
659 : // additional constraints on its filtering. A boundLimitedFilter may only
660 : // filter blocks that are wholly contained within its bounds. During forward
661 : // iteration the lower bound (and during backward iteration the upper bound)
662 : // must be externally guaranteed, with Intersects only returning false if
663 : // that bound is met. The opposite bound is verified during iteration by the
664 : // sstable iterator.
665 : //
666 : // boundLimitedFilter is permitted to be defined on a property (`Name()`)
667 : // for which another filter exists in filters. In this case both filters
668 : // will be consulted, and either filter may exclude block(s). Only a single
669 : // bound-limited block-property filter may be set.
670 : //
671 : // The boundLimitedShortID field contains the shortID of the filter's
672 : // property within the sstable. It's set to -1 if the property was not
673 : // collected when the table was built.
674 : boundLimitedFilter BoundLimitedBlockPropertyFilter
675 : boundLimitedShortID int
676 :
677 : syntheticSuffix SyntheticSuffix
678 : }
679 :
680 : var blockPropertiesFiltererPool = sync.Pool{
681 1 : New: func() interface{} {
682 1 : return &BlockPropertiesFilterer{}
683 1 : },
684 : }
685 :
686 : // newBlockPropertiesFilterer returns a partially initialized filterer. To complete
687 : // initialization, call IntersectsUserPropsAndFinishInit.
688 : func newBlockPropertiesFilterer(
689 : filters []BlockPropertyFilter,
690 : limited BoundLimitedBlockPropertyFilter,
691 : syntheticSuffix SyntheticSuffix,
692 1 : ) *BlockPropertiesFilterer {
693 1 : filterer := blockPropertiesFiltererPool.Get().(*BlockPropertiesFilterer)
694 1 : *filterer = BlockPropertiesFilterer{
695 1 : filters: filters,
696 1 : shortIDToFiltersIndex: filterer.shortIDToFiltersIndex[:0],
697 1 : boundLimitedFilter: limited,
698 1 : boundLimitedShortID: -1,
699 1 : syntheticSuffix: syntheticSuffix,
700 1 : }
701 1 : return filterer
702 1 : }
703 :
704 1 : func releaseBlockPropertiesFilterer(filterer *BlockPropertiesFilterer) {
705 1 : *filterer = BlockPropertiesFilterer{
706 1 : shortIDToFiltersIndex: filterer.shortIDToFiltersIndex[:0],
707 1 : }
708 1 : blockPropertiesFiltererPool.Put(filterer)
709 1 : }
710 :
711 : // IntersectsTable evaluates the provided block-property filter against the
712 : // provided set of table-level properties. If there is no intersection between
713 : // the filters and the table or an error is encountered, IntersectsTable returns
714 : // a nil filterer (and possibly an error). If there is an intersection,
715 : // IntersectsTable returns a non-nil filterer that may be used by an iterator
716 : // reading the table.
717 : func IntersectsTable(
718 : filters []BlockPropertyFilter,
719 : limited BoundLimitedBlockPropertyFilter,
720 : userProperties map[string]string,
721 : syntheticSuffix SyntheticSuffix,
722 1 : ) (*BlockPropertiesFilterer, error) {
723 1 : f := newBlockPropertiesFilterer(filters, limited, syntheticSuffix)
724 1 : ok, err := f.intersectsUserPropsAndFinishInit(userProperties)
725 1 : if !ok || err != nil {
726 1 : releaseBlockPropertiesFilterer(f)
727 1 : return nil, err
728 1 : }
729 1 : return f, nil
730 : }
731 :
732 : // intersectsUserPropsAndFinishInit is called with the user properties map for
733 : // the sstable and returns whether the sstable intersects the filters. It
734 : // additionally initializes the shortIDToFiltersIndex for the filters that are
735 : // relevant to this sstable.
736 : func (f *BlockPropertiesFilterer) intersectsUserPropsAndFinishInit(
737 : userProperties map[string]string,
738 1 : ) (bool, error) {
739 1 : for i := range f.filters {
740 1 : props, ok := userProperties[f.filters[i].Name()]
741 1 : if !ok {
742 1 : // Collector was not used when writing this file, so it is
743 1 : // considered intersecting.
744 1 : continue
745 : }
746 1 : if len(props) < 1 {
747 0 : return false, base.CorruptionErrorf(
748 0 : "block properties for %s is corrupted", f.filters[i].Name())
749 0 : }
750 1 : shortID := shortID(props[0])
751 1 : {
752 1 : // Use an unsafe conversion to avoid allocating. Intersects() is not
753 1 : // supposed to modify the given slice.
754 1 : // Note that unsafe.StringData only works if the string is not empty
755 1 : // (which we already checked).
756 1 : byteProps := unsafe.Slice(unsafe.StringData(props), len(props))
757 1 : var intersects bool
758 1 : var err error
759 1 : if len(f.syntheticSuffix) == 0 {
760 1 : intersects, err = f.filters[i].Intersects(byteProps[1:])
761 1 : } else {
762 1 : intersects, err = f.filters[i].SyntheticSuffixIntersects(byteProps[1:], f.syntheticSuffix)
763 1 : }
764 1 : if err != nil || !intersects {
765 1 : return false, err
766 1 : }
767 : }
768 : // Intersects the sstable, so need to use this filter when
769 : // deciding whether to read blocks.
770 1 : n := len(f.shortIDToFiltersIndex)
771 1 : if n <= int(shortID) {
772 1 : if cap(f.shortIDToFiltersIndex) <= int(shortID) {
773 1 : index := make([]int, shortID+1, 2*(shortID+1))
774 1 : copy(index, f.shortIDToFiltersIndex)
775 1 : f.shortIDToFiltersIndex = index
776 1 : } else {
777 1 : f.shortIDToFiltersIndex = f.shortIDToFiltersIndex[:shortID+1]
778 1 : }
779 1 : for j := n; j < int(shortID); j++ {
780 1 : f.shortIDToFiltersIndex[j] = -1
781 1 : }
782 : }
783 1 : f.shortIDToFiltersIndex[shortID] = i
784 : }
785 1 : if f.boundLimitedFilter == nil {
786 1 : return true, nil
787 1 : }
788 :
789 : // There's a bound-limited filter. Find its shortID. It's possible that
790 : // there's an existing filter in f.filters on the same property. That's
791 : // okay. Both filters will be consulted whenever a relevant prop is decoded.
792 1 : props, ok := userProperties[f.boundLimitedFilter.Name()]
793 1 : if !ok {
794 1 : // The collector was not used when writing this file, so it's
795 1 : // intersecting. We leave f.boundLimitedShortID=-1, so the filter will
796 1 : // be unused within this file.
797 1 : return true, nil
798 1 : }
799 1 : if len(props) < 1 {
800 0 : return false, base.CorruptionErrorf(
801 0 : "block properties for %s is corrupted", f.boundLimitedFilter.Name())
802 0 : }
803 1 : f.boundLimitedShortID = int(props[0])
804 1 :
805 1 : // We don't check for table-level intersection for the bound-limited filter.
806 1 : // The bound-limited filter is treated as vacuously intersecting.
807 1 : //
808 1 : // NB: If a block-property filter needs to be toggled inactive/active, it
809 1 : // should be implemented within the Intersects implementation.
810 1 : //
811 1 : // TODO(jackson): We could filter at the table-level by threading the table
812 1 : // smallest and largest bounds here.
813 1 :
814 1 : // The bound-limited filter isn't included in shortIDToFiltersIndex.
815 1 : //
816 1 : // When determining intersection, we decode props only up to the shortID
817 1 : // len(shortIDToFiltersIndex). If f.limitedShortID is greater than any of
818 1 : // the existing filters' shortIDs, we need to grow shortIDToFiltersIndex.
819 1 : // Growing the index with -1s ensures we're able to consult the index
820 1 : // without length checks.
821 1 : if n := len(f.shortIDToFiltersIndex); n <= f.boundLimitedShortID {
822 1 : if cap(f.shortIDToFiltersIndex) <= f.boundLimitedShortID {
823 1 : index := make([]int, f.boundLimitedShortID+1)
824 1 : copy(index, f.shortIDToFiltersIndex)
825 1 : f.shortIDToFiltersIndex = index
826 1 : } else {
827 1 : f.shortIDToFiltersIndex = f.shortIDToFiltersIndex[:f.boundLimitedShortID+1]
828 1 : }
829 1 : for j := n; j <= f.boundLimitedShortID; j++ {
830 1 : f.shortIDToFiltersIndex[j] = -1
831 1 : }
832 : }
833 1 : return true, nil
834 : }
835 :
836 : type intersectsResult int8
837 :
838 : const (
839 : blockIntersects intersectsResult = iota
840 : blockExcluded
841 : // blockMaybeExcluded is returned by BlockPropertiesFilterer.intersects when
842 : // no filters unconditionally exclude the block, but the bound-limited block
843 : // property filter will exclude it if the block's bounds fall within the
844 : // filter's current bounds. See the reader's
845 : // {single,two}LevelIterator.resolveMaybeExcluded methods.
846 : blockMaybeExcluded
847 : )
848 :
849 1 : func (f *BlockPropertiesFilterer) intersects(props []byte) (intersectsResult, error) {
850 1 : decoder := makeBlockPropertiesDecoder(len(f.shortIDToFiltersIndex), props)
851 1 : ret := blockIntersects
852 1 : for !decoder.Done() {
853 1 : id, prop, err := decoder.Next()
854 1 : if err != nil {
855 0 : return ret, err
856 0 : }
857 1 : intersects, err := f.intersectsFilter(id, prop)
858 1 : if err != nil {
859 0 : return ret, err
860 0 : }
861 1 : if intersects == blockExcluded {
862 1 : return blockExcluded, nil
863 1 : }
864 1 : if intersects == blockMaybeExcluded {
865 1 : ret = blockMaybeExcluded
866 1 : }
867 : }
868 : // ret is either blockIntersects or blockMaybeExcluded.
869 1 : return ret, nil
870 : }
871 :
872 : func (f *BlockPropertiesFilterer) intersectsFilter(
873 : id shortID, prop []byte,
874 1 : ) (intersectsResult, error) {
875 1 : var intersects bool
876 1 : var err error
877 1 : if filterIdx := f.shortIDToFiltersIndex[id]; filterIdx >= 0 {
878 1 : if !f.syntheticSuffix.IsSet() {
879 1 : intersects, err = f.filters[filterIdx].Intersects(prop)
880 1 : } else {
881 1 : intersects, err = f.filters[filterIdx].SyntheticSuffixIntersects(prop, f.syntheticSuffix)
882 1 : }
883 1 : if err != nil {
884 0 : return blockIntersects, err
885 0 : }
886 1 : if !intersects {
887 1 : return blockExcluded, nil
888 1 : }
889 : }
890 1 : if int(id) == f.boundLimitedShortID {
891 1 : // The bound-limited filter uses this id.
892 1 : //
893 1 : // The bound-limited filter only applies within a keyspan interval. We
894 1 : // expect the Intersects call to be cheaper than bounds checks. If
895 1 : // Intersects determines that there is no intersection, we return
896 1 : // `blockMaybeExcluded` if no other bpf unconditionally excludes the
897 1 : // block.
898 1 : if !f.syntheticSuffix.IsSet() {
899 1 : intersects, err = f.boundLimitedFilter.Intersects(prop)
900 1 : } else {
901 1 : intersects, err = f.boundLimitedFilter.SyntheticSuffixIntersects(prop, f.syntheticSuffix)
902 1 : }
903 1 : if err != nil {
904 0 : return blockIntersects, err
905 1 : } else if !intersects {
906 1 : return blockMaybeExcluded, nil
907 1 : }
908 : }
909 1 : return blockIntersects, nil
910 : }
911 :
912 1 : func uvarintLen(v uint32) int {
913 1 : i := 0
914 1 : for v >= 0x80 {
915 1 : v >>= 7
916 1 : i++
917 1 : }
918 1 : return i + 1
919 : }
|