Line data Source code
1 : // Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package sstable
6 :
7 : import (
8 : "encoding/binary"
9 : "fmt"
10 : "math"
11 : "sync"
12 : "unsafe"
13 :
14 : "github.com/cockroachdb/pebble/internal/base"
15 : "github.com/cockroachdb/pebble/internal/invariants"
16 : "github.com/cockroachdb/pebble/internal/keyspan"
17 : )
18 :
19 : // Block properties are an optional user-facing feature that can be used to
20 : // filter data blocks (and whole sstables) from an Iterator before they are
21 : // loaded. They do not apply to range delete blocks. These are expected to
22 : // very concisely represent a set of some attribute value contained within the
23 : // key or value, such that the set includes all the attribute values in the
24 : // block. This has some similarities with OLAP pruning approaches that
25 : // maintain min-max attribute values for some column (which concisely
26 : // represent a set), that is then used to prune at query time. In Pebble's
27 : // case, data blocks are small, typically 25-50KB, so these properties should
28 : // reduce their precision in order to be concise -- a good rule of thumb is to
29 : // not consume more than 50-100 bytes across all properties maintained for a
30 : // block, i.e., a 500x reduction compared to loading the data block.
31 : //
32 : // A block property must be assigned a unique name, which is encoded and
33 : // stored in the sstable. This name must be unique among all user-properties
34 : // encoded in an sstable.
35 : //
36 : // A property is represented as a []byte. A nil value or empty byte slice are
37 : // considered semantically identical. The caller is free to choose the
38 : // semantics of an empty byte slice e.g. they could use it to represent the
39 : // empty set or the universal set, whichever they think is more common and
40 : // therefore better to encode more concisely. The serialization of the
41 : // property for the various Finish*() calls in a BlockPropertyCollector
42 : // implementation should be identical, since the corresponding
43 : // BlockPropertyFilter implementation is not told the context in which it is
44 : // deserializing the property.
45 : //
46 : // Block properties are more general than table properties and should be
47 : // preferred over using table properties. A BlockPropertyCollector can achieve
48 : // identical behavior to table properties by returning the nil slice from
49 : // FinishDataBlock and FinishIndexBlock, and interpret them as the universal
50 : // set in BlockPropertyFilter, and return a non-universal set in FinishTable.
51 : //
52 : // Block property filtering is nondeterministic because the separation of keys
53 : // into blocks is nondeterministic. Clients use block-property filters to
54 : // implement efficient application of a filter F that applies to key-value pairs
55 : // (abbreviated as kv-filter). Consider correctness defined as surfacing exactly
56 : // the same key-value pairs that would be surfaced if one applied the filter F
57 : // above normal iteration. With this correctness definition, block property
58 : // filtering may introduce two kinds of errors:
59 : //
60 : // a) Block property filtering that uses a kv-filter may produce additional
61 : // key-value pairs that don't satisfy the filter because of the separation
62 : // of keys into blocks. Clients may remove these extra key-value pairs by
63 : // re-applying the kv filter while reading results back from Pebble.
64 : //
65 : // b) Block property filtering may surface deleted key-value pairs if the
66 : // kv filter is not a strict function of the key's user key. A block
67 : // containing k.DEL may be filtered, while a block containing the deleted
68 : // key k.SET may not be filtered, if the kv filter applies to one but not
69 : // the other.
70 : //
71 : // This error may be avoided trivially by using a kv filter that is a pure
72 : // function of the user key. A filter that examines values or key kinds
73 : // requires care to ensure F(k.SET, <value>) = F(k.DEL) = F(k.SINGLEDEL).
74 : //
75 : // The combination of range deletions and filtering by table-level properties
76 : // add another opportunity for deleted point keys to be surfaced. The pebble
77 : // Iterator stack takes care to correctly apply filtered tables' range deletions
78 : // to lower tables, preventing this form of nondeterministic error.
79 : //
80 : // In addition to the non-determinism discussed in (b), which limits the use
81 : // of properties over values, we now have support for values that are not
82 : // stored together with the key, and may not even be retrieved during
83 : // compactions. If Pebble is configured with such value separation, block
84 : // properties must only apply to the key, and will be provided a nil value.
85 :
86 : // BlockPropertyCollector is used when writing a sstable.
87 : //
88 : // - All calls to Add are included in the next FinishDataBlock, after which
89 : // the next data block is expected to start.
90 : //
91 : // - The index entry generated for the data block, which contains the return
92 : // value from FinishDataBlock, is not immediately included in the current
93 : // index block. It is included when AddPrevDataBlockToIndexBlock is called.
94 : // An alternative would be to return an opaque handle from FinishDataBlock
95 : // and pass it to a new AddToIndexBlock method, which requires more
96 : // plumbing, and passing of an interface{} results in a undesirable heap
97 : // allocation. AddPrevDataBlockToIndexBlock must be called before keys are
98 : // added to the new data block.
99 : type BlockPropertyCollector interface {
100 : // Name returns the name of the block property collector.
101 : Name() string
102 :
103 : // AddPointKey is called with each new key added to a data block in the
104 : // sstable. The callee can assume that these are in sorted order.
105 : AddPointKey(key InternalKey, value []byte) error
106 :
107 : // AddRangeKeys is called for each range span added to the sstable. The range
108 : // key properties are stored separately and don't contribute to data block
109 : // properties. They are only used when FinishTable is called.
110 : // TODO(radu): clean up this subtle semantic.
111 : AddRangeKeys(span keyspan.Span) error
112 :
113 : // AddCollectedWithSuffixReplacement adds previously collected property data
114 : // and updates it to reflect a change of suffix on all keys: the old property
115 : // data is assumed to be constructed from keys that all have the same
116 : // oldSuffix and is recalculated to reflect the same keys but with newSuffix.
117 : //
118 : // A collector which supports this method must be able to derive its updated
119 : // value from its old value and the change being made to the suffix, without
120 : // needing to be passed each updated K/V.
121 : //
122 : // For example, a collector that only inspects values can simply copy its
123 : // previously computed property as-is, since key-suffix replacement does not
124 : // change values, while a collector that depends only on key suffixes, like
125 : // one which collected mvcc-timestamp bounds from timestamp-suffixed keys, can
126 : // just set its new bounds from the new suffix, as it is common to all keys,
127 : // without needing to recompute it from every key.
128 : //
129 : // This method is optional (if it is not implemented, it always returns an
130 : // error). SupportsSuffixReplacement() can be used to check if this method is
131 : // implemented.
132 : AddCollectedWithSuffixReplacement(oldProp []byte, oldSuffix, newSuffix []byte) error
133 :
134 : // SupportsSuffixReplacement returns whether the collector supports the
135 : // AddCollectedWithSuffixReplacement method.
136 : SupportsSuffixReplacement() bool
137 :
138 : // FinishDataBlock is called when all the entries have been added to a
139 : // data block. Subsequent Add calls will be for the next data block. It
140 : // returns the property value for the finished block.
141 : FinishDataBlock(buf []byte) ([]byte, error)
142 :
143 : // AddPrevDataBlockToIndexBlock adds the entry corresponding to the
144 : // previous FinishDataBlock to the current index block.
145 : AddPrevDataBlockToIndexBlock()
146 :
147 : // FinishIndexBlock is called when an index block, containing all the
148 : // key-value pairs since the last FinishIndexBlock, will no longer see new
149 : // entries. It returns the property value for the index block.
150 : FinishIndexBlock(buf []byte) ([]byte, error)
151 :
152 : // FinishTable is called when the sstable is finished, and returns the
153 : // property value for the sstable.
154 : FinishTable(buf []byte) ([]byte, error)
155 : }
156 :
157 : // BlockPropertyFilter is used in an Iterator to filter sstables and blocks
158 : // within the sstable. It should not maintain any per-sstable state, and must
159 : // be thread-safe.
160 : type BlockPropertyFilter = base.BlockPropertyFilter
161 :
162 : // BoundLimitedBlockPropertyFilter implements the block-property filter but
163 : // imposes an additional constraint on its usage, requiring that only blocks
164 : // containing exclusively keys between its lower and upper bounds may be
165 : // filtered. The bounds may be change during iteration, so the filter doesn't
166 : // expose the bounds, instead implementing KeyIsWithin[Lower,Upper]Bound methods
167 : // for performing bound comparisons.
168 : //
169 : // To be used, a BoundLimitedBlockPropertyFilter must be supplied directly
170 : // through NewBlockPropertiesFilterer's dedicated parameter. If supplied through
171 : // the ordinary slice of block property filters, this filter's bounds will be
172 : // ignored.
173 : //
174 : // The current [lower,upper) bounds of the filter are unknown, because they may
175 : // be changing. During forward iteration the lower bound is externally
176 : // guaranteed, meaning Intersects only returns false if the sstable iterator is
177 : // already known to be positioned at a key ≥ lower. The sstable iterator is then
178 : // only responsible for ensuring filtered blocks also meet the upper bound, and
179 : // should only allow a block to be filtered if all its keys are < upper. The
180 : // sstable iterator may invoke KeyIsWithinUpperBound(key) to perform this check,
181 : // where key is an inclusive upper bound on the block's keys.
182 : //
183 : // During backward iteration the upper bound is externally guaranteed, and
184 : // Intersects only returns false if the sstable iterator is already known to be
185 : // positioned at a key < upper. The sstable iterator is responsible for ensuring
186 : // filtered blocks also meet the lower bound, enforcing that a block is only
187 : // filtered if all its keys are ≥ lower. This check is made through passing the
188 : // block's inclusive lower bound to KeyIsWithinLowerBound.
189 : //
190 : // Implementations may become active or inactive through implementing Intersects
191 : // to return true whenever the filter is disabled.
192 : //
193 : // Usage of BoundLimitedBlockPropertyFilter is subtle, and Pebble consumers
194 : // should not implement this interface directly. This interface is an internal
195 : // detail in the implementation of block-property range-key masking.
196 : type BoundLimitedBlockPropertyFilter interface {
197 : BlockPropertyFilter
198 :
199 : // KeyIsWithinLowerBound tests whether the provided internal key falls
200 : // within the current lower bound of the filter. A true return value
201 : // indicates that the filter may be used to filter blocks that exclusively
202 : // contain keys ≥ `key`, so long as the blocks' keys also satisfy the upper
203 : // bound.
204 : KeyIsWithinLowerBound(key []byte) bool
205 : // KeyIsWithinUpperBound tests whether the provided internal key falls
206 : // within the current upper bound of the filter. A true return value
207 : // indicates that the filter may be used to filter blocks that exclusively
208 : // contain keys ≤ `key`, so long as the blocks' keys also satisfy the lower
209 : // bound.
210 : KeyIsWithinUpperBound(key []byte) bool
211 : }
212 :
213 : // BlockIntervalCollector is a helper implementation of BlockPropertyCollector
214 : // for users who want to represent a set of the form [lower,upper) where both
215 : // lower and upper are uint64, and lower <= upper.
216 : //
217 : // The set is encoded as:
218 : // - Two varint integers, (lower,upper-lower), when upper-lower > 0
219 : // - Nil, when upper-lower=0
220 : //
221 : // Users must not expect this to preserve differences between empty sets --
222 : // they will all get turned into the semantically equivalent [0,0).
223 : //
224 : // A BlockIntervalCollector that collects over point and range keys needs to
225 : // have both the point and range DataBlockIntervalCollector specified, since
226 : // point and range keys are fed to the BlockIntervalCollector in an interleaved
227 : // fashion, independently of one another. This also implies that the
228 : // DataBlockIntervalCollectors for point and range keys should be references to
229 : // independent instances, rather than references to the same collector, as point
230 : // and range keys are tracked independently.
231 : type BlockIntervalCollector struct {
232 : name string
233 : mapper IntervalMapper
234 : suffixReplacer BlockIntervalSuffixReplacer
235 :
236 : blockInterval BlockInterval
237 : indexInterval BlockInterval
238 : tableInterval BlockInterval
239 : }
240 :
241 : var _ BlockPropertyCollector = &BlockIntervalCollector{}
242 :
243 : // IntervalMapper is an interface through which a user can define the mapping
244 : // between keys and intervals. The interval for any collection of keys (e.g. a
245 : // data block, a table) is the union of intervals for all keys.
246 : type IntervalMapper interface {
247 : // MapPointKey maps a point key to an interval. The interval can be empty, which
248 : // means that this key will effectively be ignored.
249 : MapPointKey(key InternalKey, value []byte) (BlockInterval, error)
250 :
251 : // MapRangeKeys maps a range key span to an interval. The interval can be
252 : // empty, which means that this span will effectively be ignored.
253 : MapRangeKeys(span Span) (BlockInterval, error)
254 : }
255 :
256 : // NewBlockIntervalCollector constructs a BlockIntervalCollector with the given
257 : // name. The BlockIntervalCollector makes use of the given point and range key
258 : // DataBlockIntervalCollectors when encountering point and range keys,
259 : // respectively.
260 : //
261 : // The caller may pass a nil DataBlockIntervalCollector for one of the point or
262 : // range key collectors, in which case keys of those types will be ignored. This
263 : // allows for flexible construction of BlockIntervalCollectors that operate on
264 : // just point keys, just range keys, or both point and range keys.
265 : //
266 : // If both point and range keys are to be tracked, two independent collectors
267 : // should be provided, rather than the same collector passed in twice (see the
268 : // comment on BlockIntervalCollector for more detail)
269 : // XXX update
270 : func NewBlockIntervalCollector(
271 : name string, mapper IntervalMapper, suffixReplacer BlockIntervalSuffixReplacer,
272 2 : ) BlockPropertyCollector {
273 2 : if mapper == nil {
274 1 : panic("mapper must be provided")
275 : }
276 2 : return &BlockIntervalCollector{
277 2 : name: name,
278 2 : mapper: mapper,
279 2 : suffixReplacer: suffixReplacer,
280 2 : }
281 : }
282 :
283 : // Name is part of the BlockPropertyCollector interface.
284 2 : func (b *BlockIntervalCollector) Name() string {
285 2 : return b.name
286 2 : }
287 :
288 : // AddPointKey is part of the BlockPropertyCollector interface.
289 2 : func (b *BlockIntervalCollector) AddPointKey(key InternalKey, value []byte) error {
290 2 : interval, err := b.mapper.MapPointKey(key, value)
291 2 : if err != nil {
292 0 : return err
293 0 : }
294 2 : b.blockInterval.UnionWith(interval)
295 2 : return nil
296 : }
297 :
298 : // AddRangeKeys is part of the BlockPropertyCollector interface.
299 2 : func (b *BlockIntervalCollector) AddRangeKeys(span Span) error {
300 2 : if span.Empty() {
301 0 : return nil
302 0 : }
303 2 : interval, err := b.mapper.MapRangeKeys(span)
304 2 : if err != nil {
305 0 : return err
306 0 : }
307 : // Range keys are not included in block or index intervals; they just apply
308 : // directly to the table interval.
309 2 : b.tableInterval.UnionWith(interval)
310 2 : return nil
311 : }
312 :
313 : // AddCollectedWithSuffixReplacement is part of the BlockPropertyCollector interface.
314 : func (b *BlockIntervalCollector) AddCollectedWithSuffixReplacement(
315 : oldProp []byte, oldSuffix, newSuffix []byte,
316 0 : ) error {
317 0 : i, err := decodeBlockInterval(oldProp)
318 0 : if err != nil {
319 0 : return err
320 0 : }
321 0 : i, err = b.suffixReplacer.ApplySuffixReplacement(i, newSuffix)
322 0 : if err != nil {
323 0 : return err
324 0 : }
325 0 : b.blockInterval.UnionWith(i)
326 0 : return nil
327 : }
328 :
329 : // SupportsSuffixReplacement is part of the BlockPropertyCollector interface.
330 0 : func (b *BlockIntervalCollector) SupportsSuffixReplacement() bool {
331 0 : return b.suffixReplacer != nil
332 0 : }
333 :
334 : // FinishDataBlock is part of the BlockPropertyCollector interface.
335 2 : func (b *BlockIntervalCollector) FinishDataBlock(buf []byte) ([]byte, error) {
336 2 : buf = encodeBlockInterval(b.blockInterval, buf)
337 2 : b.tableInterval.UnionWith(b.blockInterval)
338 2 : return buf, nil
339 2 : }
340 :
341 : // AddPrevDataBlockToIndexBlock implements the BlockPropertyCollector
342 : // interface.
343 2 : func (b *BlockIntervalCollector) AddPrevDataBlockToIndexBlock() {
344 2 : b.indexInterval.UnionWith(b.blockInterval)
345 2 : b.blockInterval = BlockInterval{}
346 2 : }
347 :
348 : // FinishIndexBlock implements the BlockPropertyCollector interface.
349 2 : func (b *BlockIntervalCollector) FinishIndexBlock(buf []byte) ([]byte, error) {
350 2 : buf = encodeBlockInterval(b.indexInterval, buf)
351 2 : b.indexInterval = BlockInterval{}
352 2 : return buf, nil
353 2 : }
354 :
355 : // FinishTable implements the BlockPropertyCollector interface.
356 2 : func (b *BlockIntervalCollector) FinishTable(buf []byte) ([]byte, error) {
357 2 : return encodeBlockInterval(b.tableInterval, buf), nil
358 2 : }
359 :
360 : // BlockInterval represents the [Lower, Upper) interval of 64-bit values
361 : // corresponding to a set of keys. The meaning of the values themselves is
362 : // opaque to the BlockIntervalCollector.
363 : //
364 : // If Lower >= Upper, the interval is the empty set.
365 : type BlockInterval struct {
366 : Lower uint64
367 : Upper uint64
368 : }
369 :
370 : // IsEmpty returns true if the interval is empty.
371 2 : func (i BlockInterval) IsEmpty() bool {
372 2 : return i.Lower >= i.Upper
373 2 : }
374 :
375 : // Intersects returns true if the two intervals intersect.
376 2 : func (i BlockInterval) Intersects(other BlockInterval) bool {
377 2 : return !i.IsEmpty() && !other.IsEmpty() && i.Upper > other.Lower && i.Lower < other.Upper
378 2 : }
379 :
380 : // UnionWith extends the receiver to include another interval.
381 2 : func (i *BlockInterval) UnionWith(other BlockInterval) {
382 2 : switch {
383 2 : case other.IsEmpty():
384 2 : case i.IsEmpty():
385 2 : *i = other
386 2 : default:
387 2 : i.Lower = min(i.Lower, other.Lower)
388 2 : i.Upper = max(i.Upper, other.Upper)
389 : }
390 : }
391 :
392 2 : func encodeBlockInterval(i BlockInterval, buf []byte) []byte {
393 2 : if i.IsEmpty() {
394 2 : return buf
395 2 : }
396 :
397 2 : var encoded [binary.MaxVarintLen64 * 2]byte
398 2 : n := binary.PutUvarint(encoded[:], i.Lower)
399 2 : n += binary.PutUvarint(encoded[n:], i.Upper-i.Lower)
400 2 : return append(buf, encoded[:n]...)
401 : }
402 :
403 2 : func decodeBlockInterval(buf []byte) (BlockInterval, error) {
404 2 : if len(buf) == 0 {
405 2 : return BlockInterval{}, nil
406 2 : }
407 2 : var i BlockInterval
408 2 : var n int
409 2 : i.Lower, n = binary.Uvarint(buf)
410 2 : if n <= 0 || n >= len(buf) {
411 0 : return BlockInterval{}, base.CorruptionErrorf("cannot decode interval from buf %x", buf)
412 0 : }
413 2 : pos := n
414 2 : i.Upper, n = binary.Uvarint(buf[pos:])
415 2 : pos += n
416 2 : if pos != len(buf) || n <= 0 {
417 0 : return BlockInterval{}, base.CorruptionErrorf("cannot decode interval from buf %x", buf)
418 0 : }
419 : // Delta decode.
420 2 : i.Upper += i.Lower
421 2 : if i.Upper < i.Lower {
422 0 : return BlockInterval{}, base.CorruptionErrorf("unexpected overflow, upper %d < lower %d", i.Upper, i.Lower)
423 0 : }
424 2 : return i, nil
425 : }
426 :
427 : // BlockIntervalSuffixReplacer provides methods to conduct just in time
428 : // adjustments of a passed in block prop interval before filtering.
429 : type BlockIntervalSuffixReplacer interface {
430 : // ApplySuffixReplacement recalculates a previously calculated interval (which
431 : // corresponds to an arbitrary collection of keys) under the assumption
432 : // that those keys are rewritten with a new prefix.
433 : //
434 : // Such a transformation is possible when the intervals depend only on the
435 : // suffixes.
436 : ApplySuffixReplacement(interval BlockInterval, newSuffix []byte) (BlockInterval, error)
437 : }
438 :
439 : // BlockIntervalFilter is an implementation of BlockPropertyFilter when the
440 : // corresponding collector is a BlockIntervalCollector. That is, the set is of
441 : // the form [lower, upper).
442 : type BlockIntervalFilter struct {
443 : name string
444 : filterInterval BlockInterval
445 : suffixReplacer BlockIntervalSuffixReplacer
446 : }
447 :
448 : var _ BlockPropertyFilter = (*BlockIntervalFilter)(nil)
449 :
450 : // NewBlockIntervalFilter constructs a BlockPropertyFilter that filters blocks
451 : // based on an interval property collected by BlockIntervalCollector and the
452 : // given [lower, upper) bounds. The given name specifies the
453 : // BlockIntervalCollector's properties to read.
454 : func NewBlockIntervalFilter(
455 : name string, lower uint64, upper uint64, suffixReplacer BlockIntervalSuffixReplacer,
456 2 : ) *BlockIntervalFilter {
457 2 : b := new(BlockIntervalFilter)
458 2 : b.Init(name, lower, upper, suffixReplacer)
459 2 : return b
460 2 : }
461 :
462 : // Init initializes (or re-initializes, clearing previous state) an existing
463 : // BLockPropertyFilter to filter blocks based on an interval property collected
464 : // by BlockIntervalCollector and the given [lower, upper) bounds. The given name
465 : // specifies the BlockIntervalCollector's properties to read.
466 : func (b *BlockIntervalFilter) Init(
467 : name string, lower, upper uint64, suffixReplacer BlockIntervalSuffixReplacer,
468 2 : ) {
469 2 : *b = BlockIntervalFilter{
470 2 : name: name,
471 2 : filterInterval: BlockInterval{Lower: lower, Upper: upper},
472 2 : suffixReplacer: suffixReplacer,
473 2 : }
474 2 : }
475 :
476 : // Name implements the BlockPropertyFilter interface.
477 2 : func (b *BlockIntervalFilter) Name() string {
478 2 : return b.name
479 2 : }
480 :
481 : // Intersects implements the BlockPropertyFilter interface.
482 2 : func (b *BlockIntervalFilter) Intersects(prop []byte) (bool, error) {
483 2 : i, err := decodeBlockInterval(prop)
484 2 : if err != nil {
485 0 : return false, err
486 0 : }
487 2 : return i.Intersects(b.filterInterval), nil
488 : }
489 :
490 : // SyntheticSuffixIntersects implements the BlockPropertyFilter interface.
491 2 : func (b *BlockIntervalFilter) SyntheticSuffixIntersects(prop []byte, suffix []byte) (bool, error) {
492 2 : if b.suffixReplacer == nil {
493 0 : return false, base.AssertionFailedf("missing SuffixReplacer for SyntheticSuffixIntersects()")
494 0 : }
495 2 : i, err := decodeBlockInterval(prop)
496 2 : if err != nil {
497 0 : return false, err
498 0 : }
499 :
500 2 : newInterval, err := b.suffixReplacer.ApplySuffixReplacement(i, suffix)
501 2 : if err != nil {
502 0 : return false, err
503 0 : }
504 2 : return newInterval.Intersects(b.filterInterval), nil
505 : }
506 :
507 : // SetInterval adjusts the [lower, upper) bounds used by the filter. It is not
508 : // generally safe to alter the filter while it's in use, except as part of the
509 : // implementation of BlockPropertyFilterMask.SetSuffix used for range-key
510 : // masking.
511 2 : func (b *BlockIntervalFilter) SetInterval(lower, upper uint64) {
512 2 : b.filterInterval = BlockInterval{Lower: lower, Upper: upper}
513 2 : }
514 :
515 : // When encoding block properties for each block, we cannot afford to encode the
516 : // name. Instead, the name is mapped to a shortID, in the scope of that sstable,
517 : // and the shortID is encoded as a single byte (which imposes a limit of of 256
518 : // block property collectors per sstable).
519 : // Note that the in-memory type is int16 to avoid overflows (e.g. in loops) and
520 : // to allow special values like -1 in code.
521 : type shortID int16
522 :
523 : const invalidShortID shortID = -1
524 : const maxShortID shortID = math.MaxUint8
525 : const maxPropertyCollectors = int(maxShortID) + 1
526 :
527 2 : func (id shortID) IsValid() bool {
528 2 : return id >= 0 && id <= maxShortID
529 2 : }
530 :
531 2 : func (id shortID) ToByte() byte {
532 2 : if invariants.Enabled && !id.IsValid() {
533 0 : panic(fmt.Sprintf("inavlid id %d", id))
534 : }
535 2 : return byte(id)
536 : }
537 :
538 : type blockPropertiesEncoder struct {
539 : propsBuf []byte
540 : scratch []byte
541 : }
542 :
543 2 : func (e *blockPropertiesEncoder) getScratchForProp() []byte {
544 2 : return e.scratch[:0]
545 2 : }
546 :
547 2 : func (e *blockPropertiesEncoder) resetProps() {
548 2 : e.propsBuf = e.propsBuf[:0]
549 2 : }
550 :
551 2 : func (e *blockPropertiesEncoder) addProp(id shortID, scratch []byte) {
552 2 : if len(scratch) == 0 {
553 2 : // We omit empty properties. The decoder will know that any missing IDs had
554 2 : // empty values.
555 2 : return
556 2 : }
557 2 : const lenID = 1
558 2 : lenProp := uvarintLen(uint32(len(scratch)))
559 2 : n := lenID + lenProp + len(scratch)
560 2 : if cap(e.propsBuf)-len(e.propsBuf) < n {
561 2 : size := len(e.propsBuf) + 2*n
562 2 : if size < 2*cap(e.propsBuf) {
563 1 : size = 2 * cap(e.propsBuf)
564 1 : }
565 2 : buf := make([]byte, len(e.propsBuf), size)
566 2 : copy(buf, e.propsBuf)
567 2 : e.propsBuf = buf
568 : }
569 2 : pos := len(e.propsBuf)
570 2 : b := e.propsBuf[pos : pos+lenID]
571 2 : b[0] = id.ToByte()
572 2 : pos += lenID
573 2 : b = e.propsBuf[pos : pos+lenProp]
574 2 : n = binary.PutUvarint(b, uint64(len(scratch)))
575 2 : pos += n
576 2 : b = e.propsBuf[pos : pos+len(scratch)]
577 2 : pos += len(scratch)
578 2 : copy(b, scratch)
579 2 : e.propsBuf = e.propsBuf[0:pos]
580 2 : e.scratch = scratch
581 : }
582 :
583 2 : func (e *blockPropertiesEncoder) unsafeProps() []byte {
584 2 : return e.propsBuf
585 2 : }
586 :
587 2 : func (e *blockPropertiesEncoder) props() []byte {
588 2 : buf := make([]byte, len(e.propsBuf))
589 2 : copy(buf, e.propsBuf)
590 2 : return buf
591 2 : }
592 :
593 : type blockPropertiesDecoder struct {
594 : props []byte
595 :
596 : // numCollectedProps is the number of collectors that were used when writing
597 : // these properties. The encoded properties contain values for shortIDs 0
598 : // through numCollectedProps-1, in order (with empty properties omitted).
599 : numCollectedProps int
600 : nextID shortID
601 : }
602 :
603 2 : func makeBlockPropertiesDecoder(numCollectedProps int, propsBuf []byte) blockPropertiesDecoder {
604 2 : return blockPropertiesDecoder{
605 2 : props: propsBuf,
606 2 : numCollectedProps: numCollectedProps,
607 2 : }
608 2 : }
609 :
610 2 : func (d *blockPropertiesDecoder) Done() bool {
611 2 : return int(d.nextID) >= d.numCollectedProps
612 2 : }
613 :
614 : // Next returns the property for each shortID between 0 and numCollectedProps-1, in order.
615 : // Note that some properties might be empty.
616 : // REQUIRES: !Done()
617 2 : func (d *blockPropertiesDecoder) Next() (id shortID, prop []byte, err error) {
618 2 : id = d.nextID
619 2 : d.nextID++
620 2 :
621 2 : if len(d.props) == 0 || shortID(d.props[0]) != id {
622 2 : if invariants.Enabled && len(d.props) > 0 && shortID(d.props[0]) < id {
623 0 : panic("shortIDs are not in order")
624 : }
625 : // This property was omitted because it was empty.
626 2 : return id, nil, nil
627 : }
628 :
629 2 : const lenID = 1
630 2 : propLen, m := binary.Uvarint(d.props[lenID:])
631 2 : n := lenID + m
632 2 : if m <= 0 || propLen == 0 || (n+int(propLen)) > len(d.props) {
633 0 : return 0, nil, base.CorruptionErrorf("corrupt block property length")
634 0 : }
635 2 : prop = d.props[n : n+int(propLen)]
636 2 : d.props = d.props[n+int(propLen):]
637 2 : return id, prop, nil
638 : }
639 :
640 : // BlockPropertiesFilterer provides filtering support when reading an sstable
641 : // in the context of an iterator that has a slice of BlockPropertyFilters.
642 : // After the call to NewBlockPropertiesFilterer, the caller must call
643 : // IntersectsUserPropsAndFinishInit to check if the sstable intersects with
644 : // the filters. If it does intersect, this function also finishes initializing
645 : // the BlockPropertiesFilterer using the shortIDs for the relevant filters.
646 : // Subsequent checks for relevance of a block should use the intersects
647 : // method.
648 : type BlockPropertiesFilterer struct {
649 : filters []BlockPropertyFilter
650 : // Maps shortID => index in filters. This can be sparse, and shortIDs for
651 : // which there is no filter are represented with an index of -1. The
652 : // length of this can be shorter than the shortIDs allocated in the
653 : // sstable. e.g. if the sstable used shortIDs 0, 1, 2, 3, and the iterator
654 : // has two filters, corresponding to shortIDs 2, 0, this would be:
655 : // len(shortIDToFiltersIndex)==3, 0=>1, 1=>-1, 2=>0.
656 : shortIDToFiltersIndex []int
657 :
658 : // boundLimitedFilter, if non-nil, holds a single block-property filter with
659 : // additional constraints on its filtering. A boundLimitedFilter may only
660 : // filter blocks that are wholly contained within its bounds. During forward
661 : // iteration the lower bound (and during backward iteration the upper bound)
662 : // must be externally guaranteed, with Intersects only returning false if
663 : // that bound is met. The opposite bound is verified during iteration by the
664 : // sstable iterator.
665 : //
666 : // boundLimitedFilter is permitted to be defined on a property (`Name()`)
667 : // for which another filter exists in filters. In this case both filters
668 : // will be consulted, and either filter may exclude block(s). Only a single
669 : // bound-limited block-property filter may be set.
670 : //
671 : // The boundLimitedShortID field contains the shortID of the filter's
672 : // property within the sstable. It's set to -1 if the property was not
673 : // collected when the table was built.
674 : boundLimitedFilter BoundLimitedBlockPropertyFilter
675 : boundLimitedShortID int
676 :
677 : syntheticSuffix SyntheticSuffix
678 : }
679 :
680 : var blockPropertiesFiltererPool = sync.Pool{
681 2 : New: func() interface{} {
682 2 : return &BlockPropertiesFilterer{}
683 2 : },
684 : }
685 :
686 : // newBlockPropertiesFilterer returns a partially initialized filterer. To complete
687 : // initialization, call IntersectsUserPropsAndFinishInit.
688 : func newBlockPropertiesFilterer(
689 : filters []BlockPropertyFilter,
690 : limited BoundLimitedBlockPropertyFilter,
691 : syntheticSuffix SyntheticSuffix,
692 2 : ) *BlockPropertiesFilterer {
693 2 : filterer := blockPropertiesFiltererPool.Get().(*BlockPropertiesFilterer)
694 2 : *filterer = BlockPropertiesFilterer{
695 2 : filters: filters,
696 2 : shortIDToFiltersIndex: filterer.shortIDToFiltersIndex[:0],
697 2 : boundLimitedFilter: limited,
698 2 : boundLimitedShortID: -1,
699 2 : syntheticSuffix: syntheticSuffix,
700 2 : }
701 2 : return filterer
702 2 : }
703 :
704 2 : func releaseBlockPropertiesFilterer(filterer *BlockPropertiesFilterer) {
705 2 : *filterer = BlockPropertiesFilterer{
706 2 : shortIDToFiltersIndex: filterer.shortIDToFiltersIndex[:0],
707 2 : }
708 2 : blockPropertiesFiltererPool.Put(filterer)
709 2 : }
710 :
711 : // IntersectsTable evaluates the provided block-property filter against the
712 : // provided set of table-level properties. If there is no intersection between
713 : // the filters and the table or an error is encountered, IntersectsTable returns
714 : // a nil filterer (and possibly an error). If there is an intersection,
715 : // IntersectsTable returns a non-nil filterer that may be used by an iterator
716 : // reading the table.
717 : func IntersectsTable(
718 : filters []BlockPropertyFilter,
719 : limited BoundLimitedBlockPropertyFilter,
720 : userProperties map[string]string,
721 : syntheticSuffix SyntheticSuffix,
722 2 : ) (*BlockPropertiesFilterer, error) {
723 2 : f := newBlockPropertiesFilterer(filters, limited, syntheticSuffix)
724 2 : ok, err := f.intersectsUserPropsAndFinishInit(userProperties)
725 2 : if !ok || err != nil {
726 2 : releaseBlockPropertiesFilterer(f)
727 2 : return nil, err
728 2 : }
729 2 : return f, nil
730 : }
731 :
732 : // intersectsUserPropsAndFinishInit is called with the user properties map for
733 : // the sstable and returns whether the sstable intersects the filters. It
734 : // additionally initializes the shortIDToFiltersIndex for the filters that are
735 : // relevant to this sstable.
736 : func (f *BlockPropertiesFilterer) intersectsUserPropsAndFinishInit(
737 : userProperties map[string]string,
738 2 : ) (bool, error) {
739 2 : for i := range f.filters {
740 2 : props, ok := userProperties[f.filters[i].Name()]
741 2 : if !ok {
742 2 : // Collector was not used when writing this file, so it is
743 2 : // considered intersecting.
744 2 : continue
745 : }
746 2 : if len(props) < 1 {
747 0 : return false, base.CorruptionErrorf(
748 0 : "block properties for %s is corrupted", f.filters[i].Name())
749 0 : }
750 2 : shortID := shortID(props[0])
751 2 : {
752 2 : // Use an unsafe conversion to avoid allocating. Intersects() is not
753 2 : // supposed to modify the given slice.
754 2 : // Note that unsafe.StringData only works if the string is not empty
755 2 : // (which we already checked).
756 2 : byteProps := unsafe.Slice(unsafe.StringData(props), len(props))
757 2 : var intersects bool
758 2 : var err error
759 2 : if len(f.syntheticSuffix) == 0 {
760 2 : intersects, err = f.filters[i].Intersects(byteProps[1:])
761 2 : } else {
762 2 : intersects, err = f.filters[i].SyntheticSuffixIntersects(byteProps[1:], f.syntheticSuffix)
763 2 : }
764 2 : if err != nil || !intersects {
765 2 : return false, err
766 2 : }
767 : }
768 : // Intersects the sstable, so need to use this filter when
769 : // deciding whether to read blocks.
770 2 : n := len(f.shortIDToFiltersIndex)
771 2 : if n <= int(shortID) {
772 2 : if cap(f.shortIDToFiltersIndex) <= int(shortID) {
773 2 : index := make([]int, shortID+1, 2*(shortID+1))
774 2 : copy(index, f.shortIDToFiltersIndex)
775 2 : f.shortIDToFiltersIndex = index
776 2 : } else {
777 2 : f.shortIDToFiltersIndex = f.shortIDToFiltersIndex[:shortID+1]
778 2 : }
779 2 : for j := n; j < int(shortID); j++ {
780 2 : f.shortIDToFiltersIndex[j] = -1
781 2 : }
782 : }
783 2 : f.shortIDToFiltersIndex[shortID] = i
784 : }
785 2 : if f.boundLimitedFilter == nil {
786 2 : return true, nil
787 2 : }
788 :
789 : // There's a bound-limited filter. Find its shortID. It's possible that
790 : // there's an existing filter in f.filters on the same property. That's
791 : // okay. Both filters will be consulted whenever a relevant prop is decoded.
792 2 : props, ok := userProperties[f.boundLimitedFilter.Name()]
793 2 : if !ok {
794 2 : // The collector was not used when writing this file, so it's
795 2 : // intersecting. We leave f.boundLimitedShortID=-1, so the filter will
796 2 : // be unused within this file.
797 2 : return true, nil
798 2 : }
799 2 : if len(props) < 1 {
800 0 : return false, base.CorruptionErrorf(
801 0 : "block properties for %s is corrupted", f.boundLimitedFilter.Name())
802 0 : }
803 2 : f.boundLimitedShortID = int(props[0])
804 2 :
805 2 : // We don't check for table-level intersection for the bound-limited filter.
806 2 : // The bound-limited filter is treated as vacuously intersecting.
807 2 : //
808 2 : // NB: If a block-property filter needs to be toggled inactive/active, it
809 2 : // should be implemented within the Intersects implementation.
810 2 : //
811 2 : // TODO(jackson): We could filter at the table-level by threading the table
812 2 : // smallest and largest bounds here.
813 2 :
814 2 : // The bound-limited filter isn't included in shortIDToFiltersIndex.
815 2 : //
816 2 : // When determining intersection, we decode props only up to the shortID
817 2 : // len(shortIDToFiltersIndex). If f.limitedShortID is greater than any of
818 2 : // the existing filters' shortIDs, we need to grow shortIDToFiltersIndex.
819 2 : // Growing the index with -1s ensures we're able to consult the index
820 2 : // without length checks.
821 2 : if n := len(f.shortIDToFiltersIndex); n <= f.boundLimitedShortID {
822 2 : if cap(f.shortIDToFiltersIndex) <= f.boundLimitedShortID {
823 2 : index := make([]int, f.boundLimitedShortID+1)
824 2 : copy(index, f.shortIDToFiltersIndex)
825 2 : f.shortIDToFiltersIndex = index
826 2 : } else {
827 2 : f.shortIDToFiltersIndex = f.shortIDToFiltersIndex[:f.boundLimitedShortID+1]
828 2 : }
829 2 : for j := n; j <= f.boundLimitedShortID; j++ {
830 2 : f.shortIDToFiltersIndex[j] = -1
831 2 : }
832 : }
833 2 : return true, nil
834 : }
835 :
836 : type intersectsResult int8
837 :
838 : const (
839 : blockIntersects intersectsResult = iota
840 : blockExcluded
841 : // blockMaybeExcluded is returned by BlockPropertiesFilterer.intersects when
842 : // no filters unconditionally exclude the block, but the bound-limited block
843 : // property filter will exclude it if the block's bounds fall within the
844 : // filter's current bounds. See the reader's
845 : // {single,two}LevelIterator.resolveMaybeExcluded methods.
846 : blockMaybeExcluded
847 : )
848 :
849 2 : func (f *BlockPropertiesFilterer) intersects(props []byte) (intersectsResult, error) {
850 2 : decoder := makeBlockPropertiesDecoder(len(f.shortIDToFiltersIndex), props)
851 2 : ret := blockIntersects
852 2 : for !decoder.Done() {
853 2 : id, prop, err := decoder.Next()
854 2 : if err != nil {
855 0 : return ret, err
856 0 : }
857 2 : intersects, err := f.intersectsFilter(id, prop)
858 2 : if err != nil {
859 0 : return ret, err
860 0 : }
861 2 : if intersects == blockExcluded {
862 2 : return blockExcluded, nil
863 2 : }
864 2 : if intersects == blockMaybeExcluded {
865 2 : ret = blockMaybeExcluded
866 2 : }
867 : }
868 : // ret is either blockIntersects or blockMaybeExcluded.
869 2 : return ret, nil
870 : }
871 :
872 : func (f *BlockPropertiesFilterer) intersectsFilter(
873 : id shortID, prop []byte,
874 2 : ) (intersectsResult, error) {
875 2 : var intersects bool
876 2 : var err error
877 2 : if filterIdx := f.shortIDToFiltersIndex[id]; filterIdx >= 0 {
878 2 : if !f.syntheticSuffix.IsSet() {
879 2 : intersects, err = f.filters[filterIdx].Intersects(prop)
880 2 : } else {
881 2 : intersects, err = f.filters[filterIdx].SyntheticSuffixIntersects(prop, f.syntheticSuffix)
882 2 : }
883 2 : if err != nil {
884 0 : return blockIntersects, err
885 0 : }
886 2 : if !intersects {
887 2 : return blockExcluded, nil
888 2 : }
889 : }
890 2 : if int(id) == f.boundLimitedShortID {
891 2 : // The bound-limited filter uses this id.
892 2 : //
893 2 : // The bound-limited filter only applies within a keyspan interval. We
894 2 : // expect the Intersects call to be cheaper than bounds checks. If
895 2 : // Intersects determines that there is no intersection, we return
896 2 : // `blockMaybeExcluded` if no other bpf unconditionally excludes the
897 2 : // block.
898 2 : if !f.syntheticSuffix.IsSet() {
899 2 : intersects, err = f.boundLimitedFilter.Intersects(prop)
900 2 : } else {
901 2 : intersects, err = f.boundLimitedFilter.SyntheticSuffixIntersects(prop, f.syntheticSuffix)
902 2 : }
903 2 : if err != nil {
904 0 : return blockIntersects, err
905 2 : } else if !intersects {
906 2 : return blockMaybeExcluded, nil
907 2 : }
908 : }
909 2 : return blockIntersects, nil
910 : }
911 :
912 2 : func uvarintLen(v uint32) int {
913 2 : i := 0
914 2 : for v >= 0x80 {
915 1 : v >>= 7
916 1 : i++
917 1 : }
918 2 : return i + 1
919 : }
|