Line data Source code
1 : // Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package pebble
6 :
7 : import (
8 : "context"
9 :
10 : "github.com/cockroachdb/errors"
11 : "github.com/cockroachdb/pebble/internal/base"
12 : "github.com/cockroachdb/pebble/internal/invariants"
13 : "github.com/cockroachdb/pebble/internal/keyspan"
14 : "github.com/cockroachdb/pebble/internal/manifest"
15 : "github.com/cockroachdb/pebble/internal/treeprinter"
16 : "github.com/cockroachdb/pebble/sstable"
17 : )
18 :
19 : // constructRangeKeyIter constructs the range-key iterator stack, populating
20 : // i.rangeKey.rangeKeyIter with the resulting iterator.
21 1 : func (i *Iterator) constructRangeKeyIter() {
22 1 : i.rangeKey.rangeKeyIter = i.rangeKey.iterConfig.Init(
23 1 : &i.comparer, i.seqNum, i.opts.LowerBound, i.opts.UpperBound,
24 1 : &i.hasPrefix, &i.prefixOrFullSeekKey, false /* internalKeys */, &i.rangeKey.rangeKeyBuffers.internal)
25 1 :
26 1 : if i.opts.DebugRangeKeyStack {
27 0 : // The default logger is preferable to i.opts.getLogger(), at least in the
28 0 : // metamorphic test.
29 0 : i.rangeKey.rangeKeyIter = keyspan.InjectLogging(i.rangeKey.rangeKeyIter, base.DefaultLogger)
30 0 : }
31 :
32 : // If there's an indexed batch with range keys, include it.
33 1 : if i.batch != nil {
34 1 : if i.batch.index == nil {
35 0 : // This isn't an indexed batch. We shouldn't have gotten this far.
36 0 : panic(errors.AssertionFailedf("creating an iterator over an unindexed batch"))
37 1 : } else {
38 1 : // Only include the batch's range key iterator if it has any keys.
39 1 : // NB: This can force reconstruction of the rangekey iterator stack
40 1 : // in SetOptions if subsequently range keys are added. See
41 1 : // SetOptions.
42 1 : if i.batch.countRangeKeys > 0 {
43 1 : i.batch.initRangeKeyIter(&i.opts, &i.batchRangeKeyIter, i.batchSeqNum)
44 1 : i.rangeKey.iterConfig.AddLevel(&i.batchRangeKeyIter)
45 1 : }
46 : }
47 : }
48 :
49 1 : if !i.batchOnlyIter {
50 1 : // Next are the flushables: memtables and large batches.
51 1 : if i.readState != nil {
52 1 : for j := len(i.readState.memtables) - 1; j >= 0; j-- {
53 1 : mem := i.readState.memtables[j]
54 1 : // We only need to read from memtables which contain sequence numbers older
55 1 : // than seqNum.
56 1 : if logSeqNum := mem.logSeqNum; logSeqNum >= i.seqNum {
57 1 : continue
58 : }
59 1 : if rki := mem.newRangeKeyIter(&i.opts); rki != nil {
60 1 : i.rangeKey.iterConfig.AddLevel(rki)
61 1 : }
62 : }
63 : }
64 :
65 1 : current := i.version
66 1 : if current == nil {
67 1 : current = i.readState.current
68 1 : }
69 : // Next are the file levels: L0 sub-levels followed by lower levels.
70 :
71 : // Add file-specific iterators for L0 files containing range keys. We
72 : // maintain a separate manifest.LevelMetadata for each level containing only
73 : // files that contain range keys, however we don't compute a separate
74 : // L0Sublevels data structure too.
75 : //
76 : // We first use L0's LevelMetadata to peek and see whether L0 contains any
77 : // range keys at all. If it does, we create a range key level iterator per
78 : // level that contains range keys using the information from L0Sublevels.
79 : // Some sublevels may not contain any range keys, and we need to iterate
80 : // through the fileMetadata to determine that. Since L0's file count should
81 : // not significantly exceed ~1000 files (see L0CompactionFileThreshold),
82 : // this should be okay.
83 1 : if !current.RangeKeyLevels[0].Empty() {
84 1 : // L0 contains at least 1 file containing range keys.
85 1 : // Add level iterators for the L0 sublevels, iterating from newest to
86 1 : // oldest.
87 1 : for j := len(current.L0SublevelFiles) - 1; j >= 0; j-- {
88 1 : iter := current.L0SublevelFiles[j].Iter()
89 1 : if !containsAnyRangeKeys(iter) {
90 1 : continue
91 : }
92 :
93 1 : li := i.rangeKey.iterConfig.NewLevelIter()
94 1 : li.Init(
95 1 : i.ctx,
96 1 : i.opts.SpanIterOptions(),
97 1 : i.cmp,
98 1 : i.newIterRangeKey,
99 1 : iter.Filter(manifest.KeyTypeRange),
100 1 : manifest.L0Sublevel(j),
101 1 : manifest.KeyTypeRange,
102 1 : )
103 1 : i.rangeKey.iterConfig.AddLevel(li)
104 : }
105 : }
106 :
107 : // Add level iterators for the non-empty non-L0 levels.
108 1 : for level := 1; level < len(current.RangeKeyLevels); level++ {
109 1 : if current.RangeKeyLevels[level].Empty() {
110 1 : continue
111 : }
112 1 : li := i.rangeKey.iterConfig.NewLevelIter()
113 1 : spanIterOpts := i.opts.SpanIterOptions()
114 1 : li.Init(i.ctx, spanIterOpts, i.cmp, i.newIterRangeKey, current.RangeKeyLevels[level].Iter(),
115 1 : manifest.Level(level), manifest.KeyTypeRange)
116 1 : i.rangeKey.iterConfig.AddLevel(li)
117 : }
118 : }
119 : }
120 :
121 1 : func containsAnyRangeKeys(iter manifest.LevelIterator) bool {
122 1 : for f := iter.First(); f != nil; f = iter.Next() {
123 1 : if f.HasRangeKeys {
124 1 : return true
125 1 : }
126 : }
127 1 : return false
128 : }
129 :
130 : // Range key masking
131 : //
132 : // Pebble iterators may be configured such that range keys with suffixes mask
133 : // point keys with lower suffixes. The intended use is implementing a MVCC
134 : // delete range operation using range keys, when suffixes are MVCC timestamps.
135 : //
136 : // To enable masking, the user populates the IterOptions's RangeKeyMasking
137 : // field. The Suffix field configures which range keys act as masks. The
138 : // intended use is to hold a MVCC read timestamp. When implementing a MVCC
139 : // delete range operation, only range keys that are visible at the read
140 : // timestamp should be visible. If a range key has a suffix ≤
141 : // RangeKeyMasking.Suffix, it acts as a mask.
142 : //
143 : // Range key masking is facilitated by the keyspan.InterleavingIter. The
144 : // interleaving iterator interleaves range keys and point keys during combined
145 : // iteration. During user iteration, the interleaving iterator is configured
146 : // with a keyspan.SpanMask, implemented by the rangeKeyMasking struct below.
147 : // The SpanMask interface defines two methods: SpanChanged and SkipPoint.
148 : //
149 : // SpanChanged is used to keep the current mask up-to-date. Whenever the point
150 : // iterator has stepped into or out of the bounds of a range key, the
151 : // interleaving iterator invokes SpanChanged passing the current covering range
152 : // key. The below rangeKeyMasking implementation scans the range keys looking
153 : // for the range key with the largest suffix that's still ≤ the suffix supplied
154 : // to IterOptions.RangeKeyMasking.Suffix (the "read timestamp"). If it finds a
155 : // range key that meets the condition, the range key should act as a mask. The
156 : // span and the relevant range key's suffix are saved.
157 : //
158 : // The above ensures that `rangeKeyMasking.maskActiveSuffix` always contains the
159 : // current masking suffix such that any point keys with lower suffixes should be
160 : // skipped.
161 : //
162 : // There are two ways in which masked point keys are skipped.
163 : //
164 : // 1. Interleaving iterator SkipPoint
165 : //
166 : // Whenever the interleaving iterator encounters a point key that falls within
167 : // the bounds of a range key, it invokes SkipPoint. The interleaving iterator
168 : // guarantees that the SpanChanged method described above has already been
169 : // invoked with the covering range key. The below rangeKeyMasking implementation
170 : // of SkipPoint splits the key into prefix and suffix, compares the suffix to
171 : // the `maskActiveSuffix` updated by SpanChanged and returns true if
172 : // suffix(point) < maskActiveSuffix.
173 : //
174 : // The SkipPoint logic is sufficient to ensure that the Pebble iterator filters
175 : // out all masked point keys. However, it requires the iterator read each masked
176 : // point key. For broad range keys that mask many points, this may be expensive.
177 : //
178 : // 2. Block property filter
179 : //
180 : // For more efficient handling of braad range keys that mask many points, the
181 : // IterOptions.RangeKeyMasking field has an optional Filter option. This Filter
182 : // field takes a superset of the block-property filter interface, adding a
183 : // method to dynamically configure the filter's filtering criteria.
184 : //
185 : // To make use of the Filter option, the user is required to define and
186 : // configure a block-property collector that collects a property containing at
187 : // least the maximum suffix of a key within a block.
188 : //
189 : // When the SpanChanged method described above is invoked, rangeKeyMasking also
190 : // reconfigures the user-provided filter. It invokes a SetSuffix method,
191 : // providing the `maskActiveSuffix`, requesting that from now on the
192 : // block-property filter return Intersects()=false for any properties indicating
193 : // that a block contains exclusively keys with suffixes greater than the
194 : // provided suffix.
195 : //
196 : // Note that unlike other block-property filters, the filter used for masking
197 : // must not apply across the entire keyspace. It must only filter blocks that
198 : // lie within the bounds of the range key that set the mask suffix. To
199 : // accommodate this, rangeKeyMasking implements a special interface:
200 : // sstable.BoundLimitedBlockPropertyFilter. This interface extends the block
201 : // property filter interface with two new methods: KeyIsWithinLowerBound and
202 : // KeyIsWithinUpperBound. The rangeKeyMasking type wraps the user-provided block
203 : // property filter, implementing these two methods and overriding Intersects to
204 : // always return true if there is no active mask.
205 : //
206 : // The logic to ensure that a mask block-property filter is only applied within
207 : // the bounds of the masking range key is subtle. The interleaving iterator
208 : // guarantees that it never invokes SpanChanged until the point iterator is
209 : // positioned within the range key. During forward iteration, this guarantees
210 : // that any block that a sstable reader might attempt to load contains only keys
211 : // greater than or equal to the range key's lower bound. During backward
212 : // iteration, it provides the analagous guarantee on the range key's upper
213 : // bound.
214 : //
215 : // The above ensures that an sstable reader only needs to verify that a block
216 : // that it skips meets the opposite bound. This is where the
217 : // KeyIsWithinLowerBound and KeyIsWithinUpperBound methods are used. When an
218 : // sstable iterator is configured with a BoundLimitedBlockPropertyFilter, it
219 : // checks for intersection with the block-property filter before every block
220 : // load, like ordinary block-property filters. However, if the bound-limited
221 : // block property filter indicates that it does NOT intersect, the filter's
222 : // relevant KeyIsWithin{Lower,Upper}Bound method is queried, using a block
223 : // index separator as the bound. If the method indicates that the provided index
224 : // separator does not fall within the range key bounds, the no-intersection
225 : // result is ignored, and the block is read.
226 :
227 : type rangeKeyMasking struct {
228 : cmp base.Compare
229 : suffixCmp base.CompareRangeSuffixes
230 : split base.Split
231 : filter BlockPropertyFilterMask
232 : // maskActiveSuffix holds the suffix of a range key currently acting as a
233 : // mask, hiding point keys with suffixes greater than it. maskActiveSuffix
234 : // is only ever non-nil if IterOptions.RangeKeyMasking.Suffix is non-nil.
235 : // maskActiveSuffix is updated whenever the iterator passes over a new range
236 : // key. The maskActiveSuffix should only be used if maskSpan is non-nil.
237 : //
238 : // See SpanChanged.
239 : maskActiveSuffix []byte
240 : // maskSpan holds the span from which the active mask suffix was extracted.
241 : // The span is used for bounds comparisons, to ensure that a range-key mask
242 : // is not applied beyond the bounds of the range key.
243 : maskSpan *keyspan.Span
244 : parent *Iterator
245 : }
246 :
247 1 : func (m *rangeKeyMasking) init(parent *Iterator, c *base.Comparer) {
248 1 : m.cmp = c.Compare
249 1 : m.suffixCmp = c.CompareRangeSuffixes
250 1 : m.split = c.Split
251 1 : if parent.opts.RangeKeyMasking.Filter != nil {
252 1 : m.filter = parent.opts.RangeKeyMasking.Filter()
253 1 : }
254 1 : m.parent = parent
255 : }
256 :
257 : // SpanChanged implements the keyspan.SpanMask interface, used during range key
258 : // iteration.
259 1 : func (m *rangeKeyMasking) SpanChanged(s *keyspan.Span) {
260 1 : if s == nil && m.maskSpan == nil {
261 1 : return
262 1 : }
263 1 : m.maskSpan = nil
264 1 : m.maskActiveSuffix = m.maskActiveSuffix[:0]
265 1 :
266 1 : // Find the smallest suffix of a range key contained within the Span,
267 1 : // excluding suffixes less than m.opts.RangeKeyMasking.Suffix.
268 1 : if s != nil {
269 1 : m.parent.rangeKey.stale = true
270 1 : if m.parent.opts.RangeKeyMasking.Suffix != nil {
271 1 : for j := range s.Keys {
272 1 : if s.Keys[j].Suffix == nil {
273 1 : continue
274 : }
275 1 : if m.suffixCmp(s.Keys[j].Suffix, m.parent.opts.RangeKeyMasking.Suffix) < 0 {
276 1 : continue
277 : }
278 1 : if len(m.maskActiveSuffix) == 0 || m.suffixCmp(m.maskActiveSuffix, s.Keys[j].Suffix) > 0 {
279 1 : m.maskSpan = s
280 1 : m.maskActiveSuffix = append(m.maskActiveSuffix[:0], s.Keys[j].Suffix...)
281 1 : }
282 : }
283 : }
284 : }
285 :
286 1 : if m.maskSpan != nil && m.parent.opts.RangeKeyMasking.Filter != nil {
287 1 : // Update the block-property filter to filter point keys with suffixes
288 1 : // greater than m.maskActiveSuffix.
289 1 : err := m.filter.SetSuffix(m.maskActiveSuffix)
290 1 : if err != nil {
291 0 : m.parent.err = err
292 0 : }
293 : }
294 : // If no span is active, we leave the inner block-property filter configured
295 : // with its existing suffix. That's okay, because Intersects calls are first
296 : // evaluated by iteratorRangeKeyState.Intersects, which considers all blocks
297 : // as intersecting if there's no active mask.
298 : }
299 :
300 : // SkipPoint implements the keyspan.SpanMask interface, used during range key
301 : // iteration. Whenever a point key is covered by a non-empty Span, the
302 : // interleaving iterator invokes SkipPoint. This function is responsible for
303 : // performing range key masking.
304 : //
305 : // If a non-nil IterOptions.RangeKeyMasking.Suffix is set, range key masking is
306 : // enabled. Masking hides point keys, transparently skipping over the keys.
307 : // Whether or not a point key is masked is determined by comparing the point
308 : // key's suffix, the overlapping span's keys' suffixes, and the user-configured
309 : // IterOption's RangeKeyMasking.Suffix. When configured with a masking threshold
310 : // _t_, and there exists a span with suffix _r_ covering a point key with suffix
311 : // _p_, and
312 : //
313 : // _t_ ≤ _r_ < _p_
314 : //
315 : // then the point key is elided. Consider the following rendering, where using
316 : // integer suffixes with higher integers sort before suffixes with lower
317 : // integers, (for example @7 ≤ @6 < @5):
318 : //
319 : // ^
320 : // @9 | •―――――――――――――――○ [e,m)@9
321 : // s 8 | • l@8
322 : // u 7 |------------------------------------ @7 RangeKeyMasking.Suffix
323 : // f 6 | [h,q)@6 •―――――――――――――――――○ (threshold)
324 : // f 5 | • h@5
325 : // f 4 | • n@4
326 : // i 3 | •―――――――――――○ [f,l)@3
327 : // x 2 | • b@2
328 : // 1 |
329 : // 0 |___________________________________
330 : // a b c d e f g h i j k l m n o p q
331 : //
332 : // An iterator scanning the entire keyspace with the masking threshold set to @7
333 : // will observe point keys b@2 and l@8. The span keys [h,q)@6 and [f,l)@3 serve
334 : // as masks, because cmp(@6,@7) ≥ 0 and cmp(@3,@7) ≥ 0. The span key [e,m)@9
335 : // does not serve as a mask, because cmp(@9,@7) < 0.
336 : //
337 : // Although point l@8 falls within the user key bounds of [e,m)@9, [e,m)@9 is
338 : // non-masking due to its suffix. The point key l@8 also falls within the user
339 : // key bounds of [h,q)@6, but since cmp(@6,@8) ≥ 0, l@8 is unmasked.
340 : //
341 : // Invariant: The userKey is within the user key bounds of the span most
342 : // recently provided to `SpanChanged`.
343 1 : func (m *rangeKeyMasking) SkipPoint(userKey []byte) bool {
344 1 : m.parent.stats.RangeKeyStats.ContainedPoints++
345 1 : if m.maskSpan == nil {
346 1 : // No range key is currently acting as a mask, so don't skip.
347 1 : return false
348 1 : }
349 : // Range key masking is enabled and the current span includes a range key
350 : // that is being used as a mask. (NB: SpanChanged already verified that the
351 : // range key's suffix is ≥ RangeKeyMasking.Suffix).
352 : //
353 : // This point key falls within the bounds of the range key (guaranteed by
354 : // the InterleavingIter). Skip the point key if the range key's suffix is
355 : // greater than the point key's suffix.
356 1 : pointSuffix := userKey[m.split(userKey):]
357 1 : if len(pointSuffix) > 0 && m.suffixCmp(m.maskActiveSuffix, pointSuffix) < 0 {
358 1 : m.parent.stats.RangeKeyStats.SkippedPoints++
359 1 : return true
360 1 : }
361 1 : return false
362 : }
363 :
364 : // The iteratorRangeKeyState type implements the sstable package's
365 : // BoundLimitedBlockPropertyFilter interface in order to use block property
366 : // filters for range key masking. The iteratorRangeKeyState implementation wraps
367 : // the block-property filter provided in Options.RangeKeyMasking.Filter.
368 : //
369 : // Using a block-property filter for range-key masking requires limiting the
370 : // filter's effect to the bounds of the range key currently acting as a mask.
371 : // Consider the range key [a,m)@10, and an iterator positioned just before the
372 : // below block, bounded by index separators `c` and `z`:
373 : //
374 : // c z
375 : // x | c@9 c@5 c@1 d@7 e@4 y@4 | ...
376 : // iter pos
377 : //
378 : // The next block cannot be skipped, despite the range key suffix @10 is greater
379 : // than all the block's keys' suffixes, because it contains a key (y@4) outside
380 : // the bounds of the range key.
381 : //
382 : // This extended BoundLimitedBlockPropertyFilter interface adds two new methods,
383 : // KeyIsWithinLowerBound and KeyIsWithinUpperBound, for testing whether a
384 : // particular block is within bounds.
385 : //
386 : // The iteratorRangeKeyState implements these new methods by first checking if
387 : // the iterator is currently positioned within a range key. If not, the provided
388 : // key is considered out-of-bounds. If the iterator is positioned within a range
389 : // key, it compares the corresponding range key bound.
390 : var _ sstable.BoundLimitedBlockPropertyFilter = (*rangeKeyMasking)(nil)
391 :
392 : // Name implements the limitedBlockPropertyFilter interface defined in the
393 : // sstable package by passing through to the user-defined block property filter.
394 1 : func (m *rangeKeyMasking) Name() string {
395 1 : return m.filter.Name()
396 1 : }
397 :
398 : // Intersects implements the limitedBlockPropertyFilter interface defined in the
399 : // sstable package by passing the intersection decision to the user-provided
400 : // block property filter only if a range key is covering the current iterator
401 : // position.
402 1 : func (m *rangeKeyMasking) Intersects(prop []byte) (bool, error) {
403 1 : if m.maskSpan == nil {
404 1 : // No span is actively masking.
405 1 : return true, nil
406 1 : }
407 1 : return m.filter.Intersects(prop)
408 : }
409 :
410 1 : func (m *rangeKeyMasking) SyntheticSuffixIntersects(prop []byte, suffix []byte) (bool, error) {
411 1 : if m.maskSpan == nil {
412 1 : // No span is actively masking.
413 1 : return true, nil
414 1 : }
415 1 : return m.filter.SyntheticSuffixIntersects(prop, suffix)
416 : }
417 :
418 : // KeyIsWithinLowerBound implements the limitedBlockPropertyFilter interface
419 : // defined in the sstable package. It's used to restrict the masking block
420 : // property filter to only applying within the bounds of the active range key.
421 1 : func (m *rangeKeyMasking) KeyIsWithinLowerBound(key []byte) bool {
422 1 : // Invariant: m.maskSpan != nil
423 1 : //
424 1 : // The provided `key` is an inclusive lower bound of the block we're
425 1 : // considering skipping.
426 1 : return m.cmp(m.maskSpan.Start, key) <= 0
427 1 : }
428 :
429 : // KeyIsWithinUpperBound implements the limitedBlockPropertyFilter interface
430 : // defined in the sstable package. It's used to restrict the masking block
431 : // property filter to only applying within the bounds of the active range key.
432 1 : func (m *rangeKeyMasking) KeyIsWithinUpperBound(key []byte) bool {
433 1 : // Invariant: m.maskSpan != nil
434 1 : //
435 1 : // The provided `key` is an *inclusive* upper bound of the block we're
436 1 : // considering skipping, so the range key's end must be strictly greater
437 1 : // than the block bound for the block to be within bounds.
438 1 : return m.cmp(m.maskSpan.End, key) > 0
439 1 : }
440 :
441 : // lazyCombinedIter implements the internalIterator interface, wrapping a
442 : // pointIter. It requires the pointIter's the levelIters be configured with
443 : // pointers to its combinedIterState. When the levelIter observes a file
444 : // containing a range key, the lazyCombinedIter constructs the combined
445 : // range+point key iterator stack and switches to it.
446 : type lazyCombinedIter struct {
447 : // parent holds a pointer to the root *pebble.Iterator containing this
448 : // iterator. It's used to mutate the internalIterator in use when switching
449 : // to combined iteration.
450 : parent *Iterator
451 : pointIter internalIterator
452 : combinedIterState combinedIterState
453 : }
454 :
455 : // combinedIterState encapsulates the current state of combined iteration.
456 : // Various low-level iterators (mergingIter, leveliter) hold pointers to the
457 : // *pebble.Iterator's combinedIterState. This allows them to check whether or
458 : // not they must monitor for files containing range keys (!initialized), or not.
459 : //
460 : // When !initialized, low-level iterators watch for files containing range keys.
461 : // When one is discovered, they set triggered=true and key to the smallest
462 : // (forward direction) or largest (reverse direction) range key that's been
463 : // observed.
464 : type combinedIterState struct {
465 : // key holds the smallest (forward direction) or largest (backward
466 : // direction) user key from a range key bound discovered during the iterator
467 : // operation that triggered the switch to combined iteration.
468 : //
469 : // Slices stored here must be stable. This is possible because callers pass
470 : // a Smallest/Largest bound from a fileMetadata, which are immutable. A key
471 : // slice's bytes must not be overwritten.
472 : key []byte
473 : triggered bool
474 : initialized bool
475 : }
476 :
477 : // Assert that *lazyCombinedIter implements internalIterator.
478 : var _ internalIterator = (*lazyCombinedIter)(nil)
479 :
480 : // initCombinedIteration is invoked after a pointIter positioning operation
481 : // resulted in i.combinedIterState.triggered=true.
482 : //
483 : // The `dir` parameter is `+1` or `-1` indicating forward iteration or backward
484 : // iteration respectively.
485 : //
486 : // The `pointKey` and `pointValue` parameters provide the new point key-value
487 : // pair that the iterator was just positioned to. The combined iterator should
488 : // be seeded with this point key-value pair and return the smaller (forward
489 : // iteration) or largest (backward iteration) of the two.
490 : //
491 : // The `seekKey` parameter is non-nil only if the iterator operation that
492 : // triggered the switch to combined iteration was a SeekGE, SeekPrefixGE or
493 : // SeekLT. It provides the seek key supplied and is used to seek the range-key
494 : // iterator using the same key. This is necessary for SeekGE/SeekPrefixGE
495 : // operations that land in the middle of a range key and must truncate to the
496 : // user-provided seek key.
497 : func (i *lazyCombinedIter) initCombinedIteration(
498 : dir int8, pointKV *base.InternalKV, seekKey []byte,
499 1 : ) *base.InternalKV {
500 1 : // Invariant: i.parent.rangeKey is nil.
501 1 : // Invariant: !i.combinedIterState.initialized.
502 1 : if invariants.Enabled {
503 1 : if i.combinedIterState.initialized {
504 0 : panic("pebble: combined iterator already initialized")
505 : }
506 1 : if i.parent.rangeKey != nil {
507 0 : panic("pebble: iterator already has a range-key iterator stack")
508 : }
509 : }
510 :
511 : // We need to determine the key to seek the range key iterator to. If
512 : // seekKey is not nil, the user-initiated operation that triggered the
513 : // switch to combined iteration was itself a seek, and we can use that key.
514 : // Otherwise, a First/Last or relative positioning operation triggered the
515 : // switch to combined iteration.
516 : //
517 : // The levelIter that observed a file containing range keys populated
518 : // combinedIterState.key with the smallest (forward) or largest (backward)
519 : // range key it observed. If multiple levelIters observed files with range
520 : // keys during the same operation on the mergingIter, combinedIterState.key
521 : // is the smallest [during forward iteration; largest in reverse iteration]
522 : // such key.
523 1 : if seekKey == nil {
524 1 : // Use the levelIter-populated key.
525 1 : seekKey = i.combinedIterState.key
526 1 :
527 1 : // We may need to adjust the levelIter-populated seek key to the
528 1 : // surfaced point key. If the key observed is beyond [in the iteration
529 1 : // direction] the current point key, there may still exist a range key
530 1 : // at an earlier key. Consider the following example:
531 1 : //
532 1 : // L5: 000003:[bar.DEL.5, foo.RANGEKEYSET.9]
533 1 : // L6: 000001:[bar.SET.2] 000002:[bax.RANGEKEYSET.8]
534 1 : //
535 1 : // A call to First() seeks the levels to files L5.000003 and L6.000001.
536 1 : // The L5 levelIter observes that L5.000003 contains the range key with
537 1 : // start key `foo`, and triggers a switch to combined iteration, setting
538 1 : // `combinedIterState.key` = `foo`.
539 1 : //
540 1 : // The L6 levelIter did not observe the true first range key
541 1 : // (bax.RANGEKEYSET.8), because it appears in a later sstable. When the
542 1 : // combined iterator is initialized, the range key iterator must be
543 1 : // seeked to a key that will find `bax`. To accomplish this, we seek the
544 1 : // key instead to `bar`. It is guaranteed that no range key exists
545 1 : // earlier than `bar`, otherwise a levelIter would've observed it and
546 1 : // set `combinedIterState.key` to its start key.
547 1 : if pointKV != nil {
548 1 : if dir == +1 && i.parent.cmp(i.combinedIterState.key, pointKV.K.UserKey) > 0 {
549 1 : seekKey = pointKV.K.UserKey
550 1 : } else if dir == -1 && i.parent.cmp(seekKey, pointKV.K.UserKey) < 0 {
551 1 : seekKey = pointKV.K.UserKey
552 1 : }
553 : }
554 : }
555 :
556 : // An operation on the point iterator observed a file containing range keys,
557 : // so we must switch to combined interleaving iteration. First, construct
558 : // the range key iterator stack. It must not exist, otherwise we'd already
559 : // be performing combined iteration.
560 1 : i.parent.rangeKey = iterRangeKeyStateAllocPool.Get().(*iteratorRangeKeyState)
561 1 : i.parent.constructRangeKeyIter()
562 1 :
563 1 : // Initialize the Iterator's interleaving iterator.
564 1 : i.parent.rangeKey.iiter.Init(
565 1 : &i.parent.comparer, i.parent.pointIter, i.parent.rangeKey.rangeKeyIter,
566 1 : keyspan.InterleavingIterOpts{
567 1 : Mask: &i.parent.rangeKeyMasking,
568 1 : LowerBound: i.parent.opts.LowerBound,
569 1 : UpperBound: i.parent.opts.UpperBound,
570 1 : })
571 1 :
572 1 : // Set the parent's primary iterator to point to the combined, interleaving
573 1 : // iterator that's now initialized with our current state.
574 1 : i.parent.iter = &i.parent.rangeKey.iiter
575 1 : i.combinedIterState.initialized = true
576 1 : i.combinedIterState.key = nil
577 1 :
578 1 : // All future iterator operations will go directly through the combined
579 1 : // iterator.
580 1 : //
581 1 : // Initialize the interleaving iterator. We pass the point key-value pair so
582 1 : // that the interleaving iterator knows where the point iterator is
583 1 : // positioned. Additionally, we pass the seek key to which the range-key
584 1 : // iterator should be seeked in order to initialize its position.
585 1 : //
586 1 : // In the forward direction (invert for backwards), the seek key is a key
587 1 : // guaranteed to find the smallest range key that's greater than the last
588 1 : // key the iterator returned. The range key may be less than pointKV, in
589 1 : // which case the range key will be interleaved next instead of the point
590 1 : // key.
591 1 : if dir == +1 {
592 1 : var prefix []byte
593 1 : if i.parent.hasPrefix {
594 1 : prefix = i.parent.prefixOrFullSeekKey
595 1 : }
596 1 : return i.parent.rangeKey.iiter.InitSeekGE(prefix, seekKey, pointKV)
597 : }
598 1 : return i.parent.rangeKey.iiter.InitSeekLT(seekKey, pointKV)
599 : }
600 :
601 1 : func (i *lazyCombinedIter) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKV {
602 1 : if i.combinedIterState.initialized {
603 0 : return i.parent.rangeKey.iiter.SeekGE(key, flags)
604 0 : }
605 1 : kv := i.pointIter.SeekGE(key, flags)
606 1 : if i.combinedIterState.triggered {
607 1 : return i.initCombinedIteration(+1, kv, key)
608 1 : }
609 1 : return kv
610 : }
611 :
612 : func (i *lazyCombinedIter) SeekPrefixGE(
613 : prefix, key []byte, flags base.SeekGEFlags,
614 1 : ) *base.InternalKV {
615 1 : if i.combinedIterState.initialized {
616 0 : return i.parent.rangeKey.iiter.SeekPrefixGE(prefix, key, flags)
617 0 : }
618 1 : kv := i.pointIter.SeekPrefixGE(prefix, key, flags)
619 1 : if i.combinedIterState.triggered {
620 1 : return i.initCombinedIteration(+1, kv, key)
621 1 : }
622 1 : return kv
623 : }
624 :
625 1 : func (i *lazyCombinedIter) SeekLT(key []byte, flags base.SeekLTFlags) *base.InternalKV {
626 1 : if i.combinedIterState.initialized {
627 0 : return i.parent.rangeKey.iiter.SeekLT(key, flags)
628 0 : }
629 1 : kv := i.pointIter.SeekLT(key, flags)
630 1 : if i.combinedIterState.triggered {
631 1 : return i.initCombinedIteration(-1, kv, key)
632 1 : }
633 1 : return kv
634 : }
635 :
636 1 : func (i *lazyCombinedIter) First() *base.InternalKV {
637 1 : if i.combinedIterState.initialized {
638 0 : return i.parent.rangeKey.iiter.First()
639 0 : }
640 1 : kv := i.pointIter.First()
641 1 : if i.combinedIterState.triggered {
642 1 : return i.initCombinedIteration(+1, kv, nil)
643 1 : }
644 1 : return kv
645 : }
646 :
647 1 : func (i *lazyCombinedIter) Last() *base.InternalKV {
648 1 : if i.combinedIterState.initialized {
649 0 : return i.parent.rangeKey.iiter.Last()
650 0 : }
651 1 : kv := i.pointIter.Last()
652 1 : if i.combinedIterState.triggered {
653 1 : return i.initCombinedIteration(-1, kv, nil)
654 1 : }
655 1 : return kv
656 : }
657 :
658 1 : func (i *lazyCombinedIter) Next() *base.InternalKV {
659 1 : if i.combinedIterState.initialized {
660 0 : return i.parent.rangeKey.iiter.Next()
661 0 : }
662 1 : kv := i.pointIter.Next()
663 1 : if i.combinedIterState.triggered {
664 1 : return i.initCombinedIteration(+1, kv, nil)
665 1 : }
666 1 : return kv
667 : }
668 :
669 1 : func (i *lazyCombinedIter) NextPrefix(succKey []byte) *base.InternalKV {
670 1 : if i.combinedIterState.initialized {
671 0 : return i.parent.rangeKey.iiter.NextPrefix(succKey)
672 0 : }
673 1 : kv := i.pointIter.NextPrefix(succKey)
674 1 : if i.combinedIterState.triggered {
675 0 : return i.initCombinedIteration(+1, kv, nil)
676 0 : }
677 1 : return kv
678 : }
679 :
680 1 : func (i *lazyCombinedIter) Prev() *base.InternalKV {
681 1 : if i.combinedIterState.initialized {
682 0 : return i.parent.rangeKey.iiter.Prev()
683 0 : }
684 1 : kv := i.pointIter.Prev()
685 1 : if i.combinedIterState.triggered {
686 1 : return i.initCombinedIteration(-1, kv, nil)
687 1 : }
688 1 : return kv
689 : }
690 :
691 1 : func (i *lazyCombinedIter) Error() error {
692 1 : if i.combinedIterState.initialized {
693 0 : return i.parent.rangeKey.iiter.Error()
694 0 : }
695 1 : return i.pointIter.Error()
696 : }
697 :
698 1 : func (i *lazyCombinedIter) Close() error {
699 1 : if i.combinedIterState.initialized {
700 0 : return i.parent.rangeKey.iiter.Close()
701 0 : }
702 1 : return i.pointIter.Close()
703 : }
704 :
705 1 : func (i *lazyCombinedIter) SetBounds(lower, upper []byte) {
706 1 : if i.combinedIterState.initialized {
707 0 : i.parent.rangeKey.iiter.SetBounds(lower, upper)
708 0 : return
709 0 : }
710 1 : i.pointIter.SetBounds(lower, upper)
711 : }
712 :
713 0 : func (i *lazyCombinedIter) SetContext(ctx context.Context) {
714 0 : if i.combinedIterState.initialized {
715 0 : i.parent.rangeKey.iiter.SetContext(ctx)
716 0 : return
717 0 : }
718 0 : i.pointIter.SetContext(ctx)
719 : }
720 :
721 : // DebugTree is part of the InternalIterator interface.
722 0 : func (i *lazyCombinedIter) DebugTree(tp treeprinter.Node) {
723 0 : n := tp.Childf("%T(%p)", i, i)
724 0 : if i.combinedIterState.initialized {
725 0 : i.parent.rangeKey.iiter.DebugTree(n)
726 0 : } else {
727 0 : i.pointIter.DebugTree(n)
728 0 : }
729 : }
730 :
731 0 : func (i *lazyCombinedIter) String() string {
732 0 : if i.combinedIterState.initialized {
733 0 : return i.parent.rangeKey.iiter.String()
734 0 : }
735 0 : return i.pointIter.String()
736 : }
|