Line data Source code
1 : // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package pebble
6 :
7 : import (
8 : "context"
9 : "fmt"
10 : "runtime/debug"
11 :
12 : "github.com/cockroachdb/errors"
13 : "github.com/cockroachdb/pebble/internal/base"
14 : "github.com/cockroachdb/pebble/internal/invariants"
15 : "github.com/cockroachdb/pebble/internal/keyspan"
16 : "github.com/cockroachdb/pebble/internal/manifest"
17 : "github.com/cockroachdb/pebble/internal/treeprinter"
18 : )
19 :
20 : // levelIter provides a merged view of the sstables in a level.
21 : //
22 : // levelIter is used during compaction and as part of the Iterator
23 : // implementation. When used as part of the Iterator implementation, level
24 : // iteration needs to "pause" at range deletion boundaries if file contains
25 : // range deletions. In this case, the levelIter uses a keyspan.InterleavingIter
26 : // to materialize InternalKVs at start and end boundaries of range deletions.
27 : // This prevents mergingIter from advancing past the sstable until the sstable
28 : // contains the smallest (or largest for reverse iteration) key in the merged
29 : // heap. Note that mergingIter treats a range deletion tombstone returned by the
30 : // point iterator as a no-op.
31 : type levelIter struct {
32 : // The context is stored here since (a) iterators are expected to be
33 : // short-lived (since they pin sstables), (b) plumbing a context into every
34 : // method is very painful, (c) they do not (yet) respect context
35 : // cancellation and are only used for tracing.
36 : ctx context.Context
37 : logger Logger
38 : comparer *Comparer
39 : // The lower/upper bounds for iteration as specified at creation or the most
40 : // recent call to SetBounds.
41 : lower []byte
42 : upper []byte
43 : // prefix holds the iteration prefix when the most recent absolute
44 : // positioning method was a SeekPrefixGE.
45 : prefix []byte
46 : // The iterator options for the currently open table. If
47 : // tableOpts.{Lower,Upper}Bound are nil, the corresponding iteration boundary
48 : // does not lie within the table bounds.
49 : tableOpts IterOptions
50 : // The layer this levelIter is initialized for. This can be either
51 : // a level L1+, an L0 sublevel, or a flushable ingests layer.
52 : layer manifest.Layer
53 : // combinedIterState may be set when a levelIter is used during user
54 : // iteration. Although levelIter only iterates over point keys, it's also
55 : // responsible for lazily constructing the combined range & point iterator
56 : // when it observes a file containing range keys. If the combined iter
57 : // state's initialized field is true, the iterator is already using combined
58 : // iterator, OR the iterator is not configured to use combined iteration. If
59 : // it's false, the levelIter must set the `triggered` and `key` fields when
60 : // the levelIter passes over a file containing range keys. See the
61 : // lazyCombinedIter for more details.
62 : combinedIterState *combinedIterState
63 : // The iter for the current file. It is nil under any of the following conditions:
64 : // - files.Current() == nil
65 : // - err != nil
66 : // - some other constraint, like the bounds in opts, caused the file at index to not
67 : // be relevant to the iteration.
68 : iter internalIterator
69 : // iterFile holds the current file. It is always equal to l.files.Current().
70 : iterFile *manifest.TableMetadata
71 : newIters tableNewIters
72 : files manifest.LevelIterator
73 : err error
74 :
75 : // When rangeDelIterSetter != nil, the caller requires that this function
76 : // gets called with a range deletion iterator whenever the current file
77 : // changes. The iterator is relinquished to the caller which is responsible
78 : // for closing it.
79 : //
80 : // When rangeDelIterSetter != nil, the levelIter will also interleave the
81 : // boundaries of range deletions among point keys.
82 : rangeDelIterSetter rangeDelIterSetter
83 :
84 : // interleaving is used when rangeDelIterFn != nil to interleave the
85 : // boundaries of range deletions among point keys. When the leve iterator is
86 : // used by a merging iterator, this ensures that we don't advance to a new
87 : // file until the range deletions are no longer needed by other levels.
88 : interleaving keyspan.InterleavingIter
89 :
90 : // internalOpts holds the internal iterator options to pass to the table
91 : // cache when constructing new table iterators.
92 : internalOpts internalIterOpts
93 :
94 : // Scratch space for the obsolete keys filter, when there are no other block
95 : // property filters specified. See the performance note where
96 : // IterOptions.PointKeyFilters is declared.
97 : filtersBuf [1]BlockPropertyFilter
98 :
99 : // exhaustedDir is set to +1 or -1 when the levelIter has been exhausted in
100 : // the forward or backward direction respectively. It is set when the
101 : // underlying data is exhausted or when iteration has reached the upper or
102 : // lower boundary and interleaved a synthetic iterator bound key. When the
103 : // iterator is exhausted and Next or Prev is called, the levelIter uses
104 : // exhaustedDir to determine whether the iterator should step on to the
105 : // first or last key within iteration bounds.
106 : exhaustedDir int8
107 :
108 : // Disable invariant checks even if they are otherwise enabled. Used by tests
109 : // which construct "impossible" situations (e.g. seeking to a key before the
110 : // lower bound).
111 : disableInvariants bool
112 : }
113 :
114 : type rangeDelIterSetter interface {
115 : setRangeDelIter(rangeDelIter keyspan.FragmentIterator)
116 : }
117 :
118 : // levelIter implements the base.InternalIterator interface.
119 : var _ base.InternalIterator = (*levelIter)(nil)
120 :
121 : // newLevelIter returns a levelIter. It is permissible to pass a nil split
122 : // parameter if the caller is never going to call SeekPrefixGE.
123 : func newLevelIter(
124 : ctx context.Context,
125 : opts IterOptions,
126 : comparer *Comparer,
127 : newIters tableNewIters,
128 : files manifest.LevelIterator,
129 : layer manifest.Layer,
130 : internalOpts internalIterOpts,
131 2 : ) *levelIter {
132 2 : l := &levelIter{}
133 2 : l.init(ctx, opts, comparer, newIters, files, layer, internalOpts)
134 2 : return l
135 2 : }
136 :
137 : func (l *levelIter) init(
138 : ctx context.Context,
139 : opts IterOptions,
140 : comparer *Comparer,
141 : newIters tableNewIters,
142 : files manifest.LevelIterator,
143 : layer manifest.Layer,
144 : internalOpts internalIterOpts,
145 2 : ) {
146 2 : l.ctx = ctx
147 2 : l.err = nil
148 2 : l.layer = layer
149 2 : l.logger = opts.getLogger()
150 2 : l.prefix = nil
151 2 : l.lower = opts.LowerBound
152 2 : l.upper = opts.UpperBound
153 2 : l.tableOpts.PointKeyFilters = opts.PointKeyFilters
154 2 : if len(opts.PointKeyFilters) == 0 {
155 2 : l.tableOpts.PointKeyFilters = l.filtersBuf[:0:1]
156 2 : }
157 2 : l.tableOpts.MaximumSuffixProperty = opts.MaximumSuffixProperty
158 2 : l.tableOpts.UseL6Filters = opts.UseL6Filters
159 2 : l.tableOpts.Category = opts.Category
160 2 : l.tableOpts.layer = l.layer
161 2 : l.tableOpts.snapshotForHideObsoletePoints = opts.snapshotForHideObsoletePoints
162 2 : l.comparer = comparer
163 2 : l.iterFile = nil
164 2 : l.newIters = newIters
165 2 : l.files = files
166 2 : l.exhaustedDir = 0
167 2 : l.internalOpts = internalOpts
168 : }
169 :
170 : // initRangeDel puts the level iterator into a mode where it interleaves range
171 : // deletion boundaries with point keys and provides a range deletion iterator
172 : // (through rangeDelIterFn) whenever the current file changes.
173 : //
174 : // The range deletion iterator passed to rangeDelIterFn is relinquished to the
175 : // implementor who is responsible for closing it.
176 2 : func (l *levelIter) initRangeDel(rangeDelSetter rangeDelIterSetter) {
177 2 : l.rangeDelIterSetter = rangeDelSetter
178 2 : }
179 :
180 2 : func (l *levelIter) initCombinedIterState(state *combinedIterState) {
181 2 : l.combinedIterState = state
182 2 : }
183 :
184 2 : func (l *levelIter) maybeTriggerCombinedIteration(file *manifest.TableMetadata, dir int) {
185 2 : // If we encounter a file that contains range keys, we may need to
186 2 : // trigger a switch to combined range-key and point-key iteration,
187 2 : // if the *pebble.Iterator is configured for it. This switch is done
188 2 : // lazily because range keys are intended to be rare, and
189 2 : // constructing the range-key iterator substantially adds to the
190 2 : // cost of iterator construction and seeking.
191 2 : if file == nil || !file.HasRangeKeys {
192 2 : return
193 2 : }
194 :
195 : // If l.combinedIterState.initialized is already true, either the
196 : // iterator is already using combined iteration or the iterator is not
197 : // configured to observe range keys. Either way, there's nothing to do.
198 : // If false, trigger the switch to combined iteration, using the
199 : // file's bounds to seek the range-key iterator appropriately.
200 2 : if l.combinedIterState == nil || l.combinedIterState.initialized {
201 2 : return
202 2 : }
203 :
204 2 : if l.upper != nil && l.comparer.Compare(file.RangeKeyBounds.SmallestUserKey(), l.upper) >= 0 {
205 2 : // Range key bounds are above the upper iteration bound.
206 2 : return
207 2 : }
208 2 : if l.lower != nil && l.comparer.Compare(file.RangeKeyBounds.LargestUserKey(), l.lower) <= 0 {
209 2 : // Range key bounds are below the lower iteration bound.
210 2 : return
211 2 : }
212 2 : if props, ok := file.TableBacking.Properties(); ok && props.NumRangeKeySets == 0 {
213 2 : // We only need to trigger combined iteration if the file contains
214 2 : // RangeKeySets: if there are only Unsets and Dels, the user will observe no
215 2 : // range keys regardless. If this file has table stats available, they'll
216 2 : // tell us whether the file has any RangeKeySets. Otherwise, we must
217 2 : // fallback to assuming it does (given that HasRangeKeys=true).
218 2 : return
219 2 : }
220 :
221 : // The file contains range keys, and we're not using combined iteration yet.
222 : // Trigger a switch to combined iteration. It's possible that a switch has
223 : // already been triggered if multiple levels encounter files containing
224 : // range keys while executing a single mergingIter operation. In this case,
225 : // we need to compare the existing key recorded to l.combinedIterState.key,
226 : // adjusting it if our key is smaller (forward iteration) or larger
227 : // (backward iteration) than the existing key.
228 : //
229 : // These key comparisons are only required during a single high-level
230 : // iterator operation. When the high-level iter op completes,
231 : // iinitialized will be true, and future calls to this function will be
232 : // no-ops.
233 2 : switch dir {
234 2 : case +1:
235 2 : if !l.combinedIterState.triggered {
236 2 : l.combinedIterState.triggered = true
237 2 : l.combinedIterState.key = file.RangeKeyBounds.SmallestUserKey()
238 2 : } else if l.comparer.Compare(l.combinedIterState.key, file.RangeKeyBounds.SmallestUserKey()) > 0 {
239 2 : l.combinedIterState.key = file.RangeKeyBounds.SmallestUserKey()
240 2 : }
241 2 : case -1:
242 2 : if !l.combinedIterState.triggered {
243 2 : l.combinedIterState.triggered = true
244 2 : l.combinedIterState.key = file.RangeKeyBounds.LargestUserKey()
245 2 : } else if l.comparer.Compare(l.combinedIterState.key, file.RangeKeyBounds.LargestUserKey()) < 0 {
246 2 : l.combinedIterState.key = file.RangeKeyBounds.LargestUserKey()
247 2 : }
248 : }
249 : }
250 :
251 2 : func (l *levelIter) findFileGE(key []byte, flags base.SeekGEFlags) *manifest.TableMetadata {
252 2 : // Find the earliest file whose largest key is >= key.
253 2 :
254 2 : // NB: if flags.TrySeekUsingNext()=true, the levelIter must respect it. If
255 2 : // the levelIter is positioned at the key P, it must return a key ≥ P. If
256 2 : // used within a merging iterator, the merging iterator will depend on the
257 2 : // levelIter only moving forward to maintain heap invariants.
258 2 :
259 2 : // Ordinarily we seek the LevelIterator using SeekGE. In some instances, we
260 2 : // Next instead. In other instances, we try Next-ing first, falling back to
261 2 : // seek:
262 2 : // a) flags.TrySeekUsingNext(): The top-level Iterator knows we're seeking
263 2 : // to a key later than the current iterator position. We don't know how
264 2 : // much later the seek key is, so it's possible there are many sstables
265 2 : // between the current position and the seek key. However in most real-
266 2 : // world use cases, the seek key is likely to be nearby. Rather than
267 2 : // performing a log(N) seek through the table metadata, we next a few
268 2 : // times from our existing location. If we don't find a file whose
269 2 : // largest is >= key within a few nexts, we fall back to seeking.
270 2 : //
271 2 : // Note that in this case, the file returned by findFileGE may be
272 2 : // different than the file returned by a raw binary search (eg, when
273 2 : // TrySeekUsingNext=false). This is possible because the most recent
274 2 : // positioning operation may have already determined that previous
275 2 : // files' keys that are ≥ key are all deleted. This information is
276 2 : // encoded within the iterator's current iterator position and is
277 2 : // unavailable to a fresh binary search.
278 2 : //
279 2 : // b) flags.RelativeSeek(): The merging iterator decided to re-seek this
280 2 : // level according to a range tombstone. When lazy combined iteration
281 2 : // is enabled, the level iterator is responsible for watching for
282 2 : // files containing range keys and triggering the switch to combined
283 2 : // iteration when such a file is observed. If a range deletion was
284 2 : // observed in a higher level causing the merging iterator to seek the
285 2 : // level to the range deletion's end key, we need to check whether all
286 2 : // of the files between the old position and the new position contain
287 2 : // any range keys.
288 2 : //
289 2 : // In this scenario, we don't seek the LevelIterator and instead we
290 2 : // Next it, one file at a time, checking each for range keys. The
291 2 : // merging iterator sets this flag to inform us that we're moving
292 2 : // forward relative to the existing position and that we must examine
293 2 : // each intermediate sstable's metadata for lazy-combined iteration.
294 2 : // In this case, we only Next and never Seek. We set nextsUntilSeek=-1
295 2 : // to signal this intention.
296 2 : //
297 2 : // NB: At most one of flags.RelativeSeek() and flags.TrySeekUsingNext() may
298 2 : // be set, because the merging iterator re-seeks relative seeks with
299 2 : // explicitly only the RelativeSeek flag set.
300 2 : var nextsUntilSeek int
301 2 : var nextInsteadOfSeek bool
302 2 : if flags.TrySeekUsingNext() {
303 2 : nextInsteadOfSeek = true
304 2 : nextsUntilSeek = 4 // arbitrary
305 2 : }
306 2 : if flags.RelativeSeek() && l.combinedIterState != nil && !l.combinedIterState.initialized {
307 2 : nextInsteadOfSeek = true
308 2 : nextsUntilSeek = -1
309 2 : }
310 :
311 2 : var m *manifest.TableMetadata
312 2 : if nextInsteadOfSeek {
313 2 : m = l.iterFile
314 2 : } else {
315 2 : m = l.files.SeekGE(l.comparer.Compare, key)
316 2 : }
317 : // The below loop has a bit of an unusual organization. There are several
318 : // conditions under which we need to Next to a later file. If none of those
319 : // conditions are met, the file in `m` is okay to return. The loop body is
320 : // structured with a series of if statements, each of which may continue the
321 : // loop to the next file. If none of the statements are met, the end of the
322 : // loop body is a break.
323 2 : for m != nil {
324 2 : if m.HasRangeKeys {
325 2 : l.maybeTriggerCombinedIteration(m, +1)
326 2 :
327 2 : // Some files may only contain range keys, which we can skip.
328 2 : // NB: HasPointKeys=true if the file contains any points or range
329 2 : // deletions (which delete points).
330 2 : if !m.HasPointKeys {
331 2 : m = l.files.Next()
332 2 : continue
333 : }
334 : }
335 :
336 : // This file has point keys.
337 : //
338 : // However, there are a couple reasons why `m` may not be positioned ≥
339 : // `key` yet:
340 : //
341 : // 1. If SeekGE(key) landed on a file containing range keys, the file
342 : // may contain range keys ≥ `key` but no point keys ≥ `key`.
343 : // 2. When nexting instead of seeking, we must check to see whether
344 : // we've nexted sufficiently far, or we need to next again.
345 : //
346 : // If the file does not contain point keys ≥ `key`, next to continue
347 : // looking for a file that does.
348 2 : if (m.HasRangeKeys || nextInsteadOfSeek) && l.comparer.Compare(m.PointKeyBounds.LargestUserKey(), key) < 0 {
349 2 : // If nextInsteadOfSeek is set and nextsUntilSeek is non-negative,
350 2 : // the iterator has been nexting hoping to discover the relevant
351 2 : // file without seeking. It's exhausted the allotted nextsUntilSeek
352 2 : // and should seek to the sought key.
353 2 : if nextInsteadOfSeek && nextsUntilSeek == 0 {
354 1 : nextInsteadOfSeek = false
355 1 : m = l.files.SeekGE(l.comparer.Compare, key)
356 1 : continue
357 2 : } else if nextsUntilSeek > 0 {
358 2 : nextsUntilSeek--
359 2 : }
360 2 : m = l.files.Next()
361 2 : continue
362 : }
363 :
364 : // This file has a point key bound ≥ `key`. But the largest point key
365 : // bound may still be a range deletion sentinel, which is exclusive. In
366 : // this case, the file doesn't actually contain any point keys equal to
367 : // `key`. We next to keep searching for a file that actually contains
368 : // point keys ≥ key.
369 : //
370 : // Additionally, this prevents loading untruncated range deletions from
371 : // a table which can't possibly contain the target key and is required
372 : // for correctness by mergingIter.SeekGE (see the comment in that
373 : // function).
374 2 : if m.PointKeyBounds.Largest().IsExclusiveSentinel() && l.comparer.Compare(m.PointKeyBounds.LargestUserKey(), key) == 0 {
375 2 : m = l.files.Next()
376 2 : continue
377 : }
378 :
379 : // This file contains point keys ≥ `key`. Break and return it.
380 2 : break
381 : }
382 2 : return m
383 : }
384 :
385 2 : func (l *levelIter) findFileLT(key []byte, flags base.SeekLTFlags) *manifest.TableMetadata {
386 2 : // Find the last file whose smallest key is < ikey.
387 2 :
388 2 : // Ordinarily we seek the LevelIterator using SeekLT.
389 2 : //
390 2 : // When lazy combined iteration is enabled, there's a complication. The
391 2 : // level iterator is responsible for watching for files containing range
392 2 : // keys and triggering the switch to combined iteration when such a file is
393 2 : // observed. If a range deletion was observed in a higher level causing the
394 2 : // merging iterator to seek the level to the range deletion's start key, we
395 2 : // need to check whether all of the files between the old position and the
396 2 : // new position contain any range keys.
397 2 : //
398 2 : // In this scenario, we don't seek the LevelIterator and instead we Prev it,
399 2 : // one file at a time, checking each for range keys.
400 2 : prevInsteadOfSeek := flags.RelativeSeek() && l.combinedIterState != nil && !l.combinedIterState.initialized
401 2 :
402 2 : var m *manifest.TableMetadata
403 2 : if prevInsteadOfSeek {
404 2 : m = l.iterFile
405 2 : } else {
406 2 : m = l.files.SeekLT(l.comparer.Compare, key)
407 2 : }
408 : // The below loop has a bit of an unusual organization. There are several
409 : // conditions under which we need to Prev to a previous file. If none of
410 : // those conditions are met, the file in `m` is okay to return. The loop
411 : // body is structured with a series of if statements, each of which may
412 : // continue the loop to the previous file. If none of the statements are
413 : // met, the end of the loop body is a break.
414 2 : for m != nil {
415 2 : if m.HasRangeKeys {
416 2 : l.maybeTriggerCombinedIteration(m, -1)
417 2 :
418 2 : // Some files may only contain range keys, which we can skip.
419 2 : // NB: HasPointKeys=true if the file contains any points or range
420 2 : // deletions (which delete points).
421 2 : if !m.HasPointKeys {
422 2 : m = l.files.Prev()
423 2 : continue
424 : }
425 : }
426 :
427 : // This file has point keys.
428 : //
429 : // However, there are a couple reasons why `m` may not be positioned <
430 : // `key` yet:
431 : //
432 : // 1. If SeekLT(key) landed on a file containing range keys, the file
433 : // may contain range keys < `key` but no point keys < `key`.
434 : // 2. When preving instead of seeking, we must check to see whether
435 : // we've preved sufficiently far, or we need to prev again.
436 : //
437 : // If the file does not contain point keys < `key`, prev to continue
438 : // looking for a file that does.
439 2 : if (m.HasRangeKeys || prevInsteadOfSeek) && l.comparer.Compare(m.PointKeyBounds.SmallestUserKey(), key) >= 0 {
440 2 : m = l.files.Prev()
441 2 : continue
442 : }
443 :
444 : // This file contains point keys < `key`. Break and return it.
445 2 : break
446 : }
447 2 : return m
448 : }
449 :
450 : // Init the iteration bounds for the current table. Returns -1 if the table
451 : // lies fully before the lower bound, +1 if the table lies fully after the
452 : // upper bound, and 0 if the table overlaps the iteration bounds.
453 2 : func (l *levelIter) initTableBounds(f *manifest.TableMetadata) int {
454 2 : l.tableOpts.LowerBound = l.lower
455 2 : if l.tableOpts.LowerBound != nil {
456 2 : if l.comparer.Compare(f.PointKeyBounds.LargestUserKey(), l.tableOpts.LowerBound) < 0 {
457 2 : // The largest key in the sstable is smaller than the lower bound.
458 2 : return -1
459 2 : }
460 2 : if l.comparer.Compare(l.tableOpts.LowerBound, f.PointKeyBounds.SmallestUserKey()) <= 0 {
461 2 : // The lower bound is smaller or equal to the smallest key in the
462 2 : // table. Iteration within the table does not need to check the lower
463 2 : // bound.
464 2 : l.tableOpts.LowerBound = nil
465 2 : }
466 : }
467 2 : l.tableOpts.UpperBound = l.upper
468 2 : if l.tableOpts.UpperBound != nil {
469 2 : if l.comparer.Compare(f.PointKeyBounds.SmallestUserKey(), l.tableOpts.UpperBound) >= 0 {
470 2 : // The smallest key in the sstable is greater than or equal to the upper
471 2 : // bound.
472 2 : return 1
473 2 : }
474 2 : if l.comparer.Compare(l.tableOpts.UpperBound, f.PointKeyBounds.LargestUserKey()) > 0 {
475 2 : // The upper bound is greater than the largest key in the
476 2 : // table. Iteration within the table does not need to check the upper
477 2 : // bound. NB: tableOpts.UpperBound is exclusive and f.PointKeyBounds.Largest() is
478 2 : // inclusive.
479 2 : l.tableOpts.UpperBound = nil
480 2 : }
481 : }
482 2 : return 0
483 : }
484 :
485 : type loadFileReturnIndicator int8
486 :
487 : const (
488 : noFileLoaded loadFileReturnIndicator = iota
489 : fileAlreadyLoaded
490 : newFileLoaded
491 : )
492 :
493 2 : func (l *levelIter) loadFile(file *manifest.TableMetadata, dir int) loadFileReturnIndicator {
494 2 : if l.iterFile == file {
495 2 : if l.err != nil {
496 0 : return noFileLoaded
497 0 : }
498 2 : if l.iter != nil {
499 2 : // We don't bother comparing the file bounds with the iteration bounds when we have
500 2 : // an already open iterator. It is possible that the iter may not be relevant given the
501 2 : // current iteration bounds, but it knows those bounds, so it will enforce them.
502 2 :
503 2 : // There are a few reasons we might not have triggered combined
504 2 : // iteration yet, even though we already had `file` open.
505 2 : // 1. If the bounds changed, we might have previously avoided
506 2 : // switching to combined iteration because the bounds excluded
507 2 : // the range keys contained in this file.
508 2 : // 2. If an existing iterator was reconfigured to iterate over range
509 2 : // keys (eg, using SetOptions), then we wouldn't have triggered
510 2 : // the switch to combined iteration yet.
511 2 : l.maybeTriggerCombinedIteration(file, dir)
512 2 : return fileAlreadyLoaded
513 2 : }
514 : // We were already at file, but don't have an iterator, probably because the file was
515 : // beyond the iteration bounds. It may still be, but it is also possible that the bounds
516 : // have changed. We handle that below.
517 : }
518 :
519 : // Close iter and send a nil iterator through rangeDelIterFn.rangeDelIterFn.
520 2 : if err := l.Close(); err != nil {
521 1 : return noFileLoaded
522 1 : }
523 :
524 2 : for {
525 2 : l.iterFile = file
526 2 : if file == nil {
527 2 : return noFileLoaded
528 2 : }
529 :
530 2 : l.maybeTriggerCombinedIteration(file, dir)
531 2 : if !file.HasPointKeys {
532 2 : switch dir {
533 2 : case +1:
534 2 : file = l.files.Next()
535 2 : continue
536 2 : case -1:
537 2 : file = l.files.Prev()
538 2 : continue
539 : }
540 : }
541 :
542 2 : switch l.initTableBounds(file) {
543 2 : case -1:
544 2 : // The largest key in the sstable is smaller than the lower bound.
545 2 : if dir < 0 {
546 2 : return noFileLoaded
547 2 : }
548 0 : file = l.files.Next()
549 0 : continue
550 2 : case +1:
551 2 : // The smallest key in the sstable is greater than or equal to the upper
552 2 : // bound.
553 2 : if dir > 0 {
554 2 : return noFileLoaded
555 2 : }
556 0 : file = l.files.Prev()
557 0 : continue
558 : }
559 : // If we're in prefix iteration, it's possible this file's smallest
560 : // boundary is large enough to prove the file cannot possibly contain
561 : // any keys within the iteration prefix. Loading the next file is
562 : // unnecessary. This has been observed in practice on slow shared
563 : // storage. See #3575.
564 2 : if l.prefix != nil && l.comparer.Compare(l.comparer.Split.Prefix(file.PointKeyBounds.SmallestUserKey()), l.prefix) > 0 {
565 2 : // Note that because l.iter is nil, a subsequent call to
566 2 : // SeekPrefixGE with TrySeekUsingNext()=true will load the file
567 2 : // (returning newFileLoaded) and disable TrySeekUsingNext before
568 2 : // performing a seek in the file.
569 2 : return noFileLoaded
570 2 : }
571 :
572 2 : iterKinds := iterPointKeys
573 2 : if l.rangeDelIterSetter != nil {
574 2 : iterKinds |= iterRangeDeletions
575 2 : }
576 :
577 2 : var iters iterSet
578 2 : iters, l.err = l.newIters(l.ctx, l.iterFile, &l.tableOpts, l.internalOpts, iterKinds)
579 2 : if l.err != nil {
580 1 : if l.rangeDelIterSetter != nil {
581 1 : l.rangeDelIterSetter.setRangeDelIter(nil)
582 1 : }
583 1 : return noFileLoaded
584 : }
585 2 : l.iter = iters.Point()
586 2 : if l.rangeDelIterSetter != nil && iters.rangeDeletion != nil {
587 2 : // If this file has range deletions, interleave the bounds of the
588 2 : // range deletions among the point keys. When used with a
589 2 : // mergingIter, this ensures we don't move beyond a file with range
590 2 : // deletions until its range deletions are no longer relevant.
591 2 : //
592 2 : // For now, we open a second range deletion iterator. Future work
593 2 : // will avoid the need to open a second range deletion iterator, and
594 2 : // avoid surfacing the file's range deletion iterator via rangeDelIterFn.
595 2 : itersForBounds, err := l.newIters(l.ctx, l.iterFile, &l.tableOpts, l.internalOpts, iterRangeDeletions)
596 2 : if err != nil {
597 1 : l.iter = nil
598 1 : l.err = errors.CombineErrors(err, iters.CloseAll())
599 1 : return noFileLoaded
600 1 : }
601 2 : l.interleaving.Init(l.comparer, l.iter, itersForBounds.RangeDeletion(), keyspan.InterleavingIterOpts{
602 2 : LowerBound: l.tableOpts.LowerBound,
603 2 : UpperBound: l.tableOpts.UpperBound,
604 2 : InterleaveEndKeys: true,
605 2 : })
606 2 : l.iter = &l.interleaving
607 2 :
608 2 : // Relinquish iters.rangeDeletion to the caller.
609 2 : l.rangeDelIterSetter.setRangeDelIter(iters.rangeDeletion)
610 : }
611 2 : return newFileLoaded
612 : }
613 : }
614 :
615 : // In race builds we verify that the keys returned by levelIter lie within
616 : // [lower,upper).
617 2 : func (l *levelIter) verify(kv *base.InternalKV) *base.InternalKV {
618 2 : // Note that invariants.Enabled is a compile time constant, which means the
619 2 : // block of code will be compiled out of normal builds making this method
620 2 : // eligible for inlining. Do not change this to use a variable.
621 2 : if invariants.Enabled && !l.disableInvariants && kv != nil {
622 2 : // We allow returning a boundary key that is outside of the lower/upper
623 2 : // bounds as such keys are always range tombstones which will be skipped
624 2 : // by the Iterator.
625 2 : if l.lower != nil && kv != nil && !kv.K.IsExclusiveSentinel() && l.comparer.Compare(kv.K.UserKey, l.lower) < 0 {
626 0 : l.logger.Fatalf("levelIter %s: lower bound violation: %s < %s\n%s", l.layer, kv, l.lower, debug.Stack())
627 0 : }
628 2 : if l.upper != nil && kv != nil && !kv.K.IsExclusiveSentinel() && l.comparer.Compare(kv.K.UserKey, l.upper) > 0 {
629 0 : l.logger.Fatalf("levelIter %s: upper bound violation: %s > %s\n%s", l.layer, kv, l.upper, debug.Stack())
630 0 : }
631 : }
632 2 : return kv
633 : }
634 :
635 2 : func (l *levelIter) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKV {
636 2 : if invariants.Enabled && l.lower != nil && l.comparer.Compare(key, l.lower) < 0 {
637 0 : panic(errors.AssertionFailedf("levelIter SeekGE to key %q violates lower bound %q", key, l.lower))
638 : }
639 :
640 2 : l.err = nil // clear cached iteration error
641 2 : l.exhaustedDir = 0
642 2 : l.prefix = nil
643 2 : // NB: the top-level Iterator has already adjusted key based on
644 2 : // IterOptions.LowerBound.
645 2 : loadFileIndicator := l.loadFile(l.findFileGE(key, flags), +1)
646 2 : if loadFileIndicator == noFileLoaded {
647 2 : l.exhaustedForward()
648 2 : return nil
649 2 : }
650 2 : if loadFileIndicator == newFileLoaded {
651 2 : // File changed, so l.iter has changed, and that iterator is not
652 2 : // positioned appropriately.
653 2 : flags = flags.DisableTrySeekUsingNext()
654 2 : }
655 2 : if kv := l.iter.SeekGE(key, flags); kv != nil {
656 2 : return l.verify(kv)
657 2 : }
658 2 : return l.verify(l.skipEmptyFileForward())
659 : }
660 :
661 2 : func (l *levelIter) SeekPrefixGE(prefix, key []byte, flags base.SeekGEFlags) *base.InternalKV {
662 2 : if invariants.Enabled && l.lower != nil && l.comparer.Compare(key, l.lower) < 0 {
663 0 : panic(errors.AssertionFailedf("levelIter SeekGE to key %q violates lower bound %q", key, l.lower))
664 : }
665 2 : l.err = nil // clear cached iteration error
666 2 : l.exhaustedDir = 0
667 2 : l.prefix = prefix
668 2 :
669 2 : // NB: the top-level Iterator has already adjusted key based on
670 2 : // IterOptions.LowerBound.
671 2 : loadFileIndicator := l.loadFile(l.findFileGE(key, flags), +1)
672 2 : if loadFileIndicator == noFileLoaded {
673 2 : l.exhaustedForward()
674 2 : return nil
675 2 : }
676 2 : if loadFileIndicator == newFileLoaded {
677 2 : // File changed, so l.iter has changed, and that iterator is not
678 2 : // positioned appropriately.
679 2 : flags = flags.DisableTrySeekUsingNext()
680 2 : }
681 2 : if kv := l.iter.SeekPrefixGE(prefix, key, flags); kv != nil {
682 2 : return l.verify(kv)
683 2 : }
684 2 : if err := l.iter.Error(); err != nil {
685 1 : return nil
686 1 : }
687 2 : return l.verify(l.skipEmptyFileForward())
688 : }
689 :
690 2 : func (l *levelIter) SeekLT(key []byte, flags base.SeekLTFlags) *base.InternalKV {
691 2 : if invariants.Enabled && l.upper != nil && l.comparer.Compare(key, l.upper) > 0 {
692 0 : panic(errors.AssertionFailedf("levelIter SeekLT to key %q violates upper bound %q", key, l.upper))
693 : }
694 :
695 2 : l.err = nil // clear cached iteration error
696 2 : l.exhaustedDir = 0
697 2 : l.prefix = nil
698 2 :
699 2 : // NB: the top-level Iterator has already adjusted key based on
700 2 : // IterOptions.UpperBound.
701 2 : if l.loadFile(l.findFileLT(key, flags), -1) == noFileLoaded {
702 2 : l.exhaustedBackward()
703 2 : return nil
704 2 : }
705 2 : if kv := l.iter.SeekLT(key, flags); kv != nil {
706 2 : return l.verify(kv)
707 2 : }
708 2 : return l.verify(l.skipEmptyFileBackward())
709 : }
710 :
711 2 : func (l *levelIter) First() *base.InternalKV {
712 2 : if invariants.Enabled && l.lower != nil {
713 0 : panic(errors.AssertionFailedf("levelIter First called while lower bound %q is set", l.lower))
714 : }
715 :
716 2 : l.err = nil // clear cached iteration error
717 2 : l.exhaustedDir = 0
718 2 : l.prefix = nil
719 2 :
720 2 : // NB: the top-level Iterator will call SeekGE if IterOptions.LowerBound is
721 2 : // set.
722 2 : if l.loadFile(l.files.First(), +1) == noFileLoaded {
723 2 : l.exhaustedForward()
724 2 : return nil
725 2 : }
726 2 : if kv := l.iter.First(); kv != nil {
727 2 : return l.verify(kv)
728 2 : }
729 2 : return l.verify(l.skipEmptyFileForward())
730 : }
731 :
732 2 : func (l *levelIter) Last() *base.InternalKV {
733 2 : if invariants.Enabled && l.upper != nil {
734 0 : panic(errors.AssertionFailedf("levelIter Last called while upper bound %q is set", l.upper))
735 : }
736 :
737 2 : l.err = nil // clear cached iteration error
738 2 : l.exhaustedDir = 0
739 2 : l.prefix = nil
740 2 :
741 2 : // NB: the top-level Iterator will call SeekLT if IterOptions.UpperBound is
742 2 : // set.
743 2 : if l.loadFile(l.files.Last(), -1) == noFileLoaded {
744 2 : l.exhaustedBackward()
745 2 : return nil
746 2 : }
747 2 : if kv := l.iter.Last(); kv != nil {
748 2 : return l.verify(kv)
749 2 : }
750 2 : return l.verify(l.skipEmptyFileBackward())
751 : }
752 :
753 2 : func (l *levelIter) Next() *base.InternalKV {
754 2 : if l.exhaustedDir == -1 {
755 2 : if l.lower != nil {
756 2 : return l.SeekGE(l.lower, base.SeekGEFlagsNone)
757 2 : }
758 2 : return l.First()
759 : }
760 2 : if l.err != nil || l.iter == nil {
761 1 : return nil
762 1 : }
763 2 : if kv := l.iter.Next(); kv != nil {
764 2 : return l.verify(kv)
765 2 : }
766 2 : return l.verify(l.skipEmptyFileForward())
767 : }
768 :
769 2 : func (l *levelIter) NextPrefix(succKey []byte) *base.InternalKV {
770 2 : if l.err != nil || l.iter == nil {
771 0 : return nil
772 0 : }
773 :
774 2 : if kv := l.iter.NextPrefix(succKey); kv != nil {
775 2 : return l.verify(kv)
776 2 : }
777 2 : if l.iter.Error() != nil {
778 1 : return nil
779 1 : }
780 2 : if l.tableOpts.UpperBound != nil {
781 1 : // The UpperBound was within this file, so don't load the next file.
782 1 : l.exhaustedForward()
783 1 : return nil
784 1 : }
785 :
786 : // Seek the manifest level iterator using TrySeekUsingNext=true and
787 : // RelativeSeek=true so that we take advantage of the knowledge that
788 : // `succKey` can only be contained in later files.
789 2 : metadataSeekFlags := base.SeekGEFlagsNone.EnableTrySeekUsingNext().EnableRelativeSeek()
790 2 : if l.loadFile(l.findFileGE(succKey, metadataSeekFlags), +1) != noFileLoaded {
791 2 : // NB: The SeekGE on the file's iterator must not set TrySeekUsingNext,
792 2 : // because l.iter is unpositioned.
793 2 : if kv := l.iter.SeekGE(succKey, base.SeekGEFlagsNone); kv != nil {
794 2 : return l.verify(kv)
795 2 : }
796 1 : return l.verify(l.skipEmptyFileForward())
797 : }
798 2 : l.exhaustedForward()
799 2 : return nil
800 : }
801 :
802 2 : func (l *levelIter) Prev() *base.InternalKV {
803 2 : if l.exhaustedDir == +1 {
804 2 : if l.upper != nil {
805 2 : return l.SeekLT(l.upper, base.SeekLTFlagsNone)
806 2 : }
807 2 : return l.Last()
808 : }
809 2 : if l.err != nil || l.iter == nil {
810 1 : return nil
811 1 : }
812 2 : if kv := l.iter.Prev(); kv != nil {
813 2 : return l.verify(kv)
814 2 : }
815 2 : return l.verify(l.skipEmptyFileBackward())
816 : }
817 :
818 2 : func (l *levelIter) skipEmptyFileForward() *base.InternalKV {
819 2 : var kv *base.InternalKV
820 2 : // The first iteration of this loop starts with an already exhausted l.iter.
821 2 : // The reason for the exhaustion is either that we iterated to the end of
822 2 : // the sstable, or our iteration was terminated early due to the presence of
823 2 : // an upper-bound or the use of SeekPrefixGE.
824 2 : //
825 2 : // Subsequent iterations will examine consecutive files such that the first
826 2 : // file that does not have an exhausted iterator causes the code to return
827 2 : // that key.
828 2 : for ; kv == nil; kv = l.iter.First() {
829 2 : if l.iter.Error() != nil {
830 1 : return nil
831 1 : }
832 : // If an upper bound is present and the upper bound lies within the
833 : // current sstable, then we will have reached the upper bound rather
834 : // than the end of the sstable.
835 2 : if l.tableOpts.UpperBound != nil {
836 2 : l.exhaustedForward()
837 2 : return nil
838 2 : }
839 :
840 : // If the iterator is in prefix iteration mode, it's possible that we
841 : // are here because bloom filter matching failed. In that case it is
842 : // likely that all keys matching the prefix are wholly within the
843 : // current file and cannot be in a subsequent file. In that case we
844 : // don't want to go to the next file, since loading and seeking in there
845 : // has some cost.
846 : //
847 : // This is not just an optimization. We must not advance to the next
848 : // file if the current file might possibly contain keys relevant to any
849 : // prefix greater than our current iteration prefix. If we did, a
850 : // subsequent SeekPrefixGE with TrySeekUsingNext could mistakenly skip
851 : // the file's relevant keys.
852 2 : if l.prefix != nil {
853 2 : if l.comparer.Compare(l.comparer.Split.Prefix(l.iterFile.PointKeyBounds.LargestUserKey()), l.prefix) > 0 {
854 2 : l.exhaustedForward()
855 2 : return nil
856 2 : }
857 : }
858 :
859 : // Current file was exhausted. Move to the next file.
860 2 : if l.loadFile(l.files.Next(), +1) == noFileLoaded {
861 2 : l.exhaustedForward()
862 2 : return nil
863 2 : }
864 : }
865 2 : return kv
866 : }
867 :
868 2 : func (l *levelIter) skipEmptyFileBackward() *base.InternalKV {
869 2 : var kv *base.InternalKV
870 2 : // The first iteration of this loop starts with an already exhausted
871 2 : // l.iter. The reason for the exhaustion is either that we iterated to the
872 2 : // end of the sstable, or our iteration was terminated early due to the
873 2 : // presence of a lower-bound.
874 2 : //
875 2 : // Subsequent iterations will examine consecutive files such that the first
876 2 : // file that does not have an exhausted iterator causes the code to return
877 2 : // that key.
878 2 : for ; kv == nil; kv = l.iter.Last() {
879 2 : if l.iter.Error() != nil {
880 1 : return nil
881 1 : }
882 : // If a lower bound is present and the lower bound lies within the
883 : // current sstable, then we will have reached the lowerr bound rather
884 : // than the end of the sstable.
885 2 : if l.tableOpts.LowerBound != nil {
886 2 : l.exhaustedBackward()
887 2 : return nil
888 2 : }
889 : // Current file was exhausted. Move to the previous file.
890 2 : if l.loadFile(l.files.Prev(), -1) == noFileLoaded {
891 2 : l.exhaustedBackward()
892 2 : return nil
893 2 : }
894 : }
895 2 : return kv
896 : }
897 :
898 2 : func (l *levelIter) exhaustedForward() {
899 2 : l.exhaustedDir = +1
900 2 : }
901 :
902 2 : func (l *levelIter) exhaustedBackward() {
903 2 : l.exhaustedDir = -1
904 2 : }
905 :
906 2 : func (l *levelIter) Error() error {
907 2 : if l.err != nil || l.iter == nil {
908 2 : return l.err
909 2 : }
910 2 : return l.iter.Error()
911 : }
912 :
913 2 : func (l *levelIter) Close() error {
914 2 : if l.iter != nil {
915 2 : l.err = l.iter.Close()
916 2 : l.iter = nil
917 2 : }
918 2 : if l.rangeDelIterSetter != nil {
919 2 : l.rangeDelIterSetter.setRangeDelIter(nil)
920 2 : }
921 2 : return l.err
922 : }
923 :
924 2 : func (l *levelIter) SetBounds(lower, upper []byte) {
925 2 : l.lower = lower
926 2 : l.upper = upper
927 2 :
928 2 : if l.iter == nil {
929 2 : return
930 2 : }
931 :
932 : // Update tableOpts.{Lower,Upper}Bound in case the new boundaries fall within
933 : // the boundaries of the current table.
934 2 : if l.initTableBounds(l.iterFile) != 0 {
935 2 : // The table does not overlap the bounds. Close() will set levelIter.err if
936 2 : // an error occurs.
937 2 : _ = l.Close()
938 2 : return
939 2 : }
940 :
941 2 : l.iter.SetBounds(l.tableOpts.LowerBound, l.tableOpts.UpperBound)
942 : }
943 :
944 1 : func (l *levelIter) SetContext(ctx context.Context) {
945 1 : l.ctx = ctx
946 1 : if l.iter != nil {
947 0 : // TODO(sumeer): this is losing the ctx = objiotracing.WithLevel(ctx,
948 0 : // manifest.LevelToInt(opts.level)) that happens in table_cache.go.
949 0 : l.iter.SetContext(ctx)
950 0 : }
951 : }
952 :
953 : // DebugTree is part of the InternalIterator interface.
954 0 : func (l *levelIter) DebugTree(tp treeprinter.Node) {
955 0 : n := tp.Childf("%T(%p) %s", l, l, l.String())
956 0 : if l.iter != nil {
957 0 : l.iter.DebugTree(n)
958 0 : }
959 : }
960 :
961 2 : func (l *levelIter) String() string {
962 2 : if l.iterFile != nil {
963 2 : return fmt.Sprintf("%s: fileNum=%s", l.layer, l.iterFile.TableNum.String())
964 2 : }
965 0 : return fmt.Sprintf("%s: fileNum=<nil>", l.layer)
966 : }
967 :
968 : var _ internalIterator = &levelIter{}
|