Line data Source code
1 : // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package pebble
6 :
7 : import (
8 : "context"
9 : "fmt"
10 : "runtime/debug"
11 :
12 : "github.com/cockroachdb/pebble/internal/base"
13 : "github.com/cockroachdb/pebble/internal/invariants"
14 : "github.com/cockroachdb/pebble/internal/keyspan"
15 : "github.com/cockroachdb/pebble/internal/manifest"
16 : "github.com/cockroachdb/pebble/sstable"
17 : )
18 :
19 : // tableNewIters creates a new point and range-del iterator for the given file
20 : // number.
21 : //
22 : // On success, the internalIterator is not-nil and must be closed; the
23 : // FragmentIterator can be nil.
24 : // TODO(radu): always return a non-nil FragmentIterator.
25 : //
26 : // On error, the iterators are nil.
27 : //
28 : // The only (non-test) implementation of tableNewIters is tableCacheContainer.newIters().
29 : type tableNewIters func(
30 : ctx context.Context,
31 : file *manifest.FileMetadata,
32 : opts *IterOptions,
33 : internalOpts internalIterOpts,
34 : ) (internalIterator, keyspan.FragmentIterator, error)
35 :
36 : // tableNewRangeDelIter takes a tableNewIters and returns a TableNewSpanIter
37 : // for the rangedel iterator returned by tableNewIters.
38 1 : func tableNewRangeDelIter(ctx context.Context, newIters tableNewIters) keyspan.TableNewSpanIter {
39 1 : return func(file *manifest.FileMetadata, iterOptions keyspan.SpanIterOptions) (keyspan.FragmentIterator, error) {
40 1 : iter, rangeDelIter, err := newIters(ctx, file, nil, internalIterOpts{})
41 1 : if iter != nil {
42 1 : _ = iter.Close()
43 1 : }
44 1 : if rangeDelIter == nil {
45 1 : rangeDelIter = emptyKeyspanIter
46 1 : }
47 1 : return rangeDelIter, err
48 : }
49 : }
50 :
51 : type internalIterOpts struct {
52 : bytesIterated *uint64
53 : bufferPool *sstable.BufferPool
54 : stats *base.InternalIteratorStats
55 : boundLimitedFilter sstable.BoundLimitedBlockPropertyFilter
56 : }
57 :
58 : // levelIter provides a merged view of the sstables in a level.
59 : //
60 : // levelIter is used during compaction and as part of the Iterator
61 : // implementation. When used as part of the Iterator implementation, level
62 : // iteration needs to "pause" at sstable boundaries if a range deletion
63 : // tombstone is the source of that boundary. We know if a range tombstone is
64 : // the smallest or largest key in a file because the kind will be
65 : // InternalKeyKindRangeDeletion. If the boundary key is a range deletion
66 : // tombstone, we materialize a fake entry to return from levelIter. This
67 : // prevents mergingIter from advancing past the sstable until the sstable
68 : // contains the smallest (or largest for reverse iteration) key in the merged
69 : // heap. Note that mergingIter treats a range deletion tombstone returned by
70 : // the point iterator as a no-op.
71 : //
72 : // SeekPrefixGE presents the need for a second type of pausing. If an sstable
73 : // iterator returns "not found" for a SeekPrefixGE operation, we don't want to
74 : // advance to the next sstable as the "not found" does not indicate that all of
75 : // the keys in the sstable are less than the search key. Advancing to the next
76 : // sstable would cause us to skip over range tombstones, violating
77 : // correctness. Instead, SeekPrefixGE creates a synthetic boundary key with the
78 : // kind InternalKeyKindRangeDeletion which will be used to pause the levelIter
79 : // at the sstable until the mergingIter is ready to advance past it.
80 : type levelIter struct {
81 : // The context is stored here since (a) iterators are expected to be
82 : // short-lived (since they pin sstables), (b) plumbing a context into every
83 : // method is very painful, (c) they do not (yet) respect context
84 : // cancellation and are only used for tracing.
85 : ctx context.Context
86 : logger Logger
87 : comparer *Comparer
88 : cmp Compare
89 : split Split
90 : // The lower/upper bounds for iteration as specified at creation or the most
91 : // recent call to SetBounds.
92 : lower []byte
93 : upper []byte
94 : // The iterator options for the currently open table. If
95 : // tableOpts.{Lower,Upper}Bound are nil, the corresponding iteration boundary
96 : // does not lie within the table bounds.
97 : tableOpts IterOptions
98 : // The LSM level this levelIter is initialized for.
99 : level manifest.Level
100 : // The keys to return when iterating past an sstable boundary and that
101 : // boundary is a range deletion tombstone. The boundary could be smallest
102 : // (i.e. arrived at with Prev), or largest (arrived at with Next).
103 : smallestBoundary *InternalKey
104 : largestBoundary *InternalKey
105 : // combinedIterState may be set when a levelIter is used during user
106 : // iteration. Although levelIter only iterates over point keys, it's also
107 : // responsible for lazily constructing the combined range & point iterator
108 : // when it observes a file containing range keys. If the combined iter
109 : // state's initialized field is true, the iterator is already using combined
110 : // iterator, OR the iterator is not configured to use combined iteration. If
111 : // it's false, the levelIter must set the `triggered` and `key` fields when
112 : // the levelIter passes over a file containing range keys. See the
113 : // lazyCombinedIter for more details.
114 : combinedIterState *combinedIterState
115 : // A synthetic boundary key to return when SeekPrefixGE finds an sstable
116 : // which doesn't contain the search key, but which does contain range
117 : // tombstones.
118 : syntheticBoundary InternalKey
119 : // The iter for the current file. It is nil under any of the following conditions:
120 : // - files.Current() == nil
121 : // - err != nil
122 : // - some other constraint, like the bounds in opts, caused the file at index to not
123 : // be relevant to the iteration.
124 : iter internalIterator
125 : // iterFile holds the current file. It is always equal to l.files.Current().
126 : iterFile *fileMetadata
127 : // filteredIter is an optional interface that may be implemented by internal
128 : // iterators that perform filtering of keys. When a new file's iterator is
129 : // opened, it's tested to see if it implements filteredIter. If it does,
130 : // it's stored here to allow the level iterator to recognize when keys were
131 : // omitted from iteration results due to filtering. This is important when a
132 : // file contains range deletions that may delete keys from other files. The
133 : // levelIter must not advance to the next file until the mergingIter has
134 : // advanced beyond the file's bounds. See
135 : // levelIterBoundaryContext.isIgnorableBoundaryKey.
136 : filteredIter filteredIter
137 : newIters tableNewIters
138 : // When rangeDelIterPtr != nil, the caller requires that *rangeDelIterPtr must
139 : // point to a range del iterator corresponding to the current file. When this
140 : // iterator returns nil, *rangeDelIterPtr should also be set to nil. Whenever
141 : // a non-nil internalIterator is placed in rangeDelIterPtr, a copy is placed
142 : // in rangeDelIterCopy. This is done for the following special case:
143 : // when this iterator returns nil because of exceeding the bounds, we don't
144 : // close iter and *rangeDelIterPtr since we could reuse it in the next seek. But
145 : // we need to set *rangeDelIterPtr to nil because of the aforementioned contract.
146 : // This copy is used to revive the *rangeDelIterPtr in the case of reuse.
147 : rangeDelIterPtr *keyspan.FragmentIterator
148 : rangeDelIterCopy keyspan.FragmentIterator
149 : files manifest.LevelIterator
150 : err error
151 :
152 : // Pointer into this level's entry in `mergingIterLevel::levelIterBoundaryContext`.
153 : // We populate it with the corresponding bounds for the currently opened file. It is used for
154 : // two purposes (described for forward iteration. The explanation for backward iteration is
155 : // similar.)
156 : // - To limit the optimization that seeks lower-level iterators past keys shadowed by a range
157 : // tombstone. Limiting this seek to the file largestUserKey is necessary since
158 : // range tombstones are stored untruncated, while they only apply to keys within their
159 : // containing file's boundaries. For a detailed example, see comment above `mergingIter`.
160 : // - To constrain the tombstone to act-within the bounds of the sstable when checking
161 : // containment. For forward iteration we need the smallestUserKey.
162 : //
163 : // An example is sstable bounds [c#8, g#12] containing a tombstone [b, i)#7.
164 : // - When doing a SeekGE to user key X, the levelIter is at this sstable because X is either within
165 : // the sstable bounds or earlier than the start of the sstable (and there is no sstable in
166 : // between at this level). If X >= smallestUserKey, and the tombstone [b, i) contains X,
167 : // it is correct to SeekGE the sstables at lower levels to min(g, i) (i.e., min of
168 : // largestUserKey, tombstone.End) since any user key preceding min(g, i) must be covered by this
169 : // tombstone (since it cannot have a version younger than this tombstone as it is at a lower
170 : // level). And even if X = smallestUserKey or equal to the start user key of the tombstone,
171 : // if the above conditions are satisfied we know that the internal keys corresponding to X at
172 : // lower levels must have a version smaller than that in this file (again because of the level
173 : // argument). So we don't need to use sequence numbers for this comparison.
174 : // - When checking whether this tombstone deletes internal key X we know that the levelIter is at this
175 : // sstable so (repeating the above) X.UserKey is either within the sstable bounds or earlier than the
176 : // start of the sstable (and there is no sstable in between at this level).
177 : // - X is at at a lower level. If X.UserKey >= smallestUserKey, and the tombstone contains
178 : // X.UserKey, we know X is deleted. This argument also works when X is a user key (we use
179 : // it when seeking to test whether a user key is deleted).
180 : // - X is at the same level. X must be within the sstable bounds of the tombstone so the
181 : // X.UserKey >= smallestUserKey comparison is trivially true. In addition to the tombstone containing
182 : // X we need to compare the sequence number of X and the tombstone (we don't need to look
183 : // at how this tombstone is truncated to act-within the file bounds, which are InternalKeys,
184 : // since X and the tombstone are from the same file).
185 : //
186 : // Iterating backwards has one more complication when checking whether a tombstone deletes
187 : // internal key X at a lower level (the construction we do here also works for a user key X).
188 : // Consider sstable bounds [c#8, g#InternalRangeDelSentinel] containing a tombstone [b, i)#7.
189 : // If we are positioned at key g#10 at a lower sstable, the tombstone we will see is [b, i)#7,
190 : // since the higher sstable is positioned at a key <= g#10. We should not use this tombstone
191 : // to delete g#10. This requires knowing that the largestUserKey is a range delete sentinel,
192 : // which we set in a separate bool below.
193 : //
194 : // These fields differs from the `*Boundary` fields in a few ways:
195 : // - `*Boundary` is only populated when the iterator is positioned exactly on the sentinel key.
196 : // - `*Boundary` can hold either the lower- or upper-bound, depending on the iterator direction.
197 : // - `*Boundary` is not exposed to the next higher-level iterator, i.e., `mergingIter`.
198 : boundaryContext *levelIterBoundaryContext
199 :
200 : // internalOpts holds the internal iterator options to pass to the table
201 : // cache when constructing new table iterators.
202 : internalOpts internalIterOpts
203 :
204 : // Scratch space for the obsolete keys filter, when there are no other block
205 : // property filters specified. See the performance note where
206 : // IterOptions.PointKeyFilters is declared.
207 : filtersBuf [1]BlockPropertyFilter
208 :
209 : // Disable invariant checks even if they are otherwise enabled. Used by tests
210 : // which construct "impossible" situations (e.g. seeking to a key before the
211 : // lower bound).
212 : disableInvariants bool
213 : }
214 :
215 : // filteredIter is an additional interface implemented by iterators that may
216 : // skip over point keys during iteration. The sstable.Iterator implements this
217 : // interface.
218 : type filteredIter interface {
219 : // MaybeFilteredKeys may be called when an iterator is exhausted, indicating
220 : // whether or not the iterator's last positioning method may have skipped
221 : // any keys due to low-level filters.
222 : //
223 : // When an iterator is configured to use block-property filters, the
224 : // low-level iterator may skip over blocks or whole sstables of keys.
225 : // Implementations that implement skipping must implement this interface.
226 : // Higher-level iterators require it to preserve invariants (eg, a levelIter
227 : // used in a mergingIter must keep the file's range-del iterator open until
228 : // the mergingIter has moved past the file's bounds, even if all of the
229 : // file's point keys were filtered).
230 : //
231 : // MaybeFilteredKeys may always return false positives, that is it may
232 : // return true when no keys were filtered. It should only be called when the
233 : // iterator is exhausted. It must never return false negatives when the
234 : // iterator is exhausted.
235 : MaybeFilteredKeys() bool
236 : }
237 :
238 : // levelIter implements the base.InternalIterator interface.
239 : var _ base.InternalIterator = (*levelIter)(nil)
240 :
241 : // newLevelIter returns a levelIter. It is permissible to pass a nil split
242 : // parameter if the caller is never going to call SeekPrefixGE.
243 : func newLevelIter(
244 : ctx context.Context,
245 : opts IterOptions,
246 : comparer *Comparer,
247 : newIters tableNewIters,
248 : files manifest.LevelIterator,
249 : level manifest.Level,
250 : internalOpts internalIterOpts,
251 1 : ) *levelIter {
252 1 : l := &levelIter{}
253 1 : l.init(ctx, opts, comparer, newIters, files, level, internalOpts)
254 1 : return l
255 1 : }
256 :
257 : func (l *levelIter) init(
258 : ctx context.Context,
259 : opts IterOptions,
260 : comparer *Comparer,
261 : newIters tableNewIters,
262 : files manifest.LevelIterator,
263 : level manifest.Level,
264 : internalOpts internalIterOpts,
265 1 : ) {
266 1 : l.ctx = ctx
267 1 : l.err = nil
268 1 : l.level = level
269 1 : l.logger = opts.getLogger()
270 1 : l.lower = opts.LowerBound
271 1 : l.upper = opts.UpperBound
272 1 : l.tableOpts.TableFilter = opts.TableFilter
273 1 : l.tableOpts.PointKeyFilters = opts.PointKeyFilters
274 1 : if len(opts.PointKeyFilters) == 0 {
275 1 : l.tableOpts.PointKeyFilters = l.filtersBuf[:0:1]
276 1 : }
277 1 : l.tableOpts.UseL6Filters = opts.UseL6Filters
278 1 : l.tableOpts.CategoryAndQoS = opts.CategoryAndQoS
279 1 : l.tableOpts.level = l.level
280 1 : l.tableOpts.snapshotForHideObsoletePoints = opts.snapshotForHideObsoletePoints
281 1 : l.comparer = comparer
282 1 : l.cmp = comparer.Compare
283 1 : l.split = comparer.Split
284 1 : l.iterFile = nil
285 1 : l.newIters = newIters
286 1 : l.files = files
287 1 : l.internalOpts = internalOpts
288 : }
289 :
290 1 : func (l *levelIter) initRangeDel(rangeDelIter *keyspan.FragmentIterator) {
291 1 : l.rangeDelIterPtr = rangeDelIter
292 1 : }
293 :
294 1 : func (l *levelIter) initBoundaryContext(context *levelIterBoundaryContext) {
295 1 : l.boundaryContext = context
296 1 : }
297 :
298 1 : func (l *levelIter) initCombinedIterState(state *combinedIterState) {
299 1 : l.combinedIterState = state
300 1 : }
301 :
302 1 : func (l *levelIter) maybeTriggerCombinedIteration(file *fileMetadata, dir int) {
303 1 : // If we encounter a file that contains range keys, we may need to
304 1 : // trigger a switch to combined range-key and point-key iteration,
305 1 : // if the *pebble.Iterator is configured for it. This switch is done
306 1 : // lazily because range keys are intended to be rare, and
307 1 : // constructing the range-key iterator substantially adds to the
308 1 : // cost of iterator construction and seeking.
309 1 : //
310 1 : // If l.combinedIterState.initialized is already true, either the
311 1 : // iterator is already using combined iteration or the iterator is not
312 1 : // configured to observe range keys. Either way, there's nothing to do.
313 1 : // If false, trigger the switch to combined iteration, using the the
314 1 : // file's bounds to seek the range-key iterator appropriately.
315 1 : //
316 1 : // We only need to trigger combined iteration if the file contains
317 1 : // RangeKeySets: if there are only Unsets and Dels, the user will observe no
318 1 : // range keys regardless. If this file has table stats available, they'll
319 1 : // tell us whether the file has any RangeKeySets. Otherwise, we must
320 1 : // fallback to assuming it does if HasRangeKeys=true.
321 1 : if file != nil && file.HasRangeKeys && l.combinedIterState != nil && !l.combinedIterState.initialized &&
322 1 : (l.upper == nil || l.cmp(file.SmallestRangeKey.UserKey, l.upper) < 0) &&
323 1 : (l.lower == nil || l.cmp(file.LargestRangeKey.UserKey, l.lower) > 0) &&
324 1 : (!file.StatsValid() || file.Stats.NumRangeKeySets > 0) {
325 1 : // The file contains range keys, and we're not using combined iteration yet.
326 1 : // Trigger a switch to combined iteration. It's possible that a switch has
327 1 : // already been triggered if multiple levels encounter files containing
328 1 : // range keys while executing a single mergingIter operation. In this case,
329 1 : // we need to compare the existing key recorded to l.combinedIterState.key,
330 1 : // adjusting it if our key is smaller (forward iteration) or larger
331 1 : // (backward iteration) than the existing key.
332 1 : //
333 1 : // These key comparisons are only required during a single high-level
334 1 : // iterator operation. When the high-level iter op completes,
335 1 : // iinitialized will be true, and future calls to this function will be
336 1 : // no-ops.
337 1 : switch dir {
338 1 : case +1:
339 1 : if !l.combinedIterState.triggered {
340 1 : l.combinedIterState.triggered = true
341 1 : l.combinedIterState.key = file.SmallestRangeKey.UserKey
342 1 : } else if l.cmp(l.combinedIterState.key, file.SmallestRangeKey.UserKey) > 0 {
343 1 : l.combinedIterState.key = file.SmallestRangeKey.UserKey
344 1 : }
345 1 : case -1:
346 1 : if !l.combinedIterState.triggered {
347 1 : l.combinedIterState.triggered = true
348 1 : l.combinedIterState.key = file.LargestRangeKey.UserKey
349 1 : } else if l.cmp(l.combinedIterState.key, file.LargestRangeKey.UserKey) < 0 {
350 1 : l.combinedIterState.key = file.LargestRangeKey.UserKey
351 1 : }
352 : }
353 : }
354 : }
355 :
356 1 : func (l *levelIter) findFileGE(key []byte, flags base.SeekGEFlags) *fileMetadata {
357 1 : // Find the earliest file whose largest key is >= key.
358 1 :
359 1 : // NB: if flags.TrySeekUsingNext()=true, the levelIter must respect it. If
360 1 : // the levelIter is positioned at the key P, it must return a key ≥ P. If
361 1 : // used within a merging iterator, the merging iterator will depend on the
362 1 : // levelIter only moving forward to maintain heap invariants.
363 1 :
364 1 : // Ordinarily we seek the LevelIterator using SeekGE. In some instances, we
365 1 : // Next instead. In other instances, we try Next-ing first, falling back to
366 1 : // seek:
367 1 : // a) flags.TrySeekUsingNext(): The top-level Iterator knows we're seeking
368 1 : // to a key later than the current iterator position. We don't know how
369 1 : // much later the seek key is, so it's possible there are many sstables
370 1 : // between the current position and the seek key. However in most real-
371 1 : // world use cases, the seek key is likely to be nearby. Rather than
372 1 : // performing a log(N) seek through the file metadata, we next a few
373 1 : // times from from our existing location. If we don't find a file whose
374 1 : // largest is >= key within a few nexts, we fall back to seeking.
375 1 : //
376 1 : // Note that in this case, the file returned by findFileGE may be
377 1 : // different than the file returned by a raw binary search (eg, when
378 1 : // TrySeekUsingNext=false). This is possible because the most recent
379 1 : // positioning operation may have already determined that previous
380 1 : // files' keys that are ≥ key are all deleted. This information is
381 1 : // encoded within the iterator's current iterator position and is
382 1 : // unavailable to a fresh binary search.
383 1 : //
384 1 : // b) flags.RelativeSeek(): The merging iterator decided to re-seek this
385 1 : // level according to a range tombstone. When lazy combined iteration
386 1 : // is enabled, the level iterator is responsible for watching for
387 1 : // files containing range keys and triggering the switch to combined
388 1 : // iteration when such a file is observed. If a range deletion was
389 1 : // observed in a higher level causing the merging iterator to seek the
390 1 : // level to the range deletion's end key, we need to check whether all
391 1 : // of the files between the old position and the new position contain
392 1 : // any range keys.
393 1 : //
394 1 : // In this scenario, we don't seek the LevelIterator and instead we
395 1 : // Next it, one file at a time, checking each for range keys. The
396 1 : // merging iterator sets this flag to inform us that we're moving
397 1 : // forward relative to the existing position and that we must examine
398 1 : // each intermediate sstable's metadata for lazy-combined iteration.
399 1 : // In this case, we only Next and never Seek. We set nextsUntilSeek=-1
400 1 : // to signal this intention.
401 1 : //
402 1 : // NB: At most one of flags.RelativeSeek() and flags.TrySeekUsingNext() may
403 1 : // be set, because the merging iterator re-seeks relative seeks with
404 1 : // explicitly only the RelativeSeek flag set.
405 1 : var nextsUntilSeek int
406 1 : var nextInsteadOfSeek bool
407 1 : if flags.TrySeekUsingNext() {
408 1 : nextInsteadOfSeek = true
409 1 : nextsUntilSeek = 4 // arbitrary
410 1 : }
411 1 : if flags.RelativeSeek() && l.combinedIterState != nil && !l.combinedIterState.initialized {
412 1 : nextInsteadOfSeek = true
413 1 : nextsUntilSeek = -1
414 1 : }
415 :
416 1 : var m *fileMetadata
417 1 : if nextInsteadOfSeek {
418 1 : m = l.iterFile
419 1 : } else {
420 1 : m = l.files.SeekGE(l.cmp, key)
421 1 : }
422 : // The below loop has a bit of an unusual organization. There are several
423 : // conditions under which we need to Next to a later file. If none of those
424 : // conditions are met, the file in `m` is okay to return. The loop body is
425 : // structured with a series of if statements, each of which may continue the
426 : // loop to the next file. If none of the statements are met, the end of the
427 : // loop body is a break.
428 1 : for m != nil {
429 1 : if m.HasRangeKeys {
430 1 : l.maybeTriggerCombinedIteration(m, +1)
431 1 :
432 1 : // Some files may only contain range keys, which we can skip.
433 1 : // NB: HasPointKeys=true if the file contains any points or range
434 1 : // deletions (which delete points).
435 1 : if !m.HasPointKeys {
436 1 : m = l.files.Next()
437 1 : continue
438 : }
439 : }
440 :
441 : // This file has point keys.
442 : //
443 : // However, there are a couple reasons why `m` may not be positioned ≥
444 : // `key` yet:
445 : //
446 : // 1. If SeekGE(key) landed on a file containing range keys, the file
447 : // may contain range keys ≥ `key` but no point keys ≥ `key`.
448 : // 2. When nexting instead of seeking, we must check to see whether
449 : // we've nexted sufficiently far, or we need to next again.
450 : //
451 : // If the file does not contain point keys ≥ `key`, next to continue
452 : // looking for a file that does.
453 1 : if (m.HasRangeKeys || nextInsteadOfSeek) && l.cmp(m.LargestPointKey.UserKey, key) < 0 {
454 1 : // If nextInsteadOfSeek is set and nextsUntilSeek is non-negative,
455 1 : // the iterator has been nexting hoping to discover the relevant
456 1 : // file without seeking. It's exhausted the allotted nextsUntilSeek
457 1 : // and should seek to the sought key.
458 1 : if nextInsteadOfSeek && nextsUntilSeek == 0 {
459 0 : nextInsteadOfSeek = false
460 0 : m = l.files.SeekGE(l.cmp, key)
461 0 : continue
462 1 : } else if nextsUntilSeek > 0 {
463 1 : nextsUntilSeek--
464 1 : }
465 1 : m = l.files.Next()
466 1 : continue
467 : }
468 :
469 : // This file has a point key bound ≥ `key`. But the largest point key
470 : // bound may still be a range deletion sentinel, which is exclusive. In
471 : // this case, the file doesn't actually contain any point keys equal to
472 : // `key`. We next to keep searching for a file that actually contains
473 : // point keys ≥ key.
474 : //
475 : // Additionally, this prevents loading untruncated range deletions from
476 : // a table which can't possibly contain the target key and is required
477 : // for correctness by mergingIter.SeekGE (see the comment in that
478 : // function).
479 1 : if m.LargestPointKey.IsExclusiveSentinel() && l.cmp(m.LargestPointKey.UserKey, key) == 0 {
480 1 : m = l.files.Next()
481 1 : continue
482 : }
483 :
484 : // This file contains point keys ≥ `key`. Break and return it.
485 1 : break
486 : }
487 1 : return m
488 : }
489 :
490 1 : func (l *levelIter) findFileLT(key []byte, flags base.SeekLTFlags) *fileMetadata {
491 1 : // Find the last file whose smallest key is < ikey.
492 1 :
493 1 : // Ordinarily we seek the LevelIterator using SeekLT.
494 1 : //
495 1 : // When lazy combined iteration is enabled, there's a complication. The
496 1 : // level iterator is responsible for watching for files containing range
497 1 : // keys and triggering the switch to combined iteration when such a file is
498 1 : // observed. If a range deletion was observed in a higher level causing the
499 1 : // merging iterator to seek the level to the range deletion's start key, we
500 1 : // need to check whether all of the files between the old position and the
501 1 : // new position contain any range keys.
502 1 : //
503 1 : // In this scenario, we don't seek the LevelIterator and instead we Prev it,
504 1 : // one file at a time, checking each for range keys.
505 1 : prevInsteadOfSeek := flags.RelativeSeek() && l.combinedIterState != nil && !l.combinedIterState.initialized
506 1 :
507 1 : var m *fileMetadata
508 1 : if prevInsteadOfSeek {
509 1 : m = l.iterFile
510 1 : } else {
511 1 : m = l.files.SeekLT(l.cmp, key)
512 1 : }
513 : // The below loop has a bit of an unusual organization. There are several
514 : // conditions under which we need to Prev to a previous file. If none of
515 : // those conditions are met, the file in `m` is okay to return. The loop
516 : // body is structured with a series of if statements, each of which may
517 : // continue the loop to the previous file. If none of the statements are
518 : // met, the end of the loop body is a break.
519 1 : for m != nil {
520 1 : if m.HasRangeKeys {
521 1 : l.maybeTriggerCombinedIteration(m, -1)
522 1 :
523 1 : // Some files may only contain range keys, which we can skip.
524 1 : // NB: HasPointKeys=true if the file contains any points or range
525 1 : // deletions (which delete points).
526 1 : if !m.HasPointKeys {
527 1 : m = l.files.Prev()
528 1 : continue
529 : }
530 : }
531 :
532 : // This file has point keys.
533 : //
534 : // However, there are a couple reasons why `m` may not be positioned <
535 : // `key` yet:
536 : //
537 : // 1. If SeekLT(key) landed on a file containing range keys, the file
538 : // may contain range keys < `key` but no point keys < `key`.
539 : // 2. When preving instead of seeking, we must check to see whether
540 : // we've preved sufficiently far, or we need to prev again.
541 : //
542 : // If the file does not contain point keys < `key`, prev to continue
543 : // looking for a file that does.
544 1 : if (m.HasRangeKeys || prevInsteadOfSeek) && l.cmp(m.SmallestPointKey.UserKey, key) >= 0 {
545 1 : m = l.files.Prev()
546 1 : continue
547 : }
548 :
549 : // This file contains point keys < `key`. Break and return it.
550 1 : break
551 : }
552 1 : return m
553 : }
554 :
555 : // Init the iteration bounds for the current table. Returns -1 if the table
556 : // lies fully before the lower bound, +1 if the table lies fully after the
557 : // upper bound, and 0 if the table overlaps the iteration bounds.
558 1 : func (l *levelIter) initTableBounds(f *fileMetadata) int {
559 1 : l.tableOpts.LowerBound = l.lower
560 1 : if l.tableOpts.LowerBound != nil {
561 1 : if l.cmp(f.LargestPointKey.UserKey, l.tableOpts.LowerBound) < 0 {
562 1 : // The largest key in the sstable is smaller than the lower bound.
563 1 : return -1
564 1 : }
565 1 : if l.cmp(l.tableOpts.LowerBound, f.SmallestPointKey.UserKey) <= 0 {
566 1 : // The lower bound is smaller or equal to the smallest key in the
567 1 : // table. Iteration within the table does not need to check the lower
568 1 : // bound.
569 1 : l.tableOpts.LowerBound = nil
570 1 : }
571 : }
572 1 : l.tableOpts.UpperBound = l.upper
573 1 : if l.tableOpts.UpperBound != nil {
574 1 : if l.cmp(f.SmallestPointKey.UserKey, l.tableOpts.UpperBound) >= 0 {
575 1 : // The smallest key in the sstable is greater than or equal to the upper
576 1 : // bound.
577 1 : return 1
578 1 : }
579 1 : if l.cmp(l.tableOpts.UpperBound, f.LargestPointKey.UserKey) > 0 {
580 1 : // The upper bound is greater than the largest key in the
581 1 : // table. Iteration within the table does not need to check the upper
582 1 : // bound. NB: tableOpts.UpperBound is exclusive and f.LargestPointKey is
583 1 : // inclusive.
584 1 : l.tableOpts.UpperBound = nil
585 1 : }
586 : }
587 1 : return 0
588 : }
589 :
590 : type loadFileReturnIndicator int8
591 :
592 : const (
593 : noFileLoaded loadFileReturnIndicator = iota
594 : fileAlreadyLoaded
595 : newFileLoaded
596 : )
597 :
598 1 : func (l *levelIter) loadFile(file *fileMetadata, dir int) loadFileReturnIndicator {
599 1 : l.smallestBoundary = nil
600 1 : l.largestBoundary = nil
601 1 : if l.boundaryContext != nil {
602 1 : l.boundaryContext.isSyntheticIterBoundsKey = false
603 1 : l.boundaryContext.isIgnorableBoundaryKey = false
604 1 : }
605 1 : if l.iterFile == file {
606 1 : if l.err != nil {
607 0 : return noFileLoaded
608 0 : }
609 1 : if l.iter != nil {
610 1 : // We don't bother comparing the file bounds with the iteration bounds when we have
611 1 : // an already open iterator. It is possible that the iter may not be relevant given the
612 1 : // current iteration bounds, but it knows those bounds, so it will enforce them.
613 1 : if l.rangeDelIterPtr != nil {
614 1 : *l.rangeDelIterPtr = l.rangeDelIterCopy
615 1 : }
616 :
617 : // There are a few reasons we might not have triggered combined
618 : // iteration yet, even though we already had `file` open.
619 : // 1. If the bounds changed, we might have previously avoided
620 : // switching to combined iteration because the bounds excluded
621 : // the range keys contained in this file.
622 : // 2. If an existing iterator was reconfigured to iterate over range
623 : // keys (eg, using SetOptions), then we wouldn't have triggered
624 : // the switch to combined iteration yet.
625 1 : l.maybeTriggerCombinedIteration(file, dir)
626 1 : return fileAlreadyLoaded
627 : }
628 : // We were already at file, but don't have an iterator, probably because the file was
629 : // beyond the iteration bounds. It may still be, but it is also possible that the bounds
630 : // have changed. We handle that below.
631 : }
632 :
633 : // Close both iter and rangeDelIterPtr. While mergingIter knows about
634 : // rangeDelIterPtr, it can't call Close() on it because it does not know
635 : // when the levelIter will switch it. Note that levelIter.Close() can be
636 : // called multiple times.
637 1 : if err := l.Close(); err != nil {
638 1 : return noFileLoaded
639 1 : }
640 :
641 1 : for {
642 1 : l.iterFile = file
643 1 : if file == nil {
644 1 : return noFileLoaded
645 1 : }
646 :
647 1 : l.maybeTriggerCombinedIteration(file, dir)
648 1 : if !file.HasPointKeys {
649 1 : switch dir {
650 1 : case +1:
651 1 : file = l.files.Next()
652 1 : continue
653 1 : case -1:
654 1 : file = l.files.Prev()
655 1 : continue
656 : }
657 : }
658 :
659 1 : switch l.initTableBounds(file) {
660 1 : case -1:
661 1 : // The largest key in the sstable is smaller than the lower bound.
662 1 : if dir < 0 {
663 1 : return noFileLoaded
664 1 : }
665 1 : file = l.files.Next()
666 1 : continue
667 1 : case +1:
668 1 : // The smallest key in the sstable is greater than or equal to the upper
669 1 : // bound.
670 1 : if dir > 0 {
671 1 : return noFileLoaded
672 1 : }
673 1 : file = l.files.Prev()
674 1 : continue
675 : }
676 :
677 1 : var rangeDelIter keyspan.FragmentIterator
678 1 : var iter internalIterator
679 1 : iter, rangeDelIter, l.err = l.newIters(l.ctx, l.iterFile, &l.tableOpts, l.internalOpts)
680 1 : l.iter = iter
681 1 : if l.err != nil {
682 1 : return noFileLoaded
683 1 : }
684 1 : if rangeDelIter != nil {
685 1 : if fi, ok := iter.(filteredIter); ok {
686 1 : l.filteredIter = fi
687 1 : } else {
688 0 : l.filteredIter = nil
689 0 : }
690 1 : } else {
691 1 : l.filteredIter = nil
692 1 : }
693 1 : if l.rangeDelIterPtr != nil {
694 1 : *l.rangeDelIterPtr = rangeDelIter
695 1 : l.rangeDelIterCopy = rangeDelIter
696 1 : } else if rangeDelIter != nil {
697 1 : rangeDelIter.Close()
698 1 : }
699 1 : if l.boundaryContext != nil {
700 1 : l.boundaryContext.smallestUserKey = file.Smallest.UserKey
701 1 : l.boundaryContext.largestUserKey = file.Largest.UserKey
702 1 : l.boundaryContext.isLargestUserKeyExclusive = file.Largest.IsExclusiveSentinel()
703 1 : }
704 1 : return newFileLoaded
705 : }
706 : }
707 :
708 : // In race builds we verify that the keys returned by levelIter lie within
709 : // [lower,upper).
710 1 : func (l *levelIter) verify(key *InternalKey, val base.LazyValue) (*InternalKey, base.LazyValue) {
711 1 : // Note that invariants.Enabled is a compile time constant, which means the
712 1 : // block of code will be compiled out of normal builds making this method
713 1 : // eligible for inlining. Do not change this to use a variable.
714 1 : if invariants.Enabled && !l.disableInvariants && key != nil {
715 1 : // We allow returning a boundary key that is outside of the lower/upper
716 1 : // bounds as such keys are always range tombstones which will be skipped by
717 1 : // the Iterator.
718 1 : if l.lower != nil && key != l.smallestBoundary && l.cmp(key.UserKey, l.lower) < 0 {
719 0 : l.logger.Fatalf("levelIter %s: lower bound violation: %s < %s\n%s", l.level, key, l.lower, debug.Stack())
720 0 : }
721 1 : if l.upper != nil && key != l.largestBoundary && l.cmp(key.UserKey, l.upper) > 0 {
722 0 : l.logger.Fatalf("levelIter %s: upper bound violation: %s > %s\n%s", l.level, key, l.upper, debug.Stack())
723 0 : }
724 : }
725 1 : return key, val
726 : }
727 :
728 1 : func (l *levelIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, base.LazyValue) {
729 1 : l.err = nil // clear cached iteration error
730 1 : if l.boundaryContext != nil {
731 1 : l.boundaryContext.isSyntheticIterBoundsKey = false
732 1 : l.boundaryContext.isIgnorableBoundaryKey = false
733 1 : }
734 : // NB: the top-level Iterator has already adjusted key based on
735 : // IterOptions.LowerBound.
736 1 : loadFileIndicator := l.loadFile(l.findFileGE(key, flags), +1)
737 1 : if loadFileIndicator == noFileLoaded {
738 1 : return nil, base.LazyValue{}
739 1 : }
740 1 : if loadFileIndicator == newFileLoaded {
741 1 : // File changed, so l.iter has changed, and that iterator is not
742 1 : // positioned appropriately.
743 1 : flags = flags.DisableTrySeekUsingNext()
744 1 : }
745 1 : if ikey, val := l.iter.SeekGE(key, flags); ikey != nil {
746 1 : return l.verify(ikey, val)
747 1 : }
748 1 : return l.verify(l.skipEmptyFileForward())
749 : }
750 :
751 : func (l *levelIter) SeekPrefixGE(
752 : prefix, key []byte, flags base.SeekGEFlags,
753 1 : ) (*base.InternalKey, base.LazyValue) {
754 1 : l.err = nil // clear cached iteration error
755 1 : if l.boundaryContext != nil {
756 1 : l.boundaryContext.isSyntheticIterBoundsKey = false
757 1 : l.boundaryContext.isIgnorableBoundaryKey = false
758 1 : }
759 :
760 : // NB: the top-level Iterator has already adjusted key based on
761 : // IterOptions.LowerBound.
762 1 : loadFileIndicator := l.loadFile(l.findFileGE(key, flags), +1)
763 1 : if loadFileIndicator == noFileLoaded {
764 1 : return nil, base.LazyValue{}
765 1 : }
766 1 : if loadFileIndicator == newFileLoaded {
767 1 : // File changed, so l.iter has changed, and that iterator is not
768 1 : // positioned appropriately.
769 1 : flags = flags.DisableTrySeekUsingNext()
770 1 : }
771 1 : if key, val := l.iter.SeekPrefixGE(prefix, key, flags); key != nil {
772 1 : return l.verify(key, val)
773 1 : }
774 : // When SeekPrefixGE returns nil, we have not necessarily reached the end of
775 : // the sstable. All we know is that a key with prefix does not exist in the
776 : // current sstable. We do know that the key lies within the bounds of the
777 : // table as findFileGE found the table where key <= meta.Largest. We return
778 : // the table's bound with isIgnorableBoundaryKey set.
779 1 : if l.rangeDelIterPtr != nil && *l.rangeDelIterPtr != nil {
780 1 : if l.tableOpts.UpperBound != nil {
781 1 : l.syntheticBoundary.UserKey = l.tableOpts.UpperBound
782 1 : l.syntheticBoundary.Trailer = InternalKeyRangeDeleteSentinel
783 1 : l.largestBoundary = &l.syntheticBoundary
784 1 : if l.boundaryContext != nil {
785 1 : l.boundaryContext.isSyntheticIterBoundsKey = true
786 1 : l.boundaryContext.isIgnorableBoundaryKey = false
787 1 : }
788 1 : return l.verify(l.largestBoundary, base.LazyValue{})
789 : }
790 : // Return the file's largest bound, ensuring this file stays open until
791 : // the mergingIter advances beyond the file's bounds. We set
792 : // isIgnorableBoundaryKey to signal that the actual key returned should
793 : // be ignored, and does not represent a real key in the database.
794 1 : l.largestBoundary = &l.iterFile.LargestPointKey
795 1 : if l.boundaryContext != nil {
796 1 : l.boundaryContext.isSyntheticIterBoundsKey = false
797 1 : l.boundaryContext.isIgnorableBoundaryKey = true
798 1 : }
799 1 : return l.verify(l.largestBoundary, base.LazyValue{})
800 : }
801 : // It is possible that we are here because bloom filter matching failed. In
802 : // that case it is likely that all keys matching the prefix are wholly
803 : // within the current file and cannot be in the subsequent file. In that
804 : // case we don't want to go to the next file, since loading and seeking in
805 : // there has some cost. Additionally, for sparse key spaces, loading the
806 : // next file will defeat the optimization for the next SeekPrefixGE that is
807 : // called with flags.TrySeekUsingNext(), since for sparse key spaces it is
808 : // likely that the next key will also be contained in the current file.
809 1 : var n int
810 1 : if l.split != nil {
811 1 : // If the split function is specified, calculate the prefix length accordingly.
812 1 : n = l.split(l.iterFile.LargestPointKey.UserKey)
813 1 : } else {
814 0 : // If the split function is not specified, the entire key is used as the
815 0 : // prefix. This case can occur when getIter uses SeekPrefixGE.
816 0 : n = len(l.iterFile.LargestPointKey.UserKey)
817 0 : }
818 1 : if l.cmp(prefix, l.iterFile.LargestPointKey.UserKey[:n]) < 0 {
819 1 : return nil, base.LazyValue{}
820 1 : }
821 0 : return l.verify(l.skipEmptyFileForward())
822 : }
823 :
824 1 : func (l *levelIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, base.LazyValue) {
825 1 : l.err = nil // clear cached iteration error
826 1 : if l.boundaryContext != nil {
827 1 : l.boundaryContext.isSyntheticIterBoundsKey = false
828 1 : l.boundaryContext.isIgnorableBoundaryKey = false
829 1 : }
830 :
831 : // NB: the top-level Iterator has already adjusted key based on
832 : // IterOptions.UpperBound.
833 1 : if l.loadFile(l.findFileLT(key, flags), -1) == noFileLoaded {
834 1 : return nil, base.LazyValue{}
835 1 : }
836 1 : if key, val := l.iter.SeekLT(key, flags); key != nil {
837 1 : return l.verify(key, val)
838 1 : }
839 1 : return l.verify(l.skipEmptyFileBackward())
840 : }
841 :
842 1 : func (l *levelIter) First() (*InternalKey, base.LazyValue) {
843 1 : l.err = nil // clear cached iteration error
844 1 : if l.boundaryContext != nil {
845 1 : l.boundaryContext.isSyntheticIterBoundsKey = false
846 1 : l.boundaryContext.isIgnorableBoundaryKey = false
847 1 : }
848 :
849 : // NB: the top-level Iterator will call SeekGE if IterOptions.LowerBound is
850 : // set.
851 1 : if l.loadFile(l.files.First(), +1) == noFileLoaded {
852 1 : return nil, base.LazyValue{}
853 1 : }
854 1 : if key, val := l.iter.First(); key != nil {
855 1 : return l.verify(key, val)
856 1 : }
857 1 : return l.verify(l.skipEmptyFileForward())
858 : }
859 :
860 1 : func (l *levelIter) Last() (*InternalKey, base.LazyValue) {
861 1 : l.err = nil // clear cached iteration error
862 1 : if l.boundaryContext != nil {
863 1 : l.boundaryContext.isSyntheticIterBoundsKey = false
864 1 : l.boundaryContext.isIgnorableBoundaryKey = false
865 1 : }
866 :
867 : // NB: the top-level Iterator will call SeekLT if IterOptions.UpperBound is
868 : // set.
869 1 : if l.loadFile(l.files.Last(), -1) == noFileLoaded {
870 1 : return nil, base.LazyValue{}
871 1 : }
872 1 : if key, val := l.iter.Last(); key != nil {
873 1 : return l.verify(key, val)
874 1 : }
875 1 : return l.verify(l.skipEmptyFileBackward())
876 : }
877 :
878 1 : func (l *levelIter) Next() (*InternalKey, base.LazyValue) {
879 1 : if l.err != nil || l.iter == nil {
880 1 : return nil, base.LazyValue{}
881 1 : }
882 1 : if l.boundaryContext != nil {
883 1 : l.boundaryContext.isSyntheticIterBoundsKey = false
884 1 : l.boundaryContext.isIgnorableBoundaryKey = false
885 1 : }
886 :
887 1 : switch {
888 1 : case l.largestBoundary != nil:
889 1 : if l.tableOpts.UpperBound != nil {
890 1 : // The UpperBound was within this file, so don't load the next
891 1 : // file. We leave the largestBoundary unchanged so that subsequent
892 1 : // calls to Next() stay at this file. If a Seek/First/Last call is
893 1 : // made and this file continues to be relevant, loadFile() will
894 1 : // set the largestBoundary to nil.
895 1 : if l.rangeDelIterPtr != nil {
896 1 : *l.rangeDelIterPtr = nil
897 1 : }
898 1 : return nil, base.LazyValue{}
899 : }
900 : // We're stepping past the boundary key, so now we can load the next file.
901 1 : if l.loadFile(l.files.Next(), +1) != noFileLoaded {
902 1 : if key, val := l.iter.First(); key != nil {
903 1 : return l.verify(key, val)
904 1 : }
905 1 : return l.verify(l.skipEmptyFileForward())
906 : }
907 1 : return nil, base.LazyValue{}
908 :
909 1 : default:
910 1 : // Reset the smallest boundary since we're moving away from it.
911 1 : l.smallestBoundary = nil
912 1 : if key, val := l.iter.Next(); key != nil {
913 1 : return l.verify(key, val)
914 1 : }
915 : }
916 1 : return l.verify(l.skipEmptyFileForward())
917 : }
918 :
919 1 : func (l *levelIter) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) {
920 1 : if l.err != nil || l.iter == nil {
921 0 : return nil, base.LazyValue{}
922 0 : }
923 1 : if l.boundaryContext != nil {
924 1 : l.boundaryContext.isSyntheticIterBoundsKey = false
925 1 : l.boundaryContext.isIgnorableBoundaryKey = false
926 1 : }
927 :
928 1 : switch {
929 0 : case l.largestBoundary != nil:
930 0 : if l.tableOpts.UpperBound != nil {
931 0 : // The UpperBound was within this file, so don't load the next
932 0 : // file. We leave the largestBoundary unchanged so that subsequent
933 0 : // calls to Next() stay at this file. If a Seek/First/Last call is
934 0 : // made and this file continues to be relevant, loadFile() will
935 0 : // set the largestBoundary to nil.
936 0 : if l.rangeDelIterPtr != nil {
937 0 : *l.rangeDelIterPtr = nil
938 0 : }
939 0 : return nil, base.LazyValue{}
940 : }
941 : // We're stepping past the boundary key, so we need to load a later
942 : // file.
943 :
944 1 : default:
945 1 : // Reset the smallest boundary since we're moving away from it.
946 1 : l.smallestBoundary = nil
947 1 :
948 1 : if key, val := l.iter.NextPrefix(succKey); key != nil {
949 1 : return l.verify(key, val)
950 1 : }
951 : // Fall through to seeking.
952 : }
953 :
954 : // Seek the manifest level iterator using TrySeekUsingNext=true and
955 : // RelativeSeek=true so that we take advantage of the knowledge that
956 : // `succKey` can only be contained in later files.
957 1 : metadataSeekFlags := base.SeekGEFlagsNone.EnableTrySeekUsingNext().EnableRelativeSeek()
958 1 : if l.loadFile(l.findFileGE(succKey, metadataSeekFlags), +1) != noFileLoaded {
959 1 : // NB: The SeekGE on the file's iterator must not set TrySeekUsingNext,
960 1 : // because l.iter is unpositioned.
961 1 : if key, val := l.iter.SeekGE(succKey, base.SeekGEFlagsNone); key != nil {
962 1 : return l.verify(key, val)
963 1 : }
964 0 : return l.verify(l.skipEmptyFileForward())
965 : }
966 0 : return nil, base.LazyValue{}
967 : }
968 :
969 1 : func (l *levelIter) Prev() (*InternalKey, base.LazyValue) {
970 1 : if l.err != nil || l.iter == nil {
971 1 : return nil, base.LazyValue{}
972 1 : }
973 1 : if l.boundaryContext != nil {
974 1 : l.boundaryContext.isSyntheticIterBoundsKey = false
975 1 : l.boundaryContext.isIgnorableBoundaryKey = false
976 1 : }
977 :
978 1 : switch {
979 1 : case l.smallestBoundary != nil:
980 1 : if l.tableOpts.LowerBound != nil {
981 1 : // The LowerBound was within this file, so don't load the previous
982 1 : // file. We leave the smallestBoundary unchanged so that
983 1 : // subsequent calls to Prev() stay at this file. If a
984 1 : // Seek/First/Last call is made and this file continues to be
985 1 : // relevant, loadFile() will set the smallestBoundary to nil.
986 1 : if l.rangeDelIterPtr != nil {
987 1 : *l.rangeDelIterPtr = nil
988 1 : }
989 1 : return nil, base.LazyValue{}
990 : }
991 : // We're stepping past the boundary key, so now we can load the prev file.
992 1 : if l.loadFile(l.files.Prev(), -1) != noFileLoaded {
993 1 : if key, val := l.iter.Last(); key != nil {
994 1 : return l.verify(key, val)
995 1 : }
996 1 : return l.verify(l.skipEmptyFileBackward())
997 : }
998 1 : return nil, base.LazyValue{}
999 :
1000 1 : default:
1001 1 : // Reset the largest boundary since we're moving away from it.
1002 1 : l.largestBoundary = nil
1003 1 : if key, val := l.iter.Prev(); key != nil {
1004 1 : return l.verify(key, val)
1005 1 : }
1006 : }
1007 1 : return l.verify(l.skipEmptyFileBackward())
1008 : }
1009 :
1010 1 : func (l *levelIter) skipEmptyFileForward() (*InternalKey, base.LazyValue) {
1011 1 : var key *InternalKey
1012 1 : var val base.LazyValue
1013 1 : // The first iteration of this loop starts with an already exhausted
1014 1 : // l.iter. The reason for the exhaustion is either that we iterated to the
1015 1 : // end of the sstable, or our iteration was terminated early due to the
1016 1 : // presence of an upper-bound or the use of SeekPrefixGE. If
1017 1 : // l.rangeDelIterPtr is non-nil, we may need to pretend the iterator is
1018 1 : // not exhausted to allow for the merging to finish consuming the
1019 1 : // l.rangeDelIterPtr before levelIter switches the rangeDelIter from
1020 1 : // under it. This pretense is done by either generating a synthetic
1021 1 : // boundary key or returning the largest key of the file, depending on the
1022 1 : // exhaustion reason.
1023 1 :
1024 1 : // Subsequent iterations will examine consecutive files such that the first
1025 1 : // file that does not have an exhausted iterator causes the code to return
1026 1 : // that key, else the behavior described above if there is a corresponding
1027 1 : // rangeDelIterPtr.
1028 1 : for ; key == nil; key, val = l.iter.First() {
1029 1 : if l.rangeDelIterPtr != nil {
1030 1 : // We're being used as part of a mergingIter and we've exhausted the
1031 1 : // current sstable. If an upper bound is present and the upper bound lies
1032 1 : // within the current sstable, then we will have reached the upper bound
1033 1 : // rather than the end of the sstable. We need to return a synthetic
1034 1 : // boundary key so that mergingIter can use the range tombstone iterator
1035 1 : // until the other levels have reached this boundary.
1036 1 : //
1037 1 : // It is safe to set the boundary key to the UpperBound user key
1038 1 : // with the RANGEDEL sentinel since it is the smallest InternalKey
1039 1 : // that matches the exclusive upper bound, and does not represent
1040 1 : // a real key.
1041 1 : if l.tableOpts.UpperBound != nil {
1042 1 : if *l.rangeDelIterPtr != nil {
1043 1 : l.syntheticBoundary.UserKey = l.tableOpts.UpperBound
1044 1 : l.syntheticBoundary.Trailer = InternalKeyRangeDeleteSentinel
1045 1 : l.largestBoundary = &l.syntheticBoundary
1046 1 : if l.boundaryContext != nil {
1047 1 : l.boundaryContext.isSyntheticIterBoundsKey = true
1048 1 : }
1049 1 : return l.largestBoundary, base.LazyValue{}
1050 : }
1051 : // Else there are no range deletions in this sstable. This
1052 : // helps with performance when many levels are populated with
1053 : // sstables and most don't have any actual keys within the
1054 : // bounds.
1055 1 : return nil, base.LazyValue{}
1056 : }
1057 : // If the boundary is a range deletion tombstone, return that key.
1058 1 : if l.iterFile.LargestPointKey.Kind() == InternalKeyKindRangeDelete {
1059 1 : l.largestBoundary = &l.iterFile.LargestPointKey
1060 1 : if l.boundaryContext != nil {
1061 1 : l.boundaryContext.isIgnorableBoundaryKey = true
1062 1 : }
1063 1 : return l.largestBoundary, base.LazyValue{}
1064 : }
1065 : // If the last point iterator positioning op might've skipped keys,
1066 : // it's possible the file's range deletions are still relevant to
1067 : // other levels. Return the largest boundary as a special ignorable
1068 : // marker to avoid advancing to the next file.
1069 : //
1070 : // The sstable iterator cannot guarantee that keys were skipped. A
1071 : // SeekGE that lands on a index separator k only knows that the
1072 : // block at the index entry contains keys ≤ k. We can't know whether
1073 : // there were actually keys between the seek key and the index
1074 : // separator key. If the block is then excluded due to block
1075 : // property filters, the iterator does not know whether keys were
1076 : // actually skipped by the block's exclusion.
1077 : //
1078 : // Since MaybeFilteredKeys cannot guarantee that keys were skipped,
1079 : // it's possible l.iterFile.Largest was already returned. Returning
1080 : // l.iterFile.Largest again is a violation of the strict
1081 : // monotonicity normally provided. The mergingIter's heap can
1082 : // tolerate this repeat key and in this case will keep the level at
1083 : // the top of the heap and immediately skip the entry, advancing to
1084 : // the next file.
1085 1 : if *l.rangeDelIterPtr != nil && l.filteredIter != nil &&
1086 1 : l.filteredIter.MaybeFilteredKeys() {
1087 0 : l.largestBoundary = &l.iterFile.Largest
1088 0 : if l.boundaryContext != nil {
1089 0 : l.boundaryContext.isIgnorableBoundaryKey = true
1090 0 : }
1091 0 : return l.largestBoundary, base.LazyValue{}
1092 : }
1093 : }
1094 :
1095 : // Current file was exhausted. Move to the next file.
1096 1 : if l.loadFile(l.files.Next(), +1) == noFileLoaded {
1097 1 : return nil, base.LazyValue{}
1098 1 : }
1099 : }
1100 1 : return key, val
1101 : }
1102 :
1103 1 : func (l *levelIter) skipEmptyFileBackward() (*InternalKey, base.LazyValue) {
1104 1 : var key *InternalKey
1105 1 : var val base.LazyValue
1106 1 : // The first iteration of this loop starts with an already exhausted
1107 1 : // l.iter. The reason for the exhaustion is either that we iterated to the
1108 1 : // end of the sstable, or our iteration was terminated early due to the
1109 1 : // presence of a lower-bound. If l.rangeDelIterPtr is non-nil, we may need
1110 1 : // to pretend the iterator is not exhausted to allow for the merging to
1111 1 : // finish consuming the l.rangeDelIterPtr before levelIter switches the
1112 1 : // rangeDelIter from under it. This pretense is done by either generating
1113 1 : // a synthetic boundary key or returning the smallest key of the file,
1114 1 : // depending on the exhaustion reason.
1115 1 :
1116 1 : // Subsequent iterations will examine consecutive files such that the first
1117 1 : // file that does not have an exhausted iterator causes the code to return
1118 1 : // that key, else the behavior described above if there is a corresponding
1119 1 : // rangeDelIterPtr.
1120 1 : for ; key == nil; key, val = l.iter.Last() {
1121 1 : if l.rangeDelIterPtr != nil {
1122 1 : // We're being used as part of a mergingIter and we've exhausted the
1123 1 : // current sstable. If a lower bound is present and the lower bound lies
1124 1 : // within the current sstable, then we will have reached the lower bound
1125 1 : // rather than the beginning of the sstable. We need to return a
1126 1 : // synthetic boundary key so that mergingIter can use the range tombstone
1127 1 : // iterator until the other levels have reached this boundary.
1128 1 : //
1129 1 : // It is safe to set the boundary key to the LowerBound user key
1130 1 : // with the RANGEDEL sentinel since it is the smallest InternalKey
1131 1 : // that is within the inclusive lower bound, and does not
1132 1 : // represent a real key.
1133 1 : if l.tableOpts.LowerBound != nil {
1134 1 : if *l.rangeDelIterPtr != nil {
1135 1 : l.syntheticBoundary.UserKey = l.tableOpts.LowerBound
1136 1 : l.syntheticBoundary.Trailer = InternalKeyRangeDeleteSentinel
1137 1 : l.smallestBoundary = &l.syntheticBoundary
1138 1 : if l.boundaryContext != nil {
1139 1 : l.boundaryContext.isSyntheticIterBoundsKey = true
1140 1 : }
1141 1 : return l.smallestBoundary, base.LazyValue{}
1142 : }
1143 : // Else there are no range deletions in this sstable. This
1144 : // helps with performance when many levels are populated with
1145 : // sstables and most don't have any actual keys within the
1146 : // bounds.
1147 1 : return nil, base.LazyValue{}
1148 : }
1149 : // If the boundary is a range deletion tombstone, return that key.
1150 1 : if l.iterFile.SmallestPointKey.Kind() == InternalKeyKindRangeDelete {
1151 1 : l.smallestBoundary = &l.iterFile.SmallestPointKey
1152 1 : if l.boundaryContext != nil {
1153 1 : l.boundaryContext.isIgnorableBoundaryKey = true
1154 1 : }
1155 1 : return l.smallestBoundary, base.LazyValue{}
1156 : }
1157 : // If the last point iterator positioning op skipped keys, it's
1158 : // possible the file's range deletions are still relevant to other
1159 : // levels. Return the smallest boundary as a special ignorable key
1160 : // to avoid advancing to the next file.
1161 : //
1162 : // The sstable iterator cannot guarantee that keys were skipped. A
1163 : // SeekGE that lands on a index separator k only knows that the
1164 : // block at the index entry contains keys ≤ k. We can't know whether
1165 : // there were actually keys between the seek key and the index
1166 : // separator key. If the block is then excluded due to block
1167 : // property filters, the iterator does not know whether keys were
1168 : // actually skipped by the block's exclusion.
1169 : //
1170 : // Since MaybeFilteredKeys cannot guarantee that keys were skipped,
1171 : // it's possible l.iterFile.Smallest was already returned. Returning
1172 : // l.iterFile.Smallest again is a violation of the strict
1173 : // monotonicity normally provided. The mergingIter's heap can
1174 : // tolerate this repeat key and in this case will keep the level at
1175 : // the top of the heap and immediately skip the entry, advancing to
1176 : // the next file.
1177 1 : if *l.rangeDelIterPtr != nil && l.filteredIter != nil && l.filteredIter.MaybeFilteredKeys() {
1178 0 : l.smallestBoundary = &l.iterFile.Smallest
1179 0 : if l.boundaryContext != nil {
1180 0 : l.boundaryContext.isIgnorableBoundaryKey = true
1181 0 : }
1182 0 : return l.smallestBoundary, base.LazyValue{}
1183 : }
1184 : }
1185 :
1186 : // Current file was exhausted. Move to the previous file.
1187 1 : if l.loadFile(l.files.Prev(), -1) == noFileLoaded {
1188 1 : return nil, base.LazyValue{}
1189 1 : }
1190 : }
1191 1 : return key, val
1192 : }
1193 :
1194 1 : func (l *levelIter) Error() error {
1195 1 : if l.err != nil || l.iter == nil {
1196 1 : return l.err
1197 1 : }
1198 1 : return l.iter.Error()
1199 : }
1200 :
1201 1 : func (l *levelIter) Close() error {
1202 1 : if l.iter != nil {
1203 1 : l.err = l.iter.Close()
1204 1 : l.iter = nil
1205 1 : }
1206 1 : if l.rangeDelIterPtr != nil {
1207 1 : if t := l.rangeDelIterCopy; t != nil {
1208 1 : l.err = firstError(l.err, t.Close())
1209 1 : }
1210 1 : *l.rangeDelIterPtr = nil
1211 1 : l.rangeDelIterCopy = nil
1212 : }
1213 1 : return l.err
1214 : }
1215 :
1216 1 : func (l *levelIter) SetBounds(lower, upper []byte) {
1217 1 : l.lower = lower
1218 1 : l.upper = upper
1219 1 :
1220 1 : if l.iter == nil {
1221 1 : return
1222 1 : }
1223 :
1224 : // Update tableOpts.{Lower,Upper}Bound in case the new boundaries fall within
1225 : // the boundaries of the current table.
1226 1 : if l.initTableBounds(l.iterFile) != 0 {
1227 1 : // The table does not overlap the bounds. Close() will set levelIter.err if
1228 1 : // an error occurs.
1229 1 : _ = l.Close()
1230 1 : return
1231 1 : }
1232 :
1233 1 : l.iter.SetBounds(l.tableOpts.LowerBound, l.tableOpts.UpperBound)
1234 : }
1235 :
1236 1 : func (l *levelIter) SetContext(ctx context.Context) {
1237 1 : l.ctx = ctx
1238 1 : if l.iter != nil {
1239 0 : // TODO(sumeer): this is losing the ctx = objiotracing.WithLevel(ctx,
1240 0 : // manifest.LevelToInt(opts.level)) that happens in table_cache.go.
1241 0 : l.iter.SetContext(ctx)
1242 0 : }
1243 : }
1244 :
1245 1 : func (l *levelIter) String() string {
1246 1 : if l.iterFile != nil {
1247 1 : return fmt.Sprintf("%s: fileNum=%s", l.level, l.iter.String())
1248 1 : }
1249 0 : return fmt.Sprintf("%s: fileNum=<nil>", l.level)
1250 : }
1251 :
1252 : var _ internalIterator = &levelIter{}
|