Line data Source code
1 : // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package pebble
6 :
7 : import (
8 : "bytes"
9 : "context"
10 : "io"
11 : "sync"
12 : "unsafe"
13 :
14 : "github.com/cockroachdb/errors"
15 : "github.com/cockroachdb/pebble/internal/base"
16 : "github.com/cockroachdb/pebble/internal/bytealloc"
17 : "github.com/cockroachdb/pebble/internal/fastrand"
18 : "github.com/cockroachdb/pebble/internal/humanize"
19 : "github.com/cockroachdb/pebble/internal/invariants"
20 : "github.com/cockroachdb/pebble/internal/keyspan"
21 : "github.com/cockroachdb/pebble/internal/manifest"
22 : "github.com/cockroachdb/pebble/internal/rangekey"
23 : "github.com/cockroachdb/pebble/sstable"
24 : "github.com/cockroachdb/redact"
25 : )
26 :
27 : // iterPos describes the state of the internal iterator, in terms of whether it
28 : // is at the position returned to the user (cur), one ahead of the position
29 : // returned (next for forward iteration and prev for reverse iteration). The cur
30 : // position is split into two states, for forward and reverse iteration, since
31 : // we need to differentiate for switching directions.
32 : //
33 : // There is subtlety in what is considered the current position of the Iterator.
34 : // The internal iterator exposes a sequence of internal keys. There is not
35 : // always a single internalIterator position corresponding to the position
36 : // returned to the user. Consider the example:
37 : //
38 : // a.MERGE.9 a.MERGE.8 a.MERGE.7 a.SET.6 b.DELETE.9 b.DELETE.5 b.SET.4
39 : // \ /
40 : // \ Iterator.Key() = 'a' /
41 : //
42 : // The Iterator exposes one valid position at user key 'a' and the two exhausted
43 : // positions at the beginning and end of iteration. The underlying
44 : // internalIterator contains 7 valid positions and 2 exhausted positions.
45 : //
46 : // Iterator positioning methods must set iterPos to iterPosCur{Foward,Backward}
47 : // iff the user key at the current internalIterator position equals the
48 : // Iterator.Key returned to the user. This guarantees that a call to nextUserKey
49 : // or prevUserKey will advance to the next or previous iterator position.
50 : // iterPosCur{Forward,Backward} does not make any guarantee about the internal
51 : // iterator position among internal keys with matching user keys, and it will
52 : // vary subtly depending on the particular key kinds encountered. In the above
53 : // example, the iterator returning 'a' may set iterPosCurForward if the internal
54 : // iterator is positioned at any of a.MERGE.9, a.MERGE.8, a.MERGE.7 or a.SET.6.
55 : //
56 : // When setting iterPos to iterPosNext or iterPosPrev, the internal iterator
57 : // must be advanced to the first internalIterator position at a user key greater
58 : // (iterPosNext) or less (iterPosPrev) than the key returned to the user. An
59 : // internalIterator position that's !Valid() must also be considered greater or
60 : // less—depending on the direction of iteration—than the last valid Iterator
61 : // position.
62 : type iterPos int8
63 :
64 : const (
65 : iterPosCurForward iterPos = 0
66 : iterPosNext iterPos = 1
67 : iterPosPrev iterPos = -1
68 : iterPosCurReverse iterPos = -2
69 :
70 : // For limited iteration. When the iterator is at iterPosCurForwardPaused
71 : // - Next*() call should behave as if the internal iterator is already
72 : // at next (akin to iterPosNext).
73 : // - Prev*() call should behave as if the internal iterator is at the
74 : // current key (akin to iterPosCurForward).
75 : //
76 : // Similar semantics apply to CurReversePaused.
77 : iterPosCurForwardPaused iterPos = 2
78 : iterPosCurReversePaused iterPos = -3
79 : )
80 :
81 : // Approximate gap in bytes between samples of data read during iteration.
82 : // This is multiplied with a default ReadSamplingMultiplier of 1 << 4 to yield
83 : // 1 << 20 (1MB). The 1MB factor comes from:
84 : // https://github.com/cockroachdb/pebble/issues/29#issuecomment-494477985
85 : const readBytesPeriod uint64 = 1 << 16
86 :
87 : var errReversePrefixIteration = errors.New("pebble: unsupported reverse prefix iteration")
88 :
89 : // IteratorMetrics holds per-iterator metrics. These do not change over the
90 : // lifetime of the iterator.
91 : type IteratorMetrics struct {
92 : // The read amplification experienced by this iterator. This is the sum of
93 : // the memtables, the L0 sublevels and the non-empty Ln levels. Higher read
94 : // amplification generally results in slower reads, though allowing higher
95 : // read amplification can also result in faster writes.
96 : ReadAmp int
97 : }
98 :
99 : // IteratorStatsKind describes the two kind of iterator stats.
100 : type IteratorStatsKind int8
101 :
102 : const (
103 : // InterfaceCall represents calls to Iterator.
104 : InterfaceCall IteratorStatsKind = iota
105 : // InternalIterCall represents calls by Iterator to its internalIterator.
106 : InternalIterCall
107 : // NumStatsKind is the number of kinds, and is used for array sizing.
108 : NumStatsKind
109 : )
110 :
111 : // IteratorStats contains iteration stats.
112 : type IteratorStats struct {
113 : // ForwardSeekCount includes SeekGE, SeekPrefixGE, First.
114 : ForwardSeekCount [NumStatsKind]int
115 : // ReverseSeek includes SeekLT, Last.
116 : ReverseSeekCount [NumStatsKind]int
117 : // ForwardStepCount includes Next.
118 : ForwardStepCount [NumStatsKind]int
119 : // ReverseStepCount includes Prev.
120 : ReverseStepCount [NumStatsKind]int
121 : InternalStats InternalIteratorStats
122 : RangeKeyStats RangeKeyIteratorStats
123 : }
124 :
125 : var _ redact.SafeFormatter = &IteratorStats{}
126 :
127 : // InternalIteratorStats contains miscellaneous stats produced by internal
128 : // iterators.
129 : type InternalIteratorStats = base.InternalIteratorStats
130 :
131 : // RangeKeyIteratorStats contains miscellaneous stats about range keys
132 : // encountered by the iterator.
133 : type RangeKeyIteratorStats struct {
134 : // Count records the number of range keys encountered during
135 : // iteration. Range keys may be counted multiple times if the iterator
136 : // leaves a range key's bounds and then returns.
137 : Count int
138 : // ContainedPoints records the number of point keys encountered within the
139 : // bounds of a range key. Note that this includes point keys with suffixes
140 : // that sort both above and below the covering range key's suffix.
141 : ContainedPoints int
142 : // SkippedPoints records the count of the subset of ContainedPoints point
143 : // keys that were skipped during iteration due to range-key masking. It does
144 : // not include point keys that were never loaded because a
145 : // RangeKeyMasking.Filter excluded the entire containing block.
146 : SkippedPoints int
147 : }
148 :
149 : // Merge adds all of the argument's statistics to the receiver. It may be used
150 : // to accumulate stats across multiple iterators.
151 0 : func (s *RangeKeyIteratorStats) Merge(o RangeKeyIteratorStats) {
152 0 : s.Count += o.Count
153 0 : s.ContainedPoints += o.ContainedPoints
154 0 : s.SkippedPoints += o.SkippedPoints
155 0 : }
156 :
157 : // LazyValue is a lazy value. See the long comment in base.LazyValue.
158 : type LazyValue = base.LazyValue
159 :
160 : // Iterator iterates over a DB's key/value pairs in key order.
161 : //
162 : // An iterator must be closed after use, but it is not necessary to read an
163 : // iterator until exhaustion.
164 : //
165 : // An iterator is not goroutine-safe, but it is safe to use multiple iterators
166 : // concurrently, with each in a dedicated goroutine.
167 : //
168 : // It is also safe to use an iterator concurrently with modifying its
169 : // underlying DB, if that DB permits modification. However, the resultant
170 : // key/value pairs are not guaranteed to be a consistent snapshot of that DB
171 : // at a particular point in time.
172 : //
173 : // If an iterator encounters an error during any operation, it is stored by
174 : // the Iterator and surfaced through the Error method. All absolute
175 : // positioning methods (eg, SeekLT, SeekGT, First, Last, etc) reset any
176 : // accumulated error before positioning. All relative positioning methods (eg,
177 : // Next, Prev) return without advancing if the iterator has an accumulated
178 : // error.
179 : type Iterator struct {
180 : // The context is stored here since (a) Iterators are expected to be
181 : // short-lived (since they pin memtables and sstables), (b) plumbing a
182 : // context into every method is very painful, (c) they do not (yet) respect
183 : // context cancellation and are only used for tracing.
184 : ctx context.Context
185 : opts IterOptions
186 : merge Merge
187 : comparer base.Comparer
188 : iter internalIterator
189 : pointIter internalIterator
190 : // Either readState or version is set, but not both.
191 : readState *readState
192 : version *version
193 : // rangeKey holds iteration state specific to iteration over range keys.
194 : // The range key field may be nil if the Iterator has never been configured
195 : // to iterate over range keys. Its non-nilness cannot be used to determine
196 : // if the Iterator is currently iterating over range keys: For that, consult
197 : // the IterOptions using opts.rangeKeys(). If non-nil, its rangeKeyIter
198 : // field is guaranteed to be non-nil too.
199 : rangeKey *iteratorRangeKeyState
200 : // rangeKeyMasking holds state for range-key masking of point keys.
201 : rangeKeyMasking rangeKeyMasking
202 : err error
203 : // When iterValidityState=IterValid, key represents the current key, which
204 : // is backed by keyBuf.
205 : key []byte
206 : keyBuf []byte
207 : value LazyValue
208 : // For use in LazyValue.Clone.
209 : valueBuf []byte
210 : fetcher base.LazyFetcher
211 : // For use in LazyValue.Value.
212 : lazyValueBuf []byte
213 : valueCloser io.Closer
214 : // boundsBuf holds two buffers used to store the lower and upper bounds.
215 : // Whenever the Iterator's bounds change, the new bounds are copied into
216 : // boundsBuf[boundsBufIdx]. The two bounds share a slice to reduce
217 : // allocations. opts.LowerBound and opts.UpperBound point into this slice.
218 : boundsBuf [2][]byte
219 : boundsBufIdx int
220 : // iterKey, iterValue reflect the latest position of iter, except when
221 : // SetBounds is called. In that case, these are explicitly set to nil.
222 : iterKey *InternalKey
223 : iterValue LazyValue
224 : alloc *iterAlloc
225 : getIterAlloc *getIterAlloc
226 : prefixOrFullSeekKey []byte
227 : readSampling readSampling
228 : stats IteratorStats
229 : externalReaders [][]*sstable.Reader
230 :
231 : // Following fields used when constructing an iterator stack, eg, in Clone
232 : // and SetOptions or when re-fragmenting a batch's range keys/range dels.
233 : // Non-nil if this Iterator includes a Batch.
234 : batch *Batch
235 : newIters tableNewIters
236 : newIterRangeKey keyspan.TableNewSpanIter
237 : lazyCombinedIter lazyCombinedIter
238 : seqNum uint64
239 : // batchSeqNum is used by Iterators over indexed batches to detect when the
240 : // underlying batch has been mutated. The batch beneath an indexed batch may
241 : // be mutated while the Iterator is open, but new keys are not surfaced
242 : // until the next call to SetOptions.
243 : batchSeqNum uint64
244 : // batch{PointIter,RangeDelIter,RangeKeyIter} are used when the Iterator is
245 : // configured to read through an indexed batch. If a batch is set, these
246 : // iterators will be included within the iterator stack regardless of
247 : // whether the batch currently contains any keys of their kind. These
248 : // pointers are used during a call to SetOptions to refresh the Iterator's
249 : // view of its indexed batch.
250 : batchPointIter batchIter
251 : batchRangeDelIter keyspan.Iter
252 : batchRangeKeyIter keyspan.Iter
253 : // merging is a pointer to this iterator's point merging iterator. It
254 : // appears here because key visibility is handled by the merging iterator.
255 : // During SetOptions on an iterator over an indexed batch, this field is
256 : // used to update the merging iterator's batch snapshot.
257 : merging *mergingIter
258 :
259 : // Keeping the bools here after all the 8 byte aligned fields shrinks the
260 : // sizeof this struct by 24 bytes.
261 :
262 : // INVARIANT:
263 : // iterValidityState==IterAtLimit <=>
264 : // pos==iterPosCurForwardPaused || pos==iterPosCurReversePaused
265 : iterValidityState IterValidityState
266 : // Set to true by SetBounds, SetOptions. Causes the Iterator to appear
267 : // exhausted externally, while preserving the correct iterValidityState for
268 : // the iterator's internal state. Preserving the correct internal validity
269 : // is used for SeekPrefixGE(..., trySeekUsingNext), and SeekGE/SeekLT
270 : // optimizations after "no-op" calls to SetBounds and SetOptions.
271 : requiresReposition bool
272 : // The position of iter. When this is iterPos{Prev,Next} the iter has been
273 : // moved past the current key-value, which can only happen if
274 : // iterValidityState=IterValid, i.e., there is something to return to the
275 : // client for the current position.
276 : pos iterPos
277 : // Relates to the prefixOrFullSeekKey field above.
278 : hasPrefix bool
279 : // Used for deriving the value of SeekPrefixGE(..., trySeekUsingNext),
280 : // and SeekGE/SeekLT optimizations
281 : lastPositioningOp lastPositioningOpKind
282 : // Used for determining when it's safe to perform SeekGE optimizations that
283 : // reuse the iterator state to avoid the cost of a full seek if the iterator
284 : // is already positioned in the correct place. If the iterator's view of its
285 : // indexed batch was just refreshed, some optimizations cannot be applied on
286 : // the first seek after the refresh:
287 : // - SeekGE has a no-op optimization that does not seek on the internal
288 : // iterator at all if the iterator is already in the correct place.
289 : // This optimization cannot be performed if the internal iterator was
290 : // last positioned when the iterator had a different view of an
291 : // underlying batch.
292 : // - Seek[Prefix]GE set flags.TrySeekUsingNext()=true when the seek key is
293 : // greater than the previous operation's seek key, under the expectation
294 : // that the various internal iterators can use their current position to
295 : // avoid a full expensive re-seek. This applies to the batchIter as well.
296 : // However, if the view of the batch was just refreshed, the batchIter's
297 : // position is not useful because it may already be beyond new keys less
298 : // than the seek key. To prevent the use of this optimization in
299 : // batchIter, Seek[Prefix]GE set flags.BatchJustRefreshed()=true if this
300 : // bit is enabled.
301 : batchJustRefreshed bool
302 : // Used for an optimization in external iterators to reduce the number of
303 : // merging levels.
304 : forwardOnly bool
305 : // closePointIterOnce is set to true if this point iter can only be Close()d
306 : // once, _and_ closing i.iter and then i.pointIter would close i.pointIter
307 : // twice. This is necessary to track if the point iter is an internal iterator
308 : // that could release its resources to a pool on Close(), making it harder for
309 : // that iterator to make its own closes idempotent.
310 : //
311 : // TODO(bilal): Update SetOptions to always close out point key iterators when
312 : // they won't be used, so that Close() doesn't need to default to closing
313 : // point iterators twice.
314 : closePointIterOnce bool
315 : // Used in some tests to disable the random disabling of seek optimizations.
316 : forceEnableSeekOpt bool
317 : // Set to true if NextPrefix is not currently permitted. Defaults to false
318 : // in case an iterator never had any bounds.
319 : nextPrefixNotPermittedByUpperBound bool
320 : }
321 :
322 : // cmp is a convenience shorthand for the i.comparer.Compare function.
323 1 : func (i *Iterator) cmp(a, b []byte) int {
324 1 : return i.comparer.Compare(a, b)
325 1 : }
326 :
327 : // split is a convenience shorthand for the i.comparer.Split function.
328 1 : func (i *Iterator) split(a []byte) int {
329 1 : return i.comparer.Split(a)
330 1 : }
331 :
332 : // equal is a convenience shorthand for the i.comparer.Equal function.
333 1 : func (i *Iterator) equal(a, b []byte) bool {
334 1 : return i.comparer.Equal(a, b)
335 1 : }
336 :
337 : // iteratorRangeKeyState holds an iterator's range key iteration state.
338 : type iteratorRangeKeyState struct {
339 : opts *IterOptions
340 : cmp base.Compare
341 : split base.Split
342 : // rangeKeyIter holds the range key iterator stack that iterates over the
343 : // merged spans across the entirety of the LSM.
344 : rangeKeyIter keyspan.FragmentIterator
345 : iiter keyspan.InterleavingIter
346 : // stale is set to true when the range key state recorded here (in start,
347 : // end and keys) may not be in sync with the current range key at the
348 : // interleaving iterator's current position.
349 : //
350 : // When the interelaving iterator passes over a new span, it invokes the
351 : // SpanChanged hook defined on the `rangeKeyMasking` type, which sets stale
352 : // to true if the span is non-nil.
353 : //
354 : // The parent iterator may not be positioned over the interleaving
355 : // iterator's current position (eg, i.iterPos = iterPos{Next,Prev}), so
356 : // {keys,start,end} are only updated to the new range key during a call to
357 : // Iterator.saveRangeKey.
358 : stale bool
359 : // updated is used to signal to the Iterator client whether the state of
360 : // range keys has changed since the previous iterator position through the
361 : // `RangeKeyChanged` method. It's set to true during an Iterator positioning
362 : // operation that changes the state of the current range key. Each Iterator
363 : // positioning operation sets it back to false before executing.
364 : //
365 : // TODO(jackson): The lifecycle of {stale,updated,prevPosHadRangeKey} is
366 : // intricate and confusing. Try to refactor to reduce complexity.
367 : updated bool
368 : // prevPosHadRangeKey records whether the previous Iterator position had a
369 : // range key (HasPointAndRage() = (_, true)). It's updated at the beginning
370 : // of each new Iterator positioning operation. It's required by saveRangeKey to
371 : // to set `updated` appropriately: Without this record of the previous iterator
372 : // state, it's ambiguous whether an iterator only temporarily stepped onto a
373 : // position without a range key.
374 : prevPosHadRangeKey bool
375 : // rangeKeyOnly is set to true if at the current iterator position there is
376 : // no point key, only a range key start boundary.
377 : rangeKeyOnly bool
378 : // hasRangeKey is true when the current iterator position has a covering
379 : // range key (eg, a range key with bounds [<lower>,<upper>) such that
380 : // <lower> ≤ Key() < <upper>).
381 : hasRangeKey bool
382 : // start and end are the [start, end) boundaries of the current range keys.
383 : start []byte
384 : end []byte
385 :
386 : rangeKeyBuffers
387 :
388 : // iterConfig holds fields that are used for the construction of the
389 : // iterator stack, but do not need to be directly accessed during iteration.
390 : // This struct is bundled within the iteratorRangeKeyState struct to reduce
391 : // allocations.
392 : iterConfig rangekey.UserIteratorConfig
393 : }
394 :
395 : type rangeKeyBuffers struct {
396 : // keys is sorted by Suffix ascending.
397 : keys []RangeKeyData
398 : // buf is used to save range-key data before moving the range-key iterator.
399 : // Start and end boundaries, suffixes and values are all copied into buf.
400 : buf bytealloc.A
401 : // internal holds buffers used by the range key internal iterators.
402 : internal rangekey.Buffers
403 : }
404 :
405 1 : func (b *rangeKeyBuffers) PrepareForReuse() {
406 1 : const maxKeysReuse = 100
407 1 : if len(b.keys) > maxKeysReuse {
408 0 : b.keys = nil
409 0 : }
410 : // Avoid caching the key buf if it is overly large. The constant is
411 : // fairly arbitrary.
412 1 : if cap(b.buf) >= maxKeyBufCacheSize {
413 1 : b.buf = nil
414 1 : } else {
415 1 : b.buf = b.buf[:0]
416 1 : }
417 1 : b.internal.PrepareForReuse()
418 : }
419 :
420 1 : func (i *iteratorRangeKeyState) init(cmp base.Compare, split base.Split, opts *IterOptions) {
421 1 : i.cmp = cmp
422 1 : i.split = split
423 1 : i.opts = opts
424 1 : }
425 :
426 : var iterRangeKeyStateAllocPool = sync.Pool{
427 1 : New: func() interface{} {
428 1 : return &iteratorRangeKeyState{}
429 1 : },
430 : }
431 :
432 : // isEphemeralPosition returns true iff the current iterator position is
433 : // ephemeral, and won't be visited during subsequent relative positioning
434 : // operations.
435 : //
436 : // The iterator position resulting from a SeekGE or SeekPrefixGE that lands on a
437 : // straddling range key without a coincident point key is such a position.
438 1 : func (i *Iterator) isEphemeralPosition() bool {
439 1 : return i.opts.rangeKeys() && i.rangeKey != nil && i.rangeKey.rangeKeyOnly &&
440 1 : !i.equal(i.rangeKey.start, i.key)
441 1 : }
442 :
443 : type lastPositioningOpKind int8
444 :
445 : const (
446 : unknownLastPositionOp lastPositioningOpKind = iota
447 : seekPrefixGELastPositioningOp
448 : seekGELastPositioningOp
449 : seekLTLastPositioningOp
450 : // invalidatedLastPositionOp is similar to unknownLastPositionOp and the
451 : // only reason to distinguish this is for the wider set of SeekGE
452 : // optimizations we permit for the external iterator Iterator.forwardOnly
453 : // case. Most code predicates should be doing equality comparisons with one
454 : // of the seek* enum values, so this duplication should not result in code
455 : // of the form:
456 : // if unknownLastPositionOp || invalidLastPositionOp
457 : invalidatedLastPositionOp
458 : )
459 :
460 : // Limited iteration mode. Not for use with prefix iteration.
461 : //
462 : // SeekGE, SeekLT, Prev, Next have WithLimit variants, that pause the iterator
463 : // at the limit in a best-effort manner. The client should behave correctly
464 : // even if the limits are ignored. These limits are not "deep", in that they
465 : // are not passed down to the underlying collection of internalIterators. This
466 : // is because the limits are transient, and apply only until the next
467 : // iteration call. They serve mainly as a way to bound the amount of work when
468 : // two (or more) Iterators are being coordinated at a higher level.
469 : //
470 : // In limited iteration mode:
471 : // - Avoid using Iterator.Valid if the last call was to a *WithLimit() method.
472 : // The return value from the *WithLimit() method provides a more precise
473 : // disposition.
474 : // - The limit is exclusive for forward and inclusive for reverse.
475 : //
476 : //
477 : // Limited iteration mode & range keys
478 : //
479 : // Limited iteration interacts with range-key iteration. When range key
480 : // iteration is enabled, range keys are interleaved at their start boundaries.
481 : // Limited iteration must ensure that if a range key exists within the limit,
482 : // the iterator visits the range key.
483 : //
484 : // During forward limited iteration, this is trivial: An overlapping range key
485 : // must have a start boundary less than the limit, and the range key's start
486 : // boundary will be interleaved and found to be within the limit.
487 : //
488 : // During reverse limited iteration, the tail of the range key may fall within
489 : // the limit. The range key must be surfaced even if the range key's start
490 : // boundary is less than the limit, and if there are no point keys between the
491 : // current iterator position and the limit. To provide this guarantee, reverse
492 : // limited iteration ignores the limit as long as there is a range key
493 : // overlapping the iteration position.
494 :
495 : // IterValidityState captures the state of the Iterator.
496 : type IterValidityState int8
497 :
498 : const (
499 : // IterExhausted represents an Iterator that is exhausted.
500 : IterExhausted IterValidityState = iota
501 : // IterValid represents an Iterator that is valid.
502 : IterValid
503 : // IterAtLimit represents an Iterator that has a non-exhausted
504 : // internalIterator, but has reached a limit without any key for the
505 : // caller.
506 : IterAtLimit
507 : )
508 :
509 : // readSampling stores variables used to sample a read to trigger a read
510 : // compaction
511 : type readSampling struct {
512 : bytesUntilReadSampling uint64
513 : initialSamplePassed bool
514 : pendingCompactions readCompactionQueue
515 : // forceReadSampling is used for testing purposes to force a read sample on every
516 : // call to Iterator.maybeSampleRead()
517 : forceReadSampling bool
518 : }
519 :
520 1 : func (i *Iterator) findNextEntry(limit []byte) {
521 1 : i.iterValidityState = IterExhausted
522 1 : i.pos = iterPosCurForward
523 1 : if i.opts.rangeKeys() && i.rangeKey != nil {
524 1 : i.rangeKey.rangeKeyOnly = false
525 1 : }
526 :
527 : // Close the closer for the current value if one was open.
528 1 : if i.closeValueCloser() != nil {
529 0 : return
530 0 : }
531 :
532 1 : for i.iterKey != nil {
533 1 : key := *i.iterKey
534 1 :
535 1 : if i.hasPrefix {
536 1 : if n := i.split(key.UserKey); !i.equal(i.prefixOrFullSeekKey, key.UserKey[:n]) {
537 1 : return
538 1 : }
539 : }
540 : // Compare with limit every time we start at a different user key.
541 : // Note that given the best-effort contract of limit, we could avoid a
542 : // comparison in the common case by doing this only after
543 : // i.nextUserKey is called for the deletes below. However that makes
544 : // the behavior non-deterministic (since the behavior will vary based
545 : // on what has been compacted), which makes it hard to test with the
546 : // metamorphic test. So we forego that performance optimization.
547 1 : if limit != nil && i.cmp(limit, i.iterKey.UserKey) <= 0 {
548 1 : i.iterValidityState = IterAtLimit
549 1 : i.pos = iterPosCurForwardPaused
550 1 : return
551 1 : }
552 :
553 1 : switch key.Kind() {
554 1 : case InternalKeyKindRangeKeySet:
555 1 : // Save the current key.
556 1 : i.keyBuf = append(i.keyBuf[:0], key.UserKey...)
557 1 : i.key = i.keyBuf
558 1 : i.value = LazyValue{}
559 1 : // There may also be a live point key at this userkey that we have
560 1 : // not yet read. We need to find the next entry with this user key
561 1 : // to find it. Save the range key so we don't lose it when we Next
562 1 : // the underlying iterator.
563 1 : i.saveRangeKey()
564 1 : pointKeyExists := i.nextPointCurrentUserKey()
565 1 : if i.err != nil {
566 0 : i.iterValidityState = IterExhausted
567 0 : return
568 0 : }
569 1 : i.rangeKey.rangeKeyOnly = !pointKeyExists
570 1 : i.iterValidityState = IterValid
571 1 : return
572 :
573 1 : case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized:
574 1 : // NB: treating InternalKeyKindSingleDelete as equivalent to DEL is not
575 1 : // only simpler, but is also necessary for correctness due to
576 1 : // InternalKeyKindSSTableInternalObsoleteBit.
577 1 : i.nextUserKey()
578 1 : continue
579 :
580 1 : case InternalKeyKindSet, InternalKeyKindSetWithDelete:
581 1 : i.keyBuf = append(i.keyBuf[:0], key.UserKey...)
582 1 : i.key = i.keyBuf
583 1 : i.value = i.iterValue
584 1 : i.iterValidityState = IterValid
585 1 : i.saveRangeKey()
586 1 : return
587 :
588 1 : case InternalKeyKindMerge:
589 1 : // Resolving the merge may advance us to the next point key, which
590 1 : // may be covered by a different set of range keys. Save the range
591 1 : // key state so we don't lose it.
592 1 : i.saveRangeKey()
593 1 : if i.mergeForward(key) {
594 1 : i.iterValidityState = IterValid
595 1 : return
596 1 : }
597 :
598 : // The merge didn't yield a valid key, either because the value
599 : // merger indicated it should be deleted, or because an error was
600 : // encountered.
601 0 : i.iterValidityState = IterExhausted
602 0 : if i.err != nil {
603 0 : return
604 0 : }
605 0 : if i.pos != iterPosNext {
606 0 : i.nextUserKey()
607 0 : }
608 0 : if i.closeValueCloser() != nil {
609 0 : return
610 0 : }
611 0 : i.pos = iterPosCurForward
612 :
613 0 : default:
614 0 : i.err = base.CorruptionErrorf("pebble: invalid internal key kind: %d", errors.Safe(key.Kind()))
615 0 : i.iterValidityState = IterExhausted
616 0 : return
617 : }
618 : }
619 : }
620 :
621 1 : func (i *Iterator) nextPointCurrentUserKey() bool {
622 1 : i.pos = iterPosCurForward
623 1 :
624 1 : i.iterKey, i.iterValue = i.iter.Next()
625 1 : i.stats.ForwardStepCount[InternalIterCall]++
626 1 : if i.iterKey == nil || !i.equal(i.key, i.iterKey.UserKey) {
627 1 : i.pos = iterPosNext
628 1 : return false
629 1 : }
630 :
631 1 : key := *i.iterKey
632 1 : switch key.Kind() {
633 0 : case InternalKeyKindRangeKeySet:
634 0 : // RangeKeySets must always be interleaved as the first internal key
635 0 : // for a user key.
636 0 : i.err = base.CorruptionErrorf("pebble: unexpected range key set mid-user key")
637 0 : return false
638 :
639 1 : case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized:
640 1 : // NB: treating InternalKeyKindSingleDelete as equivalent to DEL is not
641 1 : // only simpler, but is also necessary for correctness due to
642 1 : // InternalKeyKindSSTableInternalObsoleteBit.
643 1 : return false
644 :
645 1 : case InternalKeyKindSet, InternalKeyKindSetWithDelete:
646 1 : i.value = i.iterValue
647 1 : return true
648 :
649 1 : case InternalKeyKindMerge:
650 1 : return i.mergeForward(key)
651 :
652 0 : default:
653 0 : i.err = base.CorruptionErrorf("pebble: invalid internal key kind: %d", errors.Safe(key.Kind()))
654 0 : return false
655 : }
656 : }
657 :
658 : // mergeForward resolves a MERGE key, advancing the underlying iterator forward
659 : // to merge with subsequent keys with the same userkey. mergeForward returns a
660 : // boolean indicating whether or not the merge yielded a valid key. A merge may
661 : // not yield a valid key if an error occurred, in which case i.err is non-nil,
662 : // or the user's value merger specified the key to be deleted.
663 : //
664 : // mergeForward does not update iterValidityState.
665 1 : func (i *Iterator) mergeForward(key base.InternalKey) (valid bool) {
666 1 : var iterValue []byte
667 1 : iterValue, _, i.err = i.iterValue.Value(nil)
668 1 : if i.err != nil {
669 0 : return false
670 0 : }
671 1 : var valueMerger ValueMerger
672 1 : valueMerger, i.err = i.merge(key.UserKey, iterValue)
673 1 : if i.err != nil {
674 0 : return false
675 0 : }
676 :
677 1 : i.mergeNext(key, valueMerger)
678 1 : if i.err != nil {
679 0 : return false
680 0 : }
681 :
682 1 : var needDelete bool
683 1 : var value []byte
684 1 : value, needDelete, i.valueCloser, i.err = finishValueMerger(
685 1 : valueMerger, true /* includesBase */)
686 1 : i.value = base.MakeInPlaceValue(value)
687 1 : if i.err != nil {
688 0 : return false
689 0 : }
690 1 : if needDelete {
691 0 : _ = i.closeValueCloser()
692 0 : return false
693 0 : }
694 1 : return true
695 : }
696 :
697 1 : func (i *Iterator) closeValueCloser() error {
698 1 : if i.valueCloser != nil {
699 0 : i.err = i.valueCloser.Close()
700 0 : i.valueCloser = nil
701 0 : }
702 1 : return i.err
703 : }
704 :
705 1 : func (i *Iterator) nextUserKey() {
706 1 : if i.iterKey == nil {
707 1 : return
708 1 : }
709 1 : trailer := i.iterKey.Trailer
710 1 : done := i.iterKey.Trailer <= base.InternalKeyZeroSeqnumMaxTrailer
711 1 : if i.iterValidityState != IterValid {
712 1 : i.keyBuf = append(i.keyBuf[:0], i.iterKey.UserKey...)
713 1 : i.key = i.keyBuf
714 1 : }
715 1 : for {
716 1 : i.iterKey, i.iterValue = i.iter.Next()
717 1 : i.stats.ForwardStepCount[InternalIterCall]++
718 1 : // NB: We're guaranteed to be on the next user key if the previous key
719 1 : // had a zero sequence number (`done`), or the new key has a trailer
720 1 : // greater or equal to the previous key's trailer. This is true because
721 1 : // internal keys with the same user key are sorted by Trailer in
722 1 : // strictly monotonically descending order. We expect the trailer
723 1 : // optimization to trigger around 50% of the time with randomly
724 1 : // distributed writes. We expect it to trigger very frequently when
725 1 : // iterating through ingested sstables, which contain keys that all have
726 1 : // the same sequence number.
727 1 : if done || i.iterKey == nil || i.iterKey.Trailer >= trailer {
728 1 : break
729 : }
730 1 : if !i.equal(i.key, i.iterKey.UserKey) {
731 1 : break
732 : }
733 1 : done = i.iterKey.Trailer <= base.InternalKeyZeroSeqnumMaxTrailer
734 1 : trailer = i.iterKey.Trailer
735 : }
736 : }
737 :
738 1 : func (i *Iterator) maybeSampleRead() {
739 1 : // This method is only called when a public method of Iterator is
740 1 : // returning, and below we exclude the case were the iterator is paused at
741 1 : // a limit. The effect of these choices is that keys that are deleted, but
742 1 : // are encountered during iteration, are not accounted for in the read
743 1 : // sampling and will not cause read driven compactions, even though we are
744 1 : // incurring cost in iterating over them. And this issue is not limited to
745 1 : // Iterator, which does not see the effect of range deletes, which may be
746 1 : // causing iteration work in mergingIter. It is not clear at this time
747 1 : // whether this is a deficiency worth addressing.
748 1 : if i.iterValidityState != IterValid {
749 1 : return
750 1 : }
751 1 : if i.readState == nil {
752 1 : return
753 1 : }
754 1 : if i.readSampling.forceReadSampling {
755 0 : i.sampleRead()
756 0 : return
757 0 : }
758 1 : samplingPeriod := int32(int64(readBytesPeriod) * i.readState.db.opts.Experimental.ReadSamplingMultiplier)
759 1 : if samplingPeriod <= 0 {
760 0 : return
761 0 : }
762 1 : bytesRead := uint64(len(i.key) + i.value.Len())
763 1 : for i.readSampling.bytesUntilReadSampling < bytesRead {
764 1 : i.readSampling.bytesUntilReadSampling += uint64(fastrand.Uint32n(2 * uint32(samplingPeriod)))
765 1 : // The block below tries to adjust for the case where this is the
766 1 : // first read in a newly-opened iterator. As bytesUntilReadSampling
767 1 : // starts off at zero, we don't want to sample the first read of
768 1 : // every newly-opened iterator, but we do want to sample some of them.
769 1 : if !i.readSampling.initialSamplePassed {
770 1 : i.readSampling.initialSamplePassed = true
771 1 : if fastrand.Uint32n(uint32(i.readSampling.bytesUntilReadSampling)) > uint32(bytesRead) {
772 1 : continue
773 : }
774 : }
775 1 : i.sampleRead()
776 : }
777 1 : i.readSampling.bytesUntilReadSampling -= bytesRead
778 : }
779 :
780 1 : func (i *Iterator) sampleRead() {
781 1 : var topFile *manifest.FileMetadata
782 1 : topLevel, numOverlappingLevels := numLevels, 0
783 1 : mi := i.merging
784 1 : if mi == nil {
785 1 : return
786 1 : }
787 1 : if len(mi.levels) > 1 {
788 1 : mi.ForEachLevelIter(func(li *levelIter) bool {
789 1 : l := manifest.LevelToInt(li.level)
790 1 : if f := li.iterFile; f != nil {
791 1 : var containsKey bool
792 1 : if i.pos == iterPosNext || i.pos == iterPosCurForward ||
793 1 : i.pos == iterPosCurForwardPaused {
794 1 : containsKey = i.cmp(f.SmallestPointKey.UserKey, i.key) <= 0
795 1 : } else if i.pos == iterPosPrev || i.pos == iterPosCurReverse ||
796 1 : i.pos == iterPosCurReversePaused {
797 1 : containsKey = i.cmp(f.LargestPointKey.UserKey, i.key) >= 0
798 1 : }
799 : // Do nothing if the current key is not contained in f's
800 : // bounds. We could seek the LevelIterator at this level
801 : // to find the right file, but the performance impacts of
802 : // doing that are significant enough to negate the benefits
803 : // of read sampling in the first place. See the discussion
804 : // at:
805 : // https://github.com/cockroachdb/pebble/pull/1041#issuecomment-763226492
806 1 : if containsKey {
807 1 : numOverlappingLevels++
808 1 : if numOverlappingLevels >= 2 {
809 1 : // Terminate the loop early if at least 2 overlapping levels are found.
810 1 : return true
811 1 : }
812 1 : topLevel = l
813 1 : topFile = f
814 : }
815 : }
816 1 : return false
817 : })
818 : }
819 1 : if topFile == nil || topLevel >= numLevels {
820 1 : return
821 1 : }
822 1 : if numOverlappingLevels >= 2 {
823 1 : allowedSeeks := topFile.AllowedSeeks.Add(-1)
824 1 : if allowedSeeks == 0 {
825 0 :
826 0 : // Since the compaction queue can handle duplicates, we can keep
827 0 : // adding to the queue even once allowedSeeks hits 0.
828 0 : // In fact, we NEED to keep adding to the queue, because the queue
829 0 : // is small and evicts older and possibly useful compactions.
830 0 : topFile.AllowedSeeks.Add(topFile.InitAllowedSeeks)
831 0 :
832 0 : read := readCompaction{
833 0 : start: topFile.SmallestPointKey.UserKey,
834 0 : end: topFile.LargestPointKey.UserKey,
835 0 : level: topLevel,
836 0 : fileNum: topFile.FileNum,
837 0 : }
838 0 : i.readSampling.pendingCompactions.add(&read, i.cmp)
839 0 : }
840 : }
841 : }
842 :
843 1 : func (i *Iterator) findPrevEntry(limit []byte) {
844 1 : i.iterValidityState = IterExhausted
845 1 : i.pos = iterPosCurReverse
846 1 : if i.opts.rangeKeys() && i.rangeKey != nil {
847 1 : i.rangeKey.rangeKeyOnly = false
848 1 : }
849 :
850 : // Close the closer for the current value if one was open.
851 1 : if i.valueCloser != nil {
852 0 : i.err = i.valueCloser.Close()
853 0 : i.valueCloser = nil
854 0 : if i.err != nil {
855 0 : i.iterValidityState = IterExhausted
856 0 : return
857 0 : }
858 : }
859 :
860 1 : var valueMerger ValueMerger
861 1 : firstLoopIter := true
862 1 : rangeKeyBoundary := false
863 1 : // The code below compares with limit in multiple places. As documented in
864 1 : // findNextEntry, this is being done to make the behavior of limit
865 1 : // deterministic to allow for metamorphic testing. It is not required by
866 1 : // the best-effort contract of limit.
867 1 : for i.iterKey != nil {
868 1 : key := *i.iterKey
869 1 :
870 1 : // NB: We cannot pause if the current key is covered by a range key.
871 1 : // Otherwise, the user might not ever learn of a range key that covers
872 1 : // the key space being iterated over in which there are no point keys.
873 1 : // Since limits are best effort, ignoring the limit in this case is
874 1 : // allowed by the contract of limit.
875 1 : if firstLoopIter && limit != nil && i.cmp(limit, i.iterKey.UserKey) > 0 && !i.rangeKeyWithinLimit(limit) {
876 1 : i.iterValidityState = IterAtLimit
877 1 : i.pos = iterPosCurReversePaused
878 1 : return
879 1 : }
880 1 : firstLoopIter = false
881 1 :
882 1 : if i.iterValidityState == IterValid {
883 1 : if !i.equal(key.UserKey, i.key) {
884 1 : // We've iterated to the previous user key.
885 1 : i.pos = iterPosPrev
886 1 : if valueMerger != nil {
887 1 : var needDelete bool
888 1 : var value []byte
889 1 : value, needDelete, i.valueCloser, i.err = finishValueMerger(valueMerger, true /* includesBase */)
890 1 : i.value = base.MakeInPlaceValue(value)
891 1 : if i.err == nil && needDelete {
892 0 : // The point key at this key is deleted. If we also have
893 0 : // a range key boundary at this key, we still want to
894 0 : // return. Otherwise, we need to continue looking for
895 0 : // a live key.
896 0 : i.value = LazyValue{}
897 0 : if rangeKeyBoundary {
898 0 : i.rangeKey.rangeKeyOnly = true
899 0 : } else {
900 0 : i.iterValidityState = IterExhausted
901 0 : if i.closeValueCloser() == nil {
902 0 : continue
903 : }
904 : }
905 : }
906 : }
907 1 : if i.err != nil {
908 0 : i.iterValidityState = IterExhausted
909 0 : }
910 1 : return
911 : }
912 : }
913 :
914 1 : switch key.Kind() {
915 1 : case InternalKeyKindRangeKeySet:
916 1 : // Range key start boundary markers are interleaved with the maximum
917 1 : // sequence number, so if there's a point key also at this key, we
918 1 : // must've already iterated over it.
919 1 : // This is the final entry at this user key, so we may return
920 1 : i.rangeKey.rangeKeyOnly = i.iterValidityState != IterValid
921 1 : i.keyBuf = append(i.keyBuf[:0], key.UserKey...)
922 1 : i.key = i.keyBuf
923 1 : i.iterValidityState = IterValid
924 1 : i.saveRangeKey()
925 1 : // In all other cases, previous iteration requires advancing to
926 1 : // iterPosPrev in order to determine if the key is live and
927 1 : // unshadowed by another key at the same user key. In this case,
928 1 : // because range key start boundary markers are always interleaved
929 1 : // at the maximum sequence number, we know that there aren't any
930 1 : // additional keys with the same user key in the backward direction.
931 1 : //
932 1 : // We Prev the underlying iterator once anyways for consistency, so
933 1 : // that we can maintain the invariant during backward iteration that
934 1 : // i.iterPos = iterPosPrev.
935 1 : i.stats.ReverseStepCount[InternalIterCall]++
936 1 : i.iterKey, i.iterValue = i.iter.Prev()
937 1 :
938 1 : // Set rangeKeyBoundary so that on the next iteration, we know to
939 1 : // return the key even if the MERGE point key is deleted.
940 1 : rangeKeyBoundary = true
941 :
942 1 : case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized:
943 1 : i.value = LazyValue{}
944 1 : i.iterValidityState = IterExhausted
945 1 : valueMerger = nil
946 1 : i.iterKey, i.iterValue = i.iter.Prev()
947 1 : i.stats.ReverseStepCount[InternalIterCall]++
948 1 : // Compare with the limit. We could optimize by only checking when
949 1 : // we step to the previous user key, but detecting that requires a
950 1 : // comparison too. Note that this position may already passed a
951 1 : // number of versions of this user key, but they are all deleted,
952 1 : // so the fact that a subsequent Prev*() call will not see them is
953 1 : // harmless. Also note that this is the only place in the loop,
954 1 : // other than the firstLoopIter case above, where we could step
955 1 : // to a different user key and start processing it for returning
956 1 : // to the caller.
957 1 : if limit != nil && i.iterKey != nil && i.cmp(limit, i.iterKey.UserKey) > 0 && !i.rangeKeyWithinLimit(limit) {
958 1 : i.iterValidityState = IterAtLimit
959 1 : i.pos = iterPosCurReversePaused
960 1 : return
961 1 : }
962 1 : continue
963 :
964 1 : case InternalKeyKindSet, InternalKeyKindSetWithDelete:
965 1 : i.keyBuf = append(i.keyBuf[:0], key.UserKey...)
966 1 : i.key = i.keyBuf
967 1 : // iterValue is owned by i.iter and could change after the Prev()
968 1 : // call, so use valueBuf instead. Note that valueBuf is only used
969 1 : // in this one instance; everywhere else (eg. in findNextEntry),
970 1 : // we just point i.value to the unsafe i.iter-owned value buffer.
971 1 : i.value, i.valueBuf = i.iterValue.Clone(i.valueBuf[:0], &i.fetcher)
972 1 : i.saveRangeKey()
973 1 : i.iterValidityState = IterValid
974 1 : i.iterKey, i.iterValue = i.iter.Prev()
975 1 : i.stats.ReverseStepCount[InternalIterCall]++
976 1 : valueMerger = nil
977 1 : continue
978 :
979 1 : case InternalKeyKindMerge:
980 1 : if i.iterValidityState == IterExhausted {
981 1 : i.keyBuf = append(i.keyBuf[:0], key.UserKey...)
982 1 : i.key = i.keyBuf
983 1 : i.saveRangeKey()
984 1 : var iterValue []byte
985 1 : iterValue, _, i.err = i.iterValue.Value(nil)
986 1 : if i.err != nil {
987 0 : return
988 0 : }
989 1 : valueMerger, i.err = i.merge(i.key, iterValue)
990 1 : if i.err != nil {
991 0 : return
992 0 : }
993 1 : i.iterValidityState = IterValid
994 1 : } else if valueMerger == nil {
995 1 : // Extract value before iterValue since we use value before iterValue
996 1 : // and the underlying iterator is not required to provide backing
997 1 : // memory for both simultaneously.
998 1 : var value []byte
999 1 : var callerOwned bool
1000 1 : value, callerOwned, i.err = i.value.Value(i.lazyValueBuf)
1001 1 : if callerOwned {
1002 0 : i.lazyValueBuf = value[:0]
1003 0 : }
1004 1 : if i.err != nil {
1005 0 : return
1006 0 : }
1007 1 : valueMerger, i.err = i.merge(i.key, value)
1008 1 : var iterValue []byte
1009 1 : iterValue, _, i.err = i.iterValue.Value(nil)
1010 1 : if i.err != nil {
1011 0 : return
1012 0 : }
1013 1 : if i.err == nil {
1014 1 : i.err = valueMerger.MergeNewer(iterValue)
1015 1 : }
1016 1 : if i.err != nil {
1017 0 : i.iterValidityState = IterExhausted
1018 0 : return
1019 0 : }
1020 1 : } else {
1021 1 : var iterValue []byte
1022 1 : iterValue, _, i.err = i.iterValue.Value(nil)
1023 1 : if i.err != nil {
1024 0 : return
1025 0 : }
1026 1 : i.err = valueMerger.MergeNewer(iterValue)
1027 1 : if i.err != nil {
1028 0 : i.iterValidityState = IterExhausted
1029 0 : return
1030 0 : }
1031 : }
1032 1 : i.iterKey, i.iterValue = i.iter.Prev()
1033 1 : i.stats.ReverseStepCount[InternalIterCall]++
1034 1 : continue
1035 :
1036 0 : default:
1037 0 : i.err = base.CorruptionErrorf("pebble: invalid internal key kind: %d", errors.Safe(key.Kind()))
1038 0 : i.iterValidityState = IterExhausted
1039 0 : return
1040 : }
1041 : }
1042 :
1043 : // i.iterKey == nil, so broke out of the preceding loop.
1044 1 : if i.iterValidityState == IterValid {
1045 1 : i.pos = iterPosPrev
1046 1 : if valueMerger != nil {
1047 1 : var needDelete bool
1048 1 : var value []byte
1049 1 : value, needDelete, i.valueCloser, i.err = finishValueMerger(valueMerger, true /* includesBase */)
1050 1 : i.value = base.MakeInPlaceValue(value)
1051 1 : if i.err == nil && needDelete {
1052 0 : i.key = nil
1053 0 : i.value = LazyValue{}
1054 0 : i.iterValidityState = IterExhausted
1055 0 : }
1056 : }
1057 1 : if i.err != nil {
1058 0 : i.iterValidityState = IterExhausted
1059 0 : }
1060 : }
1061 : }
1062 :
1063 1 : func (i *Iterator) prevUserKey() {
1064 1 : if i.iterKey == nil {
1065 1 : return
1066 1 : }
1067 1 : if i.iterValidityState != IterValid {
1068 1 : // If we're going to compare against the prev key, we need to save the
1069 1 : // current key.
1070 1 : i.keyBuf = append(i.keyBuf[:0], i.iterKey.UserKey...)
1071 1 : i.key = i.keyBuf
1072 1 : }
1073 1 : for {
1074 1 : i.iterKey, i.iterValue = i.iter.Prev()
1075 1 : i.stats.ReverseStepCount[InternalIterCall]++
1076 1 : if i.iterKey == nil {
1077 1 : break
1078 : }
1079 1 : if !i.equal(i.key, i.iterKey.UserKey) {
1080 1 : break
1081 : }
1082 : }
1083 : }
1084 :
1085 1 : func (i *Iterator) mergeNext(key InternalKey, valueMerger ValueMerger) {
1086 1 : // Save the current key.
1087 1 : i.keyBuf = append(i.keyBuf[:0], key.UserKey...)
1088 1 : i.key = i.keyBuf
1089 1 :
1090 1 : // Loop looking for older values for this key and merging them.
1091 1 : for {
1092 1 : i.iterKey, i.iterValue = i.iter.Next()
1093 1 : i.stats.ForwardStepCount[InternalIterCall]++
1094 1 : if i.iterKey == nil {
1095 1 : i.pos = iterPosNext
1096 1 : return
1097 1 : }
1098 1 : key = *i.iterKey
1099 1 : if !i.equal(i.key, key.UserKey) {
1100 1 : // We've advanced to the next key.
1101 1 : i.pos = iterPosNext
1102 1 : return
1103 1 : }
1104 1 : switch key.Kind() {
1105 1 : case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized:
1106 1 : // We've hit a deletion tombstone. Return everything up to this
1107 1 : // point.
1108 1 : //
1109 1 : // NB: treating InternalKeyKindSingleDelete as equivalent to DEL is not
1110 1 : // only simpler, but is also necessary for correctness due to
1111 1 : // InternalKeyKindSSTableInternalObsoleteBit.
1112 1 : return
1113 :
1114 1 : case InternalKeyKindSet, InternalKeyKindSetWithDelete:
1115 1 : // We've hit a Set value. Merge with the existing value and return.
1116 1 : var iterValue []byte
1117 1 : iterValue, _, i.err = i.iterValue.Value(nil)
1118 1 : if i.err != nil {
1119 0 : return
1120 0 : }
1121 1 : i.err = valueMerger.MergeOlder(iterValue)
1122 1 : return
1123 :
1124 1 : case InternalKeyKindMerge:
1125 1 : // We've hit another Merge value. Merge with the existing value and
1126 1 : // continue looping.
1127 1 : var iterValue []byte
1128 1 : iterValue, _, i.err = i.iterValue.Value(nil)
1129 1 : if i.err != nil {
1130 0 : return
1131 0 : }
1132 1 : i.err = valueMerger.MergeOlder(iterValue)
1133 1 : if i.err != nil {
1134 0 : return
1135 0 : }
1136 1 : continue
1137 :
1138 0 : case InternalKeyKindRangeKeySet:
1139 0 : // The RANGEKEYSET marker must sort before a MERGE at the same user key.
1140 0 : i.err = base.CorruptionErrorf("pebble: out of order range key marker")
1141 0 : return
1142 :
1143 0 : default:
1144 0 : i.err = base.CorruptionErrorf("pebble: invalid internal key kind: %d", errors.Safe(key.Kind()))
1145 0 : return
1146 : }
1147 : }
1148 : }
1149 :
1150 : // SeekGE moves the iterator to the first key/value pair whose key is greater
1151 : // than or equal to the given key. Returns true if the iterator is pointing at
1152 : // a valid entry and false otherwise.
1153 1 : func (i *Iterator) SeekGE(key []byte) bool {
1154 1 : return i.SeekGEWithLimit(key, nil) == IterValid
1155 1 : }
1156 :
1157 : // SeekGEWithLimit moves the iterator to the first key/value pair whose key is
1158 : // greater than or equal to the given key.
1159 : //
1160 : // If limit is provided, it serves as a best-effort exclusive limit. If the
1161 : // first key greater than or equal to the given search key is also greater than
1162 : // or equal to limit, the Iterator may pause and return IterAtLimit. Because
1163 : // limits are best-effort, SeekGEWithLimit may return a key beyond limit.
1164 : //
1165 : // If the Iterator is configured to iterate over range keys, SeekGEWithLimit
1166 : // guarantees it will surface any range keys with bounds overlapping the
1167 : // keyspace [key, limit).
1168 1 : func (i *Iterator) SeekGEWithLimit(key []byte, limit []byte) IterValidityState {
1169 1 : if i.rangeKey != nil {
1170 1 : // NB: Check Valid() before clearing requiresReposition.
1171 1 : i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
1172 1 : // If we have a range key but did not expose it at the previous iterator
1173 1 : // position (because the iterator was not at a valid position), updated
1174 1 : // must be true. This ensures that after an iterator op sequence like:
1175 1 : // - Next() → (IterValid, RangeBounds() = [a,b))
1176 1 : // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -)
1177 1 : // - SeekGE(...) → (IterValid, RangeBounds() = [a,b))
1178 1 : // the iterator returns RangeKeyChanged()=true.
1179 1 : //
1180 1 : // The remainder of this function will only update i.rangeKey.updated if
1181 1 : // the iterator moves into a new range key, or out of the current range
1182 1 : // key.
1183 1 : i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys()
1184 1 : }
1185 1 : lastPositioningOp := i.lastPositioningOp
1186 1 : hasPrefix := i.hasPrefix
1187 1 : // Set it to unknown, since this operation may not succeed, in which case
1188 1 : // the SeekGE following this should not make any assumption about iterator
1189 1 : // position.
1190 1 : i.lastPositioningOp = unknownLastPositionOp
1191 1 : i.requiresReposition = false
1192 1 : i.err = nil // clear cached iteration error
1193 1 : i.hasPrefix = false
1194 1 : i.stats.ForwardSeekCount[InterfaceCall]++
1195 1 : if lowerBound := i.opts.GetLowerBound(); lowerBound != nil && i.cmp(key, lowerBound) < 0 {
1196 1 : key = lowerBound
1197 1 : } else if upperBound := i.opts.GetUpperBound(); upperBound != nil && i.cmp(key, upperBound) > 0 {
1198 1 : key = upperBound
1199 1 : }
1200 1 : seekInternalIter := true
1201 1 :
1202 1 : var flags base.SeekGEFlags
1203 1 : if i.batchJustRefreshed {
1204 0 : i.batchJustRefreshed = false
1205 0 : flags = flags.EnableBatchJustRefreshed()
1206 0 : }
1207 1 : if lastPositioningOp == seekGELastPositioningOp {
1208 1 : cmp := i.cmp(i.prefixOrFullSeekKey, key)
1209 1 : // If this seek is to the same or later key, and the iterator is
1210 1 : // already positioned there, this is a noop. This can be helpful for
1211 1 : // sparse key spaces that have many deleted keys, where one can avoid
1212 1 : // the overhead of iterating past them again and again.
1213 1 : if cmp <= 0 {
1214 1 : if !flags.BatchJustRefreshed() &&
1215 1 : (i.iterValidityState == IterExhausted ||
1216 1 : (i.iterValidityState == IterValid && i.cmp(key, i.key) <= 0 &&
1217 1 : (limit == nil || i.cmp(i.key, limit) < 0))) {
1218 1 : // Noop
1219 1 : if !invariants.Enabled || !disableSeekOpt(key, uintptr(unsafe.Pointer(i))) || i.forceEnableSeekOpt {
1220 1 : i.lastPositioningOp = seekGELastPositioningOp
1221 1 : return i.iterValidityState
1222 1 : }
1223 : }
1224 : // cmp == 0 is not safe to optimize since
1225 : // - i.pos could be at iterPosNext, due to a merge.
1226 : // - Even if i.pos were at iterPosCurForward, we could have a DELETE,
1227 : // SET pair for a key, and the iterator would have moved past DELETE
1228 : // but stayed at iterPosCurForward. A similar situation occurs for a
1229 : // MERGE, SET pair where the MERGE is consumed and the iterator is
1230 : // at the SET.
1231 : // We also leverage the IterAtLimit <=> i.pos invariant defined in the
1232 : // comment on iterValidityState, to exclude any cases where i.pos
1233 : // is iterPosCur{Forward,Reverse}Paused. This avoids the need to
1234 : // special-case those iterator positions and their interactions with
1235 : // TrySeekUsingNext, as the main uses for TrySeekUsingNext in CockroachDB
1236 : // do not use limited Seeks in the first place.
1237 1 : if cmp < 0 && i.iterValidityState != IterAtLimit && limit == nil {
1238 1 : flags = flags.EnableTrySeekUsingNext()
1239 1 : }
1240 1 : if invariants.Enabled && flags.TrySeekUsingNext() && !i.forceEnableSeekOpt && disableSeekOpt(key, uintptr(unsafe.Pointer(i))) {
1241 1 : flags = flags.DisableTrySeekUsingNext()
1242 1 : }
1243 1 : if !flags.BatchJustRefreshed() && i.pos == iterPosCurForwardPaused && i.cmp(key, i.iterKey.UserKey) <= 0 {
1244 1 : // Have some work to do, but don't need to seek, and we can
1245 1 : // start doing findNextEntry from i.iterKey.
1246 1 : seekInternalIter = false
1247 1 : }
1248 : }
1249 : }
1250 : // Check for another TrySeekUsingNext optimization opportunity, currently
1251 : // specifically tailored to external iterators. This case is intended to
1252 : // trigger in instances of Seek-ing with monotonically increasing keys with
1253 : // Nexts interspersed. At the time of writing, this is the case for
1254 : // CockroachDB scans. This optimization is important for external iterators
1255 : // to avoid re-seeking within an already-exhausted sstable. It is not always
1256 : // a performance win more generally, so we restrict it to external iterators
1257 : // that are configured to only use forward positioning operations.
1258 : //
1259 : // TODO(jackson): This optimization should be obsolete once we introduce and
1260 : // use the NextPrefix iterator positioning operation.
1261 1 : if seekInternalIter && i.forwardOnly && lastPositioningOp != invalidatedLastPositionOp &&
1262 1 : i.pos == iterPosCurForward && !hasPrefix && i.iterValidityState == IterValid &&
1263 1 : i.cmp(key, i.iterKey.UserKey) > 0 {
1264 0 : flags = flags.EnableTrySeekUsingNext()
1265 0 : if invariants.Enabled && flags.TrySeekUsingNext() && !i.forceEnableSeekOpt && disableSeekOpt(key, uintptr(unsafe.Pointer(i))) {
1266 0 : flags = flags.DisableTrySeekUsingNext()
1267 0 : }
1268 : }
1269 1 : if seekInternalIter {
1270 1 : i.iterKey, i.iterValue = i.iter.SeekGE(key, flags)
1271 1 : i.stats.ForwardSeekCount[InternalIterCall]++
1272 1 : }
1273 1 : i.findNextEntry(limit)
1274 1 : i.maybeSampleRead()
1275 1 : if i.Error() == nil {
1276 1 : // Prepare state for a future noop optimization.
1277 1 : i.prefixOrFullSeekKey = append(i.prefixOrFullSeekKey[:0], key...)
1278 1 : i.lastPositioningOp = seekGELastPositioningOp
1279 1 : }
1280 1 : return i.iterValidityState
1281 : }
1282 :
1283 : // SeekPrefixGE moves the iterator to the first key/value pair whose key is
1284 : // greater than or equal to the given key and which has the same "prefix" as
1285 : // the given key. The prefix for a key is determined by the user-defined
1286 : // Comparer.Split function. The iterator will not observe keys not matching the
1287 : // "prefix" of the search key. Calling SeekPrefixGE puts the iterator in prefix
1288 : // iteration mode. The iterator remains in prefix iteration until a subsequent
1289 : // call to another absolute positioning method (SeekGE, SeekLT, First,
1290 : // Last). Reverse iteration (Prev) is not supported when an iterator is in
1291 : // prefix iteration mode. Returns true if the iterator is pointing at a valid
1292 : // entry and false otherwise.
1293 : //
1294 : // The semantics of SeekPrefixGE are slightly unusual and designed for
1295 : // iteration to be able to take advantage of bloom filters that have been
1296 : // created on the "prefix". If you're not using bloom filters, there is no
1297 : // reason to use SeekPrefixGE.
1298 : //
1299 : // An example Split function may separate a timestamp suffix from the prefix of
1300 : // the key.
1301 : //
1302 : // Split(<key>@<timestamp>) -> <key>
1303 : //
1304 : // Consider the keys "a@1", "a@2", "aa@3", "aa@4". The prefixes for these keys
1305 : // are "a", and "aa". Note that despite "a" and "aa" sharing a prefix by the
1306 : // usual definition, those prefixes differ by the definition of the Split
1307 : // function. To see how this works, consider the following set of calls on this
1308 : // data set:
1309 : //
1310 : // SeekPrefixGE("a@0") -> "a@1"
1311 : // Next() -> "a@2"
1312 : // Next() -> EOF
1313 : //
1314 : // If you're just looking to iterate over keys with a shared prefix, as
1315 : // defined by the configured comparer, set iterator bounds instead:
1316 : //
1317 : // iter := db.NewIter(&pebble.IterOptions{
1318 : // LowerBound: []byte("prefix"),
1319 : // UpperBound: []byte("prefiy"),
1320 : // })
1321 : // for iter.First(); iter.Valid(); iter.Next() {
1322 : // // Only keys beginning with "prefix" will be visited.
1323 : // }
1324 : //
1325 : // See ExampleIterator_SeekPrefixGE for a working example.
1326 : //
1327 : // When iterating with range keys enabled, all range keys encountered are
1328 : // truncated to the seek key's prefix's bounds. The truncation of the upper
1329 : // bound requires that the database's Comparer is configured with a
1330 : // ImmediateSuccessor method. For example, a SeekPrefixGE("a@9") call with the
1331 : // prefix "a" will truncate range key bounds to [a,ImmediateSuccessor(a)].
1332 1 : func (i *Iterator) SeekPrefixGE(key []byte) bool {
1333 1 : if i.rangeKey != nil {
1334 1 : // NB: Check Valid() before clearing requiresReposition.
1335 1 : i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
1336 1 : // If we have a range key but did not expose it at the previous iterator
1337 1 : // position (because the iterator was not at a valid position), updated
1338 1 : // must be true. This ensures that after an iterator op sequence like:
1339 1 : // - Next() → (IterValid, RangeBounds() = [a,b))
1340 1 : // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -)
1341 1 : // - SeekPrefixGE(...) → (IterValid, RangeBounds() = [a,b))
1342 1 : // the iterator returns RangeKeyChanged()=true.
1343 1 : //
1344 1 : // The remainder of this function will only update i.rangeKey.updated if
1345 1 : // the iterator moves into a new range key, or out of the current range
1346 1 : // key.
1347 1 : i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys()
1348 1 : }
1349 1 : lastPositioningOp := i.lastPositioningOp
1350 1 : // Set it to unknown, since this operation may not succeed, in which case
1351 1 : // the SeekPrefixGE following this should not make any assumption about
1352 1 : // iterator position.
1353 1 : i.lastPositioningOp = unknownLastPositionOp
1354 1 : i.requiresReposition = false
1355 1 : i.err = nil // clear cached iteration error
1356 1 : i.stats.ForwardSeekCount[InterfaceCall]++
1357 1 : if i.comparer.Split == nil {
1358 0 : panic("pebble: split must be provided for SeekPrefixGE")
1359 : }
1360 1 : if i.comparer.ImmediateSuccessor == nil && i.opts.KeyTypes != IterKeyTypePointsOnly {
1361 0 : panic("pebble: ImmediateSuccessor must be provided for SeekPrefixGE with range keys")
1362 : }
1363 1 : prefixLen := i.split(key)
1364 1 : keyPrefix := key[:prefixLen]
1365 1 : var flags base.SeekGEFlags
1366 1 : if i.batchJustRefreshed {
1367 0 : flags = flags.EnableBatchJustRefreshed()
1368 0 : i.batchJustRefreshed = false
1369 0 : }
1370 1 : if lastPositioningOp == seekPrefixGELastPositioningOp {
1371 1 : if !i.hasPrefix {
1372 0 : panic("lastPositioningOpsIsSeekPrefixGE is true, but hasPrefix is false")
1373 : }
1374 : // The iterator has not been repositioned after the last SeekPrefixGE.
1375 : // See if we are seeking to a larger key, since then we can optimize
1376 : // the seek by using next. Note that we could also optimize if Next
1377 : // has been called, if the iterator is not exhausted and the current
1378 : // position is <= the seek key. We are keeping this limited for now
1379 : // since such optimizations require care for correctness, and to not
1380 : // become de-optimizations (if one usually has to do all the next
1381 : // calls and then the seek). This SeekPrefixGE optimization
1382 : // specifically benefits CockroachDB.
1383 1 : cmp := i.cmp(i.prefixOrFullSeekKey, keyPrefix)
1384 1 : // cmp == 0 is not safe to optimize since
1385 1 : // - i.pos could be at iterPosNext, due to a merge.
1386 1 : // - Even if i.pos were at iterPosCurForward, we could have a DELETE,
1387 1 : // SET pair for a key, and the iterator would have moved past DELETE
1388 1 : // but stayed at iterPosCurForward. A similar situation occurs for a
1389 1 : // MERGE, SET pair where the MERGE is consumed and the iterator is
1390 1 : // at the SET.
1391 1 : // In general some versions of i.prefix could have been consumed by
1392 1 : // the iterator, so we only optimize for cmp < 0.
1393 1 : if cmp < 0 {
1394 1 : flags = flags.EnableTrySeekUsingNext()
1395 1 : }
1396 1 : if invariants.Enabled && flags.TrySeekUsingNext() && !i.forceEnableSeekOpt && disableSeekOpt(key, uintptr(unsafe.Pointer(i))) {
1397 1 : flags = flags.DisableTrySeekUsingNext()
1398 1 : }
1399 : }
1400 : // Make a copy of the prefix so that modifications to the key after
1401 : // SeekPrefixGE returns does not affect the stored prefix.
1402 1 : if cap(i.prefixOrFullSeekKey) < prefixLen {
1403 1 : i.prefixOrFullSeekKey = make([]byte, prefixLen)
1404 1 : } else {
1405 1 : i.prefixOrFullSeekKey = i.prefixOrFullSeekKey[:prefixLen]
1406 1 : }
1407 1 : i.hasPrefix = true
1408 1 : copy(i.prefixOrFullSeekKey, keyPrefix)
1409 1 :
1410 1 : if lowerBound := i.opts.GetLowerBound(); lowerBound != nil && i.cmp(key, lowerBound) < 0 {
1411 1 : if n := i.split(lowerBound); !bytes.Equal(i.prefixOrFullSeekKey, lowerBound[:n]) {
1412 1 : i.err = errors.New("pebble: SeekPrefixGE supplied with key outside of lower bound")
1413 1 : i.iterValidityState = IterExhausted
1414 1 : return false
1415 1 : }
1416 1 : key = lowerBound
1417 1 : } else if upperBound := i.opts.GetUpperBound(); upperBound != nil && i.cmp(key, upperBound) > 0 {
1418 1 : if n := i.split(upperBound); !bytes.Equal(i.prefixOrFullSeekKey, upperBound[:n]) {
1419 1 : i.err = errors.New("pebble: SeekPrefixGE supplied with key outside of upper bound")
1420 1 : i.iterValidityState = IterExhausted
1421 1 : return false
1422 1 : }
1423 1 : key = upperBound
1424 : }
1425 1 : i.iterKey, i.iterValue = i.iter.SeekPrefixGE(i.prefixOrFullSeekKey, key, flags)
1426 1 : i.stats.ForwardSeekCount[InternalIterCall]++
1427 1 : i.findNextEntry(nil)
1428 1 : i.maybeSampleRead()
1429 1 : if i.Error() == nil {
1430 1 : i.lastPositioningOp = seekPrefixGELastPositioningOp
1431 1 : }
1432 1 : return i.iterValidityState == IterValid
1433 : }
1434 :
1435 : // Deterministic disabling of the seek optimizations. It uses the iterator
1436 : // pointer, since we want diversity in iterator behavior for the same key. Used
1437 : // for tests.
1438 1 : func disableSeekOpt(key []byte, ptr uintptr) bool {
1439 1 : // Fibonacci hash https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
1440 1 : simpleHash := (11400714819323198485 * uint64(ptr)) >> 63
1441 1 : return key != nil && key[0]&byte(1) == 0 && simpleHash == 0
1442 1 : }
1443 :
1444 : // SeekLT moves the iterator to the last key/value pair whose key is less than
1445 : // the given key. Returns true if the iterator is pointing at a valid entry and
1446 : // false otherwise.
1447 1 : func (i *Iterator) SeekLT(key []byte) bool {
1448 1 : return i.SeekLTWithLimit(key, nil) == IterValid
1449 1 : }
1450 :
1451 : // SeekLTWithLimit moves the iterator to the last key/value pair whose key is
1452 : // less than the given key.
1453 : //
1454 : // If limit is provided, it serves as a best-effort inclusive limit. If the last
1455 : // key less than the given search key is also less than limit, the Iterator may
1456 : // pause and return IterAtLimit. Because limits are best-effort, SeekLTWithLimit
1457 : // may return a key beyond limit.
1458 : //
1459 : // If the Iterator is configured to iterate over range keys, SeekLTWithLimit
1460 : // guarantees it will surface any range keys with bounds overlapping the
1461 : // keyspace up to limit.
1462 1 : func (i *Iterator) SeekLTWithLimit(key []byte, limit []byte) IterValidityState {
1463 1 : if i.rangeKey != nil {
1464 1 : // NB: Check Valid() before clearing requiresReposition.
1465 1 : i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
1466 1 : // If we have a range key but did not expose it at the previous iterator
1467 1 : // position (because the iterator was not at a valid position), updated
1468 1 : // must be true. This ensures that after an iterator op sequence like:
1469 1 : // - Next() → (IterValid, RangeBounds() = [a,b))
1470 1 : // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -)
1471 1 : // - SeekLTWithLimit(...) → (IterValid, RangeBounds() = [a,b))
1472 1 : // the iterator returns RangeKeyChanged()=true.
1473 1 : //
1474 1 : // The remainder of this function will only update i.rangeKey.updated if
1475 1 : // the iterator moves into a new range key, or out of the current range
1476 1 : // key.
1477 1 : i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys()
1478 1 : }
1479 1 : lastPositioningOp := i.lastPositioningOp
1480 1 : // Set it to unknown, since this operation may not succeed, in which case
1481 1 : // the SeekLT following this should not make any assumption about iterator
1482 1 : // position.
1483 1 : i.lastPositioningOp = unknownLastPositionOp
1484 1 : i.batchJustRefreshed = false
1485 1 : i.requiresReposition = false
1486 1 : i.err = nil // clear cached iteration error
1487 1 : i.stats.ReverseSeekCount[InterfaceCall]++
1488 1 : if upperBound := i.opts.GetUpperBound(); upperBound != nil && i.cmp(key, upperBound) > 0 {
1489 1 : key = upperBound
1490 1 : } else if lowerBound := i.opts.GetLowerBound(); lowerBound != nil && i.cmp(key, lowerBound) < 0 {
1491 1 : key = lowerBound
1492 1 : }
1493 1 : i.hasPrefix = false
1494 1 : seekInternalIter := true
1495 1 : // The following noop optimization only applies when i.batch == nil, since
1496 1 : // an iterator over a batch is iterating over mutable data, that may have
1497 1 : // changed since the last seek.
1498 1 : if lastPositioningOp == seekLTLastPositioningOp && i.batch == nil {
1499 1 : cmp := i.cmp(key, i.prefixOrFullSeekKey)
1500 1 : // If this seek is to the same or earlier key, and the iterator is
1501 1 : // already positioned there, this is a noop. This can be helpful for
1502 1 : // sparse key spaces that have many deleted keys, where one can avoid
1503 1 : // the overhead of iterating past them again and again.
1504 1 : if cmp <= 0 {
1505 1 : // NB: when pos != iterPosCurReversePaused, the invariant
1506 1 : // documented earlier implies that iterValidityState !=
1507 1 : // IterAtLimit.
1508 1 : if i.iterValidityState == IterExhausted ||
1509 1 : (i.iterValidityState == IterValid && i.cmp(i.key, key) < 0 &&
1510 1 : (limit == nil || i.cmp(limit, i.key) <= 0)) {
1511 1 : if !invariants.Enabled || !disableSeekOpt(key, uintptr(unsafe.Pointer(i))) {
1512 1 : i.lastPositioningOp = seekLTLastPositioningOp
1513 1 : return i.iterValidityState
1514 1 : }
1515 : }
1516 1 : if i.pos == iterPosCurReversePaused && i.cmp(i.iterKey.UserKey, key) < 0 {
1517 1 : // Have some work to do, but don't need to seek, and we can
1518 1 : // start doing findPrevEntry from i.iterKey.
1519 1 : seekInternalIter = false
1520 1 : }
1521 : }
1522 : }
1523 1 : if seekInternalIter {
1524 1 : i.iterKey, i.iterValue = i.iter.SeekLT(key, base.SeekLTFlagsNone)
1525 1 : i.stats.ReverseSeekCount[InternalIterCall]++
1526 1 : }
1527 1 : i.findPrevEntry(limit)
1528 1 : i.maybeSampleRead()
1529 1 : if i.Error() == nil && i.batch == nil {
1530 1 : // Prepare state for a future noop optimization.
1531 1 : i.prefixOrFullSeekKey = append(i.prefixOrFullSeekKey[:0], key...)
1532 1 : i.lastPositioningOp = seekLTLastPositioningOp
1533 1 : }
1534 1 : return i.iterValidityState
1535 : }
1536 :
1537 : // First moves the iterator the the first key/value pair. Returns true if the
1538 : // iterator is pointing at a valid entry and false otherwise.
1539 1 : func (i *Iterator) First() bool {
1540 1 : if i.rangeKey != nil {
1541 1 : // NB: Check Valid() before clearing requiresReposition.
1542 1 : i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
1543 1 : // If we have a range key but did not expose it at the previous iterator
1544 1 : // position (because the iterator was not at a valid position), updated
1545 1 : // must be true. This ensures that after an iterator op sequence like:
1546 1 : // - Next() → (IterValid, RangeBounds() = [a,b))
1547 1 : // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -)
1548 1 : // - First(...) → (IterValid, RangeBounds() = [a,b))
1549 1 : // the iterator returns RangeKeyChanged()=true.
1550 1 : //
1551 1 : // The remainder of this function will only update i.rangeKey.updated if
1552 1 : // the iterator moves into a new range key, or out of the current range
1553 1 : // key.
1554 1 : i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys()
1555 1 : }
1556 1 : i.err = nil // clear cached iteration error
1557 1 : i.hasPrefix = false
1558 1 : i.batchJustRefreshed = false
1559 1 : i.lastPositioningOp = unknownLastPositionOp
1560 1 : i.requiresReposition = false
1561 1 : i.stats.ForwardSeekCount[InterfaceCall]++
1562 1 :
1563 1 : i.iterFirstWithinBounds()
1564 1 : i.findNextEntry(nil)
1565 1 : i.maybeSampleRead()
1566 1 : return i.iterValidityState == IterValid
1567 : }
1568 :
1569 : // Last moves the iterator the the last key/value pair. Returns true if the
1570 : // iterator is pointing at a valid entry and false otherwise.
1571 1 : func (i *Iterator) Last() bool {
1572 1 : if i.rangeKey != nil {
1573 1 : // NB: Check Valid() before clearing requiresReposition.
1574 1 : i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
1575 1 : // If we have a range key but did not expose it at the previous iterator
1576 1 : // position (because the iterator was not at a valid position), updated
1577 1 : // must be true. This ensures that after an iterator op sequence like:
1578 1 : // - Next() → (IterValid, RangeBounds() = [a,b))
1579 1 : // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -)
1580 1 : // - Last(...) → (IterValid, RangeBounds() = [a,b))
1581 1 : // the iterator returns RangeKeyChanged()=true.
1582 1 : //
1583 1 : // The remainder of this function will only update i.rangeKey.updated if
1584 1 : // the iterator moves into a new range key, or out of the current range
1585 1 : // key.
1586 1 : i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys()
1587 1 : }
1588 1 : i.err = nil // clear cached iteration error
1589 1 : i.hasPrefix = false
1590 1 : i.batchJustRefreshed = false
1591 1 : i.lastPositioningOp = unknownLastPositionOp
1592 1 : i.requiresReposition = false
1593 1 : i.stats.ReverseSeekCount[InterfaceCall]++
1594 1 :
1595 1 : i.iterLastWithinBounds()
1596 1 : i.findPrevEntry(nil)
1597 1 : i.maybeSampleRead()
1598 1 : return i.iterValidityState == IterValid
1599 : }
1600 :
1601 : // Next moves the iterator to the next key/value pair. Returns true if the
1602 : // iterator is pointing at a valid entry and false otherwise.
1603 1 : func (i *Iterator) Next() bool {
1604 1 : return i.nextWithLimit(nil) == IterValid
1605 1 : }
1606 :
1607 : // NextWithLimit moves the iterator to the next key/value pair.
1608 : //
1609 : // If limit is provided, it serves as a best-effort exclusive limit. If the next
1610 : // key is greater than or equal to limit, the Iterator may pause and return
1611 : // IterAtLimit. Because limits are best-effort, NextWithLimit may return a key
1612 : // beyond limit.
1613 : //
1614 : // If the Iterator is configured to iterate over range keys, NextWithLimit
1615 : // guarantees it will surface any range keys with bounds overlapping the
1616 : // keyspace up to limit.
1617 1 : func (i *Iterator) NextWithLimit(limit []byte) IterValidityState {
1618 1 : return i.nextWithLimit(limit)
1619 1 : }
1620 :
1621 : // NextPrefix moves the iterator to the next key/value pair with a key
1622 : // containing a different prefix than the current key. Prefixes are determined
1623 : // by Comparer.Split. Exhausts the iterator if invoked while in prefix-iteration
1624 : // mode.
1625 : //
1626 : // It is not permitted to invoke NextPrefix while at a IterAtLimit position.
1627 : // When called in this condition, NextPrefix has non-deterministic behavior.
1628 : //
1629 : // It is not permitted to invoke NextPrefix when the Iterator has an
1630 : // upper-bound that is a versioned MVCC key (see the comment for
1631 : // Comparer.Split). It returns an error in this case.
1632 1 : func (i *Iterator) NextPrefix() bool {
1633 1 : if i.nextPrefixNotPermittedByUpperBound {
1634 1 : i.lastPositioningOp = unknownLastPositionOp
1635 1 : i.requiresReposition = false
1636 1 : i.err = errors.Errorf("NextPrefix not permitted with upper bound %s",
1637 1 : i.comparer.FormatKey(i.opts.UpperBound))
1638 1 : i.iterValidityState = IterExhausted
1639 1 : return false
1640 1 : }
1641 1 : if i.hasPrefix {
1642 1 : i.iterValidityState = IterExhausted
1643 1 : return false
1644 1 : }
1645 1 : return i.nextPrefix() == IterValid
1646 : }
1647 :
1648 1 : func (i *Iterator) nextPrefix() IterValidityState {
1649 1 : if i.rangeKey != nil {
1650 1 : // NB: Check Valid() before clearing requiresReposition.
1651 1 : i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
1652 1 : // If we have a range key but did not expose it at the previous iterator
1653 1 : // position (because the iterator was not at a valid position), updated
1654 1 : // must be true. This ensures that after an iterator op sequence like:
1655 1 : // - Next() → (IterValid, RangeBounds() = [a,b))
1656 1 : // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -)
1657 1 : // - NextWithLimit(...) → (IterValid, RangeBounds() = [a,b))
1658 1 : // the iterator returns RangeKeyChanged()=true.
1659 1 : //
1660 1 : // The remainder of this function will only update i.rangeKey.updated if
1661 1 : // the iterator moves into a new range key, or out of the current range
1662 1 : // key.
1663 1 : i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys()
1664 1 : }
1665 :
1666 : // Although NextPrefix documents that behavior at IterAtLimit is undefined,
1667 : // this function handles these cases as a simple prefix-agnostic Next. This
1668 : // is done for deterministic behavior in the metamorphic tests.
1669 : //
1670 : // TODO(jackson): If the metamorphic test operation generator is adjusted to
1671 : // make generation of some operations conditional on the previous
1672 : // operations, then we can remove this behavior and explicitly error.
1673 :
1674 1 : i.lastPositioningOp = unknownLastPositionOp
1675 1 : i.requiresReposition = false
1676 1 : switch i.pos {
1677 1 : case iterPosCurForward:
1678 1 : // Positioned on the current key. Advance to the next prefix.
1679 1 : i.internalNextPrefix(i.split(i.key))
1680 1 : case iterPosCurForwardPaused:
1681 : // Positioned at a limit. Implement as a prefix-agnostic Next. See TODO
1682 : // up above. The iterator is already positioned at the next key.
1683 1 : case iterPosCurReverse:
1684 1 : // Switching directions.
1685 1 : // Unless the iterator was exhausted, reverse iteration needs to
1686 1 : // position the iterator at iterPosPrev.
1687 1 : if i.iterKey != nil {
1688 0 : i.err = errors.New("switching from reverse to forward but iter is not at prev")
1689 0 : i.iterValidityState = IterExhausted
1690 0 : return i.iterValidityState
1691 0 : }
1692 : // The Iterator is exhausted and i.iter is positioned before the first
1693 : // key. Reposition to point to the first internal key.
1694 1 : i.iterFirstWithinBounds()
1695 1 : case iterPosCurReversePaused:
1696 1 : // Positioned at a limit. Implement as a prefix-agnostic Next. See TODO
1697 1 : // up above.
1698 1 : //
1699 1 : // Switching directions; The iterator must not be exhausted since it
1700 1 : // paused.
1701 1 : if i.iterKey == nil {
1702 0 : i.err = errors.New("switching paused from reverse to forward but iter is exhausted")
1703 0 : i.iterValidityState = IterExhausted
1704 0 : return i.iterValidityState
1705 0 : }
1706 1 : i.nextUserKey()
1707 1 : case iterPosPrev:
1708 1 : // The underlying iterator is pointed to the previous key (this can
1709 1 : // only happen when switching iteration directions).
1710 1 : if i.iterKey == nil {
1711 1 : // We're positioned before the first key. Need to reposition to point to
1712 1 : // the first key.
1713 1 : i.iterFirstWithinBounds()
1714 1 : } else {
1715 1 : // Move the internal iterator back onto the user key stored in
1716 1 : // i.key. iterPosPrev guarantees that it's positioned at the last
1717 1 : // key with the user key less than i.key, so we're guaranteed to
1718 1 : // land on the correct key with a single Next.
1719 1 : i.iterKey, i.iterValue = i.iter.Next()
1720 1 : if invariants.Enabled && !i.equal(i.iterKey.UserKey, i.key) {
1721 0 : i.opts.logger.Fatalf("pebble: invariant violation: Nexting internal iterator from iterPosPrev landed on %q, not %q",
1722 0 : i.iterKey.UserKey, i.key)
1723 0 : }
1724 : }
1725 : // The internal iterator is now positioned at i.key. Advance to the next
1726 : // prefix.
1727 1 : i.internalNextPrefix(i.split(i.key))
1728 1 : case iterPosNext:
1729 1 : // Already positioned on the next key. Only call nextPrefixKey if the
1730 1 : // next key shares the same prefix.
1731 1 : if i.iterKey != nil {
1732 1 : currKeyPrefixLen := i.split(i.key)
1733 1 : iterKeyPrefixLen := i.split(i.iterKey.UserKey)
1734 1 : if bytes.Equal(i.iterKey.UserKey[:iterKeyPrefixLen], i.key[:currKeyPrefixLen]) {
1735 1 : i.internalNextPrefix(currKeyPrefixLen)
1736 1 : }
1737 : }
1738 : }
1739 :
1740 1 : i.stats.ForwardStepCount[InterfaceCall]++
1741 1 : i.findNextEntry(nil /* limit */)
1742 1 : i.maybeSampleRead()
1743 1 : return i.iterValidityState
1744 : }
1745 :
1746 1 : func (i *Iterator) internalNextPrefix(currKeyPrefixLen int) {
1747 1 : if i.iterKey == nil {
1748 1 : return
1749 1 : }
1750 : // The Next "fast-path" is not really a fast-path when there is more than
1751 : // one version. However, even with TableFormatPebblev3, there is a small
1752 : // slowdown (~10%) for one version if we remove it and only call NextPrefix.
1753 : // When there are two versions, only calling NextPrefix is ~30% faster.
1754 1 : i.stats.ForwardStepCount[InternalIterCall]++
1755 1 : if i.iterKey, i.iterValue = i.iter.Next(); i.iterKey == nil {
1756 1 : return
1757 1 : }
1758 1 : iterKeyPrefixLen := i.split(i.iterKey.UserKey)
1759 1 : if !bytes.Equal(i.iterKey.UserKey[:iterKeyPrefixLen], i.key[:currKeyPrefixLen]) {
1760 1 : return
1761 1 : }
1762 1 : i.stats.ForwardStepCount[InternalIterCall]++
1763 1 : i.prefixOrFullSeekKey = i.comparer.ImmediateSuccessor(i.prefixOrFullSeekKey[:0], i.key[:currKeyPrefixLen])
1764 1 : i.iterKey, i.iterValue = i.iter.NextPrefix(i.prefixOrFullSeekKey)
1765 1 : if invariants.Enabled && i.iterKey != nil {
1766 1 : if iterKeyPrefixLen := i.split(i.iterKey.UserKey); i.cmp(i.iterKey.UserKey[:iterKeyPrefixLen], i.prefixOrFullSeekKey) < 0 {
1767 0 : panic(errors.AssertionFailedf("pebble: iter.NextPrefix did not advance beyond the current prefix: now at %q; expected to be geq %q",
1768 0 : i.iterKey, i.prefixOrFullSeekKey))
1769 : }
1770 : }
1771 : }
1772 :
1773 1 : func (i *Iterator) nextWithLimit(limit []byte) IterValidityState {
1774 1 : i.stats.ForwardStepCount[InterfaceCall]++
1775 1 : if i.hasPrefix {
1776 1 : if limit != nil {
1777 1 : i.err = errors.New("cannot use limit with prefix iteration")
1778 1 : i.iterValidityState = IterExhausted
1779 1 : return i.iterValidityState
1780 1 : } else if i.iterValidityState == IterExhausted {
1781 1 : // No-op, already exhasuted. We avoid executing the Next because it
1782 1 : // can break invariants: Specifically, a file that fails the bloom
1783 1 : // filter test may result in its level being removed from the
1784 1 : // merging iterator. The level's removal can cause a lazy combined
1785 1 : // iterator to miss range keys and trigger a switch to combined
1786 1 : // iteration at a larger key, breaking keyspan invariants.
1787 1 : return i.iterValidityState
1788 1 : }
1789 : }
1790 1 : if i.err != nil {
1791 1 : return i.iterValidityState
1792 1 : }
1793 1 : if i.rangeKey != nil {
1794 1 : // NB: Check Valid() before clearing requiresReposition.
1795 1 : i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
1796 1 : // If we have a range key but did not expose it at the previous iterator
1797 1 : // position (because the iterator was not at a valid position), updated
1798 1 : // must be true. This ensures that after an iterator op sequence like:
1799 1 : // - Next() → (IterValid, RangeBounds() = [a,b))
1800 1 : // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -)
1801 1 : // - NextWithLimit(...) → (IterValid, RangeBounds() = [a,b))
1802 1 : // the iterator returns RangeKeyChanged()=true.
1803 1 : //
1804 1 : // The remainder of this function will only update i.rangeKey.updated if
1805 1 : // the iterator moves into a new range key, or out of the current range
1806 1 : // key.
1807 1 : i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys()
1808 1 : }
1809 1 : i.lastPositioningOp = unknownLastPositionOp
1810 1 : i.requiresReposition = false
1811 1 : switch i.pos {
1812 1 : case iterPosCurForward:
1813 1 : i.nextUserKey()
1814 1 : case iterPosCurForwardPaused:
1815 : // Already at the right place.
1816 1 : case iterPosCurReverse:
1817 1 : // Switching directions.
1818 1 : // Unless the iterator was exhausted, reverse iteration needs to
1819 1 : // position the iterator at iterPosPrev.
1820 1 : if i.iterKey != nil {
1821 0 : i.err = errors.New("switching from reverse to forward but iter is not at prev")
1822 0 : i.iterValidityState = IterExhausted
1823 0 : return i.iterValidityState
1824 0 : }
1825 : // We're positioned before the first key. Need to reposition to point to
1826 : // the first key.
1827 1 : i.iterFirstWithinBounds()
1828 1 : case iterPosCurReversePaused:
1829 1 : // Switching directions.
1830 1 : // The iterator must not be exhausted since it paused.
1831 1 : if i.iterKey == nil {
1832 0 : i.err = errors.New("switching paused from reverse to forward but iter is exhausted")
1833 0 : i.iterValidityState = IterExhausted
1834 0 : return i.iterValidityState
1835 0 : }
1836 1 : i.nextUserKey()
1837 1 : case iterPosPrev:
1838 1 : // The underlying iterator is pointed to the previous key (this can
1839 1 : // only happen when switching iteration directions). We set
1840 1 : // i.iterValidityState to IterExhausted here to force the calls to
1841 1 : // nextUserKey to save the current key i.iter is pointing at in order
1842 1 : // to determine when the next user-key is reached.
1843 1 : i.iterValidityState = IterExhausted
1844 1 : if i.iterKey == nil {
1845 1 : // We're positioned before the first key. Need to reposition to point to
1846 1 : // the first key.
1847 1 : i.iterFirstWithinBounds()
1848 1 : } else {
1849 1 : i.nextUserKey()
1850 1 : }
1851 1 : i.nextUserKey()
1852 1 : case iterPosNext:
1853 : // Already at the right place.
1854 : }
1855 1 : i.findNextEntry(limit)
1856 1 : i.maybeSampleRead()
1857 1 : return i.iterValidityState
1858 : }
1859 :
1860 : // Prev moves the iterator to the previous key/value pair. Returns true if the
1861 : // iterator is pointing at a valid entry and false otherwise.
1862 1 : func (i *Iterator) Prev() bool {
1863 1 : return i.PrevWithLimit(nil) == IterValid
1864 1 : }
1865 :
1866 : // PrevWithLimit moves the iterator to the previous key/value pair.
1867 : //
1868 : // If limit is provided, it serves as a best-effort inclusive limit. If the
1869 : // previous key is less than limit, the Iterator may pause and return
1870 : // IterAtLimit. Because limits are best-effort, PrevWithLimit may return a key
1871 : // beyond limit.
1872 : //
1873 : // If the Iterator is configured to iterate over range keys, PrevWithLimit
1874 : // guarantees it will surface any range keys with bounds overlapping the
1875 : // keyspace up to limit.
1876 1 : func (i *Iterator) PrevWithLimit(limit []byte) IterValidityState {
1877 1 : i.stats.ReverseStepCount[InterfaceCall]++
1878 1 : if i.err != nil {
1879 1 : return i.iterValidityState
1880 1 : }
1881 1 : if i.rangeKey != nil {
1882 1 : // NB: Check Valid() before clearing requiresReposition.
1883 1 : i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
1884 1 : // If we have a range key but did not expose it at the previous iterator
1885 1 : // position (because the iterator was not at a valid position), updated
1886 1 : // must be true. This ensures that after an iterator op sequence like:
1887 1 : // - Next() → (IterValid, RangeBounds() = [a,b))
1888 1 : // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -)
1889 1 : // - PrevWithLimit(...) → (IterValid, RangeBounds() = [a,b))
1890 1 : // the iterator returns RangeKeyChanged()=true.
1891 1 : //
1892 1 : // The remainder of this function will only update i.rangeKey.updated if
1893 1 : // the iterator moves into a new range key, or out of the current range
1894 1 : // key.
1895 1 : i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys()
1896 1 : }
1897 1 : i.lastPositioningOp = unknownLastPositionOp
1898 1 : i.requiresReposition = false
1899 1 : if i.hasPrefix {
1900 1 : i.err = errReversePrefixIteration
1901 1 : i.iterValidityState = IterExhausted
1902 1 : return i.iterValidityState
1903 1 : }
1904 1 : switch i.pos {
1905 1 : case iterPosCurForward:
1906 : // Switching directions, and will handle this below.
1907 1 : case iterPosCurForwardPaused:
1908 : // Switching directions, and will handle this below.
1909 1 : case iterPosCurReverse:
1910 1 : i.prevUserKey()
1911 1 : case iterPosCurReversePaused:
1912 : // Already at the right place.
1913 1 : case iterPosNext:
1914 : // The underlying iterator is pointed to the next key (this can only happen
1915 : // when switching iteration directions). We will handle this below.
1916 1 : case iterPosPrev:
1917 : // Already at the right place.
1918 : }
1919 1 : if i.pos == iterPosCurForward || i.pos == iterPosNext || i.pos == iterPosCurForwardPaused {
1920 1 : // Switching direction.
1921 1 : stepAgain := i.pos == iterPosNext
1922 1 :
1923 1 : // Synthetic range key markers are a special case. Consider SeekGE(b)
1924 1 : // which finds a range key [a, c). To ensure the user observes the range
1925 1 : // key, the Iterator pauses at Key() = b. The iterator must advance the
1926 1 : // internal iterator to see if there's also a coincident point key at
1927 1 : // 'b', leaving the iterator at iterPosNext if there's not.
1928 1 : //
1929 1 : // This is a problem: Synthetic range key markers are only interleaved
1930 1 : // during the original seek. A subsequent Prev() of i.iter will not move
1931 1 : // back onto the synthetic range key marker. In this case where the
1932 1 : // previous iterator position was a synthetic range key start boundary,
1933 1 : // we must not step a second time.
1934 1 : if i.isEphemeralPosition() {
1935 1 : stepAgain = false
1936 1 : }
1937 :
1938 : // We set i.iterValidityState to IterExhausted here to force the calls
1939 : // to prevUserKey to save the current key i.iter is pointing at in
1940 : // order to determine when the prev user-key is reached.
1941 1 : i.iterValidityState = IterExhausted
1942 1 : if i.iterKey == nil {
1943 1 : // We're positioned after the last key. Need to reposition to point to
1944 1 : // the last key.
1945 1 : i.iterLastWithinBounds()
1946 1 : } else {
1947 1 : i.prevUserKey()
1948 1 : }
1949 1 : if stepAgain {
1950 1 : i.prevUserKey()
1951 1 : }
1952 : }
1953 1 : i.findPrevEntry(limit)
1954 1 : i.maybeSampleRead()
1955 1 : return i.iterValidityState
1956 : }
1957 :
1958 : // iterFirstWithinBounds moves the internal iterator to the first key,
1959 : // respecting bounds.
1960 1 : func (i *Iterator) iterFirstWithinBounds() {
1961 1 : i.stats.ForwardSeekCount[InternalIterCall]++
1962 1 : if lowerBound := i.opts.GetLowerBound(); lowerBound != nil {
1963 1 : i.iterKey, i.iterValue = i.iter.SeekGE(lowerBound, base.SeekGEFlagsNone)
1964 1 : } else {
1965 1 : i.iterKey, i.iterValue = i.iter.First()
1966 1 : }
1967 : }
1968 :
1969 : // iterLastWithinBounds moves the internal iterator to the last key, respecting
1970 : // bounds.
1971 1 : func (i *Iterator) iterLastWithinBounds() {
1972 1 : i.stats.ReverseSeekCount[InternalIterCall]++
1973 1 : if upperBound := i.opts.GetUpperBound(); upperBound != nil {
1974 1 : i.iterKey, i.iterValue = i.iter.SeekLT(upperBound, base.SeekLTFlagsNone)
1975 1 : } else {
1976 1 : i.iterKey, i.iterValue = i.iter.Last()
1977 1 : }
1978 : }
1979 :
1980 : // RangeKeyData describes a range key's data, set through RangeKeySet. The key
1981 : // boundaries of the range key is provided by Iterator.RangeBounds.
1982 : type RangeKeyData struct {
1983 : Suffix []byte
1984 : Value []byte
1985 : }
1986 :
1987 : // rangeKeyWithinLimit is called during limited reverse iteration when
1988 : // positioned over a key beyond the limit. If there exists a range key that lies
1989 : // within the limit, the iterator must not pause in order to ensure the user has
1990 : // an opportunity to observe the range key within limit.
1991 : //
1992 : // It would be valid to ignore the limit whenever there's a range key covering
1993 : // the key, but that would introduce nondeterminism. To preserve determinism for
1994 : // testing, the iterator ignores the limit only if the covering range key does
1995 : // cover the keyspace within the limit.
1996 : //
1997 : // This awkwardness exists because range keys are interleaved at their inclusive
1998 : // start positions. Note that limit is inclusive.
1999 1 : func (i *Iterator) rangeKeyWithinLimit(limit []byte) bool {
2000 1 : if i.rangeKey == nil || !i.opts.rangeKeys() {
2001 1 : return false
2002 1 : }
2003 1 : s := i.rangeKey.iiter.Span()
2004 1 : // If the range key ends beyond the limit, then the range key does not cover
2005 1 : // any portion of the keyspace within the limit and it is safe to pause.
2006 1 : return s != nil && i.cmp(s.End, limit) > 0
2007 : }
2008 :
2009 : // saveRangeKey saves the current range key to the underlying iterator's current
2010 : // range key state. If the range key has not changed, saveRangeKey is a no-op.
2011 : // If there is a new range key, saveRangeKey copies all of the key, value and
2012 : // suffixes into Iterator-managed buffers.
2013 1 : func (i *Iterator) saveRangeKey() {
2014 1 : if i.rangeKey == nil || i.opts.KeyTypes == IterKeyTypePointsOnly {
2015 1 : return
2016 1 : }
2017 :
2018 1 : s := i.rangeKey.iiter.Span()
2019 1 : if s == nil {
2020 1 : i.rangeKey.hasRangeKey = false
2021 1 : i.rangeKey.updated = i.rangeKey.prevPosHadRangeKey
2022 1 : return
2023 1 : } else if !i.rangeKey.stale {
2024 1 : // The range key `s` is identical to the one currently saved. No-op.
2025 1 : return
2026 1 : }
2027 :
2028 1 : if s.KeysOrder != keyspan.BySuffixAsc {
2029 0 : panic("pebble: range key span's keys unexpectedly not in ascending suffix order")
2030 : }
2031 :
2032 : // Although `i.rangeKey.stale` is true, the span s may still be identical
2033 : // to the currently saved span. This is possible when seeking the iterator,
2034 : // which may land back on the same range key. If we previously had a range
2035 : // key and the new one has an identical start key, then it must be the same
2036 : // range key and we can avoid copying and keep `i.rangeKey.updated=false`.
2037 : //
2038 : // TODO(jackson): These key comparisons could be avoidable during relative
2039 : // positioning operations continuing in the same direction, because these
2040 : // ops will never encounter the previous position's range key while
2041 : // stale=true. However, threading whether the current op is a seek or step
2042 : // maybe isn't worth it. This key comparison is only necessary once when we
2043 : // step onto a new range key, which should be relatively rare.
2044 1 : if i.rangeKey.prevPosHadRangeKey && i.equal(i.rangeKey.start, s.Start) &&
2045 1 : i.equal(i.rangeKey.end, s.End) {
2046 1 : i.rangeKey.updated = false
2047 1 : i.rangeKey.stale = false
2048 1 : i.rangeKey.hasRangeKey = true
2049 1 : return
2050 1 : }
2051 1 : i.stats.RangeKeyStats.Count += len(s.Keys)
2052 1 : i.rangeKey.buf.Reset()
2053 1 : i.rangeKey.hasRangeKey = true
2054 1 : i.rangeKey.updated = true
2055 1 : i.rangeKey.stale = false
2056 1 : i.rangeKey.buf, i.rangeKey.start = i.rangeKey.buf.Copy(s.Start)
2057 1 : i.rangeKey.buf, i.rangeKey.end = i.rangeKey.buf.Copy(s.End)
2058 1 : i.rangeKey.keys = i.rangeKey.keys[:0]
2059 1 : for j := 0; j < len(s.Keys); j++ {
2060 1 : if invariants.Enabled {
2061 1 : if s.Keys[j].Kind() != base.InternalKeyKindRangeKeySet {
2062 0 : panic("pebble: user iteration encountered non-RangeKeySet key kind")
2063 1 : } else if j > 0 && i.cmp(s.Keys[j].Suffix, s.Keys[j-1].Suffix) < 0 {
2064 0 : panic("pebble: user iteration encountered range keys not in suffix order")
2065 : }
2066 : }
2067 1 : var rkd RangeKeyData
2068 1 : i.rangeKey.buf, rkd.Suffix = i.rangeKey.buf.Copy(s.Keys[j].Suffix)
2069 1 : i.rangeKey.buf, rkd.Value = i.rangeKey.buf.Copy(s.Keys[j].Value)
2070 1 : i.rangeKey.keys = append(i.rangeKey.keys, rkd)
2071 : }
2072 : }
2073 :
2074 : // RangeKeyChanged indicates whether the most recent iterator positioning
2075 : // operation resulted in the iterator stepping into or out of a new range key.
2076 : // If true, previously returned range key bounds and data has been invalidated.
2077 : // If false, previously obtained range key bounds, suffix and value slices are
2078 : // still valid and may continue to be read.
2079 : //
2080 : // Invalid iterator positions are considered to not hold range keys, meaning
2081 : // that if an iterator steps from an IterExhausted or IterAtLimit position onto
2082 : // a position with a range key, RangeKeyChanged will yield true.
2083 1 : func (i *Iterator) RangeKeyChanged() bool {
2084 1 : return i.iterValidityState == IterValid && i.rangeKey != nil && i.rangeKey.updated
2085 1 : }
2086 :
2087 : // HasPointAndRange indicates whether there exists a point key, a range key or
2088 : // both at the current iterator position.
2089 1 : func (i *Iterator) HasPointAndRange() (hasPoint, hasRange bool) {
2090 1 : if i.iterValidityState != IterValid || i.requiresReposition {
2091 1 : return false, false
2092 1 : }
2093 1 : if i.opts.KeyTypes == IterKeyTypePointsOnly {
2094 1 : return true, false
2095 1 : }
2096 1 : return i.rangeKey == nil || !i.rangeKey.rangeKeyOnly, i.rangeKey != nil && i.rangeKey.hasRangeKey
2097 : }
2098 :
2099 : // RangeBounds returns the start (inclusive) and end (exclusive) bounds of the
2100 : // range key covering the current iterator position. RangeBounds returns nil
2101 : // bounds if there is no range key covering the current iterator position, or
2102 : // the iterator is not configured to surface range keys.
2103 : //
2104 : // If valid, the returned start bound is less than or equal to Key() and the
2105 : // returned end bound is greater than Key().
2106 1 : func (i *Iterator) RangeBounds() (start, end []byte) {
2107 1 : if i.rangeKey == nil || !i.opts.rangeKeys() || !i.rangeKey.hasRangeKey {
2108 0 : return nil, nil
2109 0 : }
2110 1 : return i.rangeKey.start, i.rangeKey.end
2111 : }
2112 :
2113 : // Key returns the key of the current key/value pair, or nil if done. The
2114 : // caller should not modify the contents of the returned slice, and its
2115 : // contents may change on the next call to Next.
2116 : //
2117 : // If positioned at an iterator position that only holds a range key, Key()
2118 : // always returns the start bound of the range key. Otherwise, it returns the
2119 : // point key's key.
2120 1 : func (i *Iterator) Key() []byte {
2121 1 : return i.key
2122 1 : }
2123 :
2124 : // Value returns the value of the current key/value pair, or nil if done. The
2125 : // caller should not modify the contents of the returned slice, and its
2126 : // contents may change on the next call to Next.
2127 : //
2128 : // Only valid if HasPointAndRange() returns true for hasPoint.
2129 : // Deprecated: use ValueAndErr instead.
2130 1 : func (i *Iterator) Value() []byte {
2131 1 : val, _ := i.ValueAndErr()
2132 1 : return val
2133 1 : }
2134 :
2135 : // ValueAndErr returns the value, and any error encountered in extracting the value.
2136 : // REQUIRES: i.Error()==nil and HasPointAndRange() returns true for hasPoint.
2137 : //
2138 : // The caller should not modify the contents of the returned slice, and its
2139 : // contents may change on the next call to Next.
2140 1 : func (i *Iterator) ValueAndErr() ([]byte, error) {
2141 1 : val, callerOwned, err := i.value.Value(i.lazyValueBuf)
2142 1 : if err != nil {
2143 0 : i.err = err
2144 0 : }
2145 1 : if callerOwned {
2146 1 : i.lazyValueBuf = val[:0]
2147 1 : }
2148 1 : return val, err
2149 : }
2150 :
2151 : // LazyValue returns the LazyValue. Only for advanced use cases.
2152 : // REQUIRES: i.Error()==nil and HasPointAndRange() returns true for hasPoint.
2153 0 : func (i *Iterator) LazyValue() LazyValue {
2154 0 : return i.value
2155 0 : }
2156 :
2157 : // RangeKeys returns the range key values and their suffixes covering the
2158 : // current iterator position. The range bounds may be retrieved separately
2159 : // through Iterator.RangeBounds().
2160 1 : func (i *Iterator) RangeKeys() []RangeKeyData {
2161 1 : if i.rangeKey == nil || !i.opts.rangeKeys() || !i.rangeKey.hasRangeKey {
2162 0 : return nil
2163 0 : }
2164 1 : return i.rangeKey.keys
2165 : }
2166 :
2167 : // Valid returns true if the iterator is positioned at a valid key/value pair
2168 : // and false otherwise.
2169 1 : func (i *Iterator) Valid() bool {
2170 1 : return i.iterValidityState == IterValid && !i.requiresReposition
2171 1 : }
2172 :
2173 : // Error returns any accumulated error.
2174 1 : func (i *Iterator) Error() error {
2175 1 : if i.iter != nil {
2176 1 : return firstError(i.err, i.iter.Error())
2177 1 : }
2178 0 : return i.err
2179 : }
2180 :
2181 : const maxKeyBufCacheSize = 4 << 10 // 4 KB
2182 :
2183 : // Close closes the iterator and returns any accumulated error. Exhausting
2184 : // all the key/value pairs in a table is not considered to be an error.
2185 : // It is not valid to call any method, including Close, after the iterator
2186 : // has been closed.
2187 1 : func (i *Iterator) Close() error {
2188 1 : // Close the child iterator before releasing the readState because when the
2189 1 : // readState is released sstables referenced by the readState may be deleted
2190 1 : // which will fail on Windows if the sstables are still open by the child
2191 1 : // iterator.
2192 1 : if i.iter != nil {
2193 1 : i.err = firstError(i.err, i.iter.Close())
2194 1 :
2195 1 : // Closing i.iter did not necessarily close the point and range key
2196 1 : // iterators. Calls to SetOptions may have 'disconnected' either one
2197 1 : // from i.iter if iteration key types were changed. Both point and range
2198 1 : // key iterators are preserved in case the iterator needs to switch key
2199 1 : // types again. We explicitly close both of these iterators here.
2200 1 : //
2201 1 : // NB: If the iterators were still connected to i.iter, they may be
2202 1 : // closed, but calling Close on a closed internal iterator or fragment
2203 1 : // iterator is allowed.
2204 1 : if i.pointIter != nil && !i.closePointIterOnce {
2205 1 : i.err = firstError(i.err, i.pointIter.Close())
2206 1 : }
2207 1 : if i.rangeKey != nil && i.rangeKey.rangeKeyIter != nil {
2208 1 : i.err = firstError(i.err, i.rangeKey.rangeKeyIter.Close())
2209 1 : }
2210 : }
2211 1 : err := i.err
2212 1 :
2213 1 : if i.readState != nil {
2214 1 : if i.readSampling.pendingCompactions.size > 0 {
2215 0 : // Copy pending read compactions using db.mu.Lock()
2216 0 : i.readState.db.mu.Lock()
2217 0 : i.readState.db.mu.compact.readCompactions.combine(&i.readSampling.pendingCompactions, i.cmp)
2218 0 : reschedule := i.readState.db.mu.compact.rescheduleReadCompaction
2219 0 : i.readState.db.mu.compact.rescheduleReadCompaction = false
2220 0 : concurrentCompactions := i.readState.db.mu.compact.compactingCount
2221 0 : i.readState.db.mu.Unlock()
2222 0 :
2223 0 : if reschedule && concurrentCompactions == 0 {
2224 0 : // In a read heavy workload, flushes may not happen frequently enough to
2225 0 : // schedule compactions.
2226 0 : i.readState.db.compactionSchedulers.Add(1)
2227 0 : go i.readState.db.maybeScheduleCompactionAsync()
2228 0 : }
2229 : }
2230 :
2231 1 : i.readState.unref()
2232 1 : i.readState = nil
2233 : }
2234 :
2235 1 : if i.version != nil {
2236 1 : i.version.Unref()
2237 1 : }
2238 :
2239 1 : for _, readers := range i.externalReaders {
2240 0 : for _, r := range readers {
2241 0 : err = firstError(err, r.Close())
2242 0 : }
2243 : }
2244 :
2245 : // Close the closer for the current value if one was open.
2246 1 : if i.valueCloser != nil {
2247 0 : err = firstError(err, i.valueCloser.Close())
2248 0 : i.valueCloser = nil
2249 0 : }
2250 :
2251 1 : if i.rangeKey != nil {
2252 1 :
2253 1 : i.rangeKey.rangeKeyBuffers.PrepareForReuse()
2254 1 : *i.rangeKey = iteratorRangeKeyState{
2255 1 : rangeKeyBuffers: i.rangeKey.rangeKeyBuffers,
2256 1 : }
2257 1 : iterRangeKeyStateAllocPool.Put(i.rangeKey)
2258 1 : i.rangeKey = nil
2259 1 : }
2260 1 : if alloc := i.alloc; alloc != nil {
2261 1 : // Avoid caching the key buf if it is overly large. The constant is fairly
2262 1 : // arbitrary.
2263 1 : if cap(i.keyBuf) >= maxKeyBufCacheSize {
2264 0 : alloc.keyBuf = nil
2265 1 : } else {
2266 1 : alloc.keyBuf = i.keyBuf
2267 1 : }
2268 1 : if cap(i.prefixOrFullSeekKey) >= maxKeyBufCacheSize {
2269 0 : alloc.prefixOrFullSeekKey = nil
2270 1 : } else {
2271 1 : alloc.prefixOrFullSeekKey = i.prefixOrFullSeekKey
2272 1 : }
2273 1 : for j := range i.boundsBuf {
2274 1 : if cap(i.boundsBuf[j]) >= maxKeyBufCacheSize {
2275 0 : alloc.boundsBuf[j] = nil
2276 1 : } else {
2277 1 : alloc.boundsBuf[j] = i.boundsBuf[j]
2278 1 : }
2279 : }
2280 1 : *alloc = iterAlloc{
2281 1 : keyBuf: alloc.keyBuf,
2282 1 : boundsBuf: alloc.boundsBuf,
2283 1 : prefixOrFullSeekKey: alloc.prefixOrFullSeekKey,
2284 1 : }
2285 1 : iterAllocPool.Put(alloc)
2286 1 : } else if alloc := i.getIterAlloc; alloc != nil {
2287 1 : if cap(i.keyBuf) >= maxKeyBufCacheSize {
2288 0 : alloc.keyBuf = nil
2289 1 : } else {
2290 1 : alloc.keyBuf = i.keyBuf
2291 1 : }
2292 1 : *alloc = getIterAlloc{
2293 1 : keyBuf: alloc.keyBuf,
2294 1 : }
2295 1 : getIterAllocPool.Put(alloc)
2296 : }
2297 1 : return err
2298 : }
2299 :
2300 : // SetBounds sets the lower and upper bounds for the iterator. Once SetBounds
2301 : // returns, the caller is free to mutate the provided slices.
2302 : //
2303 : // The iterator will always be invalidated and must be repositioned with a call
2304 : // to SeekGE, SeekPrefixGE, SeekLT, First, or Last.
2305 1 : func (i *Iterator) SetBounds(lower, upper []byte) {
2306 1 : // Ensure that the Iterator appears exhausted, regardless of whether we
2307 1 : // actually have to invalidate the internal iterator. Optimizations that
2308 1 : // avoid exhaustion are an internal implementation detail that shouldn't
2309 1 : // leak through the interface. The caller should still call an absolute
2310 1 : // positioning method to reposition the iterator.
2311 1 : i.requiresReposition = true
2312 1 :
2313 1 : if ((i.opts.LowerBound == nil) == (lower == nil)) &&
2314 1 : ((i.opts.UpperBound == nil) == (upper == nil)) &&
2315 1 : i.equal(i.opts.LowerBound, lower) &&
2316 1 : i.equal(i.opts.UpperBound, upper) {
2317 1 : // Unchanged, noop.
2318 1 : return
2319 1 : }
2320 :
2321 : // Copy the user-provided bounds into an Iterator-owned buffer, and set them
2322 : // on i.opts.{Lower,Upper}Bound.
2323 1 : i.processBounds(lower, upper)
2324 1 :
2325 1 : i.iter.SetBounds(i.opts.LowerBound, i.opts.UpperBound)
2326 1 : // If the iterator has an open point iterator that's not currently being
2327 1 : // used, propagate the new bounds to it.
2328 1 : if i.pointIter != nil && !i.opts.pointKeys() {
2329 1 : i.pointIter.SetBounds(i.opts.LowerBound, i.opts.UpperBound)
2330 1 : }
2331 : // If the iterator has a range key iterator, propagate bounds to it. The
2332 : // top-level SetBounds on the interleaving iterator (i.iter) won't propagate
2333 : // bounds to the range key iterator stack, because the FragmentIterator
2334 : // interface doesn't define a SetBounds method. We need to directly inform
2335 : // the iterConfig stack.
2336 1 : if i.rangeKey != nil {
2337 1 : i.rangeKey.iterConfig.SetBounds(i.opts.LowerBound, i.opts.UpperBound)
2338 1 : }
2339 :
2340 : // Even though this is not a positioning operation, the alteration of the
2341 : // bounds means we cannot optimize Seeks by using Next.
2342 1 : i.invalidate()
2343 : }
2344 :
2345 : // Initialization and changing of the bounds must call processBounds.
2346 : // processBounds saves the bounds and computes derived state from those
2347 : // bounds.
2348 1 : func (i *Iterator) processBounds(lower, upper []byte) {
2349 1 : // Copy the user-provided bounds into an Iterator-owned buffer. We can't
2350 1 : // overwrite the current bounds, because some internal iterators compare old
2351 1 : // and new bounds for optimizations.
2352 1 :
2353 1 : buf := i.boundsBuf[i.boundsBufIdx][:0]
2354 1 : if lower != nil {
2355 1 : buf = append(buf, lower...)
2356 1 : i.opts.LowerBound = buf
2357 1 : } else {
2358 1 : i.opts.LowerBound = nil
2359 1 : }
2360 1 : i.nextPrefixNotPermittedByUpperBound = false
2361 1 : if upper != nil {
2362 1 : buf = append(buf, upper...)
2363 1 : i.opts.UpperBound = buf[len(buf)-len(upper):]
2364 1 : if i.comparer.Split != nil {
2365 1 : if i.comparer.Split(i.opts.UpperBound) != len(i.opts.UpperBound) {
2366 1 : // Setting an upper bound that is a versioned MVCC key. This means
2367 1 : // that a key can have some MVCC versions before the upper bound and
2368 1 : // some after. This causes significant complications for NextPrefix,
2369 1 : // so we bar the user of NextPrefix.
2370 1 : i.nextPrefixNotPermittedByUpperBound = true
2371 1 : }
2372 : }
2373 1 : } else {
2374 1 : i.opts.UpperBound = nil
2375 1 : }
2376 1 : i.boundsBuf[i.boundsBufIdx] = buf
2377 1 : i.boundsBufIdx = 1 - i.boundsBufIdx
2378 : }
2379 :
2380 : // SetOptions sets new iterator options for the iterator. Note that the lower
2381 : // and upper bounds applied here will supersede any bounds set by previous calls
2382 : // to SetBounds.
2383 : //
2384 : // Note that the slices provided in this SetOptions must not be changed by the
2385 : // caller until the iterator is closed, or a subsequent SetBounds or SetOptions
2386 : // has returned. This is because comparisons between the existing and new bounds
2387 : // are sometimes used to optimize seeking. See the extended commentary on
2388 : // SetBounds.
2389 : //
2390 : // If the iterator was created over an indexed mutable batch, the iterator's
2391 : // view of the mutable batch is refreshed.
2392 : //
2393 : // The iterator will always be invalidated and must be repositioned with a call
2394 : // to SeekGE, SeekPrefixGE, SeekLT, First, or Last.
2395 : //
2396 : // If only lower and upper bounds need to be modified, prefer SetBounds.
2397 1 : func (i *Iterator) SetOptions(o *IterOptions) {
2398 1 : if i.externalReaders != nil {
2399 0 : if err := validateExternalIterOpts(o); err != nil {
2400 0 : panic(err)
2401 : }
2402 : }
2403 :
2404 : // Ensure that the Iterator appears exhausted, regardless of whether we
2405 : // actually have to invalidate the internal iterator. Optimizations that
2406 : // avoid exhaustion are an internal implementation detail that shouldn't
2407 : // leak through the interface. The caller should still call an absolute
2408 : // positioning method to reposition the iterator.
2409 1 : i.requiresReposition = true
2410 1 :
2411 1 : // Check if global state requires we close all internal iterators.
2412 1 : //
2413 1 : // If the Iterator is in an error state, invalidate the existing iterators
2414 1 : // so that we reconstruct an iterator state from scratch.
2415 1 : //
2416 1 : // If OnlyReadGuaranteedDurable changed, the iterator stacks are incorrect,
2417 1 : // improperly including or excluding memtables. Invalidate them so that
2418 1 : // finishInitializingIter will reconstruct them.
2419 1 : //
2420 1 : // If either the original options or the new options specify a table filter,
2421 1 : // we need to reconstruct the iterator stacks. If they both supply a table
2422 1 : // filter, we can't be certain that it's the same filter since we have no
2423 1 : // mechanism to compare the filter closures.
2424 1 : closeBoth := i.err != nil ||
2425 1 : o.OnlyReadGuaranteedDurable != i.opts.OnlyReadGuaranteedDurable ||
2426 1 : o.TableFilter != nil || i.opts.TableFilter != nil
2427 1 :
2428 1 : // If either options specify block property filters for an iterator stack,
2429 1 : // reconstruct it.
2430 1 : if i.pointIter != nil && (closeBoth || len(o.PointKeyFilters) > 0 || len(i.opts.PointKeyFilters) > 0 ||
2431 1 : o.RangeKeyMasking.Filter != nil || i.opts.RangeKeyMasking.Filter != nil) {
2432 1 : i.err = firstError(i.err, i.pointIter.Close())
2433 1 : i.pointIter = nil
2434 1 : }
2435 1 : if i.rangeKey != nil {
2436 1 : if closeBoth || len(o.RangeKeyFilters) > 0 || len(i.opts.RangeKeyFilters) > 0 {
2437 1 : i.err = firstError(i.err, i.rangeKey.rangeKeyIter.Close())
2438 1 : i.rangeKey = nil
2439 1 : } else {
2440 1 : // If there's still a range key iterator stack, invalidate the
2441 1 : // iterator. This ensures RangeKeyChanged() returns true if a
2442 1 : // subsequent positioning operation discovers a range key. It also
2443 1 : // prevents seek no-op optimizations.
2444 1 : i.invalidate()
2445 1 : }
2446 : }
2447 :
2448 : // If the iterator is backed by a batch that's been mutated, refresh its
2449 : // existing point and range-key iterators, and invalidate the iterator to
2450 : // prevent seek-using-next optimizations. If we don't yet have a point-key
2451 : // iterator or range-key iterator but we require one, it'll be created in
2452 : // the slow path that reconstructs the iterator in finishInitializingIter.
2453 1 : if i.batch != nil {
2454 1 : nextBatchSeqNum := (uint64(len(i.batch.data)) | base.InternalKeySeqNumBatch)
2455 1 : if nextBatchSeqNum != i.batchSeqNum {
2456 1 : i.batchSeqNum = nextBatchSeqNum
2457 1 : if i.merging != nil {
2458 1 : i.merging.batchSnapshot = nextBatchSeqNum
2459 1 : }
2460 : // Prevent a no-op seek optimization on the next seek. We won't be
2461 : // able to reuse the top-level Iterator state, because it may be
2462 : // incorrect after the inclusion of new batch mutations.
2463 1 : i.batchJustRefreshed = true
2464 1 : if i.pointIter != nil && i.batch.countRangeDels > 0 {
2465 1 : if i.batchRangeDelIter.Count() == 0 {
2466 0 : // When we constructed this iterator, there were no
2467 0 : // rangedels in the batch. Iterator construction will
2468 0 : // have excluded the batch rangedel iterator from the
2469 0 : // point iterator stack. We need to reconstruct the
2470 0 : // point iterator to add i.batchRangeDelIter into the
2471 0 : // iterator stack.
2472 0 : i.err = firstError(i.err, i.pointIter.Close())
2473 0 : i.pointIter = nil
2474 1 : } else {
2475 1 : // There are range deletions in the batch and we already
2476 1 : // have a batch rangedel iterator. We can update the
2477 1 : // batch rangedel iterator in place.
2478 1 : //
2479 1 : // NB: There may or may not be new range deletions. We
2480 1 : // can't tell based on i.batchRangeDelIter.Count(),
2481 1 : // which is the count of fragmented range deletions, NOT
2482 1 : // the number of range deletions written to the batch
2483 1 : // [i.batch.countRangeDels].
2484 1 : i.batch.initRangeDelIter(&i.opts, &i.batchRangeDelIter, nextBatchSeqNum)
2485 1 : }
2486 : }
2487 1 : if i.rangeKey != nil && i.batch.countRangeKeys > 0 {
2488 1 : if i.batchRangeKeyIter.Count() == 0 {
2489 1 : // When we constructed this iterator, there were no range
2490 1 : // keys in the batch. Iterator construction will have
2491 1 : // excluded the batch rangekey iterator from the range key
2492 1 : // iterator stack. We need to reconstruct the range key
2493 1 : // iterator to add i.batchRangeKeyIter into the iterator
2494 1 : // stack.
2495 1 : i.err = firstError(i.err, i.rangeKey.rangeKeyIter.Close())
2496 1 : i.rangeKey = nil
2497 1 : } else {
2498 0 : // There are range keys in the batch and we already
2499 0 : // have a batch rangekey iterator. We can update the batch
2500 0 : // rangekey iterator in place.
2501 0 : //
2502 0 : // NB: There may or may not be new range keys. We can't
2503 0 : // tell based on i.batchRangeKeyIter.Count(), which is the
2504 0 : // count of fragmented range keys, NOT the number of
2505 0 : // range keys written to the batch [i.batch.countRangeKeys].
2506 0 : i.batch.initRangeKeyIter(&i.opts, &i.batchRangeKeyIter, nextBatchSeqNum)
2507 0 : i.invalidate()
2508 0 : }
2509 : }
2510 : }
2511 : }
2512 :
2513 : // Reset combinedIterState.initialized in case the iterator key types
2514 : // changed. If there's already a range key iterator stack, the combined
2515 : // iterator is already initialized. Additionally, if the iterator is not
2516 : // configured to include range keys, mark it as initialized to signal that
2517 : // lower level iterators should not trigger a switch to combined iteration.
2518 1 : i.lazyCombinedIter.combinedIterState = combinedIterState{
2519 1 : initialized: i.rangeKey != nil || !i.opts.rangeKeys(),
2520 1 : }
2521 1 :
2522 1 : boundsEqual := ((i.opts.LowerBound == nil) == (o.LowerBound == nil)) &&
2523 1 : ((i.opts.UpperBound == nil) == (o.UpperBound == nil)) &&
2524 1 : i.equal(i.opts.LowerBound, o.LowerBound) &&
2525 1 : i.equal(i.opts.UpperBound, o.UpperBound)
2526 1 :
2527 1 : if boundsEqual && o.KeyTypes == i.opts.KeyTypes &&
2528 1 : (i.pointIter != nil || !i.opts.pointKeys()) &&
2529 1 : (i.rangeKey != nil || !i.opts.rangeKeys() || i.opts.KeyTypes == IterKeyTypePointsAndRanges) &&
2530 1 : i.equal(o.RangeKeyMasking.Suffix, i.opts.RangeKeyMasking.Suffix) &&
2531 1 : o.UseL6Filters == i.opts.UseL6Filters {
2532 1 : // The options are identical, so we can likely use the fast path. In
2533 1 : // addition to all the above constraints, we cannot use the fast path if
2534 1 : // configured to perform lazy combined iteration but an indexed batch
2535 1 : // used by the iterator now contains range keys. Lazy combined iteration
2536 1 : // is not compatible with batch range keys because we always need to
2537 1 : // merge the batch's range keys into iteration.
2538 1 : if i.rangeKey != nil || !i.opts.rangeKeys() || i.batch == nil || i.batch.countRangeKeys == 0 {
2539 1 : // Fast path. This preserves the Seek-using-Next optimizations as
2540 1 : // long as the iterator wasn't already invalidated up above.
2541 1 : return
2542 1 : }
2543 : }
2544 : // Slow path.
2545 :
2546 : // The options changed. Save the new ones to i.opts.
2547 1 : if boundsEqual {
2548 1 : // Copying the options into i.opts will overwrite LowerBound and
2549 1 : // UpperBound fields with the user-provided slices. We need to hold on
2550 1 : // to the Pebble-owned slices, so save them and re-set them after the
2551 1 : // copy.
2552 1 : lower, upper := i.opts.LowerBound, i.opts.UpperBound
2553 1 : i.opts = *o
2554 1 : i.opts.LowerBound, i.opts.UpperBound = lower, upper
2555 1 : } else {
2556 1 : i.opts = *o
2557 1 : i.processBounds(o.LowerBound, o.UpperBound)
2558 1 : // Propagate the changed bounds to the existing point iterator.
2559 1 : // NB: We propagate i.opts.{Lower,Upper}Bound, not o.{Lower,Upper}Bound
2560 1 : // because i.opts now point to buffers owned by Pebble.
2561 1 : if i.pointIter != nil {
2562 1 : i.pointIter.SetBounds(i.opts.LowerBound, i.opts.UpperBound)
2563 1 : }
2564 1 : if i.rangeKey != nil {
2565 1 : i.rangeKey.iterConfig.SetBounds(i.opts.LowerBound, i.opts.UpperBound)
2566 1 : }
2567 : }
2568 :
2569 : // Even though this is not a positioning operation, the invalidation of the
2570 : // iterator stack means we cannot optimize Seeks by using Next.
2571 1 : i.invalidate()
2572 1 :
2573 1 : // Iterators created through NewExternalIter have a different iterator
2574 1 : // initialization process.
2575 1 : if i.externalReaders != nil {
2576 0 : finishInitializingExternal(i.ctx, i)
2577 0 : return
2578 0 : }
2579 1 : finishInitializingIter(i.ctx, i.alloc)
2580 : }
2581 :
2582 1 : func (i *Iterator) invalidate() {
2583 1 : i.lastPositioningOp = invalidatedLastPositionOp
2584 1 : i.hasPrefix = false
2585 1 : i.iterKey = nil
2586 1 : i.iterValue = LazyValue{}
2587 1 : i.err = nil
2588 1 : // This switch statement isn't necessary for correctness since callers
2589 1 : // should call a repositioning method. We could have arbitrarily set i.pos
2590 1 : // to one of the values. But it results in more intuitive behavior in
2591 1 : // tests, which do not always reposition.
2592 1 : switch i.pos {
2593 1 : case iterPosCurForward, iterPosNext, iterPosCurForwardPaused:
2594 1 : i.pos = iterPosCurForward
2595 1 : case iterPosCurReverse, iterPosPrev, iterPosCurReversePaused:
2596 1 : i.pos = iterPosCurReverse
2597 : }
2598 1 : i.iterValidityState = IterExhausted
2599 1 : if i.rangeKey != nil {
2600 1 : i.rangeKey.iiter.Invalidate()
2601 1 : i.rangeKey.prevPosHadRangeKey = false
2602 1 : }
2603 : }
2604 :
2605 : // Metrics returns per-iterator metrics.
2606 0 : func (i *Iterator) Metrics() IteratorMetrics {
2607 0 : m := IteratorMetrics{
2608 0 : ReadAmp: 1,
2609 0 : }
2610 0 : if mi, ok := i.iter.(*mergingIter); ok {
2611 0 : m.ReadAmp = len(mi.levels)
2612 0 : }
2613 0 : return m
2614 : }
2615 :
2616 : // ResetStats resets the stats to 0.
2617 0 : func (i *Iterator) ResetStats() {
2618 0 : i.stats = IteratorStats{}
2619 0 : }
2620 :
2621 : // Stats returns the current stats.
2622 0 : func (i *Iterator) Stats() IteratorStats {
2623 0 : return i.stats
2624 0 : }
2625 :
2626 : // CloneOptions configures an iterator constructed through Iterator.Clone.
2627 : type CloneOptions struct {
2628 : // IterOptions, if non-nil, define the iterator options to configure a
2629 : // cloned iterator. If nil, the clone adopts the same IterOptions as the
2630 : // iterator being cloned.
2631 : IterOptions *IterOptions
2632 : // RefreshBatchView may be set to true when cloning an Iterator over an
2633 : // indexed batch. When false, the clone adopts the same (possibly stale)
2634 : // view of the indexed batch as the cloned Iterator. When true, the clone is
2635 : // constructed with a refreshed view of the batch, observing all of the
2636 : // batch's mutations at the time of the Clone. If the cloned iterator was
2637 : // not constructed to read over an indexed batch, RefreshVatchView has no
2638 : // effect.
2639 : RefreshBatchView bool
2640 : }
2641 :
2642 : // Clone creates a new Iterator over the same underlying data, i.e., over the
2643 : // same {batch, memtables, sstables}). The resulting iterator is not positioned.
2644 : // It starts with the same IterOptions, unless opts.IterOptions is set.
2645 : //
2646 : // When called on an Iterator over an indexed batch, the clone's visibility of
2647 : // the indexed batch is determined by CloneOptions.RefreshBatchView. If false,
2648 : // the clone inherits the iterator's current (possibly stale) view of the batch,
2649 : // and callers may call SetOptions to subsequently refresh the clone's view to
2650 : // include all batch mutations. If true, the clone is constructed with a
2651 : // complete view of the indexed batch's mutations at the time of the Clone.
2652 : //
2653 : // Callers can use Clone if they need multiple iterators that need to see
2654 : // exactly the same underlying state of the DB. This should not be used to
2655 : // extend the lifetime of the data backing the original Iterator since that
2656 : // will cause an increase in memory and disk usage (use NewSnapshot for that
2657 : // purpose).
2658 1 : func (i *Iterator) Clone(opts CloneOptions) (*Iterator, error) {
2659 1 : return i.CloneWithContext(context.Background(), opts)
2660 1 : }
2661 :
2662 : // CloneWithContext is like Clone, and additionally accepts a context for
2663 : // tracing.
2664 1 : func (i *Iterator) CloneWithContext(ctx context.Context, opts CloneOptions) (*Iterator, error) {
2665 1 : if opts.IterOptions == nil {
2666 1 : opts.IterOptions = &i.opts
2667 1 : }
2668 :
2669 1 : readState := i.readState
2670 1 : vers := i.version
2671 1 : if readState == nil && vers == nil {
2672 0 : return nil, errors.Errorf("cannot Clone a closed Iterator")
2673 0 : }
2674 : // i is already holding a ref, so there is no race with unref here.
2675 : //
2676 : // TODO(bilal): If the underlying iterator was created on a snapshot, we could
2677 : // grab a reference to the current readState instead of reffing the original
2678 : // readState. This allows us to release references to some zombie sstables
2679 : // and memtables.
2680 1 : if readState != nil {
2681 1 : readState.ref()
2682 1 : }
2683 1 : if vers != nil {
2684 1 : vers.Ref()
2685 1 : }
2686 : // Bundle various structures under a single umbrella in order to allocate
2687 : // them together.
2688 1 : buf := iterAllocPool.Get().(*iterAlloc)
2689 1 : dbi := &buf.dbi
2690 1 : *dbi = Iterator{
2691 1 : ctx: ctx,
2692 1 : opts: *opts.IterOptions,
2693 1 : alloc: buf,
2694 1 : merge: i.merge,
2695 1 : comparer: i.comparer,
2696 1 : readState: readState,
2697 1 : version: vers,
2698 1 : keyBuf: buf.keyBuf,
2699 1 : prefixOrFullSeekKey: buf.prefixOrFullSeekKey,
2700 1 : boundsBuf: buf.boundsBuf,
2701 1 : batch: i.batch,
2702 1 : batchSeqNum: i.batchSeqNum,
2703 1 : newIters: i.newIters,
2704 1 : newIterRangeKey: i.newIterRangeKey,
2705 1 : seqNum: i.seqNum,
2706 1 : }
2707 1 : dbi.processBounds(dbi.opts.LowerBound, dbi.opts.UpperBound)
2708 1 :
2709 1 : // If the caller requested the clone have a current view of the indexed
2710 1 : // batch, set the clone's batch sequence number appropriately.
2711 1 : if i.batch != nil && opts.RefreshBatchView {
2712 0 : dbi.batchSeqNum = (uint64(len(i.batch.data)) | base.InternalKeySeqNumBatch)
2713 0 : }
2714 :
2715 1 : return finishInitializingIter(ctx, buf), nil
2716 : }
2717 :
2718 : // Merge adds all of the argument's statistics to the receiver. It may be used
2719 : // to accumulate stats across multiple iterators.
2720 0 : func (stats *IteratorStats) Merge(o IteratorStats) {
2721 0 : for i := InterfaceCall; i < NumStatsKind; i++ {
2722 0 : stats.ForwardSeekCount[i] += o.ForwardSeekCount[i]
2723 0 : stats.ReverseSeekCount[i] += o.ReverseSeekCount[i]
2724 0 : stats.ForwardStepCount[i] += o.ForwardStepCount[i]
2725 0 : stats.ReverseStepCount[i] += o.ReverseStepCount[i]
2726 0 : }
2727 0 : stats.InternalStats.Merge(o.InternalStats)
2728 0 : stats.RangeKeyStats.Merge(o.RangeKeyStats)
2729 : }
2730 :
2731 0 : func (stats *IteratorStats) String() string {
2732 0 : return redact.StringWithoutMarkers(stats)
2733 0 : }
2734 :
2735 : // SafeFormat implements the redact.SafeFormatter interface.
2736 0 : func (stats *IteratorStats) SafeFormat(s redact.SafePrinter, verb rune) {
2737 0 : for i := range stats.ForwardStepCount {
2738 0 : switch IteratorStatsKind(i) {
2739 0 : case InterfaceCall:
2740 0 : s.SafeString("(interface (dir, seek, step): ")
2741 0 : case InternalIterCall:
2742 0 : s.SafeString(", (internal (dir, seek, step): ")
2743 : }
2744 0 : s.Printf("(fwd, %d, %d), (rev, %d, %d))",
2745 0 : redact.Safe(stats.ForwardSeekCount[i]), redact.Safe(stats.ForwardStepCount[i]),
2746 0 : redact.Safe(stats.ReverseSeekCount[i]), redact.Safe(stats.ReverseStepCount[i]))
2747 : }
2748 0 : if stats.InternalStats != (InternalIteratorStats{}) {
2749 0 : s.SafeString(",\n(internal-stats: ")
2750 0 : s.Printf("(block-bytes: (total %s, cached %s, read-time %s)), "+
2751 0 : "(points: (count %s, key-bytes %s, value-bytes %s, tombstoned %s))",
2752 0 : humanize.Bytes.Uint64(stats.InternalStats.BlockBytes),
2753 0 : humanize.Bytes.Uint64(stats.InternalStats.BlockBytesInCache),
2754 0 : humanize.FormattedString(stats.InternalStats.BlockReadDuration.String()),
2755 0 : humanize.Count.Uint64(stats.InternalStats.PointCount),
2756 0 : humanize.Bytes.Uint64(stats.InternalStats.KeyBytes),
2757 0 : humanize.Bytes.Uint64(stats.InternalStats.ValueBytes),
2758 0 : humanize.Count.Uint64(stats.InternalStats.PointsCoveredByRangeTombstones),
2759 0 : )
2760 0 : if stats.InternalStats.SeparatedPointValue.Count != 0 {
2761 0 : s.Printf(", (separated: (count %s, bytes %s, fetched %s)))",
2762 0 : humanize.Count.Uint64(stats.InternalStats.SeparatedPointValue.Count),
2763 0 : humanize.Bytes.Uint64(stats.InternalStats.SeparatedPointValue.ValueBytes),
2764 0 : humanize.Bytes.Uint64(stats.InternalStats.SeparatedPointValue.ValueBytesFetched))
2765 0 : } else {
2766 0 : s.Printf(")")
2767 0 : }
2768 : }
2769 0 : if stats.RangeKeyStats != (RangeKeyIteratorStats{}) {
2770 0 : s.SafeString(",\n(range-key-stats: ")
2771 0 : s.Printf("(count %d), (contained points: (count %d, skipped %d)))",
2772 0 : stats.RangeKeyStats.Count,
2773 0 : stats.RangeKeyStats.ContainedPoints,
2774 0 : stats.RangeKeyStats.SkippedPoints)
2775 0 : }
2776 : }
|