Line data Source code
1 : // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package pebble
6 :
7 : import (
8 : "bytes"
9 : "context"
10 : "io"
11 : "math/rand/v2"
12 : "sync"
13 : "unsafe"
14 :
15 : "github.com/cockroachdb/errors"
16 : "github.com/cockroachdb/pebble/internal/base"
17 : "github.com/cockroachdb/pebble/internal/bytealloc"
18 : "github.com/cockroachdb/pebble/internal/humanize"
19 : "github.com/cockroachdb/pebble/internal/inflight"
20 : "github.com/cockroachdb/pebble/internal/invariants"
21 : "github.com/cockroachdb/pebble/internal/keyspan"
22 : "github.com/cockroachdb/pebble/internal/keyspan/keyspanimpl"
23 : "github.com/cockroachdb/pebble/internal/manifest"
24 : "github.com/cockroachdb/pebble/internal/rangekeystack"
25 : "github.com/cockroachdb/pebble/internal/treeprinter"
26 : "github.com/cockroachdb/pebble/sstable/blob"
27 : "github.com/cockroachdb/redact"
28 : )
29 :
30 : // iterPos describes the state of the internal iterator, in terms of whether it
31 : // is at the position returned to the user (cur), one ahead of the position
32 : // returned (next for forward iteration and prev for reverse iteration). The cur
33 : // position is split into two states, for forward and reverse iteration, since
34 : // we need to differentiate for switching directions.
35 : //
36 : // There is subtlety in what is considered the current position of the Iterator.
37 : // The internal iterator exposes a sequence of internal keys. There is not
38 : // always a single internalIterator position corresponding to the position
39 : // returned to the user. Consider the example:
40 : //
41 : // a.MERGE.9 a.MERGE.8 a.MERGE.7 a.SET.6 b.DELETE.9 b.DELETE.5 b.SET.4
42 : // \ /
43 : // \ Iterator.Key() = 'a' /
44 : //
45 : // The Iterator exposes one valid position at user key 'a' and the two exhausted
46 : // positions at the beginning and end of iteration. The underlying
47 : // internalIterator contains 7 valid positions and 2 exhausted positions.
48 : //
49 : // Iterator positioning methods must set iterPos to iterPosCur{Foward,Backward}
50 : // iff the user key at the current internalIterator position equals the
51 : // Iterator.Key returned to the user. This guarantees that a call to nextUserKey
52 : // or prevUserKey will advance to the next or previous iterator position.
53 : // iterPosCur{Forward,Backward} does not make any guarantee about the internal
54 : // iterator position among internal keys with matching user keys, and it will
55 : // vary subtly depending on the particular key kinds encountered. In the above
56 : // example, the iterator returning 'a' may set iterPosCurForward if the internal
57 : // iterator is positioned at any of a.MERGE.9, a.MERGE.8, a.MERGE.7 or a.SET.6.
58 : //
59 : // When setting iterPos to iterPosNext or iterPosPrev, the internal iterator
60 : // must be advanced to the first internalIterator position at a user key greater
61 : // (iterPosNext) or less (iterPosPrev) than the key returned to the user. An
62 : // internalIterator position that's !Valid() must also be considered greater or
63 : // less—depending on the direction of iteration—than the last valid Iterator
64 : // position.
65 : type iterPos int8
66 :
67 : const (
68 : iterPosCurForward iterPos = 0
69 : iterPosNext iterPos = 1
70 : iterPosPrev iterPos = -1
71 : iterPosCurReverse iterPos = -2
72 :
73 : // For limited iteration. When the iterator is at iterPosCurForwardPaused
74 : // - Next*() call should behave as if the internal iterator is already
75 : // at next (akin to iterPosNext).
76 : // - Prev*() call should behave as if the internal iterator is at the
77 : // current key (akin to iterPosCurForward).
78 : //
79 : // Similar semantics apply to CurReversePaused.
80 : iterPosCurForwardPaused iterPos = 2
81 : iterPosCurReversePaused iterPos = -3
82 : )
83 :
84 : // Approximate gap in bytes between samples of data read during iteration.
85 : // This is multiplied with a default ReadSamplingMultiplier of 1 << 4 to yield
86 : // 1 << 20 (1MB). The 1MB factor comes from:
87 : // https://github.com/cockroachdb/pebble/issues/29#issuecomment-494477985
88 : const readBytesPeriod uint64 = 1 << 16
89 :
90 : var errReversePrefixIteration = errors.New("pebble: unsupported reverse prefix iteration")
91 :
92 : // IteratorMetrics holds per-iterator metrics. These do not change over the
93 : // lifetime of the iterator.
94 : type IteratorMetrics struct {
95 : // The read amplification experienced by this iterator. This is the sum of
96 : // the memtables, the L0 sublevels and the non-empty Ln levels. Higher read
97 : // amplification generally results in slower reads, though allowing higher
98 : // read amplification can also result in faster writes.
99 : ReadAmp int
100 : }
101 :
102 : // IteratorStatsKind describes the two kind of iterator stats.
103 : type IteratorStatsKind int8
104 :
105 : const (
106 : // InterfaceCall represents calls to Iterator.
107 : InterfaceCall IteratorStatsKind = iota
108 : // InternalIterCall represents calls by Iterator to its internalIterator.
109 : InternalIterCall
110 : // NumStatsKind is the number of kinds, and is used for array sizing.
111 : NumStatsKind
112 : )
113 :
114 : // IteratorStats contains iteration stats.
115 : type IteratorStats struct {
116 : // ForwardSeekCount includes SeekGE, SeekPrefixGE, First.
117 : ForwardSeekCount [NumStatsKind]int
118 : // ReverseSeek includes SeekLT, Last.
119 : ReverseSeekCount [NumStatsKind]int
120 : // ForwardStepCount includes Next.
121 : ForwardStepCount [NumStatsKind]int
122 : // ReverseStepCount includes Prev.
123 : ReverseStepCount [NumStatsKind]int
124 : InternalStats InternalIteratorStats
125 : RangeKeyStats RangeKeyIteratorStats
126 : }
127 :
128 : var _ redact.SafeFormatter = &IteratorStats{}
129 :
130 : // InternalIteratorStats contains miscellaneous stats produced by internal
131 : // iterators.
132 : type InternalIteratorStats = base.InternalIteratorStats
133 :
134 : // BlockReadStats contains iterators stats about block reads.
135 : type BlockReadStats = base.BlockReadStats
136 :
137 : // RangeKeyIteratorStats contains miscellaneous stats about range keys
138 : // encountered by the iterator.
139 : type RangeKeyIteratorStats struct {
140 : // Count records the number of range keys encountered during
141 : // iteration. Range keys may be counted multiple times if the iterator
142 : // leaves a range key's bounds and then returns.
143 : Count int
144 : // ContainedPoints records the number of point keys encountered within the
145 : // bounds of a range key. Note that this includes point keys with suffixes
146 : // that sort both above and below the covering range key's suffix.
147 : ContainedPoints int
148 : // SkippedPoints records the count of the subset of ContainedPoints point
149 : // keys that were skipped during iteration due to range-key masking. It does
150 : // not include point keys that were never loaded because a
151 : // RangeKeyMasking.Filter excluded the entire containing block.
152 : SkippedPoints int
153 : }
154 :
155 : // Merge adds all of the argument's statistics to the receiver. It may be used
156 : // to accumulate stats across multiple iterators.
157 1 : func (s *RangeKeyIteratorStats) Merge(o RangeKeyIteratorStats) {
158 1 : s.Count += o.Count
159 1 : s.ContainedPoints += o.ContainedPoints
160 1 : s.SkippedPoints += o.SkippedPoints
161 1 : }
162 :
163 0 : func (s *RangeKeyIteratorStats) String() string {
164 0 : return redact.StringWithoutMarkers(s)
165 0 : }
166 :
167 : // SafeFormat implements the redact.SafeFormatter interface.
168 1 : func (s *RangeKeyIteratorStats) SafeFormat(p redact.SafePrinter, verb rune) {
169 1 : p.Printf("range keys: %s, contained points: %s (%s skipped)",
170 1 : humanize.Count.Uint64(uint64(s.Count)),
171 1 : humanize.Count.Uint64(uint64(s.ContainedPoints)),
172 1 : humanize.Count.Uint64(uint64(s.SkippedPoints)))
173 1 : }
174 :
175 : // LazyValue is a lazy value. See the long comment in base.LazyValue.
176 : type LazyValue = base.LazyValue
177 :
178 : // Iterator iterates over a DB's key/value pairs in key order.
179 : //
180 : // An iterator must be closed after use, but it is not necessary to read an
181 : // iterator until exhaustion.
182 : //
183 : // An iterator is not goroutine-safe, but it is safe to use multiple iterators
184 : // concurrently, with each in a dedicated goroutine.
185 : //
186 : // It is also safe to use an iterator concurrently with modifying its
187 : // underlying DB, if that DB permits modification. However, the resultant
188 : // key/value pairs are not guaranteed to be a consistent snapshot of that DB
189 : // at a particular point in time.
190 : //
191 : // If an iterator encounters an error during any operation, it is stored by
192 : // the Iterator and surfaced through the Error method. All absolute
193 : // positioning methods (eg, SeekLT, SeekGT, First, Last, etc) reset any
194 : // accumulated error before positioning. All relative positioning methods (eg,
195 : // Next, Prev) return without advancing if the iterator has an accumulated
196 : // error.
197 : type Iterator struct {
198 : // blobValueFetcher is the ValueFetcher to use when retrieving values stored
199 : // externally in blob files.
200 : blobValueFetcher blob.ValueFetcher
201 :
202 : // All fields below this field are cleared during Iterator.Close before
203 : // returning the Iterator to the pool. Any fields above this field must also
204 : // be cleared, but may be cleared as a part of the body of Iterator.Close
205 : // (eg, if these types have their own Close method that zeroes their
206 : // fields).
207 : clearForReuseBoundary struct{}
208 :
209 : // The context is stored here since (a) Iterators are expected to be
210 : // short-lived (since they pin memtables and sstables), (b) plumbing a
211 : // context into every method is very painful, (c) they do not (yet) respect
212 : // context cancellation and are only used for tracing.
213 : ctx context.Context
214 : // TODO(radu): group fields that are always set from `db` fields (tracker,
215 : // merge, comparer, fc) into a single struct we can point to.
216 : tracker *inflight.Tracker
217 : trackerHandle inflight.Handle
218 : opts IterOptions
219 : merge Merge
220 : comparer *base.Comparer
221 : iter internalIterator
222 : pointIter topLevelIterator
223 : // Either readState or version is set, but not both.
224 : readState *readState
225 : version *manifest.Version
226 : // rangeKey holds iteration state specific to iteration over range keys.
227 : // The range key field may be nil if the Iterator has never been configured
228 : // to iterate over range keys. Its non-nilness cannot be used to determine
229 : // if the Iterator is currently iterating over range keys: For that, consult
230 : // the IterOptions using opts.rangeKeys(). If non-nil, its rangeKeyIter
231 : // field is guaranteed to be non-nil too.
232 : rangeKey *iteratorRangeKeyState
233 : // rangeKeyMasking holds state for range-key masking of point keys.
234 : rangeKeyMasking rangeKeyMasking
235 : err error
236 : // When iterValidityState=IterValid, key represents the current key, which
237 : // is backed by keyBuf.
238 : key []byte
239 : keyBuf []byte
240 : value base.InternalValue
241 : // For use in LazyValue.Clone.
242 : valueBuf []byte
243 : fetcher base.LazyFetcher
244 : // For use in LazyValue.Value.
245 : lazyValueBuf []byte
246 : valueCloser io.Closer
247 : // boundsBuf holds two buffers used to store the lower and upper bounds.
248 : // Whenever the Iterator's bounds change, the new bounds are copied into
249 : // boundsBuf[boundsBufIdx]. The two bounds share a slice to reduce
250 : // allocations. opts.LowerBound and opts.UpperBound point into this slice.
251 : boundsBuf [2][]byte
252 : boundsBufIdx int
253 : // iterKV reflects the latest position of iter, except when SetBounds is
254 : // called. In that case, it is explicitly set to nil.
255 : iterKV *base.InternalKV
256 : alloc *iterAlloc
257 : prefixOrFullSeekKey []byte
258 : readSampling readSampling
259 : stats IteratorStats
260 : externalIter *externalIterState
261 : // Following fields used when constructing an iterator stack, eg, in Clone
262 : // and SetOptions or when re-fragmenting a batch's range keys/range dels.
263 : fc *fileCacheHandle
264 : newIters tableNewIters
265 : newIterRangeKey keyspanimpl.TableNewSpanIter
266 : lazyCombinedIter lazyCombinedIter
267 : seqNum base.SeqNum
268 : // batch is non-nil if this Iterator includes an indexed batch. Batch
269 : // contains all the state pertaining to iterating over the indexed batch.
270 : // The iteratorBatchState struct is bundled within the iterAlloc struct to
271 : // reduce allocations.
272 : batch *iteratorBatchState
273 : // merging is a pointer to this iterator's point merging iterator. It
274 : // appears here because key visibility is handled by the merging iterator.
275 : // During SetOptions on an iterator over an indexed batch, this field is
276 : // used to update the merging iterator's batch snapshot.
277 : merging *mergingIter
278 :
279 : // Keeping the bools here after all the 8 byte aligned fields shrinks the
280 : // sizeof this struct by 24 bytes.
281 :
282 : // INVARIANT:
283 : // iterValidityState==IterAtLimit <=>
284 : // pos==iterPosCurForwardPaused || pos==iterPosCurReversePaused
285 : iterValidityState IterValidityState
286 : // Set to true by SetBounds, SetOptions. Causes the Iterator to appear
287 : // exhausted externally, while preserving the correct iterValidityState for
288 : // the iterator's internal state. Preserving the correct internal validity
289 : // is used for SeekPrefixGE(..., trySeekUsingNext), and SeekGE/SeekLT
290 : // optimizations after "no-op" calls to SetBounds and SetOptions.
291 : requiresReposition bool
292 : // The position of iter. When this is iterPos{Prev,Next} the iter has been
293 : // moved past the current key-value, which can only happen if
294 : // iterValidityState=IterValid, i.e., there is something to return to the
295 : // client for the current position.
296 : pos iterPos
297 : // Relates to the prefixOrFullSeekKey field above.
298 : hasPrefix bool
299 : // Used for deriving the value of SeekPrefixGE(..., trySeekUsingNext),
300 : // and SeekGE/SeekLT optimizations
301 : lastPositioningOp lastPositioningOpKind
302 : // Used for determining when it's safe to perform SeekGE optimizations that
303 : // reuse the iterator state to avoid the cost of a full seek if the iterator
304 : // is already positioned in the correct place. If the iterator's view of its
305 : // indexed batch was just refreshed, some optimizations cannot be applied on
306 : // the first seek after the refresh:
307 : // - SeekGE has a no-op optimization that does not seek on the internal
308 : // iterator at all if the iterator is already in the correct place.
309 : // This optimization cannot be performed if the internal iterator was
310 : // last positioned when the iterator had a different view of an
311 : // underlying batch.
312 : // - Seek[Prefix]GE set flags.TrySeekUsingNext()=true when the seek key is
313 : // greater than the previous operation's seek key, under the expectation
314 : // that the various internal iterators can use their current position to
315 : // avoid a full expensive re-seek. This applies to the batchIter as well.
316 : // However, if the view of the batch was just refreshed, the batchIter's
317 : // position is not useful because it may already be beyond new keys less
318 : // than the seek key. To prevent the use of this optimization in
319 : // batchIter, Seek[Prefix]GE set flags.BatchJustRefreshed()=true if this
320 : // bit is enabled.
321 : batchJustRefreshed bool
322 : // batchOnlyIter is set to true for Batch.NewBatchOnlyIter.
323 : batchOnlyIter bool
324 : // Used in some tests to disable the random disabling of seek optimizations.
325 : forceEnableSeekOpt bool
326 : // Set to true if NextPrefix is not currently permitted. Defaults to false
327 : // in case an iterator never had any bounds.
328 : nextPrefixNotPermittedByUpperBound bool
329 : }
330 :
331 : const clearOff = unsafe.Offsetof(Iterator{}.clearForReuseBoundary)
332 : const clearLen = unsafe.Sizeof(Iterator{}) - clearOff
333 :
334 2 : func (i *Iterator) clearForReuse() {
335 2 : *(*[clearLen]byte)(unsafe.Add(unsafe.Pointer(i), clearOff)) = [clearLen]byte{}
336 2 : }
337 :
338 : // cmp is a convenience shorthand for the i.comparer.Compare function.
339 2 : func (i *Iterator) cmp(a, b []byte) int {
340 2 : return i.comparer.Compare(a, b)
341 2 : }
342 :
343 : // equal is a convenience shorthand for the i.comparer.Equal function.
344 2 : func (i *Iterator) equal(a, b []byte) bool {
345 2 : return i.comparer.Equal(a, b)
346 2 : }
347 :
348 : // iteratorBatchState holds state pertaining to iterating over an indexed batch.
349 : // When an iterator is configured to read through an indexed batch, the iterator
350 : // maintains a pointer to this struct. This struct is embedded within the
351 : // iterAlloc struct to reduce allocations.
352 : type iteratorBatchState struct {
353 : batch *Batch
354 : // batchSeqNum is used by Iterators over indexed batches to detect when the
355 : // underlying batch has been mutated. The batch beneath an indexed batch may
356 : // be mutated while the Iterator is open, but new keys are not surfaced
357 : // until the next call to SetOptions.
358 : batchSeqNum base.SeqNum
359 : // batch{PointIter,RangeDelIter,RangeKeyIter} are used when the Iterator is
360 : // configured to read through an indexed batch. If a batch is set, these
361 : // iterators will be included within the iterator stack regardless of
362 : // whether the batch currently contains any keys of their kind. These
363 : // pointers are used during a call to SetOptions to refresh the Iterator's
364 : // view of its indexed batch.
365 : pointIter batchIter
366 : rangeDelIter keyspan.Iter
367 : rangeKeyIter keyspan.Iter
368 : }
369 :
370 : // iteratorRangeKeyState holds an iterator's range key iteration state.
371 : type iteratorRangeKeyState struct {
372 : // rangeKeyIter holds the range key iterator stack that iterates over the
373 : // merged spans across the entirety of the LSM.
374 : rangeKeyIter keyspan.FragmentIterator
375 : iiter keyspan.InterleavingIter
376 : // stale is set to true when the range key state recorded here (in start,
377 : // end and keys) may not be in sync with the current range key at the
378 : // interleaving iterator's current position.
379 : //
380 : // When the interelaving iterator passes over a new span, it invokes the
381 : // SpanChanged hook defined on the `rangeKeyMasking` type, which sets stale
382 : // to true if the span is non-nil.
383 : //
384 : // The parent iterator may not be positioned over the interleaving
385 : // iterator's current position (eg, i.iterPos = iterPos{Next,Prev}), so
386 : // {keys,start,end} are only updated to the new range key during a call to
387 : // Iterator.saveRangeKey.
388 : stale bool
389 : // updated is used to signal to the Iterator client whether the state of
390 : // range keys has changed since the previous iterator position through the
391 : // `RangeKeyChanged` method. It's set to true during an Iterator positioning
392 : // operation that changes the state of the current range key. Each Iterator
393 : // positioning operation sets it back to false before executing.
394 : //
395 : // TODO(jackson): The lifecycle of {stale,updated,prevPosHadRangeKey} is
396 : // intricate and confusing. Try to refactor to reduce complexity.
397 : updated bool
398 : // prevPosHadRangeKey records whether the previous Iterator position had a
399 : // range key (HasPointAndRage() = (_, true)). It's updated at the beginning
400 : // of each new Iterator positioning operation. It's required by saveRangeKey to
401 : // to set `updated` appropriately: Without this record of the previous iterator
402 : // state, it's ambiguous whether an iterator only temporarily stepped onto a
403 : // position without a range key.
404 : prevPosHadRangeKey bool
405 : // rangeKeyOnly is set to true if at the current iterator position there is
406 : // no point key, only a range key start boundary.
407 : rangeKeyOnly bool
408 : // hasRangeKey is true when the current iterator position has a covering
409 : // range key (eg, a range key with bounds [<lower>,<upper>) such that
410 : // <lower> ≤ Key() < <upper>).
411 : hasRangeKey bool
412 : // start and end are the [start, end) boundaries of the current range keys.
413 : start []byte
414 : end []byte
415 :
416 : rangeKeyBuffers
417 :
418 : // iterConfig holds fields that are used for the construction of the
419 : // iterator stack, but do not need to be directly accessed during iteration.
420 : // This struct is bundled within the iteratorRangeKeyState struct to reduce
421 : // allocations.
422 : iterConfig rangekeystack.UserIteratorConfig
423 : }
424 :
425 : type rangeKeyBuffers struct {
426 : // keys is sorted by Suffix ascending.
427 : keys []RangeKeyData
428 : // buf is used to save range-key data before moving the range-key iterator.
429 : // Start and end boundaries, suffixes and values are all copied into buf.
430 : buf bytealloc.A
431 : // internal holds buffers used by the range key internal iterators.
432 : internal rangekeystack.Buffers
433 : }
434 :
435 2 : func (b *rangeKeyBuffers) PrepareForReuse() {
436 2 : const maxKeysReuse = 100
437 2 : if len(b.keys) > maxKeysReuse {
438 0 : b.keys = nil
439 0 : }
440 : // Avoid caching the key buf if it is overly large. The constant is
441 : // fairly arbitrary.
442 2 : if cap(b.buf) >= maxKeyBufCacheSize {
443 1 : b.buf = nil
444 2 : } else {
445 2 : b.buf = b.buf[:0]
446 2 : }
447 2 : b.internal.PrepareForReuse()
448 : }
449 :
450 : var iterRangeKeyStateAllocPool = sync.Pool{
451 2 : New: func() interface{} {
452 2 : return &iteratorRangeKeyState{}
453 2 : },
454 : }
455 :
456 : // isEphemeralPosition returns true iff the current iterator position is
457 : // ephemeral, and won't be visited during subsequent relative positioning
458 : // operations.
459 : //
460 : // The iterator position resulting from a SeekGE or SeekPrefixGE that lands on a
461 : // straddling range key without a coincident point key is such a position.
462 2 : func (i *Iterator) isEphemeralPosition() bool {
463 2 : return i.opts.rangeKeys() && i.rangeKey != nil && i.rangeKey.rangeKeyOnly &&
464 2 : !i.equal(i.rangeKey.start, i.key)
465 2 : }
466 :
467 : type lastPositioningOpKind int8
468 :
469 : const (
470 : unknownLastPositionOp lastPositioningOpKind = iota
471 : seekPrefixGELastPositioningOp
472 : seekGELastPositioningOp
473 : seekLTLastPositioningOp
474 : // internalNextOp is a special internal iterator positioning operation used
475 : // by CanDeterministicallySingleDelete. It exists for enforcing requirements
476 : // around calling CanDeterministicallySingleDelete at most once per external
477 : // iterator position.
478 : internalNextOp
479 : )
480 :
481 : // Limited iteration mode. Not for use with prefix iteration.
482 : //
483 : // SeekGE, SeekLT, Prev, Next have WithLimit variants, that pause the iterator
484 : // at the limit in a best-effort manner. The client should behave correctly
485 : // even if the limits are ignored. These limits are not "deep", in that they
486 : // are not passed down to the underlying collection of internalIterators. This
487 : // is because the limits are transient, and apply only until the next
488 : // iteration call. They serve mainly as a way to bound the amount of work when
489 : // two (or more) Iterators are being coordinated at a higher level.
490 : //
491 : // In limited iteration mode:
492 : // - Avoid using Iterator.Valid if the last call was to a *WithLimit() method.
493 : // The return value from the *WithLimit() method provides a more precise
494 : // disposition.
495 : // - The limit is exclusive for forward and inclusive for reverse.
496 : //
497 : //
498 : // Limited iteration mode & range keys
499 : //
500 : // Limited iteration interacts with range-key iteration. When range key
501 : // iteration is enabled, range keys are interleaved at their start boundaries.
502 : // Limited iteration must ensure that if a range key exists within the limit,
503 : // the iterator visits the range key.
504 : //
505 : // During forward limited iteration, this is trivial: An overlapping range key
506 : // must have a start boundary less than the limit, and the range key's start
507 : // boundary will be interleaved and found to be within the limit.
508 : //
509 : // During reverse limited iteration, the tail of the range key may fall within
510 : // the limit. The range key must be surfaced even if the range key's start
511 : // boundary is less than the limit, and if there are no point keys between the
512 : // current iterator position and the limit. To provide this guarantee, reverse
513 : // limited iteration ignores the limit as long as there is a range key
514 : // overlapping the iteration position.
515 :
516 : // IterValidityState captures the state of the Iterator.
517 : type IterValidityState int8
518 :
519 : const (
520 : // IterExhausted represents an Iterator that is exhausted.
521 : IterExhausted IterValidityState = iota
522 : // IterValid represents an Iterator that is valid.
523 : IterValid
524 : // IterAtLimit represents an Iterator that has a non-exhausted
525 : // internalIterator, but has reached a limit without any key for the
526 : // caller.
527 : IterAtLimit
528 : )
529 :
530 : // readSampling stores variables used to sample a read to trigger a read
531 : // compaction
532 : type readSampling struct {
533 : bytesUntilReadSampling uint64
534 : initialSamplePassed bool
535 : pendingCompactions readCompactionQueue
536 : // forceReadSampling is used for testing purposes to force a read sample on every
537 : // call to Iterator.maybeSampleRead()
538 : forceReadSampling bool
539 : }
540 :
541 2 : func (i *Iterator) findNextEntry(limit []byte) {
542 2 : i.iterValidityState = IterExhausted
543 2 : i.pos = iterPosCurForward
544 2 : if i.opts.rangeKeys() && i.rangeKey != nil {
545 2 : i.rangeKey.rangeKeyOnly = false
546 2 : }
547 :
548 : // Close the closer for the current value if one was open.
549 2 : if i.closeValueCloser() != nil {
550 0 : return
551 0 : }
552 :
553 2 : for i.iterKV != nil {
554 2 : key := i.iterKV.K
555 2 :
556 2 : // The topLevelIterator.StrictSeekPrefixGE contract requires that in
557 2 : // prefix mode [i.hasPrefix=t], every point key returned by the internal
558 2 : // iterator must have the current iteration prefix.
559 2 : if invariants.Enabled && i.hasPrefix {
560 2 : // Range keys are an exception to the contract and may return a different
561 2 : // prefix. This case is explicitly handled in the switch statement below.
562 2 : if key.Kind() != base.InternalKeyKindRangeKeySet {
563 2 : if p := i.comparer.Split.Prefix(key.UserKey); !i.equal(i.prefixOrFullSeekKey, p) {
564 0 : i.opts.logger.Fatalf("pebble: prefix violation: key %q does not have prefix %q\n", key.UserKey, i.prefixOrFullSeekKey)
565 0 : }
566 : }
567 : }
568 :
569 : // Compare with limit every time we start at a different user key.
570 : // Note that given the best-effort contract of limit, we could avoid a
571 : // comparison in the common case by doing this only after
572 : // i.nextUserKey is called for the deletes below. However that makes
573 : // the behavior non-deterministic (since the behavior will vary based
574 : // on what has been compacted), which makes it hard to test with the
575 : // metamorphic test. So we forego that performance optimization.
576 2 : if limit != nil && i.cmp(limit, i.iterKV.K.UserKey) <= 0 {
577 2 : i.iterValidityState = IterAtLimit
578 2 : i.pos = iterPosCurForwardPaused
579 2 : return
580 2 : }
581 :
582 : // If the user has configured a SkipPoint function, invoke it to see
583 : // whether we should skip over the current user key.
584 2 : if i.opts.SkipPoint != nil && key.Kind() != InternalKeyKindRangeKeySet && i.opts.SkipPoint(i.iterKV.K.UserKey) {
585 2 : // NB: We could call nextUserKey, but in some cases the SkipPoint
586 2 : // predicate function might be cheaper than nextUserKey's key copy
587 2 : // and key comparison. This should be the case for MVCC suffix
588 2 : // comparisons, for example. In the future, we could expand the
589 2 : // SkipPoint interface to give the implementor more control over
590 2 : // whether we skip over just the internal key, the user key, or even
591 2 : // the key prefix.
592 2 : i.stats.ForwardStepCount[InternalIterCall]++
593 2 : i.iterKV = i.iter.Next()
594 2 : continue
595 : }
596 :
597 2 : switch key.Kind() {
598 2 : case InternalKeyKindRangeKeySet:
599 2 : if i.hasPrefix {
600 2 : if p := i.comparer.Split.Prefix(key.UserKey); !i.equal(i.prefixOrFullSeekKey, p) {
601 2 : return
602 2 : }
603 : }
604 : // Save the current key.
605 2 : i.keyBuf = append(i.keyBuf[:0], key.UserKey...)
606 2 : i.key = i.keyBuf
607 2 : i.value = base.InternalValue{}
608 2 : // There may also be a live point key at this userkey that we have
609 2 : // not yet read. We need to find the next entry with this user key
610 2 : // to find it. Save the range key so we don't lose it when we Next
611 2 : // the underlying iterator.
612 2 : i.saveRangeKey()
613 2 : pointKeyExists := i.nextPointCurrentUserKey()
614 2 : if i.err != nil {
615 0 : i.iterValidityState = IterExhausted
616 0 : return
617 0 : }
618 2 : i.rangeKey.rangeKeyOnly = !pointKeyExists
619 2 : i.iterValidityState = IterValid
620 2 : return
621 :
622 2 : case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized:
623 2 : // NB: treating InternalKeyKindSingleDelete as equivalent to DEL is not
624 2 : // only simpler, but is also necessary for correctness due to
625 2 : // InternalKeyKindSSTableInternalObsoleteBit.
626 2 : i.nextUserKey()
627 2 : continue
628 :
629 2 : case InternalKeyKindSet, InternalKeyKindSetWithDelete:
630 2 : i.keyBuf = append(i.keyBuf[:0], key.UserKey...)
631 2 : i.key = i.keyBuf
632 2 : i.value = i.iterKV.V
633 2 : i.iterValidityState = IterValid
634 2 : i.saveRangeKey()
635 2 : return
636 :
637 2 : case InternalKeyKindMerge:
638 2 : // Resolving the merge may advance us to the next point key, which
639 2 : // may be covered by a different set of range keys. Save the range
640 2 : // key state so we don't lose it.
641 2 : i.saveRangeKey()
642 2 : if i.mergeForward(key) {
643 2 : i.iterValidityState = IterValid
644 2 : return
645 2 : }
646 :
647 : // The merge didn't yield a valid key, either because the value
648 : // merger indicated it should be deleted, or because an error was
649 : // encountered.
650 1 : i.iterValidityState = IterExhausted
651 1 : if i.err != nil {
652 1 : return
653 1 : }
654 1 : if i.pos != iterPosNext {
655 0 : i.nextUserKey()
656 0 : }
657 1 : if i.closeValueCloser() != nil {
658 0 : return
659 0 : }
660 1 : i.pos = iterPosCurForward
661 :
662 0 : default:
663 0 : i.err = base.CorruptionErrorf("pebble: invalid internal key kind: %d", errors.Safe(key.Kind()))
664 0 : i.iterValidityState = IterExhausted
665 0 : return
666 : }
667 : }
668 :
669 : // Is iterKey nil due to an error?
670 2 : if err := i.iter.Error(); err != nil {
671 1 : i.err = err
672 1 : i.iterValidityState = IterExhausted
673 1 : }
674 : }
675 :
676 2 : func (i *Iterator) nextPointCurrentUserKey() bool {
677 2 : // If the user has configured a SkipPoint function and the current user key
678 2 : // would be skipped by it, there's no need to step forward looking for a
679 2 : // point key. If we were to find one, it should be skipped anyways.
680 2 : if i.opts.SkipPoint != nil && i.opts.SkipPoint(i.key) {
681 2 : return false
682 2 : }
683 :
684 2 : i.pos = iterPosCurForward
685 2 :
686 2 : i.iterKV = i.iter.Next()
687 2 : i.stats.ForwardStepCount[InternalIterCall]++
688 2 : if i.iterKV == nil {
689 2 : if err := i.iter.Error(); err != nil {
690 0 : i.err = err
691 2 : } else {
692 2 : i.pos = iterPosNext
693 2 : }
694 2 : return false
695 : }
696 2 : if !i.equal(i.key, i.iterKV.K.UserKey) {
697 2 : i.pos = iterPosNext
698 2 : return false
699 2 : }
700 :
701 2 : key := i.iterKV.K
702 2 : switch key.Kind() {
703 0 : case InternalKeyKindRangeKeySet:
704 0 : // RangeKeySets must always be interleaved as the first internal key
705 0 : // for a user key.
706 0 : i.err = base.CorruptionErrorf("pebble: unexpected range key set mid-user key")
707 0 : return false
708 :
709 2 : case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized:
710 2 : // NB: treating InternalKeyKindSingleDelete as equivalent to DEL is not
711 2 : // only simpler, but is also necessary for correctness due to
712 2 : // InternalKeyKindSSTableInternalObsoleteBit.
713 2 : return false
714 :
715 2 : case InternalKeyKindSet, InternalKeyKindSetWithDelete:
716 2 : i.value = i.iterKV.V
717 2 : return true
718 :
719 2 : case InternalKeyKindMerge:
720 2 : return i.mergeForward(key)
721 :
722 0 : default:
723 0 : i.err = base.CorruptionErrorf("pebble: invalid internal key kind: %d", errors.Safe(key.Kind()))
724 0 : return false
725 : }
726 : }
727 :
728 : // mergeForward resolves a MERGE key, advancing the underlying iterator forward
729 : // to merge with subsequent keys with the same userkey. mergeForward returns a
730 : // boolean indicating whether or not the merge yielded a valid key. A merge may
731 : // not yield a valid key if an error occurred, in which case i.err is non-nil,
732 : // or the user's value merger specified the key to be deleted.
733 : //
734 : // mergeForward does not update iterValidityState.
735 2 : func (i *Iterator) mergeForward(key base.InternalKey) (valid bool) {
736 2 : var iterValue []byte
737 2 : iterValue, _, i.err = i.iterKV.Value(nil)
738 2 : if i.err != nil {
739 0 : return false
740 0 : }
741 2 : var valueMerger ValueMerger
742 2 : valueMerger, i.err = i.merge(key.UserKey, iterValue)
743 2 : if i.err != nil {
744 0 : return false
745 0 : }
746 :
747 2 : i.mergeNext(key, valueMerger)
748 2 : if i.err != nil {
749 1 : return false
750 1 : }
751 :
752 2 : var needDelete bool
753 2 : var value []byte
754 2 : value, needDelete, i.valueCloser, i.err = finishValueMerger(
755 2 : valueMerger, true /* includesBase */)
756 2 : i.value = base.MakeInPlaceValue(value)
757 2 : if i.err != nil {
758 0 : return false
759 0 : }
760 2 : if needDelete {
761 1 : _ = i.closeValueCloser()
762 1 : return false
763 1 : }
764 2 : return true
765 : }
766 :
767 2 : func (i *Iterator) closeValueCloser() error {
768 2 : if i.valueCloser != nil {
769 0 : i.err = i.valueCloser.Close()
770 0 : i.valueCloser = nil
771 0 : }
772 2 : return i.err
773 : }
774 :
775 2 : func (i *Iterator) nextUserKey() {
776 2 : if i.iterKV == nil {
777 2 : return
778 2 : }
779 2 : trailer := i.iterKV.K.Trailer
780 2 : done := i.iterKV.K.Trailer <= base.InternalKeyZeroSeqnumMaxTrailer
781 2 : if i.iterValidityState != IterValid {
782 2 : i.keyBuf = append(i.keyBuf[:0], i.iterKV.K.UserKey...)
783 2 : i.key = i.keyBuf
784 2 : }
785 2 : for {
786 2 : i.stats.ForwardStepCount[InternalIterCall]++
787 2 : i.iterKV = i.iter.Next()
788 2 : if i.iterKV == nil {
789 2 : if err := i.iter.Error(); err != nil {
790 1 : i.err = err
791 1 : return
792 1 : }
793 : }
794 : // NB: We're guaranteed to be on the next user key if the previous key
795 : // had a zero sequence number (`done`), or the new key has a trailer
796 : // greater or equal to the previous key's trailer. This is true because
797 : // internal keys with the same user key are sorted by InternalKeyTrailer in
798 : // strictly monotonically descending order. We expect the trailer
799 : // optimization to trigger around 50% of the time with randomly
800 : // distributed writes. We expect it to trigger very frequently when
801 : // iterating through ingested sstables, which contain keys that all have
802 : // the same sequence number.
803 2 : if done || i.iterKV == nil || i.iterKV.K.Trailer >= trailer {
804 2 : break
805 : }
806 2 : if !i.equal(i.key, i.iterKV.K.UserKey) {
807 2 : break
808 : }
809 2 : done = i.iterKV.K.Trailer <= base.InternalKeyZeroSeqnumMaxTrailer
810 2 : trailer = i.iterKV.K.Trailer
811 : }
812 : }
813 :
814 2 : func (i *Iterator) maybeSampleRead() {
815 2 : // This method is only called when a public method of Iterator is
816 2 : // returning, and below we exclude the case were the iterator is paused at
817 2 : // a limit. The effect of these choices is that keys that are deleted, but
818 2 : // are encountered during iteration, are not accounted for in the read
819 2 : // sampling and will not cause read driven compactions, even though we are
820 2 : // incurring cost in iterating over them. And this issue is not limited to
821 2 : // Iterator, which does not see the effect of range deletes, which may be
822 2 : // causing iteration work in mergingIter. It is not clear at this time
823 2 : // whether this is a deficiency worth addressing.
824 2 : if i.iterValidityState != IterValid {
825 2 : return
826 2 : }
827 2 : if i.readState == nil {
828 2 : return
829 2 : }
830 2 : if i.readSampling.forceReadSampling {
831 1 : i.sampleRead()
832 1 : return
833 1 : }
834 2 : samplingPeriod := int32(int64(readBytesPeriod) * i.readState.db.opts.Experimental.ReadSamplingMultiplier)
835 2 : if samplingPeriod <= 0 {
836 0 : return
837 0 : }
838 2 : bytesRead := uint64(len(i.key) + i.value.Len())
839 2 : for i.readSampling.bytesUntilReadSampling < bytesRead {
840 2 : i.readSampling.bytesUntilReadSampling += uint64(rand.Uint32N(2 * uint32(samplingPeriod)))
841 2 : // The block below tries to adjust for the case where this is the
842 2 : // first read in a newly-opened iterator. As bytesUntilReadSampling
843 2 : // starts off at zero, we don't want to sample the first read of
844 2 : // every newly-opened iterator, but we do want to sample some of them.
845 2 : if !i.readSampling.initialSamplePassed {
846 2 : i.readSampling.initialSamplePassed = true
847 2 : if i.readSampling.bytesUntilReadSampling > bytesRead {
848 2 : if rand.Uint64N(i.readSampling.bytesUntilReadSampling) > bytesRead {
849 2 : continue
850 : }
851 : }
852 : }
853 2 : i.sampleRead()
854 : }
855 2 : i.readSampling.bytesUntilReadSampling -= bytesRead
856 : }
857 :
858 2 : func (i *Iterator) sampleRead() {
859 2 : var topFile *manifest.TableMetadata
860 2 : topLevel, numOverlappingLevels := numLevels, 0
861 2 : mi := i.merging
862 2 : if mi == nil {
863 1 : return
864 1 : }
865 2 : if len(mi.levels) > 1 {
866 2 : mi.ForEachLevelIter(func(li *levelIter) (done bool) {
867 2 : if li.layer.IsFlushableIngests() {
868 0 : return false
869 0 : }
870 2 : l := li.layer.Level()
871 2 : if f := li.iterFile; f != nil {
872 2 : var containsKey bool
873 2 : if i.pos == iterPosNext || i.pos == iterPosCurForward ||
874 2 : i.pos == iterPosCurForwardPaused {
875 2 : containsKey = i.cmp(f.PointKeyBounds.SmallestUserKey(), i.key) <= 0
876 2 : } else if i.pos == iterPosPrev || i.pos == iterPosCurReverse ||
877 2 : i.pos == iterPosCurReversePaused {
878 2 : containsKey = i.cmp(f.PointKeyBounds.LargestUserKey(), i.key) >= 0
879 2 : }
880 : // Do nothing if the current key is not contained in f's
881 : // bounds. We could seek the LevelIterator at this level
882 : // to find the right file, but the performance impacts of
883 : // doing that are significant enough to negate the benefits
884 : // of read sampling in the first place. See the discussion
885 : // at:
886 : // https://github.com/cockroachdb/pebble/pull/1041#issuecomment-763226492
887 2 : if containsKey {
888 2 : numOverlappingLevels++
889 2 : if numOverlappingLevels >= 2 {
890 2 : // Terminate the loop early if at least 2 overlapping levels are found.
891 2 : return true
892 2 : }
893 2 : topLevel = l
894 2 : topFile = f
895 : }
896 : }
897 2 : return false
898 : })
899 : }
900 2 : if topFile == nil || topLevel >= numLevels {
901 2 : return
902 2 : }
903 2 : if numOverlappingLevels >= 2 {
904 2 : allowedSeeks := topFile.AllowedSeeks.Add(-1)
905 2 : if allowedSeeks == 0 {
906 1 :
907 1 : // Since the compaction queue can handle duplicates, we can keep
908 1 : // adding to the queue even once allowedSeeks hits 0.
909 1 : // In fact, we NEED to keep adding to the queue, because the queue
910 1 : // is small and evicts older and possibly useful compactions.
911 1 : topFile.AllowedSeeks.Add(topFile.InitAllowedSeeks)
912 1 :
913 1 : read := readCompaction{
914 1 : start: topFile.PointKeyBounds.SmallestUserKey(),
915 1 : end: topFile.PointKeyBounds.LargestUserKey(),
916 1 : level: topLevel,
917 1 : tableNum: topFile.TableNum,
918 1 : }
919 1 : i.readSampling.pendingCompactions.add(&read, i.cmp)
920 1 : }
921 : }
922 : }
923 :
924 2 : func (i *Iterator) findPrevEntry(limit []byte) {
925 2 : i.iterValidityState = IterExhausted
926 2 : i.pos = iterPosCurReverse
927 2 : if i.opts.rangeKeys() && i.rangeKey != nil {
928 2 : i.rangeKey.rangeKeyOnly = false
929 2 : }
930 :
931 : // Close the closer for the current value if one was open.
932 2 : if i.valueCloser != nil {
933 0 : i.err = i.valueCloser.Close()
934 0 : i.valueCloser = nil
935 0 : if i.err != nil {
936 0 : i.iterValidityState = IterExhausted
937 0 : return
938 0 : }
939 : }
940 :
941 2 : var valueMerger ValueMerger
942 2 : firstLoopIter := true
943 2 : rangeKeyBoundary := false
944 2 : // The code below compares with limit in multiple places. As documented in
945 2 : // findNextEntry, this is being done to make the behavior of limit
946 2 : // deterministic to allow for metamorphic testing. It is not required by
947 2 : // the best-effort contract of limit.
948 2 : for i.iterKV != nil {
949 2 : key := i.iterKV.K
950 2 :
951 2 : // NB: We cannot pause if the current key is covered by a range key.
952 2 : // Otherwise, the user might not ever learn of a range key that covers
953 2 : // the key space being iterated over in which there are no point keys.
954 2 : // Since limits are best effort, ignoring the limit in this case is
955 2 : // allowed by the contract of limit.
956 2 : if firstLoopIter && limit != nil && i.cmp(limit, i.iterKV.K.UserKey) > 0 && !i.rangeKeyWithinLimit(limit) {
957 2 : i.iterValidityState = IterAtLimit
958 2 : i.pos = iterPosCurReversePaused
959 2 : return
960 2 : }
961 2 : firstLoopIter = false
962 2 :
963 2 : if i.iterValidityState == IterValid {
964 2 : if !i.equal(key.UserKey, i.key) {
965 2 : // We've iterated to the previous user key.
966 2 : i.pos = iterPosPrev
967 2 : if valueMerger != nil {
968 2 : var needDelete bool
969 2 : var value []byte
970 2 : value, needDelete, i.valueCloser, i.err = finishValueMerger(valueMerger, true /* includesBase */)
971 2 : i.value = base.MakeInPlaceValue(value)
972 2 : if i.err == nil && needDelete {
973 1 : // The point key at this key is deleted. If we also have
974 1 : // a range key boundary at this key, we still want to
975 1 : // return. Otherwise, we need to continue looking for
976 1 : // a live key.
977 1 : i.value = base.InternalValue{}
978 1 : if rangeKeyBoundary {
979 0 : i.rangeKey.rangeKeyOnly = true
980 1 : } else {
981 1 : i.iterValidityState = IterExhausted
982 1 : if i.closeValueCloser() == nil {
983 1 : continue
984 : }
985 : }
986 : }
987 : }
988 2 : if i.err != nil {
989 0 : i.iterValidityState = IterExhausted
990 0 : }
991 2 : return
992 : }
993 : }
994 :
995 : // If the user has configured a SkipPoint function, invoke it to see
996 : // whether we should skip over the current user key.
997 2 : if i.opts.SkipPoint != nil && key.Kind() != InternalKeyKindRangeKeySet && i.opts.SkipPoint(key.UserKey) {
998 2 : // NB: We could call prevUserKey, but in some cases the SkipPoint
999 2 : // predicate function might be cheaper than prevUserKey's key copy
1000 2 : // and key comparison. This should be the case for MVCC suffix
1001 2 : // comparisons, for example. In the future, we could expand the
1002 2 : // SkipPoint interface to give the implementor more control over
1003 2 : // whether we skip over just the internal key, the user key, or even
1004 2 : // the key prefix.
1005 2 : i.stats.ReverseStepCount[InternalIterCall]++
1006 2 : i.iterKV = i.iter.Prev()
1007 2 : if i.iterKV == nil {
1008 2 : if err := i.iter.Error(); err != nil {
1009 0 : i.err = err
1010 0 : i.iterValidityState = IterExhausted
1011 0 : return
1012 0 : }
1013 : }
1014 2 : if limit != nil && i.iterKV != nil && i.cmp(limit, i.iterKV.K.UserKey) > 0 && !i.rangeKeyWithinLimit(limit) {
1015 2 : i.iterValidityState = IterAtLimit
1016 2 : i.pos = iterPosCurReversePaused
1017 2 : return
1018 2 : }
1019 2 : continue
1020 : }
1021 :
1022 2 : switch key.Kind() {
1023 2 : case InternalKeyKindRangeKeySet:
1024 2 : // Range key start boundary markers are interleaved with the maximum
1025 2 : // sequence number, so if there's a point key also at this key, we
1026 2 : // must've already iterated over it.
1027 2 : // This is the final entry at this user key, so we may return
1028 2 : i.rangeKey.rangeKeyOnly = i.iterValidityState != IterValid
1029 2 : if i.rangeKey.rangeKeyOnly {
1030 2 : // The point iterator is now invalid, so clear the point value.
1031 2 : i.value = base.InternalValue{}
1032 2 : }
1033 2 : i.keyBuf = append(i.keyBuf[:0], key.UserKey...)
1034 2 : i.key = i.keyBuf
1035 2 : i.iterValidityState = IterValid
1036 2 : i.saveRangeKey()
1037 2 : // In all other cases, previous iteration requires advancing to
1038 2 : // iterPosPrev in order to determine if the key is live and
1039 2 : // unshadowed by another key at the same user key. In this case,
1040 2 : // because range key start boundary markers are always interleaved
1041 2 : // at the maximum sequence number, we know that there aren't any
1042 2 : // additional keys with the same user key in the backward direction.
1043 2 : //
1044 2 : // We Prev the underlying iterator once anyways for consistency, so
1045 2 : // that we can maintain the invariant during backward iteration that
1046 2 : // i.iterPos = iterPosPrev.
1047 2 : i.stats.ReverseStepCount[InternalIterCall]++
1048 2 : i.iterKV = i.iter.Prev()
1049 2 :
1050 2 : // Set rangeKeyBoundary so that on the next iteration, we know to
1051 2 : // return the key even if the MERGE point key is deleted.
1052 2 : rangeKeyBoundary = true
1053 :
1054 2 : case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized:
1055 2 : i.value = base.InternalValue{}
1056 2 : i.iterValidityState = IterExhausted
1057 2 : valueMerger = nil
1058 2 : i.stats.ReverseStepCount[InternalIterCall]++
1059 2 : i.iterKV = i.iter.Prev()
1060 2 : // Compare with the limit. We could optimize by only checking when
1061 2 : // we step to the previous user key, but detecting that requires a
1062 2 : // comparison too. Note that this position may already passed a
1063 2 : // number of versions of this user key, but they are all deleted, so
1064 2 : // the fact that a subsequent Prev*() call will not see them is
1065 2 : // harmless. Also note that this is the only place in the loop,
1066 2 : // other than the firstLoopIter and SkipPoint cases above, where we
1067 2 : // could step to a different user key and start processing it for
1068 2 : // returning to the caller.
1069 2 : if limit != nil && i.iterKV != nil && i.cmp(limit, i.iterKV.K.UserKey) > 0 && !i.rangeKeyWithinLimit(limit) {
1070 2 : i.iterValidityState = IterAtLimit
1071 2 : i.pos = iterPosCurReversePaused
1072 2 : return
1073 2 : }
1074 2 : continue
1075 :
1076 2 : case InternalKeyKindSet, InternalKeyKindSetWithDelete:
1077 2 : i.keyBuf = append(i.keyBuf[:0], key.UserKey...)
1078 2 : i.key = i.keyBuf
1079 2 : // iterValue is owned by i.iter and could change after the Prev()
1080 2 : // call, so use valueBuf instead. Note that valueBuf is only used
1081 2 : // in this one instance; everywhere else (eg. in findNextEntry),
1082 2 : // we just point i.value to the unsafe i.iter-owned value buffer.
1083 2 : i.value, i.valueBuf = i.iterKV.V.Clone(i.valueBuf[:0], &i.fetcher)
1084 2 : i.saveRangeKey()
1085 2 : i.iterValidityState = IterValid
1086 2 : i.iterKV = i.iter.Prev()
1087 2 : i.stats.ReverseStepCount[InternalIterCall]++
1088 2 : valueMerger = nil
1089 2 : continue
1090 :
1091 2 : case InternalKeyKindMerge:
1092 2 : if i.iterValidityState == IterExhausted {
1093 2 : i.keyBuf = append(i.keyBuf[:0], key.UserKey...)
1094 2 : i.key = i.keyBuf
1095 2 : i.saveRangeKey()
1096 2 : var iterValue []byte
1097 2 : iterValue, _, i.err = i.iterKV.Value(nil)
1098 2 : if i.err != nil {
1099 0 : return
1100 0 : }
1101 2 : valueMerger, i.err = i.merge(i.key, iterValue)
1102 2 : if i.err != nil {
1103 0 : return
1104 0 : }
1105 2 : i.iterValidityState = IterValid
1106 2 : } else if valueMerger == nil {
1107 2 : // Extract value before iterValue since we use value before iterValue
1108 2 : // and the underlying iterator is not required to provide backing
1109 2 : // memory for both simultaneously.
1110 2 : var value []byte
1111 2 : var callerOwned bool
1112 2 : value, callerOwned, i.err = i.value.Value(i.lazyValueBuf)
1113 2 : if callerOwned {
1114 0 : i.lazyValueBuf = value[:0]
1115 0 : }
1116 2 : if i.err != nil {
1117 0 : i.iterValidityState = IterExhausted
1118 0 : return
1119 0 : }
1120 2 : valueMerger, i.err = i.merge(i.key, value)
1121 2 : var iterValue []byte
1122 2 : iterValue, _, i.err = i.iterKV.Value(nil)
1123 2 : if i.err != nil {
1124 0 : i.iterValidityState = IterExhausted
1125 0 : return
1126 0 : }
1127 2 : if i.err == nil {
1128 2 : i.err = valueMerger.MergeNewer(iterValue)
1129 2 : }
1130 2 : if i.err != nil {
1131 0 : i.iterValidityState = IterExhausted
1132 0 : return
1133 0 : }
1134 2 : } else {
1135 2 : var iterValue []byte
1136 2 : iterValue, _, i.err = i.iterKV.Value(nil)
1137 2 : if i.err != nil {
1138 0 : i.iterValidityState = IterExhausted
1139 0 : return
1140 0 : }
1141 2 : i.err = valueMerger.MergeNewer(iterValue)
1142 2 : if i.err != nil {
1143 0 : i.iterValidityState = IterExhausted
1144 0 : return
1145 0 : }
1146 : }
1147 2 : i.iterKV = i.iter.Prev()
1148 2 : i.stats.ReverseStepCount[InternalIterCall]++
1149 2 : continue
1150 :
1151 0 : default:
1152 0 : i.err = base.CorruptionErrorf("pebble: invalid internal key kind: %d", errors.Safe(key.Kind()))
1153 0 : i.iterValidityState = IterExhausted
1154 0 : return
1155 : }
1156 : }
1157 : // i.iterKV == nil, so broke out of the preceding loop.
1158 :
1159 : // Is iterKey nil due to an error?
1160 2 : if i.err = i.iter.Error(); i.err != nil {
1161 1 : i.iterValidityState = IterExhausted
1162 1 : return
1163 1 : }
1164 :
1165 2 : if i.iterValidityState == IterValid {
1166 2 : i.pos = iterPosPrev
1167 2 : if valueMerger != nil {
1168 2 : var needDelete bool
1169 2 : var value []byte
1170 2 : value, needDelete, i.valueCloser, i.err = finishValueMerger(valueMerger, true /* includesBase */)
1171 2 : i.value = base.MakeInPlaceValue(value)
1172 2 : if i.err == nil && needDelete {
1173 1 : i.key = nil
1174 1 : i.value = base.InternalValue{}
1175 1 : i.iterValidityState = IterExhausted
1176 1 : }
1177 : }
1178 2 : if i.err != nil {
1179 0 : i.iterValidityState = IterExhausted
1180 0 : }
1181 : }
1182 : }
1183 :
1184 2 : func (i *Iterator) prevUserKey() {
1185 2 : if i.iterKV == nil {
1186 2 : return
1187 2 : }
1188 2 : if i.iterValidityState != IterValid {
1189 2 : // If we're going to compare against the prev key, we need to save the
1190 2 : // current key.
1191 2 : i.keyBuf = append(i.keyBuf[:0], i.iterKV.K.UserKey...)
1192 2 : i.key = i.keyBuf
1193 2 : }
1194 2 : for {
1195 2 : i.iterKV = i.iter.Prev()
1196 2 : i.stats.ReverseStepCount[InternalIterCall]++
1197 2 : if i.iterKV == nil {
1198 2 : if err := i.iter.Error(); err != nil {
1199 1 : i.err = err
1200 1 : i.iterValidityState = IterExhausted
1201 1 : }
1202 2 : break
1203 : }
1204 2 : if !i.equal(i.key, i.iterKV.K.UserKey) {
1205 2 : break
1206 : }
1207 : }
1208 : }
1209 :
1210 2 : func (i *Iterator) mergeNext(key InternalKey, valueMerger ValueMerger) {
1211 2 : // Save the current key.
1212 2 : i.keyBuf = append(i.keyBuf[:0], key.UserKey...)
1213 2 : i.key = i.keyBuf
1214 2 :
1215 2 : // Loop looking for older values for this key and merging them.
1216 2 : for {
1217 2 : i.iterKV = i.iter.Next()
1218 2 : i.stats.ForwardStepCount[InternalIterCall]++
1219 2 : if i.iterKV == nil {
1220 2 : if i.err = i.iter.Error(); i.err != nil {
1221 1 : return
1222 1 : }
1223 2 : i.pos = iterPosNext
1224 2 : return
1225 : }
1226 2 : key = i.iterKV.K
1227 2 : if !i.equal(i.key, key.UserKey) {
1228 2 : // We've advanced to the next key.
1229 2 : i.pos = iterPosNext
1230 2 : return
1231 2 : }
1232 2 : switch key.Kind() {
1233 2 : case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized:
1234 2 : // We've hit a deletion tombstone. Return everything up to this
1235 2 : // point.
1236 2 : //
1237 2 : // NB: treating InternalKeyKindSingleDelete as equivalent to DEL is not
1238 2 : // only simpler, but is also necessary for correctness due to
1239 2 : // InternalKeyKindSSTableInternalObsoleteBit.
1240 2 : return
1241 :
1242 2 : case InternalKeyKindSet, InternalKeyKindSetWithDelete:
1243 2 : // We've hit a Set value. Merge with the existing value and return.
1244 2 : var iterValue []byte
1245 2 : iterValue, _, i.err = i.iterKV.Value(nil)
1246 2 : if i.err != nil {
1247 0 : return
1248 0 : }
1249 2 : i.err = valueMerger.MergeOlder(iterValue)
1250 2 : return
1251 :
1252 2 : case InternalKeyKindMerge:
1253 2 : // We've hit another Merge value. Merge with the existing value and
1254 2 : // continue looping.
1255 2 : var iterValue []byte
1256 2 : iterValue, _, i.err = i.iterKV.Value(nil)
1257 2 : if i.err != nil {
1258 0 : return
1259 0 : }
1260 2 : i.err = valueMerger.MergeOlder(iterValue)
1261 2 : if i.err != nil {
1262 0 : return
1263 0 : }
1264 2 : continue
1265 :
1266 0 : case InternalKeyKindRangeKeySet:
1267 0 : // The RANGEKEYSET marker must sort before a MERGE at the same user key.
1268 0 : i.err = base.CorruptionErrorf("pebble: out of order range key marker")
1269 0 : return
1270 :
1271 0 : default:
1272 0 : i.err = base.CorruptionErrorf("pebble: invalid internal key kind: %d", errors.Safe(key.Kind()))
1273 0 : return
1274 : }
1275 : }
1276 : }
1277 :
1278 : // SeekGE moves the iterator to the first key/value pair whose key is greater
1279 : // than or equal to the given key. Returns true if the iterator is pointing at
1280 : // a valid entry and false otherwise.
1281 2 : func (i *Iterator) SeekGE(key []byte) bool {
1282 2 : return i.SeekGEWithLimit(key, nil) == IterValid
1283 2 : }
1284 :
1285 : // SeekGEWithLimit moves the iterator to the first key/value pair whose key is
1286 : // greater than or equal to the given key.
1287 : //
1288 : // If limit is provided, it serves as a best-effort exclusive limit. If the
1289 : // first key greater than or equal to the given search key is also greater than
1290 : // or equal to limit, the Iterator may pause and return IterAtLimit. Because
1291 : // limits are best-effort, SeekGEWithLimit may return a key beyond limit.
1292 : //
1293 : // If the Iterator is configured to iterate over range keys, SeekGEWithLimit
1294 : // guarantees it will surface any range keys with bounds overlapping the
1295 : // keyspace [key, limit).
1296 2 : func (i *Iterator) SeekGEWithLimit(key []byte, limit []byte) IterValidityState {
1297 2 : if i.rangeKey != nil {
1298 2 : // NB: Check Valid() before clearing requiresReposition.
1299 2 : i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
1300 2 : // If we have a range key but did not expose it at the previous iterator
1301 2 : // position (because the iterator was not at a valid position), updated
1302 2 : // must be true. This ensures that after an iterator op sequence like:
1303 2 : // - Next() → (IterValid, RangeBounds() = [a,b))
1304 2 : // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -)
1305 2 : // - SeekGE(...) → (IterValid, RangeBounds() = [a,b))
1306 2 : // the iterator returns RangeKeyChanged()=true.
1307 2 : //
1308 2 : // The remainder of this function will only update i.rangeKey.updated if
1309 2 : // the iterator moves into a new range key, or out of the current range
1310 2 : // key.
1311 2 : i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys()
1312 2 : }
1313 2 : lastPositioningOp := i.lastPositioningOp
1314 2 : // Set it to unknown, since this operation may not succeed, in which case
1315 2 : // the SeekGE following this should not make any assumption about iterator
1316 2 : // position.
1317 2 : i.lastPositioningOp = unknownLastPositionOp
1318 2 : i.requiresReposition = false
1319 2 : i.err = nil // clear cached iteration error
1320 2 : i.hasPrefix = false
1321 2 : i.stats.ForwardSeekCount[InterfaceCall]++
1322 2 : if lowerBound := i.opts.GetLowerBound(); lowerBound != nil && i.cmp(key, lowerBound) < 0 {
1323 2 : key = lowerBound
1324 2 : } else if upperBound := i.opts.GetUpperBound(); upperBound != nil && i.cmp(key, upperBound) > 0 {
1325 2 : key = upperBound
1326 2 : }
1327 2 : seekInternalIter := true
1328 2 :
1329 2 : var flags base.SeekGEFlags
1330 2 : if i.batchJustRefreshed {
1331 1 : i.batchJustRefreshed = false
1332 1 : flags = flags.EnableBatchJustRefreshed()
1333 1 : }
1334 2 : if lastPositioningOp == seekGELastPositioningOp {
1335 2 : cmp := i.cmp(i.prefixOrFullSeekKey, key)
1336 2 : // If this seek is to the same or later key, and the iterator is
1337 2 : // already positioned there, this is a noop. This can be helpful for
1338 2 : // sparse key spaces that have many deleted keys, where one can avoid
1339 2 : // the overhead of iterating past them again and again.
1340 2 : if cmp <= 0 {
1341 2 : if !flags.BatchJustRefreshed() &&
1342 2 : (i.iterValidityState == IterExhausted ||
1343 2 : (i.iterValidityState == IterValid && i.cmp(key, i.key) <= 0 &&
1344 2 : (limit == nil || i.cmp(i.key, limit) < 0))) {
1345 2 : // Noop
1346 2 : if i.forceEnableSeekOpt || !testingDisableSeekOpt(key, uintptr(unsafe.Pointer(i))) {
1347 2 : i.lastPositioningOp = seekGELastPositioningOp
1348 2 : return i.iterValidityState
1349 2 : }
1350 : }
1351 : // cmp == 0 is not safe to optimize since
1352 : // - i.pos could be at iterPosNext, due to a merge.
1353 : // - Even if i.pos were at iterPosCurForward, we could have a DELETE,
1354 : // SET pair for a key, and the iterator would have moved past DELETE
1355 : // but stayed at iterPosCurForward. A similar situation occurs for a
1356 : // MERGE, SET pair where the MERGE is consumed and the iterator is
1357 : // at the SET.
1358 : // We also leverage the IterAtLimit <=> i.pos invariant defined in the
1359 : // comment on iterValidityState, to exclude any cases where i.pos
1360 : // is iterPosCur{Forward,Reverse}Paused. This avoids the need to
1361 : // special-case those iterator positions and their interactions with
1362 : // TrySeekUsingNext, as the main uses for TrySeekUsingNext in CockroachDB
1363 : // do not use limited Seeks in the first place.
1364 2 : if cmp < 0 && i.iterValidityState != IterAtLimit && limit == nil {
1365 2 : flags = flags.EnableTrySeekUsingNext()
1366 2 : }
1367 2 : if testingDisableSeekOpt(key, uintptr(unsafe.Pointer(i))) && !i.forceEnableSeekOpt {
1368 2 : flags = flags.DisableTrySeekUsingNext()
1369 2 : }
1370 2 : if !flags.BatchJustRefreshed() && i.pos == iterPosCurForwardPaused && i.cmp(key, i.iterKV.K.UserKey) <= 0 {
1371 2 : // Have some work to do, but don't need to seek, and we can
1372 2 : // start doing findNextEntry from i.iterKey.
1373 2 : seekInternalIter = false
1374 2 : }
1375 : }
1376 : }
1377 2 : if seekInternalIter {
1378 2 : i.iterKV = i.iter.SeekGE(key, flags)
1379 2 : i.stats.ForwardSeekCount[InternalIterCall]++
1380 2 : if err := i.iter.Error(); err != nil {
1381 1 : i.err = err
1382 1 : i.iterValidityState = IterExhausted
1383 1 : return i.iterValidityState
1384 1 : }
1385 : }
1386 2 : i.findNextEntry(limit)
1387 2 : i.maybeSampleRead()
1388 2 : if i.Error() == nil {
1389 2 : // Prepare state for a future noop optimization.
1390 2 : i.prefixOrFullSeekKey = append(i.prefixOrFullSeekKey[:0], key...)
1391 2 : i.lastPositioningOp = seekGELastPositioningOp
1392 2 : }
1393 2 : return i.iterValidityState
1394 : }
1395 :
1396 : // SeekPrefixGE moves the iterator to the first key/value pair whose key is
1397 : // greater than or equal to the given key and which has the same "prefix" as
1398 : // the given key. The prefix for a key is determined by the user-defined
1399 : // Comparer.Split function. The iterator will not observe keys not matching the
1400 : // "prefix" of the search key. Calling SeekPrefixGE puts the iterator in prefix
1401 : // iteration mode. The iterator remains in prefix iteration until a subsequent
1402 : // call to another absolute positioning method (SeekGE, SeekLT, First,
1403 : // Last). Reverse iteration (Prev) is not supported when an iterator is in
1404 : // prefix iteration mode. Returns true if the iterator is pointing at a valid
1405 : // entry and false otherwise.
1406 : //
1407 : // The semantics of SeekPrefixGE are slightly unusual and designed for
1408 : // iteration to be able to take advantage of bloom filters that have been
1409 : // created on the "prefix". If you're not using bloom filters, there is no
1410 : // reason to use SeekPrefixGE.
1411 : //
1412 : // An example Split function may separate a timestamp suffix from the prefix of
1413 : // the key.
1414 : //
1415 : // Split(<key>@<timestamp>) -> <key>
1416 : //
1417 : // Consider the keys "a@1", "a@2", "aa@3", "aa@4". The prefixes for these keys
1418 : // are "a", and "aa". Note that despite "a" and "aa" sharing a prefix by the
1419 : // usual definition, those prefixes differ by the definition of the Split
1420 : // function. To see how this works, consider the following set of calls on this
1421 : // data set:
1422 : //
1423 : // SeekPrefixGE("a@0") -> "a@1"
1424 : // Next() -> "a@2"
1425 : // Next() -> EOF
1426 : //
1427 : // If you're just looking to iterate over keys with a shared prefix, as
1428 : // defined by the configured comparer, set iterator bounds instead:
1429 : //
1430 : // iter := db.NewIter(&pebble.IterOptions{
1431 : // LowerBound: []byte("prefix"),
1432 : // UpperBound: []byte("prefiy"),
1433 : // })
1434 : // for iter.First(); iter.Valid(); iter.Next() {
1435 : // // Only keys beginning with "prefix" will be visited.
1436 : // }
1437 : //
1438 : // See ExampleIterator_SeekPrefixGE for a working example.
1439 : //
1440 : // When iterating with range keys enabled, all range keys encountered are
1441 : // truncated to the seek key's prefix's bounds. The truncation of the upper
1442 : // bound requires that the database's Comparer is configured with a
1443 : // ImmediateSuccessor method. For example, a SeekPrefixGE("a@9") call with the
1444 : // prefix "a" will truncate range key bounds to [a,ImmediateSuccessor(a)].
1445 2 : func (i *Iterator) SeekPrefixGE(key []byte) bool {
1446 2 : if i.rangeKey != nil {
1447 2 : // NB: Check Valid() before clearing requiresReposition.
1448 2 : i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
1449 2 : // If we have a range key but did not expose it at the previous iterator
1450 2 : // position (because the iterator was not at a valid position), updated
1451 2 : // must be true. This ensures that after an iterator op sequence like:
1452 2 : // - Next() → (IterValid, RangeBounds() = [a,b))
1453 2 : // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -)
1454 2 : // - SeekPrefixGE(...) → (IterValid, RangeBounds() = [a,b))
1455 2 : // the iterator returns RangeKeyChanged()=true.
1456 2 : //
1457 2 : // The remainder of this function will only update i.rangeKey.updated if
1458 2 : // the iterator moves into a new range key, or out of the current range
1459 2 : // key.
1460 2 : i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys()
1461 2 : }
1462 2 : lastPositioningOp := i.lastPositioningOp
1463 2 : // Set it to unknown, since this operation may not succeed, in which case
1464 2 : // the SeekPrefixGE following this should not make any assumption about
1465 2 : // iterator position.
1466 2 : i.lastPositioningOp = unknownLastPositionOp
1467 2 : i.requiresReposition = false
1468 2 : i.err = nil // clear cached iteration error
1469 2 : i.stats.ForwardSeekCount[InterfaceCall]++
1470 2 : if i.comparer.ImmediateSuccessor == nil && i.opts.KeyTypes != IterKeyTypePointsOnly {
1471 0 : panic("pebble: ImmediateSuccessor must be provided for SeekPrefixGE with range keys")
1472 : }
1473 2 : prefixLen := i.comparer.Split(key)
1474 2 : keyPrefix := key[:prefixLen]
1475 2 : var flags base.SeekGEFlags
1476 2 : if i.batchJustRefreshed {
1477 1 : flags = flags.EnableBatchJustRefreshed()
1478 1 : i.batchJustRefreshed = false
1479 1 : }
1480 2 : if lastPositioningOp == seekPrefixGELastPositioningOp {
1481 2 : if !i.hasPrefix {
1482 0 : panic("lastPositioningOpsIsSeekPrefixGE is true, but hasPrefix is false")
1483 : }
1484 : // The iterator has not been repositioned after the last SeekPrefixGE.
1485 : // See if we are seeking to a larger key, since then we can optimize
1486 : // the seek by using next. Note that we could also optimize if Next
1487 : // has been called, if the iterator is not exhausted and the current
1488 : // position is <= the seek key. We are keeping this limited for now
1489 : // since such optimizations require care for correctness, and to not
1490 : // become de-optimizations (if one usually has to do all the next
1491 : // calls and then the seek). This SeekPrefixGE optimization
1492 : // specifically benefits CockroachDB.
1493 2 : cmp := i.cmp(i.prefixOrFullSeekKey, keyPrefix)
1494 2 : // cmp == 0 is not safe to optimize since
1495 2 : // - i.pos could be at iterPosNext, due to a merge.
1496 2 : // - Even if i.pos were at iterPosCurForward, we could have a DELETE,
1497 2 : // SET pair for a key, and the iterator would have moved past DELETE
1498 2 : // but stayed at iterPosCurForward. A similar situation occurs for a
1499 2 : // MERGE, SET pair where the MERGE is consumed and the iterator is
1500 2 : // at the SET.
1501 2 : // In general some versions of i.prefix could have been consumed by
1502 2 : // the iterator, so we only optimize for cmp < 0.
1503 2 : if cmp < 0 {
1504 2 : flags = flags.EnableTrySeekUsingNext()
1505 2 : }
1506 2 : if testingDisableSeekOpt(key, uintptr(unsafe.Pointer(i))) && !i.forceEnableSeekOpt {
1507 2 : flags = flags.DisableTrySeekUsingNext()
1508 2 : }
1509 : }
1510 : // Make a copy of the prefix so that modifications to the key after
1511 : // SeekPrefixGE returns does not affect the stored prefix.
1512 2 : if cap(i.prefixOrFullSeekKey) < prefixLen {
1513 2 : i.prefixOrFullSeekKey = make([]byte, prefixLen)
1514 2 : } else {
1515 2 : i.prefixOrFullSeekKey = i.prefixOrFullSeekKey[:prefixLen]
1516 2 : }
1517 2 : i.hasPrefix = true
1518 2 : copy(i.prefixOrFullSeekKey, keyPrefix)
1519 2 :
1520 2 : if lowerBound := i.opts.GetLowerBound(); lowerBound != nil && i.cmp(key, lowerBound) < 0 {
1521 2 : if p := i.comparer.Split.Prefix(lowerBound); !bytes.Equal(i.prefixOrFullSeekKey, p) {
1522 2 : i.err = errors.New("pebble: SeekPrefixGE supplied with key outside of lower bound")
1523 2 : i.iterValidityState = IterExhausted
1524 2 : return false
1525 2 : }
1526 2 : key = lowerBound
1527 2 : } else if upperBound := i.opts.GetUpperBound(); upperBound != nil && i.cmp(key, upperBound) > 0 {
1528 2 : if p := i.comparer.Split.Prefix(upperBound); !bytes.Equal(i.prefixOrFullSeekKey, p) {
1529 2 : i.err = errors.New("pebble: SeekPrefixGE supplied with key outside of upper bound")
1530 2 : i.iterValidityState = IterExhausted
1531 2 : return false
1532 2 : }
1533 2 : key = upperBound
1534 : }
1535 2 : i.iterKV = i.iter.SeekPrefixGE(i.prefixOrFullSeekKey, key, flags)
1536 2 : i.stats.ForwardSeekCount[InternalIterCall]++
1537 2 : i.findNextEntry(nil)
1538 2 : i.maybeSampleRead()
1539 2 : if i.Error() == nil {
1540 2 : i.lastPositioningOp = seekPrefixGELastPositioningOp
1541 2 : }
1542 2 : return i.iterValidityState == IterValid
1543 : }
1544 :
1545 : // Deterministic disabling (in testing mode) of the seek optimizations. It uses
1546 : // the iterator pointer, since we want diversity in iterator behavior for the
1547 : // same key. Used for tests.
1548 2 : func testingDisableSeekOpt(key []byte, ptr uintptr) bool {
1549 2 : if !invariants.Enabled {
1550 0 : return false
1551 0 : }
1552 : // Fibonacci hash https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
1553 2 : simpleHash := (11400714819323198485 * uint64(ptr)) >> 63
1554 2 : return key != nil && key[0]&byte(1) == 0 && simpleHash == 0
1555 : }
1556 :
1557 : // SeekLT moves the iterator to the last key/value pair whose key is less than
1558 : // the given key. Returns true if the iterator is pointing at a valid entry and
1559 : // false otherwise.
1560 2 : func (i *Iterator) SeekLT(key []byte) bool {
1561 2 : return i.SeekLTWithLimit(key, nil) == IterValid
1562 2 : }
1563 :
1564 : // SeekLTWithLimit moves the iterator to the last key/value pair whose key is
1565 : // less than the given key.
1566 : //
1567 : // If limit is provided, it serves as a best-effort inclusive limit. If the last
1568 : // key less than the given search key is also less than limit, the Iterator may
1569 : // pause and return IterAtLimit. Because limits are best-effort, SeekLTWithLimit
1570 : // may return a key beyond limit.
1571 : //
1572 : // If the Iterator is configured to iterate over range keys, SeekLTWithLimit
1573 : // guarantees it will surface any range keys with bounds overlapping the
1574 : // keyspace up to limit.
1575 2 : func (i *Iterator) SeekLTWithLimit(key []byte, limit []byte) IterValidityState {
1576 2 : if i.rangeKey != nil {
1577 2 : // NB: Check Valid() before clearing requiresReposition.
1578 2 : i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
1579 2 : // If we have a range key but did not expose it at the previous iterator
1580 2 : // position (because the iterator was not at a valid position), updated
1581 2 : // must be true. This ensures that after an iterator op sequence like:
1582 2 : // - Next() → (IterValid, RangeBounds() = [a,b))
1583 2 : // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -)
1584 2 : // - SeekLTWithLimit(...) → (IterValid, RangeBounds() = [a,b))
1585 2 : // the iterator returns RangeKeyChanged()=true.
1586 2 : //
1587 2 : // The remainder of this function will only update i.rangeKey.updated if
1588 2 : // the iterator moves into a new range key, or out of the current range
1589 2 : // key.
1590 2 : i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys()
1591 2 : }
1592 2 : lastPositioningOp := i.lastPositioningOp
1593 2 : // Set it to unknown, since this operation may not succeed, in which case
1594 2 : // the SeekLT following this should not make any assumption about iterator
1595 2 : // position.
1596 2 : i.lastPositioningOp = unknownLastPositionOp
1597 2 : i.batchJustRefreshed = false
1598 2 : i.requiresReposition = false
1599 2 : i.err = nil // clear cached iteration error
1600 2 : i.stats.ReverseSeekCount[InterfaceCall]++
1601 2 : if upperBound := i.opts.GetUpperBound(); upperBound != nil && i.cmp(key, upperBound) > 0 {
1602 2 : key = upperBound
1603 2 : } else if lowerBound := i.opts.GetLowerBound(); lowerBound != nil && i.cmp(key, lowerBound) < 0 {
1604 2 : key = lowerBound
1605 2 : }
1606 2 : i.hasPrefix = false
1607 2 : seekInternalIter := true
1608 2 : // The following noop optimization only applies when i.batch == nil, since
1609 2 : // an iterator over a batch is iterating over mutable data, that may have
1610 2 : // changed since the last seek.
1611 2 : if lastPositioningOp == seekLTLastPositioningOp && i.batch == nil {
1612 2 : cmp := i.cmp(key, i.prefixOrFullSeekKey)
1613 2 : // If this seek is to the same or earlier key, and the iterator is
1614 2 : // already positioned there, this is a noop. This can be helpful for
1615 2 : // sparse key spaces that have many deleted keys, where one can avoid
1616 2 : // the overhead of iterating past them again and again.
1617 2 : if cmp <= 0 {
1618 2 : // NB: when pos != iterPosCurReversePaused, the invariant
1619 2 : // documented earlier implies that iterValidityState !=
1620 2 : // IterAtLimit.
1621 2 : if i.iterValidityState == IterExhausted ||
1622 2 : (i.iterValidityState == IterValid && i.cmp(i.key, key) < 0 &&
1623 2 : (limit == nil || i.cmp(limit, i.key) <= 0)) {
1624 2 : if !testingDisableSeekOpt(key, uintptr(unsafe.Pointer(i))) {
1625 2 : i.lastPositioningOp = seekLTLastPositioningOp
1626 2 : return i.iterValidityState
1627 2 : }
1628 : }
1629 2 : if i.pos == iterPosCurReversePaused && i.cmp(i.iterKV.K.UserKey, key) < 0 {
1630 2 : // Have some work to do, but don't need to seek, and we can
1631 2 : // start doing findPrevEntry from i.iterKey.
1632 2 : seekInternalIter = false
1633 2 : }
1634 : }
1635 : }
1636 2 : if seekInternalIter {
1637 2 : i.iterKV = i.iter.SeekLT(key, base.SeekLTFlagsNone)
1638 2 : i.stats.ReverseSeekCount[InternalIterCall]++
1639 2 : if err := i.iter.Error(); err != nil {
1640 1 : i.err = err
1641 1 : i.iterValidityState = IterExhausted
1642 1 : return i.iterValidityState
1643 1 : }
1644 : }
1645 2 : i.findPrevEntry(limit)
1646 2 : i.maybeSampleRead()
1647 2 : if i.Error() == nil && i.batch == nil {
1648 2 : // Prepare state for a future noop optimization.
1649 2 : i.prefixOrFullSeekKey = append(i.prefixOrFullSeekKey[:0], key...)
1650 2 : i.lastPositioningOp = seekLTLastPositioningOp
1651 2 : }
1652 2 : return i.iterValidityState
1653 : }
1654 :
1655 : // First moves the iterator the first key/value pair. Returns true if the
1656 : // iterator is pointing at a valid entry and false otherwise.
1657 2 : func (i *Iterator) First() bool {
1658 2 : if i.rangeKey != nil {
1659 2 : // NB: Check Valid() before clearing requiresReposition.
1660 2 : i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
1661 2 : // If we have a range key but did not expose it at the previous iterator
1662 2 : // position (because the iterator was not at a valid position), updated
1663 2 : // must be true. This ensures that after an iterator op sequence like:
1664 2 : // - Next() → (IterValid, RangeBounds() = [a,b))
1665 2 : // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -)
1666 2 : // - First(...) → (IterValid, RangeBounds() = [a,b))
1667 2 : // the iterator returns RangeKeyChanged()=true.
1668 2 : //
1669 2 : // The remainder of this function will only update i.rangeKey.updated if
1670 2 : // the iterator moves into a new range key, or out of the current range
1671 2 : // key.
1672 2 : i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys()
1673 2 : }
1674 2 : i.err = nil // clear cached iteration error
1675 2 : i.hasPrefix = false
1676 2 : i.batchJustRefreshed = false
1677 2 : i.lastPositioningOp = unknownLastPositionOp
1678 2 : i.requiresReposition = false
1679 2 : i.stats.ForwardSeekCount[InterfaceCall]++
1680 2 :
1681 2 : i.err = i.iterFirstWithinBounds()
1682 2 : if i.err != nil {
1683 1 : i.iterValidityState = IterExhausted
1684 1 : return false
1685 1 : }
1686 2 : i.findNextEntry(nil)
1687 2 : i.maybeSampleRead()
1688 2 : return i.iterValidityState == IterValid
1689 : }
1690 :
1691 : // Last moves the iterator the last key/value pair. Returns true if the
1692 : // iterator is pointing at a valid entry and false otherwise.
1693 2 : func (i *Iterator) Last() bool {
1694 2 : if i.rangeKey != nil {
1695 2 : // NB: Check Valid() before clearing requiresReposition.
1696 2 : i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
1697 2 : // If we have a range key but did not expose it at the previous iterator
1698 2 : // position (because the iterator was not at a valid position), updated
1699 2 : // must be true. This ensures that after an iterator op sequence like:
1700 2 : // - Next() → (IterValid, RangeBounds() = [a,b))
1701 2 : // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -)
1702 2 : // - Last(...) → (IterValid, RangeBounds() = [a,b))
1703 2 : // the iterator returns RangeKeyChanged()=true.
1704 2 : //
1705 2 : // The remainder of this function will only update i.rangeKey.updated if
1706 2 : // the iterator moves into a new range key, or out of the current range
1707 2 : // key.
1708 2 : i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys()
1709 2 : }
1710 2 : i.err = nil // clear cached iteration error
1711 2 : i.hasPrefix = false
1712 2 : i.batchJustRefreshed = false
1713 2 : i.lastPositioningOp = unknownLastPositionOp
1714 2 : i.requiresReposition = false
1715 2 : i.stats.ReverseSeekCount[InterfaceCall]++
1716 2 :
1717 2 : if i.err = i.iterLastWithinBounds(); i.err != nil {
1718 1 : i.iterValidityState = IterExhausted
1719 1 : return false
1720 1 : }
1721 2 : i.findPrevEntry(nil)
1722 2 : i.maybeSampleRead()
1723 2 : return i.iterValidityState == IterValid
1724 : }
1725 :
1726 : // Next moves the iterator to the next key/value pair. Returns true if the
1727 : // iterator is pointing at a valid entry and false otherwise.
1728 2 : func (i *Iterator) Next() bool {
1729 2 : return i.nextWithLimit(nil) == IterValid
1730 2 : }
1731 :
1732 : // NextWithLimit moves the iterator to the next key/value pair.
1733 : //
1734 : // If limit is provided, it serves as a best-effort exclusive limit. If the next
1735 : // key is greater than or equal to limit, the Iterator may pause and return
1736 : // IterAtLimit. Because limits are best-effort, NextWithLimit may return a key
1737 : // beyond limit.
1738 : //
1739 : // If the Iterator is configured to iterate over range keys, NextWithLimit
1740 : // guarantees it will surface any range keys with bounds overlapping the
1741 : // keyspace up to limit.
1742 2 : func (i *Iterator) NextWithLimit(limit []byte) IterValidityState {
1743 2 : return i.nextWithLimit(limit)
1744 2 : }
1745 :
1746 : // NextPrefix moves the iterator to the next key/value pair with a key
1747 : // containing a different prefix than the current key. Prefixes are determined
1748 : // by Comparer.Split. Exhausts the iterator if invoked while in prefix-iteration
1749 : // mode.
1750 : //
1751 : // It is not permitted to invoke NextPrefix while at a IterAtLimit position.
1752 : // When called in this condition, NextPrefix has non-deterministic behavior.
1753 : //
1754 : // It is not permitted to invoke NextPrefix when the Iterator has an
1755 : // upper-bound that is a versioned MVCC key (see the comment for
1756 : // Comparer.Split). It returns an error in this case.
1757 2 : func (i *Iterator) NextPrefix() bool {
1758 2 : if i.nextPrefixNotPermittedByUpperBound {
1759 2 : i.lastPositioningOp = unknownLastPositionOp
1760 2 : i.requiresReposition = false
1761 2 : i.err = errors.Errorf("NextPrefix not permitted with upper bound %s",
1762 2 : i.comparer.FormatKey(i.opts.UpperBound))
1763 2 : i.iterValidityState = IterExhausted
1764 2 : return false
1765 2 : }
1766 2 : if i.hasPrefix {
1767 2 : i.iterValidityState = IterExhausted
1768 2 : return false
1769 2 : }
1770 2 : if i.Error() != nil {
1771 2 : return false
1772 2 : }
1773 2 : return i.nextPrefix() == IterValid
1774 : }
1775 :
1776 2 : func (i *Iterator) nextPrefix() IterValidityState {
1777 2 : if i.rangeKey != nil {
1778 2 : // NB: Check Valid() before clearing requiresReposition.
1779 2 : i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
1780 2 : // If we have a range key but did not expose it at the previous iterator
1781 2 : // position (because the iterator was not at a valid position), updated
1782 2 : // must be true. This ensures that after an iterator op sequence like:
1783 2 : // - Next() → (IterValid, RangeBounds() = [a,b))
1784 2 : // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -)
1785 2 : // - NextWithLimit(...) → (IterValid, RangeBounds() = [a,b))
1786 2 : // the iterator returns RangeKeyChanged()=true.
1787 2 : //
1788 2 : // The remainder of this function will only update i.rangeKey.updated if
1789 2 : // the iterator moves into a new range key, or out of the current range
1790 2 : // key.
1791 2 : i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys()
1792 2 : }
1793 :
1794 : // Although NextPrefix documents that behavior at IterAtLimit is undefined,
1795 : // this function handles these cases as a simple prefix-agnostic Next. This
1796 : // is done for deterministic behavior in the metamorphic tests.
1797 : //
1798 : // TODO(jackson): If the metamorphic test operation generator is adjusted to
1799 : // make generation of some operations conditional on the previous
1800 : // operations, then we can remove this behavior and explicitly error.
1801 :
1802 2 : i.lastPositioningOp = unknownLastPositionOp
1803 2 : i.requiresReposition = false
1804 2 : switch i.pos {
1805 2 : case iterPosCurForward:
1806 2 : // Positioned on the current key. Advance to the next prefix.
1807 2 : i.internalNextPrefix(i.comparer.Split(i.key))
1808 1 : case iterPosCurForwardPaused:
1809 : // Positioned at a limit. Implement as a prefix-agnostic Next. See TODO
1810 : // up above. The iterator is already positioned at the next key.
1811 2 : case iterPosCurReverse:
1812 2 : // Switching directions.
1813 2 : // Unless the iterator was exhausted, reverse iteration needs to
1814 2 : // position the iterator at iterPosPrev.
1815 2 : if i.iterKV != nil {
1816 0 : i.err = errors.New("switching from reverse to forward but iter is not at prev")
1817 0 : i.iterValidityState = IterExhausted
1818 0 : return i.iterValidityState
1819 0 : }
1820 : // The Iterator is exhausted and i.iter is positioned before the first
1821 : // key. Reposition to point to the first internal key.
1822 2 : if i.err = i.iterFirstWithinBounds(); i.err != nil {
1823 0 : i.iterValidityState = IterExhausted
1824 0 : return i.iterValidityState
1825 0 : }
1826 2 : case iterPosCurReversePaused:
1827 2 : // Positioned at a limit. Implement as a prefix-agnostic Next. See TODO
1828 2 : // up above.
1829 2 : //
1830 2 : // Switching directions; The iterator must not be exhausted since it
1831 2 : // paused.
1832 2 : if i.iterKV == nil {
1833 0 : i.err = errors.New("switching paused from reverse to forward but iter is exhausted")
1834 0 : i.iterValidityState = IterExhausted
1835 0 : return i.iterValidityState
1836 0 : }
1837 2 : i.nextUserKey()
1838 2 : case iterPosPrev:
1839 2 : // The underlying iterator is pointed to the previous key (this can
1840 2 : // only happen when switching iteration directions).
1841 2 : if i.iterKV == nil {
1842 2 : // We're positioned before the first key. Need to reposition to point to
1843 2 : // the first key.
1844 2 : i.err = i.iterFirstWithinBounds()
1845 2 : if i.iterKV == nil {
1846 0 : i.iterValidityState = IterExhausted
1847 0 : return i.iterValidityState
1848 0 : }
1849 2 : if invariants.Enabled && !i.equal(i.iterKV.K.UserKey, i.key) {
1850 0 : i.opts.getLogger().Fatalf("pebble: invariant violation: First internal iterator from iterPosPrev landed on %q, not %q",
1851 0 : i.iterKV.K.UserKey, i.key)
1852 0 : }
1853 2 : } else {
1854 2 : // Move the internal iterator back onto the user key stored in
1855 2 : // i.key. iterPosPrev guarantees that it's positioned at the last
1856 2 : // key with the user key less than i.key, so we're guaranteed to
1857 2 : // land on the correct key with a single Next.
1858 2 : i.iterKV = i.iter.Next()
1859 2 : if i.iterKV == nil {
1860 1 : // This should only be possible if i.iter.Next() encountered an
1861 1 : // error.
1862 1 : if i.iter.Error() == nil {
1863 0 : i.opts.getLogger().Fatalf("pebble: invariant violation: Nexting internal iterator from iterPosPrev found nothing")
1864 0 : }
1865 : // NB: Iterator.Error() will return i.iter.Error().
1866 1 : i.iterValidityState = IterExhausted
1867 1 : return i.iterValidityState
1868 : }
1869 2 : if invariants.Enabled && !i.equal(i.iterKV.K.UserKey, i.key) {
1870 0 : i.opts.getLogger().Fatalf("pebble: invariant violation: Nexting internal iterator from iterPosPrev landed on %q, not %q",
1871 0 : i.iterKV.K.UserKey, i.key)
1872 0 : }
1873 : }
1874 : // The internal iterator is now positioned at i.key. Advance to the next
1875 : // prefix.
1876 2 : i.internalNextPrefix(i.comparer.Split(i.key))
1877 2 : case iterPosNext:
1878 2 : // Already positioned on the next key. Only call nextPrefixKey if the
1879 2 : // next key shares the same prefix.
1880 2 : if i.iterKV != nil {
1881 2 : currKeyPrefixLen := i.comparer.Split(i.key)
1882 2 : if bytes.Equal(i.comparer.Split.Prefix(i.iterKV.K.UserKey), i.key[:currKeyPrefixLen]) {
1883 2 : i.internalNextPrefix(currKeyPrefixLen)
1884 2 : }
1885 : }
1886 : }
1887 :
1888 2 : i.stats.ForwardStepCount[InterfaceCall]++
1889 2 : i.findNextEntry(nil /* limit */)
1890 2 : i.maybeSampleRead()
1891 2 : return i.iterValidityState
1892 : }
1893 :
1894 2 : func (i *Iterator) internalNextPrefix(currKeyPrefixLen int) {
1895 2 : if i.iterKV == nil {
1896 2 : return
1897 2 : }
1898 : // The Next "fast-path" is not really a fast-path when there is more than
1899 : // one version. However, even with TableFormatPebblev3, there is a small
1900 : // slowdown (~10%) for one version if we remove it and only call NextPrefix.
1901 : // When there are two versions, only calling NextPrefix is ~30% faster.
1902 2 : i.stats.ForwardStepCount[InternalIterCall]++
1903 2 : if i.iterKV = i.iter.Next(); i.iterKV == nil {
1904 2 : return
1905 2 : }
1906 2 : if !bytes.Equal(i.comparer.Split.Prefix(i.iterKV.K.UserKey), i.key[:currKeyPrefixLen]) {
1907 2 : return
1908 2 : }
1909 2 : i.stats.ForwardStepCount[InternalIterCall]++
1910 2 : i.prefixOrFullSeekKey = i.comparer.ImmediateSuccessor(i.prefixOrFullSeekKey[:0], i.key[:currKeyPrefixLen])
1911 2 : if i.iterKV.K.IsExclusiveSentinel() {
1912 0 : panic(errors.AssertionFailedf("pebble: unexpected exclusive sentinel key: %q", i.iterKV.K))
1913 : }
1914 :
1915 2 : i.iterKV = i.iter.NextPrefix(i.prefixOrFullSeekKey)
1916 2 : if invariants.Enabled && i.iterKV != nil {
1917 2 : if p := i.comparer.Split.Prefix(i.iterKV.K.UserKey); i.cmp(p, i.prefixOrFullSeekKey) < 0 {
1918 0 : panic(errors.AssertionFailedf("pebble: iter.NextPrefix did not advance beyond the current prefix: now at %q; expected to be geq %q",
1919 0 : i.iterKV.K, i.prefixOrFullSeekKey))
1920 : }
1921 : }
1922 : }
1923 :
1924 2 : func (i *Iterator) nextWithLimit(limit []byte) IterValidityState {
1925 2 : i.stats.ForwardStepCount[InterfaceCall]++
1926 2 : if i.hasPrefix {
1927 2 : if limit != nil {
1928 2 : i.err = errors.New("cannot use limit with prefix iteration")
1929 2 : i.iterValidityState = IterExhausted
1930 2 : return i.iterValidityState
1931 2 : } else if i.iterValidityState == IterExhausted {
1932 2 : // No-op, already exhausted. We avoid executing the Next because it
1933 2 : // can break invariants: Specifically, a file that fails the bloom
1934 2 : // filter test may result in its level being removed from the
1935 2 : // merging iterator. The level's removal can cause a lazy combined
1936 2 : // iterator to miss range keys and trigger a switch to combined
1937 2 : // iteration at a larger key, breaking keyspan invariants.
1938 2 : return i.iterValidityState
1939 2 : }
1940 : }
1941 2 : if i.err != nil {
1942 2 : return i.iterValidityState
1943 2 : }
1944 2 : if i.rangeKey != nil {
1945 2 : // NB: Check Valid() before clearing requiresReposition.
1946 2 : i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
1947 2 : // If we have a range key but did not expose it at the previous iterator
1948 2 : // position (because the iterator was not at a valid position), updated
1949 2 : // must be true. This ensures that after an iterator op sequence like:
1950 2 : // - Next() → (IterValid, RangeBounds() = [a,b))
1951 2 : // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -)
1952 2 : // - NextWithLimit(...) → (IterValid, RangeBounds() = [a,b))
1953 2 : // the iterator returns RangeKeyChanged()=true.
1954 2 : //
1955 2 : // The remainder of this function will only update i.rangeKey.updated if
1956 2 : // the iterator moves into a new range key, or out of the current range
1957 2 : // key.
1958 2 : i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys()
1959 2 : }
1960 2 : i.lastPositioningOp = unknownLastPositionOp
1961 2 : i.requiresReposition = false
1962 2 : switch i.pos {
1963 2 : case iterPosCurForward:
1964 2 : i.nextUserKey()
1965 2 : case iterPosCurForwardPaused:
1966 : // Already at the right place.
1967 2 : case iterPosCurReverse:
1968 2 : // Switching directions.
1969 2 : // Unless the iterator was exhausted, reverse iteration needs to
1970 2 : // position the iterator at iterPosPrev.
1971 2 : if i.iterKV != nil {
1972 0 : i.err = errors.New("switching from reverse to forward but iter is not at prev")
1973 0 : i.iterValidityState = IterExhausted
1974 0 : return i.iterValidityState
1975 0 : }
1976 : // We're positioned before the first key. Need to reposition to point to
1977 : // the first key.
1978 2 : if i.err = i.iterFirstWithinBounds(); i.err != nil {
1979 0 : i.iterValidityState = IterExhausted
1980 0 : return i.iterValidityState
1981 0 : }
1982 2 : case iterPosCurReversePaused:
1983 2 : // Switching directions.
1984 2 : // The iterator must not be exhausted since it paused.
1985 2 : if i.iterKV == nil {
1986 0 : i.err = errors.New("switching paused from reverse to forward but iter is exhausted")
1987 0 : i.iterValidityState = IterExhausted
1988 0 : return i.iterValidityState
1989 0 : }
1990 2 : i.nextUserKey()
1991 2 : case iterPosPrev:
1992 2 : // The underlying iterator is pointed to the previous key (this can
1993 2 : // only happen when switching iteration directions). We set
1994 2 : // i.iterValidityState to IterExhausted here to force the calls to
1995 2 : // nextUserKey to save the current key i.iter is pointing at in order
1996 2 : // to determine when the next user-key is reached.
1997 2 : i.iterValidityState = IterExhausted
1998 2 : if i.iterKV == nil {
1999 2 : // We're positioned before the first key. Need to reposition to point to
2000 2 : // the first key.
2001 2 : i.err = i.iterFirstWithinBounds()
2002 2 : } else {
2003 2 : i.nextUserKey()
2004 2 : }
2005 2 : if i.err != nil {
2006 1 : i.iterValidityState = IterExhausted
2007 1 : return i.iterValidityState
2008 1 : }
2009 2 : i.nextUserKey()
2010 2 : case iterPosNext:
2011 : // Already at the right place.
2012 : }
2013 2 : i.findNextEntry(limit)
2014 2 : i.maybeSampleRead()
2015 2 : return i.iterValidityState
2016 : }
2017 :
2018 : // Prev moves the iterator to the previous key/value pair. Returns true if the
2019 : // iterator is pointing at a valid entry and false otherwise.
2020 2 : func (i *Iterator) Prev() bool {
2021 2 : return i.PrevWithLimit(nil) == IterValid
2022 2 : }
2023 :
2024 : // PrevWithLimit moves the iterator to the previous key/value pair.
2025 : //
2026 : // If limit is provided, it serves as a best-effort inclusive limit. If the
2027 : // previous key is less than limit, the Iterator may pause and return
2028 : // IterAtLimit. Because limits are best-effort, PrevWithLimit may return a key
2029 : // beyond limit.
2030 : //
2031 : // If the Iterator is configured to iterate over range keys, PrevWithLimit
2032 : // guarantees it will surface any range keys with bounds overlapping the
2033 : // keyspace up to limit.
2034 2 : func (i *Iterator) PrevWithLimit(limit []byte) IterValidityState {
2035 2 : i.stats.ReverseStepCount[InterfaceCall]++
2036 2 : if i.err != nil {
2037 2 : return i.iterValidityState
2038 2 : }
2039 2 : if i.rangeKey != nil {
2040 2 : // NB: Check Valid() before clearing requiresReposition.
2041 2 : i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
2042 2 : // If we have a range key but did not expose it at the previous iterator
2043 2 : // position (because the iterator was not at a valid position), updated
2044 2 : // must be true. This ensures that after an iterator op sequence like:
2045 2 : // - Next() → (IterValid, RangeBounds() = [a,b))
2046 2 : // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -)
2047 2 : // - PrevWithLimit(...) → (IterValid, RangeBounds() = [a,b))
2048 2 : // the iterator returns RangeKeyChanged()=true.
2049 2 : //
2050 2 : // The remainder of this function will only update i.rangeKey.updated if
2051 2 : // the iterator moves into a new range key, or out of the current range
2052 2 : // key.
2053 2 : i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys()
2054 2 : }
2055 2 : i.lastPositioningOp = unknownLastPositionOp
2056 2 : i.requiresReposition = false
2057 2 : if i.hasPrefix {
2058 2 : i.err = errReversePrefixIteration
2059 2 : i.iterValidityState = IterExhausted
2060 2 : return i.iterValidityState
2061 2 : }
2062 2 : switch i.pos {
2063 2 : case iterPosCurForward:
2064 : // Switching directions, and will handle this below.
2065 2 : case iterPosCurForwardPaused:
2066 : // Switching directions, and will handle this below.
2067 2 : case iterPosCurReverse:
2068 2 : i.prevUserKey()
2069 2 : case iterPosCurReversePaused:
2070 : // Already at the right place.
2071 2 : case iterPosNext:
2072 : // The underlying iterator is pointed to the next key (this can only happen
2073 : // when switching iteration directions). We will handle this below.
2074 2 : case iterPosPrev:
2075 : // Already at the right place.
2076 : }
2077 2 : if i.pos == iterPosCurForward || i.pos == iterPosNext || i.pos == iterPosCurForwardPaused {
2078 2 : // Switching direction.
2079 2 : stepAgain := i.pos == iterPosNext
2080 2 :
2081 2 : // Synthetic range key markers are a special case. Consider SeekGE(b)
2082 2 : // which finds a range key [a, c). To ensure the user observes the range
2083 2 : // key, the Iterator pauses at Key() = b. The iterator must advance the
2084 2 : // internal iterator to see if there's also a coincident point key at
2085 2 : // 'b', leaving the iterator at iterPosNext if there's not.
2086 2 : //
2087 2 : // This is a problem: Synthetic range key markers are only interleaved
2088 2 : // during the original seek. A subsequent Prev() of i.iter will not move
2089 2 : // back onto the synthetic range key marker. In this case where the
2090 2 : // previous iterator position was a synthetic range key start boundary,
2091 2 : // we must not step a second time.
2092 2 : if i.isEphemeralPosition() {
2093 2 : stepAgain = false
2094 2 : }
2095 :
2096 : // We set i.iterValidityState to IterExhausted here to force the calls
2097 : // to prevUserKey to save the current key i.iter is pointing at in
2098 : // order to determine when the prev user-key is reached.
2099 2 : i.iterValidityState = IterExhausted
2100 2 : if i.iterKV == nil {
2101 2 : // We're positioned after the last key. Need to reposition to point to
2102 2 : // the last key.
2103 2 : i.err = i.iterLastWithinBounds()
2104 2 : } else {
2105 2 : i.prevUserKey()
2106 2 : }
2107 2 : if i.err != nil {
2108 1 : return i.iterValidityState
2109 1 : }
2110 2 : if stepAgain {
2111 2 : i.prevUserKey()
2112 2 : if i.err != nil {
2113 0 : return i.iterValidityState
2114 0 : }
2115 : }
2116 : }
2117 2 : i.findPrevEntry(limit)
2118 2 : i.maybeSampleRead()
2119 2 : return i.iterValidityState
2120 : }
2121 :
2122 : // iterFirstWithinBounds moves the internal iterator to the first key,
2123 : // respecting bounds.
2124 2 : func (i *Iterator) iterFirstWithinBounds() error {
2125 2 : i.stats.ForwardSeekCount[InternalIterCall]++
2126 2 : if lowerBound := i.opts.GetLowerBound(); lowerBound != nil {
2127 2 : i.iterKV = i.iter.SeekGE(lowerBound, base.SeekGEFlagsNone)
2128 2 : } else {
2129 2 : i.iterKV = i.iter.First()
2130 2 : }
2131 2 : if i.iterKV == nil {
2132 2 : return i.iter.Error()
2133 2 : }
2134 2 : return nil
2135 : }
2136 :
2137 : // iterLastWithinBounds moves the internal iterator to the last key, respecting
2138 : // bounds.
2139 2 : func (i *Iterator) iterLastWithinBounds() error {
2140 2 : i.stats.ReverseSeekCount[InternalIterCall]++
2141 2 : if upperBound := i.opts.GetUpperBound(); upperBound != nil {
2142 2 : i.iterKV = i.iter.SeekLT(upperBound, base.SeekLTFlagsNone)
2143 2 : } else {
2144 2 : i.iterKV = i.iter.Last()
2145 2 : }
2146 2 : if i.iterKV == nil {
2147 2 : return i.iter.Error()
2148 2 : }
2149 2 : return nil
2150 : }
2151 :
2152 : // RangeKeyData describes a range key's data, set through RangeKeySet. The key
2153 : // boundaries of the range key is provided by Iterator.RangeBounds.
2154 : type RangeKeyData struct {
2155 : Suffix []byte
2156 : Value []byte
2157 : }
2158 :
2159 : // rangeKeyWithinLimit is called during limited reverse iteration when
2160 : // positioned over a key beyond the limit. If there exists a range key that lies
2161 : // within the limit, the iterator must not pause in order to ensure the user has
2162 : // an opportunity to observe the range key within limit.
2163 : //
2164 : // It would be valid to ignore the limit whenever there's a range key covering
2165 : // the key, but that would introduce nondeterminism. To preserve determinism for
2166 : // testing, the iterator ignores the limit only if the covering range key does
2167 : // cover the keyspace within the limit.
2168 : //
2169 : // This awkwardness exists because range keys are interleaved at their inclusive
2170 : // start positions. Note that limit is inclusive.
2171 2 : func (i *Iterator) rangeKeyWithinLimit(limit []byte) bool {
2172 2 : if i.rangeKey == nil || !i.opts.rangeKeys() {
2173 2 : return false
2174 2 : }
2175 2 : s := i.rangeKey.iiter.Span()
2176 2 : // If the range key ends beyond the limit, then the range key does not cover
2177 2 : // any portion of the keyspace within the limit and it is safe to pause.
2178 2 : return s != nil && i.cmp(s.End, limit) > 0
2179 : }
2180 :
2181 : // saveRangeKey saves the current range key to the underlying iterator's current
2182 : // range key state. If the range key has not changed, saveRangeKey is a no-op.
2183 : // If there is a new range key, saveRangeKey copies all of the key, value and
2184 : // suffixes into Iterator-managed buffers.
2185 2 : func (i *Iterator) saveRangeKey() {
2186 2 : if i.rangeKey == nil || i.opts.KeyTypes == IterKeyTypePointsOnly {
2187 2 : return
2188 2 : }
2189 :
2190 2 : s := i.rangeKey.iiter.Span()
2191 2 : if s == nil {
2192 2 : i.rangeKey.hasRangeKey = false
2193 2 : i.rangeKey.updated = i.rangeKey.prevPosHadRangeKey
2194 2 : return
2195 2 : } else if !i.rangeKey.stale {
2196 2 : // The range key `s` is identical to the one currently saved. No-op.
2197 2 : return
2198 2 : }
2199 :
2200 2 : if s.KeysOrder != keyspan.BySuffixAsc {
2201 0 : panic("pebble: range key span's keys unexpectedly not in ascending suffix order")
2202 : }
2203 :
2204 : // Although `i.rangeKey.stale` is true, the span s may still be identical
2205 : // to the currently saved span. This is possible when seeking the iterator,
2206 : // which may land back on the same range key. If we previously had a range
2207 : // key and the new one has an identical start key, then it must be the same
2208 : // range key and we can avoid copying and keep `i.rangeKey.updated=false`.
2209 : //
2210 : // TODO(jackson): These key comparisons could be avoidable during relative
2211 : // positioning operations continuing in the same direction, because these
2212 : // ops will never encounter the previous position's range key while
2213 : // stale=true. However, threading whether the current op is a seek or step
2214 : // maybe isn't worth it. This key comparison is only necessary once when we
2215 : // step onto a new range key, which should be relatively rare.
2216 2 : if i.rangeKey.prevPosHadRangeKey && i.equal(i.rangeKey.start, s.Start) &&
2217 2 : i.equal(i.rangeKey.end, s.End) {
2218 2 : i.rangeKey.updated = false
2219 2 : i.rangeKey.stale = false
2220 2 : i.rangeKey.hasRangeKey = true
2221 2 : return
2222 2 : }
2223 2 : i.stats.RangeKeyStats.Count += len(s.Keys)
2224 2 : i.rangeKey.buf.Reset()
2225 2 : i.rangeKey.hasRangeKey = true
2226 2 : i.rangeKey.updated = true
2227 2 : i.rangeKey.stale = false
2228 2 : i.rangeKey.buf, i.rangeKey.start = i.rangeKey.buf.Copy(s.Start)
2229 2 : i.rangeKey.buf, i.rangeKey.end = i.rangeKey.buf.Copy(s.End)
2230 2 : i.rangeKey.keys = i.rangeKey.keys[:0]
2231 2 : for j := 0; j < len(s.Keys); j++ {
2232 2 : if invariants.Enabled {
2233 2 : if s.Keys[j].Kind() != base.InternalKeyKindRangeKeySet {
2234 0 : panic("pebble: user iteration encountered non-RangeKeySet key kind")
2235 2 : } else if j > 0 && i.comparer.CompareRangeSuffixes(s.Keys[j].Suffix, s.Keys[j-1].Suffix) < 0 {
2236 0 : panic("pebble: user iteration encountered range keys not in suffix order")
2237 : }
2238 : }
2239 2 : var rkd RangeKeyData
2240 2 : i.rangeKey.buf, rkd.Suffix = i.rangeKey.buf.Copy(s.Keys[j].Suffix)
2241 2 : i.rangeKey.buf, rkd.Value = i.rangeKey.buf.Copy(s.Keys[j].Value)
2242 2 : i.rangeKey.keys = append(i.rangeKey.keys, rkd)
2243 : }
2244 : }
2245 :
2246 : // RangeKeyChanged indicates whether the most recent iterator positioning
2247 : // operation resulted in the iterator stepping into or out of a new range key.
2248 : // If true, previously returned range key bounds and data has been invalidated.
2249 : // If false, previously obtained range key bounds, suffix and value slices are
2250 : // still valid and may continue to be read.
2251 : //
2252 : // Invalid iterator positions are considered to not hold range keys, meaning
2253 : // that if an iterator steps from an IterExhausted or IterAtLimit position onto
2254 : // a position with a range key, RangeKeyChanged will yield true.
2255 2 : func (i *Iterator) RangeKeyChanged() bool {
2256 2 : return i.iterValidityState == IterValid && i.rangeKey != nil && i.rangeKey.updated
2257 2 : }
2258 :
2259 : // HasPointAndRange indicates whether there exists a point key, a range key or
2260 : // both at the current iterator position.
2261 2 : func (i *Iterator) HasPointAndRange() (hasPoint, hasRange bool) {
2262 2 : if i.iterValidityState != IterValid || i.requiresReposition {
2263 1 : return false, false
2264 1 : }
2265 2 : if i.opts.KeyTypes == IterKeyTypePointsOnly {
2266 2 : return true, false
2267 2 : }
2268 2 : return i.rangeKey == nil || !i.rangeKey.rangeKeyOnly, i.rangeKey != nil && i.rangeKey.hasRangeKey
2269 : }
2270 :
2271 : // RangeBounds returns the start (inclusive) and end (exclusive) bounds of the
2272 : // range key covering the current iterator position. RangeBounds returns nil
2273 : // bounds if there is no range key covering the current iterator position, or
2274 : // the iterator is not configured to surface range keys.
2275 : //
2276 : // If valid, the returned start bound is less than or equal to Key() and the
2277 : // returned end bound is greater than Key().
2278 2 : func (i *Iterator) RangeBounds() (start, end []byte) {
2279 2 : if i.rangeKey == nil || !i.opts.rangeKeys() || !i.rangeKey.hasRangeKey {
2280 0 : return nil, nil
2281 0 : }
2282 2 : return i.rangeKey.start, i.rangeKey.end
2283 : }
2284 :
2285 : // Key returns the key of the current key/value pair, or nil if done. The
2286 : // caller should not modify the contents of the returned slice, and its
2287 : // contents may change on the next call to Next.
2288 : //
2289 : // If positioned at an iterator position that only holds a range key, Key()
2290 : // always returns the start bound of the range key. Otherwise, it returns the
2291 : // point key's key.
2292 2 : func (i *Iterator) Key() []byte {
2293 2 : return i.key
2294 2 : }
2295 :
2296 : // Value returns the value of the current key/value pair, or nil if done. The
2297 : // caller should not modify the contents of the returned slice, and its
2298 : // contents may change on the next call to Next.
2299 : //
2300 : // Only valid if HasPointAndRange() returns true for hasPoint.
2301 : // Deprecated: use ValueAndErr instead.
2302 2 : func (i *Iterator) Value() []byte {
2303 2 : val, _ := i.ValueAndErr()
2304 2 : return val
2305 2 : }
2306 :
2307 : // ValueAndErr returns the value, and any error encountered in extracting the value.
2308 : // REQUIRES: i.Error()==nil and HasPointAndRange() returns true for hasPoint.
2309 : //
2310 : // The caller should not modify the contents of the returned slice, and its
2311 : // contents may change on the next call to Next.
2312 2 : func (i *Iterator) ValueAndErr() ([]byte, error) {
2313 2 : val, callerOwned, err := i.value.Value(i.lazyValueBuf)
2314 2 : if err != nil {
2315 1 : i.err = err
2316 1 : i.iterValidityState = IterExhausted
2317 1 : }
2318 2 : if callerOwned {
2319 2 : i.lazyValueBuf = val[:0]
2320 2 : }
2321 2 : return val, err
2322 : }
2323 :
2324 : // LazyValue returns the LazyValue. Only for advanced use cases.
2325 : // REQUIRES: i.Error()==nil and HasPointAndRange() returns true for hasPoint.
2326 0 : func (i *Iterator) LazyValue() LazyValue {
2327 0 : return i.value.LazyValue()
2328 0 : }
2329 :
2330 : // RangeKeys returns the range key values and their suffixes covering the
2331 : // current iterator position. The range bounds may be retrieved separately
2332 : // through Iterator.RangeBounds().
2333 2 : func (i *Iterator) RangeKeys() []RangeKeyData {
2334 2 : if i.rangeKey == nil || !i.opts.rangeKeys() || !i.rangeKey.hasRangeKey {
2335 0 : return nil
2336 0 : }
2337 2 : return i.rangeKey.keys
2338 : }
2339 :
2340 : // Valid returns true if the iterator is positioned at a valid key/value pair
2341 : // and false otherwise.
2342 2 : func (i *Iterator) Valid() bool {
2343 2 : valid := i.iterValidityState == IterValid && !i.requiresReposition
2344 2 : if invariants.Enabled {
2345 2 : if err := i.Error(); valid && err != nil {
2346 0 : panic(errors.AssertionFailedf("pebble: iterator is valid with non-nil Error: %+v", err))
2347 : }
2348 : }
2349 2 : return valid
2350 : }
2351 :
2352 : // Error returns any accumulated error.
2353 2 : func (i *Iterator) Error() error {
2354 2 : if i.err != nil {
2355 2 : return i.err
2356 2 : }
2357 2 : if i.iter != nil {
2358 2 : return i.iter.Error()
2359 2 : }
2360 0 : return nil
2361 : }
2362 :
2363 : const maxKeyBufCacheSize = 4 << 10 // 4 KB
2364 :
2365 : // Close closes the iterator and returns any accumulated error. Exhausting
2366 : // all the key/value pairs in a table is not considered to be an error.
2367 : // It is not valid to call any method, including Close, after the iterator
2368 : // has been closed.
2369 2 : func (i *Iterator) Close() error {
2370 2 : if i.trackerHandle.IsValid() {
2371 2 : i.tracker.Stop(i.trackerHandle)
2372 2 : i.trackerHandle = 0
2373 2 : }
2374 : // Close the child iterator before releasing the readState because when the
2375 : // readState is released sstables referenced by the readState may be deleted
2376 : // which will fail on Windows if the sstables are still open by the child
2377 : // iterator.
2378 2 : if i.iter != nil {
2379 2 : i.err = firstError(i.err, i.iter.Close())
2380 2 :
2381 2 : // Closing i.iter did not necessarily close the point and range key
2382 2 : // iterators. Calls to SetOptions may have 'disconnected' either one
2383 2 : // from i.iter if iteration key types were changed. Both point and range
2384 2 : // key iterators are preserved in case the iterator needs to switch key
2385 2 : // types again. We explicitly close both of these iterators here.
2386 2 : //
2387 2 : // NB: If the iterators were still connected to i.iter, they may be
2388 2 : // closed, but calling Close on a closed internal iterator or fragment
2389 2 : // iterator is allowed.
2390 2 : if i.pointIter != nil {
2391 2 : i.err = firstError(i.err, i.pointIter.Close())
2392 2 : }
2393 2 : if i.rangeKey != nil && i.rangeKey.rangeKeyIter != nil {
2394 2 : i.rangeKey.rangeKeyIter.Close()
2395 2 : }
2396 2 : i.err = firstError(i.err, i.blobValueFetcher.Close())
2397 : }
2398 2 : err := i.err
2399 2 :
2400 2 : if i.readState != nil {
2401 2 : if i.readSampling.pendingCompactions.size > 0 {
2402 1 : // Copy pending read compactions using db.mu.Lock()
2403 1 : i.readState.db.mu.Lock()
2404 1 : i.readState.db.mu.compact.readCompactions.combine(&i.readSampling.pendingCompactions, i.cmp)
2405 1 : reschedule := i.readState.db.mu.compact.rescheduleReadCompaction
2406 1 : i.readState.db.mu.compact.rescheduleReadCompaction = false
2407 1 : concurrentCompactions := i.readState.db.mu.compact.compactingCount
2408 1 : i.readState.db.mu.Unlock()
2409 1 :
2410 1 : if reschedule && concurrentCompactions == 0 {
2411 0 : // In a read heavy workload, flushes may not happen frequently enough to
2412 0 : // schedule compactions.
2413 0 : i.readState.db.compactionSchedulers.Add(1)
2414 0 : go i.readState.db.maybeScheduleCompactionAsync()
2415 0 : }
2416 : }
2417 :
2418 2 : i.readState.unref()
2419 2 : i.readState = nil
2420 : }
2421 :
2422 2 : if i.version != nil {
2423 2 : i.version.Unref()
2424 2 : }
2425 2 : if i.externalIter != nil {
2426 1 : err = firstError(err, i.externalIter.Close())
2427 1 : }
2428 :
2429 : // Close the closer for the current value if one was open.
2430 2 : if i.valueCloser != nil {
2431 1 : err = firstError(err, i.valueCloser.Close())
2432 1 : i.valueCloser = nil
2433 1 : }
2434 :
2435 2 : if i.rangeKey != nil {
2436 2 : i.rangeKey.rangeKeyBuffers.PrepareForReuse()
2437 2 : *i.rangeKey = iteratorRangeKeyState{
2438 2 : rangeKeyBuffers: i.rangeKey.rangeKeyBuffers,
2439 2 : }
2440 2 : iterRangeKeyStateAllocPool.Put(i.rangeKey)
2441 2 : i.rangeKey = nil
2442 2 : }
2443 :
2444 2 : alloc := i.alloc
2445 2 : if alloc == nil {
2446 2 : // NB: When the Iterator is used as a part of a Get(), Close() is called by
2447 2 : // getIterAlloc.Close which handles recycling the appropriate structure and
2448 2 : // fields.
2449 2 : return err
2450 2 : }
2451 :
2452 : // Reset the alloc struct, retaining any buffers within their maximum
2453 : // bounds, and return the iterAlloc struct to the pool.
2454 :
2455 : // Avoid caching the key buf if it is overly large. The constant is fairly
2456 : // arbitrary.
2457 2 : if cap(i.keyBuf) < maxKeyBufCacheSize {
2458 2 : alloc.keyBuf = i.keyBuf
2459 2 : }
2460 2 : if cap(i.prefixOrFullSeekKey) < maxKeyBufCacheSize {
2461 2 : alloc.prefixOrFullSeekKey = i.prefixOrFullSeekKey
2462 2 : }
2463 2 : for j := range i.boundsBuf {
2464 2 : if cap(i.boundsBuf[j]) < maxKeyBufCacheSize {
2465 2 : alloc.boundsBuf[j] = i.boundsBuf[j]
2466 2 : }
2467 : }
2468 2 : mergingIterHeapItems := alloc.merging.heap.items
2469 2 :
2470 2 : // We zero each field piecemeal in part to avoid the use of a stack
2471 2 : // allocated autotmp iterAlloc variable, and in part to make it easier
2472 2 : // to make zeroing conditional on whether the field was actually used.
2473 2 : //
2474 2 : // TODO(jackson,nvanbenschoten): Iterator.Close continues to appear in
2475 2 : // profiles in part because of this zeroing.
2476 2 : if i.batch != nil {
2477 2 : alloc.batchState = iteratorBatchState{}
2478 2 : }
2479 2 : alloc.dbi.clearForReuse()
2480 2 : alloc.merging = mergingIter{}
2481 2 : clear(mergingIterHeapItems[:cap(mergingIterHeapItems)])
2482 2 : alloc.merging.heap.items = mergingIterHeapItems
2483 2 : clear(alloc.mlevels[:])
2484 2 : clear(alloc.levels[:])
2485 2 : clear(alloc.levelsPositioned[:])
2486 2 :
2487 2 : iterAllocPool.Put(alloc)
2488 2 : return err
2489 : }
2490 :
2491 : // SetBounds sets the lower and upper bounds for the iterator. Once SetBounds
2492 : // returns, the caller is free to mutate the provided slices.
2493 : //
2494 : // The iterator will always be invalidated and must be repositioned with a call
2495 : // to SeekGE, SeekPrefixGE, SeekLT, First, or Last.
2496 2 : func (i *Iterator) SetBounds(lower, upper []byte) {
2497 2 : // Ensure that the Iterator appears exhausted, regardless of whether we
2498 2 : // actually have to invalidate the internal iterator. Optimizations that
2499 2 : // avoid exhaustion are an internal implementation detail that shouldn't
2500 2 : // leak through the interface. The caller should still call an absolute
2501 2 : // positioning method to reposition the iterator.
2502 2 : i.requiresReposition = true
2503 2 :
2504 2 : if ((i.opts.LowerBound == nil) == (lower == nil)) &&
2505 2 : ((i.opts.UpperBound == nil) == (upper == nil)) &&
2506 2 : i.equal(i.opts.LowerBound, lower) &&
2507 2 : i.equal(i.opts.UpperBound, upper) {
2508 2 : // Unchanged, noop.
2509 2 : return
2510 2 : }
2511 :
2512 : // Copy the user-provided bounds into an Iterator-owned buffer, and set them
2513 : // on i.opts.{Lower,Upper}Bound.
2514 2 : i.processBounds(lower, upper)
2515 2 :
2516 2 : i.iter.SetBounds(i.opts.LowerBound, i.opts.UpperBound)
2517 2 : // If the iterator has an open point iterator that's not currently being
2518 2 : // used, propagate the new bounds to it.
2519 2 : if i.pointIter != nil && !i.opts.pointKeys() {
2520 2 : i.pointIter.SetBounds(i.opts.LowerBound, i.opts.UpperBound)
2521 2 : }
2522 : // If the iterator has a range key iterator, propagate bounds to it. The
2523 : // top-level SetBounds on the interleaving iterator (i.iter) won't propagate
2524 : // bounds to the range key iterator stack, because the FragmentIterator
2525 : // interface doesn't define a SetBounds method. We need to directly inform
2526 : // the iterConfig stack.
2527 2 : if i.rangeKey != nil {
2528 2 : i.rangeKey.iterConfig.SetBounds(i.opts.LowerBound, i.opts.UpperBound)
2529 2 : }
2530 :
2531 : // Even though this is not a positioning operation, the alteration of the
2532 : // bounds means we cannot optimize Seeks by using Next.
2533 2 : i.invalidate()
2534 : }
2535 :
2536 : // SetContext replaces the context provided at iterator creation, or the last
2537 : // one provided by SetContext. Even though iterators are expected to be
2538 : // short-lived, there are some cases where either (a) iterators are used far
2539 : // from the code that created them, (b) iterators are reused (while being
2540 : // short-lived) for processing different requests. For such scenarios, we
2541 : // allow the caller to replace the context.
2542 0 : func (i *Iterator) SetContext(ctx context.Context) {
2543 0 : i.ctx = ctx
2544 0 : i.iter.SetContext(ctx)
2545 0 : // If the iterator has an open point iterator that's not currently being
2546 0 : // used, propagate the new context to it.
2547 0 : if i.pointIter != nil && !i.opts.pointKeys() {
2548 0 : i.pointIter.SetContext(i.ctx)
2549 0 : }
2550 : }
2551 :
2552 : // Initialization and changing of the bounds must call processBounds.
2553 : // processBounds saves the bounds and computes derived state from those
2554 : // bounds.
2555 2 : func (i *Iterator) processBounds(lower, upper []byte) {
2556 2 : // Copy the user-provided bounds into an Iterator-owned buffer. We can't
2557 2 : // overwrite the current bounds, because some internal iterators compare old
2558 2 : // and new bounds for optimizations.
2559 2 :
2560 2 : buf := i.boundsBuf[i.boundsBufIdx][:0]
2561 2 : if lower != nil {
2562 2 : buf = append(buf, lower...)
2563 2 : i.opts.LowerBound = buf
2564 2 : } else {
2565 2 : i.opts.LowerBound = nil
2566 2 : }
2567 2 : i.nextPrefixNotPermittedByUpperBound = false
2568 2 : if upper != nil {
2569 2 : buf = append(buf, upper...)
2570 2 : i.opts.UpperBound = buf[len(buf)-len(upper):]
2571 2 : if i.comparer.Split(i.opts.UpperBound) != len(i.opts.UpperBound) {
2572 2 : // Setting an upper bound that is a versioned MVCC key. This means
2573 2 : // that a key can have some MVCC versions before the upper bound and
2574 2 : // some after. This causes significant complications for NextPrefix,
2575 2 : // so we bar the user of NextPrefix.
2576 2 : i.nextPrefixNotPermittedByUpperBound = true
2577 2 : }
2578 2 : } else {
2579 2 : i.opts.UpperBound = nil
2580 2 : }
2581 2 : i.boundsBuf[i.boundsBufIdx] = buf
2582 2 : i.boundsBufIdx = 1 - i.boundsBufIdx
2583 : }
2584 :
2585 : // SetOptions sets new iterator options for the iterator. Note that the lower
2586 : // and upper bounds applied here will supersede any bounds set by previous calls
2587 : // to SetBounds.
2588 : //
2589 : // Note that the slices provided in this SetOptions must not be changed by the
2590 : // caller until the iterator is closed, or a subsequent SetBounds or SetOptions
2591 : // has returned. This is because comparisons between the existing and new bounds
2592 : // are sometimes used to optimize seeking. See the extended commentary on
2593 : // SetBounds.
2594 : //
2595 : // If the iterator was created over an indexed mutable batch, the iterator's
2596 : // view of the mutable batch is refreshed.
2597 : //
2598 : // The iterator will always be invalidated and must be repositioned with a call
2599 : // to SeekGE, SeekPrefixGE, SeekLT, First, or Last.
2600 : //
2601 : // If only lower and upper bounds need to be modified, prefer SetBounds.
2602 2 : func (i *Iterator) SetOptions(o *IterOptions) {
2603 2 : if i.externalIter != nil {
2604 1 : if err := validateExternalIterOpts(o); err != nil {
2605 0 : panic(err)
2606 : }
2607 : }
2608 :
2609 : // Ensure that the Iterator appears exhausted, regardless of whether we
2610 : // actually have to invalidate the internal iterator. Optimizations that
2611 : // avoid exhaustion are an internal implementation detail that shouldn't
2612 : // leak through the interface. The caller should still call an absolute
2613 : // positioning method to reposition the iterator.
2614 2 : i.requiresReposition = true
2615 2 :
2616 2 : // Check if global state requires we close all internal iterators.
2617 2 : //
2618 2 : // If the Iterator is in an error state, invalidate the existing iterators
2619 2 : // so that we reconstruct an iterator state from scratch.
2620 2 : //
2621 2 : // If OnlyReadGuaranteedDurable changed, the iterator stacks are incorrect,
2622 2 : // improperly including or excluding memtables. Invalidate them so that
2623 2 : // finishInitializingIter will reconstruct them.
2624 2 : closeBoth := i.err != nil ||
2625 2 : o.OnlyReadGuaranteedDurable != i.opts.OnlyReadGuaranteedDurable
2626 2 :
2627 2 : // If either options specify block property filters for an iterator stack,
2628 2 : // reconstruct it.
2629 2 : if i.pointIter != nil && (closeBoth || len(o.PointKeyFilters) > 0 || len(i.opts.PointKeyFilters) > 0 ||
2630 2 : o.RangeKeyMasking.Filter != nil || i.opts.RangeKeyMasking.Filter != nil || o.SkipPoint != nil ||
2631 2 : i.opts.SkipPoint != nil) {
2632 2 : i.err = firstError(i.err, i.pointIter.Close())
2633 2 : i.pointIter = nil
2634 2 : }
2635 2 : if i.rangeKey != nil {
2636 2 : if closeBoth || len(o.RangeKeyFilters) > 0 || len(i.opts.RangeKeyFilters) > 0 {
2637 2 : i.rangeKey.rangeKeyIter.Close()
2638 2 : i.rangeKey = nil
2639 2 : } else {
2640 2 : // If there's still a range key iterator stack, invalidate the
2641 2 : // iterator. This ensures RangeKeyChanged() returns true if a
2642 2 : // subsequent positioning operation discovers a range key. It also
2643 2 : // prevents seek no-op optimizations.
2644 2 : i.invalidate()
2645 2 : }
2646 : }
2647 :
2648 : // If the iterator is backed by a batch that's been mutated, refresh its
2649 : // existing point and range-key iterators, and invalidate the iterator to
2650 : // prevent seek-using-next optimizations. If we don't yet have a point-key
2651 : // iterator or range-key iterator but we require one, it'll be created in
2652 : // the slow path that reconstructs the iterator in finishInitializingIter.
2653 2 : if i.batch != nil {
2654 1 : nextBatchSeqNum := (base.SeqNum(len(i.batch.batch.data)) | base.SeqNumBatchBit)
2655 1 : if nextBatchSeqNum != i.batch.batchSeqNum {
2656 1 : i.batch.batchSeqNum = nextBatchSeqNum
2657 1 : if i.merging != nil {
2658 1 : i.merging.batchSnapshot = nextBatchSeqNum
2659 1 : }
2660 : // Prevent a no-op seek optimization on the next seek. We won't be
2661 : // able to reuse the top-level Iterator state, because it may be
2662 : // incorrect after the inclusion of new batch mutations.
2663 1 : i.batchJustRefreshed = true
2664 1 : if i.pointIter != nil && i.batch.batch.countRangeDels > 0 {
2665 1 : if i.batch.rangeDelIter.Count() == 0 {
2666 1 : // When we constructed this iterator, there were no
2667 1 : // rangedels in the batch. Iterator construction will
2668 1 : // have excluded the batch rangedel iterator from the
2669 1 : // point iterator stack. We need to reconstruct the
2670 1 : // point iterator to add i.batchRangeDelIter into the
2671 1 : // iterator stack.
2672 1 : i.err = firstError(i.err, i.pointIter.Close())
2673 1 : i.pointIter = nil
2674 1 : } else {
2675 1 : // There are range deletions in the batch and we already
2676 1 : // have a batch rangedel iterator. We can update the
2677 1 : // batch rangedel iterator in place.
2678 1 : //
2679 1 : // NB: There may or may not be new range deletions. We
2680 1 : // can't tell based on i.batchRangeDelIter.Count(),
2681 1 : // which is the count of fragmented range deletions, NOT
2682 1 : // the number of range deletions written to the batch
2683 1 : // [i.batch.countRangeDels].
2684 1 : i.batch.batch.initRangeDelIter(&i.opts, &i.batch.rangeDelIter, nextBatchSeqNum)
2685 1 : }
2686 : }
2687 1 : if i.rangeKey != nil && i.batch.batch.countRangeKeys > 0 {
2688 1 : if i.batch.rangeKeyIter.Count() == 0 {
2689 1 : // When we constructed this iterator, there were no range
2690 1 : // keys in the batch. Iterator construction will have
2691 1 : // excluded the batch rangekey iterator from the range key
2692 1 : // iterator stack. We need to reconstruct the range key
2693 1 : // iterator to add i.batchRangeKeyIter into the iterator
2694 1 : // stack.
2695 1 : i.rangeKey.rangeKeyIter.Close()
2696 1 : i.rangeKey = nil
2697 1 : } else {
2698 1 : // There are range keys in the batch and we already
2699 1 : // have a batch rangekey iterator. We can update the batch
2700 1 : // rangekey iterator in place.
2701 1 : //
2702 1 : // NB: There may or may not be new range keys. We can't
2703 1 : // tell based on i.batchRangeKeyIter.Count(), which is the
2704 1 : // count of fragmented range keys, NOT the number of
2705 1 : // range keys written to the batch [i.batch.countRangeKeys].
2706 1 : i.batch.batch.initRangeKeyIter(&i.opts, &i.batch.rangeKeyIter, nextBatchSeqNum)
2707 1 : i.invalidate()
2708 1 : }
2709 : }
2710 : }
2711 : }
2712 :
2713 : // Reset combinedIterState.initialized in case the iterator key types
2714 : // changed. If there's already a range key iterator stack, the combined
2715 : // iterator is already initialized. Additionally, if the iterator is not
2716 : // configured to include range keys, mark it as initialized to signal that
2717 : // lower level iterators should not trigger a switch to combined iteration.
2718 2 : i.lazyCombinedIter.combinedIterState = combinedIterState{
2719 2 : initialized: i.rangeKey != nil || !i.opts.rangeKeys(),
2720 2 : }
2721 2 :
2722 2 : boundsEqual := ((i.opts.LowerBound == nil) == (o.LowerBound == nil)) &&
2723 2 : ((i.opts.UpperBound == nil) == (o.UpperBound == nil)) &&
2724 2 : i.equal(i.opts.LowerBound, o.LowerBound) &&
2725 2 : i.equal(i.opts.UpperBound, o.UpperBound)
2726 2 :
2727 2 : if boundsEqual && o.KeyTypes == i.opts.KeyTypes &&
2728 2 : (i.pointIter != nil || !i.opts.pointKeys()) &&
2729 2 : (i.rangeKey != nil || !i.opts.rangeKeys() || i.opts.KeyTypes == IterKeyTypePointsAndRanges) &&
2730 2 : i.comparer.CompareRangeSuffixes(o.RangeKeyMasking.Suffix, i.opts.RangeKeyMasking.Suffix) == 0 &&
2731 2 : o.UseL6Filters == i.opts.UseL6Filters {
2732 2 : // The options are identical, so we can likely use the fast path. In
2733 2 : // addition to all the above constraints, we cannot use the fast path if
2734 2 : // configured to perform lazy combined iteration but an indexed batch
2735 2 : // used by the iterator now contains range keys. Lazy combined iteration
2736 2 : // is not compatible with batch range keys because we always need to
2737 2 : // merge the batch's range keys into iteration.
2738 2 : if i.rangeKey != nil || !i.opts.rangeKeys() || i.batch == nil || i.batch.batch.countRangeKeys == 0 {
2739 2 : // Fast path. This preserves the Seek-using-Next optimizations as
2740 2 : // long as the iterator wasn't already invalidated up above.
2741 2 : return
2742 2 : }
2743 : }
2744 : // Slow path.
2745 :
2746 : // The options changed. Save the new ones to i.opts.
2747 2 : if boundsEqual {
2748 2 : // Copying the options into i.opts will overwrite LowerBound and
2749 2 : // UpperBound fields with the user-provided slices. We need to hold on
2750 2 : // to the Pebble-owned slices, so save them and re-set them after the
2751 2 : // copy.
2752 2 : lower, upper := i.opts.LowerBound, i.opts.UpperBound
2753 2 : i.opts = *o
2754 2 : i.opts.LowerBound, i.opts.UpperBound = lower, upper
2755 2 : } else {
2756 2 : i.opts = *o
2757 2 : i.processBounds(o.LowerBound, o.UpperBound)
2758 2 : // Propagate the changed bounds to the existing point iterator.
2759 2 : // NB: We propagate i.opts.{Lower,Upper}Bound, not o.{Lower,Upper}Bound
2760 2 : // because i.opts now point to buffers owned by Pebble.
2761 2 : if i.pointIter != nil {
2762 2 : i.pointIter.SetBounds(i.opts.LowerBound, i.opts.UpperBound)
2763 2 : }
2764 2 : if i.rangeKey != nil {
2765 2 : i.rangeKey.iterConfig.SetBounds(i.opts.LowerBound, i.opts.UpperBound)
2766 2 : }
2767 : }
2768 :
2769 : // Even though this is not a positioning operation, the invalidation of the
2770 : // iterator stack means we cannot optimize Seeks by using Next.
2771 2 : i.invalidate()
2772 2 :
2773 2 : // Iterators created through NewExternalIter have a different iterator
2774 2 : // initialization process.
2775 2 : if i.externalIter != nil {
2776 1 : _ = finishInitializingExternal(i.ctx, i)
2777 1 : return
2778 1 : }
2779 2 : finishInitializingIter(i.ctx, i.alloc)
2780 : }
2781 :
2782 2 : func (i *Iterator) invalidate() {
2783 2 : i.lastPositioningOp = unknownLastPositionOp
2784 2 : i.hasPrefix = false
2785 2 : i.iterKV = nil
2786 2 : i.err = nil
2787 2 : // This switch statement isn't necessary for correctness since callers
2788 2 : // should call a repositioning method. We could have arbitrarily set i.pos
2789 2 : // to one of the values. But it results in more intuitive behavior in
2790 2 : // tests, which do not always reposition.
2791 2 : switch i.pos {
2792 2 : case iterPosCurForward, iterPosNext, iterPosCurForwardPaused:
2793 2 : i.pos = iterPosCurForward
2794 2 : case iterPosCurReverse, iterPosPrev, iterPosCurReversePaused:
2795 2 : i.pos = iterPosCurReverse
2796 : }
2797 2 : i.iterValidityState = IterExhausted
2798 2 : if i.rangeKey != nil {
2799 2 : i.rangeKey.iiter.Invalidate()
2800 2 : i.rangeKey.prevPosHadRangeKey = false
2801 2 : }
2802 : }
2803 :
2804 : // Metrics returns per-iterator metrics.
2805 0 : func (i *Iterator) Metrics() IteratorMetrics {
2806 0 : m := IteratorMetrics{
2807 0 : ReadAmp: 1,
2808 0 : }
2809 0 : if mi, ok := i.iter.(*mergingIter); ok {
2810 0 : m.ReadAmp = len(mi.levels)
2811 0 : }
2812 0 : return m
2813 : }
2814 :
2815 : // ResetStats resets the stats to 0.
2816 0 : func (i *Iterator) ResetStats() {
2817 0 : i.stats = IteratorStats{}
2818 0 : }
2819 :
2820 : // Stats returns the current stats.
2821 1 : func (i *Iterator) Stats() IteratorStats {
2822 1 : return i.stats
2823 1 : }
2824 :
2825 : // CloneOptions configures an iterator constructed through Iterator.Clone.
2826 : type CloneOptions struct {
2827 : // IterOptions, if non-nil, define the iterator options to configure a
2828 : // cloned iterator. If nil, the clone adopts the same IterOptions as the
2829 : // iterator being cloned.
2830 : IterOptions *IterOptions
2831 : // RefreshBatchView may be set to true when cloning an Iterator over an
2832 : // indexed batch. When false, the clone adopts the same (possibly stale)
2833 : // view of the indexed batch as the cloned Iterator. When true, the clone is
2834 : // constructed with a refreshed view of the batch, observing all of the
2835 : // batch's mutations at the time of the Clone. If the cloned iterator was
2836 : // not constructed to read over an indexed batch, RefreshVatchView has no
2837 : // effect.
2838 : RefreshBatchView bool
2839 : }
2840 :
2841 : // Clone creates a new Iterator over the same underlying data, i.e., over the
2842 : // same {batch, memtables, sstables}). The resulting iterator is not positioned.
2843 : // It starts with the same IterOptions, unless opts.IterOptions is set.
2844 : //
2845 : // When called on an Iterator over an indexed batch, the clone's visibility of
2846 : // the indexed batch is determined by CloneOptions.RefreshBatchView. If false,
2847 : // the clone inherits the iterator's current (possibly stale) view of the batch,
2848 : // and callers may call SetOptions to subsequently refresh the clone's view to
2849 : // include all batch mutations. If true, the clone is constructed with a
2850 : // complete view of the indexed batch's mutations at the time of the Clone.
2851 : //
2852 : // Callers can use Clone if they need multiple iterators that need to see
2853 : // exactly the same underlying state of the DB. This should not be used to
2854 : // extend the lifetime of the data backing the original Iterator since that
2855 : // will cause an increase in memory and disk usage (use NewSnapshot for that
2856 : // purpose).
2857 2 : func (i *Iterator) Clone(opts CloneOptions) (*Iterator, error) {
2858 2 : return i.CloneWithContext(context.Background(), opts)
2859 2 : }
2860 :
2861 : // CloneWithContext is like Clone, and additionally accepts a context for
2862 : // tracing.
2863 2 : func (i *Iterator) CloneWithContext(ctx context.Context, opts CloneOptions) (*Iterator, error) {
2864 2 : if opts.IterOptions == nil {
2865 2 : opts.IterOptions = &i.opts
2866 2 : }
2867 2 : if i.batchOnlyIter {
2868 1 : return nil, errors.Errorf("cannot Clone a batch-only Iterator")
2869 1 : }
2870 2 : readState := i.readState
2871 2 : vers := i.version
2872 2 : if readState == nil && vers == nil {
2873 1 : return nil, errors.Errorf("cannot Clone a closed Iterator")
2874 1 : }
2875 : // i is already holding a ref, so there is no race with unref here.
2876 : //
2877 : // TODO(bilal): If the underlying iterator was created on a snapshot, we could
2878 : // grab a reference to the current readState instead of reffing the original
2879 : // readState. This allows us to release references to some zombie sstables
2880 : // and memtables.
2881 2 : if readState != nil {
2882 2 : readState.ref()
2883 2 : }
2884 2 : if vers != nil {
2885 2 : vers.Ref()
2886 2 : }
2887 : // Bundle various structures under a single umbrella in order to allocate
2888 : // them together.
2889 2 : buf := newIterAlloc()
2890 2 : dbi := &buf.dbi
2891 2 : *dbi = Iterator{
2892 2 : ctx: ctx,
2893 2 : opts: *opts.IterOptions,
2894 2 : alloc: buf,
2895 2 : merge: i.merge,
2896 2 : comparer: i.comparer,
2897 2 : readState: readState,
2898 2 : version: vers,
2899 2 : keyBuf: buf.keyBuf,
2900 2 : prefixOrFullSeekKey: buf.prefixOrFullSeekKey,
2901 2 : boundsBuf: buf.boundsBuf,
2902 2 : fc: i.fc,
2903 2 : newIters: i.newIters,
2904 2 : newIterRangeKey: i.newIterRangeKey,
2905 2 : seqNum: i.seqNum,
2906 2 : tracker: i.tracker,
2907 2 : }
2908 2 : if i.tracker != nil && !dbi.opts.ExemptFromTracking {
2909 2 : dbi.trackerHandle = i.tracker.Start()
2910 2 : }
2911 2 : if i.batch != nil {
2912 2 : dbi.batch = &buf.batchState
2913 2 : dbi.batch.batch = i.batch.batch
2914 2 : dbi.batch.batchSeqNum = i.batch.batchSeqNum
2915 2 : // If the caller requested the clone have a current view of the indexed
2916 2 : // batch, set the clone's batch sequence number appropriately.
2917 2 : if opts.RefreshBatchView {
2918 1 : dbi.batch.batchSeqNum = (base.SeqNum(len(i.batch.batch.data)) | base.SeqNumBatchBit)
2919 1 : }
2920 : }
2921 2 : dbi.processBounds(dbi.opts.LowerBound, dbi.opts.UpperBound)
2922 2 : return finishInitializingIter(ctx, buf), nil
2923 : }
2924 :
2925 : // Merge adds all of the argument's statistics to the receiver. It may be used
2926 : // to accumulate stats across multiple iterators.
2927 1 : func (stats *IteratorStats) Merge(o IteratorStats) {
2928 1 : for i := InterfaceCall; i < NumStatsKind; i++ {
2929 1 : stats.ForwardSeekCount[i] += o.ForwardSeekCount[i]
2930 1 : stats.ReverseSeekCount[i] += o.ReverseSeekCount[i]
2931 1 : stats.ForwardStepCount[i] += o.ForwardStepCount[i]
2932 1 : stats.ReverseStepCount[i] += o.ReverseStepCount[i]
2933 1 : }
2934 1 : stats.InternalStats.Merge(o.InternalStats)
2935 1 : stats.RangeKeyStats.Merge(o.RangeKeyStats)
2936 : }
2937 :
2938 1 : func (stats *IteratorStats) String() string {
2939 1 : return redact.StringWithoutMarkers(stats)
2940 1 : }
2941 :
2942 : // SafeFormat implements the redact.SafeFormatter interface.
2943 1 : func (stats *IteratorStats) SafeFormat(s redact.SafePrinter, verb rune) {
2944 1 : if stats.ReverseSeekCount[InterfaceCall] == 0 && stats.ReverseSeekCount[InternalIterCall] == 0 {
2945 1 : s.Printf("seeked %s times (%s internal)",
2946 1 : humanize.Count.Uint64(uint64(stats.ForwardSeekCount[InterfaceCall])),
2947 1 : humanize.Count.Uint64(uint64(stats.ForwardSeekCount[InternalIterCall])),
2948 1 : )
2949 1 : } else {
2950 1 : s.Printf("seeked %s times (%s fwd/%s rev, internal: %s fwd/%s rev)",
2951 1 : humanize.Count.Uint64(uint64(stats.ForwardSeekCount[InterfaceCall]+stats.ReverseSeekCount[InterfaceCall])),
2952 1 : humanize.Count.Uint64(uint64(stats.ForwardSeekCount[InterfaceCall])),
2953 1 : humanize.Count.Uint64(uint64(stats.ReverseSeekCount[InterfaceCall])),
2954 1 : humanize.Count.Uint64(uint64(stats.ForwardSeekCount[InternalIterCall])),
2955 1 : humanize.Count.Uint64(uint64(stats.ReverseSeekCount[InternalIterCall])),
2956 1 : )
2957 1 : }
2958 1 : s.SafeString("; ")
2959 1 :
2960 1 : if stats.ReverseStepCount[InterfaceCall] == 0 && stats.ReverseStepCount[InternalIterCall] == 0 {
2961 1 : s.Printf("stepped %s times (%s internal)",
2962 1 : humanize.Count.Uint64(uint64(stats.ForwardStepCount[InterfaceCall])),
2963 1 : humanize.Count.Uint64(uint64(stats.ForwardStepCount[InternalIterCall])),
2964 1 : )
2965 1 : } else {
2966 1 : s.Printf("stepped %s times (%s fwd/%s rev, internal: %s fwd/%s rev)",
2967 1 : humanize.Count.Uint64(uint64(stats.ForwardStepCount[InterfaceCall]+stats.ReverseStepCount[InterfaceCall])),
2968 1 : humanize.Count.Uint64(uint64(stats.ForwardStepCount[InterfaceCall])),
2969 1 : humanize.Count.Uint64(uint64(stats.ReverseStepCount[InterfaceCall])),
2970 1 : humanize.Count.Uint64(uint64(stats.ForwardStepCount[InternalIterCall])),
2971 1 : humanize.Count.Uint64(uint64(stats.ReverseStepCount[InternalIterCall])),
2972 1 : )
2973 1 : }
2974 :
2975 1 : if stats.InternalStats != (InternalIteratorStats{}) {
2976 1 : s.SafeString("; ")
2977 1 : stats.InternalStats.SafeFormat(s, verb)
2978 1 : }
2979 1 : if stats.RangeKeyStats != (RangeKeyIteratorStats{}) {
2980 1 : s.SafeString(", ")
2981 1 : stats.RangeKeyStats.SafeFormat(s, verb)
2982 1 : }
2983 : }
2984 :
2985 : // CanDeterministicallySingleDelete takes a valid iterator and examines internal
2986 : // state to determine if a SingleDelete deleting Iterator.Key() would
2987 : // deterministically delete the key. CanDeterministicallySingleDelete requires
2988 : // the iterator to be oriented in the forward direction (eg, the last
2989 : // positioning operation must've been a First, a Seek[Prefix]GE, or a
2990 : // Next[Prefix][WithLimit]).
2991 : //
2992 : // This function does not change the external position of the iterator, and all
2993 : // positioning methods should behave the same as if it was never called. This
2994 : // function will only return a meaningful result the first time it's invoked at
2995 : // an iterator position. This function invalidates the iterator Value's memory,
2996 : // and the caller must not rely on the memory safety of the previous Iterator
2997 : // position.
2998 : //
2999 : // If CanDeterministicallySingleDelete returns true AND the key at the iterator
3000 : // position is not modified between the creation of the Iterator and the commit
3001 : // of a batch containing a SingleDelete over the key, then the caller can be
3002 : // assured that SingleDelete is equivalent to Delete on the local engine, but it
3003 : // may not be true on another engine that received the same writes and with
3004 : // logically equivalent state since this engine may have collapsed multiple SETs
3005 : // into one.
3006 2 : func CanDeterministicallySingleDelete(it *Iterator) (bool, error) {
3007 2 : // This function may only be called once per external iterator position. We
3008 2 : // can validate this by checking the last positioning operation.
3009 2 : if it.lastPositioningOp == internalNextOp {
3010 2 : return false, errors.New("pebble: CanDeterministicallySingleDelete called twice")
3011 2 : }
3012 2 : validity, kind := it.internalNext()
3013 2 : var shadowedBySingleDelete bool
3014 2 : for validity == internalNextValid {
3015 2 : switch kind {
3016 2 : case InternalKeyKindDelete, InternalKeyKindDeleteSized:
3017 2 : // A DEL or DELSIZED tombstone is okay. An internal key
3018 2 : // sequence like SINGLEDEL; SET; DEL; SET can be handled
3019 2 : // deterministically. If there are SETs further down, we
3020 2 : // don't care about them.
3021 2 : return true, nil
3022 2 : case InternalKeyKindSingleDelete:
3023 2 : // A SingleDelete is okay as long as when that SingleDelete was
3024 2 : // written, it was written deterministically (eg, with its own
3025 2 : // CanDeterministicallySingleDelete check). Validate that it was
3026 2 : // written deterministically. We'll allow one set to appear after
3027 2 : // the SingleDelete.
3028 2 : shadowedBySingleDelete = true
3029 2 : validity, kind = it.internalNext()
3030 2 : continue
3031 2 : case InternalKeyKindSet, InternalKeyKindSetWithDelete, InternalKeyKindMerge:
3032 2 : // If we observed a single delete, it's allowed to delete 1 key.
3033 2 : // We'll keep looping to validate that the internal keys beneath the
3034 2 : // already-written single delete are copacetic.
3035 2 : if shadowedBySingleDelete {
3036 2 : shadowedBySingleDelete = false
3037 2 : validity, kind = it.internalNext()
3038 2 : continue
3039 : }
3040 : // We encountered a shadowed SET, SETWITHDEL, MERGE. A SINGLEDEL
3041 : // that deleted the KV at the original iterator position could
3042 : // result in this key becoming visible.
3043 2 : return false, nil
3044 0 : case InternalKeyKindRangeDelete:
3045 0 : // RangeDeletes are handled by the merging iterator and should never
3046 0 : // be observed by the top-level Iterator.
3047 0 : panic(errors.AssertionFailedf("pebble: unexpected range delete"))
3048 0 : case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete:
3049 0 : // Range keys are interleaved at the maximal sequence number and
3050 0 : // should never be observed within a user key.
3051 0 : panic(errors.AssertionFailedf("pebble: unexpected range key"))
3052 0 : default:
3053 0 : panic(errors.AssertionFailedf("pebble: unexpected key kind: %s", errors.Safe(kind)))
3054 : }
3055 : }
3056 2 : if validity == internalNextError {
3057 2 : return false, it.Error()
3058 2 : }
3059 2 : return true, nil
3060 : }
3061 :
3062 : // internalNextValidity enumerates the potential outcomes of a call to
3063 : // internalNext.
3064 : type internalNextValidity int8
3065 :
3066 : const (
3067 : // internalNextError is returned by internalNext when an error occurred and
3068 : // the caller is responsible for checking iter.Error().
3069 : internalNextError internalNextValidity = iota
3070 : // internalNextExhausted is returned by internalNext when the next internal
3071 : // key is an internal key with a different user key than Iterator.Key().
3072 : internalNextExhausted
3073 : // internalNextValid is returned by internalNext when the internal next
3074 : // found a shadowed internal key with a user key equal to Iterator.Key().
3075 : internalNextValid
3076 : )
3077 :
3078 : // internalNext advances internal Iterator state forward to expose the
3079 : // InternalKeyKind of the next internal key with a user key equal to Key().
3080 : //
3081 : // internalNext is a highly specialized operation and is unlikely to be
3082 : // generally useful. See Iterator.Next for how to reposition the iterator to the
3083 : // next key. internalNext requires the Iterator to be at a valid position in the
3084 : // forward direction (the last positioning operation must've been a First, a
3085 : // Seek[Prefix]GE, or a Next[Prefix][WithLimit] and Valid() must return true).
3086 : //
3087 : // internalNext, unlike all other Iterator methods, exposes internal LSM state.
3088 : // internalNext advances the Iterator's internal iterator to the next shadowed
3089 : // key with a user key equal to Key(). When a key is overwritten or deleted, its
3090 : // removal from the LSM occurs lazily as a part of compactions. internalNext
3091 : // allows the caller to see whether an obsolete internal key exists with the
3092 : // current Key(), and what it's key kind is. Note that the existence of an
3093 : // internal key is nondeterministic and dependent on internal LSM state. These
3094 : // semantics are unlikely to be applicable to almost all use cases.
3095 : //
3096 : // If internalNext finds a key that shares the same user key as Key(), it
3097 : // returns internalNextValid and the internal key's kind. If internalNext
3098 : // encounters an error, it returns internalNextError and the caller is expected
3099 : // to call Iterator.Error() to retrieve it. In all other circumstances,
3100 : // internalNext returns internalNextExhausted, indicating that there are no more
3101 : // additional internal keys with the user key Key().
3102 : //
3103 : // internalNext does not change the external position of the iterator, and a
3104 : // Next operation should behave the same as if internalNext was never called.
3105 : // internalNext does invalidate the iterator Value's memory, and the caller must
3106 : // not rely on the memory safety of the previous Iterator position.
3107 2 : func (i *Iterator) internalNext() (internalNextValidity, base.InternalKeyKind) {
3108 2 : i.stats.ForwardStepCount[InterfaceCall]++
3109 2 : if i.err != nil {
3110 2 : return internalNextError, base.InternalKeyKindInvalid
3111 2 : } else if i.iterValidityState != IterValid {
3112 2 : return internalNextExhausted, base.InternalKeyKindInvalid
3113 2 : }
3114 2 : i.lastPositioningOp = internalNextOp
3115 2 :
3116 2 : switch i.pos {
3117 2 : case iterPosCurForward:
3118 2 : i.iterKV = i.iter.Next()
3119 2 : if i.iterKV == nil {
3120 2 : // We check i.iter.Error() here and return an internalNextError enum
3121 2 : // variant so that the caller does not need to check i.iter.Error()
3122 2 : // in the common case that the next internal key has a new user key.
3123 2 : if i.err = i.iter.Error(); i.err != nil {
3124 0 : return internalNextError, base.InternalKeyKindInvalid
3125 0 : }
3126 2 : i.pos = iterPosNext
3127 2 : return internalNextExhausted, base.InternalKeyKindInvalid
3128 2 : } else if i.comparer.Equal(i.iterKV.K.UserKey, i.key) {
3129 2 : return internalNextValid, i.iterKV.Kind()
3130 2 : }
3131 2 : i.pos = iterPosNext
3132 2 : return internalNextExhausted, base.InternalKeyKindInvalid
3133 2 : case iterPosCurReverse, iterPosCurReversePaused, iterPosPrev:
3134 2 : i.err = errors.New("switching from reverse to forward via internalNext is prohibited")
3135 2 : i.iterValidityState = IterExhausted
3136 2 : return internalNextError, base.InternalKeyKindInvalid
3137 2 : case iterPosNext, iterPosCurForwardPaused:
3138 2 : // The previous method already moved onto the next user key. This is
3139 2 : // only possible if
3140 2 : // - the last positioning method was a call to internalNext, and we
3141 2 : // advanced to a new user key.
3142 2 : // - the previous non-internalNext iterator operation encountered a
3143 2 : // range key or merge, forcing an internal Next that found a new
3144 2 : // user key that's not equal to i.Iterator.Key().
3145 2 : return internalNextExhausted, base.InternalKeyKindInvalid
3146 0 : default:
3147 0 : panic("unreachable")
3148 : }
3149 : }
3150 :
3151 : var _ base.IteratorDebug = (*Iterator)(nil)
3152 :
3153 : // DebugTree implements the base.IteratorDebug interface.
3154 0 : func (i *Iterator) DebugTree(tp treeprinter.Node) {
3155 0 : n := tp.Childf("%T(%p)", i, i)
3156 0 : if i.iter != nil {
3157 0 : i.iter.DebugTree(n)
3158 0 : }
3159 0 : if i.pointIter != nil {
3160 0 : i.pointIter.DebugTree(n)
3161 0 : }
3162 : }
|