Line data Source code
1 : // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package pebble
6 :
7 : import (
8 : "bytes"
9 : "context"
10 : "io"
11 : "math/rand/v2"
12 : "sync"
13 : "unsafe"
14 :
15 : "github.com/cockroachdb/errors"
16 : "github.com/cockroachdb/pebble/internal/base"
17 : "github.com/cockroachdb/pebble/internal/bytealloc"
18 : "github.com/cockroachdb/pebble/internal/humanize"
19 : "github.com/cockroachdb/pebble/internal/invariants"
20 : "github.com/cockroachdb/pebble/internal/keyspan"
21 : "github.com/cockroachdb/pebble/internal/keyspan/keyspanimpl"
22 : "github.com/cockroachdb/pebble/internal/manifest"
23 : "github.com/cockroachdb/pebble/internal/rangekeystack"
24 : "github.com/cockroachdb/pebble/internal/treeprinter"
25 : "github.com/cockroachdb/pebble/sstable/blob"
26 : "github.com/cockroachdb/redact"
27 : )
28 :
29 : // iterPos describes the state of the internal iterator, in terms of whether it
30 : // is at the position returned to the user (cur), one ahead of the position
31 : // returned (next for forward iteration and prev for reverse iteration). The cur
32 : // position is split into two states, for forward and reverse iteration, since
33 : // we need to differentiate for switching directions.
34 : //
35 : // There is subtlety in what is considered the current position of the Iterator.
36 : // The internal iterator exposes a sequence of internal keys. There is not
37 : // always a single internalIterator position corresponding to the position
38 : // returned to the user. Consider the example:
39 : //
40 : // a.MERGE.9 a.MERGE.8 a.MERGE.7 a.SET.6 b.DELETE.9 b.DELETE.5 b.SET.4
41 : // \ /
42 : // \ Iterator.Key() = 'a' /
43 : //
44 : // The Iterator exposes one valid position at user key 'a' and the two exhausted
45 : // positions at the beginning and end of iteration. The underlying
46 : // internalIterator contains 7 valid positions and 2 exhausted positions.
47 : //
48 : // Iterator positioning methods must set iterPos to iterPosCur{Foward,Backward}
49 : // iff the user key at the current internalIterator position equals the
50 : // Iterator.Key returned to the user. This guarantees that a call to nextUserKey
51 : // or prevUserKey will advance to the next or previous iterator position.
52 : // iterPosCur{Forward,Backward} does not make any guarantee about the internal
53 : // iterator position among internal keys with matching user keys, and it will
54 : // vary subtly depending on the particular key kinds encountered. In the above
55 : // example, the iterator returning 'a' may set iterPosCurForward if the internal
56 : // iterator is positioned at any of a.MERGE.9, a.MERGE.8, a.MERGE.7 or a.SET.6.
57 : //
58 : // When setting iterPos to iterPosNext or iterPosPrev, the internal iterator
59 : // must be advanced to the first internalIterator position at a user key greater
60 : // (iterPosNext) or less (iterPosPrev) than the key returned to the user. An
61 : // internalIterator position that's !Valid() must also be considered greater or
62 : // less—depending on the direction of iteration—than the last valid Iterator
63 : // position.
64 : type iterPos int8
65 :
66 : const (
67 : iterPosCurForward iterPos = 0
68 : iterPosNext iterPos = 1
69 : iterPosPrev iterPos = -1
70 : iterPosCurReverse iterPos = -2
71 :
72 : // For limited iteration. When the iterator is at iterPosCurForwardPaused
73 : // - Next*() call should behave as if the internal iterator is already
74 : // at next (akin to iterPosNext).
75 : // - Prev*() call should behave as if the internal iterator is at the
76 : // current key (akin to iterPosCurForward).
77 : //
78 : // Similar semantics apply to CurReversePaused.
79 : iterPosCurForwardPaused iterPos = 2
80 : iterPosCurReversePaused iterPos = -3
81 : )
82 :
83 : // Approximate gap in bytes between samples of data read during iteration.
84 : // This is multiplied with a default ReadSamplingMultiplier of 1 << 4 to yield
85 : // 1 << 20 (1MB). The 1MB factor comes from:
86 : // https://github.com/cockroachdb/pebble/issues/29#issuecomment-494477985
87 : const readBytesPeriod uint64 = 1 << 16
88 :
89 : var errReversePrefixIteration = errors.New("pebble: unsupported reverse prefix iteration")
90 :
91 : // IteratorMetrics holds per-iterator metrics. These do not change over the
92 : // lifetime of the iterator.
93 : type IteratorMetrics struct {
94 : // The read amplification experienced by this iterator. This is the sum of
95 : // the memtables, the L0 sublevels and the non-empty Ln levels. Higher read
96 : // amplification generally results in slower reads, though allowing higher
97 : // read amplification can also result in faster writes.
98 : ReadAmp int
99 : }
100 :
101 : // IteratorStatsKind describes the two kind of iterator stats.
102 : type IteratorStatsKind int8
103 :
104 : const (
105 : // InterfaceCall represents calls to Iterator.
106 : InterfaceCall IteratorStatsKind = iota
107 : // InternalIterCall represents calls by Iterator to its internalIterator.
108 : InternalIterCall
109 : // NumStatsKind is the number of kinds, and is used for array sizing.
110 : NumStatsKind
111 : )
112 :
113 : // IteratorStats contains iteration stats.
114 : type IteratorStats struct {
115 : // ForwardSeekCount includes SeekGE, SeekPrefixGE, First.
116 : ForwardSeekCount [NumStatsKind]int
117 : // ReverseSeek includes SeekLT, Last.
118 : ReverseSeekCount [NumStatsKind]int
119 : // ForwardStepCount includes Next.
120 : ForwardStepCount [NumStatsKind]int
121 : // ReverseStepCount includes Prev.
122 : ReverseStepCount [NumStatsKind]int
123 : InternalStats InternalIteratorStats
124 : RangeKeyStats RangeKeyIteratorStats
125 : }
126 :
127 : var _ redact.SafeFormatter = &IteratorStats{}
128 :
129 : // InternalIteratorStats contains miscellaneous stats produced by internal
130 : // iterators.
131 : type InternalIteratorStats = base.InternalIteratorStats
132 :
133 : // BlockReadStats contains iterators stats about block reads.
134 : type BlockReadStats = base.BlockReadStats
135 :
136 : // RangeKeyIteratorStats contains miscellaneous stats about range keys
137 : // encountered by the iterator.
138 : type RangeKeyIteratorStats struct {
139 : // Count records the number of range keys encountered during
140 : // iteration. Range keys may be counted multiple times if the iterator
141 : // leaves a range key's bounds and then returns.
142 : Count int
143 : // ContainedPoints records the number of point keys encountered within the
144 : // bounds of a range key. Note that this includes point keys with suffixes
145 : // that sort both above and below the covering range key's suffix.
146 : ContainedPoints int
147 : // SkippedPoints records the count of the subset of ContainedPoints point
148 : // keys that were skipped during iteration due to range-key masking. It does
149 : // not include point keys that were never loaded because a
150 : // RangeKeyMasking.Filter excluded the entire containing block.
151 : SkippedPoints int
152 : }
153 :
154 : // Merge adds all of the argument's statistics to the receiver. It may be used
155 : // to accumulate stats across multiple iterators.
156 0 : func (s *RangeKeyIteratorStats) Merge(o RangeKeyIteratorStats) {
157 0 : s.Count += o.Count
158 0 : s.ContainedPoints += o.ContainedPoints
159 0 : s.SkippedPoints += o.SkippedPoints
160 0 : }
161 :
162 0 : func (s *RangeKeyIteratorStats) String() string {
163 0 : return redact.StringWithoutMarkers(s)
164 0 : }
165 :
166 : // SafeFormat implements the redact.SafeFormatter interface.
167 0 : func (s *RangeKeyIteratorStats) SafeFormat(p redact.SafePrinter, verb rune) {
168 0 : p.Printf("range keys: %s, contained points: %s (%s skipped)",
169 0 : humanize.Count.Uint64(uint64(s.Count)),
170 0 : humanize.Count.Uint64(uint64(s.ContainedPoints)),
171 0 : humanize.Count.Uint64(uint64(s.SkippedPoints)))
172 0 : }
173 :
174 : // LazyValue is a lazy value. See the long comment in base.LazyValue.
175 : type LazyValue = base.LazyValue
176 :
177 : // Iterator iterates over a DB's key/value pairs in key order.
178 : //
179 : // An iterator must be closed after use, but it is not necessary to read an
180 : // iterator until exhaustion.
181 : //
182 : // An iterator is not goroutine-safe, but it is safe to use multiple iterators
183 : // concurrently, with each in a dedicated goroutine.
184 : //
185 : // It is also safe to use an iterator concurrently with modifying its
186 : // underlying DB, if that DB permits modification. However, the resultant
187 : // key/value pairs are not guaranteed to be a consistent snapshot of that DB
188 : // at a particular point in time.
189 : //
190 : // If an iterator encounters an error during any operation, it is stored by
191 : // the Iterator and surfaced through the Error method. All absolute
192 : // positioning methods (eg, SeekLT, SeekGT, First, Last, etc) reset any
193 : // accumulated error before positioning. All relative positioning methods (eg,
194 : // Next, Prev) return without advancing if the iterator has an accumulated
195 : // error.
196 : type Iterator struct {
197 : // blobValueFetcher is the ValueFetcher to use when retrieving values stored
198 : // externally in blob files.
199 : blobValueFetcher blob.ValueFetcher
200 :
201 : // All fields below this field are cleared during Iterator.Close before
202 : // returning the Iterator to the pool. Any fields above this field must also
203 : // be cleared, but may be cleared as a part of the body of Iterator.Close
204 : // (eg, if these types have their own Close method that zeroes their
205 : // fields).
206 : clearForReuseBoundary struct{}
207 :
208 : // The context is stored here since (a) Iterators are expected to be
209 : // short-lived (since they pin memtables and sstables), (b) plumbing a
210 : // context into every method is very painful, (c) they do not (yet) respect
211 : // context cancellation and are only used for tracing.
212 : ctx context.Context
213 : opts IterOptions
214 : merge Merge
215 : comparer *base.Comparer
216 : iter internalIterator
217 : pointIter topLevelIterator
218 : // Either readState or version is set, but not both.
219 : readState *readState
220 : version *manifest.Version
221 : // rangeKey holds iteration state specific to iteration over range keys.
222 : // The range key field may be nil if the Iterator has never been configured
223 : // to iterate over range keys. Its non-nilness cannot be used to determine
224 : // if the Iterator is currently iterating over range keys: For that, consult
225 : // the IterOptions using opts.rangeKeys(). If non-nil, its rangeKeyIter
226 : // field is guaranteed to be non-nil too.
227 : rangeKey *iteratorRangeKeyState
228 : // rangeKeyMasking holds state for range-key masking of point keys.
229 : rangeKeyMasking rangeKeyMasking
230 : err error
231 : // When iterValidityState=IterValid, key represents the current key, which
232 : // is backed by keyBuf.
233 : key []byte
234 : keyBuf []byte
235 : value base.InternalValue
236 : // For use in LazyValue.Clone.
237 : valueBuf []byte
238 : fetcher base.LazyFetcher
239 : // For use in LazyValue.Value.
240 : lazyValueBuf []byte
241 : valueCloser io.Closer
242 : // boundsBuf holds two buffers used to store the lower and upper bounds.
243 : // Whenever the Iterator's bounds change, the new bounds are copied into
244 : // boundsBuf[boundsBufIdx]. The two bounds share a slice to reduce
245 : // allocations. opts.LowerBound and opts.UpperBound point into this slice.
246 : boundsBuf [2][]byte
247 : boundsBufIdx int
248 : // iterKV reflects the latest position of iter, except when SetBounds is
249 : // called. In that case, it is explicitly set to nil.
250 : iterKV *base.InternalKV
251 : alloc *iterAlloc
252 : prefixOrFullSeekKey []byte
253 : readSampling readSampling
254 : stats IteratorStats
255 : externalIter *externalIterState
256 : // Following fields used when constructing an iterator stack, eg, in Clone
257 : // and SetOptions or when re-fragmenting a batch's range keys/range dels.
258 : fc *fileCacheHandle
259 : newIters tableNewIters
260 : newIterRangeKey keyspanimpl.TableNewSpanIter
261 : lazyCombinedIter lazyCombinedIter
262 : seqNum base.SeqNum
263 : // batch is non-nil if this Iterator includes an indexed batch. Batch
264 : // contains all the state pertaining to iterating over the indexed batch.
265 : // The iteratorBatchState struct is bundled within the iterAlloc struct to
266 : // reduce allocations.
267 : batch *iteratorBatchState
268 : // merging is a pointer to this iterator's point merging iterator. It
269 : // appears here because key visibility is handled by the merging iterator.
270 : // During SetOptions on an iterator over an indexed batch, this field is
271 : // used to update the merging iterator's batch snapshot.
272 : merging *mergingIter
273 :
274 : // Keeping the bools here after all the 8 byte aligned fields shrinks the
275 : // sizeof this struct by 24 bytes.
276 :
277 : // INVARIANT:
278 : // iterValidityState==IterAtLimit <=>
279 : // pos==iterPosCurForwardPaused || pos==iterPosCurReversePaused
280 : iterValidityState IterValidityState
281 : // Set to true by SetBounds, SetOptions. Causes the Iterator to appear
282 : // exhausted externally, while preserving the correct iterValidityState for
283 : // the iterator's internal state. Preserving the correct internal validity
284 : // is used for SeekPrefixGE(..., trySeekUsingNext), and SeekGE/SeekLT
285 : // optimizations after "no-op" calls to SetBounds and SetOptions.
286 : requiresReposition bool
287 : // The position of iter. When this is iterPos{Prev,Next} the iter has been
288 : // moved past the current key-value, which can only happen if
289 : // iterValidityState=IterValid, i.e., there is something to return to the
290 : // client for the current position.
291 : pos iterPos
292 : // Relates to the prefixOrFullSeekKey field above.
293 : hasPrefix bool
294 : // Used for deriving the value of SeekPrefixGE(..., trySeekUsingNext),
295 : // and SeekGE/SeekLT optimizations
296 : lastPositioningOp lastPositioningOpKind
297 : // Used for determining when it's safe to perform SeekGE optimizations that
298 : // reuse the iterator state to avoid the cost of a full seek if the iterator
299 : // is already positioned in the correct place. If the iterator's view of its
300 : // indexed batch was just refreshed, some optimizations cannot be applied on
301 : // the first seek after the refresh:
302 : // - SeekGE has a no-op optimization that does not seek on the internal
303 : // iterator at all if the iterator is already in the correct place.
304 : // This optimization cannot be performed if the internal iterator was
305 : // last positioned when the iterator had a different view of an
306 : // underlying batch.
307 : // - Seek[Prefix]GE set flags.TrySeekUsingNext()=true when the seek key is
308 : // greater than the previous operation's seek key, under the expectation
309 : // that the various internal iterators can use their current position to
310 : // avoid a full expensive re-seek. This applies to the batchIter as well.
311 : // However, if the view of the batch was just refreshed, the batchIter's
312 : // position is not useful because it may already be beyond new keys less
313 : // than the seek key. To prevent the use of this optimization in
314 : // batchIter, Seek[Prefix]GE set flags.BatchJustRefreshed()=true if this
315 : // bit is enabled.
316 : batchJustRefreshed bool
317 : // batchOnlyIter is set to true for Batch.NewBatchOnlyIter.
318 : batchOnlyIter bool
319 : // Used in some tests to disable the random disabling of seek optimizations.
320 : forceEnableSeekOpt bool
321 : // Set to true if NextPrefix is not currently permitted. Defaults to false
322 : // in case an iterator never had any bounds.
323 : nextPrefixNotPermittedByUpperBound bool
324 : }
325 :
326 : const clearOff = unsafe.Offsetof(Iterator{}.clearForReuseBoundary)
327 : const clearLen = unsafe.Sizeof(Iterator{}) - clearOff
328 :
329 1 : func (i *Iterator) clearForReuse() {
330 1 : *(*[clearLen]byte)(unsafe.Add(unsafe.Pointer(i), clearOff)) = [clearLen]byte{}
331 1 : }
332 :
333 : // cmp is a convenience shorthand for the i.comparer.Compare function.
334 1 : func (i *Iterator) cmp(a, b []byte) int {
335 1 : return i.comparer.Compare(a, b)
336 1 : }
337 :
338 : // equal is a convenience shorthand for the i.comparer.Equal function.
339 1 : func (i *Iterator) equal(a, b []byte) bool {
340 1 : return i.comparer.Equal(a, b)
341 1 : }
342 :
343 : // iteratorBatchState holds state pertaining to iterating over an indexed batch.
344 : // When an iterator is configured to read through an indexed batch, the iterator
345 : // maintains a pointer to this struct. This struct is embedded within the
346 : // iterAlloc struct to reduce allocations.
347 : type iteratorBatchState struct {
348 : batch *Batch
349 : // batchSeqNum is used by Iterators over indexed batches to detect when the
350 : // underlying batch has been mutated. The batch beneath an indexed batch may
351 : // be mutated while the Iterator is open, but new keys are not surfaced
352 : // until the next call to SetOptions.
353 : batchSeqNum base.SeqNum
354 : // batch{PointIter,RangeDelIter,RangeKeyIter} are used when the Iterator is
355 : // configured to read through an indexed batch. If a batch is set, these
356 : // iterators will be included within the iterator stack regardless of
357 : // whether the batch currently contains any keys of their kind. These
358 : // pointers are used during a call to SetOptions to refresh the Iterator's
359 : // view of its indexed batch.
360 : pointIter batchIter
361 : rangeDelIter keyspan.Iter
362 : rangeKeyIter keyspan.Iter
363 : }
364 :
365 : // iteratorRangeKeyState holds an iterator's range key iteration state.
366 : type iteratorRangeKeyState struct {
367 : // rangeKeyIter holds the range key iterator stack that iterates over the
368 : // merged spans across the entirety of the LSM.
369 : rangeKeyIter keyspan.FragmentIterator
370 : iiter keyspan.InterleavingIter
371 : // stale is set to true when the range key state recorded here (in start,
372 : // end and keys) may not be in sync with the current range key at the
373 : // interleaving iterator's current position.
374 : //
375 : // When the interelaving iterator passes over a new span, it invokes the
376 : // SpanChanged hook defined on the `rangeKeyMasking` type, which sets stale
377 : // to true if the span is non-nil.
378 : //
379 : // The parent iterator may not be positioned over the interleaving
380 : // iterator's current position (eg, i.iterPos = iterPos{Next,Prev}), so
381 : // {keys,start,end} are only updated to the new range key during a call to
382 : // Iterator.saveRangeKey.
383 : stale bool
384 : // updated is used to signal to the Iterator client whether the state of
385 : // range keys has changed since the previous iterator position through the
386 : // `RangeKeyChanged` method. It's set to true during an Iterator positioning
387 : // operation that changes the state of the current range key. Each Iterator
388 : // positioning operation sets it back to false before executing.
389 : //
390 : // TODO(jackson): The lifecycle of {stale,updated,prevPosHadRangeKey} is
391 : // intricate and confusing. Try to refactor to reduce complexity.
392 : updated bool
393 : // prevPosHadRangeKey records whether the previous Iterator position had a
394 : // range key (HasPointAndRage() = (_, true)). It's updated at the beginning
395 : // of each new Iterator positioning operation. It's required by saveRangeKey to
396 : // to set `updated` appropriately: Without this record of the previous iterator
397 : // state, it's ambiguous whether an iterator only temporarily stepped onto a
398 : // position without a range key.
399 : prevPosHadRangeKey bool
400 : // rangeKeyOnly is set to true if at the current iterator position there is
401 : // no point key, only a range key start boundary.
402 : rangeKeyOnly bool
403 : // hasRangeKey is true when the current iterator position has a covering
404 : // range key (eg, a range key with bounds [<lower>,<upper>) such that
405 : // <lower> ≤ Key() < <upper>).
406 : hasRangeKey bool
407 : // start and end are the [start, end) boundaries of the current range keys.
408 : start []byte
409 : end []byte
410 :
411 : rangeKeyBuffers
412 :
413 : // iterConfig holds fields that are used for the construction of the
414 : // iterator stack, but do not need to be directly accessed during iteration.
415 : // This struct is bundled within the iteratorRangeKeyState struct to reduce
416 : // allocations.
417 : iterConfig rangekeystack.UserIteratorConfig
418 : }
419 :
420 : type rangeKeyBuffers struct {
421 : // keys is sorted by Suffix ascending.
422 : keys []RangeKeyData
423 : // buf is used to save range-key data before moving the range-key iterator.
424 : // Start and end boundaries, suffixes and values are all copied into buf.
425 : buf bytealloc.A
426 : // internal holds buffers used by the range key internal iterators.
427 : internal rangekeystack.Buffers
428 : }
429 :
430 1 : func (b *rangeKeyBuffers) PrepareForReuse() {
431 1 : const maxKeysReuse = 100
432 1 : if len(b.keys) > maxKeysReuse {
433 0 : b.keys = nil
434 0 : }
435 : // Avoid caching the key buf if it is overly large. The constant is
436 : // fairly arbitrary.
437 1 : if cap(b.buf) >= maxKeyBufCacheSize {
438 1 : b.buf = nil
439 1 : } else {
440 1 : b.buf = b.buf[:0]
441 1 : }
442 1 : b.internal.PrepareForReuse()
443 : }
444 :
445 : var iterRangeKeyStateAllocPool = sync.Pool{
446 1 : New: func() interface{} {
447 1 : return &iteratorRangeKeyState{}
448 1 : },
449 : }
450 :
451 : // isEphemeralPosition returns true iff the current iterator position is
452 : // ephemeral, and won't be visited during subsequent relative positioning
453 : // operations.
454 : //
455 : // The iterator position resulting from a SeekGE or SeekPrefixGE that lands on a
456 : // straddling range key without a coincident point key is such a position.
457 1 : func (i *Iterator) isEphemeralPosition() bool {
458 1 : return i.opts.rangeKeys() && i.rangeKey != nil && i.rangeKey.rangeKeyOnly &&
459 1 : !i.equal(i.rangeKey.start, i.key)
460 1 : }
461 :
462 : type lastPositioningOpKind int8
463 :
464 : const (
465 : unknownLastPositionOp lastPositioningOpKind = iota
466 : seekPrefixGELastPositioningOp
467 : seekGELastPositioningOp
468 : seekLTLastPositioningOp
469 : // internalNextOp is a special internal iterator positioning operation used
470 : // by CanDeterministicallySingleDelete. It exists for enforcing requirements
471 : // around calling CanDeterministicallySingleDelete at most once per external
472 : // iterator position.
473 : internalNextOp
474 : )
475 :
476 : // Limited iteration mode. Not for use with prefix iteration.
477 : //
478 : // SeekGE, SeekLT, Prev, Next have WithLimit variants, that pause the iterator
479 : // at the limit in a best-effort manner. The client should behave correctly
480 : // even if the limits are ignored. These limits are not "deep", in that they
481 : // are not passed down to the underlying collection of internalIterators. This
482 : // is because the limits are transient, and apply only until the next
483 : // iteration call. They serve mainly as a way to bound the amount of work when
484 : // two (or more) Iterators are being coordinated at a higher level.
485 : //
486 : // In limited iteration mode:
487 : // - Avoid using Iterator.Valid if the last call was to a *WithLimit() method.
488 : // The return value from the *WithLimit() method provides a more precise
489 : // disposition.
490 : // - The limit is exclusive for forward and inclusive for reverse.
491 : //
492 : //
493 : // Limited iteration mode & range keys
494 : //
495 : // Limited iteration interacts with range-key iteration. When range key
496 : // iteration is enabled, range keys are interleaved at their start boundaries.
497 : // Limited iteration must ensure that if a range key exists within the limit,
498 : // the iterator visits the range key.
499 : //
500 : // During forward limited iteration, this is trivial: An overlapping range key
501 : // must have a start boundary less than the limit, and the range key's start
502 : // boundary will be interleaved and found to be within the limit.
503 : //
504 : // During reverse limited iteration, the tail of the range key may fall within
505 : // the limit. The range key must be surfaced even if the range key's start
506 : // boundary is less than the limit, and if there are no point keys between the
507 : // current iterator position and the limit. To provide this guarantee, reverse
508 : // limited iteration ignores the limit as long as there is a range key
509 : // overlapping the iteration position.
510 :
511 : // IterValidityState captures the state of the Iterator.
512 : type IterValidityState int8
513 :
514 : const (
515 : // IterExhausted represents an Iterator that is exhausted.
516 : IterExhausted IterValidityState = iota
517 : // IterValid represents an Iterator that is valid.
518 : IterValid
519 : // IterAtLimit represents an Iterator that has a non-exhausted
520 : // internalIterator, but has reached a limit without any key for the
521 : // caller.
522 : IterAtLimit
523 : )
524 :
525 : // readSampling stores variables used to sample a read to trigger a read
526 : // compaction
527 : type readSampling struct {
528 : bytesUntilReadSampling uint64
529 : initialSamplePassed bool
530 : pendingCompactions readCompactionQueue
531 : // forceReadSampling is used for testing purposes to force a read sample on every
532 : // call to Iterator.maybeSampleRead()
533 : forceReadSampling bool
534 : }
535 :
536 1 : func (i *Iterator) findNextEntry(limit []byte) {
537 1 : i.iterValidityState = IterExhausted
538 1 : i.pos = iterPosCurForward
539 1 : if i.opts.rangeKeys() && i.rangeKey != nil {
540 1 : i.rangeKey.rangeKeyOnly = false
541 1 : }
542 :
543 : // Close the closer for the current value if one was open.
544 1 : if i.closeValueCloser() != nil {
545 0 : return
546 0 : }
547 :
548 1 : for i.iterKV != nil {
549 1 : key := i.iterKV.K
550 1 :
551 1 : // The topLevelIterator.StrictSeekPrefixGE contract requires that in
552 1 : // prefix mode [i.hasPrefix=t], every point key returned by the internal
553 1 : // iterator must have the current iteration prefix.
554 1 : if invariants.Enabled && i.hasPrefix {
555 1 : // Range keys are an exception to the contract and may return a different
556 1 : // prefix. This case is explicitly handled in the switch statement below.
557 1 : if key.Kind() != base.InternalKeyKindRangeKeySet {
558 1 : if p := i.comparer.Split.Prefix(key.UserKey); !i.equal(i.prefixOrFullSeekKey, p) {
559 0 : i.opts.logger.Fatalf("pebble: prefix violation: key %q does not have prefix %q\n", key.UserKey, i.prefixOrFullSeekKey)
560 0 : }
561 : }
562 : }
563 :
564 : // Compare with limit every time we start at a different user key.
565 : // Note that given the best-effort contract of limit, we could avoid a
566 : // comparison in the common case by doing this only after
567 : // i.nextUserKey is called for the deletes below. However that makes
568 : // the behavior non-deterministic (since the behavior will vary based
569 : // on what has been compacted), which makes it hard to test with the
570 : // metamorphic test. So we forego that performance optimization.
571 1 : if limit != nil && i.cmp(limit, i.iterKV.K.UserKey) <= 0 {
572 1 : i.iterValidityState = IterAtLimit
573 1 : i.pos = iterPosCurForwardPaused
574 1 : return
575 1 : }
576 :
577 : // If the user has configured a SkipPoint function, invoke it to see
578 : // whether we should skip over the current user key.
579 1 : if i.opts.SkipPoint != nil && key.Kind() != InternalKeyKindRangeKeySet && i.opts.SkipPoint(i.iterKV.K.UserKey) {
580 1 : // NB: We could call nextUserKey, but in some cases the SkipPoint
581 1 : // predicate function might be cheaper than nextUserKey's key copy
582 1 : // and key comparison. This should be the case for MVCC suffix
583 1 : // comparisons, for example. In the future, we could expand the
584 1 : // SkipPoint interface to give the implementor more control over
585 1 : // whether we skip over just the internal key, the user key, or even
586 1 : // the key prefix.
587 1 : i.stats.ForwardStepCount[InternalIterCall]++
588 1 : i.iterKV = i.iter.Next()
589 1 : continue
590 : }
591 :
592 1 : switch key.Kind() {
593 1 : case InternalKeyKindRangeKeySet:
594 1 : if i.hasPrefix {
595 1 : if p := i.comparer.Split.Prefix(key.UserKey); !i.equal(i.prefixOrFullSeekKey, p) {
596 1 : return
597 1 : }
598 : }
599 : // Save the current key.
600 1 : i.keyBuf = append(i.keyBuf[:0], key.UserKey...)
601 1 : i.key = i.keyBuf
602 1 : i.value = base.InternalValue{}
603 1 : // There may also be a live point key at this userkey that we have
604 1 : // not yet read. We need to find the next entry with this user key
605 1 : // to find it. Save the range key so we don't lose it when we Next
606 1 : // the underlying iterator.
607 1 : i.saveRangeKey()
608 1 : pointKeyExists := i.nextPointCurrentUserKey()
609 1 : if i.err != nil {
610 0 : i.iterValidityState = IterExhausted
611 0 : return
612 0 : }
613 1 : i.rangeKey.rangeKeyOnly = !pointKeyExists
614 1 : i.iterValidityState = IterValid
615 1 : return
616 :
617 1 : case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized:
618 1 : // NB: treating InternalKeyKindSingleDelete as equivalent to DEL is not
619 1 : // only simpler, but is also necessary for correctness due to
620 1 : // InternalKeyKindSSTableInternalObsoleteBit.
621 1 : i.nextUserKey()
622 1 : continue
623 :
624 1 : case InternalKeyKindSet, InternalKeyKindSetWithDelete:
625 1 : i.keyBuf = append(i.keyBuf[:0], key.UserKey...)
626 1 : i.key = i.keyBuf
627 1 : i.value = i.iterKV.V
628 1 : i.iterValidityState = IterValid
629 1 : i.saveRangeKey()
630 1 : return
631 :
632 1 : case InternalKeyKindMerge:
633 1 : // Resolving the merge may advance us to the next point key, which
634 1 : // may be covered by a different set of range keys. Save the range
635 1 : // key state so we don't lose it.
636 1 : i.saveRangeKey()
637 1 : if i.mergeForward(key) {
638 1 : i.iterValidityState = IterValid
639 1 : return
640 1 : }
641 :
642 : // The merge didn't yield a valid key, either because the value
643 : // merger indicated it should be deleted, or because an error was
644 : // encountered.
645 0 : i.iterValidityState = IterExhausted
646 0 : if i.err != nil {
647 0 : return
648 0 : }
649 0 : if i.pos != iterPosNext {
650 0 : i.nextUserKey()
651 0 : }
652 0 : if i.closeValueCloser() != nil {
653 0 : return
654 0 : }
655 0 : i.pos = iterPosCurForward
656 :
657 0 : default:
658 0 : i.err = base.CorruptionErrorf("pebble: invalid internal key kind: %d", errors.Safe(key.Kind()))
659 0 : i.iterValidityState = IterExhausted
660 0 : return
661 : }
662 : }
663 :
664 : // Is iterKey nil due to an error?
665 1 : if err := i.iter.Error(); err != nil {
666 0 : i.err = err
667 0 : i.iterValidityState = IterExhausted
668 0 : }
669 : }
670 :
671 1 : func (i *Iterator) nextPointCurrentUserKey() bool {
672 1 : // If the user has configured a SkipPoint function and the current user key
673 1 : // would be skipped by it, there's no need to step forward looking for a
674 1 : // point key. If we were to find one, it should be skipped anyways.
675 1 : if i.opts.SkipPoint != nil && i.opts.SkipPoint(i.key) {
676 1 : return false
677 1 : }
678 :
679 1 : i.pos = iterPosCurForward
680 1 :
681 1 : i.iterKV = i.iter.Next()
682 1 : i.stats.ForwardStepCount[InternalIterCall]++
683 1 : if i.iterKV == nil {
684 1 : if err := i.iter.Error(); err != nil {
685 0 : i.err = err
686 1 : } else {
687 1 : i.pos = iterPosNext
688 1 : }
689 1 : return false
690 : }
691 1 : if !i.equal(i.key, i.iterKV.K.UserKey) {
692 1 : i.pos = iterPosNext
693 1 : return false
694 1 : }
695 :
696 1 : key := i.iterKV.K
697 1 : switch key.Kind() {
698 0 : case InternalKeyKindRangeKeySet:
699 0 : // RangeKeySets must always be interleaved as the first internal key
700 0 : // for a user key.
701 0 : i.err = base.CorruptionErrorf("pebble: unexpected range key set mid-user key")
702 0 : return false
703 :
704 1 : case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized:
705 1 : // NB: treating InternalKeyKindSingleDelete as equivalent to DEL is not
706 1 : // only simpler, but is also necessary for correctness due to
707 1 : // InternalKeyKindSSTableInternalObsoleteBit.
708 1 : return false
709 :
710 1 : case InternalKeyKindSet, InternalKeyKindSetWithDelete:
711 1 : i.value = i.iterKV.V
712 1 : return true
713 :
714 1 : case InternalKeyKindMerge:
715 1 : return i.mergeForward(key)
716 :
717 0 : default:
718 0 : i.err = base.CorruptionErrorf("pebble: invalid internal key kind: %d", errors.Safe(key.Kind()))
719 0 : return false
720 : }
721 : }
722 :
723 : // mergeForward resolves a MERGE key, advancing the underlying iterator forward
724 : // to merge with subsequent keys with the same userkey. mergeForward returns a
725 : // boolean indicating whether or not the merge yielded a valid key. A merge may
726 : // not yield a valid key if an error occurred, in which case i.err is non-nil,
727 : // or the user's value merger specified the key to be deleted.
728 : //
729 : // mergeForward does not update iterValidityState.
730 1 : func (i *Iterator) mergeForward(key base.InternalKey) (valid bool) {
731 1 : var iterValue []byte
732 1 : iterValue, _, i.err = i.iterKV.Value(nil)
733 1 : if i.err != nil {
734 0 : return false
735 0 : }
736 1 : var valueMerger ValueMerger
737 1 : valueMerger, i.err = i.merge(key.UserKey, iterValue)
738 1 : if i.err != nil {
739 0 : return false
740 0 : }
741 :
742 1 : i.mergeNext(key, valueMerger)
743 1 : if i.err != nil {
744 0 : return false
745 0 : }
746 :
747 1 : var needDelete bool
748 1 : var value []byte
749 1 : value, needDelete, i.valueCloser, i.err = finishValueMerger(
750 1 : valueMerger, true /* includesBase */)
751 1 : i.value = base.MakeInPlaceValue(value)
752 1 : if i.err != nil {
753 0 : return false
754 0 : }
755 1 : if needDelete {
756 0 : _ = i.closeValueCloser()
757 0 : return false
758 0 : }
759 1 : return true
760 : }
761 :
762 1 : func (i *Iterator) closeValueCloser() error {
763 1 : if i.valueCloser != nil {
764 0 : i.err = i.valueCloser.Close()
765 0 : i.valueCloser = nil
766 0 : }
767 1 : return i.err
768 : }
769 :
770 1 : func (i *Iterator) nextUserKey() {
771 1 : if i.iterKV == nil {
772 1 : return
773 1 : }
774 1 : trailer := i.iterKV.K.Trailer
775 1 : done := i.iterKV.K.Trailer <= base.InternalKeyZeroSeqnumMaxTrailer
776 1 : if i.iterValidityState != IterValid {
777 1 : i.keyBuf = append(i.keyBuf[:0], i.iterKV.K.UserKey...)
778 1 : i.key = i.keyBuf
779 1 : }
780 1 : for {
781 1 : i.stats.ForwardStepCount[InternalIterCall]++
782 1 : i.iterKV = i.iter.Next()
783 1 : if i.iterKV == nil {
784 1 : if err := i.iter.Error(); err != nil {
785 0 : i.err = err
786 0 : return
787 0 : }
788 : }
789 : // NB: We're guaranteed to be on the next user key if the previous key
790 : // had a zero sequence number (`done`), or the new key has a trailer
791 : // greater or equal to the previous key's trailer. This is true because
792 : // internal keys with the same user key are sorted by InternalKeyTrailer in
793 : // strictly monotonically descending order. We expect the trailer
794 : // optimization to trigger around 50% of the time with randomly
795 : // distributed writes. We expect it to trigger very frequently when
796 : // iterating through ingested sstables, which contain keys that all have
797 : // the same sequence number.
798 1 : if done || i.iterKV == nil || i.iterKV.K.Trailer >= trailer {
799 1 : break
800 : }
801 1 : if !i.equal(i.key, i.iterKV.K.UserKey) {
802 1 : break
803 : }
804 1 : done = i.iterKV.K.Trailer <= base.InternalKeyZeroSeqnumMaxTrailer
805 1 : trailer = i.iterKV.K.Trailer
806 : }
807 : }
808 :
809 1 : func (i *Iterator) maybeSampleRead() {
810 1 : // This method is only called when a public method of Iterator is
811 1 : // returning, and below we exclude the case were the iterator is paused at
812 1 : // a limit. The effect of these choices is that keys that are deleted, but
813 1 : // are encountered during iteration, are not accounted for in the read
814 1 : // sampling and will not cause read driven compactions, even though we are
815 1 : // incurring cost in iterating over them. And this issue is not limited to
816 1 : // Iterator, which does not see the effect of range deletes, which may be
817 1 : // causing iteration work in mergingIter. It is not clear at this time
818 1 : // whether this is a deficiency worth addressing.
819 1 : if i.iterValidityState != IterValid {
820 1 : return
821 1 : }
822 1 : if i.readState == nil {
823 1 : return
824 1 : }
825 1 : if i.readSampling.forceReadSampling {
826 0 : i.sampleRead()
827 0 : return
828 0 : }
829 1 : samplingPeriod := int32(int64(readBytesPeriod) * i.readState.db.opts.Experimental.ReadSamplingMultiplier)
830 1 : if samplingPeriod <= 0 {
831 0 : return
832 0 : }
833 1 : bytesRead := uint64(len(i.key) + i.value.Len())
834 1 : for i.readSampling.bytesUntilReadSampling < bytesRead {
835 1 : i.readSampling.bytesUntilReadSampling += uint64(rand.Uint32N(2 * uint32(samplingPeriod)))
836 1 : // The block below tries to adjust for the case where this is the
837 1 : // first read in a newly-opened iterator. As bytesUntilReadSampling
838 1 : // starts off at zero, we don't want to sample the first read of
839 1 : // every newly-opened iterator, but we do want to sample some of them.
840 1 : if !i.readSampling.initialSamplePassed {
841 1 : i.readSampling.initialSamplePassed = true
842 1 : if i.readSampling.bytesUntilReadSampling > bytesRead {
843 1 : if rand.Uint64N(i.readSampling.bytesUntilReadSampling) > bytesRead {
844 1 : continue
845 : }
846 : }
847 : }
848 1 : i.sampleRead()
849 : }
850 1 : i.readSampling.bytesUntilReadSampling -= bytesRead
851 : }
852 :
853 1 : func (i *Iterator) sampleRead() {
854 1 : var topFile *manifest.TableMetadata
855 1 : topLevel, numOverlappingLevels := numLevels, 0
856 1 : mi := i.merging
857 1 : if mi == nil {
858 1 : return
859 1 : }
860 1 : if len(mi.levels) > 1 {
861 1 : mi.ForEachLevelIter(func(li *levelIter) (done bool) {
862 1 : if li.layer.IsFlushableIngests() {
863 0 : return false
864 0 : }
865 1 : l := li.layer.Level()
866 1 : if f := li.iterFile; f != nil {
867 1 : var containsKey bool
868 1 : if i.pos == iterPosNext || i.pos == iterPosCurForward ||
869 1 : i.pos == iterPosCurForwardPaused {
870 1 : containsKey = i.cmp(f.PointKeyBounds.SmallestUserKey(), i.key) <= 0
871 1 : } else if i.pos == iterPosPrev || i.pos == iterPosCurReverse ||
872 1 : i.pos == iterPosCurReversePaused {
873 1 : containsKey = i.cmp(f.PointKeyBounds.LargestUserKey(), i.key) >= 0
874 1 : }
875 : // Do nothing if the current key is not contained in f's
876 : // bounds. We could seek the LevelIterator at this level
877 : // to find the right file, but the performance impacts of
878 : // doing that are significant enough to negate the benefits
879 : // of read sampling in the first place. See the discussion
880 : // at:
881 : // https://github.com/cockroachdb/pebble/pull/1041#issuecomment-763226492
882 1 : if containsKey {
883 1 : numOverlappingLevels++
884 1 : if numOverlappingLevels >= 2 {
885 1 : // Terminate the loop early if at least 2 overlapping levels are found.
886 1 : return true
887 1 : }
888 1 : topLevel = l
889 1 : topFile = f
890 : }
891 : }
892 1 : return false
893 : })
894 : }
895 1 : if topFile == nil || topLevel >= numLevels {
896 1 : return
897 1 : }
898 1 : if numOverlappingLevels >= 2 {
899 1 : allowedSeeks := topFile.AllowedSeeks.Add(-1)
900 1 : if allowedSeeks == 0 {
901 0 :
902 0 : // Since the compaction queue can handle duplicates, we can keep
903 0 : // adding to the queue even once allowedSeeks hits 0.
904 0 : // In fact, we NEED to keep adding to the queue, because the queue
905 0 : // is small and evicts older and possibly useful compactions.
906 0 : topFile.AllowedSeeks.Add(topFile.InitAllowedSeeks)
907 0 :
908 0 : read := readCompaction{
909 0 : start: topFile.PointKeyBounds.SmallestUserKey(),
910 0 : end: topFile.PointKeyBounds.LargestUserKey(),
911 0 : level: topLevel,
912 0 : tableNum: topFile.TableNum,
913 0 : }
914 0 : i.readSampling.pendingCompactions.add(&read, i.cmp)
915 0 : }
916 : }
917 : }
918 :
919 1 : func (i *Iterator) findPrevEntry(limit []byte) {
920 1 : i.iterValidityState = IterExhausted
921 1 : i.pos = iterPosCurReverse
922 1 : if i.opts.rangeKeys() && i.rangeKey != nil {
923 1 : i.rangeKey.rangeKeyOnly = false
924 1 : }
925 :
926 : // Close the closer for the current value if one was open.
927 1 : if i.valueCloser != nil {
928 0 : i.err = i.valueCloser.Close()
929 0 : i.valueCloser = nil
930 0 : if i.err != nil {
931 0 : i.iterValidityState = IterExhausted
932 0 : return
933 0 : }
934 : }
935 :
936 1 : var valueMerger ValueMerger
937 1 : firstLoopIter := true
938 1 : rangeKeyBoundary := false
939 1 : // The code below compares with limit in multiple places. As documented in
940 1 : // findNextEntry, this is being done to make the behavior of limit
941 1 : // deterministic to allow for metamorphic testing. It is not required by
942 1 : // the best-effort contract of limit.
943 1 : for i.iterKV != nil {
944 1 : key := i.iterKV.K
945 1 :
946 1 : // NB: We cannot pause if the current key is covered by a range key.
947 1 : // Otherwise, the user might not ever learn of a range key that covers
948 1 : // the key space being iterated over in which there are no point keys.
949 1 : // Since limits are best effort, ignoring the limit in this case is
950 1 : // allowed by the contract of limit.
951 1 : if firstLoopIter && limit != nil && i.cmp(limit, i.iterKV.K.UserKey) > 0 && !i.rangeKeyWithinLimit(limit) {
952 1 : i.iterValidityState = IterAtLimit
953 1 : i.pos = iterPosCurReversePaused
954 1 : return
955 1 : }
956 1 : firstLoopIter = false
957 1 :
958 1 : if i.iterValidityState == IterValid {
959 1 : if !i.equal(key.UserKey, i.key) {
960 1 : // We've iterated to the previous user key.
961 1 : i.pos = iterPosPrev
962 1 : if valueMerger != nil {
963 1 : var needDelete bool
964 1 : var value []byte
965 1 : value, needDelete, i.valueCloser, i.err = finishValueMerger(valueMerger, true /* includesBase */)
966 1 : i.value = base.MakeInPlaceValue(value)
967 1 : if i.err == nil && needDelete {
968 0 : // The point key at this key is deleted. If we also have
969 0 : // a range key boundary at this key, we still want to
970 0 : // return. Otherwise, we need to continue looking for
971 0 : // a live key.
972 0 : i.value = base.InternalValue{}
973 0 : if rangeKeyBoundary {
974 0 : i.rangeKey.rangeKeyOnly = true
975 0 : } else {
976 0 : i.iterValidityState = IterExhausted
977 0 : if i.closeValueCloser() == nil {
978 0 : continue
979 : }
980 : }
981 : }
982 : }
983 1 : if i.err != nil {
984 0 : i.iterValidityState = IterExhausted
985 0 : }
986 1 : return
987 : }
988 : }
989 :
990 : // If the user has configured a SkipPoint function, invoke it to see
991 : // whether we should skip over the current user key.
992 1 : if i.opts.SkipPoint != nil && key.Kind() != InternalKeyKindRangeKeySet && i.opts.SkipPoint(key.UserKey) {
993 1 : // NB: We could call prevUserKey, but in some cases the SkipPoint
994 1 : // predicate function might be cheaper than prevUserKey's key copy
995 1 : // and key comparison. This should be the case for MVCC suffix
996 1 : // comparisons, for example. In the future, we could expand the
997 1 : // SkipPoint interface to give the implementor more control over
998 1 : // whether we skip over just the internal key, the user key, or even
999 1 : // the key prefix.
1000 1 : i.stats.ReverseStepCount[InternalIterCall]++
1001 1 : i.iterKV = i.iter.Prev()
1002 1 : if i.iterKV == nil {
1003 1 : if err := i.iter.Error(); err != nil {
1004 0 : i.err = err
1005 0 : i.iterValidityState = IterExhausted
1006 0 : return
1007 0 : }
1008 : }
1009 1 : if limit != nil && i.iterKV != nil && i.cmp(limit, i.iterKV.K.UserKey) > 0 && !i.rangeKeyWithinLimit(limit) {
1010 1 : i.iterValidityState = IterAtLimit
1011 1 : i.pos = iterPosCurReversePaused
1012 1 : return
1013 1 : }
1014 1 : continue
1015 : }
1016 :
1017 1 : switch key.Kind() {
1018 1 : case InternalKeyKindRangeKeySet:
1019 1 : // Range key start boundary markers are interleaved with the maximum
1020 1 : // sequence number, so if there's a point key also at this key, we
1021 1 : // must've already iterated over it.
1022 1 : // This is the final entry at this user key, so we may return
1023 1 : i.rangeKey.rangeKeyOnly = i.iterValidityState != IterValid
1024 1 : if i.rangeKey.rangeKeyOnly {
1025 1 : // The point iterator is now invalid, so clear the point value.
1026 1 : i.value = base.InternalValue{}
1027 1 : }
1028 1 : i.keyBuf = append(i.keyBuf[:0], key.UserKey...)
1029 1 : i.key = i.keyBuf
1030 1 : i.iterValidityState = IterValid
1031 1 : i.saveRangeKey()
1032 1 : // In all other cases, previous iteration requires advancing to
1033 1 : // iterPosPrev in order to determine if the key is live and
1034 1 : // unshadowed by another key at the same user key. In this case,
1035 1 : // because range key start boundary markers are always interleaved
1036 1 : // at the maximum sequence number, we know that there aren't any
1037 1 : // additional keys with the same user key in the backward direction.
1038 1 : //
1039 1 : // We Prev the underlying iterator once anyways for consistency, so
1040 1 : // that we can maintain the invariant during backward iteration that
1041 1 : // i.iterPos = iterPosPrev.
1042 1 : i.stats.ReverseStepCount[InternalIterCall]++
1043 1 : i.iterKV = i.iter.Prev()
1044 1 :
1045 1 : // Set rangeKeyBoundary so that on the next iteration, we know to
1046 1 : // return the key even if the MERGE point key is deleted.
1047 1 : rangeKeyBoundary = true
1048 :
1049 1 : case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized:
1050 1 : i.value = base.InternalValue{}
1051 1 : i.iterValidityState = IterExhausted
1052 1 : valueMerger = nil
1053 1 : i.stats.ReverseStepCount[InternalIterCall]++
1054 1 : i.iterKV = i.iter.Prev()
1055 1 : // Compare with the limit. We could optimize by only checking when
1056 1 : // we step to the previous user key, but detecting that requires a
1057 1 : // comparison too. Note that this position may already passed a
1058 1 : // number of versions of this user key, but they are all deleted, so
1059 1 : // the fact that a subsequent Prev*() call will not see them is
1060 1 : // harmless. Also note that this is the only place in the loop,
1061 1 : // other than the firstLoopIter and SkipPoint cases above, where we
1062 1 : // could step to a different user key and start processing it for
1063 1 : // returning to the caller.
1064 1 : if limit != nil && i.iterKV != nil && i.cmp(limit, i.iterKV.K.UserKey) > 0 && !i.rangeKeyWithinLimit(limit) {
1065 1 : i.iterValidityState = IterAtLimit
1066 1 : i.pos = iterPosCurReversePaused
1067 1 : return
1068 1 : }
1069 1 : continue
1070 :
1071 1 : case InternalKeyKindSet, InternalKeyKindSetWithDelete:
1072 1 : i.keyBuf = append(i.keyBuf[:0], key.UserKey...)
1073 1 : i.key = i.keyBuf
1074 1 : // iterValue is owned by i.iter and could change after the Prev()
1075 1 : // call, so use valueBuf instead. Note that valueBuf is only used
1076 1 : // in this one instance; everywhere else (eg. in findNextEntry),
1077 1 : // we just point i.value to the unsafe i.iter-owned value buffer.
1078 1 : i.value, i.valueBuf = i.iterKV.V.Clone(i.valueBuf[:0], &i.fetcher)
1079 1 : i.saveRangeKey()
1080 1 : i.iterValidityState = IterValid
1081 1 : i.iterKV = i.iter.Prev()
1082 1 : i.stats.ReverseStepCount[InternalIterCall]++
1083 1 : valueMerger = nil
1084 1 : continue
1085 :
1086 1 : case InternalKeyKindMerge:
1087 1 : if i.iterValidityState == IterExhausted {
1088 1 : i.keyBuf = append(i.keyBuf[:0], key.UserKey...)
1089 1 : i.key = i.keyBuf
1090 1 : i.saveRangeKey()
1091 1 : var iterValue []byte
1092 1 : iterValue, _, i.err = i.iterKV.Value(nil)
1093 1 : if i.err != nil {
1094 0 : return
1095 0 : }
1096 1 : valueMerger, i.err = i.merge(i.key, iterValue)
1097 1 : if i.err != nil {
1098 0 : return
1099 0 : }
1100 1 : i.iterValidityState = IterValid
1101 1 : } else if valueMerger == nil {
1102 1 : // Extract value before iterValue since we use value before iterValue
1103 1 : // and the underlying iterator is not required to provide backing
1104 1 : // memory for both simultaneously.
1105 1 : var value []byte
1106 1 : var callerOwned bool
1107 1 : value, callerOwned, i.err = i.value.Value(i.lazyValueBuf)
1108 1 : if callerOwned {
1109 0 : i.lazyValueBuf = value[:0]
1110 0 : }
1111 1 : if i.err != nil {
1112 0 : i.iterValidityState = IterExhausted
1113 0 : return
1114 0 : }
1115 1 : valueMerger, i.err = i.merge(i.key, value)
1116 1 : var iterValue []byte
1117 1 : iterValue, _, i.err = i.iterKV.Value(nil)
1118 1 : if i.err != nil {
1119 0 : i.iterValidityState = IterExhausted
1120 0 : return
1121 0 : }
1122 1 : if i.err == nil {
1123 1 : i.err = valueMerger.MergeNewer(iterValue)
1124 1 : }
1125 1 : if i.err != nil {
1126 0 : i.iterValidityState = IterExhausted
1127 0 : return
1128 0 : }
1129 1 : } else {
1130 1 : var iterValue []byte
1131 1 : iterValue, _, i.err = i.iterKV.Value(nil)
1132 1 : if i.err != nil {
1133 0 : i.iterValidityState = IterExhausted
1134 0 : return
1135 0 : }
1136 1 : i.err = valueMerger.MergeNewer(iterValue)
1137 1 : if i.err != nil {
1138 0 : i.iterValidityState = IterExhausted
1139 0 : return
1140 0 : }
1141 : }
1142 1 : i.iterKV = i.iter.Prev()
1143 1 : i.stats.ReverseStepCount[InternalIterCall]++
1144 1 : continue
1145 :
1146 0 : default:
1147 0 : i.err = base.CorruptionErrorf("pebble: invalid internal key kind: %d", errors.Safe(key.Kind()))
1148 0 : i.iterValidityState = IterExhausted
1149 0 : return
1150 : }
1151 : }
1152 : // i.iterKV == nil, so broke out of the preceding loop.
1153 :
1154 : // Is iterKey nil due to an error?
1155 1 : if i.err = i.iter.Error(); i.err != nil {
1156 0 : i.iterValidityState = IterExhausted
1157 0 : return
1158 0 : }
1159 :
1160 1 : if i.iterValidityState == IterValid {
1161 1 : i.pos = iterPosPrev
1162 1 : if valueMerger != nil {
1163 1 : var needDelete bool
1164 1 : var value []byte
1165 1 : value, needDelete, i.valueCloser, i.err = finishValueMerger(valueMerger, true /* includesBase */)
1166 1 : i.value = base.MakeInPlaceValue(value)
1167 1 : if i.err == nil && needDelete {
1168 0 : i.key = nil
1169 0 : i.value = base.InternalValue{}
1170 0 : i.iterValidityState = IterExhausted
1171 0 : }
1172 : }
1173 1 : if i.err != nil {
1174 0 : i.iterValidityState = IterExhausted
1175 0 : }
1176 : }
1177 : }
1178 :
1179 1 : func (i *Iterator) prevUserKey() {
1180 1 : if i.iterKV == nil {
1181 1 : return
1182 1 : }
1183 1 : if i.iterValidityState != IterValid {
1184 1 : // If we're going to compare against the prev key, we need to save the
1185 1 : // current key.
1186 1 : i.keyBuf = append(i.keyBuf[:0], i.iterKV.K.UserKey...)
1187 1 : i.key = i.keyBuf
1188 1 : }
1189 1 : for {
1190 1 : i.iterKV = i.iter.Prev()
1191 1 : i.stats.ReverseStepCount[InternalIterCall]++
1192 1 : if i.iterKV == nil {
1193 1 : if err := i.iter.Error(); err != nil {
1194 0 : i.err = err
1195 0 : i.iterValidityState = IterExhausted
1196 0 : }
1197 1 : break
1198 : }
1199 1 : if !i.equal(i.key, i.iterKV.K.UserKey) {
1200 1 : break
1201 : }
1202 : }
1203 : }
1204 :
1205 1 : func (i *Iterator) mergeNext(key InternalKey, valueMerger ValueMerger) {
1206 1 : // Save the current key.
1207 1 : i.keyBuf = append(i.keyBuf[:0], key.UserKey...)
1208 1 : i.key = i.keyBuf
1209 1 :
1210 1 : // Loop looking for older values for this key and merging them.
1211 1 : for {
1212 1 : i.iterKV = i.iter.Next()
1213 1 : i.stats.ForwardStepCount[InternalIterCall]++
1214 1 : if i.iterKV == nil {
1215 1 : if i.err = i.iter.Error(); i.err != nil {
1216 0 : return
1217 0 : }
1218 1 : i.pos = iterPosNext
1219 1 : return
1220 : }
1221 1 : key = i.iterKV.K
1222 1 : if !i.equal(i.key, key.UserKey) {
1223 1 : // We've advanced to the next key.
1224 1 : i.pos = iterPosNext
1225 1 : return
1226 1 : }
1227 1 : switch key.Kind() {
1228 1 : case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized:
1229 1 : // We've hit a deletion tombstone. Return everything up to this
1230 1 : // point.
1231 1 : //
1232 1 : // NB: treating InternalKeyKindSingleDelete as equivalent to DEL is not
1233 1 : // only simpler, but is also necessary for correctness due to
1234 1 : // InternalKeyKindSSTableInternalObsoleteBit.
1235 1 : return
1236 :
1237 1 : case InternalKeyKindSet, InternalKeyKindSetWithDelete:
1238 1 : // We've hit a Set value. Merge with the existing value and return.
1239 1 : var iterValue []byte
1240 1 : iterValue, _, i.err = i.iterKV.Value(nil)
1241 1 : if i.err != nil {
1242 0 : return
1243 0 : }
1244 1 : i.err = valueMerger.MergeOlder(iterValue)
1245 1 : return
1246 :
1247 1 : case InternalKeyKindMerge:
1248 1 : // We've hit another Merge value. Merge with the existing value and
1249 1 : // continue looping.
1250 1 : var iterValue []byte
1251 1 : iterValue, _, i.err = i.iterKV.Value(nil)
1252 1 : if i.err != nil {
1253 0 : return
1254 0 : }
1255 1 : i.err = valueMerger.MergeOlder(iterValue)
1256 1 : if i.err != nil {
1257 0 : return
1258 0 : }
1259 1 : continue
1260 :
1261 0 : case InternalKeyKindRangeKeySet:
1262 0 : // The RANGEKEYSET marker must sort before a MERGE at the same user key.
1263 0 : i.err = base.CorruptionErrorf("pebble: out of order range key marker")
1264 0 : return
1265 :
1266 0 : default:
1267 0 : i.err = base.CorruptionErrorf("pebble: invalid internal key kind: %d", errors.Safe(key.Kind()))
1268 0 : return
1269 : }
1270 : }
1271 : }
1272 :
1273 : // SeekGE moves the iterator to the first key/value pair whose key is greater
1274 : // than or equal to the given key. Returns true if the iterator is pointing at
1275 : // a valid entry and false otherwise.
1276 1 : func (i *Iterator) SeekGE(key []byte) bool {
1277 1 : return i.SeekGEWithLimit(key, nil) == IterValid
1278 1 : }
1279 :
1280 : // SeekGEWithLimit moves the iterator to the first key/value pair whose key is
1281 : // greater than or equal to the given key.
1282 : //
1283 : // If limit is provided, it serves as a best-effort exclusive limit. If the
1284 : // first key greater than or equal to the given search key is also greater than
1285 : // or equal to limit, the Iterator may pause and return IterAtLimit. Because
1286 : // limits are best-effort, SeekGEWithLimit may return a key beyond limit.
1287 : //
1288 : // If the Iterator is configured to iterate over range keys, SeekGEWithLimit
1289 : // guarantees it will surface any range keys with bounds overlapping the
1290 : // keyspace [key, limit).
1291 1 : func (i *Iterator) SeekGEWithLimit(key []byte, limit []byte) IterValidityState {
1292 1 : if i.rangeKey != nil {
1293 1 : // NB: Check Valid() before clearing requiresReposition.
1294 1 : i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
1295 1 : // If we have a range key but did not expose it at the previous iterator
1296 1 : // position (because the iterator was not at a valid position), updated
1297 1 : // must be true. This ensures that after an iterator op sequence like:
1298 1 : // - Next() → (IterValid, RangeBounds() = [a,b))
1299 1 : // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -)
1300 1 : // - SeekGE(...) → (IterValid, RangeBounds() = [a,b))
1301 1 : // the iterator returns RangeKeyChanged()=true.
1302 1 : //
1303 1 : // The remainder of this function will only update i.rangeKey.updated if
1304 1 : // the iterator moves into a new range key, or out of the current range
1305 1 : // key.
1306 1 : i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys()
1307 1 : }
1308 1 : lastPositioningOp := i.lastPositioningOp
1309 1 : // Set it to unknown, since this operation may not succeed, in which case
1310 1 : // the SeekGE following this should not make any assumption about iterator
1311 1 : // position.
1312 1 : i.lastPositioningOp = unknownLastPositionOp
1313 1 : i.requiresReposition = false
1314 1 : i.err = nil // clear cached iteration error
1315 1 : i.hasPrefix = false
1316 1 : i.stats.ForwardSeekCount[InterfaceCall]++
1317 1 : if lowerBound := i.opts.GetLowerBound(); lowerBound != nil && i.cmp(key, lowerBound) < 0 {
1318 1 : key = lowerBound
1319 1 : } else if upperBound := i.opts.GetUpperBound(); upperBound != nil && i.cmp(key, upperBound) > 0 {
1320 1 : key = upperBound
1321 1 : }
1322 1 : seekInternalIter := true
1323 1 :
1324 1 : var flags base.SeekGEFlags
1325 1 : if i.batchJustRefreshed {
1326 0 : i.batchJustRefreshed = false
1327 0 : flags = flags.EnableBatchJustRefreshed()
1328 0 : }
1329 1 : if lastPositioningOp == seekGELastPositioningOp {
1330 1 : cmp := i.cmp(i.prefixOrFullSeekKey, key)
1331 1 : // If this seek is to the same or later key, and the iterator is
1332 1 : // already positioned there, this is a noop. This can be helpful for
1333 1 : // sparse key spaces that have many deleted keys, where one can avoid
1334 1 : // the overhead of iterating past them again and again.
1335 1 : if cmp <= 0 {
1336 1 : if !flags.BatchJustRefreshed() &&
1337 1 : (i.iterValidityState == IterExhausted ||
1338 1 : (i.iterValidityState == IterValid && i.cmp(key, i.key) <= 0 &&
1339 1 : (limit == nil || i.cmp(i.key, limit) < 0))) {
1340 1 : // Noop
1341 1 : if i.forceEnableSeekOpt || !testingDisableSeekOpt(key, uintptr(unsafe.Pointer(i))) {
1342 1 : i.lastPositioningOp = seekGELastPositioningOp
1343 1 : return i.iterValidityState
1344 1 : }
1345 : }
1346 : // cmp == 0 is not safe to optimize since
1347 : // - i.pos could be at iterPosNext, due to a merge.
1348 : // - Even if i.pos were at iterPosCurForward, we could have a DELETE,
1349 : // SET pair for a key, and the iterator would have moved past DELETE
1350 : // but stayed at iterPosCurForward. A similar situation occurs for a
1351 : // MERGE, SET pair where the MERGE is consumed and the iterator is
1352 : // at the SET.
1353 : // We also leverage the IterAtLimit <=> i.pos invariant defined in the
1354 : // comment on iterValidityState, to exclude any cases where i.pos
1355 : // is iterPosCur{Forward,Reverse}Paused. This avoids the need to
1356 : // special-case those iterator positions and their interactions with
1357 : // TrySeekUsingNext, as the main uses for TrySeekUsingNext in CockroachDB
1358 : // do not use limited Seeks in the first place.
1359 1 : if cmp < 0 && i.iterValidityState != IterAtLimit && limit == nil {
1360 1 : flags = flags.EnableTrySeekUsingNext()
1361 1 : }
1362 1 : if testingDisableSeekOpt(key, uintptr(unsafe.Pointer(i))) && !i.forceEnableSeekOpt {
1363 1 : flags = flags.DisableTrySeekUsingNext()
1364 1 : }
1365 1 : if !flags.BatchJustRefreshed() && i.pos == iterPosCurForwardPaused && i.cmp(key, i.iterKV.K.UserKey) <= 0 {
1366 1 : // Have some work to do, but don't need to seek, and we can
1367 1 : // start doing findNextEntry from i.iterKey.
1368 1 : seekInternalIter = false
1369 1 : }
1370 : }
1371 : }
1372 1 : if seekInternalIter {
1373 1 : i.iterKV = i.iter.SeekGE(key, flags)
1374 1 : i.stats.ForwardSeekCount[InternalIterCall]++
1375 1 : if err := i.iter.Error(); err != nil {
1376 0 : i.err = err
1377 0 : i.iterValidityState = IterExhausted
1378 0 : return i.iterValidityState
1379 0 : }
1380 : }
1381 1 : i.findNextEntry(limit)
1382 1 : i.maybeSampleRead()
1383 1 : if i.Error() == nil {
1384 1 : // Prepare state for a future noop optimization.
1385 1 : i.prefixOrFullSeekKey = append(i.prefixOrFullSeekKey[:0], key...)
1386 1 : i.lastPositioningOp = seekGELastPositioningOp
1387 1 : }
1388 1 : return i.iterValidityState
1389 : }
1390 :
1391 : // SeekPrefixGE moves the iterator to the first key/value pair whose key is
1392 : // greater than or equal to the given key and which has the same "prefix" as
1393 : // the given key. The prefix for a key is determined by the user-defined
1394 : // Comparer.Split function. The iterator will not observe keys not matching the
1395 : // "prefix" of the search key. Calling SeekPrefixGE puts the iterator in prefix
1396 : // iteration mode. The iterator remains in prefix iteration until a subsequent
1397 : // call to another absolute positioning method (SeekGE, SeekLT, First,
1398 : // Last). Reverse iteration (Prev) is not supported when an iterator is in
1399 : // prefix iteration mode. Returns true if the iterator is pointing at a valid
1400 : // entry and false otherwise.
1401 : //
1402 : // The semantics of SeekPrefixGE are slightly unusual and designed for
1403 : // iteration to be able to take advantage of bloom filters that have been
1404 : // created on the "prefix". If you're not using bloom filters, there is no
1405 : // reason to use SeekPrefixGE.
1406 : //
1407 : // An example Split function may separate a timestamp suffix from the prefix of
1408 : // the key.
1409 : //
1410 : // Split(<key>@<timestamp>) -> <key>
1411 : //
1412 : // Consider the keys "a@1", "a@2", "aa@3", "aa@4". The prefixes for these keys
1413 : // are "a", and "aa". Note that despite "a" and "aa" sharing a prefix by the
1414 : // usual definition, those prefixes differ by the definition of the Split
1415 : // function. To see how this works, consider the following set of calls on this
1416 : // data set:
1417 : //
1418 : // SeekPrefixGE("a@0") -> "a@1"
1419 : // Next() -> "a@2"
1420 : // Next() -> EOF
1421 : //
1422 : // If you're just looking to iterate over keys with a shared prefix, as
1423 : // defined by the configured comparer, set iterator bounds instead:
1424 : //
1425 : // iter := db.NewIter(&pebble.IterOptions{
1426 : // LowerBound: []byte("prefix"),
1427 : // UpperBound: []byte("prefiy"),
1428 : // })
1429 : // for iter.First(); iter.Valid(); iter.Next() {
1430 : // // Only keys beginning with "prefix" will be visited.
1431 : // }
1432 : //
1433 : // See ExampleIterator_SeekPrefixGE for a working example.
1434 : //
1435 : // When iterating with range keys enabled, all range keys encountered are
1436 : // truncated to the seek key's prefix's bounds. The truncation of the upper
1437 : // bound requires that the database's Comparer is configured with a
1438 : // ImmediateSuccessor method. For example, a SeekPrefixGE("a@9") call with the
1439 : // prefix "a" will truncate range key bounds to [a,ImmediateSuccessor(a)].
1440 1 : func (i *Iterator) SeekPrefixGE(key []byte) bool {
1441 1 : if i.rangeKey != nil {
1442 1 : // NB: Check Valid() before clearing requiresReposition.
1443 1 : i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
1444 1 : // If we have a range key but did not expose it at the previous iterator
1445 1 : // position (because the iterator was not at a valid position), updated
1446 1 : // must be true. This ensures that after an iterator op sequence like:
1447 1 : // - Next() → (IterValid, RangeBounds() = [a,b))
1448 1 : // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -)
1449 1 : // - SeekPrefixGE(...) → (IterValid, RangeBounds() = [a,b))
1450 1 : // the iterator returns RangeKeyChanged()=true.
1451 1 : //
1452 1 : // The remainder of this function will only update i.rangeKey.updated if
1453 1 : // the iterator moves into a new range key, or out of the current range
1454 1 : // key.
1455 1 : i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys()
1456 1 : }
1457 1 : lastPositioningOp := i.lastPositioningOp
1458 1 : // Set it to unknown, since this operation may not succeed, in which case
1459 1 : // the SeekPrefixGE following this should not make any assumption about
1460 1 : // iterator position.
1461 1 : i.lastPositioningOp = unknownLastPositionOp
1462 1 : i.requiresReposition = false
1463 1 : i.err = nil // clear cached iteration error
1464 1 : i.stats.ForwardSeekCount[InterfaceCall]++
1465 1 : if i.comparer.ImmediateSuccessor == nil && i.opts.KeyTypes != IterKeyTypePointsOnly {
1466 0 : panic("pebble: ImmediateSuccessor must be provided for SeekPrefixGE with range keys")
1467 : }
1468 1 : prefixLen := i.comparer.Split(key)
1469 1 : keyPrefix := key[:prefixLen]
1470 1 : var flags base.SeekGEFlags
1471 1 : if i.batchJustRefreshed {
1472 0 : flags = flags.EnableBatchJustRefreshed()
1473 0 : i.batchJustRefreshed = false
1474 0 : }
1475 1 : if lastPositioningOp == seekPrefixGELastPositioningOp {
1476 1 : if !i.hasPrefix {
1477 0 : panic("lastPositioningOpsIsSeekPrefixGE is true, but hasPrefix is false")
1478 : }
1479 : // The iterator has not been repositioned after the last SeekPrefixGE.
1480 : // See if we are seeking to a larger key, since then we can optimize
1481 : // the seek by using next. Note that we could also optimize if Next
1482 : // has been called, if the iterator is not exhausted and the current
1483 : // position is <= the seek key. We are keeping this limited for now
1484 : // since such optimizations require care for correctness, and to not
1485 : // become de-optimizations (if one usually has to do all the next
1486 : // calls and then the seek). This SeekPrefixGE optimization
1487 : // specifically benefits CockroachDB.
1488 1 : cmp := i.cmp(i.prefixOrFullSeekKey, keyPrefix)
1489 1 : // cmp == 0 is not safe to optimize since
1490 1 : // - i.pos could be at iterPosNext, due to a merge.
1491 1 : // - Even if i.pos were at iterPosCurForward, we could have a DELETE,
1492 1 : // SET pair for a key, and the iterator would have moved past DELETE
1493 1 : // but stayed at iterPosCurForward. A similar situation occurs for a
1494 1 : // MERGE, SET pair where the MERGE is consumed and the iterator is
1495 1 : // at the SET.
1496 1 : // In general some versions of i.prefix could have been consumed by
1497 1 : // the iterator, so we only optimize for cmp < 0.
1498 1 : if cmp < 0 {
1499 1 : flags = flags.EnableTrySeekUsingNext()
1500 1 : }
1501 1 : if testingDisableSeekOpt(key, uintptr(unsafe.Pointer(i))) && !i.forceEnableSeekOpt {
1502 1 : flags = flags.DisableTrySeekUsingNext()
1503 1 : }
1504 : }
1505 : // Make a copy of the prefix so that modifications to the key after
1506 : // SeekPrefixGE returns does not affect the stored prefix.
1507 1 : if cap(i.prefixOrFullSeekKey) < prefixLen {
1508 1 : i.prefixOrFullSeekKey = make([]byte, prefixLen)
1509 1 : } else {
1510 1 : i.prefixOrFullSeekKey = i.prefixOrFullSeekKey[:prefixLen]
1511 1 : }
1512 1 : i.hasPrefix = true
1513 1 : copy(i.prefixOrFullSeekKey, keyPrefix)
1514 1 :
1515 1 : if lowerBound := i.opts.GetLowerBound(); lowerBound != nil && i.cmp(key, lowerBound) < 0 {
1516 1 : if p := i.comparer.Split.Prefix(lowerBound); !bytes.Equal(i.prefixOrFullSeekKey, p) {
1517 1 : i.err = errors.New("pebble: SeekPrefixGE supplied with key outside of lower bound")
1518 1 : i.iterValidityState = IterExhausted
1519 1 : return false
1520 1 : }
1521 1 : key = lowerBound
1522 1 : } else if upperBound := i.opts.GetUpperBound(); upperBound != nil && i.cmp(key, upperBound) > 0 {
1523 1 : if p := i.comparer.Split.Prefix(upperBound); !bytes.Equal(i.prefixOrFullSeekKey, p) {
1524 1 : i.err = errors.New("pebble: SeekPrefixGE supplied with key outside of upper bound")
1525 1 : i.iterValidityState = IterExhausted
1526 1 : return false
1527 1 : }
1528 1 : key = upperBound
1529 : }
1530 1 : i.iterKV = i.iter.SeekPrefixGE(i.prefixOrFullSeekKey, key, flags)
1531 1 : i.stats.ForwardSeekCount[InternalIterCall]++
1532 1 : i.findNextEntry(nil)
1533 1 : i.maybeSampleRead()
1534 1 : if i.Error() == nil {
1535 1 : i.lastPositioningOp = seekPrefixGELastPositioningOp
1536 1 : }
1537 1 : return i.iterValidityState == IterValid
1538 : }
1539 :
1540 : // Deterministic disabling (in testing mode) of the seek optimizations. It uses
1541 : // the iterator pointer, since we want diversity in iterator behavior for the
1542 : // same key. Used for tests.
1543 1 : func testingDisableSeekOpt(key []byte, ptr uintptr) bool {
1544 1 : if !invariants.Enabled {
1545 0 : return false
1546 0 : }
1547 : // Fibonacci hash https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
1548 1 : simpleHash := (11400714819323198485 * uint64(ptr)) >> 63
1549 1 : return key != nil && key[0]&byte(1) == 0 && simpleHash == 0
1550 : }
1551 :
1552 : // SeekLT moves the iterator to the last key/value pair whose key is less than
1553 : // the given key. Returns true if the iterator is pointing at a valid entry and
1554 : // false otherwise.
1555 1 : func (i *Iterator) SeekLT(key []byte) bool {
1556 1 : return i.SeekLTWithLimit(key, nil) == IterValid
1557 1 : }
1558 :
1559 : // SeekLTWithLimit moves the iterator to the last key/value pair whose key is
1560 : // less than the given key.
1561 : //
1562 : // If limit is provided, it serves as a best-effort inclusive limit. If the last
1563 : // key less than the given search key is also less than limit, the Iterator may
1564 : // pause and return IterAtLimit. Because limits are best-effort, SeekLTWithLimit
1565 : // may return a key beyond limit.
1566 : //
1567 : // If the Iterator is configured to iterate over range keys, SeekLTWithLimit
1568 : // guarantees it will surface any range keys with bounds overlapping the
1569 : // keyspace up to limit.
1570 1 : func (i *Iterator) SeekLTWithLimit(key []byte, limit []byte) IterValidityState {
1571 1 : if i.rangeKey != nil {
1572 1 : // NB: Check Valid() before clearing requiresReposition.
1573 1 : i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
1574 1 : // If we have a range key but did not expose it at the previous iterator
1575 1 : // position (because the iterator was not at a valid position), updated
1576 1 : // must be true. This ensures that after an iterator op sequence like:
1577 1 : // - Next() → (IterValid, RangeBounds() = [a,b))
1578 1 : // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -)
1579 1 : // - SeekLTWithLimit(...) → (IterValid, RangeBounds() = [a,b))
1580 1 : // the iterator returns RangeKeyChanged()=true.
1581 1 : //
1582 1 : // The remainder of this function will only update i.rangeKey.updated if
1583 1 : // the iterator moves into a new range key, or out of the current range
1584 1 : // key.
1585 1 : i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys()
1586 1 : }
1587 1 : lastPositioningOp := i.lastPositioningOp
1588 1 : // Set it to unknown, since this operation may not succeed, in which case
1589 1 : // the SeekLT following this should not make any assumption about iterator
1590 1 : // position.
1591 1 : i.lastPositioningOp = unknownLastPositionOp
1592 1 : i.batchJustRefreshed = false
1593 1 : i.requiresReposition = false
1594 1 : i.err = nil // clear cached iteration error
1595 1 : i.stats.ReverseSeekCount[InterfaceCall]++
1596 1 : if upperBound := i.opts.GetUpperBound(); upperBound != nil && i.cmp(key, upperBound) > 0 {
1597 1 : key = upperBound
1598 1 : } else if lowerBound := i.opts.GetLowerBound(); lowerBound != nil && i.cmp(key, lowerBound) < 0 {
1599 1 : key = lowerBound
1600 1 : }
1601 1 : i.hasPrefix = false
1602 1 : seekInternalIter := true
1603 1 : // The following noop optimization only applies when i.batch == nil, since
1604 1 : // an iterator over a batch is iterating over mutable data, that may have
1605 1 : // changed since the last seek.
1606 1 : if lastPositioningOp == seekLTLastPositioningOp && i.batch == nil {
1607 1 : cmp := i.cmp(key, i.prefixOrFullSeekKey)
1608 1 : // If this seek is to the same or earlier key, and the iterator is
1609 1 : // already positioned there, this is a noop. This can be helpful for
1610 1 : // sparse key spaces that have many deleted keys, where one can avoid
1611 1 : // the overhead of iterating past them again and again.
1612 1 : if cmp <= 0 {
1613 1 : // NB: when pos != iterPosCurReversePaused, the invariant
1614 1 : // documented earlier implies that iterValidityState !=
1615 1 : // IterAtLimit.
1616 1 : if i.iterValidityState == IterExhausted ||
1617 1 : (i.iterValidityState == IterValid && i.cmp(i.key, key) < 0 &&
1618 1 : (limit == nil || i.cmp(limit, i.key) <= 0)) {
1619 1 : if !testingDisableSeekOpt(key, uintptr(unsafe.Pointer(i))) {
1620 1 : i.lastPositioningOp = seekLTLastPositioningOp
1621 1 : return i.iterValidityState
1622 1 : }
1623 : }
1624 1 : if i.pos == iterPosCurReversePaused && i.cmp(i.iterKV.K.UserKey, key) < 0 {
1625 1 : // Have some work to do, but don't need to seek, and we can
1626 1 : // start doing findPrevEntry from i.iterKey.
1627 1 : seekInternalIter = false
1628 1 : }
1629 : }
1630 : }
1631 1 : if seekInternalIter {
1632 1 : i.iterKV = i.iter.SeekLT(key, base.SeekLTFlagsNone)
1633 1 : i.stats.ReverseSeekCount[InternalIterCall]++
1634 1 : if err := i.iter.Error(); err != nil {
1635 0 : i.err = err
1636 0 : i.iterValidityState = IterExhausted
1637 0 : return i.iterValidityState
1638 0 : }
1639 : }
1640 1 : i.findPrevEntry(limit)
1641 1 : i.maybeSampleRead()
1642 1 : if i.Error() == nil && i.batch == nil {
1643 1 : // Prepare state for a future noop optimization.
1644 1 : i.prefixOrFullSeekKey = append(i.prefixOrFullSeekKey[:0], key...)
1645 1 : i.lastPositioningOp = seekLTLastPositioningOp
1646 1 : }
1647 1 : return i.iterValidityState
1648 : }
1649 :
1650 : // First moves the iterator the first key/value pair. Returns true if the
1651 : // iterator is pointing at a valid entry and false otherwise.
1652 1 : func (i *Iterator) First() bool {
1653 1 : if i.rangeKey != nil {
1654 1 : // NB: Check Valid() before clearing requiresReposition.
1655 1 : i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
1656 1 : // If we have a range key but did not expose it at the previous iterator
1657 1 : // position (because the iterator was not at a valid position), updated
1658 1 : // must be true. This ensures that after an iterator op sequence like:
1659 1 : // - Next() → (IterValid, RangeBounds() = [a,b))
1660 1 : // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -)
1661 1 : // - First(...) → (IterValid, RangeBounds() = [a,b))
1662 1 : // the iterator returns RangeKeyChanged()=true.
1663 1 : //
1664 1 : // The remainder of this function will only update i.rangeKey.updated if
1665 1 : // the iterator moves into a new range key, or out of the current range
1666 1 : // key.
1667 1 : i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys()
1668 1 : }
1669 1 : i.err = nil // clear cached iteration error
1670 1 : i.hasPrefix = false
1671 1 : i.batchJustRefreshed = false
1672 1 : i.lastPositioningOp = unknownLastPositionOp
1673 1 : i.requiresReposition = false
1674 1 : i.stats.ForwardSeekCount[InterfaceCall]++
1675 1 :
1676 1 : i.err = i.iterFirstWithinBounds()
1677 1 : if i.err != nil {
1678 0 : i.iterValidityState = IterExhausted
1679 0 : return false
1680 0 : }
1681 1 : i.findNextEntry(nil)
1682 1 : i.maybeSampleRead()
1683 1 : return i.iterValidityState == IterValid
1684 : }
1685 :
1686 : // Last moves the iterator the last key/value pair. Returns true if the
1687 : // iterator is pointing at a valid entry and false otherwise.
1688 1 : func (i *Iterator) Last() bool {
1689 1 : if i.rangeKey != nil {
1690 1 : // NB: Check Valid() before clearing requiresReposition.
1691 1 : i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
1692 1 : // If we have a range key but did not expose it at the previous iterator
1693 1 : // position (because the iterator was not at a valid position), updated
1694 1 : // must be true. This ensures that after an iterator op sequence like:
1695 1 : // - Next() → (IterValid, RangeBounds() = [a,b))
1696 1 : // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -)
1697 1 : // - Last(...) → (IterValid, RangeBounds() = [a,b))
1698 1 : // the iterator returns RangeKeyChanged()=true.
1699 1 : //
1700 1 : // The remainder of this function will only update i.rangeKey.updated if
1701 1 : // the iterator moves into a new range key, or out of the current range
1702 1 : // key.
1703 1 : i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys()
1704 1 : }
1705 1 : i.err = nil // clear cached iteration error
1706 1 : i.hasPrefix = false
1707 1 : i.batchJustRefreshed = false
1708 1 : i.lastPositioningOp = unknownLastPositionOp
1709 1 : i.requiresReposition = false
1710 1 : i.stats.ReverseSeekCount[InterfaceCall]++
1711 1 :
1712 1 : if i.err = i.iterLastWithinBounds(); i.err != nil {
1713 0 : i.iterValidityState = IterExhausted
1714 0 : return false
1715 0 : }
1716 1 : i.findPrevEntry(nil)
1717 1 : i.maybeSampleRead()
1718 1 : return i.iterValidityState == IterValid
1719 : }
1720 :
1721 : // Next moves the iterator to the next key/value pair. Returns true if the
1722 : // iterator is pointing at a valid entry and false otherwise.
1723 1 : func (i *Iterator) Next() bool {
1724 1 : return i.nextWithLimit(nil) == IterValid
1725 1 : }
1726 :
1727 : // NextWithLimit moves the iterator to the next key/value pair.
1728 : //
1729 : // If limit is provided, it serves as a best-effort exclusive limit. If the next
1730 : // key is greater than or equal to limit, the Iterator may pause and return
1731 : // IterAtLimit. Because limits are best-effort, NextWithLimit may return a key
1732 : // beyond limit.
1733 : //
1734 : // If the Iterator is configured to iterate over range keys, NextWithLimit
1735 : // guarantees it will surface any range keys with bounds overlapping the
1736 : // keyspace up to limit.
1737 1 : func (i *Iterator) NextWithLimit(limit []byte) IterValidityState {
1738 1 : return i.nextWithLimit(limit)
1739 1 : }
1740 :
1741 : // NextPrefix moves the iterator to the next key/value pair with a key
1742 : // containing a different prefix than the current key. Prefixes are determined
1743 : // by Comparer.Split. Exhausts the iterator if invoked while in prefix-iteration
1744 : // mode.
1745 : //
1746 : // It is not permitted to invoke NextPrefix while at a IterAtLimit position.
1747 : // When called in this condition, NextPrefix has non-deterministic behavior.
1748 : //
1749 : // It is not permitted to invoke NextPrefix when the Iterator has an
1750 : // upper-bound that is a versioned MVCC key (see the comment for
1751 : // Comparer.Split). It returns an error in this case.
1752 1 : func (i *Iterator) NextPrefix() bool {
1753 1 : if i.nextPrefixNotPermittedByUpperBound {
1754 1 : i.lastPositioningOp = unknownLastPositionOp
1755 1 : i.requiresReposition = false
1756 1 : i.err = errors.Errorf("NextPrefix not permitted with upper bound %s",
1757 1 : i.comparer.FormatKey(i.opts.UpperBound))
1758 1 : i.iterValidityState = IterExhausted
1759 1 : return false
1760 1 : }
1761 1 : if i.hasPrefix {
1762 1 : i.iterValidityState = IterExhausted
1763 1 : return false
1764 1 : }
1765 1 : if i.Error() != nil {
1766 1 : return false
1767 1 : }
1768 1 : return i.nextPrefix() == IterValid
1769 : }
1770 :
1771 1 : func (i *Iterator) nextPrefix() IterValidityState {
1772 1 : if i.rangeKey != nil {
1773 1 : // NB: Check Valid() before clearing requiresReposition.
1774 1 : i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
1775 1 : // If we have a range key but did not expose it at the previous iterator
1776 1 : // position (because the iterator was not at a valid position), updated
1777 1 : // must be true. This ensures that after an iterator op sequence like:
1778 1 : // - Next() → (IterValid, RangeBounds() = [a,b))
1779 1 : // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -)
1780 1 : // - NextWithLimit(...) → (IterValid, RangeBounds() = [a,b))
1781 1 : // the iterator returns RangeKeyChanged()=true.
1782 1 : //
1783 1 : // The remainder of this function will only update i.rangeKey.updated if
1784 1 : // the iterator moves into a new range key, or out of the current range
1785 1 : // key.
1786 1 : i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys()
1787 1 : }
1788 :
1789 : // Although NextPrefix documents that behavior at IterAtLimit is undefined,
1790 : // this function handles these cases as a simple prefix-agnostic Next. This
1791 : // is done for deterministic behavior in the metamorphic tests.
1792 : //
1793 : // TODO(jackson): If the metamorphic test operation generator is adjusted to
1794 : // make generation of some operations conditional on the previous
1795 : // operations, then we can remove this behavior and explicitly error.
1796 :
1797 1 : i.lastPositioningOp = unknownLastPositionOp
1798 1 : i.requiresReposition = false
1799 1 : switch i.pos {
1800 1 : case iterPosCurForward:
1801 1 : // Positioned on the current key. Advance to the next prefix.
1802 1 : i.internalNextPrefix(i.comparer.Split(i.key))
1803 1 : case iterPosCurForwardPaused:
1804 : // Positioned at a limit. Implement as a prefix-agnostic Next. See TODO
1805 : // up above. The iterator is already positioned at the next key.
1806 1 : case iterPosCurReverse:
1807 1 : // Switching directions.
1808 1 : // Unless the iterator was exhausted, reverse iteration needs to
1809 1 : // position the iterator at iterPosPrev.
1810 1 : if i.iterKV != nil {
1811 0 : i.err = errors.New("switching from reverse to forward but iter is not at prev")
1812 0 : i.iterValidityState = IterExhausted
1813 0 : return i.iterValidityState
1814 0 : }
1815 : // The Iterator is exhausted and i.iter is positioned before the first
1816 : // key. Reposition to point to the first internal key.
1817 1 : if i.err = i.iterFirstWithinBounds(); i.err != nil {
1818 0 : i.iterValidityState = IterExhausted
1819 0 : return i.iterValidityState
1820 0 : }
1821 1 : case iterPosCurReversePaused:
1822 1 : // Positioned at a limit. Implement as a prefix-agnostic Next. See TODO
1823 1 : // up above.
1824 1 : //
1825 1 : // Switching directions; The iterator must not be exhausted since it
1826 1 : // paused.
1827 1 : if i.iterKV == nil {
1828 0 : i.err = errors.New("switching paused from reverse to forward but iter is exhausted")
1829 0 : i.iterValidityState = IterExhausted
1830 0 : return i.iterValidityState
1831 0 : }
1832 1 : i.nextUserKey()
1833 1 : case iterPosPrev:
1834 1 : // The underlying iterator is pointed to the previous key (this can
1835 1 : // only happen when switching iteration directions).
1836 1 : if i.iterKV == nil {
1837 1 : // We're positioned before the first key. Need to reposition to point to
1838 1 : // the first key.
1839 1 : i.err = i.iterFirstWithinBounds()
1840 1 : if i.iterKV == nil {
1841 0 : i.iterValidityState = IterExhausted
1842 0 : return i.iterValidityState
1843 0 : }
1844 1 : if invariants.Enabled && !i.equal(i.iterKV.K.UserKey, i.key) {
1845 0 : i.opts.getLogger().Fatalf("pebble: invariant violation: First internal iterator from iterPosPrev landed on %q, not %q",
1846 0 : i.iterKV.K.UserKey, i.key)
1847 0 : }
1848 1 : } else {
1849 1 : // Move the internal iterator back onto the user key stored in
1850 1 : // i.key. iterPosPrev guarantees that it's positioned at the last
1851 1 : // key with the user key less than i.key, so we're guaranteed to
1852 1 : // land on the correct key with a single Next.
1853 1 : i.iterKV = i.iter.Next()
1854 1 : if i.iterKV == nil {
1855 0 : // This should only be possible if i.iter.Next() encountered an
1856 0 : // error.
1857 0 : if i.iter.Error() == nil {
1858 0 : i.opts.getLogger().Fatalf("pebble: invariant violation: Nexting internal iterator from iterPosPrev found nothing")
1859 0 : }
1860 : // NB: Iterator.Error() will return i.iter.Error().
1861 0 : i.iterValidityState = IterExhausted
1862 0 : return i.iterValidityState
1863 : }
1864 1 : if invariants.Enabled && !i.equal(i.iterKV.K.UserKey, i.key) {
1865 0 : i.opts.getLogger().Fatalf("pebble: invariant violation: Nexting internal iterator from iterPosPrev landed on %q, not %q",
1866 0 : i.iterKV.K.UserKey, i.key)
1867 0 : }
1868 : }
1869 : // The internal iterator is now positioned at i.key. Advance to the next
1870 : // prefix.
1871 1 : i.internalNextPrefix(i.comparer.Split(i.key))
1872 1 : case iterPosNext:
1873 1 : // Already positioned on the next key. Only call nextPrefixKey if the
1874 1 : // next key shares the same prefix.
1875 1 : if i.iterKV != nil {
1876 1 : currKeyPrefixLen := i.comparer.Split(i.key)
1877 1 : if bytes.Equal(i.comparer.Split.Prefix(i.iterKV.K.UserKey), i.key[:currKeyPrefixLen]) {
1878 1 : i.internalNextPrefix(currKeyPrefixLen)
1879 1 : }
1880 : }
1881 : }
1882 :
1883 1 : i.stats.ForwardStepCount[InterfaceCall]++
1884 1 : i.findNextEntry(nil /* limit */)
1885 1 : i.maybeSampleRead()
1886 1 : return i.iterValidityState
1887 : }
1888 :
1889 1 : func (i *Iterator) internalNextPrefix(currKeyPrefixLen int) {
1890 1 : if i.iterKV == nil {
1891 1 : return
1892 1 : }
1893 : // The Next "fast-path" is not really a fast-path when there is more than
1894 : // one version. However, even with TableFormatPebblev3, there is a small
1895 : // slowdown (~10%) for one version if we remove it and only call NextPrefix.
1896 : // When there are two versions, only calling NextPrefix is ~30% faster.
1897 1 : i.stats.ForwardStepCount[InternalIterCall]++
1898 1 : if i.iterKV = i.iter.Next(); i.iterKV == nil {
1899 1 : return
1900 1 : }
1901 1 : if !bytes.Equal(i.comparer.Split.Prefix(i.iterKV.K.UserKey), i.key[:currKeyPrefixLen]) {
1902 1 : return
1903 1 : }
1904 1 : i.stats.ForwardStepCount[InternalIterCall]++
1905 1 : i.prefixOrFullSeekKey = i.comparer.ImmediateSuccessor(i.prefixOrFullSeekKey[:0], i.key[:currKeyPrefixLen])
1906 1 : if i.iterKV.K.IsExclusiveSentinel() {
1907 0 : panic(errors.AssertionFailedf("pebble: unexpected exclusive sentinel key: %q", i.iterKV.K))
1908 : }
1909 :
1910 1 : i.iterKV = i.iter.NextPrefix(i.prefixOrFullSeekKey)
1911 1 : if invariants.Enabled && i.iterKV != nil {
1912 1 : if p := i.comparer.Split.Prefix(i.iterKV.K.UserKey); i.cmp(p, i.prefixOrFullSeekKey) < 0 {
1913 0 : panic(errors.AssertionFailedf("pebble: iter.NextPrefix did not advance beyond the current prefix: now at %q; expected to be geq %q",
1914 0 : i.iterKV.K, i.prefixOrFullSeekKey))
1915 : }
1916 : }
1917 : }
1918 :
1919 1 : func (i *Iterator) nextWithLimit(limit []byte) IterValidityState {
1920 1 : i.stats.ForwardStepCount[InterfaceCall]++
1921 1 : if i.hasPrefix {
1922 1 : if limit != nil {
1923 1 : i.err = errors.New("cannot use limit with prefix iteration")
1924 1 : i.iterValidityState = IterExhausted
1925 1 : return i.iterValidityState
1926 1 : } else if i.iterValidityState == IterExhausted {
1927 1 : // No-op, already exhausted. We avoid executing the Next because it
1928 1 : // can break invariants: Specifically, a file that fails the bloom
1929 1 : // filter test may result in its level being removed from the
1930 1 : // merging iterator. The level's removal can cause a lazy combined
1931 1 : // iterator to miss range keys and trigger a switch to combined
1932 1 : // iteration at a larger key, breaking keyspan invariants.
1933 1 : return i.iterValidityState
1934 1 : }
1935 : }
1936 1 : if i.err != nil {
1937 1 : return i.iterValidityState
1938 1 : }
1939 1 : if i.rangeKey != nil {
1940 1 : // NB: Check Valid() before clearing requiresReposition.
1941 1 : i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
1942 1 : // If we have a range key but did not expose it at the previous iterator
1943 1 : // position (because the iterator was not at a valid position), updated
1944 1 : // must be true. This ensures that after an iterator op sequence like:
1945 1 : // - Next() → (IterValid, RangeBounds() = [a,b))
1946 1 : // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -)
1947 1 : // - NextWithLimit(...) → (IterValid, RangeBounds() = [a,b))
1948 1 : // the iterator returns RangeKeyChanged()=true.
1949 1 : //
1950 1 : // The remainder of this function will only update i.rangeKey.updated if
1951 1 : // the iterator moves into a new range key, or out of the current range
1952 1 : // key.
1953 1 : i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys()
1954 1 : }
1955 1 : i.lastPositioningOp = unknownLastPositionOp
1956 1 : i.requiresReposition = false
1957 1 : switch i.pos {
1958 1 : case iterPosCurForward:
1959 1 : i.nextUserKey()
1960 1 : case iterPosCurForwardPaused:
1961 : // Already at the right place.
1962 1 : case iterPosCurReverse:
1963 1 : // Switching directions.
1964 1 : // Unless the iterator was exhausted, reverse iteration needs to
1965 1 : // position the iterator at iterPosPrev.
1966 1 : if i.iterKV != nil {
1967 0 : i.err = errors.New("switching from reverse to forward but iter is not at prev")
1968 0 : i.iterValidityState = IterExhausted
1969 0 : return i.iterValidityState
1970 0 : }
1971 : // We're positioned before the first key. Need to reposition to point to
1972 : // the first key.
1973 1 : if i.err = i.iterFirstWithinBounds(); i.err != nil {
1974 0 : i.iterValidityState = IterExhausted
1975 0 : return i.iterValidityState
1976 0 : }
1977 1 : case iterPosCurReversePaused:
1978 1 : // Switching directions.
1979 1 : // The iterator must not be exhausted since it paused.
1980 1 : if i.iterKV == nil {
1981 0 : i.err = errors.New("switching paused from reverse to forward but iter is exhausted")
1982 0 : i.iterValidityState = IterExhausted
1983 0 : return i.iterValidityState
1984 0 : }
1985 1 : i.nextUserKey()
1986 1 : case iterPosPrev:
1987 1 : // The underlying iterator is pointed to the previous key (this can
1988 1 : // only happen when switching iteration directions). We set
1989 1 : // i.iterValidityState to IterExhausted here to force the calls to
1990 1 : // nextUserKey to save the current key i.iter is pointing at in order
1991 1 : // to determine when the next user-key is reached.
1992 1 : i.iterValidityState = IterExhausted
1993 1 : if i.iterKV == nil {
1994 1 : // We're positioned before the first key. Need to reposition to point to
1995 1 : // the first key.
1996 1 : i.err = i.iterFirstWithinBounds()
1997 1 : } else {
1998 1 : i.nextUserKey()
1999 1 : }
2000 1 : if i.err != nil {
2001 0 : i.iterValidityState = IterExhausted
2002 0 : return i.iterValidityState
2003 0 : }
2004 1 : i.nextUserKey()
2005 1 : case iterPosNext:
2006 : // Already at the right place.
2007 : }
2008 1 : i.findNextEntry(limit)
2009 1 : i.maybeSampleRead()
2010 1 : return i.iterValidityState
2011 : }
2012 :
2013 : // Prev moves the iterator to the previous key/value pair. Returns true if the
2014 : // iterator is pointing at a valid entry and false otherwise.
2015 1 : func (i *Iterator) Prev() bool {
2016 1 : return i.PrevWithLimit(nil) == IterValid
2017 1 : }
2018 :
2019 : // PrevWithLimit moves the iterator to the previous key/value pair.
2020 : //
2021 : // If limit is provided, it serves as a best-effort inclusive limit. If the
2022 : // previous key is less than limit, the Iterator may pause and return
2023 : // IterAtLimit. Because limits are best-effort, PrevWithLimit may return a key
2024 : // beyond limit.
2025 : //
2026 : // If the Iterator is configured to iterate over range keys, PrevWithLimit
2027 : // guarantees it will surface any range keys with bounds overlapping the
2028 : // keyspace up to limit.
2029 1 : func (i *Iterator) PrevWithLimit(limit []byte) IterValidityState {
2030 1 : i.stats.ReverseStepCount[InterfaceCall]++
2031 1 : if i.err != nil {
2032 1 : return i.iterValidityState
2033 1 : }
2034 1 : if i.rangeKey != nil {
2035 1 : // NB: Check Valid() before clearing requiresReposition.
2036 1 : i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
2037 1 : // If we have a range key but did not expose it at the previous iterator
2038 1 : // position (because the iterator was not at a valid position), updated
2039 1 : // must be true. This ensures that after an iterator op sequence like:
2040 1 : // - Next() → (IterValid, RangeBounds() = [a,b))
2041 1 : // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -)
2042 1 : // - PrevWithLimit(...) → (IterValid, RangeBounds() = [a,b))
2043 1 : // the iterator returns RangeKeyChanged()=true.
2044 1 : //
2045 1 : // The remainder of this function will only update i.rangeKey.updated if
2046 1 : // the iterator moves into a new range key, or out of the current range
2047 1 : // key.
2048 1 : i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys()
2049 1 : }
2050 1 : i.lastPositioningOp = unknownLastPositionOp
2051 1 : i.requiresReposition = false
2052 1 : if i.hasPrefix {
2053 1 : i.err = errReversePrefixIteration
2054 1 : i.iterValidityState = IterExhausted
2055 1 : return i.iterValidityState
2056 1 : }
2057 1 : switch i.pos {
2058 1 : case iterPosCurForward:
2059 : // Switching directions, and will handle this below.
2060 1 : case iterPosCurForwardPaused:
2061 : // Switching directions, and will handle this below.
2062 1 : case iterPosCurReverse:
2063 1 : i.prevUserKey()
2064 1 : case iterPosCurReversePaused:
2065 : // Already at the right place.
2066 1 : case iterPosNext:
2067 : // The underlying iterator is pointed to the next key (this can only happen
2068 : // when switching iteration directions). We will handle this below.
2069 1 : case iterPosPrev:
2070 : // Already at the right place.
2071 : }
2072 1 : if i.pos == iterPosCurForward || i.pos == iterPosNext || i.pos == iterPosCurForwardPaused {
2073 1 : // Switching direction.
2074 1 : stepAgain := i.pos == iterPosNext
2075 1 :
2076 1 : // Synthetic range key markers are a special case. Consider SeekGE(b)
2077 1 : // which finds a range key [a, c). To ensure the user observes the range
2078 1 : // key, the Iterator pauses at Key() = b. The iterator must advance the
2079 1 : // internal iterator to see if there's also a coincident point key at
2080 1 : // 'b', leaving the iterator at iterPosNext if there's not.
2081 1 : //
2082 1 : // This is a problem: Synthetic range key markers are only interleaved
2083 1 : // during the original seek. A subsequent Prev() of i.iter will not move
2084 1 : // back onto the synthetic range key marker. In this case where the
2085 1 : // previous iterator position was a synthetic range key start boundary,
2086 1 : // we must not step a second time.
2087 1 : if i.isEphemeralPosition() {
2088 1 : stepAgain = false
2089 1 : }
2090 :
2091 : // We set i.iterValidityState to IterExhausted here to force the calls
2092 : // to prevUserKey to save the current key i.iter is pointing at in
2093 : // order to determine when the prev user-key is reached.
2094 1 : i.iterValidityState = IterExhausted
2095 1 : if i.iterKV == nil {
2096 1 : // We're positioned after the last key. Need to reposition to point to
2097 1 : // the last key.
2098 1 : i.err = i.iterLastWithinBounds()
2099 1 : } else {
2100 1 : i.prevUserKey()
2101 1 : }
2102 1 : if i.err != nil {
2103 0 : return i.iterValidityState
2104 0 : }
2105 1 : if stepAgain {
2106 1 : i.prevUserKey()
2107 1 : if i.err != nil {
2108 0 : return i.iterValidityState
2109 0 : }
2110 : }
2111 : }
2112 1 : i.findPrevEntry(limit)
2113 1 : i.maybeSampleRead()
2114 1 : return i.iterValidityState
2115 : }
2116 :
2117 : // iterFirstWithinBounds moves the internal iterator to the first key,
2118 : // respecting bounds.
2119 1 : func (i *Iterator) iterFirstWithinBounds() error {
2120 1 : i.stats.ForwardSeekCount[InternalIterCall]++
2121 1 : if lowerBound := i.opts.GetLowerBound(); lowerBound != nil {
2122 1 : i.iterKV = i.iter.SeekGE(lowerBound, base.SeekGEFlagsNone)
2123 1 : } else {
2124 1 : i.iterKV = i.iter.First()
2125 1 : }
2126 1 : if i.iterKV == nil {
2127 1 : return i.iter.Error()
2128 1 : }
2129 1 : return nil
2130 : }
2131 :
2132 : // iterLastWithinBounds moves the internal iterator to the last key, respecting
2133 : // bounds.
2134 1 : func (i *Iterator) iterLastWithinBounds() error {
2135 1 : i.stats.ReverseSeekCount[InternalIterCall]++
2136 1 : if upperBound := i.opts.GetUpperBound(); upperBound != nil {
2137 1 : i.iterKV = i.iter.SeekLT(upperBound, base.SeekLTFlagsNone)
2138 1 : } else {
2139 1 : i.iterKV = i.iter.Last()
2140 1 : }
2141 1 : if i.iterKV == nil {
2142 1 : return i.iter.Error()
2143 1 : }
2144 1 : return nil
2145 : }
2146 :
2147 : // RangeKeyData describes a range key's data, set through RangeKeySet. The key
2148 : // boundaries of the range key is provided by Iterator.RangeBounds.
2149 : type RangeKeyData struct {
2150 : Suffix []byte
2151 : Value []byte
2152 : }
2153 :
2154 : // rangeKeyWithinLimit is called during limited reverse iteration when
2155 : // positioned over a key beyond the limit. If there exists a range key that lies
2156 : // within the limit, the iterator must not pause in order to ensure the user has
2157 : // an opportunity to observe the range key within limit.
2158 : //
2159 : // It would be valid to ignore the limit whenever there's a range key covering
2160 : // the key, but that would introduce nondeterminism. To preserve determinism for
2161 : // testing, the iterator ignores the limit only if the covering range key does
2162 : // cover the keyspace within the limit.
2163 : //
2164 : // This awkwardness exists because range keys are interleaved at their inclusive
2165 : // start positions. Note that limit is inclusive.
2166 1 : func (i *Iterator) rangeKeyWithinLimit(limit []byte) bool {
2167 1 : if i.rangeKey == nil || !i.opts.rangeKeys() {
2168 1 : return false
2169 1 : }
2170 1 : s := i.rangeKey.iiter.Span()
2171 1 : // If the range key ends beyond the limit, then the range key does not cover
2172 1 : // any portion of the keyspace within the limit and it is safe to pause.
2173 1 : return s != nil && i.cmp(s.End, limit) > 0
2174 : }
2175 :
2176 : // saveRangeKey saves the current range key to the underlying iterator's current
2177 : // range key state. If the range key has not changed, saveRangeKey is a no-op.
2178 : // If there is a new range key, saveRangeKey copies all of the key, value and
2179 : // suffixes into Iterator-managed buffers.
2180 1 : func (i *Iterator) saveRangeKey() {
2181 1 : if i.rangeKey == nil || i.opts.KeyTypes == IterKeyTypePointsOnly {
2182 1 : return
2183 1 : }
2184 :
2185 1 : s := i.rangeKey.iiter.Span()
2186 1 : if s == nil {
2187 1 : i.rangeKey.hasRangeKey = false
2188 1 : i.rangeKey.updated = i.rangeKey.prevPosHadRangeKey
2189 1 : return
2190 1 : } else if !i.rangeKey.stale {
2191 1 : // The range key `s` is identical to the one currently saved. No-op.
2192 1 : return
2193 1 : }
2194 :
2195 1 : if s.KeysOrder != keyspan.BySuffixAsc {
2196 0 : panic("pebble: range key span's keys unexpectedly not in ascending suffix order")
2197 : }
2198 :
2199 : // Although `i.rangeKey.stale` is true, the span s may still be identical
2200 : // to the currently saved span. This is possible when seeking the iterator,
2201 : // which may land back on the same range key. If we previously had a range
2202 : // key and the new one has an identical start key, then it must be the same
2203 : // range key and we can avoid copying and keep `i.rangeKey.updated=false`.
2204 : //
2205 : // TODO(jackson): These key comparisons could be avoidable during relative
2206 : // positioning operations continuing in the same direction, because these
2207 : // ops will never encounter the previous position's range key while
2208 : // stale=true. However, threading whether the current op is a seek or step
2209 : // maybe isn't worth it. This key comparison is only necessary once when we
2210 : // step onto a new range key, which should be relatively rare.
2211 1 : if i.rangeKey.prevPosHadRangeKey && i.equal(i.rangeKey.start, s.Start) &&
2212 1 : i.equal(i.rangeKey.end, s.End) {
2213 1 : i.rangeKey.updated = false
2214 1 : i.rangeKey.stale = false
2215 1 : i.rangeKey.hasRangeKey = true
2216 1 : return
2217 1 : }
2218 1 : i.stats.RangeKeyStats.Count += len(s.Keys)
2219 1 : i.rangeKey.buf.Reset()
2220 1 : i.rangeKey.hasRangeKey = true
2221 1 : i.rangeKey.updated = true
2222 1 : i.rangeKey.stale = false
2223 1 : i.rangeKey.buf, i.rangeKey.start = i.rangeKey.buf.Copy(s.Start)
2224 1 : i.rangeKey.buf, i.rangeKey.end = i.rangeKey.buf.Copy(s.End)
2225 1 : i.rangeKey.keys = i.rangeKey.keys[:0]
2226 1 : for j := 0; j < len(s.Keys); j++ {
2227 1 : if invariants.Enabled {
2228 1 : if s.Keys[j].Kind() != base.InternalKeyKindRangeKeySet {
2229 0 : panic("pebble: user iteration encountered non-RangeKeySet key kind")
2230 1 : } else if j > 0 && i.comparer.CompareRangeSuffixes(s.Keys[j].Suffix, s.Keys[j-1].Suffix) < 0 {
2231 0 : panic("pebble: user iteration encountered range keys not in suffix order")
2232 : }
2233 : }
2234 1 : var rkd RangeKeyData
2235 1 : i.rangeKey.buf, rkd.Suffix = i.rangeKey.buf.Copy(s.Keys[j].Suffix)
2236 1 : i.rangeKey.buf, rkd.Value = i.rangeKey.buf.Copy(s.Keys[j].Value)
2237 1 : i.rangeKey.keys = append(i.rangeKey.keys, rkd)
2238 : }
2239 : }
2240 :
2241 : // RangeKeyChanged indicates whether the most recent iterator positioning
2242 : // operation resulted in the iterator stepping into or out of a new range key.
2243 : // If true, previously returned range key bounds and data has been invalidated.
2244 : // If false, previously obtained range key bounds, suffix and value slices are
2245 : // still valid and may continue to be read.
2246 : //
2247 : // Invalid iterator positions are considered to not hold range keys, meaning
2248 : // that if an iterator steps from an IterExhausted or IterAtLimit position onto
2249 : // a position with a range key, RangeKeyChanged will yield true.
2250 1 : func (i *Iterator) RangeKeyChanged() bool {
2251 1 : return i.iterValidityState == IterValid && i.rangeKey != nil && i.rangeKey.updated
2252 1 : }
2253 :
2254 : // HasPointAndRange indicates whether there exists a point key, a range key or
2255 : // both at the current iterator position.
2256 1 : func (i *Iterator) HasPointAndRange() (hasPoint, hasRange bool) {
2257 1 : if i.iterValidityState != IterValid || i.requiresReposition {
2258 0 : return false, false
2259 0 : }
2260 1 : if i.opts.KeyTypes == IterKeyTypePointsOnly {
2261 1 : return true, false
2262 1 : }
2263 1 : return i.rangeKey == nil || !i.rangeKey.rangeKeyOnly, i.rangeKey != nil && i.rangeKey.hasRangeKey
2264 : }
2265 :
2266 : // RangeBounds returns the start (inclusive) and end (exclusive) bounds of the
2267 : // range key covering the current iterator position. RangeBounds returns nil
2268 : // bounds if there is no range key covering the current iterator position, or
2269 : // the iterator is not configured to surface range keys.
2270 : //
2271 : // If valid, the returned start bound is less than or equal to Key() and the
2272 : // returned end bound is greater than Key().
2273 1 : func (i *Iterator) RangeBounds() (start, end []byte) {
2274 1 : if i.rangeKey == nil || !i.opts.rangeKeys() || !i.rangeKey.hasRangeKey {
2275 0 : return nil, nil
2276 0 : }
2277 1 : return i.rangeKey.start, i.rangeKey.end
2278 : }
2279 :
2280 : // Key returns the key of the current key/value pair, or nil if done. The
2281 : // caller should not modify the contents of the returned slice, and its
2282 : // contents may change on the next call to Next.
2283 : //
2284 : // If positioned at an iterator position that only holds a range key, Key()
2285 : // always returns the start bound of the range key. Otherwise, it returns the
2286 : // point key's key.
2287 1 : func (i *Iterator) Key() []byte {
2288 1 : return i.key
2289 1 : }
2290 :
2291 : // Value returns the value of the current key/value pair, or nil if done. The
2292 : // caller should not modify the contents of the returned slice, and its
2293 : // contents may change on the next call to Next.
2294 : //
2295 : // Only valid if HasPointAndRange() returns true for hasPoint.
2296 : // Deprecated: use ValueAndErr instead.
2297 1 : func (i *Iterator) Value() []byte {
2298 1 : val, _ := i.ValueAndErr()
2299 1 : return val
2300 1 : }
2301 :
2302 : // ValueAndErr returns the value, and any error encountered in extracting the value.
2303 : // REQUIRES: i.Error()==nil and HasPointAndRange() returns true for hasPoint.
2304 : //
2305 : // The caller should not modify the contents of the returned slice, and its
2306 : // contents may change on the next call to Next.
2307 1 : func (i *Iterator) ValueAndErr() ([]byte, error) {
2308 1 : val, callerOwned, err := i.value.Value(i.lazyValueBuf)
2309 1 : if err != nil {
2310 0 : i.err = err
2311 0 : i.iterValidityState = IterExhausted
2312 0 : }
2313 1 : if callerOwned {
2314 1 : i.lazyValueBuf = val[:0]
2315 1 : }
2316 1 : return val, err
2317 : }
2318 :
2319 : // LazyValue returns the LazyValue. Only for advanced use cases.
2320 : // REQUIRES: i.Error()==nil and HasPointAndRange() returns true for hasPoint.
2321 0 : func (i *Iterator) LazyValue() LazyValue {
2322 0 : return i.value.LazyValue()
2323 0 : }
2324 :
2325 : // RangeKeys returns the range key values and their suffixes covering the
2326 : // current iterator position. The range bounds may be retrieved separately
2327 : // through Iterator.RangeBounds().
2328 1 : func (i *Iterator) RangeKeys() []RangeKeyData {
2329 1 : if i.rangeKey == nil || !i.opts.rangeKeys() || !i.rangeKey.hasRangeKey {
2330 0 : return nil
2331 0 : }
2332 1 : return i.rangeKey.keys
2333 : }
2334 :
2335 : // Valid returns true if the iterator is positioned at a valid key/value pair
2336 : // and false otherwise.
2337 1 : func (i *Iterator) Valid() bool {
2338 1 : valid := i.iterValidityState == IterValid && !i.requiresReposition
2339 1 : if invariants.Enabled {
2340 1 : if err := i.Error(); valid && err != nil {
2341 0 : panic(errors.AssertionFailedf("pebble: iterator is valid with non-nil Error: %+v", err))
2342 : }
2343 : }
2344 1 : return valid
2345 : }
2346 :
2347 : // Error returns any accumulated error.
2348 1 : func (i *Iterator) Error() error {
2349 1 : if i.err != nil {
2350 1 : return i.err
2351 1 : }
2352 1 : if i.iter != nil {
2353 1 : return i.iter.Error()
2354 1 : }
2355 0 : return nil
2356 : }
2357 :
2358 : const maxKeyBufCacheSize = 4 << 10 // 4 KB
2359 :
2360 : // Close closes the iterator and returns any accumulated error. Exhausting
2361 : // all the key/value pairs in a table is not considered to be an error.
2362 : // It is not valid to call any method, including Close, after the iterator
2363 : // has been closed.
2364 1 : func (i *Iterator) Close() error {
2365 1 : // Close the child iterator before releasing the readState because when the
2366 1 : // readState is released sstables referenced by the readState may be deleted
2367 1 : // which will fail on Windows if the sstables are still open by the child
2368 1 : // iterator.
2369 1 : if i.iter != nil {
2370 1 : i.err = firstError(i.err, i.iter.Close())
2371 1 :
2372 1 : // Closing i.iter did not necessarily close the point and range key
2373 1 : // iterators. Calls to SetOptions may have 'disconnected' either one
2374 1 : // from i.iter if iteration key types were changed. Both point and range
2375 1 : // key iterators are preserved in case the iterator needs to switch key
2376 1 : // types again. We explicitly close both of these iterators here.
2377 1 : //
2378 1 : // NB: If the iterators were still connected to i.iter, they may be
2379 1 : // closed, but calling Close on a closed internal iterator or fragment
2380 1 : // iterator is allowed.
2381 1 : if i.pointIter != nil {
2382 1 : i.err = firstError(i.err, i.pointIter.Close())
2383 1 : }
2384 1 : if i.rangeKey != nil && i.rangeKey.rangeKeyIter != nil {
2385 1 : i.rangeKey.rangeKeyIter.Close()
2386 1 : }
2387 1 : i.err = firstError(i.err, i.blobValueFetcher.Close())
2388 : }
2389 1 : err := i.err
2390 1 :
2391 1 : if i.readState != nil {
2392 1 : if i.readSampling.pendingCompactions.size > 0 {
2393 0 : // Copy pending read compactions using db.mu.Lock()
2394 0 : i.readState.db.mu.Lock()
2395 0 : i.readState.db.mu.compact.readCompactions.combine(&i.readSampling.pendingCompactions, i.cmp)
2396 0 : reschedule := i.readState.db.mu.compact.rescheduleReadCompaction
2397 0 : i.readState.db.mu.compact.rescheduleReadCompaction = false
2398 0 : concurrentCompactions := i.readState.db.mu.compact.compactingCount
2399 0 : i.readState.db.mu.Unlock()
2400 0 :
2401 0 : if reschedule && concurrentCompactions == 0 {
2402 0 : // In a read heavy workload, flushes may not happen frequently enough to
2403 0 : // schedule compactions.
2404 0 : i.readState.db.compactionSchedulers.Add(1)
2405 0 : go i.readState.db.maybeScheduleCompactionAsync()
2406 0 : }
2407 : }
2408 :
2409 1 : i.readState.unref()
2410 1 : i.readState = nil
2411 : }
2412 :
2413 1 : if i.version != nil {
2414 1 : i.version.Unref()
2415 1 : }
2416 1 : if i.externalIter != nil {
2417 0 : err = firstError(err, i.externalIter.Close())
2418 0 : }
2419 :
2420 : // Close the closer for the current value if one was open.
2421 1 : if i.valueCloser != nil {
2422 0 : err = firstError(err, i.valueCloser.Close())
2423 0 : i.valueCloser = nil
2424 0 : }
2425 :
2426 1 : if i.rangeKey != nil {
2427 1 : i.rangeKey.rangeKeyBuffers.PrepareForReuse()
2428 1 : *i.rangeKey = iteratorRangeKeyState{
2429 1 : rangeKeyBuffers: i.rangeKey.rangeKeyBuffers,
2430 1 : }
2431 1 : iterRangeKeyStateAllocPool.Put(i.rangeKey)
2432 1 : i.rangeKey = nil
2433 1 : }
2434 :
2435 1 : alloc := i.alloc
2436 1 : if alloc == nil {
2437 1 : // NB: When the Iterator is used as a part of a Get(), Close() is called by
2438 1 : // getIterAlloc.Close which handles recycling the appropriate structure and
2439 1 : // fields.
2440 1 : return err
2441 1 : }
2442 :
2443 : // Reset the alloc struct, retaining any buffers within their maximum
2444 : // bounds, and return the iterAlloc struct to the pool.
2445 :
2446 : // Avoid caching the key buf if it is overly large. The constant is fairly
2447 : // arbitrary.
2448 1 : if cap(i.keyBuf) < maxKeyBufCacheSize {
2449 1 : alloc.keyBuf = i.keyBuf
2450 1 : }
2451 1 : if cap(i.prefixOrFullSeekKey) < maxKeyBufCacheSize {
2452 1 : alloc.prefixOrFullSeekKey = i.prefixOrFullSeekKey
2453 1 : }
2454 1 : for j := range i.boundsBuf {
2455 1 : if cap(i.boundsBuf[j]) < maxKeyBufCacheSize {
2456 1 : alloc.boundsBuf[j] = i.boundsBuf[j]
2457 1 : }
2458 : }
2459 1 : mergingIterHeapItems := alloc.merging.heap.items
2460 1 :
2461 1 : // We zero each field piecemeal in part to avoid the use of a stack
2462 1 : // allocated autotmp iterAlloc variable, and in part to make it easier
2463 1 : // to make zeroing conditional on whether the field was actually used.
2464 1 : //
2465 1 : // TODO(jackson,nvanbenschoten): Iterator.Close continues to appear in
2466 1 : // profiles in part because of this zeroing.
2467 1 : if i.batch != nil {
2468 1 : alloc.batchState = iteratorBatchState{}
2469 1 : }
2470 1 : alloc.dbi.clearForReuse()
2471 1 : alloc.merging = mergingIter{}
2472 1 : clear(mergingIterHeapItems[:cap(mergingIterHeapItems)])
2473 1 : alloc.merging.heap.items = mergingIterHeapItems
2474 1 : clear(alloc.mlevels[:])
2475 1 : clear(alloc.levels[:])
2476 1 : clear(alloc.levelsPositioned[:])
2477 1 :
2478 1 : iterAllocPool.Put(alloc)
2479 1 : return err
2480 : }
2481 :
2482 : // SetBounds sets the lower and upper bounds for the iterator. Once SetBounds
2483 : // returns, the caller is free to mutate the provided slices.
2484 : //
2485 : // The iterator will always be invalidated and must be repositioned with a call
2486 : // to SeekGE, SeekPrefixGE, SeekLT, First, or Last.
2487 1 : func (i *Iterator) SetBounds(lower, upper []byte) {
2488 1 : // Ensure that the Iterator appears exhausted, regardless of whether we
2489 1 : // actually have to invalidate the internal iterator. Optimizations that
2490 1 : // avoid exhaustion are an internal implementation detail that shouldn't
2491 1 : // leak through the interface. The caller should still call an absolute
2492 1 : // positioning method to reposition the iterator.
2493 1 : i.requiresReposition = true
2494 1 :
2495 1 : if ((i.opts.LowerBound == nil) == (lower == nil)) &&
2496 1 : ((i.opts.UpperBound == nil) == (upper == nil)) &&
2497 1 : i.equal(i.opts.LowerBound, lower) &&
2498 1 : i.equal(i.opts.UpperBound, upper) {
2499 1 : // Unchanged, noop.
2500 1 : return
2501 1 : }
2502 :
2503 : // Copy the user-provided bounds into an Iterator-owned buffer, and set them
2504 : // on i.opts.{Lower,Upper}Bound.
2505 1 : i.processBounds(lower, upper)
2506 1 :
2507 1 : i.iter.SetBounds(i.opts.LowerBound, i.opts.UpperBound)
2508 1 : // If the iterator has an open point iterator that's not currently being
2509 1 : // used, propagate the new bounds to it.
2510 1 : if i.pointIter != nil && !i.opts.pointKeys() {
2511 1 : i.pointIter.SetBounds(i.opts.LowerBound, i.opts.UpperBound)
2512 1 : }
2513 : // If the iterator has a range key iterator, propagate bounds to it. The
2514 : // top-level SetBounds on the interleaving iterator (i.iter) won't propagate
2515 : // bounds to the range key iterator stack, because the FragmentIterator
2516 : // interface doesn't define a SetBounds method. We need to directly inform
2517 : // the iterConfig stack.
2518 1 : if i.rangeKey != nil {
2519 1 : i.rangeKey.iterConfig.SetBounds(i.opts.LowerBound, i.opts.UpperBound)
2520 1 : }
2521 :
2522 : // Even though this is not a positioning operation, the alteration of the
2523 : // bounds means we cannot optimize Seeks by using Next.
2524 1 : i.invalidate()
2525 : }
2526 :
2527 : // SetContext replaces the context provided at iterator creation, or the last
2528 : // one provided by SetContext. Even though iterators are expected to be
2529 : // short-lived, there are some cases where either (a) iterators are used far
2530 : // from the code that created them, (b) iterators are reused (while being
2531 : // short-lived) for processing different requests. For such scenarios, we
2532 : // allow the caller to replace the context.
2533 0 : func (i *Iterator) SetContext(ctx context.Context) {
2534 0 : i.ctx = ctx
2535 0 : i.iter.SetContext(ctx)
2536 0 : // If the iterator has an open point iterator that's not currently being
2537 0 : // used, propagate the new context to it.
2538 0 : if i.pointIter != nil && !i.opts.pointKeys() {
2539 0 : i.pointIter.SetContext(i.ctx)
2540 0 : }
2541 : }
2542 :
2543 : // Initialization and changing of the bounds must call processBounds.
2544 : // processBounds saves the bounds and computes derived state from those
2545 : // bounds.
2546 1 : func (i *Iterator) processBounds(lower, upper []byte) {
2547 1 : // Copy the user-provided bounds into an Iterator-owned buffer. We can't
2548 1 : // overwrite the current bounds, because some internal iterators compare old
2549 1 : // and new bounds for optimizations.
2550 1 :
2551 1 : buf := i.boundsBuf[i.boundsBufIdx][:0]
2552 1 : if lower != nil {
2553 1 : buf = append(buf, lower...)
2554 1 : i.opts.LowerBound = buf
2555 1 : } else {
2556 1 : i.opts.LowerBound = nil
2557 1 : }
2558 1 : i.nextPrefixNotPermittedByUpperBound = false
2559 1 : if upper != nil {
2560 1 : buf = append(buf, upper...)
2561 1 : i.opts.UpperBound = buf[len(buf)-len(upper):]
2562 1 : if i.comparer.Split(i.opts.UpperBound) != len(i.opts.UpperBound) {
2563 1 : // Setting an upper bound that is a versioned MVCC key. This means
2564 1 : // that a key can have some MVCC versions before the upper bound and
2565 1 : // some after. This causes significant complications for NextPrefix,
2566 1 : // so we bar the user of NextPrefix.
2567 1 : i.nextPrefixNotPermittedByUpperBound = true
2568 1 : }
2569 1 : } else {
2570 1 : i.opts.UpperBound = nil
2571 1 : }
2572 1 : i.boundsBuf[i.boundsBufIdx] = buf
2573 1 : i.boundsBufIdx = 1 - i.boundsBufIdx
2574 : }
2575 :
2576 : // SetOptions sets new iterator options for the iterator. Note that the lower
2577 : // and upper bounds applied here will supersede any bounds set by previous calls
2578 : // to SetBounds.
2579 : //
2580 : // Note that the slices provided in this SetOptions must not be changed by the
2581 : // caller until the iterator is closed, or a subsequent SetBounds or SetOptions
2582 : // has returned. This is because comparisons between the existing and new bounds
2583 : // are sometimes used to optimize seeking. See the extended commentary on
2584 : // SetBounds.
2585 : //
2586 : // If the iterator was created over an indexed mutable batch, the iterator's
2587 : // view of the mutable batch is refreshed.
2588 : //
2589 : // The iterator will always be invalidated and must be repositioned with a call
2590 : // to SeekGE, SeekPrefixGE, SeekLT, First, or Last.
2591 : //
2592 : // If only lower and upper bounds need to be modified, prefer SetBounds.
2593 1 : func (i *Iterator) SetOptions(o *IterOptions) {
2594 1 : if i.externalIter != nil {
2595 0 : if err := validateExternalIterOpts(o); err != nil {
2596 0 : panic(err)
2597 : }
2598 : }
2599 :
2600 : // Ensure that the Iterator appears exhausted, regardless of whether we
2601 : // actually have to invalidate the internal iterator. Optimizations that
2602 : // avoid exhaustion are an internal implementation detail that shouldn't
2603 : // leak through the interface. The caller should still call an absolute
2604 : // positioning method to reposition the iterator.
2605 1 : i.requiresReposition = true
2606 1 :
2607 1 : // Check if global state requires we close all internal iterators.
2608 1 : //
2609 1 : // If the Iterator is in an error state, invalidate the existing iterators
2610 1 : // so that we reconstruct an iterator state from scratch.
2611 1 : //
2612 1 : // If OnlyReadGuaranteedDurable changed, the iterator stacks are incorrect,
2613 1 : // improperly including or excluding memtables. Invalidate them so that
2614 1 : // finishInitializingIter will reconstruct them.
2615 1 : closeBoth := i.err != nil ||
2616 1 : o.OnlyReadGuaranteedDurable != i.opts.OnlyReadGuaranteedDurable
2617 1 :
2618 1 : // If either options specify block property filters for an iterator stack,
2619 1 : // reconstruct it.
2620 1 : if i.pointIter != nil && (closeBoth || len(o.PointKeyFilters) > 0 || len(i.opts.PointKeyFilters) > 0 ||
2621 1 : o.RangeKeyMasking.Filter != nil || i.opts.RangeKeyMasking.Filter != nil || o.SkipPoint != nil ||
2622 1 : i.opts.SkipPoint != nil) {
2623 1 : i.err = firstError(i.err, i.pointIter.Close())
2624 1 : i.pointIter = nil
2625 1 : }
2626 1 : if i.rangeKey != nil {
2627 1 : if closeBoth || len(o.RangeKeyFilters) > 0 || len(i.opts.RangeKeyFilters) > 0 {
2628 1 : i.rangeKey.rangeKeyIter.Close()
2629 1 : i.rangeKey = nil
2630 1 : } else {
2631 1 : // If there's still a range key iterator stack, invalidate the
2632 1 : // iterator. This ensures RangeKeyChanged() returns true if a
2633 1 : // subsequent positioning operation discovers a range key. It also
2634 1 : // prevents seek no-op optimizations.
2635 1 : i.invalidate()
2636 1 : }
2637 : }
2638 :
2639 : // If the iterator is backed by a batch that's been mutated, refresh its
2640 : // existing point and range-key iterators, and invalidate the iterator to
2641 : // prevent seek-using-next optimizations. If we don't yet have a point-key
2642 : // iterator or range-key iterator but we require one, it'll be created in
2643 : // the slow path that reconstructs the iterator in finishInitializingIter.
2644 1 : if i.batch != nil {
2645 0 : nextBatchSeqNum := (base.SeqNum(len(i.batch.batch.data)) | base.SeqNumBatchBit)
2646 0 : if nextBatchSeqNum != i.batch.batchSeqNum {
2647 0 : i.batch.batchSeqNum = nextBatchSeqNum
2648 0 : if i.merging != nil {
2649 0 : i.merging.batchSnapshot = nextBatchSeqNum
2650 0 : }
2651 : // Prevent a no-op seek optimization on the next seek. We won't be
2652 : // able to reuse the top-level Iterator state, because it may be
2653 : // incorrect after the inclusion of new batch mutations.
2654 0 : i.batchJustRefreshed = true
2655 0 : if i.pointIter != nil && i.batch.batch.countRangeDels > 0 {
2656 0 : if i.batch.rangeDelIter.Count() == 0 {
2657 0 : // When we constructed this iterator, there were no
2658 0 : // rangedels in the batch. Iterator construction will
2659 0 : // have excluded the batch rangedel iterator from the
2660 0 : // point iterator stack. We need to reconstruct the
2661 0 : // point iterator to add i.batchRangeDelIter into the
2662 0 : // iterator stack.
2663 0 : i.err = firstError(i.err, i.pointIter.Close())
2664 0 : i.pointIter = nil
2665 0 : } else {
2666 0 : // There are range deletions in the batch and we already
2667 0 : // have a batch rangedel iterator. We can update the
2668 0 : // batch rangedel iterator in place.
2669 0 : //
2670 0 : // NB: There may or may not be new range deletions. We
2671 0 : // can't tell based on i.batchRangeDelIter.Count(),
2672 0 : // which is the count of fragmented range deletions, NOT
2673 0 : // the number of range deletions written to the batch
2674 0 : // [i.batch.countRangeDels].
2675 0 : i.batch.batch.initRangeDelIter(&i.opts, &i.batch.rangeDelIter, nextBatchSeqNum)
2676 0 : }
2677 : }
2678 0 : if i.rangeKey != nil && i.batch.batch.countRangeKeys > 0 {
2679 0 : if i.batch.rangeKeyIter.Count() == 0 {
2680 0 : // When we constructed this iterator, there were no range
2681 0 : // keys in the batch. Iterator construction will have
2682 0 : // excluded the batch rangekey iterator from the range key
2683 0 : // iterator stack. We need to reconstruct the range key
2684 0 : // iterator to add i.batchRangeKeyIter into the iterator
2685 0 : // stack.
2686 0 : i.rangeKey.rangeKeyIter.Close()
2687 0 : i.rangeKey = nil
2688 0 : } else {
2689 0 : // There are range keys in the batch and we already
2690 0 : // have a batch rangekey iterator. We can update the batch
2691 0 : // rangekey iterator in place.
2692 0 : //
2693 0 : // NB: There may or may not be new range keys. We can't
2694 0 : // tell based on i.batchRangeKeyIter.Count(), which is the
2695 0 : // count of fragmented range keys, NOT the number of
2696 0 : // range keys written to the batch [i.batch.countRangeKeys].
2697 0 : i.batch.batch.initRangeKeyIter(&i.opts, &i.batch.rangeKeyIter, nextBatchSeqNum)
2698 0 : i.invalidate()
2699 0 : }
2700 : }
2701 : }
2702 : }
2703 :
2704 : // Reset combinedIterState.initialized in case the iterator key types
2705 : // changed. If there's already a range key iterator stack, the combined
2706 : // iterator is already initialized. Additionally, if the iterator is not
2707 : // configured to include range keys, mark it as initialized to signal that
2708 : // lower level iterators should not trigger a switch to combined iteration.
2709 1 : i.lazyCombinedIter.combinedIterState = combinedIterState{
2710 1 : initialized: i.rangeKey != nil || !i.opts.rangeKeys(),
2711 1 : }
2712 1 :
2713 1 : boundsEqual := ((i.opts.LowerBound == nil) == (o.LowerBound == nil)) &&
2714 1 : ((i.opts.UpperBound == nil) == (o.UpperBound == nil)) &&
2715 1 : i.equal(i.opts.LowerBound, o.LowerBound) &&
2716 1 : i.equal(i.opts.UpperBound, o.UpperBound)
2717 1 :
2718 1 : if boundsEqual && o.KeyTypes == i.opts.KeyTypes &&
2719 1 : (i.pointIter != nil || !i.opts.pointKeys()) &&
2720 1 : (i.rangeKey != nil || !i.opts.rangeKeys() || i.opts.KeyTypes == IterKeyTypePointsAndRanges) &&
2721 1 : i.comparer.CompareRangeSuffixes(o.RangeKeyMasking.Suffix, i.opts.RangeKeyMasking.Suffix) == 0 &&
2722 1 : o.UseL6Filters == i.opts.UseL6Filters {
2723 1 : // The options are identical, so we can likely use the fast path. In
2724 1 : // addition to all the above constraints, we cannot use the fast path if
2725 1 : // configured to perform lazy combined iteration but an indexed batch
2726 1 : // used by the iterator now contains range keys. Lazy combined iteration
2727 1 : // is not compatible with batch range keys because we always need to
2728 1 : // merge the batch's range keys into iteration.
2729 1 : if i.rangeKey != nil || !i.opts.rangeKeys() || i.batch == nil || i.batch.batch.countRangeKeys == 0 {
2730 1 : // Fast path. This preserves the Seek-using-Next optimizations as
2731 1 : // long as the iterator wasn't already invalidated up above.
2732 1 : return
2733 1 : }
2734 : }
2735 : // Slow path.
2736 :
2737 : // The options changed. Save the new ones to i.opts.
2738 1 : if boundsEqual {
2739 1 : // Copying the options into i.opts will overwrite LowerBound and
2740 1 : // UpperBound fields with the user-provided slices. We need to hold on
2741 1 : // to the Pebble-owned slices, so save them and re-set them after the
2742 1 : // copy.
2743 1 : lower, upper := i.opts.LowerBound, i.opts.UpperBound
2744 1 : i.opts = *o
2745 1 : i.opts.LowerBound, i.opts.UpperBound = lower, upper
2746 1 : } else {
2747 1 : i.opts = *o
2748 1 : i.processBounds(o.LowerBound, o.UpperBound)
2749 1 : // Propagate the changed bounds to the existing point iterator.
2750 1 : // NB: We propagate i.opts.{Lower,Upper}Bound, not o.{Lower,Upper}Bound
2751 1 : // because i.opts now point to buffers owned by Pebble.
2752 1 : if i.pointIter != nil {
2753 1 : i.pointIter.SetBounds(i.opts.LowerBound, i.opts.UpperBound)
2754 1 : }
2755 1 : if i.rangeKey != nil {
2756 1 : i.rangeKey.iterConfig.SetBounds(i.opts.LowerBound, i.opts.UpperBound)
2757 1 : }
2758 : }
2759 :
2760 : // Even though this is not a positioning operation, the invalidation of the
2761 : // iterator stack means we cannot optimize Seeks by using Next.
2762 1 : i.invalidate()
2763 1 :
2764 1 : // Iterators created through NewExternalIter have a different iterator
2765 1 : // initialization process.
2766 1 : if i.externalIter != nil {
2767 0 : _ = finishInitializingExternal(i.ctx, i)
2768 0 : return
2769 0 : }
2770 1 : finishInitializingIter(i.ctx, i.alloc)
2771 : }
2772 :
2773 1 : func (i *Iterator) invalidate() {
2774 1 : i.lastPositioningOp = unknownLastPositionOp
2775 1 : i.hasPrefix = false
2776 1 : i.iterKV = nil
2777 1 : i.err = nil
2778 1 : // This switch statement isn't necessary for correctness since callers
2779 1 : // should call a repositioning method. We could have arbitrarily set i.pos
2780 1 : // to one of the values. But it results in more intuitive behavior in
2781 1 : // tests, which do not always reposition.
2782 1 : switch i.pos {
2783 1 : case iterPosCurForward, iterPosNext, iterPosCurForwardPaused:
2784 1 : i.pos = iterPosCurForward
2785 1 : case iterPosCurReverse, iterPosPrev, iterPosCurReversePaused:
2786 1 : i.pos = iterPosCurReverse
2787 : }
2788 1 : i.iterValidityState = IterExhausted
2789 1 : if i.rangeKey != nil {
2790 1 : i.rangeKey.iiter.Invalidate()
2791 1 : i.rangeKey.prevPosHadRangeKey = false
2792 1 : }
2793 : }
2794 :
2795 : // Metrics returns per-iterator metrics.
2796 0 : func (i *Iterator) Metrics() IteratorMetrics {
2797 0 : m := IteratorMetrics{
2798 0 : ReadAmp: 1,
2799 0 : }
2800 0 : if mi, ok := i.iter.(*mergingIter); ok {
2801 0 : m.ReadAmp = len(mi.levels)
2802 0 : }
2803 0 : return m
2804 : }
2805 :
2806 : // ResetStats resets the stats to 0.
2807 0 : func (i *Iterator) ResetStats() {
2808 0 : i.stats = IteratorStats{}
2809 0 : }
2810 :
2811 : // Stats returns the current stats.
2812 0 : func (i *Iterator) Stats() IteratorStats {
2813 0 : return i.stats
2814 0 : }
2815 :
2816 : // CloneOptions configures an iterator constructed through Iterator.Clone.
2817 : type CloneOptions struct {
2818 : // IterOptions, if non-nil, define the iterator options to configure a
2819 : // cloned iterator. If nil, the clone adopts the same IterOptions as the
2820 : // iterator being cloned.
2821 : IterOptions *IterOptions
2822 : // RefreshBatchView may be set to true when cloning an Iterator over an
2823 : // indexed batch. When false, the clone adopts the same (possibly stale)
2824 : // view of the indexed batch as the cloned Iterator. When true, the clone is
2825 : // constructed with a refreshed view of the batch, observing all of the
2826 : // batch's mutations at the time of the Clone. If the cloned iterator was
2827 : // not constructed to read over an indexed batch, RefreshVatchView has no
2828 : // effect.
2829 : RefreshBatchView bool
2830 : }
2831 :
2832 : // Clone creates a new Iterator over the same underlying data, i.e., over the
2833 : // same {batch, memtables, sstables}). The resulting iterator is not positioned.
2834 : // It starts with the same IterOptions, unless opts.IterOptions is set.
2835 : //
2836 : // When called on an Iterator over an indexed batch, the clone's visibility of
2837 : // the indexed batch is determined by CloneOptions.RefreshBatchView. If false,
2838 : // the clone inherits the iterator's current (possibly stale) view of the batch,
2839 : // and callers may call SetOptions to subsequently refresh the clone's view to
2840 : // include all batch mutations. If true, the clone is constructed with a
2841 : // complete view of the indexed batch's mutations at the time of the Clone.
2842 : //
2843 : // Callers can use Clone if they need multiple iterators that need to see
2844 : // exactly the same underlying state of the DB. This should not be used to
2845 : // extend the lifetime of the data backing the original Iterator since that
2846 : // will cause an increase in memory and disk usage (use NewSnapshot for that
2847 : // purpose).
2848 1 : func (i *Iterator) Clone(opts CloneOptions) (*Iterator, error) {
2849 1 : return i.CloneWithContext(context.Background(), opts)
2850 1 : }
2851 :
2852 : // CloneWithContext is like Clone, and additionally accepts a context for
2853 : // tracing.
2854 1 : func (i *Iterator) CloneWithContext(ctx context.Context, opts CloneOptions) (*Iterator, error) {
2855 1 : if opts.IterOptions == nil {
2856 1 : opts.IterOptions = &i.opts
2857 1 : }
2858 1 : if i.batchOnlyIter {
2859 0 : return nil, errors.Errorf("cannot Clone a batch-only Iterator")
2860 0 : }
2861 1 : readState := i.readState
2862 1 : vers := i.version
2863 1 : if readState == nil && vers == nil {
2864 0 : return nil, errors.Errorf("cannot Clone a closed Iterator")
2865 0 : }
2866 : // i is already holding a ref, so there is no race with unref here.
2867 : //
2868 : // TODO(bilal): If the underlying iterator was created on a snapshot, we could
2869 : // grab a reference to the current readState instead of reffing the original
2870 : // readState. This allows us to release references to some zombie sstables
2871 : // and memtables.
2872 1 : if readState != nil {
2873 1 : readState.ref()
2874 1 : }
2875 1 : if vers != nil {
2876 1 : vers.Ref()
2877 1 : }
2878 : // Bundle various structures under a single umbrella in order to allocate
2879 : // them together.
2880 1 : buf := newIterAlloc()
2881 1 : dbi := &buf.dbi
2882 1 : *dbi = Iterator{
2883 1 : ctx: ctx,
2884 1 : opts: *opts.IterOptions,
2885 1 : alloc: buf,
2886 1 : merge: i.merge,
2887 1 : comparer: i.comparer,
2888 1 : readState: readState,
2889 1 : version: vers,
2890 1 : keyBuf: buf.keyBuf,
2891 1 : prefixOrFullSeekKey: buf.prefixOrFullSeekKey,
2892 1 : boundsBuf: buf.boundsBuf,
2893 1 : fc: i.fc,
2894 1 : newIters: i.newIters,
2895 1 : newIterRangeKey: i.newIterRangeKey,
2896 1 : seqNum: i.seqNum,
2897 1 : }
2898 1 : if i.batch != nil {
2899 1 : dbi.batch = &buf.batchState
2900 1 : dbi.batch.batch = i.batch.batch
2901 1 : dbi.batch.batchSeqNum = i.batch.batchSeqNum
2902 1 : // If the caller requested the clone have a current view of the indexed
2903 1 : // batch, set the clone's batch sequence number appropriately.
2904 1 : if opts.RefreshBatchView {
2905 1 : dbi.batch.batchSeqNum = (base.SeqNum(len(i.batch.batch.data)) | base.SeqNumBatchBit)
2906 1 : }
2907 : }
2908 1 : dbi.processBounds(dbi.opts.LowerBound, dbi.opts.UpperBound)
2909 1 : return finishInitializingIter(ctx, buf), nil
2910 : }
2911 :
2912 : // Merge adds all of the argument's statistics to the receiver. It may be used
2913 : // to accumulate stats across multiple iterators.
2914 0 : func (stats *IteratorStats) Merge(o IteratorStats) {
2915 0 : for i := InterfaceCall; i < NumStatsKind; i++ {
2916 0 : stats.ForwardSeekCount[i] += o.ForwardSeekCount[i]
2917 0 : stats.ReverseSeekCount[i] += o.ReverseSeekCount[i]
2918 0 : stats.ForwardStepCount[i] += o.ForwardStepCount[i]
2919 0 : stats.ReverseStepCount[i] += o.ReverseStepCount[i]
2920 0 : }
2921 0 : stats.InternalStats.Merge(o.InternalStats)
2922 0 : stats.RangeKeyStats.Merge(o.RangeKeyStats)
2923 : }
2924 :
2925 0 : func (stats *IteratorStats) String() string {
2926 0 : return redact.StringWithoutMarkers(stats)
2927 0 : }
2928 :
2929 : // SafeFormat implements the redact.SafeFormatter interface.
2930 0 : func (stats *IteratorStats) SafeFormat(s redact.SafePrinter, verb rune) {
2931 0 : if stats.ReverseSeekCount[InterfaceCall] == 0 && stats.ReverseSeekCount[InternalIterCall] == 0 {
2932 0 : s.Printf("seeked %s times (%s internal)",
2933 0 : humanize.Count.Uint64(uint64(stats.ForwardSeekCount[InterfaceCall])),
2934 0 : humanize.Count.Uint64(uint64(stats.ForwardSeekCount[InternalIterCall])),
2935 0 : )
2936 0 : } else {
2937 0 : s.Printf("seeked %s times (%s fwd/%s rev, internal: %s fwd/%s rev)",
2938 0 : humanize.Count.Uint64(uint64(stats.ForwardSeekCount[InterfaceCall]+stats.ReverseSeekCount[InterfaceCall])),
2939 0 : humanize.Count.Uint64(uint64(stats.ForwardSeekCount[InterfaceCall])),
2940 0 : humanize.Count.Uint64(uint64(stats.ReverseSeekCount[InterfaceCall])),
2941 0 : humanize.Count.Uint64(uint64(stats.ForwardSeekCount[InternalIterCall])),
2942 0 : humanize.Count.Uint64(uint64(stats.ReverseSeekCount[InternalIterCall])),
2943 0 : )
2944 0 : }
2945 0 : s.SafeString("; ")
2946 0 :
2947 0 : if stats.ReverseStepCount[InterfaceCall] == 0 && stats.ReverseStepCount[InternalIterCall] == 0 {
2948 0 : s.Printf("stepped %s times (%s internal)",
2949 0 : humanize.Count.Uint64(uint64(stats.ForwardStepCount[InterfaceCall])),
2950 0 : humanize.Count.Uint64(uint64(stats.ForwardStepCount[InternalIterCall])),
2951 0 : )
2952 0 : } else {
2953 0 : s.Printf("stepped %s times (%s fwd/%s rev, internal: %s fwd/%s rev)",
2954 0 : humanize.Count.Uint64(uint64(stats.ForwardStepCount[InterfaceCall]+stats.ReverseStepCount[InterfaceCall])),
2955 0 : humanize.Count.Uint64(uint64(stats.ForwardStepCount[InterfaceCall])),
2956 0 : humanize.Count.Uint64(uint64(stats.ReverseStepCount[InterfaceCall])),
2957 0 : humanize.Count.Uint64(uint64(stats.ForwardStepCount[InternalIterCall])),
2958 0 : humanize.Count.Uint64(uint64(stats.ReverseStepCount[InternalIterCall])),
2959 0 : )
2960 0 : }
2961 :
2962 0 : if stats.InternalStats != (InternalIteratorStats{}) {
2963 0 : s.SafeString("; ")
2964 0 : stats.InternalStats.SafeFormat(s, verb)
2965 0 : }
2966 0 : if stats.RangeKeyStats != (RangeKeyIteratorStats{}) {
2967 0 : s.SafeString(", ")
2968 0 : stats.RangeKeyStats.SafeFormat(s, verb)
2969 0 : }
2970 : }
2971 :
2972 : // CanDeterministicallySingleDelete takes a valid iterator and examines internal
2973 : // state to determine if a SingleDelete deleting Iterator.Key() would
2974 : // deterministically delete the key. CanDeterministicallySingleDelete requires
2975 : // the iterator to be oriented in the forward direction (eg, the last
2976 : // positioning operation must've been a First, a Seek[Prefix]GE, or a
2977 : // Next[Prefix][WithLimit]).
2978 : //
2979 : // This function does not change the external position of the iterator, and all
2980 : // positioning methods should behave the same as if it was never called. This
2981 : // function will only return a meaningful result the first time it's invoked at
2982 : // an iterator position. This function invalidates the iterator Value's memory,
2983 : // and the caller must not rely on the memory safety of the previous Iterator
2984 : // position.
2985 : //
2986 : // If CanDeterministicallySingleDelete returns true AND the key at the iterator
2987 : // position is not modified between the creation of the Iterator and the commit
2988 : // of a batch containing a SingleDelete over the key, then the caller can be
2989 : // assured that SingleDelete is equivalent to Delete on the local engine, but it
2990 : // may not be true on another engine that received the same writes and with
2991 : // logically equivalent state since this engine may have collapsed multiple SETs
2992 : // into one.
2993 1 : func CanDeterministicallySingleDelete(it *Iterator) (bool, error) {
2994 1 : // This function may only be called once per external iterator position. We
2995 1 : // can validate this by checking the last positioning operation.
2996 1 : if it.lastPositioningOp == internalNextOp {
2997 1 : return false, errors.New("pebble: CanDeterministicallySingleDelete called twice")
2998 1 : }
2999 1 : validity, kind := it.internalNext()
3000 1 : var shadowedBySingleDelete bool
3001 1 : for validity == internalNextValid {
3002 1 : switch kind {
3003 1 : case InternalKeyKindDelete, InternalKeyKindDeleteSized:
3004 1 : // A DEL or DELSIZED tombstone is okay. An internal key
3005 1 : // sequence like SINGLEDEL; SET; DEL; SET can be handled
3006 1 : // deterministically. If there are SETs further down, we
3007 1 : // don't care about them.
3008 1 : return true, nil
3009 0 : case InternalKeyKindSingleDelete:
3010 0 : // A SingleDelete is okay as long as when that SingleDelete was
3011 0 : // written, it was written deterministically (eg, with its own
3012 0 : // CanDeterministicallySingleDelete check). Validate that it was
3013 0 : // written deterministically. We'll allow one set to appear after
3014 0 : // the SingleDelete.
3015 0 : shadowedBySingleDelete = true
3016 0 : validity, kind = it.internalNext()
3017 0 : continue
3018 1 : case InternalKeyKindSet, InternalKeyKindSetWithDelete, InternalKeyKindMerge:
3019 1 : // If we observed a single delete, it's allowed to delete 1 key.
3020 1 : // We'll keep looping to validate that the internal keys beneath the
3021 1 : // already-written single delete are copacetic.
3022 1 : if shadowedBySingleDelete {
3023 0 : shadowedBySingleDelete = false
3024 0 : validity, kind = it.internalNext()
3025 0 : continue
3026 : }
3027 : // We encountered a shadowed SET, SETWITHDEL, MERGE. A SINGLEDEL
3028 : // that deleted the KV at the original iterator position could
3029 : // result in this key becoming visible.
3030 1 : return false, nil
3031 0 : case InternalKeyKindRangeDelete:
3032 0 : // RangeDeletes are handled by the merging iterator and should never
3033 0 : // be observed by the top-level Iterator.
3034 0 : panic(errors.AssertionFailedf("pebble: unexpected range delete"))
3035 0 : case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete:
3036 0 : // Range keys are interleaved at the maximal sequence number and
3037 0 : // should never be observed within a user key.
3038 0 : panic(errors.AssertionFailedf("pebble: unexpected range key"))
3039 0 : default:
3040 0 : panic(errors.AssertionFailedf("pebble: unexpected key kind: %s", errors.Safe(kind)))
3041 : }
3042 : }
3043 1 : if validity == internalNextError {
3044 1 : return false, it.Error()
3045 1 : }
3046 1 : return true, nil
3047 : }
3048 :
3049 : // internalNextValidity enumerates the potential outcomes of a call to
3050 : // internalNext.
3051 : type internalNextValidity int8
3052 :
3053 : const (
3054 : // internalNextError is returned by internalNext when an error occurred and
3055 : // the caller is responsible for checking iter.Error().
3056 : internalNextError internalNextValidity = iota
3057 : // internalNextExhausted is returned by internalNext when the next internal
3058 : // key is an internal key with a different user key than Iterator.Key().
3059 : internalNextExhausted
3060 : // internalNextValid is returned by internalNext when the internal next
3061 : // found a shadowed internal key with a user key equal to Iterator.Key().
3062 : internalNextValid
3063 : )
3064 :
3065 : // internalNext advances internal Iterator state forward to expose the
3066 : // InternalKeyKind of the next internal key with a user key equal to Key().
3067 : //
3068 : // internalNext is a highly specialized operation and is unlikely to be
3069 : // generally useful. See Iterator.Next for how to reposition the iterator to the
3070 : // next key. internalNext requires the Iterator to be at a valid position in the
3071 : // forward direction (the last positioning operation must've been a First, a
3072 : // Seek[Prefix]GE, or a Next[Prefix][WithLimit] and Valid() must return true).
3073 : //
3074 : // internalNext, unlike all other Iterator methods, exposes internal LSM state.
3075 : // internalNext advances the Iterator's internal iterator to the next shadowed
3076 : // key with a user key equal to Key(). When a key is overwritten or deleted, its
3077 : // removal from the LSM occurs lazily as a part of compactions. internalNext
3078 : // allows the caller to see whether an obsolete internal key exists with the
3079 : // current Key(), and what it's key kind is. Note that the existence of an
3080 : // internal key is nondeterministic and dependent on internal LSM state. These
3081 : // semantics are unlikely to be applicable to almost all use cases.
3082 : //
3083 : // If internalNext finds a key that shares the same user key as Key(), it
3084 : // returns internalNextValid and the internal key's kind. If internalNext
3085 : // encounters an error, it returns internalNextError and the caller is expected
3086 : // to call Iterator.Error() to retrieve it. In all other circumstances,
3087 : // internalNext returns internalNextExhausted, indicating that there are no more
3088 : // additional internal keys with the user key Key().
3089 : //
3090 : // internalNext does not change the external position of the iterator, and a
3091 : // Next operation should behave the same as if internalNext was never called.
3092 : // internalNext does invalidate the iterator Value's memory, and the caller must
3093 : // not rely on the memory safety of the previous Iterator position.
3094 1 : func (i *Iterator) internalNext() (internalNextValidity, base.InternalKeyKind) {
3095 1 : i.stats.ForwardStepCount[InterfaceCall]++
3096 1 : if i.err != nil {
3097 1 : return internalNextError, base.InternalKeyKindInvalid
3098 1 : } else if i.iterValidityState != IterValid {
3099 1 : return internalNextExhausted, base.InternalKeyKindInvalid
3100 1 : }
3101 1 : i.lastPositioningOp = internalNextOp
3102 1 :
3103 1 : switch i.pos {
3104 1 : case iterPosCurForward:
3105 1 : i.iterKV = i.iter.Next()
3106 1 : if i.iterKV == nil {
3107 1 : // We check i.iter.Error() here and return an internalNextError enum
3108 1 : // variant so that the caller does not need to check i.iter.Error()
3109 1 : // in the common case that the next internal key has a new user key.
3110 1 : if i.err = i.iter.Error(); i.err != nil {
3111 0 : return internalNextError, base.InternalKeyKindInvalid
3112 0 : }
3113 1 : i.pos = iterPosNext
3114 1 : return internalNextExhausted, base.InternalKeyKindInvalid
3115 1 : } else if i.comparer.Equal(i.iterKV.K.UserKey, i.key) {
3116 1 : return internalNextValid, i.iterKV.Kind()
3117 1 : }
3118 1 : i.pos = iterPosNext
3119 1 : return internalNextExhausted, base.InternalKeyKindInvalid
3120 1 : case iterPosCurReverse, iterPosCurReversePaused, iterPosPrev:
3121 1 : i.err = errors.New("switching from reverse to forward via internalNext is prohibited")
3122 1 : i.iterValidityState = IterExhausted
3123 1 : return internalNextError, base.InternalKeyKindInvalid
3124 1 : case iterPosNext, iterPosCurForwardPaused:
3125 1 : // The previous method already moved onto the next user key. This is
3126 1 : // only possible if
3127 1 : // - the last positioning method was a call to internalNext, and we
3128 1 : // advanced to a new user key.
3129 1 : // - the previous non-internalNext iterator operation encountered a
3130 1 : // range key or merge, forcing an internal Next that found a new
3131 1 : // user key that's not equal to i.Iterator.Key().
3132 1 : return internalNextExhausted, base.InternalKeyKindInvalid
3133 0 : default:
3134 0 : panic("unreachable")
3135 : }
3136 : }
3137 :
3138 : var _ base.IteratorDebug = (*Iterator)(nil)
3139 :
3140 : // DebugTree implements the base.IteratorDebug interface.
3141 0 : func (i *Iterator) DebugTree(tp treeprinter.Node) {
3142 0 : n := tp.Childf("%T(%p)", i, i)
3143 0 : if i.iter != nil {
3144 0 : i.iter.DebugTree(n)
3145 0 : }
3146 0 : if i.pointIter != nil {
3147 0 : i.pointIter.DebugTree(n)
3148 0 : }
3149 : }
|