Line data Source code
1 : // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package sstable
6 :
7 : import (
8 : "bytes"
9 : "context"
10 : "fmt"
11 : "unsafe"
12 :
13 : "github.com/cockroachdb/pebble/internal/base"
14 : "github.com/cockroachdb/pebble/internal/invariants"
15 : "github.com/cockroachdb/pebble/objstorage"
16 : "github.com/cockroachdb/pebble/objstorage/objstorageprovider"
17 : "github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing"
18 : )
19 :
20 : // singleLevelIterator iterates over an entire table of data. To seek for a given
21 : // key, it first looks in the index for the block that contains that key, and then
22 : // looks inside that block.
23 : type singleLevelIterator struct {
24 : ctx context.Context
25 : cmp Compare
26 : // Global lower/upper bound for the iterator.
27 : lower []byte
28 : upper []byte
29 : bpfs *BlockPropertiesFilterer
30 : // Per-block lower/upper bound. Nil if the bound does not apply to the block
31 : // because we determined the block lies completely within the bound.
32 : blockLower []byte
33 : blockUpper []byte
34 : reader *Reader
35 : // vState will be set iff the iterator is constructed for virtual sstable
36 : // iteration.
37 : vState *virtualState
38 : // endKeyInclusive is set to force the iterator to treat the upper field as
39 : // inclusive while iterating instead of exclusive.
40 : endKeyInclusive bool
41 : index blockIter
42 : indexFilterRH objstorage.ReadHandle
43 : indexFilterRHPrealloc objstorageprovider.PreallocatedReadHandle
44 : data blockIter
45 : dataRH objstorage.ReadHandle
46 : dataRHPrealloc objstorageprovider.PreallocatedReadHandle
47 : // dataBH refers to the last data block that the iterator considered
48 : // loading. It may not actually have loaded the block, due to an error or
49 : // because it was considered irrelevant.
50 : dataBH BlockHandle
51 : vbReader *valueBlockReader
52 : // vbRH is the read handle for value blocks, which are in a different
53 : // part of the sstable than data blocks.
54 : vbRH objstorage.ReadHandle
55 : vbRHPrealloc objstorageprovider.PreallocatedReadHandle
56 : err error
57 : closeHook func(i Iterator) error
58 : // stats and iterStats are slightly different. stats is a shared struct
59 : // supplied from the outside, and represents stats for the whole iterator
60 : // tree and can be reset from the outside (e.g. when the pebble.Iterator is
61 : // being reused). It is currently only provided when the iterator tree is
62 : // rooted at pebble.Iterator. iterStats is this sstable iterator's private
63 : // stats that are reported to a CategoryStatsCollector when this iterator is
64 : // closed. More paths are instrumented with this as the
65 : // CategoryStatsCollector needed for this is provided by the
66 : // tableCacheContainer (which is more universally used).
67 : stats *base.InternalIteratorStats
68 : iterStats iterStatsAccumulator
69 : bufferPool *BufferPool
70 :
71 : // boundsCmp and positionedUsingLatestBounds are for optimizing iteration
72 : // that uses multiple adjacent bounds. The seek after setting a new bound
73 : // can use the fact that the iterator is either within the previous bounds
74 : // or exactly one key before or after the bounds. If the new bounds is
75 : // after/before the previous bounds, and we are already positioned at a
76 : // block that is relevant for the new bounds, we can try to first position
77 : // using Next/Prev (repeatedly) instead of doing a more expensive seek.
78 : //
79 : // When there are wide files at higher levels that match the bounds
80 : // but don't have any data for the bound, we will already be
81 : // positioned at the key beyond the bounds and won't need to do much
82 : // work -- given that most data is in L6, such files are likely to
83 : // dominate the performance of the mergingIter, and may be the main
84 : // benefit of this performance optimization (of course it also helps
85 : // when the file that has the data has successive seeks that stay in
86 : // the same block).
87 : //
88 : // Specifically, boundsCmp captures the relationship between the previous
89 : // and current bounds, if the iterator had been positioned after setting
90 : // the previous bounds. If it was not positioned, i.e., Seek/First/Last
91 : // were not called, we don't know where it is positioned and cannot
92 : // optimize.
93 : //
94 : // Example: Bounds moving forward, and iterator exhausted in forward direction.
95 : // bounds = [f, h), ^ shows block iterator position
96 : // file contents [ a b c d e f g h i j k ]
97 : // ^
98 : // new bounds = [j, k). Since positionedUsingLatestBounds=true, boundsCmp is
99 : // set to +1. SeekGE(j) can use next (the optimization also requires that j
100 : // is within the block, but that is not for correctness, but to limit the
101 : // optimization to when it will actually be an optimization).
102 : //
103 : // Example: Bounds moving forward.
104 : // bounds = [f, h), ^ shows block iterator position
105 : // file contents [ a b c d e f g h i j k ]
106 : // ^
107 : // new bounds = [j, k). Since positionedUsingLatestBounds=true, boundsCmp is
108 : // set to +1. SeekGE(j) can use next.
109 : //
110 : // Example: Bounds moving forward, but iterator not positioned using previous
111 : // bounds.
112 : // bounds = [f, h), ^ shows block iterator position
113 : // file contents [ a b c d e f g h i j k ]
114 : // ^
115 : // new bounds = [i, j). Iterator is at j since it was never positioned using
116 : // [f, h). So positionedUsingLatestBounds=false, and boundsCmp is set to 0.
117 : // SeekGE(i) will not use next.
118 : //
119 : // Example: Bounds moving forward and sparse file
120 : // bounds = [f, h), ^ shows block iterator position
121 : // file contents [ a z ]
122 : // ^
123 : // new bounds = [j, k). Since positionedUsingLatestBounds=true, boundsCmp is
124 : // set to +1. SeekGE(j) notices that the iterator is already past j and does
125 : // not need to do anything.
126 : //
127 : // Similar examples can be constructed for backward iteration.
128 : //
129 : // This notion of exactly one key before or after the bounds is not quite
130 : // true when block properties are used to ignore blocks. In that case we
131 : // can't stop precisely at the first block that is past the bounds since
132 : // we are using the index entries to enforce the bounds.
133 : //
134 : // e.g. 3 blocks with keys [b, c] [f, g], [i, j, k] with index entries d,
135 : // h, l. And let the lower bound be k, and we are reverse iterating. If
136 : // the block [i, j, k] is ignored due to the block interval annotations we
137 : // do need to move the index to block [f, g] since the index entry for the
138 : // [i, j, k] block is l which is not less than the lower bound of k. So we
139 : // have passed the entries i, j.
140 : //
141 : // This behavior is harmless since the block property filters are fixed
142 : // for the lifetime of the iterator so i, j are irrelevant. In addition,
143 : // the current code will not load the [f, g] block, so the seek
144 : // optimization that attempts to use Next/Prev do not apply anyway.
145 : boundsCmp int
146 : positionedUsingLatestBounds bool
147 :
148 : // exhaustedBounds represents whether the iterator is exhausted for
149 : // iteration by reaching the upper or lower bound. +1 when exhausted
150 : // the upper bound, -1 when exhausted the lower bound, and 0 when
151 : // neither. exhaustedBounds is also used for the TrySeekUsingNext
152 : // optimization in twoLevelIterator and singleLevelIterator. Care should be
153 : // taken in setting this in twoLevelIterator before calling into
154 : // singleLevelIterator, given that these two iterators share this field.
155 : exhaustedBounds int8
156 :
157 : // useFilter specifies whether the filter block in this sstable, if present,
158 : // should be used for prefix seeks or not. In some cases it is beneficial
159 : // to skip a filter block even if it exists (eg. if probability of a match
160 : // is high).
161 : useFilter bool
162 : lastBloomFilterMatched bool
163 :
164 : transforms IterTransforms
165 :
166 : // inPool is set to true before putting the iterator in the reusable pool;
167 : // used to detect double-close.
168 : inPool bool
169 : }
170 :
171 : // singleLevelIterator implements the base.InternalIterator interface.
172 : var _ base.InternalIterator = (*singleLevelIterator)(nil)
173 :
174 : // init initializes a singleLevelIterator for reading from the table. It is
175 : // synonmous with Reader.NewIter, but allows for reusing of the iterator
176 : // between different Readers.
177 : //
178 : // Note that lower, upper passed into init has nothing to do with virtual sstable
179 : // bounds. If the virtualState passed in is not nil, then virtual sstable bounds
180 : // will be enforced.
181 : func (i *singleLevelIterator) init(
182 : ctx context.Context,
183 : r *Reader,
184 : v *virtualState,
185 : transforms IterTransforms,
186 : lower, upper []byte,
187 : filterer *BlockPropertiesFilterer,
188 : useFilter bool,
189 : stats *base.InternalIteratorStats,
190 : categoryAndQoS CategoryAndQoS,
191 : statsCollector *CategoryStatsCollector,
192 : rp ReaderProvider,
193 : bufferPool *BufferPool,
194 1 : ) error {
195 1 : if r.err != nil {
196 0 : return r.err
197 0 : }
198 1 : i.iterStats.init(categoryAndQoS, statsCollector)
199 1 : i.indexFilterRH = objstorageprovider.UsePreallocatedReadHandle(
200 1 : ctx, r.readable, objstorage.ReadBeforeForIndexAndFilter, &i.indexFilterRHPrealloc)
201 1 : indexH, err := r.readIndex(ctx, i.indexFilterRH, stats, &i.iterStats)
202 1 : if err != nil {
203 0 : return err
204 0 : }
205 1 : if v != nil {
206 1 : i.vState = v
207 1 : i.endKeyInclusive, lower, upper = v.constrainBounds(lower, upper, false /* endInclusive */)
208 1 : }
209 :
210 1 : i.inPool = false
211 1 : i.ctx = ctx
212 1 : i.lower = lower
213 1 : i.upper = upper
214 1 : i.bpfs = filterer
215 1 : i.useFilter = useFilter
216 1 : i.reader = r
217 1 : i.cmp = r.Compare
218 1 : i.stats = stats
219 1 : i.transforms = transforms
220 1 : i.bufferPool = bufferPool
221 1 : err = i.index.initHandle(i.cmp, r.Split, indexH, transforms)
222 1 : if err != nil {
223 0 : // blockIter.Close releases indexH and always returns a nil error
224 0 : _ = i.index.Close()
225 0 : return err
226 0 : }
227 1 : i.dataRH = objstorageprovider.UsePreallocatedReadHandle(ctx, r.readable, objstorage.NoReadBefore, &i.dataRHPrealloc)
228 1 : if r.tableFormat >= TableFormatPebblev3 {
229 1 : if r.Properties.NumValueBlocks > 0 {
230 1 : // NB: we cannot avoid this ~248 byte allocation, since valueBlockReader
231 1 : // can outlive the singleLevelIterator due to be being embedded in a
232 1 : // LazyValue. This consumes ~2% in microbenchmark CPU profiles, but we
233 1 : // should only optimize this if it shows up as significant in end-to-end
234 1 : // CockroachDB benchmarks, since it is tricky to do so. One possibility
235 1 : // is that if many sstable iterators only get positioned at latest
236 1 : // versions of keys, and therefore never expose a LazyValue that is
237 1 : // separated to their callers, they can put this valueBlockReader into a
238 1 : // sync.Pool.
239 1 : i.vbReader = &valueBlockReader{
240 1 : bpOpen: i,
241 1 : rp: rp,
242 1 : vbih: r.valueBIH,
243 1 : stats: stats,
244 1 : }
245 1 : i.data.lazyValueHandling.vbr = i.vbReader
246 1 : i.vbRH = objstorageprovider.UsePreallocatedReadHandle(ctx, r.readable, objstorage.NoReadBefore, &i.vbRHPrealloc)
247 1 : }
248 1 : i.data.lazyValueHandling.hasValuePrefix = true
249 : }
250 1 : return nil
251 : }
252 :
253 : // Helper function to check if keys returned from iterator are within virtual bounds.
254 1 : func (i *singleLevelIterator) maybeVerifyKey(kv *base.InternalKV) *base.InternalKV {
255 1 : if invariants.Enabled && kv != nil && i.vState != nil {
256 1 : key := kv.K.UserKey
257 1 : v := i.vState
258 1 : lc := i.cmp(key, v.lower.UserKey)
259 1 : uc := i.cmp(key, v.upper.UserKey)
260 1 : if lc < 0 || uc > 0 || (uc == 0 && v.upper.IsExclusiveSentinel()) {
261 0 : panic(fmt.Sprintf("key %q out of singleLeveliterator virtual bounds %s %s", key, v.lower.UserKey, v.upper.UserKey))
262 : }
263 : }
264 1 : return kv
265 : }
266 :
267 : // setupForCompaction sets up the singleLevelIterator for use with compactionIter.
268 : // Currently, it skips readahead ramp-up. It should be called after init is called.
269 1 : func (i *singleLevelIterator) setupForCompaction() {
270 1 : i.dataRH.SetupForCompaction()
271 1 : if i.vbRH != nil {
272 1 : i.vbRH.SetupForCompaction()
273 1 : }
274 : }
275 :
276 1 : func (i *singleLevelIterator) resetForReuse() singleLevelIterator {
277 1 : return singleLevelIterator{
278 1 : index: i.index.resetForReuse(),
279 1 : data: i.data.resetForReuse(),
280 1 : inPool: true,
281 1 : }
282 1 : }
283 :
284 1 : func (i *singleLevelIterator) initBounds() {
285 1 : // Trim the iteration bounds for the current block. We don't have to check
286 1 : // the bounds on each iteration if the block is entirely contained within the
287 1 : // iteration bounds.
288 1 : i.blockLower = i.lower
289 1 : if i.blockLower != nil {
290 1 : kv := i.data.First()
291 1 : // TODO(radu): this should be <= 0
292 1 : if kv != nil && i.cmp(i.blockLower, kv.K.UserKey) < 0 {
293 1 : // The lower-bound is less than the first key in the block. No need
294 1 : // to check the lower-bound again for this block.
295 1 : i.blockLower = nil
296 1 : }
297 : }
298 1 : i.blockUpper = i.upper
299 1 : // TODO(radu): this should be >= 0 if blockUpper is inclusive.
300 1 : if i.blockUpper != nil && i.cmp(i.blockUpper, i.index.Key().UserKey) > 0 {
301 1 : // The upper-bound is greater than the index key which itself is greater
302 1 : // than or equal to every key in the block. No need to check the
303 1 : // upper-bound again for this block. Even if blockUpper is inclusive
304 1 : // because of upper being inclusive, we can still safely set blockUpper
305 1 : // to nil here.
306 1 : i.blockUpper = nil
307 1 : }
308 : }
309 :
310 1 : func (i *singleLevelIterator) initBoundsForAlreadyLoadedBlock() {
311 1 : // TODO(radu): determine automatically if we need to call First or not and
312 1 : // unify this function with initBounds().
313 1 : if i.data.getFirstUserKey() == nil {
314 0 : panic("initBoundsForAlreadyLoadedBlock must not be called on empty or corrupted block")
315 : }
316 1 : i.blockLower = i.lower
317 1 : if i.blockLower != nil {
318 1 : firstUserKey := i.data.getFirstUserKey()
319 1 : // TODO(radu): this should be <= 0
320 1 : if firstUserKey != nil && i.cmp(i.blockLower, firstUserKey) < 0 {
321 1 : // The lower-bound is less than the first key in the block. No need
322 1 : // to check the lower-bound again for this block.
323 1 : i.blockLower = nil
324 1 : }
325 : }
326 1 : i.blockUpper = i.upper
327 1 : // TODO(radu): this should be >= 0 if blockUpper is inclusive.
328 1 : if i.blockUpper != nil && i.cmp(i.blockUpper, i.index.Key().UserKey) > 0 {
329 1 : // The upper-bound is greater than the index key which itself is greater
330 1 : // than or equal to every key in the block. No need to check the
331 1 : // upper-bound again for this block.
332 1 : i.blockUpper = nil
333 1 : }
334 : }
335 :
336 : // Deterministic disabling of the bounds-based optimization that avoids seeking.
337 : // Uses the iterator pointer, since we want diversity in iterator behavior for
338 : // the same SetBounds call. Used for tests.
339 1 : func disableBoundsOpt(bound []byte, ptr uintptr) bool {
340 1 : // Fibonacci hash https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
341 1 : simpleHash := (11400714819323198485 * uint64(ptr)) >> 63
342 1 : return bound[len(bound)-1]&byte(1) == 0 && simpleHash == 0
343 1 : }
344 :
345 : // ensureBoundsOptDeterminism provides a facility for disabling of the bounds
346 : // optimizations performed by disableBoundsOpt for tests that require
347 : // deterministic iterator behavior. Some unit tests examine internal iterator
348 : // state and require this behavior to be deterministic.
349 : var ensureBoundsOptDeterminism bool
350 :
351 : // SetBoundsWithSyntheticPrefix indicates whether this iterator requires keys
352 : // passed to its SetBounds() method by a prefix rewriting wrapper to be *not*
353 : // rewritten to be in terms of this iterator's content, but instead be passed
354 : // as-is, i.e. with the synthetic prefix still on them.
355 : //
356 : // This allows an optimization when this iterator is passing these bounds on to
357 : // a vState to additionally constrain them. In said vState, passed bounds are
358 : // combined with the vState bounds which are in terms of the rewritten prefix.
359 : // If the caller rewrote bounds to be in terms of content prefix and SetBounds
360 : // passed those to vState, the vState would need to *un*rewrite them back to the
361 : // synthetic prefix in order to combine them with the vState bounds. Thus, if
362 : // this iterator knows bounds will be passed to vState, it can signal that it
363 : // they should be passed without being rewritten to skip converting to and fro.
364 0 : func (i singleLevelIterator) SetBoundsWithSyntheticPrefix() bool {
365 0 : return i.vState != nil
366 0 : }
367 :
368 : // SetBounds implements internalIterator.SetBounds, as documented in the pebble
369 : // package. Note that the upper field is exclusive.
370 1 : func (i *singleLevelIterator) SetBounds(lower, upper []byte) {
371 1 : i.boundsCmp = 0
372 1 : if i.vState != nil {
373 1 : // If the reader is constructed for a virtual sstable, then we must
374 1 : // constrain the bounds of the reader. For physical sstables, the bounds
375 1 : // can be wider than the actual sstable's bounds because we won't
376 1 : // accidentally expose additional keys as there are no additional keys.
377 1 : i.endKeyInclusive, lower, upper = i.vState.constrainBounds(
378 1 : lower, upper, false,
379 1 : )
380 1 : } else {
381 1 : // TODO(bananabrick): Figure out the logic here to enable the boundsCmp
382 1 : // optimization for virtual sstables.
383 1 : if i.positionedUsingLatestBounds {
384 1 : if i.upper != nil && lower != nil && i.cmp(i.upper, lower) <= 0 {
385 1 : i.boundsCmp = +1
386 1 : if invariants.Enabled && !ensureBoundsOptDeterminism &&
387 1 : disableBoundsOpt(lower, uintptr(unsafe.Pointer(i))) {
388 1 : i.boundsCmp = 0
389 1 : }
390 1 : } else if i.lower != nil && upper != nil && i.cmp(upper, i.lower) <= 0 {
391 1 : i.boundsCmp = -1
392 1 : if invariants.Enabled && !ensureBoundsOptDeterminism &&
393 1 : disableBoundsOpt(upper, uintptr(unsafe.Pointer(i))) {
394 1 : i.boundsCmp = 0
395 1 : }
396 : }
397 : }
398 : }
399 :
400 1 : i.positionedUsingLatestBounds = false
401 1 : i.lower = lower
402 1 : i.upper = upper
403 1 : i.blockLower = nil
404 1 : i.blockUpper = nil
405 : }
406 :
407 0 : func (i *singleLevelIterator) SetContext(ctx context.Context) {
408 0 : i.ctx = ctx
409 0 : }
410 :
411 : // loadBlock loads the block at the current index position and leaves i.data
412 : // unpositioned. If unsuccessful, it sets i.err to any error encountered, which
413 : // may be nil if we have simply exhausted the entire table.
414 1 : func (i *singleLevelIterator) loadBlock(dir int8) loadBlockResult {
415 1 : if !i.index.valid() {
416 0 : // Ensure the data block iterator is invalidated even if loading of the block
417 0 : // fails.
418 0 : i.data.invalidate()
419 0 : return loadBlockFailed
420 0 : }
421 : // Load the next block.
422 1 : v := i.index.value()
423 1 : bhp, err := decodeBlockHandleWithProperties(v.InPlaceValue())
424 1 : if i.dataBH == bhp.BlockHandle && i.data.valid() {
425 1 : // We're already at the data block we want to load. Reset bounds in case
426 1 : // they changed since the last seek, but don't reload the block from cache
427 1 : // or disk.
428 1 : //
429 1 : // It's safe to leave i.data in its original state here, as all callers to
430 1 : // loadBlock make an absolute positioning call (i.e. a seek, first, or last)
431 1 : // to `i.data` right after loadBlock returns loadBlockOK.
432 1 : i.initBounds()
433 1 : return loadBlockOK
434 1 : }
435 : // Ensure the data block iterator is invalidated even if loading of the block
436 : // fails.
437 1 : i.data.invalidate()
438 1 : i.dataBH = bhp.BlockHandle
439 1 : if err != nil {
440 0 : i.err = errCorruptIndexEntry(err)
441 0 : return loadBlockFailed
442 0 : }
443 1 : if i.bpfs != nil {
444 1 : intersects, err := i.bpfs.intersects(bhp.Props)
445 1 : if err != nil {
446 0 : i.err = errCorruptIndexEntry(err)
447 0 : return loadBlockFailed
448 0 : }
449 1 : if intersects == blockMaybeExcluded {
450 1 : intersects = i.resolveMaybeExcluded(dir)
451 1 : }
452 1 : if intersects == blockExcluded {
453 1 : return loadBlockIrrelevant
454 1 : }
455 : // blockIntersects
456 : }
457 1 : ctx := objiotracing.WithBlockType(i.ctx, objiotracing.DataBlock)
458 1 : block, err := i.reader.readBlock(
459 1 : ctx, i.dataBH, nil /* transform */, i.dataRH, i.stats, &i.iterStats, i.bufferPool)
460 1 : if err != nil {
461 0 : i.err = err
462 0 : return loadBlockFailed
463 0 : }
464 1 : i.err = i.data.initHandle(i.cmp, i.reader.Split, block, i.transforms)
465 1 : if i.err != nil {
466 0 : // The block is partially loaded, and we don't want it to appear valid.
467 0 : i.data.invalidate()
468 0 : return loadBlockFailed
469 0 : }
470 1 : i.initBounds()
471 1 : return loadBlockOK
472 : }
473 :
474 : // readBlockForVBR implements the blockProviderWhenOpen interface for use by
475 : // the valueBlockReader.
476 : func (i *singleLevelIterator) readBlockForVBR(
477 : h BlockHandle, stats *base.InternalIteratorStats,
478 1 : ) (bufferHandle, error) {
479 1 : ctx := objiotracing.WithBlockType(i.ctx, objiotracing.ValueBlock)
480 1 : return i.reader.readBlock(ctx, h, nil, i.vbRH, stats, &i.iterStats, i.bufferPool)
481 1 : }
482 :
483 : // resolveMaybeExcluded is invoked when the block-property filterer has found
484 : // that a block is excluded according to its properties but only if its bounds
485 : // fall within the filter's current bounds. This function consults the
486 : // apprioriate bound, depending on the iteration direction, and returns either
487 : // `blockIntersects` or `blockExcluded`.
488 1 : func (i *singleLevelIterator) resolveMaybeExcluded(dir int8) intersectsResult {
489 1 : // TODO(jackson): We could first try comparing to top-level index block's
490 1 : // key, and if within bounds avoid per-data block key comparisons.
491 1 :
492 1 : // This iterator is configured with a bound-limited block property
493 1 : // filter. The bpf determined this block could be excluded from
494 1 : // iteration based on the property encoded in the block handle.
495 1 : // However, we still need to determine if the block is wholly
496 1 : // contained within the filter's key bounds.
497 1 : //
498 1 : // External guarantees ensure all the block's keys are ≥ the
499 1 : // filter's lower bound during forward iteration, and that all the
500 1 : // block's keys are < the filter's upper bound during backward
501 1 : // iteration. We only need to determine if the opposite bound is
502 1 : // also met.
503 1 : //
504 1 : // The index separator in index.Key() provides an inclusive
505 1 : // upper-bound for the data block's keys, guaranteeing that all its
506 1 : // keys are ≤ index.Key(). For forward iteration, this is all we
507 1 : // need.
508 1 : if dir > 0 {
509 1 : // Forward iteration.
510 1 : if i.bpfs.boundLimitedFilter.KeyIsWithinUpperBound(i.index.Key().UserKey) {
511 1 : return blockExcluded
512 1 : }
513 1 : return blockIntersects
514 : }
515 :
516 : // Reverse iteration.
517 : //
518 : // Because we're iterating in the reverse direction, we don't yet have
519 : // enough context available to determine if the block is wholly contained
520 : // within its bounds. This case arises only during backward iteration,
521 : // because of the way the index is structured.
522 : //
523 : // Consider a bound-limited bpf limited to the bounds [b,d), loading the
524 : // block with separator `c`. During reverse iteration, the guarantee that
525 : // all the block's keys are < `d` is externally provided, but no guarantee
526 : // is made on the bpf's lower bound. The separator `c` only provides an
527 : // inclusive upper bound on the block's keys, indicating that the
528 : // corresponding block handle points to a block containing only keys ≤ `c`.
529 : //
530 : // To establish a lower bound, we step the index backwards to read the
531 : // previous block's separator, which provides an inclusive lower bound on
532 : // the original block's keys. Afterwards, we step forward to restore our
533 : // index position.
534 1 : if peekKV := i.index.Prev(); peekKV == nil {
535 1 : // The original block points to the first block of this index block. If
536 1 : // there's a two-level index, it could potentially provide a lower
537 1 : // bound, but the code refactoring necessary to read it doesn't seem
538 1 : // worth the payoff. We fall through to loading the block.
539 1 : } else if i.bpfs.boundLimitedFilter.KeyIsWithinLowerBound(peekKV.K.UserKey) {
540 1 : // The lower-bound on the original block falls within the filter's
541 1 : // bounds, and we can skip the block (after restoring our current index
542 1 : // position).
543 1 : _ = i.index.Next()
544 1 : return blockExcluded
545 1 : }
546 1 : _ = i.index.Next()
547 1 : return blockIntersects
548 : }
549 :
550 : // The number of times to call Next/Prev in a block before giving up and seeking.
551 : // The value of 4 is arbitrary.
552 : // TODO(sumeer): experiment with dynamic adjustment based on the history of
553 : // seeks for a particular iterator.
554 : const numStepsBeforeSeek = 4
555 :
556 : func (i *singleLevelIterator) trySeekGEUsingNextWithinBlock(
557 : key []byte,
558 1 : ) (kv *base.InternalKV, done bool) {
559 1 : kv = i.data.KV()
560 1 : for j := 0; j < numStepsBeforeSeek; j++ {
561 1 : curKeyCmp := i.cmp(kv.K.UserKey, key)
562 1 : if curKeyCmp >= 0 {
563 1 : if i.blockUpper != nil {
564 1 : cmp := i.cmp(kv.K.UserKey, i.blockUpper)
565 1 : if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
566 1 : i.exhaustedBounds = +1
567 1 : return nil, true
568 1 : }
569 : }
570 1 : return kv, true
571 : }
572 1 : kv = i.data.Next()
573 1 : if kv == nil {
574 1 : break
575 : }
576 : }
577 1 : return kv, false
578 : }
579 :
580 : func (i *singleLevelIterator) trySeekLTUsingPrevWithinBlock(
581 : key []byte,
582 1 : ) (kv *base.InternalKV, done bool) {
583 1 : kv = i.data.KV()
584 1 : for j := 0; j < numStepsBeforeSeek; j++ {
585 1 : curKeyCmp := i.cmp(kv.K.UserKey, key)
586 1 : if curKeyCmp < 0 {
587 1 : if i.blockLower != nil && i.cmp(kv.K.UserKey, i.blockLower) < 0 {
588 1 : i.exhaustedBounds = -1
589 1 : return nil, true
590 1 : }
591 1 : return kv, true
592 : }
593 1 : kv = i.data.Prev()
594 1 : if kv == nil {
595 1 : break
596 : }
597 : }
598 1 : return kv, false
599 : }
600 :
601 : // SeekGE implements internalIterator.SeekGE, as documented in the pebble
602 : // package. Note that SeekGE only checks the upper bound. It is up to the
603 : // caller to ensure that key is greater than or equal to the lower bound.
604 1 : func (i *singleLevelIterator) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKV {
605 1 : if i.vState != nil {
606 1 : // Callers of SeekGE don't know about virtual sstable bounds, so we may
607 1 : // have to internally restrict the bounds.
608 1 : //
609 1 : // TODO(bananabrick): We can optimize this check away for the level iter
610 1 : // if necessary.
611 1 : if i.cmp(key, i.lower) < 0 {
612 1 : key = i.lower
613 1 : }
614 : }
615 :
616 1 : if flags.TrySeekUsingNext() {
617 1 : // The i.exhaustedBounds comparison indicates that the upper bound was
618 1 : // reached. The i.data.isDataInvalidated() indicates that the sstable was
619 1 : // exhausted.
620 1 : if (i.exhaustedBounds == +1 || i.data.isDataInvalidated()) && i.err == nil {
621 1 : // Already exhausted, so return nil.
622 1 : return nil
623 1 : }
624 1 : if i.err != nil {
625 0 : // The current iterator position cannot be used.
626 0 : flags = flags.DisableTrySeekUsingNext()
627 0 : }
628 : // INVARIANT: flags.TrySeekUsingNext() => i.err == nil &&
629 : // !i.exhaustedBounds==+1 && !i.data.isDataInvalidated(). That is,
630 : // data-exhausted and bounds-exhausted, as defined earlier, are both
631 : // false. Ths makes it safe to clear out i.exhaustedBounds and i.err
632 : // before calling into seekGEHelper.
633 : }
634 :
635 1 : i.exhaustedBounds = 0
636 1 : i.err = nil // clear cached iteration error
637 1 : boundsCmp := i.boundsCmp
638 1 : // Seek optimization only applies until iterator is first positioned after SetBounds.
639 1 : i.boundsCmp = 0
640 1 : i.positionedUsingLatestBounds = true
641 1 : return i.seekGEHelper(key, boundsCmp, flags)
642 : }
643 :
644 : // seekGEHelper contains the common functionality for SeekGE and SeekPrefixGE.
645 : func (i *singleLevelIterator) seekGEHelper(
646 : key []byte, boundsCmp int, flags base.SeekGEFlags,
647 1 : ) *base.InternalKV {
648 1 : // Invariant: trySeekUsingNext => !i.data.isDataInvalidated() && i.exhaustedBounds != +1
649 1 :
650 1 : // SeekGE performs various step-instead-of-seeking optimizations: eg enabled
651 1 : // by trySeekUsingNext, or by monotonically increasing bounds (i.boundsCmp).
652 1 :
653 1 : var dontSeekWithinBlock bool
654 1 : if !i.data.isDataInvalidated() && !i.index.isDataInvalidated() && i.data.valid() && i.index.valid() &&
655 1 : boundsCmp > 0 && i.cmp(key, i.index.Key().UserKey) <= 0 {
656 1 : // Fast-path: The bounds have moved forward and this SeekGE is
657 1 : // respecting the lower bound (guaranteed by Iterator). We know that
658 1 : // the iterator must already be positioned within or just outside the
659 1 : // previous bounds. Therefore it cannot be positioned at a block (or
660 1 : // the position within that block) that is ahead of the seek position.
661 1 : // However it can be positioned at an earlier block. This fast-path to
662 1 : // use Next() on the block is only applied when we are already at the
663 1 : // block that the slow-path (the else-clause) would load -- this is
664 1 : // the motivation for the i.cmp(key, i.index.Key().UserKey) <= 0
665 1 : // predicate.
666 1 : i.initBoundsForAlreadyLoadedBlock()
667 1 : kv, done := i.trySeekGEUsingNextWithinBlock(key)
668 1 : if done {
669 1 : return kv
670 1 : }
671 1 : if kv == nil {
672 1 : // Done with this block.
673 1 : dontSeekWithinBlock = true
674 1 : }
675 1 : } else {
676 1 : // Cannot use bounds monotonicity. But may be able to optimize if
677 1 : // caller claimed externally known invariant represented by
678 1 : // flags.TrySeekUsingNext().
679 1 : if flags.TrySeekUsingNext() {
680 1 : // seekPrefixGE or SeekGE has already ensured
681 1 : // !i.data.isDataInvalidated() && i.exhaustedBounds != +1
682 1 : curr := i.data.KV()
683 1 : less := i.cmp(curr.K.UserKey, key) < 0
684 1 : // We could be more sophisticated and confirm that the seek
685 1 : // position is within the current block before applying this
686 1 : // optimization. But there may be some benefit even if it is in
687 1 : // the next block, since we can avoid seeking i.index.
688 1 : for j := 0; less && j < numStepsBeforeSeek; j++ {
689 1 : curr = i.Next()
690 1 : if curr == nil {
691 1 : return nil
692 1 : }
693 1 : less = i.cmp(curr.K.UserKey, key) < 0
694 : }
695 1 : if !less {
696 1 : if i.blockUpper != nil {
697 1 : cmp := i.cmp(curr.K.UserKey, i.blockUpper)
698 1 : if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
699 0 : i.exhaustedBounds = +1
700 0 : return nil
701 0 : }
702 : }
703 1 : return curr
704 : }
705 : }
706 :
707 : // Slow-path.
708 :
709 1 : var ikv *base.InternalKV
710 1 : if ikv = i.index.SeekGE(key, flags.DisableTrySeekUsingNext()); ikv == nil {
711 1 : // The target key is greater than any key in the index block.
712 1 : // Invalidate the block iterator so that a subsequent call to Prev()
713 1 : // will return the last key in the table.
714 1 : i.data.invalidate()
715 1 : return nil
716 1 : }
717 1 : result := i.loadBlock(+1)
718 1 : if result == loadBlockFailed {
719 0 : return nil
720 0 : }
721 1 : if result == loadBlockIrrelevant {
722 1 : // Enforce the upper bound here since don't want to bother moving
723 1 : // to the next block if upper bound is already exceeded. Note that
724 1 : // the next block starts with keys >= ikey.UserKey since even
725 1 : // though this is the block separator, the same user key can span
726 1 : // multiple blocks. If upper is exclusive we use >= below, else
727 1 : // we use >.
728 1 : if i.upper != nil {
729 1 : cmp := i.cmp(ikv.K.UserKey, i.upper)
730 1 : if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
731 1 : i.exhaustedBounds = +1
732 1 : return nil
733 1 : }
734 : }
735 : // Want to skip to the next block.
736 1 : dontSeekWithinBlock = true
737 : }
738 : }
739 1 : if !dontSeekWithinBlock {
740 1 : if ikv := i.data.SeekGE(key, flags.DisableTrySeekUsingNext()); ikv != nil {
741 1 : if i.blockUpper != nil {
742 1 : cmp := i.cmp(ikv.K.UserKey, i.blockUpper)
743 1 : if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
744 1 : i.exhaustedBounds = +1
745 1 : return nil
746 1 : }
747 : }
748 1 : return ikv
749 : }
750 : }
751 1 : return i.skipForward()
752 : }
753 :
754 : // SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the
755 : // pebble package. Note that SeekPrefixGE only checks the upper bound. It is up
756 : // to the caller to ensure that key is greater than or equal to the lower bound.
757 : func (i *singleLevelIterator) SeekPrefixGE(
758 : prefix, key []byte, flags base.SeekGEFlags,
759 1 : ) *base.InternalKV {
760 1 : if i.vState != nil {
761 1 : // Callers of SeekPrefixGE aren't aware of virtual sstable bounds, so
762 1 : // we may have to internally restrict the bounds.
763 1 : //
764 1 : // TODO(bananabrick): We can optimize away this check for the level iter
765 1 : // if necessary.
766 1 : if i.cmp(key, i.lower) < 0 {
767 1 : key = i.lower
768 1 : }
769 : }
770 1 : return i.seekPrefixGE(prefix, key, flags, i.useFilter)
771 : }
772 :
773 : func (i *singleLevelIterator) seekPrefixGE(
774 : prefix, key []byte, flags base.SeekGEFlags, checkFilter bool,
775 1 : ) (kv *base.InternalKV) {
776 1 : // NOTE: prefix is only used for bloom filter checking and not later work in
777 1 : // this method. Hence, we can use the existing iterator position if the last
778 1 : // SeekPrefixGE did not fail bloom filter matching.
779 1 :
780 1 : err := i.err
781 1 : i.err = nil // clear cached iteration error
782 1 : if checkFilter && i.reader.tableFilter != nil {
783 1 : if !i.lastBloomFilterMatched {
784 1 : // Iterator is not positioned based on last seek.
785 1 : flags = flags.DisableTrySeekUsingNext()
786 1 : }
787 1 : i.lastBloomFilterMatched = false
788 1 : // Check prefix bloom filter.
789 1 : var mayContain bool
790 1 : mayContain, i.err = i.bloomFilterMayContain(prefix)
791 1 : if i.err != nil || !mayContain {
792 1 : // In the i.err == nil case, this invalidation may not be necessary for
793 1 : // correctness, and may be a place to optimize later by reusing the
794 1 : // already loaded block. It was necessary in earlier versions of the code
795 1 : // since the caller was allowed to call Next when SeekPrefixGE returned
796 1 : // nil. This is no longer allowed.
797 1 : i.data.invalidate()
798 1 : return nil
799 1 : }
800 1 : i.lastBloomFilterMatched = true
801 : }
802 1 : if flags.TrySeekUsingNext() {
803 1 : // The i.exhaustedBounds comparison indicates that the upper bound was
804 1 : // reached. The i.data.isDataInvalidated() indicates that the sstable was
805 1 : // exhausted.
806 1 : if (i.exhaustedBounds == +1 || i.data.isDataInvalidated()) && err == nil {
807 1 : // Already exhausted, so return nil.
808 1 : return nil
809 1 : }
810 1 : if err != nil {
811 0 : // The current iterator position cannot be used.
812 0 : flags = flags.DisableTrySeekUsingNext()
813 0 : }
814 : // INVARIANT: flags.TrySeekUsingNext() => err == nil &&
815 : // !i.exhaustedBounds==+1 && !i.data.isDataInvalidated(). That is,
816 : // data-exhausted and bounds-exhausted, as defined earlier, are both
817 : // false. Ths makes it safe to clear out i.exhaustedBounds and i.err
818 : // before calling into seekGEHelper.
819 : }
820 : // Bloom filter matches, or skipped, so this method will position the
821 : // iterator.
822 1 : i.exhaustedBounds = 0
823 1 : boundsCmp := i.boundsCmp
824 1 : // Seek optimization only applies until iterator is first positioned after SetBounds.
825 1 : i.boundsCmp = 0
826 1 : i.positionedUsingLatestBounds = true
827 1 : return i.maybeVerifyKey(i.seekGEHelper(key, boundsCmp, flags))
828 : }
829 :
830 1 : func (i *singleLevelIterator) bloomFilterMayContain(prefix []byte) (bool, error) {
831 1 : // Check prefix bloom filter.
832 1 : prefixToCheck := prefix
833 1 : if i.transforms.SyntheticPrefix.IsSet() {
834 1 : // We have to remove the synthetic prefix.
835 1 : var ok bool
836 1 : prefixToCheck, ok = bytes.CutPrefix(prefix, i.transforms.SyntheticPrefix)
837 1 : if !ok {
838 1 : // This prefix will not be found inside this table.
839 1 : return false, nil
840 1 : }
841 : }
842 :
843 1 : dataH, err := i.reader.readFilter(i.ctx, i.indexFilterRH, i.stats, &i.iterStats)
844 1 : if err != nil {
845 0 : return false, err
846 0 : }
847 1 : defer dataH.Release()
848 1 : return i.reader.tableFilter.mayContain(dataH.Get(), prefixToCheck), nil
849 : }
850 :
851 : // virtualLast should only be called if i.vReader != nil.
852 1 : func (i *singleLevelIterator) virtualLast() *base.InternalKV {
853 1 : if i.vState == nil {
854 0 : panic("pebble: invalid call to virtualLast")
855 : }
856 :
857 1 : if !i.endKeyInclusive {
858 1 : // Trivial case.
859 1 : return i.SeekLT(i.upper, base.SeekLTFlagsNone)
860 1 : }
861 1 : return i.virtualLastSeekLE()
862 : }
863 :
864 : // virtualLastSeekLE is called by virtualLast to do a SeekLE as part of a
865 : // virtualLast. Consider generalizing this into a SeekLE() if there are other
866 : // uses of this method in the future. Does a SeekLE on the upper bound of the
867 : // file/iterator.
868 1 : func (i *singleLevelIterator) virtualLastSeekLE() *base.InternalKV {
869 1 : // Callers of SeekLE don't know about virtual sstable bounds, so we may
870 1 : // have to internally restrict the bounds.
871 1 : //
872 1 : // TODO(bananabrick): We can optimize this check away for the level iter
873 1 : // if necessary.
874 1 : if !i.endKeyInclusive {
875 0 : panic("unexpected virtualLastSeekLE with exclusive upper bounds")
876 : }
877 1 : key := i.upper
878 1 :
879 1 : i.exhaustedBounds = 0
880 1 : i.err = nil // clear cached iteration error
881 1 : // Seek optimization only applies until iterator is first positioned with a
882 1 : // SeekGE or SeekLT after SetBounds.
883 1 : i.boundsCmp = 0
884 1 : i.positionedUsingLatestBounds = true
885 1 :
886 1 : ikv := i.index.SeekGE(key, base.SeekGEFlagsNone)
887 1 : // We can have multiple internal keys with the same user key as the seek
888 1 : // key. In that case, we want the last (greatest) internal key.
889 1 : //
890 1 : // INVARIANT: One of two cases:
891 1 : // A. ikey == nil. There is no data block with index key >= key. So all keys
892 1 : // in the last data block are < key.
893 1 : // B. ikey.userkey >= key. This data block may have some keys > key.
894 1 : //
895 1 : // Subcases of B:
896 1 : // B1. ikey.userkey == key. This is when loop iteration happens.
897 1 : // Since ikey.UserKey >= largest data key in the block, the largest data
898 1 : // key in this block is <= key.
899 1 : // B2. ikey.userkey > key. Loop iteration will not happen.
900 1 : //
901 1 : // NB: We can avoid this Next()ing if we just implement a blockIter.SeekLE().
902 1 : // This might be challenging to do correctly, so impose regular operations
903 1 : // for now.
904 1 : for ikv != nil && bytes.Equal(ikv.K.UserKey, key) {
905 1 : ikv = i.index.Next()
906 1 : }
907 1 : if ikv == nil {
908 1 : // Cases A or B1 where B1 exhausted all blocks. In both cases the last block
909 1 : // has all keys <= key. skipBackward enforces the lower bound.
910 1 : return i.skipBackward()
911 1 : }
912 : // Case B. We are here because we were originally in case B2, or we were in B1
913 : // and we arrived at a block where ikey.UserKey > key. Either way, ikey.UserKey
914 : // > key. So there could be keys in the block > key. But the block preceding
915 : // this block cannot have any keys > key, otherwise it would have been the
916 : // result of the original index.SeekGE.
917 1 : result := i.loadBlock(-1)
918 1 : if result == loadBlockFailed {
919 0 : return nil
920 0 : }
921 1 : if result == loadBlockIrrelevant {
922 1 : // Want to skip to the previous block.
923 1 : return i.skipBackward()
924 1 : }
925 1 : ikv = i.data.SeekGE(key, base.SeekGEFlagsNone)
926 1 : // Go to the last user key that matches key, and then Prev() on the data
927 1 : // block.
928 1 : for ikv != nil && bytes.Equal(ikv.K.UserKey, key) {
929 1 : ikv = i.data.Next()
930 1 : }
931 1 : ikv = i.data.Prev()
932 1 : if ikv != nil {
933 1 : // Enforce the lower bound here, as we could have gone past it. This happens
934 1 : // if keys between `i.blockLower` and `key` are obsolete, for instance. Even
935 1 : // though i.blockLower (which is either nil or equal to i.lower) is <= key,
936 1 : // all internal keys in the user key interval [i.blockLower, key] could be
937 1 : // obsolete (due to a RANGEDEL which will not be observed here). And
938 1 : // i.data.Prev will skip all these obsolete keys, and could land on a key
939 1 : // below the lower bound, requiring the lower bound check.
940 1 : if i.blockLower != nil && i.cmp(ikv.K.UserKey, i.blockLower) < 0 {
941 1 : i.exhaustedBounds = -1
942 1 : return nil
943 1 : }
944 1 : return ikv
945 : }
946 1 : return i.skipBackward()
947 : }
948 :
949 : // SeekLT implements internalIterator.SeekLT, as documented in the pebble
950 : // package. Note that SeekLT only checks the lower bound. It is up to the
951 : // caller to ensure that key is less than or equal to the upper bound.
952 1 : func (i *singleLevelIterator) SeekLT(key []byte, flags base.SeekLTFlags) *base.InternalKV {
953 1 : if i.vState != nil {
954 1 : // Might have to fix upper bound since virtual sstable bounds are not
955 1 : // known to callers of SeekLT.
956 1 : //
957 1 : // TODO(bananabrick): We can optimize away this check for the level iter
958 1 : // if necessary.
959 1 : cmp := i.cmp(key, i.upper)
960 1 : // key == i.upper is fine. We'll do the right thing and return the
961 1 : // first internal key with user key < key.
962 1 : if cmp > 0 {
963 1 : // Return the last key in the virtual sstable.
964 1 : return i.maybeVerifyKey(i.virtualLast())
965 1 : }
966 : }
967 :
968 1 : i.exhaustedBounds = 0
969 1 : i.err = nil // clear cached iteration error
970 1 : boundsCmp := i.boundsCmp
971 1 : // Seek optimization only applies until iterator is first positioned after SetBounds.
972 1 : i.boundsCmp = 0
973 1 :
974 1 : // Seeking operations perform various step-instead-of-seeking optimizations:
975 1 : // eg by considering monotonically increasing bounds (i.boundsCmp).
976 1 :
977 1 : i.positionedUsingLatestBounds = true
978 1 :
979 1 : var dontSeekWithinBlock bool
980 1 : if !i.data.isDataInvalidated() && !i.index.isDataInvalidated() && i.data.valid() && i.index.valid() &&
981 1 : boundsCmp < 0 && i.cmp(i.data.getFirstUserKey(), key) < 0 {
982 1 : // Fast-path: The bounds have moved backward, and this SeekLT is
983 1 : // respecting the upper bound (guaranteed by Iterator). We know that
984 1 : // the iterator must already be positioned within or just outside the
985 1 : // previous bounds. Therefore it cannot be positioned at a block (or
986 1 : // the position within that block) that is behind the seek position.
987 1 : // However it can be positioned at a later block. This fast-path to
988 1 : // use Prev() on the block is only applied when we are already at the
989 1 : // block that can satisfy this seek -- this is the motivation for the
990 1 : // the i.cmp(i.data.firstKey.UserKey, key) < 0 predicate.
991 1 : i.initBoundsForAlreadyLoadedBlock()
992 1 : ikv, done := i.trySeekLTUsingPrevWithinBlock(key)
993 1 : if done {
994 1 : return ikv
995 1 : }
996 1 : if ikv == nil {
997 1 : // Done with this block.
998 1 : dontSeekWithinBlock = true
999 1 : }
1000 1 : } else {
1001 1 : // Slow-path.
1002 1 : var ikv *base.InternalKV
1003 1 :
1004 1 : // NB: If a bound-limited block property filter is configured, it's
1005 1 : // externally ensured that the filter is disabled (through returning
1006 1 : // Intersects=false irrespective of the block props provided) during
1007 1 : // seeks.
1008 1 : if ikv = i.index.SeekGE(key, base.SeekGEFlagsNone); ikv == nil {
1009 1 : ikv = i.index.Last()
1010 1 : if ikv == nil {
1011 0 : return nil
1012 0 : }
1013 : }
1014 : // INVARIANT: ikey != nil.
1015 1 : result := i.loadBlock(-1)
1016 1 : if result == loadBlockFailed {
1017 0 : return nil
1018 0 : }
1019 1 : if result == loadBlockIrrelevant {
1020 1 : // Enforce the lower bound here since don't want to bother moving
1021 1 : // to the previous block if lower bound is already exceeded. Note
1022 1 : // that the previous block starts with keys <= ikey.UserKey since
1023 1 : // even though this is the current block's separator, the same
1024 1 : // user key can span multiple blocks.
1025 1 : if i.lower != nil && i.cmp(ikv.K.UserKey, i.lower) < 0 {
1026 1 : i.exhaustedBounds = -1
1027 1 : return nil
1028 1 : }
1029 : // Want to skip to the previous block.
1030 1 : dontSeekWithinBlock = true
1031 : }
1032 : }
1033 1 : if !dontSeekWithinBlock {
1034 1 : if ikv := i.data.SeekLT(key, flags); ikv != nil {
1035 1 : if i.blockLower != nil && i.cmp(ikv.K.UserKey, i.blockLower) < 0 {
1036 1 : i.exhaustedBounds = -1
1037 1 : return nil
1038 1 : }
1039 1 : return ikv
1040 : }
1041 : }
1042 : // The index contains separator keys which may lie between
1043 : // user-keys. Consider the user-keys:
1044 : //
1045 : // complete
1046 : // ---- new block ---
1047 : // complexion
1048 : //
1049 : // If these two keys end one block and start the next, the index key may
1050 : // be chosen as "compleu". The SeekGE in the index block will then point
1051 : // us to the block containing "complexion". If this happens, we want the
1052 : // last key from the previous data block.
1053 1 : return i.maybeVerifyKey(i.skipBackward())
1054 : }
1055 :
1056 : // First implements internalIterator.First, as documented in the pebble
1057 : // package. Note that First only checks the upper bound. It is up to the caller
1058 : // to ensure that key is greater than or equal to the lower bound (e.g. via a
1059 : // call to SeekGE(lower)).
1060 1 : func (i *singleLevelIterator) First() *base.InternalKV {
1061 1 : // If we have a lower bound, use SeekGE. Note that in general this is not
1062 1 : // supported usage, except when the lower bound is there because the table is
1063 1 : // virtual.
1064 1 : if i.lower != nil {
1065 1 : return i.SeekGE(i.lower, base.SeekGEFlagsNone)
1066 1 : }
1067 :
1068 1 : i.positionedUsingLatestBounds = true
1069 1 :
1070 1 : return i.firstInternal()
1071 : }
1072 :
1073 : // firstInternal is a helper used for absolute positioning in a single-level
1074 : // index file, or for positioning in the second-level index in a two-level
1075 : // index file. For the latter, one cannot make any claims about absolute
1076 : // positioning.
1077 1 : func (i *singleLevelIterator) firstInternal() *base.InternalKV {
1078 1 : i.exhaustedBounds = 0
1079 1 : i.err = nil // clear cached iteration error
1080 1 : // Seek optimization only applies until iterator is first positioned after SetBounds.
1081 1 : i.boundsCmp = 0
1082 1 :
1083 1 : var kv *base.InternalKV
1084 1 : if kv = i.index.First(); kv == nil {
1085 0 : i.data.invalidate()
1086 0 : return nil
1087 0 : }
1088 1 : result := i.loadBlock(+1)
1089 1 : if result == loadBlockFailed {
1090 0 : return nil
1091 0 : }
1092 1 : if result == loadBlockOK {
1093 1 : if kv := i.data.First(); kv != nil {
1094 1 : if i.blockUpper != nil {
1095 1 : cmp := i.cmp(kv.K.UserKey, i.blockUpper)
1096 1 : if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
1097 1 : i.exhaustedBounds = +1
1098 1 : return nil
1099 1 : }
1100 : }
1101 1 : return kv
1102 : }
1103 : // Else fall through to skipForward.
1104 1 : } else {
1105 1 : // result == loadBlockIrrelevant. Enforce the upper bound here since
1106 1 : // don't want to bother moving to the next block if upper bound is
1107 1 : // already exceeded. Note that the next block starts with keys >=
1108 1 : // ikey.UserKey since even though this is the block separator, the
1109 1 : // same user key can span multiple blocks. If upper is exclusive we
1110 1 : // use >= below, else we use >.
1111 1 : if i.upper != nil {
1112 1 : cmp := i.cmp(kv.K.UserKey, i.upper)
1113 1 : if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
1114 1 : i.exhaustedBounds = +1
1115 1 : return nil
1116 1 : }
1117 : }
1118 : // Else fall through to skipForward.
1119 : }
1120 :
1121 1 : return i.skipForward()
1122 : }
1123 :
1124 : // Last implements internalIterator.Last, as documented in the pebble
1125 : // package. Note that Last only checks the lower bound. It is up to the caller
1126 : // to ensure that key is less than the upper bound (e.g. via a call to
1127 : // SeekLT(upper))
1128 1 : func (i *singleLevelIterator) Last() *base.InternalKV {
1129 1 : if i.vState != nil {
1130 1 : return i.maybeVerifyKey(i.virtualLast())
1131 1 : }
1132 :
1133 1 : if i.upper != nil {
1134 0 : panic("singleLevelIterator.Last() used despite upper bound")
1135 : }
1136 1 : i.positionedUsingLatestBounds = true
1137 1 : return i.lastInternal()
1138 : }
1139 :
1140 : // lastInternal is a helper used for absolute positioning in a single-level
1141 : // index file, or for positioning in the second-level index in a two-level
1142 : // index file. For the latter, one cannot make any claims about absolute
1143 : // positioning.
1144 1 : func (i *singleLevelIterator) lastInternal() *base.InternalKV {
1145 1 : i.exhaustedBounds = 0
1146 1 : i.err = nil // clear cached iteration error
1147 1 : // Seek optimization only applies until iterator is first positioned after SetBounds.
1148 1 : i.boundsCmp = 0
1149 1 :
1150 1 : var ikv *base.InternalKV
1151 1 : if ikv = i.index.Last(); ikv == nil {
1152 0 : i.data.invalidate()
1153 0 : return nil
1154 0 : }
1155 1 : result := i.loadBlock(-1)
1156 1 : if result == loadBlockFailed {
1157 0 : return nil
1158 0 : }
1159 1 : if result == loadBlockOK {
1160 1 : if ikv := i.data.Last(); ikv != nil {
1161 1 : if i.blockLower != nil && i.cmp(ikv.K.UserKey, i.blockLower) < 0 {
1162 1 : i.exhaustedBounds = -1
1163 1 : return nil
1164 1 : }
1165 1 : return ikv
1166 : }
1167 : // Else fall through to skipBackward.
1168 1 : } else {
1169 1 : // result == loadBlockIrrelevant. Enforce the lower bound here since
1170 1 : // don't want to bother moving to the previous block if lower bound is
1171 1 : // already exceeded. Note that the previous block starts with keys <=
1172 1 : // key.UserKey since even though this is the current block's
1173 1 : // separator, the same user key can span multiple blocks.
1174 1 : if i.lower != nil && i.cmp(ikv.K.UserKey, i.lower) < 0 {
1175 1 : i.exhaustedBounds = -1
1176 1 : return nil
1177 1 : }
1178 : }
1179 :
1180 1 : return i.skipBackward()
1181 : }
1182 :
1183 : // Next implements internalIterator.Next, as documented in the pebble
1184 : // package.
1185 : // Note: compactionIterator.Next mirrors the implementation of Iterator.Next
1186 : // due to performance. Keep the two in sync.
1187 1 : func (i *singleLevelIterator) Next() *base.InternalKV {
1188 1 : if i.exhaustedBounds == +1 {
1189 0 : panic("Next called even though exhausted upper bound")
1190 : }
1191 1 : i.exhaustedBounds = 0
1192 1 : // Seek optimization only applies until iterator is first positioned after SetBounds.
1193 1 : i.boundsCmp = 0
1194 1 :
1195 1 : if i.err != nil {
1196 0 : // TODO(jackson): Can this case be turned into a panic? Once an error is
1197 0 : // encountered, the iterator must be re-seeked.
1198 0 : return nil
1199 0 : }
1200 1 : if kv := i.data.Next(); kv != nil {
1201 1 : if i.blockUpper != nil {
1202 1 : cmp := i.cmp(kv.K.UserKey, i.blockUpper)
1203 1 : if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
1204 1 : i.exhaustedBounds = +1
1205 1 : return nil
1206 1 : }
1207 : }
1208 1 : return kv
1209 : }
1210 1 : return i.skipForward()
1211 : }
1212 :
1213 : // NextPrefix implements (base.InternalIterator).NextPrefix.
1214 1 : func (i *singleLevelIterator) NextPrefix(succKey []byte) *base.InternalKV {
1215 1 : if i.exhaustedBounds == +1 {
1216 0 : panic("NextPrefix called even though exhausted upper bound")
1217 : }
1218 1 : i.exhaustedBounds = 0
1219 1 : // Seek optimization only applies until iterator is first positioned after SetBounds.
1220 1 : i.boundsCmp = 0
1221 1 : if i.err != nil {
1222 0 : // TODO(jackson): Can this case be turned into a panic? Once an error is
1223 0 : // encountered, the iterator must be re-seeked.
1224 0 : return nil
1225 0 : }
1226 1 : if kv := i.data.NextPrefix(succKey); kv != nil {
1227 1 : if i.blockUpper != nil {
1228 1 : cmp := i.cmp(kv.K.UserKey, i.blockUpper)
1229 1 : if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
1230 1 : i.exhaustedBounds = +1
1231 1 : return nil
1232 1 : }
1233 : }
1234 1 : return kv
1235 : }
1236 : // Did not find prefix in the existing data block. This is the slow-path
1237 : // where we effectively seek the iterator.
1238 1 : var ikv *base.InternalKV
1239 1 : // The key is likely to be in the next data block, so try one step.
1240 1 : if ikv = i.index.Next(); ikv == nil {
1241 1 : // The target key is greater than any key in the index block.
1242 1 : // Invalidate the block iterator so that a subsequent call to Prev()
1243 1 : // will return the last key in the table.
1244 1 : i.data.invalidate()
1245 1 : return nil
1246 1 : }
1247 1 : if i.cmp(succKey, ikv.K.UserKey) > 0 {
1248 1 : // Not in the next data block, so seek the index.
1249 1 : if ikv = i.index.SeekGE(succKey, base.SeekGEFlagsNone); ikv == nil {
1250 1 : // The target key is greater than any key in the index block.
1251 1 : // Invalidate the block iterator so that a subsequent call to Prev()
1252 1 : // will return the last key in the table.
1253 1 : i.data.invalidate()
1254 1 : return nil
1255 1 : }
1256 : }
1257 1 : result := i.loadBlock(+1)
1258 1 : if result == loadBlockFailed {
1259 0 : return nil
1260 0 : }
1261 1 : if result == loadBlockIrrelevant {
1262 1 : // Enforce the upper bound here since don't want to bother moving
1263 1 : // to the next block if upper bound is already exceeded. Note that
1264 1 : // the next block starts with keys >= ikey.UserKey since even
1265 1 : // though this is the block separator, the same user key can span
1266 1 : // multiple blocks. If upper is exclusive we use >= below, else we use
1267 1 : // >.
1268 1 : if i.upper != nil {
1269 1 : cmp := i.cmp(ikv.K.UserKey, i.upper)
1270 1 : if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
1271 1 : i.exhaustedBounds = +1
1272 1 : return nil
1273 1 : }
1274 : }
1275 1 : } else if kv := i.data.SeekGE(succKey, base.SeekGEFlagsNone); kv != nil {
1276 1 : if i.blockUpper != nil {
1277 1 : cmp := i.cmp(kv.K.UserKey, i.blockUpper)
1278 1 : if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
1279 1 : i.exhaustedBounds = +1
1280 1 : return nil
1281 1 : }
1282 : }
1283 1 : return i.maybeVerifyKey(kv)
1284 : }
1285 :
1286 1 : return i.skipForward()
1287 : }
1288 :
1289 : // Prev implements internalIterator.Prev, as documented in the pebble
1290 : // package.
1291 1 : func (i *singleLevelIterator) Prev() *base.InternalKV {
1292 1 : if i.exhaustedBounds == -1 {
1293 0 : panic("Prev called even though exhausted lower bound")
1294 : }
1295 1 : i.exhaustedBounds = 0
1296 1 : // Seek optimization only applies until iterator is first positioned after SetBounds.
1297 1 : i.boundsCmp = 0
1298 1 :
1299 1 : if i.err != nil {
1300 0 : return nil
1301 0 : }
1302 1 : if kv := i.data.Prev(); kv != nil {
1303 1 : if i.blockLower != nil && i.cmp(kv.K.UserKey, i.blockLower) < 0 {
1304 1 : i.exhaustedBounds = -1
1305 1 : return nil
1306 1 : }
1307 1 : return kv
1308 : }
1309 1 : return i.skipBackward()
1310 : }
1311 :
1312 1 : func (i *singleLevelIterator) skipForward() *base.InternalKV {
1313 1 : for {
1314 1 : indexKey := i.index.Next()
1315 1 : if indexKey == nil {
1316 1 : i.data.invalidate()
1317 1 : break
1318 : }
1319 1 : result := i.loadBlock(+1)
1320 1 : if result != loadBlockOK {
1321 1 : if i.err != nil {
1322 0 : break
1323 : }
1324 1 : if result == loadBlockFailed {
1325 0 : // We checked that i.index was at a valid entry, so
1326 0 : // loadBlockFailed could not have happened due to i.index
1327 0 : // being exhausted, and must be due to an error.
1328 0 : panic("loadBlock should not have failed with no error")
1329 : }
1330 : // result == loadBlockIrrelevant. Enforce the upper bound here
1331 : // since don't want to bother moving to the next block if upper
1332 : // bound is already exceeded. Note that the next block starts with
1333 : // keys >= key.UserKey since even though this is the block
1334 : // separator, the same user key can span multiple blocks. If upper
1335 : // is exclusive we use >= below, else we use >.
1336 1 : if i.upper != nil {
1337 1 : cmp := i.cmp(indexKey.K.UserKey, i.upper)
1338 1 : if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
1339 1 : i.exhaustedBounds = +1
1340 1 : return nil
1341 1 : }
1342 : }
1343 1 : continue
1344 : }
1345 1 : var kv *base.InternalKV
1346 1 : // It is possible that skipBackward went too far and the virtual table lower
1347 1 : // bound is after the first key in the block we are about to load, in which
1348 1 : // case we must use SeekGE.
1349 1 : //
1350 1 : // An example of how this can happen:
1351 1 : //
1352 1 : // Data block 1 - contains keys a@1, c@1
1353 1 : // Data block 2 - contains keys e@1, g@1
1354 1 : // Data block 3 - contains keys i@2, k@2
1355 1 : //
1356 1 : // The virtual table lower bound is f. We have a range key masking filter
1357 1 : // that filters keys with @1 suffix. We are positioned inside block 3 then
1358 1 : // we Prev(). Block 2 is entirely filtered out, which makes us move to
1359 1 : // block 1. Now the range key masking filter gets an update (via
1360 1 : // SpanChanged) and it no longer filters out any keys. At this point if a
1361 1 : // Next happens, we will load block 2 but it would not be legal to return
1362 1 : // "e@1" which is outside the virtual bounds.
1363 1 : //
1364 1 : // The core of the problem is that skipBackward doesn't know it can stop
1365 1 : // at block 2, because it doesn't know what keys are at the start of that
1366 1 : // block. This is why we don't have this problem in the opposite
1367 1 : // direction: skipForward will never go beyond the last relevant block
1368 1 : // because it looks at the separator key which is an upper bound for the
1369 1 : // block.
1370 1 : //
1371 1 : // Note that this is only a problem with virtual tables; we make no
1372 1 : // guarantees wrt an iterator lower bound when we iterate forward. But we
1373 1 : // must never return keys that are not inside the virtual table.
1374 1 : if i.vState != nil && i.blockLower != nil {
1375 1 : kv = i.data.SeekGE(i.lower, base.SeekGEFlagsNone)
1376 1 : } else {
1377 1 : kv = i.data.First()
1378 1 : }
1379 1 : if kv != nil {
1380 1 : if i.blockUpper != nil {
1381 1 : cmp := i.cmp(kv.K.UserKey, i.blockUpper)
1382 1 : if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
1383 1 : i.exhaustedBounds = +1
1384 1 : return nil
1385 1 : }
1386 : }
1387 1 : return i.maybeVerifyKey(kv)
1388 : }
1389 : }
1390 1 : return nil
1391 : }
1392 :
1393 1 : func (i *singleLevelIterator) skipBackward() *base.InternalKV {
1394 1 : for {
1395 1 : indexKey := i.index.Prev()
1396 1 : if indexKey == nil {
1397 1 : i.data.invalidate()
1398 1 : break
1399 : }
1400 1 : result := i.loadBlock(-1)
1401 1 : if result != loadBlockOK {
1402 1 : if i.err != nil {
1403 0 : break
1404 : }
1405 1 : if result == loadBlockFailed {
1406 0 : // We checked that i.index was at a valid entry, so
1407 0 : // loadBlockFailed could not have happened due to to i.index
1408 0 : // being exhausted, and must be due to an error.
1409 0 : panic("loadBlock should not have failed with no error")
1410 : }
1411 : // result == loadBlockIrrelevant. Enforce the lower bound here
1412 : // since don't want to bother moving to the previous block if lower
1413 : // bound is already exceeded. Note that the previous block starts with
1414 : // keys <= key.UserKey since even though this is the current block's
1415 : // separator, the same user key can span multiple blocks.
1416 1 : if i.lower != nil && i.cmp(indexKey.K.UserKey, i.lower) < 0 {
1417 1 : i.exhaustedBounds = -1
1418 1 : return nil
1419 1 : }
1420 1 : continue
1421 : }
1422 1 : kv := i.data.Last()
1423 1 : if kv == nil {
1424 1 : return nil
1425 1 : }
1426 1 : if i.blockLower != nil && i.cmp(kv.K.UserKey, i.blockLower) < 0 {
1427 1 : i.exhaustedBounds = -1
1428 1 : return nil
1429 1 : }
1430 1 : return i.maybeVerifyKey(kv)
1431 : }
1432 1 : return nil
1433 : }
1434 :
1435 : // Error implements internalIterator.Error, as documented in the pebble
1436 : // package.
1437 1 : func (i *singleLevelIterator) Error() error {
1438 1 : if err := i.data.Error(); err != nil {
1439 0 : return err
1440 0 : }
1441 1 : return i.err
1442 : }
1443 :
1444 : // SetCloseHook sets a function that will be called when the iterator is
1445 : // closed.
1446 1 : func (i *singleLevelIterator) SetCloseHook(fn func(i Iterator) error) {
1447 1 : i.closeHook = fn
1448 1 : }
1449 :
1450 1 : func firstError(err0, err1 error) error {
1451 1 : if err0 != nil {
1452 0 : return err0
1453 0 : }
1454 1 : return err1
1455 : }
1456 :
1457 : // Close implements internalIterator.Close, as documented in the pebble
1458 : // package.
1459 1 : func (i *singleLevelIterator) Close() error {
1460 1 : if invariants.Enabled && i.inPool {
1461 0 : panic("Close called on interator in pool")
1462 : }
1463 1 : i.iterStats.close()
1464 1 : var err error
1465 1 : if i.closeHook != nil {
1466 1 : err = firstError(err, i.closeHook(i))
1467 1 : }
1468 1 : err = firstError(err, i.data.Close())
1469 1 : err = firstError(err, i.index.Close())
1470 1 : if i.indexFilterRH != nil {
1471 1 : err = firstError(err, i.indexFilterRH.Close())
1472 1 : i.indexFilterRH = nil
1473 1 : }
1474 1 : if i.dataRH != nil {
1475 1 : err = firstError(err, i.dataRH.Close())
1476 1 : i.dataRH = nil
1477 1 : }
1478 1 : err = firstError(err, i.err)
1479 1 : if i.bpfs != nil {
1480 1 : releaseBlockPropertiesFilterer(i.bpfs)
1481 1 : }
1482 1 : if i.vbReader != nil {
1483 1 : i.vbReader.close()
1484 1 : }
1485 1 : if i.vbRH != nil {
1486 1 : err = firstError(err, i.vbRH.Close())
1487 1 : i.vbRH = nil
1488 1 : }
1489 1 : *i = i.resetForReuse()
1490 1 : singleLevelIterPool.Put(i)
1491 1 : return err
1492 : }
1493 :
1494 0 : func (i *singleLevelIterator) String() string {
1495 0 : if i.vState != nil {
1496 0 : return i.vState.fileNum.String()
1497 0 : }
1498 0 : return i.reader.fileNum.String()
1499 : }
|