Line data Source code
1 : // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package sstable
6 :
7 : import (
8 : "bytes"
9 : "context"
10 : "fmt"
11 : "unsafe"
12 :
13 : "github.com/cockroachdb/pebble/internal/base"
14 : "github.com/cockroachdb/pebble/internal/invariants"
15 : "github.com/cockroachdb/pebble/internal/treeprinter"
16 : "github.com/cockroachdb/pebble/objstorage"
17 : "github.com/cockroachdb/pebble/objstorage/objstorageprovider"
18 : "github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing"
19 : "github.com/cockroachdb/pebble/sstable/block"
20 : "github.com/cockroachdb/pebble/sstable/rowblk"
21 : )
22 :
23 : // singleLevelIterator iterates over an entire table of data. To seek for a given
24 : // key, it first looks in the index for the block that contains that key, and then
25 : // looks inside that block.
26 : type singleLevelIterator struct {
27 : ctx context.Context
28 : cmp Compare
29 : // Global lower/upper bound for the iterator.
30 : lower []byte
31 : upper []byte
32 : bpfs *BlockPropertiesFilterer
33 : // Per-block lower/upper bound. Nil if the bound does not apply to the block
34 : // because we determined the block lies completely within the bound.
35 : blockLower []byte
36 : blockUpper []byte
37 : reader *Reader
38 : // vState will be set iff the iterator is constructed for virtual sstable
39 : // iteration.
40 : vState *virtualState
41 : // endKeyInclusive is set to force the iterator to treat the upper field as
42 : // inclusive while iterating instead of exclusive.
43 : endKeyInclusive bool
44 : index rowblk.Iter
45 : indexFilterRH objstorage.ReadHandle
46 : indexFilterRHPrealloc objstorageprovider.PreallocatedReadHandle
47 : data rowblk.Iter
48 : dataRH objstorage.ReadHandle
49 : dataRHPrealloc objstorageprovider.PreallocatedReadHandle
50 : // dataBH refers to the last data block that the iterator considered
51 : // loading. It may not actually have loaded the block, due to an error or
52 : // because it was considered irrelevant.
53 : dataBH block.Handle
54 : vbReader *valueBlockReader
55 : // vbRH is the read handle for value blocks, which are in a different
56 : // part of the sstable than data blocks.
57 : vbRH objstorage.ReadHandle
58 : vbRHPrealloc objstorageprovider.PreallocatedReadHandle
59 : err error
60 : closeHook func(i Iterator) error
61 : // stats and iterStats are slightly different. stats is a shared struct
62 : // supplied from the outside, and represents stats for the whole iterator
63 : // tree and can be reset from the outside (e.g. when the pebble.Iterator is
64 : // being reused). It is currently only provided when the iterator tree is
65 : // rooted at pebble.Iterator. iterStats is this sstable iterator's private
66 : // stats that are reported to a CategoryStatsCollector when this iterator is
67 : // closed. More paths are instrumented with this as the
68 : // CategoryStatsCollector needed for this is provided by the
69 : // tableCacheContainer (which is more universally used).
70 : stats *base.InternalIteratorStats
71 : iterStats iterStatsAccumulator
72 : bufferPool *block.BufferPool
73 :
74 : // boundsCmp and positionedUsingLatestBounds are for optimizing iteration
75 : // that uses multiple adjacent bounds. The seek after setting a new bound
76 : // can use the fact that the iterator is either within the previous bounds
77 : // or exactly one key before or after the bounds. If the new bounds is
78 : // after/before the previous bounds, and we are already positioned at a
79 : // block that is relevant for the new bounds, we can try to first position
80 : // using Next/Prev (repeatedly) instead of doing a more expensive seek.
81 : //
82 : // When there are wide files at higher levels that match the bounds
83 : // but don't have any data for the bound, we will already be
84 : // positioned at the key beyond the bounds and won't need to do much
85 : // work -- given that most data is in L6, such files are likely to
86 : // dominate the performance of the mergingIter, and may be the main
87 : // benefit of this performance optimization (of course it also helps
88 : // when the file that has the data has successive seeks that stay in
89 : // the same block).
90 : //
91 : // Specifically, boundsCmp captures the relationship between the previous
92 : // and current bounds, if the iterator had been positioned after setting
93 : // the previous bounds. If it was not positioned, i.e., Seek/First/Last
94 : // were not called, we don't know where it is positioned and cannot
95 : // optimize.
96 : //
97 : // Example: Bounds moving forward, and iterator exhausted in forward direction.
98 : // bounds = [f, h), ^ shows block iterator position
99 : // file contents [ a b c d e f g h i j k ]
100 : // ^
101 : // new bounds = [j, k). Since positionedUsingLatestBounds=true, boundsCmp is
102 : // set to +1. SeekGE(j) can use next (the optimization also requires that j
103 : // is within the block, but that is not for correctness, but to limit the
104 : // optimization to when it will actually be an optimization).
105 : //
106 : // Example: Bounds moving forward.
107 : // bounds = [f, h), ^ shows block iterator position
108 : // file contents [ a b c d e f g h i j k ]
109 : // ^
110 : // new bounds = [j, k). Since positionedUsingLatestBounds=true, boundsCmp is
111 : // set to +1. SeekGE(j) can use next.
112 : //
113 : // Example: Bounds moving forward, but iterator not positioned using previous
114 : // bounds.
115 : // bounds = [f, h), ^ shows block iterator position
116 : // file contents [ a b c d e f g h i j k ]
117 : // ^
118 : // new bounds = [i, j). Iterator is at j since it was never positioned using
119 : // [f, h). So positionedUsingLatestBounds=false, and boundsCmp is set to 0.
120 : // SeekGE(i) will not use next.
121 : //
122 : // Example: Bounds moving forward and sparse file
123 : // bounds = [f, h), ^ shows block iterator position
124 : // file contents [ a z ]
125 : // ^
126 : // new bounds = [j, k). Since positionedUsingLatestBounds=true, boundsCmp is
127 : // set to +1. SeekGE(j) notices that the iterator is already past j and does
128 : // not need to do anything.
129 : //
130 : // Similar examples can be constructed for backward iteration.
131 : //
132 : // This notion of exactly one key before or after the bounds is not quite
133 : // true when block properties are used to ignore blocks. In that case we
134 : // can't stop precisely at the first block that is past the bounds since
135 : // we are using the index entries to enforce the bounds.
136 : //
137 : // e.g. 3 blocks with keys [b, c] [f, g], [i, j, k] with index entries d,
138 : // h, l. And let the lower bound be k, and we are reverse iterating. If
139 : // the block [i, j, k] is ignored due to the block interval annotations we
140 : // do need to move the index to block [f, g] since the index entry for the
141 : // [i, j, k] block is l which is not less than the lower bound of k. So we
142 : // have passed the entries i, j.
143 : //
144 : // This behavior is harmless since the block property filters are fixed
145 : // for the lifetime of the iterator so i, j are irrelevant. In addition,
146 : // the current code will not load the [f, g] block, so the seek
147 : // optimization that attempts to use Next/Prev do not apply anyway.
148 : boundsCmp int
149 : positionedUsingLatestBounds bool
150 :
151 : // exhaustedBounds represents whether the iterator is exhausted for
152 : // iteration by reaching the upper or lower bound. +1 when exhausted
153 : // the upper bound, -1 when exhausted the lower bound, and 0 when
154 : // neither. exhaustedBounds is also used for the TrySeekUsingNext
155 : // optimization in twoLevelIterator and singleLevelIterator. Care should be
156 : // taken in setting this in twoLevelIterator before calling into
157 : // singleLevelIterator, given that these two iterators share this field.
158 : exhaustedBounds int8
159 :
160 : // filterBlockSizeLimit controls whether the bloom filter block in this
161 : // sstable, if present, should be used for prefix seeks or not (depending on
162 : // its size). In some cases it is beneficial to skip a filter block even if it
163 : // exists (eg. if probability of a match is high).
164 : filterBlockSizeLimit FilterBlockSizeLimit
165 : lastBloomFilterMatched bool
166 :
167 : transforms IterTransforms
168 :
169 : // inPool is set to true before putting the iterator in the reusable pool;
170 : // used to detect double-close.
171 : inPool bool
172 : }
173 :
174 : // singleLevelIterator implements the base.InternalIterator interface.
175 : var _ base.InternalIterator = (*singleLevelIterator)(nil)
176 :
177 : // newSingleLevelIterator reads the index block and creates and initializes a
178 : // singleLevelIterator.
179 : //
180 : // Note that lower, upper are iterator bounds and are separate from virtual
181 : // sstable bounds. If the virtualState passed in is not nil, then virtual
182 : // sstable bounds will be enforced.
183 : func newSingleLevelIterator(
184 : ctx context.Context,
185 : r *Reader,
186 : v *virtualState,
187 : transforms IterTransforms,
188 : lower, upper []byte,
189 : filterer *BlockPropertiesFilterer,
190 : filterBlockSizeLimit FilterBlockSizeLimit,
191 : stats *base.InternalIteratorStats,
192 : categoryAndQoS CategoryAndQoS,
193 : statsCollector *CategoryStatsCollector,
194 : rp ReaderProvider,
195 : bufferPool *block.BufferPool,
196 1 : ) (*singleLevelIterator, error) {
197 1 : if r.err != nil {
198 0 : return nil, r.err
199 0 : }
200 1 : i := singleLevelIterPool.Get().(*singleLevelIterator)
201 1 : i.init(
202 1 : ctx, r, v, transforms, lower, upper, filterer, filterBlockSizeLimit,
203 1 : stats, categoryAndQoS, statsCollector, rp, bufferPool,
204 1 : )
205 1 : indexH, err := r.readIndex(ctx, i.indexFilterRH, stats, &i.iterStats)
206 1 : if err == nil {
207 1 : err = i.index.InitHandle(i.cmp, r.Split, indexH, transforms)
208 1 : }
209 1 : if err != nil {
210 0 : _ = i.Close()
211 0 : return nil, err
212 0 : }
213 1 : return i, nil
214 : }
215 :
216 : // init initializes the singleLevelIterator struct. It does not read the index.
217 : func (i *singleLevelIterator) init(
218 : ctx context.Context,
219 : r *Reader,
220 : v *virtualState,
221 : transforms IterTransforms,
222 : lower, upper []byte,
223 : filterer *BlockPropertiesFilterer,
224 : filterBlockSizeLimit FilterBlockSizeLimit,
225 : stats *base.InternalIteratorStats,
226 : categoryAndQoS CategoryAndQoS,
227 : statsCollector *CategoryStatsCollector,
228 : rp ReaderProvider,
229 : bufferPool *block.BufferPool,
230 1 : ) {
231 1 : i.inPool = false
232 1 : i.ctx = ctx
233 1 : i.lower = lower
234 1 : i.upper = upper
235 1 : i.bpfs = filterer
236 1 : i.filterBlockSizeLimit = filterBlockSizeLimit
237 1 : i.reader = r
238 1 : i.cmp = r.Compare
239 1 : i.stats = stats
240 1 : i.transforms = transforms
241 1 : i.bufferPool = bufferPool
242 1 : if v != nil {
243 1 : i.vState = v
244 1 : i.endKeyInclusive, i.lower, i.upper = v.constrainBounds(lower, upper, false /* endInclusive */)
245 1 : }
246 :
247 1 : i.iterStats.init(categoryAndQoS, statsCollector)
248 1 :
249 1 : i.indexFilterRH = objstorageprovider.UsePreallocatedReadHandle(
250 1 : r.readable, objstorage.ReadBeforeForIndexAndFilter, &i.indexFilterRHPrealloc)
251 1 : i.dataRH = objstorageprovider.UsePreallocatedReadHandle(
252 1 : r.readable, objstorage.NoReadBefore, &i.dataRHPrealloc)
253 1 :
254 1 : if r.tableFormat >= TableFormatPebblev3 {
255 1 : if r.Properties.NumValueBlocks > 0 {
256 1 : // NB: we cannot avoid this ~248 byte allocation, since valueBlockReader
257 1 : // can outlive the singleLevelIterator due to be being embedded in a
258 1 : // LazyValue. This consumes ~2% in microbenchmark CPU profiles, but we
259 1 : // should only optimize this if it shows up as significant in end-to-end
260 1 : // CockroachDB benchmarks, since it is tricky to do so. One possibility
261 1 : // is that if many sstable iterators only get positioned at latest
262 1 : // versions of keys, and therefore never expose a LazyValue that is
263 1 : // separated to their callers, they can put this valueBlockReader into a
264 1 : // sync.Pool.
265 1 : i.vbReader = &valueBlockReader{
266 1 : bpOpen: i,
267 1 : rp: rp,
268 1 : vbih: r.valueBIH,
269 1 : stats: stats,
270 1 : }
271 1 : i.data.SetGetLazyValue(i.vbReader.getLazyValueForPrefixAndValueHandle)
272 1 : i.vbRH = objstorageprovider.UsePreallocatedReadHandle(r.readable, objstorage.NoReadBefore, &i.vbRHPrealloc)
273 1 : }
274 1 : i.data.SetHasValuePrefix(true)
275 : }
276 : }
277 :
278 : // Helper function to check if keys returned from iterator are within virtual bounds.
279 1 : func (i *singleLevelIterator) maybeVerifyKey(kv *base.InternalKV) *base.InternalKV {
280 1 : if invariants.Enabled && kv != nil && i.vState != nil {
281 1 : key := kv.K.UserKey
282 1 : v := i.vState
283 1 : lc := i.cmp(key, v.lower.UserKey)
284 1 : uc := i.cmp(key, v.upper.UserKey)
285 1 : if lc < 0 || uc > 0 || (uc == 0 && v.upper.IsExclusiveSentinel()) {
286 0 : panic(fmt.Sprintf("key %q out of singleLeveliterator virtual bounds %s %s", key, v.lower.UserKey, v.upper.UserKey))
287 : }
288 : }
289 1 : return kv
290 : }
291 :
292 : // setupForCompaction sets up the singleLevelIterator for use with compactionIter.
293 : // Currently, it skips readahead ramp-up. It should be called after init is called.
294 1 : func (i *singleLevelIterator) setupForCompaction() {
295 1 : i.dataRH.SetupForCompaction()
296 1 : if i.vbRH != nil {
297 1 : i.vbRH.SetupForCompaction()
298 1 : }
299 : }
300 :
301 1 : func (i *singleLevelIterator) resetForReuse() singleLevelIterator {
302 1 : return singleLevelIterator{
303 1 : index: i.index.ResetForReuse(),
304 1 : data: i.data.ResetForReuse(),
305 1 : inPool: true,
306 1 : }
307 1 : }
308 :
309 1 : func (i *singleLevelIterator) initBounds() {
310 1 : // Trim the iteration bounds for the current block. We don't have to check
311 1 : // the bounds on each iteration if the block is entirely contained within the
312 1 : // iteration bounds.
313 1 : i.blockLower = i.lower
314 1 : if i.blockLower != nil {
315 1 : kv := i.data.First()
316 1 : // TODO(radu): this should be <= 0
317 1 : if kv != nil && i.cmp(i.blockLower, kv.K.UserKey) < 0 {
318 1 : // The lower-bound is less than the first key in the block. No need
319 1 : // to check the lower-bound again for this block.
320 1 : i.blockLower = nil
321 1 : }
322 : }
323 1 : i.blockUpper = i.upper
324 1 : // TODO(radu): this should be >= 0 if blockUpper is inclusive.
325 1 : if i.blockUpper != nil && i.cmp(i.blockUpper, i.index.Key().UserKey) > 0 {
326 1 : // The upper-bound is greater than the index key which itself is greater
327 1 : // than or equal to every key in the block. No need to check the
328 1 : // upper-bound again for this block. Even if blockUpper is inclusive
329 1 : // because of upper being inclusive, we can still safely set blockUpper
330 1 : // to nil here.
331 1 : i.blockUpper = nil
332 1 : }
333 : }
334 :
335 1 : func (i *singleLevelIterator) initBoundsForAlreadyLoadedBlock() {
336 1 : // TODO(radu): determine automatically if we need to call First or not and
337 1 : // unify this function with initBounds().
338 1 : if i.data.FirstUserKey() == nil {
339 0 : panic("initBoundsForAlreadyLoadedBlock must not be called on empty or corrupted block")
340 : }
341 1 : i.blockLower = i.lower
342 1 : if i.blockLower != nil {
343 1 : firstUserKey := i.data.FirstUserKey()
344 1 : // TODO(radu): this should be <= 0
345 1 : if firstUserKey != nil && i.cmp(i.blockLower, firstUserKey) < 0 {
346 1 : // The lower-bound is less than the first key in the block. No need
347 1 : // to check the lower-bound again for this block.
348 1 : i.blockLower = nil
349 1 : }
350 : }
351 1 : i.blockUpper = i.upper
352 1 : // TODO(radu): this should be >= 0 if blockUpper is inclusive.
353 1 : if i.blockUpper != nil && i.cmp(i.blockUpper, i.index.Key().UserKey) > 0 {
354 1 : // The upper-bound is greater than the index key which itself is greater
355 1 : // than or equal to every key in the block. No need to check the
356 1 : // upper-bound again for this block.
357 1 : i.blockUpper = nil
358 1 : }
359 : }
360 :
361 : // Deterministic disabling (in testing mode) of the bounds-based optimization
362 : // that avoids seeking. Uses the iterator pointer, since we want diversity in
363 : // iterator behavior for the same SetBounds call. Used for tests.
364 1 : func testingDisableBoundsOpt(bound []byte, ptr uintptr) bool {
365 1 : if !invariants.Enabled || ensureBoundsOptDeterminism {
366 0 : return false
367 0 : }
368 : // Fibonacci hash https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
369 1 : simpleHash := (11400714819323198485 * uint64(ptr)) >> 63
370 1 : return bound[len(bound)-1]&byte(1) == 0 && simpleHash == 0
371 : }
372 :
373 : // ensureBoundsOptDeterminism provides a facility for disabling of the bounds
374 : // optimizations performed by disableBoundsOpt for tests that require
375 : // deterministic iterator behavior. Some unit tests examine internal iterator
376 : // state and require this behavior to be deterministic.
377 : var ensureBoundsOptDeterminism bool
378 :
379 : // SetBoundsWithSyntheticPrefix indicates whether this iterator requires keys
380 : // passed to its SetBounds() method by a prefix rewriting wrapper to be *not*
381 : // rewritten to be in terms of this iterator's content, but instead be passed
382 : // as-is, i.e. with the synthetic prefix still on them.
383 : //
384 : // This allows an optimization when this iterator is passing these bounds on to
385 : // a vState to additionally constrain them. In said vState, passed bounds are
386 : // combined with the vState bounds which are in terms of the rewritten prefix.
387 : // If the caller rewrote bounds to be in terms of content prefix and SetBounds
388 : // passed those to vState, the vState would need to *un*rewrite them back to the
389 : // synthetic prefix in order to combine them with the vState bounds. Thus, if
390 : // this iterator knows bounds will be passed to vState, it can signal that it
391 : // they should be passed without being rewritten to skip converting to and fro.
392 0 : func (i singleLevelIterator) SetBoundsWithSyntheticPrefix() bool {
393 0 : return i.vState != nil
394 0 : }
395 :
396 : // SetBounds implements internalIterator.SetBounds, as documented in the pebble
397 : // package. Note that the upper field is exclusive.
398 1 : func (i *singleLevelIterator) SetBounds(lower, upper []byte) {
399 1 : i.boundsCmp = 0
400 1 : if i.vState != nil {
401 1 : // If the reader is constructed for a virtual sstable, then we must
402 1 : // constrain the bounds of the reader. For physical sstables, the bounds
403 1 : // can be wider than the actual sstable's bounds because we won't
404 1 : // accidentally expose additional keys as there are no additional keys.
405 1 : i.endKeyInclusive, lower, upper = i.vState.constrainBounds(
406 1 : lower, upper, false,
407 1 : )
408 1 : } else {
409 1 : // TODO(bananabrick): Figure out the logic here to enable the boundsCmp
410 1 : // optimization for virtual sstables.
411 1 : if i.positionedUsingLatestBounds {
412 1 : if i.upper != nil && lower != nil && i.cmp(i.upper, lower) <= 0 {
413 1 : i.boundsCmp = +1
414 1 : if testingDisableBoundsOpt(lower, uintptr(unsafe.Pointer(i))) {
415 1 : i.boundsCmp = 0
416 1 : }
417 1 : } else if i.lower != nil && upper != nil && i.cmp(upper, i.lower) <= 0 {
418 1 : i.boundsCmp = -1
419 1 : if testingDisableBoundsOpt(upper, uintptr(unsafe.Pointer(i))) {
420 1 : i.boundsCmp = 0
421 1 : }
422 : }
423 : }
424 : }
425 :
426 1 : i.positionedUsingLatestBounds = false
427 1 : i.lower = lower
428 1 : i.upper = upper
429 1 : i.blockLower = nil
430 1 : i.blockUpper = nil
431 : }
432 :
433 0 : func (i *singleLevelIterator) SetContext(ctx context.Context) {
434 0 : i.ctx = ctx
435 0 : }
436 :
437 : // loadBlock loads the block at the current index position and leaves i.data
438 : // unpositioned. If unsuccessful, it sets i.err to any error encountered, which
439 : // may be nil if we have simply exhausted the entire table.
440 1 : func (i *singleLevelIterator) loadBlock(dir int8) loadBlockResult {
441 1 : if !i.index.Valid() {
442 0 : // Ensure the data block iterator is invalidated even if loading of the block
443 0 : // fails.
444 0 : i.data.Invalidate()
445 0 : return loadBlockFailed
446 0 : }
447 : // Load the next block.
448 1 : v := i.index.Value()
449 1 : bhp, err := decodeBlockHandleWithProperties(v.InPlaceValue())
450 1 : if i.dataBH == bhp.Handle && i.data.Valid() {
451 1 : // We're already at the data block we want to load. Reset bounds in case
452 1 : // they changed since the last seek, but don't reload the block from cache
453 1 : // or disk.
454 1 : //
455 1 : // It's safe to leave i.data in its original state here, as all callers to
456 1 : // loadBlock make an absolute positioning call (i.e. a seek, first, or last)
457 1 : // to `i.data` right after loadBlock returns loadBlockOK.
458 1 : i.initBounds()
459 1 : return loadBlockOK
460 1 : }
461 : // Ensure the data block iterator is invalidated even if loading of the block
462 : // fails.
463 1 : i.data.Invalidate()
464 1 : i.dataBH = bhp.Handle
465 1 : if err != nil {
466 0 : i.err = errCorruptIndexEntry(err)
467 0 : return loadBlockFailed
468 0 : }
469 1 : if i.bpfs != nil {
470 1 : intersects, err := i.bpfs.intersects(bhp.Props)
471 1 : if err != nil {
472 0 : i.err = errCorruptIndexEntry(err)
473 0 : return loadBlockFailed
474 0 : }
475 1 : if intersects == blockMaybeExcluded {
476 1 : intersects = i.resolveMaybeExcluded(dir)
477 1 : }
478 1 : if intersects == blockExcluded {
479 1 : return loadBlockIrrelevant
480 1 : }
481 : // blockIntersects
482 : }
483 1 : ctx := objiotracing.WithBlockType(i.ctx, objiotracing.DataBlock)
484 1 : block, err := i.reader.readBlock(
485 1 : ctx, i.dataBH, nil /* transform */, i.dataRH, i.stats, &i.iterStats, i.bufferPool)
486 1 : if err != nil {
487 0 : i.err = err
488 0 : return loadBlockFailed
489 0 : }
490 1 : i.err = i.data.InitHandle(i.cmp, i.reader.Split, block, i.transforms)
491 1 : if i.err != nil {
492 0 : // The block is partially loaded, and we don't want it to appear valid.
493 0 : i.data.Invalidate()
494 0 : return loadBlockFailed
495 0 : }
496 1 : i.initBounds()
497 1 : return loadBlockOK
498 : }
499 :
500 : // readBlockForVBR implements the blockProviderWhenOpen interface for use by
501 : // the valueBlockReader.
502 : func (i *singleLevelIterator) readBlockForVBR(
503 : h block.Handle, stats *base.InternalIteratorStats,
504 1 : ) (block.BufferHandle, error) {
505 1 : ctx := objiotracing.WithBlockType(i.ctx, objiotracing.ValueBlock)
506 1 : return i.reader.readBlock(ctx, h, nil, i.vbRH, stats, &i.iterStats, i.bufferPool)
507 1 : }
508 :
509 : // resolveMaybeExcluded is invoked when the block-property filterer has found
510 : // that a block is excluded according to its properties but only if its bounds
511 : // fall within the filter's current bounds. This function consults the
512 : // apprioriate bound, depending on the iteration direction, and returns either
513 : // `blockIntersects` or `blockExcluded`.
514 1 : func (i *singleLevelIterator) resolveMaybeExcluded(dir int8) intersectsResult {
515 1 : // TODO(jackson): We could first try comparing to top-level index block's
516 1 : // key, and if within bounds avoid per-data block key comparisons.
517 1 :
518 1 : // This iterator is configured with a bound-limited block property
519 1 : // filter. The bpf determined this block could be excluded from
520 1 : // iteration based on the property encoded in the block handle.
521 1 : // However, we still need to determine if the block is wholly
522 1 : // contained within the filter's key bounds.
523 1 : //
524 1 : // External guarantees ensure all the block's keys are ≥ the
525 1 : // filter's lower bound during forward iteration, and that all the
526 1 : // block's keys are < the filter's upper bound during backward
527 1 : // iteration. We only need to determine if the opposite bound is
528 1 : // also met.
529 1 : //
530 1 : // The index separator in index.Key() provides an inclusive
531 1 : // upper-bound for the data block's keys, guaranteeing that all its
532 1 : // keys are ≤ index.Key(). For forward iteration, this is all we
533 1 : // need.
534 1 : if dir > 0 {
535 1 : // Forward iteration.
536 1 : if i.bpfs.boundLimitedFilter.KeyIsWithinUpperBound(i.index.Key().UserKey) {
537 1 : return blockExcluded
538 1 : }
539 1 : return blockIntersects
540 : }
541 :
542 : // Reverse iteration.
543 : //
544 : // Because we're iterating in the reverse direction, we don't yet have
545 : // enough context available to determine if the block is wholly contained
546 : // within its bounds. This case arises only during backward iteration,
547 : // because of the way the index is structured.
548 : //
549 : // Consider a bound-limited bpf limited to the bounds [b,d), loading the
550 : // block with separator `c`. During reverse iteration, the guarantee that
551 : // all the block's keys are < `d` is externally provided, but no guarantee
552 : // is made on the bpf's lower bound. The separator `c` only provides an
553 : // inclusive upper bound on the block's keys, indicating that the
554 : // corresponding block handle points to a block containing only keys ≤ `c`.
555 : //
556 : // To establish a lower bound, we step the index backwards to read the
557 : // previous block's separator, which provides an inclusive lower bound on
558 : // the original block's keys. Afterwards, we step forward to restore our
559 : // index position.
560 1 : if peekKV := i.index.Prev(); peekKV == nil {
561 1 : // The original block points to the first block of this index block. If
562 1 : // there's a two-level index, it could potentially provide a lower
563 1 : // bound, but the code refactoring necessary to read it doesn't seem
564 1 : // worth the payoff. We fall through to loading the block.
565 1 : } else if i.bpfs.boundLimitedFilter.KeyIsWithinLowerBound(peekKV.K.UserKey) {
566 1 : // The lower-bound on the original block falls within the filter's
567 1 : // bounds, and we can skip the block (after restoring our current index
568 1 : // position).
569 1 : _ = i.index.Next()
570 1 : return blockExcluded
571 1 : }
572 1 : _ = i.index.Next()
573 1 : return blockIntersects
574 : }
575 :
576 : // The number of times to call Next/Prev in a block before giving up and seeking.
577 : // The value of 4 is arbitrary.
578 : // TODO(sumeer): experiment with dynamic adjustment based on the history of
579 : // seeks for a particular iterator.
580 : const numStepsBeforeSeek = 4
581 :
582 : func (i *singleLevelIterator) trySeekGEUsingNextWithinBlock(
583 : key []byte,
584 1 : ) (kv *base.InternalKV, done bool) {
585 1 : kv = i.data.KV()
586 1 : for j := 0; j < numStepsBeforeSeek; j++ {
587 1 : curKeyCmp := i.cmp(kv.K.UserKey, key)
588 1 : if curKeyCmp >= 0 {
589 1 : if i.blockUpper != nil {
590 1 : cmp := i.cmp(kv.K.UserKey, i.blockUpper)
591 1 : if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
592 1 : i.exhaustedBounds = +1
593 1 : return nil, true
594 1 : }
595 : }
596 1 : return kv, true
597 : }
598 1 : kv = i.data.Next()
599 1 : if kv == nil {
600 1 : break
601 : }
602 : }
603 1 : return kv, false
604 : }
605 :
606 : func (i *singleLevelIterator) trySeekLTUsingPrevWithinBlock(
607 : key []byte,
608 1 : ) (kv *base.InternalKV, done bool) {
609 1 : kv = i.data.KV()
610 1 : for j := 0; j < numStepsBeforeSeek; j++ {
611 1 : curKeyCmp := i.cmp(kv.K.UserKey, key)
612 1 : if curKeyCmp < 0 {
613 1 : if i.blockLower != nil && i.cmp(kv.K.UserKey, i.blockLower) < 0 {
614 1 : i.exhaustedBounds = -1
615 1 : return nil, true
616 1 : }
617 1 : return kv, true
618 : }
619 1 : kv = i.data.Prev()
620 1 : if kv == nil {
621 1 : break
622 : }
623 : }
624 1 : return kv, false
625 : }
626 :
627 : // SeekGE implements internalIterator.SeekGE, as documented in the pebble
628 : // package. Note that SeekGE only checks the upper bound. It is up to the
629 : // caller to ensure that key is greater than or equal to the lower bound.
630 1 : func (i *singleLevelIterator) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKV {
631 1 : if i.vState != nil {
632 1 : // Callers of SeekGE don't know about virtual sstable bounds, so we may
633 1 : // have to internally restrict the bounds.
634 1 : //
635 1 : // TODO(bananabrick): We can optimize this check away for the level iter
636 1 : // if necessary.
637 1 : if i.cmp(key, i.lower) < 0 {
638 1 : key = i.lower
639 1 : }
640 : }
641 :
642 1 : if flags.TrySeekUsingNext() {
643 1 : // The i.exhaustedBounds comparison indicates that the upper bound was
644 1 : // reached. The i.data.isDataInvalidated() indicates that the sstable was
645 1 : // exhausted.
646 1 : if (i.exhaustedBounds == +1 || i.data.IsDataInvalidated()) && i.err == nil {
647 1 : // Already exhausted, so return nil.
648 1 : return nil
649 1 : }
650 1 : if i.err != nil {
651 0 : // The current iterator position cannot be used.
652 0 : flags = flags.DisableTrySeekUsingNext()
653 0 : }
654 : // INVARIANT: flags.TrySeekUsingNext() => i.err == nil &&
655 : // !i.exhaustedBounds==+1 && !i.data.isDataInvalidated(). That is,
656 : // data-exhausted and bounds-exhausted, as defined earlier, are both
657 : // false. Ths makes it safe to clear out i.exhaustedBounds and i.err
658 : // before calling into seekGEHelper.
659 : }
660 :
661 1 : i.exhaustedBounds = 0
662 1 : i.err = nil // clear cached iteration error
663 1 : boundsCmp := i.boundsCmp
664 1 : // Seek optimization only applies until iterator is first positioned after SetBounds.
665 1 : i.boundsCmp = 0
666 1 : i.positionedUsingLatestBounds = true
667 1 : return i.seekGEHelper(key, boundsCmp, flags)
668 : }
669 :
670 : // seekGEHelper contains the common functionality for SeekGE and SeekPrefixGE.
671 : func (i *singleLevelIterator) seekGEHelper(
672 : key []byte, boundsCmp int, flags base.SeekGEFlags,
673 1 : ) *base.InternalKV {
674 1 : // Invariant: trySeekUsingNext => !i.data.isDataInvalidated() && i.exhaustedBounds != +1
675 1 :
676 1 : // SeekGE performs various step-instead-of-seeking optimizations: eg enabled
677 1 : // by trySeekUsingNext, or by monotonically increasing bounds (i.boundsCmp).
678 1 :
679 1 : var dontSeekWithinBlock bool
680 1 : if !i.data.IsDataInvalidated() && i.data.Valid() && i.index.Valid() &&
681 1 : boundsCmp > 0 && i.cmp(key, i.index.Key().UserKey) <= 0 {
682 1 : // Fast-path: The bounds have moved forward and this SeekGE is
683 1 : // respecting the lower bound (guaranteed by Iterator). We know that
684 1 : // the iterator must already be positioned within or just outside the
685 1 : // previous bounds. Therefore it cannot be positioned at a block (or
686 1 : // the position within that block) that is ahead of the seek position.
687 1 : // However it can be positioned at an earlier block. This fast-path to
688 1 : // use Next() on the block is only applied when we are already at the
689 1 : // block that the slow-path (the else-clause) would load -- this is
690 1 : // the motivation for the i.cmp(key, i.index.Key().UserKey) <= 0
691 1 : // predicate.
692 1 : i.initBoundsForAlreadyLoadedBlock()
693 1 : kv, done := i.trySeekGEUsingNextWithinBlock(key)
694 1 : if done {
695 1 : return kv
696 1 : }
697 1 : if kv == nil {
698 1 : // Done with this block.
699 1 : dontSeekWithinBlock = true
700 1 : }
701 1 : } else {
702 1 : // Cannot use bounds monotonicity. But may be able to optimize if
703 1 : // caller claimed externally known invariant represented by
704 1 : // flags.TrySeekUsingNext().
705 1 : if flags.TrySeekUsingNext() {
706 1 : // seekPrefixGE or SeekGE has already ensured
707 1 : // !i.data.isDataInvalidated() && i.exhaustedBounds != +1
708 1 : curr := i.data.KV()
709 1 : less := i.cmp(curr.K.UserKey, key) < 0
710 1 : // We could be more sophisticated and confirm that the seek
711 1 : // position is within the current block before applying this
712 1 : // optimization. But there may be some benefit even if it is in
713 1 : // the next block, since we can avoid seeking i.index.
714 1 : for j := 0; less && j < numStepsBeforeSeek; j++ {
715 1 : curr = i.Next()
716 1 : if curr == nil {
717 1 : return nil
718 1 : }
719 1 : less = i.cmp(curr.K.UserKey, key) < 0
720 : }
721 1 : if !less {
722 1 : if i.blockUpper != nil {
723 1 : cmp := i.cmp(curr.K.UserKey, i.blockUpper)
724 1 : if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
725 0 : i.exhaustedBounds = +1
726 0 : return nil
727 0 : }
728 : }
729 1 : return curr
730 : }
731 : }
732 :
733 : // Slow-path.
734 :
735 1 : var ikv *base.InternalKV
736 1 : if ikv = i.index.SeekGE(key, flags.DisableTrySeekUsingNext()); ikv == nil {
737 1 : // The target key is greater than any key in the index block.
738 1 : // Invalidate the block iterator so that a subsequent call to Prev()
739 1 : // will return the last key in the table.
740 1 : i.data.Invalidate()
741 1 : return nil
742 1 : }
743 1 : result := i.loadBlock(+1)
744 1 : if result == loadBlockFailed {
745 0 : return nil
746 0 : }
747 1 : if result == loadBlockIrrelevant {
748 1 : // Enforce the upper bound here since don't want to bother moving
749 1 : // to the next block if upper bound is already exceeded. Note that
750 1 : // the next block starts with keys >= ikey.UserKey since even
751 1 : // though this is the block separator, the same user key can span
752 1 : // multiple blocks. If upper is exclusive we use >= below, else
753 1 : // we use >.
754 1 : if i.upper != nil {
755 1 : cmp := i.cmp(ikv.K.UserKey, i.upper)
756 1 : if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
757 1 : i.exhaustedBounds = +1
758 1 : return nil
759 1 : }
760 : }
761 : // Want to skip to the next block.
762 1 : dontSeekWithinBlock = true
763 : }
764 : }
765 1 : if !dontSeekWithinBlock {
766 1 : if ikv := i.data.SeekGE(key, flags.DisableTrySeekUsingNext()); ikv != nil {
767 1 : if i.blockUpper != nil {
768 1 : cmp := i.cmp(ikv.K.UserKey, i.blockUpper)
769 1 : if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
770 1 : i.exhaustedBounds = +1
771 1 : return nil
772 1 : }
773 : }
774 1 : return ikv
775 : }
776 : }
777 1 : return i.skipForward()
778 : }
779 :
780 : // SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the
781 : // pebble package. Note that SeekPrefixGE only checks the upper bound. It is up
782 : // to the caller to ensure that key is greater than or equal to the lower bound.
783 : func (i *singleLevelIterator) SeekPrefixGE(
784 : prefix, key []byte, flags base.SeekGEFlags,
785 1 : ) *base.InternalKV {
786 1 : if i.vState != nil {
787 1 : // Callers of SeekPrefixGE aren't aware of virtual sstable bounds, so
788 1 : // we may have to internally restrict the bounds.
789 1 : //
790 1 : // TODO(bananabrick): We can optimize away this check for the level iter
791 1 : // if necessary.
792 1 : if i.cmp(key, i.lower) < 0 {
793 1 : key = i.lower
794 1 : }
795 : }
796 1 : return i.seekPrefixGE(prefix, key, flags, i.filterBlockSizeLimit)
797 : }
798 :
799 : func (i *singleLevelIterator) seekPrefixGE(
800 : prefix, key []byte, flags base.SeekGEFlags, filterBlockSizeLimit FilterBlockSizeLimit,
801 1 : ) (kv *base.InternalKV) {
802 1 : // NOTE: prefix is only used for bloom filter checking and not later work in
803 1 : // this method. Hence, we can use the existing iterator position if the last
804 1 : // SeekPrefixGE did not fail bloom filter matching.
805 1 :
806 1 : err := i.err
807 1 : i.err = nil // clear cached iteration error
808 1 : if shouldUseFilterBlock(i.reader, filterBlockSizeLimit) {
809 1 : if !i.lastBloomFilterMatched {
810 1 : // Iterator is not positioned based on last seek.
811 1 : flags = flags.DisableTrySeekUsingNext()
812 1 : }
813 1 : i.lastBloomFilterMatched = false
814 1 : // Check prefix bloom filter.
815 1 : var mayContain bool
816 1 : mayContain, i.err = i.bloomFilterMayContain(prefix)
817 1 : if i.err != nil || !mayContain {
818 1 : // In the i.err == nil case, this invalidation may not be necessary for
819 1 : // correctness, and may be a place to optimize later by reusing the
820 1 : // already loaded block. It was necessary in earlier versions of the code
821 1 : // since the caller was allowed to call Next when SeekPrefixGE returned
822 1 : // nil. This is no longer allowed.
823 1 : i.data.Invalidate()
824 1 : return nil
825 1 : }
826 1 : i.lastBloomFilterMatched = true
827 : }
828 1 : if flags.TrySeekUsingNext() {
829 1 : // The i.exhaustedBounds comparison indicates that the upper bound was
830 1 : // reached. The i.data.isDataInvalidated() indicates that the sstable was
831 1 : // exhausted.
832 1 : if (i.exhaustedBounds == +1 || i.data.IsDataInvalidated()) && err == nil {
833 1 : // Already exhausted, so return nil.
834 1 : return nil
835 1 : }
836 1 : if err != nil {
837 0 : // The current iterator position cannot be used.
838 0 : flags = flags.DisableTrySeekUsingNext()
839 0 : }
840 : // INVARIANT: flags.TrySeekUsingNext() => err == nil &&
841 : // !i.exhaustedBounds==+1 && !i.data.isDataInvalidated(). That is,
842 : // data-exhausted and bounds-exhausted, as defined earlier, are both
843 : // false. Ths makes it safe to clear out i.exhaustedBounds and i.err
844 : // before calling into seekGEHelper.
845 : }
846 : // Bloom filter matches, or skipped, so this method will position the
847 : // iterator.
848 1 : i.exhaustedBounds = 0
849 1 : boundsCmp := i.boundsCmp
850 1 : // Seek optimization only applies until iterator is first positioned after SetBounds.
851 1 : i.boundsCmp = 0
852 1 : i.positionedUsingLatestBounds = true
853 1 : return i.maybeVerifyKey(i.seekGEHelper(key, boundsCmp, flags))
854 : }
855 :
856 : // shouldUseFilterBlock returns whether we should use the filter block, based on
857 : // its length and the size limit.
858 1 : func shouldUseFilterBlock(reader *Reader, filterBlockSizeLimit FilterBlockSizeLimit) bool {
859 1 : return reader.tableFilter != nil && reader.filterBH.Length <= uint64(filterBlockSizeLimit)
860 1 : }
861 :
862 1 : func (i *singleLevelIterator) bloomFilterMayContain(prefix []byte) (bool, error) {
863 1 : // Check prefix bloom filter.
864 1 : prefixToCheck := prefix
865 1 : if i.transforms.SyntheticPrefix.IsSet() {
866 1 : // We have to remove the synthetic prefix.
867 1 : var ok bool
868 1 : prefixToCheck, ok = bytes.CutPrefix(prefix, i.transforms.SyntheticPrefix)
869 1 : if !ok {
870 1 : // This prefix will not be found inside this table.
871 1 : return false, nil
872 1 : }
873 : }
874 :
875 1 : dataH, err := i.reader.readFilter(i.ctx, i.indexFilterRH, i.stats, &i.iterStats)
876 1 : if err != nil {
877 0 : return false, err
878 0 : }
879 1 : defer dataH.Release()
880 1 : return i.reader.tableFilter.mayContain(dataH.Get(), prefixToCheck), nil
881 : }
882 :
883 : // virtualLast should only be called if i.vReader != nil.
884 1 : func (i *singleLevelIterator) virtualLast() *base.InternalKV {
885 1 : if i.vState == nil {
886 0 : panic("pebble: invalid call to virtualLast")
887 : }
888 :
889 1 : if !i.endKeyInclusive {
890 1 : // Trivial case.
891 1 : return i.SeekLT(i.upper, base.SeekLTFlagsNone)
892 1 : }
893 1 : return i.virtualLastSeekLE()
894 : }
895 :
896 : // virtualLastSeekLE is called by virtualLast to do a SeekLE as part of a
897 : // virtualLast. Consider generalizing this into a SeekLE() if there are other
898 : // uses of this method in the future. Does a SeekLE on the upper bound of the
899 : // file/iterator.
900 1 : func (i *singleLevelIterator) virtualLastSeekLE() *base.InternalKV {
901 1 : // Callers of SeekLE don't know about virtual sstable bounds, so we may
902 1 : // have to internally restrict the bounds.
903 1 : //
904 1 : // TODO(bananabrick): We can optimize this check away for the level iter
905 1 : // if necessary.
906 1 : if !i.endKeyInclusive {
907 0 : panic("unexpected virtualLastSeekLE with exclusive upper bounds")
908 : }
909 1 : key := i.upper
910 1 :
911 1 : i.exhaustedBounds = 0
912 1 : i.err = nil // clear cached iteration error
913 1 : // Seek optimization only applies until iterator is first positioned with a
914 1 : // SeekGE or SeekLT after SetBounds.
915 1 : i.boundsCmp = 0
916 1 : i.positionedUsingLatestBounds = true
917 1 :
918 1 : ikv := i.index.SeekGE(key, base.SeekGEFlagsNone)
919 1 : // We can have multiple internal keys with the same user key as the seek
920 1 : // key. In that case, we want the last (greatest) internal key.
921 1 : //
922 1 : // INVARIANT: One of two cases:
923 1 : // A. ikey == nil. There is no data block with index key >= key. So all keys
924 1 : // in the last data block are < key.
925 1 : // B. ikey.userkey >= key. This data block may have some keys > key.
926 1 : //
927 1 : // Subcases of B:
928 1 : // B1. ikey.userkey == key. This is when loop iteration happens.
929 1 : // Since ikey.UserKey >= largest data key in the block, the largest data
930 1 : // key in this block is <= key.
931 1 : // B2. ikey.userkey > key. Loop iteration will not happen.
932 1 : //
933 1 : // NB: We can avoid this Next()ing if we just implement a blockIter.SeekLE().
934 1 : // This might be challenging to do correctly, so impose regular operations
935 1 : // for now.
936 1 : for ikv != nil && bytes.Equal(ikv.K.UserKey, key) {
937 1 : ikv = i.index.Next()
938 1 : }
939 1 : if ikv == nil {
940 1 : // Cases A or B1 where B1 exhausted all blocks. In both cases the last block
941 1 : // has all keys <= key. skipBackward enforces the lower bound.
942 1 : return i.skipBackward()
943 1 : }
944 : // Case B. We are here because we were originally in case B2, or we were in B1
945 : // and we arrived at a block where ikey.UserKey > key. Either way, ikey.UserKey
946 : // > key. So there could be keys in the block > key. But the block preceding
947 : // this block cannot have any keys > key, otherwise it would have been the
948 : // result of the original index.SeekGE.
949 1 : result := i.loadBlock(-1)
950 1 : if result == loadBlockFailed {
951 0 : return nil
952 0 : }
953 1 : if result == loadBlockIrrelevant {
954 1 : // Want to skip to the previous block.
955 1 : return i.skipBackward()
956 1 : }
957 1 : ikv = i.data.SeekGE(key, base.SeekGEFlagsNone)
958 1 : // Go to the last user key that matches key, and then Prev() on the data
959 1 : // block.
960 1 : for ikv != nil && bytes.Equal(ikv.K.UserKey, key) {
961 1 : ikv = i.data.Next()
962 1 : }
963 1 : ikv = i.data.Prev()
964 1 : if ikv != nil {
965 1 : // Enforce the lower bound here, as we could have gone past it. This happens
966 1 : // if keys between `i.blockLower` and `key` are obsolete, for instance. Even
967 1 : // though i.blockLower (which is either nil or equal to i.lower) is <= key,
968 1 : // all internal keys in the user key interval [i.blockLower, key] could be
969 1 : // obsolete (due to a RANGEDEL which will not be observed here). And
970 1 : // i.data.Prev will skip all these obsolete keys, and could land on a key
971 1 : // below the lower bound, requiring the lower bound check.
972 1 : if i.blockLower != nil && i.cmp(ikv.K.UserKey, i.blockLower) < 0 {
973 0 : i.exhaustedBounds = -1
974 0 : return nil
975 0 : }
976 1 : return ikv
977 : }
978 1 : return i.skipBackward()
979 : }
980 :
981 : // SeekLT implements internalIterator.SeekLT, as documented in the pebble
982 : // package. Note that SeekLT only checks the lower bound. It is up to the
983 : // caller to ensure that key is less than or equal to the upper bound.
984 1 : func (i *singleLevelIterator) SeekLT(key []byte, flags base.SeekLTFlags) *base.InternalKV {
985 1 : if i.vState != nil {
986 1 : // Might have to fix upper bound since virtual sstable bounds are not
987 1 : // known to callers of SeekLT.
988 1 : //
989 1 : // TODO(bananabrick): We can optimize away this check for the level iter
990 1 : // if necessary.
991 1 : cmp := i.cmp(key, i.upper)
992 1 : // key == i.upper is fine. We'll do the right thing and return the
993 1 : // first internal key with user key < key.
994 1 : if cmp > 0 {
995 1 : // Return the last key in the virtual sstable.
996 1 : return i.maybeVerifyKey(i.virtualLast())
997 1 : }
998 : }
999 :
1000 1 : i.exhaustedBounds = 0
1001 1 : i.err = nil // clear cached iteration error
1002 1 : boundsCmp := i.boundsCmp
1003 1 : // Seek optimization only applies until iterator is first positioned after SetBounds.
1004 1 : i.boundsCmp = 0
1005 1 :
1006 1 : // Seeking operations perform various step-instead-of-seeking optimizations:
1007 1 : // eg by considering monotonically increasing bounds (i.boundsCmp).
1008 1 :
1009 1 : i.positionedUsingLatestBounds = true
1010 1 :
1011 1 : var dontSeekWithinBlock bool
1012 1 : if !i.data.IsDataInvalidated() && i.data.Valid() && i.index.Valid() &&
1013 1 : boundsCmp < 0 && i.cmp(i.data.FirstUserKey(), key) < 0 {
1014 1 : // Fast-path: The bounds have moved backward, and this SeekLT is
1015 1 : // respecting the upper bound (guaranteed by Iterator). We know that
1016 1 : // the iterator must already be positioned within or just outside the
1017 1 : // previous bounds. Therefore it cannot be positioned at a block (or
1018 1 : // the position within that block) that is behind the seek position.
1019 1 : // However it can be positioned at a later block. This fast-path to
1020 1 : // use Prev() on the block is only applied when we are already at the
1021 1 : // block that can satisfy this seek -- this is the motivation for the
1022 1 : // the i.cmp(i.data.firstKey.UserKey, key) < 0 predicate.
1023 1 : i.initBoundsForAlreadyLoadedBlock()
1024 1 : ikv, done := i.trySeekLTUsingPrevWithinBlock(key)
1025 1 : if done {
1026 1 : return ikv
1027 1 : }
1028 1 : if ikv == nil {
1029 1 : // Done with this block.
1030 1 : dontSeekWithinBlock = true
1031 1 : }
1032 1 : } else {
1033 1 : // Slow-path.
1034 1 : var ikv *base.InternalKV
1035 1 :
1036 1 : // NB: If a bound-limited block property filter is configured, it's
1037 1 : // externally ensured that the filter is disabled (through returning
1038 1 : // Intersects=false irrespective of the block props provided) during
1039 1 : // seeks.
1040 1 : if ikv = i.index.SeekGE(key, base.SeekGEFlagsNone); ikv == nil {
1041 1 : ikv = i.index.Last()
1042 1 : if ikv == nil {
1043 0 : return nil
1044 0 : }
1045 : }
1046 : // INVARIANT: ikey != nil.
1047 1 : result := i.loadBlock(-1)
1048 1 : if result == loadBlockFailed {
1049 0 : return nil
1050 0 : }
1051 1 : if result == loadBlockIrrelevant {
1052 1 : // Enforce the lower bound here since don't want to bother moving
1053 1 : // to the previous block if lower bound is already exceeded. Note
1054 1 : // that the previous block starts with keys <= ikey.UserKey since
1055 1 : // even though this is the current block's separator, the same
1056 1 : // user key can span multiple blocks.
1057 1 : if i.lower != nil && i.cmp(ikv.K.UserKey, i.lower) < 0 {
1058 1 : i.exhaustedBounds = -1
1059 1 : return nil
1060 1 : }
1061 : // Want to skip to the previous block.
1062 1 : dontSeekWithinBlock = true
1063 : }
1064 : }
1065 1 : if !dontSeekWithinBlock {
1066 1 : if ikv := i.data.SeekLT(key, flags); ikv != nil {
1067 1 : if i.blockLower != nil && i.cmp(ikv.K.UserKey, i.blockLower) < 0 {
1068 1 : i.exhaustedBounds = -1
1069 1 : return nil
1070 1 : }
1071 1 : return ikv
1072 : }
1073 : }
1074 : // The index contains separator keys which may lie between
1075 : // user-keys. Consider the user-keys:
1076 : //
1077 : // complete
1078 : // ---- new block ---
1079 : // complexion
1080 : //
1081 : // If these two keys end one block and start the next, the index key may
1082 : // be chosen as "compleu". The SeekGE in the index block will then point
1083 : // us to the block containing "complexion". If this happens, we want the
1084 : // last key from the previous data block.
1085 1 : return i.maybeVerifyKey(i.skipBackward())
1086 : }
1087 :
1088 : // First implements internalIterator.First, as documented in the pebble
1089 : // package. Note that First only checks the upper bound. It is up to the caller
1090 : // to ensure that key is greater than or equal to the lower bound (e.g. via a
1091 : // call to SeekGE(lower)).
1092 1 : func (i *singleLevelIterator) First() *base.InternalKV {
1093 1 : // If we have a lower bound, use SeekGE. Note that in general this is not
1094 1 : // supported usage, except when the lower bound is there because the table is
1095 1 : // virtual.
1096 1 : if i.lower != nil {
1097 1 : return i.SeekGE(i.lower, base.SeekGEFlagsNone)
1098 1 : }
1099 :
1100 1 : i.positionedUsingLatestBounds = true
1101 1 :
1102 1 : return i.firstInternal()
1103 : }
1104 :
1105 : // firstInternal is a helper used for absolute positioning in a single-level
1106 : // index file, or for positioning in the second-level index in a two-level
1107 : // index file. For the latter, one cannot make any claims about absolute
1108 : // positioning.
1109 1 : func (i *singleLevelIterator) firstInternal() *base.InternalKV {
1110 1 : i.exhaustedBounds = 0
1111 1 : i.err = nil // clear cached iteration error
1112 1 : // Seek optimization only applies until iterator is first positioned after SetBounds.
1113 1 : i.boundsCmp = 0
1114 1 :
1115 1 : var kv *base.InternalKV
1116 1 : if kv = i.index.First(); kv == nil {
1117 0 : i.data.Invalidate()
1118 0 : return nil
1119 0 : }
1120 1 : result := i.loadBlock(+1)
1121 1 : if result == loadBlockFailed {
1122 0 : return nil
1123 0 : }
1124 1 : if result == loadBlockOK {
1125 1 : if kv := i.data.First(); kv != nil {
1126 1 : if i.blockUpper != nil {
1127 1 : cmp := i.cmp(kv.K.UserKey, i.blockUpper)
1128 1 : if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
1129 1 : i.exhaustedBounds = +1
1130 1 : return nil
1131 1 : }
1132 : }
1133 1 : return kv
1134 : }
1135 : // Else fall through to skipForward.
1136 1 : } else {
1137 1 : // result == loadBlockIrrelevant. Enforce the upper bound here since
1138 1 : // don't want to bother moving to the next block if upper bound is
1139 1 : // already exceeded. Note that the next block starts with keys >=
1140 1 : // ikey.UserKey since even though this is the block separator, the
1141 1 : // same user key can span multiple blocks. If upper is exclusive we
1142 1 : // use >= below, else we use >.
1143 1 : if i.upper != nil {
1144 1 : cmp := i.cmp(kv.K.UserKey, i.upper)
1145 1 : if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
1146 1 : i.exhaustedBounds = +1
1147 1 : return nil
1148 1 : }
1149 : }
1150 : // Else fall through to skipForward.
1151 : }
1152 :
1153 1 : return i.skipForward()
1154 : }
1155 :
1156 : // Last implements internalIterator.Last, as documented in the pebble
1157 : // package. Note that Last only checks the lower bound. It is up to the caller
1158 : // to ensure that key is less than the upper bound (e.g. via a call to
1159 : // SeekLT(upper))
1160 1 : func (i *singleLevelIterator) Last() *base.InternalKV {
1161 1 : if i.vState != nil {
1162 1 : return i.maybeVerifyKey(i.virtualLast())
1163 1 : }
1164 :
1165 1 : if i.upper != nil {
1166 0 : panic("singleLevelIterator.Last() used despite upper bound")
1167 : }
1168 1 : i.positionedUsingLatestBounds = true
1169 1 : return i.lastInternal()
1170 : }
1171 :
1172 : // lastInternal is a helper used for absolute positioning in a single-level
1173 : // index file, or for positioning in the second-level index in a two-level
1174 : // index file. For the latter, one cannot make any claims about absolute
1175 : // positioning.
1176 1 : func (i *singleLevelIterator) lastInternal() *base.InternalKV {
1177 1 : i.exhaustedBounds = 0
1178 1 : i.err = nil // clear cached iteration error
1179 1 : // Seek optimization only applies until iterator is first positioned after SetBounds.
1180 1 : i.boundsCmp = 0
1181 1 :
1182 1 : var ikv *base.InternalKV
1183 1 : if ikv = i.index.Last(); ikv == nil {
1184 0 : i.data.Invalidate()
1185 0 : return nil
1186 0 : }
1187 1 : result := i.loadBlock(-1)
1188 1 : if result == loadBlockFailed {
1189 0 : return nil
1190 0 : }
1191 1 : if result == loadBlockOK {
1192 1 : if ikv := i.data.Last(); ikv != nil {
1193 1 : if i.blockLower != nil && i.cmp(ikv.K.UserKey, i.blockLower) < 0 {
1194 1 : i.exhaustedBounds = -1
1195 1 : return nil
1196 1 : }
1197 1 : return ikv
1198 : }
1199 : // Else fall through to skipBackward.
1200 1 : } else {
1201 1 : // result == loadBlockIrrelevant. Enforce the lower bound here since
1202 1 : // don't want to bother moving to the previous block if lower bound is
1203 1 : // already exceeded. Note that the previous block starts with keys <=
1204 1 : // key.UserKey since even though this is the current block's
1205 1 : // separator, the same user key can span multiple blocks.
1206 1 : if i.lower != nil && i.cmp(ikv.K.UserKey, i.lower) < 0 {
1207 1 : i.exhaustedBounds = -1
1208 1 : return nil
1209 1 : }
1210 : }
1211 :
1212 1 : return i.skipBackward()
1213 : }
1214 :
1215 : // Next implements internalIterator.Next, as documented in the pebble
1216 : // package.
1217 : // Note: compactionIterator.Next mirrors the implementation of Iterator.Next
1218 : // due to performance. Keep the two in sync.
1219 1 : func (i *singleLevelIterator) Next() *base.InternalKV {
1220 1 : if i.exhaustedBounds == +1 {
1221 0 : panic("Next called even though exhausted upper bound")
1222 : }
1223 1 : i.exhaustedBounds = 0
1224 1 : // Seek optimization only applies until iterator is first positioned after SetBounds.
1225 1 : i.boundsCmp = 0
1226 1 :
1227 1 : if i.err != nil {
1228 0 : // TODO(jackson): Can this case be turned into a panic? Once an error is
1229 0 : // encountered, the iterator must be re-seeked.
1230 0 : return nil
1231 0 : }
1232 1 : if kv := i.data.Next(); kv != nil {
1233 1 : if i.blockUpper != nil {
1234 1 : cmp := i.cmp(kv.K.UserKey, i.blockUpper)
1235 1 : if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
1236 1 : i.exhaustedBounds = +1
1237 1 : return nil
1238 1 : }
1239 : }
1240 1 : return kv
1241 : }
1242 1 : return i.skipForward()
1243 : }
1244 :
1245 : // NextPrefix implements (base.InternalIterator).NextPrefix.
1246 1 : func (i *singleLevelIterator) NextPrefix(succKey []byte) *base.InternalKV {
1247 1 : if i.exhaustedBounds == +1 {
1248 0 : panic("NextPrefix called even though exhausted upper bound")
1249 : }
1250 1 : i.exhaustedBounds = 0
1251 1 : // Seek optimization only applies until iterator is first positioned after SetBounds.
1252 1 : i.boundsCmp = 0
1253 1 : if i.err != nil {
1254 0 : // TODO(jackson): Can this case be turned into a panic? Once an error is
1255 0 : // encountered, the iterator must be re-seeked.
1256 0 : return nil
1257 0 : }
1258 1 : if kv := i.data.NextPrefix(succKey); kv != nil {
1259 1 : if i.blockUpper != nil {
1260 1 : cmp := i.cmp(kv.K.UserKey, i.blockUpper)
1261 1 : if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
1262 1 : i.exhaustedBounds = +1
1263 1 : return nil
1264 1 : }
1265 : }
1266 1 : return kv
1267 : }
1268 : // Did not find prefix in the existing data block. This is the slow-path
1269 : // where we effectively seek the iterator.
1270 1 : var ikv *base.InternalKV
1271 1 : // The key is likely to be in the next data block, so try one step.
1272 1 : if ikv = i.index.Next(); ikv == nil {
1273 1 : // The target key is greater than any key in the index block.
1274 1 : // Invalidate the block iterator so that a subsequent call to Prev()
1275 1 : // will return the last key in the table.
1276 1 : i.data.Invalidate()
1277 1 : return nil
1278 1 : }
1279 1 : if i.cmp(succKey, ikv.K.UserKey) > 0 {
1280 1 : // Not in the next data block, so seek the index.
1281 1 : if ikv = i.index.SeekGE(succKey, base.SeekGEFlagsNone); ikv == nil {
1282 1 : // The target key is greater than any key in the index block.
1283 1 : // Invalidate the block iterator so that a subsequent call to Prev()
1284 1 : // will return the last key in the table.
1285 1 : i.data.Invalidate()
1286 1 : return nil
1287 1 : }
1288 : }
1289 1 : result := i.loadBlock(+1)
1290 1 : if result == loadBlockFailed {
1291 0 : return nil
1292 0 : }
1293 1 : if result == loadBlockIrrelevant {
1294 1 : // Enforce the upper bound here since don't want to bother moving
1295 1 : // to the next block if upper bound is already exceeded. Note that
1296 1 : // the next block starts with keys >= ikey.UserKey since even
1297 1 : // though this is the block separator, the same user key can span
1298 1 : // multiple blocks. If upper is exclusive we use >= below, else we use
1299 1 : // >.
1300 1 : if i.upper != nil {
1301 0 : cmp := i.cmp(ikv.K.UserKey, i.upper)
1302 0 : if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
1303 0 : i.exhaustedBounds = +1
1304 0 : return nil
1305 0 : }
1306 : }
1307 1 : } else if kv := i.data.SeekGE(succKey, base.SeekGEFlagsNone); kv != nil {
1308 1 : if i.blockUpper != nil {
1309 1 : cmp := i.cmp(kv.K.UserKey, i.blockUpper)
1310 1 : if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
1311 1 : i.exhaustedBounds = +1
1312 1 : return nil
1313 1 : }
1314 : }
1315 1 : return i.maybeVerifyKey(kv)
1316 : }
1317 :
1318 1 : return i.skipForward()
1319 : }
1320 :
1321 : // Prev implements internalIterator.Prev, as documented in the pebble
1322 : // package.
1323 1 : func (i *singleLevelIterator) Prev() *base.InternalKV {
1324 1 : if i.exhaustedBounds == -1 {
1325 0 : panic("Prev called even though exhausted lower bound")
1326 : }
1327 1 : i.exhaustedBounds = 0
1328 1 : // Seek optimization only applies until iterator is first positioned after SetBounds.
1329 1 : i.boundsCmp = 0
1330 1 :
1331 1 : if i.err != nil {
1332 0 : return nil
1333 0 : }
1334 1 : if kv := i.data.Prev(); kv != nil {
1335 1 : if i.blockLower != nil && i.cmp(kv.K.UserKey, i.blockLower) < 0 {
1336 1 : i.exhaustedBounds = -1
1337 1 : return nil
1338 1 : }
1339 1 : return kv
1340 : }
1341 1 : return i.skipBackward()
1342 : }
1343 :
1344 1 : func (i *singleLevelIterator) skipForward() *base.InternalKV {
1345 1 : for {
1346 1 : indexKey := i.index.Next()
1347 1 : if indexKey == nil {
1348 1 : i.data.Invalidate()
1349 1 : break
1350 : }
1351 1 : result := i.loadBlock(+1)
1352 1 : if result != loadBlockOK {
1353 1 : if i.err != nil {
1354 0 : break
1355 : }
1356 1 : if result == loadBlockFailed {
1357 0 : // We checked that i.index was at a valid entry, so
1358 0 : // loadBlockFailed could not have happened due to i.index
1359 0 : // being exhausted, and must be due to an error.
1360 0 : panic("loadBlock should not have failed with no error")
1361 : }
1362 : // result == loadBlockIrrelevant. Enforce the upper bound here
1363 : // since don't want to bother moving to the next block if upper
1364 : // bound is already exceeded. Note that the next block starts with
1365 : // keys >= key.UserKey since even though this is the block
1366 : // separator, the same user key can span multiple blocks. If upper
1367 : // is exclusive we use >= below, else we use >.
1368 1 : if i.upper != nil {
1369 1 : cmp := i.cmp(indexKey.K.UserKey, i.upper)
1370 1 : if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
1371 1 : i.exhaustedBounds = +1
1372 1 : return nil
1373 1 : }
1374 : }
1375 1 : continue
1376 : }
1377 1 : var kv *base.InternalKV
1378 1 : // It is possible that skipBackward went too far and the virtual table lower
1379 1 : // bound is after the first key in the block we are about to load, in which
1380 1 : // case we must use SeekGE.
1381 1 : //
1382 1 : // An example of how this can happen:
1383 1 : //
1384 1 : // Data block 1 - contains keys a@1, c@1
1385 1 : // Data block 2 - contains keys e@1, g@1
1386 1 : // Data block 3 - contains keys i@2, k@2
1387 1 : //
1388 1 : // The virtual table lower bound is f. We have a range key masking filter
1389 1 : // that filters keys with @1 suffix. We are positioned inside block 3 then
1390 1 : // we Prev(). Block 2 is entirely filtered out, which makes us move to
1391 1 : // block 1. Now the range key masking filter gets an update (via
1392 1 : // SpanChanged) and it no longer filters out any keys. At this point if a
1393 1 : // Next happens, we will load block 2 but it would not be legal to return
1394 1 : // "e@1" which is outside the virtual bounds.
1395 1 : //
1396 1 : // The core of the problem is that skipBackward doesn't know it can stop
1397 1 : // at block 2, because it doesn't know what keys are at the start of that
1398 1 : // block. This is why we don't have this problem in the opposite
1399 1 : // direction: skipForward will never go beyond the last relevant block
1400 1 : // because it looks at the separator key which is an upper bound for the
1401 1 : // block.
1402 1 : //
1403 1 : // Note that this is only a problem with virtual tables; we make no
1404 1 : // guarantees wrt an iterator lower bound when we iterate forward. But we
1405 1 : // must never return keys that are not inside the virtual table.
1406 1 : if i.vState != nil && i.blockLower != nil {
1407 1 : kv = i.data.SeekGE(i.lower, base.SeekGEFlagsNone)
1408 1 : } else {
1409 1 : kv = i.data.First()
1410 1 : }
1411 1 : if kv != nil {
1412 1 : if i.blockUpper != nil {
1413 1 : cmp := i.cmp(kv.K.UserKey, i.blockUpper)
1414 1 : if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
1415 1 : i.exhaustedBounds = +1
1416 1 : return nil
1417 1 : }
1418 : }
1419 1 : return i.maybeVerifyKey(kv)
1420 : }
1421 : }
1422 1 : return nil
1423 : }
1424 :
1425 1 : func (i *singleLevelIterator) skipBackward() *base.InternalKV {
1426 1 : for {
1427 1 : indexKey := i.index.Prev()
1428 1 : if indexKey == nil {
1429 1 : i.data.Invalidate()
1430 1 : break
1431 : }
1432 1 : result := i.loadBlock(-1)
1433 1 : if result != loadBlockOK {
1434 1 : if i.err != nil {
1435 0 : break
1436 : }
1437 1 : if result == loadBlockFailed {
1438 0 : // We checked that i.index was at a valid entry, so
1439 0 : // loadBlockFailed could not have happened due to to i.index
1440 0 : // being exhausted, and must be due to an error.
1441 0 : panic("loadBlock should not have failed with no error")
1442 : }
1443 : // result == loadBlockIrrelevant. Enforce the lower bound here
1444 : // since don't want to bother moving to the previous block if lower
1445 : // bound is already exceeded. Note that the previous block starts with
1446 : // keys <= key.UserKey since even though this is the current block's
1447 : // separator, the same user key can span multiple blocks.
1448 1 : if i.lower != nil && i.cmp(indexKey.K.UserKey, i.lower) < 0 {
1449 1 : i.exhaustedBounds = -1
1450 1 : return nil
1451 1 : }
1452 1 : continue
1453 : }
1454 1 : kv := i.data.Last()
1455 1 : if kv == nil {
1456 1 : // The block iter could have hid some obsolete points, so it isn't
1457 1 : // safe to assume that there are no keys if we keep skipping backwards.
1458 1 : // Check the previous block, but check the lower bound before doing
1459 1 : // that.
1460 1 : if i.lower != nil && i.cmp(indexKey.K.UserKey, i.lower) < 0 {
1461 1 : i.exhaustedBounds = -1
1462 1 : return nil
1463 1 : }
1464 1 : continue
1465 : }
1466 1 : if i.blockLower != nil && i.cmp(kv.K.UserKey, i.blockLower) < 0 {
1467 1 : i.exhaustedBounds = -1
1468 1 : return nil
1469 1 : }
1470 1 : return i.maybeVerifyKey(kv)
1471 : }
1472 1 : return nil
1473 : }
1474 :
1475 : // Error implements internalIterator.Error, as documented in the pebble
1476 : // package.
1477 1 : func (i *singleLevelIterator) Error() error {
1478 1 : if err := i.data.Error(); err != nil {
1479 0 : return err
1480 0 : }
1481 1 : return i.err
1482 : }
1483 :
1484 : // SetCloseHook sets a function that will be called when the iterator is
1485 : // closed.
1486 1 : func (i *singleLevelIterator) SetCloseHook(fn func(i Iterator) error) {
1487 1 : i.closeHook = fn
1488 1 : }
1489 :
1490 1 : func firstError(err0, err1 error) error {
1491 1 : if err0 != nil {
1492 0 : return err0
1493 0 : }
1494 1 : return err1
1495 : }
1496 :
1497 : // Close implements internalIterator.Close, as documented in the pebble
1498 : // package.
1499 1 : func (i *singleLevelIterator) Close() error {
1500 1 : if invariants.Enabled && i.inPool {
1501 0 : panic("Close called on interator in pool")
1502 : }
1503 1 : i.iterStats.close()
1504 1 : var err error
1505 1 : if i.closeHook != nil {
1506 1 : err = firstError(err, i.closeHook(i))
1507 1 : }
1508 1 : err = firstError(err, i.data.Close())
1509 1 : err = firstError(err, i.index.Close())
1510 1 : if i.indexFilterRH != nil {
1511 1 : err = firstError(err, i.indexFilterRH.Close())
1512 1 : i.indexFilterRH = nil
1513 1 : }
1514 1 : if i.dataRH != nil {
1515 1 : err = firstError(err, i.dataRH.Close())
1516 1 : i.dataRH = nil
1517 1 : }
1518 1 : err = firstError(err, i.err)
1519 1 : if i.bpfs != nil {
1520 1 : releaseBlockPropertiesFilterer(i.bpfs)
1521 1 : }
1522 1 : if i.vbReader != nil {
1523 1 : i.vbReader.close()
1524 1 : }
1525 1 : if i.vbRH != nil {
1526 1 : err = firstError(err, i.vbRH.Close())
1527 1 : i.vbRH = nil
1528 1 : }
1529 1 : *i = i.resetForReuse()
1530 1 : singleLevelIterPool.Put(i)
1531 1 : return err
1532 : }
1533 :
1534 0 : func (i *singleLevelIterator) String() string {
1535 0 : if i.vState != nil {
1536 0 : return i.vState.fileNum.String()
1537 0 : }
1538 0 : return i.reader.cacheOpts.FileNum.String()
1539 : }
1540 :
1541 : // DebugTree is part of the InternalIterator interface.
1542 0 : func (i *singleLevelIterator) DebugTree(tp treeprinter.Node) {
1543 0 : tp.Childf("%T(%p) fileNum=%s", i, i, i.String())
1544 0 : }
|