Line data Source code
1 : // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package sstable
6 :
7 : import (
8 : "bytes"
9 : "context"
10 : "fmt"
11 : "unsafe"
12 :
13 : "github.com/cockroachdb/pebble/internal/base"
14 : "github.com/cockroachdb/pebble/internal/invariants"
15 : "github.com/cockroachdb/pebble/objstorage"
16 : "github.com/cockroachdb/pebble/objstorage/objstorageprovider"
17 : "github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing"
18 : )
19 :
20 : // singleLevelIterator iterates over an entire table of data. To seek for a given
21 : // key, it first looks in the index for the block that contains that key, and then
22 : // looks inside that block.
23 : type singleLevelIterator struct {
24 : ctx context.Context
25 : cmp Compare
26 : // Global lower/upper bound for the iterator.
27 : lower []byte
28 : upper []byte
29 : bpfs *BlockPropertiesFilterer
30 : // Per-block lower/upper bound. Nil if the bound does not apply to the block
31 : // because we determined the block lies completely within the bound.
32 : blockLower []byte
33 : blockUpper []byte
34 : reader *Reader
35 : // vState will be set iff the iterator is constructed for virtual sstable
36 : // iteration.
37 : vState *virtualState
38 : // endKeyInclusive is set to force the iterator to treat the upper field as
39 : // inclusive while iterating instead of exclusive.
40 : endKeyInclusive bool
41 : index blockIter
42 : data blockIter
43 : dataRH objstorage.ReadHandle
44 : dataRHPrealloc objstorageprovider.PreallocatedReadHandle
45 : // dataBH refers to the last data block that the iterator considered
46 : // loading. It may not actually have loaded the block, due to an error or
47 : // because it was considered irrelevant.
48 : dataBH BlockHandle
49 : vbReader *valueBlockReader
50 : // vbRH is the read handle for value blocks, which are in a different
51 : // part of the sstable than data blocks.
52 : vbRH objstorage.ReadHandle
53 : vbRHPrealloc objstorageprovider.PreallocatedReadHandle
54 : err error
55 : closeHook func(i Iterator) error
56 : // stats and iterStats are slightly different. stats is a shared struct
57 : // supplied from the outside, and represents stats for the whole iterator
58 : // tree and can be reset from the outside (e.g. when the pebble.Iterator is
59 : // being reused). It is currently only provided when the iterator tree is
60 : // rooted at pebble.Iterator. iterStats is this sstable iterator's private
61 : // stats that are reported to a CategoryStatsCollector when this iterator is
62 : // closed. More paths are instrumented with this as the
63 : // CategoryStatsCollector needed for this is provided by the
64 : // tableCacheContainer (which is more universally used).
65 : stats *base.InternalIteratorStats
66 : iterStats iterStatsAccumulator
67 : bufferPool *BufferPool
68 :
69 : // boundsCmp and positionedUsingLatestBounds are for optimizing iteration
70 : // that uses multiple adjacent bounds. The seek after setting a new bound
71 : // can use the fact that the iterator is either within the previous bounds
72 : // or exactly one key before or after the bounds. If the new bounds is
73 : // after/before the previous bounds, and we are already positioned at a
74 : // block that is relevant for the new bounds, we can try to first position
75 : // using Next/Prev (repeatedly) instead of doing a more expensive seek.
76 : //
77 : // When there are wide files at higher levels that match the bounds
78 : // but don't have any data for the bound, we will already be
79 : // positioned at the key beyond the bounds and won't need to do much
80 : // work -- given that most data is in L6, such files are likely to
81 : // dominate the performance of the mergingIter, and may be the main
82 : // benefit of this performance optimization (of course it also helps
83 : // when the file that has the data has successive seeks that stay in
84 : // the same block).
85 : //
86 : // Specifically, boundsCmp captures the relationship between the previous
87 : // and current bounds, if the iterator had been positioned after setting
88 : // the previous bounds. If it was not positioned, i.e., Seek/First/Last
89 : // were not called, we don't know where it is positioned and cannot
90 : // optimize.
91 : //
92 : // Example: Bounds moving forward, and iterator exhausted in forward direction.
93 : // bounds = [f, h), ^ shows block iterator position
94 : // file contents [ a b c d e f g h i j k ]
95 : // ^
96 : // new bounds = [j, k). Since positionedUsingLatestBounds=true, boundsCmp is
97 : // set to +1. SeekGE(j) can use next (the optimization also requires that j
98 : // is within the block, but that is not for correctness, but to limit the
99 : // optimization to when it will actually be an optimization).
100 : //
101 : // Example: Bounds moving forward.
102 : // bounds = [f, h), ^ shows block iterator position
103 : // file contents [ a b c d e f g h i j k ]
104 : // ^
105 : // new bounds = [j, k). Since positionedUsingLatestBounds=true, boundsCmp is
106 : // set to +1. SeekGE(j) can use next.
107 : //
108 : // Example: Bounds moving forward, but iterator not positioned using previous
109 : // bounds.
110 : // bounds = [f, h), ^ shows block iterator position
111 : // file contents [ a b c d e f g h i j k ]
112 : // ^
113 : // new bounds = [i, j). Iterator is at j since it was never positioned using
114 : // [f, h). So positionedUsingLatestBounds=false, and boundsCmp is set to 0.
115 : // SeekGE(i) will not use next.
116 : //
117 : // Example: Bounds moving forward and sparse file
118 : // bounds = [f, h), ^ shows block iterator position
119 : // file contents [ a z ]
120 : // ^
121 : // new bounds = [j, k). Since positionedUsingLatestBounds=true, boundsCmp is
122 : // set to +1. SeekGE(j) notices that the iterator is already past j and does
123 : // not need to do anything.
124 : //
125 : // Similar examples can be constructed for backward iteration.
126 : //
127 : // This notion of exactly one key before or after the bounds is not quite
128 : // true when block properties are used to ignore blocks. In that case we
129 : // can't stop precisely at the first block that is past the bounds since
130 : // we are using the index entries to enforce the bounds.
131 : //
132 : // e.g. 3 blocks with keys [b, c] [f, g], [i, j, k] with index entries d,
133 : // h, l. And let the lower bound be k, and we are reverse iterating. If
134 : // the block [i, j, k] is ignored due to the block interval annotations we
135 : // do need to move the index to block [f, g] since the index entry for the
136 : // [i, j, k] block is l which is not less than the lower bound of k. So we
137 : // have passed the entries i, j.
138 : //
139 : // This behavior is harmless since the block property filters are fixed
140 : // for the lifetime of the iterator so i, j are irrelevant. In addition,
141 : // the current code will not load the [f, g] block, so the seek
142 : // optimization that attempts to use Next/Prev do not apply anyway.
143 : boundsCmp int
144 : positionedUsingLatestBounds bool
145 :
146 : // exhaustedBounds represents whether the iterator is exhausted for
147 : // iteration by reaching the upper or lower bound. +1 when exhausted
148 : // the upper bound, -1 when exhausted the lower bound, and 0 when
149 : // neither. exhaustedBounds is also used for the TrySeekUsingNext
150 : // optimization in twoLevelIterator and singleLevelIterator. Care should be
151 : // taken in setting this in twoLevelIterator before calling into
152 : // singleLevelIterator, given that these two iterators share this field.
153 : exhaustedBounds int8
154 :
155 : // useFilter specifies whether the filter block in this sstable, if present,
156 : // should be used for prefix seeks or not. In some cases it is beneficial
157 : // to skip a filter block even if it exists (eg. if probability of a match
158 : // is high).
159 : useFilter bool
160 : lastBloomFilterMatched bool
161 :
162 : transforms IterTransforms
163 :
164 : // inPool is set to true before putting the iterator in the reusable pool;
165 : // used to detect double-close.
166 : inPool bool
167 : }
168 :
169 : // singleLevelIterator implements the base.InternalIterator interface.
170 : var _ base.InternalIterator = (*singleLevelIterator)(nil)
171 :
172 : // init initializes a singleLevelIterator for reading from the table. It is
173 : // synonmous with Reader.NewIter, but allows for reusing of the iterator
174 : // between different Readers.
175 : //
176 : // Note that lower, upper passed into init has nothing to do with virtual sstable
177 : // bounds. If the virtualState passed in is not nil, then virtual sstable bounds
178 : // will be enforced.
179 : func (i *singleLevelIterator) init(
180 : ctx context.Context,
181 : r *Reader,
182 : v *virtualState,
183 : transforms IterTransforms,
184 : lower, upper []byte,
185 : filterer *BlockPropertiesFilterer,
186 : useFilter bool,
187 : stats *base.InternalIteratorStats,
188 : categoryAndQoS CategoryAndQoS,
189 : statsCollector *CategoryStatsCollector,
190 : rp ReaderProvider,
191 : bufferPool *BufferPool,
192 1 : ) error {
193 1 : if r.err != nil {
194 0 : return r.err
195 0 : }
196 1 : i.iterStats.init(categoryAndQoS, statsCollector)
197 1 : indexH, err := r.readIndex(ctx, stats, &i.iterStats)
198 1 : if err != nil {
199 0 : return err
200 0 : }
201 1 : if v != nil {
202 1 : i.vState = v
203 1 : i.endKeyInclusive, lower, upper = v.constrainBounds(lower, upper, false /* endInclusive */)
204 1 : }
205 :
206 1 : i.inPool = false
207 1 : i.ctx = ctx
208 1 : i.lower = lower
209 1 : i.upper = upper
210 1 : i.bpfs = filterer
211 1 : i.useFilter = useFilter
212 1 : i.reader = r
213 1 : i.cmp = r.Compare
214 1 : i.stats = stats
215 1 : i.transforms = transforms
216 1 : i.bufferPool = bufferPool
217 1 : err = i.index.initHandle(i.cmp, r.Split, indexH, transforms)
218 1 : if err != nil {
219 0 : // blockIter.Close releases indexH and always returns a nil error
220 0 : _ = i.index.Close()
221 0 : return err
222 0 : }
223 1 : i.dataRH = objstorageprovider.UsePreallocatedReadHandle(ctx, r.readable, &i.dataRHPrealloc)
224 1 : if r.tableFormat >= TableFormatPebblev3 {
225 1 : if r.Properties.NumValueBlocks > 0 {
226 1 : // NB: we cannot avoid this ~248 byte allocation, since valueBlockReader
227 1 : // can outlive the singleLevelIterator due to be being embedded in a
228 1 : // LazyValue. This consumes ~2% in microbenchmark CPU profiles, but we
229 1 : // should only optimize this if it shows up as significant in end-to-end
230 1 : // CockroachDB benchmarks, since it is tricky to do so. One possibility
231 1 : // is that if many sstable iterators only get positioned at latest
232 1 : // versions of keys, and therefore never expose a LazyValue that is
233 1 : // separated to their callers, they can put this valueBlockReader into a
234 1 : // sync.Pool.
235 1 : i.vbReader = &valueBlockReader{
236 1 : bpOpen: i,
237 1 : rp: rp,
238 1 : vbih: r.valueBIH,
239 1 : stats: stats,
240 1 : }
241 1 : i.data.lazyValueHandling.vbr = i.vbReader
242 1 : i.vbRH = objstorageprovider.UsePreallocatedReadHandle(ctx, r.readable, &i.vbRHPrealloc)
243 1 : }
244 1 : i.data.lazyValueHandling.hasValuePrefix = true
245 : }
246 1 : return nil
247 : }
248 :
249 : // Helper function to check if keys returned from iterator are within virtual bounds.
250 1 : func (i *singleLevelIterator) maybeVerifyKey(kv *base.InternalKV) *base.InternalKV {
251 1 : if invariants.Enabled && kv != nil && i.vState != nil {
252 1 : key := kv.K.UserKey
253 1 : v := i.vState
254 1 : lc := i.cmp(key, v.lower.UserKey)
255 1 : uc := i.cmp(key, v.upper.UserKey)
256 1 : if lc < 0 || uc > 0 || (uc == 0 && v.upper.IsExclusiveSentinel()) {
257 0 : panic(fmt.Sprintf("key %q out of singleLeveliterator virtual bounds %s %s", key, v.lower.UserKey, v.upper.UserKey))
258 : }
259 : }
260 1 : return kv
261 : }
262 :
263 : // setupForCompaction sets up the singleLevelIterator for use with compactionIter.
264 : // Currently, it skips readahead ramp-up. It should be called after init is called.
265 1 : func (i *singleLevelIterator) setupForCompaction() {
266 1 : i.dataRH.SetupForCompaction()
267 1 : if i.vbRH != nil {
268 1 : i.vbRH.SetupForCompaction()
269 1 : }
270 : }
271 :
272 1 : func (i *singleLevelIterator) resetForReuse() singleLevelIterator {
273 1 : return singleLevelIterator{
274 1 : index: i.index.resetForReuse(),
275 1 : data: i.data.resetForReuse(),
276 1 : inPool: true,
277 1 : }
278 1 : }
279 :
280 1 : func (i *singleLevelIterator) initBounds() {
281 1 : // Trim the iteration bounds for the current block. We don't have to check
282 1 : // the bounds on each iteration if the block is entirely contained within the
283 1 : // iteration bounds.
284 1 : i.blockLower = i.lower
285 1 : if i.blockLower != nil {
286 1 : kv := i.data.First()
287 1 : // TODO(radu): this should be <= 0
288 1 : if kv != nil && i.cmp(i.blockLower, kv.K.UserKey) < 0 {
289 1 : // The lower-bound is less than the first key in the block. No need
290 1 : // to check the lower-bound again for this block.
291 1 : i.blockLower = nil
292 1 : }
293 : }
294 1 : i.blockUpper = i.upper
295 1 : // TODO(radu): this should be >= 0 if blockUpper is inclusive.
296 1 : if i.blockUpper != nil && i.cmp(i.blockUpper, i.index.Key().UserKey) > 0 {
297 1 : // The upper-bound is greater than the index key which itself is greater
298 1 : // than or equal to every key in the block. No need to check the
299 1 : // upper-bound again for this block. Even if blockUpper is inclusive
300 1 : // because of upper being inclusive, we can still safely set blockUpper
301 1 : // to nil here.
302 1 : i.blockUpper = nil
303 1 : }
304 : }
305 :
306 1 : func (i *singleLevelIterator) initBoundsForAlreadyLoadedBlock() {
307 1 : // TODO(radu): determine automatically if we need to call First or not and
308 1 : // unify this function with initBounds().
309 1 : if i.data.getFirstUserKey() == nil {
310 0 : panic("initBoundsForAlreadyLoadedBlock must not be called on empty or corrupted block")
311 : }
312 1 : i.blockLower = i.lower
313 1 : if i.blockLower != nil {
314 1 : firstUserKey := i.data.getFirstUserKey()
315 1 : // TODO(radu): this should be <= 0
316 1 : if firstUserKey != nil && i.cmp(i.blockLower, firstUserKey) < 0 {
317 1 : // The lower-bound is less than the first key in the block. No need
318 1 : // to check the lower-bound again for this block.
319 1 : i.blockLower = nil
320 1 : }
321 : }
322 1 : i.blockUpper = i.upper
323 1 : // TODO(radu): this should be >= 0 if blockUpper is inclusive.
324 1 : if i.blockUpper != nil && i.cmp(i.blockUpper, i.index.Key().UserKey) > 0 {
325 1 : // The upper-bound is greater than the index key which itself is greater
326 1 : // than or equal to every key in the block. No need to check the
327 1 : // upper-bound again for this block.
328 1 : i.blockUpper = nil
329 1 : }
330 : }
331 :
332 : // Deterministic disabling of the bounds-based optimization that avoids seeking.
333 : // Uses the iterator pointer, since we want diversity in iterator behavior for
334 : // the same SetBounds call. Used for tests.
335 1 : func disableBoundsOpt(bound []byte, ptr uintptr) bool {
336 1 : // Fibonacci hash https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
337 1 : simpleHash := (11400714819323198485 * uint64(ptr)) >> 63
338 1 : return bound[len(bound)-1]&byte(1) == 0 && simpleHash == 0
339 1 : }
340 :
341 : // ensureBoundsOptDeterminism provides a facility for disabling of the bounds
342 : // optimizations performed by disableBoundsOpt for tests that require
343 : // deterministic iterator behavior. Some unit tests examine internal iterator
344 : // state and require this behavior to be deterministic.
345 : var ensureBoundsOptDeterminism bool
346 :
347 : // SetBoundsWithSyntheticPrefix indicates whether this iterator requires keys
348 : // passed to its SetBounds() method by a prefix rewriting wrapper to be *not*
349 : // rewritten to be in terms of this iterator's content, but instead be passed
350 : // as-is, i.e. with the synthetic prefix still on them.
351 : //
352 : // This allows an optimization when this iterator is passing these bounds on to
353 : // a vState to additionally constrain them. In said vState, passed bounds are
354 : // combined with the vState bounds which are in terms of the rewritten prefix.
355 : // If the caller rewrote bounds to be in terms of content prefix and SetBounds
356 : // passed those to vState, the vState would need to *un*rewrite them back to the
357 : // synthetic prefix in order to combine them with the vState bounds. Thus, if
358 : // this iterator knows bounds will be passed to vState, it can signal that it
359 : // they should be passed without being rewritten to skip converting to and fro.
360 0 : func (i singleLevelIterator) SetBoundsWithSyntheticPrefix() bool {
361 0 : return i.vState != nil
362 0 : }
363 :
364 : // SetBounds implements internalIterator.SetBounds, as documented in the pebble
365 : // package. Note that the upper field is exclusive.
366 1 : func (i *singleLevelIterator) SetBounds(lower, upper []byte) {
367 1 : i.boundsCmp = 0
368 1 : if i.vState != nil {
369 1 : // If the reader is constructed for a virtual sstable, then we must
370 1 : // constrain the bounds of the reader. For physical sstables, the bounds
371 1 : // can be wider than the actual sstable's bounds because we won't
372 1 : // accidentally expose additional keys as there are no additional keys.
373 1 : i.endKeyInclusive, lower, upper = i.vState.constrainBounds(
374 1 : lower, upper, false,
375 1 : )
376 1 : } else {
377 1 : // TODO(bananabrick): Figure out the logic here to enable the boundsCmp
378 1 : // optimization for virtual sstables.
379 1 : if i.positionedUsingLatestBounds {
380 1 : if i.upper != nil && lower != nil && i.cmp(i.upper, lower) <= 0 {
381 1 : i.boundsCmp = +1
382 1 : if invariants.Enabled && !ensureBoundsOptDeterminism &&
383 1 : disableBoundsOpt(lower, uintptr(unsafe.Pointer(i))) {
384 1 : i.boundsCmp = 0
385 1 : }
386 1 : } else if i.lower != nil && upper != nil && i.cmp(upper, i.lower) <= 0 {
387 1 : i.boundsCmp = -1
388 1 : if invariants.Enabled && !ensureBoundsOptDeterminism &&
389 1 : disableBoundsOpt(upper, uintptr(unsafe.Pointer(i))) {
390 1 : i.boundsCmp = 0
391 1 : }
392 : }
393 : }
394 : }
395 :
396 1 : i.positionedUsingLatestBounds = false
397 1 : i.lower = lower
398 1 : i.upper = upper
399 1 : i.blockLower = nil
400 1 : i.blockUpper = nil
401 : }
402 :
403 0 : func (i *singleLevelIterator) SetContext(ctx context.Context) {
404 0 : i.ctx = ctx
405 0 : }
406 :
407 : // loadBlock loads the block at the current index position and leaves i.data
408 : // unpositioned. If unsuccessful, it sets i.err to any error encountered, which
409 : // may be nil if we have simply exhausted the entire table.
410 1 : func (i *singleLevelIterator) loadBlock(dir int8) loadBlockResult {
411 1 : if !i.index.valid() {
412 0 : // Ensure the data block iterator is invalidated even if loading of the block
413 0 : // fails.
414 0 : i.data.invalidate()
415 0 : return loadBlockFailed
416 0 : }
417 : // Load the next block.
418 1 : v := i.index.value()
419 1 : bhp, err := decodeBlockHandleWithProperties(v.InPlaceValue())
420 1 : if i.dataBH == bhp.BlockHandle && i.data.valid() {
421 1 : // We're already at the data block we want to load. Reset bounds in case
422 1 : // they changed since the last seek, but don't reload the block from cache
423 1 : // or disk.
424 1 : //
425 1 : // It's safe to leave i.data in its original state here, as all callers to
426 1 : // loadBlock make an absolute positioning call (i.e. a seek, first, or last)
427 1 : // to `i.data` right after loadBlock returns loadBlockOK.
428 1 : i.initBounds()
429 1 : return loadBlockOK
430 1 : }
431 : // Ensure the data block iterator is invalidated even if loading of the block
432 : // fails.
433 1 : i.data.invalidate()
434 1 : i.dataBH = bhp.BlockHandle
435 1 : if err != nil {
436 0 : i.err = errCorruptIndexEntry(err)
437 0 : return loadBlockFailed
438 0 : }
439 1 : if i.bpfs != nil {
440 1 : intersects, err := i.bpfs.intersects(bhp.Props)
441 1 : if err != nil {
442 0 : i.err = errCorruptIndexEntry(err)
443 0 : return loadBlockFailed
444 0 : }
445 1 : if intersects == blockMaybeExcluded {
446 1 : intersects = i.resolveMaybeExcluded(dir)
447 1 : }
448 1 : if intersects == blockExcluded {
449 1 : return loadBlockIrrelevant
450 1 : }
451 : // blockIntersects
452 : }
453 1 : ctx := objiotracing.WithBlockType(i.ctx, objiotracing.DataBlock)
454 1 : block, err := i.reader.readBlock(
455 1 : ctx, i.dataBH, nil /* transform */, i.dataRH, i.stats, &i.iterStats, i.bufferPool)
456 1 : if err != nil {
457 0 : i.err = err
458 0 : return loadBlockFailed
459 0 : }
460 1 : i.err = i.data.initHandle(i.cmp, i.reader.Split, block, i.transforms)
461 1 : if i.err != nil {
462 0 : // The block is partially loaded, and we don't want it to appear valid.
463 0 : i.data.invalidate()
464 0 : return loadBlockFailed
465 0 : }
466 1 : i.initBounds()
467 1 : return loadBlockOK
468 : }
469 :
470 : // readBlockForVBR implements the blockProviderWhenOpen interface for use by
471 : // the valueBlockReader.
472 : func (i *singleLevelIterator) readBlockForVBR(
473 : h BlockHandle, stats *base.InternalIteratorStats,
474 1 : ) (bufferHandle, error) {
475 1 : ctx := objiotracing.WithBlockType(i.ctx, objiotracing.ValueBlock)
476 1 : return i.reader.readBlock(ctx, h, nil, i.vbRH, stats, &i.iterStats, i.bufferPool)
477 1 : }
478 :
479 : // resolveMaybeExcluded is invoked when the block-property filterer has found
480 : // that a block is excluded according to its properties but only if its bounds
481 : // fall within the filter's current bounds. This function consults the
482 : // apprioriate bound, depending on the iteration direction, and returns either
483 : // `blockIntersects` or `blockExcluded`.
484 1 : func (i *singleLevelIterator) resolveMaybeExcluded(dir int8) intersectsResult {
485 1 : // TODO(jackson): We could first try comparing to top-level index block's
486 1 : // key, and if within bounds avoid per-data block key comparisons.
487 1 :
488 1 : // This iterator is configured with a bound-limited block property
489 1 : // filter. The bpf determined this block could be excluded from
490 1 : // iteration based on the property encoded in the block handle.
491 1 : // However, we still need to determine if the block is wholly
492 1 : // contained within the filter's key bounds.
493 1 : //
494 1 : // External guarantees ensure all the block's keys are ≥ the
495 1 : // filter's lower bound during forward iteration, and that all the
496 1 : // block's keys are < the filter's upper bound during backward
497 1 : // iteration. We only need to determine if the opposite bound is
498 1 : // also met.
499 1 : //
500 1 : // The index separator in index.Key() provides an inclusive
501 1 : // upper-bound for the data block's keys, guaranteeing that all its
502 1 : // keys are ≤ index.Key(). For forward iteration, this is all we
503 1 : // need.
504 1 : if dir > 0 {
505 1 : // Forward iteration.
506 1 : if i.bpfs.boundLimitedFilter.KeyIsWithinUpperBound(i.index.Key().UserKey) {
507 1 : return blockExcluded
508 1 : }
509 1 : return blockIntersects
510 : }
511 :
512 : // Reverse iteration.
513 : //
514 : // Because we're iterating in the reverse direction, we don't yet have
515 : // enough context available to determine if the block is wholly contained
516 : // within its bounds. This case arises only during backward iteration,
517 : // because of the way the index is structured.
518 : //
519 : // Consider a bound-limited bpf limited to the bounds [b,d), loading the
520 : // block with separator `c`. During reverse iteration, the guarantee that
521 : // all the block's keys are < `d` is externally provided, but no guarantee
522 : // is made on the bpf's lower bound. The separator `c` only provides an
523 : // inclusive upper bound on the block's keys, indicating that the
524 : // corresponding block handle points to a block containing only keys ≤ `c`.
525 : //
526 : // To establish a lower bound, we step the index backwards to read the
527 : // previous block's separator, which provides an inclusive lower bound on
528 : // the original block's keys. Afterwards, we step forward to restore our
529 : // index position.
530 1 : if peekKV := i.index.Prev(); peekKV == nil {
531 1 : // The original block points to the first block of this index block. If
532 1 : // there's a two-level index, it could potentially provide a lower
533 1 : // bound, but the code refactoring necessary to read it doesn't seem
534 1 : // worth the payoff. We fall through to loading the block.
535 1 : } else if i.bpfs.boundLimitedFilter.KeyIsWithinLowerBound(peekKV.K.UserKey) {
536 1 : // The lower-bound on the original block falls within the filter's
537 1 : // bounds, and we can skip the block (after restoring our current index
538 1 : // position).
539 1 : _ = i.index.Next()
540 1 : return blockExcluded
541 1 : }
542 1 : _ = i.index.Next()
543 1 : return blockIntersects
544 : }
545 :
546 : // The number of times to call Next/Prev in a block before giving up and seeking.
547 : // The value of 4 is arbitrary.
548 : // TODO(sumeer): experiment with dynamic adjustment based on the history of
549 : // seeks for a particular iterator.
550 : const numStepsBeforeSeek = 4
551 :
552 : func (i *singleLevelIterator) trySeekGEUsingNextWithinBlock(
553 : key []byte,
554 1 : ) (kv *base.InternalKV, done bool) {
555 1 : kv = i.data.KV()
556 1 : for j := 0; j < numStepsBeforeSeek; j++ {
557 1 : curKeyCmp := i.cmp(kv.K.UserKey, key)
558 1 : if curKeyCmp >= 0 {
559 1 : if i.blockUpper != nil {
560 1 : cmp := i.cmp(kv.K.UserKey, i.blockUpper)
561 1 : if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
562 1 : i.exhaustedBounds = +1
563 1 : return nil, true
564 1 : }
565 : }
566 1 : return kv, true
567 : }
568 1 : kv = i.data.Next()
569 1 : if kv == nil {
570 1 : break
571 : }
572 : }
573 1 : return kv, false
574 : }
575 :
576 : func (i *singleLevelIterator) trySeekLTUsingPrevWithinBlock(
577 : key []byte,
578 1 : ) (kv *base.InternalKV, done bool) {
579 1 : kv = i.data.KV()
580 1 : for j := 0; j < numStepsBeforeSeek; j++ {
581 1 : curKeyCmp := i.cmp(kv.K.UserKey, key)
582 1 : if curKeyCmp < 0 {
583 1 : if i.blockLower != nil && i.cmp(kv.K.UserKey, i.blockLower) < 0 {
584 1 : i.exhaustedBounds = -1
585 1 : return nil, true
586 1 : }
587 1 : return kv, true
588 : }
589 1 : kv = i.data.Prev()
590 1 : if kv == nil {
591 1 : break
592 : }
593 : }
594 1 : return kv, false
595 : }
596 :
597 : // SeekGE implements internalIterator.SeekGE, as documented in the pebble
598 : // package. Note that SeekGE only checks the upper bound. It is up to the
599 : // caller to ensure that key is greater than or equal to the lower bound.
600 1 : func (i *singleLevelIterator) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKV {
601 1 : if i.vState != nil {
602 1 : // Callers of SeekGE don't know about virtual sstable bounds, so we may
603 1 : // have to internally restrict the bounds.
604 1 : //
605 1 : // TODO(bananabrick): We can optimize this check away for the level iter
606 1 : // if necessary.
607 1 : if i.cmp(key, i.lower) < 0 {
608 1 : key = i.lower
609 1 : }
610 : }
611 :
612 1 : if flags.TrySeekUsingNext() {
613 1 : // The i.exhaustedBounds comparison indicates that the upper bound was
614 1 : // reached. The i.data.isDataInvalidated() indicates that the sstable was
615 1 : // exhausted.
616 1 : if (i.exhaustedBounds == +1 || i.data.isDataInvalidated()) && i.err == nil {
617 1 : // Already exhausted, so return nil.
618 1 : return nil
619 1 : }
620 1 : if i.err != nil {
621 0 : // The current iterator position cannot be used.
622 0 : flags = flags.DisableTrySeekUsingNext()
623 0 : }
624 : // INVARIANT: flags.TrySeekUsingNext() => i.err == nil &&
625 : // !i.exhaustedBounds==+1 && !i.data.isDataInvalidated(). That is,
626 : // data-exhausted and bounds-exhausted, as defined earlier, are both
627 : // false. Ths makes it safe to clear out i.exhaustedBounds and i.err
628 : // before calling into seekGEHelper.
629 : }
630 :
631 1 : i.exhaustedBounds = 0
632 1 : i.err = nil // clear cached iteration error
633 1 : boundsCmp := i.boundsCmp
634 1 : // Seek optimization only applies until iterator is first positioned after SetBounds.
635 1 : i.boundsCmp = 0
636 1 : i.positionedUsingLatestBounds = true
637 1 : return i.seekGEHelper(key, boundsCmp, flags)
638 : }
639 :
640 : // seekGEHelper contains the common functionality for SeekGE and SeekPrefixGE.
641 : func (i *singleLevelIterator) seekGEHelper(
642 : key []byte, boundsCmp int, flags base.SeekGEFlags,
643 1 : ) *base.InternalKV {
644 1 : // Invariant: trySeekUsingNext => !i.data.isDataInvalidated() && i.exhaustedBounds != +1
645 1 :
646 1 : // SeekGE performs various step-instead-of-seeking optimizations: eg enabled
647 1 : // by trySeekUsingNext, or by monotonically increasing bounds (i.boundsCmp).
648 1 :
649 1 : var dontSeekWithinBlock bool
650 1 : if !i.data.isDataInvalidated() && !i.index.isDataInvalidated() && i.data.valid() && i.index.valid() &&
651 1 : boundsCmp > 0 && i.cmp(key, i.index.Key().UserKey) <= 0 {
652 1 : // Fast-path: The bounds have moved forward and this SeekGE is
653 1 : // respecting the lower bound (guaranteed by Iterator). We know that
654 1 : // the iterator must already be positioned within or just outside the
655 1 : // previous bounds. Therefore it cannot be positioned at a block (or
656 1 : // the position within that block) that is ahead of the seek position.
657 1 : // However it can be positioned at an earlier block. This fast-path to
658 1 : // use Next() on the block is only applied when we are already at the
659 1 : // block that the slow-path (the else-clause) would load -- this is
660 1 : // the motivation for the i.cmp(key, i.index.Key().UserKey) <= 0
661 1 : // predicate.
662 1 : i.initBoundsForAlreadyLoadedBlock()
663 1 : kv, done := i.trySeekGEUsingNextWithinBlock(key)
664 1 : if done {
665 1 : return kv
666 1 : }
667 1 : if kv == nil {
668 1 : // Done with this block.
669 1 : dontSeekWithinBlock = true
670 1 : }
671 1 : } else {
672 1 : // Cannot use bounds monotonicity. But may be able to optimize if
673 1 : // caller claimed externally known invariant represented by
674 1 : // flags.TrySeekUsingNext().
675 1 : if flags.TrySeekUsingNext() {
676 1 : // seekPrefixGE or SeekGE has already ensured
677 1 : // !i.data.isDataInvalidated() && i.exhaustedBounds != +1
678 1 : curr := i.data.KV()
679 1 : less := i.cmp(curr.K.UserKey, key) < 0
680 1 : // We could be more sophisticated and confirm that the seek
681 1 : // position is within the current block before applying this
682 1 : // optimization. But there may be some benefit even if it is in
683 1 : // the next block, since we can avoid seeking i.index.
684 1 : for j := 0; less && j < numStepsBeforeSeek; j++ {
685 1 : curr = i.Next()
686 1 : if curr == nil {
687 1 : return nil
688 1 : }
689 1 : less = i.cmp(curr.K.UserKey, key) < 0
690 : }
691 1 : if !less {
692 1 : if i.blockUpper != nil {
693 1 : cmp := i.cmp(curr.K.UserKey, i.blockUpper)
694 1 : if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
695 0 : i.exhaustedBounds = +1
696 0 : return nil
697 0 : }
698 : }
699 1 : return curr
700 : }
701 : }
702 :
703 : // Slow-path.
704 :
705 1 : var ikv *base.InternalKV
706 1 : if ikv = i.index.SeekGE(key, flags.DisableTrySeekUsingNext()); ikv == nil {
707 1 : // The target key is greater than any key in the index block.
708 1 : // Invalidate the block iterator so that a subsequent call to Prev()
709 1 : // will return the last key in the table.
710 1 : i.data.invalidate()
711 1 : return nil
712 1 : }
713 1 : result := i.loadBlock(+1)
714 1 : if result == loadBlockFailed {
715 0 : return nil
716 0 : }
717 1 : if result == loadBlockIrrelevant {
718 1 : // Enforce the upper bound here since don't want to bother moving
719 1 : // to the next block if upper bound is already exceeded. Note that
720 1 : // the next block starts with keys >= ikey.UserKey since even
721 1 : // though this is the block separator, the same user key can span
722 1 : // multiple blocks. If upper is exclusive we use >= below, else
723 1 : // we use >.
724 1 : if i.upper != nil {
725 1 : cmp := i.cmp(ikv.K.UserKey, i.upper)
726 1 : if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
727 1 : i.exhaustedBounds = +1
728 1 : return nil
729 1 : }
730 : }
731 : // Want to skip to the next block.
732 1 : dontSeekWithinBlock = true
733 : }
734 : }
735 1 : if !dontSeekWithinBlock {
736 1 : if ikv := i.data.SeekGE(key, flags.DisableTrySeekUsingNext()); ikv != nil {
737 1 : if i.blockUpper != nil {
738 1 : cmp := i.cmp(ikv.K.UserKey, i.blockUpper)
739 1 : if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
740 1 : i.exhaustedBounds = +1
741 1 : return nil
742 1 : }
743 : }
744 1 : return ikv
745 : }
746 : }
747 1 : return i.skipForward()
748 : }
749 :
750 : // SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the
751 : // pebble package. Note that SeekPrefixGE only checks the upper bound. It is up
752 : // to the caller to ensure that key is greater than or equal to the lower bound.
753 : func (i *singleLevelIterator) SeekPrefixGE(
754 : prefix, key []byte, flags base.SeekGEFlags,
755 1 : ) *base.InternalKV {
756 1 : if i.vState != nil {
757 1 : // Callers of SeekPrefixGE aren't aware of virtual sstable bounds, so
758 1 : // we may have to internally restrict the bounds.
759 1 : //
760 1 : // TODO(bananabrick): We can optimize away this check for the level iter
761 1 : // if necessary.
762 1 : if i.cmp(key, i.lower) < 0 {
763 1 : key = i.lower
764 1 : }
765 : }
766 1 : return i.seekPrefixGE(prefix, key, flags, i.useFilter)
767 : }
768 :
769 : func (i *singleLevelIterator) seekPrefixGE(
770 : prefix, key []byte, flags base.SeekGEFlags, checkFilter bool,
771 1 : ) (kv *base.InternalKV) {
772 1 : // NOTE: prefix is only used for bloom filter checking and not later work in
773 1 : // this method. Hence, we can use the existing iterator position if the last
774 1 : // SeekPrefixGE did not fail bloom filter matching.
775 1 :
776 1 : err := i.err
777 1 : i.err = nil // clear cached iteration error
778 1 : if checkFilter && i.reader.tableFilter != nil {
779 1 : if !i.lastBloomFilterMatched {
780 1 : // Iterator is not positioned based on last seek.
781 1 : flags = flags.DisableTrySeekUsingNext()
782 1 : }
783 1 : i.lastBloomFilterMatched = false
784 1 : // Check prefix bloom filter.
785 1 : var dataH bufferHandle
786 1 : dataH, i.err = i.reader.readFilter(i.ctx, i.stats, &i.iterStats)
787 1 : if i.err != nil {
788 0 : i.data.invalidate()
789 0 : return nil
790 0 : }
791 1 : mayContain := i.reader.tableFilter.mayContain(dataH.Get(), prefix)
792 1 : dataH.Release()
793 1 : if !mayContain {
794 1 : // This invalidation may not be necessary for correctness, and may
795 1 : // be a place to optimize later by reusing the already loaded
796 1 : // block. It was necessary in earlier versions of the code since
797 1 : // the caller was allowed to call Next when SeekPrefixGE returned
798 1 : // nil. This is no longer allowed.
799 1 : i.data.invalidate()
800 1 : return nil
801 1 : }
802 1 : i.lastBloomFilterMatched = true
803 : }
804 1 : if flags.TrySeekUsingNext() {
805 1 : // The i.exhaustedBounds comparison indicates that the upper bound was
806 1 : // reached. The i.data.isDataInvalidated() indicates that the sstable was
807 1 : // exhausted.
808 1 : if (i.exhaustedBounds == +1 || i.data.isDataInvalidated()) && err == nil {
809 1 : // Already exhausted, so return nil.
810 1 : return nil
811 1 : }
812 1 : if err != nil {
813 0 : // The current iterator position cannot be used.
814 0 : flags = flags.DisableTrySeekUsingNext()
815 0 : }
816 : // INVARIANT: flags.TrySeekUsingNext() => err == nil &&
817 : // !i.exhaustedBounds==+1 && !i.data.isDataInvalidated(). That is,
818 : // data-exhausted and bounds-exhausted, as defined earlier, are both
819 : // false. Ths makes it safe to clear out i.exhaustedBounds and i.err
820 : // before calling into seekGEHelper.
821 : }
822 : // Bloom filter matches, or skipped, so this method will position the
823 : // iterator.
824 1 : i.exhaustedBounds = 0
825 1 : boundsCmp := i.boundsCmp
826 1 : // Seek optimization only applies until iterator is first positioned after SetBounds.
827 1 : i.boundsCmp = 0
828 1 : i.positionedUsingLatestBounds = true
829 1 : return i.maybeVerifyKey(i.seekGEHelper(key, boundsCmp, flags))
830 : }
831 :
832 : // virtualLast should only be called if i.vReader != nil.
833 1 : func (i *singleLevelIterator) virtualLast() *base.InternalKV {
834 1 : if i.vState == nil {
835 0 : panic("pebble: invalid call to virtualLast")
836 : }
837 :
838 1 : if !i.endKeyInclusive {
839 1 : // Trivial case.
840 1 : return i.SeekLT(i.upper, base.SeekLTFlagsNone)
841 1 : }
842 1 : return i.virtualLastSeekLE()
843 : }
844 :
845 : // virtualLastSeekLE is called by virtualLast to do a SeekLE as part of a
846 : // virtualLast. Consider generalizing this into a SeekLE() if there are other
847 : // uses of this method in the future. Does a SeekLE on the upper bound of the
848 : // file/iterator.
849 1 : func (i *singleLevelIterator) virtualLastSeekLE() *base.InternalKV {
850 1 : // Callers of SeekLE don't know about virtual sstable bounds, so we may
851 1 : // have to internally restrict the bounds.
852 1 : //
853 1 : // TODO(bananabrick): We can optimize this check away for the level iter
854 1 : // if necessary.
855 1 : if !i.endKeyInclusive {
856 0 : panic("unexpected virtualLastSeekLE with exclusive upper bounds")
857 : }
858 1 : key := i.upper
859 1 :
860 1 : i.exhaustedBounds = 0
861 1 : i.err = nil // clear cached iteration error
862 1 : // Seek optimization only applies until iterator is first positioned with a
863 1 : // SeekGE or SeekLT after SetBounds.
864 1 : i.boundsCmp = 0
865 1 : i.positionedUsingLatestBounds = true
866 1 :
867 1 : ikv := i.index.SeekGE(key, base.SeekGEFlagsNone)
868 1 : // We can have multiple internal keys with the same user key as the seek
869 1 : // key. In that case, we want the last (greatest) internal key.
870 1 : //
871 1 : // INVARIANT: One of two cases:
872 1 : // A. ikey == nil. There is no data block with index key >= key. So all keys
873 1 : // in the last data block are < key.
874 1 : // B. ikey.userkey >= key. This data block may have some keys > key.
875 1 : //
876 1 : // Subcases of B:
877 1 : // B1. ikey.userkey == key. This is when loop iteration happens.
878 1 : // Since ikey.UserKey >= largest data key in the block, the largest data
879 1 : // key in this block is <= key.
880 1 : // B2. ikey.userkey > key. Loop iteration will not happen.
881 1 : //
882 1 : // NB: We can avoid this Next()ing if we just implement a blockIter.SeekLE().
883 1 : // This might be challenging to do correctly, so impose regular operations
884 1 : // for now.
885 1 : for ikv != nil && bytes.Equal(ikv.K.UserKey, key) {
886 1 : ikv = i.index.Next()
887 1 : }
888 1 : if ikv == nil {
889 1 : // Cases A or B1 where B1 exhausted all blocks. In both cases the last block
890 1 : // has all keys <= key. skipBackward enforces the lower bound.
891 1 : return i.skipBackward()
892 1 : }
893 : // Case B. We are here because we were originally in case B2, or we were in B1
894 : // and we arrived at a block where ikey.UserKey > key. Either way, ikey.UserKey
895 : // > key. So there could be keys in the block > key. But the block preceding
896 : // this block cannot have any keys > key, otherwise it would have been the
897 : // result of the original index.SeekGE.
898 1 : result := i.loadBlock(-1)
899 1 : if result == loadBlockFailed {
900 0 : return nil
901 0 : }
902 1 : if result == loadBlockIrrelevant {
903 1 : // Want to skip to the previous block.
904 1 : return i.skipBackward()
905 1 : }
906 1 : ikv = i.data.SeekGE(key, base.SeekGEFlagsNone)
907 1 : // Go to the last user key that matches key, and then Prev() on the data
908 1 : // block.
909 1 : for ikv != nil && bytes.Equal(ikv.K.UserKey, key) {
910 1 : ikv = i.data.Next()
911 1 : }
912 1 : ikv = i.data.Prev()
913 1 : if ikv != nil {
914 1 : // Enforce the lower bound here, as we could have gone past it. This happens
915 1 : // if keys between `i.blockLower` and `key` are obsolete, for instance. Even
916 1 : // though i.blockLower (which is either nil or equal to i.lower) is <= key,
917 1 : // all internal keys in the user key interval [i.blockLower, key] could be
918 1 : // obsolete (due to a RANGEDEL which will not be observed here). And
919 1 : // i.data.Prev will skip all these obsolete keys, and could land on a key
920 1 : // below the lower bound, requiring the lower bound check.
921 1 : if i.blockLower != nil && i.cmp(ikv.K.UserKey, i.blockLower) < 0 {
922 0 : i.exhaustedBounds = -1
923 0 : return nil
924 0 : }
925 1 : return ikv
926 : }
927 1 : return i.skipBackward()
928 : }
929 :
930 : // SeekLT implements internalIterator.SeekLT, as documented in the pebble
931 : // package. Note that SeekLT only checks the lower bound. It is up to the
932 : // caller to ensure that key is less than or equal to the upper bound.
933 1 : func (i *singleLevelIterator) SeekLT(key []byte, flags base.SeekLTFlags) *base.InternalKV {
934 1 : if i.vState != nil {
935 1 : // Might have to fix upper bound since virtual sstable bounds are not
936 1 : // known to callers of SeekLT.
937 1 : //
938 1 : // TODO(bananabrick): We can optimize away this check for the level iter
939 1 : // if necessary.
940 1 : cmp := i.cmp(key, i.upper)
941 1 : // key == i.upper is fine. We'll do the right thing and return the
942 1 : // first internal key with user key < key.
943 1 : if cmp > 0 {
944 1 : // Return the last key in the virtual sstable.
945 1 : return i.maybeVerifyKey(i.virtualLast())
946 1 : }
947 : }
948 :
949 1 : i.exhaustedBounds = 0
950 1 : i.err = nil // clear cached iteration error
951 1 : boundsCmp := i.boundsCmp
952 1 : // Seek optimization only applies until iterator is first positioned after SetBounds.
953 1 : i.boundsCmp = 0
954 1 :
955 1 : // Seeking operations perform various step-instead-of-seeking optimizations:
956 1 : // eg by considering monotonically increasing bounds (i.boundsCmp).
957 1 :
958 1 : i.positionedUsingLatestBounds = true
959 1 :
960 1 : var dontSeekWithinBlock bool
961 1 : if !i.data.isDataInvalidated() && !i.index.isDataInvalidated() && i.data.valid() && i.index.valid() &&
962 1 : boundsCmp < 0 && i.cmp(i.data.getFirstUserKey(), key) < 0 {
963 1 : // Fast-path: The bounds have moved backward, and this SeekLT is
964 1 : // respecting the upper bound (guaranteed by Iterator). We know that
965 1 : // the iterator must already be positioned within or just outside the
966 1 : // previous bounds. Therefore it cannot be positioned at a block (or
967 1 : // the position within that block) that is behind the seek position.
968 1 : // However it can be positioned at a later block. This fast-path to
969 1 : // use Prev() on the block is only applied when we are already at the
970 1 : // block that can satisfy this seek -- this is the motivation for the
971 1 : // the i.cmp(i.data.firstKey.UserKey, key) < 0 predicate.
972 1 : i.initBoundsForAlreadyLoadedBlock()
973 1 : ikv, done := i.trySeekLTUsingPrevWithinBlock(key)
974 1 : if done {
975 1 : return ikv
976 1 : }
977 1 : if ikv == nil {
978 1 : // Done with this block.
979 1 : dontSeekWithinBlock = true
980 1 : }
981 1 : } else {
982 1 : // Slow-path.
983 1 : var ikv *base.InternalKV
984 1 :
985 1 : // NB: If a bound-limited block property filter is configured, it's
986 1 : // externally ensured that the filter is disabled (through returning
987 1 : // Intersects=false irrespective of the block props provided) during
988 1 : // seeks.
989 1 : if ikv = i.index.SeekGE(key, base.SeekGEFlagsNone); ikv == nil {
990 1 : ikv = i.index.Last()
991 1 : if ikv == nil {
992 0 : return nil
993 0 : }
994 : }
995 : // INVARIANT: ikey != nil.
996 1 : result := i.loadBlock(-1)
997 1 : if result == loadBlockFailed {
998 0 : return nil
999 0 : }
1000 1 : if result == loadBlockIrrelevant {
1001 1 : // Enforce the lower bound here since don't want to bother moving
1002 1 : // to the previous block if lower bound is already exceeded. Note
1003 1 : // that the previous block starts with keys <= ikey.UserKey since
1004 1 : // even though this is the current block's separator, the same
1005 1 : // user key can span multiple blocks.
1006 1 : if i.lower != nil && i.cmp(ikv.K.UserKey, i.lower) < 0 {
1007 1 : i.exhaustedBounds = -1
1008 1 : return nil
1009 1 : }
1010 : // Want to skip to the previous block.
1011 1 : dontSeekWithinBlock = true
1012 : }
1013 : }
1014 1 : if !dontSeekWithinBlock {
1015 1 : if ikv := i.data.SeekLT(key, flags); ikv != nil {
1016 1 : if i.blockLower != nil && i.cmp(ikv.K.UserKey, i.blockLower) < 0 {
1017 1 : i.exhaustedBounds = -1
1018 1 : return nil
1019 1 : }
1020 1 : return ikv
1021 : }
1022 : }
1023 : // The index contains separator keys which may lie between
1024 : // user-keys. Consider the user-keys:
1025 : //
1026 : // complete
1027 : // ---- new block ---
1028 : // complexion
1029 : //
1030 : // If these two keys end one block and start the next, the index key may
1031 : // be chosen as "compleu". The SeekGE in the index block will then point
1032 : // us to the block containing "complexion". If this happens, we want the
1033 : // last key from the previous data block.
1034 1 : return i.maybeVerifyKey(i.skipBackward())
1035 : }
1036 :
1037 : // First implements internalIterator.First, as documented in the pebble
1038 : // package. Note that First only checks the upper bound. It is up to the caller
1039 : // to ensure that key is greater than or equal to the lower bound (e.g. via a
1040 : // call to SeekGE(lower)).
1041 1 : func (i *singleLevelIterator) First() *base.InternalKV {
1042 1 : // If we have a lower bound, use SeekGE. Note that in general this is not
1043 1 : // supported usage, except when the lower bound is there because the table is
1044 1 : // virtual.
1045 1 : if i.lower != nil {
1046 1 : return i.SeekGE(i.lower, base.SeekGEFlagsNone)
1047 1 : }
1048 :
1049 1 : i.positionedUsingLatestBounds = true
1050 1 :
1051 1 : return i.firstInternal()
1052 : }
1053 :
1054 : // firstInternal is a helper used for absolute positioning in a single-level
1055 : // index file, or for positioning in the second-level index in a two-level
1056 : // index file. For the latter, one cannot make any claims about absolute
1057 : // positioning.
1058 1 : func (i *singleLevelIterator) firstInternal() *base.InternalKV {
1059 1 : i.exhaustedBounds = 0
1060 1 : i.err = nil // clear cached iteration error
1061 1 : // Seek optimization only applies until iterator is first positioned after SetBounds.
1062 1 : i.boundsCmp = 0
1063 1 :
1064 1 : var kv *base.InternalKV
1065 1 : if kv = i.index.First(); kv == nil {
1066 0 : i.data.invalidate()
1067 0 : return nil
1068 0 : }
1069 1 : result := i.loadBlock(+1)
1070 1 : if result == loadBlockFailed {
1071 0 : return nil
1072 0 : }
1073 1 : if result == loadBlockOK {
1074 1 : if kv := i.data.First(); kv != nil {
1075 1 : if i.blockUpper != nil {
1076 1 : cmp := i.cmp(kv.K.UserKey, i.blockUpper)
1077 1 : if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
1078 1 : i.exhaustedBounds = +1
1079 1 : return nil
1080 1 : }
1081 : }
1082 1 : return kv
1083 : }
1084 : // Else fall through to skipForward.
1085 1 : } else {
1086 1 : // result == loadBlockIrrelevant. Enforce the upper bound here since
1087 1 : // don't want to bother moving to the next block if upper bound is
1088 1 : // already exceeded. Note that the next block starts with keys >=
1089 1 : // ikey.UserKey since even though this is the block separator, the
1090 1 : // same user key can span multiple blocks. If upper is exclusive we
1091 1 : // use >= below, else we use >.
1092 1 : if i.upper != nil {
1093 1 : cmp := i.cmp(kv.K.UserKey, i.upper)
1094 1 : if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
1095 1 : i.exhaustedBounds = +1
1096 1 : return nil
1097 1 : }
1098 : }
1099 : // Else fall through to skipForward.
1100 : }
1101 :
1102 1 : return i.skipForward()
1103 : }
1104 :
1105 : // Last implements internalIterator.Last, as documented in the pebble
1106 : // package. Note that Last only checks the lower bound. It is up to the caller
1107 : // to ensure that key is less than the upper bound (e.g. via a call to
1108 : // SeekLT(upper))
1109 1 : func (i *singleLevelIterator) Last() *base.InternalKV {
1110 1 : if i.vState != nil {
1111 1 : return i.maybeVerifyKey(i.virtualLast())
1112 1 : }
1113 :
1114 1 : if i.upper != nil {
1115 0 : panic("singleLevelIterator.Last() used despite upper bound")
1116 : }
1117 1 : i.positionedUsingLatestBounds = true
1118 1 : return i.lastInternal()
1119 : }
1120 :
1121 : // lastInternal is a helper used for absolute positioning in a single-level
1122 : // index file, or for positioning in the second-level index in a two-level
1123 : // index file. For the latter, one cannot make any claims about absolute
1124 : // positioning.
1125 1 : func (i *singleLevelIterator) lastInternal() *base.InternalKV {
1126 1 : i.exhaustedBounds = 0
1127 1 : i.err = nil // clear cached iteration error
1128 1 : // Seek optimization only applies until iterator is first positioned after SetBounds.
1129 1 : i.boundsCmp = 0
1130 1 :
1131 1 : var ikv *base.InternalKV
1132 1 : if ikv = i.index.Last(); ikv == nil {
1133 0 : i.data.invalidate()
1134 0 : return nil
1135 0 : }
1136 1 : result := i.loadBlock(-1)
1137 1 : if result == loadBlockFailed {
1138 0 : return nil
1139 0 : }
1140 1 : if result == loadBlockOK {
1141 1 : if ikv := i.data.Last(); ikv != nil {
1142 1 : if i.blockLower != nil && i.cmp(ikv.K.UserKey, i.blockLower) < 0 {
1143 1 : i.exhaustedBounds = -1
1144 1 : return nil
1145 1 : }
1146 1 : return ikv
1147 : }
1148 : // Else fall through to skipBackward.
1149 1 : } else {
1150 1 : // result == loadBlockIrrelevant. Enforce the lower bound here since
1151 1 : // don't want to bother moving to the previous block if lower bound is
1152 1 : // already exceeded. Note that the previous block starts with keys <=
1153 1 : // key.UserKey since even though this is the current block's
1154 1 : // separator, the same user key can span multiple blocks.
1155 1 : if i.lower != nil && i.cmp(ikv.K.UserKey, i.lower) < 0 {
1156 1 : i.exhaustedBounds = -1
1157 1 : return nil
1158 1 : }
1159 : }
1160 :
1161 1 : return i.skipBackward()
1162 : }
1163 :
1164 : // Next implements internalIterator.Next, as documented in the pebble
1165 : // package.
1166 : // Note: compactionIterator.Next mirrors the implementation of Iterator.Next
1167 : // due to performance. Keep the two in sync.
1168 1 : func (i *singleLevelIterator) Next() *base.InternalKV {
1169 1 : if i.exhaustedBounds == +1 {
1170 0 : panic("Next called even though exhausted upper bound")
1171 : }
1172 1 : i.exhaustedBounds = 0
1173 1 : // Seek optimization only applies until iterator is first positioned after SetBounds.
1174 1 : i.boundsCmp = 0
1175 1 :
1176 1 : if i.err != nil {
1177 0 : // TODO(jackson): Can this case be turned into a panic? Once an error is
1178 0 : // encountered, the iterator must be re-seeked.
1179 0 : return nil
1180 0 : }
1181 1 : if kv := i.data.Next(); kv != nil {
1182 1 : if i.blockUpper != nil {
1183 1 : cmp := i.cmp(kv.K.UserKey, i.blockUpper)
1184 1 : if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
1185 1 : i.exhaustedBounds = +1
1186 1 : return nil
1187 1 : }
1188 : }
1189 1 : return kv
1190 : }
1191 1 : return i.skipForward()
1192 : }
1193 :
1194 : // NextPrefix implements (base.InternalIterator).NextPrefix.
1195 1 : func (i *singleLevelIterator) NextPrefix(succKey []byte) *base.InternalKV {
1196 1 : if i.exhaustedBounds == +1 {
1197 0 : panic("NextPrefix called even though exhausted upper bound")
1198 : }
1199 1 : i.exhaustedBounds = 0
1200 1 : // Seek optimization only applies until iterator is first positioned after SetBounds.
1201 1 : i.boundsCmp = 0
1202 1 : if i.err != nil {
1203 0 : // TODO(jackson): Can this case be turned into a panic? Once an error is
1204 0 : // encountered, the iterator must be re-seeked.
1205 0 : return nil
1206 0 : }
1207 1 : if kv := i.data.NextPrefix(succKey); kv != nil {
1208 1 : if i.blockUpper != nil {
1209 1 : cmp := i.cmp(kv.K.UserKey, i.blockUpper)
1210 1 : if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
1211 1 : i.exhaustedBounds = +1
1212 1 : return nil
1213 1 : }
1214 : }
1215 1 : return kv
1216 : }
1217 : // Did not find prefix in the existing data block. This is the slow-path
1218 : // where we effectively seek the iterator.
1219 1 : var ikv *base.InternalKV
1220 1 : // The key is likely to be in the next data block, so try one step.
1221 1 : if ikv = i.index.Next(); ikv == nil {
1222 1 : // The target key is greater than any key in the index block.
1223 1 : // Invalidate the block iterator so that a subsequent call to Prev()
1224 1 : // will return the last key in the table.
1225 1 : i.data.invalidate()
1226 1 : return nil
1227 1 : }
1228 1 : if i.cmp(succKey, ikv.K.UserKey) > 0 {
1229 1 : // Not in the next data block, so seek the index.
1230 1 : if ikv = i.index.SeekGE(succKey, base.SeekGEFlagsNone); ikv == nil {
1231 1 : // The target key is greater than any key in the index block.
1232 1 : // Invalidate the block iterator so that a subsequent call to Prev()
1233 1 : // will return the last key in the table.
1234 1 : i.data.invalidate()
1235 1 : return nil
1236 1 : }
1237 : }
1238 1 : result := i.loadBlock(+1)
1239 1 : if result == loadBlockFailed {
1240 0 : return nil
1241 0 : }
1242 1 : if result == loadBlockIrrelevant {
1243 1 : // Enforce the upper bound here since don't want to bother moving
1244 1 : // to the next block if upper bound is already exceeded. Note that
1245 1 : // the next block starts with keys >= ikey.UserKey since even
1246 1 : // though this is the block separator, the same user key can span
1247 1 : // multiple blocks. If upper is exclusive we use >= below, else we use
1248 1 : // >.
1249 1 : if i.upper != nil {
1250 1 : cmp := i.cmp(ikv.K.UserKey, i.upper)
1251 1 : if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
1252 1 : i.exhaustedBounds = +1
1253 1 : return nil
1254 1 : }
1255 : }
1256 1 : } else if kv := i.data.SeekGE(succKey, base.SeekGEFlagsNone); kv != nil {
1257 1 : if i.blockUpper != nil {
1258 1 : cmp := i.cmp(kv.K.UserKey, i.blockUpper)
1259 1 : if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
1260 1 : i.exhaustedBounds = +1
1261 1 : return nil
1262 1 : }
1263 : }
1264 1 : return i.maybeVerifyKey(kv)
1265 : }
1266 :
1267 1 : return i.skipForward()
1268 : }
1269 :
1270 : // Prev implements internalIterator.Prev, as documented in the pebble
1271 : // package.
1272 1 : func (i *singleLevelIterator) Prev() *base.InternalKV {
1273 1 : if i.exhaustedBounds == -1 {
1274 0 : panic("Prev called even though exhausted lower bound")
1275 : }
1276 1 : i.exhaustedBounds = 0
1277 1 : // Seek optimization only applies until iterator is first positioned after SetBounds.
1278 1 : i.boundsCmp = 0
1279 1 :
1280 1 : if i.err != nil {
1281 0 : return nil
1282 0 : }
1283 1 : if kv := i.data.Prev(); kv != nil {
1284 1 : if i.blockLower != nil && i.cmp(kv.K.UserKey, i.blockLower) < 0 {
1285 1 : i.exhaustedBounds = -1
1286 1 : return nil
1287 1 : }
1288 1 : return kv
1289 : }
1290 1 : return i.skipBackward()
1291 : }
1292 :
1293 1 : func (i *singleLevelIterator) skipForward() *base.InternalKV {
1294 1 : for {
1295 1 : indexKey := i.index.Next()
1296 1 : if indexKey == nil {
1297 1 : i.data.invalidate()
1298 1 : break
1299 : }
1300 1 : result := i.loadBlock(+1)
1301 1 : if result != loadBlockOK {
1302 1 : if i.err != nil {
1303 0 : break
1304 : }
1305 1 : if result == loadBlockFailed {
1306 0 : // We checked that i.index was at a valid entry, so
1307 0 : // loadBlockFailed could not have happened due to i.index
1308 0 : // being exhausted, and must be due to an error.
1309 0 : panic("loadBlock should not have failed with no error")
1310 : }
1311 : // result == loadBlockIrrelevant. Enforce the upper bound here
1312 : // since don't want to bother moving to the next block if upper
1313 : // bound is already exceeded. Note that the next block starts with
1314 : // keys >= key.UserKey since even though this is the block
1315 : // separator, the same user key can span multiple blocks. If upper
1316 : // is exclusive we use >= below, else we use >.
1317 1 : if i.upper != nil {
1318 1 : cmp := i.cmp(indexKey.K.UserKey, i.upper)
1319 1 : if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
1320 1 : i.exhaustedBounds = +1
1321 1 : return nil
1322 1 : }
1323 : }
1324 1 : continue
1325 : }
1326 1 : var kv *base.InternalKV
1327 1 : // It is possible that skipBackward went too far and the virtual table lower
1328 1 : // bound is after the first key in the block we are about to load, in which
1329 1 : // case we must use SeekGE.
1330 1 : //
1331 1 : // An example of how this can happen:
1332 1 : //
1333 1 : // Data block 1 - contains keys a@1, c@1
1334 1 : // Data block 2 - contains keys e@1, g@1
1335 1 : // Data block 3 - contains keys i@2, k@2
1336 1 : //
1337 1 : // The virtual table lower bound is f. We have a range key masking filter
1338 1 : // that filters keys with @1 suffix. We are positioned inside block 3 then
1339 1 : // we Prev(). Block 2 is entirely filtered out, which makes us move to
1340 1 : // block 1. Now the range key masking filter gets an update (via
1341 1 : // SpanChanged) and it no longer filters out any keys. At this point if a
1342 1 : // Next happens, we will load block 2 but it would not be legal to return
1343 1 : // "e@1" which is outside the virtual bounds.
1344 1 : //
1345 1 : // The core of the problem is that skipBackward doesn't know it can stop
1346 1 : // at block 2, because it doesn't know what keys are at the start of that
1347 1 : // block. This is why we don't have this problem in the opposite
1348 1 : // direction: skipForward will never go beyond the last relevant block
1349 1 : // because it looks at the separator key which is an upper bound for the
1350 1 : // block.
1351 1 : //
1352 1 : // Note that this is only a problem with virtual tables; we make no
1353 1 : // guarantees wrt an iterator lower bound when we iterate forward. But we
1354 1 : // must never return keys that are not inside the virtual table.
1355 1 : if i.vState != nil && i.blockLower != nil {
1356 1 : kv = i.data.SeekGE(i.lower, base.SeekGEFlagsNone)
1357 1 : } else {
1358 1 : kv = i.data.First()
1359 1 : }
1360 1 : if kv != nil {
1361 1 : if i.blockUpper != nil {
1362 1 : cmp := i.cmp(kv.K.UserKey, i.blockUpper)
1363 1 : if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
1364 1 : i.exhaustedBounds = +1
1365 1 : return nil
1366 1 : }
1367 : }
1368 1 : return i.maybeVerifyKey(kv)
1369 : }
1370 : }
1371 1 : return nil
1372 : }
1373 :
1374 1 : func (i *singleLevelIterator) skipBackward() *base.InternalKV {
1375 1 : for {
1376 1 : indexKey := i.index.Prev()
1377 1 : if indexKey == nil {
1378 1 : i.data.invalidate()
1379 1 : break
1380 : }
1381 1 : result := i.loadBlock(-1)
1382 1 : if result != loadBlockOK {
1383 1 : if i.err != nil {
1384 0 : break
1385 : }
1386 1 : if result == loadBlockFailed {
1387 0 : // We checked that i.index was at a valid entry, so
1388 0 : // loadBlockFailed could not have happened due to to i.index
1389 0 : // being exhausted, and must be due to an error.
1390 0 : panic("loadBlock should not have failed with no error")
1391 : }
1392 : // result == loadBlockIrrelevant. Enforce the lower bound here
1393 : // since don't want to bother moving to the previous block if lower
1394 : // bound is already exceeded. Note that the previous block starts with
1395 : // keys <= key.UserKey since even though this is the current block's
1396 : // separator, the same user key can span multiple blocks.
1397 1 : if i.lower != nil && i.cmp(indexKey.K.UserKey, i.lower) < 0 {
1398 1 : i.exhaustedBounds = -1
1399 1 : return nil
1400 1 : }
1401 1 : continue
1402 : }
1403 1 : kv := i.data.Last()
1404 1 : if kv == nil {
1405 1 : return nil
1406 1 : }
1407 1 : if i.blockLower != nil && i.cmp(kv.K.UserKey, i.blockLower) < 0 {
1408 1 : i.exhaustedBounds = -1
1409 1 : return nil
1410 1 : }
1411 1 : return i.maybeVerifyKey(kv)
1412 : }
1413 1 : return nil
1414 : }
1415 :
1416 : // Error implements internalIterator.Error, as documented in the pebble
1417 : // package.
1418 1 : func (i *singleLevelIterator) Error() error {
1419 1 : if err := i.data.Error(); err != nil {
1420 0 : return err
1421 0 : }
1422 1 : return i.err
1423 : }
1424 :
1425 : // SetCloseHook sets a function that will be called when the iterator is
1426 : // closed.
1427 1 : func (i *singleLevelIterator) SetCloseHook(fn func(i Iterator) error) {
1428 1 : i.closeHook = fn
1429 1 : }
1430 :
1431 1 : func firstError(err0, err1 error) error {
1432 1 : if err0 != nil {
1433 0 : return err0
1434 0 : }
1435 1 : return err1
1436 : }
1437 :
1438 : // Close implements internalIterator.Close, as documented in the pebble
1439 : // package.
1440 1 : func (i *singleLevelIterator) Close() error {
1441 1 : if invariants.Enabled && i.inPool {
1442 0 : panic("Close called on interator in pool")
1443 : }
1444 1 : i.iterStats.close()
1445 1 : var err error
1446 1 : if i.closeHook != nil {
1447 1 : err = firstError(err, i.closeHook(i))
1448 1 : }
1449 1 : err = firstError(err, i.data.Close())
1450 1 : err = firstError(err, i.index.Close())
1451 1 : if i.dataRH != nil {
1452 1 : err = firstError(err, i.dataRH.Close())
1453 1 : i.dataRH = nil
1454 1 : }
1455 1 : err = firstError(err, i.err)
1456 1 : if i.bpfs != nil {
1457 1 : releaseBlockPropertiesFilterer(i.bpfs)
1458 1 : }
1459 1 : if i.vbReader != nil {
1460 1 : i.vbReader.close()
1461 1 : }
1462 1 : if i.vbRH != nil {
1463 1 : err = firstError(err, i.vbRH.Close())
1464 1 : i.vbRH = nil
1465 1 : }
1466 1 : *i = i.resetForReuse()
1467 1 : singleLevelIterPool.Put(i)
1468 1 : return err
1469 : }
1470 :
1471 0 : func (i *singleLevelIterator) String() string {
1472 0 : if i.vState != nil {
1473 0 : return i.vState.fileNum.String()
1474 0 : }
1475 0 : return i.reader.fileNum.String()
1476 : }
|