Line data Source code
1 : // Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package base
6 :
7 : import (
8 : "context"
9 : "fmt"
10 : "time"
11 :
12 : "github.com/cockroachdb/pebble/internal/humanize"
13 : "github.com/cockroachdb/redact"
14 : )
15 :
16 : // InternalIterator iterates over a DB's key/value pairs in key order. Unlike
17 : // the Iterator interface, the returned keys are InternalKeys composed of the
18 : // user-key, a sequence number and a key kind. In forward iteration, key/value
19 : // pairs for identical user-keys are returned in descending sequence order. In
20 : // reverse iteration, key/value pairs for identical user-keys are returned in
21 : // ascending sequence order.
22 : //
23 : // InternalIterators provide 5 absolute positioning methods and 2 relative
24 : // positioning methods. The absolute positioning methods are:
25 : //
26 : // - SeekGE
27 : // - SeekPrefixGE
28 : // - SeekLT
29 : // - First
30 : // - Last
31 : //
32 : // The relative positioning methods are:
33 : //
34 : // - Next
35 : // - Prev
36 : //
37 : // The relative positioning methods can be used in conjunction with any of the
38 : // absolute positioning methods with one exception: SeekPrefixGE does not
39 : // support reverse iteration via Prev. It is undefined to call relative
40 : // positioning methods without ever calling an absolute positioning method.
41 : //
42 : // InternalIterators can optionally implement a prefix iteration mode. This
43 : // mode is entered by calling SeekPrefixGE and exited by any other absolute
44 : // positioning method (SeekGE, SeekLT, First, Last). When in prefix iteration
45 : // mode, a call to Next will advance to the next key which has the same
46 : // "prefix" as the one supplied to SeekPrefixGE. Note that "prefix" in this
47 : // context is not a strict byte prefix, but defined by byte equality for the
48 : // result of the Comparer.Split method. An InternalIterator is not required to
49 : // support prefix iteration mode, and can implement SeekPrefixGE by forwarding
50 : // to SeekGE. When the iteration prefix is exhausted, it is not valid to call
51 : // Next on an internal iterator that's already returned (nil,nilv) or a key
52 : // beyond the prefix.
53 : //
54 : // Bounds, [lower, upper), can be set on iterators, either using the SetBounds()
55 : // function in the interface, or in implementation specific ways during iterator
56 : // creation. The forward positioning routines (SeekGE, First, and Next) only
57 : // check the upper bound. The reverse positioning routines (SeekLT, Last, and
58 : // Prev) only check the lower bound. It is up to the caller to ensure that the
59 : // forward positioning routines respect the lower bound and the reverse
60 : // positioning routines respect the upper bound (i.e. calling SeekGE instead of
61 : // First if there is a lower bound, and SeekLT instead of Last if there is an
62 : // upper bound). This imposition is done in order to elevate that enforcement to
63 : // the caller (generally pebble.Iterator or pebble.mergingIter) rather than
64 : // having it duplicated in every InternalIterator implementation.
65 : //
66 : // Additionally, the caller needs to ensure that SeekGE/SeekPrefixGE are not
67 : // called with a key > the upper bound, and SeekLT is not called with a key <
68 : // the lower bound. InternalIterator implementations are required to respect
69 : // the iterator bounds, never returning records outside of the bounds with one
70 : // exception: an iterator may generate synthetic RANGEDEL marker records. See
71 : // levelIter.syntheticBoundary for the sole existing example of this behavior.
72 : // Specifically, levelIter can return synthetic keys whose user key is equal to
73 : // the lower/upper bound.
74 : //
75 : // The bounds provided to an internal iterator must remain valid until a
76 : // subsequent call to SetBounds has returned. This requirement exists so that
77 : // iterator implementations may compare old and new bounds to apply low-level
78 : // optimizations. The pebble.Iterator satisfies this requirement by maintaining
79 : // two bound buffers and switching between them.
80 : //
81 : // An iterator must be closed after use, but it is not necessary to read an
82 : // iterator until exhaustion.
83 : //
84 : // An iterator is not goroutine-safe, but it is safe to use multiple iterators
85 : // concurrently, either in separate goroutines or switching between the
86 : // iterators in a single goroutine.
87 : //
88 : // It is also safe to use an iterator concurrently with modifying its
89 : // underlying DB, if that DB permits modification. However, the resultant
90 : // key/value pairs are not guaranteed to be a consistent snapshot of that DB
91 : // at a particular point in time.
92 : //
93 : // InternalIterators accumulate errors encountered during operation, exposing
94 : // them through the Error method. All of the absolute positioning methods
95 : // reset any accumulated error before positioning. Relative positioning
96 : // methods return without advancing if the iterator has accumulated an error.
97 : //
98 : // nilv == shorthand for LazyValue{}, which represents a nil value.
99 : type InternalIterator interface {
100 : // SeekGE moves the iterator to the first key/value pair whose key is greater
101 : // than or equal to the given key. Returns the key and value if the iterator
102 : // is pointing at a valid entry, and (nil, nilv) otherwise. Note that SeekGE
103 : // only checks the upper bound. It is up to the caller to ensure that key
104 : // is greater than or equal to the lower bound.
105 : SeekGE(key []byte, flags SeekGEFlags) *InternalKV
106 :
107 : // SeekPrefixGE moves the iterator to the first key/value pair whose key is
108 : // greater than or equal to the given key. Returns the key and value if the
109 : // iterator is pointing at a valid entry, and (nil, nilv) otherwise. Note that
110 : // SeekPrefixGE only checks the upper bound. It is up to the caller to ensure
111 : // that key is greater than or equal to the lower bound.
112 : //
113 : // The prefix argument is used by some InternalIterator implementations
114 : // (e.g. sstable.Reader) to avoid expensive operations. This operation is
115 : // only useful when a user-defined Split function is supplied to the
116 : // Comparer for the DB. The supplied prefix will be the prefix of the given
117 : // key returned by that Split function. If the iterator is able to determine
118 : // that no key with the prefix exists, it can return (nil,nilv). Unlike
119 : // SeekGE, this is not an indication that iteration is exhausted. The prefix
120 : // byte slice is guaranteed to be stable until the next absolute positioning
121 : // operation.
122 : //
123 : // Note that the iterator may return keys not matching the prefix. It is up
124 : // to the caller to check if the prefix matches.
125 : //
126 : // Calling SeekPrefixGE places the receiver into prefix iteration mode. Once
127 : // in this mode, reverse iteration may not be supported and will return an
128 : // error. Note that pebble/Iterator.SeekPrefixGE has this same restriction on
129 : // not supporting reverse iteration in prefix iteration mode until a
130 : // different positioning routine (SeekGE, SeekLT, First or Last) switches the
131 : // iterator out of prefix iteration.
132 : SeekPrefixGE(prefix, key []byte, flags SeekGEFlags) *InternalKV
133 :
134 : // SeekLT moves the iterator to the last key/value pair whose key is less
135 : // than the given key. Returns the key and value if the iterator is pointing
136 : // at a valid entry, and (nil, nilv) otherwise. Note that SeekLT only checks
137 : // the lower bound. It is up to the caller to ensure that key is less than
138 : // the upper bound.
139 : SeekLT(key []byte, flags SeekLTFlags) *InternalKV
140 :
141 : // First moves the iterator the first key/value pair. Returns the key and
142 : // value if the iterator is pointing at a valid entry, and (nil, nilv)
143 : // otherwise. Note that First only checks the upper bound. It is up to the
144 : // caller to ensure that First() is not called when there is a lower bound,
145 : // and instead call SeekGE(lower).
146 : First() *InternalKV
147 :
148 : // Last moves the iterator the last key/value pair. Returns the key and
149 : // value if the iterator is pointing at a valid entry, and (nil, nilv)
150 : // otherwise. Note that Last only checks the lower bound. It is up to the
151 : // caller to ensure that Last() is not called when there is an upper bound,
152 : // and instead call SeekLT(upper).
153 : Last() *InternalKV
154 :
155 : // Next moves the iterator to the next key/value pair. Returns the key and
156 : // value if the iterator is pointing at a valid entry, and (nil, nilv)
157 : // otherwise. Note that Next only checks the upper bound. It is up to the
158 : // caller to ensure that key is greater than or equal to the lower bound.
159 : //
160 : // It is valid to call Next when the iterator is positioned before the first
161 : // key/value pair due to either a prior call to SeekLT or Prev which returned
162 : // (nil, nilv). It is not allowed to call Next when the previous call to SeekGE,
163 : // SeekPrefixGE or Next returned (nil, nilv).
164 : Next() *InternalKV
165 :
166 : // NextPrefix moves the iterator to the next key/value pair with a different
167 : // prefix than the key at the current iterator position. Returns the key and
168 : // value if the iterator is pointing at a valid entry, and (nil, nil)
169 : // otherwise. Note that NextPrefix only checks the upper bound. It is up to
170 : // the caller to ensure that key is greater than or equal to the lower
171 : // bound.
172 : //
173 : // NextPrefix is passed the immediate successor to the current prefix key. A
174 : // valid implementation of NextPrefix is to call SeekGE with succKey.
175 : //
176 : // It is not allowed to call NextPrefix when the previous call was a reverse
177 : // positioning operation or a call to a forward positioning method that
178 : // returned (nil, nilv). It is also not allowed to call NextPrefix when the
179 : // iterator is in prefix iteration mode.
180 : NextPrefix(succKey []byte) *InternalKV
181 :
182 : // Prev moves the iterator to the previous key/value pair. Returns the key
183 : // and value if the iterator is pointing at a valid entry, and (nil, nilv)
184 : // otherwise. Note that Prev only checks the lower bound. It is up to the
185 : // caller to ensure that key is less than the upper bound.
186 : //
187 : // It is valid to call Prev when the iterator is positioned after the last
188 : // key/value pair due to either a prior call to SeekGE or Next which returned
189 : // (nil, nilv). It is not allowed to call Prev when the previous call to SeekLT
190 : // or Prev returned (nil, nilv).
191 : Prev() *InternalKV
192 :
193 : // Error returns any accumulated error. It may not include errors returned
194 : // to the client when calling LazyValue.Value().
195 : Error() error
196 :
197 : // Close closes the iterator and returns any accumulated error. Exhausting
198 : // all the key/value pairs in a table is not considered to be an error.
199 : //
200 : // Once Close is called, the iterator should not be used again. Specific
201 : // implementations may support multiple calls to Close (but no other calls
202 : // after the first Close).
203 : Close() error
204 :
205 : // SetBounds sets the lower and upper bounds for the iterator. Note that the
206 : // result of Next and Prev will be undefined until the iterator has been
207 : // repositioned with SeekGE, SeekPrefixGE, SeekLT, First, or Last.
208 : //
209 : // The bounds provided must remain valid until a subsequent call to
210 : // SetBounds has returned. This requirement exists so that iterator
211 : // implementations may compare old and new bounds to apply low-level
212 : // optimizations.
213 : SetBounds(lower, upper []byte)
214 :
215 : // SetContext replaces the context provided at iterator creation, or the
216 : // last one provided by SetContext.
217 : SetContext(ctx context.Context)
218 :
219 : fmt.Stringer
220 : }
221 :
222 : // TopLevelIterator extends InternalIterator to include an additional absolute
223 : // positioning method, SeekPrefixGEStrict.
224 : type TopLevelIterator interface {
225 : InternalIterator
226 :
227 : // SeekPrefixGEStrict extends InternalIterator.SeekPrefixGE with a guarantee
228 : // that the iterator only returns keys matching the prefix.
229 : SeekPrefixGEStrict(prefix, key []byte, flags SeekGEFlags) *InternalKV
230 : }
231 :
232 : // SeekGEFlags holds flags that may configure the behavior of a forward seek.
233 : // Not all flags are relevant to all iterators.
234 : type SeekGEFlags uint8
235 :
236 : const (
237 : seekGEFlagTrySeekUsingNext uint8 = iota
238 : seekGEFlagRelativeSeek
239 : seekGEFlagBatchJustRefreshed
240 : )
241 :
242 : // SeekGEFlagsNone is the default value of SeekGEFlags, with all flags disabled.
243 : const SeekGEFlagsNone = SeekGEFlags(0)
244 :
245 : // TrySeekUsingNext indicates whether a performance optimization was enabled
246 : // by a caller, indicating the caller has not done any action to move this
247 : // iterator beyond the first key that would be found if this iterator were to
248 : // honestly do the intended seek. For example, say the caller did a
249 : // SeekGE(k1...), followed by SeekGE(k2...) where k1 <= k2, without any
250 : // intermediate positioning calls. The caller can safely specify true for this
251 : // parameter in the second call. As another example, say the caller did do one
252 : // call to Next between the two Seek calls, and k1 < k2. Again, the caller can
253 : // safely specify a true value for this parameter. Note that a false value is
254 : // always safe. The callee is free to ignore the true value if its
255 : // implementation does not permit this optimization.
256 : //
257 : // We make the caller do this determination since a string comparison of k1, k2
258 : // is not necessarily cheap, and there may be many iterators in the iterator
259 : // stack. Doing it once at the root of the iterator stack is cheaper.
260 : //
261 : // This optimization could also be applied to SeekLT (where it would be
262 : // trySeekUsingPrev). We currently only do it for SeekPrefixGE and SeekGE
263 : // because this is where this optimization helps the performance of CockroachDB.
264 : // The SeekLT cases in CockroachDB are typically accompanied with bounds that
265 : // change between seek calls, and is optimized inside certain iterator
266 : // implementations, like singleLevelIterator, without any extra parameter
267 : // passing (though the same amortization of string comparisons could be done to
268 : // improve that optimization, by making the root of the iterator stack do it).
269 1 : func (s SeekGEFlags) TrySeekUsingNext() bool { return (s & (1 << seekGEFlagTrySeekUsingNext)) != 0 }
270 :
271 : // RelativeSeek is set when in the course of a forward positioning operation, a
272 : // higher-level iterator seeks a lower-level iterator to a larger key than the
273 : // one at the current iterator position.
274 : //
275 : // Concretely, this occurs when the merging iterator observes a range deletion
276 : // covering the key at a level's current position, and the merging iterator
277 : // seeks the level to the range deletion's end key. During lazy-combined
278 : // iteration, this flag signals to the level iterator that the seek is NOT an
279 : // absolute-positioning operation from the perspective of the pebble.Iterator,
280 : // and the level iterator must look for range keys in tables between the current
281 : // iterator position and the new seeked position.
282 1 : func (s SeekGEFlags) RelativeSeek() bool { return (s & (1 << seekGEFlagRelativeSeek)) != 0 }
283 :
284 : // BatchJustRefreshed is set by Seek[Prefix]GE when an iterator's view of an
285 : // indexed batch was just refreshed. It serves as a signal to the batch iterator
286 : // to ignore the TrySeekUsingNext optimization, because the external knowledge
287 : // imparted by the TrySeekUsingNext flag does not apply to the batch iterator's
288 : // position. See (pebble.Iterator).batchJustRefreshed.
289 1 : func (s SeekGEFlags) BatchJustRefreshed() bool { return (s & (1 << seekGEFlagBatchJustRefreshed)) != 0 }
290 :
291 : // EnableTrySeekUsingNext returns the provided flags with the
292 : // try-seek-using-next optimization enabled. See TrySeekUsingNext for an
293 : // explanation of this optimization.
294 1 : func (s SeekGEFlags) EnableTrySeekUsingNext() SeekGEFlags {
295 1 : return s | (1 << seekGEFlagTrySeekUsingNext)
296 1 : }
297 :
298 : // DisableTrySeekUsingNext returns the provided flags with the
299 : // try-seek-using-next optimization disabled.
300 1 : func (s SeekGEFlags) DisableTrySeekUsingNext() SeekGEFlags {
301 1 : return s &^ (1 << seekGEFlagTrySeekUsingNext)
302 1 : }
303 :
304 : // EnableRelativeSeek returns the provided flags with the relative-seek flag
305 : // enabled. See RelativeSeek for an explanation of this flag's use.
306 1 : func (s SeekGEFlags) EnableRelativeSeek() SeekGEFlags {
307 1 : return s | (1 << seekGEFlagRelativeSeek)
308 1 : }
309 :
310 : // DisableRelativeSeek returns the provided flags with the relative-seek flag
311 : // disabled.
312 1 : func (s SeekGEFlags) DisableRelativeSeek() SeekGEFlags {
313 1 : return s &^ (1 << seekGEFlagRelativeSeek)
314 1 : }
315 :
316 : // EnableBatchJustRefreshed returns the provided flags with the
317 : // batch-just-refreshed bit set. See BatchJustRefreshed for an explanation of
318 : // this flag.
319 1 : func (s SeekGEFlags) EnableBatchJustRefreshed() SeekGEFlags {
320 1 : return s | (1 << seekGEFlagBatchJustRefreshed)
321 1 : }
322 :
323 : // DisableBatchJustRefreshed returns the provided flags with the
324 : // batch-just-refreshed bit unset.
325 1 : func (s SeekGEFlags) DisableBatchJustRefreshed() SeekGEFlags {
326 1 : return s &^ (1 << seekGEFlagBatchJustRefreshed)
327 1 : }
328 :
329 : // SeekLTFlags holds flags that may configure the behavior of a reverse seek.
330 : // Not all flags are relevant to all iterators.
331 : type SeekLTFlags uint8
332 :
333 : const (
334 : seekLTFlagRelativeSeek uint8 = iota
335 : )
336 :
337 : // SeekLTFlagsNone is the default value of SeekLTFlags, with all flags disabled.
338 : const SeekLTFlagsNone = SeekLTFlags(0)
339 :
340 : // RelativeSeek is set when in the course of a reverse positioning operation, a
341 : // higher-level iterator seeks a lower-level iterator to a smaller key than the
342 : // one at the current iterator position.
343 : //
344 : // Concretely, this occurs when the merging iterator observes a range deletion
345 : // covering the key at a level's current position, and the merging iterator
346 : // seeks the level to the range deletion's start key. During lazy-combined
347 : // iteration, this flag signals to the level iterator that the seek is NOT an
348 : // absolute-positioning operation from the perspective of the pebble.Iterator,
349 : // and the level iterator must look for range keys in tables between the current
350 : // iterator position and the new seeked position.
351 1 : func (s SeekLTFlags) RelativeSeek() bool { return s&(1<<seekLTFlagRelativeSeek) != 0 }
352 :
353 : // EnableRelativeSeek returns the provided flags with the relative-seek flag
354 : // enabled. See RelativeSeek for an explanation of this flag's use.
355 1 : func (s SeekLTFlags) EnableRelativeSeek() SeekLTFlags {
356 1 : return s | (1 << seekLTFlagRelativeSeek)
357 1 : }
358 :
359 : // DisableRelativeSeek returns the provided flags with the relative-seek flag
360 : // disabled.
361 1 : func (s SeekLTFlags) DisableRelativeSeek() SeekLTFlags {
362 1 : return s &^ (1 << seekLTFlagRelativeSeek)
363 1 : }
364 :
365 : // InternalIteratorStats contains miscellaneous stats produced by
366 : // InternalIterators that are part of the InternalIterator tree. Not every
367 : // field is relevant for an InternalIterator implementation. The field values
368 : // are aggregated as one goes up the InternalIterator tree.
369 : type InternalIteratorStats struct {
370 : // Bytes in the loaded blocks. If the block was compressed, this is the
371 : // compressed bytes. Currently, only the index blocks, data blocks
372 : // containing points, and filter blocks are included.
373 : BlockBytes uint64
374 : // Subset of BlockBytes that were in the block cache.
375 : BlockBytesInCache uint64
376 : // BlockReadDuration accumulates the duration spent fetching blocks
377 : // due to block cache misses.
378 : // TODO(sumeer): this currently excludes the time spent in Reader creation,
379 : // and in reading the rangedel and rangekey blocks. Fix that.
380 : BlockReadDuration time.Duration
381 : // The following can repeatedly count the same points if they are iterated
382 : // over multiple times. Additionally, they may count a point twice when
383 : // switching directions. The latter could be improved if needed.
384 :
385 : // Bytes in keys that were iterated over. Currently, only point keys are
386 : // included.
387 : KeyBytes uint64
388 : // Bytes in values that were iterated over. Currently, only point values are
389 : // included. For separated values, this is the size of the handle.
390 : ValueBytes uint64
391 : // The count of points iterated over.
392 : PointCount uint64
393 : // Points that were iterated over that were covered by range tombstones. It
394 : // can be useful for discovering instances of
395 : // https://github.com/cockroachdb/pebble/issues/1070.
396 : PointsCoveredByRangeTombstones uint64
397 :
398 : // Stats related to points in value blocks encountered during iteration.
399 : // These are useful to understand outliers, since typical user facing
400 : // iteration should tend to only look at the latest point, and hence have
401 : // the following stats close to 0.
402 : SeparatedPointValue struct {
403 : // Count is a count of points that were in value blocks. This is not a
404 : // subset of PointCount: PointCount is produced by mergingIter and if
405 : // positioned once, and successful in returning a point, will have a
406 : // PointCount of 1, regardless of how many sstables (and memtables etc.)
407 : // in the heap got positioned. The count here includes every sstable
408 : // iterator that got positioned in the heap.
409 : Count uint64
410 : // ValueBytes represent the total byte length of the values (in value
411 : // blocks) of the points corresponding to Count.
412 : ValueBytes uint64
413 : // ValueBytesFetched is the total byte length of the values (in value
414 : // blocks) that were retrieved.
415 : ValueBytesFetched uint64
416 : }
417 : }
418 :
419 : // Merge merges the stats in from into the given stats.
420 1 : func (s *InternalIteratorStats) Merge(from InternalIteratorStats) {
421 1 : s.BlockBytes += from.BlockBytes
422 1 : s.BlockBytesInCache += from.BlockBytesInCache
423 1 : s.BlockReadDuration += from.BlockReadDuration
424 1 : s.KeyBytes += from.KeyBytes
425 1 : s.ValueBytes += from.ValueBytes
426 1 : s.PointCount += from.PointCount
427 1 : s.PointsCoveredByRangeTombstones += from.PointsCoveredByRangeTombstones
428 1 : s.SeparatedPointValue.Count += from.SeparatedPointValue.Count
429 1 : s.SeparatedPointValue.ValueBytes += from.SeparatedPointValue.ValueBytes
430 1 : s.SeparatedPointValue.ValueBytesFetched += from.SeparatedPointValue.ValueBytesFetched
431 1 : }
432 :
433 0 : func (s *InternalIteratorStats) String() string {
434 0 : return redact.StringWithoutMarkers(s)
435 0 : }
436 :
437 : // SafeFormat implements the redact.SafeFormatter interface.
438 1 : func (s *InternalIteratorStats) SafeFormat(p redact.SafePrinter, verb rune) {
439 1 : var tombstoned humanize.FormattedString
440 1 : if s.PointsCoveredByRangeTombstones != 0 {
441 0 : tombstoned = "(" + humanize.Count.Uint64(s.PointsCoveredByRangeTombstones) + " tombstoned)"
442 0 : }
443 1 : p.Printf("blocks: %s (%s cached), read time %s; "+
444 1 : "points: %s%s (%s keys, %s values)",
445 1 : humanize.Bytes.Uint64(s.BlockBytes),
446 1 : humanize.Bytes.Uint64(s.BlockBytesInCache),
447 1 : humanize.FormattedString(s.BlockReadDuration.String()),
448 1 : humanize.Count.Uint64(s.PointCount),
449 1 : tombstoned,
450 1 : humanize.Bytes.Uint64(s.KeyBytes),
451 1 : humanize.Bytes.Uint64(s.ValueBytes),
452 1 : )
453 1 : if s.SeparatedPointValue.Count != 0 {
454 1 : p.Printf("; separated: %s (%s, %s fetched)",
455 1 : humanize.Count.Uint64(s.SeparatedPointValue.Count),
456 1 : humanize.Bytes.Uint64(s.SeparatedPointValue.ValueBytes),
457 1 : humanize.Bytes.Uint64(s.SeparatedPointValue.ValueBytesFetched))
458 1 : }
459 : }
|