Line data Source code
1 : // Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package compact
6 :
7 : import (
8 : "encoding/binary"
9 : "io"
10 : "strconv"
11 :
12 : "github.com/cockroachdb/errors"
13 : "github.com/cockroachdb/pebble/internal/base"
14 : "github.com/cockroachdb/pebble/internal/invalidating"
15 : "github.com/cockroachdb/pebble/internal/invariants"
16 : "github.com/cockroachdb/pebble/internal/keyspan"
17 : "github.com/cockroachdb/pebble/internal/rangekey"
18 : "github.com/cockroachdb/redact"
19 : )
20 :
21 : // Iter provides a forward-only iterator that encapsulates the logic for
22 : // collapsing entries during compaction. It wraps an internal iterator and
23 : // collapses entries that are no longer necessary because they are shadowed by
24 : // newer entries. The simplest example of this is when the internal iterator
25 : // contains two keys: a.PUT.2 and a.PUT.1. Instead of returning both entries,
26 : // compact.Iter collapses the second entry because it is no longer necessary.
27 : // The high-level structure for compact.Iter is to iterate over its internal
28 : // iterator and output 1 entry for every user-key. There are four complications
29 : // to this story.
30 : //
31 : // 1. Eliding Deletion Tombstones
32 : //
33 : // Consider the entries a.DEL.2 and a.PUT.1. These entries collapse to
34 : // a.DEL.2. Do we have to output the entry a.DEL.2? Only if a.DEL.2 possibly
35 : // shadows an entry at a lower level. If we're compacting to the base-level in
36 : // the LSM tree then a.DEL.2 is definitely not shadowing an entry at a lower
37 : // level and can be elided.
38 : //
39 : // We can do slightly better than only eliding deletion tombstones at the base
40 : // level by observing that we can elide a deletion tombstone if there are no
41 : // sstables that contain the entry's key. This check is performed by
42 : // elideTombstone.
43 : //
44 : // 2. Merges
45 : //
46 : // The MERGE operation merges the value for an entry with the existing value
47 : // for an entry. The logical value of an entry can be composed of a series of
48 : // merge operations. When compact.Iter sees a MERGE, it scans forward in its
49 : // internal iterator collapsing MERGE operations for the same key until it
50 : // encounters a SET or DELETE operation. For example, the keys a.MERGE.4,
51 : // a.MERGE.3, a.MERGE.2 will be collapsed to a.MERGE.4 and the values will be
52 : // merged using the specified Merger.
53 : //
54 : // An interesting case here occurs when MERGE is combined with SET. Consider
55 : // the entries a.MERGE.3 and a.SET.2. The collapsed key will be a.SET.3. The
56 : // reason that the kind is changed to SET is because the SET operation acts as
57 : // a barrier preventing further merging. This can be seen better in the
58 : // scenario a.MERGE.3, a.SET.2, a.MERGE.1. The entry a.MERGE.1 may be at lower
59 : // (older) level and not involved in the compaction. If the compaction of
60 : // a.MERGE.3 and a.SET.2 produced a.MERGE.3, a subsequent compaction with
61 : // a.MERGE.1 would merge the values together incorrectly.
62 : //
63 : // 3. Snapshots
64 : //
65 : // Snapshots are lightweight point-in-time views of the DB state. At its core,
66 : // a snapshot is a sequence number along with a guarantee from Pebble that it
67 : // will maintain the view of the database at that sequence number. Part of this
68 : // guarantee is relatively straightforward to achieve. When reading from the
69 : // database Pebble will ignore sequence numbers that are larger than the
70 : // snapshot sequence number. The primary complexity with snapshots occurs
71 : // during compaction: the collapsing of entries that are shadowed by newer
72 : // entries is at odds with the guarantee that Pebble will maintain the view of
73 : // the database at the snapshot sequence number. Rather than collapsing entries
74 : // up to the next user key, compact.Iter can only collapse entries up to the
75 : // next snapshot boundary. That is, every snapshot boundary potentially causes
76 : // another entry for the same user-key to be emitted. Another way to view this
77 : // is that snapshots define stripes and entries are collapsed within stripes,
78 : // but not across stripes. Consider the following scenario:
79 : //
80 : // a.PUT.9
81 : // a.DEL.8
82 : // a.PUT.7
83 : // a.DEL.6
84 : // a.PUT.5
85 : //
86 : // In the absence of snapshots these entries would be collapsed to
87 : // a.PUT.9. What if there is a snapshot at sequence number 7? The entries can
88 : // be divided into two stripes and collapsed within the stripes:
89 : //
90 : // a.PUT.9 a.PUT.9
91 : // a.DEL.8 --->
92 : // a.PUT.7
93 : // -- --
94 : // a.DEL.6 ---> a.DEL.6
95 : // a.PUT.5
96 : //
97 : // All of the rules described earlier still apply, but they are confined to
98 : // operate within a snapshot stripe. Snapshots only affect compaction when the
99 : // snapshot sequence number lies within the range of sequence numbers being
100 : // compacted. In the above example, a snapshot at sequence number 10 or at
101 : // sequence number 5 would not have any effect.
102 : //
103 : // 4. Range Deletions
104 : //
105 : // Range deletions provide the ability to delete all of the keys (and values)
106 : // in a contiguous range. Range deletions are stored indexed by their start
107 : // key. The end key of the range is stored in the value. In order to support
108 : // lookup of the range deletions which overlap with a particular key, the range
109 : // deletion tombstones need to be fragmented whenever they overlap. This
110 : // fragmentation is performed by keyspan.Fragmenter. The fragments are then
111 : // subject to the rules for snapshots. For example, consider the two range
112 : // tombstones [a,e)#1 and [c,g)#2:
113 : //
114 : // 2: c-------g
115 : // 1: a-------e
116 : //
117 : // These tombstones will be fragmented into:
118 : //
119 : // 2: c---e---g
120 : // 1: a---c---e
121 : //
122 : // Do we output the fragment [c,e)#1? Since it is covered by [c-e]#2 the answer
123 : // depends on whether it is in a new snapshot stripe.
124 : //
125 : // In addition to the fragmentation of range tombstones, compaction also needs
126 : // to take the range tombstones into consideration when outputting normal
127 : // keys. Just as with point deletions, a range deletion covering an entry can
128 : // cause the entry to be elided.
129 : //
130 : // A note on the stability of keys and values.
131 : //
132 : // The stability guarantees of keys and values returned by the iterator tree
133 : // that backs a compact.Iter is nuanced and care must be taken when
134 : // referencing any returned items.
135 : //
136 : // Keys and values returned by exported functions (i.e. First, Next, etc.) have
137 : // lifetimes that fall into two categories:
138 : //
139 : // Lifetime valid for duration of compaction. Range deletion keys and values are
140 : // stable for the duration of the compaction, due to way in which a
141 : // compact.Iter is typically constructed (i.e. via (*compaction).newInputIter,
142 : // which wraps the iterator over the range deletion block in a noCloseIter,
143 : // preventing the release of the backing memory until the compaction is
144 : // finished).
145 : //
146 : // Lifetime limited to duration of sstable block liveness. Point keys (SET, DEL,
147 : // etc.) and values must be cloned / copied following the return from the
148 : // exported function, and before a subsequent call to Next advances the iterator
149 : // and mutates the contents of the returned key and value.
150 : type Iter struct {
151 : cmp base.Compare
152 :
153 : cfg IterConfig
154 :
155 : // rangeDelInterlaving is an interleaving iterator for range deletions, that
156 : // interleaves range tombstones among the point keys.
157 : rangeDelInterleaving keyspan.InterleavingIter
158 : // rangeKeyInterleaving is the interleaving iter for range keys.
159 : rangeKeyInterleaving keyspan.InterleavingIter
160 :
161 : // iter is the iterator which interleaves points with RANGEDELs and range
162 : // keys.
163 : iter base.InternalIterator
164 :
165 : delElider pointTombstoneElider
166 : rangeDelCompactor RangeDelSpanCompactor
167 : rangeKeyCompactor RangeKeySpanCompactor
168 : err error
169 : // `key.UserKey` is set to `keyBuf` caused by saving `i.iterKV.UserKey`
170 : // and `key.Trailer` is set to `i.iterKV.Trailer`. This is the
171 : // case on return from all public methods -- these methods return `key`.
172 : // Additionally, it is the internal state when the code is moving to the
173 : // next key so it can determine whether the user key has changed from
174 : // the previous key.
175 : key base.InternalKey
176 : // keyTrailer is updated when `i.key` is updated and holds the key's
177 : // original trailer (eg, before any sequence-number zeroing or changes to
178 : // key kind).
179 : keyTrailer uint64
180 : value []byte
181 : valueCloser io.Closer
182 : // Temporary buffer used for storing the previous user key in order to
183 : // determine when iteration has advanced to a new user key and thus a new
184 : // snapshot stripe.
185 : keyBuf []byte
186 : // Temporary buffer used for storing the previous value, which may be an
187 : // unsafe, i.iter-owned slice that could be altered when the iterator is
188 : // advanced.
189 : valueBuf []byte
190 : iterKV *base.InternalKV
191 : iterValue []byte
192 : iterStripeChange stripeChangeType
193 : // skip indicates whether the remaining entries in the current snapshot
194 : // stripe should be skipped or processed. `skip` has no effect when `pos ==
195 : // iterPosNext`.
196 : skip bool
197 : // pos indicates the iterator position at the top of Next():
198 : // - iterPosCurForward: the iterator is at the last key returned.
199 : // - iterPosNext: the iterator has already been advanced to the next
200 : // candidate key. For example, this happens when processing merge operands,
201 : // where we advance the iterator all the way into the next stripe or next
202 : // user key to ensure we've seen all mergeable operands.
203 : pos iterPos
204 : // snapshotPinned indicates whether the last point key returned by the
205 : // compaction iterator was only returned because an open snapshot prevents
206 : // its elision. This field only applies to point keys, and not to range
207 : // deletions or range keys.
208 : snapshotPinned bool
209 : // forceObsoleteDueToRangeDel is set to true in a subset of the cases that
210 : // snapshotPinned is true. This value is true when the point is obsolete due
211 : // to a RANGEDEL but could not be deleted due to a snapshot.
212 : //
213 : // NB: it may seem that the additional cases that snapshotPinned captures
214 : // are harmless in that they can also be used to mark a point as obsolete
215 : // (it is merely a duplication of some logic that happens in
216 : // Writer.AddWithForceObsolete), but that is not quite accurate as of this
217 : // writing -- snapshotPinned originated in stats collection and for a
218 : // sequence MERGE, SET, where the MERGE cannot merge with the (older) SET
219 : // due to a snapshot, the snapshotPinned value for the SET is true.
220 : //
221 : // TODO(sumeer,jackson): improve the logic of snapshotPinned and reconsider
222 : // whether we need forceObsoleteDueToRangeDel.
223 : forceObsoleteDueToRangeDel bool
224 : // The index of the snapshot for the current key within the snapshots slice.
225 : curSnapshotIdx int
226 : curSnapshotSeqNum uint64
227 : // frontiers holds a heap of user keys that affect compaction behavior when
228 : // they're exceeded. Before a new key is returned, the compaction iterator
229 : // advances the frontier, notifying any code that subscribed to be notified
230 : // when a key was reached. The primary use today is within the
231 : // implementation of compactionOutputSplitters in compaction.go. Many of
232 : // these splitters wait for the compaction iterator to call Advance(k) when
233 : // it's returning a new key. If the key that they're waiting for is
234 : // surpassed, these splitters update internal state recording that they
235 : // should request a compaction split next time they're asked in
236 : // [shouldSplitBefore].
237 : frontiers Frontiers
238 :
239 : // lastRangeDelSpan stores the last, not compacted tombstone span. It is used
240 : // to elide points or mark them as snapshot-pinned.
241 : lastRangeDelSpan keyspan.Span
242 :
243 : // span stores the last, compacted tombstone or range key span. It is provided
244 : // to the caller via Span().
245 : span keyspan.Span
246 :
247 : stats IterStats
248 : }
249 :
250 : // IterConfig contains the parameters necessary to create a compaction iterator.
251 : type IterConfig struct {
252 : Comparer *base.Comparer
253 : Merge base.Merge
254 :
255 : // The snapshot sequence numbers that need to be maintained. These sequence
256 : // numbers define the snapshot stripes.
257 : Snapshots Snapshots
258 :
259 : TombstoneElision TombstoneElision
260 : RangeKeyElision TombstoneElision
261 :
262 : // AllowZeroSeqNum allows the sequence number of KVs in the bottom snapshot
263 : // stripe to be simplified to 0 (which improves compression and enables an
264 : // optimization during forward iteration). This can be enabled if there are no
265 : // tables overlapping the output at lower levels (than the output) in the LSM.
266 : AllowZeroSeqNum bool
267 :
268 : // IneffectualPointDeleteCallback is called if a SINGLEDEL is being elided
269 : // without deleting a point set/merge.
270 : IneffectualSingleDeleteCallback func(userKey []byte)
271 :
272 : // SingleDeleteInvariantViolationCallback is called in compactions/flushes if any
273 : // single delete has consumed a Set/Merge, and there is another immediately older
274 : // Set/SetWithDelete/Merge. The user of Pebble has violated the invariant under
275 : // which SingleDelete can be used correctly.
276 : SingleDeleteInvariantViolationCallback func(userKey []byte)
277 : }
278 :
279 1 : func (c *IterConfig) ensureDefaults() {
280 1 : if c.IneffectualSingleDeleteCallback == nil {
281 1 : c.IneffectualSingleDeleteCallback = func(userKey []byte) {}
282 : }
283 : }
284 :
285 : // IterStats are statistics produced by the compaction iterator.
286 : type IterStats struct {
287 : // Count of DELSIZED keys that were missized.
288 : CountMissizedDels uint64
289 : }
290 :
291 : type iterPos int8
292 :
293 : const (
294 : iterPosCurForward iterPos = 0
295 : iterPosNext iterPos = 1
296 : )
297 :
298 : // NewIter creates a new compaction iterator. See the comment for Iter for a
299 : // detailed description.
300 : // rangeDelIter and rangeKeyIter can be nil.
301 : func NewIter(
302 : cfg IterConfig,
303 : pointIter base.InternalIterator,
304 : rangeDelIter, rangeKeyIter keyspan.FragmentIterator,
305 1 : ) *Iter {
306 1 : cfg.ensureDefaults()
307 1 : i := &Iter{
308 1 : cmp: cfg.Comparer.Compare,
309 1 : cfg: cfg,
310 1 : // We don't want a nil keyBuf because if the first key we encounter is
311 1 : // empty, it would become nil.
312 1 : keyBuf: make([]byte, 8),
313 1 : }
314 1 :
315 1 : iter := pointIter
316 1 : if rangeDelIter != nil {
317 1 : i.rangeDelInterleaving.Init(cfg.Comparer, iter, rangeDelIter, keyspan.InterleavingIterOpts{})
318 1 : iter = &i.rangeDelInterleaving
319 1 : }
320 1 : if rangeKeyIter != nil {
321 1 : i.rangeKeyInterleaving.Init(cfg.Comparer, iter, rangeKeyIter, keyspan.InterleavingIterOpts{})
322 1 : iter = &i.rangeKeyInterleaving
323 1 : }
324 1 : i.iter = invalidating.MaybeWrapIfInvariants(iter)
325 1 :
326 1 : i.frontiers.Init(i.cmp)
327 1 : i.delElider.Init(i.cmp, cfg.TombstoneElision)
328 1 : i.rangeDelCompactor = MakeRangeDelSpanCompactor(i.cmp, i.cfg.Comparer.Equal, cfg.Snapshots, cfg.TombstoneElision)
329 1 : i.rangeKeyCompactor = MakeRangeKeySpanCompactor(i.cmp, i.cfg.Comparer.Equal, cfg.Snapshots, cfg.RangeKeyElision)
330 1 : return i
331 : }
332 :
333 : // Frontiers returns the frontiers for the compaction iterator.
334 1 : func (i *Iter) Frontiers() *Frontiers {
335 1 : return &i.frontiers
336 1 : }
337 :
338 : // SnapshotPinned returns whether the last point key returned by the compaction
339 : // iterator was only returned because an open snapshot prevents its elision.
340 : // This field only applies to point keys, and not to range deletions or range
341 : // keys.
342 1 : func (i *Iter) SnapshotPinned() bool {
343 1 : return i.snapshotPinned
344 1 : }
345 :
346 : // ForceObsoleteDueToRangeDel returns true in a subset of the cases when
347 : // SnapshotPinned returns true. This value is true when the point is obsolete
348 : // due to a RANGEDEL but could not be deleted due to a snapshot.
349 1 : func (i *Iter) ForceObsoleteDueToRangeDel() bool {
350 1 : return i.forceObsoleteDueToRangeDel
351 1 : }
352 :
353 : // Stats returns the compaction iterator stats.
354 1 : func (i *Iter) Stats() IterStats {
355 1 : return i.stats
356 1 : }
357 :
358 : // First has the same semantics as InternalIterator.First.
359 1 : func (i *Iter) First() (*base.InternalKey, []byte) {
360 1 : if i.err != nil {
361 0 : return nil, nil
362 0 : }
363 1 : i.iterKV = i.iter.First()
364 1 : if i.iterKV != nil {
365 1 : i.iterValue, _, i.err = i.iterKV.Value(nil)
366 1 : if i.err != nil {
367 0 : return nil, nil
368 0 : }
369 1 : i.curSnapshotIdx, i.curSnapshotSeqNum = i.cfg.Snapshots.IndexAndSeqNum(i.iterKV.SeqNum())
370 : }
371 1 : i.pos = iterPosNext
372 1 : i.iterStripeChange = newStripeNewKey
373 1 : return i.Next()
374 : }
375 :
376 : // Next has the same semantics as InternalIterator.Next. Note that when Next
377 : // returns a RANGEDEL or a range key, the caller can use Span() to get the
378 : // corresponding span.
379 1 : func (i *Iter) Next() (*base.InternalKey, []byte) {
380 1 : if i.err != nil {
381 1 : return nil, nil
382 1 : }
383 :
384 : // Close the closer for the current value if one was open.
385 1 : if i.closeValueCloser() != nil {
386 0 : return nil, nil
387 0 : }
388 :
389 : // Prior to this call to `Next()` we are in one of three situations with
390 : // respect to `iterKey` and related state:
391 : //
392 : // - `!skip && pos == iterPosNext`: `iterKey` is already at the next key.
393 : // - `!skip && pos == iterPosCurForward`: We are at the key that has been returned.
394 : // To move forward we advance by one key, even if that lands us in the same
395 : // snapshot stripe.
396 : // - `skip && pos == iterPosCurForward`: We are at the key that has been returned.
397 : // To move forward we skip skippable entries in the stripe.
398 1 : if i.pos == iterPosCurForward {
399 1 : if i.skip {
400 1 : i.skipInStripe()
401 1 : } else {
402 1 : i.nextInStripe()
403 1 : }
404 1 : } else if i.skip {
405 0 : panic(errors.AssertionFailedf("compaction iterator has skip=true, but iterator is at iterPosNext"))
406 : }
407 :
408 1 : i.pos = iterPosCurForward
409 1 :
410 1 : for i.iterKV != nil {
411 1 : // If we entered a new snapshot stripe with the same key, any key we
412 1 : // return on this iteration is only returned because the open snapshot
413 1 : // prevented it from being elided or merged with the key returned for
414 1 : // the previous stripe. Mark it as pinned so that the compaction loop
415 1 : // can correctly populate output tables' pinned statistics. We might
416 1 : // also set snapshotPinned=true down below if we observe that the key is
417 1 : // deleted by a range deletion in a higher stripe or that this key is a
418 1 : // tombstone that could be elided if only it were in the last snapshot
419 1 : // stripe.
420 1 : i.snapshotPinned = i.iterStripeChange == newStripeSameKey
421 1 :
422 1 : if i.iterKV.Kind() == base.InternalKeyKindRangeDelete || rangekey.IsRangeKey(i.iterKV.Kind()) {
423 1 : // Return the span so the compaction can use it for file truncation and add
424 1 : // it to the relevant fragmenter. In the case of range deletions, we do not
425 1 : // set `skip` to true before returning as there may be any number of point
426 1 : // keys with the same user key and sequence numbers ≥ the range deletion's
427 1 : // sequence number. Such point keys must be visible (i.e., not skipped
428 1 : // over) since we promise point keys are not deleted by range tombstones at
429 1 : // the same sequence number (or higher).
430 1 : //
431 1 : // Note that `skip` must already be false here, because range keys and range
432 1 : // deletions are interleaved at the maximal sequence numbers and neither will
433 1 : // set `skip`=true.
434 1 : if i.skip {
435 0 : panic(errors.AssertionFailedf("pebble: compaction iterator: skip unexpectedly true"))
436 : }
437 :
438 1 : if i.iterKV.Kind() == base.InternalKeyKindRangeDelete {
439 1 : span := i.rangeDelInterleaving.Span()
440 1 : i.lastRangeDelSpan.CopyFrom(span)
441 1 : i.rangeDelCompactor.Compact(span, &i.span)
442 1 : if i.span.Empty() {
443 1 : // The range del span was elided entirely; don't return this key to the caller.
444 1 : i.saveKey()
445 1 : i.nextInStripe()
446 1 : continue
447 : }
448 1 : } else {
449 1 : i.rangeKeyCompactor.Compact(i.rangeKeyInterleaving.Span(), &i.span)
450 1 : if i.span.Empty() {
451 1 : // The range key span was elided entirely; don't return this key to the caller.
452 1 : i.saveKey()
453 1 : i.nextInStripe()
454 1 : continue
455 : }
456 : }
457 :
458 : // NOTE: there is a subtle invariant violation here in that calling
459 : // saveKey and returning a reference to the temporary slice violates
460 : // the stability guarantee for range deletion keys. A potential
461 : // mediation could return the original iterKey and iterValue
462 : // directly, as the backing memory is guaranteed to be stable until
463 : // the compaction completes. The violation here is only minor in
464 : // that the caller immediately clones the range deletion InternalKey
465 : // when passing the key to the deletion fragmenter (see the
466 : // call-site in compaction.go).
467 : // TODO(travers): address this violation by removing the call to
468 : // saveKey and instead return the original iterKey and iterValue.
469 : // This goes against the comment on i.key in the struct, and
470 : // therefore warrants some investigation.
471 1 : i.saveKey()
472 1 : // TODO(jackson): Handle tracking pinned statistics for range keys
473 1 : // and range deletions. This would require updating
474 1 : // emitRangeDelChunk and rangeKeyCompactionTransform to update
475 1 : // statistics when they apply their own snapshot striping logic.
476 1 : i.snapshotPinned = false
477 1 : i.value = i.iterValue
478 1 : return &i.key, i.value
479 : }
480 :
481 : // Check if the last tombstone covers the key.
482 : // TODO(sumeer): we could avoid calling tombstoneCovers if
483 : // i.iterStripeChange == sameStripeSameKey since that check has already been
484 : // done in nextInStripeHelper. However, we also need to handle the case of
485 : // CoversInvisibly below.
486 1 : switch i.tombstoneCovers(i.iterKV.K, i.curSnapshotSeqNum) {
487 1 : case coversVisibly:
488 1 : // A pending range deletion deletes this key. Skip it.
489 1 : i.saveKey()
490 1 : i.skipInStripe()
491 1 : continue
492 :
493 1 : case coversInvisibly:
494 1 : // i.iterKV would be deleted by a range deletion if there weren't any open
495 1 : // snapshots. Mark it as pinned.
496 1 : //
497 1 : // NB: there are multiple places in this file where we check for a
498 1 : // covering tombstone and this is the only one where we are writing to
499 1 : // i.snapshotPinned. Those other cases occur in mergeNext where the caller
500 1 : // is deciding whether the value should be merged or not, and the key is
501 1 : // in the same snapshot stripe. Hence, snapshotPinned is by definition
502 1 : // false in those cases.
503 1 : i.snapshotPinned = true
504 1 : i.forceObsoleteDueToRangeDel = true
505 :
506 1 : default:
507 1 : i.forceObsoleteDueToRangeDel = false
508 : }
509 :
510 1 : switch i.iterKV.Kind() {
511 1 : case base.InternalKeyKindDelete, base.InternalKeyKindSingleDelete, base.InternalKeyKindDeleteSized:
512 1 : if i.delElider.ShouldElide(i.iterKV.K.UserKey) {
513 1 : if i.curSnapshotIdx == 0 {
514 1 : // If we're at the last snapshot stripe and the tombstone
515 1 : // can be elided skip skippable keys in the same stripe.
516 1 : i.saveKey()
517 1 : if i.key.Kind() == base.InternalKeyKindSingleDelete {
518 1 : i.skipDueToSingleDeleteElision()
519 1 : } else {
520 1 : i.skipInStripe()
521 1 : if !i.skip && i.iterStripeChange != newStripeNewKey {
522 0 : panic(errors.AssertionFailedf("pebble: skipInStripe in last stripe disabled skip without advancing to new key"))
523 : }
524 : }
525 1 : if i.iterStripeChange == newStripeSameKey {
526 0 : panic(errors.AssertionFailedf("pebble: skipInStripe in last stripe found a new stripe within the same key"))
527 : }
528 1 : continue
529 1 : } else {
530 1 : // We're not at the last snapshot stripe, so the tombstone
531 1 : // can NOT yet be elided. Mark it as pinned, so that it's
532 1 : // included in table statistics appropriately.
533 1 : i.snapshotPinned = true
534 1 : }
535 : }
536 :
537 1 : switch i.iterKV.Kind() {
538 1 : case base.InternalKeyKindDelete:
539 1 : i.saveKey()
540 1 : i.value = i.iterValue
541 1 : i.skip = true
542 1 : return &i.key, i.value
543 :
544 1 : case base.InternalKeyKindDeleteSized:
545 1 : // We may skip subsequent keys because of this tombstone. Scan
546 1 : // ahead to see just how much data this tombstone drops and if
547 1 : // the tombstone's value should be updated accordingly.
548 1 : return i.deleteSizedNext()
549 :
550 1 : case base.InternalKeyKindSingleDelete:
551 1 : if i.singleDeleteNext() {
552 1 : return &i.key, i.value
553 1 : } else if i.err != nil {
554 0 : return nil, nil
555 0 : }
556 1 : continue
557 :
558 0 : default:
559 0 : panic(errors.AssertionFailedf(
560 0 : "unexpected kind %s", redact.SafeString(i.iterKV.Kind().String())))
561 : }
562 :
563 1 : case base.InternalKeyKindSet, base.InternalKeyKindSetWithDelete:
564 1 : // The key we emit for this entry is a function of the current key
565 1 : // kind, and whether this entry is followed by a DEL/SINGLEDEL
566 1 : // entry. setNext() does the work to move the iterator forward,
567 1 : // preserving the original value, and potentially mutating the key
568 1 : // kind.
569 1 : i.setNext()
570 1 : if i.err != nil {
571 1 : return nil, nil
572 1 : }
573 1 : return &i.key, i.value
574 :
575 1 : case base.InternalKeyKindMerge:
576 1 : // Record the snapshot index before mergeNext as merging
577 1 : // advances the iterator, adjusting curSnapshotIdx.
578 1 : origSnapshotIdx := i.curSnapshotIdx
579 1 : var valueMerger base.ValueMerger
580 1 : valueMerger, i.err = i.cfg.Merge(i.iterKV.K.UserKey, i.iterValue)
581 1 : if i.err == nil {
582 1 : i.mergeNext(valueMerger)
583 1 : }
584 1 : var needDelete bool
585 1 : if i.err == nil {
586 1 : // includesBase is true whenever we've transformed the MERGE record
587 1 : // into a SET.
588 1 : var includesBase bool
589 1 : switch i.key.Kind() {
590 1 : case base.InternalKeyKindSet, base.InternalKeyKindSetWithDelete:
591 1 : includesBase = true
592 1 : case base.InternalKeyKindMerge:
593 0 : default:
594 0 : panic(errors.AssertionFailedf(
595 0 : "unexpected kind %s", redact.SafeString(i.key.Kind().String())))
596 : }
597 1 : i.value, needDelete, i.valueCloser, i.err = finishValueMerger(valueMerger, includesBase)
598 : }
599 1 : if i.err == nil {
600 1 : if needDelete {
601 1 : if i.closeValueCloser() != nil {
602 0 : return nil, nil
603 0 : }
604 1 : continue
605 : }
606 :
607 1 : i.maybeZeroSeqnum(origSnapshotIdx)
608 1 : return &i.key, i.value
609 : }
610 1 : if i.err != nil {
611 1 : // TODO(sumeer): why is MarkCorruptionError only being called for
612 1 : // MERGE?
613 1 : i.err = base.MarkCorruptionError(i.err)
614 1 : }
615 1 : return nil, nil
616 :
617 1 : default:
618 1 : i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(i.iterKV.Kind()))
619 1 : return nil, nil
620 : }
621 : }
622 :
623 1 : return nil, nil
624 : }
625 :
626 : // Span returns the range deletion or range key span corresponding to the
627 : // current key. Can only be called right after a Next() call that returned a
628 : // RANGEDEL or a range key. The keys in the span should not be retained or
629 : // modified.
630 1 : func (i *Iter) Span() *keyspan.Span {
631 1 : return &i.span
632 1 : }
633 :
634 1 : func (i *Iter) closeValueCloser() error {
635 1 : if i.valueCloser == nil {
636 1 : return nil
637 1 : }
638 :
639 0 : i.err = i.valueCloser.Close()
640 0 : i.valueCloser = nil
641 0 : return i.err
642 : }
643 :
644 : // skipInStripe skips over skippable keys in the same stripe and user key. It
645 : // may set i.err, in which case i.iterKV will be nil.
646 1 : func (i *Iter) skipInStripe() {
647 1 : i.skip = true
648 1 : // TODO(sumeer): we can avoid the overhead of calling i.rangeDelFrag.Covers,
649 1 : // in this case of nextInStripe, since we are skipping all of them anyway.
650 1 : for i.nextInStripe() == sameStripe {
651 1 : if i.err != nil {
652 0 : panic(i.err)
653 : }
654 : }
655 : // We landed outside the original stripe, so reset skip.
656 1 : i.skip = false
657 : }
658 :
659 1 : func (i *Iter) iterNext() bool {
660 1 : i.iterKV = i.iter.Next()
661 1 : if i.iterKV != nil {
662 1 : i.iterValue, _, i.err = i.iterKV.Value(nil)
663 1 : if i.err != nil {
664 0 : i.iterKV = nil
665 0 : }
666 : }
667 1 : return i.iterKV != nil
668 : }
669 :
670 : // stripeChangeType indicates how the snapshot stripe changed relative to the
671 : // previous key. If the snapshot stripe changed, it also indicates whether the
672 : // new stripe was entered because the iterator progressed onto an entirely new
673 : // key or entered a new stripe within the same key.
674 : type stripeChangeType int
675 :
676 : const (
677 : newStripeNewKey stripeChangeType = iota
678 : newStripeSameKey
679 : sameStripe
680 : )
681 :
682 : // nextInStripe advances the iterator and returns one of the above const ints
683 : // indicating how its state changed.
684 : //
685 : // All sameStripe keys that are covered by a RANGEDEL will be skipped and not
686 : // returned.
687 : //
688 : // Calls to nextInStripe must be preceded by a call to saveKey to retain a
689 : // temporary reference to the original key, so that forward iteration can
690 : // proceed with a reference to the original key. Care should be taken to avoid
691 : // overwriting or mutating the saved key or value before they have been returned
692 : // to the caller of the exported function (i.e. the caller of Next, First, etc.)
693 : //
694 : // nextInStripe may set i.err, in which case the return value will be
695 : // newStripeNewKey, and i.iterKV will be nil.
696 1 : func (i *Iter) nextInStripe() stripeChangeType {
697 1 : i.iterStripeChange = i.nextInStripeHelper()
698 1 : return i.iterStripeChange
699 1 : }
700 :
701 : // nextInStripeHelper is an internal helper for nextInStripe; callers should use
702 : // nextInStripe and not call nextInStripeHelper.
703 1 : func (i *Iter) nextInStripeHelper() stripeChangeType {
704 1 : origSnapshotIdx := i.curSnapshotIdx
705 1 : for {
706 1 : if !i.iterNext() {
707 1 : return newStripeNewKey
708 1 : }
709 1 : kv := i.iterKV
710 1 :
711 1 : // Is this a new key? There are two cases:
712 1 : //
713 1 : // 1. The new key has a different user key.
714 1 : // 2. The previous key was an interleaved range deletion or range key
715 1 : // boundary. These keys are interleaved in the same input iterator
716 1 : // stream as point keys, but they do not obey the ordinary sequence
717 1 : // number ordering within a user key. If the previous key was one
718 1 : // of these keys, we consider the new key a `newStripeNewKey` to
719 1 : // reflect that it's the beginning of a new stream of point keys.
720 1 : if i.key.IsExclusiveSentinel() || !i.cfg.Comparer.Equal(i.key.UserKey, kv.K.UserKey) {
721 1 : i.curSnapshotIdx, i.curSnapshotSeqNum = i.cfg.Snapshots.IndexAndSeqNum(kv.SeqNum())
722 1 : return newStripeNewKey
723 1 : }
724 :
725 : // If i.key and key have the same user key, then
726 : // 1. i.key must not have had a zero sequence number (or it would've be the last
727 : // key with its user key).
728 : // 2. i.key must have a strictly larger sequence number
729 : // There's an exception in that either key may be a range delete. Range
730 : // deletes may share a sequence number with a point key if the keys were
731 : // ingested together. Range keys may also share the sequence number if they
732 : // were ingested, but range keys are interleaved into the compaction
733 : // iterator's input iterator at the maximal sequence number so their
734 : // original sequence number will not be observed here.
735 1 : if prevSeqNum := base.SeqNumFromTrailer(i.keyTrailer); (prevSeqNum == 0 || prevSeqNum <= kv.SeqNum()) &&
736 1 : i.key.Kind() != base.InternalKeyKindRangeDelete && kv.Kind() != base.InternalKeyKindRangeDelete {
737 0 : prevKey := i.key
738 0 : prevKey.Trailer = i.keyTrailer
739 0 : panic(errors.AssertionFailedf("pebble: invariant violation: %s and %s out of order", prevKey, kv.K))
740 : }
741 :
742 1 : i.curSnapshotIdx, i.curSnapshotSeqNum = i.cfg.Snapshots.IndexAndSeqNum(kv.SeqNum())
743 1 : switch kv.Kind() {
744 : case base.InternalKeyKindRangeKeySet, base.InternalKeyKindRangeKeyUnset, base.InternalKeyKindRangeKeyDelete,
745 0 : base.InternalKeyKindRangeDelete:
746 0 : // Range tombstones and range keys are interleaved at the max
747 0 : // sequence number for a given user key, and the first key after one
748 0 : // is always considered a newStripeNewKey, so we should never reach
749 0 : // this.
750 0 : panic("unreachable")
751 : case base.InternalKeyKindDelete, base.InternalKeyKindSet, base.InternalKeyKindMerge, base.InternalKeyKindSingleDelete,
752 1 : base.InternalKeyKindSetWithDelete, base.InternalKeyKindDeleteSized:
753 : // Fall through
754 1 : default:
755 1 : kind := i.iterKV.Kind()
756 1 : i.iterKV = nil
757 1 : i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(kind))
758 1 : return newStripeNewKey
759 : }
760 1 : if i.curSnapshotIdx == origSnapshotIdx {
761 1 : // Same snapshot.
762 1 : if i.tombstoneCovers(i.iterKV.K, i.curSnapshotSeqNum) == coversVisibly {
763 1 : continue
764 : }
765 1 : return sameStripe
766 : }
767 1 : return newStripeSameKey
768 : }
769 : }
770 :
771 1 : func (i *Iter) setNext() {
772 1 : // Save the current key.
773 1 : i.saveKey()
774 1 : i.value = i.iterValue
775 1 : i.maybeZeroSeqnum(i.curSnapshotIdx)
776 1 :
777 1 : // If this key is already a SETWITHDEL we can early return and skip the remaining
778 1 : // records in the stripe:
779 1 : if i.iterKV.Kind() == base.InternalKeyKindSetWithDelete {
780 1 : i.skip = true
781 1 : return
782 1 : }
783 :
784 : // We are iterating forward. Save the current value.
785 1 : i.valueBuf = append(i.valueBuf[:0], i.iterValue...)
786 1 : i.value = i.valueBuf
787 1 :
788 1 : // Else, we continue to loop through entries in the stripe looking for a
789 1 : // DEL. Note that we may stop *before* encountering a DEL, if one exists.
790 1 : //
791 1 : // NB: nextInStripe will skip sameStripe keys that are visibly covered by a
792 1 : // RANGEDEL. This can include DELs -- this is fine since such DELs don't
793 1 : // need to be combined with SET to make SETWITHDEL.
794 1 : for {
795 1 : switch i.nextInStripe() {
796 1 : case newStripeNewKey, newStripeSameKey:
797 1 : i.pos = iterPosNext
798 1 : return
799 1 : case sameStripe:
800 1 : // We're still in the same stripe. If this is a
801 1 : // DEL/SINGLEDEL/DELSIZED, we stop looking and emit a SETWITHDEL.
802 1 : // Subsequent keys are eligible for skipping.
803 1 : switch i.iterKV.Kind() {
804 1 : case base.InternalKeyKindDelete, base.InternalKeyKindSingleDelete, base.InternalKeyKindDeleteSized:
805 1 : i.key.SetKind(base.InternalKeyKindSetWithDelete)
806 1 : i.skip = true
807 1 : return
808 1 : case base.InternalKeyKindSet, base.InternalKeyKindMerge, base.InternalKeyKindSetWithDelete:
809 : // Do nothing
810 0 : default:
811 0 : i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(i.iterKV.Kind()))
812 : }
813 0 : default:
814 0 : panic("pebble: unexpected stripeChangeType: " + strconv.Itoa(int(i.iterStripeChange)))
815 : }
816 : }
817 : }
818 :
819 1 : func (i *Iter) mergeNext(valueMerger base.ValueMerger) {
820 1 : // Save the current key.
821 1 : i.saveKey()
822 1 :
823 1 : // Loop looking for older values in the current snapshot stripe and merge
824 1 : // them.
825 1 : for {
826 1 : if i.nextInStripe() != sameStripe {
827 1 : i.pos = iterPosNext
828 1 : return
829 1 : }
830 1 : if i.err != nil {
831 0 : panic(i.err)
832 : }
833 : // NB: MERGE#10+RANGEDEL#9 stays a MERGE, since nextInStripe skips
834 : // sameStripe keys that are visibly covered by a RANGEDEL. There may be
835 : // MERGE#7 that is invisibly covered and will be preserved, but there is
836 : // no risk that MERGE#10 and MERGE#7 will get merged in the future as
837 : // the RANGEDEL still exists and will be used in user-facing reads that
838 : // see MERGE#10, and will also eventually cause MERGE#7 to be deleted in
839 : // a compaction.
840 1 : key := i.iterKV
841 1 : switch key.Kind() {
842 1 : case base.InternalKeyKindDelete, base.InternalKeyKindSingleDelete, base.InternalKeyKindDeleteSized:
843 1 : // We've hit a deletion tombstone. Return everything up to this point and
844 1 : // then skip entries until the next snapshot stripe. We change the kind
845 1 : // of the result key to a Set so that it shadows keys in lower
846 1 : // levels. That is, MERGE+DEL -> SETWITHDEL.
847 1 : //
848 1 : // We do the same for SingleDelete since SingleDelete is only
849 1 : // permitted (with deterministic behavior) for keys that have been
850 1 : // set once since the last SingleDelete/Delete, so everything
851 1 : // older is acceptable to shadow. Note that this is slightly
852 1 : // different from singleDeleteNext() which implements stricter
853 1 : // semantics in terms of applying the SingleDelete to the single
854 1 : // next Set. But those stricter semantics are not observable to
855 1 : // the end-user since Iterator interprets SingleDelete as Delete.
856 1 : // We could do something more complicated here and consume only a
857 1 : // single Set, and then merge in any following Sets, but that is
858 1 : // complicated wrt code and unnecessary given the narrow permitted
859 1 : // use of SingleDelete.
860 1 : i.key.SetKind(base.InternalKeyKindSetWithDelete)
861 1 : i.skip = true
862 1 : return
863 :
864 1 : case base.InternalKeyKindSet, base.InternalKeyKindSetWithDelete:
865 1 : // We've hit a Set or SetWithDel value. Merge with the existing
866 1 : // value and return. We change the kind of the resulting key to a
867 1 : // Set so that it shadows keys in lower levels. That is:
868 1 : // MERGE + (SET*) -> SET.
869 1 : i.err = valueMerger.MergeOlder(i.iterValue)
870 1 : if i.err != nil {
871 0 : return
872 0 : }
873 1 : i.key.SetKind(base.InternalKeyKindSet)
874 1 : i.skip = true
875 1 : return
876 :
877 1 : case base.InternalKeyKindMerge:
878 1 : // We've hit another Merge value. Merge with the existing value and
879 1 : // continue looping.
880 1 : i.err = valueMerger.MergeOlder(i.iterValue)
881 1 : if i.err != nil {
882 0 : return
883 0 : }
884 :
885 0 : default:
886 0 : i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(i.iterKV.Kind()))
887 0 : return
888 : }
889 : }
890 : }
891 :
892 : // singleDeleteNext processes a SingleDelete point tombstone. A SingleDelete, or
893 : // SINGLEDEL, is unique in that it deletes exactly 1 internal key. It's a
894 : // performance optimization when the client knows a user key has not been
895 : // overwritten, allowing the elision of the tombstone earlier, avoiding write
896 : // amplification.
897 : //
898 : // singleDeleteNext returns a boolean indicating whether or not the caller
899 : // should yield the SingleDelete key to the consumer of the Iter. If
900 : // singleDeleteNext returns false, the caller may consume/elide the
901 : // SingleDelete.
902 1 : func (i *Iter) singleDeleteNext() bool {
903 1 : // Save the current key.
904 1 : i.saveKey()
905 1 : i.value = i.iterValue
906 1 :
907 1 : // Loop until finds a key to be passed to the next level.
908 1 : for {
909 1 : // If we find a key that can't be skipped, return true so that the
910 1 : // caller yields the SingleDelete to the caller.
911 1 : if i.nextInStripe() != sameStripe {
912 1 : // This defers additional error checking regarding single delete
913 1 : // invariants to the compaction where the keys with the same user key as
914 1 : // the single delete are in the same stripe.
915 1 : i.pos = iterPosNext
916 1 : return i.err == nil
917 1 : }
918 1 : if i.err != nil {
919 0 : panic(i.err)
920 : }
921 : // INVARIANT: sameStripe.
922 1 : key := i.iterKV
923 1 : kind := key.Kind()
924 1 : switch kind {
925 1 : case base.InternalKeyKindDelete, base.InternalKeyKindSetWithDelete, base.InternalKeyKindDeleteSized:
926 1 : if kind == base.InternalKeyKindDelete || kind == base.InternalKeyKindDeleteSized {
927 1 : i.cfg.IneffectualSingleDeleteCallback(i.key.UserKey)
928 1 : }
929 : // We've hit a Delete, DeleteSized, SetWithDelete, transform
930 : // the SingleDelete into a full Delete.
931 1 : i.key.SetKind(base.InternalKeyKindDelete)
932 1 : i.skip = true
933 1 : return true
934 :
935 1 : case base.InternalKeyKindSet, base.InternalKeyKindMerge:
936 1 : // This SingleDelete deletes the Set/Merge, and we can now elide the
937 1 : // SingleDel as well. We advance past the Set and return false to
938 1 : // indicate to the main compaction loop that we should NOT yield the
939 1 : // current SingleDel key to the compaction loop.
940 1 : //
941 1 : // NB: singleDeleteNext was called with i.pos == iterPosCurForward, and
942 1 : // after the call to nextInStripe, we are still at iterPosCurForward,
943 1 : // since we are at the key after the Set/Merge that was single deleted.
944 1 : change := i.nextInStripe()
945 1 : switch change {
946 1 : case sameStripe, newStripeSameKey:
947 1 : // On the same user key.
948 1 : nextKind := i.iterKV.Kind()
949 1 : switch nextKind {
950 1 : case base.InternalKeyKindSet, base.InternalKeyKindSetWithDelete, base.InternalKeyKindMerge:
951 1 : if i.cfg.SingleDeleteInvariantViolationCallback != nil {
952 1 : // sameStripe keys returned by nextInStripe() are already
953 1 : // known to not be covered by a RANGEDEL, so it is an invariant
954 1 : // violation. The rare case is newStripeSameKey, where it is a
955 1 : // violation if not covered by a RANGEDEL.
956 1 : if change == sameStripe ||
957 1 : i.tombstoneCovers(i.iterKV.K, i.curSnapshotSeqNum) == noCover {
958 1 : i.cfg.SingleDeleteInvariantViolationCallback(i.key.UserKey)
959 1 : }
960 : }
961 1 : case base.InternalKeyKindDelete, base.InternalKeyKindDeleteSized, base.InternalKeyKindSingleDelete:
962 0 : default:
963 0 : panic(errors.AssertionFailedf(
964 0 : "unexpected internal key kind: %d", errors.Safe(i.iterKV.Kind())))
965 : }
966 1 : case newStripeNewKey:
967 0 : default:
968 0 : panic("unreachable")
969 : }
970 1 : return false
971 :
972 1 : case base.InternalKeyKindSingleDelete:
973 1 : // Two single deletes met in a compaction. The first single delete is
974 1 : // ineffectual.
975 1 : i.cfg.IneffectualSingleDeleteCallback(i.key.UserKey)
976 1 : // Continue to apply the second single delete.
977 1 : continue
978 :
979 0 : default:
980 0 : i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(i.iterKV.Kind()))
981 0 : return false
982 : }
983 : }
984 : }
985 :
986 : // skipDueToSingleDeleteElision is called when the SingleDelete is being
987 : // elided because it is in the final snapshot stripe and there are no keys
988 : // with the same user key in lower levels in the LSM (below the files in this
989 : // compaction).
990 : //
991 : // TODO(sumeer): the only difference between singleDeleteNext and
992 : // skipDueToSingleDeleteElision is the fact that the caller knows it will be
993 : // eliding the single delete in the latter case. There are some similar things
994 : // happening in both implementations. My first attempt at combining them into
995 : // a single method was hard to comprehend. Try again.
996 1 : func (i *Iter) skipDueToSingleDeleteElision() {
997 1 : for {
998 1 : stripeChange := i.nextInStripe()
999 1 : if i.err != nil {
1000 0 : panic(i.err)
1001 : }
1002 1 : switch stripeChange {
1003 1 : case newStripeNewKey:
1004 1 : // The single delete is only now being elided, meaning it did not elide
1005 1 : // any keys earlier in its descent down the LSM. We stepped onto a new
1006 1 : // user key, meaning that even now at its moment of elision, it still
1007 1 : // hasn't elided any other keys. The single delete was ineffectual (a
1008 1 : // no-op).
1009 1 : i.cfg.IneffectualSingleDeleteCallback(i.key.UserKey)
1010 1 : i.skip = false
1011 1 : return
1012 0 : case newStripeSameKey:
1013 0 : // This should be impossible. If we're eliding a single delete, we
1014 0 : // determined that the tombstone is in the final snapshot stripe, but we
1015 0 : // stepped into a new stripe of the same key.
1016 0 : panic(errors.AssertionFailedf("eliding single delete followed by same key in new stripe"))
1017 1 : case sameStripe:
1018 1 : kind := i.iterKV.Kind()
1019 1 : switch kind {
1020 1 : case base.InternalKeyKindDelete, base.InternalKeyKindDeleteSized, base.InternalKeyKindSingleDelete:
1021 1 : i.cfg.IneffectualSingleDeleteCallback(i.key.UserKey)
1022 1 : switch kind {
1023 1 : case base.InternalKeyKindDelete, base.InternalKeyKindDeleteSized:
1024 1 : i.skipInStripe()
1025 1 : return
1026 1 : case base.InternalKeyKindSingleDelete:
1027 1 : // Repeat the same with this SingleDelete. We don't want to simply
1028 1 : // call skipInStripe(), since it increases the strength of the
1029 1 : // SingleDel, which hides bugs in the use of single delete.
1030 1 : continue
1031 0 : default:
1032 0 : panic(errors.AssertionFailedf(
1033 0 : "unexpected internal key kind: %d", errors.Safe(i.iterKV.Kind())))
1034 : }
1035 1 : case base.InternalKeyKindSetWithDelete:
1036 1 : // The SingleDelete should behave like a Delete.
1037 1 : i.skipInStripe()
1038 1 : return
1039 1 : case base.InternalKeyKindSet, base.InternalKeyKindMerge:
1040 1 : // This SingleDelete deletes the Set/Merge, and we are eliding the
1041 1 : // SingleDel as well. Step to the next key (this is not deleted by the
1042 1 : // SingleDelete).
1043 1 : //
1044 1 : // NB: skipDueToSingleDeleteElision was called with i.pos ==
1045 1 : // iterPosCurForward, and after the call to nextInStripe, we are still
1046 1 : // at iterPosCurForward, since we are at the key after the Set/Merge
1047 1 : // that was single deleted.
1048 1 : change := i.nextInStripe()
1049 1 : if i.err != nil {
1050 0 : panic(i.err)
1051 : }
1052 1 : switch change {
1053 0 : case newStripeSameKey:
1054 0 : panic(errors.AssertionFailedf("eliding single delete followed by same key in new stripe"))
1055 1 : case newStripeNewKey:
1056 0 : case sameStripe:
1057 0 : // On the same key.
1058 0 : nextKind := i.iterKV.Kind()
1059 0 : switch nextKind {
1060 0 : case base.InternalKeyKindSet, base.InternalKeyKindSetWithDelete, base.InternalKeyKindMerge:
1061 0 : if i.cfg.SingleDeleteInvariantViolationCallback != nil {
1062 0 : i.cfg.SingleDeleteInvariantViolationCallback(i.key.UserKey)
1063 0 : }
1064 0 : case base.InternalKeyKindDelete, base.InternalKeyKindDeleteSized, base.InternalKeyKindSingleDelete:
1065 0 : default:
1066 0 : panic(errors.AssertionFailedf(
1067 0 : "unexpected internal key kind: %d", errors.Safe(i.iterKV.Kind())))
1068 : }
1069 0 : default:
1070 0 : panic("unreachable")
1071 : }
1072 : // Whether in same stripe or new stripe, this key is not consumed by
1073 : // the SingleDelete.
1074 1 : i.skip = false
1075 1 : return
1076 0 : default:
1077 0 : panic(errors.AssertionFailedf(
1078 0 : "unexpected internal key kind: %d", errors.Safe(i.iterKV.Kind())))
1079 : }
1080 0 : default:
1081 0 : panic("unreachable")
1082 : }
1083 : }
1084 : }
1085 :
1086 : // deleteSizedNext processes a DELSIZED point tombstone. Unlike ordinary DELs,
1087 : // these tombstones carry a value that's a varint indicating the size of the
1088 : // entry (len(key)+len(value)) that the tombstone is expected to delete.
1089 : //
1090 : // When a deleteSizedNext is encountered, we skip ahead to see which keys, if
1091 : // any, are elided as a result of the tombstone.
1092 1 : func (i *Iter) deleteSizedNext() (*base.InternalKey, []byte) {
1093 1 : i.saveKey()
1094 1 : i.skip = true
1095 1 :
1096 1 : // The DELSIZED tombstone may have no value at all. This happens when the
1097 1 : // tombstone has already deleted the key that the user originally predicted.
1098 1 : // In this case, we still peek forward in case there's another DELSIZED key
1099 1 : // with a lower sequence number, in which case we'll adopt its value.
1100 1 : if len(i.iterValue) == 0 {
1101 1 : i.value = i.valueBuf[:0]
1102 1 : } else {
1103 1 : i.valueBuf = append(i.valueBuf[:0], i.iterValue...)
1104 1 : i.value = i.valueBuf
1105 1 : }
1106 :
1107 : // Loop through all the keys within this stripe that are skippable.
1108 1 : i.pos = iterPosNext
1109 1 : for i.nextInStripe() == sameStripe {
1110 1 : if i.err != nil {
1111 0 : panic(i.err)
1112 : }
1113 1 : switch i.iterKV.Kind() {
1114 1 : case base.InternalKeyKindDelete, base.InternalKeyKindDeleteSized, base.InternalKeyKindSingleDelete:
1115 1 : // We encountered a tombstone (DEL, or DELSIZED) that's deleted by
1116 1 : // the original DELSIZED tombstone. This can happen in two cases:
1117 1 : //
1118 1 : // (1) These tombstones were intended to delete two distinct values,
1119 1 : // and this DELSIZED has already dropped the relevant key. For
1120 1 : // example:
1121 1 : //
1122 1 : // a.DELSIZED.9 a.SET.7 a.DELSIZED.5 a.SET.4
1123 1 : //
1124 1 : // If a.DELSIZED.9 has already deleted a.SET.7, its size has
1125 1 : // already been zeroed out. In this case, we want to adopt the
1126 1 : // value of the DELSIZED with the lower sequence number, in
1127 1 : // case the a.SET.4 key has not yet been elided.
1128 1 : //
1129 1 : // (2) This DELSIZED was missized. The user thought they were
1130 1 : // deleting a key with this user key, but this user key had
1131 1 : // already been deleted.
1132 1 : //
1133 1 : // We can differentiate these two cases by examining the length of
1134 1 : // the DELSIZED's value. A DELSIZED's value holds the size of both
1135 1 : // the user key and value that it intends to delete. For any user
1136 1 : // key with a length > 0, a DELSIZED that has not deleted a key must
1137 1 : // have a value with a length > 0.
1138 1 : //
1139 1 : // We treat both cases the same functionally, adopting the identity
1140 1 : // of the lower-sequence numbered tombstone. However in the second
1141 1 : // case, we also increment the stat counting missized tombstones.
1142 1 : if len(i.value) > 0 {
1143 1 : // The original DELSIZED key was missized. The key that the user
1144 1 : // thought they were deleting does not exist.
1145 1 : i.stats.CountMissizedDels++
1146 1 : }
1147 1 : i.valueBuf = append(i.valueBuf[:0], i.iterValue...)
1148 1 : i.value = i.valueBuf
1149 1 : if i.iterKV.Kind() != base.InternalKeyKindDeleteSized {
1150 1 : // Convert the DELSIZED to a DEL—The DEL/SINGLEDEL we're eliding
1151 1 : // may not have deleted the key(s) it was intended to yet. The
1152 1 : // ordinary DEL compaction heuristics are better suited at that,
1153 1 : // plus we don't want to count it as a missized DEL. We early
1154 1 : // exit in this case, after skipping the remainder of the
1155 1 : // snapshot stripe.
1156 1 : i.key.SetKind(base.InternalKeyKindDelete)
1157 1 : // NB: We skipInStripe now, rather than returning leaving
1158 1 : // i.skip=true and returning early, because Next() requires
1159 1 : // that i.skip=true only if i.iterPos = iterPosCurForward.
1160 1 : //
1161 1 : // Ignore any error caused by skipInStripe since it does not affect
1162 1 : // the key/value being returned here, and the next call to Next() will
1163 1 : // expose it.
1164 1 : i.skipInStripe()
1165 1 : return &i.key, i.value
1166 1 : }
1167 : // Continue, in case we uncover another DELSIZED or a key this
1168 : // DELSIZED deletes.
1169 :
1170 1 : case base.InternalKeyKindSet, base.InternalKeyKindMerge, base.InternalKeyKindSetWithDelete:
1171 1 : // If the DELSIZED is value-less, it already deleted the key that it
1172 1 : // was intended to delete. This is possible with a sequence like:
1173 1 : //
1174 1 : // DELSIZED.8 SET.7 SET.3
1175 1 : //
1176 1 : // The DELSIZED only describes the size of the SET.7, which in this
1177 1 : // case has already been elided. We don't count it as a missizing,
1178 1 : // instead converting the DELSIZED to a DEL. Skip the remainder of
1179 1 : // the snapshot stripe and return.
1180 1 : if len(i.value) == 0 {
1181 1 : i.key.SetKind(base.InternalKeyKindDelete)
1182 1 : // NB: We skipInStripe now, rather than returning leaving
1183 1 : // i.skip=true and returning early, because Next() requires
1184 1 : // that i.skip=true only if i.iterPos = iterPosCurForward.
1185 1 : //
1186 1 : // Ignore any error caused by skipInStripe since it does not affect
1187 1 : // the key/value being returned here, and the next call to Next() will
1188 1 : // expose it.
1189 1 : i.skipInStripe()
1190 1 : return &i.key, i.value
1191 1 : }
1192 : // The deleted key is not a DEL, DELSIZED, and the DELSIZED in i.key
1193 : // has a positive size.
1194 1 : expectedSize, n := binary.Uvarint(i.value)
1195 1 : if n != len(i.value) {
1196 1 : i.err = base.CorruptionErrorf("DELSIZED holds invalid value: %x", errors.Safe(i.value))
1197 1 : return nil, nil
1198 1 : }
1199 1 : elidedSize := uint64(len(i.iterKV.K.UserKey)) + uint64(len(i.iterValue))
1200 1 : if elidedSize != expectedSize {
1201 1 : // The original DELSIZED key was missized. It's unclear what to
1202 1 : // do. The user-provided size was wrong, so it's unlikely to be
1203 1 : // accurate or meaningful. We could:
1204 1 : //
1205 1 : // 1. return the DELSIZED with the original user-provided size unmodified
1206 1 : // 2. return the DELZIZED with a zeroed size to reflect that a key was
1207 1 : // elided, even if it wasn't the anticipated size.
1208 1 : // 3. subtract the elided size from the estimate and re-encode.
1209 1 : // 4. convert the DELSIZED into a value-less DEL, so that
1210 1 : // ordinary DEL heuristics apply.
1211 1 : //
1212 1 : // We opt for (4) under the rationale that we can't rely on the
1213 1 : // user-provided size for accuracy, so ordinary DEL heuristics
1214 1 : // are safer.
1215 1 : i.stats.CountMissizedDels++
1216 1 : i.key.SetKind(base.InternalKeyKindDelete)
1217 1 : i.value = i.valueBuf[:0]
1218 1 : // NB: We skipInStripe now, rather than returning leaving
1219 1 : // i.skip=true and returning early, because Next() requires
1220 1 : // that i.skip=true only if i.iterPos = iterPosCurForward.
1221 1 : //
1222 1 : // Ignore any error caused by skipInStripe since it does not affect
1223 1 : // the key/value being returned here, and the next call to Next() will
1224 1 : // expose it.
1225 1 : i.skipInStripe()
1226 1 : return &i.key, i.value
1227 1 : }
1228 : // NB: We remove the value regardless of whether the key was sized
1229 : // appropriately. The size encoded is 'consumed' the first time it
1230 : // meets a key that it deletes.
1231 1 : i.value = i.valueBuf[:0]
1232 :
1233 0 : default:
1234 0 : i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(i.iterKV.Kind()))
1235 0 : return nil, nil
1236 : }
1237 : }
1238 :
1239 1 : if i.iterStripeChange == sameStripe {
1240 0 : panic(errors.AssertionFailedf("unexpectedly found iter stripe change = %d", i.iterStripeChange))
1241 : }
1242 : // We landed outside the original stripe. Reset skip.
1243 1 : i.skip = false
1244 1 : if i.err != nil {
1245 0 : return nil, nil
1246 0 : }
1247 1 : return &i.key, i.value
1248 : }
1249 :
1250 1 : func (i *Iter) saveKey() {
1251 1 : i.keyBuf = append(i.keyBuf[:0], i.iterKV.K.UserKey...)
1252 1 : i.key = base.InternalKey{
1253 1 : UserKey: i.keyBuf,
1254 1 : Trailer: i.iterKV.K.Trailer,
1255 1 : }
1256 1 : i.keyTrailer = i.key.Trailer
1257 1 : i.frontiers.Advance(i.key.UserKey)
1258 1 : }
1259 :
1260 : // Error returns any error encountered.
1261 : //
1262 : // Note that Close will return the error as well.
1263 1 : func (i *Iter) Error() error {
1264 1 : return i.err
1265 1 : }
1266 :
1267 : // Close the iterator.
1268 1 : func (i *Iter) Close() error {
1269 1 : err := i.iter.Close()
1270 1 : if i.err == nil {
1271 1 : i.err = err
1272 1 : }
1273 :
1274 : // Close the closer for the current value if one was open.
1275 1 : if i.valueCloser != nil {
1276 0 : i.err = errors.CombineErrors(i.err, i.valueCloser.Close())
1277 0 : i.valueCloser = nil
1278 0 : }
1279 :
1280 1 : return i.err
1281 : }
1282 :
1283 : // cover is returned by tombstoneCovers and describes a span's relationship to
1284 : // a key at a particular snapshot.
1285 : type cover int8
1286 :
1287 : const (
1288 : // noCover indicates the tested key does not fall within the span's bounds,
1289 : // or the span contains no keys with sequence numbers higher than the key's.
1290 : noCover cover = iota
1291 :
1292 : // coversInvisibly indicates the tested key does fall within the span's
1293 : // bounds and the span contains at least one key with a higher sequence
1294 : // number, but none visible at the provided snapshot.
1295 : coversInvisibly
1296 :
1297 : // coversVisibly indicates the tested key does fall within the span's
1298 : // bounds, and the span constains at least one key with a sequence number
1299 : // higher than the key's sequence number that is visible at the provided
1300 : // snapshot.
1301 : coversVisibly
1302 : )
1303 :
1304 : // tombstoneCovers returns whether the key is covered by a tombstone and whether
1305 : // it is covered by a tombstone visible in the given snapshot.
1306 : //
1307 : // The key's UserKey must be greater or equal to the last span Start key passed
1308 : // to AddTombstoneSpan. The keys passed to tombstoneCovers calls must be
1309 : // ordered.
1310 1 : func (i *Iter) tombstoneCovers(key base.InternalKey, snapshot uint64) cover {
1311 1 : if i.lastRangeDelSpan.Empty() {
1312 1 : return noCover
1313 1 : }
1314 1 : if invariants.Enabled && i.cmp(key.UserKey, i.lastRangeDelSpan.Start) < 0 {
1315 0 : panic(errors.AssertionFailedf("invalid key %q, last span %s", key, i.lastRangeDelSpan))
1316 : }
1317 1 : if i.cmp(key.UserKey, i.lastRangeDelSpan.End) >= 0 {
1318 1 : i.lastRangeDelSpan.Reset()
1319 1 : return noCover
1320 1 : }
1321 : // The Covers() check is very cheap, so we want to do that first.
1322 1 : switch {
1323 1 : case !i.lastRangeDelSpan.Covers(key.SeqNum()):
1324 1 : return noCover
1325 1 : case i.lastRangeDelSpan.CoversAt(snapshot, key.SeqNum()):
1326 1 : return coversVisibly
1327 1 : default:
1328 1 : return coversInvisibly
1329 : }
1330 : }
1331 :
1332 : // maybeZeroSeqnum attempts to set the seqnum for the current key to 0. Doing
1333 : // so improves compression and enables an optimization during forward iteration
1334 : // to skip some key comparisons. The seqnum for an entry can be zeroed if the
1335 : // entry is on the bottom snapshot stripe and on the bottom level of the LSM.
1336 1 : func (i *Iter) maybeZeroSeqnum(snapshotIdx int) {
1337 1 : if !i.cfg.AllowZeroSeqNum {
1338 1 : // TODO(peter): allowZeroSeqNum applies to the entire compaction. We could
1339 1 : // make the determination on a key by key basis, similar to what is done
1340 1 : // for elideTombstone. Need to add a benchmark for Iter to verify
1341 1 : // that isn't too expensive.
1342 1 : return
1343 1 : }
1344 1 : if snapshotIdx > 0 {
1345 1 : // This is not the last snapshot
1346 1 : return
1347 1 : }
1348 1 : i.key.SetSeqNum(base.SeqNumZero)
1349 : }
1350 :
1351 : func finishValueMerger(
1352 : valueMerger base.ValueMerger, includesBase bool,
1353 1 : ) (value []byte, needDelete bool, closer io.Closer, err error) {
1354 1 : if valueMerger2, ok := valueMerger.(base.DeletableValueMerger); ok {
1355 1 : value, needDelete, closer, err = valueMerger2.DeletableFinish(includesBase)
1356 1 : } else {
1357 1 : value, closer, err = valueMerger.Finish(includesBase)
1358 1 : }
1359 1 : return
1360 : }
|