Line data Source code
1 : // Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package compact
6 :
7 : import (
8 : "encoding/binary"
9 : "io"
10 : "strconv"
11 :
12 : "github.com/cockroachdb/errors"
13 : "github.com/cockroachdb/pebble/internal/base"
14 : "github.com/cockroachdb/pebble/internal/invalidating"
15 : "github.com/cockroachdb/pebble/internal/invariants"
16 : "github.com/cockroachdb/pebble/internal/keyspan"
17 : "github.com/cockroachdb/pebble/internal/rangekey"
18 : "github.com/cockroachdb/redact"
19 : )
20 :
21 : // Iter provides a forward-only iterator that encapsulates the logic for
22 : // collapsing entries during compaction. It wraps an internal iterator and
23 : // collapses entries that are no longer necessary because they are shadowed by
24 : // newer entries. The simplest example of this is when the internal iterator
25 : // contains two keys: a.PUT.2 and a.PUT.1. Instead of returning both entries,
26 : // compact.Iter collapses the second entry because it is no longer necessary.
27 : // The high-level structure for compact.Iter is to iterate over its internal
28 : // iterator and output 1 entry for every user-key. There are four complications
29 : // to this story.
30 : //
31 : // 1. Eliding Deletion Tombstones
32 : //
33 : // Consider the entries a.DEL.2 and a.PUT.1. These entries collapse to
34 : // a.DEL.2. Do we have to output the entry a.DEL.2? Only if a.DEL.2 possibly
35 : // shadows an entry at a lower level. If we're compacting to the base-level in
36 : // the LSM tree then a.DEL.2 is definitely not shadowing an entry at a lower
37 : // level and can be elided.
38 : //
39 : // We can do slightly better than only eliding deletion tombstones at the base
40 : // level by observing that we can elide a deletion tombstone if there are no
41 : // sstables that contain the entry's key. This check is performed by
42 : // elideTombstone.
43 : //
44 : // 2. Merges
45 : //
46 : // The MERGE operation merges the value for an entry with the existing value
47 : // for an entry. The logical value of an entry can be composed of a series of
48 : // merge operations. When compact.Iter sees a MERGE, it scans forward in its
49 : // internal iterator collapsing MERGE operations for the same key until it
50 : // encounters a SET or DELETE operation. For example, the keys a.MERGE.4,
51 : // a.MERGE.3, a.MERGE.2 will be collapsed to a.MERGE.4 and the values will be
52 : // merged using the specified Merger.
53 : //
54 : // An interesting case here occurs when MERGE is combined with SET. Consider
55 : // the entries a.MERGE.3 and a.SET.2. The collapsed key will be a.SET.3. The
56 : // reason that the kind is changed to SET is because the SET operation acts as
57 : // a barrier preventing further merging. This can be seen better in the
58 : // scenario a.MERGE.3, a.SET.2, a.MERGE.1. The entry a.MERGE.1 may be at lower
59 : // (older) level and not involved in the compaction. If the compaction of
60 : // a.MERGE.3 and a.SET.2 produced a.MERGE.3, a subsequent compaction with
61 : // a.MERGE.1 would merge the values together incorrectly.
62 : //
63 : // 3. Snapshots
64 : //
65 : // Snapshots are lightweight point-in-time views of the DB state. At its core,
66 : // a snapshot is a sequence number along with a guarantee from Pebble that it
67 : // will maintain the view of the database at that sequence number. Part of this
68 : // guarantee is relatively straightforward to achieve. When reading from the
69 : // database Pebble will ignore sequence numbers that are larger than the
70 : // snapshot sequence number. The primary complexity with snapshots occurs
71 : // during compaction: the collapsing of entries that are shadowed by newer
72 : // entries is at odds with the guarantee that Pebble will maintain the view of
73 : // the database at the snapshot sequence number. Rather than collapsing entries
74 : // up to the next user key, compact.Iter can only collapse entries up to the
75 : // next snapshot boundary. That is, every snapshot boundary potentially causes
76 : // another entry for the same user-key to be emitted. Another way to view this
77 : // is that snapshots define stripes and entries are collapsed within stripes,
78 : // but not across stripes. Consider the following scenario:
79 : //
80 : // a.PUT.9
81 : // a.DEL.8
82 : // a.PUT.7
83 : // a.DEL.6
84 : // a.PUT.5
85 : //
86 : // In the absence of snapshots these entries would be collapsed to
87 : // a.PUT.9. What if there is a snapshot at sequence number 7? The entries can
88 : // be divided into two stripes and collapsed within the stripes:
89 : //
90 : // a.PUT.9 a.PUT.9
91 : // a.DEL.8 --->
92 : // a.PUT.7
93 : // -- --
94 : // a.DEL.6 ---> a.DEL.6
95 : // a.PUT.5
96 : //
97 : // All of the rules described earlier still apply, but they are confined to
98 : // operate within a snapshot stripe. Snapshots only affect compaction when the
99 : // snapshot sequence number lies within the range of sequence numbers being
100 : // compacted. In the above example, a snapshot at sequence number 10 or at
101 : // sequence number 5 would not have any effect.
102 : //
103 : // 4. Range Deletions
104 : //
105 : // Range deletions provide the ability to delete all of the keys (and values)
106 : // in a contiguous range. Range deletions are stored indexed by their start
107 : // key. The end key of the range is stored in the value. In order to support
108 : // lookup of the range deletions which overlap with a particular key, the range
109 : // deletion tombstones need to be fragmented whenever they overlap. This
110 : // fragmentation is performed by keyspan.Fragmenter. The fragments are then
111 : // subject to the rules for snapshots. For example, consider the two range
112 : // tombstones [a,e)#1 and [c,g)#2:
113 : //
114 : // 2: c-------g
115 : // 1: a-------e
116 : //
117 : // These tombstones will be fragmented into:
118 : //
119 : // 2: c---e---g
120 : // 1: a---c---e
121 : //
122 : // Do we output the fragment [c,e)#1? Since it is covered by [c-e]#2 the answer
123 : // depends on whether it is in a new snapshot stripe.
124 : //
125 : // In addition to the fragmentation of range tombstones, compaction also needs
126 : // to take the range tombstones into consideration when outputting normal
127 : // keys. Just as with point deletions, a range deletion covering an entry can
128 : // cause the entry to be elided.
129 : //
130 : // A note on the stability of keys and values.
131 : //
132 : // The stability guarantees of keys and values returned by the iterator tree
133 : // that backs a compact.Iter is nuanced and care must be taken when
134 : // referencing any returned items.
135 : //
136 : // Keys and values returned by exported functions (i.e. First, Next, etc.) have
137 : // lifetimes that fall into two categories:
138 : //
139 : // Lifetime valid for duration of compaction. Range deletion keys and values are
140 : // stable for the duration of the compaction, due to way in which a
141 : // compact.Iter is typically constructed (i.e. via (*compaction).newInputIter,
142 : // which wraps the iterator over the range deletion block in a noCloseIter,
143 : // preventing the release of the backing memory until the compaction is
144 : // finished).
145 : //
146 : // Lifetime limited to duration of sstable block liveness. Point keys (SET, DEL,
147 : // etc.) and values must be cloned / copied following the return from the
148 : // exported function, and before a subsequent call to Next advances the iterator
149 : // and mutates the contents of the returned key and value.
150 : type Iter struct {
151 : cmp base.Compare
152 : suffixCmp base.CompareSuffixes
153 :
154 : cfg IterConfig
155 :
156 : // rangeDelInterlaving is an interleaving iterator for range deletions, that
157 : // interleaves range tombstones among the point keys.
158 : rangeDelInterleaving keyspan.InterleavingIter
159 : // rangeKeyInterleaving is the interleaving iter for range keys.
160 : rangeKeyInterleaving keyspan.InterleavingIter
161 :
162 : // iter is the iterator which interleaves points with RANGEDELs and range
163 : // keys.
164 : iter base.InternalIterator
165 :
166 : delElider pointTombstoneElider
167 : rangeDelCompactor RangeDelSpanCompactor
168 : rangeKeyCompactor RangeKeySpanCompactor
169 : err error
170 : // `key.UserKey` is set to `keyBuf` caused by saving `i.iterKV.UserKey`
171 : // and `key.InternalKeyTrailer` is set to `i.iterKV.InternalKeyTrailer`. This is the
172 : // case on return from all public methods -- these methods return `key`.
173 : // Additionally, it is the internal state when the code is moving to the
174 : // next key so it can determine whether the user key has changed from
175 : // the previous key.
176 : key base.InternalKey
177 : // keyTrailer is updated when `i.key` is updated and holds the key's
178 : // original trailer (eg, before any sequence-number zeroing or changes to
179 : // key kind).
180 : keyTrailer base.InternalKeyTrailer
181 : value []byte
182 : valueCloser io.Closer
183 : // Temporary buffer used for storing the previous user key in order to
184 : // determine when iteration has advanced to a new user key and thus a new
185 : // snapshot stripe.
186 : keyBuf []byte
187 : // Temporary buffer used for storing the previous value, which may be an
188 : // unsafe, i.iter-owned slice that could be altered when the iterator is
189 : // advanced.
190 : valueBuf []byte
191 : iterKV *base.InternalKV
192 : iterValue []byte
193 : iterStripeChange stripeChangeType
194 : // skip indicates whether the remaining entries in the current snapshot
195 : // stripe should be skipped or processed. `skip` has no effect when `pos ==
196 : // iterPosNext`.
197 : skip bool
198 : // pos indicates the iterator position at the top of Next():
199 : // - iterPosCurForward: the iterator is at the last key returned.
200 : // - iterPosNext: the iterator has already been advanced to the next
201 : // candidate key. For example, this happens when processing merge operands,
202 : // where we advance the iterator all the way into the next stripe or next
203 : // user key to ensure we've seen all mergeable operands.
204 : pos iterPos
205 : // snapshotPinned indicates whether the last point key returned by the
206 : // compaction iterator was only returned because an open snapshot prevents
207 : // its elision. This field only applies to point keys, and not to range
208 : // deletions or range keys.
209 : snapshotPinned bool
210 : // forceObsoleteDueToRangeDel is set to true in a subset of the cases that
211 : // snapshotPinned is true. This value is true when the point is obsolete due
212 : // to a RANGEDEL but could not be deleted due to a snapshot.
213 : //
214 : // NB: it may seem that the additional cases that snapshotPinned captures
215 : // are harmless in that they can also be used to mark a point as obsolete
216 : // (it is merely a duplication of some logic that happens in
217 : // Writer.AddWithForceObsolete), but that is not quite accurate as of this
218 : // writing -- snapshotPinned originated in stats collection and for a
219 : // sequence MERGE, SET, where the MERGE cannot merge with the (older) SET
220 : // due to a snapshot, the snapshotPinned value for the SET is true.
221 : //
222 : // TODO(sumeer,jackson): improve the logic of snapshotPinned and reconsider
223 : // whether we need forceObsoleteDueToRangeDel.
224 : forceObsoleteDueToRangeDel bool
225 : // The index of the snapshot for the current key within the snapshots slice.
226 : curSnapshotIdx int
227 : curSnapshotSeqNum base.SeqNum
228 : // frontiers holds a heap of user keys that affect compaction behavior when
229 : // they're exceeded. Before a new key is returned, the compaction iterator
230 : // advances the frontier, notifying any code that subscribed to be notified
231 : // when a key was reached. The primary use today is within the
232 : // implementation of compactionOutputSplitters in compaction.go. Many of
233 : // these splitters wait for the compaction iterator to call Advance(k) when
234 : // it's returning a new key. If the key that they're waiting for is
235 : // surpassed, these splitters update internal state recording that they
236 : // should request a compaction split next time they're asked in
237 : // [shouldSplitBefore].
238 : frontiers Frontiers
239 :
240 : // lastRangeDelSpan stores the last, not compacted tombstone span. It is used
241 : // to elide points or mark them as snapshot-pinned.
242 : lastRangeDelSpan keyspan.Span
243 : // lastRangeDelSpanFrontier is the frontier used to clear out lastRangeDelSpan
244 : // when we move beyond its end key.
245 : lastRangeDelSpanFrontier frontier
246 :
247 : // span stores the last, compacted tombstone or range key span. It is provided
248 : // to the caller via Span().
249 : span keyspan.Span
250 :
251 : stats IterStats
252 : }
253 :
254 : // IterConfig contains the parameters necessary to create a compaction iterator.
255 : type IterConfig struct {
256 : Comparer *base.Comparer
257 : Merge base.Merge
258 :
259 : // The snapshot sequence numbers that need to be maintained. These sequence
260 : // numbers define the snapshot stripes.
261 : Snapshots Snapshots
262 :
263 : TombstoneElision TombstoneElision
264 : RangeKeyElision TombstoneElision
265 :
266 : // AllowZeroSeqNum allows the sequence number of KVs in the bottom snapshot
267 : // stripe to be simplified to 0 (which improves compression and enables an
268 : // optimization during forward iteration). This can be enabled if there are no
269 : // tables overlapping the output at lower levels (than the output) in the LSM.
270 : AllowZeroSeqNum bool
271 :
272 : // IneffectualPointDeleteCallback is called if a SINGLEDEL is being elided
273 : // without deleting a point set/merge.
274 : IneffectualSingleDeleteCallback func(userKey []byte)
275 :
276 : // SingleDeleteInvariantViolationCallback is called in compactions/flushes if any
277 : // single delete has consumed a Set/Merge, and there is another immediately older
278 : // Set/SetWithDelete/Merge. The user of Pebble has violated the invariant under
279 : // which SingleDelete can be used correctly.
280 : SingleDeleteInvariantViolationCallback func(userKey []byte)
281 : }
282 :
283 1 : func (c *IterConfig) ensureDefaults() {
284 1 : if c.IneffectualSingleDeleteCallback == nil {
285 1 : c.IneffectualSingleDeleteCallback = func(userKey []byte) {}
286 : }
287 : }
288 :
289 : // IterStats are statistics produced by the compaction iterator.
290 : type IterStats struct {
291 : // Count of DELSIZED keys that were missized.
292 : CountMissizedDels uint64
293 : }
294 :
295 : type iterPos int8
296 :
297 : const (
298 : iterPosCurForward iterPos = 0
299 : iterPosNext iterPos = 1
300 : )
301 :
302 : // NewIter creates a new compaction iterator. See the comment for Iter for a
303 : // detailed description.
304 : // rangeDelIter and rangeKeyIter can be nil.
305 : func NewIter(
306 : cfg IterConfig,
307 : pointIter base.InternalIterator,
308 : rangeDelIter, rangeKeyIter keyspan.FragmentIterator,
309 1 : ) *Iter {
310 1 : cfg.ensureDefaults()
311 1 : i := &Iter{
312 1 : cmp: cfg.Comparer.Compare,
313 1 : suffixCmp: cfg.Comparer.CompareSuffixes,
314 1 : cfg: cfg,
315 1 : // We don't want a nil keyBuf because if the first key we encounter is
316 1 : // empty, it would become nil.
317 1 : keyBuf: make([]byte, 8),
318 1 : }
319 1 :
320 1 : iter := pointIter
321 1 : if rangeDelIter != nil {
322 1 : i.rangeDelInterleaving.Init(cfg.Comparer, iter, rangeDelIter, keyspan.InterleavingIterOpts{})
323 1 : iter = &i.rangeDelInterleaving
324 1 : }
325 1 : if rangeKeyIter != nil {
326 1 : i.rangeKeyInterleaving.Init(cfg.Comparer, iter, rangeKeyIter, keyspan.InterleavingIterOpts{})
327 1 : iter = &i.rangeKeyInterleaving
328 1 : }
329 1 : i.iter = invalidating.MaybeWrapIfInvariants(iter)
330 1 :
331 1 : i.frontiers.Init(i.cmp)
332 1 : i.delElider.Init(i.cmp, cfg.TombstoneElision)
333 1 : i.rangeDelCompactor = MakeRangeDelSpanCompactor(i.cmp, i.cfg.Comparer.Equal, cfg.Snapshots, cfg.TombstoneElision)
334 1 : i.rangeKeyCompactor = MakeRangeKeySpanCompactor(i.cmp, i.suffixCmp, cfg.Snapshots, cfg.RangeKeyElision)
335 1 : i.lastRangeDelSpanFrontier.Init(&i.frontiers, nil, i.lastRangeDelSpanFrontierReached)
336 1 : return i
337 : }
338 :
339 : // Frontiers returns the frontiers for the compaction iterator.
340 1 : func (i *Iter) Frontiers() *Frontiers {
341 1 : return &i.frontiers
342 1 : }
343 :
344 : // SnapshotPinned returns whether the last point key returned by the compaction
345 : // iterator was only returned because an open snapshot prevents its elision.
346 : // This field only applies to point keys, and not to range deletions or range
347 : // keys.
348 1 : func (i *Iter) SnapshotPinned() bool {
349 1 : return i.snapshotPinned
350 1 : }
351 :
352 : // ForceObsoleteDueToRangeDel returns true in a subset of the cases when
353 : // SnapshotPinned returns true. This value is true when the point is obsolete
354 : // due to a RANGEDEL but could not be deleted due to a snapshot.
355 1 : func (i *Iter) ForceObsoleteDueToRangeDel() bool {
356 1 : return i.forceObsoleteDueToRangeDel
357 1 : }
358 :
359 : // Stats returns the compaction iterator stats.
360 1 : func (i *Iter) Stats() IterStats {
361 1 : return i.stats
362 1 : }
363 :
364 : // First has the same semantics as InternalIterator.First.
365 1 : func (i *Iter) First() (*base.InternalKey, []byte) {
366 1 : if i.err != nil {
367 0 : return nil, nil
368 0 : }
369 1 : i.iterKV = i.iter.First()
370 1 : if i.iterKV != nil {
371 1 : i.iterValue, _, i.err = i.iterKV.Value(nil)
372 1 : if i.err != nil {
373 0 : return nil, nil
374 0 : }
375 1 : i.curSnapshotIdx, i.curSnapshotSeqNum = i.cfg.Snapshots.IndexAndSeqNum(i.iterKV.SeqNum())
376 : }
377 1 : i.pos = iterPosNext
378 1 : i.iterStripeChange = newStripeNewKey
379 1 : return i.Next()
380 : }
381 :
382 : // Next has the same semantics as InternalIterator.Next. Note that when Next
383 : // returns a RANGEDEL or a range key, the caller can use Span() to get the
384 : // corresponding span.
385 1 : func (i *Iter) Next() (*base.InternalKey, []byte) {
386 1 : if i.err != nil {
387 1 : return nil, nil
388 1 : }
389 :
390 : // Close the closer for the current value if one was open.
391 1 : if i.closeValueCloser() != nil {
392 0 : return nil, nil
393 0 : }
394 :
395 : // Prior to this call to `Next()` we are in one of three situations with
396 : // respect to `iterKey` and related state:
397 : //
398 : // - `!skip && pos == iterPosNext`: `iterKey` is already at the next key.
399 : // - `!skip && pos == iterPosCurForward`: We are at the key that has been returned.
400 : // To move forward we advance by one key, even if that lands us in the same
401 : // snapshot stripe.
402 : // - `skip && pos == iterPosCurForward`: We are at the key that has been returned.
403 : // To move forward we skip skippable entries in the stripe.
404 1 : if i.pos == iterPosCurForward {
405 1 : if i.skip {
406 1 : i.skipInStripe()
407 1 : } else {
408 1 : i.nextInStripe()
409 1 : }
410 1 : } else if i.skip {
411 0 : panic(errors.AssertionFailedf("compaction iterator has skip=true, but iterator is at iterPosNext"))
412 : }
413 :
414 1 : i.pos = iterPosCurForward
415 1 :
416 1 : for i.iterKV != nil {
417 1 : i.frontiers.Advance(i.iterKV.K.UserKey)
418 1 :
419 1 : // If we entered a new snapshot stripe with the same key, any key we
420 1 : // return on this iteration is only returned because the open snapshot
421 1 : // prevented it from being elided or merged with the key returned for
422 1 : // the previous stripe. Mark it as pinned so that the compaction loop
423 1 : // can correctly populate output tables' pinned statistics. We might
424 1 : // also set snapshotPinned=true down below if we observe that the key is
425 1 : // deleted by a range deletion in a higher stripe or that this key is a
426 1 : // tombstone that could be elided if only it were in the last snapshot
427 1 : // stripe.
428 1 : i.snapshotPinned = i.iterStripeChange == newStripeSameKey
429 1 :
430 1 : if i.iterKV.Kind() == base.InternalKeyKindRangeDelete || rangekey.IsRangeKey(i.iterKV.Kind()) {
431 1 : // Return the span so the compaction can use it for file truncation and add
432 1 : // it to the relevant fragmenter. In the case of range deletions, we do not
433 1 : // set `skip` to true before returning as there may be any number of point
434 1 : // keys with the same user key and sequence numbers ≥ the range deletion's
435 1 : // sequence number. Such point keys must be visible (i.e., not skipped
436 1 : // over) since we promise point keys are not deleted by range tombstones at
437 1 : // the same sequence number (or higher).
438 1 : //
439 1 : // Note that `skip` must already be false here, because range keys and range
440 1 : // deletions are interleaved at the maximal sequence numbers and neither will
441 1 : // set `skip`=true.
442 1 : if i.skip {
443 0 : panic(errors.AssertionFailedf("pebble: compaction iterator: skip unexpectedly true"))
444 : }
445 :
446 1 : if i.iterKV.Kind() == base.InternalKeyKindRangeDelete {
447 1 : span := i.rangeDelInterleaving.Span()
448 1 : i.setLastRangeDelSpan(span)
449 1 : i.rangeDelCompactor.Compact(span, &i.span)
450 1 : if i.span.Empty() {
451 1 : // The range del span was elided entirely; don't return this key to the caller.
452 1 : i.saveKey()
453 1 : i.nextInStripe()
454 1 : continue
455 : }
456 1 : } else {
457 1 : i.rangeKeyCompactor.Compact(i.rangeKeyInterleaving.Span(), &i.span)
458 1 : if i.span.Empty() {
459 1 : // The range key span was elided entirely; don't return this key to the caller.
460 1 : i.saveKey()
461 1 : i.nextInStripe()
462 1 : continue
463 : }
464 : }
465 :
466 : // NOTE: there is a subtle invariant violation here in that calling
467 : // saveKey and returning a reference to the temporary slice violates
468 : // the stability guarantee for range deletion keys. A potential
469 : // mediation could return the original iterKey and iterValue
470 : // directly, as the backing memory is guaranteed to be stable until
471 : // the compaction completes. The violation here is only minor in
472 : // that the caller immediately clones the range deletion InternalKey
473 : // when passing the key to the deletion fragmenter (see the
474 : // call-site in compaction.go).
475 : // TODO(travers): address this violation by removing the call to
476 : // saveKey and instead return the original iterKey and iterValue.
477 : // This goes against the comment on i.key in the struct, and
478 : // therefore warrants some investigation.
479 1 : i.saveKey()
480 1 : // TODO(jackson): Handle tracking pinned statistics for range keys
481 1 : // and range deletions. This would require updating
482 1 : // emitRangeDelChunk and rangeKeyCompactionTransform to update
483 1 : // statistics when they apply their own snapshot striping logic.
484 1 : i.snapshotPinned = false
485 1 : i.value = i.iterValue
486 1 : return &i.key, i.value
487 : }
488 :
489 : // Check if the last tombstone covers the key.
490 : // TODO(sumeer): we could avoid calling tombstoneCovers if
491 : // i.iterStripeChange == sameStripeSameKey since that check has already been
492 : // done in nextInStripeHelper. However, we also need to handle the case of
493 : // CoversInvisibly below.
494 1 : switch i.tombstoneCovers(i.iterKV.K, i.curSnapshotSeqNum) {
495 1 : case coversVisibly:
496 1 : // A pending range deletion deletes this key. Skip it.
497 1 : i.saveKey()
498 1 : i.skipInStripe()
499 1 : continue
500 :
501 1 : case coversInvisibly:
502 1 : // i.iterKV would be deleted by a range deletion if there weren't any open
503 1 : // snapshots. Mark it as pinned.
504 1 : //
505 1 : // NB: there are multiple places in this file where we check for a
506 1 : // covering tombstone and this is the only one where we are writing to
507 1 : // i.snapshotPinned. Those other cases occur in mergeNext where the caller
508 1 : // is deciding whether the value should be merged or not, and the key is
509 1 : // in the same snapshot stripe. Hence, snapshotPinned is by definition
510 1 : // false in those cases.
511 1 : i.snapshotPinned = true
512 1 : i.forceObsoleteDueToRangeDel = true
513 :
514 1 : default:
515 1 : i.forceObsoleteDueToRangeDel = false
516 : }
517 :
518 1 : switch i.iterKV.Kind() {
519 1 : case base.InternalKeyKindDelete, base.InternalKeyKindSingleDelete, base.InternalKeyKindDeleteSized:
520 1 : if i.delElider.ShouldElide(i.iterKV.K.UserKey) {
521 1 : if i.curSnapshotIdx == 0 {
522 1 : // If we're at the last snapshot stripe and the tombstone
523 1 : // can be elided skip skippable keys in the same stripe.
524 1 : i.saveKey()
525 1 : if i.key.Kind() == base.InternalKeyKindSingleDelete {
526 1 : i.skipDueToSingleDeleteElision()
527 1 : } else {
528 1 : i.skipInStripe()
529 1 : if !i.skip && i.iterStripeChange != newStripeNewKey {
530 0 : panic(errors.AssertionFailedf("pebble: skipInStripe in last stripe disabled skip without advancing to new key"))
531 : }
532 : }
533 1 : if i.iterStripeChange == newStripeSameKey {
534 0 : panic(errors.AssertionFailedf("pebble: skipInStripe in last stripe found a new stripe within the same key"))
535 : }
536 1 : continue
537 1 : } else {
538 1 : // We're not at the last snapshot stripe, so the tombstone
539 1 : // can NOT yet be elided. Mark it as pinned, so that it's
540 1 : // included in table statistics appropriately.
541 1 : i.snapshotPinned = true
542 1 : }
543 : }
544 :
545 1 : switch i.iterKV.Kind() {
546 1 : case base.InternalKeyKindDelete:
547 1 : i.saveKey()
548 1 : i.value = i.iterValue
549 1 : i.skip = true
550 1 : return &i.key, i.value
551 :
552 1 : case base.InternalKeyKindDeleteSized:
553 1 : // We may skip subsequent keys because of this tombstone. Scan
554 1 : // ahead to see just how much data this tombstone drops and if
555 1 : // the tombstone's value should be updated accordingly.
556 1 : return i.deleteSizedNext()
557 :
558 1 : case base.InternalKeyKindSingleDelete:
559 1 : if i.singleDeleteNext() {
560 1 : return &i.key, i.value
561 1 : } else if i.err != nil {
562 0 : return nil, nil
563 0 : }
564 1 : continue
565 :
566 0 : default:
567 0 : panic(errors.AssertionFailedf(
568 0 : "unexpected kind %s", redact.SafeString(i.iterKV.Kind().String())))
569 : }
570 :
571 1 : case base.InternalKeyKindSet, base.InternalKeyKindSetWithDelete:
572 1 : // The key we emit for this entry is a function of the current key
573 1 : // kind, and whether this entry is followed by a DEL/SINGLEDEL
574 1 : // entry. setNext() does the work to move the iterator forward,
575 1 : // preserving the original value, and potentially mutating the key
576 1 : // kind.
577 1 : i.setNext()
578 1 : if i.err != nil {
579 1 : return nil, nil
580 1 : }
581 1 : return &i.key, i.value
582 :
583 1 : case base.InternalKeyKindMerge:
584 1 : // Record the snapshot index before mergeNext as merging
585 1 : // advances the iterator, adjusting curSnapshotIdx.
586 1 : origSnapshotIdx := i.curSnapshotIdx
587 1 : var valueMerger base.ValueMerger
588 1 : valueMerger, i.err = i.cfg.Merge(i.iterKV.K.UserKey, i.iterValue)
589 1 : if i.err == nil {
590 1 : i.mergeNext(valueMerger)
591 1 : }
592 1 : var needDelete bool
593 1 : if i.err == nil {
594 1 : // includesBase is true whenever we've transformed the MERGE record
595 1 : // into a SET.
596 1 : var includesBase bool
597 1 : switch i.key.Kind() {
598 1 : case base.InternalKeyKindSet, base.InternalKeyKindSetWithDelete:
599 1 : includesBase = true
600 1 : case base.InternalKeyKindMerge:
601 0 : default:
602 0 : panic(errors.AssertionFailedf(
603 0 : "unexpected kind %s", redact.SafeString(i.key.Kind().String())))
604 : }
605 1 : i.value, needDelete, i.valueCloser, i.err = finishValueMerger(valueMerger, includesBase)
606 : }
607 1 : if i.err == nil {
608 1 : if needDelete {
609 1 : if i.closeValueCloser() != nil {
610 0 : return nil, nil
611 0 : }
612 1 : continue
613 : }
614 :
615 1 : i.maybeZeroSeqnum(origSnapshotIdx)
616 1 : return &i.key, i.value
617 : }
618 1 : if i.err != nil {
619 1 : // TODO(sumeer): why is MarkCorruptionError only being called for
620 1 : // MERGE?
621 1 : i.err = base.MarkCorruptionError(i.err)
622 1 : }
623 1 : return nil, nil
624 :
625 1 : default:
626 1 : i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(i.iterKV.Kind()))
627 1 : return nil, nil
628 : }
629 : }
630 :
631 1 : return nil, nil
632 : }
633 :
634 : // Span returns the range deletion or range key span corresponding to the
635 : // current key. Can only be called right after a Next() call that returned a
636 : // RANGEDEL or a range key. The keys in the span should not be retained or
637 : // modified.
638 1 : func (i *Iter) Span() *keyspan.Span {
639 1 : return &i.span
640 1 : }
641 :
642 1 : func (i *Iter) closeValueCloser() error {
643 1 : if i.valueCloser == nil {
644 1 : return nil
645 1 : }
646 :
647 0 : i.err = i.valueCloser.Close()
648 0 : i.valueCloser = nil
649 0 : return i.err
650 : }
651 :
652 : // skipInStripe skips over skippable keys in the same stripe and user key. It
653 : // may set i.err, in which case i.iterKV will be nil.
654 1 : func (i *Iter) skipInStripe() {
655 1 : i.skip = true
656 1 : // TODO(sumeer): we can avoid the overhead of calling i.rangeDelFrag.Covers,
657 1 : // in this case of nextInStripe, since we are skipping all of them anyway.
658 1 : for i.nextInStripe() == sameStripe {
659 1 : if i.err != nil {
660 0 : panic(i.err)
661 : }
662 : }
663 : // We landed outside the original stripe, so reset skip.
664 1 : i.skip = false
665 : }
666 :
667 1 : func (i *Iter) iterNext() bool {
668 1 : i.iterKV = i.iter.Next()
669 1 : if i.iterKV != nil {
670 1 : i.iterValue, _, i.err = i.iterKV.Value(nil)
671 1 : if i.err != nil {
672 0 : i.iterKV = nil
673 0 : }
674 : }
675 1 : return i.iterKV != nil
676 : }
677 :
678 : // stripeChangeType indicates how the snapshot stripe changed relative to the
679 : // previous key. If the snapshot stripe changed, it also indicates whether the
680 : // new stripe was entered because the iterator progressed onto an entirely new
681 : // key or entered a new stripe within the same key.
682 : type stripeChangeType int
683 :
684 : const (
685 : newStripeNewKey stripeChangeType = iota
686 : newStripeSameKey
687 : sameStripe
688 : )
689 :
690 : // nextInStripe advances the iterator and returns one of the above const ints
691 : // indicating how its state changed.
692 : //
693 : // All sameStripe keys that are covered by a RANGEDEL will be skipped and not
694 : // returned.
695 : //
696 : // Calls to nextInStripe must be preceded by a call to saveKey to retain a
697 : // temporary reference to the original key, so that forward iteration can
698 : // proceed with a reference to the original key. Care should be taken to avoid
699 : // overwriting or mutating the saved key or value before they have been returned
700 : // to the caller of the exported function (i.e. the caller of Next, First, etc.)
701 : //
702 : // nextInStripe may set i.err, in which case the return value will be
703 : // newStripeNewKey, and i.iterKV will be nil.
704 1 : func (i *Iter) nextInStripe() stripeChangeType {
705 1 : i.iterStripeChange = i.nextInStripeHelper()
706 1 : return i.iterStripeChange
707 1 : }
708 :
709 : // nextInStripeHelper is an internal helper for nextInStripe; callers should use
710 : // nextInStripe and not call nextInStripeHelper.
711 1 : func (i *Iter) nextInStripeHelper() stripeChangeType {
712 1 : origSnapshotIdx := i.curSnapshotIdx
713 1 : for {
714 1 : if !i.iterNext() {
715 1 : return newStripeNewKey
716 1 : }
717 1 : kv := i.iterKV
718 1 :
719 1 : // Is this a new key? There are two cases:
720 1 : //
721 1 : // 1. The new key has a different user key.
722 1 : // 2. The previous key was an interleaved range deletion or range key
723 1 : // boundary. These keys are interleaved in the same input iterator
724 1 : // stream as point keys, but they do not obey the ordinary sequence
725 1 : // number ordering within a user key. If the previous key was one
726 1 : // of these keys, we consider the new key a `newStripeNewKey` to
727 1 : // reflect that it's the beginning of a new stream of point keys.
728 1 : if i.key.IsExclusiveSentinel() || !i.cfg.Comparer.Equal(i.key.UserKey, kv.K.UserKey) {
729 1 : i.curSnapshotIdx, i.curSnapshotSeqNum = i.cfg.Snapshots.IndexAndSeqNum(kv.SeqNum())
730 1 : return newStripeNewKey
731 1 : }
732 :
733 : // If i.key and key have the same user key, then
734 : // 1. i.key must not have had a zero sequence number (or it would've be the last
735 : // key with its user key).
736 : // 2. i.key must have a strictly larger sequence number
737 : // There's an exception in that either key may be a range delete. Range
738 : // deletes may share a sequence number with a point key if the keys were
739 : // ingested together. Range keys may also share the sequence number if they
740 : // were ingested, but range keys are interleaved into the compaction
741 : // iterator's input iterator at the maximal sequence number so their
742 : // original sequence number will not be observed here.
743 1 : if prevSeqNum := i.keyTrailer.SeqNum(); (prevSeqNum == 0 || prevSeqNum <= kv.SeqNum()) &&
744 1 : i.key.Kind() != base.InternalKeyKindRangeDelete && kv.Kind() != base.InternalKeyKindRangeDelete {
745 0 : prevKey := i.key
746 0 : prevKey.Trailer = i.keyTrailer
747 0 : panic(errors.AssertionFailedf("pebble: invariant violation: %s and %s out of order", prevKey, kv.K))
748 : }
749 :
750 1 : i.curSnapshotIdx, i.curSnapshotSeqNum = i.cfg.Snapshots.IndexAndSeqNum(kv.SeqNum())
751 1 : switch kv.Kind() {
752 : case base.InternalKeyKindRangeKeySet, base.InternalKeyKindRangeKeyUnset, base.InternalKeyKindRangeKeyDelete,
753 0 : base.InternalKeyKindRangeDelete:
754 0 : // Range tombstones and range keys are interleaved at the max
755 0 : // sequence number for a given user key, and the first key after one
756 0 : // is always considered a newStripeNewKey, so we should never reach
757 0 : // this.
758 0 : panic("unreachable")
759 : case base.InternalKeyKindDelete, base.InternalKeyKindSet, base.InternalKeyKindMerge, base.InternalKeyKindSingleDelete,
760 1 : base.InternalKeyKindSetWithDelete, base.InternalKeyKindDeleteSized:
761 : // Fall through
762 1 : default:
763 1 : kind := i.iterKV.Kind()
764 1 : i.iterKV = nil
765 1 : i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(kind))
766 1 : return newStripeNewKey
767 : }
768 1 : if i.curSnapshotIdx == origSnapshotIdx {
769 1 : // Same snapshot.
770 1 : if i.tombstoneCovers(i.iterKV.K, i.curSnapshotSeqNum) == coversVisibly {
771 1 : continue
772 : }
773 1 : return sameStripe
774 : }
775 1 : return newStripeSameKey
776 : }
777 : }
778 :
779 1 : func (i *Iter) setNext() {
780 1 : // Save the current key.
781 1 : i.saveKey()
782 1 : i.value = i.iterValue
783 1 : i.maybeZeroSeqnum(i.curSnapshotIdx)
784 1 :
785 1 : // If this key is already a SETWITHDEL we can early return and skip the remaining
786 1 : // records in the stripe:
787 1 : if i.iterKV.Kind() == base.InternalKeyKindSetWithDelete {
788 1 : i.skip = true
789 1 : return
790 1 : }
791 :
792 : // We are iterating forward. Save the current value.
793 1 : i.valueBuf = append(i.valueBuf[:0], i.iterValue...)
794 1 : i.value = i.valueBuf
795 1 :
796 1 : // Else, we continue to loop through entries in the stripe looking for a
797 1 : // DEL. Note that we may stop *before* encountering a DEL, if one exists.
798 1 : //
799 1 : // NB: nextInStripe will skip sameStripe keys that are visibly covered by a
800 1 : // RANGEDEL. This can include DELs -- this is fine since such DELs don't
801 1 : // need to be combined with SET to make SETWITHDEL.
802 1 : for {
803 1 : switch i.nextInStripe() {
804 1 : case newStripeNewKey, newStripeSameKey:
805 1 : i.pos = iterPosNext
806 1 : return
807 1 : case sameStripe:
808 1 : // We're still in the same stripe. If this is a
809 1 : // DEL/SINGLEDEL/DELSIZED, we stop looking and emit a SETWITHDEL.
810 1 : // Subsequent keys are eligible for skipping.
811 1 : switch i.iterKV.Kind() {
812 1 : case base.InternalKeyKindDelete, base.InternalKeyKindSingleDelete, base.InternalKeyKindDeleteSized:
813 1 : i.key.SetKind(base.InternalKeyKindSetWithDelete)
814 1 : i.skip = true
815 1 : return
816 1 : case base.InternalKeyKindSet, base.InternalKeyKindMerge, base.InternalKeyKindSetWithDelete:
817 : // Do nothing
818 0 : default:
819 0 : i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(i.iterKV.Kind()))
820 : }
821 0 : default:
822 0 : panic("pebble: unexpected stripeChangeType: " + strconv.Itoa(int(i.iterStripeChange)))
823 : }
824 : }
825 : }
826 :
827 1 : func (i *Iter) mergeNext(valueMerger base.ValueMerger) {
828 1 : // Save the current key.
829 1 : i.saveKey()
830 1 :
831 1 : // Loop looking for older values in the current snapshot stripe and merge
832 1 : // them.
833 1 : for {
834 1 : if i.nextInStripe() != sameStripe {
835 1 : i.pos = iterPosNext
836 1 : return
837 1 : }
838 1 : if i.err != nil {
839 0 : panic(i.err)
840 : }
841 : // NB: MERGE#10+RANGEDEL#9 stays a MERGE, since nextInStripe skips
842 : // sameStripe keys that are visibly covered by a RANGEDEL. There may be
843 : // MERGE#7 that is invisibly covered and will be preserved, but there is
844 : // no risk that MERGE#10 and MERGE#7 will get merged in the future as
845 : // the RANGEDEL still exists and will be used in user-facing reads that
846 : // see MERGE#10, and will also eventually cause MERGE#7 to be deleted in
847 : // a compaction.
848 1 : key := i.iterKV
849 1 : switch key.Kind() {
850 1 : case base.InternalKeyKindDelete, base.InternalKeyKindSingleDelete, base.InternalKeyKindDeleteSized:
851 1 : // We've hit a deletion tombstone. Return everything up to this point and
852 1 : // then skip entries until the next snapshot stripe. We change the kind
853 1 : // of the result key to a Set so that it shadows keys in lower
854 1 : // levels. That is, MERGE+DEL -> SETWITHDEL.
855 1 : //
856 1 : // We do the same for SingleDelete since SingleDelete is only
857 1 : // permitted (with deterministic behavior) for keys that have been
858 1 : // set once since the last SingleDelete/Delete, so everything
859 1 : // older is acceptable to shadow. Note that this is slightly
860 1 : // different from singleDeleteNext() which implements stricter
861 1 : // semantics in terms of applying the SingleDelete to the single
862 1 : // next Set. But those stricter semantics are not observable to
863 1 : // the end-user since Iterator interprets SingleDelete as Delete.
864 1 : // We could do something more complicated here and consume only a
865 1 : // single Set, and then merge in any following Sets, but that is
866 1 : // complicated wrt code and unnecessary given the narrow permitted
867 1 : // use of SingleDelete.
868 1 : i.key.SetKind(base.InternalKeyKindSetWithDelete)
869 1 : i.skip = true
870 1 : return
871 :
872 1 : case base.InternalKeyKindSet, base.InternalKeyKindSetWithDelete:
873 1 : // We've hit a Set or SetWithDel value. Merge with the existing
874 1 : // value and return. We change the kind of the resulting key to a
875 1 : // Set so that it shadows keys in lower levels. That is:
876 1 : // MERGE + (SET*) -> SET.
877 1 : i.err = valueMerger.MergeOlder(i.iterValue)
878 1 : if i.err != nil {
879 0 : return
880 0 : }
881 1 : i.key.SetKind(base.InternalKeyKindSet)
882 1 : i.skip = true
883 1 : return
884 :
885 1 : case base.InternalKeyKindMerge:
886 1 : // We've hit another Merge value. Merge with the existing value and
887 1 : // continue looping.
888 1 : i.err = valueMerger.MergeOlder(i.iterValue)
889 1 : if i.err != nil {
890 0 : return
891 0 : }
892 :
893 0 : default:
894 0 : i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(i.iterKV.Kind()))
895 0 : return
896 : }
897 : }
898 : }
899 :
900 : // singleDeleteNext processes a SingleDelete point tombstone. A SingleDelete, or
901 : // SINGLEDEL, is unique in that it deletes exactly 1 internal key. It's a
902 : // performance optimization when the client knows a user key has not been
903 : // overwritten, allowing the elision of the tombstone earlier, avoiding write
904 : // amplification.
905 : //
906 : // singleDeleteNext returns a boolean indicating whether or not the caller
907 : // should yield the SingleDelete key to the consumer of the Iter. If
908 : // singleDeleteNext returns false, the caller may consume/elide the
909 : // SingleDelete.
910 1 : func (i *Iter) singleDeleteNext() bool {
911 1 : // Save the current key.
912 1 : i.saveKey()
913 1 : i.value = i.iterValue
914 1 :
915 1 : // Loop until finds a key to be passed to the next level.
916 1 : for {
917 1 : // If we find a key that can't be skipped, return true so that the
918 1 : // caller yields the SingleDelete to the caller.
919 1 : if i.nextInStripe() != sameStripe {
920 1 : // This defers additional error checking regarding single delete
921 1 : // invariants to the compaction where the keys with the same user key as
922 1 : // the single delete are in the same stripe.
923 1 : i.pos = iterPosNext
924 1 : return i.err == nil
925 1 : }
926 1 : if i.err != nil {
927 0 : panic(i.err)
928 : }
929 : // INVARIANT: sameStripe.
930 1 : key := i.iterKV
931 1 : kind := key.Kind()
932 1 : switch kind {
933 1 : case base.InternalKeyKindDelete, base.InternalKeyKindSetWithDelete, base.InternalKeyKindDeleteSized:
934 1 : if kind == base.InternalKeyKindDelete || kind == base.InternalKeyKindDeleteSized {
935 1 : i.cfg.IneffectualSingleDeleteCallback(i.key.UserKey)
936 1 : }
937 : // We've hit a Delete, DeleteSized, SetWithDelete, transform
938 : // the SingleDelete into a full Delete.
939 1 : i.key.SetKind(base.InternalKeyKindDelete)
940 1 : i.skip = true
941 1 : return true
942 :
943 1 : case base.InternalKeyKindSet, base.InternalKeyKindMerge:
944 1 : // This SingleDelete deletes the Set/Merge, and we can now elide the
945 1 : // SingleDel as well. We advance past the Set and return false to
946 1 : // indicate to the main compaction loop that we should NOT yield the
947 1 : // current SingleDel key to the compaction loop.
948 1 : //
949 1 : // NB: singleDeleteNext was called with i.pos == iterPosCurForward, and
950 1 : // after the call to nextInStripe, we are still at iterPosCurForward,
951 1 : // since we are at the key after the Set/Merge that was single deleted.
952 1 : change := i.nextInStripe()
953 1 : switch change {
954 1 : case sameStripe, newStripeSameKey:
955 1 : // On the same user key.
956 1 : nextKind := i.iterKV.Kind()
957 1 : switch nextKind {
958 1 : case base.InternalKeyKindSet, base.InternalKeyKindSetWithDelete, base.InternalKeyKindMerge:
959 1 : if i.cfg.SingleDeleteInvariantViolationCallback != nil {
960 1 : // sameStripe keys returned by nextInStripe() are already
961 1 : // known to not be covered by a RANGEDEL, so it is an invariant
962 1 : // violation. The rare case is newStripeSameKey, where it is a
963 1 : // violation if not covered by a RANGEDEL.
964 1 : if change == sameStripe ||
965 1 : i.tombstoneCovers(i.iterKV.K, i.curSnapshotSeqNum) == noCover {
966 1 : i.cfg.SingleDeleteInvariantViolationCallback(i.key.UserKey)
967 1 : }
968 : }
969 1 : case base.InternalKeyKindDelete, base.InternalKeyKindDeleteSized, base.InternalKeyKindSingleDelete:
970 0 : default:
971 0 : panic(errors.AssertionFailedf(
972 0 : "unexpected internal key kind: %d", errors.Safe(i.iterKV.Kind())))
973 : }
974 1 : case newStripeNewKey:
975 0 : default:
976 0 : panic("unreachable")
977 : }
978 1 : return false
979 :
980 1 : case base.InternalKeyKindSingleDelete:
981 1 : // Two single deletes met in a compaction. The first single delete is
982 1 : // ineffectual.
983 1 : i.cfg.IneffectualSingleDeleteCallback(i.key.UserKey)
984 1 : // Continue to apply the second single delete.
985 1 : continue
986 :
987 0 : default:
988 0 : i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(i.iterKV.Kind()))
989 0 : return false
990 : }
991 : }
992 : }
993 :
994 : // skipDueToSingleDeleteElision is called when the SingleDelete is being
995 : // elided because it is in the final snapshot stripe and there are no keys
996 : // with the same user key in lower levels in the LSM (below the files in this
997 : // compaction).
998 : //
999 : // TODO(sumeer): the only difference between singleDeleteNext and
1000 : // skipDueToSingleDeleteElision is the fact that the caller knows it will be
1001 : // eliding the single delete in the latter case. There are some similar things
1002 : // happening in both implementations. My first attempt at combining them into
1003 : // a single method was hard to comprehend. Try again.
1004 1 : func (i *Iter) skipDueToSingleDeleteElision() {
1005 1 : for {
1006 1 : stripeChange := i.nextInStripe()
1007 1 : if i.err != nil {
1008 0 : panic(i.err)
1009 : }
1010 1 : switch stripeChange {
1011 1 : case newStripeNewKey:
1012 1 : // The single delete is only now being elided, meaning it did not elide
1013 1 : // any keys earlier in its descent down the LSM. We stepped onto a new
1014 1 : // user key, meaning that even now at its moment of elision, it still
1015 1 : // hasn't elided any other keys. The single delete was ineffectual (a
1016 1 : // no-op).
1017 1 : i.cfg.IneffectualSingleDeleteCallback(i.key.UserKey)
1018 1 : i.skip = false
1019 1 : return
1020 0 : case newStripeSameKey:
1021 0 : // This should be impossible. If we're eliding a single delete, we
1022 0 : // determined that the tombstone is in the final snapshot stripe, but we
1023 0 : // stepped into a new stripe of the same key.
1024 0 : panic(errors.AssertionFailedf("eliding single delete followed by same key in new stripe"))
1025 1 : case sameStripe:
1026 1 : kind := i.iterKV.Kind()
1027 1 : switch kind {
1028 1 : case base.InternalKeyKindDelete, base.InternalKeyKindDeleteSized, base.InternalKeyKindSingleDelete:
1029 1 : i.cfg.IneffectualSingleDeleteCallback(i.key.UserKey)
1030 1 : switch kind {
1031 1 : case base.InternalKeyKindDelete, base.InternalKeyKindDeleteSized:
1032 1 : i.skipInStripe()
1033 1 : return
1034 1 : case base.InternalKeyKindSingleDelete:
1035 1 : // Repeat the same with this SingleDelete. We don't want to simply
1036 1 : // call skipInStripe(), since it increases the strength of the
1037 1 : // SingleDel, which hides bugs in the use of single delete.
1038 1 : continue
1039 0 : default:
1040 0 : panic(errors.AssertionFailedf(
1041 0 : "unexpected internal key kind: %d", errors.Safe(i.iterKV.Kind())))
1042 : }
1043 1 : case base.InternalKeyKindSetWithDelete:
1044 1 : // The SingleDelete should behave like a Delete.
1045 1 : i.skipInStripe()
1046 1 : return
1047 1 : case base.InternalKeyKindSet, base.InternalKeyKindMerge:
1048 1 : // This SingleDelete deletes the Set/Merge, and we are eliding the
1049 1 : // SingleDel as well. Step to the next key (this is not deleted by the
1050 1 : // SingleDelete).
1051 1 : //
1052 1 : // NB: skipDueToSingleDeleteElision was called with i.pos ==
1053 1 : // iterPosCurForward, and after the call to nextInStripe, we are still
1054 1 : // at iterPosCurForward, since we are at the key after the Set/Merge
1055 1 : // that was single deleted.
1056 1 : change := i.nextInStripe()
1057 1 : if i.err != nil {
1058 0 : panic(i.err)
1059 : }
1060 1 : switch change {
1061 0 : case newStripeSameKey:
1062 0 : panic(errors.AssertionFailedf("eliding single delete followed by same key in new stripe"))
1063 1 : case newStripeNewKey:
1064 0 : case sameStripe:
1065 0 : // On the same key.
1066 0 : nextKind := i.iterKV.Kind()
1067 0 : switch nextKind {
1068 0 : case base.InternalKeyKindSet, base.InternalKeyKindSetWithDelete, base.InternalKeyKindMerge:
1069 0 : if i.cfg.SingleDeleteInvariantViolationCallback != nil {
1070 0 : i.cfg.SingleDeleteInvariantViolationCallback(i.key.UserKey)
1071 0 : }
1072 0 : case base.InternalKeyKindDelete, base.InternalKeyKindDeleteSized, base.InternalKeyKindSingleDelete:
1073 0 : default:
1074 0 : panic(errors.AssertionFailedf(
1075 0 : "unexpected internal key kind: %d", errors.Safe(i.iterKV.Kind())))
1076 : }
1077 0 : default:
1078 0 : panic("unreachable")
1079 : }
1080 : // Whether in same stripe or new stripe, this key is not consumed by
1081 : // the SingleDelete.
1082 1 : i.skip = false
1083 1 : return
1084 0 : default:
1085 0 : panic(errors.AssertionFailedf(
1086 0 : "unexpected internal key kind: %d", errors.Safe(i.iterKV.Kind())))
1087 : }
1088 0 : default:
1089 0 : panic("unreachable")
1090 : }
1091 : }
1092 : }
1093 :
1094 : // deleteSizedNext processes a DELSIZED point tombstone. Unlike ordinary DELs,
1095 : // these tombstones carry a value that's a varint indicating the size of the
1096 : // entry (len(key)+len(value)) that the tombstone is expected to delete.
1097 : //
1098 : // When a deleteSizedNext is encountered, we skip ahead to see which keys, if
1099 : // any, are elided as a result of the tombstone.
1100 1 : func (i *Iter) deleteSizedNext() (*base.InternalKey, []byte) {
1101 1 : i.saveKey()
1102 1 : i.skip = true
1103 1 :
1104 1 : // The DELSIZED tombstone may have no value at all. This happens when the
1105 1 : // tombstone has already deleted the key that the user originally predicted.
1106 1 : // In this case, we still peek forward in case there's another DELSIZED key
1107 1 : // with a lower sequence number, in which case we'll adopt its value.
1108 1 : if len(i.iterValue) == 0 {
1109 1 : i.value = i.valueBuf[:0]
1110 1 : } else {
1111 1 : i.valueBuf = append(i.valueBuf[:0], i.iterValue...)
1112 1 : i.value = i.valueBuf
1113 1 : }
1114 :
1115 : // Loop through all the keys within this stripe that are skippable.
1116 1 : i.pos = iterPosNext
1117 1 : for i.nextInStripe() == sameStripe {
1118 1 : if i.err != nil {
1119 0 : panic(i.err)
1120 : }
1121 1 : switch i.iterKV.Kind() {
1122 1 : case base.InternalKeyKindDelete, base.InternalKeyKindDeleteSized, base.InternalKeyKindSingleDelete:
1123 1 : // We encountered a tombstone (DEL, or DELSIZED) that's deleted by
1124 1 : // the original DELSIZED tombstone. This can happen in two cases:
1125 1 : //
1126 1 : // (1) These tombstones were intended to delete two distinct values,
1127 1 : // and this DELSIZED has already dropped the relevant key. For
1128 1 : // example:
1129 1 : //
1130 1 : // a.DELSIZED.9 a.SET.7 a.DELSIZED.5 a.SET.4
1131 1 : //
1132 1 : // If a.DELSIZED.9 has already deleted a.SET.7, its size has
1133 1 : // already been zeroed out. In this case, we want to adopt the
1134 1 : // value of the DELSIZED with the lower sequence number, in
1135 1 : // case the a.SET.4 key has not yet been elided.
1136 1 : //
1137 1 : // (2) This DELSIZED was missized. The user thought they were
1138 1 : // deleting a key with this user key, but this user key had
1139 1 : // already been deleted.
1140 1 : //
1141 1 : // We can differentiate these two cases by examining the length of
1142 1 : // the DELSIZED's value. A DELSIZED's value holds the size of both
1143 1 : // the user key and value that it intends to delete. For any user
1144 1 : // key with a length > 0, a DELSIZED that has not deleted a key must
1145 1 : // have a value with a length > 0.
1146 1 : //
1147 1 : // We treat both cases the same functionally, adopting the identity
1148 1 : // of the lower-sequence numbered tombstone. However in the second
1149 1 : // case, we also increment the stat counting missized tombstones.
1150 1 : if len(i.value) > 0 {
1151 1 : // The original DELSIZED key was missized. The key that the user
1152 1 : // thought they were deleting does not exist.
1153 1 : i.stats.CountMissizedDels++
1154 1 : }
1155 1 : i.valueBuf = append(i.valueBuf[:0], i.iterValue...)
1156 1 : i.value = i.valueBuf
1157 1 : if i.iterKV.Kind() != base.InternalKeyKindDeleteSized {
1158 1 : // Convert the DELSIZED to a DEL—The DEL/SINGLEDEL we're eliding
1159 1 : // may not have deleted the key(s) it was intended to yet. The
1160 1 : // ordinary DEL compaction heuristics are better suited at that,
1161 1 : // plus we don't want to count it as a missized DEL. We early
1162 1 : // exit in this case, after skipping the remainder of the
1163 1 : // snapshot stripe.
1164 1 : i.key.SetKind(base.InternalKeyKindDelete)
1165 1 : // NB: We skipInStripe now, rather than returning leaving
1166 1 : // i.skip=true and returning early, because Next() requires
1167 1 : // that i.skip=true only if i.iterPos = iterPosCurForward.
1168 1 : //
1169 1 : // Ignore any error caused by skipInStripe since it does not affect
1170 1 : // the key/value being returned here, and the next call to Next() will
1171 1 : // expose it.
1172 1 : i.skipInStripe()
1173 1 : return &i.key, i.value
1174 1 : }
1175 : // Continue, in case we uncover another DELSIZED or a key this
1176 : // DELSIZED deletes.
1177 :
1178 1 : case base.InternalKeyKindSet, base.InternalKeyKindMerge, base.InternalKeyKindSetWithDelete:
1179 1 : // If the DELSIZED is value-less, it already deleted the key that it
1180 1 : // was intended to delete. This is possible with a sequence like:
1181 1 : //
1182 1 : // DELSIZED.8 SET.7 SET.3
1183 1 : //
1184 1 : // The DELSIZED only describes the size of the SET.7, which in this
1185 1 : // case has already been elided. We don't count it as a missizing,
1186 1 : // instead converting the DELSIZED to a DEL. Skip the remainder of
1187 1 : // the snapshot stripe and return.
1188 1 : if len(i.value) == 0 {
1189 1 : i.key.SetKind(base.InternalKeyKindDelete)
1190 1 : // NB: We skipInStripe now, rather than returning leaving
1191 1 : // i.skip=true and returning early, because Next() requires
1192 1 : // that i.skip=true only if i.iterPos = iterPosCurForward.
1193 1 : //
1194 1 : // Ignore any error caused by skipInStripe since it does not affect
1195 1 : // the key/value being returned here, and the next call to Next() will
1196 1 : // expose it.
1197 1 : i.skipInStripe()
1198 1 : return &i.key, i.value
1199 1 : }
1200 : // The deleted key is not a DEL, DELSIZED, and the DELSIZED in i.key
1201 : // has a positive size.
1202 1 : expectedSize, n := binary.Uvarint(i.value)
1203 1 : if n != len(i.value) {
1204 1 : i.err = base.CorruptionErrorf("DELSIZED holds invalid value: %x", errors.Safe(i.value))
1205 1 : return nil, nil
1206 1 : }
1207 1 : elidedSize := uint64(len(i.iterKV.K.UserKey)) + uint64(len(i.iterValue))
1208 1 : if elidedSize != expectedSize {
1209 1 : // The original DELSIZED key was missized. It's unclear what to
1210 1 : // do. The user-provided size was wrong, so it's unlikely to be
1211 1 : // accurate or meaningful. We could:
1212 1 : //
1213 1 : // 1. return the DELSIZED with the original user-provided size unmodified
1214 1 : // 2. return the DELZIZED with a zeroed size to reflect that a key was
1215 1 : // elided, even if it wasn't the anticipated size.
1216 1 : // 3. subtract the elided size from the estimate and re-encode.
1217 1 : // 4. convert the DELSIZED into a value-less DEL, so that
1218 1 : // ordinary DEL heuristics apply.
1219 1 : //
1220 1 : // We opt for (4) under the rationale that we can't rely on the
1221 1 : // user-provided size for accuracy, so ordinary DEL heuristics
1222 1 : // are safer.
1223 1 : i.stats.CountMissizedDels++
1224 1 : i.key.SetKind(base.InternalKeyKindDelete)
1225 1 : i.value = i.valueBuf[:0]
1226 1 : // NB: We skipInStripe now, rather than returning leaving
1227 1 : // i.skip=true and returning early, because Next() requires
1228 1 : // that i.skip=true only if i.iterPos = iterPosCurForward.
1229 1 : //
1230 1 : // Ignore any error caused by skipInStripe since it does not affect
1231 1 : // the key/value being returned here, and the next call to Next() will
1232 1 : // expose it.
1233 1 : i.skipInStripe()
1234 1 : return &i.key, i.value
1235 1 : }
1236 : // NB: We remove the value regardless of whether the key was sized
1237 : // appropriately. The size encoded is 'consumed' the first time it
1238 : // meets a key that it deletes.
1239 1 : i.value = i.valueBuf[:0]
1240 :
1241 0 : default:
1242 0 : i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(i.iterKV.Kind()))
1243 0 : return nil, nil
1244 : }
1245 : }
1246 :
1247 1 : if i.iterStripeChange == sameStripe {
1248 0 : panic(errors.AssertionFailedf("unexpectedly found iter stripe change = %d", i.iterStripeChange))
1249 : }
1250 : // We landed outside the original stripe. Reset skip.
1251 1 : i.skip = false
1252 1 : if i.err != nil {
1253 0 : return nil, nil
1254 0 : }
1255 1 : return &i.key, i.value
1256 : }
1257 :
1258 1 : func (i *Iter) saveKey() {
1259 1 : i.keyBuf = append(i.keyBuf[:0], i.iterKV.K.UserKey...)
1260 1 : i.key = base.InternalKey{
1261 1 : UserKey: i.keyBuf,
1262 1 : Trailer: i.iterKV.K.Trailer,
1263 1 : }
1264 1 : i.keyTrailer = i.key.Trailer
1265 1 : }
1266 :
1267 : // Error returns any error encountered.
1268 : //
1269 : // Note that Close will return the error as well.
1270 1 : func (i *Iter) Error() error {
1271 1 : return i.err
1272 1 : }
1273 :
1274 : // Close the iterator.
1275 1 : func (i *Iter) Close() error {
1276 1 : err := i.iter.Close()
1277 1 : if i.err == nil {
1278 1 : i.err = err
1279 1 : }
1280 :
1281 : // Close the closer for the current value if one was open.
1282 1 : if i.valueCloser != nil {
1283 0 : i.err = errors.CombineErrors(i.err, i.valueCloser.Close())
1284 0 : i.valueCloser = nil
1285 0 : }
1286 :
1287 1 : return i.err
1288 : }
1289 :
1290 : // cover is returned by tombstoneCovers and describes a span's relationship to
1291 : // a key at a particular snapshot.
1292 : type cover int8
1293 :
1294 : const (
1295 : // noCover indicates the tested key does not fall within the span's bounds,
1296 : // or the span contains no keys with sequence numbers higher than the key's.
1297 : noCover cover = iota
1298 :
1299 : // coversInvisibly indicates the tested key does fall within the span's
1300 : // bounds and the span contains at least one key with a higher sequence
1301 : // number, but none visible at the provided snapshot.
1302 : coversInvisibly
1303 :
1304 : // coversVisibly indicates the tested key does fall within the span's
1305 : // bounds, and the span constains at least one key with a sequence number
1306 : // higher than the key's sequence number that is visible at the provided
1307 : // snapshot.
1308 : coversVisibly
1309 : )
1310 :
1311 : // tombstoneCovers returns whether the key is covered by a tombstone and whether
1312 : // it is covered by a tombstone visible in the given snapshot.
1313 : //
1314 : // The key's UserKey must be greater or equal to the last span Start key passed
1315 : // to AddTombstoneSpan. The keys passed to tombstoneCovers calls must be
1316 : // ordered.
1317 1 : func (i *Iter) tombstoneCovers(key base.InternalKey, snapshot base.SeqNum) cover {
1318 1 : if i.lastRangeDelSpan.Empty() {
1319 1 : return noCover
1320 1 : }
1321 1 : if invariants.Enabled && (i.cmp(key.UserKey, i.lastRangeDelSpan.Start) < 0 || i.cmp(key.UserKey, i.lastRangeDelSpan.End) >= 0) {
1322 0 : panic(errors.AssertionFailedf("invalid key %q, last span %s", key, i.lastRangeDelSpan))
1323 : }
1324 : // The Covers() check is very cheap, so we want to do that first.
1325 1 : switch {
1326 1 : case !i.lastRangeDelSpan.Covers(key.SeqNum()):
1327 1 : return noCover
1328 1 : case i.lastRangeDelSpan.CoversAt(snapshot, key.SeqNum()):
1329 1 : return coversVisibly
1330 1 : default:
1331 1 : return coversInvisibly
1332 : }
1333 : }
1334 :
1335 1 : func (i *Iter) setLastRangeDelSpan(span *keyspan.Span) {
1336 1 : if invariants.Enabled && !i.lastRangeDelSpan.Empty() {
1337 0 : panic("last range del span overwritten")
1338 : }
1339 1 : i.lastRangeDelSpan.CopyFrom(span)
1340 1 : i.lastRangeDelSpanFrontier.Update(i.lastRangeDelSpan.End)
1341 : }
1342 :
1343 1 : func (i *Iter) lastRangeDelSpanFrontierReached(key []byte) []byte {
1344 1 : i.lastRangeDelSpan.Reset()
1345 1 : return nil
1346 1 : }
1347 :
1348 : // maybeZeroSeqnum attempts to set the seqnum for the current key to 0. Doing
1349 : // so improves compression and enables an optimization during forward iteration
1350 : // to skip some key comparisons. The seqnum for an entry can be zeroed if the
1351 : // entry is on the bottom snapshot stripe and on the bottom level of the LSM.
1352 1 : func (i *Iter) maybeZeroSeqnum(snapshotIdx int) {
1353 1 : if !i.cfg.AllowZeroSeqNum {
1354 1 : // TODO(peter): allowZeroSeqNum applies to the entire compaction. We could
1355 1 : // make the determination on a key by key basis, similar to what is done
1356 1 : // for elideTombstone. Need to add a benchmark for Iter to verify
1357 1 : // that isn't too expensive.
1358 1 : return
1359 1 : }
1360 1 : if snapshotIdx > 0 {
1361 1 : // This is not the last snapshot
1362 1 : return
1363 1 : }
1364 1 : i.key.SetSeqNum(base.SeqNumZero)
1365 : }
1366 :
1367 : func finishValueMerger(
1368 : valueMerger base.ValueMerger, includesBase bool,
1369 1 : ) (value []byte, needDelete bool, closer io.Closer, err error) {
1370 1 : if valueMerger2, ok := valueMerger.(base.DeletableValueMerger); ok {
1371 1 : value, needDelete, closer, err = valueMerger2.DeletableFinish(includesBase)
1372 1 : } else {
1373 1 : value, closer, err = valueMerger.Finish(includesBase)
1374 1 : }
1375 1 : return
1376 : }
|