LCOV - code coverage report
Current view: top level - pebble/internal/rangekey - coalesce.go (source / functions) Hit Total Coverage
Test: 2023-10-29 08:16Z ed45a776 - tests + meta.lcov Lines: 151 163 92.6 %
Date: 2023-10-29 08:18:00 Functions: 0 0 -

          Line data    Source code
       1             : // Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
       2             : // of this source code is governed by a BSD-style license that can be found in
       3             : // the LICENSE file.
       4             : 
       5             : package rangekey
       6             : 
       7             : import (
       8             :         "bytes"
       9             :         "math"
      10             :         "sort"
      11             : 
      12             :         "github.com/cockroachdb/pebble/internal/base"
      13             :         "github.com/cockroachdb/pebble/internal/invariants"
      14             :         "github.com/cockroachdb/pebble/internal/keyspan"
      15             :         "github.com/cockroachdb/pebble/internal/manifest"
      16             : )
      17             : 
      18             : // UserIteratorConfig holds state for constructing the range key iterator stack
      19             : // for user iteration. The range key iterator must merge range key spans across
      20             : // the levels of the LSM. This merging is performed by a keyspan.MergingIter
      21             : // on-the-fly. The UserIteratorConfig implements keyspan.Transformer, evaluating
      22             : // range-key semantics and shadowing, so the spans returned by a MergingIter are
      23             : // fully resolved.
      24             : //
      25             : // The MergingIter is wrapped by a BoundedIter, which elides spans that are
      26             : // outside the iterator bounds (or the current prefix's bounds, during prefix
      27             : // iteration mode).
      28             : //
      29             : // To provide determinisim during iteration, the BoundedIter is wrapped by a
      30             : // DefragmentingIter that defragments abutting spans with identical
      31             : // user-observable state.
      32             : //
      33             : // At the top-level an InterleavingIter interleaves range keys with point keys
      34             : // and performs truncation to iterator bounds.
      35             : //
      36             : // Below is an abbreviated diagram illustrating the mechanics of a SeekGE.
      37             : //
      38             : //                     InterleavingIter.SeekGE
      39             : //                             │
      40             : //                  DefragmentingIter.SeekGE
      41             : //                             │
      42             : //                     BoundedIter.SeekGE
      43             : //                             │
      44             : //            ╭────────────────┴───────────────╮
      45             : //            │                                ├── defragmentBwd*
      46             : //      MergingIter.SeekGE                     │
      47             : //            │                                ╰── defragmentFwd
      48             : //            ╰─╶╶ per level╶╶ ─╮
      49             : //                              │
      50             : //                              │
      51             : //                              ├── <?>.SeekLT
      52             : //                              │
      53             : //                              ╰── <?>.Next
      54             : type UserIteratorConfig struct {
      55             :         snapshot   uint64
      56             :         comparer   *base.Comparer
      57             :         miter      keyspan.MergingIter
      58             :         biter      keyspan.BoundedIter
      59             :         diter      keyspan.DefragmentingIter
      60             :         liters     [manifest.NumLevels]keyspan.LevelIter
      61             :         litersUsed int
      62             :         onlySets   bool
      63             :         bufs       *Buffers
      64             : }
      65             : 
      66             : // Buffers holds various buffers used for range key iteration. They're exposed
      67             : // so that they may be pooled and reused between iterators.
      68             : type Buffers struct {
      69             :         merging       keyspan.MergingBuffers
      70             :         defragmenting keyspan.DefragmentingBuffers
      71             :         sortBuf       keyspan.KeysBySuffix
      72             : }
      73             : 
      74             : // PrepareForReuse discards any excessively large buffers.
      75           2 : func (bufs *Buffers) PrepareForReuse() {
      76           2 :         bufs.merging.PrepareForReuse()
      77           2 :         bufs.defragmenting.PrepareForReuse()
      78           2 : }
      79             : 
      80             : // Init initializes the range key iterator stack for user iteration. The
      81             : // resulting fragment iterator applies range key semantics, defragments spans
      82             : // according to their user-observable state and, if onlySets = true, removes all
      83             : // Keys other than RangeKeySets describing the current state of range keys. The
      84             : // resulting spans contain Keys sorted by Suffix.
      85             : //
      86             : // The snapshot sequence number parameter determines which keys are visible. Any
      87             : // keys not visible at the provided snapshot are ignored.
      88             : func (ui *UserIteratorConfig) Init(
      89             :         comparer *base.Comparer,
      90             :         snapshot uint64,
      91             :         lower, upper []byte,
      92             :         hasPrefix *bool,
      93             :         prefix *[]byte,
      94             :         onlySets bool,
      95             :         bufs *Buffers,
      96             :         iters ...keyspan.FragmentIterator,
      97           2 : ) keyspan.FragmentIterator {
      98           2 :         ui.snapshot = snapshot
      99           2 :         ui.comparer = comparer
     100           2 :         ui.onlySets = onlySets
     101           2 :         ui.miter.Init(comparer.Compare, ui, &bufs.merging, iters...)
     102           2 :         ui.biter.Init(comparer.Compare, comparer.Split, &ui.miter, lower, upper, hasPrefix, prefix)
     103           2 :         ui.diter.Init(comparer, &ui.biter, ui, keyspan.StaticDefragmentReducer, &bufs.defragmenting)
     104           2 :         ui.litersUsed = 0
     105           2 :         ui.bufs = bufs
     106           2 :         return &ui.diter
     107           2 : }
     108             : 
     109             : // AddLevel adds a new level to the bottom of the iterator stack. AddLevel
     110             : // must be called after Init and before any other method on the iterator.
     111           2 : func (ui *UserIteratorConfig) AddLevel(iter keyspan.FragmentIterator) {
     112           2 :         ui.miter.AddLevel(iter)
     113           2 : }
     114             : 
     115             : // NewLevelIter returns a pointer to a newly allocated or reused
     116             : // keyspan.LevelIter. The caller is responsible for calling Init() on this
     117             : // instance.
     118           2 : func (ui *UserIteratorConfig) NewLevelIter() *keyspan.LevelIter {
     119           2 :         if ui.litersUsed >= len(ui.liters) {
     120           1 :                 return &keyspan.LevelIter{}
     121           1 :         }
     122           2 :         ui.litersUsed++
     123           2 :         return &ui.liters[ui.litersUsed-1]
     124             : }
     125             : 
     126             : // SetBounds propagates bounds to the iterator stack. The fragment iterator
     127             : // interface ordinarily doesn't enforce bounds, so this is exposed as an
     128             : // explicit method on the user iterator config.
     129           2 : func (ui *UserIteratorConfig) SetBounds(lower, upper []byte) {
     130           2 :         ui.biter.SetBounds(lower, upper)
     131           2 : }
     132             : 
     133             : // Transform implements the keyspan.Transformer interface for use with a
     134             : // keyspan.MergingIter. It transforms spans by resolving range keys at the
     135             : // provided snapshot sequence number. Shadowing of keys is resolved (eg, removal
     136             : // of unset keys, removal of keys overwritten by a set at the same suffix, etc)
     137             : // and then non-RangeKeySet keys are removed. The resulting transformed spans
     138             : // only contain RangeKeySets describing the state visible at the provided
     139             : // sequence number, and hold their Keys sorted by Suffix.
     140           2 : func (ui *UserIteratorConfig) Transform(cmp base.Compare, s keyspan.Span, dst *keyspan.Span) error {
     141           2 :         // Apply shadowing of keys.
     142           2 :         dst.Start = s.Start
     143           2 :         dst.End = s.End
     144           2 :         ui.bufs.sortBuf = keyspan.KeysBySuffix{
     145           2 :                 Cmp:  cmp,
     146           2 :                 Keys: ui.bufs.sortBuf.Keys[:0],
     147           2 :         }
     148           2 :         if err := coalesce(ui.comparer.Equal, &ui.bufs.sortBuf, ui.snapshot, s.Keys); err != nil {
     149           0 :                 return err
     150           0 :         }
     151             :         // During user iteration over range keys, unsets and deletes don't matter.
     152             :         // Remove them if onlySets = true. This step helps logical defragmentation
     153             :         // during iteration.
     154           2 :         keys := ui.bufs.sortBuf.Keys
     155           2 :         dst.Keys = dst.Keys[:0]
     156           2 :         for i := range keys {
     157           2 :                 switch keys[i].Kind() {
     158           2 :                 case base.InternalKeyKindRangeKeySet:
     159           2 :                         if invariants.Enabled && len(dst.Keys) > 0 && cmp(dst.Keys[len(dst.Keys)-1].Suffix, keys[i].Suffix) > 0 {
     160           0 :                                 panic("pebble: keys unexpectedly not in ascending suffix order")
     161             :                         }
     162           2 :                         dst.Keys = append(dst.Keys, keys[i])
     163           2 :                 case base.InternalKeyKindRangeKeyUnset:
     164           2 :                         if invariants.Enabled && len(dst.Keys) > 0 && cmp(dst.Keys[len(dst.Keys)-1].Suffix, keys[i].Suffix) > 0 {
     165           0 :                                 panic("pebble: keys unexpectedly not in ascending suffix order")
     166             :                         }
     167           2 :                         if ui.onlySets {
     168           2 :                                 // Skip.
     169           2 :                                 continue
     170             :                         }
     171           1 :                         dst.Keys = append(dst.Keys, keys[i])
     172           2 :                 case base.InternalKeyKindRangeKeyDelete:
     173           2 :                         if ui.onlySets {
     174           2 :                                 // Skip.
     175           2 :                                 continue
     176             :                         }
     177           1 :                         dst.Keys = append(dst.Keys, keys[i])
     178           0 :                 default:
     179           0 :                         return base.CorruptionErrorf("pebble: unrecognized range key kind %s", keys[i].Kind())
     180             :                 }
     181             :         }
     182             :         // coalesce results in dst.Keys being sorted by Suffix.
     183           2 :         dst.KeysOrder = keyspan.BySuffixAsc
     184           2 :         return nil
     185             : }
     186             : 
     187             : // ShouldDefragment implements the DefragmentMethod interface and configures a
     188             : // DefragmentingIter to defragment spans of range keys if their user-visible
     189             : // state is identical. This defragmenting method assumes the provided spans have
     190             : // already been transformed through (UserIterationConfig).Transform, so all
     191             : // RangeKeySets are user-visible sets and are already in Suffix order. This
     192             : // defragmenter checks for equality between set suffixes and values (ignoring
     193             : // sequence numbers). It's intended for use during user iteration, when the
     194             : // wrapped keyspan iterator is merging spans across all levels of the LSM.
     195           2 : func (ui *UserIteratorConfig) ShouldDefragment(equal base.Equal, a, b *keyspan.Span) bool {
     196           2 :         // This implementation must only be used on spans that have transformed by
     197           2 :         // ui.Transform. The transform applies shadowing, removes all keys besides
     198           2 :         // the resulting Sets and sorts the keys by suffix. Since shadowing has been
     199           2 :         // applied, each Set must set a unique suffix. If the two spans are
     200           2 :         // equivalent, they must have the same number of range key sets.
     201           2 :         if len(a.Keys) != len(b.Keys) || len(a.Keys) == 0 {
     202           2 :                 return false
     203           2 :         }
     204           2 :         if a.KeysOrder != keyspan.BySuffixAsc || b.KeysOrder != keyspan.BySuffixAsc {
     205           0 :                 panic("pebble: range key span's keys unexpectedly not in ascending suffix order")
     206             :         }
     207             : 
     208           2 :         ret := true
     209           2 :         for i := range a.Keys {
     210           2 :                 if invariants.Enabled {
     211           2 :                         if ui.onlySets && (a.Keys[i].Kind() != base.InternalKeyKindRangeKeySet ||
     212           2 :                                 b.Keys[i].Kind() != base.InternalKeyKindRangeKeySet) {
     213           0 :                                 panic("pebble: unexpected non-RangeKeySet during defragmentation")
     214             :                         }
     215           2 :                         if i > 0 && (ui.comparer.Compare(a.Keys[i].Suffix, a.Keys[i-1].Suffix) < 0 ||
     216           2 :                                 ui.comparer.Compare(b.Keys[i].Suffix, b.Keys[i-1].Suffix) < 0) {
     217           0 :                                 panic("pebble: range keys not ordered by suffix during defragmentation")
     218             :                         }
     219             :                 }
     220           2 :                 if !equal(a.Keys[i].Suffix, b.Keys[i].Suffix) {
     221           2 :                         ret = false
     222           2 :                         break
     223             :                 }
     224           2 :                 if !bytes.Equal(a.Keys[i].Value, b.Keys[i].Value) {
     225           2 :                         ret = false
     226           2 :                         break
     227             :                 }
     228             :         }
     229           2 :         return ret
     230             : }
     231             : 
     232             : // Coalesce imposes range key semantics and coalesces range keys with the same
     233             : // bounds. Coalesce drops any keys shadowed by more recent sets, unsets or
     234             : // deletes. Coalesce modifies the provided span's Keys slice, reslicing the
     235             : // slice to remove dropped keys.
     236             : //
     237             : // Coalescence has subtle behavior with respect to sequence numbers. Coalesce
     238             : // depends on a keyspan.Span's Keys being sorted in sequence number descending
     239             : // order. The first key has the largest sequence number. The returned coalesced
     240             : // span includes only the largest sequence number. All other sequence numbers
     241             : // are forgotten. When a compaction constructs output range keys from a
     242             : // coalesced span, it produces at most one RANGEKEYSET, one RANGEKEYUNSET and
     243             : // one RANGEKEYDEL. Each one of these keys adopt the largest sequence number.
     244             : //
     245             : // This has the potentially surprising effect of 'promoting' a key to a higher
     246             : // sequence number. This is okay, because:
     247             : //   - There are no other overlapping keys within the coalesced span of
     248             : //     sequence numbers (otherwise they would be in the compaction, due to
     249             : //     the LSM invariant).
     250             : //   - Range key sequence numbers are never compared to point key sequence
     251             : //     numbers. Range keys and point keys have parallel existences.
     252             : //   - Compactions only coalesce within snapshot stripes.
     253             : //
     254             : // Additionally, internal range keys at the same sequence number have subtle
     255             : // mechanics:
     256             : //   - RANGEKEYSETs shadow RANGEKEYUNSETs of the same suffix.
     257             : //   - RANGEKEYDELs only apply to keys at lower sequence numbers.
     258             : //
     259             : // This is required for ingestion. Ingested sstables are assigned a single
     260             : // sequence number for the file, at which all of the file's keys are visible.
     261             : // The RANGEKEYSET, RANGEKEYUNSET and RANGEKEYDEL key kinds are ordered such
     262             : // that among keys with equal sequence numbers (thus ordered by their kinds) the
     263             : // keys do not affect one another. Ingested sstables are expected to be
     264             : // consistent with respect to the set/unset suffixes: A given suffix should be
     265             : // set or unset but not both.
     266             : //
     267             : // The resulting dst Keys slice is sorted by Trailer.
     268           2 : func Coalesce(cmp base.Compare, eq base.Equal, keys []keyspan.Key, dst *[]keyspan.Key) error {
     269           2 :         // TODO(jackson): Currently, Coalesce doesn't actually perform the sequence
     270           2 :         // number promotion described in the comment above.
     271           2 :         keysBySuffix := keyspan.KeysBySuffix{
     272           2 :                 Cmp:  cmp,
     273           2 :                 Keys: (*dst)[:0],
     274           2 :         }
     275           2 :         if err := coalesce(eq, &keysBySuffix, math.MaxUint64, keys); err != nil {
     276           0 :                 return err
     277           0 :         }
     278             :         // Update the span with the (potentially reduced) keys slice. coalesce left
     279             :         // the keys in *dst sorted by suffix. Re-sort them by trailer.
     280           2 :         *dst = keysBySuffix.Keys
     281           2 :         keyspan.SortKeysByTrailer(dst)
     282           2 :         return nil
     283             : }
     284             : 
     285             : func coalesce(
     286             :         equal base.Equal, keysBySuffix *keyspan.KeysBySuffix, snapshot uint64, keys []keyspan.Key,
     287           2 : ) error {
     288           2 :         // First, enforce visibility and RangeKeyDelete mechanics. We only need to
     289           2 :         // consider the prefix of keys before and including the first
     290           2 :         // RangeKeyDelete. We also must skip any keys that aren't visible at the
     291           2 :         // provided snapshot sequence number.
     292           2 :         //
     293           2 :         // NB: Within a given sequence number, keys are ordered as:
     294           2 :         //   RangeKeySet > RangeKeyUnset > RangeKeyDelete
     295           2 :         // This is significant, because this ensures that a Set or Unset sharing a
     296           2 :         // sequence number with a Delete do not shadow each other.
     297           2 :         deleteIdx := -1
     298           2 :         for i := range keys {
     299           2 :                 if invariants.Enabled && i > 0 && keys[i].Trailer > keys[i-1].Trailer {
     300           0 :                         panic("pebble: invariant violation: span keys unordered")
     301             :                 }
     302           2 :                 if !keys[i].VisibleAt(snapshot) {
     303           2 :                         continue
     304             :                 }
     305             :                 // Once a RangeKeyDelete is observed, we know it shadows all subsequent
     306             :                 // keys and we can break early. We don't add the RangeKeyDelete key to
     307             :                 // keysBySuffix.keys yet, because we don't want a suffix-less key
     308             :                 // that appeared earlier in the slice to elide it. It'll be added back
     309             :                 // in at the end.
     310           2 :                 if keys[i].Kind() == base.InternalKeyKindRangeKeyDelete {
     311           2 :                         deleteIdx = i
     312           2 :                         break
     313             :                 }
     314           2 :                 keysBySuffix.Keys = append(keysBySuffix.Keys, keys[i])
     315             :         }
     316             : 
     317             :         // Sort the accumulated keys by suffix. There may be duplicates within a
     318             :         // suffix, in which case the one with a larger trailer survives.
     319             :         //
     320             :         // We use a stable sort so that the first key with a given suffix is the one
     321             :         // that with the highest Trailer (because the input `keys` was sorted by
     322             :         // trailer descending).
     323           2 :         sort.Stable(keysBySuffix)
     324           2 : 
     325           2 :         // Grab a handle of the full sorted slice, before reslicing
     326           2 :         // keysBySuffix.keys to accumulate the final coalesced keys.
     327           2 :         sorted := keysBySuffix.Keys
     328           2 :         keysBySuffix.Keys = keysBySuffix.Keys[:0]
     329           2 : 
     330           2 :         var (
     331           2 :                 // prevSuffix is updated on each iteration of the below loop, and
     332           2 :                 // compared by the subsequent iteration to determine whether adjacent
     333           2 :                 // keys are defined at the same suffix.
     334           2 :                 prevSuffix []byte
     335           2 :                 // shadowing is set to true once any Key is shadowed by another key.
     336           2 :                 // When it's set to true—or after the loop if no keys are shadowed—the
     337           2 :                 // keysBySuffix.keys slice is resliced to contain the prefix of
     338           2 :                 // unshadowed keys. This avoids copying them incrementally in the common
     339           2 :                 // case of no shadowing.
     340           2 :                 shadowing bool
     341           2 :         )
     342           2 :         for i := range sorted {
     343           2 :                 if i > 0 && equal(prevSuffix, sorted[i].Suffix) {
     344           2 :                         // Skip; this key is shadowed by the predecessor that had a larger
     345           2 :                         // Trailer. If this is the first shadowed key, set shadowing=true
     346           2 :                         // and reslice keysBySuffix.keys to hold the entire unshadowed
     347           2 :                         // prefix.
     348           2 :                         if !shadowing {
     349           2 :                                 keysBySuffix.Keys = keysBySuffix.Keys[:i]
     350           2 :                                 shadowing = true
     351           2 :                         }
     352           2 :                         continue
     353             :                 }
     354           2 :                 prevSuffix = sorted[i].Suffix
     355           2 :                 if shadowing {
     356           2 :                         keysBySuffix.Keys = append(keysBySuffix.Keys, sorted[i])
     357           2 :                 }
     358             :         }
     359             :         // If there was no shadowing, keysBySuffix.keys is untouched. We can simply
     360             :         // set it to the existing `sorted` slice (also backed by keysBySuffix.keys).
     361           2 :         if !shadowing {
     362           2 :                 keysBySuffix.Keys = sorted
     363           2 :         }
     364             :         // If the original input `keys` slice contained a RangeKeyDelete, add it.
     365           2 :         if deleteIdx >= 0 {
     366           2 :                 keysBySuffix.Keys = append(keysBySuffix.Keys, keys[deleteIdx])
     367           2 :         }
     368           2 :         return nil
     369             : }

Generated by: LCOV version 1.14