Line data Source code
1 : // Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 : // of this source code is governed by a BSD-style license that can be found in 3 : // the LICENSE file. 4 : 5 : package rangekey 6 : 7 : import ( 8 : "math" 9 : "slices" 10 : 11 : "github.com/cockroachdb/pebble/internal/base" 12 : "github.com/cockroachdb/pebble/internal/invariants" 13 : "github.com/cockroachdb/pebble/internal/keyspan" 14 : ) 15 : 16 : // Coalesce imposes range key semantics and coalesces range keys with the same 17 : // bounds. Coalesce drops any keys shadowed by more recent sets, unsets or 18 : // deletes. Coalesce modifies the provided span's Keys slice, reslicing the 19 : // slice to remove dropped keys. 20 : // 21 : // Coalescence has subtle behavior with respect to sequence numbers. Coalesce 22 : // depends on a keyspan.Span's Keys being sorted in sequence number descending 23 : // order. The first key has the largest sequence number. The returned coalesced 24 : // span includes only the largest sequence number. All other sequence numbers 25 : // are forgotten. When a compaction constructs output range keys from a 26 : // coalesced span, it produces at most one RANGEKEYSET, one RANGEKEYUNSET and 27 : // one RANGEKEYDEL. Each one of these keys adopt the largest sequence number. 28 : // 29 : // This has the potentially surprising effect of 'promoting' a key to a higher 30 : // sequence number. This is okay, because: 31 : // - There are no other overlapping keys within the coalesced span of 32 : // sequence numbers (otherwise they would be in the compaction, due to 33 : // the LSM invariant). 34 : // - Range key sequence numbers are never compared to point key sequence 35 : // numbers. Range keys and point keys have parallel existences. 36 : // - Compactions only coalesce within snapshot stripes. 37 : // 38 : // Additionally, internal range keys at the same sequence number have subtle 39 : // mechanics: 40 : // - RANGEKEYSETs shadow RANGEKEYUNSETs of the same suffix. 41 : // - RANGEKEYDELs only apply to keys at lower sequence numbers. 42 : // 43 : // This is required for ingestion. Ingested sstables are assigned a single 44 : // sequence number for the file, at which all of the file's keys are visible. 45 : // The RANGEKEYSET, RANGEKEYUNSET and RANGEKEYDEL key kinds are ordered such 46 : // that among keys with equal sequence numbers (thus ordered by their kinds) the 47 : // keys do not affect one another. Ingested sstables are expected to be 48 : // consistent with respect to the set/unset suffixes: A given suffix should be 49 : // set or unset but not both. 50 : // 51 : // The resulting dst Keys slice is sorted by InternalKeyTrailer. 52 1 : func Coalesce(suffixCmp base.CompareSuffixes, keys []keyspan.Key, dst *[]keyspan.Key) { 53 1 : // TODO(jackson): Currently, Coalesce doesn't actually perform the sequence 54 1 : // number promotion described in the comment above. 55 1 : *dst = CoalesceInto(suffixCmp, (*dst)[:0], math.MaxUint64, keys) 56 1 : // Update the span with the (potentially reduced) keys slice. coalesce left 57 1 : // the keys in *dst sorted by suffix. Re-sort them by trailer. 58 1 : keyspan.SortKeysByTrailer(*dst) 59 1 : } 60 : 61 : // CoalesceInto is a variant of Coalesce which outputs the results into dst 62 : // without sorting them. 63 : func CoalesceInto( 64 : suffixCmp base.CompareSuffixes, dst []keyspan.Key, snapshot base.SeqNum, keys []keyspan.Key, 65 1 : ) []keyspan.Key { 66 1 : dst = dst[:0] 67 1 : // First, enforce visibility and RangeKeyDelete mechanics. We only need to 68 1 : // consider the prefix of keys before and including the first 69 1 : // RangeKeyDelete. We also must skip any keys that aren't visible at the 70 1 : // provided snapshot sequence number. 71 1 : // 72 1 : // NB: Within a given sequence number, keys are ordered as: 73 1 : // RangeKeySet > RangeKeyUnset > RangeKeyDelete 74 1 : // This is significant, because this ensures that a Set or Unset sharing a 75 1 : // sequence number with a Delete do not shadow each other. 76 1 : deleteIdx := -1 77 1 : for i := range keys { 78 1 : if invariants.Enabled && i > 0 && keys[i].Trailer > keys[i-1].Trailer { 79 0 : panic("pebble: invariant violation: span keys unordered") 80 : } 81 1 : if !keys[i].VisibleAt(snapshot) { 82 1 : continue 83 : } 84 : // Once a RangeKeyDelete is observed, we know it shadows all subsequent 85 : // keys and we can break early. We don't add the RangeKeyDelete key to 86 : // keysBySuffix.keys yet, because we don't want a suffix-less key 87 : // that appeared earlier in the slice to elide it. It'll be added back 88 : // in at the end. 89 1 : if keys[i].Kind() == base.InternalKeyKindRangeKeyDelete { 90 1 : deleteIdx = i 91 1 : break 92 : } 93 1 : dst = append(dst, keys[i]) 94 : } 95 : 96 : // Sort the accumulated keys by suffix. There may be duplicates within a 97 : // suffix, in which case the one with a larger trailer survives. 98 : // 99 : // We use a stable sort so that the first key with a given suffix is the one 100 : // that with the highest InternalKeyTrailer (because the input `keys` was sorted by 101 : // trailer descending). 102 1 : slices.SortStableFunc(dst, func(a, b keyspan.Key) int { 103 1 : return suffixCmp(a.Suffix, b.Suffix) 104 1 : }) 105 : 106 : // Grab a handle of the full sorted slice, before reslicing 107 : // dst to accumulate the final coalesced keys. 108 1 : sorted := dst 109 1 : dst = dst[:0] 110 1 : 111 1 : var ( 112 1 : // prevSuffix is updated on each iteration of the below loop, and 113 1 : // compared by the subsequent iteration to determine whether adjacent 114 1 : // keys are defined at the same suffix. 115 1 : prevSuffix []byte 116 1 : // shadowing is set to true once any Key is shadowed by another key. 117 1 : // When it's set to true—or after the loop if no keys are shadowed—the 118 1 : // keysBySuffix.keys slice is resliced to contain the prefix of 119 1 : // unshadowed keys. This avoids copying them incrementally in the common 120 1 : // case of no shadowing. 121 1 : shadowing bool 122 1 : ) 123 1 : for i := range sorted { 124 1 : if i > 0 && suffixCmp(prevSuffix, sorted[i].Suffix) == 0 { 125 1 : // Skip; this key is shadowed by the predecessor that had a larger 126 1 : // InternalKeyTrailer. If this is the first shadowed key, set shadowing=true 127 1 : // and reslice keysBySuffix.keys to hold the entire unshadowed 128 1 : // prefix. 129 1 : if !shadowing { 130 1 : dst = dst[:i] 131 1 : shadowing = true 132 1 : } 133 1 : continue 134 : } 135 1 : prevSuffix = sorted[i].Suffix 136 1 : if shadowing { 137 1 : dst = append(dst, sorted[i]) 138 1 : } 139 : } 140 : // If there was no shadowing, dst.keys is untouched. We can simply set it to 141 : // the existing `sorted` slice (also backed by dst). 142 1 : if !shadowing { 143 1 : dst = sorted 144 1 : } 145 : // If the original input `keys` slice contained a RangeKeyDelete, add it. 146 1 : if deleteIdx >= 0 { 147 1 : dst = append(dst, keys[deleteIdx]) 148 1 : } 149 1 : return dst 150 : } 151 : 152 : // ForeignSSTTransformer implements a keyspan.Transformer for range keys in 153 : // shared ingested sstables. It is largely similar to the Transform function 154 : // implemented in UserIteratorConfig in that it calls coalesce to remove range 155 : // keys shadowed by other range keys, but also retains the range key that does 156 : // the shadowing. In addition, it elides RangeKey unsets/dels in L6 as they are 157 : // inapplicable when reading from a different Pebble instance. Finally, it 158 : // returns keys sorted in trailer order, not suffix order, as that's what the 159 : // rest of the iterator stack expects. 160 : type ForeignSSTTransformer struct { 161 : Equal base.Equal 162 : SeqNum base.SeqNum 163 : sortBuf []keyspan.Key 164 : } 165 : 166 : // Transform implements the Transformer interface. 167 : func (f *ForeignSSTTransformer) Transform( 168 : suffixCmp base.CompareSuffixes, s keyspan.Span, dst *keyspan.Span, 169 1 : ) error { 170 1 : // Apply shadowing of keys. 171 1 : dst.Start = s.Start 172 1 : dst.End = s.End 173 1 : f.sortBuf = f.sortBuf[:0] 174 1 : f.sortBuf = CoalesceInto(suffixCmp, f.sortBuf, math.MaxUint64, s.Keys) 175 1 : keys := f.sortBuf 176 1 : dst.Keys = dst.Keys[:0] 177 1 : for i := range keys { 178 1 : switch keys[i].Kind() { 179 1 : case base.InternalKeyKindRangeKeySet: 180 1 : if invariants.Enabled && len(dst.Keys) > 0 && suffixCmp(dst.Keys[len(dst.Keys)-1].Suffix, keys[i].Suffix) > 0 { 181 0 : panic("pebble: keys unexpectedly not in ascending suffix order") 182 : } 183 0 : case base.InternalKeyKindRangeKeyUnset: 184 0 : if invariants.Enabled && len(dst.Keys) > 0 && suffixCmp(dst.Keys[len(dst.Keys)-1].Suffix, keys[i].Suffix) > 0 { 185 0 : panic("pebble: keys unexpectedly not in ascending suffix order") 186 : } 187 1 : case base.InternalKeyKindRangeKeyDelete: 188 : // Nothing to do. 189 0 : default: 190 0 : return base.CorruptionErrorf("pebble: unrecognized range key kind %s", keys[i].Kind()) 191 : } 192 1 : dst.Keys = append(dst.Keys, keyspan.Key{ 193 1 : Trailer: base.MakeTrailer(f.SeqNum, keys[i].Kind()), 194 1 : Suffix: keys[i].Suffix, 195 1 : Value: keys[i].Value, 196 1 : }) 197 : } 198 : // coalesce results in dst.Keys being sorted by Suffix. Change it back to 199 : // ByTrailerDesc, as that's what the iterator stack will expect. 200 1 : keyspan.SortKeysByTrailer(dst.Keys) 201 1 : dst.KeysOrder = keyspan.ByTrailerDesc 202 1 : return nil 203 : }