Line data Source code
1 : // Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 : // of this source code is governed by a BSD-style license that can be found in 3 : // the LICENSE file. 4 : 5 : package rangekey 6 : 7 : import ( 8 : "math" 9 : "sort" 10 : 11 : "github.com/cockroachdb/pebble/internal/base" 12 : "github.com/cockroachdb/pebble/internal/invariants" 13 : "github.com/cockroachdb/pebble/internal/keyspan" 14 : ) 15 : 16 : // Coalesce imposes range key semantics and coalesces range keys with the same 17 : // bounds. Coalesce drops any keys shadowed by more recent sets, unsets or 18 : // deletes. Coalesce modifies the provided span's Keys slice, reslicing the 19 : // slice to remove dropped keys. 20 : // 21 : // Coalescence has subtle behavior with respect to sequence numbers. Coalesce 22 : // depends on a keyspan.Span's Keys being sorted in sequence number descending 23 : // order. The first key has the largest sequence number. The returned coalesced 24 : // span includes only the largest sequence number. All other sequence numbers 25 : // are forgotten. When a compaction constructs output range keys from a 26 : // coalesced span, it produces at most one RANGEKEYSET, one RANGEKEYUNSET and 27 : // one RANGEKEYDEL. Each one of these keys adopt the largest sequence number. 28 : // 29 : // This has the potentially surprising effect of 'promoting' a key to a higher 30 : // sequence number. This is okay, because: 31 : // - There are no other overlapping keys within the coalesced span of 32 : // sequence numbers (otherwise they would be in the compaction, due to 33 : // the LSM invariant). 34 : // - Range key sequence numbers are never compared to point key sequence 35 : // numbers. Range keys and point keys have parallel existences. 36 : // - Compactions only coalesce within snapshot stripes. 37 : // 38 : // Additionally, internal range keys at the same sequence number have subtle 39 : // mechanics: 40 : // - RANGEKEYSETs shadow RANGEKEYUNSETs of the same suffix. 41 : // - RANGEKEYDELs only apply to keys at lower sequence numbers. 42 : // 43 : // This is required for ingestion. Ingested sstables are assigned a single 44 : // sequence number for the file, at which all of the file's keys are visible. 45 : // The RANGEKEYSET, RANGEKEYUNSET and RANGEKEYDEL key kinds are ordered such 46 : // that among keys with equal sequence numbers (thus ordered by their kinds) the 47 : // keys do not affect one another. Ingested sstables are expected to be 48 : // consistent with respect to the set/unset suffixes: A given suffix should be 49 : // set or unset but not both. 50 : // 51 : // The resulting dst Keys slice is sorted by Trailer. 52 2 : func Coalesce(cmp base.Compare, eq base.Equal, keys []keyspan.Key, dst *[]keyspan.Key) { 53 2 : // TODO(jackson): Currently, Coalesce doesn't actually perform the sequence 54 2 : // number promotion described in the comment above. 55 2 : keysBySuffix := keyspan.KeysBySuffix{ 56 2 : Cmp: cmp, 57 2 : Keys: (*dst)[:0], 58 2 : } 59 2 : CoalesceIntoKeysBySuffix(eq, &keysBySuffix, math.MaxUint64, keys) 60 2 : // Update the span with the (potentially reduced) keys slice. coalesce left 61 2 : // the keys in *dst sorted by suffix. Re-sort them by trailer. 62 2 : *dst = keysBySuffix.Keys 63 2 : keyspan.SortKeysByTrailer(dst) 64 2 : } 65 : 66 : // CoalesceIntoKeysBySuffix is a variant of Coalesce which outputs the results into 67 : // keyspan.KeysBySuffix without sorting them. 68 : func CoalesceIntoKeysBySuffix( 69 : equal base.Equal, keysBySuffix *keyspan.KeysBySuffix, snapshot uint64, keys []keyspan.Key, 70 2 : ) { 71 2 : // First, enforce visibility and RangeKeyDelete mechanics. We only need to 72 2 : // consider the prefix of keys before and including the first 73 2 : // RangeKeyDelete. We also must skip any keys that aren't visible at the 74 2 : // provided snapshot sequence number. 75 2 : // 76 2 : // NB: Within a given sequence number, keys are ordered as: 77 2 : // RangeKeySet > RangeKeyUnset > RangeKeyDelete 78 2 : // This is significant, because this ensures that a Set or Unset sharing a 79 2 : // sequence number with a Delete do not shadow each other. 80 2 : deleteIdx := -1 81 2 : for i := range keys { 82 2 : if invariants.Enabled && i > 0 && keys[i].Trailer > keys[i-1].Trailer { 83 0 : panic("pebble: invariant violation: span keys unordered") 84 : } 85 2 : if !keys[i].VisibleAt(snapshot) { 86 2 : continue 87 : } 88 : // Once a RangeKeyDelete is observed, we know it shadows all subsequent 89 : // keys and we can break early. We don't add the RangeKeyDelete key to 90 : // keysBySuffix.keys yet, because we don't want a suffix-less key 91 : // that appeared earlier in the slice to elide it. It'll be added back 92 : // in at the end. 93 2 : if keys[i].Kind() == base.InternalKeyKindRangeKeyDelete { 94 2 : deleteIdx = i 95 2 : break 96 : } 97 2 : keysBySuffix.Keys = append(keysBySuffix.Keys, keys[i]) 98 : } 99 : 100 : // Sort the accumulated keys by suffix. There may be duplicates within a 101 : // suffix, in which case the one with a larger trailer survives. 102 : // 103 : // We use a stable sort so that the first key with a given suffix is the one 104 : // that with the highest Trailer (because the input `keys` was sorted by 105 : // trailer descending). 106 2 : sort.Stable(keysBySuffix) 107 2 : 108 2 : // Grab a handle of the full sorted slice, before reslicing 109 2 : // keysBySuffix.keys to accumulate the final coalesced keys. 110 2 : sorted := keysBySuffix.Keys 111 2 : keysBySuffix.Keys = keysBySuffix.Keys[:0] 112 2 : 113 2 : var ( 114 2 : // prevSuffix is updated on each iteration of the below loop, and 115 2 : // compared by the subsequent iteration to determine whether adjacent 116 2 : // keys are defined at the same suffix. 117 2 : prevSuffix []byte 118 2 : // shadowing is set to true once any Key is shadowed by another key. 119 2 : // When it's set to true—or after the loop if no keys are shadowed—the 120 2 : // keysBySuffix.keys slice is resliced to contain the prefix of 121 2 : // unshadowed keys. This avoids copying them incrementally in the common 122 2 : // case of no shadowing. 123 2 : shadowing bool 124 2 : ) 125 2 : for i := range sorted { 126 2 : if i > 0 && equal(prevSuffix, sorted[i].Suffix) { 127 2 : // Skip; this key is shadowed by the predecessor that had a larger 128 2 : // Trailer. If this is the first shadowed key, set shadowing=true 129 2 : // and reslice keysBySuffix.keys to hold the entire unshadowed 130 2 : // prefix. 131 2 : if !shadowing { 132 2 : keysBySuffix.Keys = keysBySuffix.Keys[:i] 133 2 : shadowing = true 134 2 : } 135 2 : continue 136 : } 137 2 : prevSuffix = sorted[i].Suffix 138 2 : if shadowing { 139 2 : keysBySuffix.Keys = append(keysBySuffix.Keys, sorted[i]) 140 2 : } 141 : } 142 : // If there was no shadowing, keysBySuffix.keys is untouched. We can simply 143 : // set it to the existing `sorted` slice (also backed by keysBySuffix.keys). 144 2 : if !shadowing { 145 2 : keysBySuffix.Keys = sorted 146 2 : } 147 : // If the original input `keys` slice contained a RangeKeyDelete, add it. 148 2 : if deleteIdx >= 0 { 149 2 : keysBySuffix.Keys = append(keysBySuffix.Keys, keys[deleteIdx]) 150 2 : } 151 : } 152 : 153 : // ForeignSSTTransformer implements a keyspan.Transformer for range keys in 154 : // shared ingested sstables. It is largely similar to the Transform function 155 : // implemented in UserIteratorConfig in that it calls coalesce to remove range 156 : // keys shadowed by other range keys, but also retains the range key that does 157 : // the shadowing. In addition, it elides RangeKey unsets/dels in L6 as they are 158 : // inapplicable when reading from a different Pebble instance. Finally, it 159 : // returns keys sorted in trailer order, not suffix order, as that's what the 160 : // rest of the iterator stack expects. 161 : type ForeignSSTTransformer struct { 162 : Equal base.Equal 163 : SeqNum uint64 164 : sortBuf keyspan.KeysBySuffix 165 : } 166 : 167 : // Transform implements the Transformer interface. 168 : func (f *ForeignSSTTransformer) Transform( 169 : cmp base.Compare, s keyspan.Span, dst *keyspan.Span, 170 2 : ) error { 171 2 : // Apply shadowing of keys. 172 2 : dst.Start = s.Start 173 2 : dst.End = s.End 174 2 : f.sortBuf = keyspan.KeysBySuffix{ 175 2 : Cmp: cmp, 176 2 : Keys: f.sortBuf.Keys[:0], 177 2 : } 178 2 : CoalesceIntoKeysBySuffix(f.Equal, &f.sortBuf, math.MaxUint64, s.Keys) 179 2 : keys := f.sortBuf.Keys 180 2 : dst.Keys = dst.Keys[:0] 181 2 : for i := range keys { 182 2 : switch keys[i].Kind() { 183 2 : case base.InternalKeyKindRangeKeySet: 184 2 : if invariants.Enabled && len(dst.Keys) > 0 && cmp(dst.Keys[len(dst.Keys)-1].Suffix, keys[i].Suffix) > 0 { 185 0 : panic("pebble: keys unexpectedly not in ascending suffix order") 186 : } 187 2 : case base.InternalKeyKindRangeKeyUnset: 188 2 : if invariants.Enabled && len(dst.Keys) > 0 && cmp(dst.Keys[len(dst.Keys)-1].Suffix, keys[i].Suffix) > 0 { 189 0 : panic("pebble: keys unexpectedly not in ascending suffix order") 190 : } 191 2 : case base.InternalKeyKindRangeKeyDelete: 192 : // Nothing to do. 193 0 : default: 194 0 : return base.CorruptionErrorf("pebble: unrecognized range key kind %s", keys[i].Kind()) 195 : } 196 2 : dst.Keys = append(dst.Keys, keyspan.Key{ 197 2 : Trailer: base.MakeTrailer(f.SeqNum, keys[i].Kind()), 198 2 : Suffix: keys[i].Suffix, 199 2 : Value: keys[i].Value, 200 2 : }) 201 : } 202 : // coalesce results in dst.Keys being sorted by Suffix. Change it back to 203 : // ByTrailerDesc, as that's what the iterator stack will expect. 204 2 : keyspan.SortKeysByTrailer(&dst.Keys) 205 2 : dst.KeysOrder = keyspan.ByTrailerDesc 206 2 : return nil 207 : }