Line data Source code
1 : // Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package rangekey
6 :
7 : import (
8 : "bytes"
9 : "math"
10 : "sort"
11 :
12 : "github.com/cockroachdb/pebble/internal/base"
13 : "github.com/cockroachdb/pebble/internal/invariants"
14 : "github.com/cockroachdb/pebble/internal/keyspan"
15 : "github.com/cockroachdb/pebble/internal/manifest"
16 : )
17 :
18 : // UserIteratorConfig holds state for constructing the range key iterator stack
19 : // for user iteration. The range key iterator must merge range key spans across
20 : // the levels of the LSM. This merging is performed by a keyspan.MergingIter
21 : // on-the-fly. The UserIteratorConfig implements keyspan.Transformer, evaluating
22 : // range-key semantics and shadowing, so the spans returned by a MergingIter are
23 : // fully resolved.
24 : //
25 : // The MergingIter is wrapped by a BoundedIter, which elides spans that are
26 : // outside the iterator bounds (or the current prefix's bounds, during prefix
27 : // iteration mode).
28 : //
29 : // To provide determinisim during iteration, the BoundedIter is wrapped by a
30 : // DefragmentingIter that defragments abutting spans with identical
31 : // user-observable state.
32 : //
33 : // At the top-level an InterleavingIter interleaves range keys with point keys
34 : // and performs truncation to iterator bounds.
35 : //
36 : // Below is an abbreviated diagram illustrating the mechanics of a SeekGE.
37 : //
38 : // InterleavingIter.SeekGE
39 : // │
40 : // DefragmentingIter.SeekGE
41 : // │
42 : // BoundedIter.SeekGE
43 : // │
44 : // ╭────────────────┴───────────────╮
45 : // │ ├── defragmentBwd*
46 : // MergingIter.SeekGE │
47 : // │ ╰── defragmentFwd
48 : // ╰─╶╶ per level╶╶ ─╮
49 : // │
50 : // │
51 : // ├── <?>.SeekLT
52 : // │
53 : // ╰── <?>.Next
54 : type UserIteratorConfig struct {
55 : snapshot uint64
56 : comparer *base.Comparer
57 : miter keyspan.MergingIter
58 : biter keyspan.BoundedIter
59 : diter keyspan.DefragmentingIter
60 : liters [manifest.NumLevels]keyspan.LevelIter
61 : litersUsed int
62 : onlySets bool
63 : bufs *Buffers
64 : }
65 :
66 : // Buffers holds various buffers used for range key iteration. They're exposed
67 : // so that they may be pooled and reused between iterators.
68 : type Buffers struct {
69 : merging keyspan.MergingBuffers
70 : defragmenting keyspan.DefragmentingBuffers
71 : sortBuf keyspan.KeysBySuffix
72 : }
73 :
74 : // PrepareForReuse discards any excessively large buffers.
75 2 : func (bufs *Buffers) PrepareForReuse() {
76 2 : bufs.merging.PrepareForReuse()
77 2 : bufs.defragmenting.PrepareForReuse()
78 2 : }
79 :
80 : // Init initializes the range key iterator stack for user iteration. The
81 : // resulting fragment iterator applies range key semantics, defragments spans
82 : // according to their user-observable state and, if onlySets = true, removes all
83 : // Keys other than RangeKeySets describing the current state of range keys. The
84 : // resulting spans contain Keys sorted by Suffix.
85 : //
86 : // The snapshot sequence number parameter determines which keys are visible. Any
87 : // keys not visible at the provided snapshot are ignored.
88 : func (ui *UserIteratorConfig) Init(
89 : comparer *base.Comparer,
90 : snapshot uint64,
91 : lower, upper []byte,
92 : hasPrefix *bool,
93 : prefix *[]byte,
94 : onlySets bool,
95 : bufs *Buffers,
96 : iters ...keyspan.FragmentIterator,
97 2 : ) keyspan.FragmentIterator {
98 2 : ui.snapshot = snapshot
99 2 : ui.comparer = comparer
100 2 : ui.onlySets = onlySets
101 2 : ui.miter.Init(comparer.Compare, ui, &bufs.merging, iters...)
102 2 : ui.biter.Init(comparer.Compare, comparer.Split, &ui.miter, lower, upper, hasPrefix, prefix)
103 2 : ui.diter.Init(comparer, &ui.biter, ui, keyspan.StaticDefragmentReducer, &bufs.defragmenting)
104 2 : ui.litersUsed = 0
105 2 : ui.bufs = bufs
106 2 : return &ui.diter
107 2 : }
108 :
109 : // AddLevel adds a new level to the bottom of the iterator stack. AddLevel
110 : // must be called after Init and before any other method on the iterator.
111 2 : func (ui *UserIteratorConfig) AddLevel(iter keyspan.FragmentIterator) {
112 2 : ui.miter.AddLevel(iter)
113 2 : }
114 :
115 : // NewLevelIter returns a pointer to a newly allocated or reused
116 : // keyspan.LevelIter. The caller is responsible for calling Init() on this
117 : // instance.
118 2 : func (ui *UserIteratorConfig) NewLevelIter() *keyspan.LevelIter {
119 2 : if ui.litersUsed >= len(ui.liters) {
120 1 : return &keyspan.LevelIter{}
121 1 : }
122 2 : ui.litersUsed++
123 2 : return &ui.liters[ui.litersUsed-1]
124 : }
125 :
126 : // SetBounds propagates bounds to the iterator stack. The fragment iterator
127 : // interface ordinarily doesn't enforce bounds, so this is exposed as an
128 : // explicit method on the user iterator config.
129 2 : func (ui *UserIteratorConfig) SetBounds(lower, upper []byte) {
130 2 : ui.biter.SetBounds(lower, upper)
131 2 : }
132 :
133 : // Transform implements the keyspan.Transformer interface for use with a
134 : // keyspan.MergingIter. It transforms spans by resolving range keys at the
135 : // provided snapshot sequence number. Shadowing of keys is resolved (eg, removal
136 : // of unset keys, removal of keys overwritten by a set at the same suffix, etc)
137 : // and then non-RangeKeySet keys are removed. The resulting transformed spans
138 : // only contain RangeKeySets describing the state visible at the provided
139 : // sequence number, and hold their Keys sorted by Suffix.
140 2 : func (ui *UserIteratorConfig) Transform(cmp base.Compare, s keyspan.Span, dst *keyspan.Span) error {
141 2 : // Apply shadowing of keys.
142 2 : dst.Start = s.Start
143 2 : dst.End = s.End
144 2 : ui.bufs.sortBuf = keyspan.KeysBySuffix{
145 2 : Cmp: cmp,
146 2 : Keys: ui.bufs.sortBuf.Keys[:0],
147 2 : }
148 2 : if err := coalesce(ui.comparer.Equal, &ui.bufs.sortBuf, ui.snapshot, s.Keys); err != nil {
149 0 : return err
150 0 : }
151 : // During user iteration over range keys, unsets and deletes don't matter.
152 : // Remove them if onlySets = true. This step helps logical defragmentation
153 : // during iteration.
154 2 : keys := ui.bufs.sortBuf.Keys
155 2 : dst.Keys = dst.Keys[:0]
156 2 : for i := range keys {
157 2 : switch keys[i].Kind() {
158 2 : case base.InternalKeyKindRangeKeySet:
159 2 : if invariants.Enabled && len(dst.Keys) > 0 && cmp(dst.Keys[len(dst.Keys)-1].Suffix, keys[i].Suffix) > 0 {
160 0 : panic("pebble: keys unexpectedly not in ascending suffix order")
161 : }
162 2 : dst.Keys = append(dst.Keys, keys[i])
163 2 : case base.InternalKeyKindRangeKeyUnset:
164 2 : if invariants.Enabled && len(dst.Keys) > 0 && cmp(dst.Keys[len(dst.Keys)-1].Suffix, keys[i].Suffix) > 0 {
165 0 : panic("pebble: keys unexpectedly not in ascending suffix order")
166 : }
167 2 : if ui.onlySets {
168 2 : // Skip.
169 2 : continue
170 : }
171 1 : dst.Keys = append(dst.Keys, keys[i])
172 2 : case base.InternalKeyKindRangeKeyDelete:
173 2 : if ui.onlySets {
174 2 : // Skip.
175 2 : continue
176 : }
177 1 : dst.Keys = append(dst.Keys, keys[i])
178 0 : default:
179 0 : return base.CorruptionErrorf("pebble: unrecognized range key kind %s", keys[i].Kind())
180 : }
181 : }
182 : // coalesce results in dst.Keys being sorted by Suffix.
183 2 : dst.KeysOrder = keyspan.BySuffixAsc
184 2 : return nil
185 : }
186 :
187 : // ShouldDefragment implements the DefragmentMethod interface and configures a
188 : // DefragmentingIter to defragment spans of range keys if their user-visible
189 : // state is identical. This defragmenting method assumes the provided spans have
190 : // already been transformed through (UserIterationConfig).Transform, so all
191 : // RangeKeySets are user-visible sets and are already in Suffix order. This
192 : // defragmenter checks for equality between set suffixes and values (ignoring
193 : // sequence numbers). It's intended for use during user iteration, when the
194 : // wrapped keyspan iterator is merging spans across all levels of the LSM.
195 2 : func (ui *UserIteratorConfig) ShouldDefragment(equal base.Equal, a, b *keyspan.Span) bool {
196 2 : // This implementation must only be used on spans that have transformed by
197 2 : // ui.Transform. The transform applies shadowing, removes all keys besides
198 2 : // the resulting Sets and sorts the keys by suffix. Since shadowing has been
199 2 : // applied, each Set must set a unique suffix. If the two spans are
200 2 : // equivalent, they must have the same number of range key sets.
201 2 : if len(a.Keys) != len(b.Keys) || len(a.Keys) == 0 {
202 2 : return false
203 2 : }
204 2 : if a.KeysOrder != keyspan.BySuffixAsc || b.KeysOrder != keyspan.BySuffixAsc {
205 0 : panic("pebble: range key span's keys unexpectedly not in ascending suffix order")
206 : }
207 :
208 2 : ret := true
209 2 : for i := range a.Keys {
210 2 : if invariants.Enabled {
211 2 : if ui.onlySets && (a.Keys[i].Kind() != base.InternalKeyKindRangeKeySet ||
212 2 : b.Keys[i].Kind() != base.InternalKeyKindRangeKeySet) {
213 0 : panic("pebble: unexpected non-RangeKeySet during defragmentation")
214 : }
215 2 : if i > 0 && (ui.comparer.Compare(a.Keys[i].Suffix, a.Keys[i-1].Suffix) < 0 ||
216 2 : ui.comparer.Compare(b.Keys[i].Suffix, b.Keys[i-1].Suffix) < 0) {
217 0 : panic("pebble: range keys not ordered by suffix during defragmentation")
218 : }
219 : }
220 2 : if !equal(a.Keys[i].Suffix, b.Keys[i].Suffix) {
221 2 : ret = false
222 2 : break
223 : }
224 2 : if !bytes.Equal(a.Keys[i].Value, b.Keys[i].Value) {
225 2 : ret = false
226 2 : break
227 : }
228 : }
229 2 : return ret
230 : }
231 :
232 : // Coalesce imposes range key semantics and coalesces range keys with the same
233 : // bounds. Coalesce drops any keys shadowed by more recent sets, unsets or
234 : // deletes. Coalesce modifies the provided span's Keys slice, reslicing the
235 : // slice to remove dropped keys.
236 : //
237 : // Coalescence has subtle behavior with respect to sequence numbers. Coalesce
238 : // depends on a keyspan.Span's Keys being sorted in sequence number descending
239 : // order. The first key has the largest sequence number. The returned coalesced
240 : // span includes only the largest sequence number. All other sequence numbers
241 : // are forgotten. When a compaction constructs output range keys from a
242 : // coalesced span, it produces at most one RANGEKEYSET, one RANGEKEYUNSET and
243 : // one RANGEKEYDEL. Each one of these keys adopt the largest sequence number.
244 : //
245 : // This has the potentially surprising effect of 'promoting' a key to a higher
246 : // sequence number. This is okay, because:
247 : // - There are no other overlapping keys within the coalesced span of
248 : // sequence numbers (otherwise they would be in the compaction, due to
249 : // the LSM invariant).
250 : // - Range key sequence numbers are never compared to point key sequence
251 : // numbers. Range keys and point keys have parallel existences.
252 : // - Compactions only coalesce within snapshot stripes.
253 : //
254 : // Additionally, internal range keys at the same sequence number have subtle
255 : // mechanics:
256 : // - RANGEKEYSETs shadow RANGEKEYUNSETs of the same suffix.
257 : // - RANGEKEYDELs only apply to keys at lower sequence numbers.
258 : //
259 : // This is required for ingestion. Ingested sstables are assigned a single
260 : // sequence number for the file, at which all of the file's keys are visible.
261 : // The RANGEKEYSET, RANGEKEYUNSET and RANGEKEYDEL key kinds are ordered such
262 : // that among keys with equal sequence numbers (thus ordered by their kinds) the
263 : // keys do not affect one another. Ingested sstables are expected to be
264 : // consistent with respect to the set/unset suffixes: A given suffix should be
265 : // set or unset but not both.
266 : //
267 : // The resulting dst Keys slice is sorted by Trailer.
268 2 : func Coalesce(cmp base.Compare, eq base.Equal, keys []keyspan.Key, dst *[]keyspan.Key) error {
269 2 : // TODO(jackson): Currently, Coalesce doesn't actually perform the sequence
270 2 : // number promotion described in the comment above.
271 2 : keysBySuffix := keyspan.KeysBySuffix{
272 2 : Cmp: cmp,
273 2 : Keys: (*dst)[:0],
274 2 : }
275 2 : if err := coalesce(eq, &keysBySuffix, math.MaxUint64, keys); err != nil {
276 0 : return err
277 0 : }
278 : // Update the span with the (potentially reduced) keys slice. coalesce left
279 : // the keys in *dst sorted by suffix. Re-sort them by trailer.
280 2 : *dst = keysBySuffix.Keys
281 2 : keyspan.SortKeysByTrailer(dst)
282 2 : return nil
283 : }
284 :
285 : func coalesce(
286 : equal base.Equal, keysBySuffix *keyspan.KeysBySuffix, snapshot uint64, keys []keyspan.Key,
287 2 : ) error {
288 2 : // First, enforce visibility and RangeKeyDelete mechanics. We only need to
289 2 : // consider the prefix of keys before and including the first
290 2 : // RangeKeyDelete. We also must skip any keys that aren't visible at the
291 2 : // provided snapshot sequence number.
292 2 : //
293 2 : // NB: Within a given sequence number, keys are ordered as:
294 2 : // RangeKeySet > RangeKeyUnset > RangeKeyDelete
295 2 : // This is significant, because this ensures that a Set or Unset sharing a
296 2 : // sequence number with a Delete do not shadow each other.
297 2 : deleteIdx := -1
298 2 : for i := range keys {
299 2 : if invariants.Enabled && i > 0 && keys[i].Trailer > keys[i-1].Trailer {
300 0 : panic("pebble: invariant violation: span keys unordered")
301 : }
302 2 : if !keys[i].VisibleAt(snapshot) {
303 2 : continue
304 : }
305 : // Once a RangeKeyDelete is observed, we know it shadows all subsequent
306 : // keys and we can break early. We don't add the RangeKeyDelete key to
307 : // keysBySuffix.keys yet, because we don't want a suffix-less key
308 : // that appeared earlier in the slice to elide it. It'll be added back
309 : // in at the end.
310 2 : if keys[i].Kind() == base.InternalKeyKindRangeKeyDelete {
311 2 : deleteIdx = i
312 2 : break
313 : }
314 2 : keysBySuffix.Keys = append(keysBySuffix.Keys, keys[i])
315 : }
316 :
317 : // Sort the accumulated keys by suffix. There may be duplicates within a
318 : // suffix, in which case the one with a larger trailer survives.
319 : //
320 : // We use a stable sort so that the first key with a given suffix is the one
321 : // that with the highest Trailer (because the input `keys` was sorted by
322 : // trailer descending).
323 2 : sort.Stable(keysBySuffix)
324 2 :
325 2 : // Grab a handle of the full sorted slice, before reslicing
326 2 : // keysBySuffix.keys to accumulate the final coalesced keys.
327 2 : sorted := keysBySuffix.Keys
328 2 : keysBySuffix.Keys = keysBySuffix.Keys[:0]
329 2 :
330 2 : var (
331 2 : // prevSuffix is updated on each iteration of the below loop, and
332 2 : // compared by the subsequent iteration to determine whether adjacent
333 2 : // keys are defined at the same suffix.
334 2 : prevSuffix []byte
335 2 : // shadowing is set to true once any Key is shadowed by another key.
336 2 : // When it's set to true—or after the loop if no keys are shadowed—the
337 2 : // keysBySuffix.keys slice is resliced to contain the prefix of
338 2 : // unshadowed keys. This avoids copying them incrementally in the common
339 2 : // case of no shadowing.
340 2 : shadowing bool
341 2 : )
342 2 : for i := range sorted {
343 2 : if i > 0 && equal(prevSuffix, sorted[i].Suffix) {
344 2 : // Skip; this key is shadowed by the predecessor that had a larger
345 2 : // Trailer. If this is the first shadowed key, set shadowing=true
346 2 : // and reslice keysBySuffix.keys to hold the entire unshadowed
347 2 : // prefix.
348 2 : if !shadowing {
349 2 : keysBySuffix.Keys = keysBySuffix.Keys[:i]
350 2 : shadowing = true
351 2 : }
352 2 : continue
353 : }
354 2 : prevSuffix = sorted[i].Suffix
355 2 : if shadowing {
356 2 : keysBySuffix.Keys = append(keysBySuffix.Keys, sorted[i])
357 2 : }
358 : }
359 : // If there was no shadowing, keysBySuffix.keys is untouched. We can simply
360 : // set it to the existing `sorted` slice (also backed by keysBySuffix.keys).
361 2 : if !shadowing {
362 2 : keysBySuffix.Keys = sorted
363 2 : }
364 : // If the original input `keys` slice contained a RangeKeyDelete, add it.
365 2 : if deleteIdx >= 0 {
366 2 : keysBySuffix.Keys = append(keysBySuffix.Keys, keys[deleteIdx])
367 2 : }
368 2 : return nil
369 : }
|