Line data Source code
1 : // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package keyspan // import "github.com/cockroachdb/pebble/internal/keyspan"
6 :
7 : import (
8 : "bytes"
9 : "fmt"
10 : "slices"
11 : "sort"
12 : "strings"
13 : "unicode"
14 :
15 : "github.com/cockroachdb/pebble/internal/base"
16 : )
17 :
18 : // Span represents a set of keys over a span of user key space. All of the keys
19 : // within a Span are applied across the span's key span indicated by Start and
20 : // End. Each internal key applied over the user key span appears as a separate
21 : // Key, with its own kind and sequence number. Optionally, each Key may also
22 : // have a Suffix and/or Value.
23 : //
24 : // Note that the start user key is inclusive and the end user key is exclusive.
25 : //
26 : // Currently the only supported key kinds are:
27 : //
28 : // RANGEDEL, RANGEKEYSET, RANGEKEYUNSET, RANGEKEYDEL.
29 : type Span struct {
30 : // Start and End encode the user key range of all the contained items, with
31 : // an inclusive start key and exclusive end key. Both Start and End must be
32 : // non-nil, or both nil if representing an invalid Span.
33 : Start, End []byte
34 : // Keys holds the set of keys applied over the [Start, End) user key range.
35 : // Keys is sorted by (SeqNum, Kind) descending, unless otherwise specified
36 : // by the context. If SeqNum and Kind are equal, the order of Keys is
37 : // undefined. Keys may be empty, even if Start and End are non-nil.
38 : //
39 : // Keys are a decoded representation of the internal keys stored in batches
40 : // or sstable blocks. A single internal key in a range key block may produce
41 : // several decoded Keys.
42 : Keys []Key
43 : KeysOrder KeysOrder
44 : }
45 :
46 : // KeysOrder describes the ordering of Keys within a Span.
47 : type KeysOrder int8
48 :
49 : const (
50 : // ByTrailerDesc indicates a Span's keys are sorted by InternalKeyTrailer descending.
51 : // This is the default ordering, and the ordering used during physical
52 : // storage.
53 : ByTrailerDesc KeysOrder = iota
54 : // BySuffixAsc indicates a Span's keys are sorted by Suffix ascending. This
55 : // ordering is used during user iteration of range keys.
56 : BySuffixAsc
57 : )
58 :
59 : // Key represents a single key applied over a span of user keys. A Key is
60 : // contained by a Span which specifies the span of user keys over which the Key
61 : // is applied.
62 : type Key struct {
63 : // Trailer contains the key kind and sequence number.
64 : Trailer base.InternalKeyTrailer
65 : // Suffix holds an optional suffix associated with the key. This is only
66 : // non-nil for RANGEKEYSET and RANGEKEYUNSET keys.
67 : Suffix []byte
68 : // Value holds a logical value associated with the Key. It is NOT the
69 : // internal value stored in a range key or range deletion block. This is
70 : // only non-nil for RANGEKEYSET keys.
71 : Value []byte
72 : }
73 :
74 : // SeqNum returns the sequence number component of the key.
75 1 : func (k Key) SeqNum() base.SeqNum {
76 1 : return k.Trailer.SeqNum()
77 1 : }
78 :
79 : // VisibleAt returns true if the provided key is visible at the provided
80 : // snapshot sequence number. It interprets batch sequence numbers as always
81 : // visible, because non-visible batch span keys are filtered when they're
82 : // fragmented.
83 1 : func (k Key) VisibleAt(snapshot base.SeqNum) bool {
84 1 : seq := k.SeqNum()
85 1 : return seq < snapshot || seq&base.SeqNumBatchBit != 0
86 1 : }
87 :
88 : // Kind returns the kind component of the key.
89 1 : func (k Key) Kind() base.InternalKeyKind {
90 1 : return base.InternalKeyKind(k.Trailer & 0xff)
91 1 : }
92 :
93 : // Equal returns true if this Key is equal to the given key. Two keys are said
94 : // to be equal if the two Keys have equal trailers, suffix and value. Suffix
95 : // comparison uses the provided base.Compare func. Value comparison is bytewise.
96 1 : func (k Key) Equal(equal base.Equal, b Key) bool {
97 1 : return k.Trailer == b.Trailer &&
98 1 : equal(k.Suffix, b.Suffix) &&
99 1 : bytes.Equal(k.Value, b.Value)
100 1 : }
101 :
102 : // CopyFrom copies the contents of another key, retaining the Suffix and Value slices.
103 1 : func (k *Key) CopyFrom(other Key) {
104 1 : k.Trailer = other.Trailer
105 1 : k.Suffix = append(k.Suffix[:0], other.Suffix...)
106 1 : k.Value = append(k.Value[:0], other.Value...)
107 1 : }
108 :
109 : // Clone creates a deep clone of the key, copying the Suffix and Value
110 : // slices.
111 1 : func (k Key) Clone() Key {
112 1 : res := Key{
113 1 : Trailer: k.Trailer,
114 1 : }
115 1 : if len(k.Suffix) > 0 {
116 1 : res.Suffix = slices.Clone(k.Suffix)
117 1 : }
118 1 : if len(k.Value) > 0 {
119 1 : res.Value = slices.Clone(k.Value)
120 1 : }
121 1 : return res
122 : }
123 :
124 1 : func (k Key) String() string {
125 1 : var b strings.Builder
126 1 : fmt.Fprintf(&b, "(#%d,%s", k.SeqNum(), k.Kind())
127 1 : if len(k.Suffix) > 0 || len(k.Value) > 0 {
128 1 : fmt.Fprintf(&b, ",%s", k.Suffix)
129 1 : }
130 1 : if len(k.Value) > 0 {
131 1 : fmt.Fprintf(&b, ",%s", k.Value)
132 1 : }
133 1 : b.WriteString(")")
134 1 : return b.String()
135 : }
136 :
137 : // Valid returns true if the span is defined.
138 1 : func (s *Span) Valid() bool {
139 1 : return s.Start != nil && s.End != nil
140 1 : }
141 :
142 : // Empty returns true if the span does not contain any keys. An empty span may
143 : // still be Valid. A non-empty span must be Valid.
144 : //
145 : // An Empty span may be produced by Visible, or be produced by iterators in
146 : // order to surface the gaps between keys.
147 1 : func (s *Span) Empty() bool {
148 1 : return s == nil || len(s.Keys) == 0
149 1 : }
150 :
151 : // Bounds returns Start and End as UserKeyBounds.
152 0 : func (s *Span) Bounds() base.UserKeyBounds {
153 0 : return base.UserKeyBoundsEndExclusive(s.Start, s.End)
154 0 : }
155 :
156 : // SmallestKey returns the smallest internal key defined by the span's keys.
157 : // It requires the Span's keys be in ByTrailerDesc order. It panics if the span
158 : // contains no keys or its keys are sorted in a different order.
159 1 : func (s *Span) SmallestKey() base.InternalKey {
160 1 : if len(s.Keys) == 0 {
161 0 : panic("pebble: Span contains no keys")
162 1 : } else if s.KeysOrder != ByTrailerDesc {
163 0 : panic("pebble: span's keys unexpectedly not in trailer order")
164 : }
165 : // The first key has the highest (sequence number,kind) tuple.
166 1 : return base.InternalKey{
167 1 : UserKey: s.Start,
168 1 : Trailer: s.Keys[0].Trailer,
169 1 : }
170 : }
171 :
172 : // LargestKey returns the largest internal key defined by the span's keys. The
173 : // returned key will always be a "sentinel key" at the end boundary. The
174 : // "sentinel key" models the exclusive end boundary by returning an InternalKey
175 : // with the maximal sequence number, ensuring all InternalKeys with the same
176 : // user key sort after the sentinel key.
177 : //
178 : // It requires the Span's keys be in ByTrailerDesc order. It panics if the span
179 : // contains no keys or its keys are sorted in a different order.
180 1 : func (s *Span) LargestKey() base.InternalKey {
181 1 : if len(s.Keys) == 0 {
182 0 : panic("pebble: Span contains no keys")
183 1 : } else if s.KeysOrder != ByTrailerDesc {
184 0 : panic("pebble: span's keys unexpectedly not in trailer order")
185 : }
186 : // The last key has the lowest (sequence number,kind) tuple.
187 1 : kind := s.Keys[len(s.Keys)-1].Kind()
188 1 : return base.MakeExclusiveSentinelKey(kind, s.End)
189 : }
190 :
191 : // SmallestSeqNum returns the smallest sequence number of a key contained within
192 : // the span. It requires the Span's keys be in ByTrailerDesc order. It panics if
193 : // the span contains no keys or its keys are sorted in a different order.
194 1 : func (s *Span) SmallestSeqNum() base.SeqNum {
195 1 : if len(s.Keys) == 0 {
196 0 : panic("pebble: Span contains no keys")
197 1 : } else if s.KeysOrder != ByTrailerDesc {
198 0 : panic("pebble: span's keys unexpectedly not in trailer order")
199 : }
200 :
201 1 : return s.Keys[len(s.Keys)-1].SeqNum()
202 : }
203 :
204 : // LargestSeqNum returns the largest sequence number of a key contained within
205 : // the span. It requires the Span's keys be in ByTrailerDesc order. It panics if
206 : // the span contains no keys or its keys are sorted in a different order.
207 1 : func (s *Span) LargestSeqNum() base.SeqNum {
208 1 : if len(s.Keys) == 0 {
209 0 : panic("pebble: Span contains no keys")
210 1 : } else if s.KeysOrder != ByTrailerDesc {
211 0 : panic("pebble: span's keys unexpectedly not in trailer order")
212 : }
213 1 : return s.Keys[0].SeqNum()
214 : }
215 :
216 : // LargestVisibleSeqNum returns the largest sequence number of a key contained
217 : // within the span that's also visible at the provided snapshot sequence number.
218 : // It requires the Span's keys be in ByTrailerDesc order. It panics if the span
219 : // contains no keys or its keys are sorted in a different order.
220 1 : func (s *Span) LargestVisibleSeqNum(snapshot base.SeqNum) (largest base.SeqNum, ok bool) {
221 1 : if s == nil {
222 1 : return 0, false
223 1 : } else if len(s.Keys) == 0 {
224 0 : panic("pebble: Span contains no keys")
225 1 : } else if s.KeysOrder != ByTrailerDesc {
226 0 : panic("pebble: span's keys unexpectedly not in trailer order")
227 : }
228 1 : for i := range s.Keys {
229 1 : if s.Keys[i].VisibleAt(snapshot) {
230 1 : return s.Keys[i].SeqNum(), true
231 1 : }
232 : }
233 1 : return 0, false
234 : }
235 :
236 : // TODO(jackson): Replace most of the calls to Visible with more targeted calls
237 : // that avoid the need to construct a new Span.
238 :
239 : // Visible returns a span with the subset of keys visible at the provided
240 : // sequence number. It requires the Span's keys be in ByTrailerDesc order. It
241 : // panics if the span's keys are sorted in a different order.
242 : //
243 : // Visible may incur an allocation, so callers should prefer targeted,
244 : // non-allocating methods when possible.
245 1 : func (s Span) Visible(snapshot base.SeqNum) Span {
246 1 : if s.KeysOrder != ByTrailerDesc {
247 0 : panic("pebble: span's keys unexpectedly not in trailer order")
248 : }
249 :
250 1 : ret := Span{Start: s.Start, End: s.End}
251 1 : if len(s.Keys) == 0 {
252 0 : return ret
253 0 : }
254 :
255 : // Keys from indexed batches may force an allocation. The Keys slice is
256 : // ordered by sequence number, so ordinarily we can return the trailing
257 : // subslice containing keys with sequence numbers less than `seqNum`.
258 : //
259 : // However, batch keys are special. Only visible batch keys are included
260 : // when an Iterator's batch spans are fragmented. They must always be
261 : // visible.
262 : //
263 : // Batch keys can create a sandwich of visible batch keys at the beginning
264 : // of the slice and visible committed keys at the end of the slice, forcing
265 : // us to allocate a new slice and copy the contents.
266 : //
267 : // Care is taking to only incur an allocation only when batch keys and
268 : // visible keys actually sandwich non-visible keys.
269 :
270 : // lastBatchIdx and lastNonVisibleIdx are set to the last index of a batch
271 : // key and a non-visible key respectively.
272 1 : lastBatchIdx := -1
273 1 : lastNonVisibleIdx := -1
274 1 : for i := range s.Keys {
275 1 : if seqNum := s.Keys[i].SeqNum(); seqNum&base.SeqNumBatchBit != 0 {
276 1 : // Batch key. Always visible.
277 1 : lastBatchIdx = i
278 1 : } else if seqNum >= snapshot {
279 1 : // This key is not visible.
280 1 : lastNonVisibleIdx = i
281 1 : }
282 : }
283 :
284 : // In the following comments: b = batch, h = hidden, v = visible (committed).
285 1 : switch {
286 1 : case lastNonVisibleIdx == -1:
287 1 : // All keys are visible.
288 1 : //
289 1 : // [b b b], [v v v] and [b b b v v v]
290 1 : ret.Keys = s.Keys
291 1 : case lastBatchIdx == -1:
292 1 : // There are no batch keys, so we can return the continuous subslice
293 1 : // starting after the last non-visible Key.
294 1 : //
295 1 : // h h h [v v v]
296 1 : ret.Keys = s.Keys[lastNonVisibleIdx+1:]
297 1 : case lastNonVisibleIdx == len(s.Keys)-1:
298 1 : // While we have a batch key and non-visible keys, there are no
299 1 : // committed visible keys. The 'sandwich' is missing the bottom layer,
300 1 : // so we can return the continuous sublice at the beginning.
301 1 : //
302 1 : // [b b b] h h h
303 1 : ret.Keys = s.Keys[0 : lastBatchIdx+1]
304 1 : default:
305 1 : // This is the problematic sandwich case. Allocate a new slice, copying
306 1 : // the batch keys and the visible keys into it.
307 1 : //
308 1 : // [b b b] h h h [v v v]
309 1 : ret.Keys = make([]Key, (lastBatchIdx+1)+(len(s.Keys)-lastNonVisibleIdx-1))
310 1 : copy(ret.Keys, s.Keys[:lastBatchIdx+1])
311 1 : copy(ret.Keys[lastBatchIdx+1:], s.Keys[lastNonVisibleIdx+1:])
312 : }
313 1 : return ret
314 : }
315 :
316 : // VisibleAt returns true if the span contains a key visible at the provided
317 : // snapshot. Keys with sequence numbers with the batch bit set are treated as
318 : // always visible.
319 : //
320 : // VisibleAt requires the Span's keys be in ByTrailerDesc order. It panics if
321 : // the span's keys are sorted in a different order.
322 1 : func (s *Span) VisibleAt(snapshot base.SeqNum) bool {
323 1 : if s.KeysOrder != ByTrailerDesc {
324 0 : panic("pebble: span's keys unexpectedly not in trailer order")
325 : }
326 1 : if len(s.Keys) == 0 {
327 0 : return false
328 1 : } else if first := s.Keys[0].SeqNum(); first&base.SeqNumBatchBit != 0 {
329 1 : // Only visible batch keys are included when an Iterator's batch spans
330 1 : // are fragmented. They must always be visible.
331 1 : return true
332 1 : } else {
333 1 : // Otherwise we check the last key. Since keys are ordered decreasing in
334 1 : // sequence number, the last key has the lowest sequence number of any
335 1 : // of the span's keys. If any of the keys are visible, the last key must
336 1 : // be visible. Or put differently: if the last key is not visible, then
337 1 : // no key is visible.
338 1 : return s.Keys[len(s.Keys)-1].SeqNum() < snapshot
339 1 : }
340 : }
341 :
342 : // Clone clones the span, creating copies of all contained slices. Clone is
343 : // allocation heavy and should not be used in hot paths.
344 1 : func (s *Span) Clone() Span {
345 1 : c := Span{
346 1 : Start: slices.Clone(s.Start),
347 1 : End: slices.Clone(s.End),
348 1 : KeysOrder: s.KeysOrder,
349 1 : }
350 1 : c.Keys = make([]Key, len(s.Keys))
351 1 : for i := range c.Keys {
352 1 : c.Keys[i] = s.Keys[i].Clone()
353 1 : }
354 1 : return c
355 : }
356 :
357 : // Contains returns true if the specified key resides within the span's bounds.
358 1 : func (s *Span) Contains(cmp base.Compare, key []byte) bool {
359 1 : return cmp(s.Start, key) <= 0 && cmp(key, s.End) < 0
360 1 : }
361 :
362 : // Covers returns true if the span covers keys at seqNum.
363 : //
364 : // Covers requires the Span's keys be in ByTrailerDesc order. It panics if the
365 : // span's keys are sorted in a different order.
366 1 : func (s Span) Covers(seqNum base.SeqNum) bool {
367 1 : if s.KeysOrder != ByTrailerDesc {
368 0 : panic("pebble: span's keys unexpectedly not in trailer order")
369 : }
370 1 : return !s.Empty() && s.Keys[0].SeqNum() > seqNum
371 : }
372 :
373 : // CoversAt returns true if the span contains a key that is visible at the
374 : // provided snapshot sequence number, and that key's sequence number is higher
375 : // than seqNum.
376 : //
377 : // Keys with sequence numbers with the batch bit set are treated as always
378 : // visible.
379 : //
380 : // CoversAt requires the Span's keys be in ByTrailerDesc order. It panics if the
381 : // span's keys are sorted in a different order.
382 1 : func (s *Span) CoversAt(snapshot, seqNum base.SeqNum) bool {
383 1 : if s.KeysOrder != ByTrailerDesc {
384 0 : panic("pebble: span's keys unexpectedly not in trailer order")
385 : }
386 : // NB: A key is visible at `snapshot` if its sequence number is strictly
387 : // less than `snapshot`. See base.Visible.
388 1 : for i := range s.Keys {
389 1 : if kseq := s.Keys[i].SeqNum(); kseq&base.SeqNumBatchBit != 0 {
390 1 : // Only visible batch keys are included when an Iterator's batch spans
391 1 : // are fragmented. They must always be visible.
392 1 : return kseq > seqNum
393 1 : } else if kseq < snapshot {
394 1 : return kseq > seqNum
395 1 : }
396 : }
397 1 : return false
398 : }
399 :
400 : // Reset clears the span's Start, End, and Keys fields, retaining the slices for
401 : // reuse.
402 1 : func (s *Span) Reset() {
403 1 : s.Start = s.Start[:0]
404 1 : s.End = s.End[:0]
405 1 : s.Keys = s.Keys[:0]
406 1 : }
407 :
408 : // CopyFrom deep-copies the contents of the other span, retaining the slices
409 : // allocated in this span.
410 1 : func (s *Span) CopyFrom(other *Span) {
411 1 : s.Start = append(s.Start[:0], other.Start...)
412 1 : s.End = append(s.End[:0], other.End...)
413 1 :
414 1 : // We want to preserve any existing Suffix/Value buffers.
415 1 : if cap(s.Keys) >= len(other.Keys) {
416 1 : s.Keys = s.Keys[:len(other.Keys)]
417 1 : } else {
418 1 : s.Keys = append(s.Keys[:cap(s.Keys)], make([]Key, len(other.Keys)-cap(s.Keys))...)
419 1 : }
420 1 : for i := range other.Keys {
421 1 : s.Keys[i].CopyFrom(other.Keys[i])
422 1 : }
423 :
424 1 : s.KeysOrder = other.KeysOrder
425 : }
426 :
427 : // String returns a string representation of the span.
428 1 : func (s Span) String() string {
429 1 : return fmt.Sprint(prettySpan{Span: s, formatKey: base.DefaultFormatter})
430 1 : }
431 :
432 : // Pretty returns a formatter for the span.
433 1 : func (s Span) Pretty(f base.FormatKey) fmt.Formatter {
434 1 : // TODO(jackson): Take a base.FormatValue to format Key.Value too.
435 1 : return prettySpan{s, f}
436 1 : }
437 :
438 : type prettySpan struct {
439 : Span
440 : formatKey base.FormatKey
441 : }
442 :
443 1 : func (s prettySpan) Format(fs fmt.State, c rune) {
444 1 : if !s.Valid() {
445 1 : fmt.Fprintf(fs, "<invalid>")
446 1 : return
447 1 : }
448 1 : fmt.Fprintf(fs, "%s-%s:{", s.formatKey(s.Start), s.formatKey(s.End))
449 1 : for i, k := range s.Keys {
450 1 : if i > 0 {
451 1 : fmt.Fprint(fs, " ")
452 1 : }
453 1 : fmt.Fprint(fs, k.String())
454 : }
455 1 : fmt.Fprintf(fs, "}")
456 : }
457 :
458 : // SortKeysByTrailer sorts a keys slice by trailer.
459 1 : func SortKeysByTrailer(keys *[]Key) {
460 1 : // NB: keys is a pointer to a slice instead of a slice to avoid `sorted`
461 1 : // escaping to the heap.
462 1 : sorted := (*keysBySeqNumKind)(keys)
463 1 : sort.Sort(sorted)
464 1 : }
465 :
466 : // KeysBySuffix implements sort.Interface, sorting its member Keys slice to by
467 : // Suffix in the order dictated by Cmp.
468 : type KeysBySuffix struct {
469 : Cmp base.Compare
470 : Keys []Key
471 : }
472 :
473 1 : func (s *KeysBySuffix) Len() int { return len(s.Keys) }
474 1 : func (s *KeysBySuffix) Less(i, j int) bool { return s.Cmp(s.Keys[i].Suffix, s.Keys[j].Suffix) < 0 }
475 1 : func (s *KeysBySuffix) Swap(i, j int) { s.Keys[i], s.Keys[j] = s.Keys[j], s.Keys[i] }
476 :
477 : // ParseSpan parses the string representation of a Span. It's intended for
478 : // tests. ParseSpan panics if passed a malformed span representation.
479 1 : func ParseSpan(input string) Span {
480 1 : var s Span
481 1 : parts := strings.FieldsFunc(input, func(r rune) bool {
482 1 : switch r {
483 1 : case '-', ':', '{', '}':
484 1 : return true
485 1 : default:
486 1 : return unicode.IsSpace(r)
487 : }
488 : })
489 1 : s.Start, s.End = []byte(parts[0]), []byte(parts[1])
490 1 :
491 1 : // Each of the remaining parts represents a single Key.
492 1 : s.Keys = make([]Key, 0, len(parts)-2)
493 1 : for _, p := range parts[2:] {
494 1 : if len(p) >= 2 && p[0] == '(' && p[len(p)-1] == ')' {
495 1 : p = p[1 : len(p)-1]
496 1 : }
497 1 : keyFields := strings.FieldsFunc(p, func(r rune) bool {
498 1 : switch r {
499 1 : case '#', ',':
500 1 : return true
501 1 : default:
502 1 : return unicode.IsSpace(r)
503 : }
504 : })
505 :
506 1 : var k Key
507 1 : seqNum := base.ParseSeqNum(keyFields[0])
508 1 : kind := base.ParseKind(keyFields[1])
509 1 : k.Trailer = base.MakeTrailer(seqNum, kind)
510 1 : // Parse the optional suffix.
511 1 : if len(keyFields) >= 3 {
512 1 : k.Suffix = []byte(keyFields[2])
513 1 : }
514 : // Parse the optional value.
515 1 : if len(keyFields) >= 4 {
516 1 : k.Value = []byte(keyFields[3])
517 1 : }
518 1 : s.Keys = append(s.Keys, k)
519 : }
520 1 : for i := 1; i < len(s.Keys); i++ {
521 1 : if s.Keys[i-1].Trailer < s.Keys[i].Trailer {
522 0 : panic(fmt.Sprintf("span keys not sorted: %s %s", s.Keys[i-1], s.Keys[i]))
523 : }
524 : }
525 1 : s.KeysOrder = ByTrailerDesc
526 1 : return s
527 : }
|