Line data Source code
1 : // Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : // Package rangekey provides facilities for encoding, decoding and merging range
6 : // keys.
7 : //
8 : // Range keys map a span of keyspan `[start, end)`, at an optional suffix, to a
9 : // value.
10 : //
11 : // # Encoding
12 : //
13 : // Unlike other Pebble keys, range keys encode several fields of information:
14 : // start key, end key, suffix and value. Internally within Pebble and its
15 : // sstables, all keys including range keys are represented as a key-value tuple.
16 : // Range keys map to internal key-value tuples by mapping the start key to the
17 : // key and encoding the remainder of the fields in the value.
18 : //
19 : // ## `RANGEKEYSET`
20 : //
21 : // A `RANGEKEYSET` represents one more range keys set over a single region of
22 : // user key space. Each represented range key must have a unique suffix. A
23 : // `RANGEKEYSET` encapsulates a start key, an end key and a set of SuffixValue
24 : // pairs.
25 : //
26 : // A `RANGEKEYSET` key's user key holds the start key. Its value is a varstring
27 : // end key, followed by a set of SuffixValue pairs. A `RANGEKEYSET` may have
28 : // multiple SuffixValue pairs if the keyspan was set at multiple unique suffix
29 : // values.
30 : //
31 : // ## `RANGEKEYUNSET`
32 : //
33 : // A `RANGEKEYUNSET` represents the removal of range keys at specific suffixes
34 : // over a single region of user key space. A `RANGEKEYUNSET` encapsulates a
35 : // start key, an end key and a set of suffixes.
36 : //
37 : // A `RANGEKEYUNSET` key's user key holds the start key. Its value is a
38 : // varstring end key, followed by a set of suffixes. A `RANGEKEYUNSET` may have
39 : // multiple suffixes if the keyspan was unset at multiple unique suffixes.
40 : //
41 : // ## `RANGEKEYDEL`
42 : //
43 : // A `RANGEKEYDEL` represents the removal of all range keys over a single region
44 : // of user key space, regardless of suffix. A `RANGEKEYDEL` encapsulates a
45 : // start key and an end key. The end key is stored in the value, without any
46 : // varstring length prefixing.
47 : package rangekey
48 :
49 : // TODO(jackson): Document the encoding of RANGEKEYSET and RANGEKEYUNSET values
50 : // once we're confident they're stable.
51 :
52 : import (
53 : "encoding/binary"
54 :
55 : "github.com/cockroachdb/pebble/internal/base"
56 : "github.com/cockroachdb/pebble/internal/invariants"
57 : "github.com/cockroachdb/pebble/internal/keyspan"
58 : )
59 :
60 : // Encode takes a Span containing only range keys. It invokes the provided
61 : // closure with the encoded internal keys that represent the Span's state. The
62 : // keys and values passed to emit are only valid until the closure returns.
63 : // If emit returns an error, Encode stops and returns the error.
64 0 : func Encode(s *keyspan.Span, emit func(k base.InternalKey, v []byte) error) error {
65 0 : enc := Encoder{Emit: emit}
66 0 : return enc.Encode(s)
67 0 : }
68 :
69 : // An Encoder encodes range keys into their on-disk InternalKey format. An
70 : // Encoder holds internal buffers, reused between Emit calls.
71 : type Encoder struct {
72 : Emit func(base.InternalKey, []byte) error
73 : buf []byte
74 : unsets [][]byte
75 : sets []SuffixValue
76 : }
77 :
78 : // Encode takes a Span containing only range keys. It invokes the Encoder's Emit
79 : // closure with the encoded internal keys that represent the Span's state. The
80 : // keys and values passed to emit are only valid until the closure returns. If
81 : // Emit returns an error, Encode stops and returns the error.
82 : //
83 : // The encoded key-value pair passed to Emit is only valid until the closure
84 : // completes.
85 1 : func (e *Encoder) Encode(s *keyspan.Span) error {
86 1 : if s.Empty() {
87 0 : return nil
88 0 : }
89 :
90 : // This for loop iterates through the span's keys, which are sorted by
91 : // sequence number descending, grouping them into sequence numbers. All keys
92 : // with identical sequence numbers are flushed together.
93 1 : var del bool
94 1 : var seqNum base.SeqNum
95 1 : for i := range s.Keys {
96 1 : if i == 0 || s.Keys[i].SeqNum() != seqNum {
97 1 : if i > 0 {
98 1 : // Flush all the existing internal keys that exist at seqNum.
99 1 : if err := e.flush(s, seqNum, del); err != nil {
100 0 : return err
101 0 : }
102 : }
103 :
104 : // Reset sets, unsets, del.
105 1 : seqNum = s.Keys[i].SeqNum()
106 1 : del = false
107 1 : e.sets = e.sets[:0]
108 1 : e.unsets = e.unsets[:0]
109 : }
110 :
111 1 : switch s.Keys[i].Kind() {
112 1 : case base.InternalKeyKindRangeKeySet:
113 1 : e.sets = append(e.sets, SuffixValue{
114 1 : Suffix: s.Keys[i].Suffix,
115 1 : Value: s.Keys[i].Value,
116 1 : })
117 1 : case base.InternalKeyKindRangeKeyUnset:
118 1 : e.unsets = append(e.unsets, s.Keys[i].Suffix)
119 1 : case base.InternalKeyKindRangeKeyDelete:
120 1 : del = true
121 0 : default:
122 0 : return base.CorruptionErrorf("pebble: %s key kind is not a range key", s.Keys[i].Kind())
123 : }
124 : }
125 1 : return e.flush(s, seqNum, del)
126 : }
127 :
128 : // flush constructs internal keys for accumulated key state, and emits the
129 : // internal keys.
130 1 : func (e *Encoder) flush(s *keyspan.Span, seqNum base.SeqNum, del bool) error {
131 1 : if len(e.sets) > 0 {
132 1 : ik := base.MakeInternalKey(s.Start, seqNum, base.InternalKeyKindRangeKeySet)
133 1 : l := EncodedSetValueLen(s.End, e.sets)
134 1 : if l > cap(e.buf) {
135 1 : e.buf = make([]byte, l)
136 1 : }
137 1 : EncodeSetValue(e.buf[:l], s.End, e.sets)
138 1 : if err := e.Emit(ik, e.buf[:l]); err != nil {
139 0 : return err
140 0 : }
141 : }
142 1 : if len(e.unsets) > 0 {
143 1 : ik := base.MakeInternalKey(s.Start, seqNum, base.InternalKeyKindRangeKeyUnset)
144 1 : l := EncodedUnsetValueLen(s.End, e.unsets)
145 1 : if l > cap(e.buf) {
146 1 : e.buf = make([]byte, l)
147 1 : }
148 1 : EncodeUnsetValue(e.buf[:l], s.End, e.unsets)
149 1 : if err := e.Emit(ik, e.buf[:l]); err != nil {
150 0 : return err
151 0 : }
152 : }
153 1 : if del {
154 1 : ik := base.MakeInternalKey(s.Start, seqNum, base.InternalKeyKindRangeKeyDelete)
155 1 : // s.End is stored directly in the value for RangeKeyDeletes.
156 1 : if err := e.Emit(ik, s.End); err != nil {
157 0 : return err
158 0 : }
159 : }
160 1 : return nil
161 : }
162 :
163 : // Decode takes an internal key pair encoding range key(s) and returns a decoded
164 : // keyspan containing the keys. If keysBuf is provided, keys will be appended to
165 : // it.
166 1 : func Decode(ik base.InternalKey, v []byte, keysBuf []keyspan.Key) (keyspan.Span, error) {
167 1 : var s keyspan.Span
168 1 : s.Start = ik.UserKey
169 1 : var err error
170 1 : s.End, v, err = DecodeEndKey(ik.Kind(), v)
171 1 : if err != nil {
172 0 : return keyspan.Span{}, err
173 0 : }
174 1 : s.Keys, err = appendKeys(keysBuf, ik, v)
175 1 : if err != nil {
176 0 : return keyspan.Span{}, err
177 0 : }
178 1 : return s, nil
179 : }
180 :
181 : // DecodeIntoSpan decodes an internal key pair encoding range key(s) and appends
182 : // them to the given span. The start and end keys must match those in the span.
183 1 : func DecodeIntoSpan(cmp base.Compare, ik base.InternalKey, v []byte, s *keyspan.Span) error {
184 1 : // Hydrate the user key bounds.
185 1 : startKey := ik.UserKey
186 1 : endKey, v, err := DecodeEndKey(ik.Kind(), v)
187 1 : if err != nil {
188 0 : return err
189 0 : }
190 : // This function should only be called when ik.UserKey matches the Start of
191 : // the span we already have. If this is not the case, it is a bug in the
192 : // calling code.
193 1 : if invariants.Enabled && cmp(s.Start, startKey) != 0 {
194 0 : return base.AssertionFailedf("DecodeIntoSpan called with different start key")
195 0 : }
196 : // The value can come from disk or from the user, so we want to check the end
197 : // key in all builds.
198 1 : if cmp(s.End, endKey) != 0 {
199 0 : return base.CorruptionErrorf("pebble: corrupt range key fragmentation")
200 0 : }
201 1 : s.Keys, err = appendKeys(s.Keys, ik, v)
202 1 : return err
203 : }
204 :
205 1 : func appendKeys(buf []keyspan.Key, ik base.InternalKey, v []byte) ([]keyspan.Key, error) {
206 1 : // Hydrate the contents of the range key(s).
207 1 : switch ik.Kind() {
208 1 : case base.InternalKeyKindRangeKeySet:
209 1 : for len(v) > 0 {
210 1 : var sv SuffixValue
211 1 : var ok bool
212 1 : sv, v, ok = decodeSuffixValue(v)
213 1 : if !ok {
214 0 : return nil, base.CorruptionErrorf("pebble: unable to decode range key suffix-value tuple")
215 0 : }
216 1 : buf = append(buf, keyspan.Key{
217 1 : Trailer: ik.Trailer,
218 1 : Suffix: sv.Suffix,
219 1 : Value: sv.Value,
220 1 : })
221 : }
222 1 : case base.InternalKeyKindRangeKeyUnset:
223 1 : for len(v) > 0 {
224 1 : var suffix []byte
225 1 : var ok bool
226 1 : suffix, v, ok = decodeSuffix(v)
227 1 : if !ok {
228 0 : return nil, base.CorruptionErrorf("pebble: unable to decode range key unset suffix")
229 0 : }
230 1 : buf = append(buf, keyspan.Key{
231 1 : Trailer: ik.Trailer,
232 1 : Suffix: suffix,
233 1 : })
234 : }
235 1 : case base.InternalKeyKindRangeKeyDelete:
236 1 : if len(v) > 0 {
237 0 : return nil, base.CorruptionErrorf("pebble: RANGEKEYDELs must not contain additional data")
238 0 : }
239 1 : buf = append(buf, keyspan.Key{Trailer: ik.Trailer})
240 0 : default:
241 0 : return nil, base.CorruptionErrorf("pebble: %s is not a range key", ik.Kind())
242 : }
243 1 : return buf, nil
244 : }
245 :
246 : // SuffixValue represents a tuple of a suffix and a corresponding value. A
247 : // physical RANGEKEYSET key may contain many logical RangeKeySets, each
248 : // represented with a separate SuffixValue tuple.
249 : type SuffixValue struct {
250 : Suffix []byte
251 : Value []byte
252 : }
253 :
254 : // encodedSetSuffixValuesLen precomputes the length of the given slice of
255 : // SuffixValues, when encoded for a RangeKeySet. It may be used to construct a
256 : // buffer of the appropriate size before encoding.
257 1 : func encodedSetSuffixValuesLen(suffixValues []SuffixValue) int {
258 1 : var n int
259 1 : for i := 0; i < len(suffixValues); i++ {
260 1 : n += lenVarint(len(suffixValues[i].Suffix))
261 1 : n += len(suffixValues[i].Suffix)
262 1 : n += lenVarint(len(suffixValues[i].Value))
263 1 : n += len(suffixValues[i].Value)
264 1 : }
265 1 : return n
266 : }
267 :
268 : // encodeSetSuffixValues encodes a slice of SuffixValues for a RangeKeySet into
269 : // dst. The length of dst must be greater than or equal to
270 : // encodedSetSuffixValuesLen. encodeSetSuffixValues returns the number of bytes
271 : // written, which should always equal the EncodedSetValueLen with the same
272 : // arguments.
273 1 : func encodeSetSuffixValues(dst []byte, suffixValues []SuffixValue) int {
274 1 : // Encode the list of (suffix, value-len) tuples.
275 1 : var n int
276 1 : for i := 0; i < len(suffixValues); i++ {
277 1 : // Encode the length of the suffix.
278 1 : n += binary.PutUvarint(dst[n:], uint64(len(suffixValues[i].Suffix)))
279 1 :
280 1 : // Encode the suffix itself.
281 1 : n += copy(dst[n:], suffixValues[i].Suffix)
282 1 :
283 1 : // Encode the value length.
284 1 : n += binary.PutUvarint(dst[n:], uint64(len(suffixValues[i].Value)))
285 1 :
286 1 : // Encode the value itself.
287 1 : n += copy(dst[n:], suffixValues[i].Value)
288 1 : }
289 1 : return n
290 : }
291 :
292 : // EncodedSetValueLen precomputes the length of a RangeKeySet's value when
293 : // encoded. It may be used to construct a buffer of the appropriate size before
294 : // encoding.
295 1 : func EncodedSetValueLen(endKey []byte, suffixValues []SuffixValue) int {
296 1 : n := lenVarint(len(endKey))
297 1 : n += len(endKey)
298 1 : n += encodedSetSuffixValuesLen(suffixValues)
299 1 : return n
300 1 : }
301 :
302 : // EncodeSetValue encodes a RangeKeySet's value into dst. The length of dst must
303 : // be greater than or equal to EncodedSetValueLen. EncodeSetValue returns the
304 : // number of bytes written, which should always equal the EncodedSetValueLen
305 : // with the same arguments.
306 1 : func EncodeSetValue(dst []byte, endKey []byte, suffixValues []SuffixValue) int {
307 1 : // First encode the end key as a varstring.
308 1 : n := binary.PutUvarint(dst, uint64(len(endKey)))
309 1 : n += copy(dst[n:], endKey)
310 1 : n += encodeSetSuffixValues(dst[n:], suffixValues)
311 1 : return n
312 1 : }
313 :
314 : // DecodeEndKey reads the end key from the beginning of a range key (RANGEKEYSET,
315 : // RANGEKEYUNSET or RANGEKEYDEL)'s physical encoded value. Both sets and unsets
316 : // encode the range key, plus additional data in the value.
317 1 : func DecodeEndKey(kind base.InternalKeyKind, data []byte) (endKey, value []byte, _ error) {
318 1 : switch kind {
319 1 : case base.InternalKeyKindRangeKeyDelete:
320 1 : // No splitting is necessary for range key deletes. The value is the end
321 1 : // key, and there is no additional associated value.
322 1 : return data, nil, nil
323 :
324 1 : case base.InternalKeyKindRangeKeySet, base.InternalKeyKindRangeKeyUnset:
325 1 : v, n := binary.Uvarint(data)
326 1 : if n <= 0 || uint64(n)+v >= uint64(len(data)) {
327 0 : return nil, nil, base.CorruptionErrorf("pebble: unable to decode range key end from %s", kind)
328 0 : }
329 1 : endKey, value = data[n:n+int(v)], data[n+int(v):]
330 1 : return endKey, value, nil
331 :
332 0 : default:
333 0 : return nil, nil, base.AssertionFailedf("key kind %s is not a range key kind", kind)
334 : }
335 : }
336 :
337 : // decodeSuffixValue decodes a single encoded SuffixValue from a RangeKeySet's
338 : // split value. The end key must have already been stripped from the
339 : // RangeKeySet's value (see DecodeEndKey).
340 1 : func decodeSuffixValue(data []byte) (sv SuffixValue, rest []byte, ok bool) {
341 1 : // Decode the suffix.
342 1 : sv.Suffix, data, ok = decodeVarstring(data)
343 1 : if !ok {
344 0 : return SuffixValue{}, nil, false
345 0 : }
346 : // Decode the value.
347 1 : sv.Value, data, ok = decodeVarstring(data)
348 1 : if !ok {
349 0 : return SuffixValue{}, nil, false
350 0 : }
351 1 : return sv, data, true
352 : }
353 :
354 : // encodedUnsetSuffixesLen precomputes the length of the given slice of
355 : // suffixes, when encoded for a RangeKeyUnset. It may be used to construct a
356 : // buffer of the appropriate size before encoding.
357 1 : func encodedUnsetSuffixesLen(suffixes [][]byte) int {
358 1 : var n int
359 1 : for i := 0; i < len(suffixes); i++ {
360 1 : n += lenVarint(len(suffixes[i]))
361 1 : n += len(suffixes[i])
362 1 : }
363 1 : return n
364 : }
365 :
366 : // encodeUnsetSuffixes encodes a slice of suffixes for a RangeKeyUnset into dst.
367 : // The length of dst must be greater than or equal to EncodedUnsetSuffixesLen.
368 : // EncodeUnsetSuffixes returns the number of bytes written, which should always
369 : // equal the EncodedUnsetSuffixesLen with the same arguments.
370 1 : func encodeUnsetSuffixes(dst []byte, suffixes [][]byte) int {
371 1 : // Encode the list of (suffix, value-len) tuples.
372 1 : var n int
373 1 : for i := 0; i < len(suffixes); i++ {
374 1 : // Encode the length of the suffix.
375 1 : n += binary.PutUvarint(dst[n:], uint64(len(suffixes[i])))
376 1 :
377 1 : // Encode the suffix itself.
378 1 : n += copy(dst[n:], suffixes[i])
379 1 : }
380 1 : return n
381 : }
382 :
383 : // EncodedUnsetValueLen precomputes the length of a RangeKeyUnset's value when
384 : // encoded. It may be used to construct a buffer of the appropriate size before
385 : // encoding.
386 1 : func EncodedUnsetValueLen(endKey []byte, suffixes [][]byte) int {
387 1 : n := lenVarint(len(endKey))
388 1 : n += len(endKey)
389 1 : n += encodedUnsetSuffixesLen(suffixes)
390 1 : return n
391 1 : }
392 :
393 : // EncodeUnsetValue encodes a RangeKeyUnset's value into dst. The length of dst
394 : // must be greater than or equal to EncodedUnsetValueLen. EncodeUnsetValue
395 : // returns the number of bytes written, which should always equal the
396 : // EncodedUnsetValueLen with the same arguments.
397 1 : func EncodeUnsetValue(dst []byte, endKey []byte, suffixes [][]byte) int {
398 1 : // First encode the end key as a varstring.
399 1 : n := binary.PutUvarint(dst, uint64(len(endKey)))
400 1 : n += copy(dst[n:], endKey)
401 1 : n += encodeUnsetSuffixes(dst[n:], suffixes)
402 1 : return n
403 1 : }
404 :
405 : // decodeSuffix decodes a single suffix from the beginning of data. If decoding
406 : // suffixes from a RangeKeyUnset's value, the end key must have already been
407 : // stripped from the RangeKeyUnset's value (see DecodeEndKey).
408 1 : func decodeSuffix(data []byte) (suffix, rest []byte, ok bool) {
409 1 : return decodeVarstring(data)
410 1 : }
411 :
412 1 : func decodeVarstring(data []byte) (v, rest []byte, ok bool) {
413 1 : // Decode the length of the string.
414 1 : l, n := binary.Uvarint(data)
415 1 : if n <= 0 {
416 0 : return nil, nil, ok
417 0 : }
418 :
419 : // Extract the string itself.
420 1 : return data[n : n+int(l)], data[n+int(l):], true
421 : }
422 :
423 : // IsRangeKey returns true if the given key kind is one of the range key kinds.
424 1 : func IsRangeKey(kind base.InternalKeyKind) bool {
425 1 : switch kind {
426 : case base.InternalKeyKindRangeKeyDelete,
427 : base.InternalKeyKindRangeKeyUnset,
428 1 : base.InternalKeyKindRangeKeySet:
429 1 : return true
430 1 : default:
431 1 : return false
432 : }
433 : }
434 :
435 1 : func lenVarint(v int) (n int) {
436 1 : x := uint32(v)
437 1 : n++
438 1 : for x >= 0x80 {
439 0 : x >>= 7
440 0 : n++
441 0 : }
442 1 : return n
443 : }
|