Line data Source code
1 : // Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : // Package rangekey provides facilities for encoding, decoding and merging range
6 : // keys.
7 : //
8 : // Range keys map a span of keyspan `[start, end)`, at an optional suffix, to a
9 : // value.
10 : //
11 : // # Encoding
12 : //
13 : // Unlike other Pebble keys, range keys encode several fields of information:
14 : // start key, end key, suffix and value. Internally within Pebble and its
15 : // sstables, all keys including range keys are represented as a key-value tuple.
16 : // Range keys map to internal key-value tuples by mapping the start key to the
17 : // key and encoding the remainder of the fields in the value.
18 : //
19 : // ## `RANGEKEYSET`
20 : //
21 : // A `RANGEKEYSET` represents one more range keys set over a single region of
22 : // user key space. Each represented range key must have a unique suffix. A
23 : // `RANGEKEYSET` encapsulates a start key, an end key and a set of SuffixValue
24 : // pairs.
25 : //
26 : // A `RANGEKEYSET` key's user key holds the start key. Its value is a varstring
27 : // end key, followed by a set of SuffixValue pairs. A `RANGEKEYSET` may have
28 : // multiple SuffixValue pairs if the keyspan was set at multiple unique suffix
29 : // values.
30 : //
31 : // ## `RANGEKEYUNSET`
32 : //
33 : // A `RANGEKEYUNSET` represents the removal of range keys at specific suffixes
34 : // over a single region of user key space. A `RANGEKEYUNSET` encapsulates a
35 : // start key, an end key and a set of suffixes.
36 : //
37 : // A `RANGEKEYUNSET` key's user key holds the start key. Its value is a
38 : // varstring end key, followed by a set of suffixes. A `RANGEKEYUNSET` may have
39 : // multiple suffixes if the keyspan was unset at multiple unique suffixes.
40 : //
41 : // ## `RANGEKEYDEL`
42 : //
43 : // A `RANGEKEYDEL` represents the removal of all range keys over a single region
44 : // of user key space, regardless of suffix. A `RANGEKEYDEL` encapsulates a
45 : // start key and an end key. The end key is stored in the value, without any
46 : // varstring length prefixing.
47 : package rangekey
48 :
49 : // TODO(jackson): Document the encoding of RANGEKEYSET and RANGEKEYUNSET values
50 : // once we're confident they're stable.
51 :
52 : import (
53 : "encoding/binary"
54 :
55 : "github.com/cockroachdb/errors"
56 : "github.com/cockroachdb/pebble/internal/base"
57 : "github.com/cockroachdb/pebble/internal/keyspan"
58 : )
59 :
60 : // Encode takes a Span containing only range keys. It invokes the provided
61 : // closure with the encoded internal keys that represent the Span's state. The
62 : // keys and values passed to emit are only valid until the closure returns.
63 : // If emit returns an error, Encode stops and returns the error.
64 2 : func Encode(s *keyspan.Span, emit func(k base.InternalKey, v []byte) error) error {
65 2 : enc := Encoder{Emit: emit}
66 2 : return enc.Encode(s)
67 2 : }
68 :
69 : // An Encoder encodes range keys into their on-disk InternalKey format. An
70 : // Encoder holds internal buffers, reused between Emit calls.
71 : type Encoder struct {
72 : Emit func(base.InternalKey, []byte) error
73 : buf []byte
74 : unsets [][]byte
75 : sets []SuffixValue
76 : }
77 :
78 : // Encode takes a Span containing only range keys. It invokes the Encoder's Emit
79 : // closure with the encoded internal keys that represent the Span's state. The
80 : // keys and values passed to emit are only valid until the closure returns. If
81 : // Emit returns an error, Encode stops and returns the error.
82 : //
83 : // The encoded key-value pair passed to Emit is only valid until the closure
84 : // completes.
85 2 : func (e *Encoder) Encode(s *keyspan.Span) error {
86 2 : if s.Empty() {
87 1 : return nil
88 1 : }
89 :
90 : // This for loop iterates through the span's keys, which are sorted by
91 : // sequence number descending, grouping them into sequence numbers. All keys
92 : // with identical sequence numbers are flushed together.
93 2 : var del bool
94 2 : var seqNum uint64
95 2 : for i := range s.Keys {
96 2 : if i == 0 || s.Keys[i].SeqNum() != seqNum {
97 2 : if i > 0 {
98 2 : // Flush all the existing internal keys that exist at seqNum.
99 2 : if err := e.flush(s, seqNum, del); err != nil {
100 0 : return err
101 0 : }
102 : }
103 :
104 : // Reset sets, unsets, del.
105 2 : seqNum = s.Keys[i].SeqNum()
106 2 : del = false
107 2 : e.sets = e.sets[:0]
108 2 : e.unsets = e.unsets[:0]
109 : }
110 :
111 2 : switch s.Keys[i].Kind() {
112 2 : case base.InternalKeyKindRangeKeySet:
113 2 : e.sets = append(e.sets, SuffixValue{
114 2 : Suffix: s.Keys[i].Suffix,
115 2 : Value: s.Keys[i].Value,
116 2 : })
117 2 : case base.InternalKeyKindRangeKeyUnset:
118 2 : e.unsets = append(e.unsets, s.Keys[i].Suffix)
119 2 : case base.InternalKeyKindRangeKeyDelete:
120 2 : del = true
121 0 : default:
122 0 : return base.CorruptionErrorf("pebble: %s key kind is not a range key", s.Keys[i].Kind())
123 : }
124 : }
125 2 : return e.flush(s, seqNum, del)
126 : }
127 :
128 : // flush constructs internal keys for accumulated key state, and emits the
129 : // internal keys.
130 2 : func (e *Encoder) flush(s *keyspan.Span, seqNum uint64, del bool) error {
131 2 : if len(e.sets) > 0 {
132 2 : ik := base.MakeInternalKey(s.Start, seqNum, base.InternalKeyKindRangeKeySet)
133 2 : l := EncodedSetValueLen(s.End, e.sets)
134 2 : if l > cap(e.buf) {
135 2 : e.buf = make([]byte, l)
136 2 : }
137 2 : EncodeSetValue(e.buf[:l], s.End, e.sets)
138 2 : if err := e.Emit(ik, e.buf[:l]); err != nil {
139 0 : return err
140 0 : }
141 : }
142 2 : if len(e.unsets) > 0 {
143 2 : ik := base.MakeInternalKey(s.Start, seqNum, base.InternalKeyKindRangeKeyUnset)
144 2 : l := EncodedUnsetValueLen(s.End, e.unsets)
145 2 : if l > cap(e.buf) {
146 2 : e.buf = make([]byte, l)
147 2 : }
148 2 : EncodeUnsetValue(e.buf[:l], s.End, e.unsets)
149 2 : if err := e.Emit(ik, e.buf[:l]); err != nil {
150 0 : return err
151 0 : }
152 : }
153 2 : if del {
154 2 : ik := base.MakeInternalKey(s.Start, seqNum, base.InternalKeyKindRangeKeyDelete)
155 2 : // s.End is stored directly in the value for RangeKeyDeletes.
156 2 : if err := e.Emit(ik, s.End); err != nil {
157 0 : return err
158 0 : }
159 : }
160 2 : return nil
161 : }
162 :
163 : // Decode takes an internal key pair encoding range key(s) and returns a decoded
164 : // keyspan containing the keys. If keysDst is provided, keys will be appended to
165 : // keysDst.
166 2 : func Decode(ik base.InternalKey, v []byte, keysDst []keyspan.Key) (keyspan.Span, error) {
167 2 : var s keyspan.Span
168 2 :
169 2 : // Hydrate the user key bounds.
170 2 : s.Start = ik.UserKey
171 2 : var ok bool
172 2 : s.End, v, ok = DecodeEndKey(ik.Kind(), v)
173 2 : if !ok {
174 0 : return keyspan.Span{}, base.CorruptionErrorf("pebble: unable to decode range key end from %s", ik.Kind())
175 0 : }
176 2 : s.Keys = keysDst
177 2 :
178 2 : // Hydrate the contents of the range key(s).
179 2 : switch ik.Kind() {
180 2 : case base.InternalKeyKindRangeKeySet:
181 2 : for len(v) > 0 {
182 2 : var sv SuffixValue
183 2 : sv, v, ok = decodeSuffixValue(v)
184 2 : if !ok {
185 0 : return keyspan.Span{}, base.CorruptionErrorf("pebble: unable to decode range key suffix-value tuple")
186 0 : }
187 2 : s.Keys = append(s.Keys, keyspan.Key{
188 2 : Trailer: ik.Trailer,
189 2 : Suffix: sv.Suffix,
190 2 : Value: sv.Value,
191 2 : })
192 : }
193 2 : case base.InternalKeyKindRangeKeyUnset:
194 2 : for len(v) > 0 {
195 2 : var suffix []byte
196 2 : suffix, v, ok = decodeSuffix(v)
197 2 : if !ok {
198 0 : return keyspan.Span{}, base.CorruptionErrorf("pebble: unable to decode range key unset suffix")
199 0 : }
200 2 : s.Keys = append(s.Keys, keyspan.Key{
201 2 : Trailer: ik.Trailer,
202 2 : Suffix: suffix,
203 2 : })
204 : }
205 2 : case base.InternalKeyKindRangeKeyDelete:
206 2 : if len(v) > 0 {
207 0 : return keyspan.Span{}, base.CorruptionErrorf("pebble: RANGEKEYDELs must not contain additional data")
208 0 : }
209 2 : s.Keys = append(s.Keys, keyspan.Key{Trailer: ik.Trailer})
210 0 : default:
211 0 : return keyspan.Span{}, base.CorruptionErrorf("pebble: %s is not a range key", ik.Kind())
212 : }
213 2 : return s, nil
214 : }
215 :
216 : // SuffixValue represents a tuple of a suffix and a corresponding value. A
217 : // physical RANGEKEYSET key may contain many logical RangeKeySets, each
218 : // represented with a separate SuffixValue tuple.
219 : type SuffixValue struct {
220 : Suffix []byte
221 : Value []byte
222 : }
223 :
224 : // encodedSetSuffixValuesLen precomputes the length of the given slice of
225 : // SuffixValues, when encoded for a RangeKeySet. It may be used to construct a
226 : // buffer of the appropriate size before encoding.
227 2 : func encodedSetSuffixValuesLen(suffixValues []SuffixValue) int {
228 2 : var n int
229 2 : for i := 0; i < len(suffixValues); i++ {
230 2 : n += lenVarint(len(suffixValues[i].Suffix))
231 2 : n += len(suffixValues[i].Suffix)
232 2 : n += lenVarint(len(suffixValues[i].Value))
233 2 : n += len(suffixValues[i].Value)
234 2 : }
235 2 : return n
236 : }
237 :
238 : // encodeSetSuffixValues encodes a slice of SuffixValues for a RangeKeySet into
239 : // dst. The length of dst must be greater than or equal to
240 : // encodedSetSuffixValuesLen. encodeSetSuffixValues returns the number of bytes
241 : // written, which should always equal the EncodedSetValueLen with the same
242 : // arguments.
243 2 : func encodeSetSuffixValues(dst []byte, suffixValues []SuffixValue) int {
244 2 : // Encode the list of (suffix, value-len) tuples.
245 2 : var n int
246 2 : for i := 0; i < len(suffixValues); i++ {
247 2 : // Encode the length of the suffix.
248 2 : n += binary.PutUvarint(dst[n:], uint64(len(suffixValues[i].Suffix)))
249 2 :
250 2 : // Encode the suffix itself.
251 2 : n += copy(dst[n:], suffixValues[i].Suffix)
252 2 :
253 2 : // Encode the value length.
254 2 : n += binary.PutUvarint(dst[n:], uint64(len(suffixValues[i].Value)))
255 2 :
256 2 : // Encode the value itself.
257 2 : n += copy(dst[n:], suffixValues[i].Value)
258 2 : }
259 2 : return n
260 : }
261 :
262 : // EncodedSetValueLen precomputes the length of a RangeKeySet's value when
263 : // encoded. It may be used to construct a buffer of the appropriate size before
264 : // encoding.
265 2 : func EncodedSetValueLen(endKey []byte, suffixValues []SuffixValue) int {
266 2 : n := lenVarint(len(endKey))
267 2 : n += len(endKey)
268 2 : n += encodedSetSuffixValuesLen(suffixValues)
269 2 : return n
270 2 : }
271 :
272 : // EncodeSetValue encodes a RangeKeySet's value into dst. The length of dst must
273 : // be greater than or equal to EncodedSetValueLen. EncodeSetValue returns the
274 : // number of bytes written, which should always equal the EncodedSetValueLen
275 : // with the same arguments.
276 2 : func EncodeSetValue(dst []byte, endKey []byte, suffixValues []SuffixValue) int {
277 2 : // First encode the end key as a varstring.
278 2 : n := binary.PutUvarint(dst, uint64(len(endKey)))
279 2 : n += copy(dst[n:], endKey)
280 2 : n += encodeSetSuffixValues(dst[n:], suffixValues)
281 2 : return n
282 2 : }
283 :
284 : // DecodeEndKey reads the end key from the beginning of a range key (RANGEKEYSET,
285 : // RANGEKEYUNSET or RANGEKEYDEL)'s physical encoded value. Both sets and unsets
286 : // encode the range key, plus additional data in the value.
287 2 : func DecodeEndKey(kind base.InternalKeyKind, data []byte) (endKey, value []byte, ok bool) {
288 2 : switch kind {
289 2 : case base.InternalKeyKindRangeKeyDelete:
290 2 : // No splitting is necessary for range key deletes. The value is the end
291 2 : // key, and there is no additional associated value.
292 2 : return data, nil, true
293 2 : case base.InternalKeyKindRangeKeySet, base.InternalKeyKindRangeKeyUnset:
294 2 : v, n := binary.Uvarint(data)
295 2 : if n <= 0 || uint64(n)+v >= uint64(len(data)) {
296 0 : return nil, nil, false
297 0 : }
298 2 : endKey, value = data[n:n+int(v)], data[n+int(v):]
299 2 : return endKey, value, true
300 0 : default:
301 0 : panic(errors.Newf("key kind %s is not a range key kind", kind))
302 : }
303 : }
304 :
305 : // decodeSuffixValue decodes a single encoded SuffixValue from a RangeKeySet's
306 : // split value. The end key must have already been stripped from the
307 : // RangeKeySet's value (see DecodeEndKey).
308 2 : func decodeSuffixValue(data []byte) (sv SuffixValue, rest []byte, ok bool) {
309 2 : // Decode the suffix.
310 2 : sv.Suffix, data, ok = decodeVarstring(data)
311 2 : if !ok {
312 0 : return SuffixValue{}, nil, false
313 0 : }
314 : // Decode the value.
315 2 : sv.Value, data, ok = decodeVarstring(data)
316 2 : if !ok {
317 0 : return SuffixValue{}, nil, false
318 0 : }
319 2 : return sv, data, true
320 : }
321 :
322 : // encodedUnsetSuffixesLen precomputes the length of the given slice of
323 : // suffixes, when encoded for a RangeKeyUnset. It may be used to construct a
324 : // buffer of the appropriate size before encoding.
325 2 : func encodedUnsetSuffixesLen(suffixes [][]byte) int {
326 2 : var n int
327 2 : for i := 0; i < len(suffixes); i++ {
328 2 : n += lenVarint(len(suffixes[i]))
329 2 : n += len(suffixes[i])
330 2 : }
331 2 : return n
332 : }
333 :
334 : // encodeUnsetSuffixes encodes a slice of suffixes for a RangeKeyUnset into dst.
335 : // The length of dst must be greater than or equal to EncodedUnsetSuffixesLen.
336 : // EncodeUnsetSuffixes returns the number of bytes written, which should always
337 : // equal the EncodedUnsetSuffixesLen with the same arguments.
338 2 : func encodeUnsetSuffixes(dst []byte, suffixes [][]byte) int {
339 2 : // Encode the list of (suffix, value-len) tuples.
340 2 : var n int
341 2 : for i := 0; i < len(suffixes); i++ {
342 2 : // Encode the length of the suffix.
343 2 : n += binary.PutUvarint(dst[n:], uint64(len(suffixes[i])))
344 2 :
345 2 : // Encode the suffix itself.
346 2 : n += copy(dst[n:], suffixes[i])
347 2 : }
348 2 : return n
349 : }
350 :
351 : // EncodedUnsetValueLen precomputes the length of a RangeKeyUnset's value when
352 : // encoded. It may be used to construct a buffer of the appropriate size before
353 : // encoding.
354 2 : func EncodedUnsetValueLen(endKey []byte, suffixes [][]byte) int {
355 2 : n := lenVarint(len(endKey))
356 2 : n += len(endKey)
357 2 : n += encodedUnsetSuffixesLen(suffixes)
358 2 : return n
359 2 : }
360 :
361 : // EncodeUnsetValue encodes a RangeKeyUnset's value into dst. The length of dst
362 : // must be greater than or equal to EncodedUnsetValueLen. EncodeUnsetValue
363 : // returns the number of bytes written, which should always equal the
364 : // EncodedUnsetValueLen with the same arguments.
365 2 : func EncodeUnsetValue(dst []byte, endKey []byte, suffixes [][]byte) int {
366 2 : // First encode the end key as a varstring.
367 2 : n := binary.PutUvarint(dst, uint64(len(endKey)))
368 2 : n += copy(dst[n:], endKey)
369 2 : n += encodeUnsetSuffixes(dst[n:], suffixes)
370 2 : return n
371 2 : }
372 :
373 : // decodeSuffix decodes a single suffix from the beginning of data. If decoding
374 : // suffixes from a RangeKeyUnset's value, the end key must have already been
375 : // stripped from the RangeKeyUnset's value (see DecodeEndKey).
376 2 : func decodeSuffix(data []byte) (suffix, rest []byte, ok bool) {
377 2 : return decodeVarstring(data)
378 2 : }
379 :
380 2 : func decodeVarstring(data []byte) (v, rest []byte, ok bool) {
381 2 : // Decode the length of the string.
382 2 : l, n := binary.Uvarint(data)
383 2 : if n <= 0 {
384 0 : return nil, nil, ok
385 0 : }
386 :
387 : // Extract the string itself.
388 2 : return data[n : n+int(l)], data[n+int(l):], true
389 : }
390 :
391 : // IsRangeKey returns true if the given key kind is one of the range key kinds.
392 2 : func IsRangeKey(kind base.InternalKeyKind) bool {
393 2 : switch kind {
394 : case base.InternalKeyKindRangeKeyDelete,
395 : base.InternalKeyKindRangeKeyUnset,
396 2 : base.InternalKeyKindRangeKeySet:
397 2 : return true
398 2 : default:
399 2 : return false
400 : }
401 : }
402 :
403 2 : func lenVarint(v int) (n int) {
404 2 : x := uint32(v)
405 2 : n++
406 2 : for x >= 0x80 {
407 1 : x >>= 7
408 1 : n++
409 1 : }
410 2 : return n
411 : }
|