Line data Source code
1 : // Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : // Package rangekey provides facilities for encoding, decoding and merging range
6 : // keys.
7 : //
8 : // Range keys map a span of keyspan `[start, end)`, at an optional suffix, to a
9 : // value.
10 : //
11 : // # Encoding
12 : //
13 : // Unlike other Pebble keys, range keys encode several fields of information:
14 : // start key, end key, suffix and value. Internally within Pebble and its
15 : // sstables, all keys including range keys are represented as a key-value tuple.
16 : // Range keys map to internal key-value tuples by mapping the start key to the
17 : // key and encoding the remainder of the fields in the value.
18 : //
19 : // ## `RANGEKEYSET`
20 : //
21 : // A `RANGEKEYSET` represents one more range keys set over a single region of
22 : // user key space. Each represented range key must have a unique suffix. A
23 : // `RANGEKEYSET` encapsulates a start key, an end key and a set of SuffixValue
24 : // pairs.
25 : //
26 : // A `RANGEKEYSET` key's user key holds the start key. Its value is a varstring
27 : // end key, followed by a set of SuffixValue pairs. A `RANGEKEYSET` may have
28 : // multiple SuffixValue pairs if the keyspan was set at multiple unique suffix
29 : // values.
30 : //
31 : // ## `RANGEKEYUNSET`
32 : //
33 : // A `RANGEKEYUNSET` represents the removal of range keys at specific suffixes
34 : // over a single region of user key space. A `RANGEKEYUNSET` encapsulates a
35 : // start key, an end key and a set of suffixes.
36 : //
37 : // A `RANGEKEYUNSET` key's user key holds the start key. Its value is a
38 : // varstring end key, followed by a set of suffixes. A `RANGEKEYUNSET` may have
39 : // multiple suffixes if the keyspan was unset at multiple unique suffixes.
40 : //
41 : // ## `RANGEKEYDEL`
42 : //
43 : // A `RANGEKEYDEL` represents the removal of all range keys over a single region
44 : // of user key space, regardless of suffix. A `RANGEKEYDEL` encapsulates a
45 : // start key and an end key. The end key is stored in the value, without any
46 : // varstring length prefixing.
47 : package rangekey
48 :
49 : // TODO(jackson): Document the encoding of RANGEKEYSET and RANGEKEYUNSET values
50 : // once we're confident they're stable.
51 :
52 : import (
53 : "encoding/binary"
54 :
55 : "github.com/cockroachdb/errors"
56 : "github.com/cockroachdb/pebble/internal/base"
57 : "github.com/cockroachdb/pebble/internal/keyspan"
58 : )
59 :
60 : // Encode takes a Span containing only range keys. It invokes the provided
61 : // closure with the encoded internal keys that represent the Span's state. The
62 : // keys and values passed to emit are only valid until the closure returns.
63 : // If emit returns an error, Encode stops and returns the error.
64 1 : func Encode(s *keyspan.Span, emit func(k base.InternalKey, v []byte) error) error {
65 1 : enc := Encoder{Emit: emit}
66 1 : return enc.Encode(s)
67 1 : }
68 :
69 : // An Encoder encodes range keys into their on-disk InternalKey format. An
70 : // Encoder holds internal buffers, reused between Emit calls.
71 : type Encoder struct {
72 : Emit func(base.InternalKey, []byte) error
73 : buf []byte
74 : unsets [][]byte
75 : sets []SuffixValue
76 : }
77 :
78 : // Encode takes a Span containing only range keys. It invokes the Encoder's Emit
79 : // closure with the encoded internal keys that represent the Span's state. The
80 : // keys and values passed to emit are only valid until the closure returns. If
81 : // Emit returns an error, Encode stops and returns the error.
82 : //
83 : // The encoded key-value pair passed to Emit is only valid until the closure
84 : // completes.
85 1 : func (e *Encoder) Encode(s *keyspan.Span) error {
86 1 : if s.Empty() {
87 1 : return nil
88 1 : }
89 :
90 : // This for loop iterates through the span's keys, which are sorted by
91 : // sequence number descending, grouping them into sequence numbers. All keys
92 : // with identical sequence numbers are flushed together.
93 1 : var del bool
94 1 : var seqNum uint64
95 1 : for i := range s.Keys {
96 1 : if i == 0 || s.Keys[i].SeqNum() != seqNum {
97 1 : if i > 0 {
98 1 : // Flush all the existing internal keys that exist at seqNum.
99 1 : if err := e.flush(s, seqNum, del); err != nil {
100 0 : return err
101 0 : }
102 : }
103 :
104 : // Reset sets, unsets, del.
105 1 : seqNum = s.Keys[i].SeqNum()
106 1 : del = false
107 1 : e.sets = e.sets[:0]
108 1 : e.unsets = e.unsets[:0]
109 : }
110 :
111 1 : switch s.Keys[i].Kind() {
112 1 : case base.InternalKeyKindRangeKeySet:
113 1 : e.sets = append(e.sets, SuffixValue{
114 1 : Suffix: s.Keys[i].Suffix,
115 1 : Value: s.Keys[i].Value,
116 1 : })
117 1 : case base.InternalKeyKindRangeKeyUnset:
118 1 : e.unsets = append(e.unsets, s.Keys[i].Suffix)
119 1 : case base.InternalKeyKindRangeKeyDelete:
120 1 : del = true
121 0 : default:
122 0 : return base.CorruptionErrorf("pebble: %s key kind is not a range key", s.Keys[i].Kind())
123 : }
124 : }
125 1 : return e.flush(s, seqNum, del)
126 : }
127 :
128 : // flush constructs internal keys for accumulated key state, and emits the
129 : // internal keys.
130 1 : func (e *Encoder) flush(s *keyspan.Span, seqNum uint64, del bool) error {
131 1 : if len(e.sets) > 0 {
132 1 : ik := base.MakeInternalKey(s.Start, seqNum, base.InternalKeyKindRangeKeySet)
133 1 : l := EncodedSetValueLen(s.End, e.sets)
134 1 : if l > cap(e.buf) {
135 1 : e.buf = make([]byte, l)
136 1 : }
137 1 : EncodeSetValue(e.buf[:l], s.End, e.sets)
138 1 : if err := e.Emit(ik, e.buf[:l]); err != nil {
139 0 : return err
140 0 : }
141 : }
142 1 : if len(e.unsets) > 0 {
143 1 : ik := base.MakeInternalKey(s.Start, seqNum, base.InternalKeyKindRangeKeyUnset)
144 1 : l := EncodedUnsetValueLen(s.End, e.unsets)
145 1 : if l > cap(e.buf) {
146 1 : e.buf = make([]byte, l)
147 1 : }
148 1 : EncodeUnsetValue(e.buf[:l], s.End, e.unsets)
149 1 : if err := e.Emit(ik, e.buf[:l]); err != nil {
150 0 : return err
151 0 : }
152 : }
153 1 : if del {
154 1 : ik := base.MakeInternalKey(s.Start, seqNum, base.InternalKeyKindRangeKeyDelete)
155 1 : // s.End is stored directly in the value for RangeKeyDeletes.
156 1 : if err := e.Emit(ik, s.End); err != nil {
157 0 : return err
158 0 : }
159 : }
160 1 : return nil
161 : }
162 :
163 : // Decode takes an internal key pair encoding range key(s) and returns a decoded
164 : // keyspan containing the keys. If keysDst is provided, keys will be appended to
165 : // keysDst.
166 1 : func Decode(ik base.InternalKey, v []byte, keysDst []keyspan.Key) (keyspan.Span, error) {
167 1 : var s keyspan.Span
168 1 :
169 1 : // Hydrate the user key bounds.
170 1 : s.Start = ik.UserKey
171 1 : var ok bool
172 1 : s.End, v, ok = DecodeEndKey(ik.Kind(), v)
173 1 : if !ok {
174 0 : return keyspan.Span{}, base.CorruptionErrorf("pebble: unable to decode range key end from %s", ik.Kind())
175 0 : }
176 1 : s.Keys = keysDst
177 1 :
178 1 : // Hydrate the contents of the range key(s).
179 1 : switch ik.Kind() {
180 1 : case base.InternalKeyKindRangeKeySet:
181 1 : for len(v) > 0 {
182 1 : var sv SuffixValue
183 1 : sv, v, ok = decodeSuffixValue(v)
184 1 : if !ok {
185 0 : return keyspan.Span{}, base.CorruptionErrorf("pebble: unable to decode range key suffix-value tuple")
186 0 : }
187 1 : s.Keys = append(s.Keys, keyspan.Key{
188 1 : Trailer: ik.Trailer,
189 1 : Suffix: sv.Suffix,
190 1 : Value: sv.Value,
191 1 : })
192 : }
193 1 : case base.InternalKeyKindRangeKeyUnset:
194 1 : for len(v) > 0 {
195 1 : var suffix []byte
196 1 : suffix, v, ok = decodeSuffix(v)
197 1 : if !ok {
198 0 : return keyspan.Span{}, base.CorruptionErrorf("pebble: unable to decode range key unset suffix")
199 0 : }
200 1 : s.Keys = append(s.Keys, keyspan.Key{
201 1 : Trailer: ik.Trailer,
202 1 : Suffix: suffix,
203 1 : })
204 : }
205 1 : case base.InternalKeyKindRangeKeyDelete:
206 1 : if len(v) > 0 {
207 0 : return keyspan.Span{}, base.CorruptionErrorf("pebble: RANGEKEYDELs must not contain additional data")
208 0 : }
209 1 : s.Keys = append(s.Keys, keyspan.Key{Trailer: ik.Trailer})
210 0 : default:
211 0 : return keyspan.Span{}, base.CorruptionErrorf("pebble: %s is not a range key", ik.Kind())
212 : }
213 1 : return s, nil
214 : }
215 :
216 : // SuffixValue represents a tuple of a suffix and a corresponding value. A
217 : // physical RANGEKEYSET key may contain many logical RangeKeySets, each
218 : // represented with a separate SuffixValue tuple.
219 : type SuffixValue struct {
220 : Suffix []byte
221 : Value []byte
222 : }
223 :
224 : // encodedSetSuffixValuesLen precomputes the length of the given slice of
225 : // SuffixValues, when encoded for a RangeKeySet. It may be used to construct a
226 : // buffer of the appropriate size before encoding.
227 1 : func encodedSetSuffixValuesLen(suffixValues []SuffixValue) int {
228 1 : var n int
229 1 : for i := 0; i < len(suffixValues); i++ {
230 1 : n += lenVarint(len(suffixValues[i].Suffix))
231 1 : n += len(suffixValues[i].Suffix)
232 1 : n += lenVarint(len(suffixValues[i].Value))
233 1 : n += len(suffixValues[i].Value)
234 1 : }
235 1 : return n
236 : }
237 :
238 : // encodeSetSuffixValues encodes a slice of SuffixValues for a RangeKeySet into
239 : // dst. The length of dst must be greater than or equal to
240 : // encodedSetSuffixValuesLen. encodeSetSuffixValues returns the number of bytes
241 : // written, which should always equal the EncodedSetValueLen with the same
242 : // arguments.
243 1 : func encodeSetSuffixValues(dst []byte, suffixValues []SuffixValue) int {
244 1 : // Encode the list of (suffix, value-len) tuples.
245 1 : var n int
246 1 : for i := 0; i < len(suffixValues); i++ {
247 1 : // Encode the length of the suffix.
248 1 : n += binary.PutUvarint(dst[n:], uint64(len(suffixValues[i].Suffix)))
249 1 :
250 1 : // Encode the suffix itself.
251 1 : n += copy(dst[n:], suffixValues[i].Suffix)
252 1 :
253 1 : // Encode the value length.
254 1 : n += binary.PutUvarint(dst[n:], uint64(len(suffixValues[i].Value)))
255 1 :
256 1 : // Encode the value itself.
257 1 : n += copy(dst[n:], suffixValues[i].Value)
258 1 : }
259 1 : return n
260 : }
261 :
262 : // EncodedSetValueLen precomputes the length of a RangeKeySet's value when
263 : // encoded. It may be used to construct a buffer of the appropriate size before
264 : // encoding.
265 1 : func EncodedSetValueLen(endKey []byte, suffixValues []SuffixValue) int {
266 1 : n := lenVarint(len(endKey))
267 1 : n += len(endKey)
268 1 : n += encodedSetSuffixValuesLen(suffixValues)
269 1 : return n
270 1 : }
271 :
272 : // EncodeSetValue encodes a RangeKeySet's value into dst. The length of dst must
273 : // be greater than or equal to EncodedSetValueLen. EncodeSetValue returns the
274 : // number of bytes written, which should always equal the EncodedSetValueLen
275 : // with the same arguments.
276 1 : func EncodeSetValue(dst []byte, endKey []byte, suffixValues []SuffixValue) int {
277 1 : // First encode the end key as a varstring.
278 1 : n := binary.PutUvarint(dst, uint64(len(endKey)))
279 1 : n += copy(dst[n:], endKey)
280 1 : n += encodeSetSuffixValues(dst[n:], suffixValues)
281 1 : return n
282 1 : }
283 :
284 : // DecodeEndKey reads the end key from the beginning of a range key (RANGEKEYSET,
285 : // RANGEKEYUNSET or RANGEKEYDEL)'s physical encoded value. Both sets and unsets
286 : // encode the range key, plus additional data in the value.
287 1 : func DecodeEndKey(kind base.InternalKeyKind, data []byte) (endKey, value []byte, ok bool) {
288 1 : switch kind {
289 1 : case base.InternalKeyKindRangeKeyDelete:
290 1 : // No splitting is necessary for range key deletes. The value is the end
291 1 : // key, and there is no additional associated value.
292 1 : return data, nil, true
293 1 : case base.InternalKeyKindRangeKeySet, base.InternalKeyKindRangeKeyUnset:
294 1 : v, n := binary.Uvarint(data)
295 1 : if n <= 0 || uint64(n)+v >= uint64(len(data)) {
296 0 : return nil, nil, false
297 0 : }
298 1 : endKey, value = data[n:n+int(v)], data[n+int(v):]
299 1 : return endKey, value, true
300 0 : default:
301 0 : panic(errors.Newf("key kind %s is not a range key kind", kind))
302 : }
303 : }
304 :
305 : // decodeSuffixValue decodes a single encoded SuffixValue from a RangeKeySet's
306 : // split value. The end key must have already been stripped from the
307 : // RangeKeySet's value (see DecodeEndKey).
308 1 : func decodeSuffixValue(data []byte) (sv SuffixValue, rest []byte, ok bool) {
309 1 : // Decode the suffix.
310 1 : sv.Suffix, data, ok = decodeVarstring(data)
311 1 : if !ok {
312 0 : return SuffixValue{}, nil, false
313 0 : }
314 : // Decode the value.
315 1 : sv.Value, data, ok = decodeVarstring(data)
316 1 : if !ok {
317 0 : return SuffixValue{}, nil, false
318 0 : }
319 1 : return sv, data, true
320 : }
321 :
322 : // encodedUnsetSuffixesLen precomputes the length of the given slice of
323 : // suffixes, when encoded for a RangeKeyUnset. It may be used to construct a
324 : // buffer of the appropriate size before encoding.
325 1 : func encodedUnsetSuffixesLen(suffixes [][]byte) int {
326 1 : var n int
327 1 : for i := 0; i < len(suffixes); i++ {
328 1 : n += lenVarint(len(suffixes[i]))
329 1 : n += len(suffixes[i])
330 1 : }
331 1 : return n
332 : }
333 :
334 : // encodeUnsetSuffixes encodes a slice of suffixes for a RangeKeyUnset into dst.
335 : // The length of dst must be greater than or equal to EncodedUnsetSuffixesLen.
336 : // EncodeUnsetSuffixes returns the number of bytes written, which should always
337 : // equal the EncodedUnsetSuffixesLen with the same arguments.
338 1 : func encodeUnsetSuffixes(dst []byte, suffixes [][]byte) int {
339 1 : // Encode the list of (suffix, value-len) tuples.
340 1 : var n int
341 1 : for i := 0; i < len(suffixes); i++ {
342 1 : // Encode the length of the suffix.
343 1 : n += binary.PutUvarint(dst[n:], uint64(len(suffixes[i])))
344 1 :
345 1 : // Encode the suffix itself.
346 1 : n += copy(dst[n:], suffixes[i])
347 1 : }
348 1 : return n
349 : }
350 :
351 : // EncodedUnsetValueLen precomputes the length of a RangeKeyUnset's value when
352 : // encoded. It may be used to construct a buffer of the appropriate size before
353 : // encoding.
354 1 : func EncodedUnsetValueLen(endKey []byte, suffixes [][]byte) int {
355 1 : n := lenVarint(len(endKey))
356 1 : n += len(endKey)
357 1 : n += encodedUnsetSuffixesLen(suffixes)
358 1 : return n
359 1 : }
360 :
361 : // EncodeUnsetValue encodes a RangeKeyUnset's value into dst. The length of dst
362 : // must be greater than or equal to EncodedUnsetValueLen. EncodeUnsetValue
363 : // returns the number of bytes written, which should always equal the
364 : // EncodedUnsetValueLen with the same arguments.
365 1 : func EncodeUnsetValue(dst []byte, endKey []byte, suffixes [][]byte) int {
366 1 : // First encode the end key as a varstring.
367 1 : n := binary.PutUvarint(dst, uint64(len(endKey)))
368 1 : n += copy(dst[n:], endKey)
369 1 : n += encodeUnsetSuffixes(dst[n:], suffixes)
370 1 : return n
371 1 : }
372 :
373 : // decodeSuffix decodes a single suffix from the beginning of data. If decoding
374 : // suffixes from a RangeKeyUnset's value, the end key must have already been
375 : // stripped from the RangeKeyUnset's value (see DecodeEndKey).
376 1 : func decodeSuffix(data []byte) (suffix, rest []byte, ok bool) {
377 1 : return decodeVarstring(data)
378 1 : }
379 :
380 1 : func decodeVarstring(data []byte) (v, rest []byte, ok bool) {
381 1 : // Decode the length of the string.
382 1 : l, n := binary.Uvarint(data)
383 1 : if n <= 0 {
384 0 : return nil, nil, ok
385 0 : }
386 :
387 : // Extract the string itself.
388 1 : return data[n : n+int(l)], data[n+int(l):], true
389 : }
390 :
391 : // IsRangeKey returns true if the given key kind is one of the range key kinds.
392 1 : func IsRangeKey(kind base.InternalKeyKind) bool {
393 1 : switch kind {
394 : case base.InternalKeyKindRangeKeyDelete,
395 : base.InternalKeyKindRangeKeyUnset,
396 1 : base.InternalKeyKindRangeKeySet:
397 1 : return true
398 1 : default:
399 1 : return false
400 : }
401 : }
402 :
403 1 : func lenVarint(v int) (n int) {
404 1 : x := uint32(v)
405 1 : n++
406 1 : for x >= 0x80 {
407 1 : x >>= 7
408 1 : n++
409 1 : }
410 1 : return n
411 : }
|