Line data Source code
1 : // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package base // import "github.com/cockroachdb/pebble/internal/base"
6 :
7 : import (
8 : "encoding/binary"
9 : "fmt"
10 : "strconv"
11 : "strings"
12 : )
13 :
14 : const (
15 : // SeqNumZero is the zero sequence number, set by compactions if they can
16 : // guarantee there are no keys underneath an internal key.
17 : SeqNumZero = uint64(0)
18 : // SeqNumStart is the first sequence number assigned to a key. Sequence
19 : // numbers 1-9 are reserved for potential future use.
20 : SeqNumStart = uint64(10)
21 : )
22 :
23 : // InternalKeyKind enumerates the kind of key: a deletion tombstone, a set
24 : // value, a merged value, etc.
25 : type InternalKeyKind uint8
26 :
27 : // These constants are part of the file format, and should not be changed.
28 : const (
29 : InternalKeyKindDelete InternalKeyKind = 0
30 : InternalKeyKindSet InternalKeyKind = 1
31 : InternalKeyKindMerge InternalKeyKind = 2
32 : InternalKeyKindLogData InternalKeyKind = 3
33 : //InternalKeyKindColumnFamilyDeletion InternalKeyKind = 4
34 : //InternalKeyKindColumnFamilyValue InternalKeyKind = 5
35 : //InternalKeyKindColumnFamilyMerge InternalKeyKind = 6
36 :
37 : // InternalKeyKindSingleDelete (SINGLEDEL) is a performance optimization
38 : // solely for compactions (to reduce write amp and space amp). Readers other
39 : // than compactions should treat SINGLEDEL as equivalent to a DEL.
40 : // Historically, it was simpler for readers other than compactions to treat
41 : // SINGLEDEL as equivalent to DEL, but as of the introduction of
42 : // InternalKeyKindSSTableInternalObsoleteBit, this is also necessary for
43 : // correctness.
44 : InternalKeyKindSingleDelete InternalKeyKind = 7
45 : //InternalKeyKindColumnFamilySingleDelete InternalKeyKind = 8
46 : //InternalKeyKindBeginPrepareXID InternalKeyKind = 9
47 : //InternalKeyKindEndPrepareXID InternalKeyKind = 10
48 : //InternalKeyKindCommitXID InternalKeyKind = 11
49 : //InternalKeyKindRollbackXID InternalKeyKind = 12
50 : //InternalKeyKindNoop InternalKeyKind = 13
51 : //InternalKeyKindColumnFamilyRangeDelete InternalKeyKind = 14
52 : InternalKeyKindRangeDelete InternalKeyKind = 15
53 : //InternalKeyKindColumnFamilyBlobIndex InternalKeyKind = 16
54 : //InternalKeyKindBlobIndex InternalKeyKind = 17
55 :
56 : // InternalKeyKindSeparator is a key used for separator / successor keys
57 : // written to sstable block indexes.
58 : //
59 : // NOTE: the RocksDB value has been repurposed. This was done to ensure that
60 : // keys written to block indexes with value "17" (when 17 happened to be the
61 : // max value, and InternalKeyKindMax was therefore set to 17), remain stable
62 : // when new key kinds are supported in Pebble.
63 : InternalKeyKindSeparator InternalKeyKind = 17
64 :
65 : // InternalKeyKindSetWithDelete keys are SET keys that have met with a
66 : // DELETE or SINGLEDEL key in a prior compaction. This key kind is
67 : // specific to Pebble. See
68 : // https://github.com/cockroachdb/pebble/issues/1255.
69 : InternalKeyKindSetWithDelete InternalKeyKind = 18
70 :
71 : // InternalKeyKindRangeKeyDelete removes all range keys within a key range.
72 : // See the internal/rangekey package for more details.
73 : InternalKeyKindRangeKeyDelete InternalKeyKind = 19
74 : // InternalKeyKindRangeKeySet and InternalKeyKindRangeUnset represent
75 : // keys that set and unset values associated with ranges of key
76 : // space. See the internal/rangekey package for more details.
77 : InternalKeyKindRangeKeyUnset InternalKeyKind = 20
78 : InternalKeyKindRangeKeySet InternalKeyKind = 21
79 :
80 : // InternalKeyKindIngestSST is used to distinguish a batch that corresponds to
81 : // the WAL entry for ingested sstables that are added to the flushable
82 : // queue. This InternalKeyKind cannot appear, amongst other key kinds in a
83 : // batch, or in an sstable.
84 : InternalKeyKindIngestSST InternalKeyKind = 22
85 :
86 : // InternalKeyKindDeleteSized keys behave identically to
87 : // InternalKeyKindDelete keys, except that they hold an associated uint64
88 : // value indicating the (len(key)+len(value)) of the shadowed entry the
89 : // tombstone is expected to delete. This value is used to inform compaction
90 : // heuristics, but is not required to be accurate for correctness.
91 : InternalKeyKindDeleteSized InternalKeyKind = 23
92 :
93 : // This maximum value isn't part of the file format. Future extensions may
94 : // increase this value.
95 : //
96 : // When constructing an internal key to pass to DB.Seek{GE,LE},
97 : // internalKeyComparer sorts decreasing by kind (after sorting increasing by
98 : // user key and decreasing by sequence number). Thus, use InternalKeyKindMax,
99 : // which sorts 'less than or equal to' any other valid internalKeyKind, when
100 : // searching for any kind of internal key formed by a certain user key and
101 : // seqNum.
102 : InternalKeyKindMax InternalKeyKind = 23
103 :
104 : // Internal to the sstable format. Not exposed by any sstable iterator.
105 : // Declared here to prevent definition of valid key kinds that set this bit.
106 : InternalKeyKindSSTableInternalObsoleteBit InternalKeyKind = 64
107 : InternalKeyKindSSTableInternalObsoleteMask InternalKeyKind = 191
108 :
109 : // InternalKeyZeroSeqnumMaxTrailer is the largest trailer with a
110 : // zero sequence number.
111 : InternalKeyZeroSeqnumMaxTrailer = uint64(255)
112 :
113 : // A marker for an invalid key.
114 : InternalKeyKindInvalid InternalKeyKind = InternalKeyKindSSTableInternalObsoleteMask
115 :
116 : // InternalKeySeqNumBatch is a bit that is set on batch sequence numbers
117 : // which prevents those entries from being excluded from iteration.
118 : InternalKeySeqNumBatch = uint64(1 << 55)
119 :
120 : // InternalKeySeqNumMax is the largest valid sequence number.
121 : InternalKeySeqNumMax = uint64(1<<56 - 1)
122 :
123 : // InternalKeyRangeDeleteSentinel is the marker for a range delete sentinel
124 : // key. This sequence number and kind are used for the upper stable boundary
125 : // when a range deletion tombstone is the largest key in an sstable. This is
126 : // necessary because sstable boundaries are inclusive, while the end key of a
127 : // range deletion tombstone is exclusive.
128 : InternalKeyRangeDeleteSentinel = (InternalKeySeqNumMax << 8) | uint64(InternalKeyKindRangeDelete)
129 :
130 : // InternalKeyBoundaryRangeKey is the marker for a range key boundary. This
131 : // sequence number and kind are used during interleaved range key and point
132 : // iteration to allow an iterator to stop at range key start keys where
133 : // there exists no point key.
134 : InternalKeyBoundaryRangeKey = (InternalKeySeqNumMax << 8) | uint64(InternalKeyKindRangeKeySet)
135 : )
136 :
137 : // Assert InternalKeyKindSSTableInternalObsoleteBit > InternalKeyKindMax
138 : const _ = uint(InternalKeyKindSSTableInternalObsoleteBit - InternalKeyKindMax - 1)
139 :
140 : var internalKeyKindNames = []string{
141 : InternalKeyKindDelete: "DEL",
142 : InternalKeyKindSet: "SET",
143 : InternalKeyKindMerge: "MERGE",
144 : InternalKeyKindLogData: "LOGDATA",
145 : InternalKeyKindSingleDelete: "SINGLEDEL",
146 : InternalKeyKindRangeDelete: "RANGEDEL",
147 : InternalKeyKindSeparator: "SEPARATOR",
148 : InternalKeyKindSetWithDelete: "SETWITHDEL",
149 : InternalKeyKindRangeKeySet: "RANGEKEYSET",
150 : InternalKeyKindRangeKeyUnset: "RANGEKEYUNSET",
151 : InternalKeyKindRangeKeyDelete: "RANGEKEYDEL",
152 : InternalKeyKindIngestSST: "INGESTSST",
153 : InternalKeyKindDeleteSized: "DELSIZED",
154 : InternalKeyKindInvalid: "INVALID",
155 : }
156 :
157 1 : func (k InternalKeyKind) String() string {
158 1 : if int(k) < len(internalKeyKindNames) {
159 1 : return internalKeyKindNames[k]
160 1 : }
161 0 : return fmt.Sprintf("UNKNOWN:%d", k)
162 : }
163 :
164 : // InternalKey is a key used for the in-memory and on-disk partial DBs that
165 : // make up a pebble DB.
166 : //
167 : // It consists of the user key (as given by the code that uses package pebble)
168 : // followed by 8-bytes of metadata:
169 : // - 1 byte for the type of internal key: delete or set,
170 : // - 7 bytes for a uint56 sequence number, in little-endian format.
171 : type InternalKey struct {
172 : UserKey []byte
173 : Trailer uint64
174 : }
175 :
176 : // InvalidInternalKey is an invalid internal key for which Valid() will return
177 : // false.
178 : var InvalidInternalKey = MakeInternalKey(nil, 0, InternalKeyKindInvalid)
179 :
180 : // MakeInternalKey constructs an internal key from a specified user key,
181 : // sequence number and kind.
182 1 : func MakeInternalKey(userKey []byte, seqNum uint64, kind InternalKeyKind) InternalKey {
183 1 : return InternalKey{
184 1 : UserKey: userKey,
185 1 : Trailer: (seqNum << 8) | uint64(kind),
186 1 : }
187 1 : }
188 :
189 : // MakeTrailer constructs an internal key trailer from the specified sequence
190 : // number and kind.
191 1 : func MakeTrailer(seqNum uint64, kind InternalKeyKind) uint64 {
192 1 : return (seqNum << 8) | uint64(kind)
193 1 : }
194 :
195 : // MakeSearchKey constructs an internal key that is appropriate for searching
196 : // for a the specified user key. The search key contain the maximal sequence
197 : // number and kind ensuring that it sorts before any other internal keys for
198 : // the same user key.
199 1 : func MakeSearchKey(userKey []byte) InternalKey {
200 1 : return InternalKey{
201 1 : UserKey: userKey,
202 1 : Trailer: (InternalKeySeqNumMax << 8) | uint64(InternalKeyKindMax),
203 1 : }
204 1 : }
205 :
206 : // MakeRangeDeleteSentinelKey constructs an internal key that is a range
207 : // deletion sentinel key, used as the upper boundary for an sstable when a
208 : // range deletion is the largest key in an sstable.
209 1 : func MakeRangeDeleteSentinelKey(userKey []byte) InternalKey {
210 1 : return InternalKey{
211 1 : UserKey: userKey,
212 1 : Trailer: InternalKeyRangeDeleteSentinel,
213 1 : }
214 1 : }
215 :
216 : // MakeExclusiveSentinelKey constructs an internal key that is an
217 : // exclusive sentinel key, used as the upper boundary for an sstable
218 : // when a ranged key is the largest key in an sstable.
219 1 : func MakeExclusiveSentinelKey(kind InternalKeyKind, userKey []byte) InternalKey {
220 1 : return InternalKey{
221 1 : UserKey: userKey,
222 1 : Trailer: (InternalKeySeqNumMax << 8) | uint64(kind),
223 1 : }
224 1 : }
225 :
226 : var kindsMap = map[string]InternalKeyKind{
227 : "DEL": InternalKeyKindDelete,
228 : "SINGLEDEL": InternalKeyKindSingleDelete,
229 : "RANGEDEL": InternalKeyKindRangeDelete,
230 : "LOGDATA": InternalKeyKindLogData,
231 : "SET": InternalKeyKindSet,
232 : "MERGE": InternalKeyKindMerge,
233 : "INVALID": InternalKeyKindInvalid,
234 : "SEPARATOR": InternalKeyKindSeparator,
235 : "SETWITHDEL": InternalKeyKindSetWithDelete,
236 : "RANGEKEYSET": InternalKeyKindRangeKeySet,
237 : "RANGEKEYUNSET": InternalKeyKindRangeKeyUnset,
238 : "RANGEKEYDEL": InternalKeyKindRangeKeyDelete,
239 : "INGESTSST": InternalKeyKindIngestSST,
240 : "DELSIZED": InternalKeyKindDeleteSized,
241 : }
242 :
243 : // ParseInternalKey parses the string representation of an internal key. The
244 : // format is <user-key>.<kind>.<seq-num>. If the seq-num starts with a "b" it
245 : // is marked as a batch-seq-num (i.e. the InternalKeySeqNumBatch bit is set).
246 0 : func ParseInternalKey(s string) InternalKey {
247 0 : x := strings.Split(s, ".")
248 0 : ukey := x[0]
249 0 : kind, ok := kindsMap[x[1]]
250 0 : if !ok {
251 0 : panic(fmt.Sprintf("unknown kind: %q", x[1]))
252 : }
253 0 : j := 0
254 0 : if x[2][0] == 'b' {
255 0 : j = 1
256 0 : }
257 0 : seqNum, _ := strconv.ParseUint(x[2][j:], 10, 64)
258 0 : if x[2][0] == 'b' {
259 0 : seqNum |= InternalKeySeqNumBatch
260 0 : }
261 0 : return MakeInternalKey([]byte(ukey), seqNum, kind)
262 : }
263 :
264 : // ParseKind parses the string representation of an internal key kind.
265 0 : func ParseKind(s string) InternalKeyKind {
266 0 : kind, ok := kindsMap[s]
267 0 : if !ok {
268 0 : panic(fmt.Sprintf("unknown kind: %q", s))
269 : }
270 0 : return kind
271 : }
272 :
273 : // InternalTrailerLen is the number of bytes used to encode InternalKey.Trailer.
274 : const InternalTrailerLen = 8
275 :
276 : // DecodeInternalKey decodes an encoded internal key. See InternalKey.Encode().
277 1 : func DecodeInternalKey(encodedKey []byte) InternalKey {
278 1 : n := len(encodedKey) - InternalTrailerLen
279 1 : var trailer uint64
280 1 : if n >= 0 {
281 1 : trailer = binary.LittleEndian.Uint64(encodedKey[n:])
282 1 : encodedKey = encodedKey[:n:n]
283 1 : } else {
284 1 : trailer = uint64(InternalKeyKindInvalid)
285 1 : encodedKey = nil
286 1 : }
287 1 : return InternalKey{
288 1 : UserKey: encodedKey,
289 1 : Trailer: trailer,
290 1 : }
291 : }
292 :
293 : // InternalCompare compares two internal keys using the specified comparison
294 : // function. For equal user keys, internal keys compare in descending sequence
295 : // number order. For equal user keys and sequence numbers, internal keys
296 : // compare in descending kind order (this may happen in practice among range
297 : // keys).
298 1 : func InternalCompare(userCmp Compare, a, b InternalKey) int {
299 1 : if x := userCmp(a.UserKey, b.UserKey); x != 0 {
300 1 : return x
301 1 : }
302 1 : if a.Trailer > b.Trailer {
303 1 : return -1
304 1 : }
305 1 : if a.Trailer < b.Trailer {
306 1 : return 1
307 1 : }
308 1 : return 0
309 : }
310 :
311 : // Encode encodes the receiver into the buffer. The buffer must be large enough
312 : // to hold the encoded data. See InternalKey.Size().
313 1 : func (k InternalKey) Encode(buf []byte) {
314 1 : i := copy(buf, k.UserKey)
315 1 : binary.LittleEndian.PutUint64(buf[i:], k.Trailer)
316 1 : }
317 :
318 : // EncodeTrailer returns the trailer encoded to an 8-byte array.
319 1 : func (k InternalKey) EncodeTrailer() [8]byte {
320 1 : var buf [8]byte
321 1 : binary.LittleEndian.PutUint64(buf[:], k.Trailer)
322 1 : return buf
323 1 : }
324 :
325 : // Separator returns a separator key such that k <= x && x < other, where less
326 : // than is consistent with the Compare function. The buf parameter may be used
327 : // to store the returned InternalKey.UserKey, though it is valid to pass a
328 : // nil. See the Separator type for details on separator keys.
329 : func (k InternalKey) Separator(
330 : cmp Compare, sep Separator, buf []byte, other InternalKey,
331 1 : ) InternalKey {
332 1 : buf = sep(buf, k.UserKey, other.UserKey)
333 1 : if len(buf) <= len(k.UserKey) && cmp(k.UserKey, buf) < 0 {
334 1 : // The separator user key is physically shorter than k.UserKey (if it is
335 1 : // longer, we'll continue to use "k"), but logically after. Tack on the max
336 1 : // sequence number to the shortened user key. Note that we could tack on
337 1 : // any sequence number and kind here to create a valid separator key. We
338 1 : // use the max sequence number to match the behavior of LevelDB and
339 1 : // RocksDB.
340 1 : return MakeInternalKey(buf, InternalKeySeqNumMax, InternalKeyKindSeparator)
341 1 : }
342 1 : return k
343 : }
344 :
345 : // Successor returns a successor key such that k <= x. A simple implementation
346 : // may return k unchanged. The buf parameter may be used to store the returned
347 : // InternalKey.UserKey, though it is valid to pass a nil.
348 1 : func (k InternalKey) Successor(cmp Compare, succ Successor, buf []byte) InternalKey {
349 1 : buf = succ(buf, k.UserKey)
350 1 : if len(buf) <= len(k.UserKey) && cmp(k.UserKey, buf) < 0 {
351 1 : // The successor user key is physically shorter that k.UserKey (if it is
352 1 : // longer, we'll continue to use "k"), but logically after. Tack on the max
353 1 : // sequence number to the shortened user key. Note that we could tack on
354 1 : // any sequence number and kind here to create a valid separator key. We
355 1 : // use the max sequence number to match the behavior of LevelDB and
356 1 : // RocksDB.
357 1 : return MakeInternalKey(buf, InternalKeySeqNumMax, InternalKeyKindSeparator)
358 1 : }
359 1 : return k
360 : }
361 :
362 : // Size returns the encoded size of the key.
363 1 : func (k InternalKey) Size() int {
364 1 : return len(k.UserKey) + 8
365 1 : }
366 :
367 : // SetSeqNum sets the sequence number component of the key.
368 1 : func (k *InternalKey) SetSeqNum(seqNum uint64) {
369 1 : k.Trailer = (seqNum << 8) | (k.Trailer & 0xff)
370 1 : }
371 :
372 : // SeqNum returns the sequence number component of the key.
373 1 : func (k InternalKey) SeqNum() uint64 {
374 1 : return k.Trailer >> 8
375 1 : }
376 :
377 : // Visible returns true if the key is visible at the specified snapshot
378 : // sequence number.
379 1 : func (k InternalKey) Visible(snapshot, batchSnapshot uint64) bool {
380 1 : return Visible(k.SeqNum(), snapshot, batchSnapshot)
381 1 : }
382 :
383 : // Visible returns true if a key with the provided sequence number is visible at
384 : // the specified snapshot sequence numbers.
385 1 : func Visible(seqNum uint64, snapshot, batchSnapshot uint64) bool {
386 1 : // There are two snapshot sequence numbers, one for committed keys and one
387 1 : // for batch keys. If a seqNum is less than `snapshot`, then seqNum
388 1 : // corresponds to a committed key that is visible. If seqNum has its batch
389 1 : // bit set, then seqNum corresponds to an uncommitted batch key. Its
390 1 : // visible if its snapshot is less than batchSnapshot.
391 1 : //
392 1 : // There's one complication. The maximal sequence number
393 1 : // (`InternalKeySeqNumMax`) is used across Pebble for exclusive sentinel
394 1 : // keys and other purposes. The maximal sequence number has its batch bit
395 1 : // set, but it can never be < `batchSnapshot`, since there is no expressible
396 1 : // larger snapshot. We dictate that the maximal sequence number is always
397 1 : // visible.
398 1 : return seqNum < snapshot ||
399 1 : ((seqNum&InternalKeySeqNumBatch) != 0 && seqNum < batchSnapshot) ||
400 1 : seqNum == InternalKeySeqNumMax
401 1 : }
402 :
403 : // SetKind sets the kind component of the key.
404 1 : func (k *InternalKey) SetKind(kind InternalKeyKind) {
405 1 : k.Trailer = (k.Trailer &^ 0xff) | uint64(kind)
406 1 : }
407 :
408 : // Kind returns the kind component of the key.
409 1 : func (k InternalKey) Kind() InternalKeyKind {
410 1 : return TrailerKind(k.Trailer)
411 1 : }
412 :
413 : // TrailerKind returns the key kind of the key trailer.
414 1 : func TrailerKind(trailer uint64) InternalKeyKind {
415 1 : return InternalKeyKind(trailer & 0xff)
416 1 : }
417 :
418 : // Valid returns true if the key has a valid kind.
419 0 : func (k InternalKey) Valid() bool {
420 0 : return k.Kind() <= InternalKeyKindMax
421 0 : }
422 :
423 : // Clone clones the storage for the UserKey component of the key.
424 1 : func (k InternalKey) Clone() InternalKey {
425 1 : if len(k.UserKey) == 0 {
426 0 : return k
427 0 : }
428 1 : return InternalKey{
429 1 : UserKey: append([]byte(nil), k.UserKey...),
430 1 : Trailer: k.Trailer,
431 1 : }
432 : }
433 :
434 : // CopyFrom converts this InternalKey into a clone of the passed-in InternalKey,
435 : // reusing any space already used for the current UserKey.
436 0 : func (k *InternalKey) CopyFrom(k2 InternalKey) {
437 0 : k.UserKey = append(k.UserKey[:0], k2.UserKey...)
438 0 : k.Trailer = k2.Trailer
439 0 : }
440 :
441 : // String returns a string representation of the key.
442 1 : func (k InternalKey) String() string {
443 1 : return fmt.Sprintf("%s#%d,%d", FormatBytes(k.UserKey), k.SeqNum(), k.Kind())
444 1 : }
445 :
446 : // Pretty returns a formatter for the key.
447 1 : func (k InternalKey) Pretty(f FormatKey) fmt.Formatter {
448 1 : return prettyInternalKey{k, f}
449 1 : }
450 :
451 : // IsExclusiveSentinel returns whether this internal key excludes point keys
452 : // with the same user key if used as an end boundary. See the comment on
453 : // InternalKeyRangeDeletionSentinel.
454 1 : func (k InternalKey) IsExclusiveSentinel() bool {
455 1 : switch kind := k.Kind(); kind {
456 1 : case InternalKeyKindRangeDelete:
457 1 : return k.Trailer == InternalKeyRangeDeleteSentinel
458 1 : case InternalKeyKindRangeKeyDelete, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeySet:
459 1 : return (k.Trailer >> 8) == InternalKeySeqNumMax
460 1 : default:
461 1 : return false
462 : }
463 : }
464 :
465 : type prettyInternalKey struct {
466 : InternalKey
467 : formatKey FormatKey
468 : }
469 :
470 1 : func (k prettyInternalKey) Format(s fmt.State, c rune) {
471 1 : if seqNum := k.SeqNum(); seqNum == InternalKeySeqNumMax {
472 1 : fmt.Fprintf(s, "%s#inf,%s", k.formatKey(k.UserKey), k.Kind())
473 1 : } else {
474 1 : fmt.Fprintf(s, "%s#%d,%s", k.formatKey(k.UserKey), k.SeqNum(), k.Kind())
475 1 : }
476 : }
477 :
478 : // ParsePrettyInternalKey parses the pretty string representation of an
479 : // internal key. The format is <user-key>#<seq-num>,<kind>.
480 0 : func ParsePrettyInternalKey(s string) InternalKey {
481 0 : x := strings.FieldsFunc(s, func(c rune) bool { return c == '#' || c == ',' })
482 0 : ukey := x[0]
483 0 : kind, ok := kindsMap[x[2]]
484 0 : if !ok {
485 0 : panic(fmt.Sprintf("unknown kind: %q", x[2]))
486 : }
487 0 : var seqNum uint64
488 0 : if x[1] == "max" || x[1] == "inf" {
489 0 : seqNum = InternalKeySeqNumMax
490 0 : } else {
491 0 : seqNum, _ = strconv.ParseUint(x[1], 10, 64)
492 0 : }
493 0 : return MakeInternalKey([]byte(ukey), seqNum, kind)
494 : }
|