Line data Source code
1 : // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package base // import "github.com/cockroachdb/pebble/internal/base"
6 :
7 : import (
8 : "encoding/binary"
9 : "fmt"
10 : "strconv"
11 : "strings"
12 : )
13 :
14 : const (
15 : // SeqNumZero is the zero sequence number, set by compactions if they can
16 : // guarantee there are no keys underneath an internal key.
17 : SeqNumZero = uint64(0)
18 : // SeqNumStart is the first sequence number assigned to a key. Sequence
19 : // numbers 1-9 are reserved for potential future use.
20 : SeqNumStart = uint64(10)
21 : )
22 :
23 : // InternalKeyKind enumerates the kind of key: a deletion tombstone, a set
24 : // value, a merged value, etc.
25 : type InternalKeyKind uint8
26 :
27 : // These constants are part of the file format, and should not be changed.
28 : const (
29 : InternalKeyKindDelete InternalKeyKind = 0
30 : InternalKeyKindSet InternalKeyKind = 1
31 : InternalKeyKindMerge InternalKeyKind = 2
32 : InternalKeyKindLogData InternalKeyKind = 3
33 : //InternalKeyKindColumnFamilyDeletion InternalKeyKind = 4
34 : //InternalKeyKindColumnFamilyValue InternalKeyKind = 5
35 : //InternalKeyKindColumnFamilyMerge InternalKeyKind = 6
36 :
37 : // InternalKeyKindSingleDelete (SINGLEDEL) is a performance optimization
38 : // solely for compactions (to reduce write amp and space amp). Readers other
39 : // than compactions should treat SINGLEDEL as equivalent to a DEL.
40 : // Historically, it was simpler for readers other than compactions to treat
41 : // SINGLEDEL as equivalent to DEL, but as of the introduction of
42 : // InternalKeyKindSSTableInternalObsoleteBit, this is also necessary for
43 : // correctness.
44 : InternalKeyKindSingleDelete InternalKeyKind = 7
45 : //InternalKeyKindColumnFamilySingleDelete InternalKeyKind = 8
46 : //InternalKeyKindBeginPrepareXID InternalKeyKind = 9
47 : //InternalKeyKindEndPrepareXID InternalKeyKind = 10
48 : //InternalKeyKindCommitXID InternalKeyKind = 11
49 : //InternalKeyKindRollbackXID InternalKeyKind = 12
50 : //InternalKeyKindNoop InternalKeyKind = 13
51 : //InternalKeyKindColumnFamilyRangeDelete InternalKeyKind = 14
52 : InternalKeyKindRangeDelete InternalKeyKind = 15
53 : //InternalKeyKindColumnFamilyBlobIndex InternalKeyKind = 16
54 : //InternalKeyKindBlobIndex InternalKeyKind = 17
55 :
56 : // InternalKeyKindSeparator is a key used for separator / successor keys
57 : // written to sstable block indexes.
58 : //
59 : // NOTE: the RocksDB value has been repurposed. This was done to ensure that
60 : // keys written to block indexes with value "17" (when 17 happened to be the
61 : // max value, and InternalKeyKindMax was therefore set to 17), remain stable
62 : // when new key kinds are supported in Pebble.
63 : InternalKeyKindSeparator InternalKeyKind = 17
64 :
65 : // InternalKeyKindSetWithDelete keys are SET keys that have met with a
66 : // DELETE or SINGLEDEL key in a prior compaction. This key kind is
67 : // specific to Pebble. See
68 : // https://github.com/cockroachdb/pebble/issues/1255.
69 : InternalKeyKindSetWithDelete InternalKeyKind = 18
70 :
71 : // InternalKeyKindRangeKeyDelete removes all range keys within a key range.
72 : // See the internal/rangekey package for more details.
73 : InternalKeyKindRangeKeyDelete InternalKeyKind = 19
74 : // InternalKeyKindRangeKeySet and InternalKeyKindRangeUnset represent
75 : // keys that set and unset values associated with ranges of key
76 : // space. See the internal/rangekey package for more details.
77 : InternalKeyKindRangeKeyUnset InternalKeyKind = 20
78 : InternalKeyKindRangeKeySet InternalKeyKind = 21
79 :
80 : // InternalKeyKindIngestSST is used to distinguish a batch that corresponds to
81 : // the WAL entry for ingested sstables that are added to the flushable
82 : // queue. This InternalKeyKind cannot appear, amongst other key kinds in a
83 : // batch, or in an sstable.
84 : InternalKeyKindIngestSST InternalKeyKind = 22
85 :
86 : // InternalKeyKindDeleteSized keys behave identically to
87 : // InternalKeyKindDelete keys, except that they hold an associated uint64
88 : // value indicating the (len(key)+len(value)) of the shadowed entry the
89 : // tombstone is expected to delete. This value is used to inform compaction
90 : // heuristics, but is not required to be accurate for correctness.
91 : InternalKeyKindDeleteSized InternalKeyKind = 23
92 :
93 : // This maximum value isn't part of the file format. Future extensions may
94 : // increase this value.
95 : //
96 : // When constructing an internal key to pass to DB.Seek{GE,LE},
97 : // internalKeyComparer sorts decreasing by kind (after sorting increasing by
98 : // user key and decreasing by sequence number). Thus, use InternalKeyKindMax,
99 : // which sorts 'less than or equal to' any other valid internalKeyKind, when
100 : // searching for any kind of internal key formed by a certain user key and
101 : // seqNum.
102 : InternalKeyKindMax InternalKeyKind = 23
103 :
104 : // Internal to the sstable format. Not exposed by any sstable iterator.
105 : // Declared here to prevent definition of valid key kinds that set this bit.
106 : InternalKeyKindSSTableInternalObsoleteBit InternalKeyKind = 64
107 : InternalKeyKindSSTableInternalObsoleteMask InternalKeyKind = 191
108 :
109 : // InternalKeyZeroSeqnumMaxTrailer is the largest trailer with a
110 : // zero sequence number.
111 : InternalKeyZeroSeqnumMaxTrailer = uint64(255)
112 :
113 : // A marker for an invalid key.
114 : InternalKeyKindInvalid InternalKeyKind = InternalKeyKindSSTableInternalObsoleteMask
115 :
116 : // InternalKeySeqNumBatch is a bit that is set on batch sequence numbers
117 : // which prevents those entries from being excluded from iteration.
118 : InternalKeySeqNumBatch = uint64(1 << 55)
119 :
120 : // InternalKeySeqNumMax is the largest valid sequence number.
121 : InternalKeySeqNumMax = uint64(1<<56 - 1)
122 :
123 : // InternalKeyRangeDeleteSentinel is the marker for a range delete sentinel
124 : // key. This sequence number and kind are used for the upper stable boundary
125 : // when a range deletion tombstone is the largest key in an sstable. This is
126 : // necessary because sstable boundaries are inclusive, while the end key of a
127 : // range deletion tombstone is exclusive.
128 : InternalKeyRangeDeleteSentinel = (InternalKeySeqNumMax << 8) | uint64(InternalKeyKindRangeDelete)
129 :
130 : // InternalKeyBoundaryRangeKey is the marker for a range key boundary. This
131 : // sequence number and kind are used during interleaved range key and point
132 : // iteration to allow an iterator to stop at range key start keys where
133 : // there exists no point key.
134 : InternalKeyBoundaryRangeKey = (InternalKeySeqNumMax << 8) | uint64(InternalKeyKindRangeKeySet)
135 : )
136 :
137 : // Assert InternalKeyKindSSTableInternalObsoleteBit > InternalKeyKindMax
138 : const _ = uint(InternalKeyKindSSTableInternalObsoleteBit - InternalKeyKindMax - 1)
139 :
140 : var internalKeyKindNames = []string{
141 : InternalKeyKindDelete: "DEL",
142 : InternalKeyKindSet: "SET",
143 : InternalKeyKindMerge: "MERGE",
144 : InternalKeyKindLogData: "LOGDATA",
145 : InternalKeyKindSingleDelete: "SINGLEDEL",
146 : InternalKeyKindRangeDelete: "RANGEDEL",
147 : InternalKeyKindSeparator: "SEPARATOR",
148 : InternalKeyKindSetWithDelete: "SETWITHDEL",
149 : InternalKeyKindRangeKeySet: "RANGEKEYSET",
150 : InternalKeyKindRangeKeyUnset: "RANGEKEYUNSET",
151 : InternalKeyKindRangeKeyDelete: "RANGEKEYDEL",
152 : InternalKeyKindIngestSST: "INGESTSST",
153 : InternalKeyKindDeleteSized: "DELSIZED",
154 : InternalKeyKindInvalid: "INVALID",
155 : }
156 :
157 2 : func (k InternalKeyKind) String() string {
158 2 : if int(k) < len(internalKeyKindNames) {
159 2 : return internalKeyKindNames[k]
160 2 : }
161 0 : return fmt.Sprintf("UNKNOWN:%d", k)
162 : }
163 :
164 : // InternalKey is a key used for the in-memory and on-disk partial DBs that
165 : // make up a pebble DB.
166 : //
167 : // It consists of the user key (as given by the code that uses package pebble)
168 : // followed by 8-bytes of metadata:
169 : // - 1 byte for the type of internal key: delete or set,
170 : // - 7 bytes for a uint56 sequence number, in little-endian format.
171 : type InternalKey struct {
172 : UserKey []byte
173 : Trailer uint64
174 : }
175 :
176 : // InvalidInternalKey is an invalid internal key for which Valid() will return
177 : // false.
178 : var InvalidInternalKey = MakeInternalKey(nil, 0, InternalKeyKindInvalid)
179 :
180 : // MakeInternalKey constructs an internal key from a specified user key,
181 : // sequence number and kind.
182 2 : func MakeInternalKey(userKey []byte, seqNum uint64, kind InternalKeyKind) InternalKey {
183 2 : return InternalKey{
184 2 : UserKey: userKey,
185 2 : Trailer: (seqNum << 8) | uint64(kind),
186 2 : }
187 2 : }
188 :
189 : // MakeTrailer constructs an internal key trailer from the specified sequence
190 : // number and kind.
191 2 : func MakeTrailer(seqNum uint64, kind InternalKeyKind) uint64 {
192 2 : return (seqNum << 8) | uint64(kind)
193 2 : }
194 :
195 : // MakeSearchKey constructs an internal key that is appropriate for searching
196 : // for a the specified user key. The search key contain the maximal sequence
197 : // number and kind ensuring that it sorts before any other internal keys for
198 : // the same user key.
199 2 : func MakeSearchKey(userKey []byte) InternalKey {
200 2 : return InternalKey{
201 2 : UserKey: userKey,
202 2 : Trailer: (InternalKeySeqNumMax << 8) | uint64(InternalKeyKindMax),
203 2 : }
204 2 : }
205 :
206 : // MakeRangeDeleteSentinelKey constructs an internal key that is a range
207 : // deletion sentinel key, used as the upper boundary for an sstable when a
208 : // range deletion is the largest key in an sstable.
209 2 : func MakeRangeDeleteSentinelKey(userKey []byte) InternalKey {
210 2 : return InternalKey{
211 2 : UserKey: userKey,
212 2 : Trailer: InternalKeyRangeDeleteSentinel,
213 2 : }
214 2 : }
215 :
216 : // MakeExclusiveSentinelKey constructs an internal key that is an
217 : // exclusive sentinel key, used as the upper boundary for an sstable
218 : // when a ranged key is the largest key in an sstable.
219 2 : func MakeExclusiveSentinelKey(kind InternalKeyKind, userKey []byte) InternalKey {
220 2 : return InternalKey{
221 2 : UserKey: userKey,
222 2 : Trailer: (InternalKeySeqNumMax << 8) | uint64(kind),
223 2 : }
224 2 : }
225 :
226 : var kindsMap = map[string]InternalKeyKind{
227 : "DEL": InternalKeyKindDelete,
228 : "SINGLEDEL": InternalKeyKindSingleDelete,
229 : "RANGEDEL": InternalKeyKindRangeDelete,
230 : "LOGDATA": InternalKeyKindLogData,
231 : "SET": InternalKeyKindSet,
232 : "MERGE": InternalKeyKindMerge,
233 : "INVALID": InternalKeyKindInvalid,
234 : "SEPARATOR": InternalKeyKindSeparator,
235 : "SETWITHDEL": InternalKeyKindSetWithDelete,
236 : "RANGEKEYSET": InternalKeyKindRangeKeySet,
237 : "RANGEKEYUNSET": InternalKeyKindRangeKeyUnset,
238 : "RANGEKEYDEL": InternalKeyKindRangeKeyDelete,
239 : "INGESTSST": InternalKeyKindIngestSST,
240 : "DELSIZED": InternalKeyKindDeleteSized,
241 : }
242 :
243 : // ParseInternalKey parses the string representation of an internal key. The
244 : // format is <user-key>.<kind>.<seq-num>. If the seq-num starts with a "b" it
245 : // is marked as a batch-seq-num (i.e. the InternalKeySeqNumBatch bit is set).
246 1 : func ParseInternalKey(s string) InternalKey {
247 1 : x := strings.Split(s, ".")
248 1 : ukey := x[0]
249 1 : kind, ok := kindsMap[x[1]]
250 1 : if !ok {
251 0 : panic(fmt.Sprintf("unknown kind: %q", x[1]))
252 : }
253 1 : j := 0
254 1 : if x[2][0] == 'b' {
255 1 : j = 1
256 1 : }
257 1 : seqNum, _ := strconv.ParseUint(x[2][j:], 10, 64)
258 1 : if x[2][0] == 'b' {
259 1 : seqNum |= InternalKeySeqNumBatch
260 1 : }
261 1 : return MakeInternalKey([]byte(ukey), seqNum, kind)
262 : }
263 :
264 : // ParseKind parses the string representation of an internal key kind.
265 1 : func ParseKind(s string) InternalKeyKind {
266 1 : kind, ok := kindsMap[s]
267 1 : if !ok {
268 0 : panic(fmt.Sprintf("unknown kind: %q", s))
269 : }
270 1 : return kind
271 : }
272 :
273 : // InternalTrailerLen is the number of bytes used to encode InternalKey.Trailer.
274 : const InternalTrailerLen = 8
275 :
276 : // DecodeInternalKey decodes an encoded internal key. See InternalKey.Encode().
277 2 : func DecodeInternalKey(encodedKey []byte) InternalKey {
278 2 : n := len(encodedKey) - InternalTrailerLen
279 2 : var trailer uint64
280 2 : if n >= 0 {
281 2 : trailer = binary.LittleEndian.Uint64(encodedKey[n:])
282 2 : encodedKey = encodedKey[:n:n]
283 2 : } else {
284 2 : trailer = uint64(InternalKeyKindInvalid)
285 2 : encodedKey = nil
286 2 : }
287 2 : return InternalKey{
288 2 : UserKey: encodedKey,
289 2 : Trailer: trailer,
290 2 : }
291 : }
292 :
293 : // InternalCompare compares two internal keys using the specified comparison
294 : // function. For equal user keys, internal keys compare in descending sequence
295 : // number order. For equal user keys and sequence numbers, internal keys
296 : // compare in descending kind order (this may happen in practice among range
297 : // keys).
298 2 : func InternalCompare(userCmp Compare, a, b InternalKey) int {
299 2 : if x := userCmp(a.UserKey, b.UserKey); x != 0 {
300 2 : return x
301 2 : }
302 2 : if a.Trailer > b.Trailer {
303 2 : return -1
304 2 : }
305 2 : if a.Trailer < b.Trailer {
306 2 : return 1
307 2 : }
308 2 : return 0
309 : }
310 :
311 : // Encode encodes the receiver into the buffer. The buffer must be large enough
312 : // to hold the encoded data. See InternalKey.Size().
313 2 : func (k InternalKey) Encode(buf []byte) {
314 2 : i := copy(buf, k.UserKey)
315 2 : binary.LittleEndian.PutUint64(buf[i:], k.Trailer)
316 2 : }
317 :
318 : // EncodeTrailer returns the trailer encoded to an 8-byte array.
319 2 : func (k InternalKey) EncodeTrailer() [8]byte {
320 2 : var buf [8]byte
321 2 : binary.LittleEndian.PutUint64(buf[:], k.Trailer)
322 2 : return buf
323 2 : }
324 :
325 : // Separator returns a separator key such that k <= x && x < other, where less
326 : // than is consistent with the Compare function. The buf parameter may be used
327 : // to store the returned InternalKey.UserKey, though it is valid to pass a
328 : // nil. See the Separator type for details on separator keys.
329 : func (k InternalKey) Separator(
330 : cmp Compare, sep Separator, buf []byte, other InternalKey,
331 2 : ) InternalKey {
332 2 : buf = sep(buf, k.UserKey, other.UserKey)
333 2 : if len(buf) <= len(k.UserKey) && cmp(k.UserKey, buf) < 0 {
334 2 : // The separator user key is physically shorter than k.UserKey (if it is
335 2 : // longer, we'll continue to use "k"), but logically after. Tack on the max
336 2 : // sequence number to the shortened user key. Note that we could tack on
337 2 : // any sequence number and kind here to create a valid separator key. We
338 2 : // use the max sequence number to match the behavior of LevelDB and
339 2 : // RocksDB.
340 2 : return MakeInternalKey(buf, InternalKeySeqNumMax, InternalKeyKindSeparator)
341 2 : }
342 2 : return k
343 : }
344 :
345 : // Successor returns a successor key such that k <= x. A simple implementation
346 : // may return k unchanged. The buf parameter may be used to store the returned
347 : // InternalKey.UserKey, though it is valid to pass a nil.
348 2 : func (k InternalKey) Successor(cmp Compare, succ Successor, buf []byte) InternalKey {
349 2 : buf = succ(buf, k.UserKey)
350 2 : if len(buf) <= len(k.UserKey) && cmp(k.UserKey, buf) < 0 {
351 2 : // The successor user key is physically shorter that k.UserKey (if it is
352 2 : // longer, we'll continue to use "k"), but logically after. Tack on the max
353 2 : // sequence number to the shortened user key. Note that we could tack on
354 2 : // any sequence number and kind here to create a valid separator key. We
355 2 : // use the max sequence number to match the behavior of LevelDB and
356 2 : // RocksDB.
357 2 : return MakeInternalKey(buf, InternalKeySeqNumMax, InternalKeyKindSeparator)
358 2 : }
359 2 : return k
360 : }
361 :
362 : // Size returns the encoded size of the key.
363 2 : func (k InternalKey) Size() int {
364 2 : return len(k.UserKey) + 8
365 2 : }
366 :
367 : // SetSeqNum sets the sequence number component of the key.
368 2 : func (k *InternalKey) SetSeqNum(seqNum uint64) {
369 2 : k.Trailer = (seqNum << 8) | (k.Trailer & 0xff)
370 2 : }
371 :
372 : // SeqNum returns the sequence number component of the key.
373 2 : func (k InternalKey) SeqNum() uint64 {
374 2 : return k.Trailer >> 8
375 2 : }
376 :
377 : // Visible returns true if the key is visible at the specified snapshot
378 : // sequence number.
379 2 : func (k InternalKey) Visible(snapshot, batchSnapshot uint64) bool {
380 2 : return Visible(k.SeqNum(), snapshot, batchSnapshot)
381 2 : }
382 :
383 : // Visible returns true if a key with the provided sequence number is visible at
384 : // the specified snapshot sequence numbers.
385 2 : func Visible(seqNum uint64, snapshot, batchSnapshot uint64) bool {
386 2 : // There are two snapshot sequence numbers, one for committed keys and one
387 2 : // for batch keys. If a seqNum is less than `snapshot`, then seqNum
388 2 : // corresponds to a committed key that is visible. If seqNum has its batch
389 2 : // bit set, then seqNum corresponds to an uncommitted batch key. Its
390 2 : // visible if its snapshot is less than batchSnapshot.
391 2 : //
392 2 : // There's one complication. The maximal sequence number
393 2 : // (`InternalKeySeqNumMax`) is used across Pebble for exclusive sentinel
394 2 : // keys and other purposes. The maximal sequence number has its batch bit
395 2 : // set, but it can never be < `batchSnapshot`, since there is no expressible
396 2 : // larger snapshot. We dictate that the maximal sequence number is always
397 2 : // visible.
398 2 : return seqNum < snapshot ||
399 2 : ((seqNum&InternalKeySeqNumBatch) != 0 && seqNum < batchSnapshot) ||
400 2 : seqNum == InternalKeySeqNumMax
401 2 : }
402 :
403 : // SetKind sets the kind component of the key.
404 2 : func (k *InternalKey) SetKind(kind InternalKeyKind) {
405 2 : k.Trailer = (k.Trailer &^ 0xff) | uint64(kind)
406 2 : }
407 :
408 : // Kind returns the kind component of the key.
409 2 : func (k InternalKey) Kind() InternalKeyKind {
410 2 : return TrailerKind(k.Trailer)
411 2 : }
412 :
413 : // TrailerKind returns the key kind of the key trailer.
414 2 : func TrailerKind(trailer uint64) InternalKeyKind {
415 2 : return InternalKeyKind(trailer & 0xff)
416 2 : }
417 :
418 : // Valid returns true if the key has a valid kind.
419 1 : func (k InternalKey) Valid() bool {
420 1 : return k.Kind() <= InternalKeyKindMax
421 1 : }
422 :
423 : // Clone clones the storage for the UserKey component of the key.
424 2 : func (k InternalKey) Clone() InternalKey {
425 2 : if len(k.UserKey) == 0 {
426 1 : return k
427 1 : }
428 2 : return InternalKey{
429 2 : UserKey: append([]byte(nil), k.UserKey...),
430 2 : Trailer: k.Trailer,
431 2 : }
432 : }
433 :
434 : // CopyFrom converts this InternalKey into a clone of the passed-in InternalKey,
435 : // reusing any space already used for the current UserKey.
436 1 : func (k *InternalKey) CopyFrom(k2 InternalKey) {
437 1 : k.UserKey = append(k.UserKey[:0], k2.UserKey...)
438 1 : k.Trailer = k2.Trailer
439 1 : }
440 :
441 : // String returns a string representation of the key.
442 2 : func (k InternalKey) String() string {
443 2 : return fmt.Sprintf("%s#%d,%d", FormatBytes(k.UserKey), k.SeqNum(), k.Kind())
444 2 : }
445 :
446 : // Pretty returns a formatter for the key.
447 2 : func (k InternalKey) Pretty(f FormatKey) fmt.Formatter {
448 2 : return prettyInternalKey{k, f}
449 2 : }
450 :
451 : // IsExclusiveSentinel returns whether this internal key excludes point keys
452 : // with the same user key if used as an end boundary. See the comment on
453 : // InternalKeyRangeDeletionSentinel.
454 2 : func (k InternalKey) IsExclusiveSentinel() bool {
455 2 : switch kind := k.Kind(); kind {
456 2 : case InternalKeyKindRangeDelete:
457 2 : return k.Trailer == InternalKeyRangeDeleteSentinel
458 2 : case InternalKeyKindRangeKeyDelete, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeySet:
459 2 : return (k.Trailer >> 8) == InternalKeySeqNumMax
460 2 : default:
461 2 : return false
462 : }
463 : }
464 :
465 : type prettyInternalKey struct {
466 : InternalKey
467 : formatKey FormatKey
468 : }
469 :
470 2 : func (k prettyInternalKey) Format(s fmt.State, c rune) {
471 2 : if seqNum := k.SeqNum(); seqNum == InternalKeySeqNumMax {
472 2 : fmt.Fprintf(s, "%s#inf,%s", k.formatKey(k.UserKey), k.Kind())
473 2 : } else {
474 2 : fmt.Fprintf(s, "%s#%d,%s", k.formatKey(k.UserKey), k.SeqNum(), k.Kind())
475 2 : }
476 : }
477 :
478 : // ParsePrettyInternalKey parses the pretty string representation of an
479 : // internal key. The format is <user-key>#<seq-num>,<kind>.
480 1 : func ParsePrettyInternalKey(s string) InternalKey {
481 1 : x := strings.FieldsFunc(s, func(c rune) bool { return c == '#' || c == ',' })
482 1 : ukey := x[0]
483 1 : kind, ok := kindsMap[x[2]]
484 1 : if !ok {
485 0 : panic(fmt.Sprintf("unknown kind: %q", x[2]))
486 : }
487 1 : var seqNum uint64
488 1 : if x[1] == "max" || x[1] == "inf" {
489 1 : seqNum = InternalKeySeqNumMax
490 1 : } else {
491 1 : seqNum, _ = strconv.ParseUint(x[1], 10, 64)
492 1 : }
493 1 : return MakeInternalKey([]byte(ukey), seqNum, kind)
494 : }
|