Line data Source code
1 : // Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : // Package crdbtest provides facilities for representing keys, workloads, etc
6 : // representative of CockroachDB's use of Pebble.
7 : package crdbtest
8 :
9 : import (
10 : "bytes"
11 : "cmp"
12 : "encoding/binary"
13 : "fmt"
14 : "io"
15 : "strings"
16 : "unsafe"
17 :
18 : "github.com/cockroachdb/crlib/crbytes"
19 : "github.com/cockroachdb/errors"
20 : "github.com/cockroachdb/pebble/internal/base"
21 : "github.com/cockroachdb/pebble/internal/invariants"
22 : "github.com/cockroachdb/pebble/sstable/colblk"
23 : )
24 :
25 : const (
26 : engineKeyNoVersion = 0
27 : engineKeyVersionWallTimeLen = 8
28 : engineKeyVersionWallAndLogicalTimeLen = 12
29 : engineKeyVersionWallLogicalAndSyntheticTimeLen = 13
30 : engineKeyVersionLockTableLen = 17
31 :
32 : mvccEncodedTimeSentinelLen = 1
33 : mvccEncodedTimeWallLen = 8
34 : mvccEncodedTimeLogicalLen = 4
35 : mvccEncodedTimeSyntheticLen = 1
36 : mvccEncodedTimeLengthLen = 1
37 :
38 : suffixLenWithWall = engineKeyVersionWallTimeLen + 1
39 : suffixLenWithLogical = engineKeyVersionWallAndLogicalTimeLen + 1
40 : suffixLenWithSynthetic = engineKeyVersionWallLogicalAndSyntheticTimeLen + 1
41 : suffixLenWithLockTable = engineKeyVersionLockTableLen + 1
42 : )
43 :
44 : // MaxSuffixLen is the maximum length of the CockroachDB key suffix.
45 : const MaxSuffixLen = 1 + max(
46 : engineKeyVersionLockTableLen,
47 : engineKeyVersionWallLogicalAndSyntheticTimeLen,
48 : engineKeyVersionWallAndLogicalTimeLen,
49 : engineKeyVersionWallTimeLen,
50 : )
51 :
52 : // Comparer is a base.Comparer for CockroachDB keys.
53 : var Comparer = base.Comparer{
54 : Split: Split,
55 : ComparePointSuffixes: ComparePointSuffixes,
56 : CompareRangeSuffixes: CompareRangeSuffixes,
57 : Compare: Compare,
58 : Equal: Equal,
59 0 : AbbreviatedKey: func(k []byte) uint64 {
60 0 : key, ok := getKeyPartFromEngineKey(k)
61 0 : if !ok {
62 0 : return 0
63 0 : }
64 0 : return base.DefaultComparer.AbbreviatedKey(key)
65 : },
66 : FormatKey: base.DefaultFormatter,
67 0 : Separator: func(dst, a, b []byte) []byte {
68 0 : aKey, ok := getKeyPartFromEngineKey(a)
69 0 : if !ok {
70 0 : return append(dst, a...)
71 0 : }
72 0 : bKey, ok := getKeyPartFromEngineKey(b)
73 0 : if !ok {
74 0 : return append(dst, a...)
75 0 : }
76 : // If the keys are the same just return a.
77 0 : if bytes.Equal(aKey, bKey) {
78 0 : return append(dst, a...)
79 0 : }
80 0 : n := len(dst)
81 0 : dst = base.DefaultComparer.Separator(dst, aKey, bKey)
82 0 : // Did it pick a separator different than aKey -- if it did not we can't do better than a.
83 0 : buf := dst[n:]
84 0 : if bytes.Equal(aKey, buf) {
85 0 : return append(dst[:n], a...)
86 0 : }
87 : // The separator is > aKey, so we only need to add the sentinel.
88 0 : return append(dst, 0)
89 : },
90 0 : Successor: func(dst, a []byte) []byte {
91 0 : aKey, ok := getKeyPartFromEngineKey(a)
92 0 : if !ok {
93 0 : return append(dst, a...)
94 0 : }
95 0 : n := len(dst)
96 0 : // Engine key comparison uses bytes.Compare on the roachpb.Key, which is the same semantics as
97 0 : // pebble.DefaultComparer, so reuse the latter's SeekSetBitGE implementation.
98 0 : dst = base.DefaultComparer.Successor(dst, aKey)
99 0 : // Did it pick a successor different than aKey -- if it did not we can't do better than a.
100 0 : buf := dst[n:]
101 0 : if bytes.Equal(aKey, buf) {
102 0 : return append(dst[:n], a...)
103 0 : }
104 : // The successor is > aKey, so we only need to add the sentinel.
105 0 : return append(dst, 0)
106 : },
107 0 : ImmediateSuccessor: func(dst, a []byte) []byte {
108 0 : // The key `a` is guaranteed to be a bare prefix: It's a
109 0 : // `engineKeyNoVersion` key without a version—just a trailing 0-byte to
110 0 : // signify the length of the version. For example the user key "foo" is
111 0 : // encoded as: "foo\0". We need to encode the immediate successor to
112 0 : // "foo", which in the natural byte ordering is "foo\0". Append a
113 0 : // single additional zero, to encode the user key "foo\0" with a
114 0 : // zero-length version.
115 0 : return append(append(dst, a...), 0)
116 0 : },
117 : Name: "cockroach_comparator",
118 : }
119 :
120 : // EncodeMVCCKey encodes a MVCC key into dst, growing dst as necessary.
121 1 : func EncodeMVCCKey(dst []byte, key []byte, walltime uint64, logical uint32) []byte {
122 1 : if cap(dst) < len(key)+suffixLenWithSynthetic {
123 1 : dst = make([]byte, 0, len(key)+suffixLenWithSynthetic)
124 1 : }
125 1 : dst = append(dst[:0], key...)
126 1 : return EncodeTimestamp(dst, walltime, logical)
127 : }
128 :
129 : // AppendTimestamp appends an encoded MVCC timestamp onto key, returning the new
130 : // key. The provided key should already have the 0x00 sentinel byte (i.e., key
131 : // should be a proper prefix from the perspective of Pebble).
132 1 : func AppendTimestamp(key []byte, walltime uint64, logical uint32) []byte {
133 1 : if key[len(key)-1] != 0 {
134 0 : panic(errors.AssertionFailedf("key does not end with 0x00 sentinel byte: %x", key))
135 : }
136 1 : if logical == 0 {
137 1 : if walltime == 0 {
138 1 : return key
139 1 : }
140 1 : key = append(key, make([]byte, 9)...)
141 1 : binary.BigEndian.PutUint64(key[len(key)-9:], walltime)
142 1 : key[len(key)-1] = 9 // Version length byte
143 1 : return key
144 : }
145 1 : key = append(key, make([]byte, 13)...)
146 1 : binary.BigEndian.PutUint64(key[len(key)-13:], walltime)
147 1 : binary.BigEndian.PutUint32(key[len(key)-5:], logical)
148 1 : key[len(key)-1] = 13 // Version length byte
149 1 : return key
150 :
151 : }
152 :
153 : // EncodeTimestamp encodes a MVCC timestamp into a key, returning the new key.
154 : // The key's capacity must be sufficiently large to hold the encoded timestamp.
155 1 : func EncodeTimestamp(key []byte, walltime uint64, logical uint32) []byte {
156 1 : pos := len(key)
157 1 : if logical == 0 {
158 1 : if walltime == 0 {
159 1 : key = key[:pos+1]
160 1 : key[pos] = 0 // sentinel byte
161 1 : return key
162 1 : }
163 :
164 1 : key = key[:pos+1+8+1]
165 1 : key[pos] = 0 // sentinel byte
166 1 : key[pos+1+8] = 9
167 1 : binary.BigEndian.PutUint64(key[pos+1:], walltime)
168 1 : return key
169 : }
170 :
171 1 : key = key[:pos+1+12+1]
172 1 : key[pos] = 0 // sentinel byte
173 1 : key[pos+1+8+4] = 13
174 1 : binary.BigEndian.PutUint64(key[pos+1:], walltime)
175 1 : binary.BigEndian.PutUint32(key[pos+1+8:], logical)
176 1 : return key
177 : }
178 :
179 : // DecodeEngineKey decodes the given bytes as an EngineKey.
180 0 : func DecodeEngineKey(b []byte) (roachKey, version []byte, ok bool) {
181 0 : if len(b) == 0 {
182 0 : return nil, nil, false
183 0 : }
184 : // Last byte is the version length + 1 when there is a version,
185 : // else it is 0.
186 0 : versionLen := int(b[len(b)-1])
187 0 : if versionLen == 1 {
188 0 : // The key encodes an empty version, which is not valid.
189 0 : return nil, nil, false
190 0 : }
191 : // keyPartEnd points to the sentinel byte.
192 0 : keyPartEnd := len(b) - 1 - versionLen
193 0 : if keyPartEnd < 0 || b[keyPartEnd] != 0x00 {
194 0 : return nil, nil, false
195 0 : }
196 : // roachKey excludes the sentinel byte.
197 0 : roachKey = b[:keyPartEnd]
198 0 : if versionLen > 0 {
199 0 : // version consists of the bytes after the sentinel and before the
200 0 : // length.
201 0 : version = b[keyPartEnd+1 : len(b)-1]
202 0 : }
203 0 : return roachKey, version, true
204 : }
205 :
206 : // Split implements base.Split for CockroachDB keys.
207 : //
208 : //go:inline
209 1 : func Split(key []byte) int {
210 1 : if len(key) == 0 {
211 0 : return 0
212 0 : }
213 :
214 : // Last byte is the version length + 1 when there is a version, else it is
215 : // 0.
216 1 : versionLen := int(key[len(key)-1])
217 1 : if versionLen > len(key) {
218 0 : panic(errors.AssertionFailedf("invalid version length"))
219 : }
220 1 : return len(key) - versionLen
221 : }
222 :
223 : // Compare compares cockroach keys, including the version (which could be MVCC
224 : // timestamps).
225 1 : func Compare(a, b []byte) int {
226 1 : if len(a) == 0 || len(b) == 0 {
227 0 : return cmp.Compare(len(a), len(b))
228 0 : }
229 1 : aSuffixLen := int(a[len(a)-1])
230 1 : aSuffixStart := len(a) - aSuffixLen
231 1 : bSuffixLen := int(b[len(b)-1])
232 1 : bSuffixStart := len(b) - bSuffixLen
233 1 :
234 1 : // Compare the "user key" part of the key.
235 1 : if c := bytes.Compare(a[:aSuffixStart], b[:bSuffixStart]); c != 0 {
236 1 : return c
237 1 : }
238 1 : if aSuffixLen == 0 || bSuffixLen == 0 {
239 1 : // Empty suffixes come before non-empty suffixes.
240 1 : return cmp.Compare(aSuffixLen, bSuffixLen)
241 1 : }
242 :
243 1 : return bytes.Compare(
244 1 : normalizeEngineSuffixForCompare(b[bSuffixStart:]),
245 1 : normalizeEngineSuffixForCompare(a[aSuffixStart:]),
246 1 : )
247 : }
248 :
249 : // CompareRangeSuffixes compares range key suffixes (normally timestamps).
250 1 : func CompareRangeSuffixes(a, b []byte) int {
251 1 : if len(a) == 0 || len(b) == 0 {
252 1 : // Empty suffixes come before non-empty suffixes.
253 1 : return cmp.Compare(len(a), len(b))
254 1 : }
255 : // Here we are not using normalizeEngineKeyVersionForCompare for historical
256 : // reasons, summarized in
257 : // https://github.com/cockroachdb/cockroach/issues/130533.
258 1 : return bytes.Compare(b[:len(b)-1], a[:len(a)-1])
259 : }
260 :
261 : // ComparePointSuffixes compares point key suffixes (normally timestamps).
262 1 : func ComparePointSuffixes(a, b []byte) int {
263 1 : if len(a) == 0 || len(b) == 0 {
264 1 : // Empty suffixes sort before non-empty suffixes.
265 1 : return cmp.Compare(len(a), len(b))
266 1 : }
267 1 : return bytes.Compare(normalizeEngineSuffixForCompare(b), normalizeEngineSuffixForCompare(a))
268 : }
269 :
270 : // Equal implements base.Equal for Cockroach keys.
271 1 : func Equal(a, b []byte) bool {
272 1 : if len(a) == 0 || len(b) == 0 {
273 0 : return len(a) == len(b)
274 0 : }
275 1 : aSuffixLen := int(a[len(a)-1])
276 1 : aSuffixStart := len(a) - aSuffixLen
277 1 : bSuffixLen := int(b[len(b)-1])
278 1 : bSuffixStart := len(b) - bSuffixLen
279 1 :
280 1 : // Fast-path: normalizeEngineSuffixForCompare doesn't strip off bytes when the
281 1 : // length is withWall or withLockTableLen. In this case, as well as cases with
282 1 : // no prefix, we can check for byte equality immediately.
283 1 : const withWall = mvccEncodedTimeSentinelLen + mvccEncodedTimeWallLen
284 1 : const withLockTableLen = mvccEncodedTimeSentinelLen + engineKeyVersionLockTableLen
285 1 : if (aSuffixLen <= withWall && bSuffixLen <= withWall) ||
286 1 : (aSuffixLen == withLockTableLen && bSuffixLen == withLockTableLen) ||
287 1 : aSuffixLen == 0 || bSuffixLen == 0 {
288 1 : return bytes.Equal(a, b)
289 1 : }
290 :
291 : // Compare the "user key" part of the key.
292 1 : if !bytes.Equal(a[:aSuffixStart], b[:bSuffixStart]) {
293 1 : return false
294 1 : }
295 :
296 1 : return bytes.Equal(
297 1 : normalizeEngineSuffixForCompare(a[aSuffixStart:]),
298 1 : normalizeEngineSuffixForCompare(b[bSuffixStart:]),
299 1 : )
300 : }
301 :
302 : var zeroLogical [4]byte
303 :
304 : // normalizeEngineSuffixForCompare takes a non-empty key suffix (including the
305 : // trailing sentinel byte) and returns a prefix of the buffer that should be
306 : // used for byte-wise comparison. It trims the trailing suffix length byte and
307 : // any other trailing bytes that need to be ignored (like a synthetic bit or
308 : // zero logical component).
309 : //
310 : //gcassert:inline
311 1 : func normalizeEngineSuffixForCompare(a []byte) []byte {
312 1 : // Check sentinel byte.
313 1 : if invariants.Enabled && len(a) != int(a[len(a)-1]) {
314 0 : panic(errors.AssertionFailedf("malformed suffix: %x", a))
315 : }
316 : // Strip off sentinel byte.
317 1 : a = a[:len(a)-1]
318 1 : switch len(a) {
319 1 : case engineKeyVersionWallLogicalAndSyntheticTimeLen:
320 1 : // Strip the synthetic bit component from the timestamp version. The
321 1 : // presence of the synthetic bit does not affect key ordering or equality.
322 1 : a = a[:engineKeyVersionWallAndLogicalTimeLen]
323 1 : fallthrough
324 1 : case engineKeyVersionWallAndLogicalTimeLen:
325 1 : // If the timestamp version contains a logical timestamp component that is
326 1 : // zero, strip the component. encodeMVCCTimestampToBuf will typically omit
327 1 : // the entire logical component in these cases as an optimization, but it
328 1 : // does not guarantee to never include a zero logical component.
329 1 : // Additionally, we can fall into this case after stripping off other
330 1 : // components of the key version earlier on in this function.
331 1 : if bytes.Equal(a[engineKeyVersionWallTimeLen:], zeroLogical[:]) {
332 1 : a = a[:engineKeyVersionWallTimeLen]
333 1 : }
334 1 : fallthrough
335 1 : case engineKeyVersionWallTimeLen:
336 : // Nothing to do.
337 :
338 1 : case engineKeyVersionLockTableLen:
339 : // We rely on engineKeyVersionLockTableLen being different from the other
340 : // lengths above to ensure that we don't strip parts from a non-timestamp
341 : // version.
342 :
343 0 : default:
344 0 : if invariants.Enabled {
345 0 : panic(errors.AssertionFailedf("version with unexpected length: %x", a))
346 : }
347 : }
348 1 : return a
349 : }
350 :
351 0 : func getKeyPartFromEngineKey(engineKey []byte) (key []byte, ok bool) {
352 0 : if len(engineKey) == 0 {
353 0 : return nil, false
354 0 : }
355 : // Last byte is the version length + 1 when there is a version,
356 : // else it is 0.
357 0 : versionLen := int(engineKey[len(engineKey)-1])
358 0 : // keyPartEnd points to the sentinel byte.
359 0 : keyPartEnd := len(engineKey) - 1 - versionLen
360 0 : if keyPartEnd < 0 || engineKey[keyPartEnd] != 0x00 {
361 0 : return nil, false
362 0 : }
363 : // Key excludes the sentinel byte.
364 0 : return engineKey[:keyPartEnd], true
365 : }
366 :
367 : const (
368 : cockroachColRoachKey int = iota
369 : cockroachColMVCCWallTime
370 : cockroachColMVCCLogical
371 : cockroachColUntypedVersion
372 : cockroachColCount
373 : )
374 :
375 : var KeySchema = colblk.KeySchema{
376 : Name: "crdb1",
377 : HeaderSize: 1,
378 : ColumnTypes: []colblk.DataType{
379 : cockroachColRoachKey: colblk.DataTypePrefixBytes,
380 : cockroachColMVCCWallTime: colblk.DataTypeUint,
381 : cockroachColMVCCLogical: colblk.DataTypeUint,
382 : cockroachColUntypedVersion: colblk.DataTypeBytes,
383 : },
384 1 : NewKeyWriter: func() colblk.KeyWriter {
385 1 : return makeCockroachKeyWriter()
386 1 : },
387 1 : InitKeySeekerMetadata: func(meta *colblk.KeySeekerMetadata, d *colblk.DataBlockDecoder) {
388 1 : ks := (*cockroachKeySeeker)(unsafe.Pointer(meta))
389 1 : ks.init(d)
390 1 : },
391 1 : KeySeeker: func(meta *colblk.KeySeekerMetadata) colblk.KeySeeker {
392 1 : return (*cockroachKeySeeker)(unsafe.Pointer(meta))
393 1 : },
394 : }
395 :
396 : // suffixTypes is a bitfield indicating what kind of suffixes are present in a
397 : // block.
398 : type suffixTypes uint8
399 :
400 : const (
401 : // hasMVCCSuffixes is set if there is at least one key with an MVCC suffix in
402 : // the block.
403 : hasMVCCSuffixes suffixTypes = (1 << iota)
404 : // hasEmptySuffixes is set if there is at least one key with no suffix in the block.
405 : hasEmptySuffixes
406 : // hasNonMVCCSuffixes is set if there is at least one key with a non-empty,
407 : // non-MVCC suffix.
408 : hasNonMVCCSuffixes
409 : )
410 :
411 1 : func (s suffixTypes) String() string {
412 1 : var suffixes []string
413 1 : if s&hasMVCCSuffixes != 0 {
414 1 : suffixes = append(suffixes, "mvcc")
415 1 : }
416 1 : if s&hasEmptySuffixes != 0 {
417 1 : suffixes = append(suffixes, "empty")
418 1 : }
419 1 : if s&hasNonMVCCSuffixes != 0 {
420 1 : suffixes = append(suffixes, "non-mvcc")
421 1 : }
422 1 : if len(suffixes) == 0 {
423 0 : return "none"
424 0 : }
425 1 : return strings.Join(suffixes, ",")
426 : }
427 :
428 : type cockroachKeyWriter struct {
429 : roachKeys colblk.PrefixBytesBuilder
430 : wallTimes colblk.UintBuilder
431 : logicalTimes colblk.UintBuilder
432 : untypedVersions colblk.RawBytesBuilder
433 : suffixTypes suffixTypes
434 : prevRoachKeyLen int32
435 : prevSuffix []byte
436 : }
437 :
438 1 : func makeCockroachKeyWriter() *cockroachKeyWriter {
439 1 : kw := &cockroachKeyWriter{}
440 1 : kw.roachKeys.Init(16)
441 1 : kw.wallTimes.Init()
442 1 : kw.logicalTimes.InitWithDefault()
443 1 : kw.untypedVersions.Init()
444 1 : return kw
445 1 : }
446 :
447 1 : func (kw *cockroachKeyWriter) Reset() {
448 1 : kw.roachKeys.Reset()
449 1 : kw.wallTimes.Reset()
450 1 : kw.logicalTimes.Reset()
451 1 : kw.untypedVersions.Reset()
452 1 : kw.suffixTypes = 0
453 1 : kw.prevRoachKeyLen = 0
454 1 : }
455 :
456 1 : func (kw *cockroachKeyWriter) ComparePrev(key []byte) colblk.KeyComparison {
457 1 : prefixLen := Split(key)
458 1 : if kw.roachKeys.Rows() == 0 {
459 1 : return colblk.KeyComparison{
460 1 : PrefixLen: int32(prefixLen),
461 1 : CommonPrefixLen: 0,
462 1 : UserKeyComparison: +1,
463 1 : }
464 1 : }
465 1 : lastRoachKey := kw.roachKeys.UnsafeGet(kw.roachKeys.Rows() - 1)
466 1 : commonPrefixLen := crbytes.CommonPrefix(lastRoachKey, key[:prefixLen-1])
467 1 : if len(lastRoachKey) == commonPrefixLen {
468 1 : if invariants.Enabled && len(lastRoachKey) > prefixLen-1 {
469 0 : panic(errors.AssertionFailedf("out-of-order keys: previous roach key %q > roach key of key %q",
470 0 : lastRoachKey, key))
471 : }
472 : // All the bytes of the previous roach key form a byte-wise prefix of
473 : // [key]'s prefix. The last byte of the previous prefix is the 0x00
474 : // sentinel byte, which is not stored within roachKeys. It's possible
475 : // that [key] also has a 0x00 byte in the same position (either also
476 : // serving as a sentinel byte, in which case the prefixes are equal, or
477 : // not in which case [key] is greater). In both cases, we need to
478 : // increment CommonPrefixLen.
479 1 : if key[commonPrefixLen] == 0x00 {
480 1 : commonPrefixLen++
481 1 : if commonPrefixLen == prefixLen {
482 1 : // The prefixes are equal; compare the suffixes.
483 1 : return colblk.KeyComparison{
484 1 : PrefixLen: int32(prefixLen),
485 1 : CommonPrefixLen: int32(commonPrefixLen),
486 1 : UserKeyComparison: int32(ComparePointSuffixes(key[prefixLen:], kw.prevSuffix)),
487 1 : }
488 1 : }
489 : }
490 : // prefixLen > commonPrefixLen; key is greater.
491 1 : return colblk.KeyComparison{
492 1 : PrefixLen: int32(prefixLen),
493 1 : CommonPrefixLen: int32(commonPrefixLen),
494 1 : UserKeyComparison: +1,
495 1 : }
496 : }
497 : // Both keys have at least 1 additional byte at which they diverge.
498 : // Compare the diverging byte.
499 1 : return colblk.KeyComparison{
500 1 : PrefixLen: int32(prefixLen),
501 1 : CommonPrefixLen: int32(commonPrefixLen),
502 1 : UserKeyComparison: int32(cmp.Compare(key[commonPrefixLen], lastRoachKey[commonPrefixLen])),
503 1 : }
504 : }
505 :
506 : func (kw *cockroachKeyWriter) WriteKey(
507 : row int, key []byte, keyPrefixLen, keyPrefixLenSharedWithPrev int32,
508 1 : ) {
509 1 : if len(key) == 0 {
510 0 : panic(errors.AssertionFailedf("empty key"))
511 : }
512 : // Last byte is the version length + 1 when there is a version,
513 : // else it is 0.
514 1 : versionLen := int(key[len(key)-1])
515 1 : if (len(key)-versionLen) != int(keyPrefixLen) || key[keyPrefixLen-1] != 0x00 {
516 0 : panic(errors.AssertionFailedf("invalid %d-byte key with %d-byte prefix (%q)",
517 0 : len(key), keyPrefixLen, key))
518 : }
519 : // TODO(jackson): Avoid copying the previous suffix.
520 1 : kw.prevSuffix = append(kw.prevSuffix[:0], key[keyPrefixLen:]...)
521 1 :
522 1 : // When the roach key is the same or contain the previous key as a prefix,
523 1 : // keyPrefixLenSharedWithPrev includes the previous key's separator byte.
524 1 : kw.roachKeys.Put(key[:keyPrefixLen-1], min(int(keyPrefixLenSharedWithPrev), int(kw.prevRoachKeyLen)))
525 1 : kw.prevRoachKeyLen = keyPrefixLen - 1
526 1 :
527 1 : // NB: The w.logicalTimes builder was initialized with InitWithDefault, so
528 1 : // if we don't set a value, the column value is implicitly zero. We only
529 1 : // need to Set anything for non-zero values.
530 1 : var wallTime uint64
531 1 : var untypedVersion []byte
532 1 : switch versionLen {
533 1 : case 0:
534 1 : // No-op.
535 1 : kw.suffixTypes |= hasEmptySuffixes
536 1 : case 9:
537 1 : kw.suffixTypes |= hasMVCCSuffixes
538 1 : wallTime = binary.BigEndian.Uint64(key[keyPrefixLen : keyPrefixLen+8])
539 1 : case 13, 14:
540 1 : kw.suffixTypes |= hasMVCCSuffixes
541 1 : wallTime = binary.BigEndian.Uint64(key[keyPrefixLen : keyPrefixLen+8])
542 1 : kw.logicalTimes.Set(row, uint64(binary.BigEndian.Uint32(key[keyPrefixLen+8:keyPrefixLen+12])))
543 : // NOTE: byte 13 used to store the timestamp's synthetic bit, but this is no
544 : // longer consulted and can be ignored during decoding.
545 1 : default:
546 1 : // Not a MVCC timestamp.
547 1 : kw.suffixTypes |= hasNonMVCCSuffixes
548 1 : untypedVersion = key[keyPrefixLen : len(key)-1]
549 : }
550 1 : kw.wallTimes.Set(row, wallTime)
551 1 : kw.untypedVersions.Put(untypedVersion)
552 : }
553 :
554 1 : func (kw *cockroachKeyWriter) MaterializeKey(dst []byte, i int) []byte {
555 1 : dst = append(dst, kw.roachKeys.UnsafeGet(i)...)
556 1 : // Append separator byte.
557 1 : dst = append(dst, 0)
558 1 : if untypedVersion := kw.untypedVersions.UnsafeGet(i); len(untypedVersion) > 0 {
559 1 : dst = append(dst, untypedVersion...)
560 1 : dst = append(dst, byte(len(untypedVersion)+1))
561 1 : return dst
562 1 : }
563 1 : return AppendTimestamp(dst, kw.wallTimes.Get(i), uint32(kw.logicalTimes.Get(i)))
564 : }
565 :
566 0 : func (kw *cockroachKeyWriter) WriteDebug(dst io.Writer, rows int) {
567 0 : fmt.Fprint(dst, "prefixes: ")
568 0 : kw.roachKeys.WriteDebug(dst, rows)
569 0 : fmt.Fprintln(dst)
570 0 : fmt.Fprint(dst, "wall times: ")
571 0 : kw.wallTimes.WriteDebug(dst, rows)
572 0 : fmt.Fprintln(dst)
573 0 : fmt.Fprint(dst, "logical times: ")
574 0 : kw.logicalTimes.WriteDebug(dst, rows)
575 0 : fmt.Fprintln(dst)
576 0 : fmt.Fprint(dst, "untyped suffixes: ")
577 0 : kw.untypedVersions.WriteDebug(dst, rows)
578 0 : fmt.Fprintln(dst)
579 0 : fmt.Fprint(dst, "suffix types: ")
580 0 : fmt.Fprintln(dst, kw.suffixTypes.String())
581 0 : }
582 :
583 1 : func (kw *cockroachKeyWriter) NumColumns() int {
584 1 : return cockroachColCount
585 1 : }
586 :
587 1 : func (kw *cockroachKeyWriter) DataType(col int) colblk.DataType {
588 1 : return KeySchema.ColumnTypes[col]
589 1 : }
590 :
591 1 : func (kw *cockroachKeyWriter) Size(rows int, offset uint32) uint32 {
592 1 : offset = kw.roachKeys.Size(rows, offset)
593 1 : offset = kw.wallTimes.Size(rows, offset)
594 1 : offset = kw.logicalTimes.Size(rows, offset)
595 1 : offset = kw.untypedVersions.Size(rows, offset)
596 1 : return offset
597 1 : }
598 :
599 : func (kw *cockroachKeyWriter) Finish(
600 : col int, rows int, offset uint32, buf []byte,
601 1 : ) (endOffset uint32) {
602 1 : switch col {
603 1 : case cockroachColRoachKey:
604 1 : return kw.roachKeys.Finish(0, rows, offset, buf)
605 1 : case cockroachColMVCCWallTime:
606 1 : return kw.wallTimes.Finish(0, rows, offset, buf)
607 1 : case cockroachColMVCCLogical:
608 1 : return kw.logicalTimes.Finish(0, rows, offset, buf)
609 1 : case cockroachColUntypedVersion:
610 1 : return kw.untypedVersions.Finish(0, rows, offset, buf)
611 0 : default:
612 0 : panic(fmt.Sprintf("unknown default key column: %d", col))
613 : }
614 : }
615 :
616 1 : func (kw *cockroachKeyWriter) FinishHeader(buf []byte) {
617 1 : buf[0] = byte(kw.suffixTypes)
618 1 : }
619 :
620 : type cockroachKeySeeker struct {
621 : roachKeys colblk.PrefixBytes
622 : roachKeyChanged colblk.Bitmap
623 : mvccWallTimes colblk.UnsafeUints
624 : mvccLogical colblk.UnsafeUints
625 : untypedVersions colblk.RawBytes
626 : suffixTypes suffixTypes
627 : }
628 :
629 : // Assert that the cockroachKeySeeker fits inside KeySeekerMetadata.
630 : var _ uint = colblk.KeySeekerMetadataSize - uint(unsafe.Sizeof(cockroachKeySeeker{}))
631 :
632 : var _ colblk.KeySeeker = (*cockroachKeySeeker)(nil)
633 :
634 1 : func (ks *cockroachKeySeeker) init(d *colblk.DataBlockDecoder) {
635 1 : bd := d.BlockDecoder()
636 1 : ks.roachKeys = bd.PrefixBytes(cockroachColRoachKey)
637 1 : ks.roachKeyChanged = d.PrefixChanged()
638 1 : ks.mvccWallTimes = bd.Uints(cockroachColMVCCWallTime)
639 1 : ks.mvccLogical = bd.Uints(cockroachColMVCCLogical)
640 1 : ks.untypedVersions = bd.RawBytes(cockroachColUntypedVersion)
641 1 : header := d.KeySchemaHeader()
642 1 : if len(header) != 1 {
643 0 : panic(errors.AssertionFailedf("invalid key schema-specific header %x", header))
644 : }
645 1 : ks.suffixTypes = suffixTypes(header[0])
646 : }
647 :
648 : // IsLowerBound is part of the KeySeeker interface.
649 0 : func (ks *cockroachKeySeeker) IsLowerBound(k []byte, syntheticSuffix []byte) bool {
650 0 : roachKey, version, ok := DecodeEngineKey(k)
651 0 : if !ok {
652 0 : panic(errors.AssertionFailedf("invalid key %q", k))
653 : }
654 0 : if v := bytes.Compare(ks.roachKeys.UnsafeFirstSlice(), roachKey); v != 0 {
655 0 : return v > 0
656 0 : }
657 : // If there's a synthetic suffix, we ignore the block's suffix columns and
658 : // compare the key's suffix to the synthetic suffix.
659 0 : if len(syntheticSuffix) > 0 {
660 0 : return ComparePointSuffixes(syntheticSuffix, k[len(roachKey)+1:]) >= 0
661 0 : }
662 0 : var wallTime uint64
663 0 : var logicalTime uint32
664 0 : switch len(version) {
665 0 : case engineKeyNoVersion:
666 0 : case engineKeyVersionWallTimeLen:
667 0 : wallTime = binary.BigEndian.Uint64(version[:8])
668 0 : case engineKeyVersionWallAndLogicalTimeLen, engineKeyVersionWallLogicalAndSyntheticTimeLen:
669 0 : wallTime = binary.BigEndian.Uint64(version[:8])
670 0 : logicalTime = binary.BigEndian.Uint32(version[8:12])
671 0 : default:
672 0 : // The provided key `k` is not a MVCC key. Assert that the first key in
673 0 : // the block is also not an MVCC key. If it were, that would mean there
674 0 : // exists both a MVCC key and a non-MVCC key with the same prefix.
675 0 : //
676 0 : // TODO(jackson): Double check that we'll never produce index separators
677 0 : // that are invalid version lengths.
678 0 : if invariants.Enabled && ks.mvccWallTimes.At(0) != 0 {
679 0 : panic("comparing timestamp with untyped suffix")
680 : }
681 0 : return ComparePointSuffixes(ks.untypedVersions.At(0), version) >= 0
682 : }
683 :
684 : // NB: The sign comparison is inverted because suffixes are sorted such that
685 : // the largest timestamps are "smaller" in the lexicographical ordering.
686 0 : if v := cmp.Compare(ks.mvccWallTimes.At(0), wallTime); v != 0 {
687 0 : return v < 0
688 0 : }
689 0 : return cmp.Compare(uint32(ks.mvccLogical.At(0)), logicalTime) <= 0
690 : }
691 :
692 : // SeekGE is part of the KeySeeker interface.
693 : func (ks *cockroachKeySeeker) SeekGE(
694 : key []byte, boundRow int, searchDir int8,
695 1 : ) (row int, equalPrefix bool) {
696 1 : si := Split(key)
697 1 : row, eq := ks.roachKeys.Search(key[:si-1])
698 1 : if eq {
699 1 : return ks.seekGEOnSuffix(row, key[si:]), true
700 1 : }
701 1 : return row, false
702 : }
703 :
704 : // seekGEOnSuffix is a helper function for SeekGE when a seek key's prefix
705 : // exactly matches a row. seekGEOnSuffix finds the first row at index or later
706 : // with the same prefix as index and a suffix greater than or equal to [suffix],
707 : // or if no such row exists, the next row with a different prefix.
708 1 : func (ks *cockroachKeySeeker) seekGEOnSuffix(index int, seekSuffix []byte) (row int) {
709 1 : // We have three common cases:
710 1 : // 1. The seek key has no suffix.
711 1 : // 2. We are seeking to an MVCC timestamp in a block where all keys have
712 1 : // MVCC timestamps (e.g. SQL table data).
713 1 : // 3. We are seeking to a non-MVCC timestamp in a block where no keys have
714 1 : // MVCC timestamps (e.g. lock keys).
715 1 :
716 1 : if len(seekSuffix) == 0 {
717 1 : // The search key has no suffix, so it's the smallest possible key with its
718 1 : // prefix. Return the row. This is a common case where the user is seeking
719 1 : // to the most-recent row and just wants the smallest key with the prefix.
720 1 : return index
721 1 : }
722 :
723 1 : const withWall = 9
724 1 : const withLogical = withWall + 4
725 1 : const withSynthetic = withLogical + 1
726 1 :
727 1 : // If suffixTypes == hasMVCCSuffixes, all keys in the block have MVCC
728 1 : // suffixes. Note that blocks that contain both MVCC and non-MVCC should be
729 1 : // very rare, so it's ok to use the more general path below in that case.
730 1 : if ks.suffixTypes == hasMVCCSuffixes && (len(seekSuffix) == withWall || len(seekSuffix) == withLogical || len(seekSuffix) == withSynthetic) {
731 1 : // Fast path: seeking among MVCC versions using a MVCC timestamp.
732 1 : seekWallTime := binary.BigEndian.Uint64(seekSuffix)
733 1 : var seekLogicalTime uint32
734 1 : if len(seekSuffix) >= withLogical {
735 1 : seekLogicalTime = binary.BigEndian.Uint32(seekSuffix[8:])
736 1 : }
737 :
738 : // First check the suffix at index, because querying for the latest value is
739 : // the most common case.
740 1 : if latestWallTime := ks.mvccWallTimes.At(index); latestWallTime < seekWallTime ||
741 1 : (latestWallTime == seekWallTime && uint32(ks.mvccLogical.At(index)) <= seekLogicalTime) {
742 1 : return index
743 1 : }
744 :
745 : // Binary search between [index+1, prefixChanged.SeekSetBitGE(index+1)].
746 : //
747 : // Define f(i) = true iff key at i is >= seek key.
748 : // Invariant: f(l-1) == false, f(u) == true.
749 1 : l := index + 1
750 1 : u := ks.roachKeyChanged.SeekSetBitGE(index + 1)
751 1 :
752 1 : for l < u {
753 1 : m := int(uint(l+u) >> 1) // avoid overflow when computing m
754 1 : // l ≤ m < u
755 1 : mWallTime := ks.mvccWallTimes.At(m)
756 1 : if mWallTime < seekWallTime ||
757 1 : (mWallTime == seekWallTime && uint32(ks.mvccLogical.At(m)) <= seekLogicalTime) {
758 1 : u = m // preserves f(u) = true
759 1 : } else {
760 1 : l = m + 1 // preserves f(l-1) = false
761 1 : }
762 : }
763 1 : return l
764 : }
765 :
766 : // Remove the terminator byte, which we know is equal to len(seekSuffix)
767 : // because we obtained the suffix by splitting the seek key.
768 1 : version := seekSuffix[:len(seekSuffix)-1]
769 1 : if invariants.Enabled && seekSuffix[len(version)] != byte(len(seekSuffix)) {
770 0 : panic(errors.AssertionFailedf("invalid seek suffix %x", seekSuffix))
771 : }
772 :
773 : // Binary search for version between [index, prefixChanged.SeekSetBitGE(index+1)].
774 : //
775 : // Define f(i) = true iff key at i is >= seek key (i.e. suffix at i is <= seek suffix).
776 : // Invariant: f(l-1) == false, f(u) == true.
777 1 : l := index
778 1 : u := ks.roachKeyChanged.SeekSetBitGE(index + 1)
779 1 : for l < u {
780 1 : m := int(uint(l+u) >> 1) // avoid overflow when computing m
781 1 : // l ≤ m < u
782 1 : mVer := ks.untypedVersions.At(m)
783 1 : if len(mVer) == 0 {
784 1 : wallTime := ks.mvccWallTimes.At(m)
785 1 : logicalTime := uint32(ks.mvccLogical.At(m))
786 1 : if wallTime == 0 && logicalTime == 0 {
787 1 : // This row has an empty suffix. Since the seek suffix is not empty, it comes after.
788 1 : l = m + 1
789 1 : continue
790 : }
791 :
792 : // Note: this path is not very performance sensitive: blocks that mix MVCC
793 : // suffixes with non-MVCC suffixes should be rare.
794 :
795 : //gcassert:noescape
796 1 : var buf [12]byte
797 1 : //gcassert:inline
798 1 : binary.BigEndian.PutUint64(buf[:], wallTime)
799 1 : if logicalTime == 0 {
800 1 : mVer = buf[:8]
801 1 : } else {
802 1 : //gcassert:inline
803 1 : binary.BigEndian.PutUint32(buf[8:], logicalTime)
804 1 : mVer = buf[:12]
805 1 : }
806 : }
807 1 : if bytes.Compare(mVer, version) <= 0 {
808 1 : u = m // preserves f(u) == true
809 1 : } else {
810 1 : l = m + 1 // preserves f(l-1) == false
811 1 : }
812 : }
813 1 : return l
814 : }
815 :
816 : // MaterializeUserKey is part of the KeySeeker interface.
817 : func (ks *cockroachKeySeeker) MaterializeUserKey(
818 : ki *colblk.PrefixBytesIter, prevRow, row int,
819 1 : ) []byte {
820 1 : if invariants.Enabled && (row < 0 || row >= ks.roachKeys.Rows()) {
821 0 : panic(errors.AssertionFailedf("invalid row number %d", row))
822 : }
823 1 : if prevRow+1 == row && prevRow >= 0 {
824 1 : ks.roachKeys.SetNext(ki)
825 1 : } else {
826 1 : ks.roachKeys.SetAt(ki, row)
827 1 : }
828 :
829 1 : roachKeyLen := len(ki.Buf)
830 1 : ptr := unsafe.Pointer(uintptr(unsafe.Pointer(unsafe.SliceData(ki.Buf))) + uintptr(roachKeyLen))
831 1 : mvccWall := ks.mvccWallTimes.At(row)
832 1 : mvccLogical := uint32(ks.mvccLogical.At(row))
833 1 : if mvccWall == 0 && mvccLogical == 0 {
834 1 : // This is not an MVCC key. Use the untyped suffix.
835 1 : untypedVersion := ks.untypedVersions.At(row)
836 1 : if len(untypedVersion) == 0 {
837 1 : res := ki.Buf[:roachKeyLen+1]
838 1 : res[roachKeyLen] = 0
839 1 : return res
840 1 : }
841 : // Slice first, to check that the capacity is sufficient.
842 1 : res := ki.Buf[:roachKeyLen+2+len(untypedVersion)]
843 1 : *(*byte)(ptr) = 0
844 1 : memmove(
845 1 : unsafe.Pointer(uintptr(ptr)+1),
846 1 : unsafe.Pointer(unsafe.SliceData(untypedVersion)),
847 1 : uintptr(len(untypedVersion)),
848 1 : )
849 1 : *(*byte)(unsafe.Pointer(uintptr(ptr) + uintptr(len(untypedVersion)+1))) = byte(len(untypedVersion) + 1)
850 1 : return res
851 : }
852 :
853 : // Inline binary.BigEndian.PutUint64. Note that this code is converted into
854 : // word-size instructions by the compiler.
855 1 : *(*byte)(ptr) = 0
856 1 : *(*byte)(unsafe.Pointer(uintptr(ptr) + 1)) = byte(mvccWall >> 56)
857 1 : *(*byte)(unsafe.Pointer(uintptr(ptr) + 2)) = byte(mvccWall >> 48)
858 1 : *(*byte)(unsafe.Pointer(uintptr(ptr) + 3)) = byte(mvccWall >> 40)
859 1 : *(*byte)(unsafe.Pointer(uintptr(ptr) + 4)) = byte(mvccWall >> 32)
860 1 : *(*byte)(unsafe.Pointer(uintptr(ptr) + 5)) = byte(mvccWall >> 24)
861 1 : *(*byte)(unsafe.Pointer(uintptr(ptr) + 6)) = byte(mvccWall >> 16)
862 1 : *(*byte)(unsafe.Pointer(uintptr(ptr) + 7)) = byte(mvccWall >> 8)
863 1 : *(*byte)(unsafe.Pointer(uintptr(ptr) + 8)) = byte(mvccWall)
864 1 :
865 1 : ptr = unsafe.Pointer(uintptr(ptr) + 9)
866 1 : // This is an MVCC key.
867 1 : if mvccLogical == 0 {
868 1 : *(*byte)(ptr) = 9
869 1 : return ki.Buf[:len(ki.Buf)+10]
870 1 : }
871 :
872 : // Inline binary.BigEndian.PutUint32.
873 1 : *(*byte)(ptr) = byte(mvccLogical >> 24)
874 1 : *(*byte)(unsafe.Pointer(uintptr(ptr) + 1)) = byte(mvccLogical >> 16)
875 1 : *(*byte)(unsafe.Pointer(uintptr(ptr) + 2)) = byte(mvccLogical >> 8)
876 1 : *(*byte)(unsafe.Pointer(uintptr(ptr) + 3)) = byte(mvccLogical)
877 1 : *(*byte)(unsafe.Pointer(uintptr(ptr) + 4)) = 13
878 1 : return ki.Buf[:len(ki.Buf)+14]
879 : }
880 :
881 : // MaterializeUserKeyWithSyntheticSuffix is part of the KeySeeker interface.
882 : func (ks *cockroachKeySeeker) MaterializeUserKeyWithSyntheticSuffix(
883 : ki *colblk.PrefixBytesIter, suffix []byte, prevRow, row int,
884 0 : ) []byte {
885 0 : if prevRow+1 == row && prevRow >= 0 {
886 0 : ks.roachKeys.SetNext(ki)
887 0 : } else {
888 0 : ks.roachKeys.SetAt(ki, row)
889 0 : }
890 :
891 : // Slice first, to check that the capacity is sufficient.
892 0 : res := ki.Buf[:len(ki.Buf)+1+len(suffix)]
893 0 : ptr := unsafe.Pointer(uintptr(unsafe.Pointer(unsafe.SliceData(ki.Buf))) + uintptr(len(ki.Buf)))
894 0 : *(*byte)(ptr) = 0
895 0 : memmove(unsafe.Pointer(uintptr(ptr)+1), unsafe.Pointer(unsafe.SliceData(suffix)), uintptr(len(suffix)))
896 0 : return res
897 : }
898 :
899 : //go:linkname memmove runtime.memmove
900 : func memmove(to, from unsafe.Pointer, n uintptr)
|