Line data Source code
1 : // Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : // Package crdbtest provides facilities for representing keys, workloads, etc
6 : // representative of CockroachDB's use of Pebble.
7 : package crdbtest
8 :
9 : import (
10 : "bytes"
11 : "cmp"
12 : "encoding/binary"
13 : "fmt"
14 : "io"
15 : "math/rand/v2"
16 : "slices"
17 : "sync"
18 : "time"
19 : "unsafe"
20 :
21 : "github.com/cockroachdb/crlib/crbytes"
22 : "github.com/cockroachdb/crlib/crstrings"
23 : "github.com/cockroachdb/errors"
24 : "github.com/cockroachdb/pebble/internal/base"
25 : "github.com/cockroachdb/pebble/internal/invariants"
26 : "github.com/cockroachdb/pebble/sstable/colblk"
27 : )
28 :
29 : const withWall = 9
30 : const withLogical = withWall + 4
31 : const withSynthetic = withLogical + 1
32 : const withLockTableLen = 18
33 :
34 : // MaxSuffixLen is the maximum length of the CockroachDB key suffix.
35 : const MaxSuffixLen = max(withLockTableLen, withSynthetic, withLogical, withWall)
36 :
37 : // Comparer is a base.Comparer for CockroachDB keys.
38 : var Comparer = base.Comparer{
39 : Split: Split,
40 : CompareSuffixes: CompareSuffixes,
41 : Compare: Compare,
42 : Equal: Equal,
43 0 : AbbreviatedKey: func(k []byte) uint64 {
44 0 : key, ok := getKeyPartFromEngineKey(k)
45 0 : if !ok {
46 0 : return 0
47 0 : }
48 0 : return base.DefaultComparer.AbbreviatedKey(key)
49 : },
50 : FormatKey: base.DefaultFormatter,
51 0 : Separator: func(dst, a, b []byte) []byte {
52 0 : aKey, ok := getKeyPartFromEngineKey(a)
53 0 : if !ok {
54 0 : return append(dst, a...)
55 0 : }
56 0 : bKey, ok := getKeyPartFromEngineKey(b)
57 0 : if !ok {
58 0 : return append(dst, a...)
59 0 : }
60 : // If the keys are the same just return a.
61 0 : if bytes.Equal(aKey, bKey) {
62 0 : return append(dst, a...)
63 0 : }
64 0 : n := len(dst)
65 0 : dst = base.DefaultComparer.Separator(dst, aKey, bKey)
66 0 : // Did it pick a separator different than aKey -- if it did not we can't do better than a.
67 0 : buf := dst[n:]
68 0 : if bytes.Equal(aKey, buf) {
69 0 : return append(dst[:n], a...)
70 0 : }
71 : // The separator is > aKey, so we only need to add the sentinel.
72 0 : return append(dst, 0)
73 : },
74 0 : Successor: func(dst, a []byte) []byte {
75 0 : aKey, ok := getKeyPartFromEngineKey(a)
76 0 : if !ok {
77 0 : return append(dst, a...)
78 0 : }
79 0 : n := len(dst)
80 0 : // Engine key comparison uses bytes.Compare on the roachpb.Key, which is the same semantics as
81 0 : // pebble.DefaultComparer, so reuse the latter's SeekSetBitGE implementation.
82 0 : dst = base.DefaultComparer.Successor(dst, aKey)
83 0 : // Did it pick a successor different than aKey -- if it did not we can't do better than a.
84 0 : buf := dst[n:]
85 0 : if bytes.Equal(aKey, buf) {
86 0 : return append(dst[:n], a...)
87 0 : }
88 : // The successor is > aKey, so we only need to add the sentinel.
89 0 : return append(dst, 0)
90 : },
91 0 : ImmediateSuccessor: func(dst, a []byte) []byte {
92 0 : // The key `a` is guaranteed to be a bare prefix: It's a
93 0 : // `engineKeyNoVersion` key without a version—just a trailing 0-byte to
94 0 : // signify the length of the version. For example the user key "foo" is
95 0 : // encoded as: "foo\0". We need to encode the immediate successor to
96 0 : // "foo", which in the natural byte ordering is "foo\0". Append a
97 0 : // single additional zero, to encode the user key "foo\0" with a
98 0 : // zero-length version.
99 0 : return append(append(dst, a...), 0)
100 0 : },
101 : Name: "cockroach_comparator",
102 : }
103 :
104 : // EncodeMVCCKey encodes a MVCC key into dst, growing dst as necessary.
105 1 : func EncodeMVCCKey(dst []byte, key []byte, walltime uint64, logical uint32) []byte {
106 1 : if cap(dst) < len(key)+withSynthetic {
107 1 : dst = make([]byte, 0, len(key)+withSynthetic)
108 1 : }
109 1 : dst = append(dst[:0], key...)
110 1 : return EncodeTimestamp(dst, walltime, logical)
111 : }
112 :
113 : // AppendTimestamp appends an encoded MVCC timestamp onto key, returning the new
114 : // key. The provided key should already have the 0x00 sentinel byte (i.e., key
115 : // should be a proper prefix from the perspective of Pebble).
116 1 : func AppendTimestamp(key []byte, walltime uint64, logical uint32) []byte {
117 1 : if key[len(key)-1] != 0 {
118 0 : panic(errors.AssertionFailedf("key does not end with 0x00 sentinel byte: %x", key))
119 : }
120 1 : if logical == 0 {
121 1 : if walltime == 0 {
122 0 : return key
123 0 : }
124 1 : key = append(key, make([]byte, 9)...)
125 1 : binary.BigEndian.PutUint64(key[len(key)-9:], walltime)
126 1 : key[len(key)-1] = 9 // Version length byte
127 1 : return key
128 : }
129 0 : key = append(key, make([]byte, 13)...)
130 0 : binary.BigEndian.PutUint64(key[len(key)-13:], walltime)
131 0 : binary.BigEndian.PutUint32(key[len(key)-5:], logical)
132 0 : key[len(key)-1] = 13 // Version length byte
133 0 : return key
134 :
135 : }
136 :
137 : // EncodeTimestamp encodes a MVCC timestamp into a key, returning the new key.
138 : // The key's capacity must be sufficiently large to hold the encoded timestamp.
139 1 : func EncodeTimestamp(key []byte, walltime uint64, logical uint32) []byte {
140 1 : pos := len(key)
141 1 : if logical == 0 {
142 1 : if walltime == 0 {
143 1 : key = key[:pos+1]
144 1 : key[pos] = 0 // sentinel byte
145 1 : return key
146 1 : }
147 :
148 1 : key = key[:pos+1+8+1]
149 1 : key[pos] = 0 // sentinel byte
150 1 : key[pos+1+8] = 9
151 1 : binary.BigEndian.PutUint64(key[pos+1:], walltime)
152 1 : return key
153 : }
154 :
155 1 : key = key[:pos+1+12+1]
156 1 : key[pos] = 0 // sentinel byte
157 1 : key[pos+1+8+4] = 13
158 1 : binary.BigEndian.PutUint64(key[pos+1:], walltime)
159 1 : binary.BigEndian.PutUint32(key[pos+1+8:], logical)
160 1 : return key
161 : }
162 :
163 : // DecodeEngineKey decodes an Engine key (the key that gets stored in Pebble).
164 : // Returns:
165 : //
166 : // - roachKey: key prefix without the separator byte; corresponds to
167 : // MVCCKey.Key / EngineKey.Key (roachpb.Key type).
168 : //
169 : // - untypedVersion: when the suffix does not correspond to a timestamp,
170 : // untypedVersion is set to the suffix without the length
171 : // byte; corresponds to EngineKey.Version.
172 : //
173 : // - wallTime, logicalTime: timestamp for the key, or 0 if the key does not
174 : // correspond to a timestamp.
175 : func DecodeEngineKey(
176 : engineKey []byte,
177 1 : ) (roachKey []byte, untypedVersion []byte, wallTime uint64, logicalTime uint32) {
178 1 : tsLen := int(engineKey[len(engineKey)-1])
179 1 : tsStart := len(engineKey) - tsLen
180 1 : if tsStart < 0 {
181 0 : if invariants.Enabled {
182 0 : panic("invalid length byte")
183 : }
184 0 : return nil, nil, 0, 0
185 : }
186 1 : roachKeyEnd := tsStart - 1
187 1 : if roachKeyEnd < 0 {
188 0 : roachKeyEnd = 0
189 1 : } else if invariants.Enabled && engineKey[roachKeyEnd] != 0 {
190 0 : panic("invalid separator byte")
191 : }
192 :
193 1 : roachKey = engineKey[:roachKeyEnd]
194 1 : untypedVersion, wallTime, logicalTime = DecodeSuffix(engineKey[tsStart:])
195 1 : return roachKey, untypedVersion, wallTime, logicalTime
196 : }
197 :
198 : // DecodeSuffix decodes the suffix of a key. If the suffix is a timestamp,
199 : // wallTime and logicalTime are returned; otherwise untypedVersion contains the
200 : // version (which is the suffix without the terminator length byte).
201 : //
202 : //gcassert:inline
203 1 : func DecodeSuffix(suffix []byte) (untypedVersion []byte, wallTime uint64, logicalTime uint32) {
204 1 : if invariants.Enabled && len(suffix) > 0 && len(suffix) != int(suffix[len(suffix)-1]) {
205 0 : panic("invalid length byte")
206 : }
207 1 : switch len(suffix) {
208 0 : case 0:
209 0 : return nil, 0, 0
210 1 : case 9:
211 1 : return nil, binary.BigEndian.Uint64(suffix[:8]), 0
212 0 : case 13, 14:
213 0 : return nil, binary.BigEndian.Uint64(suffix[:8]), binary.BigEndian.Uint32(suffix[8:12])
214 0 : default:
215 0 : return suffix[:len(suffix)-1], 0, 0
216 : }
217 : }
218 :
219 : // Split implements base.Split for CockroachDB keys.
220 1 : func Split(key []byte) int {
221 1 : if len(key) == 0 {
222 0 : return 0
223 0 : }
224 :
225 : // Last byte is the version length + 1 when there is a version, else it is
226 : // 0.
227 1 : versionLen := int(key[len(key)-1])
228 1 : if versionLen > len(key) {
229 0 : panic(errors.AssertionFailedf("invalid version length"))
230 : }
231 1 : return len(key) - versionLen
232 : }
233 :
234 : // Compare compares cockroach keys, including the version (which could be MVCC
235 : // timestamps).
236 1 : func Compare(a, b []byte) int {
237 1 : if len(a) == 0 || len(b) == 0 {
238 0 : return cmp.Compare(len(a), len(b))
239 0 : }
240 :
241 : // NB: For performance, this routine manually splits the key into the
242 : // user-key and version components rather than using DecodeEngineKey. In
243 : // most situations, use DecodeEngineKey or GetKeyPartFromEngineKey or
244 : // SplitMVCCKey instead of doing this.
245 1 : aEnd := len(a) - 1
246 1 : bEnd := len(b) - 1
247 1 :
248 1 : // Compute the index of the separator between the key and the version. If the
249 1 : // separator is found to be at -1 for both keys, then we are comparing bare
250 1 : // suffixes without a user key part. Pebble requires bare suffixes to be
251 1 : // comparable with the same ordering as if they had a common user key.
252 1 : aSep := aEnd - int(a[aEnd])
253 1 : bSep := bEnd - int(b[bEnd])
254 1 : if aSep < 0 || bSep < 0 || a[aSep] != 0 || b[bSep] != 0 {
255 0 : panic(errors.AssertionFailedf("malformed key: %x, %x", a, b))
256 : }
257 : // Compare the "user key" part of the key.
258 1 : if c := bytes.Compare(a[:aSep], b[:bSep]); c != 0 {
259 1 : return c
260 1 : }
261 :
262 : // Compare the version part of the key. Note that when the version is a
263 : // timestamp, the timestamp encoding causes byte comparison to be equivalent
264 : // to timestamp comparison.
265 1 : a, b = a[aSep+1:], b[bSep+1:]
266 1 : if len(a) == 0 || len(b) == 0 {
267 1 : // Empty suffixes come before non-empty suffixes.
268 1 : return cmp.Compare(len(a), len(b))
269 1 : }
270 1 : return bytes.Compare(
271 1 : normalizeEngineKeyVersionForCompare(b),
272 1 : normalizeEngineKeyVersionForCompare(a),
273 1 : )
274 : }
275 :
276 : // CompareSuffixes compares suffixes (normally timestamps).
277 1 : func CompareSuffixes(a, b []byte) int {
278 1 : if len(a) == 0 || len(b) == 0 {
279 1 : // Empty suffixes come before non-empty suffixes.
280 1 : return cmp.Compare(len(a), len(b))
281 1 : }
282 : // Here we are not using normalizeEngineKeyVersionForCompare for historical
283 : // reasons, summarized in
284 : // https://github.com/cockroachdb/cockroach/issues/130533.
285 1 : return bytes.Compare(b[:len(b)-1], a[:len(a)-1])
286 : }
287 :
288 : // Equal implements base.Equal for Cockroach keys.
289 1 : func Equal(a, b []byte) bool {
290 1 : // TODO(radu): Pebble sometimes passes empty "keys" and we have to tolerate
291 1 : // them until we fix that.
292 1 : if len(a) == 0 || len(b) == 0 {
293 0 : return len(a) == len(b)
294 0 : }
295 1 : aEnd := len(a) - 1
296 1 : bEnd := len(b) - 1
297 1 :
298 1 : // Last byte is the version length + 1 when there is a version,
299 1 : // else it is 0.
300 1 : aVerLen := int(a[aEnd])
301 1 : bVerLen := int(b[bEnd])
302 1 :
303 1 : // Fast-path. If the key version is empty or contains only a walltime
304 1 : // component then normalizeEngineKeyVersionForCompare is a no-op, so we don't
305 1 : // need to split the "user key" from the version suffix before comparing to
306 1 : // compute equality. Instead, we can check for byte equality immediately.
307 1 : if (aVerLen <= withWall && bVerLen <= withWall) || (aVerLen == withLockTableLen && bVerLen == withLockTableLen) {
308 1 : return bytes.Equal(a, b)
309 1 : }
310 :
311 : // Compute the index of the separator between the key and the version. If the
312 : // separator is found to be at -1 for both keys, then we are comparing bare
313 : // suffixes without a user key part. Pebble requires bare suffixes to be
314 : // comparable with the same ordering as if they had a common user key.
315 1 : aSep := aEnd - aVerLen
316 1 : bSep := bEnd - bVerLen
317 1 : // Compare the "user key" part of the key.
318 1 : if !bytes.Equal(a[:aSep], b[:bSep]) {
319 1 : return false
320 1 : }
321 1 : if aVerLen == 0 || bVerLen == 0 {
322 1 : return aVerLen == bVerLen
323 1 : }
324 :
325 : // Compare the version part of the key.
326 1 : aVer := a[aSep+1:]
327 1 : bVer := b[bSep+1:]
328 1 : aVer = normalizeEngineKeyVersionForCompare(aVer)
329 1 : bVer = normalizeEngineKeyVersionForCompare(bVer)
330 1 : return bytes.Equal(aVer, bVer)
331 : }
332 :
333 : var zeroLogical [4]byte
334 :
335 1 : func normalizeEngineKeyVersionForCompare(a []byte) []byte {
336 1 : // Check sentinel byte.
337 1 : if len(a) != int(a[len(a)-1]) {
338 0 : panic(errors.AssertionFailedf("malformed suffix: %x", a))
339 : }
340 : // Strip off sentinel byte.
341 1 : a = a[:len(a)-1]
342 1 : // In general, the version could also be a non-timestamp version, but we know
343 1 : // that engineKeyVersionLockTableLen+mvccEncodedTimeSentinelLen is a different
344 1 : // constant than the above, so there is no danger here of stripping parts from
345 1 : // a non-timestamp version.
346 1 : if len(a) == withSynthetic-1 {
347 1 : // Strip the synthetic bit component from the timestamp version. The
348 1 : // presence of the synthetic bit does not affect key ordering or equality.
349 1 : a = a[:withLogical-1]
350 1 : }
351 1 : if len(a) == withLogical-1 {
352 1 : // If the timestamp version contains a logical timestamp component that is
353 1 : // zero, strip the component. encodeMVCCTimestampToBuf will typically omit
354 1 : // the entire logical component in these cases as an optimization, but it
355 1 : // does not guarantee to never include a zero logical component.
356 1 : // Additionally, we can fall into this case after stripping off other
357 1 : // components of the key version earlier on in this function.
358 1 : if bytes.Equal(a[withWall-1:], zeroLogical[:]) {
359 1 : a = a[:withWall-1]
360 1 : }
361 : }
362 1 : return a
363 : }
364 :
365 0 : func getKeyPartFromEngineKey(engineKey []byte) (key []byte, ok bool) {
366 0 : if len(engineKey) == 0 {
367 0 : return nil, false
368 0 : }
369 : // Last byte is the version length + 1 when there is a version,
370 : // else it is 0.
371 0 : versionLen := int(engineKey[len(engineKey)-1])
372 0 : // keyPartEnd points to the sentinel byte.
373 0 : keyPartEnd := len(engineKey) - 1 - versionLen
374 0 : if keyPartEnd < 0 || engineKey[keyPartEnd] != 0x00 {
375 0 : return nil, false
376 0 : }
377 : // Key excludes the sentinel byte.
378 0 : return engineKey[:keyPartEnd], true
379 : }
380 :
381 : // KeyConfig configures the shape of the random keys generated.
382 : type KeyConfig struct {
383 : PrefixAlphabetLen int // Number of bytes in the alphabet used for the prefix.
384 : PrefixLenShared int // Number of bytes shared by all key prefixes.
385 : PrefixLen int // Number of bytes in the prefix.
386 : AvgKeysPerPrefix int // Average number of keys (with varying suffixes) per prefix.
387 : BaseWallTime uint64 // Smallest MVCC WallTime.
388 : PercentLogical int // Percent of keys with non-zero MVCC logical time.
389 : }
390 :
391 0 : func (cfg KeyConfig) String() string {
392 0 : return fmt.Sprintf(
393 0 : "AlphaLen=%d,Prefix=%d,Shared=%d,KeysPerPrefix=%d%s",
394 0 : cfg.PrefixAlphabetLen, cfg.PrefixLen, cfg.PrefixLenShared,
395 0 : cfg.AvgKeysPerPrefix,
396 0 : crstrings.If(cfg.PercentLogical != 0, fmt.Sprintf(",Logical=%d", cfg.PercentLogical)),
397 0 : )
398 0 : }
399 :
400 : // RandomKVs constructs count random KVs with the provided parameters.
401 1 : func RandomKVs(rng *rand.Rand, count int, cfg KeyConfig, valueLen int) (keys, vals [][]byte) {
402 1 : g := makeCockroachKeyGen(rng, cfg)
403 1 : sharedPrefix := make([]byte, cfg.PrefixLenShared)
404 1 : for i := 0; i < len(sharedPrefix); i++ {
405 1 : sharedPrefix[i] = byte(rng.IntN(cfg.PrefixAlphabetLen) + 'a')
406 1 : }
407 :
408 1 : keys = make([][]byte, 0, count)
409 1 : vals = make([][]byte, 0, count)
410 1 : for len(keys) < count {
411 1 : prefix := g.randPrefix(sharedPrefix)
412 1 : // We use the exponential distribution so that we occasionally have many
413 1 : // suffixes
414 1 : n := int(rng.ExpFloat64() * float64(cfg.AvgKeysPerPrefix))
415 1 : n = max(n, 1)
416 1 : for i := 0; i < n && len(keys) < count; i++ {
417 1 : wallTime, logicalTime := g.randTimestamp()
418 1 : k := makeKey(prefix, wallTime, logicalTime)
419 1 : v := make([]byte, valueLen)
420 1 : for j := range v {
421 1 : v[j] = byte(rng.Uint32())
422 1 : }
423 1 : keys = append(keys, k)
424 1 : vals = append(vals, v)
425 : }
426 : }
427 1 : slices.SortFunc(keys, Compare)
428 1 : return keys, vals
429 : }
430 :
431 1 : func makeKey(prefix []byte, wallTime uint64, logicalTime uint32) []byte {
432 1 : k := make([]byte, 0, len(prefix)+MaxSuffixLen)
433 1 : k = append(k, prefix...)
434 1 : return EncodeTimestamp(k, wallTime, logicalTime)
435 1 : }
436 :
437 : // RandomQueryKeys returns a slice of count random query keys. Each key has a
438 : // random prefix uniformly chosen from the distinct prefixes in writtenKeys and
439 : // a random timestamp.
440 : //
441 : // Note that by setting baseWallTime to be large enough, we can simulate a query
442 : // pattern that always retrieves the latest version of any prefix.
443 : func RandomQueryKeys(
444 : rng *rand.Rand, count int, writtenKeys [][]byte, baseWallTime uint64,
445 1 : ) [][]byte {
446 1 : // Gather prefixes.
447 1 : prefixes := make([][]byte, len(writtenKeys))
448 1 : for i, k := range writtenKeys {
449 1 : prefixes[i] = k[:Split(k)-1]
450 1 : }
451 1 : slices.SortFunc(prefixes, bytes.Compare)
452 1 : prefixes = slices.CompactFunc(prefixes, bytes.Equal)
453 1 : result := make([][]byte, count)
454 1 : for i := range result {
455 1 : prefix := prefixes[rng.IntN(len(prefixes))]
456 1 : wallTime := baseWallTime + rng.Uint64N(uint64(time.Hour))
457 1 : var logicalTime uint32
458 1 : if rng.IntN(10) == 0 {
459 1 : logicalTime = rng.Uint32()
460 1 : }
461 1 : result[i] = makeKey(prefix, wallTime, logicalTime)
462 : }
463 1 : return result
464 : }
465 :
466 : type cockroachKeyGen struct {
467 : rng *rand.Rand
468 : cfg KeyConfig
469 : }
470 :
471 1 : func makeCockroachKeyGen(rng *rand.Rand, cfg KeyConfig) cockroachKeyGen {
472 1 : return cockroachKeyGen{
473 1 : rng: rng,
474 1 : cfg: cfg,
475 1 : }
476 1 : }
477 :
478 1 : func (g *cockroachKeyGen) randPrefix(blockPrefix []byte) []byte {
479 1 : prefix := make([]byte, 0, g.cfg.PrefixLen+MaxSuffixLen)
480 1 : prefix = append(prefix, blockPrefix...)
481 1 : for len(prefix) < g.cfg.PrefixLen {
482 1 : prefix = append(prefix, byte(g.rng.IntN(g.cfg.PrefixAlphabetLen)+'a'))
483 1 : }
484 1 : return prefix
485 : }
486 :
487 1 : func (g *cockroachKeyGen) randTimestamp() (wallTime uint64, logicalTime uint32) {
488 1 : wallTime = g.cfg.BaseWallTime + g.rng.Uint64N(uint64(time.Hour))
489 1 : if g.cfg.PercentLogical > 0 && g.rng.IntN(100) < g.cfg.PercentLogical {
490 1 : logicalTime = g.rng.Uint32()
491 1 : }
492 1 : return wallTime, logicalTime
493 : }
494 :
495 : const (
496 : cockroachColRoachKey int = iota
497 : cockroachColMVCCWallTime
498 : cockroachColMVCCLogical
499 : cockroachColUntypedVersion
500 : cockroachColCount
501 : )
502 :
503 : var KeySchema = colblk.KeySchema{
504 : ColumnTypes: []colblk.DataType{
505 : cockroachColRoachKey: colblk.DataTypePrefixBytes,
506 : cockroachColMVCCWallTime: colblk.DataTypeUint,
507 : cockroachColMVCCLogical: colblk.DataTypeUint,
508 : cockroachColUntypedVersion: colblk.DataTypeBytes,
509 : },
510 1 : NewKeyWriter: func() colblk.KeyWriter {
511 1 : kw := &cockroachKeyWriter{}
512 1 : kw.roachKeys.Init(16)
513 1 : kw.wallTimes.Init()
514 1 : kw.logicalTimes.InitWithDefault()
515 1 : kw.untypedVersions.Init()
516 1 : return kw
517 1 : },
518 1 : NewKeySeeker: func() colblk.KeySeeker {
519 1 : return cockroachKeySeekerPool.Get().(*cockroachKeySeeker)
520 1 : },
521 : }
522 :
523 : type cockroachKeyWriter struct {
524 : roachKeys colblk.PrefixBytesBuilder
525 : wallTimes colblk.UintBuilder
526 : logicalTimes colblk.UintBuilder
527 : untypedVersions colblk.RawBytesBuilder
528 : prevSuffix []byte
529 : }
530 :
531 1 : func (kw *cockroachKeyWriter) ComparePrev(key []byte) colblk.KeyComparison {
532 1 : var cmpv colblk.KeyComparison
533 1 : cmpv.PrefixLen = int32(Split(key)) // TODO(jackson): Inline
534 1 : if kw.roachKeys.Rows() == 0 {
535 1 : cmpv.UserKeyComparison = 1
536 1 : return cmpv
537 1 : }
538 1 : lp := kw.roachKeys.UnsafeGet(kw.roachKeys.Rows() - 1)
539 1 : cmpv.CommonPrefixLen = int32(crbytes.CommonPrefix(lp, key[:cmpv.PrefixLen-1]))
540 1 : if cmpv.CommonPrefixLen == cmpv.PrefixLen-1 {
541 1 : // Adjust CommonPrefixLen to include the sentinel byte,
542 1 : cmpv.CommonPrefixLen = cmpv.PrefixLen
543 1 : cmpv.UserKeyComparison = int32(CompareSuffixes(key[cmpv.PrefixLen:], kw.prevSuffix))
544 1 : return cmpv
545 1 : }
546 : // The keys have different MVCC prefixes. We haven't determined which is
547 : // greater, but we know the index at which they diverge. The base.Comparer
548 : // contract dictates that prefixes must be lexicographically ordered.
549 1 : if len(lp) == int(cmpv.CommonPrefixLen) {
550 0 : // cmpv.PrefixLen > cmpv.PrefixLenShared; key is greater.
551 0 : cmpv.UserKeyComparison = +1
552 1 : } else {
553 1 : // Both keys have at least 1 additional byte at which they diverge.
554 1 : // Compare the diverging byte.
555 1 : cmpv.UserKeyComparison = int32(cmp.Compare(key[cmpv.CommonPrefixLen], lp[cmpv.CommonPrefixLen]))
556 1 : }
557 1 : return cmpv
558 : }
559 :
560 : func (kw *cockroachKeyWriter) WriteKey(
561 : row int, key []byte, keyPrefixLen, keyPrefixLenSharedWithPrev int32,
562 1 : ) {
563 1 : // TODO(jackson): Avoid copying the previous suffix.
564 1 : // TODO(jackson): Use keyPrefixLen to speed up decoding.
565 1 : roachKey, untypedVersion, wallTime, logicalTime := DecodeEngineKey(key)
566 1 : kw.prevSuffix = append(kw.prevSuffix[:0], key[keyPrefixLen:]...)
567 1 : // When the roach key is the same, keyPrefixLenSharedWithPrev includes the
568 1 : // separator byte.
569 1 : kw.roachKeys.Put(roachKey, min(int(keyPrefixLenSharedWithPrev), len(roachKey)))
570 1 : kw.wallTimes.Set(row, wallTime)
571 1 : // The w.logicalTimes builder was initialized with InitWithDefault, so if we
572 1 : // don't set a value, the column value is implicitly zero. We only need to
573 1 : // Set anything for non-zero values.
574 1 : if logicalTime > 0 {
575 0 : kw.logicalTimes.Set(row, uint64(logicalTime))
576 0 : }
577 1 : kw.untypedVersions.Put(untypedVersion)
578 : }
579 :
580 1 : func (kw *cockroachKeyWriter) MaterializeKey(dst []byte, i int) []byte {
581 1 : dst = append(dst, kw.roachKeys.UnsafeGet(i)...)
582 1 : // Append separator byte.
583 1 : dst = append(dst, 0)
584 1 : if untypedVersion := kw.untypedVersions.UnsafeGet(i); len(untypedVersion) > 0 {
585 0 : dst = append(dst, untypedVersion...)
586 0 : dst = append(dst, byte(len(untypedVersion)))
587 0 : return dst
588 0 : }
589 1 : return AppendTimestamp(dst, kw.wallTimes.Get(i), uint32(kw.logicalTimes.Get(i)))
590 : }
591 :
592 0 : func (kw *cockroachKeyWriter) Reset() {
593 0 : kw.roachKeys.Reset()
594 0 : kw.wallTimes.Reset()
595 0 : kw.logicalTimes.Reset()
596 0 : kw.untypedVersions.Reset()
597 0 : }
598 :
599 0 : func (kw *cockroachKeyWriter) WriteDebug(dst io.Writer, rows int) {
600 0 : fmt.Fprint(dst, "prefixes: ")
601 0 : kw.roachKeys.WriteDebug(dst, rows)
602 0 : fmt.Fprintln(dst)
603 0 : fmt.Fprint(dst, "wall times: ")
604 0 : kw.wallTimes.WriteDebug(dst, rows)
605 0 : fmt.Fprintln(dst)
606 0 : fmt.Fprint(dst, "logical times: ")
607 0 : kw.logicalTimes.WriteDebug(dst, rows)
608 0 : fmt.Fprintln(dst)
609 0 : fmt.Fprint(dst, "untyped suffixes: ")
610 0 : kw.untypedVersions.WriteDebug(dst, rows)
611 0 : fmt.Fprintln(dst)
612 0 : }
613 :
614 1 : func (kw *cockroachKeyWriter) NumColumns() int {
615 1 : return cockroachColCount
616 1 : }
617 :
618 1 : func (kw *cockroachKeyWriter) DataType(col int) colblk.DataType {
619 1 : return KeySchema.ColumnTypes[col]
620 1 : }
621 :
622 1 : func (kw *cockroachKeyWriter) Size(rows int, offset uint32) uint32 {
623 1 : offset = kw.roachKeys.Size(rows, offset)
624 1 : offset = kw.wallTimes.Size(rows, offset)
625 1 : offset = kw.logicalTimes.Size(rows, offset)
626 1 : offset = kw.untypedVersions.Size(rows, offset)
627 1 : return offset
628 1 : }
629 :
630 : func (kw *cockroachKeyWriter) Finish(
631 : col int, rows int, offset uint32, buf []byte,
632 1 : ) (endOffset uint32) {
633 1 : switch col {
634 1 : case cockroachColRoachKey:
635 1 : return kw.roachKeys.Finish(0, rows, offset, buf)
636 1 : case cockroachColMVCCWallTime:
637 1 : return kw.wallTimes.Finish(0, rows, offset, buf)
638 1 : case cockroachColMVCCLogical:
639 1 : return kw.logicalTimes.Finish(0, rows, offset, buf)
640 1 : case cockroachColUntypedVersion:
641 1 : return kw.untypedVersions.Finish(0, rows, offset, buf)
642 0 : default:
643 0 : panic(fmt.Sprintf("unknown default key column: %d", col))
644 : }
645 : }
646 :
647 : var cockroachKeySeekerPool = sync.Pool{
648 1 : New: func() interface{} { return &cockroachKeySeeker{} },
649 : }
650 :
651 : type cockroachKeySeeker struct {
652 : roachKeys colblk.PrefixBytes
653 : roachKeyChanged colblk.Bitmap
654 : mvccWallTimes colblk.UnsafeUints
655 : mvccLogical colblk.UnsafeUints
656 : untypedVersions colblk.RawBytes
657 : }
658 :
659 : var _ colblk.KeySeeker = (*cockroachKeySeeker)(nil)
660 :
661 : // Init is part of the KeySeeker interface.
662 1 : func (ks *cockroachKeySeeker) Init(d *colblk.DataBlockDecoder) error {
663 1 : bd := d.BlockDecoder()
664 1 : ks.roachKeys = bd.PrefixBytes(cockroachColRoachKey)
665 1 : ks.roachKeyChanged = d.PrefixChanged()
666 1 : ks.mvccWallTimes = bd.Uints(cockroachColMVCCWallTime)
667 1 : ks.mvccLogical = bd.Uints(cockroachColMVCCLogical)
668 1 : ks.untypedVersions = bd.RawBytes(cockroachColUntypedVersion)
669 1 : return nil
670 1 : }
671 :
672 : // CompareFirstUserKey compares the provided key to the first user key
673 : // contained within the data block. It's equivalent to performing
674 : //
675 : // Compare(firstUserKey, k)
676 0 : func (ks *cockroachKeySeeker) IsLowerBound(k []byte, syntheticSuffix []byte) bool {
677 0 : roachKey, untypedVersion, wallTime, logicalTime := DecodeEngineKey(k)
678 0 : if v := Compare(ks.roachKeys.UnsafeFirstSlice(), roachKey); v != 0 {
679 0 : return v > 0
680 0 : }
681 0 : if len(syntheticSuffix) > 0 {
682 0 : return Compare(syntheticSuffix, k[len(roachKey)+1:]) >= 0
683 0 : }
684 0 : if len(untypedVersion) > 0 {
685 0 : if invariants.Enabled && ks.mvccWallTimes.At(0) != 0 {
686 0 : panic("comparing timestamp with untyped suffix")
687 : }
688 0 : return Compare(ks.untypedVersions.At(0), untypedVersion) >= 0
689 : }
690 0 : if v := cmp.Compare(ks.mvccWallTimes.At(0), wallTime); v != 0 {
691 0 : return v > 0
692 0 : }
693 0 : return cmp.Compare(uint32(ks.mvccLogical.At(0)), logicalTime) >= 0
694 : }
695 :
696 : // SeekGE is part of the KeySeeker interface.
697 : func (ks *cockroachKeySeeker) SeekGE(
698 : key []byte, boundRow int, searchDir int8,
699 1 : ) (row int, equalPrefix bool) {
700 1 : // TODO(jackson): Inline Split.
701 1 : si := Split(key)
702 1 : row, eq := ks.roachKeys.Search(key[:si-1])
703 1 : if eq {
704 1 : return ks.seekGEOnSuffix(row, key[si:]), true
705 1 : }
706 0 : return row, false
707 : }
708 :
709 : // seekGEOnSuffix is a helper function for SeekGE when a seek key's prefix
710 : // exactly matches a row. seekGEOnSuffix finds the first row at index or later
711 : // with the same prefix as index and a suffix greater than or equal to [suffix],
712 : // or if no such row exists, the next row with a different prefix.
713 1 : func (ks *cockroachKeySeeker) seekGEOnSuffix(index int, seekSuffix []byte) (row int) {
714 1 : // The search key's prefix exactly matches the prefix of the row at index.
715 1 : const withWall = 9
716 1 : const withLogical = withWall + 4
717 1 : const withSynthetic = withLogical + 1
718 1 : var seekWallTime uint64
719 1 : var seekLogicalTime uint32
720 1 : switch len(seekSuffix) {
721 0 : case 0:
722 0 : // The search key has no suffix, so it's the smallest possible key with
723 0 : // its prefix. Return the row. This is a common case where the user is
724 0 : // seeking to the most-recent row and just wants the smallest key with
725 0 : // the prefix.
726 0 : return index
727 0 : case withLogical, withSynthetic:
728 0 : seekWallTime = binary.BigEndian.Uint64(seekSuffix)
729 0 : seekLogicalTime = binary.BigEndian.Uint32(seekSuffix[8:])
730 1 : case withWall:
731 1 : seekWallTime = binary.BigEndian.Uint64(seekSuffix)
732 0 : default:
733 0 : // The suffix is untyped. Compare the untyped suffixes.
734 0 : // Binary search between [index, prefixChanged.SeekSetBitGE(index+1)].
735 0 : //
736 0 : // Define f(i) = true iff key at i is >= seek key.
737 0 : // Invariant: f(l-1) == false, f(u) == true.
738 0 : l := index
739 0 : u := ks.roachKeyChanged.SeekSetBitGE(index + 1)
740 0 : for l < u {
741 0 : h := int(uint(l+u) >> 1) // avoid overflow when computing h
742 0 : // l ≤ h < u
743 0 : if bytes.Compare(ks.untypedVersions.At(h), seekSuffix) >= 0 {
744 0 : u = h // preserves f(u) == true
745 0 : } else {
746 0 : l = h + 1 // preserves f(l-1) == false
747 0 : }
748 : }
749 0 : return l
750 : }
751 : // Seeking among MVCC versions using a MVCC timestamp.
752 :
753 : // TODO(jackson): What if the row has an untyped suffix?
754 :
755 : // First check the suffix at index, because querying for the latest value is
756 : // the most common case.
757 1 : if latestWallTime := ks.mvccWallTimes.At(index); latestWallTime < seekWallTime ||
758 1 : (latestWallTime == seekWallTime && uint32(ks.mvccLogical.At(index)) <= seekLogicalTime) {
759 1 : return index
760 1 : }
761 :
762 : // Binary search between [index+1, prefixChanged.SeekSetBitGE(index+1)].
763 : //
764 : // Define f(i) = true iff key at i is >= seek key.
765 : // Invariant: f(l-1) == false, f(u) == true.
766 1 : l := index + 1
767 1 : u := ks.roachKeyChanged.SeekSetBitGE(index + 1)
768 1 : for l < u {
769 1 : h := int(uint(l+u) >> 1) // avoid overflow when computing h
770 1 : // l ≤ h < u
771 1 : hWallTime := ks.mvccWallTimes.At(h)
772 1 : if hWallTime < seekWallTime ||
773 1 : (hWallTime == seekWallTime && uint32(ks.mvccLogical.At(h)) <= seekLogicalTime) {
774 1 : u = h // preserves f(u) = true
775 1 : } else {
776 1 : l = h + 1 // preserves f(l-1) = false
777 1 : }
778 : }
779 1 : return l
780 : }
781 :
782 : // MaterializeUserKey is part of the KeySeeker interface.
783 : func (ks *cockroachKeySeeker) MaterializeUserKey(
784 : ki *colblk.PrefixBytesIter, prevRow, row int,
785 1 : ) []byte {
786 1 : if prevRow+1 == row && prevRow >= 0 {
787 1 : ks.roachKeys.SetNext(ki)
788 1 : } else {
789 1 : ks.roachKeys.SetAt(ki, row)
790 1 : }
791 :
792 1 : roachKeyLen := len(ki.Buf)
793 1 : ptr := unsafe.Pointer(uintptr(unsafe.Pointer(unsafe.SliceData(ki.Buf))) + uintptr(roachKeyLen))
794 1 : mvccWall := ks.mvccWallTimes.At(row)
795 1 : mvccLogical := uint32(ks.mvccLogical.At(row))
796 1 : if mvccWall == 0 && mvccLogical == 0 {
797 0 : // This is not an MVCC key. Use the untyped suffix.
798 0 : untypedVersion := ks.untypedVersions.At(row)
799 0 : if len(untypedVersion) == 0 {
800 0 : res := ki.Buf[:roachKeyLen+1]
801 0 : res[roachKeyLen] = 0
802 0 : return res
803 0 : }
804 : // Slice first, to check that the capacity is sufficient.
805 0 : res := ki.Buf[:roachKeyLen+1+len(untypedVersion)]
806 0 : *(*byte)(ptr) = 0
807 0 : memmove(
808 0 : unsafe.Pointer(uintptr(ptr)+1),
809 0 : unsafe.Pointer(unsafe.SliceData(untypedVersion)),
810 0 : uintptr(len(untypedVersion)),
811 0 : )
812 0 : return res
813 : }
814 :
815 : // Inline binary.BigEndian.PutUint64. Note that this code is converted into
816 : // word-size instructions by the compiler.
817 1 : *(*byte)(ptr) = 0
818 1 : *(*byte)(unsafe.Pointer(uintptr(ptr) + 1)) = byte(mvccWall >> 56)
819 1 : *(*byte)(unsafe.Pointer(uintptr(ptr) + 2)) = byte(mvccWall >> 48)
820 1 : *(*byte)(unsafe.Pointer(uintptr(ptr) + 3)) = byte(mvccWall >> 40)
821 1 : *(*byte)(unsafe.Pointer(uintptr(ptr) + 4)) = byte(mvccWall >> 32)
822 1 : *(*byte)(unsafe.Pointer(uintptr(ptr) + 5)) = byte(mvccWall >> 24)
823 1 : *(*byte)(unsafe.Pointer(uintptr(ptr) + 6)) = byte(mvccWall >> 16)
824 1 : *(*byte)(unsafe.Pointer(uintptr(ptr) + 7)) = byte(mvccWall >> 8)
825 1 : *(*byte)(unsafe.Pointer(uintptr(ptr) + 8)) = byte(mvccWall)
826 1 :
827 1 : ptr = unsafe.Pointer(uintptr(ptr) + 9)
828 1 : // This is an MVCC key.
829 1 : if mvccLogical == 0 {
830 1 : *(*byte)(ptr) = 9
831 1 : return ki.Buf[:len(ki.Buf)+10]
832 1 : }
833 :
834 : // Inline binary.BigEndian.PutUint32.
835 0 : *(*byte)(ptr) = byte(mvccWall >> 24)
836 0 : *(*byte)(unsafe.Pointer(uintptr(ptr) + 1)) = byte(mvccWall >> 16)
837 0 : *(*byte)(unsafe.Pointer(uintptr(ptr) + 2)) = byte(mvccWall >> 8)
838 0 : *(*byte)(unsafe.Pointer(uintptr(ptr) + 3)) = byte(mvccWall)
839 0 : *(*byte)(unsafe.Pointer(uintptr(ptr) + 4)) = 13
840 0 : return ki.Buf[:len(ki.Buf)+14]
841 : }
842 :
843 : // MaterializeUserKeyWithSyntheticSuffix is part of the KeySeeker interface.
844 : func (ks *cockroachKeySeeker) MaterializeUserKeyWithSyntheticSuffix(
845 : ki *colblk.PrefixBytesIter, suffix []byte, prevRow, row int,
846 0 : ) []byte {
847 0 : if prevRow+1 == row && prevRow >= 0 {
848 0 : ks.roachKeys.SetNext(ki)
849 0 : } else {
850 0 : ks.roachKeys.SetAt(ki, row)
851 0 : }
852 :
853 : // Slice first, to check that the capacity is sufficient.
854 0 : res := ki.Buf[:len(ki.Buf)+1+len(suffix)]
855 0 : ptr := unsafe.Pointer(uintptr(unsafe.Pointer(unsafe.SliceData(ki.Buf))) + uintptr(len(ki.Buf)))
856 0 : *(*byte)(ptr) = 0
857 0 : memmove(unsafe.Pointer(uintptr(ptr)+1), unsafe.Pointer(unsafe.SliceData(suffix)), uintptr(len(suffix)))
858 0 : return res
859 : }
860 :
861 : // Release is part of the KeySeeker interface.
862 0 : func (ks *cockroachKeySeeker) Release() {
863 0 : *ks = cockroachKeySeeker{}
864 0 : cockroachKeySeekerPool.Put(ks)
865 0 : }
866 :
867 : //go:linkname memmove runtime.memmove
868 : func memmove(to, from unsafe.Pointer, n uintptr)
|