Line data Source code
1 : // Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : // Package crdbtest provides facilities for representing keys, workloads, etc
6 : // representative of CockroachDB's use of Pebble.
7 : package crdbtest
8 :
9 : import (
10 : "bytes"
11 : "cmp"
12 : "encoding/binary"
13 : "fmt"
14 : "slices"
15 : "time"
16 :
17 : "github.com/cockroachdb/errors"
18 : "github.com/cockroachdb/pebble/internal/base"
19 : "golang.org/x/exp/rand"
20 : )
21 :
22 : const withWall = 9
23 : const withLogical = withWall + 4
24 : const withSynthetic = withLogical + 1
25 : const withLockTableLen = 18
26 :
27 : // MaxSuffixLen is the maximum length of the CockroachDB key suffix.
28 : const MaxSuffixLen = max(withLockTableLen, withSynthetic, withLogical, withWall)
29 :
30 : // Comparer is a base.Comparer for CockroachDB keys.
31 : var Comparer = base.Comparer{
32 : Split: Split,
33 : CompareSuffixes: CompareSuffixes,
34 : Compare: Compare,
35 : Equal: Equal,
36 0 : AbbreviatedKey: func(k []byte) uint64 {
37 0 : key, ok := getKeyPartFromEngineKey(k)
38 0 : if !ok {
39 0 : return 0
40 0 : }
41 0 : return base.DefaultComparer.AbbreviatedKey(key)
42 : },
43 : FormatKey: base.DefaultFormatter,
44 0 : Separator: func(dst, a, b []byte) []byte {
45 0 : aKey, ok := getKeyPartFromEngineKey(a)
46 0 : if !ok {
47 0 : return append(dst, a...)
48 0 : }
49 0 : bKey, ok := getKeyPartFromEngineKey(b)
50 0 : if !ok {
51 0 : return append(dst, a...)
52 0 : }
53 : // If the keys are the same just return a.
54 0 : if bytes.Equal(aKey, bKey) {
55 0 : return append(dst, a...)
56 0 : }
57 0 : n := len(dst)
58 0 : dst = base.DefaultComparer.Separator(dst, aKey, bKey)
59 0 : // Did it pick a separator different than aKey -- if it did not we can't do better than a.
60 0 : buf := dst[n:]
61 0 : if bytes.Equal(aKey, buf) {
62 0 : return append(dst[:n], a...)
63 0 : }
64 : // The separator is > aKey, so we only need to add the sentinel.
65 0 : return append(dst, 0)
66 : },
67 0 : Successor: func(dst, a []byte) []byte {
68 0 : aKey, ok := getKeyPartFromEngineKey(a)
69 0 : if !ok {
70 0 : return append(dst, a...)
71 0 : }
72 0 : n := len(dst)
73 0 : // Engine key comparison uses bytes.Compare on the roachpb.Key, which is the same semantics as
74 0 : // pebble.DefaultComparer, so reuse the latter's SeekSetBitGE implementation.
75 0 : dst = base.DefaultComparer.Successor(dst, aKey)
76 0 : // Did it pick a successor different than aKey -- if it did not we can't do better than a.
77 0 : buf := dst[n:]
78 0 : if bytes.Equal(aKey, buf) {
79 0 : return append(dst[:n], a...)
80 0 : }
81 : // The successor is > aKey, so we only need to add the sentinel.
82 0 : return append(dst, 0)
83 : },
84 0 : ImmediateSuccessor: func(dst, a []byte) []byte {
85 0 : // The key `a` is guaranteed to be a bare prefix: It's a
86 0 : // `engineKeyNoVersion` key without a version—just a trailing 0-byte to
87 0 : // signify the length of the version. For example the user key "foo" is
88 0 : // encoded as: "foo\0". We need to encode the immediate successor to
89 0 : // "foo", which in the natural byte ordering is "foo\0". Append a
90 0 : // single additional zero, to encode the user key "foo\0" with a
91 0 : // zero-length version.
92 0 : return append(append(dst, a...), 0)
93 0 : },
94 : Name: "cockroach_comparator",
95 : }
96 :
97 : // EncodeMVCCKey encodes a MVCC key into dst, growing dst as necessary.
98 1 : func EncodeMVCCKey(dst []byte, key []byte, walltime uint64, logical uint32) []byte {
99 1 : if cap(dst) < len(key)+withSynthetic {
100 1 : dst = make([]byte, 0, len(key)+withSynthetic)
101 1 : }
102 1 : dst = append(dst[:0], key...)
103 1 : return EncodeTimestamp(dst, walltime, logical)
104 : }
105 :
106 : // AppendTimestamp appends an encoded MVCC timestamp onto key, returning the new
107 : // key. The provided key should already have the 0x00 sentinel byte (i.e., key
108 : // should be a proper prefix from the perspective of Pebble).
109 1 : func AppendTimestamp(key []byte, walltime uint64, logical uint32) []byte {
110 1 : if key[len(key)-1] != 0 {
111 0 : panic(errors.AssertionFailedf("key does not end with 0x00 sentinel byte: %x", key))
112 : }
113 1 : if logical == 0 {
114 1 : if walltime == 0 {
115 0 : return key
116 0 : }
117 1 : key = append(key, make([]byte, 9)...)
118 1 : binary.BigEndian.PutUint64(key[len(key)-9:], walltime)
119 1 : key[len(key)-1] = 9 // Version length byte
120 1 : return key
121 : }
122 0 : key = append(key, make([]byte, 13)...)
123 0 : binary.BigEndian.PutUint64(key[len(key)-13:], walltime)
124 0 : binary.BigEndian.PutUint32(key[len(key)-5:], logical)
125 0 : key[len(key)-1] = 13 // Version length byte
126 0 : return key
127 :
128 : }
129 :
130 : // EncodeTimestamp encodes a MVCC timestamp into a key, returning the new key.
131 : // The key's capacity must be sufficiently large to hold the encoded timestamp.
132 1 : func EncodeTimestamp(key []byte, walltime uint64, logical uint32) []byte {
133 1 : pos := len(key)
134 1 : if logical == 0 {
135 1 : if walltime == 0 {
136 1 : key = key[:pos+1]
137 1 : key[pos] = 0 // sentinel byte
138 1 : return key
139 1 : }
140 :
141 1 : key = key[:pos+1+8+1]
142 1 : key[pos] = 0 // sentinel byte
143 1 : key[pos+1+8] = 9
144 1 : binary.BigEndian.PutUint64(key[pos+1:], walltime)
145 1 : return key
146 : }
147 :
148 1 : key = key[:pos+1+12+1]
149 1 : key[pos] = 0 // sentinel byte
150 1 : key[pos+1+8+4] = 13
151 1 : binary.BigEndian.PutUint64(key[pos+1:], walltime)
152 1 : binary.BigEndian.PutUint32(key[pos+1+8:], logical)
153 1 : return key
154 : }
155 :
156 : // DecodeTimestamp decodes a MVCC timestamp from a serialized MVCC key.
157 1 : func DecodeTimestamp(mvccKey []byte) ([]byte, []byte, uint64, uint32) {
158 1 : tsLen := int(mvccKey[len(mvccKey)-1])
159 1 : keyPartEnd := len(mvccKey) - tsLen
160 1 : if keyPartEnd < 0 {
161 0 : return nil, nil, 0, 0
162 0 : }
163 :
164 1 : key := mvccKey[:keyPartEnd]
165 1 : if tsLen > 0 {
166 1 : ts := mvccKey[keyPartEnd : len(mvccKey)-1]
167 1 : switch len(ts) {
168 1 : case 8:
169 1 : return key, nil, binary.BigEndian.Uint64(ts[:8]), 0
170 0 : case 12, 13:
171 0 : return key, nil, binary.BigEndian.Uint64(ts[:8]), binary.BigEndian.Uint32(ts[8:12])
172 0 : default:
173 0 : return key, ts, 0, 0
174 : }
175 : }
176 0 : return key, nil, 0, 0
177 : }
178 :
179 : // Split implements base.Split for CockroachDB keys.
180 1 : func Split(key []byte) int {
181 1 : if len(key) == 0 {
182 0 : return 0
183 0 : }
184 :
185 : // Last byte is the version length + 1 when there is a version, else it is
186 : // 0.
187 1 : versionLen := int(key[len(key)-1])
188 1 : if versionLen > len(key) {
189 0 : panic(errors.AssertionFailedf("invalid version length"))
190 : }
191 1 : return len(key) - versionLen
192 : }
193 :
194 : // Compare compares cockroach keys, including the version (which could be MVCC
195 : // timestamps).
196 1 : func Compare(a, b []byte) int {
197 1 : if len(a) == 0 || len(b) == 0 {
198 0 : return cmp.Compare(len(a), len(b))
199 0 : }
200 :
201 : // NB: For performance, this routine manually splits the key into the
202 : // user-key and version components rather than using DecodeEngineKey. In
203 : // most situations, use DecodeEngineKey or GetKeyPartFromEngineKey or
204 : // SplitMVCCKey instead of doing this.
205 1 : aEnd := len(a) - 1
206 1 : bEnd := len(b) - 1
207 1 :
208 1 : // Compute the index of the separator between the key and the version. If the
209 1 : // separator is found to be at -1 for both keys, then we are comparing bare
210 1 : // suffixes without a user key part. Pebble requires bare suffixes to be
211 1 : // comparable with the same ordering as if they had a common user key.
212 1 : aSep := aEnd - int(a[aEnd])
213 1 : bSep := bEnd - int(b[bEnd])
214 1 : if aSep < 0 || bSep < 0 || a[aSep] != 0 || b[bSep] != 0 {
215 0 : panic(errors.AssertionFailedf("malformed key: %x, %x", a, b))
216 : }
217 : // Compare the "user key" part of the key.
218 1 : if c := bytes.Compare(a[:aSep], b[:bSep]); c != 0 {
219 1 : return c
220 1 : }
221 :
222 : // Compare the version part of the key. Note that when the version is a
223 : // timestamp, the timestamp encoding causes byte comparison to be equivalent
224 : // to timestamp comparison.
225 1 : a, b = a[aSep+1:], b[bSep+1:]
226 1 : if len(a) == 0 || len(b) == 0 {
227 1 : // Empty suffixes come before non-empty suffixes.
228 1 : return cmp.Compare(len(a), len(b))
229 1 : }
230 1 : return bytes.Compare(
231 1 : normalizeEngineKeyVersionForCompare(b),
232 1 : normalizeEngineKeyVersionForCompare(a),
233 1 : )
234 : }
235 :
236 : // CompareSuffixes compares suffixes (normally timestamps).
237 1 : func CompareSuffixes(a, b []byte) int {
238 1 : if len(a) == 0 || len(b) == 0 {
239 1 : // Empty suffixes come before non-empty suffixes.
240 1 : return cmp.Compare(len(a), len(b))
241 1 : }
242 : // Here we are not using normalizeEngineKeyVersionForCompare for historical
243 : // reasons, summarized in
244 : // https://github.com/cockroachdb/cockroach/issues/130533.
245 1 : return bytes.Compare(b[:len(b)-1], a[:len(a)-1])
246 : }
247 :
248 : // Equal implements base.Equal for Cockroach keys.
249 1 : func Equal(a, b []byte) bool {
250 1 : // TODO(radu): Pebble sometimes passes empty "keys" and we have to tolerate
251 1 : // them until we fix that.
252 1 : if len(a) == 0 || len(b) == 0 {
253 0 : return len(a) == len(b)
254 0 : }
255 1 : aEnd := len(a) - 1
256 1 : bEnd := len(b) - 1
257 1 :
258 1 : // Last byte is the version length + 1 when there is a version,
259 1 : // else it is 0.
260 1 : aVerLen := int(a[aEnd])
261 1 : bVerLen := int(b[bEnd])
262 1 :
263 1 : // Fast-path. If the key version is empty or contains only a walltime
264 1 : // component then normalizeEngineKeyVersionForCompare is a no-op, so we don't
265 1 : // need to split the "user key" from the version suffix before comparing to
266 1 : // compute equality. Instead, we can check for byte equality immediately.
267 1 : if (aVerLen <= withWall && bVerLen <= withWall) || (aVerLen == withLockTableLen && bVerLen == withLockTableLen) {
268 1 : return bytes.Equal(a, b)
269 1 : }
270 :
271 : // Compute the index of the separator between the key and the version. If the
272 : // separator is found to be at -1 for both keys, then we are comparing bare
273 : // suffixes without a user key part. Pebble requires bare suffixes to be
274 : // comparable with the same ordering as if they had a common user key.
275 1 : aSep := aEnd - aVerLen
276 1 : bSep := bEnd - bVerLen
277 1 : // Compare the "user key" part of the key.
278 1 : if !bytes.Equal(a[:aSep], b[:bSep]) {
279 1 : return false
280 1 : }
281 1 : if aVerLen == 0 || bVerLen == 0 {
282 1 : return aVerLen == bVerLen
283 1 : }
284 :
285 : // Compare the version part of the key.
286 1 : aVer := a[aSep+1:]
287 1 : bVer := b[bSep+1:]
288 1 : aVer = normalizeEngineKeyVersionForCompare(aVer)
289 1 : bVer = normalizeEngineKeyVersionForCompare(bVer)
290 1 : return bytes.Equal(aVer, bVer)
291 : }
292 :
293 : var zeroLogical [4]byte
294 :
295 1 : func normalizeEngineKeyVersionForCompare(a []byte) []byte {
296 1 : // Check sentinel byte.
297 1 : if len(a) != int(a[len(a)-1]) {
298 0 : panic(errors.AssertionFailedf("malformed suffix: %x", a))
299 : }
300 : // Strip off sentinel byte.
301 1 : a = a[:len(a)-1]
302 1 : // In general, the version could also be a non-timestamp version, but we know
303 1 : // that engineKeyVersionLockTableLen+mvccEncodedTimeSentinelLen is a different
304 1 : // constant than the above, so there is no danger here of stripping parts from
305 1 : // a non-timestamp version.
306 1 : if len(a) == withSynthetic-1 {
307 1 : // Strip the synthetic bit component from the timestamp version. The
308 1 : // presence of the synthetic bit does not affect key ordering or equality.
309 1 : a = a[:withLogical-1]
310 1 : }
311 1 : if len(a) == withLogical-1 {
312 1 : // If the timestamp version contains a logical timestamp component that is
313 1 : // zero, strip the component. encodeMVCCTimestampToBuf will typically omit
314 1 : // the entire logical component in these cases as an optimization, but it
315 1 : // does not guarantee to never include a zero logical component.
316 1 : // Additionally, we can fall into this case after stripping off other
317 1 : // components of the key version earlier on in this function.
318 1 : if bytes.Equal(a[withWall-1:], zeroLogical[:]) {
319 1 : a = a[:withWall-1]
320 1 : }
321 : }
322 1 : return a
323 : }
324 :
325 0 : func getKeyPartFromEngineKey(engineKey []byte) (key []byte, ok bool) {
326 0 : if len(engineKey) == 0 {
327 0 : return nil, false
328 0 : }
329 : // Last byte is the version length + 1 when there is a version,
330 : // else it is 0.
331 0 : versionLen := int(engineKey[len(engineKey)-1])
332 0 : // keyPartEnd points to the sentinel byte.
333 0 : keyPartEnd := len(engineKey) - 1 - versionLen
334 0 : if keyPartEnd < 0 || engineKey[keyPartEnd] != 0x00 {
335 0 : return nil, false
336 0 : }
337 : // Key excludes the sentinel byte.
338 0 : return engineKey[:keyPartEnd], true
339 : }
340 :
341 : // KeyConfig configures the shape of the random keys generated.
342 : type KeyConfig struct {
343 : PrefixAlphabetLen int // Number of bytes in the alphabet used for the prefix.
344 : PrefixLenShared int // Number of bytes shared by all key prefixes.
345 : PrefixLen int // Number of bytes in the prefix.
346 : BaseWallTime uint64 // Smallest MVCC WallTime.
347 : Logical uint32 // MVCC logical time for all keys.
348 : }
349 :
350 0 : func (cfg KeyConfig) String() string {
351 0 : return fmt.Sprintf("AlphaLen=%d,Shared=%d,PrefixLen=%d,Logical=%d",
352 0 : cfg.PrefixAlphabetLen, cfg.PrefixLenShared, cfg.PrefixLen, cfg.Logical)
353 0 : }
354 :
355 : // RandomKVs constructs count random KVs with the provided parameters.
356 1 : func RandomKVs(rng *rand.Rand, count int, cfg KeyConfig, valueLen int) (keys, vals [][]byte) {
357 1 : sharedPrefix := make([]byte, cfg.PrefixLenShared)
358 1 : for i := 0; i < len(sharedPrefix); i++ {
359 0 : sharedPrefix[i] = byte(rng.Intn(cfg.PrefixAlphabetLen) + 'a')
360 0 : }
361 :
362 1 : keys = make([][]byte, count)
363 1 : vals = make([][]byte, count)
364 1 : for i := range keys {
365 1 : keys[i] = randCockroachKey(rng, cfg, sharedPrefix)
366 1 : vals[i] = make([]byte, valueLen)
367 1 : rng.Read(vals[i])
368 1 : }
369 1 : slices.SortFunc(keys, Compare)
370 1 : return keys, vals
371 : }
372 :
373 1 : func randCockroachKey(rng *rand.Rand, cfg KeyConfig, blockPrefix []byte) []byte {
374 1 : key := make([]byte, 0, cfg.PrefixLen+MaxSuffixLen)
375 1 : key = append(key, blockPrefix...)
376 1 : wallTime := cfg.BaseWallTime + rng.Uint64n(uint64(time.Hour))
377 1 : for len(key) < cfg.PrefixLen {
378 1 : key = append(key, byte(rng.Intn(cfg.PrefixAlphabetLen)+'a'))
379 1 : }
380 1 : return EncodeTimestamp(key, wallTime, cfg.Logical)
381 : }
|