Line data Source code
1 : // Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package colblk
6 :
7 : import (
8 : "bytes"
9 : "cmp"
10 : "context"
11 : "encoding/binary"
12 : "fmt"
13 : "io"
14 : "math"
15 : "unsafe"
16 :
17 : "github.com/cockroachdb/crlib/crbytes"
18 : "github.com/cockroachdb/errors"
19 : "github.com/cockroachdb/pebble/internal/base"
20 : "github.com/cockroachdb/pebble/internal/binfmt"
21 : "github.com/cockroachdb/pebble/internal/bytealloc"
22 : "github.com/cockroachdb/pebble/internal/invariants"
23 : "github.com/cockroachdb/pebble/internal/treeprinter"
24 : "github.com/cockroachdb/pebble/sstable/block"
25 : )
26 :
27 : // KeySchema defines the schema of a user key, as defined by the user's
28 : // application.
29 : //
30 : // TODO(jackson): Consider making this KVSchema. It feels like there's an
31 : // opportunity to generalize the ShortAttribute so that when a value is stored
32 : // out-of-band, the DataBlockEncoder calls user-provided code to store the short
33 : // attributes inlined within the data block. For inlined-values, the
34 : // user-defined value columns would be implicitly null.
35 : type KeySchema struct {
36 : Name string
37 : // KeySchema implementations can optionally make use a fixed-sized custom
38 : // header inside each block.
39 : HeaderSize uint32
40 : ColumnTypes []DataType
41 : NewKeyWriter func() KeyWriter
42 :
43 : // InitKeySeekerMetadata initializes the provided KeySeekerMetadata. This
44 : // happens once when a block enters the block cache and can be used to save
45 : // computation in NewKeySeeker.
46 : InitKeySeekerMetadata func(meta *KeySeekerMetadata, d *DataBlockDecoder)
47 :
48 : // KeySeeker returns a KeySeeker using metadata that was previously
49 : // initialized with InitKeySeekerMetadata. The returned key seeker can be an
50 : // unsafe cast of the metadata itself.
51 : KeySeeker func(meta *KeySeekerMetadata) KeySeeker
52 : }
53 :
54 : // KeySeekerMetadata is an in-memory buffer that stores metadata for a block. It
55 : // is allocated together with the buffer storing the block and is initialized
56 : // once when the block is read from disk. It is always 8-byte aligned.
57 : //
58 : // Portions of this buffer can be cast to the structures we need (through
59 : // unsafe.Pointer), but note that any pointers in these structures will be
60 : // invisible to the GC. Pointers to the block's data buffer are ok, since the
61 : // metadata and the data have the same lifetime (sharing the underlying
62 : // allocation).
63 : //
64 : // KeySeekerMetadata is stored inside block.Metadata.
65 : type KeySeekerMetadata [KeySeekerMetadataSize]byte
66 :
67 : // KeySeekerMetadataSize is chosen to fit the CockroachDB key seeker
68 : // implementation.
69 : const KeySeekerMetadataSize = 176
70 :
71 : // A KeyWriter maintains ColumnWriters for a data block for writing user keys
72 : // into the database-specific key schema. Users may define their own key schema
73 : // and implement KeyWriter to encode keys into custom columns that are aware of
74 : // the structure of user keys.
75 : type KeyWriter interface {
76 : ColumnWriter
77 : // ComparePrev compares the provided user to the previously-written user
78 : // key. The returned KeyComparison's UserKeyComparison field is equivalent
79 : // to Compare(key, prevKey) where prevKey is the last key passed to
80 : // WriteKey.
81 : //
82 : // If no key has been written yet, ComparePrev returns a KeyComparison with
83 : // PrefixLen set and UserKeyComparison=1.
84 : ComparePrev(key []byte) KeyComparison
85 : // WriteKey writes a user key into the KeyWriter's columns. The
86 : // keyPrefixLenSharedWithPrev parameter takes the number of bytes prefixing
87 : // the key's logical prefix (as defined by (base.Comparer).Split) that the
88 : // previously-written key's prefix shares.
89 : //
90 : // WriteKey is guaranteed to be called sequentially with increasing row
91 : // indexes, beginning at zero.
92 : WriteKey(row int, key []byte, keyPrefixLen, keyPrefixLenSharedWithPrev int32)
93 : // MaterializeKey appends the zero-indexed row'th key written to dst,
94 : // returning the result.
95 : MaterializeKey(dst []byte, row int) []byte
96 : // FinishHeader serializes an internal header of exactly KeySchema.HeaderSize bytes.
97 : FinishHeader(dst []byte)
98 : }
99 :
100 : // AssertKeyCompare compares two keys using the provided comparer, ensuring the
101 : // provided KeyComparison accurately describing the result. Panics if the
102 : // assertion does not hold.
103 2 : func AssertKeyCompare(comparer *base.Comparer, a, b []byte, kcmp KeyComparison) {
104 2 : bi := comparer.Split(b)
105 2 : var recomputed KeyComparison
106 2 : recomputed.PrefixLen = int32(comparer.Split(a))
107 2 : recomputed.CommonPrefixLen = int32(crbytes.CommonPrefix(a[:recomputed.PrefixLen], b[:bi]))
108 2 : recomputed.UserKeyComparison = int32(comparer.Compare(a, b))
109 2 : if recomputed.PrefixEqual() != bytes.Equal(a[:recomputed.PrefixLen], b[:bi]) {
110 0 : panic(errors.AssertionFailedf("PrefixEqual()=%t doesn't hold: %q, %q", kcmp.PrefixEqual(), a, b))
111 : }
112 2 : if recomputed != kcmp {
113 0 : panic(errors.AssertionFailedf("KeyComparison of (%q, %q) = %s, ComparePrev gave %s",
114 0 : a, b, recomputed, kcmp))
115 : }
116 : }
117 :
118 : // KeyComparison holds information about a key and its comparison to another a
119 : // key.
120 : type KeyComparison struct {
121 : // PrefixLen is the length of the prefix of the key. It's the outcome of
122 : // calling base.Split on the key.
123 : PrefixLen int32
124 : // CommonPrefixLen is the length of the physical (byte-wise) prefix of the
125 : // logical prefix that is shared with the other key. For example, for
126 : // "apple@1" and "applied@3" the value is 4 (the length of "appl"). For
127 : // "apple@1" and "apple@10" the value is 5 (the length of "apple"), because
128 : // the shared bytes within the suffix are not included.
129 : CommonPrefixLen int32
130 : // UserKeyComparison is the comparison of the user keys of the two keys.
131 : // Should be equivalent to
132 : //
133 : // Compare(key, otherKey)
134 : UserKeyComparison int32
135 : }
136 :
137 : // String returns a string representation of the KeyComparison.
138 0 : func (kcmp KeyComparison) String() string {
139 0 : return fmt.Sprintf("(prefix={%d,common=%d} cmp=%d)",
140 0 : kcmp.PrefixLen, kcmp.CommonPrefixLen, kcmp.UserKeyComparison)
141 0 : }
142 :
143 : // PrefixEqual returns true if the key comparison determined that the keys have
144 : // equal prefixes.
145 2 : func (kcmp KeyComparison) PrefixEqual() bool { return kcmp.PrefixLen == kcmp.CommonPrefixLen }
146 :
147 : // KeySeeker iterates over the keys in a columnar data block.
148 : //
149 : // Users of Pebble who define their own key schema must implement KeySeeker to
150 : // seek over their decomposed keys.
151 : //
152 : // KeySeeker implementations must be safe for concurrent use by multiple
153 : // goroutines. In practice, multiple DataBlockIterators may use the same
154 : // KeySeeker.
155 : type KeySeeker interface {
156 : // IsLowerBound returns true if all keys in the data block (after suffix
157 : // replacement if syntheticSuffix is not empty) are >= the given key. If the
158 : // data block contains no keys, returns true.
159 : IsLowerBound(k []byte, syntheticSuffix []byte) bool
160 : // SeekGE returns the index of the first row with a key greater than or equal
161 : // to [key], and whether that row has the same prefix as [key].
162 : //
163 : // If the caller externally knows a bound on where the key is located, it
164 : // may indicate it through [boundRow] and [searchDir]. A [searchDir] value
165 : // of -1 indicates that the sought row must be at an index ≤ [boundRow]. A
166 : // [searchDir] value of +1 indicates that the sought row must be at an index
167 : // ≥ [boundRow]. Implementations may use this information to constrain the
168 : // search. See (base.SeekGEFlags).TrySeekUsingNext for context on when this
169 : // may be set in practice.
170 : SeekGE(key []byte, boundRow int, searchDir int8) (row int, equalPrefix bool)
171 : // MaterializeUserKey materializes the user key of the specified row,
172 : // returning a slice of the materialized user key.
173 : //
174 : // The provided keyIter must have a buffer large enough to hold the key.
175 : //
176 : // The prevRow parameter is the row MaterializeUserKey was last invoked with
177 : // (or a negative number if not applicable). Implementations may take
178 : // advantage of that knowledge to reduce work.
179 : MaterializeUserKey(keyIter *PrefixBytesIter, prevRow, row int) []byte
180 : // MaterializeUserKeyWithSyntheticSuffix is a variant of MaterializeUserKey
181 : // where the suffix is replaced.
182 : //
183 : // The provided keyIter must have a buffer large enough to hold the key after
184 : // suffix replacement.
185 : //
186 : // The prevRow parameter is the row MaterializeUserKeyWithSyntheticSuffix was
187 : // last invoked with (or a negative number if not applicable). Implementations
188 : // may take advantage of that knowledge to reduce work.
189 : MaterializeUserKeyWithSyntheticSuffix(
190 : keyIter *PrefixBytesIter, syntheticSuffix []byte, prevRow, row int,
191 : ) []byte
192 : }
193 :
194 : const (
195 : defaultKeySchemaColumnPrefix int = iota
196 : defaultKeySchemaColumnSuffix
197 : )
198 :
199 : var defaultSchemaColumnTypes = []DataType{
200 : defaultKeySchemaColumnPrefix: DataTypePrefixBytes,
201 : defaultKeySchemaColumnSuffix: DataTypeBytes,
202 : }
203 :
204 : // DefaultKeySchema returns the default key schema that decomposes a user key
205 : // into its prefix and suffix. Prefixes are sorted in lexicographical order.
206 2 : func DefaultKeySchema(comparer *base.Comparer, prefixBundleSize int) KeySchema {
207 2 : return KeySchema{
208 2 : Name: fmt.Sprintf("DefaultKeySchema(%s,%d)", comparer.Name, prefixBundleSize),
209 2 : HeaderSize: 0,
210 2 : ColumnTypes: defaultSchemaColumnTypes,
211 2 : NewKeyWriter: func() KeyWriter {
212 2 : kw := &defaultKeyWriter{comparer: comparer}
213 2 : kw.prefixes.Init(prefixBundleSize)
214 2 : kw.suffixes.Init()
215 2 : return kw
216 2 : },
217 2 : InitKeySeekerMetadata: func(meta *KeySeekerMetadata, d *DataBlockDecoder) {
218 2 : ks := (*defaultKeySeeker)(unsafe.Pointer(&meta[0]))
219 2 : ks.comparer = comparer
220 2 : ks.init(d)
221 2 : },
222 2 : KeySeeker: func(meta *KeySeekerMetadata) KeySeeker {
223 2 : ks := (*defaultKeySeeker)(unsafe.Pointer(&meta[0]))
224 2 : return ks
225 2 : },
226 : }
227 : }
228 :
229 : // Assert that *defaultKeyWriter implements the KeyWriter interface.
230 : var _ KeyWriter = (*defaultKeyWriter)(nil)
231 :
232 : type defaultKeyWriter struct {
233 : comparer *base.Comparer
234 : prefixes PrefixBytesBuilder
235 : suffixes RawBytesBuilder
236 : }
237 :
238 2 : func (w *defaultKeyWriter) ComparePrev(key []byte) KeyComparison {
239 2 : var cmpv KeyComparison
240 2 : cmpv.PrefixLen = int32(w.comparer.Split(key))
241 2 : if w.prefixes.nKeys == 0 {
242 2 : // The first key has no previous key to compare to.
243 2 : cmpv.UserKeyComparison = 1
244 2 : return cmpv
245 2 : }
246 2 : lp := w.prefixes.UnsafeGet(w.prefixes.nKeys - 1)
247 2 : cmpv.CommonPrefixLen = int32(crbytes.CommonPrefix(lp, key[:cmpv.PrefixLen]))
248 2 :
249 2 : // Keys are written in order and prefixes must be sorted lexicograpgically,
250 2 : // so CommonPrefixLen == PrefixLen implies that the keys share the same
251 2 : // logical prefix. (If the previous key had a prefix longer than
252 2 : // CommonPrefixLen, it would sort after [key].)
253 2 : if cmpv.CommonPrefixLen == cmpv.PrefixLen {
254 2 : // The keys share the same MVCC prefix. Compare the suffixes.
255 2 : cmpv.UserKeyComparison = int32(w.comparer.ComparePointSuffixes(key[cmpv.PrefixLen:],
256 2 : w.suffixes.UnsafeGet(w.suffixes.rows-1)))
257 2 : if invariants.Enabled {
258 2 : if !w.comparer.Equal(lp, key[:cmpv.PrefixLen]) {
259 0 : panic(errors.AssertionFailedf("keys have different logical prefixes: %q != %q", lp, key[:cmpv.PrefixLen]))
260 : }
261 : }
262 2 : return cmpv
263 : }
264 :
265 : // The keys have different MVCC prefixes. We haven't determined which is
266 : // greater, but we know the index at which they diverge. The base.Comparer
267 : // contract dictates that prefixes must be lexicographically ordered.
268 2 : if len(lp) == int(cmpv.CommonPrefixLen) {
269 2 : // cmpv.PrefixLen > cmpv.PrefixLenShared; key is greater.
270 2 : cmpv.UserKeyComparison = +1
271 2 : } else {
272 2 : // Both keys have at least 1 additional byte at which they diverge.
273 2 : // Compare the diverging byte.
274 2 : cmpv.UserKeyComparison = int32(cmp.Compare(key[cmpv.CommonPrefixLen], lp[cmpv.CommonPrefixLen]))
275 2 : }
276 2 : if invariants.Enabled {
277 2 : // In this case we've determined that the keys have different prefixes,
278 2 : // so the UserKeyComparison should be equal to the result of comparing
279 2 : // the prefixes and nonzero.
280 2 : if cmpv.UserKeyComparison == 0 {
281 0 : panic(errors.AssertionFailedf("user keys should not be equal: %q+%q, %q",
282 0 : lp, w.suffixes.UnsafeGet(w.suffixes.rows-1), key))
283 : }
284 2 : if v := w.comparer.Compare(key, lp); v != int(cmpv.UserKeyComparison) {
285 0 : panic(errors.AssertionFailedf("user key comparison mismatch: Compare(%q, %q) = %d ≠ %d",
286 0 : key, lp, v, cmpv.UserKeyComparison))
287 : }
288 : }
289 2 : return cmpv
290 : }
291 :
292 : func (w *defaultKeyWriter) WriteKey(
293 : row int, key []byte, keyPrefixLen, keyPrefixLenSharedWithPrev int32,
294 2 : ) {
295 2 : w.prefixes.Put(key[:keyPrefixLen], int(keyPrefixLenSharedWithPrev))
296 2 : w.suffixes.Put(key[keyPrefixLen:])
297 2 : }
298 :
299 2 : func (w *defaultKeyWriter) MaterializeKey(dst []byte, row int) []byte {
300 2 : dst = append(dst, w.prefixes.UnsafeGet(row)...)
301 2 : dst = append(dst, w.suffixes.UnsafeGet(row)...)
302 2 : return dst
303 2 : }
304 :
305 2 : func (w *defaultKeyWriter) NumColumns() int {
306 2 : return 2
307 2 : }
308 :
309 2 : func (w *defaultKeyWriter) DataType(col int) DataType {
310 2 : return defaultSchemaColumnTypes[col]
311 2 : }
312 :
313 2 : func (w *defaultKeyWriter) Reset() {
314 2 : w.prefixes.Reset()
315 2 : w.suffixes.Reset()
316 2 : }
317 :
318 1 : func (w *defaultKeyWriter) WriteDebug(dst io.Writer, rows int) {
319 1 : fmt.Fprint(dst, "0: prefixes: ")
320 1 : w.prefixes.WriteDebug(dst, rows)
321 1 : fmt.Fprintln(dst)
322 1 : fmt.Fprint(dst, "1: suffixes: ")
323 1 : w.suffixes.WriteDebug(dst, rows)
324 1 : fmt.Fprintln(dst)
325 1 : }
326 :
327 2 : func (w *defaultKeyWriter) Size(rows int, offset uint32) uint32 {
328 2 : offset = w.prefixes.Size(rows, offset)
329 2 : offset = w.suffixes.Size(rows, offset)
330 2 : return offset
331 2 : }
332 :
333 2 : func (w *defaultKeyWriter) FinishHeader([]byte) {}
334 :
335 2 : func (w *defaultKeyWriter) Finish(col, rows int, offset uint32, buf []byte) (nextOffset uint32) {
336 2 : switch col {
337 2 : case defaultKeySchemaColumnPrefix:
338 2 : return w.prefixes.Finish(0, rows, offset, buf)
339 2 : case defaultKeySchemaColumnSuffix:
340 2 : return w.suffixes.Finish(0, rows, offset, buf)
341 0 : default:
342 0 : panic(fmt.Sprintf("unknown default key column: %d", col))
343 : }
344 : }
345 :
346 : // Assert that *defaultKeySeeker implements KeySeeker.
347 : var _ KeySeeker = (*defaultKeySeeker)(nil)
348 :
349 : // Assert that the metadata fits the defalut key seeker.
350 : var _ uint = KeySeekerMetadataSize - uint(unsafe.Sizeof(defaultKeySeeker{}))
351 :
352 : type defaultKeySeeker struct {
353 : comparer *base.Comparer
354 : decoder *DataBlockDecoder
355 : prefixes PrefixBytes
356 : suffixes RawBytes
357 : sharedPrefix []byte
358 : }
359 :
360 2 : func (ks *defaultKeySeeker) init(d *DataBlockDecoder) {
361 2 : ks.decoder = d
362 2 : ks.prefixes = d.d.PrefixBytes(defaultKeySchemaColumnPrefix)
363 2 : ks.suffixes = d.d.RawBytes(defaultKeySchemaColumnSuffix)
364 2 : ks.sharedPrefix = ks.prefixes.SharedPrefix()
365 2 : }
366 :
367 : // IsLowerBound is part of the KeySeeker interface.
368 2 : func (ks *defaultKeySeeker) IsLowerBound(k []byte, syntheticSuffix []byte) bool {
369 2 : si := ks.comparer.Split(k)
370 2 : if v := ks.comparer.Compare(ks.prefixes.UnsafeFirstSlice(), k[:si]); v != 0 {
371 2 : return v > 0
372 2 : }
373 2 : suffix := syntheticSuffix
374 2 : if len(suffix) == 0 {
375 2 : suffix = ks.suffixes.At(0)
376 2 : }
377 2 : return ks.comparer.Compare(suffix, k[si:]) >= 0
378 : }
379 :
380 : // SeekGE is part of the KeySeeker interface.
381 : func (ks *defaultKeySeeker) SeekGE(
382 : key []byte, boundRow int, searchDir int8,
383 2 : ) (row int, equalPrefix bool) {
384 2 : si := ks.comparer.Split(key)
385 2 : row, eq := ks.prefixes.Search(key[:si])
386 2 : if eq {
387 2 : return ks.seekGEOnSuffix(row, key[si:]), true
388 2 : }
389 2 : return row, false
390 : }
391 :
392 : // seekGEOnSuffix is a helper function for SeekGE when a seek key's prefix
393 : // exactly matches a row. seekGEOnSuffix finds the first row at index or later
394 : // with the same prefix as index and a suffix greater than or equal to [suffix],
395 : // or if no such row exists, the next row with a different prefix.
396 2 : func (ks *defaultKeySeeker) seekGEOnSuffix(index int, suffix []byte) (row int) {
397 2 : // The search key's prefix exactly matches the prefix of the row at index.
398 2 : // If the row at index has a suffix >= [suffix], then return the row.
399 2 : if ks.comparer.ComparePointSuffixes(ks.suffixes.At(index), suffix) >= 0 {
400 2 : return index
401 2 : }
402 : // Otherwise, the row at [index] sorts before the search key and we need to
403 : // search forward. Binary search between [index+1, prefixChanged.SeekSetBitGE(index+1)].
404 : //
405 : // Define f(l-1) == false and f(u) == true.
406 : // Invariant: f(l-1) == false, f(u) == true.
407 2 : l := index + 1
408 2 : u := ks.decoder.prefixChanged.SeekSetBitGE(index + 1)
409 2 : for l < u {
410 2 : h := int(uint(l+u) >> 1) // avoid overflow when computing h
411 2 : // l ≤ h < u
412 2 : if ks.comparer.ComparePointSuffixes(ks.suffixes.At(h), suffix) >= 0 {
413 2 : u = h // preserves f(u) == true
414 2 : } else {
415 2 : l = h + 1 // preserves f(l-1) == false
416 2 : }
417 : }
418 2 : return l
419 : }
420 :
421 : // MaterializeUserKey is part of the colblk.KeySeeker interface.
422 2 : func (ks *defaultKeySeeker) MaterializeUserKey(keyIter *PrefixBytesIter, prevRow, row int) []byte {
423 2 : if row == prevRow+1 && prevRow >= 0 {
424 2 : ks.prefixes.SetNext(keyIter)
425 2 : } else {
426 2 : ks.prefixes.SetAt(keyIter, row)
427 2 : }
428 2 : suffix := ks.suffixes.At(row)
429 2 : res := keyIter.Buf[:len(keyIter.Buf)+len(suffix)]
430 2 : memmove(
431 2 : unsafe.Pointer(uintptr(unsafe.Pointer(unsafe.SliceData(keyIter.Buf)))+uintptr(len(keyIter.Buf))),
432 2 : unsafe.Pointer(unsafe.SliceData(suffix)),
433 2 : uintptr(len(suffix)),
434 2 : )
435 2 : return res
436 : }
437 :
438 : // MaterializeUserKeyWithSyntheticSuffix is part of the colblk.KeySeeker interface.
439 : func (ks *defaultKeySeeker) MaterializeUserKeyWithSyntheticSuffix(
440 : keyIter *PrefixBytesIter, suffix []byte, prevRow, row int,
441 2 : ) []byte {
442 2 : if row == prevRow+1 && prevRow >= 0 {
443 2 : ks.prefixes.SetNext(keyIter)
444 2 : } else {
445 2 : ks.prefixes.SetAt(keyIter, row)
446 2 : }
447 2 : res := keyIter.Buf[:len(keyIter.Buf)+len(suffix)]
448 2 : memmove(
449 2 : unsafe.Pointer(uintptr(unsafe.Pointer(unsafe.SliceData(keyIter.Buf)))+uintptr(len(keyIter.Buf))),
450 2 : unsafe.Pointer(unsafe.SliceData(suffix)),
451 2 : uintptr(len(suffix)),
452 2 : )
453 2 : return res
454 : }
455 :
456 : // DataBlockEncoder encodes columnar data blocks using a user-defined schema.
457 : type DataBlockEncoder struct {
458 : Schema *KeySchema
459 : KeyWriter KeyWriter
460 : // trailers is the column writer for InternalKey uint64 trailers.
461 : trailers UintBuilder
462 : // prefixSame is the column writer for the prefix-changed bitmap that
463 : // indicates when a new key prefix begins. During block building, the bitmap
464 : // represents when the prefix stays the same, which is expected to be a
465 : // rarer case. Before Finish-ing the column, we invert the bitmap.
466 : prefixSame BitmapBuilder
467 : // values is the column writer for values. Iff the isValueExternal bitmap
468 : // indicates a value is external, the value is prefixed with a ValuePrefix
469 : // byte.
470 : values RawBytesBuilder
471 : // isValueExternal is the column writer for the is-value-external bitmap
472 : // that indicates when a value is stored out-of-band in a value block.
473 : isValueExternal BitmapBuilder
474 : // isObsolete is the column writer for the is-obsolete bitmap that indicates
475 : // when a key is known to be obsolete/non-live (i.e., shadowed by another
476 : // identical point key or range deletion with a higher sequence number).
477 : isObsolete BitmapBuilder
478 :
479 : enc blockEncoder
480 : rows int
481 : maximumKeyLength int
482 : valuePrefixTmp [1]byte
483 : lastUserKeyTmp []byte
484 : }
485 :
486 : const (
487 : dataBlockColumnTrailer = iota
488 : dataBlockColumnPrefixChanged
489 : dataBlockColumnValue
490 : dataBlockColumnIsValueExternal
491 : dataBlockColumnIsObsolete
492 : dataBlockColumnMax
493 : )
494 :
495 : // The data block header is a 4-byte uint32 encoding the maximum length of a key
496 : // contained within the block. This is used by iterators to avoid the need to
497 : // grow key buffers while iterating over the block, ensuring that the key buffer
498 : // is always sufficiently large.
499 : // This is serialized immediately after the KeySchema specific header.
500 : const dataBlockCustomHeaderSize = 4
501 :
502 : // Init initializes the data block writer.
503 2 : func (w *DataBlockEncoder) Init(schema *KeySchema) {
504 2 : w.Schema = schema
505 2 : w.KeyWriter = schema.NewKeyWriter()
506 2 : w.trailers.Init()
507 2 : w.prefixSame.Reset()
508 2 : w.values.Init()
509 2 : w.isValueExternal.Reset()
510 2 : w.isObsolete.Reset()
511 2 : w.rows = 0
512 2 : w.maximumKeyLength = 0
513 2 : w.lastUserKeyTmp = w.lastUserKeyTmp[:0]
514 2 : w.enc.reset()
515 2 : }
516 :
517 : // Reset resets the data block writer to its initial state, retaining buffers.
518 2 : func (w *DataBlockEncoder) Reset() {
519 2 : w.KeyWriter.Reset()
520 2 : w.trailers.Reset()
521 2 : w.prefixSame.Reset()
522 2 : w.values.Reset()
523 2 : w.isValueExternal.Reset()
524 2 : w.isObsolete.Reset()
525 2 : w.rows = 0
526 2 : w.maximumKeyLength = 0
527 2 : w.lastUserKeyTmp = w.lastUserKeyTmp[:0]
528 2 : w.enc.reset()
529 2 : }
530 :
531 : // String outputs a human-readable summary of internal DataBlockEncoder state.
532 1 : func (w *DataBlockEncoder) String() string {
533 1 : var buf bytes.Buffer
534 1 : size := uint32(w.Size())
535 1 : fmt.Fprintf(&buf, "size=%d:\n", size)
536 1 : w.KeyWriter.WriteDebug(&buf, w.rows)
537 1 :
538 1 : fmt.Fprintf(&buf, "%d: trailers: ", len(w.Schema.ColumnTypes)+dataBlockColumnTrailer)
539 1 : w.trailers.WriteDebug(&buf, w.rows)
540 1 : fmt.Fprintln(&buf)
541 1 :
542 1 : fmt.Fprintf(&buf, "%d: prefix changed: ", len(w.Schema.ColumnTypes)+dataBlockColumnPrefixChanged)
543 1 : w.prefixSame.WriteDebug(&buf, w.rows)
544 1 : fmt.Fprintln(&buf)
545 1 :
546 1 : fmt.Fprintf(&buf, "%d: values: ", len(w.Schema.ColumnTypes)+dataBlockColumnValue)
547 1 : w.values.WriteDebug(&buf, w.rows)
548 1 : fmt.Fprintln(&buf)
549 1 :
550 1 : fmt.Fprintf(&buf, "%d: is-value-ext: ", len(w.Schema.ColumnTypes)+dataBlockColumnIsValueExternal)
551 1 : w.isValueExternal.WriteDebug(&buf, w.rows)
552 1 : fmt.Fprintln(&buf)
553 1 :
554 1 : fmt.Fprintf(&buf, "%d: is-obsolete: ", len(w.Schema.ColumnTypes)+dataBlockColumnIsObsolete)
555 1 : w.isObsolete.WriteDebug(&buf, w.rows)
556 1 : fmt.Fprintln(&buf)
557 1 :
558 1 : return buf.String()
559 1 : }
560 :
561 : // Add adds the provided key to the data block. Keys must be added in order. The
562 : // caller must supply a KeyComparison containing the comparison of the key to
563 : // the previously-added key, obtainable through
564 : //
565 : // KeyWriter.ComparePrev(ikey.UserKey)
566 : //
567 : // The caller is required to pass this in because in expected use cases, the
568 : // caller will also require the same information.
569 : func (w *DataBlockEncoder) Add(
570 : ikey base.InternalKey,
571 : value []byte,
572 : valuePrefix block.ValuePrefix,
573 : kcmp KeyComparison,
574 : isObsolete bool,
575 2 : ) {
576 2 : w.KeyWriter.WriteKey(w.rows, ikey.UserKey, kcmp.PrefixLen, kcmp.CommonPrefixLen)
577 2 : if kcmp.PrefixEqual() {
578 2 : w.prefixSame.Set(w.rows)
579 2 : }
580 2 : if isObsolete {
581 2 : w.isObsolete.Set(w.rows)
582 2 : }
583 2 : w.trailers.Set(w.rows, uint64(ikey.Trailer))
584 2 : if valuePrefix.IsValueHandle() {
585 2 : w.isValueExternal.Set(w.rows)
586 2 : // Write the value with the value prefix byte preceding the value.
587 2 : w.valuePrefixTmp[0] = byte(valuePrefix)
588 2 : w.values.PutConcat(w.valuePrefixTmp[:], value)
589 2 : } else {
590 2 : // Elide the value prefix. Readers will examine the isValueExternal
591 2 : // bitmap and know there is no value prefix byte if !isValueExternal.
592 2 : w.values.Put(value)
593 2 : }
594 2 : if len(ikey.UserKey) > int(w.maximumKeyLength) {
595 2 : w.maximumKeyLength = len(ikey.UserKey)
596 2 : }
597 2 : w.rows++
598 : }
599 :
600 : // Rows returns the number of rows in the current pending data block.
601 2 : func (w *DataBlockEncoder) Rows() int {
602 2 : return w.rows
603 2 : }
604 :
605 : // Size returns the size of the current pending data block.
606 2 : func (w *DataBlockEncoder) Size() int {
607 2 : off := blockHeaderSize(len(w.Schema.ColumnTypes)+dataBlockColumnMax, dataBlockCustomHeaderSize+w.Schema.HeaderSize)
608 2 : off = w.KeyWriter.Size(w.rows, off)
609 2 : off = w.trailers.Size(w.rows, off)
610 2 : off = w.prefixSame.InvertedSize(w.rows, off)
611 2 : off = w.values.Size(w.rows, off)
612 2 : off = w.isValueExternal.Size(w.rows, off)
613 2 : off = w.isObsolete.Size(w.rows, off)
614 2 : off++ // trailer padding byte
615 2 : return int(off)
616 2 : }
617 :
618 : // MaterializeLastUserKey materializes the last added user key.
619 1 : func (w *DataBlockEncoder) MaterializeLastUserKey(appendTo []byte) []byte {
620 1 : return w.KeyWriter.MaterializeKey(appendTo, w.rows-1)
621 1 : }
622 :
623 : // Finish serializes the pending data block, including the first [rows] rows.
624 : // The value of [rows] must be Rows() or Rows()-1. The provided size must be the
625 : // size of the data block with the provided row count (i.e., the return value of
626 : // [Size] when DataBlockEncoder.Rows() = [rows]).
627 : //
628 : // Finish the returns the serialized, uncompressed data block and the
629 : // InternalKey of the last key contained within the data block. The memory of
630 : // the lastKey's UserKey is owned by the DataBlockEncoder. The caller must
631 : // copy it if they require it to outlive a Reset of the writer.
632 2 : func (w *DataBlockEncoder) Finish(rows, size int) (finished []byte, lastKey base.InternalKey) {
633 2 : if invariants.Enabled && rows != w.rows && rows != w.rows-1 {
634 0 : panic(errors.AssertionFailedf("data block has %d rows; asked to finish %d", w.rows, rows))
635 : }
636 :
637 2 : cols := len(w.Schema.ColumnTypes) + dataBlockColumnMax
638 2 : h := Header{
639 2 : Version: Version1,
640 2 : Columns: uint16(cols),
641 2 : Rows: uint32(rows),
642 2 : }
643 2 :
644 2 : // Invert the prefix-same bitmap before writing it out, because we want it
645 2 : // to represent when the prefix changes.
646 2 : w.prefixSame.Invert(rows)
647 2 :
648 2 : w.enc.init(size, h, dataBlockCustomHeaderSize+w.Schema.HeaderSize)
649 2 :
650 2 : // Write the key schema custom header.
651 2 : w.KeyWriter.FinishHeader(w.enc.data()[:w.Schema.HeaderSize])
652 2 : // Write the max key length in the data block custom header.
653 2 : binary.LittleEndian.PutUint32(w.enc.data()[w.Schema.HeaderSize:w.Schema.HeaderSize+dataBlockCustomHeaderSize], uint32(w.maximumKeyLength))
654 2 : w.enc.encode(rows, w.KeyWriter)
655 2 : w.enc.encode(rows, &w.trailers)
656 2 : w.enc.encode(rows, &w.prefixSame)
657 2 : w.enc.encode(rows, &w.values)
658 2 : w.enc.encode(rows, &w.isValueExternal)
659 2 : w.enc.encode(rows, &w.isObsolete)
660 2 : finished = w.enc.finish()
661 2 :
662 2 : w.lastUserKeyTmp = w.lastUserKeyTmp[:0]
663 2 : w.lastUserKeyTmp = w.KeyWriter.MaterializeKey(w.lastUserKeyTmp[:0], rows-1)
664 2 : lastKey = base.InternalKey{
665 2 : UserKey: w.lastUserKeyTmp,
666 2 : Trailer: base.InternalKeyTrailer(w.trailers.Get(rows - 1)),
667 2 : }
668 2 : return finished, lastKey
669 : }
670 :
671 : // DataBlockRewriter rewrites data blocks. See RewriteSuffixes.
672 : type DataBlockRewriter struct {
673 : KeySchema *KeySchema
674 :
675 : encoder DataBlockEncoder
676 : decoder DataBlockDecoder
677 : iter DataBlockIter
678 : keySeeker KeySeeker
679 : comparer *base.Comparer
680 : keyBuf []byte
681 : // keyAlloc grown throughout the lifetime of the rewriter.
682 : keyAlloc bytealloc.A
683 : prefixBytesIter PrefixBytesIter
684 : initialized bool
685 : }
686 :
687 : // NewDataBlockRewriter creates a block rewriter.
688 1 : func NewDataBlockRewriter(keySchema *KeySchema, comparer *base.Comparer) *DataBlockRewriter {
689 1 : return &DataBlockRewriter{
690 1 : KeySchema: keySchema,
691 1 : comparer: comparer,
692 1 : }
693 1 : }
694 :
695 : type assertNoExternalValues struct{}
696 :
697 : var _ block.GetLazyValueForPrefixAndValueHandler = assertNoExternalValues{}
698 :
699 0 : func (assertNoExternalValues) GetLazyValueForPrefixAndValueHandle(value []byte) base.LazyValue {
700 0 : panic(errors.AssertionFailedf("pebble: sstable contains values in value blocks"))
701 : }
702 :
703 : // RewriteSuffixes rewrites the input block. It expects the input block to only
704 : // contain keys with the suffix `from`. It rewrites the block to contain the
705 : // same keys with the suffix `to`.
706 : //
707 : // RewriteSuffixes returns the start and end keys of the rewritten block, and
708 : // the finished rewritten block. The returned start and end keys have indefinite
709 : // lifetimes. The returned rewritten block is owned by the DataBlockRewriter. If
710 : // it must be retained beyond the next call to RewriteSuffixes, the caller
711 : // should make a copy.
712 : //
713 : // Note that the input slice must be 8-byte aligned.
714 : func (rw *DataBlockRewriter) RewriteSuffixes(
715 : input []byte, from []byte, to []byte,
716 1 : ) (start, end base.InternalKey, rewritten []byte, err error) {
717 1 : if !rw.initialized {
718 1 : rw.iter.InitOnce(rw.KeySchema, rw.comparer, assertNoExternalValues{})
719 1 : rw.encoder.Init(rw.KeySchema)
720 1 : rw.initialized = true
721 1 : }
722 :
723 : // TODO(jackson): RewriteSuffixes performs a naïve rewrite of the block,
724 : // iterating over the input block while building a new block, KV-by-KV.
725 : // Since key columns are stored separately from other data, we could copy
726 : // columns that are unchanged (at least all the non-key columns, and in
727 : // practice a PrefixBytes column) wholesale without retrieving rows
728 : // one-by-one. In practice, there a few obstacles to making this work:
729 : //
730 : // - Only the beginning of a data block is assumed to be aligned. Columns
731 : // then add padding as necessary to align data that needs to be aligned.
732 : // If we copy a column, we have no guarantee that the alignment of the
733 : // column start in the old block matches the alignment in the new block.
734 : // We'd have to add padding to between columns to match the original
735 : // alignment. It's a bit subtle.
736 : // - We still need to read all the key columns in order to synthesize
737 : // [start] and [end].
738 : //
739 : // The columnar format is designed to support fast IterTransforms at read
740 : // time, including IterTransforms.SyntheticSuffix. Our effort might be
741 : // better spent dropping support for the physical rewriting of data blocks
742 : // we're performing here and instead use a read-time IterTransform.
743 :
744 1 : rw.decoder.Init(rw.KeySchema, input)
745 1 : meta := &KeySeekerMetadata{}
746 1 : rw.KeySchema.InitKeySeekerMetadata(meta, &rw.decoder)
747 1 : rw.keySeeker = rw.KeySchema.KeySeeker(meta)
748 1 : rw.encoder.Reset()
749 1 : if err = rw.iter.Init(&rw.decoder, block.IterTransforms{}); err != nil {
750 0 : return base.InternalKey{}, base.InternalKey{}, nil, err
751 0 : }
752 :
753 : // Allocate a keyIter buffer that's large enough to hold the largest user
754 : // key in the block with 1 byte to spare (so that pointer arithmetic is
755 : // never pointing beyond the allocation, which would violate Go rules).
756 1 : if cap(rw.prefixBytesIter.Buf) < int(rw.decoder.maximumKeyLength)+1 {
757 1 : rw.prefixBytesIter.Buf = make([]byte, rw.decoder.maximumKeyLength+1)
758 1 : }
759 1 : if newMax := int(rw.decoder.maximumKeyLength) - len(from) + len(to) + 1; cap(rw.keyBuf) < newMax {
760 1 : rw.keyBuf = make([]byte, newMax)
761 1 : }
762 :
763 : // Rewrite each key-value pair one-by-one.
764 1 : for i, kv := 0, rw.iter.First(); kv != nil; i, kv = i+1, rw.iter.Next() {
765 1 : value := kv.V.ValueOrHandle
766 1 : valuePrefix := block.InPlaceValuePrefix(false /* setHasSamePrefix (unused) */)
767 1 : isValueExternal := rw.decoder.isValueExternal.At(i)
768 1 : if isValueExternal {
769 0 : valuePrefix = block.ValuePrefix(kv.V.ValueOrHandle[0])
770 0 : value = kv.V.ValueOrHandle[1:]
771 0 : }
772 1 : kcmp := rw.encoder.KeyWriter.ComparePrev(kv.K.UserKey)
773 1 : if !bytes.Equal(kv.K.UserKey[kcmp.PrefixLen:], from) {
774 1 : return base.InternalKey{}, base.InternalKey{}, nil,
775 1 : errors.Newf("key %s has suffix 0x%x; require 0x%x", kv.K, kv.K.UserKey[kcmp.PrefixLen:], from)
776 1 : }
777 1 : rw.keyBuf = append(rw.keyBuf[:0], kv.K.UserKey[:kcmp.PrefixLen]...)
778 1 : rw.keyBuf = append(rw.keyBuf, to...)
779 1 : if i == 0 {
780 1 : start.UserKey, rw.keyAlloc = rw.keyAlloc.Copy(rw.keyBuf)
781 1 : start.Trailer = kv.K.Trailer
782 1 : }
783 1 : k := base.InternalKey{UserKey: rw.keyBuf, Trailer: kv.K.Trailer}
784 1 : rw.encoder.Add(k, value, valuePrefix, kcmp, rw.decoder.isObsolete.At(i))
785 : }
786 1 : rewritten, end = rw.encoder.Finish(int(rw.decoder.d.header.Rows), rw.encoder.Size())
787 1 : end.UserKey, rw.keyAlloc = rw.keyAlloc.Copy(end.UserKey)
788 1 : return start, end, rewritten, nil
789 : }
790 :
791 : // dataBlockDecoderSize is the size of DataBlockDecoder, round up to 8 bytes.
792 : const dataBlockDecoderSize = (unsafe.Sizeof(DataBlockDecoder{}) + 7) &^ 7
793 :
794 : // Assert that dataBlockDecoderSize is a multiple of 8 bytes (so that
795 : // KeySeekerMetadata is also aligned).
796 : const _ uint = uint(-(dataBlockDecoderSize % 8))
797 :
798 : // Assert that a DataBlockDecoder and a KeySeekerMetadata can fit inside block.Metadata.
799 : const _ uint = block.MetadataSize - uint(dataBlockDecoderSize) - KeySeekerMetadataSize
800 :
801 : // InitDataBlockMetadata initializes the metadata for a data block.
802 2 : func InitDataBlockMetadata(schema *KeySchema, md *block.Metadata, data []byte) (err error) {
803 2 : if uintptr(unsafe.Pointer(md))%8 != 0 {
804 0 : return errors.AssertionFailedf("metadata is not 8-byte aligned")
805 0 : }
806 2 : d := (*DataBlockDecoder)(unsafe.Pointer(md))
807 2 : // Initialization can panic; convert panics to corruption errors (so higher
808 2 : // layers can add file number and offset information).
809 2 : defer func() {
810 2 : if r := recover(); r != nil {
811 0 : err = base.CorruptionErrorf("error initializing data block metadata: %v", r)
812 0 : }
813 : }()
814 2 : d.Init(schema, data)
815 2 : keySchemaMeta := (*KeySeekerMetadata)(unsafe.Pointer(&md[dataBlockDecoderSize]))
816 2 : schema.InitKeySeekerMetadata(keySchemaMeta, d)
817 2 : return nil
818 : }
819 :
820 : // Assert that an IndexBlockDecoder can fit inside block.Metadata.
821 : const _ uint = block.MetadataSize - uint(unsafe.Sizeof(IndexBlockDecoder{}))
822 :
823 : // InitIndexBlockMetadata initializes the metadata for an index block.
824 2 : func InitIndexBlockMetadata(md *block.Metadata, data []byte) (err error) {
825 2 : if uintptr(unsafe.Pointer(md))%8 != 0 {
826 0 : return errors.AssertionFailedf("metadata is not 8-byte aligned")
827 0 : }
828 2 : d := (*IndexBlockDecoder)(unsafe.Pointer(md))
829 2 : // Initialization can panic; convert panics to corruption errors (so higher
830 2 : // layers can add file number and offset information).
831 2 : defer func() {
832 2 : if r := recover(); r != nil {
833 0 : err = base.CorruptionErrorf("error initializing index block metadata: %v", r)
834 0 : }
835 : }()
836 2 : d.Init(data)
837 2 : return nil
838 : }
839 :
840 : // Assert that a IndexBlockDecoder can fit inside block.Metadata.
841 : const _ uint = block.MetadataSize - uint(unsafe.Sizeof(KeyspanDecoder{}))
842 :
843 : // InitKeyspanBlockMetadata initializes the metadata for a rangedel or range key block.
844 2 : func InitKeyspanBlockMetadata(md *block.Metadata, data []byte) (err error) {
845 2 : if uintptr(unsafe.Pointer(md))%8 != 0 {
846 0 : return errors.AssertionFailedf("metadata is not 8-byte aligned")
847 0 : }
848 2 : d := (*KeyspanDecoder)(unsafe.Pointer(md))
849 2 : // Initialization can panic; convert panics to corruption errors (so higher
850 2 : // layers can add file number and offset information).
851 2 : defer func() {
852 2 : if r := recover(); r != nil {
853 0 : err = base.CorruptionErrorf("error initializing keyspan block metadata: %v", r)
854 0 : }
855 : }()
856 2 : d.Init(data)
857 2 : return nil
858 : }
859 :
860 : // A DataBlockDecoder holds state for interpreting a columnar data block. It may
861 : // be shared among multiple DataBlockIters.
862 : type DataBlockDecoder struct {
863 : d BlockDecoder
864 : // trailers holds an array of the InternalKey trailers, encoding the key
865 : // kind and sequence number of each key.
866 : trailers UnsafeUints
867 : // prefixChanged is a bitmap indicating when the prefix (as defined by
868 : // Split) of a key changes, relative to the preceding key. This is used to
869 : // bound seeks within a prefix, and to optimize NextPrefix.
870 : prefixChanged Bitmap
871 : // values is the column reader for values. If the isValueExternal bitmap
872 : // indicates a value is external, the value is prefixed with a ValuePrefix
873 : // byte.
874 : values RawBytes
875 : // isValueExternal is the column reader for the is-value-external bitmap
876 : // that indicates whether a value is stored out-of-band in a value block. If
877 : // true, the value contains a ValuePrefix byte followed by an encoded value
878 : // handle indicating the value's location within the value block(s).
879 : isValueExternal Bitmap
880 : // isObsolete is the column reader for the is-obsolete bitmap
881 : // that indicates whether a key is obsolete/non-live.
882 : isObsolete Bitmap
883 : // maximumKeyLength is the maximum length of a user key in the block.
884 : // Iterators may use it to allocate a sufficiently large buffer up front,
885 : // and elide size checks during iteration. Note that iterators should add +1
886 : // to the key length to ensure pointer arithmetric that computes a pointer
887 : // to the tail of the key does not point to memory beyond the allocation
888 : // (prohibited by Go pointer rules).
889 : maximumKeyLength uint32
890 : }
891 :
892 : // BlockDecoder returns a pointer to the underlying BlockDecoder.
893 1 : func (d *DataBlockDecoder) BlockDecoder() *BlockDecoder {
894 1 : return &d.d
895 1 : }
896 :
897 : // PrefixChanged returns the prefix-changed bitmap.
898 1 : func (d *DataBlockDecoder) PrefixChanged() Bitmap {
899 1 : return d.prefixChanged
900 1 : }
901 :
902 : // KeySchemaHeader returns the KeySchema-specific header.
903 1 : func (d *DataBlockDecoder) KeySchemaHeader() []byte {
904 1 : return d.d.data[:d.d.customHeaderSize-dataBlockCustomHeaderSize]
905 1 : }
906 :
907 : // Init initializes the data block reader with the given serialized data block.
908 2 : func (d *DataBlockDecoder) Init(schema *KeySchema, data []byte) {
909 2 : if uintptr(unsafe.Pointer(unsafe.SliceData(data)))&7 != 0 {
910 0 : panic("data buffer not 8-byte aligned")
911 : }
912 2 : d.d.Init(data, dataBlockCustomHeaderSize+schema.HeaderSize)
913 2 : d.trailers = d.d.Uints(len(schema.ColumnTypes) + dataBlockColumnTrailer)
914 2 : d.prefixChanged = d.d.Bitmap(len(schema.ColumnTypes) + dataBlockColumnPrefixChanged)
915 2 : d.values = d.d.RawBytes(len(schema.ColumnTypes) + dataBlockColumnValue)
916 2 : d.isValueExternal = d.d.Bitmap(len(schema.ColumnTypes) + dataBlockColumnIsValueExternal)
917 2 : d.isObsolete = d.d.Bitmap(len(schema.ColumnTypes) + dataBlockColumnIsObsolete)
918 2 : d.maximumKeyLength = binary.LittleEndian.Uint32(data[schema.HeaderSize:])
919 : }
920 :
921 : // Describe descirbes the binary format of the data block, assuming f.Offset()
922 : // is positioned at the beginning of the same data block described by d.
923 1 : func (d *DataBlockDecoder) Describe(f *binfmt.Formatter, tp treeprinter.Node) {
924 1 : // Set the relative offset. When loaded into memory, the beginning of blocks
925 1 : // are aligned. Padding that ensures alignment is done relative to the
926 1 : // current offset. Setting the relative offset ensures that if we're
927 1 : // describing this block within a larger structure (eg, f.Offset()>0), we
928 1 : // compute padding appropriately assuming the current byte f.Offset() is
929 1 : // aligned.
930 1 : f.SetAnchorOffset()
931 1 :
932 1 : n := tp.Child("data block header")
933 1 : if keySchemaHeaderSize := int(d.d.customHeaderSize - 4); keySchemaHeaderSize > 0 {
934 1 : f.HexBytesln(keySchemaHeaderSize, "key schema header")
935 1 : }
936 1 : f.HexBytesln(4, "maximum key length: %d", d.maximumKeyLength)
937 1 : d.d.headerToBinFormatter(f, n)
938 1 : for i := 0; i < int(d.d.header.Columns); i++ {
939 1 : d.d.columnToBinFormatter(f, n, i, int(d.d.header.Rows))
940 1 : }
941 1 : f.HexBytesln(1, "block padding byte")
942 1 : f.ToTreePrinter(n)
943 : }
944 :
945 : // Validate validates invariants that should hold across all data blocks.
946 2 : func (d *DataBlockDecoder) Validate(comparer *base.Comparer, keySchema *KeySchema) error {
947 2 : // TODO(jackson): Consider avoiding these allocations, even if this is only
948 2 : // called in invariants builds.
949 2 : n := d.d.header.Rows
950 2 : meta := &KeySeekerMetadata{}
951 2 : keySchema.InitKeySeekerMetadata(meta, d)
952 2 : keySeeker := keySchema.KeySeeker(meta)
953 2 : prevKey := base.InternalKey{UserKey: make([]byte, 0, d.maximumKeyLength+1)}
954 2 : var curKey PrefixBytesIter
955 2 : curKey.Init(int(d.maximumKeyLength), nil)
956 2 :
957 2 : for i := 0; i < int(n); i++ {
958 2 : k := base.InternalKey{
959 2 : UserKey: keySeeker.MaterializeUserKey(&curKey, i-1, i),
960 2 : Trailer: base.InternalKeyTrailer(d.trailers.At(i)),
961 2 : }
962 2 : // Ensure the keys are ordered.
963 2 : ucmp := comparer.Compare(k.UserKey, prevKey.UserKey)
964 2 : if ucmp < 0 || (ucmp == 0 && k.Trailer >= prevKey.Trailer) {
965 0 : return errors.AssertionFailedf("key %s (row %d) and key %s (row %d) are out of order",
966 0 : prevKey, i-1, k, i)
967 0 : }
968 : // Ensure the obsolete bit is set if the key is definitively obsolete.
969 : // Not all sources of obsolescence are evident with only a data block
970 : // available (range deletions or point keys in previous blocks may cause
971 : // a key to be obsolete).
972 2 : if ucmp == 0 && prevKey.Kind() != base.InternalKeyKindMerge && !d.isObsolete.At(i) {
973 0 : return errors.AssertionFailedf("key %s (row %d) is shadowed by previous key %s but is not marked as obsolete",
974 0 : k, i, prevKey)
975 0 : }
976 : // Ensure that the prefix-changed bit is set correctly.
977 2 : if i > 0 {
978 2 : currPrefix := comparer.Split.Prefix(k.UserKey)
979 2 : prevPrefix := comparer.Split.Prefix(prevKey.UserKey)
980 2 : prefixChanged := !bytes.Equal(prevPrefix, currPrefix)
981 2 : if prefixChanged != d.prefixChanged.At(i) {
982 0 : return errors.AssertionFailedf("prefix changed bit for key %q (row %d) is %t, expected %t [prev key was %q]",
983 0 : k.UserKey, i, d.prefixChanged.At(i), prefixChanged, prevKey.UserKey)
984 0 : }
985 : }
986 :
987 2 : prevKey.CopyFrom(k)
988 : }
989 2 : return nil
990 : }
991 :
992 : // Assert that *DataBlockIter implements block.DataBlockIterator.
993 : var _ block.DataBlockIterator = (*DataBlockIter)(nil)
994 :
995 : // DataBlockIter iterates over a columnar data block.
996 : type DataBlockIter struct {
997 : // -- Fields that are initialized once --
998 : // For any changes to these fields, InitOnce should be updated.
999 :
1000 : // keySchema configures the DataBlockIterConfig to use the provided
1001 : // KeySchema when initializing the DataBlockIter for iteration over a new
1002 : // block.
1003 : keySchema *KeySchema
1004 : suffixCmp base.ComparePointSuffixes
1005 : split base.Split
1006 : // getLazyValuer configures the DataBlockIterConfig to initialize the
1007 : // DataBlockIter to use the provided handler for retrieving lazy values.
1008 : getLazyValuer block.GetLazyValueForPrefixAndValueHandler
1009 :
1010 : // -- Fields that are initialized for each block --
1011 : // For any changes to these fields, InitHandle should be updated.
1012 :
1013 : d *DataBlockDecoder
1014 : h block.BufferHandle
1015 : maxRow int
1016 : transforms block.IterTransforms
1017 : noTransforms bool
1018 : keySeeker KeySeeker
1019 :
1020 : // -- State --
1021 : // For any changes to these fields, InitHandle (which resets them) should be
1022 : // updated.
1023 :
1024 : keyIter PrefixBytesIter
1025 : row int
1026 : kv base.InternalKV
1027 : kvRow int // the row currently held in kv
1028 :
1029 : // nextObsoletePoint is the row index of the first obsolete point after i.row.
1030 : // It is used to optimize skipping of obsolete points during forward
1031 : // iteration.
1032 : nextObsoletePoint int
1033 : }
1034 :
1035 : // InitOnce configures the data block iterator's key schema and lazy value
1036 : // handler. The iterator must be initialized with a block before it can be used.
1037 : // It may be reinitialized with new blocks without calling InitOnce again.
1038 : func (i *DataBlockIter) InitOnce(
1039 : keySchema *KeySchema,
1040 : comparer *base.Comparer,
1041 : getLazyValuer block.GetLazyValueForPrefixAndValueHandler,
1042 2 : ) {
1043 2 : i.keySchema = keySchema
1044 2 : i.suffixCmp = comparer.ComparePointSuffixes
1045 2 : i.split = comparer.Split
1046 2 : i.getLazyValuer = getLazyValuer
1047 2 : }
1048 :
1049 : // Init initializes the data block iterator, configuring it to read from the
1050 : // provided decoder.
1051 1 : func (i *DataBlockIter) Init(d *DataBlockDecoder, transforms block.IterTransforms) error {
1052 1 : i.d = d
1053 1 : // Leave i.h unchanged.
1054 1 : numRows := int(d.d.header.Rows)
1055 1 : i.maxRow = numRows - 1
1056 1 : i.transforms = transforms
1057 1 : if i.transforms.HideObsoletePoints && d.isObsolete.SeekSetBitGE(0) == numRows {
1058 0 : // There are no obsolete points in the block; don't bother checking.
1059 0 : i.transforms.HideObsoletePoints = false
1060 0 : }
1061 1 : i.noTransforms = i.transforms.NoTransforms()
1062 1 :
1063 1 : // TODO(radu): see if this allocation can be a problem for the suffix rewriter.
1064 1 : meta := &KeySeekerMetadata{}
1065 1 : i.keySchema.InitKeySeekerMetadata(meta, d)
1066 1 : i.keySeeker = i.keySchema.KeySeeker(meta)
1067 1 :
1068 1 : // The worst case is when the largest key in the block has no suffix.
1069 1 : maxKeyLength := int(i.transforms.SyntheticPrefixAndSuffix.PrefixLen() + d.maximumKeyLength + i.transforms.SyntheticPrefixAndSuffix.SuffixLen())
1070 1 : i.keyIter.Init(maxKeyLength, i.transforms.SyntheticPrefix())
1071 1 : i.row = -1
1072 1 : i.kv = base.InternalKV{}
1073 1 : i.kvRow = math.MinInt
1074 1 : i.nextObsoletePoint = 0
1075 1 : return nil
1076 : }
1077 :
1078 : // InitHandle initializes the block from the provided buffer handle. InitHandle
1079 : // assumes that the block's metadata was initialized using
1080 : // InitDataBlockMetadata().
1081 : func (i *DataBlockIter) InitHandle(
1082 : comparer *base.Comparer, h block.BufferHandle, transforms block.IterTransforms,
1083 2 : ) error {
1084 2 : i.suffixCmp = comparer.ComparePointSuffixes
1085 2 : i.split = comparer.Split
1086 2 : blockMeta := h.BlockMetadata()
1087 2 : i.d = (*DataBlockDecoder)(unsafe.Pointer(blockMeta))
1088 2 : keySeekerMeta := (*KeySeekerMetadata)(blockMeta[unsafe.Sizeof(DataBlockDecoder{}):])
1089 2 : i.h.Release()
1090 2 : i.h = h
1091 2 :
1092 2 : numRows := int(i.d.d.header.Rows)
1093 2 : i.maxRow = numRows - 1
1094 2 :
1095 2 : i.transforms = transforms
1096 2 : if i.transforms.HideObsoletePoints && i.d.isObsolete.SeekSetBitGE(0) == numRows {
1097 2 : // There are no obsolete points in the block; don't bother checking.
1098 2 : i.transforms.HideObsoletePoints = false
1099 2 : }
1100 2 : i.noTransforms = i.transforms.NoTransforms()
1101 2 :
1102 2 : // The worst case is when the largest key in the block has no suffix.
1103 2 : maxKeyLength := int(i.transforms.SyntheticPrefixAndSuffix.PrefixLen() + i.d.maximumKeyLength + i.transforms.SyntheticPrefixAndSuffix.SuffixLen())
1104 2 : i.keyIter.Init(maxKeyLength, i.transforms.SyntheticPrefix())
1105 2 : i.row = -1
1106 2 : i.kv = base.InternalKV{}
1107 2 : i.kvRow = math.MinInt
1108 2 : i.nextObsoletePoint = 0
1109 2 : i.keySeeker = i.keySchema.KeySeeker(keySeekerMeta)
1110 2 : return nil
1111 : }
1112 :
1113 : // Handle returns the handle to the block.
1114 2 : func (i *DataBlockIter) Handle() block.BufferHandle {
1115 2 : return i.h
1116 2 : }
1117 :
1118 : // Valid returns true if the iterator is currently positioned at a valid KV.
1119 2 : func (i *DataBlockIter) Valid() bool {
1120 2 : return i.row >= 0 && i.row <= i.maxRow && !i.IsDataInvalidated()
1121 2 : }
1122 :
1123 : // KV returns the key-value pair at the current iterator position. The
1124 : // iterator must be positioned over a valid KV.
1125 2 : func (i *DataBlockIter) KV() *base.InternalKV {
1126 2 : return &i.kv
1127 2 : }
1128 :
1129 : // Invalidate invalidates the block iterator, removing references to the block
1130 : // it was initialized with. The iterator may continue to be used after
1131 : // a call to Invalidate, but all positioning methods should return false.
1132 : // Valid() must also return false.
1133 2 : func (i *DataBlockIter) Invalidate() {
1134 2 : i.d = nil
1135 2 : }
1136 :
1137 : // IsDataInvalidated returns true when the iterator has been invalidated
1138 : // using an Invalidate call.
1139 2 : func (i *DataBlockIter) IsDataInvalidated() bool {
1140 2 : return i.d == nil
1141 2 : }
1142 :
1143 : // IsLowerBound implements the block.DataBlockIterator interface.
1144 2 : func (i *DataBlockIter) IsLowerBound(k []byte) bool {
1145 2 : if i.transforms.HasSyntheticPrefix() {
1146 1 : var keyPrefix []byte
1147 1 : keyPrefix, k = splitKey(k, len(i.transforms.SyntheticPrefix()))
1148 1 : if cmp := bytes.Compare(keyPrefix, i.transforms.SyntheticPrefix()); cmp != 0 {
1149 1 : return cmp < 0
1150 1 : }
1151 : }
1152 : // If we are hiding obsolete points, it is possible that all points < k are
1153 : // hidden.
1154 : // Note: we ignore HideObsoletePoints, but false negatives are allowed.
1155 2 : return i.keySeeker.IsLowerBound(k, i.transforms.SyntheticSuffix())
1156 : }
1157 :
1158 : // splitKey splits a key into k[:at] and k[at:].
1159 2 : func splitKey(k []byte, at int) (before, after []byte) {
1160 2 : if len(k) <= at {
1161 2 : return k, nil
1162 2 : }
1163 2 : return k[:at], k[at:]
1164 : }
1165 :
1166 : // seekGEInternal is a wrapper around keySeeker.SeekGE which takes into account
1167 : // the synthetic prefix and suffix.
1168 2 : func (i *DataBlockIter) seekGEInternal(key []byte, boundRow int, searchDir int8) (row int) {
1169 2 : if i.transforms.HasSyntheticPrefix() {
1170 2 : var keyPrefix []byte
1171 2 : keyPrefix, key = splitKey(key, len(i.transforms.SyntheticPrefix()))
1172 2 : if cmp := bytes.Compare(keyPrefix, i.transforms.SyntheticPrefix()); cmp != 0 {
1173 1 : if cmp < 0 {
1174 1 : return 0
1175 1 : }
1176 1 : return i.maxRow + 1
1177 : }
1178 : }
1179 2 : if i.transforms.HasSyntheticSuffix() {
1180 2 : n := i.split(key)
1181 2 : row, eq := i.keySeeker.SeekGE(key[:n], boundRow, searchDir)
1182 2 : if eq && i.suffixCmp(key[n:], i.transforms.SyntheticSuffix()) > 0 {
1183 2 : row = i.d.prefixChanged.SeekSetBitGE(row + 1)
1184 2 : }
1185 2 : return row
1186 : }
1187 2 : row, _ = i.keySeeker.SeekGE(key, boundRow, searchDir)
1188 2 : return row
1189 : }
1190 :
1191 : // SeekGE implements the base.InternalIterator interface.
1192 2 : func (i *DataBlockIter) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKV {
1193 2 : if i.d == nil {
1194 1 : return nil
1195 1 : }
1196 2 : searchDir := int8(0)
1197 2 : if flags.TrySeekUsingNext() {
1198 0 : searchDir = +1
1199 0 : }
1200 2 : if i.noTransforms {
1201 2 : // Fast path.
1202 2 : i.row, _ = i.keySeeker.SeekGE(key, i.row, searchDir)
1203 2 : return i.decodeRow()
1204 2 : }
1205 2 : i.row = i.seekGEInternal(key, i.row, searchDir)
1206 2 : if i.transforms.HideObsoletePoints {
1207 2 : i.nextObsoletePoint = i.d.isObsolete.SeekSetBitGE(i.row)
1208 2 : if i.atObsoletePointForward() {
1209 2 : i.skipObsoletePointsForward()
1210 2 : if i.row > i.maxRow {
1211 2 : return nil
1212 2 : }
1213 : }
1214 : }
1215 2 : return i.decodeRow()
1216 : }
1217 :
1218 : // SeekPrefixGE implements the base.InternalIterator interface.
1219 0 : func (i *DataBlockIter) SeekPrefixGE(prefix, key []byte, flags base.SeekGEFlags) *base.InternalKV {
1220 0 : // This should never be called as prefix iteration is handled by
1221 0 : // sstable.Iterator.
1222 0 :
1223 0 : // TODO(jackson): We can implement this and avoid propagating keys without
1224 0 : // the prefix up to the merging iterator. It will avoid unnecessary key
1225 0 : // comparisons fixing up the merging iterator heap. We can also short
1226 0 : // circuit the search if the prefix isn't found within the prefix column.
1227 0 : // There's some subtlety around ensuring we continue to benefit from the
1228 0 : // TrySeekUsingNext optimization.
1229 0 : panic("pebble: SeekPrefixGE unimplemented")
1230 : }
1231 :
1232 : // SeekLT implements the base.InternalIterator interface.
1233 2 : func (i *DataBlockIter) SeekLT(key []byte, _ base.SeekLTFlags) *base.InternalKV {
1234 2 : if i.d == nil {
1235 1 : return nil
1236 1 : }
1237 2 : i.row = i.seekGEInternal(key, i.row, 0 /* searchDir */) - 1
1238 2 : if i.transforms.HideObsoletePoints {
1239 2 : i.nextObsoletePoint = i.d.isObsolete.SeekSetBitGE(max(i.row, 0))
1240 2 : if i.atObsoletePointBackward() {
1241 2 : i.skipObsoletePointsBackward()
1242 2 : if i.row < 0 {
1243 2 : return nil
1244 2 : }
1245 : }
1246 : }
1247 2 : return i.decodeRow()
1248 : }
1249 :
1250 : // First implements the base.InternalIterator interface.
1251 2 : func (i *DataBlockIter) First() *base.InternalKV {
1252 2 : if i.d == nil {
1253 1 : return nil
1254 1 : }
1255 2 : i.row = 0
1256 2 : if i.transforms.HideObsoletePoints {
1257 2 : i.nextObsoletePoint = i.d.isObsolete.SeekSetBitGE(0)
1258 2 : if i.atObsoletePointForward() {
1259 2 : i.skipObsoletePointsForward()
1260 2 : if i.row > i.maxRow {
1261 2 : return nil
1262 2 : }
1263 : }
1264 : }
1265 2 : return i.decodeRow()
1266 : }
1267 :
1268 : // Last implements the base.InternalIterator interface.
1269 2 : func (i *DataBlockIter) Last() *base.InternalKV {
1270 2 : if i.d == nil {
1271 1 : return nil
1272 1 : }
1273 2 : i.row = i.maxRow
1274 2 : if i.transforms.HideObsoletePoints {
1275 2 : i.nextObsoletePoint = i.maxRow + 1
1276 2 : if i.atObsoletePointBackward() {
1277 2 : i.skipObsoletePointsBackward()
1278 2 : if i.row < 0 {
1279 1 : return nil
1280 1 : }
1281 : }
1282 : }
1283 2 : return i.decodeRow()
1284 : }
1285 :
1286 : // Next advances to the next KV pair in the block.
1287 2 : func (i *DataBlockIter) Next() *base.InternalKV {
1288 2 : if i.d == nil {
1289 2 : return nil
1290 2 : }
1291 : // Inline decodeRow, but avoiding unnecessary checks against i.row.
1292 2 : if i.row >= i.maxRow {
1293 2 : i.row = i.maxRow + 1
1294 2 : return nil
1295 2 : }
1296 2 : i.row++
1297 2 : // Inline decodeKey(), adding obsolete points logic.
1298 2 : if i.noTransforms {
1299 2 : // Fast path.
1300 2 : i.kv.K = base.InternalKey{
1301 2 : UserKey: i.keySeeker.MaterializeUserKey(&i.keyIter, i.kvRow, i.row),
1302 2 : Trailer: base.InternalKeyTrailer(i.d.trailers.At(i.row)),
1303 2 : }
1304 2 : } else {
1305 2 : if i.transforms.HideObsoletePoints && i.atObsoletePointForward() {
1306 2 : i.skipObsoletePointsForward()
1307 2 : if i.row > i.maxRow {
1308 2 : return nil
1309 2 : }
1310 : }
1311 2 : if i.transforms.HasSyntheticSuffix() {
1312 2 : i.kv.K.UserKey = i.keySeeker.MaterializeUserKeyWithSyntheticSuffix(
1313 2 : &i.keyIter, i.transforms.SyntheticSuffix(), i.kvRow, i.row,
1314 2 : )
1315 2 : } else {
1316 2 : i.kv.K.UserKey = i.keySeeker.MaterializeUserKey(&i.keyIter, i.kvRow, i.row)
1317 2 : }
1318 2 : i.kv.K.Trailer = base.InternalKeyTrailer(i.d.trailers.At(i.row))
1319 2 : if n := i.transforms.SyntheticSeqNum; n != 0 {
1320 2 : i.kv.K.SetSeqNum(base.SeqNum(n))
1321 2 : }
1322 : }
1323 : // Inline i.d.values.At(row).
1324 2 : v := i.d.values.slice(i.d.values.offsets.At2(i.row))
1325 2 : if i.d.isValueExternal.At(i.row) {
1326 2 : i.kv.V = i.getLazyValuer.GetLazyValueForPrefixAndValueHandle(v)
1327 2 : } else {
1328 2 : i.kv.V = base.MakeInPlaceValue(v)
1329 2 : }
1330 2 : i.kvRow = i.row
1331 2 : return &i.kv
1332 : }
1333 :
1334 : // NextPrefix moves the iterator to the next row with a different prefix than
1335 : // the key at the current iterator position.
1336 : //
1337 : // The columnar block implementation uses a newPrefix bitmap to identify the
1338 : // next row with a differing prefix from the current row's key. If newPrefix[i]
1339 : // is set then row's i key prefix is different that row i-1. The bitmap is
1340 : // organized as a slice of 64-bit words. If a 64-bit word in the bitmap is zero
1341 : // then all of the rows corresponding to the bits in that word have the same
1342 : // prefix and we can skip ahead. If a row is non-zero a small bit of bit
1343 : // shifting and masking combined with bits.TrailingZeros64 can identify the
1344 : // next bit that is set after the current row. The bitmap uses 1 bit/row (0.125
1345 : // bytes/row). A 32KB block containing 1300 rows (25 bytes/row) would need a
1346 : // bitmap of 21 64-bit words. Even in the worst case where every word is 0 this
1347 : // bitmap can be scanned in ~20 ns (1 ns/word) leading to a total NextPrefix
1348 : // time of ~30 ns if a row is found and decodeRow are called. In more normal
1349 : // cases, NextPrefix takes ~15% longer that a single Next call.
1350 : //
1351 : // For comparison, the rowblk nextPrefixV3 optimizations work by setting a bit
1352 : // in the value prefix byte that indicates that the current key has the same
1353 : // prefix as the previous key. Additionally, a bit is stolen from the restart
1354 : // table entries indicating whether a restart table entry has the same key
1355 : // prefix as the previous entry. Checking the value prefix byte bit requires
1356 : // locating that byte which requires decoding 3 varints per key/value pair.
1357 2 : func (i *DataBlockIter) NextPrefix(_ []byte) *base.InternalKV {
1358 2 : if i.d == nil {
1359 0 : return nil
1360 0 : }
1361 2 : i.row = i.d.prefixChanged.SeekSetBitGE(i.row + 1)
1362 2 : if i.transforms.HideObsoletePoints {
1363 1 : i.nextObsoletePoint = i.d.isObsolete.SeekSetBitGE(i.row)
1364 1 : if i.atObsoletePointForward() {
1365 1 : i.skipObsoletePointsForward()
1366 1 : }
1367 : }
1368 :
1369 2 : return i.decodeRow()
1370 : }
1371 :
1372 : // Prev moves the iterator to the previous KV pair in the block.
1373 2 : func (i *DataBlockIter) Prev() *base.InternalKV {
1374 2 : if i.d == nil {
1375 2 : return nil
1376 2 : }
1377 2 : i.row--
1378 2 : if i.transforms.HideObsoletePoints && i.atObsoletePointBackward() {
1379 2 : i.skipObsoletePointsBackward()
1380 2 : if i.row < 0 {
1381 2 : return nil
1382 2 : }
1383 : }
1384 2 : return i.decodeRow()
1385 : }
1386 :
1387 : // atObsoletePointForward returns true if i.row is an obsolete point. It is
1388 : // separate from skipObsoletePointsForward() because that method does not
1389 : // inline. It can only be used during forward iteration (i.e. i.row was
1390 : // incremented).
1391 : //
1392 : //gcassert:inline
1393 2 : func (i *DataBlockIter) atObsoletePointForward() bool {
1394 2 : if invariants.Enabled && i.row > i.nextObsoletePoint {
1395 0 : panic("invalid nextObsoletePoint")
1396 : }
1397 2 : return i.row == i.nextObsoletePoint && i.row <= i.maxRow
1398 : }
1399 :
1400 2 : func (i *DataBlockIter) skipObsoletePointsForward() {
1401 2 : if invariants.Enabled {
1402 2 : i.atObsoletePointCheck()
1403 2 : }
1404 2 : i.row = i.d.isObsolete.SeekUnsetBitGE(i.row)
1405 2 : i.nextObsoletePoint = i.d.isObsolete.SeekSetBitGE(i.row)
1406 : }
1407 :
1408 : // atObsoletePointBackward returns true if i.row is an obsolete point. It is
1409 : // separate from skipObsoletePointsBackward() because that method does not
1410 : // inline. It can only be used during reverse iteration (i.e. i.row was
1411 : // decremented).
1412 : //
1413 : //gcassert:inline
1414 2 : func (i *DataBlockIter) atObsoletePointBackward() bool {
1415 2 : return i.row >= 0 && i.d.isObsolete.At(i.row)
1416 2 : }
1417 :
1418 2 : func (i *DataBlockIter) skipObsoletePointsBackward() {
1419 2 : if invariants.Enabled {
1420 2 : i.atObsoletePointCheck()
1421 2 : }
1422 2 : i.row = i.d.isObsolete.SeekUnsetBitLE(i.row)
1423 2 : i.nextObsoletePoint = i.row + 1
1424 : }
1425 :
1426 2 : func (i *DataBlockIter) atObsoletePointCheck() {
1427 2 : // We extract this code into a separate function to avoid getting a spurious
1428 2 : // error from GCAssert about At not being inlined because it is compiled out
1429 2 : // altogether in non-invariant builds.
1430 2 : if !i.transforms.HideObsoletePoints || !i.d.isObsolete.At(i.row) {
1431 0 : panic("expected obsolete point")
1432 : }
1433 : }
1434 :
1435 : // Error implements the base.InternalIterator interface. A DataBlockIter is
1436 : // infallible and always returns a nil error.
1437 2 : func (i *DataBlockIter) Error() error {
1438 2 : return nil // infallible
1439 2 : }
1440 :
1441 : // SetBounds implements the base.InternalIterator interface.
1442 0 : func (i *DataBlockIter) SetBounds(lower, upper []byte) {
1443 0 : // This should never be called as bounds are handled by sstable.Iterator.
1444 0 : panic("pebble: SetBounds unimplemented")
1445 : }
1446 :
1447 : // SetContext implements the base.InternalIterator interface.
1448 0 : func (i *DataBlockIter) SetContext(_ context.Context) {}
1449 :
1450 : var dataBlockTypeString string = fmt.Sprintf("%T", (*DataBlockIter)(nil))
1451 :
1452 : // String implements the base.InternalIterator interface.
1453 0 : func (i *DataBlockIter) String() string {
1454 0 : return dataBlockTypeString
1455 0 : }
1456 :
1457 : // DebugTree is part of the InternalIterator interface.
1458 0 : func (i *DataBlockIter) DebugTree(tp treeprinter.Node) {
1459 0 : tp.Childf("%T(%p)", i, i)
1460 0 : }
1461 :
1462 : // decodeRow decodes i.row into i.kv. If i.row is invalid, it returns nil.
1463 2 : func (i *DataBlockIter) decodeRow() *base.InternalKV {
1464 2 : switch {
1465 2 : case i.row < 0 || i.row > i.maxRow:
1466 2 : return nil
1467 2 : case i.kvRow == i.row:
1468 2 : // Already synthesized the kv at row.
1469 2 : return &i.kv
1470 2 : default:
1471 2 : // Inline decodeKey().
1472 2 : if i.noTransforms {
1473 2 : // Fast path.
1474 2 : i.kv.K = base.InternalKey{
1475 2 : UserKey: i.keySeeker.MaterializeUserKey(&i.keyIter, i.kvRow, i.row),
1476 2 : Trailer: base.InternalKeyTrailer(i.d.trailers.At(i.row)),
1477 2 : }
1478 2 : } else {
1479 2 : if i.transforms.HasSyntheticSuffix() {
1480 2 : i.kv.K.UserKey = i.keySeeker.MaterializeUserKeyWithSyntheticSuffix(
1481 2 : &i.keyIter, i.transforms.SyntheticSuffix(), i.kvRow, i.row,
1482 2 : )
1483 2 : } else {
1484 2 : i.kv.K.UserKey = i.keySeeker.MaterializeUserKey(&i.keyIter, i.kvRow, i.row)
1485 2 : }
1486 2 : i.kv.K.Trailer = base.InternalKeyTrailer(i.d.trailers.At(i.row))
1487 2 : if n := i.transforms.SyntheticSeqNum; n != 0 {
1488 2 : i.kv.K.SetSeqNum(base.SeqNum(n))
1489 2 : }
1490 : }
1491 : // Inline i.d.values.At(row).
1492 2 : startOffset := i.d.values.offsets.At(i.row)
1493 2 : v := unsafe.Slice((*byte)(i.d.values.ptr(startOffset)), i.d.values.offsets.At(i.row+1)-startOffset)
1494 2 : if i.d.isValueExternal.At(i.row) {
1495 2 : i.kv.V = i.getLazyValuer.GetLazyValueForPrefixAndValueHandle(v)
1496 2 : } else {
1497 2 : i.kv.V = base.MakeInPlaceValue(v)
1498 2 : }
1499 2 : i.kvRow = i.row
1500 2 : return &i.kv
1501 : }
1502 : }
1503 :
1504 : // decodeKey updates i.kv.K to the key for i.row (which must be valid).
1505 : // This function does not inline, so we copy its code verbatim. For any updates
1506 : // to this code, all code preceded by "Inline decodeKey" must be updated.
1507 0 : func (i *DataBlockIter) decodeKey() {
1508 0 : if i.noTransforms {
1509 0 : // Fast path.
1510 0 : i.kv.K = base.InternalKey{
1511 0 : UserKey: i.keySeeker.MaterializeUserKey(&i.keyIter, i.kvRow, i.row),
1512 0 : Trailer: base.InternalKeyTrailer(i.d.trailers.At(i.row)),
1513 0 : }
1514 0 : } else {
1515 0 : if i.transforms.HasSyntheticSuffix() {
1516 0 : i.kv.K.UserKey = i.keySeeker.MaterializeUserKeyWithSyntheticSuffix(
1517 0 : &i.keyIter, i.transforms.SyntheticSuffix(), i.kvRow, i.row,
1518 0 : )
1519 0 : } else {
1520 0 : i.kv.K.UserKey = i.keySeeker.MaterializeUserKey(&i.keyIter, i.kvRow, i.row)
1521 0 : }
1522 0 : i.kv.K.Trailer = base.InternalKeyTrailer(i.d.trailers.At(i.row))
1523 0 : if n := i.transforms.SyntheticSeqNum; n != 0 {
1524 0 : i.kv.K.SetSeqNum(base.SeqNum(n))
1525 0 : }
1526 : }
1527 : }
1528 :
1529 : var _ = (*DataBlockIter).decodeKey
1530 :
1531 : // Close implements the base.InternalIterator interface.
1532 2 : func (i *DataBlockIter) Close() error {
1533 2 : i.keySeeker = nil
1534 2 : i.d = nil
1535 2 : i.h.Release()
1536 2 : i.h = block.BufferHandle{}
1537 2 : i.transforms = block.IterTransforms{}
1538 2 : i.kv = base.InternalKV{}
1539 2 : return nil
1540 2 : }
|