LCOV - code coverage report
Current view: top level - pebble/sstable/colblk - data_block.go (source / functions) Hit Total Coverage
Test: 2024-12-28 08:18Z 87a5141c - tests + meta.lcov Lines: 731 824 88.7 %
Date: 2024-12-28 08:19:46 Functions: 0 0 -

          Line data    Source code
       1             : // Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use
       2             : // of this source code is governed by a BSD-style license that can be found in
       3             : // the LICENSE file.
       4             : 
       5             : package colblk
       6             : 
       7             : import (
       8             :         "bytes"
       9             :         "cmp"
      10             :         "context"
      11             :         "encoding/binary"
      12             :         "fmt"
      13             :         "io"
      14             :         "math"
      15             :         "unsafe"
      16             : 
      17             :         "github.com/cockroachdb/crlib/crbytes"
      18             :         "github.com/cockroachdb/errors"
      19             :         "github.com/cockroachdb/pebble/internal/base"
      20             :         "github.com/cockroachdb/pebble/internal/binfmt"
      21             :         "github.com/cockroachdb/pebble/internal/bytealloc"
      22             :         "github.com/cockroachdb/pebble/internal/invariants"
      23             :         "github.com/cockroachdb/pebble/internal/treeprinter"
      24             :         "github.com/cockroachdb/pebble/sstable/block"
      25             : )
      26             : 
      27             : // KeySchema defines the schema of a user key, as defined by the user's
      28             : // application.
      29             : //
      30             : // TODO(jackson): Consider making this KVSchema. It feels like there's an
      31             : // opportunity to generalize the ShortAttribute so that when a value is stored
      32             : // out-of-band, the DataBlockEncoder calls user-provided code to store the short
      33             : // attributes inlined within the data block. For inlined-values, the
      34             : // user-defined value columns would be implicitly null.
      35             : type KeySchema struct {
      36             :         Name string
      37             :         // KeySchema implementations can optionally make use a fixed-sized custom
      38             :         // header inside each block.
      39             :         HeaderSize   uint32
      40             :         ColumnTypes  []DataType
      41             :         NewKeyWriter func() KeyWriter
      42             : 
      43             :         // InitKeySeekerMetadata initializes the provided KeySeekerMetadata. This
      44             :         // happens once when a block enters the block cache and can be used to save
      45             :         // computation in NewKeySeeker.
      46             :         InitKeySeekerMetadata func(meta *KeySeekerMetadata, d *DataBlockDecoder)
      47             : 
      48             :         // KeySeeker returns a KeySeeker using metadata that was previously
      49             :         // initialized with InitKeySeekerMetadata. The returned key seeker can be an
      50             :         // unsafe cast of the metadata itself.
      51             :         KeySeeker func(meta *KeySeekerMetadata) KeySeeker
      52             : }
      53             : 
      54             : // KeySeekerMetadata is an in-memory buffer that stores metadata for a block. It
      55             : // is allocated together with the buffer storing the block and is initialized
      56             : // once when the block is read from disk. It is always 8-byte aligned.
      57             : //
      58             : // Portions of this buffer can be cast to the structures we need (through
      59             : // unsafe.Pointer), but note that any pointers in these structures will be
      60             : // invisible to the GC. Pointers to the block's data buffer are ok, since the
      61             : // metadata and the data have the same lifetime (sharing the underlying
      62             : // allocation).
      63             : //
      64             : // KeySeekerMetadata is stored inside block.Metadata.
      65             : type KeySeekerMetadata [KeySeekerMetadataSize]byte
      66             : 
      67             : // KeySeekerMetadataSize is chosen to fit the CockroachDB key seeker
      68             : // implementation.
      69             : const KeySeekerMetadataSize = 176
      70             : 
      71             : // A KeyWriter maintains ColumnWriters for a data block for writing user keys
      72             : // into the database-specific key schema. Users may define their own key schema
      73             : // and implement KeyWriter to encode keys into custom columns that are aware of
      74             : // the structure of user keys.
      75             : type KeyWriter interface {
      76             :         ColumnWriter
      77             :         // ComparePrev compares the provided user to the previously-written user
      78             :         // key. The returned KeyComparison's UserKeyComparison field is equivalent
      79             :         // to Compare(key, prevKey) where prevKey is the last key passed to
      80             :         // WriteKey.
      81             :         //
      82             :         // If no key has been written yet, ComparePrev returns a KeyComparison with
      83             :         // PrefixLen set and UserKeyComparison=1.
      84             :         ComparePrev(key []byte) KeyComparison
      85             :         // WriteKey writes a user key into the KeyWriter's columns. The
      86             :         // keyPrefixLenSharedWithPrev parameter takes the number of bytes prefixing
      87             :         // the key's logical prefix (as defined by (base.Comparer).Split) that the
      88             :         // previously-written key's prefix shares.
      89             :         //
      90             :         // WriteKey is guaranteed to be called sequentially with increasing row
      91             :         // indexes, beginning at zero.
      92             :         WriteKey(row int, key []byte, keyPrefixLen, keyPrefixLenSharedWithPrev int32)
      93             :         // MaterializeKey appends the zero-indexed row'th key written to dst,
      94             :         // returning the result.
      95             :         MaterializeKey(dst []byte, row int) []byte
      96             :         // FinishHeader serializes an internal header of exactly KeySchema.HeaderSize bytes.
      97             :         FinishHeader(dst []byte)
      98             : }
      99             : 
     100             : // AssertKeyCompare compares two keys using the provided comparer, ensuring the
     101             : // provided KeyComparison accurately describing the result. Panics if the
     102             : // assertion does not hold.
     103           2 : func AssertKeyCompare(comparer *base.Comparer, a, b []byte, kcmp KeyComparison) {
     104           2 :         bi := comparer.Split(b)
     105           2 :         var recomputed KeyComparison
     106           2 :         recomputed.PrefixLen = int32(comparer.Split(a))
     107           2 :         recomputed.CommonPrefixLen = int32(crbytes.CommonPrefix(a[:recomputed.PrefixLen], b[:bi]))
     108           2 :         recomputed.UserKeyComparison = int32(comparer.Compare(a, b))
     109           2 :         if recomputed.PrefixEqual() != bytes.Equal(a[:recomputed.PrefixLen], b[:bi]) {
     110           0 :                 panic(errors.AssertionFailedf("PrefixEqual()=%t doesn't hold: %q, %q", kcmp.PrefixEqual(), a, b))
     111             :         }
     112           2 :         if recomputed != kcmp {
     113           0 :                 panic(errors.AssertionFailedf("KeyComparison of (%q, %q) = %s, ComparePrev gave %s",
     114           0 :                         a, b, recomputed, kcmp))
     115             :         }
     116             : }
     117             : 
     118             : // KeyComparison holds information about a key and its comparison to another a
     119             : // key.
     120             : type KeyComparison struct {
     121             :         // PrefixLen is the length of the prefix of the key. It's the outcome of
     122             :         // calling base.Split on the key.
     123             :         PrefixLen int32
     124             :         // CommonPrefixLen is the length of the physical (byte-wise) prefix of the
     125             :         // logical prefix that is shared with the other key. For example, for
     126             :         // "apple@1" and "applied@3" the value is 4 (the length of "appl"). For
     127             :         // "apple@1" and "apple@10" the value is 5 (the length of "apple"), because
     128             :         // the shared bytes within the suffix are not included.
     129             :         CommonPrefixLen int32
     130             :         // UserKeyComparison is the comparison of the user keys of the two keys.
     131             :         // Should be equivalent to
     132             :         //
     133             :         //   Compare(key, otherKey)
     134             :         UserKeyComparison int32
     135             : }
     136             : 
     137             : // String returns a string representation of the KeyComparison.
     138           0 : func (kcmp KeyComparison) String() string {
     139           0 :         return fmt.Sprintf("(prefix={%d,common=%d} cmp=%d)",
     140           0 :                 kcmp.PrefixLen, kcmp.CommonPrefixLen, kcmp.UserKeyComparison)
     141           0 : }
     142             : 
     143             : // PrefixEqual returns true if the key comparison determined that the keys have
     144             : // equal prefixes.
     145           2 : func (kcmp KeyComparison) PrefixEqual() bool { return kcmp.PrefixLen == kcmp.CommonPrefixLen }
     146             : 
     147             : // KeySeeker iterates over the keys in a columnar data block.
     148             : //
     149             : // Users of Pebble who define their own key schema must implement KeySeeker to
     150             : // seek over their decomposed keys.
     151             : //
     152             : // KeySeeker implementations must be safe for concurrent use by multiple
     153             : // goroutines. In practice, multiple DataBlockIterators may use the same
     154             : // KeySeeker.
     155             : type KeySeeker interface {
     156             :         // IsLowerBound returns true if all keys in the data block (after suffix
     157             :         // replacement if syntheticSuffix is not empty) are >= the given key. If the
     158             :         // data block contains no keys, returns true.
     159             :         IsLowerBound(k []byte, syntheticSuffix []byte) bool
     160             :         // SeekGE returns the index of the first row with a key greater than or equal
     161             :         // to [key], and whether that row has the same prefix as [key].
     162             :         //
     163             :         // If the caller externally knows a bound on where the key is located, it
     164             :         // may indicate it through [boundRow] and [searchDir]. A [searchDir] value
     165             :         // of -1 indicates that the sought row must be at an index ≤ [boundRow]. A
     166             :         // [searchDir] value of +1 indicates that the sought row must be at an index
     167             :         // ≥ [boundRow]. Implementations may use this information to constrain the
     168             :         // search. See (base.SeekGEFlags).TrySeekUsingNext for context on when this
     169             :         // may be set in practice.
     170             :         SeekGE(key []byte, boundRow int, searchDir int8) (row int, equalPrefix bool)
     171             :         // MaterializeUserKey materializes the user key of the specified row,
     172             :         // returning a slice of the materialized user key.
     173             :         //
     174             :         // The provided keyIter must have a buffer large enough to hold the key.
     175             :         //
     176             :         // The prevRow parameter is the row MaterializeUserKey was last invoked with
     177             :         // (or a negative number if not applicable). Implementations may take
     178             :         // advantage of that knowledge to reduce work.
     179             :         MaterializeUserKey(keyIter *PrefixBytesIter, prevRow, row int) []byte
     180             :         // MaterializeUserKeyWithSyntheticSuffix is a variant of MaterializeUserKey
     181             :         // where the suffix is replaced.
     182             :         //
     183             :         // The provided keyIter must have a buffer large enough to hold the key after
     184             :         // suffix replacement.
     185             :         //
     186             :         // The prevRow parameter is the row MaterializeUserKeyWithSyntheticSuffix was
     187             :         // last invoked with (or a negative number if not applicable). Implementations
     188             :         // may take advantage of that knowledge to reduce work.
     189             :         MaterializeUserKeyWithSyntheticSuffix(
     190             :                 keyIter *PrefixBytesIter, syntheticSuffix []byte, prevRow, row int,
     191             :         ) []byte
     192             : }
     193             : 
     194             : const (
     195             :         defaultKeySchemaColumnPrefix int = iota
     196             :         defaultKeySchemaColumnSuffix
     197             : )
     198             : 
     199             : var defaultSchemaColumnTypes = []DataType{
     200             :         defaultKeySchemaColumnPrefix: DataTypePrefixBytes,
     201             :         defaultKeySchemaColumnSuffix: DataTypeBytes,
     202             : }
     203             : 
     204             : // DefaultKeySchema returns the default key schema that decomposes a user key
     205             : // into its prefix and suffix. Prefixes are sorted in lexicographical order.
     206           2 : func DefaultKeySchema(comparer *base.Comparer, prefixBundleSize int) KeySchema {
     207           2 :         return KeySchema{
     208           2 :                 Name:        fmt.Sprintf("DefaultKeySchema(%s,%d)", comparer.Name, prefixBundleSize),
     209           2 :                 HeaderSize:  0,
     210           2 :                 ColumnTypes: defaultSchemaColumnTypes,
     211           2 :                 NewKeyWriter: func() KeyWriter {
     212           2 :                         kw := &defaultKeyWriter{comparer: comparer}
     213           2 :                         kw.prefixes.Init(prefixBundleSize)
     214           2 :                         kw.suffixes.Init()
     215           2 :                         return kw
     216           2 :                 },
     217           2 :                 InitKeySeekerMetadata: func(meta *KeySeekerMetadata, d *DataBlockDecoder) {
     218           2 :                         ks := (*defaultKeySeeker)(unsafe.Pointer(&meta[0]))
     219           2 :                         ks.comparer = comparer
     220           2 :                         ks.init(d)
     221           2 :                 },
     222           2 :                 KeySeeker: func(meta *KeySeekerMetadata) KeySeeker {
     223           2 :                         ks := (*defaultKeySeeker)(unsafe.Pointer(&meta[0]))
     224           2 :                         return ks
     225           2 :                 },
     226             :         }
     227             : }
     228             : 
     229             : // Assert that *defaultKeyWriter implements the KeyWriter interface.
     230             : var _ KeyWriter = (*defaultKeyWriter)(nil)
     231             : 
     232             : type defaultKeyWriter struct {
     233             :         comparer *base.Comparer
     234             :         prefixes PrefixBytesBuilder
     235             :         suffixes RawBytesBuilder
     236             : }
     237             : 
     238           2 : func (w *defaultKeyWriter) ComparePrev(key []byte) KeyComparison {
     239           2 :         var cmpv KeyComparison
     240           2 :         cmpv.PrefixLen = int32(w.comparer.Split(key))
     241           2 :         if w.prefixes.nKeys == 0 {
     242           2 :                 // The first key has no previous key to compare to.
     243           2 :                 cmpv.UserKeyComparison = 1
     244           2 :                 return cmpv
     245           2 :         }
     246           2 :         lp := w.prefixes.UnsafeGet(w.prefixes.nKeys - 1)
     247           2 :         cmpv.CommonPrefixLen = int32(crbytes.CommonPrefix(lp, key[:cmpv.PrefixLen]))
     248           2 : 
     249           2 :         // Keys are written in order and prefixes must be sorted lexicograpgically,
     250           2 :         // so CommonPrefixLen == PrefixLen implies that the keys share the same
     251           2 :         // logical prefix. (If the previous key had a prefix longer than
     252           2 :         // CommonPrefixLen, it would sort after [key].)
     253           2 :         if cmpv.CommonPrefixLen == cmpv.PrefixLen {
     254           2 :                 // The keys share the same MVCC prefix. Compare the suffixes.
     255           2 :                 cmpv.UserKeyComparison = int32(w.comparer.ComparePointSuffixes(key[cmpv.PrefixLen:],
     256           2 :                         w.suffixes.UnsafeGet(w.suffixes.rows-1)))
     257           2 :                 if invariants.Enabled {
     258           2 :                         if !w.comparer.Equal(lp, key[:cmpv.PrefixLen]) {
     259           0 :                                 panic(errors.AssertionFailedf("keys have different logical prefixes: %q != %q", lp, key[:cmpv.PrefixLen]))
     260             :                         }
     261             :                 }
     262           2 :                 return cmpv
     263             :         }
     264             : 
     265             :         // The keys have different MVCC prefixes. We haven't determined which is
     266             :         // greater, but we know the index at which they diverge. The base.Comparer
     267             :         // contract dictates that prefixes must be lexicographically ordered.
     268           2 :         if len(lp) == int(cmpv.CommonPrefixLen) {
     269           2 :                 // cmpv.PrefixLen > cmpv.PrefixLenShared; key is greater.
     270           2 :                 cmpv.UserKeyComparison = +1
     271           2 :         } else {
     272           2 :                 // Both keys have at least 1 additional byte at which they diverge.
     273           2 :                 // Compare the diverging byte.
     274           2 :                 cmpv.UserKeyComparison = int32(cmp.Compare(key[cmpv.CommonPrefixLen], lp[cmpv.CommonPrefixLen]))
     275           2 :         }
     276           2 :         if invariants.Enabled {
     277           2 :                 // In this case we've determined that the keys have different prefixes,
     278           2 :                 // so the UserKeyComparison should be equal to the result of comparing
     279           2 :                 // the prefixes and nonzero.
     280           2 :                 if cmpv.UserKeyComparison == 0 {
     281           0 :                         panic(errors.AssertionFailedf("user keys should not be equal: %q+%q, %q",
     282           0 :                                 lp, w.suffixes.UnsafeGet(w.suffixes.rows-1), key))
     283             :                 }
     284           2 :                 if v := w.comparer.Compare(key, lp); v != int(cmpv.UserKeyComparison) {
     285           0 :                         panic(errors.AssertionFailedf("user key comparison mismatch: Compare(%q, %q) = %d ≠ %d",
     286           0 :                                 key, lp, v, cmpv.UserKeyComparison))
     287             :                 }
     288             :         }
     289           2 :         return cmpv
     290             : }
     291             : 
     292             : func (w *defaultKeyWriter) WriteKey(
     293             :         row int, key []byte, keyPrefixLen, keyPrefixLenSharedWithPrev int32,
     294           2 : ) {
     295           2 :         w.prefixes.Put(key[:keyPrefixLen], int(keyPrefixLenSharedWithPrev))
     296           2 :         w.suffixes.Put(key[keyPrefixLen:])
     297           2 : }
     298             : 
     299           2 : func (w *defaultKeyWriter) MaterializeKey(dst []byte, row int) []byte {
     300           2 :         dst = append(dst, w.prefixes.UnsafeGet(row)...)
     301           2 :         dst = append(dst, w.suffixes.UnsafeGet(row)...)
     302           2 :         return dst
     303           2 : }
     304             : 
     305           2 : func (w *defaultKeyWriter) NumColumns() int {
     306           2 :         return 2
     307           2 : }
     308             : 
     309           2 : func (w *defaultKeyWriter) DataType(col int) DataType {
     310           2 :         return defaultSchemaColumnTypes[col]
     311           2 : }
     312             : 
     313           2 : func (w *defaultKeyWriter) Reset() {
     314           2 :         w.prefixes.Reset()
     315           2 :         w.suffixes.Reset()
     316           2 : }
     317             : 
     318           1 : func (w *defaultKeyWriter) WriteDebug(dst io.Writer, rows int) {
     319           1 :         fmt.Fprint(dst, "0: prefixes:       ")
     320           1 :         w.prefixes.WriteDebug(dst, rows)
     321           1 :         fmt.Fprintln(dst)
     322           1 :         fmt.Fprint(dst, "1: suffixes:       ")
     323           1 :         w.suffixes.WriteDebug(dst, rows)
     324           1 :         fmt.Fprintln(dst)
     325           1 : }
     326             : 
     327           2 : func (w *defaultKeyWriter) Size(rows int, offset uint32) uint32 {
     328           2 :         offset = w.prefixes.Size(rows, offset)
     329           2 :         offset = w.suffixes.Size(rows, offset)
     330           2 :         return offset
     331           2 : }
     332             : 
     333           2 : func (w *defaultKeyWriter) FinishHeader([]byte) {}
     334             : 
     335           2 : func (w *defaultKeyWriter) Finish(col, rows int, offset uint32, buf []byte) (nextOffset uint32) {
     336           2 :         switch col {
     337           2 :         case defaultKeySchemaColumnPrefix:
     338           2 :                 return w.prefixes.Finish(0, rows, offset, buf)
     339           2 :         case defaultKeySchemaColumnSuffix:
     340           2 :                 return w.suffixes.Finish(0, rows, offset, buf)
     341           0 :         default:
     342           0 :                 panic(fmt.Sprintf("unknown default key column: %d", col))
     343             :         }
     344             : }
     345             : 
     346             : // Assert that *defaultKeySeeker implements KeySeeker.
     347             : var _ KeySeeker = (*defaultKeySeeker)(nil)
     348             : 
     349             : // Assert that the metadata fits the defalut key seeker.
     350             : var _ uint = KeySeekerMetadataSize - uint(unsafe.Sizeof(defaultKeySeeker{}))
     351             : 
     352             : type defaultKeySeeker struct {
     353             :         comparer     *base.Comparer
     354             :         decoder      *DataBlockDecoder
     355             :         prefixes     PrefixBytes
     356             :         suffixes     RawBytes
     357             :         sharedPrefix []byte
     358             : }
     359             : 
     360           2 : func (ks *defaultKeySeeker) init(d *DataBlockDecoder) {
     361           2 :         ks.decoder = d
     362           2 :         ks.prefixes = d.d.PrefixBytes(defaultKeySchemaColumnPrefix)
     363           2 :         ks.suffixes = d.d.RawBytes(defaultKeySchemaColumnSuffix)
     364           2 :         ks.sharedPrefix = ks.prefixes.SharedPrefix()
     365           2 : }
     366             : 
     367             : // IsLowerBound is part of the KeySeeker interface.
     368           2 : func (ks *defaultKeySeeker) IsLowerBound(k []byte, syntheticSuffix []byte) bool {
     369           2 :         si := ks.comparer.Split(k)
     370           2 :         if v := ks.comparer.Compare(ks.prefixes.UnsafeFirstSlice(), k[:si]); v != 0 {
     371           2 :                 return v > 0
     372           2 :         }
     373           2 :         suffix := syntheticSuffix
     374           2 :         if len(suffix) == 0 {
     375           2 :                 suffix = ks.suffixes.At(0)
     376           2 :         }
     377           2 :         return ks.comparer.Compare(suffix, k[si:]) >= 0
     378             : }
     379             : 
     380             : // SeekGE is part of the KeySeeker interface.
     381             : func (ks *defaultKeySeeker) SeekGE(
     382             :         key []byte, boundRow int, searchDir int8,
     383           2 : ) (row int, equalPrefix bool) {
     384           2 :         si := ks.comparer.Split(key)
     385           2 :         row, eq := ks.prefixes.Search(key[:si])
     386           2 :         if eq {
     387           2 :                 return ks.seekGEOnSuffix(row, key[si:]), true
     388           2 :         }
     389           2 :         return row, false
     390             : }
     391             : 
     392             : // seekGEOnSuffix is a helper function for SeekGE when a seek key's prefix
     393             : // exactly matches a row. seekGEOnSuffix finds the first row at index or later
     394             : // with the same prefix as index and a suffix greater than or equal to [suffix],
     395             : // or if no such row exists, the next row with a different prefix.
     396           2 : func (ks *defaultKeySeeker) seekGEOnSuffix(index int, suffix []byte) (row int) {
     397           2 :         // The search key's prefix exactly matches the prefix of the row at index.
     398           2 :         // If the row at index has a suffix >= [suffix], then return the row.
     399           2 :         if ks.comparer.ComparePointSuffixes(ks.suffixes.At(index), suffix) >= 0 {
     400           2 :                 return index
     401           2 :         }
     402             :         // Otherwise, the row at [index] sorts before the search key and we need to
     403             :         // search forward. Binary search between [index+1, prefixChanged.SeekSetBitGE(index+1)].
     404             :         //
     405             :         // Define f(l-1) == false and f(u) == true.
     406             :         // Invariant: f(l-1) == false, f(u) == true.
     407           2 :         l := index + 1
     408           2 :         u := ks.decoder.prefixChanged.SeekSetBitGE(index + 1)
     409           2 :         for l < u {
     410           2 :                 h := int(uint(l+u) >> 1) // avoid overflow when computing h
     411           2 :                 // l ≤ h < u
     412           2 :                 if ks.comparer.ComparePointSuffixes(ks.suffixes.At(h), suffix) >= 0 {
     413           2 :                         u = h // preserves f(u) == true
     414           2 :                 } else {
     415           2 :                         l = h + 1 // preserves f(l-1) == false
     416           2 :                 }
     417             :         }
     418           2 :         return l
     419             : }
     420             : 
     421             : // MaterializeUserKey is part of the colblk.KeySeeker interface.
     422           2 : func (ks *defaultKeySeeker) MaterializeUserKey(keyIter *PrefixBytesIter, prevRow, row int) []byte {
     423           2 :         if row == prevRow+1 && prevRow >= 0 {
     424           2 :                 ks.prefixes.SetNext(keyIter)
     425           2 :         } else {
     426           2 :                 ks.prefixes.SetAt(keyIter, row)
     427           2 :         }
     428           2 :         suffix := ks.suffixes.At(row)
     429           2 :         res := keyIter.Buf[:len(keyIter.Buf)+len(suffix)]
     430           2 :         memmove(
     431           2 :                 unsafe.Pointer(uintptr(unsafe.Pointer(unsafe.SliceData(keyIter.Buf)))+uintptr(len(keyIter.Buf))),
     432           2 :                 unsafe.Pointer(unsafe.SliceData(suffix)),
     433           2 :                 uintptr(len(suffix)),
     434           2 :         )
     435           2 :         return res
     436             : }
     437             : 
     438             : // MaterializeUserKeyWithSyntheticSuffix is part of the colblk.KeySeeker interface.
     439             : func (ks *defaultKeySeeker) MaterializeUserKeyWithSyntheticSuffix(
     440             :         keyIter *PrefixBytesIter, suffix []byte, prevRow, row int,
     441           2 : ) []byte {
     442           2 :         if row == prevRow+1 && prevRow >= 0 {
     443           2 :                 ks.prefixes.SetNext(keyIter)
     444           2 :         } else {
     445           2 :                 ks.prefixes.SetAt(keyIter, row)
     446           2 :         }
     447           2 :         res := keyIter.Buf[:len(keyIter.Buf)+len(suffix)]
     448           2 :         memmove(
     449           2 :                 unsafe.Pointer(uintptr(unsafe.Pointer(unsafe.SliceData(keyIter.Buf)))+uintptr(len(keyIter.Buf))),
     450           2 :                 unsafe.Pointer(unsafe.SliceData(suffix)),
     451           2 :                 uintptr(len(suffix)),
     452           2 :         )
     453           2 :         return res
     454             : }
     455             : 
     456             : // DataBlockEncoder encodes columnar data blocks using a user-defined schema.
     457             : type DataBlockEncoder struct {
     458             :         Schema    *KeySchema
     459             :         KeyWriter KeyWriter
     460             :         // trailers is the column writer for InternalKey uint64 trailers.
     461             :         trailers UintBuilder
     462             :         // prefixSame is the column writer for the prefix-changed bitmap that
     463             :         // indicates when a new key prefix begins. During block building, the bitmap
     464             :         // represents when the prefix stays the same, which is expected to be a
     465             :         // rarer case. Before Finish-ing the column, we invert the bitmap.
     466             :         prefixSame BitmapBuilder
     467             :         // values is the column writer for values. Iff the isValueExternal bitmap
     468             :         // indicates a value is external, the value is prefixed with a ValuePrefix
     469             :         // byte.
     470             :         values RawBytesBuilder
     471             :         // isValueExternal is the column writer for the is-value-external bitmap
     472             :         // that indicates when a value is stored out-of-band in a value block.
     473             :         isValueExternal BitmapBuilder
     474             :         // isObsolete is the column writer for the is-obsolete bitmap that indicates
     475             :         // when a key is known to be obsolete/non-live (i.e., shadowed by another
     476             :         // identical point key or range deletion with a higher sequence number).
     477             :         isObsolete BitmapBuilder
     478             : 
     479             :         enc              blockEncoder
     480             :         rows             int
     481             :         maximumKeyLength int
     482             :         valuePrefixTmp   [1]byte
     483             :         lastUserKeyTmp   []byte
     484             : }
     485             : 
     486             : const (
     487             :         dataBlockColumnTrailer = iota
     488             :         dataBlockColumnPrefixChanged
     489             :         dataBlockColumnValue
     490             :         dataBlockColumnIsValueExternal
     491             :         dataBlockColumnIsObsolete
     492             :         dataBlockColumnMax
     493             : )
     494             : 
     495             : // The data block header is a 4-byte uint32 encoding the maximum length of a key
     496             : // contained within the block. This is used by iterators to avoid the need to
     497             : // grow key buffers while iterating over the block, ensuring that the key buffer
     498             : // is always sufficiently large.
     499             : // This is serialized immediately after the KeySchema specific header.
     500             : const dataBlockCustomHeaderSize = 4
     501             : 
     502             : // Init initializes the data block writer.
     503           2 : func (w *DataBlockEncoder) Init(schema *KeySchema) {
     504           2 :         w.Schema = schema
     505           2 :         w.KeyWriter = schema.NewKeyWriter()
     506           2 :         w.trailers.Init()
     507           2 :         w.prefixSame.Reset()
     508           2 :         w.values.Init()
     509           2 :         w.isValueExternal.Reset()
     510           2 :         w.isObsolete.Reset()
     511           2 :         w.rows = 0
     512           2 :         w.maximumKeyLength = 0
     513           2 :         w.lastUserKeyTmp = w.lastUserKeyTmp[:0]
     514           2 :         w.enc.reset()
     515           2 : }
     516             : 
     517             : // Reset resets the data block writer to its initial state, retaining buffers.
     518           2 : func (w *DataBlockEncoder) Reset() {
     519           2 :         w.KeyWriter.Reset()
     520           2 :         w.trailers.Reset()
     521           2 :         w.prefixSame.Reset()
     522           2 :         w.values.Reset()
     523           2 :         w.isValueExternal.Reset()
     524           2 :         w.isObsolete.Reset()
     525           2 :         w.rows = 0
     526           2 :         w.maximumKeyLength = 0
     527           2 :         w.lastUserKeyTmp = w.lastUserKeyTmp[:0]
     528           2 :         w.enc.reset()
     529           2 : }
     530             : 
     531             : // String outputs a human-readable summary of internal DataBlockEncoder state.
     532           1 : func (w *DataBlockEncoder) String() string {
     533           1 :         var buf bytes.Buffer
     534           1 :         size := uint32(w.Size())
     535           1 :         fmt.Fprintf(&buf, "size=%d:\n", size)
     536           1 :         w.KeyWriter.WriteDebug(&buf, w.rows)
     537           1 : 
     538           1 :         fmt.Fprintf(&buf, "%d: trailers:       ", len(w.Schema.ColumnTypes)+dataBlockColumnTrailer)
     539           1 :         w.trailers.WriteDebug(&buf, w.rows)
     540           1 :         fmt.Fprintln(&buf)
     541           1 : 
     542           1 :         fmt.Fprintf(&buf, "%d: prefix changed: ", len(w.Schema.ColumnTypes)+dataBlockColumnPrefixChanged)
     543           1 :         w.prefixSame.WriteDebug(&buf, w.rows)
     544           1 :         fmt.Fprintln(&buf)
     545           1 : 
     546           1 :         fmt.Fprintf(&buf, "%d: values:         ", len(w.Schema.ColumnTypes)+dataBlockColumnValue)
     547           1 :         w.values.WriteDebug(&buf, w.rows)
     548           1 :         fmt.Fprintln(&buf)
     549           1 : 
     550           1 :         fmt.Fprintf(&buf, "%d: is-value-ext:   ", len(w.Schema.ColumnTypes)+dataBlockColumnIsValueExternal)
     551           1 :         w.isValueExternal.WriteDebug(&buf, w.rows)
     552           1 :         fmt.Fprintln(&buf)
     553           1 : 
     554           1 :         fmt.Fprintf(&buf, "%d: is-obsolete:    ", len(w.Schema.ColumnTypes)+dataBlockColumnIsObsolete)
     555           1 :         w.isObsolete.WriteDebug(&buf, w.rows)
     556           1 :         fmt.Fprintln(&buf)
     557           1 : 
     558           1 :         return buf.String()
     559           1 : }
     560             : 
     561             : // Add adds the provided key to the data block. Keys must be added in order. The
     562             : // caller must supply a KeyComparison containing the comparison of the key to
     563             : // the previously-added key, obtainable through
     564             : //
     565             : //      KeyWriter.ComparePrev(ikey.UserKey)
     566             : //
     567             : // The caller is required to pass this in because in expected use cases, the
     568             : // caller will also require the same information.
     569             : func (w *DataBlockEncoder) Add(
     570             :         ikey base.InternalKey,
     571             :         value []byte,
     572             :         valuePrefix block.ValuePrefix,
     573             :         kcmp KeyComparison,
     574             :         isObsolete bool,
     575           2 : ) {
     576           2 :         w.KeyWriter.WriteKey(w.rows, ikey.UserKey, kcmp.PrefixLen, kcmp.CommonPrefixLen)
     577           2 :         if kcmp.PrefixEqual() {
     578           2 :                 w.prefixSame.Set(w.rows)
     579           2 :         }
     580           2 :         if isObsolete {
     581           2 :                 w.isObsolete.Set(w.rows)
     582           2 :         }
     583           2 :         w.trailers.Set(w.rows, uint64(ikey.Trailer))
     584           2 :         if valuePrefix.IsValueHandle() {
     585           2 :                 w.isValueExternal.Set(w.rows)
     586           2 :                 // Write the value with the value prefix byte preceding the value.
     587           2 :                 w.valuePrefixTmp[0] = byte(valuePrefix)
     588           2 :                 w.values.PutConcat(w.valuePrefixTmp[:], value)
     589           2 :         } else {
     590           2 :                 // Elide the value prefix. Readers will examine the isValueExternal
     591           2 :                 // bitmap and know there is no value prefix byte if !isValueExternal.
     592           2 :                 w.values.Put(value)
     593           2 :         }
     594           2 :         if len(ikey.UserKey) > int(w.maximumKeyLength) {
     595           2 :                 w.maximumKeyLength = len(ikey.UserKey)
     596           2 :         }
     597           2 :         w.rows++
     598             : }
     599             : 
     600             : // Rows returns the number of rows in the current pending data block.
     601           2 : func (w *DataBlockEncoder) Rows() int {
     602           2 :         return w.rows
     603           2 : }
     604             : 
     605             : // Size returns the size of the current pending data block.
     606           2 : func (w *DataBlockEncoder) Size() int {
     607           2 :         off := blockHeaderSize(len(w.Schema.ColumnTypes)+dataBlockColumnMax, dataBlockCustomHeaderSize+w.Schema.HeaderSize)
     608           2 :         off = w.KeyWriter.Size(w.rows, off)
     609           2 :         off = w.trailers.Size(w.rows, off)
     610           2 :         off = w.prefixSame.InvertedSize(w.rows, off)
     611           2 :         off = w.values.Size(w.rows, off)
     612           2 :         off = w.isValueExternal.Size(w.rows, off)
     613           2 :         off = w.isObsolete.Size(w.rows, off)
     614           2 :         off++ // trailer padding byte
     615           2 :         return int(off)
     616           2 : }
     617             : 
     618             : // MaterializeLastUserKey materializes the last added user key.
     619           1 : func (w *DataBlockEncoder) MaterializeLastUserKey(appendTo []byte) []byte {
     620           1 :         return w.KeyWriter.MaterializeKey(appendTo, w.rows-1)
     621           1 : }
     622             : 
     623             : // Finish serializes the pending data block, including the first [rows] rows.
     624             : // The value of [rows] must be Rows() or Rows()-1. The provided size must be the
     625             : // size of the data block with the provided row count (i.e., the return value of
     626             : // [Size] when DataBlockEncoder.Rows() = [rows]).
     627             : //
     628             : // Finish the returns the serialized, uncompressed data block and the
     629             : // InternalKey of the last key contained within the data block. The memory of
     630             : // the lastKey's UserKey is owned by the DataBlockEncoder. The caller must
     631             : // copy it if they require it to outlive a Reset of the writer.
     632           2 : func (w *DataBlockEncoder) Finish(rows, size int) (finished []byte, lastKey base.InternalKey) {
     633           2 :         if invariants.Enabled && rows != w.rows && rows != w.rows-1 {
     634           0 :                 panic(errors.AssertionFailedf("data block has %d rows; asked to finish %d", w.rows, rows))
     635             :         }
     636             : 
     637           2 :         cols := len(w.Schema.ColumnTypes) + dataBlockColumnMax
     638           2 :         h := Header{
     639           2 :                 Version: Version1,
     640           2 :                 Columns: uint16(cols),
     641           2 :                 Rows:    uint32(rows),
     642           2 :         }
     643           2 : 
     644           2 :         // Invert the prefix-same bitmap before writing it out, because we want it
     645           2 :         // to represent when the prefix changes.
     646           2 :         w.prefixSame.Invert(rows)
     647           2 : 
     648           2 :         w.enc.init(size, h, dataBlockCustomHeaderSize+w.Schema.HeaderSize)
     649           2 : 
     650           2 :         // Write the key schema custom header.
     651           2 :         w.KeyWriter.FinishHeader(w.enc.data()[:w.Schema.HeaderSize])
     652           2 :         // Write the max key length in the data block custom header.
     653           2 :         binary.LittleEndian.PutUint32(w.enc.data()[w.Schema.HeaderSize:w.Schema.HeaderSize+dataBlockCustomHeaderSize], uint32(w.maximumKeyLength))
     654           2 :         w.enc.encode(rows, w.KeyWriter)
     655           2 :         w.enc.encode(rows, &w.trailers)
     656           2 :         w.enc.encode(rows, &w.prefixSame)
     657           2 :         w.enc.encode(rows, &w.values)
     658           2 :         w.enc.encode(rows, &w.isValueExternal)
     659           2 :         w.enc.encode(rows, &w.isObsolete)
     660           2 :         finished = w.enc.finish()
     661           2 : 
     662           2 :         w.lastUserKeyTmp = w.lastUserKeyTmp[:0]
     663           2 :         w.lastUserKeyTmp = w.KeyWriter.MaterializeKey(w.lastUserKeyTmp[:0], rows-1)
     664           2 :         lastKey = base.InternalKey{
     665           2 :                 UserKey: w.lastUserKeyTmp,
     666           2 :                 Trailer: base.InternalKeyTrailer(w.trailers.Get(rows - 1)),
     667           2 :         }
     668           2 :         return finished, lastKey
     669             : }
     670             : 
     671             : // DataBlockRewriter rewrites data blocks. See RewriteSuffixes.
     672             : type DataBlockRewriter struct {
     673             :         KeySchema *KeySchema
     674             : 
     675             :         encoder   DataBlockEncoder
     676             :         decoder   DataBlockDecoder
     677             :         iter      DataBlockIter
     678             :         keySeeker KeySeeker
     679             :         comparer  *base.Comparer
     680             :         keyBuf    []byte
     681             :         // keyAlloc grown throughout the lifetime of the rewriter.
     682             :         keyAlloc        bytealloc.A
     683             :         prefixBytesIter PrefixBytesIter
     684             :         initialized     bool
     685             : }
     686             : 
     687             : // NewDataBlockRewriter creates a block rewriter.
     688           1 : func NewDataBlockRewriter(keySchema *KeySchema, comparer *base.Comparer) *DataBlockRewriter {
     689           1 :         return &DataBlockRewriter{
     690           1 :                 KeySchema: keySchema,
     691           1 :                 comparer:  comparer,
     692           1 :         }
     693           1 : }
     694             : 
     695             : type assertNoExternalValues struct{}
     696             : 
     697             : var _ block.GetLazyValueForPrefixAndValueHandler = assertNoExternalValues{}
     698             : 
     699           0 : func (assertNoExternalValues) GetLazyValueForPrefixAndValueHandle(value []byte) base.LazyValue {
     700           0 :         panic(errors.AssertionFailedf("pebble: sstable contains values in value blocks"))
     701             : }
     702             : 
     703             : // RewriteSuffixes rewrites the input block. It expects the input block to only
     704             : // contain keys with the suffix `from`. It rewrites the block to contain the
     705             : // same keys with the suffix `to`.
     706             : //
     707             : // RewriteSuffixes returns the start and end keys of the rewritten block, and
     708             : // the finished rewritten block. The returned start and end keys have indefinite
     709             : // lifetimes. The returned rewritten block is owned by the DataBlockRewriter. If
     710             : // it must be retained beyond the next call to RewriteSuffixes, the caller
     711             : // should make a copy.
     712             : //
     713             : // Note that the input slice must be 8-byte aligned.
     714             : func (rw *DataBlockRewriter) RewriteSuffixes(
     715             :         input []byte, from []byte, to []byte,
     716           1 : ) (start, end base.InternalKey, rewritten []byte, err error) {
     717           1 :         if !rw.initialized {
     718           1 :                 rw.iter.InitOnce(rw.KeySchema, rw.comparer, assertNoExternalValues{})
     719           1 :                 rw.encoder.Init(rw.KeySchema)
     720           1 :                 rw.initialized = true
     721           1 :         }
     722             : 
     723             :         // TODO(jackson): RewriteSuffixes performs a naïve rewrite of the block,
     724             :         // iterating over the input block while building a new block, KV-by-KV.
     725             :         // Since key columns are stored separately from other data, we could copy
     726             :         // columns that are unchanged (at least all the non-key columns, and in
     727             :         // practice a PrefixBytes column) wholesale without retrieving rows
     728             :         // one-by-one. In practice, there a few obstacles to making this work:
     729             :         //
     730             :         // - Only the beginning of a data block is assumed to be aligned. Columns
     731             :         //   then add padding as necessary to align data that needs to be aligned.
     732             :         //   If we copy a column, we have no guarantee that the alignment of the
     733             :         //   column start in the old block matches the alignment in the new block.
     734             :         //   We'd have to add padding to between columns to match the original
     735             :         //   alignment. It's a bit subtle.
     736             :         // - We still need to read all the key columns in order to synthesize
     737             :         //   [start] and [end].
     738             :         //
     739             :         // The columnar format is designed to support fast IterTransforms at read
     740             :         // time, including IterTransforms.SyntheticSuffix. Our effort might be
     741             :         // better spent dropping support for the physical rewriting of data blocks
     742             :         // we're performing here and instead use a read-time IterTransform.
     743             : 
     744           1 :         rw.decoder.Init(rw.KeySchema, input)
     745           1 :         meta := &KeySeekerMetadata{}
     746           1 :         rw.KeySchema.InitKeySeekerMetadata(meta, &rw.decoder)
     747           1 :         rw.keySeeker = rw.KeySchema.KeySeeker(meta)
     748           1 :         rw.encoder.Reset()
     749           1 :         if err = rw.iter.Init(&rw.decoder, block.IterTransforms{}); err != nil {
     750           0 :                 return base.InternalKey{}, base.InternalKey{}, nil, err
     751           0 :         }
     752             : 
     753             :         // Allocate a keyIter buffer that's large enough to hold the largest user
     754             :         // key in the block with 1 byte to spare (so that pointer arithmetic is
     755             :         // never pointing beyond the allocation, which would violate Go rules).
     756           1 :         if cap(rw.prefixBytesIter.Buf) < int(rw.decoder.maximumKeyLength)+1 {
     757           1 :                 rw.prefixBytesIter.Buf = make([]byte, rw.decoder.maximumKeyLength+1)
     758           1 :         }
     759           1 :         if newMax := int(rw.decoder.maximumKeyLength) - len(from) + len(to) + 1; cap(rw.keyBuf) < newMax {
     760           1 :                 rw.keyBuf = make([]byte, newMax)
     761           1 :         }
     762             : 
     763             :         // Rewrite each key-value pair one-by-one.
     764           1 :         for i, kv := 0, rw.iter.First(); kv != nil; i, kv = i+1, rw.iter.Next() {
     765           1 :                 value := kv.V.ValueOrHandle
     766           1 :                 valuePrefix := block.InPlaceValuePrefix(false /* setHasSamePrefix (unused) */)
     767           1 :                 isValueExternal := rw.decoder.isValueExternal.At(i)
     768           1 :                 if isValueExternal {
     769           0 :                         valuePrefix = block.ValuePrefix(kv.V.ValueOrHandle[0])
     770           0 :                         value = kv.V.ValueOrHandle[1:]
     771           0 :                 }
     772           1 :                 kcmp := rw.encoder.KeyWriter.ComparePrev(kv.K.UserKey)
     773           1 :                 if !bytes.Equal(kv.K.UserKey[kcmp.PrefixLen:], from) {
     774           1 :                         return base.InternalKey{}, base.InternalKey{}, nil,
     775           1 :                                 errors.Newf("key %s has suffix 0x%x; require 0x%x", kv.K, kv.K.UserKey[kcmp.PrefixLen:], from)
     776           1 :                 }
     777           1 :                 rw.keyBuf = append(rw.keyBuf[:0], kv.K.UserKey[:kcmp.PrefixLen]...)
     778           1 :                 rw.keyBuf = append(rw.keyBuf, to...)
     779           1 :                 if i == 0 {
     780           1 :                         start.UserKey, rw.keyAlloc = rw.keyAlloc.Copy(rw.keyBuf)
     781           1 :                         start.Trailer = kv.K.Trailer
     782           1 :                 }
     783           1 :                 k := base.InternalKey{UserKey: rw.keyBuf, Trailer: kv.K.Trailer}
     784           1 :                 rw.encoder.Add(k, value, valuePrefix, kcmp, rw.decoder.isObsolete.At(i))
     785             :         }
     786           1 :         rewritten, end = rw.encoder.Finish(int(rw.decoder.d.header.Rows), rw.encoder.Size())
     787           1 :         end.UserKey, rw.keyAlloc = rw.keyAlloc.Copy(end.UserKey)
     788           1 :         return start, end, rewritten, nil
     789             : }
     790             : 
     791             : // dataBlockDecoderSize is the size of DataBlockDecoder, round up to 8 bytes.
     792             : const dataBlockDecoderSize = (unsafe.Sizeof(DataBlockDecoder{}) + 7) &^ 7
     793             : 
     794             : // Assert that dataBlockDecoderSize is a multiple of 8 bytes (so that
     795             : // KeySeekerMetadata is also aligned).
     796             : const _ uint = uint(-(dataBlockDecoderSize % 8))
     797             : 
     798             : // Assert that a DataBlockDecoder and a KeySeekerMetadata can fit inside block.Metadata.
     799             : const _ uint = block.MetadataSize - uint(dataBlockDecoderSize) - KeySeekerMetadataSize
     800             : 
     801             : // InitDataBlockMetadata initializes the metadata for a data block.
     802           2 : func InitDataBlockMetadata(schema *KeySchema, md *block.Metadata, data []byte) (err error) {
     803           2 :         if uintptr(unsafe.Pointer(md))%8 != 0 {
     804           0 :                 return errors.AssertionFailedf("metadata is not 8-byte aligned")
     805           0 :         }
     806           2 :         d := (*DataBlockDecoder)(unsafe.Pointer(md))
     807           2 :         // Initialization can panic; convert panics to corruption errors (so higher
     808           2 :         // layers can add file number and offset information).
     809           2 :         defer func() {
     810           2 :                 if r := recover(); r != nil {
     811           0 :                         err = base.CorruptionErrorf("error initializing data block metadata: %v", r)
     812           0 :                 }
     813             :         }()
     814           2 :         d.Init(schema, data)
     815           2 :         keySchemaMeta := (*KeySeekerMetadata)(unsafe.Pointer(&md[dataBlockDecoderSize]))
     816           2 :         schema.InitKeySeekerMetadata(keySchemaMeta, d)
     817           2 :         return nil
     818             : }
     819             : 
     820             : // Assert that an IndexBlockDecoder can fit inside block.Metadata.
     821             : const _ uint = block.MetadataSize - uint(unsafe.Sizeof(IndexBlockDecoder{}))
     822             : 
     823             : // InitIndexBlockMetadata initializes the metadata for an index block.
     824           2 : func InitIndexBlockMetadata(md *block.Metadata, data []byte) (err error) {
     825           2 :         if uintptr(unsafe.Pointer(md))%8 != 0 {
     826           0 :                 return errors.AssertionFailedf("metadata is not 8-byte aligned")
     827           0 :         }
     828           2 :         d := (*IndexBlockDecoder)(unsafe.Pointer(md))
     829           2 :         // Initialization can panic; convert panics to corruption errors (so higher
     830           2 :         // layers can add file number and offset information).
     831           2 :         defer func() {
     832           2 :                 if r := recover(); r != nil {
     833           0 :                         err = base.CorruptionErrorf("error initializing index block metadata: %v", r)
     834           0 :                 }
     835             :         }()
     836           2 :         d.Init(data)
     837           2 :         return nil
     838             : }
     839             : 
     840             : // Assert that a IndexBlockDecoder can fit inside block.Metadata.
     841             : const _ uint = block.MetadataSize - uint(unsafe.Sizeof(KeyspanDecoder{}))
     842             : 
     843             : // InitKeyspanBlockMetadata initializes the metadata for a rangedel or range key block.
     844           2 : func InitKeyspanBlockMetadata(md *block.Metadata, data []byte) (err error) {
     845           2 :         if uintptr(unsafe.Pointer(md))%8 != 0 {
     846           0 :                 return errors.AssertionFailedf("metadata is not 8-byte aligned")
     847           0 :         }
     848           2 :         d := (*KeyspanDecoder)(unsafe.Pointer(md))
     849           2 :         // Initialization can panic; convert panics to corruption errors (so higher
     850           2 :         // layers can add file number and offset information).
     851           2 :         defer func() {
     852           2 :                 if r := recover(); r != nil {
     853           0 :                         err = base.CorruptionErrorf("error initializing keyspan block metadata: %v", r)
     854           0 :                 }
     855             :         }()
     856           2 :         d.Init(data)
     857           2 :         return nil
     858             : }
     859             : 
     860             : // A DataBlockDecoder holds state for interpreting a columnar data block. It may
     861             : // be shared among multiple DataBlockIters.
     862             : type DataBlockDecoder struct {
     863             :         d BlockDecoder
     864             :         // trailers holds an array of the InternalKey trailers, encoding the key
     865             :         // kind and sequence number of each key.
     866             :         trailers UnsafeUints
     867             :         // prefixChanged is a bitmap indicating when the prefix (as defined by
     868             :         // Split) of a key changes, relative to the preceding key. This is used to
     869             :         // bound seeks within a prefix, and to optimize NextPrefix.
     870             :         prefixChanged Bitmap
     871             :         // values is the column reader for values. If the isValueExternal bitmap
     872             :         // indicates a value is external, the value is prefixed with a ValuePrefix
     873             :         // byte.
     874             :         values RawBytes
     875             :         // isValueExternal is the column reader for the is-value-external bitmap
     876             :         // that indicates whether a value is stored out-of-band in a value block. If
     877             :         // true, the value contains a ValuePrefix byte followed by an encoded value
     878             :         // handle indicating the value's location within the value block(s).
     879             :         isValueExternal Bitmap
     880             :         // isObsolete is the column reader for the is-obsolete bitmap
     881             :         // that indicates whether a key is obsolete/non-live.
     882             :         isObsolete Bitmap
     883             :         // maximumKeyLength is the maximum length of a user key in the block.
     884             :         // Iterators may use it to allocate a sufficiently large buffer up front,
     885             :         // and elide size checks during iteration. Note that iterators should add +1
     886             :         // to the key length to ensure pointer arithmetric that computes a pointer
     887             :         // to the tail of the key does not point to memory beyond the allocation
     888             :         // (prohibited by Go pointer rules).
     889             :         maximumKeyLength uint32
     890             : }
     891             : 
     892             : // BlockDecoder returns a pointer to the underlying BlockDecoder.
     893           1 : func (d *DataBlockDecoder) BlockDecoder() *BlockDecoder {
     894           1 :         return &d.d
     895           1 : }
     896             : 
     897             : // PrefixChanged returns the prefix-changed bitmap.
     898           1 : func (d *DataBlockDecoder) PrefixChanged() Bitmap {
     899           1 :         return d.prefixChanged
     900           1 : }
     901             : 
     902             : // KeySchemaHeader returns the KeySchema-specific header.
     903           1 : func (d *DataBlockDecoder) KeySchemaHeader() []byte {
     904           1 :         return d.d.data[:d.d.customHeaderSize-dataBlockCustomHeaderSize]
     905           1 : }
     906             : 
     907             : // Init initializes the data block reader with the given serialized data block.
     908           2 : func (d *DataBlockDecoder) Init(schema *KeySchema, data []byte) {
     909           2 :         if uintptr(unsafe.Pointer(unsafe.SliceData(data)))&7 != 0 {
     910           0 :                 panic("data buffer not 8-byte aligned")
     911             :         }
     912           2 :         d.d.Init(data, dataBlockCustomHeaderSize+schema.HeaderSize)
     913           2 :         d.trailers = d.d.Uints(len(schema.ColumnTypes) + dataBlockColumnTrailer)
     914           2 :         d.prefixChanged = d.d.Bitmap(len(schema.ColumnTypes) + dataBlockColumnPrefixChanged)
     915           2 :         d.values = d.d.RawBytes(len(schema.ColumnTypes) + dataBlockColumnValue)
     916           2 :         d.isValueExternal = d.d.Bitmap(len(schema.ColumnTypes) + dataBlockColumnIsValueExternal)
     917           2 :         d.isObsolete = d.d.Bitmap(len(schema.ColumnTypes) + dataBlockColumnIsObsolete)
     918           2 :         d.maximumKeyLength = binary.LittleEndian.Uint32(data[schema.HeaderSize:])
     919             : }
     920             : 
     921             : // Describe descirbes the binary format of the data block, assuming f.Offset()
     922             : // is positioned at the beginning of the same data block described by d.
     923           1 : func (d *DataBlockDecoder) Describe(f *binfmt.Formatter, tp treeprinter.Node) {
     924           1 :         // Set the relative offset. When loaded into memory, the beginning of blocks
     925           1 :         // are aligned. Padding that ensures alignment is done relative to the
     926           1 :         // current offset. Setting the relative offset ensures that if we're
     927           1 :         // describing this block within a larger structure (eg, f.Offset()>0), we
     928           1 :         // compute padding appropriately assuming the current byte f.Offset() is
     929           1 :         // aligned.
     930           1 :         f.SetAnchorOffset()
     931           1 : 
     932           1 :         n := tp.Child("data block header")
     933           1 :         if keySchemaHeaderSize := int(d.d.customHeaderSize - 4); keySchemaHeaderSize > 0 {
     934           1 :                 f.HexBytesln(keySchemaHeaderSize, "key schema header")
     935           1 :         }
     936           1 :         f.HexBytesln(4, "maximum key length: %d", d.maximumKeyLength)
     937           1 :         d.d.headerToBinFormatter(f, n)
     938           1 :         for i := 0; i < int(d.d.header.Columns); i++ {
     939           1 :                 d.d.columnToBinFormatter(f, n, i, int(d.d.header.Rows))
     940           1 :         }
     941           1 :         f.HexBytesln(1, "block padding byte")
     942           1 :         f.ToTreePrinter(n)
     943             : }
     944             : 
     945             : // Validate validates invariants that should hold across all data blocks.
     946           2 : func (d *DataBlockDecoder) Validate(comparer *base.Comparer, keySchema *KeySchema) error {
     947           2 :         // TODO(jackson): Consider avoiding these allocations, even if this is only
     948           2 :         // called in invariants builds.
     949           2 :         n := d.d.header.Rows
     950           2 :         meta := &KeySeekerMetadata{}
     951           2 :         keySchema.InitKeySeekerMetadata(meta, d)
     952           2 :         keySeeker := keySchema.KeySeeker(meta)
     953           2 :         prevKey := base.InternalKey{UserKey: make([]byte, 0, d.maximumKeyLength+1)}
     954           2 :         var curKey PrefixBytesIter
     955           2 :         curKey.Init(int(d.maximumKeyLength), nil)
     956           2 : 
     957           2 :         for i := 0; i < int(n); i++ {
     958           2 :                 k := base.InternalKey{
     959           2 :                         UserKey: keySeeker.MaterializeUserKey(&curKey, i-1, i),
     960           2 :                         Trailer: base.InternalKeyTrailer(d.trailers.At(i)),
     961           2 :                 }
     962           2 :                 // Ensure the keys are ordered.
     963           2 :                 ucmp := comparer.Compare(k.UserKey, prevKey.UserKey)
     964           2 :                 if ucmp < 0 || (ucmp == 0 && k.Trailer >= prevKey.Trailer) {
     965           0 :                         return errors.AssertionFailedf("key %s (row %d) and key %s (row %d) are out of order",
     966           0 :                                 prevKey, i-1, k, i)
     967           0 :                 }
     968             :                 // Ensure the obsolete bit is set if the key is definitively obsolete.
     969             :                 // Not all sources of obsolescence are evident with only a data block
     970             :                 // available (range deletions or point keys in previous blocks may cause
     971             :                 // a key to be obsolete).
     972           2 :                 if ucmp == 0 && prevKey.Kind() != base.InternalKeyKindMerge && !d.isObsolete.At(i) {
     973           0 :                         return errors.AssertionFailedf("key %s (row %d) is shadowed by previous key %s but is not marked as obsolete",
     974           0 :                                 k, i, prevKey)
     975           0 :                 }
     976             :                 // Ensure that the prefix-changed bit is set correctly.
     977           2 :                 if i > 0 {
     978           2 :                         currPrefix := comparer.Split.Prefix(k.UserKey)
     979           2 :                         prevPrefix := comparer.Split.Prefix(prevKey.UserKey)
     980           2 :                         prefixChanged := !bytes.Equal(prevPrefix, currPrefix)
     981           2 :                         if prefixChanged != d.prefixChanged.At(i) {
     982           0 :                                 return errors.AssertionFailedf("prefix changed bit for key %q (row %d) is %t, expected %t [prev key was %q]",
     983           0 :                                         k.UserKey, i, d.prefixChanged.At(i), prefixChanged, prevKey.UserKey)
     984           0 :                         }
     985             :                 }
     986             : 
     987           2 :                 prevKey.CopyFrom(k)
     988             :         }
     989           2 :         return nil
     990             : }
     991             : 
     992             : // Assert that *DataBlockIter implements block.DataBlockIterator.
     993             : var _ block.DataBlockIterator = (*DataBlockIter)(nil)
     994             : 
     995             : // DataBlockIter iterates over a columnar data block.
     996             : type DataBlockIter struct {
     997             :         // -- Fields that are initialized once --
     998             :         // For any changes to these fields, InitOnce should be updated.
     999             : 
    1000             :         // keySchema configures the DataBlockIterConfig to use the provided
    1001             :         // KeySchema when initializing the DataBlockIter for iteration over a new
    1002             :         // block.
    1003             :         keySchema *KeySchema
    1004             :         suffixCmp base.ComparePointSuffixes
    1005             :         split     base.Split
    1006             :         // getLazyValuer configures the DataBlockIterConfig to initialize the
    1007             :         // DataBlockIter to use the provided handler for retrieving lazy values.
    1008             :         getLazyValuer block.GetLazyValueForPrefixAndValueHandler
    1009             : 
    1010             :         // -- Fields that are initialized for each block --
    1011             :         // For any changes to these fields, InitHandle should be updated.
    1012             : 
    1013             :         d            *DataBlockDecoder
    1014             :         h            block.BufferHandle
    1015             :         maxRow       int
    1016             :         transforms   block.IterTransforms
    1017             :         noTransforms bool
    1018             :         keySeeker    KeySeeker
    1019             : 
    1020             :         // -- State --
    1021             :         // For any changes to these fields, InitHandle (which resets them) should be
    1022             :         // updated.
    1023             : 
    1024             :         keyIter PrefixBytesIter
    1025             :         row     int
    1026             :         kv      base.InternalKV
    1027             :         kvRow   int // the row currently held in kv
    1028             : 
    1029             :         // nextObsoletePoint is the row index of the first obsolete point after i.row.
    1030             :         // It is used to optimize skipping of obsolete points during forward
    1031             :         // iteration.
    1032             :         nextObsoletePoint int
    1033             : }
    1034             : 
    1035             : // InitOnce configures the data block iterator's key schema and lazy value
    1036             : // handler. The iterator must be initialized with a block before it can be used.
    1037             : // It may be reinitialized with new blocks without calling InitOnce again.
    1038             : func (i *DataBlockIter) InitOnce(
    1039             :         keySchema *KeySchema,
    1040             :         comparer *base.Comparer,
    1041             :         getLazyValuer block.GetLazyValueForPrefixAndValueHandler,
    1042           2 : ) {
    1043           2 :         i.keySchema = keySchema
    1044           2 :         i.suffixCmp = comparer.ComparePointSuffixes
    1045           2 :         i.split = comparer.Split
    1046           2 :         i.getLazyValuer = getLazyValuer
    1047           2 : }
    1048             : 
    1049             : // Init initializes the data block iterator, configuring it to read from the
    1050             : // provided decoder.
    1051           1 : func (i *DataBlockIter) Init(d *DataBlockDecoder, transforms block.IterTransforms) error {
    1052           1 :         i.d = d
    1053           1 :         // Leave i.h unchanged.
    1054           1 :         numRows := int(d.d.header.Rows)
    1055           1 :         i.maxRow = numRows - 1
    1056           1 :         i.transforms = transforms
    1057           1 :         if i.transforms.HideObsoletePoints && d.isObsolete.SeekSetBitGE(0) == numRows {
    1058           0 :                 // There are no obsolete points in the block; don't bother checking.
    1059           0 :                 i.transforms.HideObsoletePoints = false
    1060           0 :         }
    1061           1 :         i.noTransforms = i.transforms.NoTransforms()
    1062           1 : 
    1063           1 :         // TODO(radu): see if this allocation can be a problem for the suffix rewriter.
    1064           1 :         meta := &KeySeekerMetadata{}
    1065           1 :         i.keySchema.InitKeySeekerMetadata(meta, d)
    1066           1 :         i.keySeeker = i.keySchema.KeySeeker(meta)
    1067           1 : 
    1068           1 :         // The worst case is when the largest key in the block has no suffix.
    1069           1 :         maxKeyLength := int(i.transforms.SyntheticPrefixAndSuffix.PrefixLen() + d.maximumKeyLength + i.transforms.SyntheticPrefixAndSuffix.SuffixLen())
    1070           1 :         i.keyIter.Init(maxKeyLength, i.transforms.SyntheticPrefix())
    1071           1 :         i.row = -1
    1072           1 :         i.kv = base.InternalKV{}
    1073           1 :         i.kvRow = math.MinInt
    1074           1 :         i.nextObsoletePoint = 0
    1075           1 :         return nil
    1076             : }
    1077             : 
    1078             : // InitHandle initializes the block from the provided buffer handle. InitHandle
    1079             : // assumes that the block's metadata was initialized using
    1080             : // InitDataBlockMetadata().
    1081             : func (i *DataBlockIter) InitHandle(
    1082             :         comparer *base.Comparer, h block.BufferHandle, transforms block.IterTransforms,
    1083           2 : ) error {
    1084           2 :         i.suffixCmp = comparer.ComparePointSuffixes
    1085           2 :         i.split = comparer.Split
    1086           2 :         blockMeta := h.BlockMetadata()
    1087           2 :         i.d = (*DataBlockDecoder)(unsafe.Pointer(blockMeta))
    1088           2 :         keySeekerMeta := (*KeySeekerMetadata)(blockMeta[unsafe.Sizeof(DataBlockDecoder{}):])
    1089           2 :         i.h.Release()
    1090           2 :         i.h = h
    1091           2 : 
    1092           2 :         numRows := int(i.d.d.header.Rows)
    1093           2 :         i.maxRow = numRows - 1
    1094           2 : 
    1095           2 :         i.transforms = transforms
    1096           2 :         if i.transforms.HideObsoletePoints && i.d.isObsolete.SeekSetBitGE(0) == numRows {
    1097           2 :                 // There are no obsolete points in the block; don't bother checking.
    1098           2 :                 i.transforms.HideObsoletePoints = false
    1099           2 :         }
    1100           2 :         i.noTransforms = i.transforms.NoTransforms()
    1101           2 : 
    1102           2 :         // The worst case is when the largest key in the block has no suffix.
    1103           2 :         maxKeyLength := int(i.transforms.SyntheticPrefixAndSuffix.PrefixLen() + i.d.maximumKeyLength + i.transforms.SyntheticPrefixAndSuffix.SuffixLen())
    1104           2 :         i.keyIter.Init(maxKeyLength, i.transforms.SyntheticPrefix())
    1105           2 :         i.row = -1
    1106           2 :         i.kv = base.InternalKV{}
    1107           2 :         i.kvRow = math.MinInt
    1108           2 :         i.nextObsoletePoint = 0
    1109           2 :         i.keySeeker = i.keySchema.KeySeeker(keySeekerMeta)
    1110           2 :         return nil
    1111             : }
    1112             : 
    1113             : // Handle returns the handle to the block.
    1114           2 : func (i *DataBlockIter) Handle() block.BufferHandle {
    1115           2 :         return i.h
    1116           2 : }
    1117             : 
    1118             : // Valid returns true if the iterator is currently positioned at a valid KV.
    1119           2 : func (i *DataBlockIter) Valid() bool {
    1120           2 :         return i.row >= 0 && i.row <= i.maxRow && !i.IsDataInvalidated()
    1121           2 : }
    1122             : 
    1123             : // KV returns the key-value pair at the current iterator position. The
    1124             : // iterator must be positioned over a valid KV.
    1125           2 : func (i *DataBlockIter) KV() *base.InternalKV {
    1126           2 :         return &i.kv
    1127           2 : }
    1128             : 
    1129             : // Invalidate invalidates the block iterator, removing references to the block
    1130             : // it was initialized with. The iterator may continue to be used after
    1131             : // a call to Invalidate, but all positioning methods should return false.
    1132             : // Valid() must also return false.
    1133           2 : func (i *DataBlockIter) Invalidate() {
    1134           2 :         i.d = nil
    1135           2 : }
    1136             : 
    1137             : // IsDataInvalidated returns true when the iterator has been invalidated
    1138             : // using an Invalidate call.
    1139           2 : func (i *DataBlockIter) IsDataInvalidated() bool {
    1140           2 :         return i.d == nil
    1141           2 : }
    1142             : 
    1143             : // IsLowerBound implements the block.DataBlockIterator interface.
    1144           2 : func (i *DataBlockIter) IsLowerBound(k []byte) bool {
    1145           2 :         if i.transforms.HasSyntheticPrefix() {
    1146           1 :                 var keyPrefix []byte
    1147           1 :                 keyPrefix, k = splitKey(k, len(i.transforms.SyntheticPrefix()))
    1148           1 :                 if cmp := bytes.Compare(keyPrefix, i.transforms.SyntheticPrefix()); cmp != 0 {
    1149           1 :                         return cmp < 0
    1150           1 :                 }
    1151             :         }
    1152             :         // If we are hiding obsolete points, it is possible that all points < k are
    1153             :         // hidden.
    1154             :         // Note: we ignore HideObsoletePoints, but false negatives are allowed.
    1155           2 :         return i.keySeeker.IsLowerBound(k, i.transforms.SyntheticSuffix())
    1156             : }
    1157             : 
    1158             : // splitKey splits a key into k[:at] and k[at:].
    1159           2 : func splitKey(k []byte, at int) (before, after []byte) {
    1160           2 :         if len(k) <= at {
    1161           2 :                 return k, nil
    1162           2 :         }
    1163           2 :         return k[:at], k[at:]
    1164             : }
    1165             : 
    1166             : // seekGEInternal is a wrapper around keySeeker.SeekGE which takes into account
    1167             : // the synthetic prefix and suffix.
    1168           2 : func (i *DataBlockIter) seekGEInternal(key []byte, boundRow int, searchDir int8) (row int) {
    1169           2 :         if i.transforms.HasSyntheticPrefix() {
    1170           2 :                 var keyPrefix []byte
    1171           2 :                 keyPrefix, key = splitKey(key, len(i.transforms.SyntheticPrefix()))
    1172           2 :                 if cmp := bytes.Compare(keyPrefix, i.transforms.SyntheticPrefix()); cmp != 0 {
    1173           1 :                         if cmp < 0 {
    1174           1 :                                 return 0
    1175           1 :                         }
    1176           1 :                         return i.maxRow + 1
    1177             :                 }
    1178             :         }
    1179           2 :         if i.transforms.HasSyntheticSuffix() {
    1180           2 :                 n := i.split(key)
    1181           2 :                 row, eq := i.keySeeker.SeekGE(key[:n], boundRow, searchDir)
    1182           2 :                 if eq && i.suffixCmp(key[n:], i.transforms.SyntheticSuffix()) > 0 {
    1183           2 :                         row = i.d.prefixChanged.SeekSetBitGE(row + 1)
    1184           2 :                 }
    1185           2 :                 return row
    1186             :         }
    1187           2 :         row, _ = i.keySeeker.SeekGE(key, boundRow, searchDir)
    1188           2 :         return row
    1189             : }
    1190             : 
    1191             : // SeekGE implements the base.InternalIterator interface.
    1192           2 : func (i *DataBlockIter) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKV {
    1193           2 :         if i.d == nil {
    1194           1 :                 return nil
    1195           1 :         }
    1196           2 :         searchDir := int8(0)
    1197           2 :         if flags.TrySeekUsingNext() {
    1198           0 :                 searchDir = +1
    1199           0 :         }
    1200           2 :         if i.noTransforms {
    1201           2 :                 // Fast path.
    1202           2 :                 i.row, _ = i.keySeeker.SeekGE(key, i.row, searchDir)
    1203           2 :                 return i.decodeRow()
    1204           2 :         }
    1205           2 :         i.row = i.seekGEInternal(key, i.row, searchDir)
    1206           2 :         if i.transforms.HideObsoletePoints {
    1207           2 :                 i.nextObsoletePoint = i.d.isObsolete.SeekSetBitGE(i.row)
    1208           2 :                 if i.atObsoletePointForward() {
    1209           2 :                         i.skipObsoletePointsForward()
    1210           2 :                         if i.row > i.maxRow {
    1211           2 :                                 return nil
    1212           2 :                         }
    1213             :                 }
    1214             :         }
    1215           2 :         return i.decodeRow()
    1216             : }
    1217             : 
    1218             : // SeekPrefixGE implements the base.InternalIterator interface.
    1219           0 : func (i *DataBlockIter) SeekPrefixGE(prefix, key []byte, flags base.SeekGEFlags) *base.InternalKV {
    1220           0 :         // This should never be called as prefix iteration is handled by
    1221           0 :         // sstable.Iterator.
    1222           0 : 
    1223           0 :         // TODO(jackson): We can implement this and avoid propagating keys without
    1224           0 :         // the prefix up to the merging iterator. It will avoid unnecessary key
    1225           0 :         // comparisons fixing up the merging iterator heap. We can also short
    1226           0 :         // circuit the search if the prefix isn't found within the prefix column.
    1227           0 :         // There's some subtlety around ensuring we continue to benefit from the
    1228           0 :         // TrySeekUsingNext optimization.
    1229           0 :         panic("pebble: SeekPrefixGE unimplemented")
    1230             : }
    1231             : 
    1232             : // SeekLT implements the base.InternalIterator interface.
    1233           2 : func (i *DataBlockIter) SeekLT(key []byte, _ base.SeekLTFlags) *base.InternalKV {
    1234           2 :         if i.d == nil {
    1235           1 :                 return nil
    1236           1 :         }
    1237           2 :         i.row = i.seekGEInternal(key, i.row, 0 /* searchDir */) - 1
    1238           2 :         if i.transforms.HideObsoletePoints {
    1239           2 :                 i.nextObsoletePoint = i.d.isObsolete.SeekSetBitGE(max(i.row, 0))
    1240           2 :                 if i.atObsoletePointBackward() {
    1241           2 :                         i.skipObsoletePointsBackward()
    1242           2 :                         if i.row < 0 {
    1243           2 :                                 return nil
    1244           2 :                         }
    1245             :                 }
    1246             :         }
    1247           2 :         return i.decodeRow()
    1248             : }
    1249             : 
    1250             : // First implements the base.InternalIterator interface.
    1251           2 : func (i *DataBlockIter) First() *base.InternalKV {
    1252           2 :         if i.d == nil {
    1253           1 :                 return nil
    1254           1 :         }
    1255           2 :         i.row = 0
    1256           2 :         if i.transforms.HideObsoletePoints {
    1257           2 :                 i.nextObsoletePoint = i.d.isObsolete.SeekSetBitGE(0)
    1258           2 :                 if i.atObsoletePointForward() {
    1259           2 :                         i.skipObsoletePointsForward()
    1260           2 :                         if i.row > i.maxRow {
    1261           2 :                                 return nil
    1262           2 :                         }
    1263             :                 }
    1264             :         }
    1265           2 :         return i.decodeRow()
    1266             : }
    1267             : 
    1268             : // Last implements the base.InternalIterator interface.
    1269           2 : func (i *DataBlockIter) Last() *base.InternalKV {
    1270           2 :         if i.d == nil {
    1271           1 :                 return nil
    1272           1 :         }
    1273           2 :         i.row = i.maxRow
    1274           2 :         if i.transforms.HideObsoletePoints {
    1275           2 :                 i.nextObsoletePoint = i.maxRow + 1
    1276           2 :                 if i.atObsoletePointBackward() {
    1277           2 :                         i.skipObsoletePointsBackward()
    1278           2 :                         if i.row < 0 {
    1279           1 :                                 return nil
    1280           1 :                         }
    1281             :                 }
    1282             :         }
    1283           2 :         return i.decodeRow()
    1284             : }
    1285             : 
    1286             : // Next advances to the next KV pair in the block.
    1287           2 : func (i *DataBlockIter) Next() *base.InternalKV {
    1288           2 :         if i.d == nil {
    1289           2 :                 return nil
    1290           2 :         }
    1291             :         // Inline decodeRow, but avoiding unnecessary checks against i.row.
    1292           2 :         if i.row >= i.maxRow {
    1293           2 :                 i.row = i.maxRow + 1
    1294           2 :                 return nil
    1295           2 :         }
    1296           2 :         i.row++
    1297           2 :         // Inline decodeKey(), adding obsolete points logic.
    1298           2 :         if i.noTransforms {
    1299           2 :                 // Fast path.
    1300           2 :                 i.kv.K = base.InternalKey{
    1301           2 :                         UserKey: i.keySeeker.MaterializeUserKey(&i.keyIter, i.kvRow, i.row),
    1302           2 :                         Trailer: base.InternalKeyTrailer(i.d.trailers.At(i.row)),
    1303           2 :                 }
    1304           2 :         } else {
    1305           2 :                 if i.transforms.HideObsoletePoints && i.atObsoletePointForward() {
    1306           2 :                         i.skipObsoletePointsForward()
    1307           2 :                         if i.row > i.maxRow {
    1308           2 :                                 return nil
    1309           2 :                         }
    1310             :                 }
    1311           2 :                 if i.transforms.HasSyntheticSuffix() {
    1312           2 :                         i.kv.K.UserKey = i.keySeeker.MaterializeUserKeyWithSyntheticSuffix(
    1313           2 :                                 &i.keyIter, i.transforms.SyntheticSuffix(), i.kvRow, i.row,
    1314           2 :                         )
    1315           2 :                 } else {
    1316           2 :                         i.kv.K.UserKey = i.keySeeker.MaterializeUserKey(&i.keyIter, i.kvRow, i.row)
    1317           2 :                 }
    1318           2 :                 i.kv.K.Trailer = base.InternalKeyTrailer(i.d.trailers.At(i.row))
    1319           2 :                 if n := i.transforms.SyntheticSeqNum; n != 0 {
    1320           2 :                         i.kv.K.SetSeqNum(base.SeqNum(n))
    1321           2 :                 }
    1322             :         }
    1323             :         // Inline i.d.values.At(row).
    1324           2 :         v := i.d.values.slice(i.d.values.offsets.At2(i.row))
    1325           2 :         if i.d.isValueExternal.At(i.row) {
    1326           2 :                 i.kv.V = i.getLazyValuer.GetLazyValueForPrefixAndValueHandle(v)
    1327           2 :         } else {
    1328           2 :                 i.kv.V = base.MakeInPlaceValue(v)
    1329           2 :         }
    1330           2 :         i.kvRow = i.row
    1331           2 :         return &i.kv
    1332             : }
    1333             : 
    1334             : // NextPrefix moves the iterator to the next row with a different prefix than
    1335             : // the key at the current iterator position.
    1336             : //
    1337             : // The columnar block implementation uses a newPrefix bitmap to identify the
    1338             : // next row with a differing prefix from the current row's key. If newPrefix[i]
    1339             : // is set then row's i key prefix is different that row i-1. The bitmap is
    1340             : // organized as a slice of 64-bit words. If a 64-bit word in the bitmap is zero
    1341             : // then all of the rows corresponding to the bits in that word have the same
    1342             : // prefix and we can skip ahead. If a row is non-zero a small bit of bit
    1343             : // shifting and masking combined with bits.TrailingZeros64 can identify the
    1344             : // next bit that is set after the current row. The bitmap uses 1 bit/row (0.125
    1345             : // bytes/row). A 32KB block containing 1300 rows (25 bytes/row) would need a
    1346             : // bitmap of 21 64-bit words. Even in the worst case where every word is 0 this
    1347             : // bitmap can be scanned in ~20 ns (1 ns/word) leading to a total NextPrefix
    1348             : // time of ~30 ns if a row is found and decodeRow are called. In more normal
    1349             : // cases, NextPrefix takes ~15% longer that a single Next call.
    1350             : //
    1351             : // For comparison, the rowblk nextPrefixV3 optimizations work by setting a bit
    1352             : // in the value prefix byte that indicates that the current key has the same
    1353             : // prefix as the previous key. Additionally, a bit is stolen from the restart
    1354             : // table entries indicating whether a restart table entry has the same key
    1355             : // prefix as the previous entry. Checking the value prefix byte bit requires
    1356             : // locating that byte which requires decoding 3 varints per key/value pair.
    1357           2 : func (i *DataBlockIter) NextPrefix(_ []byte) *base.InternalKV {
    1358           2 :         if i.d == nil {
    1359           0 :                 return nil
    1360           0 :         }
    1361           2 :         i.row = i.d.prefixChanged.SeekSetBitGE(i.row + 1)
    1362           2 :         if i.transforms.HideObsoletePoints {
    1363           1 :                 i.nextObsoletePoint = i.d.isObsolete.SeekSetBitGE(i.row)
    1364           1 :                 if i.atObsoletePointForward() {
    1365           1 :                         i.skipObsoletePointsForward()
    1366           1 :                 }
    1367             :         }
    1368             : 
    1369           2 :         return i.decodeRow()
    1370             : }
    1371             : 
    1372             : // Prev moves the iterator to the previous KV pair in the block.
    1373           2 : func (i *DataBlockIter) Prev() *base.InternalKV {
    1374           2 :         if i.d == nil {
    1375           2 :                 return nil
    1376           2 :         }
    1377           2 :         i.row--
    1378           2 :         if i.transforms.HideObsoletePoints && i.atObsoletePointBackward() {
    1379           2 :                 i.skipObsoletePointsBackward()
    1380           2 :                 if i.row < 0 {
    1381           2 :                         return nil
    1382           2 :                 }
    1383             :         }
    1384           2 :         return i.decodeRow()
    1385             : }
    1386             : 
    1387             : // atObsoletePointForward returns true if i.row is an obsolete point. It is
    1388             : // separate from skipObsoletePointsForward() because that method does not
    1389             : // inline. It can only be used during forward iteration (i.e. i.row was
    1390             : // incremented).
    1391             : //
    1392             : //gcassert:inline
    1393           2 : func (i *DataBlockIter) atObsoletePointForward() bool {
    1394           2 :         if invariants.Enabled && i.row > i.nextObsoletePoint {
    1395           0 :                 panic("invalid nextObsoletePoint")
    1396             :         }
    1397           2 :         return i.row == i.nextObsoletePoint && i.row <= i.maxRow
    1398             : }
    1399             : 
    1400           2 : func (i *DataBlockIter) skipObsoletePointsForward() {
    1401           2 :         if invariants.Enabled {
    1402           2 :                 i.atObsoletePointCheck()
    1403           2 :         }
    1404           2 :         i.row = i.d.isObsolete.SeekUnsetBitGE(i.row)
    1405           2 :         i.nextObsoletePoint = i.d.isObsolete.SeekSetBitGE(i.row)
    1406             : }
    1407             : 
    1408             : // atObsoletePointBackward returns true if i.row is an obsolete point. It is
    1409             : // separate from skipObsoletePointsBackward() because that method does not
    1410             : // inline. It can only be used during reverse iteration (i.e. i.row was
    1411             : // decremented).
    1412             : //
    1413             : //gcassert:inline
    1414           2 : func (i *DataBlockIter) atObsoletePointBackward() bool {
    1415           2 :         return i.row >= 0 && i.d.isObsolete.At(i.row)
    1416           2 : }
    1417             : 
    1418           2 : func (i *DataBlockIter) skipObsoletePointsBackward() {
    1419           2 :         if invariants.Enabled {
    1420           2 :                 i.atObsoletePointCheck()
    1421           2 :         }
    1422           2 :         i.row = i.d.isObsolete.SeekUnsetBitLE(i.row)
    1423           2 :         i.nextObsoletePoint = i.row + 1
    1424             : }
    1425             : 
    1426           2 : func (i *DataBlockIter) atObsoletePointCheck() {
    1427           2 :         // We extract this code into a separate function to avoid getting a spurious
    1428           2 :         // error from GCAssert about At not being inlined because it is compiled out
    1429           2 :         // altogether in non-invariant builds.
    1430           2 :         if !i.transforms.HideObsoletePoints || !i.d.isObsolete.At(i.row) {
    1431           0 :                 panic("expected obsolete point")
    1432             :         }
    1433             : }
    1434             : 
    1435             : // Error implements the base.InternalIterator interface. A DataBlockIter is
    1436             : // infallible and always returns a nil error.
    1437           2 : func (i *DataBlockIter) Error() error {
    1438           2 :         return nil // infallible
    1439           2 : }
    1440             : 
    1441             : // SetBounds implements the base.InternalIterator interface.
    1442           0 : func (i *DataBlockIter) SetBounds(lower, upper []byte) {
    1443           0 :         // This should never be called as bounds are handled by sstable.Iterator.
    1444           0 :         panic("pebble: SetBounds unimplemented")
    1445             : }
    1446             : 
    1447             : // SetContext implements the base.InternalIterator interface.
    1448           0 : func (i *DataBlockIter) SetContext(_ context.Context) {}
    1449             : 
    1450             : var dataBlockTypeString string = fmt.Sprintf("%T", (*DataBlockIter)(nil))
    1451             : 
    1452             : // String implements the base.InternalIterator interface.
    1453           0 : func (i *DataBlockIter) String() string {
    1454           0 :         return dataBlockTypeString
    1455           0 : }
    1456             : 
    1457             : // DebugTree is part of the InternalIterator interface.
    1458           0 : func (i *DataBlockIter) DebugTree(tp treeprinter.Node) {
    1459           0 :         tp.Childf("%T(%p)", i, i)
    1460           0 : }
    1461             : 
    1462             : // decodeRow decodes i.row into i.kv. If i.row is invalid, it returns nil.
    1463           2 : func (i *DataBlockIter) decodeRow() *base.InternalKV {
    1464           2 :         switch {
    1465           2 :         case i.row < 0 || i.row > i.maxRow:
    1466           2 :                 return nil
    1467           2 :         case i.kvRow == i.row:
    1468           2 :                 // Already synthesized the kv at row.
    1469           2 :                 return &i.kv
    1470           2 :         default:
    1471           2 :                 // Inline decodeKey().
    1472           2 :                 if i.noTransforms {
    1473           2 :                         // Fast path.
    1474           2 :                         i.kv.K = base.InternalKey{
    1475           2 :                                 UserKey: i.keySeeker.MaterializeUserKey(&i.keyIter, i.kvRow, i.row),
    1476           2 :                                 Trailer: base.InternalKeyTrailer(i.d.trailers.At(i.row)),
    1477           2 :                         }
    1478           2 :                 } else {
    1479           2 :                         if i.transforms.HasSyntheticSuffix() {
    1480           2 :                                 i.kv.K.UserKey = i.keySeeker.MaterializeUserKeyWithSyntheticSuffix(
    1481           2 :                                         &i.keyIter, i.transforms.SyntheticSuffix(), i.kvRow, i.row,
    1482           2 :                                 )
    1483           2 :                         } else {
    1484           2 :                                 i.kv.K.UserKey = i.keySeeker.MaterializeUserKey(&i.keyIter, i.kvRow, i.row)
    1485           2 :                         }
    1486           2 :                         i.kv.K.Trailer = base.InternalKeyTrailer(i.d.trailers.At(i.row))
    1487           2 :                         if n := i.transforms.SyntheticSeqNum; n != 0 {
    1488           2 :                                 i.kv.K.SetSeqNum(base.SeqNum(n))
    1489           2 :                         }
    1490             :                 }
    1491             :                 // Inline i.d.values.At(row).
    1492           2 :                 startOffset := i.d.values.offsets.At(i.row)
    1493           2 :                 v := unsafe.Slice((*byte)(i.d.values.ptr(startOffset)), i.d.values.offsets.At(i.row+1)-startOffset)
    1494           2 :                 if i.d.isValueExternal.At(i.row) {
    1495           2 :                         i.kv.V = i.getLazyValuer.GetLazyValueForPrefixAndValueHandle(v)
    1496           2 :                 } else {
    1497           2 :                         i.kv.V = base.MakeInPlaceValue(v)
    1498           2 :                 }
    1499           2 :                 i.kvRow = i.row
    1500           2 :                 return &i.kv
    1501             :         }
    1502             : }
    1503             : 
    1504             : // decodeKey updates i.kv.K to the key for i.row (which must be valid).
    1505             : // This function does not inline, so we copy its code verbatim. For any updates
    1506             : // to this code, all code preceded by "Inline decodeKey" must be updated.
    1507           0 : func (i *DataBlockIter) decodeKey() {
    1508           0 :         if i.noTransforms {
    1509           0 :                 // Fast path.
    1510           0 :                 i.kv.K = base.InternalKey{
    1511           0 :                         UserKey: i.keySeeker.MaterializeUserKey(&i.keyIter, i.kvRow, i.row),
    1512           0 :                         Trailer: base.InternalKeyTrailer(i.d.trailers.At(i.row)),
    1513           0 :                 }
    1514           0 :         } else {
    1515           0 :                 if i.transforms.HasSyntheticSuffix() {
    1516           0 :                         i.kv.K.UserKey = i.keySeeker.MaterializeUserKeyWithSyntheticSuffix(
    1517           0 :                                 &i.keyIter, i.transforms.SyntheticSuffix(), i.kvRow, i.row,
    1518           0 :                         )
    1519           0 :                 } else {
    1520           0 :                         i.kv.K.UserKey = i.keySeeker.MaterializeUserKey(&i.keyIter, i.kvRow, i.row)
    1521           0 :                 }
    1522           0 :                 i.kv.K.Trailer = base.InternalKeyTrailer(i.d.trailers.At(i.row))
    1523           0 :                 if n := i.transforms.SyntheticSeqNum; n != 0 {
    1524           0 :                         i.kv.K.SetSeqNum(base.SeqNum(n))
    1525           0 :                 }
    1526             :         }
    1527             : }
    1528             : 
    1529             : var _ = (*DataBlockIter).decodeKey
    1530             : 
    1531             : // Close implements the base.InternalIterator interface.
    1532           2 : func (i *DataBlockIter) Close() error {
    1533           2 :         i.keySeeker = nil
    1534           2 :         i.d = nil
    1535           2 :         i.h.Release()
    1536           2 :         i.h = block.BufferHandle{}
    1537           2 :         i.transforms = block.IterTransforms{}
    1538           2 :         i.kv = base.InternalKV{}
    1539           2 :         return nil
    1540           2 : }

Generated by: LCOV version 1.14