LCOV - 2024-10-20 08:17Z ced78c34 - meta test only.lcov - pebble/sstable/rowblk/rowblk

LCOV - code coverage report

Current view:	top level - pebble/sstable/rowblk - rowblk_iter.go (source / functions)		Hit	Total	Coverage
Test:	2024-10-20 08:17Z ced78c34 - meta test only.lcov	Lines:	850	1347	63.1 %
Date:	2024-10-20 08:18:47	Functions:	0	0	-

          Line data    Source code

       1             : // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
       2             : // of this source code is governed by a BSD-style license that can be found in
       3             : // the LICENSE file.
       4             : 
       5             : package rowblk
       6             : 
       7             : import (
       8             :         "bytes"
       9             :         "context"
      10             :         "encoding/binary"
      11             :         "io"
      12             :         "slices"
      13             :         "sort"
      14             :         "unsafe"
      15             : 
      16             :         "github.com/cockroachdb/errors"
      17             :         "github.com/cockroachdb/pebble/internal/base"
      18             :         "github.com/cockroachdb/pebble/internal/invariants"
      19             :         "github.com/cockroachdb/pebble/internal/manual"
      20             :         "github.com/cockroachdb/pebble/internal/treeprinter"
      21             :         "github.com/cockroachdb/pebble/sstable/block"
      22             : )
      23             : 
      24             : // Iter is an iterator over a single block of data.
      25             : //
      26             : // An Iter provides an additional guarantee around key stability when a block
      27             : // has a restart interval of 1 (i.e. when there is no prefix compression). Key
      28             : // stability refers to whether the InternalKey.UserKey bytes returned by a
      29             : // positioning call will remain stable after a subsequent positioning call. The
      30             : // normal case is that a positioning call will invalidate any previously
      31             : // returned InternalKey.UserKey. If a block has a restart interval of 1 (no
      32             : // prefix compression), Iter guarantees that InternalKey.UserKey will point to
      33             : // the key as stored in the block itself which will remain valid until the Iter
      34             : // is closed. The key stability guarantee is used by the range tombstone and
      35             : // range key code, which knows that the respective blocks are always encoded
      36             : // with a restart interval of 1. This per-block key stability guarantee is
      37             : // sufficient for range tombstones and range deletes as they are always encoded
      38             : // in a single block. Note: this stability guarantee no longer holds for a block
      39             : // iter with synthetic prefix/suffix replacement, but we don't use the synthetic
      40             : // suffix/prefix functionality of Iter for range keys.
      41             : //
      42             : // An Iter also provides a value stability guarantee for range deletions and
      43             : // range keys since there is only a single range deletion and range key block
      44             : // per sstable and the Iter will not release the bytes for the block until it is
      45             : // closed.
      46             : //
      47             : // Note on why Iter knows about lazyValueHandling:
      48             : //
      49             : // Iter's positioning functions (that return a LazyValue), are too
      50             : // complex to inline even prior to lazyValueHandling. Iter.Next and
      51             : // Iter.First were by far the cheapest and had costs 195 and 180
      52             : // respectively, which exceeds the budget of 80. We initially tried to keep
      53             : // the lazyValueHandling logic out of Iter by wrapping it with a
      54             : // lazyValueDataBlockIter. singleLevelIter and twoLevelIter would use this
      55             : // wrapped iter. The functions in lazyValueDataBlockIter were simple, in that
      56             : // they called the corresponding Iter func and then decided whether the
      57             : // value was in fact in-place (so return immediately) or needed further
      58             : // handling. But these also turned out too costly for mid-stack inlining since
      59             : // simple calls like the following have a high cost that is barely under the
      60             : // budget of 80
      61             : //
      62             : //      k, v := i.data.SeekGE(key, flags)  // cost 74
      63             : //      k, v := i.data.Next()              // cost 72
      64             : //
      65             : // We have 2 options for minimizing performance regressions:
      66             : //   - Include the lazyValueHandling logic in the already non-inlineable
      67             : //     Iter functions: Since most of the time is spent in data block iters,
      68             : //     it is acceptable to take the small hit of unnecessary branching (which
      69             : //     hopefully branch prediction will predict correctly) for other kinds of
      70             : //     blocks.
      71             : //   - Duplicate the logic of singleLevelIterator and twoLevelIterator for the
      72             : //     v3 sstable and only use the aforementioned lazyValueDataBlockIter for a
      73             : //     v3 sstable. We would want to manage these copies via code generation.
      74             : //
      75             : // We have picked the first option here.
      76             : type Iter struct {
      77             :         cmp   base.Compare
      78             :         split base.Split
      79             : 
      80             :         // Iterator transforms.
      81             :         //
      82             :         // SyntheticSuffix, if set, will replace the decoded ikey.UserKey suffix
      83             :         // before the key is returned to the user. A sequence of iter operations on a
      84             :         // block with a syntheticSuffix rule should return keys as if those operations
      85             :         // ran on a block with keys that all had the syntheticSuffix. As an example:
      86             :         // any sequence of block iter cmds should return the same keys for the
      87             :         // following two blocks:
      88             :         //
      89             :         // blockA: a@3,b@3,c@3
      90             :         // blockB: a@1,b@2,c@1 with syntheticSuffix=3
      91             :         //
      92             :         // To ensure this, Suffix replacement will not change the ordering of keys in
      93             :         // the block because the iter assumes that no two keys in the block share the
      94             :         // same prefix. Furthermore, during SeekGE and SeekLT operations, the block
      95             :         // iterator handles "off by one" errors (explained in more detail in those
      96             :         // functions) when, for a given key, originalSuffix < searchSuffix <
      97             :         // replacementSuffix, with integer comparison. To handle these cases, the
      98             :         // iterator assumes:
      99             :         //
     100             :         //  pebble.Compare(keyPrefix{replacementSuffix},keyPrefix{originalSuffix}) < 0
     101             :         //  for keys with a suffix.
     102             :         //
     103             :         //  NB: it is possible for a block iter to add a synthetic suffix on a key
     104             :         //  without a suffix, which implies
     105             :         //  pebble.Compare(keyPrefix{replacementSuffix},keyPrefix{noSuffix}) > 0 ,
     106             :         //  however, the iterator would never need to handle an off by one error in
     107             :         //  this case since originalSuffix (empty) > searchSuffix (non empty), with
     108             :         //  integer comparison.
     109             :         //
     110             :         //
     111             :         // In addition, we also assume that any block with rangekeys will not contain
     112             :         // a synthetic suffix.
     113             :         transforms block.IterTransforms
     114             : 
     115             :         // offset is the byte index that marks where the current key/value is
     116             :         // encoded in the block.
     117             :         offset int32
     118             :         // nextOffset is the byte index where the next key/value is encoded in the
     119             :         // block.
     120             :         nextOffset int32
     121             :         // A "restart point" in a block is a point where the full key is encoded,
     122             :         // instead of just having a suffix of the key encoded. See readEntry() for
     123             :         // how prefix compression of keys works. Keys in between two restart points
     124             :         // only have a suffix encoded in the block. When restart interval is 1, no
     125             :         // prefix compression of keys happens. This is the case with range tombstone
     126             :         // blocks.
     127             :         //
     128             :         // All restart offsets are listed in increasing order in
     129             :         // i.ptr[i.restarts:len(block)-4], while numRestarts is encoded in the last
     130             :         // 4 bytes of the block as a uint32 (i.ptr[len(block)-4:]). i.restarts can
     131             :         // therefore be seen as the point where data in the block ends, and a list
     132             :         // of offsets of all restart points begins.
     133             :         restarts int32
     134             :         // Number of restart points in this block. Encoded at the end of the block
     135             :         // as a uint32.
     136             :         numRestarts int32
     137             :         ptr         unsafe.Pointer
     138             :         data        []byte
     139             :         // key contains the raw key the iterator is currently pointed at. This may
     140             :         // point directly to data stored in the block (for a key which has no prefix
     141             :         // compression), to fullKey (for a prefix compressed key), or to a slice of
     142             :         // data stored in cachedBuf (during reverse iteration).
     143             :         //
     144             :         // NB: In general, key contains the same logical content as ikey
     145             :         // (i.e. ikey = decode(key)), but if the iterator contains a synthetic suffix
     146             :         // replacement rule, this will not be the case. Therefore, key should never
     147             :         // be used after ikey is set.
     148             :         key []byte
     149             :         // fullKey is a buffer used for key prefix decompression. Note that if
     150             :         // transforms.SyntheticPrifix is not nil, fullKey always starts with that
     151             :         // prefix.
     152             :         fullKey []byte
     153             :         // val contains the value the iterator is currently pointed at. If non-nil,
     154             :         // this points to a slice of the block data.
     155             :         val []byte
     156             :         // ikv contains the decoded internal KV the iterator is currently positioned
     157             :         // at.
     158             :         //
     159             :         // ikv.InternalKey contains the decoded InternalKey the iterator is
     160             :         // currently pointed at. Note that the memory backing ikv.UserKey is either
     161             :         // data stored directly in the block, fullKey, or cachedBuf. The key
     162             :         // stability guarantee for blocks built with a restart interval of 1 is
     163             :         // achieved by having ikv.UserKey always point to data stored directly in
     164             :         // the block.
     165             :         //
     166             :         // ikv.LazyValue is val turned into a LazyValue, whenever a positioning
     167             :         // method returns a non-nil key-value pair.
     168             :         ikv base.InternalKV
     169             :         // cached and cachedBuf are used during reverse iteration. They are needed
     170             :         // because we can't perform prefix decoding in reverse, only in the forward
     171             :         // direction. In order to iterate in reverse, we decode and cache the entries
     172             :         // between two restart points.
     173             :         //
     174             :         // Note that cached[len(cached)-1] contains the previous entry to the one the
     175             :         // blockIter is currently pointed at. As usual, nextOffset will contain the
     176             :         // offset of the next entry. During reverse iteration, nextOffset will be
     177             :         // updated to point to offset, and we'll set the blockIter to point at the
     178             :         // entry cached[len(cached)-1]. See Prev() for more details.
     179             :         //
     180             :         // For a block encoded with a restart interval of 1, cached and cachedBuf
     181             :         // will not be used as there are no prefix compressed entries between the
     182             :         // restart points.
     183             :         cached    []blockEntry
     184             :         cachedBuf []byte
     185             :         handle    block.BufferHandle
     186             :         // for block iteration for already loaded blocks.
     187             :         firstUserKey      []byte
     188             :         lazyValueHandling struct {
     189             :                 getValue       block.GetLazyValueForPrefixAndValueHandler
     190             :                 hasValuePrefix bool
     191             :         }
     192             :         synthSuffixBuf            []byte
     193             :         firstUserKeyWithPrefixBuf []byte
     194             : }
     195             : 
     196             : type blockEntry struct {
     197             :         offset   int32
     198             :         keyStart int32
     199             :         keyEnd   int32
     200             :         valStart int32
     201             :         valSize  int32
     202             : }
     203             : 
     204             : // *Iter implements the block.DataBlockIterator interface.
     205             : var _ block.DataBlockIterator = (*Iter)(nil)
     206             : 
     207             : // NewIter constructs a new row-oriented block iterator over the provided serialized block.
     208             : func NewIter(
     209             :         cmp base.Compare, split base.Split, block []byte, transforms block.IterTransforms,
     210           0 : ) (*Iter, error) {
     211           0 :         i := &Iter{}
     212           0 :         return i, i.Init(cmp, split, block, transforms)
     213           0 : }
     214             : 
     215             : // String implements fmt.Stringer.
     216           0 : func (i *Iter) String() string {
     217           0 :         return "block"
     218           0 : }
     219             : 
     220             : // Init initializes the block iterator from the provided block.
     221             : func (i *Iter) Init(
     222             :         cmp base.Compare, split base.Split, blk []byte, transforms block.IterTransforms,
     223           1 : ) error {
     224           1 :         numRestarts := int32(binary.LittleEndian.Uint32(blk[len(blk)-4:]))
     225           1 :         if numRestarts == 0 {
     226           0 :                 return base.CorruptionErrorf("pebble/table: invalid table (block has no restart points)")
     227           0 :         }
     228           1 :         i.transforms = transforms
     229           1 :         i.synthSuffixBuf = i.synthSuffixBuf[:0]
     230           1 :         i.split = split
     231           1 :         i.cmp = cmp
     232           1 :         i.restarts = int32(len(blk)) - 4*(1+numRestarts)
     233           1 :         i.numRestarts = numRestarts
     234           1 :         i.ptr = unsafe.Pointer(&blk[0])
     235           1 :         i.data = blk
     236           1 :         if i.transforms.SyntheticPrefix.IsSet() {
     237           1 :                 i.fullKey = append(i.fullKey[:0], i.transforms.SyntheticPrefix...)
     238           1 :         } else {
     239           1 :                 i.fullKey = i.fullKey[:0]
     240           1 :         }
     241           1 :         i.val = nil
     242           1 :         i.clearCache()
     243           1 :         if i.restarts > 0 {
     244           1 :                 if err := i.readFirstKey(); err != nil {
     245           0 :                         return err
     246           0 :                 }
     247           1 :         } else {
     248           1 :                 // Block is empty.
     249           1 :                 i.firstUserKey = nil
     250           1 :         }
     251           1 :         return nil
     252             : }
     253             : 
     254             : // InitHandle initializes an iterator from the provided block handle.
     255             : // NB: two cases of hideObsoletePoints:
     256             : //   - Local sstable iteration: syntheticSeqNum will be set iff the sstable was
     257             : //     ingested.
     258             : //   - Foreign sstable iteration: syntheticSeqNum is always set.
     259             : func (i *Iter) InitHandle(
     260             :         cmp base.Compare, split base.Split, block block.BufferHandle, transforms block.IterTransforms,
     261           1 : ) error {
     262           1 :         i.handle.Release()
     263           1 :         i.handle = block
     264           1 :         return i.Init(cmp, split, block.BlockData(), transforms)
     265           1 : }
     266             : 
     267             : // SetHasValuePrefix sets whether or not the block iterator should expect values
     268             : // corresponding to Set keys to have a prefix byte.
     269           1 : func (i *Iter) SetHasValuePrefix(hasValuePrefix bool) {
     270           1 :         i.lazyValueHandling.hasValuePrefix = hasValuePrefix
     271           1 : }
     272             : 
     273             : // SetGetLazyValuer sets the value block reader the iterator should use to get
     274             : // lazy values when the value encodes a value prefix.
     275           1 : func (i *Iter) SetGetLazyValuer(g block.GetLazyValueForPrefixAndValueHandler) {
     276           1 :         i.lazyValueHandling.getValue = g
     277           1 : 
     278           1 : }
     279             : 
     280             : // Handle returns the underlying block buffer handle, if the iterator was
     281             : // initialized with one.
     282           1 : func (i *Iter) Handle() block.BufferHandle {
     283           1 :         return i.handle
     284           1 : }
     285             : 
     286             : // Invalidate invalidates the block iterator, removing references to the block
     287             : // it was initialized with.
     288           1 : func (i *Iter) Invalidate() {
     289           1 :         i.clearCache()
     290           1 :         i.offset = 0
     291           1 :         i.nextOffset = 0
     292           1 :         i.restarts = 0
     293           1 :         i.numRestarts = 0
     294           1 :         i.data = nil
     295           1 : }
     296             : 
     297             : // IsDataInvalidated returns true when the blockIter has been invalidated
     298             : // using an invalidate call. NB: this is different from blockIter.Valid
     299             : // which is part of the InternalIterator implementation.
     300           1 : func (i *Iter) IsDataInvalidated() bool {
     301           1 :         return i.data == nil
     302           1 : }
     303             : 
     304             : // ResetForReuse resets the blockIter for reuse, retaining buffers to avoid
     305             : // future allocations.
     306           1 : func (i *Iter) ResetForReuse() {
     307           1 :         fullKey := i.fullKey[:0]
     308           1 :         cached := i.cached[:0]
     309           1 :         cachedBuf := i.cachedBuf[:0]
     310           1 :         firstUserKeyWithPrefixBuf := i.firstUserKeyWithPrefixBuf[:0]
     311           1 :         *i = Iter{
     312           1 :                 fullKey:                   fullKey,
     313           1 :                 cached:                    cached,
     314           1 :                 cachedBuf:                 cachedBuf,
     315           1 :                 firstUserKeyWithPrefixBuf: firstUserKeyWithPrefixBuf,
     316           1 :         }
     317           1 : }
     318             : 
     319           1 : func (i *Iter) readEntry() {
     320           1 :         ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset))
     321           1 : 
     322           1 :         // This is an ugly performance hack. Reading entries from blocks is one of
     323           1 :         // the inner-most routines and decoding the 3 varints per-entry takes
     324           1 :         // significant time. Neither go1.11 or go1.12 will inline decodeVarint for
     325           1 :         // us, so we do it manually. This provides a 10-15% performance improvement
     326           1 :         // on blockIter benchmarks on both go1.11 and go1.12.
     327           1 :         //
     328           1 :         // TODO(peter): remove this hack if go:inline is ever supported.
     329           1 : 
     330           1 :         var shared uint32
     331           1 :         if a := *((*uint8)(ptr)); a < 128 {
     332           1 :                 shared = uint32(a)
     333           1 :                 ptr = unsafe.Pointer(uintptr(ptr) + 1)
     334           1 :         } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
     335           0 :                 shared = uint32(b)<<7 | uint32(a)
     336           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 2)
     337           0 :         } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
     338           0 :                 shared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     339           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 3)
     340           0 :         } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
     341           0 :                 shared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     342           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 4)
     343           0 :         } else {
     344           0 :                 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
     345           0 :                 shared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     346           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 5)
     347           0 :         }
     348             : 
     349           1 :         var unshared uint32
     350           1 :         if a := *((*uint8)(ptr)); a < 128 {
     351           1 :                 unshared = uint32(a)
     352           1 :                 ptr = unsafe.Pointer(uintptr(ptr) + 1)
     353           1 :         } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
     354           0 :                 unshared = uint32(b)<<7 | uint32(a)
     355           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 2)
     356           0 :         } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
     357           0 :                 unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     358           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 3)
     359           0 :         } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
     360           0 :                 unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     361           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 4)
     362           0 :         } else {
     363           0 :                 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
     364           0 :                 unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     365           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 5)
     366           0 :         }
     367             : 
     368           1 :         var value uint32
     369           1 :         if a := *((*uint8)(ptr)); a < 128 {
     370           1 :                 value = uint32(a)
     371           1 :                 ptr = unsafe.Pointer(uintptr(ptr) + 1)
     372           1 :         } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
     373           1 :                 value = uint32(b)<<7 | uint32(a)
     374           1 :                 ptr = unsafe.Pointer(uintptr(ptr) + 2)
     375           1 :         } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
     376           0 :                 value = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     377           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 3)
     378           0 :         } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
     379           0 :                 value = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     380           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 4)
     381           0 :         } else {
     382           0 :                 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
     383           0 :                 value = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     384           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 5)
     385           0 :         }
     386           1 :         shared += uint32(len(i.transforms.SyntheticPrefix))
     387           1 :         unsharedKey := getBytes(ptr, int(unshared))
     388           1 :         // TODO(sumeer): move this into the else block below.
     389           1 :         i.fullKey = append(i.fullKey[:shared], unsharedKey...)
     390           1 :         if shared == 0 {
     391           1 :                 // Provide stability for the key across positioning calls if the key
     392           1 :                 // doesn't share a prefix with the previous key. This removes requiring the
     393           1 :                 // key to be copied if the caller knows the block has a restart interval of
     394           1 :                 // 1. An important example of this is range-del blocks.
     395           1 :                 i.key = unsharedKey
     396           1 :         } else {
     397           1 :                 i.key = i.fullKey
     398           1 :         }
     399           1 :         ptr = unsafe.Pointer(uintptr(ptr) + uintptr(unshared))
     400           1 :         i.val = getBytes(ptr, int(value))
     401           1 :         i.nextOffset = int32(uintptr(ptr)-uintptr(i.ptr)) + int32(value)
     402             : }
     403             : 
     404           1 : func (i *Iter) readFirstKey() error {
     405           1 :         ptr := i.ptr
     406           1 : 
     407           1 :         // This is an ugly performance hack. Reading entries from blocks is one of
     408           1 :         // the inner-most routines and decoding the 3 varints per-entry takes
     409           1 :         // significant time. Neither go1.11 or go1.12 will inline decodeVarint for
     410           1 :         // us, so we do it manually. This provides a 10-15% performance improvement
     411           1 :         // on blockIter benchmarks on both go1.11 and go1.12.
     412           1 :         //
     413           1 :         // TODO(peter): remove this hack if go:inline is ever supported.
     414           1 : 
     415           1 :         if shared := *((*uint8)(ptr)); shared == 0 {
     416           1 :                 ptr = unsafe.Pointer(uintptr(ptr) + 1)
     417           1 :         } else {
     418           0 :                 // The shared length is != 0, which is invalid.
     419           0 :                 panic("first key in block must have zero shared length")
     420             :         }
     421             : 
     422           1 :         var unshared uint32
     423           1 :         if a := *((*uint8)(ptr)); a < 128 {
     424           1 :                 unshared = uint32(a)
     425           1 :                 ptr = unsafe.Pointer(uintptr(ptr) + 1)
     426           1 :         } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
     427           0 :                 unshared = uint32(b)<<7 | uint32(a)
     428           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 2)
     429           0 :         } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
     430           0 :                 unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     431           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 3)
     432           0 :         } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
     433           0 :                 unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     434           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 4)
     435           0 :         } else {
     436           0 :                 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
     437           0 :                 unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     438           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 5)
     439           0 :         }
     440             : 
     441             :         // Skip the value length.
     442           1 :         if a := *((*uint8)(ptr)); a < 128 {
     443           1 :                 ptr = unsafe.Pointer(uintptr(ptr) + 1)
     444           1 :         } else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); a < 128 {
     445           1 :                 ptr = unsafe.Pointer(uintptr(ptr) + 2)
     446           1 :         } else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); a < 128 {
     447           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 3)
     448           0 :         } else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); a < 128 {
     449           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 4)
     450           0 :         } else {
     451           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 5)
     452           0 :         }
     453             : 
     454           1 :         firstKey := getBytes(ptr, int(unshared))
     455           1 :         // Manually inlining base.DecodeInternalKey provides a 5-10% speedup on
     456           1 :         // BlockIter benchmarks.
     457           1 :         if n := len(firstKey) - 8; n >= 0 {
     458           1 :                 i.firstUserKey = firstKey[:n:n]
     459           1 :         } else {
     460           0 :                 i.firstUserKey = nil
     461           0 :                 return base.CorruptionErrorf("pebble/table: invalid firstKey in block")
     462           0 :         }
     463           1 :         if i.transforms.SyntheticPrefix != nil {
     464           1 :                 i.firstUserKeyWithPrefixBuf = slices.Grow(i.firstUserKeyWithPrefixBuf[:0], len(i.transforms.SyntheticPrefix)+len(i.firstUserKey))
     465           1 :                 i.firstUserKeyWithPrefixBuf = append(i.firstUserKeyWithPrefixBuf, i.transforms.SyntheticPrefix...)
     466           1 :                 i.firstUserKeyWithPrefixBuf = append(i.firstUserKeyWithPrefixBuf, i.firstUserKey...)
     467           1 :                 i.firstUserKey = i.firstUserKeyWithPrefixBuf
     468           1 :         }
     469           1 :         return nil
     470             : }
     471             : 
     472           1 : func (i *Iter) decodeInternalKey(key []byte) (hiddenPoint bool) {
     473           1 :         // Manually inlining base.DecodeInternalKey provides a 5-10% speedup on
     474           1 :         // BlockIter benchmarks.
     475           1 :         if n := len(key) - 8; n >= 0 {
     476           1 :                 trailer := base.InternalKeyTrailer(binary.LittleEndian.Uint64(key[n:]))
     477           1 :                 hiddenPoint = i.transforms.HideObsoletePoints &&
     478           1 :                         (trailer&TrailerObsoleteBit != 0)
     479           1 :                 i.ikv.K.Trailer = trailer & TrailerObsoleteMask
     480           1 :                 i.ikv.K.UserKey = key[:n:n]
     481           1 :                 if n := i.transforms.SyntheticSeqNum; n != 0 {
     482           1 :                         i.ikv.K.SetSeqNum(base.SeqNum(n))
     483           1 :                 }
     484           1 :         } else {
     485           1 :                 i.ikv.K.Trailer = base.InternalKeyTrailer(base.InternalKeyKindInvalid)
     486           1 :                 i.ikv.K.UserKey = nil
     487           1 :         }
     488           1 :         return hiddenPoint
     489             : }
     490             : 
     491             : // maybeReplaceSuffix replaces the suffix in i.ikey.UserKey with
     492             : // i.transforms.syntheticSuffix.
     493           1 : func (i *Iter) maybeReplaceSuffix() {
     494           1 :         if i.transforms.SyntheticSuffix.IsSet() && i.ikv.K.UserKey != nil {
     495           1 :                 prefixLen := i.split(i.ikv.K.UserKey)
     496           1 :                 // If ikey is cached or may get cached, we must copy
     497           1 :                 // UserKey to a new buffer before suffix replacement.
     498           1 :                 i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikv.K.UserKey[:prefixLen]...)
     499           1 :                 i.synthSuffixBuf = append(i.synthSuffixBuf, i.transforms.SyntheticSuffix...)
     500           1 :                 i.ikv.K.UserKey = i.synthSuffixBuf
     501           1 :         }
     502             : }
     503             : 
     504           1 : func (i *Iter) clearCache() {
     505           1 :         i.cached = i.cached[:0]
     506           1 :         i.cachedBuf = i.cachedBuf[:0]
     507           1 : }
     508             : 
     509           1 : func (i *Iter) cacheEntry() {
     510           1 :         var valStart int32
     511           1 :         valSize := int32(len(i.val))
     512           1 :         if valSize > 0 {
     513           1 :                 valStart = int32(uintptr(unsafe.Pointer(&i.val[0])) - uintptr(i.ptr))
     514           1 :         }
     515             : 
     516           1 :         i.cached = append(i.cached, blockEntry{
     517           1 :                 offset:   i.offset,
     518           1 :                 keyStart: int32(len(i.cachedBuf)),
     519           1 :                 keyEnd:   int32(len(i.cachedBuf) + len(i.key)),
     520           1 :                 valStart: valStart,
     521           1 :                 valSize:  valSize,
     522           1 :         })
     523           1 :         i.cachedBuf = append(i.cachedBuf, i.key...)
     524             : }
     525             : 
     526             : // IsLowerBound implements the block.DataBlockIterator interface.
     527           1 : func (i *Iter) IsLowerBound(k []byte) bool {
     528           1 :         // Note: we ignore HideObsoletePoints, but false negatives are allowed.
     529           1 :         return i.cmp(i.firstUserKey, k) >= 0
     530           1 : }
     531             : 
     532             : // SeekGE implements internalIterator.SeekGE, as documented in the pebble
     533             : // package.
     534           1 : func (i *Iter) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKV {
     535           1 :         if invariants.Enabled && i.IsDataInvalidated() {
     536           0 :                 panic(errors.AssertionFailedf("invalidated blockIter used"))
     537             :         }
     538           1 :         searchKey := key
     539           1 :         if i.transforms.SyntheticPrefix != nil {
     540           1 :                 if !bytes.HasPrefix(key, i.transforms.SyntheticPrefix) {
     541           0 :                         // The seek key is before or after the entire block of keys that start
     542           0 :                         // with SyntheticPrefix. To determine which, we need to compare against a
     543           0 :                         // valid key in the block. We use firstUserKey which has the synthetic
     544           0 :                         // prefix.
     545           0 :                         if i.cmp(i.firstUserKey, key) >= 0 {
     546           0 :                                 return i.First()
     547           0 :                         }
     548             :                         // Set the offset to the end of the block to mimic the offset of an
     549             :                         // invalid iterator. This ensures a subsequent i.Prev() returns a valid
     550             :                         // result.
     551           0 :                         i.offset = i.restarts
     552           0 :                         i.nextOffset = i.restarts
     553           0 :                         return nil
     554             :                 }
     555           1 :                 searchKey = key[len(i.transforms.SyntheticPrefix):]
     556             :         }
     557             : 
     558           1 :         i.clearCache()
     559           1 :         // Find the index of the smallest restart point whose key is > the key
     560           1 :         // sought; index will be numRestarts if there is no such restart point.
     561           1 :         i.offset = 0
     562           1 :         var index int32
     563           1 : 
     564           1 :         {
     565           1 :                 // NB: manually inlined sort.Seach is ~5% faster.
     566           1 :                 //
     567           1 :                 // Define f(-1) == false and f(n) == true.
     568           1 :                 // Invariant: f(index-1) == false, f(upper) == true.
     569           1 :                 upper := i.numRestarts
     570           1 :                 for index < upper {
     571           1 :                         h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
     572           1 :                         // index ≤ h < upper
     573           1 :                         offset := decodeRestart(i.data[i.restarts+4*h:])
     574           1 :                         // For a restart point, there are 0 bytes shared with the previous key.
     575           1 :                         // The varint encoding of 0 occupies 1 byte.
     576           1 :                         ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(offset+1))
     577           1 : 
     578           1 :                         // Decode the key at that restart point, and compare it to the key
     579           1 :                         // sought. See the comment in readEntry for why we manually inline the
     580           1 :                         // varint decoding.
     581           1 :                         var v1 uint32
     582           1 :                         if a := *((*uint8)(ptr)); a < 128 {
     583           1 :                                 v1 = uint32(a)
     584           1 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 1)
     585           1 :                         } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
     586           0 :                                 v1 = uint32(b)<<7 | uint32(a)
     587           0 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 2)
     588           0 :                         } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
     589           0 :                                 v1 = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     590           0 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 3)
     591           0 :                         } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
     592           0 :                                 v1 = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     593           0 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 4)
     594           0 :                         } else {
     595           0 :                                 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
     596           0 :                                 v1 = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     597           0 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 5)
     598           0 :                         }
     599             : 
     600           1 :                         if *((*uint8)(ptr)) < 128 {
     601           1 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 1)
     602           1 :                         } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))) < 128 {
     603           0 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 2)
     604           0 :                         } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))) < 128 {
     605           0 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 3)
     606           0 :                         } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))) < 128 {
     607           0 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 4)
     608           0 :                         } else {
     609           0 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 5)
     610           0 :                         }
     611             : 
     612             :                         // Manually inlining part of base.DecodeInternalKey provides a 5-10%
     613             :                         // speedup on BlockIter benchmarks.
     614           1 :                         s := getBytes(ptr, int(v1))
     615           1 :                         var k []byte
     616           1 :                         if n := len(s) - 8; n >= 0 {
     617           1 :                                 k = s[:n:n]
     618           1 :                         }
     619             :                         // Else k is invalid, and left as nil
     620             : 
     621           1 :                         if i.cmp(searchKey, k) > 0 {
     622           1 :                                 // The search key is greater than the user key at this restart point.
     623           1 :                                 // Search beyond this restart point, since we are trying to find the
     624           1 :                                 // first restart point with a user key >= the search key.
     625           1 :                                 index = h + 1 // preserves f(i-1) == false
     626           1 :                         } else {
     627           1 :                                 // k >= search key, so prune everything after index (since index
     628           1 :                                 // satisfies the property we are looking for).
     629           1 :                                 upper = h // preserves f(j) == true
     630           1 :                         }
     631             :                 }
     632             :                 // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
     633             :                 // => answer is index.
     634             :         }
     635             : 
     636             :         // index is the first restart point with key >= search key. Define the keys
     637             :         // between a restart point and the next restart point as belonging to that
     638             :         // restart point.
     639             :         //
     640             :         // Since keys are strictly increasing, if index > 0 then the restart point
     641             :         // at index-1 will be the first one that has some keys belonging to it that
     642             :         // could be equal to the search key.  If index == 0, then all keys in this
     643             :         // block are larger than the key sought, and offset remains at zero.
     644           1 :         if index > 0 {
     645           1 :                 i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
     646           1 :         }
     647           1 :         i.readEntry()
     648           1 :         hiddenPoint := i.decodeInternalKey(i.key)
     649           1 : 
     650           1 :         // Iterate from that restart point to somewhere >= the key sought.
     651           1 :         if !i.Valid() {
     652           0 :                 return nil
     653           0 :         }
     654             : 
     655             :         // A note on seeking in a block with a suffix replacement rule: even though
     656             :         // the binary search above was conducted on keys without suffix replacement,
     657             :         // Seek will still return the correct suffix replaced key. A binary
     658             :         // search without suffix replacement will land on a key that is _less_ than
     659             :         // the key the search would have landed on if all keys were already suffix
     660             :         // replaced. Since Seek then conducts forward iteration to the first suffix
     661             :         // replaced user key that is greater than or equal to the search key, the
     662             :         // correct key is still returned.
     663             :         //
     664             :         // As an example, consider the following block with a restart interval of 1,
     665             :         // with a replacement suffix of "4":
     666             :         // - Pre-suffix replacement: apple@1, banana@3
     667             :         // - Post-suffix replacement: apple@4, banana@4
     668             :         //
     669             :         // Suppose the client seeks with apple@3. Assuming suffixes sort in reverse
     670             :         // chronological order (i.e. apple@1>apple@3), the binary search without
     671             :         // suffix replacement would return apple@1. A binary search with suffix
     672             :         // replacement would return banana@4. After beginning forward iteration from
     673             :         // either returned restart point, forward iteration would
     674             :         // always return the correct key, banana@4.
     675             :         //
     676             :         // Further, if the user searched with apple@0 (i.e. a suffix less than the
     677             :         // pre replacement suffix) or with apple@5 (a suffix larger than the post
     678             :         // replacement suffix), the binary search with or without suffix replacement
     679             :         // would land on the same key, as we assume the following:
     680             :         // (1) no two keys in the sst share the same prefix.
     681             :         // (2) pebble.Compare(replacementSuffix,originalSuffix) > 0
     682             : 
     683           1 :         i.maybeReplaceSuffix()
     684           1 : 
     685           1 :         if !hiddenPoint && i.cmp(i.ikv.K.UserKey, key) >= 0 {
     686           1 :                 // Initialize i.lazyValue
     687           1 :                 if !i.lazyValueHandling.hasValuePrefix ||
     688           1 :                         i.ikv.K.Kind() != base.InternalKeyKindSet {
     689           1 :                         i.ikv.V = base.MakeInPlaceValue(i.val)
     690           1 :                 } else if i.lazyValueHandling.getValue == nil || !block.ValuePrefix(i.val[0]).IsValueHandle() {
     691           1 :                         i.ikv.V = base.MakeInPlaceValue(i.val[1:])
     692           1 :                 } else {
     693           1 :                         i.ikv.V = i.lazyValueHandling.getValue.GetLazyValueForPrefixAndValueHandle(i.val)
     694           1 :                 }
     695           1 :                 return &i.ikv
     696             :         }
     697           1 :         for i.Next(); i.Valid(); i.Next() {
     698           1 :                 if i.cmp(i.ikv.K.UserKey, key) >= 0 {
     699           1 :                         // i.Next() has already initialized i.ikv.LazyValue.
     700           1 :                         return &i.ikv
     701           1 :                 }
     702             :         }
     703           1 :         return nil
     704             : }
     705             : 
     706             : // SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the
     707             : // pebble package.
     708           0 : func (i *Iter) SeekPrefixGE(prefix, key []byte, flags base.SeekGEFlags) *base.InternalKV {
     709           0 :         // This should never be called as prefix iteration is handled by sstable.Iterator.
     710           0 :         panic("pebble: SeekPrefixGE unimplemented")
     711             : }
     712             : 
     713             : // SeekLT implements internalIterator.SeekLT, as documented in the pebble
     714             : // package.
     715           1 : func (i *Iter) SeekLT(key []byte, flags base.SeekLTFlags) *base.InternalKV {
     716           1 :         if invariants.Enabled && i.IsDataInvalidated() {
     717           0 :                 panic(errors.AssertionFailedf("invalidated blockIter used"))
     718             :         }
     719           1 :         searchKey := key
     720           1 :         if i.transforms.SyntheticPrefix != nil {
     721           1 :                 if !bytes.HasPrefix(key, i.transforms.SyntheticPrefix) {
     722           1 :                         // The seek key is before or after the entire block of keys that start
     723           1 :                         // with SyntheticPrefix. To determine which, we need to compare against a
     724           1 :                         // valid key in the block. We use firstUserKey which has the synthetic
     725           1 :                         // prefix.
     726           1 :                         if i.cmp(i.firstUserKey, key) < 0 {
     727           1 :                                 return i.Last()
     728           1 :                         }
     729             :                         // Set the offset to the beginning of the block to mimic an exhausted
     730             :                         // iterator that has conducted backward interation. This ensures a
     731             :                         // subsequent Next() call returns the first key in the block.
     732           1 :                         i.offset = -1
     733           1 :                         i.nextOffset = 0
     734           1 :                         return nil
     735             :                 }
     736           1 :                 searchKey = key[len(i.transforms.SyntheticPrefix):]
     737             :         }
     738             : 
     739           1 :         i.clearCache()
     740           1 :         // Find the index of the smallest restart point whose key is >= the key
     741           1 :         // sought; index will be numRestarts if there is no such restart point.
     742           1 :         i.offset = 0
     743           1 :         var index int32
     744           1 : 
     745           1 :         {
     746           1 :                 // NB: manually inlined sort.Search is ~5% faster.
     747           1 :                 //
     748           1 :                 // Define f(-1) == false and f(n) == true.
     749           1 :                 // Invariant: f(index-1) == false, f(upper) == true.
     750           1 :                 upper := i.numRestarts
     751           1 :                 for index < upper {
     752           1 :                         h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
     753           1 :                         // index ≤ h < upper
     754           1 :                         offset := decodeRestart(i.data[i.restarts+4*h:])
     755           1 :                         // For a restart point, there are 0 bytes shared with the previous key.
     756           1 :                         // The varint encoding of 0 occupies 1 byte.
     757           1 :                         ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(offset+1))
     758           1 : 
     759           1 :                         // Decode the key at that restart point, and compare it to the key
     760           1 :                         // sought. See the comment in readEntry for why we manually inline the
     761           1 :                         // varint decoding.
     762           1 :                         var v1 uint32
     763           1 :                         if a := *((*uint8)(ptr)); a < 128 {
     764           1 :                                 v1 = uint32(a)
     765           1 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 1)
     766           1 :                         } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
     767           0 :                                 v1 = uint32(b)<<7 | uint32(a)
     768           0 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 2)
     769           0 :                         } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
     770           0 :                                 v1 = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     771           0 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 3)
     772           0 :                         } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
     773           0 :                                 v1 = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     774           0 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 4)
     775           0 :                         } else {
     776           0 :                                 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
     777           0 :                                 v1 = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     778           0 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 5)
     779           0 :                         }
     780             : 
     781           1 :                         if *((*uint8)(ptr)) < 128 {
     782           1 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 1)
     783           1 :                         } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))) < 128 {
     784           1 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 2)
     785           1 :                         } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))) < 128 {
     786           0 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 3)
     787           0 :                         } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))) < 128 {
     788           0 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 4)
     789           0 :                         } else {
     790           0 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 5)
     791           0 :                         }
     792             : 
     793             :                         // Manually inlining part of base.DecodeInternalKey provides a 5-10%
     794             :                         // speedup on BlockIter benchmarks.
     795           1 :                         s := getBytes(ptr, int(v1))
     796           1 :                         var k []byte
     797           1 :                         if n := len(s) - 8; n >= 0 {
     798           1 :                                 k = s[:n:n]
     799           1 :                         }
     800             :                         // Else k is invalid, and left as nil
     801             : 
     802           1 :                         if i.cmp(searchKey, k) > 0 {
     803           1 :                                 // The search key is greater than the user key at this restart point.
     804           1 :                                 // Search beyond this restart point, since we are trying to find the
     805           1 :                                 // first restart point with a user key >= the search key.
     806           1 :                                 index = h + 1 // preserves f(i-1) == false
     807           1 :                         } else {
     808           1 :                                 // k >= search key, so prune everything after index (since index
     809           1 :                                 // satisfies the property we are looking for).
     810           1 :                                 upper = h // preserves f(j) == true
     811           1 :                         }
     812             :                 }
     813             :                 // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
     814             :                 // => answer is index.
     815             :         }
     816             : 
     817           1 :         if index == 0 {
     818           1 :                 if i.transforms.SyntheticSuffix.IsSet() {
     819           1 :                         // The binary search was conducted on keys without suffix replacement,
     820           1 :                         // implying the first key in the block may be less than the search key. To
     821           1 :                         // double check, get the first key in the block with suffix replacement
     822           1 :                         // and compare to the search key. Consider the following example: suppose
     823           1 :                         // the user searches with a@3, the first key in the block is a@2 and the
     824           1 :                         // block contains a suffix replacement rule of 4. Since a@3 sorts before
     825           1 :                         // a@2, the binary search would return index==0. Without conducting the
     826           1 :                         // suffix replacement, the SeekLT would incorrectly return nil. With
     827           1 :                         // suffix replacement though, a@4 should be returned as a@4 sorts before
     828           1 :                         // a@3.
     829           1 :                         ikv := i.First()
     830           1 :                         if i.cmp(ikv.K.UserKey, key) < 0 {
     831           1 :                                 return ikv
     832           1 :                         }
     833             :                 }
     834             :                 // If index == 0 then all keys in this block are larger than the key
     835             :                 // sought, so there is no match.
     836           1 :                 i.offset = -1
     837           1 :                 i.nextOffset = 0
     838           1 :                 return nil
     839             :         }
     840             : 
     841             :         // INVARIANT: index > 0
     842             : 
     843             :         // Ignoring suffix replacement, index is the first restart point with key >=
     844             :         // search key. Define the keys between a restart point and the next restart
     845             :         // point as belonging to that restart point. Note that index could be equal to
     846             :         // i.numRestarts, i.e., we are past the last restart.  Since keys are strictly
     847             :         // increasing, then the restart point at index-1 will be the first one that
     848             :         // has some keys belonging to it that are less than the search key.
     849             :         //
     850             :         // Next, we will search between the restart at index-1 and the restart point
     851             :         // at index, for the first key >= key, and then on finding it, return
     852             :         // i.Prev(). We need to know when we have hit the offset for index, since then
     853             :         // we can stop searching. targetOffset encodes that offset for index.
     854           1 :         targetOffset := i.restarts
     855           1 :         i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
     856           1 :         if index < i.numRestarts {
     857           1 :                 targetOffset = decodeRestart(i.data[i.restarts+4*(index):])
     858           1 : 
     859           1 :                 if i.transforms.SyntheticSuffix.IsSet() {
     860           0 :                         // The binary search was conducted on keys without suffix replacement,
     861           0 :                         // implying the returned restart point (index) may be less than the search
     862           0 :                         // key, breaking the assumption described above.
     863           0 :                         //
     864           0 :                         // For example: consider this block with a replacement ts of 4, and
     865           0 :                         // restart interval of 1: - pre replacement: a@3,b@2,c@3 - post
     866           0 :                         // replacement: a@4,b@4,c@4
     867           0 :                         //
     868           0 :                         // Suppose the client calls SeekLT(b@3), SeekLT must return b@4.
     869           0 :                         //
     870           0 :                         // If the client calls  SeekLT(b@3), the binary search would return b@2,
     871           0 :                         // the lowest key geq to b@3, pre-suffix replacement. Then, SeekLT will
     872           0 :                         // begin forward iteration from a@3, the previous restart point, to
     873           0 :                         // b{suffix}. The iteration stops when it encounters a key geq to the
     874           0 :                         // search key or if it reaches the upper bound. Without suffix
     875           0 :                         // replacement, we can assume that the upper bound of this forward
     876           0 :                         // iteration, b{suffix}, is greater than the search key, as implied by the
     877           0 :                         // binary search.
     878           0 :                         //
     879           0 :                         // If we naively hold this assumption with suffix replacement, the
     880           0 :                         // iteration would terminate at the upper bound, b@4, call i.Prev, and
     881           0 :                         // incorrectly return a@4. To correct for this, if the original returned
     882           0 :                         // index is less than the search key, shift our forward iteration to begin
     883           0 :                         // at index instead of index -1. With suffix replacement the key at index
     884           0 :                         // is guaranteed to be the highest restart point less than the seach key
     885           0 :                         // (i.e. the same property of index-1 for a block without suffix
     886           0 :                         // replacement). This property holds because of the invariant that a block
     887           0 :                         // with suffix replacement will not have two keys that share the same
     888           0 :                         // prefix. To consider the above example, binary searching with b@3 landed
     889           0 :                         // naively at a@3, but since b@4<b@3, we shift our forward iteration to
     890           0 :                         // begin at b@4. We never need to shift by more than one restart point
     891           0 :                         // (i.e. to c@4) because it's impossible for the search key to be greater
     892           0 :                         // than the key at the next restart point in the block because that
     893           0 :                         // key will always have a different prefix. Put another way, because no
     894           0 :                         // key in the block shares the same prefix, naive binary search should
     895           0 :                         // always land at most 1 restart point off the correct one.
     896           0 : 
     897           0 :                         naiveOffset := i.offset
     898           0 :                         // Shift up to the original binary search result and decode the key.
     899           0 :                         i.offset = targetOffset
     900           0 :                         i.readEntry()
     901           0 :                         i.decodeInternalKey(i.key)
     902           0 :                         i.maybeReplaceSuffix()
     903           0 : 
     904           0 :                         // If the binary search point is actually less than the search key, post
     905           0 :                         // replacement, bump the target offset.
     906           0 :                         if i.cmp(i.ikv.K.UserKey, key) < 0 {
     907           0 :                                 i.offset = targetOffset
     908           0 :                                 if index+1 < i.numRestarts {
     909           0 :                                         // if index+1 is within the i.data bounds, use it to find the target
     910           0 :                                         // offset.
     911           0 :                                         targetOffset = decodeRestart(i.data[i.restarts+4*(index+1):])
     912           0 :                                 } else {
     913           0 :                                         targetOffset = i.restarts
     914           0 :                                 }
     915           0 :                         } else {
     916           0 :                                 i.offset = naiveOffset
     917           0 :                         }
     918             :                 }
     919             :         }
     920             : 
     921             :         // Init nextOffset for the forward iteration below.
     922           1 :         i.nextOffset = i.offset
     923           1 : 
     924           1 :         for {
     925           1 :                 i.offset = i.nextOffset
     926           1 :                 i.readEntry()
     927           1 :                 // When hidden keys are common, there is additional optimization possible
     928           1 :                 // by not caching entries that are hidden (note that some calls to
     929           1 :                 // cacheEntry don't decode the internal key before caching, but checking
     930           1 :                 // whether a key is hidden does not require full decoding). However, we do
     931           1 :                 // need to use the blockEntry.offset in the cache for the first entry at
     932           1 :                 // the reset point to do the binary search when the cache is empty -- so
     933           1 :                 // we would need to cache that first entry (though not the key) even if
     934           1 :                 // was hidden. Our current assumption is that if there are large numbers
     935           1 :                 // of hidden keys we will be able to skip whole blocks (using block
     936           1 :                 // property filters) so we don't bother optimizing.
     937           1 :                 hiddenPoint := i.decodeInternalKey(i.key)
     938           1 :                 i.maybeReplaceSuffix()
     939           1 : 
     940           1 :                 // NB: we don't use the hiddenPoint return value of decodeInternalKey
     941           1 :                 // since we want to stop as soon as we reach a key >= ikey.UserKey, so
     942           1 :                 // that we can reverse.
     943           1 :                 if i.cmp(i.ikv.K.UserKey, key) >= 0 {
     944           1 :                         // The current key is greater than or equal to our search key. Back up to
     945           1 :                         // the previous key which was less than our search key. Note that this for
     946           1 :                         // loop will execute at least once with this if-block not being true, so
     947           1 :                         // the key we are backing up to is the last one this loop cached.
     948           1 :                         return i.Prev()
     949           1 :                 }
     950             : 
     951           1 :                 if i.nextOffset >= targetOffset {
     952           1 :                         // We've reached the end of the current restart block. Return the
     953           1 :                         // current key if not hidden, else call Prev().
     954           1 :                         //
     955           1 :                         // When the restart interval is 1, the first iteration of the for loop
     956           1 :                         // will bring us here. In that case ikey is backed by the block so we
     957           1 :                         // get the desired key stability guarantee for the lifetime of the
     958           1 :                         // blockIter. That is, we never cache anything and therefore never
     959           1 :                         // return a key backed by cachedBuf.
     960           1 :                         if hiddenPoint {
     961           1 :                                 return i.Prev()
     962           1 :                         }
     963           1 :                         break
     964             :                 }
     965           1 :                 i.cacheEntry()
     966             :         }
     967             : 
     968           1 :         if !i.Valid() {
     969           1 :                 return nil
     970           1 :         }
     971           1 :         if !i.lazyValueHandling.hasValuePrefix ||
     972           1 :                 i.ikv.K.Kind() != base.InternalKeyKindSet {
     973           1 :                 i.ikv.V = base.MakeInPlaceValue(i.val)
     974           1 :         } else if i.lazyValueHandling.getValue == nil || !block.ValuePrefix(i.val[0]).IsValueHandle() {
     975           1 :                 i.ikv.V = base.MakeInPlaceValue(i.val[1:])
     976           1 :         } else {
     977           1 :                 i.ikv.V = i.lazyValueHandling.getValue.GetLazyValueForPrefixAndValueHandle(i.val)
     978           1 :         }
     979           1 :         return &i.ikv
     980             : }
     981             : 
     982             : // First implements internalIterator.First, as documented in the pebble
     983             : // package.
     984           1 : func (i *Iter) First() *base.InternalKV {
     985           1 :         if invariants.Enabled && i.IsDataInvalidated() {
     986           0 :                 panic(errors.AssertionFailedf("invalidated blockIter used"))
     987             :         }
     988             : 
     989           1 :         i.offset = 0
     990           1 :         if !i.Valid() {
     991           1 :                 return nil
     992           1 :         }
     993           1 :         i.clearCache()
     994           1 :         i.readEntry()
     995           1 :         hiddenPoint := i.decodeInternalKey(i.key)
     996           1 :         if hiddenPoint {
     997           1 :                 return i.Next()
     998           1 :         }
     999           1 :         i.maybeReplaceSuffix()
    1000           1 :         if !i.lazyValueHandling.hasValuePrefix ||
    1001           1 :                 i.ikv.K.Kind() != base.InternalKeyKindSet {
    1002           1 :                 i.ikv.V = base.MakeInPlaceValue(i.val)
    1003           1 :         } else if i.lazyValueHandling.getValue == nil || !block.ValuePrefix(i.val[0]).IsValueHandle() {
    1004           1 :                 i.ikv.V = base.MakeInPlaceValue(i.val[1:])
    1005           1 :         } else {
    1006           1 :                 i.ikv.V = i.lazyValueHandling.getValue.GetLazyValueForPrefixAndValueHandle(i.val)
    1007           1 :         }
    1008           1 :         return &i.ikv
    1009             : }
    1010             : 
    1011             : const restartMaskLittleEndianHighByteWithoutSetHasSamePrefix byte = 0b0111_1111
    1012             : const restartMaskLittleEndianHighByteOnlySetHasSamePrefix byte = 0b1000_0000
    1013             : 
    1014           1 : func decodeRestart(b []byte) int32 {
    1015           1 :         _ = b[3] // bounds check hint to compiler; see golang.org/issue/14808
    1016           1 :         return int32(uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 |
    1017           1 :                 uint32(b[3]&restartMaskLittleEndianHighByteWithoutSetHasSamePrefix)<<24)
    1018           1 : }
    1019             : 
    1020             : // Last implements internalIterator.Last, as documented in the pebble package.
    1021           1 : func (i *Iter) Last() *base.InternalKV {
    1022           1 :         if invariants.Enabled && i.IsDataInvalidated() {
    1023           0 :                 panic(errors.AssertionFailedf("invalidated blockIter used"))
    1024             :         }
    1025             : 
    1026             :         // Seek forward from the last restart point.
    1027           1 :         i.offset = decodeRestart(i.data[i.restarts+4*(i.numRestarts-1):])
    1028           1 :         if !i.Valid() {
    1029           1 :                 return nil
    1030           1 :         }
    1031             : 
    1032           1 :         i.readEntry()
    1033           1 :         i.clearCache()
    1034           1 : 
    1035           1 :         for i.nextOffset < i.restarts {
    1036           1 :                 i.cacheEntry()
    1037           1 :                 i.offset = i.nextOffset
    1038           1 :                 i.readEntry()
    1039           1 :         }
    1040             : 
    1041           1 :         hiddenPoint := i.decodeInternalKey(i.key)
    1042           1 :         if hiddenPoint {
    1043           1 :                 return i.Prev()
    1044           1 :         }
    1045           1 :         i.maybeReplaceSuffix()
    1046           1 :         if !i.lazyValueHandling.hasValuePrefix ||
    1047           1 :                 i.ikv.K.Kind() != base.InternalKeyKindSet {
    1048           1 :                 i.ikv.V = base.MakeInPlaceValue(i.val)
    1049           1 :         } else if i.lazyValueHandling.getValue == nil || !block.ValuePrefix(i.val[0]).IsValueHandle() {
    1050           1 :                 i.ikv.V = base.MakeInPlaceValue(i.val[1:])
    1051           1 :         } else {
    1052           1 :                 i.ikv.V = i.lazyValueHandling.getValue.GetLazyValueForPrefixAndValueHandle(i.val)
    1053           1 :         }
    1054           1 :         return &i.ikv
    1055             : }
    1056             : 
    1057             : // Next implements internalIterator.Next, as documented in the pebble
    1058             : // package.
    1059           1 : func (i *Iter) Next() *base.InternalKV {
    1060           1 :         if len(i.cachedBuf) > 0 {
    1061           1 :                 // We're switching from reverse iteration to forward iteration. We need to
    1062           1 :                 // populate i.fullKey with the current key we're positioned at so that
    1063           1 :                 // readEntry() can use i.fullKey for key prefix decompression. Note that we
    1064           1 :                 // don't know whether i.key is backed by i.cachedBuf or i.fullKey (if
    1065           1 :                 // SeekLT was the previous call, i.key may be backed by i.fullKey), but
    1066           1 :                 // copying into i.fullKey works for both cases.
    1067           1 :                 //
    1068           1 :                 // TODO(peter): Rather than clearing the cache, we could instead use the
    1069           1 :                 // cache until it is exhausted. This would likely be faster than falling
    1070           1 :                 // through to the normal forward iteration code below.
    1071           1 :                 i.fullKey = append(i.fullKey[:0], i.key...)
    1072           1 :                 i.clearCache()
    1073           1 :         }
    1074             : 
    1075             : start:
    1076           1 :         i.offset = i.nextOffset
    1077           1 :         if !i.Valid() {
    1078           1 :                 return nil
    1079           1 :         }
    1080           1 :         i.readEntry()
    1081           1 :         // Manually inlined version of i.decodeInternalKey(i.key).
    1082           1 :         if n := len(i.key) - 8; n >= 0 {
    1083           1 :                 trailer := base.InternalKeyTrailer(binary.LittleEndian.Uint64(i.key[n:]))
    1084           1 :                 hiddenPoint := i.transforms.HideObsoletePoints &&
    1085           1 :                         (trailer&TrailerObsoleteBit != 0)
    1086           1 :                 i.ikv.K.Trailer = trailer & TrailerObsoleteMask
    1087           1 :                 i.ikv.K.UserKey = i.key[:n:n]
    1088           1 :                 if n := i.transforms.SyntheticSeqNum; n != 0 {
    1089           1 :                         i.ikv.K.SetSeqNum(base.SeqNum(n))
    1090           1 :                 }
    1091           1 :                 if hiddenPoint {
    1092           1 :                         goto start
    1093             :                 }
    1094           1 :                 if i.transforms.SyntheticSuffix.IsSet() {
    1095           1 :                         // Inlined version of i.maybeReplaceSuffix()
    1096           1 :                         prefixLen := i.split(i.ikv.K.UserKey)
    1097           1 :                         i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikv.K.UserKey[:prefixLen]...)
    1098           1 :                         i.synthSuffixBuf = append(i.synthSuffixBuf, i.transforms.SyntheticSuffix...)
    1099           1 :                         i.ikv.K.UserKey = i.synthSuffixBuf
    1100           1 :                 }
    1101           0 :         } else {
    1102           0 :                 i.ikv.K.Trailer = base.InternalKeyTrailer(base.InternalKeyKindInvalid)
    1103           0 :                 i.ikv.K.UserKey = nil
    1104           0 :         }
    1105           1 :         if !i.lazyValueHandling.hasValuePrefix ||
    1106           1 :                 i.ikv.K.Kind() != base.InternalKeyKindSet {
    1107           1 :                 i.ikv.V = base.MakeInPlaceValue(i.val)
    1108           1 :         } else if i.lazyValueHandling.getValue == nil || !block.ValuePrefix(i.val[0]).IsValueHandle() {
    1109           1 :                 i.ikv.V = base.MakeInPlaceValue(i.val[1:])
    1110           1 :         } else {
    1111           1 :                 i.ikv.V = i.lazyValueHandling.getValue.GetLazyValueForPrefixAndValueHandle(i.val)
    1112           1 :         }
    1113           1 :         return &i.ikv
    1114             : }
    1115             : 
    1116             : // NextPrefix implements (base.InternalIterator).NextPrefix.
    1117           1 : func (i *Iter) NextPrefix(succKey []byte) *base.InternalKV {
    1118           1 :         if i.lazyValueHandling.hasValuePrefix {
    1119           1 :                 return i.nextPrefixV3(succKey)
    1120           1 :         }
    1121           1 :         const nextsBeforeSeek = 3
    1122           1 :         kv := i.Next()
    1123           1 :         for j := 1; kv != nil && i.cmp(kv.K.UserKey, succKey) < 0; j++ {
    1124           1 :                 if j >= nextsBeforeSeek {
    1125           1 :                         return i.SeekGE(succKey, base.SeekGEFlagsNone)
    1126           1 :                 }
    1127           1 :                 kv = i.Next()
    1128             :         }
    1129           1 :         return kv
    1130             : }
    1131             : 
    1132           1 : func (i *Iter) nextPrefixV3(succKey []byte) *base.InternalKV {
    1133           1 :         // Doing nexts that involve a key comparison can be expensive (and the cost
    1134           1 :         // depends on the key length), so we use the same threshold of 3 that we use
    1135           1 :         // for TableFormatPebblev2 in blockIter.nextPrefix above. The next fast path
    1136           1 :         // that looks at setHasSamePrefix takes ~5ns per key, which is ~150x faster
    1137           1 :         // than doing a SeekGE within the block, so we do this 16 times
    1138           1 :         // (~5ns*16=80ns), and then switch to looking at restarts. Doing the binary
    1139           1 :         // search for the restart consumes > 100ns. If the number of versions is >
    1140           1 :         // 17, we will increment nextFastCount to 17, then do a binary search, and
    1141           1 :         // on average need to find a key between two restarts, so another 8 steps
    1142           1 :         // corresponding to nextFastCount, for a mean total of 17 + 8 = 25 such
    1143           1 :         // steps.
    1144           1 :         //
    1145           1 :         // TODO(sumeer): use the configured restartInterval for the sstable when it
    1146           1 :         // was written (which we don't currently store) instead of the default value
    1147           1 :         // of 16.
    1148           1 :         const nextCmpThresholdBeforeSeek = 3
    1149           1 :         const nextFastThresholdBeforeRestarts = 16
    1150           1 :         nextCmpCount := 0
    1151           1 :         nextFastCount := 0
    1152           1 :         usedRestarts := false
    1153           1 :         // INVARIANT: blockIter is valid.
    1154           1 :         if invariants.Enabled && !i.Valid() {
    1155           0 :                 panic(errors.AssertionFailedf("nextPrefixV3 called on invalid blockIter"))
    1156             :         }
    1157           1 :         prevKeyIsSet := i.ikv.Kind() == base.InternalKeyKindSet
    1158           1 :         for {
    1159           1 :                 i.offset = i.nextOffset
    1160           1 :                 if !i.Valid() {
    1161           1 :                         return nil
    1162           1 :                 }
    1163             :                 // Need to decode the length integers, so we can compute nextOffset.
    1164           1 :                 ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset))
    1165           1 :                 // This is an ugly performance hack. Reading entries from blocks is one of
    1166           1 :                 // the inner-most routines and decoding the 3 varints per-entry takes
    1167           1 :                 // significant time. Neither go1.11 or go1.12 will inline decodeVarint for
    1168           1 :                 // us, so we do it manually. This provides a 10-15% performance improvement
    1169           1 :                 // on blockIter benchmarks on both go1.11 and go1.12.
    1170           1 :                 //
    1171           1 :                 // TODO(peter): remove this hack if go:inline is ever supported.
    1172           1 : 
    1173           1 :                 // Decode the shared key length integer.
    1174           1 :                 var shared uint32
    1175           1 :                 if a := *((*uint8)(ptr)); a < 128 {
    1176           1 :                         shared = uint32(a)
    1177           1 :                         ptr = unsafe.Pointer(uintptr(ptr) + 1)
    1178           1 :                 } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
    1179           0 :                         shared = uint32(b)<<7 | uint32(a)
    1180           0 :                         ptr = unsafe.Pointer(uintptr(ptr) + 2)
    1181           0 :                 } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
    1182           0 :                         shared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
    1183           0 :                         ptr = unsafe.Pointer(uintptr(ptr) + 3)
    1184           0 :                 } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
    1185           0 :                         shared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
    1186           0 :                         ptr = unsafe.Pointer(uintptr(ptr) + 4)
    1187           0 :                 } else {
    1188           0 :                         d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
    1189           0 :                         shared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
    1190           0 :                         ptr = unsafe.Pointer(uintptr(ptr) + 5)
    1191           0 :                 }
    1192             :                 // Decode the unshared key length integer.
    1193           1 :                 var unshared uint32
    1194           1 :                 if a := *((*uint8)(ptr)); a < 128 {
    1195           1 :                         unshared = uint32(a)
    1196           1 :                         ptr = unsafe.Pointer(uintptr(ptr) + 1)
    1197           1 :                 } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
    1198           0 :                         unshared = uint32(b)<<7 | uint32(a)
    1199           0 :                         ptr = unsafe.Pointer(uintptr(ptr) + 2)
    1200           0 :                 } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
    1201           0 :                         unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
    1202           0 :                         ptr = unsafe.Pointer(uintptr(ptr) + 3)
    1203           0 :                 } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
    1204           0 :                         unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
    1205           0 :                         ptr = unsafe.Pointer(uintptr(ptr) + 4)
    1206           0 :                 } else {
    1207           0 :                         d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
    1208           0 :                         unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
    1209           0 :                         ptr = unsafe.Pointer(uintptr(ptr) + 5)
    1210           0 :                 }
    1211             :                 // Decode the value length integer.
    1212           1 :                 var value uint32
    1213           1 :                 if a := *((*uint8)(ptr)); a < 128 {
    1214           1 :                         value = uint32(a)
    1215           1 :                         ptr = unsafe.Pointer(uintptr(ptr) + 1)
    1216           1 :                 } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
    1217           0 :                         value = uint32(b)<<7 | uint32(a)
    1218           0 :                         ptr = unsafe.Pointer(uintptr(ptr) + 2)
    1219           0 :                 } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
    1220           0 :                         value = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
    1221           0 :                         ptr = unsafe.Pointer(uintptr(ptr) + 3)
    1222           0 :                 } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
    1223           0 :                         value = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
    1224           0 :                         ptr = unsafe.Pointer(uintptr(ptr) + 4)
    1225           0 :                 } else {
    1226           0 :                         d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
    1227           0 :                         value = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
    1228           0 :                         ptr = unsafe.Pointer(uintptr(ptr) + 5)
    1229           0 :                 }
    1230           1 :                 if i.transforms.SyntheticPrefix != nil {
    1231           0 :                         shared += uint32(len(i.transforms.SyntheticPrefix))
    1232           0 :                 }
    1233             :                 // The starting position of the value.
    1234           1 :                 valuePtr := unsafe.Pointer(uintptr(ptr) + uintptr(unshared))
    1235           1 :                 i.nextOffset = int32(uintptr(valuePtr)-uintptr(i.ptr)) + int32(value)
    1236           1 :                 if invariants.Enabled && unshared < 8 {
    1237           0 :                         // This should not happen since only the key prefix is shared, so even
    1238           0 :                         // if the prefix length is the same as the user key length, the unshared
    1239           0 :                         // will include the trailer.
    1240           0 :                         panic(errors.AssertionFailedf("unshared %d is too small", unshared))
    1241             :                 }
    1242             :                 // The trailer is written in little endian, so the key kind is the first
    1243             :                 // byte in the trailer that is encoded in the slice [unshared-8:unshared].
    1244           1 :                 keyKind := base.InternalKeyKind((*[manual.MaxArrayLen]byte)(ptr)[unshared-8])
    1245           1 :                 keyKind = keyKind & base.InternalKeyKindSSTableInternalObsoleteMask
    1246           1 :                 prefixChanged := false
    1247           1 :                 if keyKind == base.InternalKeyKindSet {
    1248           1 :                         if invariants.Enabled && value == 0 {
    1249           0 :                                 panic(errors.AssertionFailedf("value is of length 0, but we expect a valuePrefix"))
    1250             :                         }
    1251           1 :                         valPrefix := *((*block.ValuePrefix)(valuePtr))
    1252           1 :                         if valPrefix.SetHasSamePrefix() {
    1253           1 :                                 // Fast-path. No need to assemble i.fullKey, or update i.key. We know
    1254           1 :                                 // that subsequent keys will not have a shared length that is greater
    1255           1 :                                 // than the prefix of the current key, which is also the prefix of
    1256           1 :                                 // i.key. Since we are continuing to iterate, we don't need to
    1257           1 :                                 // initialize i.ikey and i.lazyValue (these are initialized before
    1258           1 :                                 // returning).
    1259           1 :                                 nextFastCount++
    1260           1 :                                 if nextFastCount > nextFastThresholdBeforeRestarts {
    1261           0 :                                         if usedRestarts {
    1262           0 :                                                 // Exhausted iteration budget. This will never happen unless
    1263           0 :                                                 // someone is using a restart interval > 16. It is just to guard
    1264           0 :                                                 // against long restart intervals causing too much iteration.
    1265           0 :                                                 break
    1266             :                                         }
    1267             :                                         // Haven't used restarts yet, so find the first restart at or beyond
    1268             :                                         // the current offset.
    1269           0 :                                         targetOffset := i.offset
    1270           0 :                                         var index int32
    1271           0 :                                         {
    1272           0 :                                                 // NB: manually inlined sort.Sort is ~5% faster.
    1273           0 :                                                 //
    1274           0 :                                                 // f defined for a restart point is true iff the offset >=
    1275           0 :                                                 // targetOffset.
    1276           0 :                                                 // Define f(-1) == false and f(i.numRestarts) == true.
    1277           0 :                                                 // Invariant: f(index-1) == false, f(upper) == true.
    1278           0 :                                                 upper := i.numRestarts
    1279           0 :                                                 for index < upper {
    1280           0 :                                                         h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
    1281           0 :                                                         // index ≤ h < upper
    1282           0 :                                                         offset := decodeRestart(i.data[i.restarts+4*h:])
    1283           0 :                                                         if offset < targetOffset {
    1284           0 :                                                                 index = h + 1 // preserves f(index-1) == false
    1285           0 :                                                         } else {
    1286           0 :                                                                 upper = h // preserves f(upper) == true
    1287           0 :                                                         }
    1288             :                                                 }
    1289             :                                                 // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
    1290             :                                                 // => answer is index.
    1291             :                                         }
    1292           0 :                                         usedRestarts = true
    1293           0 :                                         nextFastCount = 0
    1294           0 :                                         if index == i.numRestarts {
    1295           0 :                                                 // Already past the last real restart, so iterate a bit more until
    1296           0 :                                                 // we are done with the block.
    1297           0 :                                                 continue
    1298             :                                         }
    1299             :                                         // Have some real restarts after index. NB: index is the first
    1300             :                                         // restart at or beyond the current offset.
    1301           0 :                                         startingIndex := index
    1302           0 :                                         for index != i.numRestarts &&
    1303           0 :                                                 // The restart at index is 4 bytes written in little endian format
    1304           0 :                                                 // starting at i.restart+4*index. The 0th byte is the least
    1305           0 :                                                 // significant and the 3rd byte is the most significant. Since the
    1306           0 :                                                 // most significant bit of the 3rd byte is what we use for
    1307           0 :                                                 // encoding the set-has-same-prefix information, the indexing
    1308           0 :                                                 // below has +3.
    1309           0 :                                                 i.data[i.restarts+4*index+3]&restartMaskLittleEndianHighByteOnlySetHasSamePrefix != 0 {
    1310           0 :                                                 // We still have the same prefix, so move to the next restart.
    1311           0 :                                                 index++
    1312           0 :                                         }
    1313             :                                         // index is the first restart that did not have the same prefix.
    1314           0 :                                         if index != startingIndex {
    1315           0 :                                                 // Managed to skip past at least one restart. Resume iteration
    1316           0 :                                                 // from index-1. Since nextFastCount has been reset to 0, we
    1317           0 :                                                 // should be able to iterate to the next prefix.
    1318           0 :                                                 i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
    1319           0 :                                                 i.readEntry()
    1320           0 :                                         }
    1321             :                                         // Else, unable to skip past any restart. Resume iteration. Since
    1322             :                                         // nextFastCount has been reset to 0, we should be able to iterate
    1323             :                                         // to the next prefix.
    1324           0 :                                         continue
    1325             :                                 }
    1326           1 :                                 continue
    1327           1 :                         } else if prevKeyIsSet {
    1328           1 :                                 prefixChanged = true
    1329           1 :                         }
    1330           1 :                 } else {
    1331           1 :                         prevKeyIsSet = false
    1332           1 :                 }
    1333             :                 // Slow-path cases:
    1334             :                 // - (Likely) The prefix has changed.
    1335             :                 // - (Unlikely) The prefix has not changed.
    1336             :                 // We assemble the key etc. under the assumption that it is the likely
    1337             :                 // case.
    1338           1 :                 unsharedKey := getBytes(ptr, int(unshared))
    1339           1 :                 // TODO(sumeer): move this into the else block below. This is a bit tricky
    1340           1 :                 // since the current logic assumes we have always copied the latest key
    1341           1 :                 // into fullKey, which is why when we get to the next key we can (a)
    1342           1 :                 // access i.fullKey[:shared], (b) append only the unsharedKey to
    1343           1 :                 // i.fullKey. For (a), we can access i.key[:shared] since that memory is
    1344           1 :                 // valid (even if unshared). For (b), we will need to remember whether
    1345           1 :                 // i.key refers to i.fullKey or not, and can append the unsharedKey only
    1346           1 :                 // in the former case and for the latter case need to copy the shared part
    1347           1 :                 // too. This same comment applies to the other place where we can do this
    1348           1 :                 // optimization, in readEntry().
    1349           1 :                 i.fullKey = append(i.fullKey[:shared], unsharedKey...)
    1350           1 :                 i.val = getBytes(valuePtr, int(value))
    1351           1 :                 if shared == 0 {
    1352           1 :                         // Provide stability for the key across positioning calls if the key
    1353           1 :                         // doesn't share a prefix with the previous key. This removes requiring the
    1354           1 :                         // key to be copied if the caller knows the block has a restart interval of
    1355           1 :                         // 1. An important example of this is range-del blocks.
    1356           1 :                         i.key = unsharedKey
    1357           1 :                 } else {
    1358           1 :                         i.key = i.fullKey
    1359           1 :                 }
    1360             :                 // Manually inlined version of i.decodeInternalKey(i.key).
    1361           1 :                 hiddenPoint := false
    1362           1 :                 if n := len(i.key) - 8; n >= 0 {
    1363           1 :                         trailer := base.InternalKeyTrailer(binary.LittleEndian.Uint64(i.key[n:]))
    1364           1 :                         hiddenPoint = i.transforms.HideObsoletePoints &&
    1365           1 :                                 (trailer&TrailerObsoleteBit != 0)
    1366           1 :                         i.ikv.K = base.InternalKey{
    1367           1 :                                 Trailer: trailer & TrailerObsoleteMask,
    1368           1 :                                 UserKey: i.key[:n:n],
    1369           1 :                         }
    1370           1 :                         if n := i.transforms.SyntheticSeqNum; n != 0 {
    1371           1 :                                 i.ikv.K.SetSeqNum(base.SeqNum(n))
    1372           1 :                         }
    1373           1 :                         if i.transforms.SyntheticSuffix.IsSet() {
    1374           0 :                                 // Inlined version of i.maybeReplaceSuffix()
    1375           0 :                                 prefixLen := i.split(i.ikv.K.UserKey)
    1376           0 :                                 i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikv.K.UserKey[:prefixLen]...)
    1377           0 :                                 i.synthSuffixBuf = append(i.synthSuffixBuf, i.transforms.SyntheticSuffix...)
    1378           0 :                                 i.ikv.K.UserKey = i.synthSuffixBuf
    1379           0 :                         }
    1380           0 :                 } else {
    1381           0 :                         i.ikv.K.Trailer = base.InternalKeyTrailer(base.InternalKeyKindInvalid)
    1382           0 :                         i.ikv.K.UserKey = nil
    1383           0 :                 }
    1384           1 :                 nextCmpCount++
    1385           1 :                 if invariants.Enabled && prefixChanged && i.cmp(i.ikv.K.UserKey, succKey) < 0 {
    1386           0 :                         panic(errors.AssertionFailedf("prefix should have changed but %x < %x",
    1387           0 :                                 i.ikv.K.UserKey, succKey))
    1388             :                 }
    1389           1 :                 if prefixChanged || i.cmp(i.ikv.K.UserKey, succKey) >= 0 {
    1390           1 :                         // Prefix has changed.
    1391           1 :                         if hiddenPoint {
    1392           1 :                                 return i.Next()
    1393           1 :                         }
    1394           1 :                         if invariants.Enabled && !i.lazyValueHandling.hasValuePrefix {
    1395           0 :                                 panic(errors.AssertionFailedf("nextPrefixV3 being run for non-v3 sstable"))
    1396             :                         }
    1397           1 :                         if i.ikv.K.Kind() != base.InternalKeyKindSet {
    1398           1 :                                 i.ikv.V = base.MakeInPlaceValue(i.val)
    1399           1 :                         } else if i.lazyValueHandling.getValue == nil || !block.ValuePrefix(i.val[0]).IsValueHandle() {
    1400           1 :                                 i.ikv.V = base.MakeInPlaceValue(i.val[1:])
    1401           1 :                         } else {
    1402           0 :                                 i.ikv.V = i.lazyValueHandling.getValue.GetLazyValueForPrefixAndValueHandle(i.val)
    1403           0 :                         }
    1404           1 :                         return &i.ikv
    1405             :                 }
    1406             :                 // Else prefix has not changed.
    1407             : 
    1408           1 :                 if nextCmpCount >= nextCmpThresholdBeforeSeek {
    1409           0 :                         break
    1410             :                 }
    1411             :         }
    1412           0 :         return i.SeekGE(succKey, base.SeekGEFlagsNone)
    1413             : }
    1414             : 
    1415             : // Prev implements internalIterator.Prev, as documented in the pebble
    1416             : // package.
    1417           1 : func (i *Iter) Prev() *base.InternalKV {
    1418           1 : start:
    1419           1 :         for n := len(i.cached) - 1; n >= 0; n-- {
    1420           1 :                 i.nextOffset = i.offset
    1421           1 :                 e := &i.cached[n]
    1422           1 :                 i.offset = e.offset
    1423           1 :                 i.val = getBytes(unsafe.Pointer(uintptr(i.ptr)+uintptr(e.valStart)), int(e.valSize))
    1424           1 :                 // Manually inlined version of i.decodeInternalKey(i.key).
    1425           1 :                 i.key = i.cachedBuf[e.keyStart:e.keyEnd]
    1426           1 :                 if n := len(i.key) - 8; n >= 0 {
    1427           1 :                         trailer := base.InternalKeyTrailer(binary.LittleEndian.Uint64(i.key[n:]))
    1428           1 :                         hiddenPoint := i.transforms.HideObsoletePoints &&
    1429           1 :                                 (trailer&TrailerObsoleteBit != 0)
    1430           1 :                         if hiddenPoint {
    1431           1 :                                 continue
    1432             :                         }
    1433           1 :                         i.ikv.K = base.InternalKey{
    1434           1 :                                 Trailer: trailer & TrailerObsoleteMask,
    1435           1 :                                 UserKey: i.key[:n:n],
    1436           1 :                         }
    1437           1 :                         if n := i.transforms.SyntheticSeqNum; n != 0 {
    1438           1 :                                 i.ikv.K.SetSeqNum(base.SeqNum(n))
    1439           1 :                         }
    1440           1 :                         if i.transforms.SyntheticSuffix.IsSet() {
    1441           1 :                                 // Inlined version of i.maybeReplaceSuffix()
    1442           1 :                                 prefixLen := i.split(i.ikv.K.UserKey)
    1443           1 :                                 // If ikey is cached or may get cached, we must de-reference
    1444           1 :                                 // UserKey before suffix replacement.
    1445           1 :                                 i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikv.K.UserKey[:prefixLen]...)
    1446           1 :                                 i.synthSuffixBuf = append(i.synthSuffixBuf, i.transforms.SyntheticSuffix...)
    1447           1 :                                 i.ikv.K.UserKey = i.synthSuffixBuf
    1448           1 :                         }
    1449           0 :                 } else {
    1450           0 :                         i.ikv.K.Trailer = base.InternalKeyTrailer(base.InternalKeyKindInvalid)
    1451           0 :                         i.ikv.K.UserKey = nil
    1452           0 :                 }
    1453           1 :                 i.cached = i.cached[:n]
    1454           1 :                 if !i.lazyValueHandling.hasValuePrefix ||
    1455           1 :                         i.ikv.K.Kind() != base.InternalKeyKindSet {
    1456           1 :                         i.ikv.V = base.MakeInPlaceValue(i.val)
    1457           1 :                 } else if i.lazyValueHandling.getValue == nil || !block.ValuePrefix(i.val[0]).IsValueHandle() {
    1458           1 :                         i.ikv.V = base.MakeInPlaceValue(i.val[1:])
    1459           1 :                 } else {
    1460           1 :                         i.ikv.V = i.lazyValueHandling.getValue.GetLazyValueForPrefixAndValueHandle(i.val)
    1461           1 :                 }
    1462           1 :                 return &i.ikv
    1463             :         }
    1464             : 
    1465           1 :         i.clearCache()
    1466           1 :         if i.offset <= 0 {
    1467           1 :                 i.offset = -1
    1468           1 :                 i.nextOffset = 0
    1469           1 :                 return nil
    1470           1 :         }
    1471             : 
    1472           1 :         targetOffset := i.offset
    1473           1 :         var index int32
    1474           1 : 
    1475           1 :         {
    1476           1 :                 // NB: manually inlined sort.Sort is ~5% faster.
    1477           1 :                 //
    1478           1 :                 // Define f(-1) == false and f(n) == true.
    1479           1 :                 // Invariant: f(index-1) == false, f(upper) == true.
    1480           1 :                 upper := i.numRestarts
    1481           1 :                 for index < upper {
    1482           1 :                         h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
    1483           1 :                         // index ≤ h < upper
    1484           1 :                         offset := decodeRestart(i.data[i.restarts+4*h:])
    1485           1 :                         if offset < targetOffset {
    1486           1 :                                 // Looking for the first restart that has offset >= targetOffset, so
    1487           1 :                                 // ignore h and earlier.
    1488           1 :                                 index = h + 1 // preserves f(i-1) == false
    1489           1 :                         } else {
    1490           1 :                                 upper = h // preserves f(j) == true
    1491           1 :                         }
    1492             :                 }
    1493             :                 // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
    1494             :                 // => answer is index.
    1495             :         }
    1496             : 
    1497             :         // index is first restart with offset >= targetOffset. Note that
    1498             :         // targetOffset may not be at a restart point since one can call Prev()
    1499             :         // after Next() (so the cache was not populated) and targetOffset refers to
    1500             :         // the current entry. index-1 must have an offset < targetOffset (it can't
    1501             :         // be equal to targetOffset since the binary search would have selected that
    1502             :         // as the index).
    1503           1 :         i.offset = 0
    1504           1 :         if index > 0 {
    1505           1 :                 i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
    1506           1 :         }
    1507             :         // TODO(sumeer): why is the else case not an error given targetOffset is a
    1508             :         // valid offset.
    1509             : 
    1510           1 :         i.readEntry()
    1511           1 : 
    1512           1 :         // We stop when i.nextOffset == targetOffset since the targetOffset is the
    1513           1 :         // entry we are stepping back from, and we don't need to cache the entry
    1514           1 :         // before it, since it is the candidate to return.
    1515           1 :         for i.nextOffset < targetOffset {
    1516           1 :                 i.cacheEntry()
    1517           1 :                 i.offset = i.nextOffset
    1518           1 :                 i.readEntry()
    1519           1 :         }
    1520             : 
    1521           1 :         hiddenPoint := i.decodeInternalKey(i.key)
    1522           1 :         if hiddenPoint {
    1523           1 :                 // Use the cache.
    1524           1 :                 goto start
    1525             :         }
    1526           1 :         if i.transforms.SyntheticSuffix.IsSet() {
    1527           1 :                 // Inlined version of i.maybeReplaceSuffix()
    1528           1 :                 prefixLen := i.split(i.ikv.K.UserKey)
    1529           1 :                 // If ikey is cached or may get cached, we must de-reference
    1530           1 :                 // UserKey before suffix replacement.
    1531           1 :                 i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikv.K.UserKey[:prefixLen]...)
    1532           1 :                 i.synthSuffixBuf = append(i.synthSuffixBuf, i.transforms.SyntheticSuffix...)
    1533           1 :                 i.ikv.K.UserKey = i.synthSuffixBuf
    1534           1 :         }
    1535           1 :         if !i.lazyValueHandling.hasValuePrefix ||
    1536           1 :                 i.ikv.K.Kind() != base.InternalKeyKindSet {
    1537           1 :                 i.ikv.V = base.MakeInPlaceValue(i.val)
    1538           1 :         } else if i.lazyValueHandling.getValue == nil || !block.ValuePrefix(i.val[0]).IsValueHandle() {
    1539           1 :                 i.ikv.V = base.MakeInPlaceValue(i.val[1:])
    1540           1 :         } else {
    1541           1 :                 i.ikv.V = i.lazyValueHandling.getValue.GetLazyValueForPrefixAndValueHandle(i.val)
    1542           1 :         }
    1543           1 :         return &i.ikv
    1544             : }
    1545             : 
    1546             : // Key returns the internal key at the current iterator position.
    1547           0 : func (i *Iter) Key() *base.InternalKey {
    1548           0 :         return &i.ikv.K
    1549           0 : }
    1550             : 
    1551             : // KV returns the internal KV at the current iterator position.
    1552           1 : func (i *Iter) KV() *base.InternalKV {
    1553           1 :         return &i.ikv
    1554           1 : }
    1555             : 
    1556             : // Value returns the value at the current iterator position.
    1557           0 : func (i *Iter) Value() base.LazyValue {
    1558           0 :         return i.ikv.V
    1559           0 : }
    1560             : 
    1561             : // Error implements internalIterator.Error, as documented in the pebble
    1562             : // package.
    1563           1 : func (i *Iter) Error() error {
    1564           1 :         return nil // infallible
    1565           1 : }
    1566             : 
    1567             : // Close implements internalIterator.Close, as documented in the pebble
    1568             : // package.
    1569           1 : func (i *Iter) Close() error {
    1570           1 :         i.handle.Release()
    1571           1 :         i.handle = block.BufferHandle{}
    1572           1 :         i.val = nil
    1573           1 :         i.ikv = base.InternalKV{}
    1574           1 :         i.lazyValueHandling.getValue = nil
    1575           1 :         return nil
    1576           1 : }
    1577             : 
    1578             : // SetBounds implements base.InternalIterator. It panics, as bounds should
    1579             : // always be handled the by the parent sstable iterator.
    1580           0 : func (i *Iter) SetBounds(lower, upper []byte) {
    1581           0 :         // This should never be called as bounds are handled by sstable.Iterator.
    1582           0 :         panic("pebble: SetBounds unimplemented")
    1583             : }
    1584             : 
    1585             : // SetContext implements base.InternalIterator.
    1586           0 : func (i *Iter) SetContext(_ context.Context) {}
    1587             : 
    1588             : // Valid returns true if the iterator is currently positioned at a valid KV.
    1589           1 : func (i *Iter) Valid() bool {
    1590           1 :         return i.offset >= 0 && i.offset < i.restarts
    1591           1 : }
    1592             : 
    1593             : // DebugTree is part of the InternalIterator interface.
    1594           0 : func (i *Iter) DebugTree(tp treeprinter.Node) {
    1595           0 :         tp.Childf("%T(%p)", i, i)
    1596           0 : }
    1597             : 
    1598           0 : func (i *Iter) getRestart(idx int) int32 {
    1599           0 :         return int32(binary.LittleEndian.Uint32(i.data[i.restarts+4*int32(idx):]))
    1600           0 : }
    1601             : 
    1602           0 : func (i *Iter) isRestartPoint() bool {
    1603           0 :         j := sort.Search(int(i.numRestarts), func(j int) bool {
    1604           0 :                 return i.getRestart(j) >= i.offset
    1605           0 :         })
    1606           0 :         return j < int(i.numRestarts) && i.getRestart(j) == i.offset
    1607             : }
    1608             : 
    1609             : // DescribeKV is a function that formats a key-value pair, writing the
    1610             : // description to w.
    1611             : type DescribeKV func(w io.Writer, key *base.InternalKey, val []byte, enc KVEncoding)
    1612             : 
    1613             : // KVEncoding describes the encoding of a key-value pair within the block.
    1614             : type KVEncoding struct {
    1615             :         // IsRestart is true if the key is a restart point.
    1616             :         IsRestart bool
    1617             :         // Offset is the position within the block at which the key-value pair is
    1618             :         // encoded.
    1619             :         Offset int32
    1620             :         // Length is the total length of the KV pair as it is encoded in the block
    1621             :         // format.
    1622             :         Length int32
    1623             :         // KeyShared is the number of bytes this KV's user key shared with its predecessor.
    1624             :         KeyShared uint32
    1625             :         // KeyUnshared is the number of bytes this KV's user key did not share with
    1626             :         // its predecessor.
    1627             :         KeyUnshared uint32
    1628             :         // ValueLen is the length of the internal value.
    1629             :         ValueLen uint32
    1630             : }
    1631             : 
    1632             : // Describe describes the contents of a block, writing the description to w.
    1633             : // It invokes fmtKV to describe each key-value pair.
    1634           0 : func (i *Iter) Describe(tp treeprinter.Node, fmtKV DescribeKV) {
    1635           0 :         var buf bytes.Buffer
    1636           0 :         for kv := i.First(); kv != nil; kv = i.Next() {
    1637           0 :                 enc := KVEncoding{
    1638           0 :                         IsRestart: i.isRestartPoint(),
    1639           0 :                         Offset:    i.offset,
    1640           0 :                         Length:    int32(i.nextOffset - i.offset),
    1641           0 :                 }
    1642           0 :                 ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset))
    1643           0 :                 enc.KeyShared, ptr = decodeVarint(ptr)
    1644           0 :                 enc.KeyUnshared, ptr = decodeVarint(ptr)
    1645           0 :                 enc.ValueLen, _ = decodeVarint(ptr)
    1646           0 :                 buf.Reset()
    1647           0 :                 fmtKV(&buf, &kv.K, kv.V.ValueOrHandle, enc)
    1648           0 :                 tp.Child(buf.String())
    1649           0 :         }
    1650             :         // Format the restart points.
    1651           0 :         n := tp.Child("restart points")
    1652           0 :         // Format the restart points.
    1653           0 :         for j := 0; j < int(i.numRestarts); j++ {
    1654           0 :                 offset := i.getRestart(j)
    1655           0 :                 n.Childf("%05d [restart %d]", uint64(i.restarts+4*int32(j)), offset)
    1656           0 :         }
    1657             : }
    1658             : 
    1659             : // RawIter is an iterator over a single block of data. Unlike blockIter,
    1660             : // keys are stored in "raw" format (i.e. not as internal keys). Note that there
    1661             : // is significant similarity between this code and the code in blockIter. Yet
    1662             : // reducing duplication is difficult due to the blockIter being performance
    1663             : // critical. RawIter must only be used for blocks where the value is
    1664             : // stored together with the key.
    1665             : type RawIter struct {
    1666             :         cmp         base.Compare
    1667             :         offset      int32
    1668             :         nextOffset  int32
    1669             :         restarts    int32
    1670             :         numRestarts int32
    1671             :         ptr         unsafe.Pointer
    1672             :         data        []byte
    1673             :         key, val    []byte
    1674             :         ikey        base.InternalKey
    1675             :         cached      []blockEntry
    1676             :         cachedBuf   []byte
    1677             : }
    1678             : 
    1679             : // NewRawIter constructs a new raw block iterator.
    1680           1 : func NewRawIter(cmp base.Compare, block []byte) (*RawIter, error) {
    1681           1 :         i := &RawIter{}
    1682           1 :         return i, i.Init(cmp, block)
    1683           1 : }
    1684             : 
    1685             : // Init initializes the raw block iterator.
    1686           1 : func (i *RawIter) Init(cmp base.Compare, blk []byte) error {
    1687           1 :         numRestarts := int32(binary.LittleEndian.Uint32(blk[len(blk)-4:]))
    1688           1 :         if numRestarts == 0 {
    1689           0 :                 return base.CorruptionErrorf("pebble/table: invalid table (block has no restart points)")
    1690           0 :         }
    1691           1 :         i.cmp = cmp
    1692           1 :         i.restarts = int32(len(blk)) - 4*(1+numRestarts)
    1693           1 :         i.numRestarts = numRestarts
    1694           1 :         i.ptr = unsafe.Pointer(&blk[0])
    1695           1 :         i.data = blk
    1696           1 :         if i.key == nil {
    1697           1 :                 i.key = make([]byte, 0, 256)
    1698           1 :         } else {
    1699           0 :                 i.key = i.key[:0]
    1700           0 :         }
    1701           1 :         i.val = nil
    1702           1 :         i.clearCache()
    1703           1 :         return nil
    1704             : }
    1705             : 
    1706           1 : func (i *RawIter) readEntry() {
    1707           1 :         ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset))
    1708           1 :         shared, ptr := decodeVarint(ptr)
    1709           1 :         unshared, ptr := decodeVarint(ptr)
    1710           1 :         value, ptr := decodeVarint(ptr)
    1711           1 :         i.key = append(i.key[:shared], getBytes(ptr, int(unshared))...)
    1712           1 :         i.key = i.key[:len(i.key):len(i.key)]
    1713           1 :         ptr = unsafe.Pointer(uintptr(ptr) + uintptr(unshared))
    1714           1 :         i.val = getBytes(ptr, int(value))
    1715           1 :         i.nextOffset = int32(uintptr(ptr)-uintptr(i.ptr)) + int32(value)
    1716           1 : }
    1717             : 
    1718           1 : func (i *RawIter) loadEntry() {
    1719           1 :         i.readEntry()
    1720           1 :         i.ikey.UserKey = i.key
    1721           1 : }
    1722             : 
    1723           1 : func (i *RawIter) clearCache() {
    1724           1 :         i.cached = i.cached[:0]
    1725           1 :         i.cachedBuf = i.cachedBuf[:0]
    1726           1 : }
    1727             : 
    1728           0 : func (i *RawIter) cacheEntry() {
    1729           0 :         var valStart int32
    1730           0 :         valSize := int32(len(i.val))
    1731           0 :         if valSize > 0 {
    1732           0 :                 valStart = int32(uintptr(unsafe.Pointer(&i.val[0])) - uintptr(i.ptr))
    1733           0 :         }
    1734             : 
    1735           0 :         i.cached = append(i.cached, blockEntry{
    1736           0 :                 offset:   i.offset,
    1737           0 :                 keyStart: int32(len(i.cachedBuf)),
    1738           0 :                 keyEnd:   int32(len(i.cachedBuf) + len(i.key)),
    1739           0 :                 valStart: valStart,
    1740           0 :                 valSize:  valSize,
    1741           0 :         })
    1742           0 :         i.cachedBuf = append(i.cachedBuf, i.key...)
    1743             : }
    1744             : 
    1745             : // SeekGE implements internalIterator.SeekGE, as documented in the pebble
    1746             : // package.
    1747           0 : func (i *RawIter) SeekGE(key []byte) bool {
    1748           0 :         // Find the index of the smallest restart point whose key is > the key
    1749           0 :         // sought; index will be numRestarts if there is no such restart point.
    1750           0 :         i.offset = 0
    1751           0 :         index := sort.Search(int(i.numRestarts), func(j int) bool {
    1752           0 :                 offset := int32(binary.LittleEndian.Uint32(i.data[int(i.restarts)+4*j:]))
    1753           0 :                 // For a restart point, there are 0 bytes shared with the previous key.
    1754           0 :                 // The varint encoding of 0 occupies 1 byte.
    1755           0 :                 ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(offset+1))
    1756           0 :                 // Decode the key at that restart point, and compare it to the key sought.
    1757           0 :                 v1, ptr := decodeVarint(ptr)
    1758           0 :                 _, ptr = decodeVarint(ptr)
    1759           0 :                 s := getBytes(ptr, int(v1))
    1760           0 :                 return i.cmp(key, s) < 0
    1761           0 :         })
    1762             : 
    1763             :         // Since keys are strictly increasing, if index > 0 then the restart point at
    1764             :         // index-1 will be the largest whose key is <= the key sought.  If index ==
    1765             :         // 0, then all keys in this block are larger than the key sought, and offset
    1766             :         // remains at zero.
    1767           0 :         if index > 0 {
    1768           0 :                 i.offset = int32(binary.LittleEndian.Uint32(i.data[int(i.restarts)+4*(index-1):]))
    1769           0 :         }
    1770           0 :         i.loadEntry()
    1771           0 : 
    1772           0 :         // Iterate from that restart point to somewhere >= the key sought.
    1773           0 :         for valid := i.Valid(); valid; valid = i.Next() {
    1774           0 :                 if i.cmp(key, i.key) <= 0 {
    1775           0 :                         break
    1776             :                 }
    1777             :         }
    1778           0 :         return i.Valid()
    1779             : }
    1780             : 
    1781             : // First implements internalIterator.First, as documented in the pebble
    1782             : // package.
    1783           1 : func (i *RawIter) First() bool {
    1784           1 :         i.offset = 0
    1785           1 :         i.loadEntry()
    1786           1 :         return i.Valid()
    1787           1 : }
    1788             : 
    1789             : // Last implements internalIterator.Last, as documented in the pebble package.
    1790           0 : func (i *RawIter) Last() bool {
    1791           0 :         // Seek forward from the last restart point.
    1792           0 :         i.offset = int32(binary.LittleEndian.Uint32(i.data[i.restarts+4*(i.numRestarts-1):]))
    1793           0 : 
    1794           0 :         i.readEntry()
    1795           0 :         i.clearCache()
    1796           0 :         i.cacheEntry()
    1797           0 : 
    1798           0 :         for i.nextOffset < i.restarts {
    1799           0 :                 i.offset = i.nextOffset
    1800           0 :                 i.readEntry()
    1801           0 :                 i.cacheEntry()
    1802           0 :         }
    1803             : 
    1804           0 :         i.ikey.UserKey = i.key
    1805           0 :         return i.Valid()
    1806             : }
    1807             : 
    1808             : // Next implements internalIterator.Next, as documented in the pebble
    1809             : // package.
    1810           1 : func (i *RawIter) Next() bool {
    1811           1 :         i.offset = i.nextOffset
    1812           1 :         if !i.Valid() {
    1813           1 :                 return false
    1814           1 :         }
    1815           1 :         i.loadEntry()
    1816           1 :         return true
    1817             : }
    1818             : 
    1819             : // Prev implements internalIterator.Prev, as documented in the pebble
    1820             : // package.
    1821           0 : func (i *RawIter) Prev() bool {
    1822           0 :         if n := len(i.cached) - 1; n > 0 && i.cached[n].offset == i.offset {
    1823           0 :                 i.nextOffset = i.offset
    1824           0 :                 e := &i.cached[n-1]
    1825           0 :                 i.offset = e.offset
    1826           0 :                 i.val = getBytes(unsafe.Pointer(uintptr(i.ptr)+uintptr(e.valStart)), int(e.valSize))
    1827           0 :                 i.ikey.UserKey = i.cachedBuf[e.keyStart:e.keyEnd]
    1828           0 :                 i.cached = i.cached[:n]
    1829           0 :                 return true
    1830           0 :         }
    1831             : 
    1832           0 :         if i.offset == 0 {
    1833           0 :                 i.offset = -1
    1834           0 :                 i.nextOffset = 0
    1835           0 :                 return false
    1836           0 :         }
    1837             : 
    1838           0 :         targetOffset := i.offset
    1839           0 :         index := sort.Search(int(i.numRestarts), func(j int) bool {
    1840           0 :                 offset := int32(binary.LittleEndian.Uint32(i.data[int(i.restarts)+4*j:]))
    1841           0 :                 return offset >= targetOffset
    1842           0 :         })
    1843           0 :         i.offset = 0
    1844           0 :         if index > 0 {
    1845           0 :                 i.offset = int32(binary.LittleEndian.Uint32(i.data[int(i.restarts)+4*(index-1):]))
    1846           0 :         }
    1847             : 
    1848           0 :         i.readEntry()
    1849           0 :         i.clearCache()
    1850           0 :         i.cacheEntry()
    1851           0 : 
    1852           0 :         for i.nextOffset < targetOffset {
    1853           0 :                 i.offset = i.nextOffset
    1854           0 :                 i.readEntry()
    1855           0 :                 i.cacheEntry()
    1856           0 :         }
    1857             : 
    1858           0 :         i.ikey.UserKey = i.key
    1859           0 :         return true
    1860             : }
    1861             : 
    1862             : // Key implements internalIterator.Key, as documented in the pebble package.
    1863           1 : func (i *RawIter) Key() base.InternalKey {
    1864           1 :         return i.ikey
    1865           1 : }
    1866             : 
    1867             : // Value implements internalIterator.Value, as documented in the pebble
    1868             : // package.
    1869           1 : func (i *RawIter) Value() []byte {
    1870           1 :         return i.val
    1871           1 : }
    1872             : 
    1873             : // Valid implements internalIterator.Valid, as documented in the pebble
    1874             : // package.
    1875           1 : func (i *RawIter) Valid() bool {
    1876           1 :         return i.offset >= 0 && i.offset < i.restarts
    1877           1 : }
    1878             : 
    1879             : // Error implements internalIterator.Error, as documented in the pebble
    1880             : // package.
    1881           0 : func (i *RawIter) Error() error {
    1882           0 :         return nil
    1883           0 : }
    1884             : 
    1885             : // Close implements internalIterator.Close, as documented in the pebble
    1886             : // package.
    1887           1 : func (i *RawIter) Close() error {
    1888           1 :         i.val = nil
    1889           1 :         return nil
    1890           1 : }
    1891             : 
    1892             : // DebugTree is part of the InternalIterator interface.
    1893           0 : func (i *RawIter) DebugTree(tp treeprinter.Node) {
    1894           0 :         tp.Childf("%T(%p)", i, i)
    1895           0 : }
    1896             : 
    1897           0 : func (i *RawIter) getRestart(idx int) int32 {
    1898           0 :         return int32(binary.LittleEndian.Uint32(i.data[i.restarts+4*int32(idx):]))
    1899           0 : }
    1900             : 
    1901           0 : func (i *RawIter) isRestartPoint() bool {
    1902           0 :         j := sort.Search(int(i.numRestarts), func(j int) bool {
    1903           0 :                 return i.getRestart(j) >= i.offset
    1904           0 :         })
    1905           0 :         return j < int(i.numRestarts) && i.getRestart(j) == i.offset
    1906             : }
    1907             : 
    1908             : // Describe describes the contents of a block, writing the description to w.
    1909             : // It invokes fmtKV to describe each key-value pair.
    1910           0 : func (i *RawIter) Describe(tp treeprinter.Node, fmtKV DescribeKV) {
    1911           0 :         var buf bytes.Buffer
    1912           0 :         for valid := i.First(); valid; valid = i.Next() {
    1913           0 :                 enc := KVEncoding{
    1914           0 :                         IsRestart: i.isRestartPoint(),
    1915           0 :                         Offset:    i.offset,
    1916           0 :                         Length:    int32(i.nextOffset - i.offset),
    1917           0 :                 }
    1918           0 :                 ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset))
    1919           0 :                 enc.KeyShared, ptr = decodeVarint(ptr)
    1920           0 :                 enc.KeyUnshared, ptr = decodeVarint(ptr)
    1921           0 :                 enc.ValueLen, _ = decodeVarint(ptr)
    1922           0 :                 buf.Reset()
    1923           0 :                 fmtKV(&buf, &i.ikey, i.val, enc)
    1924           0 :                 if i.isRestartPoint() {
    1925           0 :                         buf.WriteString(" [restart]")
    1926           0 :                 }
    1927           0 :                 tp.Child(buf.String())
    1928             :         }
    1929           0 :         n := tp.Child("restart points")
    1930           0 :         // Format the restart points.
    1931           0 :         for j := 0; j < int(i.numRestarts); j++ {
    1932           0 :                 offset := i.getRestart(j)
    1933           0 :                 n.Childf("%05d [restart %d]", uint64(i.restarts+4*int32(j)), offset)
    1934           0 :         }
    1935             : }
    1936             : 
    1937           1 : func getBytes(ptr unsafe.Pointer, length int) []byte {
    1938           1 :         return (*[manual.MaxArrayLen]byte)(ptr)[:length:length]
    1939           1 : }
    1940             : 
    1941           1 : func decodeVarint(ptr unsafe.Pointer) (uint32, unsafe.Pointer) {
    1942           1 :         if a := *((*uint8)(ptr)); a < 128 {
    1943           1 :                 return uint32(a),
    1944           1 :                         unsafe.Pointer(uintptr(ptr) + 1)
    1945           1 :         } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
    1946           0 :                 return uint32(b)<<7 | uint32(a),
    1947           0 :                         unsafe.Pointer(uintptr(ptr) + 2)
    1948           0 :         } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
    1949           0 :                 return uint32(c)<<14 | uint32(b)<<7 | uint32(a),
    1950           0 :                         unsafe.Pointer(uintptr(ptr) + 3)
    1951           0 :         } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
    1952           0 :                 return uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a),
    1953           0 :                         unsafe.Pointer(uintptr(ptr) + 4)
    1954           0 :         } else {
    1955           0 :                 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
    1956           0 :                 return uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a),
    1957           0 :                         unsafe.Pointer(uintptr(ptr) + 5)
    1958           0 :         }
    1959             : }

Generated by: LCOV version 1.14