LCOV - 2024-05-10 08:16Z a43bb5d5 - tests + meta.lcov

LCOV - code coverage report

Current view:	top level - pebble/sstable - block_iter.go (source / functions)		Hit	Total	Coverage
Test:	2024-05-10 08:16Z a43bb5d5 - tests + meta.lcov	Lines:	908	1084	83.8 %
Date:	2024-05-10 08:18:42	Functions:	0	0	-

          Line data    Source code

       1             : // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
       2             : // of this source code is governed by a BSD-style license that can be found in
       3             : // the LICENSE file.
       4             : 
       5             : package sstable
       6             : 
       7             : import (
       8             :         "bytes"
       9             :         "context"
      10             :         "encoding/binary"
      11             :         "unsafe"
      12             : 
      13             :         "github.com/cockroachdb/errors"
      14             :         "github.com/cockroachdb/pebble/internal/base"
      15             :         "github.com/cockroachdb/pebble/internal/invariants"
      16             :         "github.com/cockroachdb/pebble/internal/manual"
      17             :         "golang.org/x/exp/slices"
      18             : )
      19             : 
      20             : // blockIter is an iterator over a single block of data.
      21             : //
      22             : // A blockIter provides an additional guarantee around key stability when a
      23             : // block has a restart interval of 1 (i.e. when there is no prefix
      24             : // compression). Key stability refers to whether the InternalKey.UserKey bytes
      25             : // returned by a positioning call will remain stable after a subsequent
      26             : // positioning call. The normal case is that a positioning call will invalidate
      27             : // any previously returned InternalKey.UserKey. If a block has a restart
      28             : // interval of 1 (no prefix compression), blockIter guarantees that
      29             : // InternalKey.UserKey will point to the key as stored in the block itself
      30             : // which will remain valid until the blockIter is closed. The key stability
      31             : // guarantee is used by the range tombstone and range key code, which knows that
      32             : // the respective blocks are always encoded with a restart interval of 1. This
      33             : // per-block key stability guarantee is sufficient for range tombstones and
      34             : // range deletes as they are always encoded in a single block. Note: this
      35             : // stability guarantee no longer holds for a block iter with synthetic suffix
      36             : // replacement, but this doesn't matter, as the user will not open
      37             : // an iterator with a synthetic suffix on a block with rangekeys (for now).
      38             : //
      39             : // A blockIter also provides a value stability guarantee for range deletions and
      40             : // range keys since there is only a single range deletion and range key block
      41             : // per sstable and the blockIter will not release the bytes for the block until
      42             : // it is closed.
      43             : //
      44             : // Note on why blockIter knows about lazyValueHandling:
      45             : //
      46             : // blockIter's positioning functions (that return a LazyValue), are too
      47             : // complex to inline even prior to lazyValueHandling. blockIter.Next and
      48             : // blockIter.First were by far the cheapest and had costs 195 and 180
      49             : // respectively, which exceeds the budget of 80. We initially tried to keep
      50             : // the lazyValueHandling logic out of blockIter by wrapping it with a
      51             : // lazyValueDataBlockIter. singleLevelIter and twoLevelIter would use this
      52             : // wrapped iter. The functions in lazyValueDataBlockIter were simple, in that
      53             : // they called the corresponding blockIter func and then decided whether the
      54             : // value was in fact in-place (so return immediately) or needed further
      55             : // handling. But these also turned out too costly for mid-stack inlining since
      56             : // simple calls like the following have a high cost that is barely under the
      57             : // budget of 80
      58             : //
      59             : //      k, v := i.data.SeekGE(key, flags)  // cost 74
      60             : //      k, v := i.data.Next()              // cost 72
      61             : //
      62             : // We have 2 options for minimizing performance regressions:
      63             : //   - Include the lazyValueHandling logic in the already non-inlineable
      64             : //     blockIter functions: Since most of the time is spent in data block iters,
      65             : //     it is acceptable to take the small hit of unnecessary branching (which
      66             : //     hopefully branch prediction will predict correctly) for other kinds of
      67             : //     blocks.
      68             : //   - Duplicate the logic of singleLevelIterator and twoLevelIterator for the
      69             : //     v3 sstable and only use the aforementioned lazyValueDataBlockIter for a
      70             : //     v3 sstable. We would want to manage these copies via code generation.
      71             : //
      72             : // We have picked the first option here.
      73             : type blockIter struct {
      74             :         cmp   Compare
      75             :         split Split
      76             : 
      77             :         // Iterator transforms.
      78             :         //
      79             :         // SyntheticSuffix, if set, will replace the decoded ikey.UserKey suffix
      80             :         // before the key is returned to the user. A sequence of iter operations on a
      81             :         // block with a syntheticSuffix rule should return keys as if those operations
      82             :         // ran on a block with keys that all had the syntheticSuffix. As an example:
      83             :         // any sequence of block iter cmds should return the same keys for the
      84             :         // following two blocks:
      85             :         //
      86             :         // blockA: a@3,b@3,c@3
      87             :         // blockB: a@1,b@2,c@1 with syntheticSuffix=3
      88             :         //
      89             :         // To ensure this, Suffix replacement will not change the ordering of keys in
      90             :         // the block because the iter assumes that no two keys in the block share the
      91             :         // same prefix. Furthermore, during SeekGE and SeekLT operations, the block
      92             :         // iterator handles "off by one" errors (explained in more detail in those
      93             :         // functions) when, for a given key, originalSuffix < searchSuffix <
      94             :         // replacementSuffix, with integer comparison. To handle these cases, the
      95             :         // iterator assumes:
      96             :         //
      97             :         //  pebble.Compare(keyPrefix{replacementSuffix},keyPrefix{originalSuffix}) < 0
      98             :         //  for keys with a suffix.
      99             :         //
     100             :         //  NB: it is possible for a block iter to add a synthetic suffix on a key
     101             :         //  without a suffix, which implies
     102             :         //  pebble.Compare(keyPrefix{replacementSuffix},keyPrefix{noSuffix}) > 0 ,
     103             :         //  however, the iterator would never need to handle an off by one error in
     104             :         //  this case since originalSuffix (empty) > searchSuffix (non empty), with
     105             :         //  integer comparison.
     106             :         //
     107             :         //
     108             :         // In addition, we also assume that any block with rangekeys will not contain
     109             :         // a synthetic suffix.
     110             :         transforms IterTransforms
     111             : 
     112             :         // offset is the byte index that marks where the current key/value is
     113             :         // encoded in the block.
     114             :         offset int32
     115             :         // nextOffset is the byte index where the next key/value is encoded in the
     116             :         // block.
     117             :         nextOffset int32
     118             :         // A "restart point" in a block is a point where the full key is encoded,
     119             :         // instead of just having a suffix of the key encoded. See readEntry() for
     120             :         // how prefix compression of keys works. Keys in between two restart points
     121             :         // only have a suffix encoded in the block. When restart interval is 1, no
     122             :         // prefix compression of keys happens. This is the case with range tombstone
     123             :         // blocks.
     124             :         //
     125             :         // All restart offsets are listed in increasing order in
     126             :         // i.ptr[i.restarts:len(block)-4], while numRestarts is encoded in the last
     127             :         // 4 bytes of the block as a uint32 (i.ptr[len(block)-4:]). i.restarts can
     128             :         // therefore be seen as the point where data in the block ends, and a list
     129             :         // of offsets of all restart points begins.
     130             :         restarts int32
     131             :         // Number of restart points in this block. Encoded at the end of the block
     132             :         // as a uint32.
     133             :         numRestarts int32
     134             :         ptr         unsafe.Pointer
     135             :         data        []byte
     136             :         // key contains the raw key the iterator is currently pointed at. This may
     137             :         // point directly to data stored in the block (for a key which has no prefix
     138             :         // compression), to fullKey (for a prefix compressed key), or to a slice of
     139             :         // data stored in cachedBuf (during reverse iteration).
     140             :         //
     141             :         // NB: In general, key contains the same logical content as ikey
     142             :         // (i.e. ikey = decode(key)), but if the iterator contains a synthetic suffix
     143             :         // replacement rule, this will not be the case. Therefore, key should never
     144             :         // be used after ikey is set.
     145             :         key []byte
     146             :         // fullKey is a buffer used for key prefix decompression. Note that if
     147             :         // transforms.SyntheticPrifix is not nil, fullKey always starts with that
     148             :         // prefix.
     149             :         fullKey []byte
     150             :         // val contains the value the iterator is currently pointed at. If non-nil,
     151             :         // this points to a slice of the block data.
     152             :         val []byte
     153             :         // ikv contains the decoded internal KV the iterator is currently positioned
     154             :         // at.
     155             :         //
     156             :         // ikv.InternalKey contains the decoded InternalKey the iterator is
     157             :         // currently pointed at. Note that the memory backing ikv.UserKey is either
     158             :         // data stored directly in the block, fullKey, or cachedBuf. The key
     159             :         // stability guarantee for blocks built with a restart interval of 1 is
     160             :         // achieved by having ikv.UserKey always point to data stored directly in
     161             :         // the block.
     162             :         //
     163             :         // ikv.LazyValue is val turned into a LazyValue, whenever a positioning
     164             :         // method returns a non-nil key-value pair.
     165             :         ikv base.InternalKV
     166             :         // cached and cachedBuf are used during reverse iteration. They are needed
     167             :         // because we can't perform prefix decoding in reverse, only in the forward
     168             :         // direction. In order to iterate in reverse, we decode and cache the entries
     169             :         // between two restart points.
     170             :         //
     171             :         // Note that cached[len(cached)-1] contains the previous entry to the one the
     172             :         // blockIter is currently pointed at. As usual, nextOffset will contain the
     173             :         // offset of the next entry. During reverse iteration, nextOffset will be
     174             :         // updated to point to offset, and we'll set the blockIter to point at the
     175             :         // entry cached[len(cached)-1]. See Prev() for more details.
     176             :         //
     177             :         // For a block encoded with a restart interval of 1, cached and cachedBuf
     178             :         // will not be used as there are no prefix compressed entries between the
     179             :         // restart points.
     180             :         cached    []blockEntry
     181             :         cachedBuf []byte
     182             :         handle    bufferHandle
     183             :         // for block iteration for already loaded blocks.
     184             :         firstUserKey      []byte
     185             :         lazyValueHandling struct {
     186             :                 vbr            *valueBlockReader
     187             :                 hasValuePrefix bool
     188             :         }
     189             :         synthSuffixBuf            []byte
     190             :         firstUserKeyWithPrefixBuf []byte
     191             : }
     192             : 
     193             : type blockEntry struct {
     194             :         offset   int32
     195             :         keyStart int32
     196             :         keyEnd   int32
     197             :         valStart int32
     198             :         valSize  int32
     199             : }
     200             : 
     201             : // blockIter implements the base.InternalIterator interface.
     202             : var _ base.InternalIterator = (*blockIter)(nil)
     203             : 
     204             : func newBlockIter(
     205             :         cmp Compare, split Split, block block, transforms IterTransforms,
     206           2 : ) (*blockIter, error) {
     207           2 :         i := &blockIter{}
     208           2 :         return i, i.init(cmp, split, block, transforms)
     209           2 : }
     210             : 
     211           0 : func (i *blockIter) String() string {
     212           0 :         return "block"
     213           0 : }
     214             : 
     215           2 : func (i *blockIter) init(cmp Compare, split Split, block block, transforms IterTransforms) error {
     216           2 :         numRestarts := int32(binary.LittleEndian.Uint32(block[len(block)-4:]))
     217           2 :         if numRestarts == 0 {
     218           0 :                 return base.CorruptionErrorf("pebble/table: invalid table (block has no restart points)")
     219           0 :         }
     220           2 :         i.transforms = transforms
     221           2 :         i.synthSuffixBuf = i.synthSuffixBuf[:0]
     222           2 :         i.split = split
     223           2 :         i.cmp = cmp
     224           2 :         i.restarts = int32(len(block)) - 4*(1+numRestarts)
     225           2 :         i.numRestarts = numRestarts
     226           2 :         i.ptr = unsafe.Pointer(&block[0])
     227           2 :         i.data = block
     228           2 :         if i.transforms.SyntheticPrefix.IsSet() {
     229           2 :                 i.fullKey = append(i.fullKey[:0], i.transforms.SyntheticPrefix...)
     230           2 :         } else {
     231           2 :                 i.fullKey = i.fullKey[:0]
     232           2 :         }
     233           2 :         i.val = nil
     234           2 :         i.clearCache()
     235           2 :         if i.restarts > 0 {
     236           2 :                 if err := i.readFirstKey(); err != nil {
     237           0 :                         return err
     238           0 :                 }
     239           2 :         } else {
     240           2 :                 // Block is empty.
     241           2 :                 i.firstUserKey = nil
     242           2 :         }
     243           2 :         return nil
     244             : }
     245             : 
     246             : // NB: two cases of hideObsoletePoints:
     247             : //   - Local sstable iteration: syntheticSeqNum will be set iff the sstable was
     248             : //     ingested.
     249             : //   - Foreign sstable iteration: syntheticSeqNum is always set.
     250             : func (i *blockIter) initHandle(
     251             :         cmp Compare, split Split, block bufferHandle, transforms IterTransforms,
     252           2 : ) error {
     253           2 :         i.handle.Release()
     254           2 :         i.handle = block
     255           2 :         return i.init(cmp, split, block.Get(), transforms)
     256           2 : }
     257             : 
     258           2 : func (i *blockIter) invalidate() {
     259           2 :         i.clearCache()
     260           2 :         i.offset = 0
     261           2 :         i.nextOffset = 0
     262           2 :         i.restarts = 0
     263           2 :         i.numRestarts = 0
     264           2 :         i.data = nil
     265           2 : }
     266             : 
     267             : // isDataInvalidated returns true when the blockIter has been invalidated
     268             : // using an invalidate call. NB: this is different from blockIter.Valid
     269             : // which is part of the InternalIterator implementation.
     270           2 : func (i *blockIter) isDataInvalidated() bool {
     271           2 :         return i.data == nil
     272           2 : }
     273             : 
     274           2 : func (i *blockIter) resetForReuse() blockIter {
     275           2 :         return blockIter{
     276           2 :                 fullKey:                   i.fullKey[:0],
     277           2 :                 cached:                    i.cached[:0],
     278           2 :                 cachedBuf:                 i.cachedBuf[:0],
     279           2 :                 firstUserKeyWithPrefixBuf: i.firstUserKeyWithPrefixBuf[:0],
     280           2 :                 data:                      nil,
     281           2 :         }
     282           2 : }
     283             : 
     284           2 : func (i *blockIter) readEntry() {
     285           2 :         ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset))
     286           2 : 
     287           2 :         // This is an ugly performance hack. Reading entries from blocks is one of
     288           2 :         // the inner-most routines and decoding the 3 varints per-entry takes
     289           2 :         // significant time. Neither go1.11 or go1.12 will inline decodeVarint for
     290           2 :         // us, so we do it manually. This provides a 10-15% performance improvement
     291           2 :         // on blockIter benchmarks on both go1.11 and go1.12.
     292           2 :         //
     293           2 :         // TODO(peter): remove this hack if go:inline is ever supported.
     294           2 : 
     295           2 :         var shared uint32
     296           2 :         if a := *((*uint8)(ptr)); a < 128 {
     297           2 :                 shared = uint32(a)
     298           2 :                 ptr = unsafe.Pointer(uintptr(ptr) + 1)
     299           2 :         } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
     300           0 :                 shared = uint32(b)<<7 | uint32(a)
     301           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 2)
     302           0 :         } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
     303           0 :                 shared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     304           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 3)
     305           0 :         } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
     306           0 :                 shared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     307           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 4)
     308           0 :         } else {
     309           0 :                 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
     310           0 :                 shared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     311           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 5)
     312           0 :         }
     313             : 
     314           2 :         var unshared uint32
     315           2 :         if a := *((*uint8)(ptr)); a < 128 {
     316           2 :                 unshared = uint32(a)
     317           2 :                 ptr = unsafe.Pointer(uintptr(ptr) + 1)
     318           2 :         } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
     319           0 :                 unshared = uint32(b)<<7 | uint32(a)
     320           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 2)
     321           0 :         } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
     322           0 :                 unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     323           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 3)
     324           0 :         } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
     325           0 :                 unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     326           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 4)
     327           0 :         } else {
     328           0 :                 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
     329           0 :                 unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     330           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 5)
     331           0 :         }
     332             : 
     333           2 :         var value uint32
     334           2 :         if a := *((*uint8)(ptr)); a < 128 {
     335           2 :                 value = uint32(a)
     336           2 :                 ptr = unsafe.Pointer(uintptr(ptr) + 1)
     337           2 :         } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
     338           2 :                 value = uint32(b)<<7 | uint32(a)
     339           2 :                 ptr = unsafe.Pointer(uintptr(ptr) + 2)
     340           2 :         } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
     341           1 :                 value = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     342           1 :                 ptr = unsafe.Pointer(uintptr(ptr) + 3)
     343           1 :         } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
     344           1 :                 value = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     345           1 :                 ptr = unsafe.Pointer(uintptr(ptr) + 4)
     346           1 :         } else {
     347           0 :                 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
     348           0 :                 value = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     349           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 5)
     350           0 :         }
     351           2 :         shared += uint32(len(i.transforms.SyntheticPrefix))
     352           2 :         unsharedKey := getBytes(ptr, int(unshared))
     353           2 :         // TODO(sumeer): move this into the else block below.
     354           2 :         i.fullKey = append(i.fullKey[:shared], unsharedKey...)
     355           2 :         if shared == 0 {
     356           2 :                 // Provide stability for the key across positioning calls if the key
     357           2 :                 // doesn't share a prefix with the previous key. This removes requiring the
     358           2 :                 // key to be copied if the caller knows the block has a restart interval of
     359           2 :                 // 1. An important example of this is range-del blocks.
     360           2 :                 i.key = unsharedKey
     361           2 :         } else {
     362           2 :                 i.key = i.fullKey
     363           2 :         }
     364           2 :         ptr = unsafe.Pointer(uintptr(ptr) + uintptr(unshared))
     365           2 :         i.val = getBytes(ptr, int(value))
     366           2 :         i.nextOffset = int32(uintptr(ptr)-uintptr(i.ptr)) + int32(value)
     367             : }
     368             : 
     369           2 : func (i *blockIter) readFirstKey() error {
     370           2 :         ptr := i.ptr
     371           2 : 
     372           2 :         // This is an ugly performance hack. Reading entries from blocks is one of
     373           2 :         // the inner-most routines and decoding the 3 varints per-entry takes
     374           2 :         // significant time. Neither go1.11 or go1.12 will inline decodeVarint for
     375           2 :         // us, so we do it manually. This provides a 10-15% performance improvement
     376           2 :         // on blockIter benchmarks on both go1.11 and go1.12.
     377           2 :         //
     378           2 :         // TODO(peter): remove this hack if go:inline is ever supported.
     379           2 : 
     380           2 :         if shared := *((*uint8)(ptr)); shared == 0 {
     381           2 :                 ptr = unsafe.Pointer(uintptr(ptr) + 1)
     382           2 :         } else {
     383           0 :                 // The shared length is != 0, which is invalid.
     384           0 :                 panic("first key in block must have zero shared length")
     385             :         }
     386             : 
     387           2 :         var unshared uint32
     388           2 :         if a := *((*uint8)(ptr)); a < 128 {
     389           2 :                 unshared = uint32(a)
     390           2 :                 ptr = unsafe.Pointer(uintptr(ptr) + 1)
     391           2 :         } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
     392           0 :                 unshared = uint32(b)<<7 | uint32(a)
     393           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 2)
     394           0 :         } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
     395           0 :                 unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     396           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 3)
     397           0 :         } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
     398           0 :                 unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     399           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 4)
     400           0 :         } else {
     401           0 :                 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
     402           0 :                 unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     403           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 5)
     404           0 :         }
     405             : 
     406             :         // Skip the value length.
     407           2 :         if a := *((*uint8)(ptr)); a < 128 {
     408           2 :                 ptr = unsafe.Pointer(uintptr(ptr) + 1)
     409           2 :         } else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); a < 128 {
     410           2 :                 ptr = unsafe.Pointer(uintptr(ptr) + 2)
     411           2 :         } else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); a < 128 {
     412           1 :                 ptr = unsafe.Pointer(uintptr(ptr) + 3)
     413           1 :         } else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); a < 128 {
     414           1 :                 ptr = unsafe.Pointer(uintptr(ptr) + 4)
     415           1 :         } else {
     416           0 :                 ptr = unsafe.Pointer(uintptr(ptr) + 5)
     417           0 :         }
     418             : 
     419           2 :         firstKey := getBytes(ptr, int(unshared))
     420           2 :         // Manually inlining base.DecodeInternalKey provides a 5-10% speedup on
     421           2 :         // BlockIter benchmarks.
     422           2 :         if n := len(firstKey) - 8; n >= 0 {
     423           2 :                 i.firstUserKey = firstKey[:n:n]
     424           2 :         } else {
     425           0 :                 i.firstUserKey = nil
     426           0 :                 return base.CorruptionErrorf("pebble/table: invalid firstKey in block")
     427           0 :         }
     428           2 :         if i.transforms.SyntheticPrefix != nil {
     429           2 :                 i.firstUserKeyWithPrefixBuf = slices.Grow(i.firstUserKeyWithPrefixBuf[:0], len(i.transforms.SyntheticPrefix)+len(i.firstUserKey))
     430           2 :                 i.firstUserKeyWithPrefixBuf = append(i.firstUserKeyWithPrefixBuf, i.transforms.SyntheticPrefix...)
     431           2 :                 i.firstUserKeyWithPrefixBuf = append(i.firstUserKeyWithPrefixBuf, i.firstUserKey...)
     432           2 :                 i.firstUserKey = i.firstUserKeyWithPrefixBuf
     433           2 :         }
     434           2 :         return nil
     435             : }
     436             : 
     437             : // The sstable internal obsolete bit is set when writing a block and unset by
     438             : // blockIter, so no code outside block writing/reading code ever sees it.
     439             : const trailerObsoleteBit = uint64(base.InternalKeyKindSSTableInternalObsoleteBit)
     440             : const trailerObsoleteMask = (InternalKeySeqNumMax << 8) | uint64(base.InternalKeyKindSSTableInternalObsoleteMask)
     441             : 
     442           2 : func (i *blockIter) decodeInternalKey(key []byte) (hiddenPoint bool) {
     443           2 :         // Manually inlining base.DecodeInternalKey provides a 5-10% speedup on
     444           2 :         // BlockIter benchmarks.
     445           2 :         if n := len(key) - 8; n >= 0 {
     446           2 :                 trailer := binary.LittleEndian.Uint64(key[n:])
     447           2 :                 hiddenPoint = i.transforms.HideObsoletePoints &&
     448           2 :                         (trailer&trailerObsoleteBit != 0)
     449           2 :                 i.ikv.K.Trailer = trailer & trailerObsoleteMask
     450           2 :                 i.ikv.K.UserKey = key[:n:n]
     451           2 :                 if n := i.transforms.SyntheticSeqNum; n != 0 {
     452           2 :                         i.ikv.K.SetSeqNum(uint64(n))
     453           2 :                 }
     454           2 :         } else {
     455           2 :                 i.ikv.K.Trailer = uint64(InternalKeyKindInvalid)
     456           2 :                 i.ikv.K.UserKey = nil
     457           2 :         }
     458           2 :         return hiddenPoint
     459             : }
     460             : 
     461             : // maybeReplaceSuffix replaces the suffix in i.ikey.UserKey with
     462             : // i.transforms.syntheticSuffix.
     463           2 : func (i *blockIter) maybeReplaceSuffix() {
     464           2 :         if i.transforms.SyntheticSuffix.IsSet() && i.ikv.K.UserKey != nil {
     465           2 :                 prefixLen := i.split(i.ikv.K.UserKey)
     466           2 :                 // If ikey is cached or may get cached, we must copy
     467           2 :                 // UserKey to a new buffer before suffix replacement.
     468           2 :                 i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikv.K.UserKey[:prefixLen]...)
     469           2 :                 i.synthSuffixBuf = append(i.synthSuffixBuf, i.transforms.SyntheticSuffix...)
     470           2 :                 i.ikv.K.UserKey = i.synthSuffixBuf
     471           2 :         }
     472             : }
     473             : 
     474           2 : func (i *blockIter) clearCache() {
     475           2 :         i.cached = i.cached[:0]
     476           2 :         i.cachedBuf = i.cachedBuf[:0]
     477           2 : }
     478             : 
     479           2 : func (i *blockIter) cacheEntry() {
     480           2 :         var valStart int32
     481           2 :         valSize := int32(len(i.val))
     482           2 :         if valSize > 0 {
     483           2 :                 valStart = int32(uintptr(unsafe.Pointer(&i.val[0])) - uintptr(i.ptr))
     484           2 :         }
     485             : 
     486           2 :         i.cached = append(i.cached, blockEntry{
     487           2 :                 offset:   i.offset,
     488           2 :                 keyStart: int32(len(i.cachedBuf)),
     489           2 :                 keyEnd:   int32(len(i.cachedBuf) + len(i.key)),
     490           2 :                 valStart: valStart,
     491           2 :                 valSize:  valSize,
     492           2 :         })
     493           2 :         i.cachedBuf = append(i.cachedBuf, i.key...)
     494             : }
     495             : 
     496           2 : func (i *blockIter) getFirstUserKey() []byte {
     497           2 :         return i.firstUserKey
     498           2 : }
     499             : 
     500             : // SeekGE implements internalIterator.SeekGE, as documented in the pebble
     501             : // package.
     502           2 : func (i *blockIter) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKV {
     503           2 :         if invariants.Enabled && i.isDataInvalidated() {
     504           0 :                 panic(errors.AssertionFailedf("invalidated blockIter used"))
     505             :         }
     506           2 :         searchKey := key
     507           2 :         if i.transforms.SyntheticPrefix != nil {
     508           2 :                 // The seek key is before or after the entire block of keys that start with
     509           2 :                 // SyntheticPrefix. To determine which, we need to compare against a valid
     510           2 :                 // key in the block. We use firstUserKey which has the synthetic prefix.
     511           2 :                 if !bytes.HasPrefix(key, i.transforms.SyntheticPrefix) {
     512           1 :                         if i.cmp(i.firstUserKey, key) >= 0 {
     513           1 :                                 return i.First()
     514           1 :                         }
     515             :                         // Set the offset to the end of the block to mimic the offset of an
     516             :                         // invalid iterator. This ensures a subsequent i.Prev() returns a valid
     517             :                         // result.
     518           1 :                         i.offset = i.restarts
     519           1 :                         i.nextOffset = i.restarts
     520           1 :                         return nil
     521             :                 }
     522           2 :                 searchKey = key[len(i.transforms.SyntheticPrefix):]
     523             :         }
     524             : 
     525           2 :         i.clearCache()
     526           2 :         // Find the index of the smallest restart point whose key is > the key
     527           2 :         // sought; index will be numRestarts if there is no such restart point.
     528           2 :         i.offset = 0
     529           2 :         var index int32
     530           2 : 
     531           2 :         {
     532           2 :                 // NB: manually inlined sort.Seach is ~5% faster.
     533           2 :                 //
     534           2 :                 // Define f(-1) == false and f(n) == true.
     535           2 :                 // Invariant: f(index-1) == false, f(upper) == true.
     536           2 :                 upper := i.numRestarts
     537           2 :                 for index < upper {
     538           2 :                         h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
     539           2 :                         // index ≤ h < upper
     540           2 :                         offset := decodeRestart(i.data[i.restarts+4*h:])
     541           2 :                         // For a restart point, there are 0 bytes shared with the previous key.
     542           2 :                         // The varint encoding of 0 occupies 1 byte.
     543           2 :                         ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(offset+1))
     544           2 : 
     545           2 :                         // Decode the key at that restart point, and compare it to the key
     546           2 :                         // sought. See the comment in readEntry for why we manually inline the
     547           2 :                         // varint decoding.
     548           2 :                         var v1 uint32
     549           2 :                         if a := *((*uint8)(ptr)); a < 128 {
     550           2 :                                 v1 = uint32(a)
     551           2 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 1)
     552           2 :                         } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
     553           0 :                                 v1 = uint32(b)<<7 | uint32(a)
     554           0 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 2)
     555           0 :                         } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
     556           0 :                                 v1 = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     557           0 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 3)
     558           0 :                         } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
     559           0 :                                 v1 = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     560           0 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 4)
     561           0 :                         } else {
     562           0 :                                 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
     563           0 :                                 v1 = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     564           0 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 5)
     565           0 :                         }
     566             : 
     567           2 :                         if *((*uint8)(ptr)) < 128 {
     568           2 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 1)
     569           2 :                         } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))) < 128 {
     570           2 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 2)
     571           2 :                         } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))) < 128 {
     572           1 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 3)
     573           1 :                         } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))) < 128 {
     574           0 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 4)
     575           0 :                         } else {
     576           0 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 5)
     577           0 :                         }
     578             : 
     579             :                         // Manually inlining part of base.DecodeInternalKey provides a 5-10%
     580             :                         // speedup on BlockIter benchmarks.
     581           2 :                         s := getBytes(ptr, int(v1))
     582           2 :                         var k []byte
     583           2 :                         if n := len(s) - 8; n >= 0 {
     584           2 :                                 k = s[:n:n]
     585           2 :                         }
     586             :                         // Else k is invalid, and left as nil
     587             : 
     588           2 :                         if i.cmp(searchKey, k) > 0 {
     589           2 :                                 // The search key is greater than the user key at this restart point.
     590           2 :                                 // Search beyond this restart point, since we are trying to find the
     591           2 :                                 // first restart point with a user key >= the search key.
     592           2 :                                 index = h + 1 // preserves f(i-1) == false
     593           2 :                         } else {
     594           2 :                                 // k >= search key, so prune everything after index (since index
     595           2 :                                 // satisfies the property we are looking for).
     596           2 :                                 upper = h // preserves f(j) == true
     597           2 :                         }
     598             :                 }
     599             :                 // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
     600             :                 // => answer is index.
     601             :         }
     602             : 
     603             :         // index is the first restart point with key >= search key. Define the keys
     604             :         // between a restart point and the next restart point as belonging to that
     605             :         // restart point.
     606             :         //
     607             :         // Since keys are strictly increasing, if index > 0 then the restart point
     608             :         // at index-1 will be the first one that has some keys belonging to it that
     609             :         // could be equal to the search key.  If index == 0, then all keys in this
     610             :         // block are larger than the key sought, and offset remains at zero.
     611           2 :         if index > 0 {
     612           2 :                 i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
     613           2 :         }
     614           2 :         i.readEntry()
     615           2 :         hiddenPoint := i.decodeInternalKey(i.key)
     616           2 : 
     617           2 :         // Iterate from that restart point to somewhere >= the key sought.
     618           2 :         if !i.valid() {
     619           1 :                 return nil
     620           1 :         }
     621             : 
     622             :         // A note on seeking in a block with a suffix replacement rule: even though
     623             :         // the binary search above was conducted on keys without suffix replacement,
     624             :         // Seek will still return the correct suffix replaced key. A binary
     625             :         // search without suffix replacement will land on a key that is _less_ than
     626             :         // the key the search would have landed on if all keys were already suffix
     627             :         // replaced. Since Seek then conducts forward iteration to the first suffix
     628             :         // replaced user key that is greater than or equal to the search key, the
     629             :         // correct key is still returned.
     630             :         //
     631             :         // As an example, consider the following block with a restart interval of 1,
     632             :         // with a replacement suffix of "4":
     633             :         // - Pre-suffix replacement: apple@1, banana@3
     634             :         // - Post-suffix replacement: apple@4, banana@4
     635             :         //
     636             :         // Suppose the client seeks with apple@3. Assuming suffixes sort in reverse
     637             :         // chronological order (i.e. apple@1>apple@3), the binary search without
     638             :         // suffix replacement would return apple@1. A binary search with suffix
     639             :         // replacement would return banana@4. After beginning forward iteration from
     640             :         // either returned restart point, forward iteration would
     641             :         // always return the correct key, banana@4.
     642             :         //
     643             :         // Further, if the user searched with apple@0 (i.e. a suffix less than the
     644             :         // pre replacement suffix) or with apple@5 (a suffix larger than the post
     645             :         // replacement suffix), the binary search with or without suffix replacement
     646             :         // would land on the same key, as we assume the following:
     647             :         // (1) no two keys in the sst share the same prefix.
     648             :         // (2) pebble.Compare(replacementSuffix,originalSuffix) > 0
     649             : 
     650           2 :         i.maybeReplaceSuffix()
     651           2 : 
     652           2 :         if !hiddenPoint && i.cmp(i.ikv.K.UserKey, key) >= 0 {
     653           2 :                 // Initialize i.lazyValue
     654           2 :                 if !i.lazyValueHandling.hasValuePrefix ||
     655           2 :                         base.TrailerKind(i.ikv.K.Trailer) != InternalKeyKindSet {
     656           2 :                         i.ikv.V = base.MakeInPlaceValue(i.val)
     657           2 :                 } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
     658           2 :                         i.ikv.V = base.MakeInPlaceValue(i.val[1:])
     659           2 :                 } else {
     660           2 :                         i.ikv.V = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
     661           2 :                 }
     662           2 :                 return &i.ikv
     663             :         }
     664           2 :         for i.Next(); i.valid(); i.Next() {
     665           2 :                 if i.cmp(i.ikv.K.UserKey, key) >= 0 {
     666           2 :                         // i.Next() has already initialized i.ikv.LazyValue.
     667           2 :                         return &i.ikv
     668           2 :                 }
     669             :         }
     670           2 :         return nil
     671             : }
     672             : 
     673             : // SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the
     674             : // pebble package.
     675           0 : func (i *blockIter) SeekPrefixGE(prefix, key []byte, flags base.SeekGEFlags) *base.InternalKV {
     676           0 :         // This should never be called as prefix iteration is handled by sstable.Iterator.
     677           0 :         panic("pebble: SeekPrefixGE unimplemented")
     678             : }
     679             : 
     680             : // SeekLT implements internalIterator.SeekLT, as documented in the pebble
     681             : // package.
     682           2 : func (i *blockIter) SeekLT(key []byte, flags base.SeekLTFlags) *base.InternalKV {
     683           2 :         if invariants.Enabled && i.isDataInvalidated() {
     684           0 :                 panic(errors.AssertionFailedf("invalidated blockIter used"))
     685             :         }
     686           2 :         searchKey := key
     687           2 :         if i.transforms.SyntheticPrefix != nil {
     688           2 :                 // The seek key is before or after the entire block of keys that start with
     689           2 :                 // SyntheticPrefix. To determine which, we need to compare against a valid
     690           2 :                 // key in the block. We use firstUserKey which has the synthetic prefix.
     691           2 :                 if !bytes.HasPrefix(key, i.transforms.SyntheticPrefix) {
     692           1 :                         if i.cmp(i.firstUserKey, key) < 0 {
     693           1 :                                 return i.Last()
     694           1 :                         }
     695             :                         // Set the offset to the beginning of the block to mimic an exhausted
     696             :                         // iterator that has conducted backward interation. This ensures a
     697             :                         // subsequent Next() call returns the first key in the block.
     698           1 :                         i.offset = -1
     699           1 :                         i.nextOffset = 0
     700           1 :                         return nil
     701             :                 }
     702           2 :                 searchKey = key[len(i.transforms.SyntheticPrefix):]
     703             :         }
     704             : 
     705           2 :         i.clearCache()
     706           2 :         // Find the index of the smallest restart point whose key is >= the key
     707           2 :         // sought; index will be numRestarts if there is no such restart point.
     708           2 :         i.offset = 0
     709           2 :         var index int32
     710           2 : 
     711           2 :         {
     712           2 :                 // NB: manually inlined sort.Search is ~5% faster.
     713           2 :                 //
     714           2 :                 // Define f(-1) == false and f(n) == true.
     715           2 :                 // Invariant: f(index-1) == false, f(upper) == true.
     716           2 :                 upper := i.numRestarts
     717           2 :                 for index < upper {
     718           2 :                         h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
     719           2 :                         // index ≤ h < upper
     720           2 :                         offset := decodeRestart(i.data[i.restarts+4*h:])
     721           2 :                         // For a restart point, there are 0 bytes shared with the previous key.
     722           2 :                         // The varint encoding of 0 occupies 1 byte.
     723           2 :                         ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(offset+1))
     724           2 : 
     725           2 :                         // Decode the key at that restart point, and compare it to the key
     726           2 :                         // sought. See the comment in readEntry for why we manually inline the
     727           2 :                         // varint decoding.
     728           2 :                         var v1 uint32
     729           2 :                         if a := *((*uint8)(ptr)); a < 128 {
     730           2 :                                 v1 = uint32(a)
     731           2 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 1)
     732           2 :                         } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
     733           0 :                                 v1 = uint32(b)<<7 | uint32(a)
     734           0 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 2)
     735           0 :                         } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
     736           0 :                                 v1 = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     737           0 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 3)
     738           0 :                         } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
     739           0 :                                 v1 = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     740           0 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 4)
     741           0 :                         } else {
     742           0 :                                 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
     743           0 :                                 v1 = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
     744           0 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 5)
     745           0 :                         }
     746             : 
     747           2 :                         if *((*uint8)(ptr)) < 128 {
     748           2 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 1)
     749           2 :                         } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))) < 128 {
     750           1 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 2)
     751           2 :                         } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))) < 128 {
     752           1 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 3)
     753           1 :                         } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))) < 128 {
     754           0 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 4)
     755           0 :                         } else {
     756           0 :                                 ptr = unsafe.Pointer(uintptr(ptr) + 5)
     757           0 :                         }
     758             : 
     759             :                         // Manually inlining part of base.DecodeInternalKey provides a 5-10%
     760             :                         // speedup on BlockIter benchmarks.
     761           2 :                         s := getBytes(ptr, int(v1))
     762           2 :                         var k []byte
     763           2 :                         if n := len(s) - 8; n >= 0 {
     764           2 :                                 k = s[:n:n]
     765           2 :                         }
     766             :                         // Else k is invalid, and left as nil
     767             : 
     768           2 :                         if i.cmp(searchKey, k) > 0 {
     769           2 :                                 // The search key is greater than the user key at this restart point.
     770           2 :                                 // Search beyond this restart point, since we are trying to find the
     771           2 :                                 // first restart point with a user key >= the search key.
     772           2 :                                 index = h + 1 // preserves f(i-1) == false
     773           2 :                         } else {
     774           2 :                                 // k >= search key, so prune everything after index (since index
     775           2 :                                 // satisfies the property we are looking for).
     776           2 :                                 upper = h // preserves f(j) == true
     777           2 :                         }
     778             :                 }
     779             :                 // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
     780             :                 // => answer is index.
     781             :         }
     782             : 
     783           2 :         if index == 0 {
     784           2 :                 if i.transforms.SyntheticSuffix.IsSet() {
     785           2 :                         // The binary search was conducted on keys without suffix replacement,
     786           2 :                         // implying the first key in the block may be less than the search key. To
     787           2 :                         // double check, get the first key in the block with suffix replacement
     788           2 :                         // and compare to the search key. Consider the following example: suppose
     789           2 :                         // the user searches with a@3, the first key in the block is a@2 and the
     790           2 :                         // block contains a suffix replacement rule of 4. Since a@3 sorts before
     791           2 :                         // a@2, the binary search would return index==0. Without conducting the
     792           2 :                         // suffix replacement, the SeekLT would incorrectly return nil. With
     793           2 :                         // suffix replacement though, a@4 should be returned as a@4 sorts before
     794           2 :                         // a@3.
     795           2 :                         ikv := i.First()
     796           2 :                         if i.cmp(ikv.K.UserKey, key) < 0 {
     797           2 :                                 return ikv
     798           2 :                         }
     799             :                 }
     800             :                 // If index == 0 then all keys in this block are larger than the key
     801             :                 // sought, so there is no match.
     802           2 :                 i.offset = -1
     803           2 :                 i.nextOffset = 0
     804           2 :                 return nil
     805             :         }
     806             : 
     807             :         // INVARIANT: index > 0
     808             : 
     809             :         // Ignoring suffix replacement, index is the first restart point with key >=
     810             :         // search key. Define the keys between a restart point and the next restart
     811             :         // point as belonging to that restart point. Note that index could be equal to
     812             :         // i.numRestarts, i.e., we are past the last restart.  Since keys are strictly
     813             :         // increasing, then the restart point at index-1 will be the first one that
     814             :         // has some keys belonging to it that are less than the search key.
     815             :         //
     816             :         // Next, we will search between the restart at index-1 and the restart point
     817             :         // at index, for the first key >= key, and then on finding it, return
     818             :         // i.Prev(). We need to know when we have hit the offset for index, since then
     819             :         // we can stop searching. targetOffset encodes that offset for index.
     820           2 :         targetOffset := i.restarts
     821           2 :         i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
     822           2 :         if index < i.numRestarts {
     823           2 :                 targetOffset = decodeRestart(i.data[i.restarts+4*(index):])
     824           2 : 
     825           2 :                 if i.transforms.SyntheticSuffix.IsSet() {
     826           1 :                         // The binary search was conducted on keys without suffix replacement,
     827           1 :                         // implying the returned restart point (index) may be less than the search
     828           1 :                         // key, breaking the assumption described above.
     829           1 :                         //
     830           1 :                         // For example: consider this block with a replacement ts of 4, and
     831           1 :                         // restart interval of 1: - pre replacement: a@3,b@2,c@3 - post
     832           1 :                         // replacement: a@4,b@4,c@4
     833           1 :                         //
     834           1 :                         // Suppose the client calls SeekLT(b@3), SeekLT must return b@4.
     835           1 :                         //
     836           1 :                         // If the client calls  SeekLT(b@3), the binary search would return b@2,
     837           1 :                         // the lowest key geq to b@3, pre-suffix replacement. Then, SeekLT will
     838           1 :                         // begin forward iteration from a@3, the previous restart point, to
     839           1 :                         // b{suffix}. The iteration stops when it encounters a key geq to the
     840           1 :                         // search key or if it reaches the upper bound. Without suffix
     841           1 :                         // replacement, we can assume that the upper bound of this forward
     842           1 :                         // iteration, b{suffix}, is greater than the search key, as implied by the
     843           1 :                         // binary search.
     844           1 :                         //
     845           1 :                         // If we naively hold this assumption with suffix replacement, the
     846           1 :                         // iteration would terminate at the upper bound, b@4, call i.Prev, and
     847           1 :                         // incorrectly return a@4. To correct for this, if the original returned
     848           1 :                         // index is less than the search key, shift our forward iteration to begin
     849           1 :                         // at index instead of index -1. With suffix replacement the key at index
     850           1 :                         // is guaranteed to be the highest restart point less than the seach key
     851           1 :                         // (i.e. the same property of index-1 for a block without suffix
     852           1 :                         // replacement). This property holds because of the invariant that a block
     853           1 :                         // with suffix replacement will not have two keys that share the same
     854           1 :                         // prefix. To consider the above example, binary searching with b@3 landed
     855           1 :                         // naively at a@3, but since b@4<b@3, we shift our forward iteration to
     856           1 :                         // begin at b@4. We never need to shift by more than one restart point
     857           1 :                         // (i.e. to c@4) because it's impossible for the search key to be greater
     858           1 :                         // than the key at the next restart point in the block because that
     859           1 :                         // key will always have a different prefix. Put another way, because no
     860           1 :                         // key in the block shares the same prefix, naive binary search should
     861           1 :                         // always land at most 1 restart point off the correct one.
     862           1 : 
     863           1 :                         naiveOffset := i.offset
     864           1 :                         // Shift up to the original binary search result and decode the key.
     865           1 :                         i.offset = targetOffset
     866           1 :                         i.readEntry()
     867           1 :                         i.decodeInternalKey(i.key)
     868           1 :                         i.maybeReplaceSuffix()
     869           1 : 
     870           1 :                         // If the binary search point is actually less than the search key, post
     871           1 :                         // replacement, bump the target offset.
     872           1 :                         if i.cmp(i.ikv.K.UserKey, key) < 0 {
     873           1 :                                 i.offset = targetOffset
     874           1 :                                 if index+1 < i.numRestarts {
     875           1 :                                         // if index+1 is within the i.data bounds, use it to find the target
     876           1 :                                         // offset.
     877           1 :                                         targetOffset = decodeRestart(i.data[i.restarts+4*(index+1):])
     878           1 :                                 } else {
     879           1 :                                         targetOffset = i.restarts
     880           1 :                                 }
     881           1 :                         } else {
     882           1 :                                 i.offset = naiveOffset
     883           1 :                         }
     884             :                 }
     885             :         }
     886             : 
     887             :         // Init nextOffset for the forward iteration below.
     888           2 :         i.nextOffset = i.offset
     889           2 : 
     890           2 :         for {
     891           2 :                 i.offset = i.nextOffset
     892           2 :                 i.readEntry()
     893           2 :                 // When hidden keys are common, there is additional optimization possible
     894           2 :                 // by not caching entries that are hidden (note that some calls to
     895           2 :                 // cacheEntry don't decode the internal key before caching, but checking
     896           2 :                 // whether a key is hidden does not require full decoding). However, we do
     897           2 :                 // need to use the blockEntry.offset in the cache for the first entry at
     898           2 :                 // the reset point to do the binary search when the cache is empty -- so
     899           2 :                 // we would need to cache that first entry (though not the key) even if
     900           2 :                 // was hidden. Our current assumption is that if there are large numbers
     901           2 :                 // of hidden keys we will be able to skip whole blocks (using block
     902           2 :                 // property filters) so we don't bother optimizing.
     903           2 :                 hiddenPoint := i.decodeInternalKey(i.key)
     904           2 :                 i.maybeReplaceSuffix()
     905           2 : 
     906           2 :                 // NB: we don't use the hiddenPoint return value of decodeInternalKey
     907           2 :                 // since we want to stop as soon as we reach a key >= ikey.UserKey, so
     908           2 :                 // that we can reverse.
     909           2 :                 if i.cmp(i.ikv.K.UserKey, key) >= 0 {
     910           2 :                         // The current key is greater than or equal to our search key. Back up to
     911           2 :                         // the previous key which was less than our search key. Note that this for
     912           2 :                         // loop will execute at least once with this if-block not being true, so
     913           2 :                         // the key we are backing up to is the last one this loop cached.
     914           2 :                         return i.Prev()
     915           2 :                 }
     916             : 
     917           2 :                 if i.nextOffset >= targetOffset {
     918           2 :                         // We've reached the end of the current restart block. Return the
     919           2 :                         // current key if not hidden, else call Prev().
     920           2 :                         //
     921           2 :                         // When the restart interval is 1, the first iteration of the for loop
     922           2 :                         // will bring us here. In that case ikey is backed by the block so we
     923           2 :                         // get the desired key stability guarantee for the lifetime of the
     924           2 :                         // blockIter. That is, we never cache anything and therefore never
     925           2 :                         // return a key backed by cachedBuf.
     926           2 :                         if hiddenPoint {
     927           2 :                                 return i.Prev()
     928           2 :                         }
     929           2 :                         break
     930             :                 }
     931           2 :                 i.cacheEntry()
     932             :         }
     933             : 
     934           2 :         if !i.valid() {
     935           2 :                 return nil
     936           2 :         }
     937           2 :         if !i.lazyValueHandling.hasValuePrefix ||
     938           2 :                 base.TrailerKind(i.ikv.K.Trailer) != InternalKeyKindSet {
     939           2 :                 i.ikv.V = base.MakeInPlaceValue(i.val)
     940           2 :         } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
     941           2 :                 i.ikv.V = base.MakeInPlaceValue(i.val[1:])
     942           2 :         } else {
     943           2 :                 i.ikv.V = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
     944           2 :         }
     945           2 :         return &i.ikv
     946             : }
     947             : 
     948             : // First implements internalIterator.First, as documented in the pebble
     949             : // package.
     950           2 : func (i *blockIter) First() *base.InternalKV {
     951           2 :         if invariants.Enabled && i.isDataInvalidated() {
     952           0 :                 panic(errors.AssertionFailedf("invalidated blockIter used"))
     953             :         }
     954             : 
     955           2 :         i.offset = 0
     956           2 :         if !i.valid() {
     957           2 :                 return nil
     958           2 :         }
     959           2 :         i.clearCache()
     960           2 :         i.readEntry()
     961           2 :         hiddenPoint := i.decodeInternalKey(i.key)
     962           2 :         if hiddenPoint {
     963           2 :                 return i.Next()
     964           2 :         }
     965           2 :         i.maybeReplaceSuffix()
     966           2 :         if !i.lazyValueHandling.hasValuePrefix ||
     967           2 :                 base.TrailerKind(i.ikv.K.Trailer) != InternalKeyKindSet {
     968           2 :                 i.ikv.V = base.MakeInPlaceValue(i.val)
     969           2 :         } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
     970           2 :                 i.ikv.V = base.MakeInPlaceValue(i.val[1:])
     971           2 :         } else {
     972           2 :                 i.ikv.V = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
     973           2 :         }
     974           2 :         return &i.ikv
     975             : }
     976             : 
     977           2 : func decodeRestart(b []byte) int32 {
     978           2 :         _ = b[3] // bounds check hint to compiler; see golang.org/issue/14808
     979           2 :         return int32(uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 |
     980           2 :                 uint32(b[3]&restartMaskLittleEndianHighByteWithoutSetHasSamePrefix)<<24)
     981           2 : }
     982             : 
     983             : // Last implements internalIterator.Last, as documented in the pebble package.
     984           2 : func (i *blockIter) Last() *base.InternalKV {
     985           2 :         if invariants.Enabled && i.isDataInvalidated() {
     986           0 :                 panic(errors.AssertionFailedf("invalidated blockIter used"))
     987             :         }
     988             : 
     989             :         // Seek forward from the last restart point.
     990           2 :         i.offset = decodeRestart(i.data[i.restarts+4*(i.numRestarts-1):])
     991           2 :         if !i.valid() {
     992           2 :                 return nil
     993           2 :         }
     994             : 
     995           2 :         i.readEntry()
     996           2 :         i.clearCache()
     997           2 : 
     998           2 :         for i.nextOffset < i.restarts {
     999           2 :                 i.cacheEntry()
    1000           2 :                 i.offset = i.nextOffset
    1001           2 :                 i.readEntry()
    1002           2 :         }
    1003             : 
    1004           2 :         hiddenPoint := i.decodeInternalKey(i.key)
    1005           2 :         if hiddenPoint {
    1006           2 :                 return i.Prev()
    1007           2 :         }
    1008           2 :         i.maybeReplaceSuffix()
    1009           2 :         if !i.lazyValueHandling.hasValuePrefix ||
    1010           2 :                 base.TrailerKind(i.ikv.K.Trailer) != InternalKeyKindSet {
    1011           2 :                 i.ikv.V = base.MakeInPlaceValue(i.val)
    1012           2 :         } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
    1013           2 :                 i.ikv.V = base.MakeInPlaceValue(i.val[1:])
    1014           2 :         } else {
    1015           2 :                 i.ikv.V = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
    1016           2 :         }
    1017           2 :         return &i.ikv
    1018             : }
    1019             : 
    1020             : // Next implements internalIterator.Next, as documented in the pebble
    1021             : // package.
    1022           2 : func (i *blockIter) Next() *base.InternalKV {
    1023           2 :         if len(i.cachedBuf) > 0 {
    1024           2 :                 // We're switching from reverse iteration to forward iteration. We need to
    1025           2 :                 // populate i.fullKey with the current key we're positioned at so that
    1026           2 :                 // readEntry() can use i.fullKey for key prefix decompression. Note that we
    1027           2 :                 // don't know whether i.key is backed by i.cachedBuf or i.fullKey (if
    1028           2 :                 // SeekLT was the previous call, i.key may be backed by i.fullKey), but
    1029           2 :                 // copying into i.fullKey works for both cases.
    1030           2 :                 //
    1031           2 :                 // TODO(peter): Rather than clearing the cache, we could instead use the
    1032           2 :                 // cache until it is exhausted. This would likely be faster than falling
    1033           2 :                 // through to the normal forward iteration code below.
    1034           2 :                 i.fullKey = append(i.fullKey[:0], i.key...)
    1035           2 :                 i.clearCache()
    1036           2 :         }
    1037             : 
    1038             : start:
    1039           2 :         i.offset = i.nextOffset
    1040           2 :         if !i.valid() {
    1041           2 :                 return nil
    1042           2 :         }
    1043           2 :         i.readEntry()
    1044           2 :         // Manually inlined version of i.decodeInternalKey(i.key).
    1045           2 :         if n := len(i.key) - 8; n >= 0 {
    1046           2 :                 trailer := binary.LittleEndian.Uint64(i.key[n:])
    1047           2 :                 hiddenPoint := i.transforms.HideObsoletePoints &&
    1048           2 :                         (trailer&trailerObsoleteBit != 0)
    1049           2 :                 i.ikv.K.Trailer = trailer & trailerObsoleteMask
    1050           2 :                 i.ikv.K.UserKey = i.key[:n:n]
    1051           2 :                 if n := i.transforms.SyntheticSeqNum; n != 0 {
    1052           2 :                         i.ikv.K.SetSeqNum(uint64(n))
    1053           2 :                 }
    1054           2 :                 if hiddenPoint {
    1055           2 :                         goto start
    1056             :                 }
    1057           2 :                 if i.transforms.SyntheticSuffix.IsSet() {
    1058           2 :                         // Inlined version of i.maybeReplaceSuffix()
    1059           2 :                         prefixLen := i.split(i.ikv.K.UserKey)
    1060           2 :                         i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikv.K.UserKey[:prefixLen]...)
    1061           2 :                         i.synthSuffixBuf = append(i.synthSuffixBuf, i.transforms.SyntheticSuffix...)
    1062           2 :                         i.ikv.K.UserKey = i.synthSuffixBuf
    1063           2 :                 }
    1064           0 :         } else {
    1065           0 :                 i.ikv.K.Trailer = uint64(InternalKeyKindInvalid)
    1066           0 :                 i.ikv.K.UserKey = nil
    1067           0 :         }
    1068           2 :         if !i.lazyValueHandling.hasValuePrefix ||
    1069           2 :                 base.TrailerKind(i.ikv.K.Trailer) != InternalKeyKindSet {
    1070           2 :                 i.ikv.V = base.MakeInPlaceValue(i.val)
    1071           2 :         } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
    1072           2 :                 i.ikv.V = base.MakeInPlaceValue(i.val[1:])
    1073           2 :         } else {
    1074           2 :                 i.ikv.V = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
    1075           2 :         }
    1076           2 :         return &i.ikv
    1077             : }
    1078             : 
    1079             : // NextPrefix implements (base.InternalIterator).NextPrefix.
    1080           2 : func (i *blockIter) NextPrefix(succKey []byte) *base.InternalKV {
    1081           2 :         if i.lazyValueHandling.hasValuePrefix {
    1082           2 :                 return i.nextPrefixV3(succKey)
    1083           2 :         }
    1084           2 :         const nextsBeforeSeek = 3
    1085           2 :         kv := i.Next()
    1086           2 :         for j := 1; kv != nil && i.cmp(kv.K.UserKey, succKey) < 0; j++ {
    1087           2 :                 if j >= nextsBeforeSeek {
    1088           2 :                         return i.SeekGE(succKey, base.SeekGEFlagsNone)
    1089           2 :                 }
    1090           2 :                 kv = i.Next()
    1091             :         }
    1092           2 :         return kv
    1093             : }
    1094             : 
    1095           2 : func (i *blockIter) nextPrefixV3(succKey []byte) *base.InternalKV {
    1096           2 :         // Doing nexts that involve a key comparison can be expensive (and the cost
    1097           2 :         // depends on the key length), so we use the same threshold of 3 that we use
    1098           2 :         // for TableFormatPebblev2 in blockIter.nextPrefix above. The next fast path
    1099           2 :         // that looks at setHasSamePrefix takes ~5ns per key, which is ~150x faster
    1100           2 :         // than doing a SeekGE within the block, so we do this 16 times
    1101           2 :         // (~5ns*16=80ns), and then switch to looking at restarts. Doing the binary
    1102           2 :         // search for the restart consumes > 100ns. If the number of versions is >
    1103           2 :         // 17, we will increment nextFastCount to 17, then do a binary search, and
    1104           2 :         // on average need to find a key between two restarts, so another 8 steps
    1105           2 :         // corresponding to nextFastCount, for a mean total of 17 + 8 = 25 such
    1106           2 :         // steps.
    1107           2 :         //
    1108           2 :         // TODO(sumeer): use the configured restartInterval for the sstable when it
    1109           2 :         // was written (which we don't currently store) instead of the default value
    1110           2 :         // of 16.
    1111           2 :         const nextCmpThresholdBeforeSeek = 3
    1112           2 :         const nextFastThresholdBeforeRestarts = 16
    1113           2 :         nextCmpCount := 0
    1114           2 :         nextFastCount := 0
    1115           2 :         usedRestarts := false
    1116           2 :         // INVARIANT: blockIter is valid.
    1117           2 :         if invariants.Enabled && !i.valid() {
    1118           0 :                 panic(errors.AssertionFailedf("nextPrefixV3 called on invalid blockIter"))
    1119             :         }
    1120           2 :         prevKeyIsSet := i.ikv.Kind() == InternalKeyKindSet
    1121           2 :         for {
    1122           2 :                 i.offset = i.nextOffset
    1123           2 :                 if !i.valid() {
    1124           2 :                         return nil
    1125           2 :                 }
    1126             :                 // Need to decode the length integers, so we can compute nextOffset.
    1127           2 :                 ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset))
    1128           2 :                 // This is an ugly performance hack. Reading entries from blocks is one of
    1129           2 :                 // the inner-most routines and decoding the 3 varints per-entry takes
    1130           2 :                 // significant time. Neither go1.11 or go1.12 will inline decodeVarint for
    1131           2 :                 // us, so we do it manually. This provides a 10-15% performance improvement
    1132           2 :                 // on blockIter benchmarks on both go1.11 and go1.12.
    1133           2 :                 //
    1134           2 :                 // TODO(peter): remove this hack if go:inline is ever supported.
    1135           2 : 
    1136           2 :                 // Decode the shared key length integer.
    1137           2 :                 var shared uint32
    1138           2 :                 if a := *((*uint8)(ptr)); a < 128 {
    1139           2 :                         shared = uint32(a)
    1140           2 :                         ptr = unsafe.Pointer(uintptr(ptr) + 1)
    1141           2 :                 } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
    1142           0 :                         shared = uint32(b)<<7 | uint32(a)
    1143           0 :                         ptr = unsafe.Pointer(uintptr(ptr) + 2)
    1144           0 :                 } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
    1145           0 :                         shared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
    1146           0 :                         ptr = unsafe.Pointer(uintptr(ptr) + 3)
    1147           0 :                 } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
    1148           0 :                         shared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
    1149           0 :                         ptr = unsafe.Pointer(uintptr(ptr) + 4)
    1150           0 :                 } else {
    1151           0 :                         d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
    1152           0 :                         shared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
    1153           0 :                         ptr = unsafe.Pointer(uintptr(ptr) + 5)
    1154           0 :                 }
    1155             :                 // Decode the unshared key length integer.
    1156           2 :                 var unshared uint32
    1157           2 :                 if a := *((*uint8)(ptr)); a < 128 {
    1158           2 :                         unshared = uint32(a)
    1159           2 :                         ptr = unsafe.Pointer(uintptr(ptr) + 1)
    1160           2 :                 } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
    1161           0 :                         unshared = uint32(b)<<7 | uint32(a)
    1162           0 :                         ptr = unsafe.Pointer(uintptr(ptr) + 2)
    1163           0 :                 } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
    1164           0 :                         unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
    1165           0 :                         ptr = unsafe.Pointer(uintptr(ptr) + 3)
    1166           0 :                 } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
    1167           0 :                         unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
    1168           0 :                         ptr = unsafe.Pointer(uintptr(ptr) + 4)
    1169           0 :                 } else {
    1170           0 :                         d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
    1171           0 :                         unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
    1172           0 :                         ptr = unsafe.Pointer(uintptr(ptr) + 5)
    1173           0 :                 }
    1174             :                 // Decode the value length integer.
    1175           2 :                 var value uint32
    1176           2 :                 if a := *((*uint8)(ptr)); a < 128 {
    1177           2 :                         value = uint32(a)
    1178           2 :                         ptr = unsafe.Pointer(uintptr(ptr) + 1)
    1179           2 :                 } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
    1180           0 :                         value = uint32(b)<<7 | uint32(a)
    1181           0 :                         ptr = unsafe.Pointer(uintptr(ptr) + 2)
    1182           0 :                 } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
    1183           0 :                         value = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
    1184           0 :                         ptr = unsafe.Pointer(uintptr(ptr) + 3)
    1185           0 :                 } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
    1186           0 :                         value = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
    1187           0 :                         ptr = unsafe.Pointer(uintptr(ptr) + 4)
    1188           0 :                 } else {
    1189           0 :                         d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
    1190           0 :                         value = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
    1191           0 :                         ptr = unsafe.Pointer(uintptr(ptr) + 5)
    1192           0 :                 }
    1193           2 :                 if i.transforms.SyntheticPrefix != nil {
    1194           0 :                         shared += uint32(len(i.transforms.SyntheticPrefix))
    1195           0 :                 }
    1196             :                 // The starting position of the value.
    1197           2 :                 valuePtr := unsafe.Pointer(uintptr(ptr) + uintptr(unshared))
    1198           2 :                 i.nextOffset = int32(uintptr(valuePtr)-uintptr(i.ptr)) + int32(value)
    1199           2 :                 if invariants.Enabled && unshared < 8 {
    1200           0 :                         // This should not happen since only the key prefix is shared, so even
    1201           0 :                         // if the prefix length is the same as the user key length, the unshared
    1202           0 :                         // will include the trailer.
    1203           0 :                         panic(errors.AssertionFailedf("unshared %d is too small", unshared))
    1204             :                 }
    1205             :                 // The trailer is written in little endian, so the key kind is the first
    1206             :                 // byte in the trailer that is encoded in the slice [unshared-8:unshared].
    1207           2 :                 keyKind := InternalKeyKind((*[manual.MaxArrayLen]byte)(ptr)[unshared-8])
    1208           2 :                 keyKind = keyKind & base.InternalKeyKindSSTableInternalObsoleteMask
    1209           2 :                 prefixChanged := false
    1210           2 :                 if keyKind == InternalKeyKindSet {
    1211           2 :                         if invariants.Enabled && value == 0 {
    1212           0 :                                 panic(errors.AssertionFailedf("value is of length 0, but we expect a valuePrefix"))
    1213             :                         }
    1214           2 :                         valPrefix := *((*valuePrefix)(valuePtr))
    1215           2 :                         if setHasSamePrefix(valPrefix) {
    1216           2 :                                 // Fast-path. No need to assemble i.fullKey, or update i.key. We know
    1217           2 :                                 // that subsequent keys will not have a shared length that is greater
    1218           2 :                                 // than the prefix of the current key, which is also the prefix of
    1219           2 :                                 // i.key. Since we are continuing to iterate, we don't need to
    1220           2 :                                 // initialize i.ikey and i.lazyValue (these are initialized before
    1221           2 :                                 // returning).
    1222           2 :                                 nextFastCount++
    1223           2 :                                 if nextFastCount > nextFastThresholdBeforeRestarts {
    1224           1 :                                         if usedRestarts {
    1225           0 :                                                 // Exhausted iteration budget. This will never happen unless
    1226           0 :                                                 // someone is using a restart interval > 16. It is just to guard
    1227           0 :                                                 // against long restart intervals causing too much iteration.
    1228           0 :                                                 break
    1229             :                                         }
    1230             :                                         // Haven't used restarts yet, so find the first restart at or beyond
    1231             :                                         // the current offset.
    1232           1 :                                         targetOffset := i.offset
    1233           1 :                                         var index int32
    1234           1 :                                         {
    1235           1 :                                                 // NB: manually inlined sort.Sort is ~5% faster.
    1236           1 :                                                 //
    1237           1 :                                                 // f defined for a restart point is true iff the offset >=
    1238           1 :                                                 // targetOffset.
    1239           1 :                                                 // Define f(-1) == false and f(i.numRestarts) == true.
    1240           1 :                                                 // Invariant: f(index-1) == false, f(upper) == true.
    1241           1 :                                                 upper := i.numRestarts
    1242           1 :                                                 for index < upper {
    1243           1 :                                                         h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
    1244           1 :                                                         // index ≤ h < upper
    1245           1 :                                                         offset := decodeRestart(i.data[i.restarts+4*h:])
    1246           1 :                                                         if offset < targetOffset {
    1247           1 :                                                                 index = h + 1 // preserves f(index-1) == false
    1248           1 :                                                         } else {
    1249           1 :                                                                 upper = h // preserves f(upper) == true
    1250           1 :                                                         }
    1251             :                                                 }
    1252             :                                                 // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
    1253             :                                                 // => answer is index.
    1254             :                                         }
    1255           1 :                                         usedRestarts = true
    1256           1 :                                         nextFastCount = 0
    1257           1 :                                         if index == i.numRestarts {
    1258           1 :                                                 // Already past the last real restart, so iterate a bit more until
    1259           1 :                                                 // we are done with the block.
    1260           1 :                                                 continue
    1261             :                                         }
    1262             :                                         // Have some real restarts after index. NB: index is the first
    1263             :                                         // restart at or beyond the current offset.
    1264           1 :                                         startingIndex := index
    1265           1 :                                         for index != i.numRestarts &&
    1266           1 :                                                 // The restart at index is 4 bytes written in little endian format
    1267           1 :                                                 // starting at i.restart+4*index. The 0th byte is the least
    1268           1 :                                                 // significant and the 3rd byte is the most significant. Since the
    1269           1 :                                                 // most significant bit of the 3rd byte is what we use for
    1270           1 :                                                 // encoding the set-has-same-prefix information, the indexing
    1271           1 :                                                 // below has +3.
    1272           1 :                                                 i.data[i.restarts+4*index+3]&restartMaskLittleEndianHighByteOnlySetHasSamePrefix != 0 {
    1273           1 :                                                 // We still have the same prefix, so move to the next restart.
    1274           1 :                                                 index++
    1275           1 :                                         }
    1276             :                                         // index is the first restart that did not have the same prefix.
    1277           1 :                                         if index != startingIndex {
    1278           1 :                                                 // Managed to skip past at least one restart. Resume iteration
    1279           1 :                                                 // from index-1. Since nextFastCount has been reset to 0, we
    1280           1 :                                                 // should be able to iterate to the next prefix.
    1281           1 :                                                 i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
    1282           1 :                                                 i.readEntry()
    1283           1 :                                         }
    1284             :                                         // Else, unable to skip past any restart. Resume iteration. Since
    1285             :                                         // nextFastCount has been reset to 0, we should be able to iterate
    1286             :                                         // to the next prefix.
    1287           1 :                                         continue
    1288             :                                 }
    1289           2 :                                 continue
    1290           2 :                         } else if prevKeyIsSet {
    1291           2 :                                 prefixChanged = true
    1292           2 :                         }
    1293           2 :                 } else {
    1294           2 :                         prevKeyIsSet = false
    1295           2 :                 }
    1296             :                 // Slow-path cases:
    1297             :                 // - (Likely) The prefix has changed.
    1298             :                 // - (Unlikely) The prefix has not changed.
    1299             :                 // We assemble the key etc. under the assumption that it is the likely
    1300             :                 // case.
    1301           2 :                 unsharedKey := getBytes(ptr, int(unshared))
    1302           2 :                 // TODO(sumeer): move this into the else block below. This is a bit tricky
    1303           2 :                 // since the current logic assumes we have always copied the latest key
    1304           2 :                 // into fullKey, which is why when we get to the next key we can (a)
    1305           2 :                 // access i.fullKey[:shared], (b) append only the unsharedKey to
    1306           2 :                 // i.fullKey. For (a), we can access i.key[:shared] since that memory is
    1307           2 :                 // valid (even if unshared). For (b), we will need to remember whether
    1308           2 :                 // i.key refers to i.fullKey or not, and can append the unsharedKey only
    1309           2 :                 // in the former case and for the latter case need to copy the shared part
    1310           2 :                 // too. This same comment applies to the other place where we can do this
    1311           2 :                 // optimization, in readEntry().
    1312           2 :                 i.fullKey = append(i.fullKey[:shared], unsharedKey...)
    1313           2 :                 i.val = getBytes(valuePtr, int(value))
    1314           2 :                 if shared == 0 {
    1315           2 :                         // Provide stability for the key across positioning calls if the key
    1316           2 :                         // doesn't share a prefix with the previous key. This removes requiring the
    1317           2 :                         // key to be copied if the caller knows the block has a restart interval of
    1318           2 :                         // 1. An important example of this is range-del blocks.
    1319           2 :                         i.key = unsharedKey
    1320           2 :                 } else {
    1321           2 :                         i.key = i.fullKey
    1322           2 :                 }
    1323             :                 // Manually inlined version of i.decodeInternalKey(i.key).
    1324           2 :                 hiddenPoint := false
    1325           2 :                 if n := len(i.key) - 8; n >= 0 {
    1326           2 :                         trailer := binary.LittleEndian.Uint64(i.key[n:])
    1327           2 :                         hiddenPoint = i.transforms.HideObsoletePoints &&
    1328           2 :                                 (trailer&trailerObsoleteBit != 0)
    1329           2 :                         i.ikv.K = base.InternalKey{
    1330           2 :                                 Trailer: trailer & trailerObsoleteMask,
    1331           2 :                                 UserKey: i.key[:n:n],
    1332           2 :                         }
    1333           2 :                         if n := i.transforms.SyntheticSeqNum; n != 0 {
    1334           1 :                                 i.ikv.K.SetSeqNum(uint64(n))
    1335           1 :                         }
    1336           2 :                         if i.transforms.SyntheticSuffix.IsSet() {
    1337           0 :                                 // Inlined version of i.maybeReplaceSuffix()
    1338           0 :                                 prefixLen := i.split(i.ikv.K.UserKey)
    1339           0 :                                 i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikv.K.UserKey[:prefixLen]...)
    1340           0 :                                 i.synthSuffixBuf = append(i.synthSuffixBuf, i.transforms.SyntheticSuffix...)
    1341           0 :                                 i.ikv.K.UserKey = i.synthSuffixBuf
    1342           0 :                         }
    1343           0 :                 } else {
    1344           0 :                         i.ikv.K.Trailer = uint64(InternalKeyKindInvalid)
    1345           0 :                         i.ikv.K.UserKey = nil
    1346           0 :                 }
    1347           2 :                 nextCmpCount++
    1348           2 :                 if invariants.Enabled && prefixChanged && i.cmp(i.ikv.K.UserKey, succKey) < 0 {
    1349           0 :                         panic(errors.AssertionFailedf("prefix should have changed but %x < %x",
    1350           0 :                                 i.ikv.K.UserKey, succKey))
    1351             :                 }
    1352           2 :                 if prefixChanged || i.cmp(i.ikv.K.UserKey, succKey) >= 0 {
    1353           2 :                         // Prefix has changed.
    1354           2 :                         if hiddenPoint {
    1355           1 :                                 return i.Next()
    1356           1 :                         }
    1357           2 :                         if invariants.Enabled && !i.lazyValueHandling.hasValuePrefix {
    1358           0 :                                 panic(errors.AssertionFailedf("nextPrefixV3 being run for non-v3 sstable"))
    1359             :                         }
    1360           2 :                         if base.TrailerKind(i.ikv.K.Trailer) != InternalKeyKindSet {
    1361           2 :                                 i.ikv.V = base.MakeInPlaceValue(i.val)
    1362           2 :                         } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
    1363           2 :                                 i.ikv.V = base.MakeInPlaceValue(i.val[1:])
    1364           2 :                         } else {
    1365           0 :                                 i.ikv.V = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
    1366           0 :                         }
    1367           2 :                         return &i.ikv
    1368             :                 }
    1369             :                 // Else prefix has not changed.
    1370             : 
    1371           2 :                 if nextCmpCount >= nextCmpThresholdBeforeSeek {
    1372           1 :                         break
    1373             :                 }
    1374             :         }
    1375           1 :         return i.SeekGE(succKey, base.SeekGEFlagsNone)
    1376             : }
    1377             : 
    1378             : // Prev implements internalIterator.Prev, as documented in the pebble
    1379             : // package.
    1380           2 : func (i *blockIter) Prev() *base.InternalKV {
    1381           2 : start:
    1382           2 :         for n := len(i.cached) - 1; n >= 0; n-- {
    1383           2 :                 i.nextOffset = i.offset
    1384           2 :                 e := &i.cached[n]
    1385           2 :                 i.offset = e.offset
    1386           2 :                 i.val = getBytes(unsafe.Pointer(uintptr(i.ptr)+uintptr(e.valStart)), int(e.valSize))
    1387           2 :                 // Manually inlined version of i.decodeInternalKey(i.key).
    1388           2 :                 i.key = i.cachedBuf[e.keyStart:e.keyEnd]
    1389           2 :                 if n := len(i.key) - 8; n >= 0 {
    1390           2 :                         trailer := binary.LittleEndian.Uint64(i.key[n:])
    1391           2 :                         hiddenPoint := i.transforms.HideObsoletePoints &&
    1392           2 :                                 (trailer&trailerObsoleteBit != 0)
    1393           2 :                         if hiddenPoint {
    1394           2 :                                 continue
    1395             :                         }
    1396           2 :                         i.ikv.K = base.InternalKey{
    1397           2 :                                 Trailer: trailer & trailerObsoleteMask,
    1398           2 :                                 UserKey: i.key[:n:n],
    1399           2 :                         }
    1400           2 :                         if n := i.transforms.SyntheticSeqNum; n != 0 {
    1401           2 :                                 i.ikv.K.SetSeqNum(uint64(n))
    1402           2 :                         }
    1403           2 :                         if i.transforms.SyntheticSuffix.IsSet() {
    1404           2 :                                 // Inlined version of i.maybeReplaceSuffix()
    1405           2 :                                 prefixLen := i.split(i.ikv.K.UserKey)
    1406           2 :                                 // If ikey is cached or may get cached, we must de-reference
    1407           2 :                                 // UserKey before suffix replacement.
    1408           2 :                                 i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikv.K.UserKey[:prefixLen]...)
    1409           2 :                                 i.synthSuffixBuf = append(i.synthSuffixBuf, i.transforms.SyntheticSuffix...)
    1410           2 :                                 i.ikv.K.UserKey = i.synthSuffixBuf
    1411           2 :                         }
    1412           0 :                 } else {
    1413           0 :                         i.ikv.K.Trailer = uint64(InternalKeyKindInvalid)
    1414           0 :                         i.ikv.K.UserKey = nil
    1415           0 :                 }
    1416           2 :                 i.cached = i.cached[:n]
    1417           2 :                 if !i.lazyValueHandling.hasValuePrefix ||
    1418           2 :                         base.TrailerKind(i.ikv.K.Trailer) != InternalKeyKindSet {
    1419           2 :                         i.ikv.V = base.MakeInPlaceValue(i.val)
    1420           2 :                 } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
    1421           2 :                         i.ikv.V = base.MakeInPlaceValue(i.val[1:])
    1422           2 :                 } else {
    1423           2 :                         i.ikv.V = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
    1424           2 :                 }
    1425           2 :                 return &i.ikv
    1426             :         }
    1427             : 
    1428           2 :         i.clearCache()
    1429           2 :         if i.offset <= 0 {
    1430           2 :                 i.offset = -1
    1431           2 :                 i.nextOffset = 0
    1432           2 :                 return nil
    1433           2 :         }
    1434             : 
    1435           2 :         targetOffset := i.offset
    1436           2 :         var index int32
    1437           2 : 
    1438           2 :         {
    1439           2 :                 // NB: manually inlined sort.Sort is ~5% faster.
    1440           2 :                 //
    1441           2 :                 // Define f(-1) == false and f(n) == true.
    1442           2 :                 // Invariant: f(index-1) == false, f(upper) == true.
    1443           2 :                 upper := i.numRestarts
    1444           2 :                 for index < upper {
    1445           2 :                         h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
    1446           2 :                         // index ≤ h < upper
    1447           2 :                         offset := decodeRestart(i.data[i.restarts+4*h:])
    1448           2 :                         if offset < targetOffset {
    1449           2 :                                 // Looking for the first restart that has offset >= targetOffset, so
    1450           2 :                                 // ignore h and earlier.
    1451           2 :                                 index = h + 1 // preserves f(i-1) == false
    1452           2 :                         } else {
    1453           2 :                                 upper = h // preserves f(j) == true
    1454           2 :                         }
    1455             :                 }
    1456             :                 // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
    1457             :                 // => answer is index.
    1458             :         }
    1459             : 
    1460             :         // index is first restart with offset >= targetOffset. Note that
    1461             :         // targetOffset may not be at a restart point since one can call Prev()
    1462             :         // after Next() (so the cache was not populated) and targetOffset refers to
    1463             :         // the current entry. index-1 must have an offset < targetOffset (it can't
    1464             :         // be equal to targetOffset since the binary search would have selected that
    1465             :         // as the index).
    1466           2 :         i.offset = 0
    1467           2 :         if index > 0 {
    1468           2 :                 i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
    1469           2 :         }
    1470             :         // TODO(sumeer): why is the else case not an error given targetOffset is a
    1471             :         // valid offset.
    1472             : 
    1473           2 :         i.readEntry()
    1474           2 : 
    1475           2 :         // We stop when i.nextOffset == targetOffset since the targetOffset is the
    1476           2 :         // entry we are stepping back from, and we don't need to cache the entry
    1477           2 :         // before it, since it is the candidate to return.
    1478           2 :         for i.nextOffset < targetOffset {
    1479           2 :                 i.cacheEntry()
    1480           2 :                 i.offset = i.nextOffset
    1481           2 :                 i.readEntry()
    1482           2 :         }
    1483             : 
    1484           2 :         hiddenPoint := i.decodeInternalKey(i.key)
    1485           2 :         if hiddenPoint {
    1486           2 :                 // Use the cache.
    1487           2 :                 goto start
    1488             :         }
    1489           2 :         if i.transforms.SyntheticSuffix.IsSet() {
    1490           2 :                 // Inlined version of i.maybeReplaceSuffix()
    1491           2 :                 prefixLen := i.split(i.ikv.K.UserKey)
    1492           2 :                 // If ikey is cached or may get cached, we must de-reference
    1493           2 :                 // UserKey before suffix replacement.
    1494           2 :                 i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikv.K.UserKey[:prefixLen]...)
    1495           2 :                 i.synthSuffixBuf = append(i.synthSuffixBuf, i.transforms.SyntheticSuffix...)
    1496           2 :                 i.ikv.K.UserKey = i.synthSuffixBuf
    1497           2 :         }
    1498           2 :         if !i.lazyValueHandling.hasValuePrefix ||
    1499           2 :                 base.TrailerKind(i.ikv.K.Trailer) != InternalKeyKindSet {
    1500           2 :                 i.ikv.V = base.MakeInPlaceValue(i.val)
    1501           2 :         } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
    1502           2 :                 i.ikv.V = base.MakeInPlaceValue(i.val[1:])
    1503           2 :         } else {
    1504           2 :                 i.ikv.V = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
    1505           2 :         }
    1506           2 :         return &i.ikv
    1507             : }
    1508             : 
    1509             : // Key returns the internal key at the current iterator position.
    1510           2 : func (i *blockIter) Key() *InternalKey {
    1511           2 :         return &i.ikv.K
    1512           2 : }
    1513             : 
    1514             : // KV returns the internal KV at the current iterator position.
    1515           2 : func (i *blockIter) KV() *base.InternalKV {
    1516           2 :         return &i.ikv
    1517           2 : }
    1518             : 
    1519           2 : func (i *blockIter) value() base.LazyValue {
    1520           2 :         return i.ikv.V
    1521           2 : }
    1522             : 
    1523             : // Error implements internalIterator.Error, as documented in the pebble
    1524             : // package.
    1525           2 : func (i *blockIter) Error() error {
    1526           2 :         return nil // infallible
    1527           2 : }
    1528             : 
    1529             : // Close implements internalIterator.Close, as documented in the pebble
    1530             : // package.
    1531           2 : func (i *blockIter) Close() error {
    1532           2 :         i.handle.Release()
    1533           2 :         i.handle = bufferHandle{}
    1534           2 :         i.val = nil
    1535           2 :         i.ikv = base.InternalKV{}
    1536           2 :         i.lazyValueHandling.vbr = nil
    1537           2 :         return nil
    1538           2 : }
    1539             : 
    1540           0 : func (i *blockIter) SetBounds(lower, upper []byte) {
    1541           0 :         // This should never be called as bounds are handled by sstable.Iterator.
    1542           0 :         panic("pebble: SetBounds unimplemented")
    1543             : }
    1544             : 
    1545           0 : func (i *blockIter) SetContext(_ context.Context) {}
    1546             : 
    1547           2 : func (i *blockIter) valid() bool {
    1548           2 :         return i.offset >= 0 && i.offset < i.restarts
    1549           2 : }

Generated by: LCOV version 1.14